diff --git a/docs/sql-ref-syntax-qry-select-distribute-by.md b/docs/sql-ref-syntax-qry-select-distribute-by.md new file mode 100644 index 0000000000000..a1b3fcbfb5993 --- /dev/null +++ b/docs/sql-ref-syntax-qry-select-distribute-by.md @@ -0,0 +1,84 @@ +--- +layout: global +title: DISTRIBUTE BY Clause +displayTitle: DISTRIBUTE BY Clause +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- +The DISTRIBUTE BY clause is used to repartition the data based +on the input expressions. Unlike the `CLUSTER BY` clause, this does not +sort the data within each partition. + +### Syntax +{% highlight sql %} +DISTRIBUTE BY { expression [ , ... ] } +{% endhighlight %} + +### Parameters +
+
expression
+
+ Specifies combination of one or more values, operators and SQL functions that results in a value. +
+
+ +### Examples +{% highlight sql %} +CREATE TABLE person (name STRING, age INT); +INSERT INTO person VALUES + ('Zen Hui', 25), + ('Anil B', 18), + ('Shone S', 16), + ('Mike A', 25), + ('John A', 18), + ('Jack N', 16); + +-- Reduce the number of shuffle partitions to 2 to illustrate the behavior of `DISTRIBUTE BY`. +-- It's easier to see the clustering and sorting behavior with less number of partitions. +SET spark.sql.shuffle.partitions = 2; + +-- Select the rows with no ordering. Please note that without any sort directive, the result +-- of the query is not deterministic. It's included here to just contrast it with the +-- behavior of `DISTRIBUTE BY`. The query below produces rows where age columns are not +-- clustered together. +SELECT age, name FROM person; + + +---+-------+ + |age|name | + +---+-------+ + |16 |Shone S| + |25 |Zen Hui| + |16 |Jack N | + |25 |Mike A | + |18 |John A | + |18 |Anil B | + +---+-------+ + +-- Produces rows clustered by age. Persons with same age are clustered together. +-- Unlike `CLUSTER BY` clause, the rows are not sorted within a partition. +SELECT age, name FROM person DISTRIBUTE BY age; + + +---+-------+ + |age|name | + +---+-------+ + |25 |Zen Hui| + |25 |Mike A | + |18 |John A | + |18 |Anil B | + |16 |Shone S| + |16 |Jack N | + +---+-------+ +{% endhighlight %}