apache · dilipbiswal · Jan 20, 2020 · Jan 21, 2020 · Jan 23, 2020 · Jan 23, 2020
diff --git a/docs/sql-ref-syntax-qry-select-distribute-by.md b/docs/sql-ref-syntax-qry-select-distribute-by.md
@@ -0,0 +1,84 @@
+---
+layout: global
+title: DISTRIBUTE BY Clause
+displayTitle: DISTRIBUTE BY Clause
+license: |
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+---
+The <code>DISTRIBUTE BY</code> clause is used to repartition the data based
+on the input expressions. Unlike the `CLUSTER BY` clause, this does not
+sort the data within each partition. 
+
+### Syntax
+{% highlight sql %}
+DISTRIBUTE BY { expression [ , ... ] }
+{% endhighlight %}
+
+### Parameters
+<dl>
+  <dt><code><em>expression</em></code></dt>
+  <dd>
+    Specifies combination of one or more values, operators and SQL functions that results in a value.
+  </dd>
+</dl>
+
+### Examples
+{% highlight sql %}
+CREATE TABLE person (name STRING, age INT);
+INSERT INTO person VALUES
+    ('Zen Hui', 25), 
+    ('Anil B', 18), 
+    ('Shone S', 16), 
+    ('Mike A', 25),
+    ('John A', 18), 
+    ('Jack N', 16);
+
+-- Reduce the number of shuffle partitions to 2 to illustrate the behavior of `DISTRIBUTE BY`.
+-- It's easier to see the clustering and sorting behavior with less number of partitions.
+SET spark.sql.shuffle.partitions = 2;
+
+-- Select the rows with no ordering. Please note that without any sort directive, the result
+-- of the query is not deterministic. It's included here to just contrast it with the 
+-- behavior of `DISTRIBUTE BY`. The query below produces rows where age columns are not
+-- clustered together.
+SELECT age, name FROM person;
+
+  +---+-------+
+  |age|name   |
+  +---+-------+
+  |16 |Shone S|
+  |25 |Zen Hui|
+  |16 |Jack N |
+  |25 |Mike A |
+  |18 |John A |
+  |18 |Anil B |
+  +---+-------+
+
+-- Produces rows clustered by age. Persons with same age are clustered together.
+-- Unlike `CLUSTER BY` clause, the rows are not sorted within a partition.
+SELECT age, name FROM person DISTRIBUTE BY age;
+
+  +---+-------+
+  |age|name   |
+  +---+-------+
+  |25 |Zen Hui|
+  |25 |Mike A |
+  |18 |John A |
+  |18 |Anil B |
+  |16 |Shone S|
+  |16 |Jack N |
+  +---+-------+
+{% endhighlight %}