RedisLabs · fe2s · Oct 31, 2018 · Oct 25, 2018 · Oct 25, 2018 · Oct 25, 2018
diff --git a/doc/dataframe.md b/doc/dataframe.md
@@ -93,6 +93,40 @@ The keys in Redis:
 2) "person:Peter"
 ```
 
+The keys will not be persisted in Redis hashes
+
+```bash
+127.0.0.1:6379> hgetall person:John
+1) "age"
+2) "30"
+```
+
+In order to load the keys back, you also need to specify
+the key column parameter while reading
+
+```scala
+val df = spark.read
+  .format("org.apache.spark.sql.redis")
+  .option("table", "person")
+  .option("key.column", "name")
+  .load()
+```
+
+Otherwise, a field with name `_id` of type `String` will be populated
+
+```bash
+root
+ |-- _id: string (nullable = true)
+ |-- age: integer (nullable = false)
+
++-----+---+
+|  _id|age|
++-----+---+
+| John| 30|
+|Peter| 45|
++-----+---+
+```
+
 ### Save Modes
 
 Spark-redis supports all DataFrame [SaveMode](https://spark.apache.org/docs/latest/sql-programming-guide.html#save-modes)'s: `Append`, 
@@ -213,7 +247,7 @@ root
 +-----+---+
 | John| 30|
 |Peter| 45|
-+-----+---+ 
++-----+---+
 ```
 
 To read with a Spark SQL:
@@ -262,8 +296,42 @@ The output is:
 root
  |-- name: string (nullable = true)
  |-- age: string (nullable = true)
+ |-- _id: string (nullable = true)
 ```
 
+Note: If your schema has a field named `_id` or it was inferred. The
+Redis key will be stored in that field. Spark Redis will also try to
+extract the key based on your pattern. (you can also change the name
+of key column, please refer to [Specifying Redis key](#specifying-redis-key))
+- if the pattern ends with `*` and it's the only wildcard, all the
+trailing value will be extracted, e.g.
+    ```scala
+    df.show()
+    ```
+    ```bash
+    +-----+---+-----+
+    | name|age|  _id|
+    +-----+---+-----+
+    | John| 30| John|
+    |Peter| 45|Peter|
+    +-----+---+-----+
+    ```
+- otherwise, all Redis key will be kept as is, e.g.
+    ```scala
+    val df = // code ommitted...
+                .option("keys.pattern", "p*:*")
+                .load()
+    df.show()
+    ```
+    ```bash
+    +-----+---+------------+
+    | name|age|         _id|
+    +-----+---+------------+
+    | John| 30| person:John|
+    |Peter| 45|person:Peter|
+    +-----+---+------------+
+    ```
+
 ## DataFrame options
 
 | Name              | Description                                                                               | Type                  | Default |
@@ -279,4 +347,5 @@ root
 
 ## Known limitations
 
- - Nested DataFrame fields are not currently supported with Hash model. Consider making DataFrame schema flat or using Binary persistence model.
+ - Nested DataFrame fields are not currently supported with Hash model. Consider making DataFrame schema flat or using Binary persistence model.
+ - Key column deserialization relies on pattern prefix, e.g. keysPattern:*, tableName:$key
diff --git a/...provider/redis/util/ConnectionUtils.scala → ...provider/redis/util/ConnectionUtils.scala b/...provider/redis/util/ConnectionUtils.scala → ...provider/redis/util/ConnectionUtils.scala
@@ -1,15 +1,13 @@
 package com.redislabs.provider.redis.util
 
-import com.redislabs.provider.redis.RedisEndpoint
 import redis.clients.jedis.Jedis
 
 /**
   * @author The Viet Nguyen
   */
 object ConnectionUtils {
 
-  def withConnection[A](endpoint: RedisEndpoint)(body: Jedis => A): A = {
-    val conn = endpoint.connect()
+  def withConnection[A](conn: Jedis)(body: Jedis => A): A = {
     val res = body(conn)
     conn.close()
     res

diff --git a/src/main/scala/org/apache/spark/sql/redis/BinaryRedisPersistence.scala b/src/main/scala/org/apache/spark/sql/redis/BinaryRedisPersistence.scala
@@ -25,13 +25,14 @@ class BinaryRedisPersistence extends RedisPersistence[Array[Byte]] {
   override def load(pipeline: Pipeline, key: String, requiredColumns: Seq[String]): Unit =
     pipeline.get(key.getBytes(UTF_8))
 
-  override def encodeRow(value: Row): Array[Byte] = {
+  override def encodeRow(keyName: String, value: Row): Array[Byte] = {
     val fields = value.schema.fields.map(_.name)
     val valuesArray = fields.map(f => value.getAs[Any](f))
     SerializationUtils.serialize(valuesArray)
   }
 
-  override def decodeRow(value: Array[Byte], schema: => StructType, inferSchema: Boolean): Row = {
+  override def decodeRow(keyMap: (String, String), value: Array[Byte], schema: StructType,
+                         requiredColumns: Seq[String]): Row = {
     val valuesArray: Array[Any] = SerializationUtils.deserialize(value)
     new GenericRowWithSchema(valuesArray, schema)
   }

diff --git a/src/main/scala/org/apache/spark/sql/redis/HashRedisPersistence.scala b/src/main/scala/org/apache/spark/sql/redis/HashRedisPersistence.scala
@@ -1,6 +1,7 @@
 package org.apache.spark.sql.redis
 
 import java.lang.{Boolean => JBoolean, Byte => JByte, Double => JDouble, Float => JFloat, Long => JLong, Short => JShort}
+import java.util.{List => JList}
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@@ -12,47 +13,44 @@ import scala.collection.JavaConverters._
 /**
   * @author The Viet Nguyen
   */
-class HashRedisPersistence extends RedisPersistence[Map[String, String]] {
+class HashRedisPersistence extends RedisPersistence[Any] {
 
-  override def save(pipeline: Pipeline, key: String, value: Map[String, String], ttl: Int): Unit = {
-    val javaValue = value.asJava
+  override def save(pipeline: Pipeline, key: String, value: Any, ttl: Int): Unit = {
+    val javaValue = value.asInstanceOf[Map[String, String]].asJava
     pipeline.hmset(key, javaValue)
     if (ttl > 0) {
       pipeline.expire(key, ttl)
     }
   }
 
   override def load(pipeline: Pipeline, key: String, requiredColumns: Seq[String]): Unit = {
-    if (requiredColumns.isEmpty) {
-      pipeline.hgetAll(key)
-    } else {
-      pipeline.hmget(key, requiredColumns: _*)
-    }
+    pipeline.hmget(key, requiredColumns: _*)
   }
 
-  override def encodeRow(value: Row): Map[String, String] = {
+  override def encodeRow(keyName: String, value: Row): Map[String, String] = {
     val fields = value.schema.fields.map(_.name)
     val kvMap = value.getValuesMap[Any](fields)
     kvMap
-      .filter { case (k, v) =>
+      .filter { case (_, v) =>
         // don't store null values
         v != null
       }
+      .filter { case (k, _) =>
+        // don't store key values
+        k != keyName
+      }
       .map { case (k, v) =>
         k -> String.valueOf(v)
       }
   }
 
-  override def decodeRow(value: Map[String, String], schema: => StructType,
-                         inferSchema: Boolean): Row = {
-    val actualSchema = if (!inferSchema) schema else {
-      val fields = value.keys
-        .map(StructField(_, StringType))
-        .toArray
-      StructType(fields)
-    }
-    val fieldsValue = parseFields(value, actualSchema)
-    new GenericRowWithSchema(fieldsValue, actualSchema)
+  override def decodeRow(keyMap: (String, String), value: Any, schema: StructType,
+                         requiredColumns: Seq[String]): Row = {
+    val scalaValue = value.asInstanceOf[JList[String]].asScala
+    val values = requiredColumns.zip(scalaValue)
+    val results = values :+ keyMap
+    val fieldsValue = parseFields(results.toMap, schema)
+    new GenericRowWithSchema(fieldsValue, schema)
   }
 
   private def parseFields(value: Map[String, String], schema: StructType): Array[Any] =

diff --git a/src/main/scala/org/apache/spark/sql/redis/RedisPersistence.scala b/src/main/scala/org/apache/spark/sql/redis/RedisPersistence.scala
@@ -13,9 +13,26 @@ trait RedisPersistence[T] extends Serializable {
 
   def load(pipeline: Pipeline, key: String, requiredColumns: Seq[String]): Unit
 
-  def encodeRow(value: Row): T
-
-  def decodeRow(value: T, schema: => StructType, inferSchema: Boolean): Row
+  /**
+    * Encode dataframe row before storing it in Redis.
+    *
+    * @param keyName field name that should be encoded in special way, e.g. in Redis keys.
+    * @param value   row to encode.
+    * @return encoded row
+    */
+  def encodeRow(keyName: String, value: Row): T
+
+  /**
+    * Decode dataframe row stored in Redis.
+    *
+    * @param keyMap          extracted name/value of key column from Redis key
+    * @param value           encoded row
+    * @param schema          row schema
+    * @param requiredColumns required columns to decode
+    * @return decoded row
+    */
+  def decodeRow(keyMap: (String, String), value: T, schema: StructType,
+                requiredColumns: Seq[String]): Row
 }
 
 object RedisPersistence {