typelevel · OlivierBlanvillain · Mar 7, 2018 · Mar 6, 2018 · Mar 7, 2018 · OlivierBlanvillain
diff --git a/dataset/src/main/scala/frameless/TypedColumn.scala b/dataset/src/main/scala/frameless/TypedColumn.scala
@@ -471,6 +471,31 @@ abstract class AbstractTypedColumn[T, U]
   def cast[A: TypedEncoder](implicit c: CatalystCast[U, A]): ThisType[T, A] =
     typed(self.untyped.cast(TypedEncoder[A].catalystRepr))
 
+  /**
+    * An expression that returns a substring
+    * {{{
+    *   df.select(df('a).substr(0, 5))
+    * }}}
+    *
+    * @param startPos starting position
+    * @param len length of the substring
+    */
+  def substr(startPos: Int, len: Int)(implicit ev: U =:= String): ThisType[T, String] =
+    typed(self.untyped.substr(startPos, len))
+
+  /**
+    * An expression that returns a substring
+    * {{{
+    *   df.select(df('a).substr(df('b), df('c)))
+    * }}}
+    *
+    * @param startPos expression for the starting position
+    * @param len expression for the length of the substring
+    */
+  def substr[TT, W](startPos: ThisType[TT, Int], len: ThisType[TT, Int])
+                   (implicit ev: U =:= String, w: With.Aux[T, TT, W]): ThisType[W, String] =
+    typed(self.untyped.substr(startPos.untyped, len.untyped))
+
   /** String contains another string literal.
     * {{{
     *   df.filter ( df.col('a).contains("foo") )

diff --git a/dataset/src/test/scala/frameless/ColumnTests.scala b/dataset/src/test/scala/frameless/ColumnTests.scala
@@ -77,6 +77,59 @@ class ColumnTests extends TypedDatasetSuite {
     }
   }
 
+  test("substr") {
+    val spark = session
+    import spark.implicits._
+
+    check {
+      forAll { (a: String, b: Int, c: Int) =>
+        val ds = TypedDataset.create(X3(a, b, c) :: Nil)
+
+        val typedSubstr = ds
+          .select(ds('a).substr(ds('b), ds('c)))
+          .collect()
+          .run()
+          .toList
+
+        val untypedDs = ds.toDF()
+        val untypedSubstr = untypedDs
+          .select(untypedDs("a").substr(untypedDs("b"), untypedDs("c")))
+          .as[String]
+          .collect()
+          .toList
+
+        typedSubstr ?= untypedSubstr
+      }
+    }
+
+    check {
+      forAll { (a: String, b: Int, c: Int) =>
+        val ds = TypedDataset.create(X1(a) :: Nil)
+
+        val typedSubstr = ds
+          .select(ds('a).substr(b, c))
+          .collect()
+          .run()
+          .toList
+
+        val untypedDs = ds.toDF()
+        val untypedSubstr = untypedDs
+          .select(untypedDs("a").substr(b, c))
+          .as[String]
+          .collect()
+          .toList
+
+        typedSubstr ?= untypedSubstr
+      }
+    }
+
+    val ds1 = TypedDataset.create((1, false, 2.0) :: Nil)
+    illTyped("""ds1.select(ds1('_1).substr(0, 5))""")
+    illTyped("""ds1.select(ds1('_2).substr(0, 5))""")
+    illTyped("""ds1.select(ds1('_3).substr(0, 5))""")
+    illTyped("""ds1.select(ds1('_1).substr(ds1('_2), ds1('_3)))""")
+  }
+
   test("contains") {
     val spark = session
     import spark.implicits._