Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions dataset/src/main/scala/frameless/TypedColumn.scala
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,31 @@ abstract class AbstractTypedColumn[T, U]
def cast[A: TypedEncoder](implicit c: CatalystCast[U, A]): ThisType[T, A] =
typed(self.untyped.cast(TypedEncoder[A].catalystRepr))

/**
* An expression that returns a substring
* {{{
* df.select(df('a).substr(0, 5))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: in Spark, is it possible to we mix literals and columns? (.substr(0, df('a)))

Copy link
Contributor Author

@pgabara pgabara Mar 7, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nope, there are only substr(startPos: Int, len: Int) and substr(startPos: Column, len: Column)

* }}}
*
* @param startPos starting position
* @param len length of the substring
*/
def substr(startPos: Int, len: Int)(implicit ev: U =:= String): ThisType[T, String] =
typed(self.untyped.substr(startPos, len))

/**
* An expression that returns a substring
* {{{
* df.select(df('a).substr(df('b), df('c)))
* }}}
*
* @param startPos expression for the starting position
* @param len expression for the length of the substring
*/
def substr[TT, W](startPos: ThisType[TT, Int], len: ThisType[TT, Int])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is the first columns method we have that involves 3 columns. The way you wrote it here, with TT used in both startPos and len, you are forcing these two columns to come from the same dataset. Something like the following wouldn't typecheck:

ds1.joins(ds2)(ds1('a) === ds1('a).sustr(ds1('b), ds2('c))

I know it's a contrive example, but to make the above working you could need something like the following:

def substr[TT1, TT2, W1, W2](startPos: ThisType[TT1, Int], len: ThisType[TT2, Int])
  (implicit
    i0: U =:= String,
    i1: With.Aux[T, TT1, W1],
    i2: With.Aux[W1, TT2, W2]
  ) = ...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make sense, I will fix it. thanks

(implicit ev: U =:= String, w: With.Aux[T, TT, W]): ThisType[W, String] =
typed(self.untyped.substr(startPos.untyped, len.untyped))

/** String contains another string literal.
* {{{
* df.filter ( df.col('a).contains("foo") )
Expand Down
53 changes: 53 additions & 0 deletions dataset/src/test/scala/frameless/ColumnTests.scala
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,59 @@ class ColumnTests extends TypedDatasetSuite {
}
}

test("substr") {
val spark = session
import spark.implicits._

check {
forAll { (a: String, b: Int, c: Int) =>
val ds = TypedDataset.create(X3(a, b, c) :: Nil)

val typedSubstr = ds
.select(ds('a).substr(ds('b), ds('c)))
.collect()
.run()
.toList

val untypedDs = ds.toDF()
val untypedSubstr = untypedDs
.select(untypedDs("a").substr(untypedDs("b"), untypedDs("c")))
.as[String]
.collect()
.toList

typedSubstr ?= untypedSubstr
}
}

check {
forAll { (a: String, b: Int, c: Int) =>
val ds = TypedDataset.create(X1(a) :: Nil)

val typedSubstr = ds
.select(ds('a).substr(b, c))
.collect()
.run()
.toList

val untypedDs = ds.toDF()
val untypedSubstr = untypedDs
.select(untypedDs("a").substr(b, c))
.as[String]
.collect()
.toList

typedSubstr ?= untypedSubstr
}
}

val ds1 = TypedDataset.create((1, false, 2.0) :: Nil)
illTyped("""ds1.select(ds1('_1).substr(0, 5))""")
illTyped("""ds1.select(ds1('_2).substr(0, 5))""")
illTyped("""ds1.select(ds1('_3).substr(0, 5))""")
illTyped("""ds1.select(ds1('_1).substr(ds1('_2), ds1('_3)))""")
}

test("contains") {
val spark = session
import spark.implicits._
Expand Down