Skip to content

AWS Glue error converting data frame to dynamic frame #49

@shanmukhakota

Description

@shanmukhakota

Here's my code where I am trying to create a new data frame out of the result set of my left join on other 2 data frames and then trying to convert it to a dynamic frame.

dfs = sqlContext.read.format(SNOWFLAKE_SOURCE_NAME).options(**sfOptions).option("query", "SELECT hashkey as hash From randomtable").load()

#Source
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "test", table_name = "randomtable", transformation_ctx = "datasource0")

#add hash value
df = datasource0.toDF()
df.cache()
df = df.withColumn("hashkey", sha2(concat_ws("||", *df.columns), 256))

#drop dupes
df1 = df.dropDuplicates(subset=['hashkey'])

#read incremental data
inc = df1.join(dfs, df1["hashkey"] == dfs["hash"], how='left').filter(col('hash').isNull())

#convert it back to glue context
datasource1 = DynamicFrame.fromDF(inc, glueContext, "datasource1")

Here is the error I get when trying to convert a data frame to a dynamic frame.

datasource1 = DynamicFrame.fromDF(inc, glueContext, "datasource1")
File
"/mnt/yarn/usercache/root/appcache/application_1560272525947_0002/container_1560272525947_0002_01_000001/PyGlue.zip/awsglue/dynamicframe.py",

line 150, in fromDF
File "/mnt/yarn/usercache/root/appcache/application_1560272525947_0002/container_1560272525947_0002_01_000001/py4j-0.10.4-src.zip/py4j/java_gateway.py",
line 1133, in call
File "/mnt/yarn/usercache/root/appcache/application_1560272525947_0002/container_1560272525947_0002_01_000001/pyspark.zip/pyspark/sql/utils.py",
line 63, in deco
File "/mnt/yarn/usercache/root/appcache/application_1560272525947_0002/container_1560272525947_0002_01_000001/py4j-0.10.4-src.zip/py4j/protocol.py",
line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling z:com.amazonaws.services.glue.DynamicFrame.apply.
: java.lang.NoSuchMethodError: org.apache.spark.sql.catalyst.expressions.AttributeReference.(Ljava/lang/String;Lorg/apache/spark/sql/types/DataType;ZLorg/apache/spark/sql/types/Metadata;Lorg/apache/spark/sql/catalyst/expressions/ExprId;Lscala/collection/Seq;)V
at net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryHelper$$anonfun$8.apply(QueryHelper.scala:66)
at net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryHelper$$anonfun$8.apply(QueryHelper.scala:65)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.List.foreach(List.scala:381)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.immutable.List.map(List.scala:285)
at net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryHelper.(QueryHelper.scala:64)
at net.snowflake.spark.snowflake.pushdowns.querygeneration.SourceQuery.(SnowflakeQuery.scala:100)
at net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryBuilder.net$snowflake$spark$snowflake$pushdowns$querygeneration$QueryBuilder$$generateQueries(QueryBuilder.scala:98)
at net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryBuilder.liftedTree1$1(QueryBuilder.scala:63)
at net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryBuilder.treeRoot$lzycompute(QueryBuilder.scala:61)
at net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryBuilder.treeRoot(QueryBuilder.scala:60)
at net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryBuilder.tryBuild$lzycompute(QueryBuilder.scala:34)
at net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryBuilder.tryBuild(QueryBuilder.scala:33)
at net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryBuilder$.getRDDFromPlan(QueryBuilder.scala:179)
at net.snowflake.spark.snowflake.pushdowns.SnowflakeStrategy.buildQueryRDD(SnowflakeStrategy.scala:42)
at net.snowflake.spark.snowflake.pushdowns.SnowflakeStrategy.apply(SnowflakeStrategy.scala:24)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:62)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:62)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:92)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:77)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:74)
at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1336)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:74)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:66)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:92)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:77)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:74)
at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1336)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:74)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:66)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:92)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:77)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:74)
at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)

Any help is greatly appreciated.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions