From c04a83dc311c5e073c8227665ac318647be9a0e4 Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Wed, 23 Jul 2014 13:25:11 -0700 Subject: [PATCH 1/4] some default configs for PySpark --- python/pyspark/context.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/pyspark/context.py b/python/pyspark/context.py index e21be0e10a3f..af1fea8a5dc2 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -37,6 +37,15 @@ from py4j.java_collections import ListConverter +# These are special default configs for PySpark, they will overwrite +# the default ones for Spark if they are not configured by user. +DEFAULT_CONFIGS = { + "spark.serializer": "org.apache.spark.serializer.KryoSerializer", + "spark.kryo.referenceTracking": False, + "spark.serializer.objectStreamReset": 1, +} + + class SparkContext(object): """ Main entry point for Spark functionality. A SparkContext represents the @@ -112,6 +121,9 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None, if environment: for key, value in environment.iteritems(): self._conf.setExecutorEnv(key, value) + for key, value in DEFAULT_CONFIGS.items(): + if self._conf.get(key) is None: + self._conf.set(key, value) # Check that we have at least the required parameters if not self._conf.contains("spark.master"): From 8bc9f08686ce9bf9a0bde4da968fa6a41e03b40b Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Thu, 24 Jul 2014 11:34:52 -0700 Subject: [PATCH 2/4] fix unittest --- python/pyspark/context.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyspark/context.py b/python/pyspark/context.py index af1fea8a5dc2..4095549f4a63 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -41,8 +41,7 @@ # the default ones for Spark if they are not configured by user. DEFAULT_CONFIGS = { "spark.serializer": "org.apache.spark.serializer.KryoSerializer", - "spark.kryo.referenceTracking": False, - "spark.serializer.objectStreamReset": 1, + "spark.serializer.objectStreamReset": 100, } From f71a355409f11c7f7fad42cd665aa49d17b7030f Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Fri, 25 Jul 2014 10:55:18 -0700 Subject: [PATCH 3/4] rebase to master, add spark.rdd.compress = True --- python/pyspark/context.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/context.py b/python/pyspark/context.py index abf5171a3835..d402ec13c548 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -42,6 +42,7 @@ DEFAULT_CONFIGS = { "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.serializer.objectStreamReset": 100, + "spark.rdd.compress": True, } @@ -121,8 +122,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None, for key, value in environment.iteritems(): self._conf.setExecutorEnv(key, value) for key, value in DEFAULT_CONFIGS.items(): - if self._conf.get(key) is None: - self._conf.set(key, value) + self._conf.setIfMissing(key, value) # Check that we have at least the required parameters if not self._conf.contains("spark.master"): From cd316f1c6c2026f2d65228c5fd830b6de5e95bf3 Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Fri, 25 Jul 2014 18:19:12 -0700 Subject: [PATCH 4/4] remove duplicated line --- python/pyspark/context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/context.py b/python/pyspark/context.py index d402ec13c548..58bf89d7091d 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -110,7 +110,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None, else: self.serializer = BatchedSerializer(self._unbatched_serializer, batchSize) - self._conf.setIfMissing("spark.rdd.compress", "true") + # Set any parameters passed directly to us on the conf if master: self._conf.setMaster(master)