Skip to content

Commit cba9463

Browse files
committed
move show_profiles and dump_profiles to SparkContext
1 parent fb9565b commit cba9463

File tree

4 files changed

+49
-43
lines changed

4 files changed

+49
-43
lines changed

docs/configuration.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -210,17 +210,20 @@ Apart from these, the following properties are also available, and may be useful
210210
<td><code>spark.python.profile</code></td>
211211
<td>false</td>
212212
<td>
213-
Enable profiling in Python worker, the profile result will show up by `rdd.show_profile()`,
214-
or it will show up before the driver exit. It also can be dumped into disk by
215-
`rdd.dump_profile(path)`.
213+
Enable profiling in Python worker, the profile result will show up by `sc.show_profiles()`,
214+
or it will be showed up before the driver exiting. It also can be dumped into disk by
215+
`sc.dump_profiles(path)`. If some of the profile results had been showed up maually,
216+
they will not be showed up automatically before driver exiting.
216217
</td>
217218
</tr>
218219
<tr>
219220
<td><code>spark.python.profile.dump</code></td>
220221
<td>(none)</td>
221222
<td>
222-
The directory which is used to dump the profile result. The results will be dumped
223-
as sepereted file for each RDD. They can be loaded by ptats.Stats().
223+
The directory which is used to dump the profile result before driver exiting.
224+
The results will be dumped as separated file for each RDD. They can be loaded
225+
by ptats.Stats(). If this is specified, the profile result will not be showed up
226+
automatically.
224227
</tr>
225228
<tr>
226229
<td><code>spark.python.worker.reuse</code></td>

python/pyspark/context.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import sys
2121
from threading import Lock
2222
from tempfile import NamedTemporaryFile
23+
import atexit
2324

2425
from pyspark import accumulators
2526
from pyspark.accumulators import Accumulator
@@ -30,7 +31,6 @@
3031
from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer, \
3132
PairDeserializer, CompressedSerializer
3233
from pyspark.storagelevel import StorageLevel
33-
from pyspark import rdd
3434
from pyspark.rdd import RDD
3535
from pyspark.traceback_utils import CallSite, first_spark_call
3636

@@ -193,6 +193,9 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
193193
self._temp_dir = \
194194
self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir).getAbsolutePath()
195195

196+
# profiling stats collected for each PythonRDD
197+
self._profile_stats = []
198+
196199
def _initialize_context(self, jconf):
197200
"""
198201
Initialize SparkContext in function to allow subclass specific initialization
@@ -793,6 +796,40 @@ def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):
793796
it = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal)
794797
return list(mappedRDD._collect_iterator_through_file(it))
795798

799+
def _add_profile(self, id, profileAcc):
800+
if not self._profile_stats:
801+
dump_path = self._conf.get("spark.python.profile.dump")
802+
if dump_path:
803+
atexit.register(self.dump_profiles, dump_path)
804+
else:
805+
atexit.register(self.show_profiles)
806+
807+
self._profile_stats.append([id, profileAcc, False])
808+
809+
def show_profiles(self):
810+
""" Print the profile stats to stdout """
811+
for i, (id, acc, showed) in self._profile_stats:
812+
stats = acc.value
813+
if not showed and stats:
814+
print "=" * 60
815+
print "Profile of RDD<id=%d>" % id
816+
print "=" * 60
817+
stats.sort_stats("tottime", "cumtime").print_stats()
818+
# mark it as showed
819+
self._profile_stats[i][2] = True
820+
821+
def dump_profiles(self, path):
822+
""" Dump the profile stats into directory `path`
823+
"""
824+
if not os.path.exists(path):
825+
os.makedirs(path)
826+
for id, acc, _ in self._created_profiles:
827+
stats = acc.value
828+
if stats:
829+
p = os.path.join(path, "rdd_%d.pstats" % id)
830+
stats.dump_stats(p)
831+
self._profile_stats = []
832+
796833

797834
def _test():
798835
import atexit

python/pyspark/rdd.py

Lines changed: 1 addition & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
import warnings
2929
import heapq
3030
import bisect
31-
import atexit
3231
from random import Random
3332
from math import sqrt, log, isinf, isnan
3433

@@ -2088,41 +2087,9 @@ def _jrdd(self):
20882087

20892088
if enable_profile:
20902089
self._id = self._jrdd_val.id()
2091-
if not self._created_profiles:
2092-
dump_path = self.ctx._conf.get("spark.python.profile.dump")
2093-
if dump_path:
2094-
atexit.register(PipelinedRDD.dump_profile, dump_path)
2095-
else:
2096-
atexit.register(PipelinedRDD.show_profile)
2097-
self._created_profiles.append((self._id, profileStats))
2098-
2090+
self.ctx._add_profile(self._id, profileStats)
20992091
return self._jrdd_val
21002092

2101-
@classmethod
2102-
def show_profile(cls):
2103-
""" Print the profile stats to stdout """
2104-
for id, acc in cls._created_profiles:
2105-
stats = acc.value
2106-
if stats:
2107-
print "=" * 60
2108-
print "Profile of RDD<id=%d>" % id
2109-
print "=" * 60
2110-
stats.sort_stats("tottime", "cumtime").print_stats()
2111-
cls._created_profiles = []
2112-
2113-
@classmethod
2114-
def dump_profile(cls, dump_path):
2115-
""" Dump the profile stats into directory `dump_path`
2116-
"""
2117-
if not os.path.exists(dump_path):
2118-
os.makedirs(dump_path)
2119-
for id, acc in cls._created_profiles:
2120-
stats = acc.value
2121-
if stats:
2122-
path = os.path.join(dump_path, "rdd_%d.pstats" % id)
2123-
stats.dump_stats(path)
2124-
cls._created_profiles = []
2125-
21262093
def id(self):
21272094
if self._id is None:
21282095
self._id = self._jrdd.id()

python/pyspark/tests.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -616,10 +616,9 @@ def heavy_foo(x):
616616
for i in range(1 << 20):
617617
x = 1
618618
rdd = self.sc.parallelize(range(100)).foreach(heavy_foo)
619-
from pyspark.rdd import PipelinedRDD
620-
profiles = PipelinedRDD._created_profiles
619+
profiles = self.sc._profile_stats
621620
self.assertEqual(1, len(profiles))
622-
id, acc = profiles.pop()
621+
id, acc, _ = profiles.pop()
623622
stats = acc.value
624623
self.assertTrue(stats is not None)
625624
width, stat_list = stats.get_print_list([])

0 commit comments

Comments
 (0)