Skip to content

Commit 85246e5

Browse files
committed
add docs for use_unicode
1 parent a0295e1 commit 85246e5

File tree

2 files changed

+11
-6
lines changed

2 files changed

+11
-6
lines changed

python/pyspark/context.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,10 @@ def textFile(self, name, minPartitions=None, use_unicode=True):
320320
nodes), or any Hadoop-supported file system URI, and return it as an
321321
RDD of Strings.
322322
323+
If use_unicode is False, the strings will be kept as `str` (encoding
324+
as `utf-8`), which is faster and smaller than unicode. (Added in
325+
Spark 1.1)
326+
323327
>>> path = os.path.join(tempdir, "sample-text.txt")
324328
>>> with open(path, "w") as testFile:
325329
... testFile.write("Hello world!")
@@ -339,6 +343,10 @@ def wholeTextFiles(self, path, minPartitions=None, use_unicode=True):
339343
key-value pair, where the key is the path of each file, the
340344
value is the content of each file.
341345
346+
If use_unicode is False, the strings will be kept as `str` (encoding
347+
as `utf-8`), which is faster and smaller than unicode. (Added in
348+
Spark 1.1)
349+
342350
For example, if you have the following files::
343351
344352
hdfs://a-hdfs-path/part-00000

python/pyspark/serializers.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -412,18 +412,15 @@ class UTF8Deserializer(Serializer):
412412
def __init__(self, use_unicode=False):
413413
self.use_unicode = use_unicode
414414

415-
def loads(self, stream):
416-
length = read_int(stream)
417-
return stream.read(length)
418-
419415
def load_stream(self, stream):
420416
try:
417+
_read_int = read_int # faster than global lookup
421418
if self.use_unicode:
422419
while True:
423-
yield self.loads(stream).decode("utf-8")
420+
yield stream.read(_read_int(stream)).decode("utf-8")
424421
else:
425422
while True:
426-
yield self.loads(stream)
423+
yield stream.read(_read_int(stream))
427424
except struct.error:
428425
return
429426
except EOFError:

0 commit comments

Comments
 (0)