diff --git a/python/pyspark/cloudpickle/__init__.py b/python/pyspark/cloudpickle/__init__.py index 4e85b637800dc..56506d95fa1be 100644 --- a/python/pyspark/cloudpickle/__init__.py +++ b/python/pyspark/cloudpickle/__init__.py @@ -4,4 +4,8 @@ from pyspark.cloudpickle.cloudpickle import * # noqa from pyspark.cloudpickle.cloudpickle_fast import CloudPickler, dumps, dump # noqa -__version__ = '1.5.0' +# Conform to the convention used by python serialization libraries, which +# expose their Pickler subclass at top-level under the "Pickler" name. +Pickler = CloudPickler + +__version__ = '1.6.0' diff --git a/python/pyspark/cloudpickle/cloudpickle.py b/python/pyspark/cloudpickle/cloudpickle.py index 58c274bd79720..05d52afa0da96 100644 --- a/python/pyspark/cloudpickle/cloudpickle.py +++ b/python/pyspark/cloudpickle/cloudpickle.py @@ -88,7 +88,7 @@ def g(): DEFAULT_PROTOCOL = pickle.HIGHEST_PROTOCOL # Track the provenance of reconstructed dynamic classes to make it possible to -# reconstruct instances from the matching singleton class definition when +# recontruct instances from the matching singleton class definition when # appropriate and preserve the usual "isinstance" semantics of Python objects. _DYNAMIC_CLASS_TRACKER_BY_CLASS = weakref.WeakKeyDictionary() _DYNAMIC_CLASS_TRACKER_BY_ID = weakref.WeakValueDictionary() @@ -236,7 +236,7 @@ def _extract_code_globals(co): out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} # Declaring a function inside another one using the "def ..." - # syntax generates a constant code object corresponding to the one + # syntax generates a constant code object corresonding to the one # of the nested function's As the nested function may itself need # global variables, we need to introspect its code, extract its # globals, (look for code object in it's co_consts attribute..) and @@ -457,7 +457,7 @@ def _is_parametrized_type_hint(obj): is_typing = getattr(obj, '__origin__', None) is not None # typing_extensions.Literal - is_literal = getattr(obj, '__values__', None) is not None + is_litteral = getattr(obj, '__values__', None) is not None # typing_extensions.Final is_final = getattr(obj, '__type__', None) is not None @@ -469,7 +469,7 @@ def _is_parametrized_type_hint(obj): getattr(obj, '__result__', None) is not None and getattr(obj, '__args__', None) is not None ) - return any((is_typing, is_literal, is_final, is_union, is_tuple, + return any((is_typing, is_litteral, is_final, is_union, is_tuple, is_callable)) def _create_parametrized_type_hint(origin, args): @@ -699,7 +699,7 @@ def _make_skel_func(code, cell_count, base_globals=None): """ # This function is deprecated and should be removed in cloudpickle 1.7 warnings.warn( - "A pickle file created using an old (<=1.4.1) version of cloudpickle " + "A pickle file created using an old (<=1.4.1) version of cloudpicke " "is currently being loaded. This is not supported by cloudpickle and " "will break in cloudpickle 1.7", category=UserWarning ) @@ -828,3 +828,15 @@ def _get_bases(typ): # For regular class objects bases_attr = '__bases__' return getattr(typ, bases_attr) + + +def _make_dict_keys(obj): + return dict.fromkeys(obj).keys() + + +def _make_dict_values(obj): + return {i: _ for i, _ in enumerate(obj)}.values() + + +def _make_dict_items(obj): + return obj.items() diff --git a/python/pyspark/cloudpickle/cloudpickle_fast.py b/python/pyspark/cloudpickle/cloudpickle_fast.py index 3c48ff7b0a885..fa8da0f635c49 100644 --- a/python/pyspark/cloudpickle/cloudpickle_fast.py +++ b/python/pyspark/cloudpickle/cloudpickle_fast.py @@ -6,10 +6,11 @@ is only available for Python versions 3.8+, a lot of backward-compatibility code is also removed. -Note that the C Pickler subclassing API is CPython-specific. Therefore, some +Note that the C Pickler sublassing API is CPython-specific. Therefore, some guards present in cloudpickle.py that were written to handle PyPy specificities are not present in cloudpickle_fast.py """ +import _collections_abc import abc import copyreg import io @@ -33,8 +34,8 @@ _typevar_reduce, _get_bases, _make_cell, _make_empty_cell, CellType, _is_parametrized_type_hint, PYPY, cell_set, parametrized_type_hint_getinitargs, _create_parametrized_type_hint, - builtin_code_type - + builtin_code_type, + _make_dict_keys, _make_dict_values, _make_dict_items, ) @@ -179,7 +180,7 @@ def _class_getstate(obj): clsdict.pop('__weakref__', None) if issubclass(type(obj), abc.ABCMeta): - # If obj is an instance of an ABCMeta subclass, don't pickle the + # If obj is an instance of an ABCMeta subclass, dont pickle the # cache/negative caches populated during isinstance/issubclass # checks, but pickle the list of registered subclasses of obj. clsdict.pop('_abc_cache', None) @@ -400,6 +401,24 @@ def _class_reduce(obj): return NotImplemented +def _dict_keys_reduce(obj): + # Safer not to ship the full dict as sending the rest might + # be unintended and could potentially cause leaking of + # sensitive information + return _make_dict_keys, (list(obj), ) + + +def _dict_values_reduce(obj): + # Safer not to ship the full dict as sending the rest might + # be unintended and could potentially cause leaking of + # sensitive information + return _make_dict_values, (list(obj), ) + + +def _dict_items_reduce(obj): + return _make_dict_items, (dict(obj), ) + + # COLLECTIONS OF OBJECTS STATE SETTERS # ------------------------------------ # state setters are called at unpickling time, once the object is created and @@ -407,7 +426,7 @@ def _class_reduce(obj): def _function_setstate(obj, state): - """Update the state of a dynamic function. + """Update the state of a dynaamic function. As __closure__ and __globals__ are readonly attributes of a function, we cannot rely on the native setstate routine of pickle.load_build, that calls @@ -473,6 +492,10 @@ class CloudPickler(Pickler): _dispatch_table[types.MappingProxyType] = _mappingproxy_reduce _dispatch_table[weakref.WeakSet] = _weakset_reduce _dispatch_table[typing.TypeVar] = _typevar_reduce + _dispatch_table[_collections_abc.dict_keys] = _dict_keys_reduce + _dispatch_table[_collections_abc.dict_values] = _dict_values_reduce + _dispatch_table[_collections_abc.dict_items] = _dict_items_reduce + dispatch_table = ChainMap(_dispatch_table, copyreg.dispatch_table) @@ -556,7 +579,7 @@ def dump(self, obj): # `dispatch` attribute. Earlier versions of the protocol 5 CloudPickler # used `CloudPickler.dispatch` as a class-level attribute storing all # reducers implemented by cloudpickle, but the attribute name was not a - # great choice given the meaning of `CloudPickler.dispatch` when + # great choice given the meaning of `Cloudpickler.dispatch` when # `CloudPickler` extends the pure-python pickler. dispatch = dispatch_table @@ -630,7 +653,7 @@ def reducer_override(self, obj): return self._function_reduce(obj) else: # fallback to save_global, including the Pickler's - # dispatch_table + # distpatch_table return NotImplemented else: