## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements. See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.#importcopyimportsysimportosimportoperatorimportshleximportwarningsimportheapqimportbisectimportrandomfromsubprocessimportPopen,PIPEfromthreadingimportThreadfromcollectionsimportdefaultdictfromitertoolsimportchainfromfunctoolsimportreducefrommathimportsqrt,log,isinf,isnan,pow,ceilfromtypingimport(Any,Callable,Dict,Generic,Hashable,Iterable,Iterator,IO,List,NoReturn,Optional,Sequence,Tuple,Union,TypeVar,cast,overload,TYPE_CHECKING,)frompyspark.java_gatewayimportlocal_connect_and_authfrompyspark.serializersimport(AutoBatchedSerializer,BatchedSerializer,NoOpSerializer,CartesianDeserializer,CloudPickleSerializer,PairDeserializer,CPickleSerializer,Serializer,pack_long,read_int,write_int,)frompyspark.joinimport(python_join,python_left_outer_join,python_right_outer_join,python_full_outer_join,python_cogroup,)frompyspark.statcounterimportStatCounterfrompyspark.rddsamplerimportRDDSampler,RDDRangeSampler,RDDStratifiedSamplerfrompyspark.storagelevelimportStorageLevelfrompyspark.resource.requestsimportExecutorResourceRequests,TaskResourceRequestsfrompyspark.resource.profileimportResourceProfilefrompyspark.resultiterableimportResultIterablefrompyspark.shuffleimport(Aggregator,ExternalMerger,get_used_memory,ExternalSorter,ExternalGroupBy,)frompyspark.traceback_utilsimportSCCallSiteSyncfrompyspark.utilimportfail_on_stopiteration,_parse_memoryfrompyspark.errorsimportPySparkRuntimeErrorifTYPE_CHECKING:importsocketimportiofrompyspark._typingimportNonUDFTypefrompyspark._typingimportS,NumberOrArrayfrompyspark.contextimportSparkContextfrompyspark.sql.pandas._typingimport(PandasScalarUDFType,PandasGroupedMapUDFType,PandasGroupedAggUDFType,PandasWindowAggUDFType,PandasScalarIterUDFType,PandasMapIterUDFType,PandasCogroupedMapUDFType,ArrowMapIterUDFType,PandasGroupedMapUDFWithStateType,)frompyspark.sql.dataframeimportDataFramefrompyspark.sql.typesimportAtomicType,StructTypefrompyspark.sql._typingimport(AtomicValue,RowLike,SQLArrowBatchedUDFType,SQLArrowTableUDFType,SQLBatchedUDFType,SQLTableUDFType,)frompy4j.java_gatewayimportJavaObjectfrompy4j.java_collectionsimportJavaArrayT=TypeVar("T")T_co=TypeVar("T_co",covariant=True)U=TypeVar("U")K=TypeVar("K",bound=Hashable)V=TypeVar("V")V1=TypeVar("V1")V2=TypeVar("V2")V3=TypeVar("V3")__all__=["RDD"]classPythonEvalType:""" Evaluation type of python rdd. These values are internal to PySpark. These values should match values in org.apache.spark.api.python.PythonEvalType. """NON_UDF:"NonUDFType"=0SQL_BATCHED_UDF:"SQLBatchedUDFType"=100SQL_ARROW_BATCHED_UDF:"SQLArrowBatchedUDFType"=101SQL_SCALAR_PANDAS_UDF:"PandasScalarUDFType"=200SQL_GROUPED_MAP_PANDAS_UDF:"PandasGroupedMapUDFType"=201SQL_GROUPED_AGG_PANDAS_UDF:"PandasGroupedAggUDFType"=202SQL_WINDOW_AGG_PANDAS_UDF:"PandasWindowAggUDFType"=203SQL_SCALAR_PANDAS_ITER_UDF:"PandasScalarIterUDFType"=204SQL_MAP_PANDAS_ITER_UDF:"PandasMapIterUDFType"=205SQL_COGROUPED_MAP_PANDAS_UDF:"PandasCogroupedMapUDFType"=206SQL_MAP_ARROW_ITER_UDF:"ArrowMapIterUDFType"=207SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE:"PandasGroupedMapUDFWithStateType"=208SQL_TABLE_UDF:"SQLTableUDFType"=300SQL_ARROW_TABLE_UDF:"SQLArrowTableUDFType"=301defportable_hash(x:Hashable)->int:""" This function returns consistent hash code for builtin types, especially for None and tuple with None. The algorithm is similar to that one used by CPython 2.7 Examples -------- >>> portable_hash(None) 0 >>> portable_hash((None, 1)) & 0xffffffff 219750521 """if"PYTHONHASHSEED"notinos.environ:raisePySparkRuntimeError(error_class="PYTHON_HASH_SEED_NOT_SET",message_parameters={},)ifxisNone:return0ifisinstance(x,tuple):h=0x345678foriinx:h^=portable_hash(i)h*=1000003h&=sys.maxsizeh^=len(x)ifh==-1:h=-2returnint(h)returnhash(x)classBoundedFloat(float):""" Bounded value is generated by approximate job, with confidence and low bound and high bound. Examples -------- >>> BoundedFloat(100.0, 0.95, 95.0, 105.0) 100.0 """confidence:floatlow:floathigh:floatdef__new__(cls,mean:float,confidence:float,low:float,high:float)->"BoundedFloat":obj=float.__new__(cls,mean)obj.confidence=confidenceobj.low=lowobj.high=highreturnobjdef_create_local_socket(sock_info:"JavaArray")->"io.BufferedRWPair":""" Create a local socket that can be used to load deserialized data from the JVM Parameters ---------- sock_info : tuple Tuple containing port number and authentication secret for a local socket. Returns ------- sockfile file descriptor of the local socket """sockfile:"io.BufferedRWPair"sock:"socket.socket"port:int=sock_info[0]auth_secret:str=sock_info[1]sockfile,sock=local_connect_and_auth(port,auth_secret)# The RDD materialization time is unpredictable, if we set a timeout for socket reading# operation, it will very possibly fail. See SPARK-18281.sock.settimeout(None)returnsockfiledef_load_from_socket(sock_info:"JavaArray",serializer:Serializer)->Iterator[Any]:""" Connect to a local socket described by sock_info and use the given serializer to yield data Parameters ---------- sock_info : tuple Tuple containing port number and authentication secret for a local socket. serializer : class:`Serializer` The PySpark serializer to use Returns ------- result of meth:`Serializer.load_stream`, usually a generator that yields deserialized data """sockfile=_create_local_socket(sock_info)# The socket will be automatically closed when garbage-collected.returnserializer.load_stream(sockfile)def_local_iterator_from_socket(sock_info:"JavaArray",serializer:Serializer)->Iterator[Any]:classPyLocalIterable:"""Create a synchronous local iterable over a socket"""def__init__(self,_sock_info:"JavaArray",_serializer:Serializer):port:intauth_secret:strjsocket_auth_server:"JavaObject"port,auth_secret,self.jsocket_auth_server=_sock_infoself._sockfile=_create_local_socket((port,auth_secret))self._serializer=_serializerself._read_iter:Iterator[Any]=iter([])# Initialize as empty iteratorself._read_status=1def__iter__(self)->Iterator[Any]:whileself._read_status==1:# Request next partition data from Javawrite_int(1,self._sockfile)self._sockfile.flush()# If response is 1 then there is a partition to read, if 0 then fully consumedself._read_status=read_int(self._sockfile)ifself._read_status==1:# Load the partition data as a stream and read each itemself._read_iter=self._serializer.load_stream(self._sockfile)foriteminself._read_iter:yielditem# An error occurred, join serving thread and raise any exceptions from the JVMelifself._read_status==-1:self.jsocket_auth_server.getResult()def__del__(self)->None:# If local iterator is not fully consumed,ifself._read_status==1:try:# Finish consuming partition data streamfor_inself._read_iter:pass# Tell Java to stop sending data and close connectionwrite_int(0,self._sockfile)self._sockfile.flush()exceptException:# Ignore any errors, socket is automatically closed when garbage-collectedpassreturniter(PyLocalIterable(sock_info,serializer))classPartitioner:def__init__(self,numPartitions:int,partitionFunc:Callable[[Any],int]):self.numPartitions=numPartitionsself.partitionFunc=partitionFuncdef__eq__(self,other:Any)->bool:return(isinstance(other,Partitioner)andself.numPartitions==other.numPartitionsandself.partitionFunc==other.partitionFunc)def__call__(self,k:Any)->int:returnself.partitionFunc(k)%self.numPartitions
[docs]classRDD(Generic[T_co]):""" A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable, partitioned collection of elements that can be operated on in parallel. """def__init__(self,jrdd:"JavaObject",ctx:"SparkContext",jrdd_deserializer:Serializer=AutoBatchedSerializer(CPickleSerializer()),):self._jrdd=jrddself.is_cached=Falseself.is_checkpointed=Falseself.has_resource_profile=Falseself.ctx=ctxself._jrdd_deserializer=jrdd_deserializerself._id=jrdd.id()self.partitioner:Optional[Partitioner]=Nonedef_pickled(self:"RDD[T]")->"RDD[T]":returnself._reserialize(AutoBatchedSerializer(CPickleSerializer()))
[docs]defid(self)->int:""" A unique ID for this RDD (within its SparkContext). .. versionadded:: 0.7.0 Returns ------- int The unique ID for this :class:`RDD` Examples -------- >>> rdd = sc.range(5) >>> rdd.id() # doctest: +SKIP 3 """returnself._id
def__repr__(self)->str:returnself._jrdd.toString()def__getnewargs__(self)->NoReturn:# This method is called when attempting to pickle an RDD, which is always an error:raisePySparkRuntimeError(error_class="RDD_TRANSFORM_ONLY_VALID_ON_DRIVER",message_parameters={},)@propertydefcontext(self)->"SparkContext":""" The :class:`SparkContext` that this RDD was created on. .. versionadded:: 0.7.0 Returns ------- :class:`SparkContext` The :class:`SparkContext` that this RDD was created on Examples -------- >>> rdd = sc.range(5) >>> rdd.context <SparkContext ...> >>> rdd.context is sc True """returnself.ctx
[docs]defcache(self:"RDD[T]")->"RDD[T]":""" Persist this RDD with the default storage level (`MEMORY_ONLY`). .. versionadded:: 0.7.0 Returns ------- :class:`RDD` The same :class:`RDD` with storage level set to `MEMORY_ONLY` See Also -------- :meth:`RDD.persist` :meth:`RDD.unpersist` :meth:`RDD.getStorageLevel` Examples -------- >>> rdd = sc.range(5) >>> rdd2 = rdd.cache() >>> rdd2 is rdd True >>> str(rdd.getStorageLevel()) 'Memory Serialized 1x Replicated' >>> _ = rdd.unpersist() """self.is_cached=Trueself.persist(StorageLevel.MEMORY_ONLY)returnself
[docs]defpersist(self:"RDD[T]",storageLevel:StorageLevel=StorageLevel.MEMORY_ONLY)->"RDD[T]":""" Set this RDD's storage level to persist its values across operations after the first time it is computed. This can only be used to assign a new storage level if the RDD does not have a storage level set yet. If no storage level is specified defaults to (`MEMORY_ONLY`). .. versionadded:: 0.9.1 Parameters ---------- storageLevel : :class:`StorageLevel`, default `MEMORY_ONLY` the target storage level Returns ------- :class:`RDD` The same :class:`RDD` with storage level set to `storageLevel`. See Also -------- :meth:`RDD.cache` :meth:`RDD.unpersist` :meth:`RDD.getStorageLevel` Examples -------- >>> rdd = sc.parallelize(["b", "a", "c"]) >>> rdd.persist().is_cached True >>> str(rdd.getStorageLevel()) 'Memory Serialized 1x Replicated' >>> _ = rdd.unpersist() >>> rdd.is_cached False >>> from pyspark import StorageLevel >>> rdd2 = sc.range(5) >>> _ = rdd2.persist(StorageLevel.MEMORY_AND_DISK) >>> rdd2.is_cached True >>> str(rdd2.getStorageLevel()) 'Disk Memory Serialized 1x Replicated' Can not override existing storage level >>> _ = rdd2.persist(StorageLevel.MEMORY_ONLY_2) Traceback (most recent call last): ... py4j.protocol.Py4JJavaError: ... Assign another storage level after `unpersist` >>> _ = rdd2.unpersist() >>> rdd2.is_cached False >>> _ = rdd2.persist(StorageLevel.MEMORY_ONLY_2) >>> str(rdd2.getStorageLevel()) 'Memory Serialized 2x Replicated' >>> rdd2.is_cached True >>> _ = rdd2.unpersist() """self.is_cached=TruejavaStorageLevel=self.ctx._getJavaStorageLevel(storageLevel)self._jrdd.persist(javaStorageLevel)returnself
[docs]defunpersist(self:"RDD[T]",blocking:bool=False)->"RDD[T]":""" Mark the RDD as non-persistent, and remove all blocks for it from memory and disk. .. versionadded:: 0.9.1 Parameters ---------- blocking : bool, optional, default False whether to block until all blocks are deleted .. versionadded:: 3.0.0 Returns ------- :class:`RDD` The same :class:`RDD` See Also -------- :meth:`RDD.cache` :meth:`RDD.persist` :meth:`RDD.getStorageLevel` Examples -------- >>> rdd = sc.range(5) >>> rdd.is_cached False >>> _ = rdd.unpersist() >>> rdd.is_cached False >>> _ = rdd.cache() >>> rdd.is_cached True >>> _ = rdd.unpersist() >>> rdd.is_cached False >>> _ = rdd.unpersist() """self.is_cached=Falseself._jrdd.unpersist(blocking)returnself
[docs]defcheckpoint(self)->None:""" Mark this RDD for checkpointing. It will be saved to a file inside the checkpoint directory set with :meth:`SparkContext.setCheckpointDir` and all references to its parent RDDs will be removed. This function must be called before any job has been executed on this RDD. It is strongly recommended that this RDD is persisted in memory, otherwise saving it on a file will require recomputation. .. versionadded:: 0.7.0 See Also -------- :meth:`RDD.isCheckpointed` :meth:`RDD.getCheckpointFile` :meth:`RDD.localCheckpoint` :meth:`SparkContext.setCheckpointDir` :meth:`SparkContext.getCheckpointDir` Examples -------- >>> rdd = sc.range(5) >>> rdd.is_checkpointed False >>> rdd.getCheckpointFile() == None True >>> rdd.checkpoint() >>> rdd.is_checkpointed True >>> rdd.getCheckpointFile() == None True >>> rdd.count() 5 >>> rdd.is_checkpointed True >>> rdd.getCheckpointFile() == None False """self.is_checkpointed=Trueself._jrdd.rdd().checkpoint()
[docs]defisCheckpointed(self)->bool:""" Return whether this RDD is checkpointed and materialized, either reliably or locally. .. versionadded:: 0.7.0 Returns ------- bool whether this :class:`RDD` is checkpointed and materialized, either reliably or locally See Also -------- :meth:`RDD.checkpoint` :meth:`RDD.getCheckpointFile` :meth:`SparkContext.setCheckpointDir` :meth:`SparkContext.getCheckpointDir` """returnself._jrdd.rdd().isCheckpointed()
[docs]deflocalCheckpoint(self)->None:""" Mark this RDD for local checkpointing using Spark's existing caching layer. This method is for users who wish to truncate RDD lineages while skipping the expensive step of replicating the materialized data in a reliable distributed file system. This is useful for RDDs with long lineages that need to be truncated periodically (e.g. GraphX). Local checkpointing sacrifices fault-tolerance for performance. In particular, checkpointed data is written to ephemeral local storage in the executors instead of to a reliable, fault-tolerant storage. The effect is that if an executor fails during the computation, the checkpointed data may no longer be accessible, causing an irrecoverable job failure. This is NOT safe to use with dynamic allocation, which removes executors along with their cached blocks. If you must use both features, you are advised to set `spark.dynamicAllocation.cachedExecutorIdleTimeout` to a high value. The checkpoint directory set through :meth:`SparkContext.setCheckpointDir` is not used. .. versionadded:: 2.2.0 See Also -------- :meth:`RDD.checkpoint` :meth:`RDD.isLocallyCheckpointed` Examples -------- >>> rdd = sc.range(5) >>> rdd.isLocallyCheckpointed() False >>> rdd.localCheckpoint() >>> rdd.isLocallyCheckpointed() True """self._jrdd.rdd().localCheckpoint()
[docs]defisLocallyCheckpointed(self)->bool:""" Return whether this RDD is marked for local checkpointing. Exposed for testing. .. versionadded:: 2.2.0 Returns ------- bool whether this :class:`RDD` is marked for local checkpointing See Also -------- :meth:`RDD.localCheckpoint` """returnself._jrdd.rdd().isLocallyCheckpointed()
[docs]defgetCheckpointFile(self)->Optional[str]:""" Gets the name of the file to which this RDD was checkpointed Not defined if RDD is checkpointed locally. .. versionadded:: 0.7.0 Returns ------- str the name of the file to which this :class:`RDD` was checkpointed See Also -------- :meth:`RDD.checkpoint` :meth:`SparkContext.setCheckpointDir` :meth:`SparkContext.getCheckpointDir` """checkpointFile=self._jrdd.rdd().getCheckpointFile()returncheckpointFile.get()ifcheckpointFile.isDefined()elseNone
[docs]defcleanShuffleDependencies(self,blocking:bool=False)->None:""" Removes an RDD's shuffles and it's non-persisted ancestors. When running without a shuffle service, cleaning up shuffle files enables downscaling. If you use the RDD after this call, you should checkpoint and materialize it first. .. versionadded:: 3.3.0 Parameters ---------- blocking : bool, optional, default False whether to block on shuffle cleanup tasks Notes ----- This API is a developer API. """self._jrdd.rdd().cleanShuffleDependencies(blocking)
[docs]defmap(self:"RDD[T]",f:Callable[[T],U],preservesPartitioning:bool=False)->"RDD[U]":""" Return a new RDD by applying a function to each element of this RDD. .. versionadded:: 0.7.0 Parameters ---------- f : function a function to run on each element of the RDD preservesPartitioning : bool, optional, default False indicates whether the input function preserves the partitioner, which should be False unless this is a pair RDD and the input Returns ------- :class:`RDD` a new :class:`RDD` by applying a function to all elements See Also -------- :meth:`RDD.flatMap` :meth:`RDD.mapPartitions` :meth:`RDD.mapPartitionsWithIndex` :meth:`RDD.mapPartitionsWithSplit` Examples -------- >>> rdd = sc.parallelize(["b", "a", "c"]) >>> sorted(rdd.map(lambda x: (x, 1)).collect()) [('a', 1), ('b', 1), ('c', 1)] """deffunc(_:int,iterator:Iterable[T])->Iterable[U]:returnmap(fail_on_stopiteration(f),iterator)returnself.mapPartitionsWithIndex(func,preservesPartitioning)
[docs]defflatMap(self:"RDD[T]",f:Callable[[T],Iterable[U]],preservesPartitioning:bool=False)->"RDD[U]":""" Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results. .. versionadded:: 0.7.0 Parameters ---------- f : function a function to turn a T into a sequence of U preservesPartitioning : bool, optional, default False indicates whether the input function preserves the partitioner, which should be False unless this is a pair RDD and the input Returns ------- :class:`RDD` a new :class:`RDD` by applying a function to all elements See Also -------- :meth:`RDD.map` :meth:`RDD.mapPartitions` :meth:`RDD.mapPartitionsWithIndex` :meth:`RDD.mapPartitionsWithSplit` Examples -------- >>> rdd = sc.parallelize([2, 3, 4]) >>> sorted(rdd.flatMap(lambda x: range(1, x)).collect()) [1, 1, 1, 2, 2, 3] >>> sorted(rdd.flatMap(lambda x: [(x, x), (x, x)]).collect()) [(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)] """deffunc(_:int,iterator:Iterable[T])->Iterable[U]:returnchain.from_iterable(map(fail_on_stopiteration(f),iterator))returnself.mapPartitionsWithIndex(func,preservesPartitioning)
[docs]defmapPartitions(self:"RDD[T]",f:Callable[[Iterable[T]],Iterable[U]],preservesPartitioning:bool=False)->"RDD[U]":""" Return a new RDD by applying a function to each partition of this RDD. .. versionadded:: 0.7.0 Parameters ---------- f : function a function to run on each partition of the RDD preservesPartitioning : bool, optional, default False indicates whether the input function preserves the partitioner, which should be False unless this is a pair RDD and the input Returns ------- :class:`RDD` a new :class:`RDD` by applying a function to each partition See Also -------- :meth:`RDD.map` :meth:`RDD.flatMap` :meth:`RDD.mapPartitionsWithIndex` :meth:`RDD.mapPartitionsWithSplit` :meth:`RDDBarrier.mapPartitions` Examples -------- >>> rdd = sc.parallelize([1, 2, 3, 4], 2) >>> def f(iterator): yield sum(iterator) ... >>> rdd.mapPartitions(f).collect() [3, 7] """deffunc(_:int,iterator:Iterable[T])->Iterable[U]:returnf(iterator)returnself.mapPartitionsWithIndex(func,preservesPartitioning)
[docs]defmapPartitionsWithIndex(self:"RDD[T]",f:Callable[[int,Iterable[T]],Iterable[U]],preservesPartitioning:bool=False,)->"RDD[U]":""" Return a new RDD by applying a function to each partition of this RDD, while tracking the index of the original partition. .. versionadded:: 0.7.0 Parameters ---------- f : function a function to run on each partition of the RDD preservesPartitioning : bool, optional, default False indicates whether the input function preserves the partitioner, which should be False unless this is a pair RDD and the input Returns ------- :class:`RDD` a new :class:`RDD` by applying a function to each partition See Also -------- :meth:`RDD.map` :meth:`RDD.flatMap` :meth:`RDD.mapPartitions` :meth:`RDD.mapPartitionsWithSplit` :meth:`RDDBarrier.mapPartitionsWithIndex` Examples -------- >>> rdd = sc.parallelize([1, 2, 3, 4], 4) >>> def f(splitIndex, iterator): yield splitIndex ... >>> rdd.mapPartitionsWithIndex(f).sum() 6 """returnPipelinedRDD(self,f,preservesPartitioning)
[docs]defmapPartitionsWithSplit(self:"RDD[T]",f:Callable[[int,Iterable[T]],Iterable[U]],preservesPartitioning:bool=False,)->"RDD[U]":""" Return a new RDD by applying a function to each partition of this RDD, while tracking the index of the original partition. .. versionadded:: 0.7.0 .. deprecated:: 0.9.0 use meth:`RDD.mapPartitionsWithIndex` instead. Parameters ---------- f : function a function to run on each partition of the RDD preservesPartitioning : bool, optional, default False indicates whether the input function preserves the partitioner, which should be False unless this is a pair RDD and the input Returns ------- :class:`RDD` a new :class:`RDD` by applying a function to each partition See Also -------- :meth:`RDD.map` :meth:`RDD.flatMap` :meth:`RDD.mapPartitions` :meth:`RDD.mapPartitionsWithIndex` Examples -------- >>> rdd = sc.parallelize([1, 2, 3, 4], 4) >>> def f(splitIndex, iterator): yield splitIndex ... >>> rdd.mapPartitionsWithSplit(f).sum() 6 """warnings.warn("mapPartitionsWithSplit is deprecated; use mapPartitionsWithIndex instead",FutureWarning,stacklevel=2,)returnself.mapPartitionsWithIndex(f,preservesPartitioning)
[docs]defgetNumPartitions(self)->int:""" Returns the number of partitions in RDD .. versionadded:: 1.1.0 Returns ------- int number of partitions Examples -------- >>> rdd = sc.parallelize([1, 2, 3, 4], 2) >>> rdd.getNumPartitions() 2 """returnself._jrdd.partitions().size()
[docs]deffilter(self:"RDD[T]",f:Callable[[T],bool])->"RDD[T]":""" Return a new RDD containing only the elements that satisfy a predicate. .. versionadded:: 0.7.0 Parameters ---------- f : function a function to run on each element of the RDD Returns ------- :class:`RDD` a new :class:`RDD` by applying a function to each element See Also -------- :meth:`RDD.map` Examples -------- >>> rdd = sc.parallelize([1, 2, 3, 4, 5]) >>> rdd.filter(lambda x: x % 2 == 0).collect() [2, 4] """deffunc(iterator:Iterable[T])->Iterable[T]:returnfilter(fail_on_stopiteration(f),iterator)returnself.mapPartitions(func,True)
[docs]defdistinct(self:"RDD[T]",numPartitions:Optional[int]=None)->"RDD[T]":""" Return a new RDD containing the distinct elements in this RDD. .. versionadded:: 0.7.0 Parameters ---------- numPartitions : int, optional the number of partitions in new :class:`RDD` Returns ------- :class:`RDD` a new :class:`RDD` containing the distinct elements See Also -------- :meth:`RDD.countApproxDistinct` Examples -------- >>> sorted(sc.parallelize([1, 1, 2, 3]).distinct().collect()) [1, 2, 3] """return(self.map(lambdax:(x,None)).reduceByKey(lambdax,_:x,numPartitions).map(lambdax:x[0]))
[docs]defsample(self:"RDD[T]",withReplacement:bool,fraction:float,seed:Optional[int]=None)->"RDD[T]":""" Return a sampled subset of this RDD. .. versionadded:: 0.7.0 Parameters ---------- withReplacement : bool can elements be sampled multiple times (replaced when sampled out) fraction : float expected size of the sample as a fraction of this RDD's size without replacement: probability that each element is chosen; fraction must be [0, 1] with replacement: expected number of times each element is chosen; fraction must be >= 0 seed : int, optional seed for the random number generator Returns ------- :class:`RDD` a new :class:`RDD` containing a sampled subset of elements See Also -------- :meth:`RDD.takeSample` :meth:`RDD.sampleByKey` :meth:`pyspark.sql.DataFrame.sample` Notes ----- This is not guaranteed to provide exactly the fraction specified of the total count of the given :class:`DataFrame`. Examples -------- >>> rdd = sc.parallelize(range(100), 4) >>> 6 <= rdd.sample(False, 0.1, 81).count() <= 14 True """ifnotfraction>=0:raiseValueError("Fraction must be nonnegative.")returnself.mapPartitionsWithIndex(RDDSampler(withReplacement,fraction,seed).func,True)
[docs]defrandomSplit(self:"RDD[T]",weights:Sequence[Union[int,float]],seed:Optional[int]=None)->"List[RDD[T]]":""" Randomly splits this RDD with the provided weights. .. versionadded:: 1.3.0 Parameters ---------- weights : list weights for splits, will be normalized if they don't sum to 1 seed : int, optional random seed Returns ------- list split :class:`RDD`\\s in a list See Also -------- :meth:`pyspark.sql.DataFrame.randomSplit` Examples -------- >>> rdd = sc.parallelize(range(500), 1) >>> rdd1, rdd2 = rdd.randomSplit([2, 3], 17) >>> len(rdd1.collect() + rdd2.collect()) 500 >>> 150 < rdd1.count() < 250 True >>> 250 < rdd2.count() < 350 True """ifnotall(w>=0forwinweights):raiseValueError("Weights must be nonnegative")s=float(sum(weights))ifnots>0:raiseValueError("Sum of weights must be positive")cweights=[0.0]forwinweights:cweights.append(cweights[-1]+w/s)ifseedisNone:seed=random.randint(0,2**32-1)return[self.mapPartitionsWithIndex(RDDRangeSampler(lb,ub,seed).func,True)forlb,ubinzip(cweights,cweights[1:])]
# this is ported from scala/spark/RDD.scala
[docs]deftakeSample(self:"RDD[T]",withReplacement:bool,num:int,seed:Optional[int]=None)->List[T]:""" Return a fixed-size sampled subset of this RDD. .. versionadded:: 1.3.0 Parameters ---------- withReplacement : bool whether sampling is done with replacement num : int size of the returned sample seed : int, optional random seed Returns ------- list a fixed-size sampled subset of this :class:`RDD` in an array See Also -------- :meth:`RDD.sample` Notes ----- This method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory. Examples -------- >>> import sys >>> rdd = sc.parallelize(range(0, 10)) >>> len(rdd.takeSample(True, 20, 1)) 20 >>> len(rdd.takeSample(False, 5, 2)) 5 >>> len(rdd.takeSample(False, 15, 3)) 10 >>> sc.range(0, 10).takeSample(False, sys.maxsize) Traceback (most recent call last): ... ValueError: Sample size cannot be greater than ... """numStDev=10.0maxSampleSize=sys.maxsize-int(numStDev*sqrt(sys.maxsize))ifnum<0:raiseValueError("Sample size cannot be negative.")elifnum>maxSampleSize:raiseValueError("Sample size cannot be greater than %d."%maxSampleSize)ifnum==0orself.getNumPartitions()==0:return[]initialCount=self.count()ifinitialCount==0:return[]rand=random.Random(seed)if(notwithReplacement)andnum>=initialCount:# shuffle current RDD and returnsamples=self.collect()rand.shuffle(samples)returnsamplesfraction=RDD._computeFractionForSampleSize(num,initialCount,withReplacement)samples=self.sample(withReplacement,fraction,seed).collect()# If the first sample didn't turn out large enough, keep trying to take samples;# this shouldn't happen often because we use a big multiplier for their initial size.# See: scala/spark/RDD.scalawhilelen(samples)<num:# TODO: add log warning for when more than one iteration was runseed=rand.randint(0,sys.maxsize)samples=self.sample(withReplacement,fraction,seed).collect()rand.shuffle(samples)returnsamples[0:num]
@staticmethoddef_computeFractionForSampleSize(sampleSizeLowerBound:int,total:int,withReplacement:bool)->float:""" Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time. How the sampling rate is determined: Let p = num / total, where num is the sample size and total is the total number of data points in the RDD. We're trying to compute q > p such that - when sampling with replacement, we're drawing each data point with prob_i ~ Pois(q), where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total), i.e. the failure rate of not having a sufficiently large sample < 0.0001. Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for num > 12, but we need a slightly larger q (9 empirically determined). - when sampling without replacement, we're drawing each data point with prob_i ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success rate, where success rate is defined the same as in sampling with replacement. """fraction=float(sampleSizeLowerBound)/totalifwithReplacement:numStDev=5ifsampleSizeLowerBound<12:numStDev=9returnfraction+numStDev*sqrt(fraction/total)else:delta=0.00005gamma=-log(delta)/totalreturnmin(1,fraction+gamma+sqrt(gamma*gamma+2*gamma*fraction))
[docs]defunion(self:"RDD[T]",other:"RDD[U]")->"RDD[Union[T, U]]":""" Return the union of this RDD and another one. .. versionadded:: 0.7.0 Parameters ---------- other : :class:`RDD` another :class:`RDD` Returns ------- :class:`RDD` the union of this :class:`RDD` and another one See Also -------- :meth:`SparkContext.union` :meth:`pyspark.sql.DataFrame.union` Examples -------- >>> rdd = sc.parallelize([1, 1, 2, 3]) >>> rdd.union(rdd).collect() [1, 1, 2, 3, 1, 1, 2, 3] """ifself._jrdd_deserializer==other._jrdd_deserializer:rdd:"RDD[Union[T, U]]"=RDD(self._jrdd.union(other._jrdd),self.ctx,self._jrdd_deserializer)else:# These RDDs contain data in different serialized formats, so we# must normalize them to the default serializer.self_copy=self._reserialize()other_copy=other._reserialize()rdd=RDD(self_copy._jrdd.union(other_copy._jrdd),self.ctx,self.ctx.serializer)if(self.partitioner==other.partitionerandself.getNumPartitions()==rdd.getNumPartitions()):rdd.partitioner=self.partitionerreturnrdd
[docs]defintersection(self:"RDD[T]",other:"RDD[T]")->"RDD[T]":""" Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did. .. versionadded:: 1.0.0 Parameters ---------- other : :class:`RDD` another :class:`RDD` Returns ------- :class:`RDD` the intersection of this :class:`RDD` and another one See Also -------- :meth:`pyspark.sql.DataFrame.intersect` Notes ----- This method performs a shuffle internally. Examples -------- >>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5]) >>> rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8]) >>> rdd1.intersection(rdd2).collect() [1, 2, 3] """return(self.map(lambdav:(v,None)).cogroup(other.map(lambdav:(v,None))).filter(lambdak_vs:all(k_vs[1])).keys())
def_reserialize(self:"RDD[T]",serializer:Optional[Serializer]=None)->"RDD[T]":serializer=serializerorself.ctx.serializerifself._jrdd_deserializer!=serializer:self=self.map(lambdax:x,preservesPartitioning=True)self._jrdd_deserializer=serializerreturnselfdef__add__(self:"RDD[T]",other:"RDD[U]")->"RDD[Union[T, U]]":""" Return the union of this RDD and another one. Examples -------- >>> rdd = sc.parallelize([1, 1, 2, 3]) >>> (rdd + rdd).collect() [1, 1, 2, 3, 1, 1, 2, 3] """ifnotisinstance(other,RDD):raiseTypeErrorreturnself.union(other)@overloaddefrepartitionAndSortWithinPartitions(self:"RDD[Tuple[S, V]]",numPartitions:Optional[int]=...,partitionFunc:Callable[["S"],int]=...,ascending:bool=...,)->"RDD[Tuple[S, V]]":...@overloaddefrepartitionAndSortWithinPartitions(self:"RDD[Tuple[K, V]]",numPartitions:Optional[int],partitionFunc:Callable[[K],int],ascending:bool,keyfunc:Callable[[K],"S"],)->"RDD[Tuple[K, V]]":...@overloaddefrepartitionAndSortWithinPartitions(self:"RDD[Tuple[K, V]]",numPartitions:Optional[int]=...,partitionFunc:Callable[[K],int]=...,ascending:bool=...,*,keyfunc:Callable[[K],"S"],)->"RDD[Tuple[K, V]]":...
[docs]defrepartitionAndSortWithinPartitions(self:"RDD[Tuple[Any, Any]]",numPartitions:Optional[int]=None,partitionFunc:Callable[[Any],int]=portable_hash,ascending:bool=True,keyfunc:Callable[[Any],Any]=lambdax:x,)->"RDD[Tuple[Any, Any]]":""" Repartition the RDD according to the given partitioner and, within each resulting partition, sort records by their keys. .. versionadded:: 1.2.0 Parameters ---------- numPartitions : int, optional the number of partitions in new :class:`RDD` partitionFunc : function, optional, default `portable_hash` a function to compute the partition index ascending : bool, optional, default True sort the keys in ascending or descending order keyfunc : function, optional, default identity mapping a function to compute the key Returns ------- :class:`RDD` a new :class:`RDD` See Also -------- :meth:`RDD.repartition` :meth:`RDD.partitionBy` :meth:`RDD.sortBy` :meth:`RDD.sortByKey` Examples -------- >>> rdd = sc.parallelize([(0, 5), (3, 8), (2, 6), (0, 8), (3, 8), (1, 3)]) >>> rdd2 = rdd.repartitionAndSortWithinPartitions(2, lambda x: x % 2, True) >>> rdd2.glom().collect() [[(0, 5), (0, 8), (2, 6)], [(1, 3), (3, 8), (3, 8)]] """ifnumPartitionsisNone:numPartitions=self._defaultReducePartitions()memory=self._memory_limit()serializer=self._jrdd_deserializerdefsortPartition(iterator:Iterable[Tuple[K,V]])->Iterable[Tuple[K,V]]:sort=ExternalSorter(memory*0.9,serializer).sortedreturniter(sort(iterator,key=lambdak_v:keyfunc(k_v[0]),reverse=(notascending)))returnself.partitionBy(numPartitions,partitionFunc).mapPartitions(sortPartition,True)
[docs]defsortByKey(self:"RDD[Tuple[K, V]]",ascending:Optional[bool]=True,numPartitions:Optional[int]=None,keyfunc:Callable[[Any],Any]=lambdax:x,)->"RDD[Tuple[K, V]]":""" Sorts this RDD, which is assumed to consist of (key, value) pairs. .. versionadded:: 0.9.1 Parameters ---------- ascending : bool, optional, default True sort the keys in ascending or descending order numPartitions : int, optional the number of partitions in new :class:`RDD` keyfunc : function, optional, default identity mapping a function to compute the key Returns ------- :class:`RDD` a new :class:`RDD` See Also -------- :meth:`RDD.sortBy` :meth:`pyspark.sql.DataFrame.sort` Examples -------- >>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)] >>> sc.parallelize(tmp).sortByKey().first() ('1', 3) >>> sc.parallelize(tmp).sortByKey(True, 1).collect() [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)] >>> sc.parallelize(tmp).sortByKey(True, 2).collect() [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)] >>> tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)] >>> tmp2.extend([('whose', 6), ('fleece', 7), ('was', 8), ('white', 9)]) >>> sc.parallelize(tmp2).sortByKey(True, 3, keyfunc=lambda k: k.lower()).collect() [('a', 3), ('fleece', 7), ('had', 2), ('lamb', 5),...('white', 9), ('whose', 6)] """ifnumPartitionsisNone:numPartitions=self._defaultReducePartitions()memory=self._memory_limit()serializer=self._jrdd_deserializerdefsortPartition(iterator:Iterable[Tuple[K,V]])->Iterable[Tuple[K,V]]:sort=ExternalSorter(memory*0.9,serializer).sortedreturniter(sort(iterator,key=lambdakv:keyfunc(kv[0]),reverse=(notascending)))ifnumPartitions==1:ifself.getNumPartitions()>1:self=self.coalesce(1)returnself.mapPartitions(sortPartition,True)# first compute the boundary of each part via sampling: we want to partition# the key-space into bins such that the bins have roughly the same# number of (key, value) pairs falling into themrddSize=self.count()ifnotrddSize:returnself# empty RDDmaxSampleSize=numPartitions*20.0# constant from Spark's RangePartitionerfraction=min(maxSampleSize/max(rddSize,1),1.0)samples=self.sample(False,fraction,1).map(lambdakv:kv[0]).collect()samples=sorted(samples,key=keyfunc)# we have numPartitions many parts but one of the them has# an implicit boundarybounds=[samples[int(len(samples)*(i+1)/numPartitions)]foriinrange(0,numPartitions-1)]defrangePartitioner(k:K)->int:p=bisect.bisect_left(bounds,keyfunc(k))ifascending:returnpelse:returnnumPartitions-1-p# type: ignore[operator]returnself.partitionBy(numPartitions,rangePartitioner).mapPartitions(sortPartition,True)
[docs]defsortBy(self:"RDD[T]",keyfunc:Callable[[T],"S"],ascending:bool=True,numPartitions:Optional[int]=None,)->"RDD[T]":""" Sorts this RDD by the given keyfunc .. versionadded:: 1.1.0 Parameters ---------- keyfunc : function a function to compute the key ascending : bool, optional, default True sort the keys in ascending or descending order numPartitions : int, optional the number of partitions in new :class:`RDD` Returns ------- :class:`RDD` a new :class:`RDD` See Also -------- :meth:`RDD.sortByKey` :meth:`pyspark.sql.DataFrame.sort` Examples -------- >>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)] >>> sc.parallelize(tmp).sortBy(lambda x: x[0]).collect() [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)] >>> sc.parallelize(tmp).sortBy(lambda x: x[1]).collect() [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)] """return(self.keyBy(keyfunc)# type: ignore[type-var].sortByKey(ascending,numPartitions).values())
[docs]defglom(self:"RDD[T]")->"RDD[List[T]]":""" Return an RDD created by coalescing all elements within each partition into a list. .. versionadded:: 0.7.0 Returns ------- :class:`RDD` a new :class:`RDD` coalescing all elements within each partition into a list Examples -------- >>> rdd = sc.parallelize([1, 2, 3, 4], 2) >>> sorted(rdd.glom().collect()) [[1, 2], [3, 4]] """deffunc(iterator:Iterable[T])->Iterable[List[T]]:yieldlist(iterator)returnself.mapPartitions(func)
[docs]defcartesian(self:"RDD[T]",other:"RDD[U]")->"RDD[Tuple[T, U]]":""" Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of elements ``(a, b)`` where ``a`` is in `self` and ``b`` is in `other`. .. versionadded:: 0.7.0 Parameters ---------- other : :class:`RDD` another :class:`RDD` Returns ------- :class:`RDD` the Cartesian product of this :class:`RDD` and another one See Also -------- :meth:`pyspark.sql.DataFrame.crossJoin` Examples -------- >>> rdd = sc.parallelize([1, 2]) >>> sorted(rdd.cartesian(rdd).collect()) [(1, 1), (1, 2), (2, 1), (2, 2)] """# Due to batching, we can't use the Java cartesian method.deserializer=CartesianDeserializer(self._jrdd_deserializer,other._jrdd_deserializer)returnRDD(self._jrdd.cartesian(other._jrdd),self.ctx,deserializer)
[docs]defgroupBy(self:"RDD[T]",f:Callable[[T],K],numPartitions:Optional[int]=None,partitionFunc:Callable[[K],int]=portable_hash,)->"RDD[Tuple[K, Iterable[T]]]":""" Return an RDD of grouped items. .. versionadded:: 0.7.0 Parameters ---------- f : function a function to compute the key numPartitions : int, optional the number of partitions in new :class:`RDD` partitionFunc : function, optional, default `portable_hash` a function to compute the partition index Returns ------- :class:`RDD` a new :class:`RDD` of grouped items See Also -------- :meth:`RDD.groupByKey` :meth:`pyspark.sql.DataFrame.groupBy` Examples -------- >>> rdd = sc.parallelize([1, 1, 2, 3, 5, 8]) >>> result = rdd.groupBy(lambda x: x % 2).collect() >>> sorted([(x, sorted(y)) for (x, y) in result]) [(0, [2, 8]), (1, [1, 1, 3, 5])] """returnself.map(lambdax:(f(x),x)).groupByKey(numPartitions,partitionFunc)
[docs]defpipe(self,command:str,env:Optional[Dict[str,str]]=None,checkCode:bool=False)->"RDD[str]":""" Return an RDD created by piping elements to a forked external process. .. versionadded:: 0.7.0 Parameters ---------- command : str command to run. env : dict, optional environment variables to set. checkCode : bool, optional whether to check the return value of the shell command. Returns ------- :class:`RDD` a new :class:`RDD` of strings Examples -------- >>> sc.parallelize(['1', '2', '', '3']).pipe('cat').collect() ['1', '2', '', '3'] """ifenvisNone:env=dict()deffunc(iterator:Iterable[T])->Iterable[str]:pipe=Popen(shlex.split(command),env=env,stdin=PIPE,stdout=PIPE)defpipe_objs(out:IO[bytes])->None:forobjiniterator:s=str(obj).rstrip("\n")+"\n"out.write(s.encode("utf-8"))out.close()Thread(target=pipe_objs,args=[pipe.stdin]).start()defcheck_return_code()->Iterable[int]:pipe.wait()ifcheckCodeandpipe.returncode:raisePySparkRuntimeError(error_class="PIPE_FUNCTION_EXITED",message_parameters={"func_name":command,"error_code":str(pipe.returncode),},)else:foriinrange(0):yieldireturn(cast(bytes,x).rstrip(b"\n").decode("utf-8")forxinchain(iter(cast(IO[bytes],pipe.stdout).readline,b""),check_return_code()))returnself.mapPartitions(func)
[docs]defforeach(self:"RDD[T]",f:Callable[[T],None])->None:""" Applies a function to all elements of this RDD. .. versionadded:: 0.7.0 Parameters ---------- f : function a function applied to each element See Also -------- :meth:`RDD.foreachPartition` :meth:`pyspark.sql.DataFrame.foreach` :meth:`pyspark.sql.DataFrame.foreachPartition` Examples -------- >>> def f(x): print(x) ... >>> sc.parallelize([1, 2, 3, 4, 5]).foreach(f) """f=fail_on_stopiteration(f)defprocessPartition(iterator:Iterable[T])->Iterable[Any]:forxiniterator:f(x)returniter([])self.mapPartitions(processPartition).count()# Force evaluation
[docs]defforeachPartition(self:"RDD[T]",f:Callable[[Iterable[T]],None])->None:""" Applies a function to each partition of this RDD. .. versionadded:: 1.0.0 Parameters ---------- f : function a function applied to each partition See Also -------- :meth:`RDD.foreach` :meth:`pyspark.sql.DataFrame.foreach` :meth:`pyspark.sql.DataFrame.foreachPartition` Examples -------- >>> def f(iterator): ... for x in iterator: ... print(x) ... >>> sc.parallelize([1, 2, 3, 4, 5]).foreachPartition(f) """deffunc(it:Iterable[T])->Iterable[Any]:r=f(it)try:returniter(r)# type: ignore[call-overload]exceptTypeError:returniter([])self.mapPartitions(func).count()# Force evaluation
[docs]defcollect(self:"RDD[T]")->List[T]:""" Return a list that contains all the elements in this RDD. .. versionadded:: 0.7.0 Returns ------- list a list containing all the elements Notes ----- This method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory. See Also -------- :meth:`RDD.toLocalIterator` :meth:`pyspark.sql.DataFrame.collect` Examples -------- >>> sc.range(5).collect() [0, 1, 2, 3, 4] >>> sc.parallelize(["x", "y", "z"]).collect() ['x', 'y', 'z'] """withSCCallSiteSync(self.context):assertself.ctx._jvmisnotNonesock_info=self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())returnlist(_load_from_socket(sock_info,self._jrdd_deserializer))
[docs]defcollectWithJobGroup(self:"RDD[T]",groupId:str,description:str,interruptOnCancel:bool=False)->"List[T]":""" When collect rdd, use this method to specify job group. .. versionadded:: 3.0.0 .. deprecated:: 3.1.0 Use :class:`pyspark.InheritableThread` with the pinned thread mode enabled. Parameters ---------- groupId : str The group ID to assign. description : str The description to set for the job group. interruptOnCancel : bool, optional, default False whether to interrupt jobs on job cancellation. Returns ------- list a list containing all the elements See Also -------- :meth:`RDD.collect` :meth:`SparkContext.setJobGroup` """warnings.warn("Deprecated in 3.1, Use pyspark.InheritableThread with ""the pinned thread mode enabled.",FutureWarning,)withSCCallSiteSync(self.context):assertself.ctx._jvmisnotNonesock_info=self.ctx._jvm.PythonRDD.collectAndServeWithJobGroup(self._jrdd.rdd(),groupId,description,interruptOnCancel)returnlist(_load_from_socket(sock_info,self._jrdd_deserializer))
[docs]defreduce(self:"RDD[T]",f:Callable[[T,T],T])->T:""" Reduces the elements of this RDD using the specified commutative and associative binary operator. Currently reduces partitions locally. .. versionadded:: 0.7.0 Parameters ---------- f : function the reduce function Returns ------- T the aggregated result See Also -------- :meth:`RDD.treeReduce` :meth:`RDD.aggregate` :meth:`RDD.treeAggregate` Examples -------- >>> from operator import add >>> sc.parallelize([1, 2, 3, 4, 5]).reduce(add) 15 >>> sc.parallelize((2 for _ in range(10))).map(lambda x: 1).cache().reduce(add) 10 >>> sc.parallelize([]).reduce(add) Traceback (most recent call last): ... ValueError: Can not reduce() empty RDD """f=fail_on_stopiteration(f)deffunc(iterator:Iterable[T])->Iterable[T]:iterator=iter(iterator)try:initial=next(iterator)exceptStopIteration:returnyieldreduce(f,iterator,initial)vals=self.mapPartitions(func).collect()ifvals:returnreduce(f,vals)raiseValueError("Can not reduce() empty RDD")
[docs]deftreeReduce(self:"RDD[T]",f:Callable[[T,T],T],depth:int=2)->T:""" Reduces the elements of this RDD in a multi-level tree pattern. .. versionadded:: 1.3.0 Parameters ---------- f : function the reduce function depth : int, optional, default 2 suggested depth of the tree (default: 2) Returns ------- T the aggregated result See Also -------- :meth:`RDD.reduce` :meth:`RDD.aggregate` :meth:`RDD.treeAggregate` Examples -------- >>> add = lambda x, y: x + y >>> rdd = sc.parallelize([-5, -4, -3, -2, -1, 1, 2, 3, 4], 10) >>> rdd.treeReduce(add) -5 >>> rdd.treeReduce(add, 1) -5 >>> rdd.treeReduce(add, 2) -5 >>> rdd.treeReduce(add, 5) -5 >>> rdd.treeReduce(add, 10) -5 """ifdepth<1:raiseValueError("Depth cannot be smaller than 1 but got %d."%depth)# Use the second entry to indicate whether this is a dummy value.zeroValue:Tuple[T,bool]=(# type: ignore[assignment]None,True,)defop(x:Tuple[T,bool],y:Tuple[T,bool])->Tuple[T,bool]:ifx[1]:returnyelify[1]:returnxelse:returnf(x[0],y[0]),Falsereduced=self.map(lambdax:(x,False)).treeAggregate(zeroValue,op,op,depth)ifreduced[1]:raiseValueError("Cannot reduce empty RDD.")returnreduced[0]
[docs]deffold(self:"RDD[T]",zeroValue:T,op:Callable[[T,T],T])->T:""" Aggregate the elements of each partition, and then the results for all the partitions, using a given associative function and a neutral "zero value." The function ``op(t1, t2)`` is allowed to modify ``t1`` and return it as its result value to avoid object allocation; however, it should not modify ``t2``. This behaves somewhat differently from fold operations implemented for non-distributed collections in functional languages like Scala. This fold operation may be applied to partitions individually, and then fold those results into the final result, rather than apply the fold to each element sequentially in some defined ordering. For functions that are not commutative, the result may differ from that of a fold applied to a non-distributed collection. .. versionadded:: 0.7.0 Parameters ---------- zeroValue : T the initial value for the accumulated result of each partition op : function a function used to both accumulate results within a partition and combine results from different partitions Returns ------- T the aggregated result See Also -------- :meth:`RDD.reduce` :meth:`RDD.aggregate` Examples -------- >>> from operator import add >>> sc.parallelize([1, 2, 3, 4, 5]).fold(0, add) 15 """op=fail_on_stopiteration(op)deffunc(iterator:Iterable[T])->Iterable[T]:acc=zeroValueforobjiniterator:acc=op(acc,obj)yieldacc# collecting result of mapPartitions here ensures that the copy of# zeroValue provided to each partition is unique from the one provided# to the final reduce callvals=self.mapPartitions(func).collect()returnreduce(op,vals,zeroValue)
[docs]defaggregate(self:"RDD[T]",zeroValue:U,seqOp:Callable[[U,T],U],combOp:Callable[[U,U],U])->U:""" Aggregate the elements of each partition, and then the results for all the partitions, using a given combine functions and a neutral "zero value." The functions ``op(t1, t2)`` is allowed to modify ``t1`` and return it as its result value to avoid object allocation; however, it should not modify ``t2``. The first function (seqOp) can return a different result type, U, than the type of this RDD. Thus, we need one operation for merging a T into an U and one operation for merging two U .. versionadded:: 1.1.0 Parameters ---------- zeroValue : U the initial value for the accumulated result of each partition seqOp : function a function used to accumulate results within a partition combOp : function an associative function used to combine results from different partitions Returns ------- U the aggregated result See Also -------- :meth:`RDD.reduce` :meth:`RDD.fold` Examples -------- >>> seqOp = (lambda x, y: (x[0] + y, x[1] + 1)) >>> combOp = (lambda x, y: (x[0] + y[0], x[1] + y[1])) >>> sc.parallelize([1, 2, 3, 4]).aggregate((0, 0), seqOp, combOp) (10, 4) >>> sc.parallelize([]).aggregate((0, 0), seqOp, combOp) (0, 0) """seqOp=fail_on_stopiteration(seqOp)combOp=fail_on_stopiteration(combOp)deffunc(iterator:Iterable[T])->Iterable[U]:acc=zeroValueforobjiniterator:acc=seqOp(acc,obj)yieldacc# collecting result of mapPartitions here ensures that the copy of# zeroValue provided to each partition is unique from the one provided# to the final reduce callvals=self.mapPartitions(func).collect()returnreduce(combOp,vals,zeroValue)
[docs]deftreeAggregate(self:"RDD[T]",zeroValue:U,seqOp:Callable[[U,T],U],combOp:Callable[[U,U],U],depth:int=2,)->U:""" Aggregates the elements of this RDD in a multi-level tree pattern. .. versionadded:: 1.3.0 Parameters ---------- zeroValue : U the initial value for the accumulated result of each partition seqOp : function a function used to accumulate results within a partition combOp : function an associative function used to combine results from different partitions depth : int, optional, default 2 suggested depth of the tree Returns ------- U the aggregated result See Also -------- :meth:`RDD.aggregate` :meth:`RDD.treeReduce` Examples -------- >>> add = lambda x, y: x + y >>> rdd = sc.parallelize([-5, -4, -3, -2, -1, 1, 2, 3, 4], 10) >>> rdd.treeAggregate(0, add, add) -5 >>> rdd.treeAggregate(0, add, add, 1) -5 >>> rdd.treeAggregate(0, add, add, 2) -5 >>> rdd.treeAggregate(0, add, add, 5) -5 >>> rdd.treeAggregate(0, add, add, 10) -5 """ifdepth<1:raiseValueError("Depth cannot be smaller than 1 but got %d."%depth)ifself.getNumPartitions()==0:returnzeroValuedefaggregatePartition(iterator:Iterable[T])->Iterable[U]:acc=zeroValueforobjiniterator:acc=seqOp(acc,obj)yieldaccpartiallyAggregated=self.mapPartitions(aggregatePartition)numPartitions=partiallyAggregated.getNumPartitions()scale=max(int(ceil(pow(numPartitions,1.0/depth))),2)# If creating an extra level doesn't help reduce the wall-clock time, we stop the tree# aggregation.whilenumPartitions>scale+numPartitions/scale:numPartitions/=scale# type: ignore[assignment]curNumPartitions=int(numPartitions)defmapPartition(i:int,iterator:Iterable[U])->Iterable[Tuple[int,U]]:forobjiniterator:yield(i%curNumPartitions,obj)partiallyAggregated=(partiallyAggregated.mapPartitionsWithIndex(mapPartition).reduceByKey(combOp,curNumPartitions).values())returnpartiallyAggregated.reduce(combOp)
[docs]defmax(self:"RDD[T]",key:Optional[Callable[[T],"S"]]=None)->T:""" Find the maximum item in this RDD. .. versionadded:: 1.0.0 Parameters ---------- key : function, optional A function used to generate key for comparing Returns ------- T the maximum item See Also -------- :meth:`RDD.min` Examples -------- >>> rdd = sc.parallelize([1.0, 5.0, 43.0, 10.0]) >>> rdd.max() 43.0 >>> rdd.max(key=str) 5.0 """ifkeyisNone:returnself.reduce(max)# type: ignore[arg-type]returnself.reduce(lambdaa,b:max(a,b,key=key))# type: ignore[arg-type]
[docs]defmin(self:"RDD[T]",key:Optional[Callable[[T],"S"]]=None)->T:""" Find the minimum item in this RDD. .. versionadded:: 1.0.0 Parameters ---------- key : function, optional A function used to generate key for comparing Returns ------- T the minimum item See Also -------- :meth:`RDD.max` Examples -------- >>> rdd = sc.parallelize([2.0, 5.0, 43.0, 10.0]) >>> rdd.min() 2.0 >>> rdd.min(key=str) 10.0 """ifkeyisNone:returnself.reduce(min)# type: ignore[arg-type]returnself.reduce(lambdaa,b:min(a,b,key=key))# type: ignore[arg-type]
[docs]defsum(self:"RDD[NumberOrArray]")->"NumberOrArray":""" Add up the elements in this RDD. .. versionadded:: 0.7.0 Returns ------- float, int, or complex the sum of all elements See Also -------- :meth:`RDD.mean` :meth:`RDD.sumApprox` Examples -------- >>> sc.parallelize([1.0, 2.0, 3.0]).sum() 6.0 """returnself.mapPartitions(lambdax:[sum(x)]).fold(# type: ignore[return-value]0,operator.add)
[docs]defcount(self)->int:""" Return the number of elements in this RDD. .. versionadded:: 0.7.0 Returns ------- int the number of elements See Also -------- :meth:`RDD.countApprox` :meth:`pyspark.sql.DataFrame.count` Examples -------- >>> sc.parallelize([2, 3, 4]).count() 3 """returnself.mapPartitions(lambdai:[sum(1for_ini)]).sum()
[docs]defstats(self:"RDD[NumberOrArray]")->StatCounter:""" Return a :class:`StatCounter` object that captures the mean, variance and count of the RDD's elements in one operation. .. versionadded:: 0.9.1 Returns ------- :class:`StatCounter` a :class:`StatCounter` capturing the mean, variance and count of all elements See Also -------- :meth:`RDD.stdev` :meth:`RDD.sampleStdev` :meth:`RDD.variance` :meth:`RDD.sampleVariance` :meth:`RDD.histogram` :meth:`pyspark.sql.DataFrame.stat` """defredFunc(left_counter:StatCounter,right_counter:StatCounter)->StatCounter:returnleft_counter.mergeStats(right_counter)returnself.mapPartitions(lambdai:[StatCounter(i)]).reduce(# type: ignore[arg-type]redFunc)
[docs]defhistogram(self:"RDD[S]",buckets:Union[int,List["S"],Tuple["S",...]])->Tuple[Sequence["S"],List[int]]:""" Compute a histogram using the provided buckets. The buckets are all open to the right except for the last which is closed. e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50], which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1 and 50 we would have a histogram of 1,0,1. If your histogram is evenly spaced (e.g. [0, 10, 20, 30]), this can be switched from an O(log n) insertion to O(1) per element (where n is the number of buckets). Buckets must be sorted, not contain any duplicates, and have at least two elements. If `buckets` is a number, it will generate buckets which are evenly spaced between the minimum and maximum of the RDD. For example, if the min value is 0 and the max is 100, given `buckets` as 2, the resulting buckets will be [0,50) [50,100]. `buckets` must be at least 1. An exception is raised if the RDD contains infinity. If the elements in the RDD do not vary (max == min), a single bucket will be used. .. versionadded:: 1.2.0 Parameters ---------- buckets : int, or list, or tuple if `buckets` is a number, it computes a histogram of the data using `buckets` number of buckets evenly, otherwise, `buckets` is the provided buckets to bin the data. Returns ------- tuple a tuple of buckets and histogram See Also -------- :meth:`RDD.stats` Examples -------- >>> rdd = sc.parallelize(range(51)) >>> rdd.histogram(2) ([0, 25, 50], [25, 26]) >>> rdd.histogram([0, 5, 25, 50]) ([0, 5, 25, 50], [5, 20, 26]) >>> rdd.histogram([0, 15, 30, 45, 60]) # evenly spaced buckets ([0, 15, 30, 45, 60], [15, 15, 15, 6]) >>> rdd = sc.parallelize(["ab", "ac", "b", "bd", "ef"]) >>> rdd.histogram(("a", "b", "c")) (('a', 'b', 'c'), [2, 2]) """ifisinstance(buckets,int):ifbuckets<1:raiseValueError("number of buckets must be >= 1")# filter out non-comparable elementsdefcomparable(x:Any)->bool:ifxisNone:returnFalseiftype(x)isfloatandisnan(x):returnFalsereturnTruefiltered=self.filter(comparable)# faster than stats()defminmax(a:Tuple["S","S"],b:Tuple["S","S"])->Tuple["S","S"]:returnmin(a[0],b[0]),max(a[1],b[1])try:minv,maxv=filtered.map(lambdax:(x,x)).reduce(minmax)exceptTypeErrorase:if" empty "instr(e):raiseValueError("can not generate buckets from empty RDD")raiseifminv==maxvorbuckets==1:return[minv,maxv],[filtered.count()]try:inc=(maxv-minv)/buckets# type: ignore[operator]exceptTypeError:raiseTypeError("Can not generate buckets with non-number in RDD")ifisinf(inc):raiseValueError("Can not generate buckets with infinite value")# keep them as integer if possibleinc=int(inc)ifinc*buckets!=maxv-minv:# type: ignore[operator]inc=(maxv-minv)*1.0/buckets# type: ignore[operator]buckets=[i*inc+minvforiinrange(buckets)]buckets.append(maxv)# fix accumulated erroreven=Trueelifisinstance(buckets,(list,tuple)):iflen(buckets)<2:raiseValueError("buckets should have more than one value")ifany(iisNoneorisinstance(i,float)andisnan(i)foriinbuckets):raiseValueError("can not have None or NaN in buckets")ifsorted(buckets)!=list(buckets):raiseValueError("buckets should be sorted")iflen(set(buckets))!=len(buckets):raiseValueError("buckets should not contain duplicated values")minv=buckets[0]maxv=buckets[-1]even=Falseinc=Nonetry:steps=[buckets[i+1]-buckets[i]# type: ignore[operator]foriinrange(len(buckets)-1)]exceptTypeError:pass# objects in buckets do not support '-'else:ifmax(steps)-min(steps)<1e-10:# handle precision errorseven=Trueinc=(maxv-minv)/(len(buckets)-1)# type: ignore[operator]else:raiseTypeError("buckets should be a list or tuple or number(int or long)")defhistogram(iterator:Iterable["S"])->Iterable[List[int]]:counters=[0]*len(buckets)# type: ignore[arg-type]foriiniterator:ifiisNoneor(isinstance(i,float)andisnan(i))ori>maxvori<minv:continuet=(int((i-minv)/inc)# type: ignore[operator]ifevenelsebisect.bisect_right(buckets,i)-1# type: ignore[arg-type])counters[t]+=1# add last two togetherlast=counters.pop()counters[-1]+=lastreturn[counters]defmergeCounters(a:List[int],b:List[int])->List[int]:return[i+jfori,jinzip(a,b)]returnbuckets,self.mapPartitions(histogram).reduce(mergeCounters)
[docs]defmean(self:"RDD[NumberOrArray]")->float:""" Compute the mean of this RDD's elements. .. versionadded:: 0.9.1 Returns ------- float the mean of all elements See Also -------- :meth:`RDD.stats` :meth:`RDD.sum` :meth:`RDD.meanApprox` Examples -------- >>> sc.parallelize([1, 2, 3]).mean() 2.0 """returnself.stats().mean()
[docs]defvariance(self:"RDD[NumberOrArray]")->float:""" Compute the variance of this RDD's elements. .. versionadded:: 0.9.1 Returns ------- float the variance of all elements See Also -------- :meth:`RDD.stats` :meth:`RDD.sampleVariance` :meth:`RDD.stdev` :meth:`RDD.sampleStdev` Examples -------- >>> sc.parallelize([1, 2, 3]).variance() 0.666... """returnself.stats().variance()
[docs]defstdev(self:"RDD[NumberOrArray]")->float:""" Compute the standard deviation of this RDD's elements. .. versionadded:: 0.9.1 Returns ------- float the standard deviation of all elements See Also -------- :meth:`RDD.stats` :meth:`RDD.sampleStdev` :meth:`RDD.variance` :meth:`RDD.sampleVariance` Examples -------- >>> sc.parallelize([1, 2, 3]).stdev() 0.816... """returnself.stats().stdev()
[docs]defsampleStdev(self:"RDD[NumberOrArray]")->float:""" Compute the sample standard deviation of this RDD's elements (which corrects for bias in estimating the standard deviation by dividing by N-1 instead of N). .. versionadded:: 0.9.1 Returns ------- float the sample standard deviation of all elements See Also -------- :meth:`RDD.stats` :meth:`RDD.stdev` :meth:`RDD.variance` :meth:`RDD.sampleVariance` Examples -------- >>> sc.parallelize([1, 2, 3]).sampleStdev() 1.0 """returnself.stats().sampleStdev()
[docs]defsampleVariance(self:"RDD[NumberOrArray]")->float:""" Compute the sample variance of this RDD's elements (which corrects for bias in estimating the variance by dividing by N-1 instead of N). .. versionadded:: 0.9.1 Returns ------- float the sample variance of all elements See Also -------- :meth:`RDD.stats` :meth:`RDD.variance` :meth:`RDD.stdev` :meth:`RDD.sampleStdev` Examples -------- >>> sc.parallelize([1, 2, 3]).sampleVariance() 1.0 """returnself.stats().sampleVariance()
[docs]defcountByValue(self:"RDD[K]")->Dict[K,int]:""" Return the count of each unique value in this RDD as a dictionary of (value, count) pairs. .. versionadded:: 0.7.0 Returns ------- dict a dictionary of (value, count) pairs See Also -------- :meth:`RDD.collectAsMap` :meth:`RDD.countByKey` Examples -------- >>> sorted(sc.parallelize([1, 2, 1, 2, 2], 2).countByValue().items()) [(1, 2), (2, 3)] """defcountPartition(iterator:Iterable[K])->Iterable[Dict[K,int]]:counts:Dict[K,int]=defaultdict(int)forobjiniterator:counts[obj]+=1yieldcountsdefmergeMaps(m1:Dict[K,int],m2:Dict[K,int])->Dict[K,int]:fork,vinm2.items():m1[k]+=vreturnm1returnself.mapPartitions(countPartition).reduce(mergeMaps)
[docs]deftop(self:"RDD[T]",num:int,key:Optional[Callable[[T],"S"]]=None)->List[T]:""" Get the top N elements from an RDD. .. versionadded:: 1.0.0 Parameters ---------- num : int top N key : function, optional a function used to generate key for comparing Returns ------- list the top N elements See Also -------- :meth:`RDD.takeOrdered` :meth:`RDD.max` :meth:`RDD.min` Notes ----- This method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory. It returns the list sorted in descending order. Examples -------- >>> sc.parallelize([10, 4, 2, 12, 3]).top(1) [12] >>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2) [6, 5] >>> sc.parallelize([10, 4, 2, 12, 3]).top(3, key=str) [4, 3, 2] """deftopIterator(iterator:Iterable[T])->Iterable[List[T]]:yieldheapq.nlargest(num,iterator,key=key)defmerge(a:List[T],b:List[T])->List[T]:returnheapq.nlargest(num,a+b,key=key)returnself.mapPartitions(topIterator).reduce(merge)
[docs]deftakeOrdered(self:"RDD[T]",num:int,key:Optional[Callable[[T],"S"]]=None)->List[T]:""" Get the N elements from an RDD ordered in ascending order or as specified by the optional key function. .. versionadded:: 1.0.0 Parameters ---------- num : int top N key : function, optional a function used to generate key for comparing Returns ------- list the top N elements See Also -------- :meth:`RDD.top` :meth:`RDD.max` :meth:`RDD.min` Notes ----- This method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory. Examples -------- >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6) [1, 2, 3, 4, 5, 6] >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7], 2).takeOrdered(6, key=lambda x: -x) [10, 9, 7, 6, 5, 4] >>> sc.emptyRDD().takeOrdered(3) [] """ifnum<0:raiseValueError("top N cannot be negative.")ifnum==0orself.getNumPartitions()==0:return[]else:defmerge(a:List[T],b:List[T])->List[T]:returnheapq.nsmallest(num,a+b,key)returnself.mapPartitions(lambdait:[heapq.nsmallest(num,it,key)]).reduce(merge)
[docs]deftake(self:"RDD[T]",num:int)->List[T]:""" Take the first num elements of the RDD. It works by first scanning one partition, and use the results from that partition to estimate the number of additional partitions needed to satisfy the limit. Translated from the Scala implementation in RDD#take(). .. versionadded:: 0.7.0 Parameters ---------- num : int first number of elements Returns ------- list the first `num` elements See Also -------- :meth:`RDD.first` :meth:`pyspark.sql.DataFrame.take` Notes ----- This method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory. Examples -------- >>> sc.parallelize([2, 3, 4, 5, 6]).cache().take(2) [2, 3] >>> sc.parallelize([2, 3, 4, 5, 6]).take(10) [2, 3, 4, 5, 6] >>> sc.parallelize(range(100), 100).filter(lambda x: x > 90).take(3) [91, 92, 93] """items:List[T]=[]totalParts=self.getNumPartitions()partsScanned=0whilelen(items)<numandpartsScanned<totalParts:# The number of partitions to try in this iteration.# It is ok for this number to be greater than totalParts because# we actually cap it at totalParts in runJob.numPartsToTry=1ifpartsScanned>0:# If we didn't find any rows after the previous iteration,# quadruple and retry. Otherwise, interpolate the number of# partitions we need to try, but overestimate it by 50%.# We also cap the estimation in the end.iflen(items)==0:numPartsToTry=partsScanned*4else:# the first parameter of max is >=1 whenever partsScanned >= 2numPartsToTry=int(1.5*num*partsScanned/len(items))-partsScannednumPartsToTry=min(max(numPartsToTry,1),partsScanned*4)left=num-len(items)deftakeUpToNumLeft(iterator:Iterable[T])->Iterable[T]:iterator=iter(iterator)taken=0whiletaken<left:try:yieldnext(iterator)exceptStopIteration:returntaken+=1p=range(partsScanned,min(partsScanned+numPartsToTry,totalParts))res=self.context.runJob(self,takeUpToNumLeft,p)items+=respartsScanned+=numPartsToTryreturnitems[:num]
[docs]deffirst(self:"RDD[T]")->T:""" Return the first element in this RDD. .. versionadded:: 0.7.0 Returns ------- T the first element See Also -------- :meth:`RDD.take` :meth:`pyspark.sql.DataFrame.first` :meth:`pyspark.sql.DataFrame.head` Examples -------- >>> sc.parallelize([2, 3, 4]).first() 2 >>> sc.parallelize([]).first() Traceback (most recent call last): ... ValueError: RDD is empty """rs=self.take(1)ifrs:returnrs[0]raiseValueError("RDD is empty")
[docs]defisEmpty(self)->bool:""" Returns true if and only if the RDD contains no elements at all. .. versionadded:: 1.3.0 Returns ------- bool whether the :class:`RDD` is empty See Also -------- :meth:`RDD.first` :meth:`pyspark.sql.DataFrame.isEmpty` Notes ----- An RDD may be empty even when it has at least 1 partition. Examples -------- >>> sc.parallelize([]).isEmpty() True >>> sc.parallelize([1]).isEmpty() False """returnself.getNumPartitions()==0orlen(self.take(1))==0
[docs]defsaveAsNewAPIHadoopDataset(self:"RDD[Tuple[K, V]]",conf:Dict[str,str],keyConverter:Optional[str]=None,valueConverter:Optional[str]=None,)->None:""" Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file system, using the new Hadoop OutputFormat API (mapreduce package). Keys/values are converted for output using either user specified converters or, by default, "org.apache.spark.api.python.JavaToWritableConverter". .. versionadded:: 1.1.0 Parameters ---------- conf : dict Hadoop job configuration keyConverter : str, optional fully qualified classname of key converter (None by default) valueConverter : str, optional fully qualified classname of value converter (None by default) See Also -------- :meth:`SparkContext.newAPIHadoopRDD` :meth:`RDD.saveAsHadoopDataset` :meth:`RDD.saveAsHadoopFile` :meth:`RDD.saveAsNewAPIHadoopFile` :meth:`RDD.saveAsSequenceFile` Examples -------- >>> import os >>> import tempfile Set the related classes >>> output_format_class = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat" >>> input_format_class = "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat" >>> key_class = "org.apache.hadoop.io.IntWritable" >>> value_class = "org.apache.hadoop.io.Text" >>> with tempfile.TemporaryDirectory() as d: ... path = os.path.join(d, "new_hadoop_file") ... ... # Create the conf for writing ... write_conf = { ... "mapreduce.job.outputformat.class": (output_format_class), ... "mapreduce.job.output.key.class": key_class, ... "mapreduce.job.output.value.class": value_class, ... "mapreduce.output.fileoutputformat.outputdir": path, ... } ... ... # Write a temporary Hadoop file ... rdd = sc.parallelize([(1, ""), (1, "a"), (3, "x")]) ... rdd.saveAsNewAPIHadoopDataset(conf=write_conf) ... ... # Create the conf for reading ... read_conf = {"mapreduce.input.fileinputformat.inputdir": path} ... ... # Load this Hadoop file as an RDD ... loaded = sc.newAPIHadoopRDD(input_format_class, ... key_class, value_class, conf=read_conf) ... sorted(loaded.collect()) [(1, ''), (1, 'a'), (3, 'x')] """jconf=self.ctx._dictToJavaMap(conf)pickledRDD=self._pickled()assertself.ctx._jvmisnotNoneself.ctx._jvm.PythonRDD.saveAsHadoopDataset(pickledRDD._jrdd,True,jconf,keyConverter,valueConverter,True)
[docs]defsaveAsNewAPIHadoopFile(self:"RDD[Tuple[K, V]]",path:str,outputFormatClass:str,keyClass:Optional[str]=None,valueClass:Optional[str]=None,keyConverter:Optional[str]=None,valueConverter:Optional[str]=None,conf:Optional[Dict[str,str]]=None,)->None:""" Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file system, using the new Hadoop OutputFormat API (mapreduce package). Key and value types will be inferred if not specified. Keys and values are converted for output using either user specified converters or "org.apache.spark.api.python.JavaToWritableConverter". The `conf` is applied on top of the base Hadoop conf associated with the SparkContext of this RDD to create a merged Hadoop MapReduce job configuration for saving the data. .. versionadded:: 1.1.0 Parameters ---------- path : str path to Hadoop file outputFormatClass : str fully qualified classname of Hadoop OutputFormat (e.g. "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat") keyClass : str, optional fully qualified classname of key Writable class (e.g. "org.apache.hadoop.io.IntWritable", None by default) valueClass : str, optional fully qualified classname of value Writable class (e.g. "org.apache.hadoop.io.Text", None by default) keyConverter : str, optional fully qualified classname of key converter (None by default) valueConverter : str, optional fully qualified classname of value converter (None by default) conf : dict, optional Hadoop job configuration (None by default) See Also -------- :meth:`SparkContext.newAPIHadoopFile` :meth:`RDD.saveAsHadoopDataset` :meth:`RDD.saveAsNewAPIHadoopDataset` :meth:`RDD.saveAsHadoopFile` :meth:`RDD.saveAsSequenceFile` Examples -------- >>> import os >>> import tempfile Set the class of output format >>> output_format_class = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat" >>> with tempfile.TemporaryDirectory() as d: ... path = os.path.join(d, "hadoop_file") ... ... # Write a temporary Hadoop file ... rdd = sc.parallelize([(1, {3.0: "bb"}), (2, {1.0: "aa"}), (3, {2.0: "dd"})]) ... rdd.saveAsNewAPIHadoopFile(path, output_format_class) ... ... # Load this Hadoop file as an RDD ... sorted(sc.sequenceFile(path).collect()) [(1, {3.0: 'bb'}), (2, {1.0: 'aa'}), (3, {2.0: 'dd'})] """jconf=self.ctx._dictToJavaMap(conf)pickledRDD=self._pickled()assertself.ctx._jvmisnotNoneself.ctx._jvm.PythonRDD.saveAsNewAPIHadoopFile(pickledRDD._jrdd,True,path,outputFormatClass,keyClass,valueClass,keyConverter,valueConverter,jconf,)
[docs]defsaveAsHadoopDataset(self:"RDD[Tuple[K, V]]",conf:Dict[str,str],keyConverter:Optional[str]=None,valueConverter:Optional[str]=None,)->None:""" Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file system, using the old Hadoop OutputFormat API (mapred package). Keys/values are converted for output using either user specified converters or, by default, "org.apache.spark.api.python.JavaToWritableConverter". .. versionadded:: 1.1.0 Parameters ---------- conf : dict Hadoop job configuration keyConverter : str, optional fully qualified classname of key converter (None by default) valueConverter : str, optional fully qualified classname of value converter (None by default) See Also -------- :meth:`SparkContext.hadoopRDD` :meth:`RDD.saveAsNewAPIHadoopDataset` :meth:`RDD.saveAsHadoopFile` :meth:`RDD.saveAsNewAPIHadoopFile` :meth:`RDD.saveAsSequenceFile` Examples -------- >>> import os >>> import tempfile Set the related classes >>> output_format_class = "org.apache.hadoop.mapred.TextOutputFormat" >>> input_format_class = "org.apache.hadoop.mapred.TextInputFormat" >>> key_class = "org.apache.hadoop.io.IntWritable" >>> value_class = "org.apache.hadoop.io.Text" >>> with tempfile.TemporaryDirectory() as d: ... path = os.path.join(d, "old_hadoop_file") ... ... # Create the conf for writing ... write_conf = { ... "mapred.output.format.class": output_format_class, ... "mapreduce.job.output.key.class": key_class, ... "mapreduce.job.output.value.class": value_class, ... "mapreduce.output.fileoutputformat.outputdir": path, ... } ... ... # Write a temporary Hadoop file ... rdd = sc.parallelize([(1, ""), (1, "a"), (3, "x")]) ... rdd.saveAsHadoopDataset(conf=write_conf) ... ... # Create the conf for reading ... read_conf = {"mapreduce.input.fileinputformat.inputdir": path} ... ... # Load this Hadoop file as an RDD ... loaded = sc.hadoopRDD(input_format_class, key_class, value_class, conf=read_conf) ... sorted(loaded.collect()) [(0, '1\\t'), (0, '1\\ta'), (0, '3\\tx')] """jconf=self.ctx._dictToJavaMap(conf)pickledRDD=self._pickled()assertself.ctx._jvmisnotNoneself.ctx._jvm.PythonRDD.saveAsHadoopDataset(pickledRDD._jrdd,True,jconf,keyConverter,valueConverter,False)
[docs]defsaveAsHadoopFile(self:"RDD[Tuple[K, V]]",path:str,outputFormatClass:str,keyClass:Optional[str]=None,valueClass:Optional[str]=None,keyConverter:Optional[str]=None,valueConverter:Optional[str]=None,conf:Optional[Dict[str,str]]=None,compressionCodecClass:Optional[str]=None,)->None:""" Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file system, using the old Hadoop OutputFormat API (mapred package). Key and value types will be inferred if not specified. Keys and values are converted for output using either user specified converters or "org.apache.spark.api.python.JavaToWritableConverter". The `conf` is applied on top of the base Hadoop conf associated with the SparkContext of this RDD to create a merged Hadoop MapReduce job configuration for saving the data. .. versionadded:: 1.1.0 Parameters ---------- path : str path to Hadoop file outputFormatClass : str fully qualified classname of Hadoop OutputFormat (e.g. "org.apache.hadoop.mapred.SequenceFileOutputFormat") keyClass : str, optional fully qualified classname of key Writable class (e.g. "org.apache.hadoop.io.IntWritable", None by default) valueClass : str, optional fully qualified classname of value Writable class (e.g. "org.apache.hadoop.io.Text", None by default) keyConverter : str, optional fully qualified classname of key converter (None by default) valueConverter : str, optional fully qualified classname of value converter (None by default) conf : dict, optional (None by default) compressionCodecClass : str fully qualified classname of the compression codec class i.e. "org.apache.hadoop.io.compress.GzipCodec" (None by default) See Also -------- :meth:`SparkContext.hadoopFile` :meth:`RDD.saveAsNewAPIHadoopFile` :meth:`RDD.saveAsHadoopDataset` :meth:`RDD.saveAsNewAPIHadoopDataset` :meth:`RDD.saveAsSequenceFile` Examples -------- >>> import os >>> import tempfile Set the related classes >>> output_format_class = "org.apache.hadoop.mapred.TextOutputFormat" >>> input_format_class = "org.apache.hadoop.mapred.TextInputFormat" >>> key_class = "org.apache.hadoop.io.IntWritable" >>> value_class = "org.apache.hadoop.io.Text" >>> with tempfile.TemporaryDirectory() as d: ... path = os.path.join(d, "old_hadoop_file") ... ... # Write a temporary Hadoop file ... rdd = sc.parallelize([(1, ""), (1, "a"), (3, "x")]) ... rdd.saveAsHadoopFile(path, output_format_class, key_class, value_class) ... ... # Load this Hadoop file as an RDD ... loaded = sc.hadoopFile(path, input_format_class, key_class, value_class) ... sorted(loaded.collect()) [(0, '1\\t'), (0, '1\\ta'), (0, '3\\tx')] """jconf=self.ctx._dictToJavaMap(conf)pickledRDD=self._pickled()assertself.ctx._jvmisnotNoneself.ctx._jvm.PythonRDD.saveAsHadoopFile(pickledRDD._jrdd,True,path,outputFormatClass,keyClass,valueClass,keyConverter,valueConverter,jconf,compressionCodecClass,)
[docs]defsaveAsSequenceFile(self:"RDD[Tuple[K, V]]",path:str,compressionCodecClass:Optional[str]=None)->None:""" Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file system, using the "org.apache.hadoop.io.Writable" types that we convert from the RDD's key and value types. The mechanism is as follows: 1. Pickle is used to convert pickled Python RDD into RDD of Java objects. 2. Keys and values of this Java RDD are converted to Writables and written out. .. versionadded:: 1.1.0 Parameters ---------- path : str path to sequence file compressionCodecClass : str, optional fully qualified classname of the compression codec class i.e. "org.apache.hadoop.io.compress.GzipCodec" (None by default) See Also -------- :meth:`SparkContext.sequenceFile` :meth:`RDD.saveAsHadoopFile` :meth:`RDD.saveAsNewAPIHadoopFile` :meth:`RDD.saveAsHadoopDataset` :meth:`RDD.saveAsNewAPIHadoopDataset` :meth:`RDD.saveAsSequenceFile` Examples -------- >>> import os >>> import tempfile Set the related classes >>> with tempfile.TemporaryDirectory() as d: ... path = os.path.join(d, "sequence_file") ... ... # Write a temporary sequence file ... rdd = sc.parallelize([(1, ""), (1, "a"), (3, "x")]) ... rdd.saveAsSequenceFile(path) ... ... # Load this sequence file as an RDD ... loaded = sc.sequenceFile(path) ... sorted(loaded.collect()) [(1, ''), (1, 'a'), (3, 'x')] """pickledRDD=self._pickled()assertself.ctx._jvmisnotNoneself.ctx._jvm.PythonRDD.saveAsSequenceFile(pickledRDD._jrdd,True,path,compressionCodecClass)
[docs]defsaveAsPickleFile(self,path:str,batchSize:int=10)->None:""" Save this RDD as a SequenceFile of serialized objects. The serializer used is :class:`pyspark.serializers.CPickleSerializer`, default batch size is 10. .. versionadded:: 1.1.0 Parameters ---------- path : str path to pickled file batchSize : int, optional, default 10 the number of Python objects represented as a single Java object. See Also -------- :meth:`SparkContext.pickleFile` Examples -------- >>> import os >>> import tempfile >>> with tempfile.TemporaryDirectory() as d: ... path = os.path.join(d, "pickle_file") ... ... # Write a temporary pickled file ... sc.parallelize(range(10)).saveAsPickleFile(path, 3) ... ... # Load picked file as an RDD ... sorted(sc.pickleFile(path, 3).collect()) [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] """ser:SerializerifbatchSize==0:ser=AutoBatchedSerializer(CPickleSerializer())else:ser=BatchedSerializer(CPickleSerializer(),batchSize)self._reserialize(ser)._jrdd.saveAsObjectFile(path)
[docs]defsaveAsTextFile(self,path:str,compressionCodecClass:Optional[str]=None)->None:""" Save this RDD as a text file, using string representations of elements. .. versionadded:: 0.7.0 Parameters ---------- path : str path to text file compressionCodecClass : str, optional fully qualified classname of the compression codec class i.e. "org.apache.hadoop.io.compress.GzipCodec" (None by default) See Also -------- :meth:`SparkContext.textFile` :meth:`SparkContext.wholeTextFiles` Examples -------- >>> import os >>> import tempfile >>> from fileinput import input >>> from glob import glob >>> with tempfile.TemporaryDirectory() as d1: ... path1 = os.path.join(d1, "text_file1") ... ... # Write a temporary text file ... sc.parallelize(range(10)).saveAsTextFile(path1) ... ... # Load text file as an RDD ... ''.join(sorted(input(glob(path1 + "/part-0000*")))) '0\\n1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n' Empty lines are tolerated when saving to text files. >>> with tempfile.TemporaryDirectory() as d2: ... path2 = os.path.join(d2, "text2_file2") ... ... # Write another temporary text file ... sc.parallelize(['', 'foo', '', 'bar', '']).saveAsTextFile(path2) ... ... # Load text file as an RDD ... ''.join(sorted(input(glob(path2 + "/part-0000*")))) '\\n\\n\\nbar\\nfoo\\n' Using compressionCodecClass >>> from fileinput import input, hook_compressed >>> with tempfile.TemporaryDirectory() as d3: ... path3 = os.path.join(d3, "text3") ... codec = "org.apache.hadoop.io.compress.GzipCodec" ... ... # Write another temporary text file with specified codec ... sc.parallelize(['foo', 'bar']).saveAsTextFile(path3, codec) ... ... # Load text file as an RDD ... result = sorted(input(glob(path3 + "/part*.gz"), openhook=hook_compressed)) ... ''.join([r.decode('utf-8') if isinstance(r, bytes) else r for r in result]) 'bar\\nfoo\\n' """deffunc(split:int,iterator:Iterable[Any])->Iterable[bytes]:forxiniterator:ifisinstance(x,bytes):yieldxelifisinstance(x,str):yieldx.encode("utf-8")else:yieldstr(x).encode("utf-8")keyed=self.mapPartitionsWithIndex(func)keyed._bypass_serializer=True# type: ignore[attr-defined]assertself.ctx._jvmisnotNoneifcompressionCodecClass:compressionCodec=self.ctx._jvm.java.lang.Class.forName(compressionCodecClass)keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path,compressionCodec)else:keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)
# Pair functions
[docs]defcollectAsMap(self:"RDD[Tuple[K, V]]")->Dict[K,V]:""" Return the key-value pairs in this RDD to the master as a dictionary. .. versionadded:: 0.7.0 Returns ------- :class:`dict` a dictionary of (key, value) pairs See Also -------- :meth:`RDD.countByValue` Notes ----- This method should only be used if the resulting data is expected to be small, as all the data is loaded into the driver's memory. Examples -------- >>> m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap() >>> m[1] 2 >>> m[3] 4 """returndict(self.collect())
[docs]defkeys(self:"RDD[Tuple[K, V]]")->"RDD[K]":""" Return an RDD with the keys of each tuple. .. versionadded:: 0.7.0 Returns ------- :class:`RDD` a :class:`RDD` only containing the keys See Also -------- :meth:`RDD.values` Examples -------- >>> rdd = sc.parallelize([(1, 2), (3, 4)]).keys() >>> rdd.collect() [1, 3] """returnself.map(lambdax:x[0])
[docs]defvalues(self:"RDD[Tuple[K, V]]")->"RDD[V]":""" Return an RDD with the values of each tuple. .. versionadded:: 0.7.0 Returns ------- :class:`RDD` a :class:`RDD` only containing the values See Also -------- :meth:`RDD.keys` Examples -------- >>> rdd = sc.parallelize([(1, 2), (3, 4)]).values() >>> rdd.collect() [2, 4] """returnself.map(lambdax:x[1])
[docs]defreduceByKey(self:"RDD[Tuple[K, V]]",func:Callable[[V,V],V],numPartitions:Optional[int]=None,partitionFunc:Callable[[K],int]=portable_hash,)->"RDD[Tuple[K, V]]":""" Merge the values for each key using an associative and commutative reduce function. This will also perform the merging locally on each mapper before sending results to a reducer, similarly to a "combiner" in MapReduce. Output will be partitioned with `numPartitions` partitions, or the default parallelism level if `numPartitions` is not specified. Default partitioner is hash-partition. .. versionadded:: 1.6.0 Parameters ---------- func : function the reduce function numPartitions : int, optional the number of partitions in new :class:`RDD` partitionFunc : function, optional, default `portable_hash` function to compute the partition index Returns ------- :class:`RDD` a :class:`RDD` containing the keys and the aggregated result for each key See Also -------- :meth:`RDD.reduceByKeyLocally` :meth:`RDD.combineByKey` :meth:`RDD.aggregateByKey` :meth:`RDD.foldByKey` :meth:`RDD.groupByKey` Examples -------- >>> from operator import add >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> sorted(rdd.reduceByKey(add).collect()) [('a', 2), ('b', 1)] """returnself.combineByKey(lambdax:x,func,func,numPartitions,partitionFunc)
[docs]defreduceByKeyLocally(self:"RDD[Tuple[K, V]]",func:Callable[[V,V],V])->Dict[K,V]:""" Merge the values for each key using an associative and commutative reduce function, but return the results immediately to the master as a dictionary. This will also perform the merging locally on each mapper before sending results to a reducer, similarly to a "combiner" in MapReduce. .. versionadded:: 0.7.0 Parameters ---------- func : function the reduce function Returns ------- dict a dict containing the keys and the aggregated result for each key See Also -------- :meth:`RDD.reduceByKey` :meth:`RDD.aggregateByKey` Examples -------- >>> from operator import add >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> sorted(rdd.reduceByKeyLocally(add).items()) [('a', 2), ('b', 1)] """func=fail_on_stopiteration(func)defreducePartition(iterator:Iterable[Tuple[K,V]])->Iterable[Dict[K,V]]:m:Dict[K,V]={}fork,viniterator:m[k]=func(m[k],v)ifkinmelsevyieldmdefmergeMaps(m1:Dict[K,V],m2:Dict[K,V])->Dict[K,V]:fork,vinm2.items():m1[k]=func(m1[k],v)ifkinm1elsevreturnm1returnself.mapPartitions(reducePartition).reduce(mergeMaps)
[docs]defcountByKey(self:"RDD[Tuple[K, V]]")->Dict[K,int]:""" Count the number of elements for each key, and return the result to the master as a dictionary. .. versionadded:: 0.7.0 Returns ------- dict a dictionary of (key, count) pairs See Also -------- :meth:`RDD.collectAsMap` :meth:`RDD.countByValue` Examples -------- >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> sorted(rdd.countByKey().items()) [('a', 2), ('b', 1)] """returnself.map(lambdax:x[0]).countByValue()
[docs]defjoin(self:"RDD[Tuple[K, V]]",other:"RDD[Tuple[K, U]]",numPartitions:Optional[int]=None,)->"RDD[Tuple[K, Tuple[V, U]]]":""" Return an RDD containing all pairs of elements with matching keys in `self` and `other`. Each pair of elements will be returned as a (k, (v1, v2)) tuple, where (k, v1) is in `self` and (k, v2) is in `other`. Performs a hash join across the cluster. .. versionadded:: 0.7.0 Parameters ---------- other : :class:`RDD` another :class:`RDD` numPartitions : int, optional the number of partitions in new :class:`RDD` Returns ------- :class:`RDD` a :class:`RDD` containing all pairs of elements with matching keys See Also -------- :meth:`RDD.leftOuterJoin` :meth:`RDD.rightOuterJoin` :meth:`RDD.fullOuterJoin` :meth:`RDD.cogroup` :meth:`RDD.groupWith` :meth:`pyspark.sql.DataFrame.join` Examples -------- >>> rdd1 = sc.parallelize([("a", 1), ("b", 4)]) >>> rdd2 = sc.parallelize([("a", 2), ("a", 3)]) >>> sorted(rdd1.join(rdd2).collect()) [('a', (1, 2)), ('a', (1, 3))] """returnpython_join(self,other,numPartitions)
[docs]defleftOuterJoin(self:"RDD[Tuple[K, V]]",other:"RDD[Tuple[K, U]]",numPartitions:Optional[int]=None,)->"RDD[Tuple[K, Tuple[V, Optional[U]]]]":""" Perform a left outer join of `self` and `other`. For each element (k, v) in `self`, the resulting RDD will either contain all pairs (k, (v, w)) for w in `other`, or the pair (k, (v, None)) if no elements in `other` have key k. Hash-partitions the resulting RDD into the given number of partitions. .. versionadded:: 0.7.0 Parameters ---------- other : :class:`RDD` another :class:`RDD` numPartitions : int, optional the number of partitions in new :class:`RDD` Returns ------- :class:`RDD` a :class:`RDD` containing all pairs of elements with matching keys See Also -------- :meth:`RDD.join` :meth:`RDD.rightOuterJoin` :meth:`RDD.fullOuterJoin` :meth:`pyspark.sql.DataFrame.join` Examples -------- >>> rdd1 = sc.parallelize([("a", 1), ("b", 4)]) >>> rdd2 = sc.parallelize([("a", 2)]) >>> sorted(rdd1.leftOuterJoin(rdd2).collect()) [('a', (1, 2)), ('b', (4, None))] """returnpython_left_outer_join(self,other,numPartitions)
[docs]defrightOuterJoin(self:"RDD[Tuple[K, V]]",other:"RDD[Tuple[K, U]]",numPartitions:Optional[int]=None,)->"RDD[Tuple[K, Tuple[Optional[V], U]]]":""" Perform a right outer join of `self` and `other`. For each element (k, w) in `other`, the resulting RDD will either contain all pairs (k, (v, w)) for v in this, or the pair (k, (None, w)) if no elements in `self` have key k. Hash-partitions the resulting RDD into the given number of partitions. .. versionadded:: 0.7.0 Parameters ---------- other : :class:`RDD` another :class:`RDD` numPartitions : int, optional the number of partitions in new :class:`RDD` Returns ------- :class:`RDD` a :class:`RDD` containing all pairs of elements with matching keys See Also -------- :meth:`RDD.join` :meth:`RDD.leftOuterJoin` :meth:`RDD.fullOuterJoin` :meth:`pyspark.sql.DataFrame.join` Examples -------- >>> rdd1 = sc.parallelize([("a", 1), ("b", 4)]) >>> rdd2 = sc.parallelize([("a", 2)]) >>> sorted(rdd2.rightOuterJoin(rdd1).collect()) [('a', (2, 1)), ('b', (None, 4))] """returnpython_right_outer_join(self,other,numPartitions)
[docs]deffullOuterJoin(self:"RDD[Tuple[K, V]]",other:"RDD[Tuple[K, U]]",numPartitions:Optional[int]=None,)->"RDD[Tuple[K, Tuple[Optional[V], Optional[U]]]]":""" Perform a right outer join of `self` and `other`. For each element (k, v) in `self`, the resulting RDD will either contain all pairs (k, (v, w)) for w in `other`, or the pair (k, (v, None)) if no elements in `other` have key k. Similarly, for each element (k, w) in `other`, the resulting RDD will either contain all pairs (k, (v, w)) for v in `self`, or the pair (k, (None, w)) if no elements in `self` have key k. Hash-partitions the resulting RDD into the given number of partitions. .. versionadded:: 1.2.0 Parameters ---------- other : :class:`RDD` another :class:`RDD` numPartitions : int, optional the number of partitions in new :class:`RDD` Returns ------- :class:`RDD` a :class:`RDD` containing all pairs of elements with matching keys See Also -------- :meth:`RDD.join` :meth:`RDD.leftOuterJoin` :meth:`RDD.fullOuterJoin` :meth:`pyspark.sql.DataFrame.join` Examples -------- >>> rdd1 = sc.parallelize([("a", 1), ("b", 4)]) >>> rdd2 = sc.parallelize([("a", 2), ("c", 8)]) >>> sorted(rdd1.fullOuterJoin(rdd2).collect()) [('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))] """returnpython_full_outer_join(self,other,numPartitions)
# TODO: add option to control map-side combining# portable_hash is used as default, because builtin hash of None is different# cross machines.
[docs]defpartitionBy(self:"RDD[Tuple[K, V]]",numPartitions:Optional[int],partitionFunc:Callable[[K],int]=portable_hash,)->"RDD[Tuple[K, V]]":""" Return a copy of the RDD partitioned using the specified partitioner. .. versionadded:: 0.7.0 Parameters ---------- numPartitions : int, optional the number of partitions in new :class:`RDD` partitionFunc : function, optional, default `portable_hash` function to compute the partition index Returns ------- :class:`RDD` a :class:`RDD` partitioned using the specified partitioner See Also -------- :meth:`RDD.repartition` :meth:`RDD.repartitionAndSortWithinPartitions` Examples -------- >>> pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(lambda x: (x, x)) >>> sets = pairs.partitionBy(2).glom().collect() >>> len(set(sets[0]).intersection(set(sets[1]))) 0 """ifnumPartitionsisNone:numPartitions=self._defaultReducePartitions()partitioner=Partitioner(numPartitions,partitionFunc)ifself.partitioner==partitioner:returnself# Transferring O(n) objects to Java is too expensive.# Instead, we'll form the hash buckets in Python,# transferring O(numPartitions) objects to Java.# Each object is a (splitNumber, [objects]) pair.# In order to avoid too huge objects, the objects are# grouped into chunks.outputSerializer=self.ctx._unbatched_serializerlimit=self._memory_limit()/2defadd_shuffle_key(split:int,iterator:Iterable[Tuple[K,V]])->Iterable[bytes]:buckets=defaultdict(list)c,batch=0,min(10*numPartitions,1000)# type: ignore[operator]fork,viniterator:buckets[partitionFunc(k)%numPartitions].append((k,v))# type: ignore[operator]c+=1# check used memory and avg size of chunk of objectsifc%1000==0andget_used_memory()>limitorc>batch:n,size=len(buckets),0forsplitinlist(buckets.keys()):yieldpack_long(split)d=outputSerializer.dumps(buckets[split])delbuckets[split]yielddsize+=len(d)avg=int(size/n)>>20# let 1M < avg < 10Mifavg<1:batch=min(sys.maxsize,batch*1.5)# type: ignore[assignment]elifavg>10:batch=max(int(batch/1.5),1)c=0forsplit,itemsinbuckets.items():yieldpack_long(split)yieldoutputSerializer.dumps(items)keyed=self.mapPartitionsWithIndex(add_shuffle_key,preservesPartitioning=True)keyed._bypass_serializer=True# type: ignore[attr-defined]assertself.ctx._jvmisnotNonewithSCCallSiteSync(self.context):pairRDD=self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()jpartitioner=self.ctx._jvm.PythonPartitioner(numPartitions,id(partitionFunc))jrdd=self.ctx._jvm.PythonRDD.valueOfPair(pairRDD.partitionBy(jpartitioner))rdd:"RDD[Tuple[K, V]]"=RDD(jrdd,self.ctx,BatchedSerializer(outputSerializer))rdd.partitioner=partitionerreturnrdd
# TODO: add control over map-side aggregation
[docs]defcombineByKey(self:"RDD[Tuple[K, V]]",createCombiner:Callable[[V],U],mergeValue:Callable[[U,V],U],mergeCombiners:Callable[[U,U],U],numPartitions:Optional[int]=None,partitionFunc:Callable[[K],int]=portable_hash,)->"RDD[Tuple[K, U]]":""" Generic function to combine the elements for each key using a custom set of aggregation functions. Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined type" C. To avoid memory allocation, both mergeValue and mergeCombiners are allowed to modify and return their first argument instead of creating a new C. In addition, users can control the partitioning of the output RDD. .. versionadded:: 0.7.0 Parameters ---------- createCombiner : function a function to turns a V into a C mergeValue : function a function to merge a V into a C mergeCombiners : function a function to combine two C's into a single one numPartitions : int, optional the number of partitions in new :class:`RDD` partitionFunc : function, optional, default `portable_hash` function to compute the partition index Returns ------- :class:`RDD` a :class:`RDD` containing the keys and the aggregated result for each key See Also -------- :meth:`RDD.reduceByKey` :meth:`RDD.aggregateByKey` :meth:`RDD.foldByKey` :meth:`RDD.groupByKey` Notes ----- V and C can be different -- for example, one might group an RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Examples -------- >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 2)]) >>> def to_list(a): ... return [a] ... >>> def append(a, b): ... a.append(b) ... return a ... >>> def extend(a, b): ... a.extend(b) ... return a ... >>> sorted(rdd.combineByKey(to_list, append, extend).collect()) [('a', [1, 2]), ('b', [1])] """ifnumPartitionsisNone:numPartitions=self._defaultReducePartitions()serializer=self.ctx.serializermemory=self._memory_limit()agg=Aggregator(createCombiner,mergeValue,mergeCombiners)defcombineLocally(iterator:Iterable[Tuple[K,V]])->Iterable[Tuple[K,U]]:merger=ExternalMerger(agg,memory*0.9,serializer)merger.mergeValues(iterator)returnmerger.items()locally_combined=self.mapPartitions(combineLocally,preservesPartitioning=True)shuffled=locally_combined.partitionBy(numPartitions,partitionFunc)def_mergeCombiners(iterator:Iterable[Tuple[K,U]])->Iterable[Tuple[K,U]]:merger=ExternalMerger(agg,memory,serializer)merger.mergeCombiners(iterator)returnmerger.items()returnshuffled.mapPartitions(_mergeCombiners,preservesPartitioning=True)
[docs]defaggregateByKey(self:"RDD[Tuple[K, V]]",zeroValue:U,seqFunc:Callable[[U,V],U],combFunc:Callable[[U,U],U],numPartitions:Optional[int]=None,partitionFunc:Callable[[K],int]=portable_hash,)->"RDD[Tuple[K, U]]":""" Aggregate the values of each key, using given combine functions and a neutral "zero value". This function can return a different result type, U, than the type of the values in this RDD, V. Thus, we need one operation for merging a V into a U and one operation for merging two U's, The former operation is used for merging values within a partition, and the latter is used for merging values between partitions. To avoid memory allocation, both of these functions are allowed to modify and return their first argument instead of creating a new U. .. versionadded:: 1.1.0 Parameters ---------- zeroValue : U the initial value for the accumulated result of each partition seqFunc : function a function to merge a V into a U combFunc : function a function to combine two U's into a single one numPartitions : int, optional the number of partitions in new :class:`RDD` partitionFunc : function, optional, default `portable_hash` function to compute the partition index Returns ------- :class:`RDD` a :class:`RDD` containing the keys and the aggregated result for each key See Also -------- :meth:`RDD.reduceByKey` :meth:`RDD.combineByKey` :meth:`RDD.foldByKey` :meth:`RDD.groupByKey` Examples -------- >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 2)]) >>> seqFunc = (lambda x, y: (x[0] + y, x[1] + 1)) >>> combFunc = (lambda x, y: (x[0] + y[0], x[1] + y[1])) >>> sorted(rdd.aggregateByKey((0, 0), seqFunc, combFunc).collect()) [('a', (3, 2)), ('b', (1, 1))] """defcreateZero()->U:returncopy.deepcopy(zeroValue)returnself.combineByKey(lambdav:seqFunc(createZero(),v),seqFunc,combFunc,numPartitions,partitionFunc)
[docs]deffoldByKey(self:"RDD[Tuple[K, V]]",zeroValue:V,func:Callable[[V,V],V],numPartitions:Optional[int]=None,partitionFunc:Callable[[K],int]=portable_hash,)->"RDD[Tuple[K, V]]":""" Merge the values for each key using an associative function "func" and a neutral "zeroValue" which may be added to the result an arbitrary number of times, and must not change the result (e.g., 0 for addition, or 1 for multiplication.). .. versionadded:: 1.1.0 Parameters ---------- zeroValue : V the initial value for the accumulated result of each partition func : function a function to combine two V's into a single one numPartitions : int, optional the number of partitions in new :class:`RDD` partitionFunc : function, optional, default `portable_hash` function to compute the partition index Returns ------- :class:`RDD` a :class:`RDD` containing the keys and the aggregated result for each key See Also -------- :meth:`RDD.reduceByKey` :meth:`RDD.combineByKey` :meth:`RDD.aggregateByKey` :meth:`RDD.groupByKey` Examples -------- >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> from operator import add >>> sorted(rdd.foldByKey(0, add).collect()) [('a', 2), ('b', 1)] """defcreateZero()->V:returncopy.deepcopy(zeroValue)returnself.combineByKey(lambdav:func(createZero(),v),func,func,numPartitions,partitionFunc)
def_memory_limit(self)->int:return_parse_memory(self.ctx._conf.get("spark.python.worker.memory","512m"))# TODO: support variant with custom partitioner
[docs]defgroupByKey(self:"RDD[Tuple[K, V]]",numPartitions:Optional[int]=None,partitionFunc:Callable[[K],int]=portable_hash,)->"RDD[Tuple[K, Iterable[V]]]":""" Group the values for each key in the RDD into a single sequence. Hash-partitions the resulting RDD with numPartitions partitions. .. versionadded:: 0.7.0 Parameters ---------- numPartitions : int, optional the number of partitions in new :class:`RDD` partitionFunc : function, optional, default `portable_hash` function to compute the partition index Returns ------- :class:`RDD` a :class:`RDD` containing the keys and the grouped result for each key See Also -------- :meth:`RDD.reduceByKey` :meth:`RDD.combineByKey` :meth:`RDD.aggregateByKey` :meth:`RDD.foldByKey` Notes ----- If you are grouping in order to perform an aggregation (such as a sum or average) over each key, using reduceByKey or aggregateByKey will provide much better performance. Examples -------- >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> sorted(rdd.groupByKey().mapValues(len).collect()) [('a', 2), ('b', 1)] >>> sorted(rdd.groupByKey().mapValues(list).collect()) [('a', [1, 1]), ('b', [1])] """defcreateCombiner(x:V)->List[V]:return[x]defmergeValue(xs:List[V],x:V)->List[V]:xs.append(x)returnxsdefmergeCombiners(a:List[V],b:List[V])->List[V]:a.extend(b)returnamemory=self._memory_limit()serializer=self._jrdd_deserializeragg=Aggregator(createCombiner,mergeValue,mergeCombiners)defcombine(iterator:Iterable[Tuple[K,V]])->Iterable[Tuple[K,List[V]]]:merger=ExternalMerger(agg,memory*0.9,serializer)merger.mergeValues(iterator)returnmerger.items()locally_combined=self.mapPartitions(combine,preservesPartitioning=True)shuffled=locally_combined.partitionBy(numPartitions,partitionFunc)defgroupByKey(it:Iterable[Tuple[K,List[V]]])->Iterable[Tuple[K,List[V]]]:merger=ExternalGroupBy(agg,memory,serializer)merger.mergeCombiners(it)returnmerger.items()returnshuffled.mapPartitions(groupByKey,True).mapValues(ResultIterable)
[docs]defflatMapValues(self:"RDD[Tuple[K, V]]",f:Callable[[V],Iterable[U]])->"RDD[Tuple[K, U]]":""" Pass each value in the key-value pair RDD through a flatMap function without changing the keys; this also retains the original RDD's partitioning. .. versionadded:: 0.7.0 Parameters ---------- f : function a function to turn a V into a sequence of U Returns ------- :class:`RDD` a :class:`RDD` containing the keys and the flat-mapped value See Also -------- :meth:`RDD.flatMap` :meth:`RDD.mapValues` Examples -------- >>> rdd = sc.parallelize([("a", ["x", "y", "z"]), ("b", ["p", "r"])]) >>> def f(x): return x ... >>> rdd.flatMapValues(f).collect() [('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')] """defflat_map_fn(kv:Tuple[K,V])->Iterable[Tuple[K,U]]:return((kv[0],x)forxinf(kv[1]))returnself.flatMap(flat_map_fn,preservesPartitioning=True)
[docs]defmapValues(self:"RDD[Tuple[K, V]]",f:Callable[[V],U])->"RDD[Tuple[K, U]]":""" Pass each value in the key-value pair RDD through a map function without changing the keys; this also retains the original RDD's partitioning. .. versionadded:: 0.7.0 Parameters ---------- f : function a function to turn a V into a U Returns ------- :class:`RDD` a :class:`RDD` containing the keys and the mapped value See Also -------- :meth:`RDD.map` :meth:`RDD.flatMapValues` Examples -------- >>> rdd = sc.parallelize([("a", ["apple", "banana", "lemon"]), ("b", ["grapes"])]) >>> def f(x): return len(x) ... >>> rdd.mapValues(f).collect() [('a', 3), ('b', 1)] """defmap_values_fn(kv:Tuple[K,V])->Tuple[K,U]:returnkv[0],f(kv[1])returnself.map(map_values_fn,preservesPartitioning=True)
[docs]defgroupWith(# type: ignore[misc]self:"RDD[Tuple[Any, Any]]",other:"RDD[Tuple[Any, Any]]",*others:"RDD[Tuple[Any, Any]]")->"RDD[Tuple[Any, Tuple[ResultIterable[Any], ...]]]":""" Alias for cogroup but with support for multiple RDDs. .. versionadded:: 0.7.0 Parameters ---------- other : :class:`RDD` another :class:`RDD` others : :class:`RDD` other :class:`RDD`\\s Returns ------- :class:`RDD` a :class:`RDD` containing the keys and cogrouped values See Also -------- :meth:`RDD.cogroup` :meth:`RDD.join` Examples -------- >>> rdd1 = sc.parallelize([("a", 5), ("b", 6)]) >>> rdd2 = sc.parallelize([("a", 1), ("b", 4)]) >>> rdd3 = sc.parallelize([("a", 2)]) >>> rdd4 = sc.parallelize([("b", 42)]) >>> [(x, tuple(map(list, y))) for x, y in ... sorted(list(rdd1.groupWith(rdd2, rdd3, rdd4).collect()))] [('a', ([5], [1], [2], [])), ('b', ([6], [4], [], [42]))] """returnpython_cogroup((self,other)+others,numPartitions=None)
# TODO: add variant with custom partitioner
[docs]defcogroup(self:"RDD[Tuple[K, V]]",other:"RDD[Tuple[K, U]]",numPartitions:Optional[int]=None,)->"RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[U]]]]":""" For each key k in `self` or `other`, return a resulting RDD that contains a tuple with the list of values for that key in `self` as well as `other`. .. versionadded:: 0.7.0 Parameters ---------- other : :class:`RDD` another :class:`RDD` Returns ------- :class:`RDD` a :class:`RDD` containing the keys and cogrouped values See Also -------- :meth:`RDD.groupWith` :meth:`RDD.join` Examples -------- >>> rdd1 = sc.parallelize([("a", 1), ("b", 4)]) >>> rdd2 = sc.parallelize([("a", 2)]) >>> [(x, tuple(map(list, y))) for x, y in sorted(list(rdd1.cogroup(rdd2).collect()))] [('a', ([1], [2])), ('b', ([4], []))] """returnpython_cogroup((self,other),numPartitions)
[docs]defsampleByKey(self:"RDD[Tuple[K, V]]",withReplacement:bool,fractions:Dict[K,Union[float,int]],seed:Optional[int]=None,)->"RDD[Tuple[K, V]]":""" Return a subset of this RDD sampled by key (via stratified sampling). Create a sample of this RDD using variable sampling rates for different keys as specified by fractions, a key to sampling rate map. .. versionadded:: 0.7.0 Parameters ---------- withReplacement : bool whether to sample with or without replacement fractions : dict map of specific keys to sampling rates seed : int, optional seed for the random number generator Returns ------- :class:`RDD` a :class:`RDD` containing the stratified sampling result See Also -------- :meth:`RDD.sample` Examples -------- >>> fractions = {"a": 0.2, "b": 0.1} >>> rdd = sc.parallelize(fractions.keys()).cartesian(sc.parallelize(range(0, 1000))) >>> sample = dict(rdd.sampleByKey(False, fractions, 2).groupByKey().collect()) >>> 100 < len(sample["a"]) < 300 and 50 < len(sample["b"]) < 150 True >>> max(sample["a"]) <= 999 and min(sample["a"]) >= 0 True >>> max(sample["b"]) <= 999 and min(sample["b"]) >= 0 True """forfractioninfractions.values():assertfraction>=0.0,"Negative fraction value: %s"%fractionreturnself.mapPartitionsWithIndex(RDDStratifiedSampler(withReplacement,fractions,seed).func,True)
[docs]defsubtractByKey(self:"RDD[Tuple[K, V]]",other:"RDD[Tuple[K, Any]]",numPartitions:Optional[int]=None,)->"RDD[Tuple[K, V]]":""" Return each (key, value) pair in `self` that has no pair with matching key in `other`. .. versionadded:: 0.9.1 Parameters ---------- other : :class:`RDD` another :class:`RDD` numPartitions : int, optional the number of partitions in new :class:`RDD` Returns ------- :class:`RDD` a :class:`RDD` with the pairs from this whose keys are not in `other` See Also -------- :meth:`RDD.subtract` Examples -------- >>> rdd1 = sc.parallelize([("a", 1), ("b", 4), ("b", 5), ("a", 2)]) >>> rdd2 = sc.parallelize([("a", 3), ("c", None)]) >>> sorted(rdd1.subtractByKey(rdd2).collect()) [('b', 4), ('b', 5)] """deffilter_func(pair:Tuple[K,Tuple[V,Any]])->bool:key,(val1,val2)=pairreturnval1andnotval2# type: ignore[return-value]return(self.cogroup(other,numPartitions).filter(filter_func)# type: ignore[arg-type].flatMapValues(lambdax:x[0]))
[docs]defsubtract(self:"RDD[T]",other:"RDD[T]",numPartitions:Optional[int]=None)->"RDD[T]":""" Return each value in `self` that is not contained in `other`. .. versionadded:: 0.9.1 Parameters ---------- other : :class:`RDD` another :class:`RDD` numPartitions : int, optional the number of partitions in new :class:`RDD` Returns ------- :class:`RDD` a :class:`RDD` with the elements from this that are not in `other` See Also -------- :meth:`RDD.subtractByKey` Examples -------- >>> rdd1 = sc.parallelize([("a", 1), ("b", 4), ("b", 5), ("a", 3)]) >>> rdd2 = sc.parallelize([("a", 3), ("c", None)]) >>> sorted(rdd1.subtract(rdd2).collect()) [('a', 1), ('b', 4), ('b', 5)] """# note: here 'True' is just a placeholderrdd=other.map(lambdax:(x,True))returnself.map(lambdax:(x,True)).subtractByKey(rdd,numPartitions).keys()
[docs]defkeyBy(self:"RDD[T]",f:Callable[[T],K])->"RDD[Tuple[K, T]]":""" Creates tuples of the elements in this RDD by applying `f`. .. versionadded:: 0.9.1 Parameters ---------- f : function a function to compute the key Returns ------- :class:`RDD` a :class:`RDD` with the elements from this that are not in `other` See Also -------- :meth:`RDD.map` :meth:`RDD.keys` :meth:`RDD.values` Examples -------- >>> rdd1 = sc.parallelize(range(0,3)).keyBy(lambda x: x*x) >>> rdd2 = sc.parallelize(zip(range(0,5), range(0,5))) >>> [(x, list(map(list, y))) for x, y in sorted(rdd1.cogroup(rdd2).collect())] [(0, [[0], [0]]), (1, [[1], [1]]), (2, [[], [2]]), (3, [[], [3]]), (4, [[2], [4]])] """returnself.map(lambdax:(f(x),x))
[docs]defrepartition(self:"RDD[T]",numPartitions:int)->"RDD[T]":""" Return a new RDD that has exactly numPartitions partitions. Can increase or decrease the level of parallelism in this RDD. Internally, this uses a shuffle to redistribute data. If you are decreasing the number of partitions in this RDD, consider using `coalesce`, which can avoid performing a shuffle. .. versionadded:: 1.0.0 Parameters ---------- numPartitions : int, optional the number of partitions in new :class:`RDD` Returns ------- :class:`RDD` a :class:`RDD` with exactly numPartitions partitions See Also -------- :meth:`RDD.coalesce` :meth:`RDD.partitionBy` :meth:`RDD.repartitionAndSortWithinPartitions` Examples -------- >>> rdd = sc.parallelize([1,2,3,4,5,6,7], 4) >>> sorted(rdd.glom().collect()) [[1], [2, 3], [4, 5], [6, 7]] >>> len(rdd.repartition(2).glom().collect()) 2 >>> len(rdd.repartition(10).glom().collect()) 10 """returnself.coalesce(numPartitions,shuffle=True)
[docs]defcoalesce(self:"RDD[T]",numPartitions:int,shuffle:bool=False)->"RDD[T]":""" Return a new RDD that is reduced into `numPartitions` partitions. .. versionadded:: 1.0.0 Parameters ---------- numPartitions : int, optional the number of partitions in new :class:`RDD` shuffle : bool, optional, default False whether to add a shuffle step Returns ------- :class:`RDD` a :class:`RDD` that is reduced into `numPartitions` partitions See Also -------- :meth:`RDD.repartition` Examples -------- >>> sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect() [[1], [2, 3], [4, 5]] >>> sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(1).glom().collect() [[1, 2, 3, 4, 5]] """ifnotnumPartitions>0:raiseValueError("Number of partitions must be positive.")ifshuffle:# Decrease the batch size in order to distribute evenly the elements across output# partitions. Otherwise, repartition will possibly produce highly skewed partitions.batchSize=min(10,self.ctx._batchSizeor1024)ser=BatchedSerializer(CPickleSerializer(),batchSize)selfCopy=self._reserialize(ser)jrdd_deserializer=selfCopy._jrdd_deserializerjrdd=selfCopy._jrdd.coalesce(numPartitions,shuffle)else:jrdd_deserializer=self._jrdd_deserializerjrdd=self._jrdd.coalesce(numPartitions,shuffle)returnRDD(jrdd,self.ctx,jrdd_deserializer)
[docs]defzip(self:"RDD[T]",other:"RDD[U]")->"RDD[Tuple[T, U]]":""" Zips this RDD with another one, returning key-value pairs with the first element in each RDD second element in each RDD, etc. Assumes that the two RDDs have the same number of partitions and the same number of elements in each partition (e.g. one was made through a map on the other). .. versionadded:: 1.0.0 Parameters ---------- other : :class:`RDD` another :class:`RDD` Returns ------- :class:`RDD` a :class:`RDD` containing the zipped key-value pairs See Also -------- :meth:`RDD.zipWithIndex` :meth:`RDD.zipWithUniqueId` Examples -------- >>> rdd1 = sc.parallelize(range(0,5)) >>> rdd2 = sc.parallelize(range(1000, 1005)) >>> rdd1.zip(rdd2).collect() [(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)] """defget_batch_size(ser:Serializer)->int:ifisinstance(ser,BatchedSerializer):returnser.batchSizereturn1# not batcheddefbatch_as(rdd:"RDD[V]",batchSize:int)->"RDD[V]":returnrdd._reserialize(BatchedSerializer(CPickleSerializer(),batchSize))my_batch=get_batch_size(self._jrdd_deserializer)other_batch=get_batch_size(other._jrdd_deserializer)ifmy_batch!=other_batchornotmy_batch:# use the smallest batchSize for both of thembatchSize=min(my_batch,other_batch)ifbatchSize<=0:# auto batched or unlimitedbatchSize=100other=batch_as(other,batchSize)self=batch_as(self,batchSize)ifself.getNumPartitions()!=other.getNumPartitions():raiseValueError("Can only zip with RDD which has the same number of partitions")# There will be an Exception in JVM if there are different number# of items in each partitions.pairRDD=self._jrdd.zip(other._jrdd)deserializer=PairDeserializer(self._jrdd_deserializer,other._jrdd_deserializer)returnRDD(pairRDD,self.ctx,deserializer)
[docs]defzipWithIndex(self:"RDD[T]")->"RDD[Tuple[T, int]]":""" Zips this RDD with its element indices. The ordering is first based on the partition index and then the ordering of items within each partition. So the first item in the first partition gets index 0, and the last item in the last partition receives the largest index. This method needs to trigger a spark job when this RDD contains more than one partitions. .. versionadded:: 1.2.0 Returns ------- :class:`RDD` a :class:`RDD` containing the zipped key-index pairs See Also -------- :meth:`RDD.zip` :meth:`RDD.zipWithUniqueId` Examples -------- >>> sc.parallelize(["a", "b", "c", "d"], 3).zipWithIndex().collect() [('a', 0), ('b', 1), ('c', 2), ('d', 3)] """starts=[0]ifself.getNumPartitions()>1:nums=self.mapPartitions(lambdait:[sum(1foriinit)]).collect()foriinrange(len(nums)-1):starts.append(starts[-1]+nums[i])deffunc(k:int,it:Iterable[T])->Iterable[Tuple[T,int]]:fori,vinenumerate(it,starts[k]):yieldv,ireturnself.mapPartitionsWithIndex(func)
[docs]defzipWithUniqueId(self:"RDD[T]")->"RDD[Tuple[T, int]]":""" Zips this RDD with generated unique Long ids. Items in the kth partition will get ids k, n+k, 2*n+k, ..., where n is the number of partitions. So there may exist gaps, but this method won't trigger a spark job, which is different from :meth:`zipWithIndex`. .. versionadded:: 1.2.0 Returns ------- :class:`RDD` a :class:`RDD` containing the zipped key-UniqueId pairs See Also -------- :meth:`RDD.zip` :meth:`RDD.zipWithIndex` Examples -------- >>> sc.parallelize(["a", "b", "c", "d", "e"], 3).zipWithUniqueId().collect() [('a', 0), ('b', 1), ('c', 4), ('d', 2), ('e', 5)] """n=self.getNumPartitions()deffunc(k:int,it:Iterable[T])->Iterable[Tuple[T,int]]:fori,vinenumerate(it):yieldv,i*n+kreturnself.mapPartitionsWithIndex(func)
[docs]defname(self)->Optional[str]:""" Return the name of this RDD. .. versionadded:: 1.0.0 Returns ------- str :class:`RDD` name See Also -------- :meth:`RDD.setName` Examples -------- >>> rdd = sc.range(5) >>> rdd.name() == None True """n=self._jrdd.name()returnnifnelseNone
[docs]defsetName(self:"RDD[T]",name:str)->"RDD[T]":""" Assign a name to this RDD. .. versionadded:: 1.0.0 Parameters ---------- name : str new name Returns ------- :class:`RDD` the same :class:`RDD` with name updated See Also -------- :meth:`RDD.name` Examples -------- >>> rdd = sc.parallelize([1, 2]) >>> rdd.setName('I am an RDD').name() 'I am an RDD' """self._jrdd.setName(name)returnself
[docs]deftoDebugString(self)->Optional[bytes]:""" A description of this RDD and its recursive dependencies for debugging. .. versionadded:: 1.0.0 Returns ------- bytes debugging information of this :class:`RDD` Examples -------- >>> rdd = sc.range(5) >>> rdd.toDebugString() b'...PythonRDD...ParallelCollectionRDD...' """debug_string=self._jrdd.toDebugString()returndebug_string.encode("utf-8")ifdebug_stringelseNone
[docs]defgetStorageLevel(self)->StorageLevel:""" Get the RDD's current storage level. .. versionadded:: 1.0.0 Returns ------- :class:`StorageLevel` current :class:`StorageLevel` See Also -------- :meth:`RDD.name` Examples -------- >>> rdd = sc.parallelize([1,2]) >>> rdd.getStorageLevel() StorageLevel(False, False, False, False, 1) >>> print(rdd.getStorageLevel()) Serialized 1x Replicated """java_storage_level=self._jrdd.getStorageLevel()storage_level=StorageLevel(java_storage_level.useDisk(),java_storage_level.useMemory(),java_storage_level.useOffHeap(),java_storage_level.deserialized(),java_storage_level.replication(),)returnstorage_level
def_defaultReducePartitions(self)->int:""" Returns the default number of partitions to use during reduce tasks (e.g., groupBy). If spark.default.parallelism is set, then we'll use the value from SparkContext defaultParallelism, otherwise we'll use the number of partitions in this RDD. This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will be inherent. """ifself.ctx._conf.contains("spark.default.parallelism"):returnself.ctx.defaultParallelismelse:returnself.getNumPartitions()
[docs]deflookup(self:"RDD[Tuple[K, V]]",key:K)->List[V]:""" Return the list of values in the RDD for key `key`. This operation is done efficiently if the RDD has a known partitioner by only searching the partition that the key maps to. .. versionadded:: 1.2.0 Parameters ---------- key : K the key to look up Returns ------- list the list of values in the :class:`RDD` for key `key` Examples -------- >>> l = range(1000) >>> rdd = sc.parallelize(zip(l, l), 10) >>> rdd.lookup(42) # slow [42] >>> sorted = rdd.sortByKey() >>> sorted.lookup(42) # fast [42] >>> sorted.lookup(1024) [] >>> rdd2 = sc.parallelize([(('a', 'b'), 'c')]).groupByKey() >>> list(rdd2.lookup(('a', 'b'))[0]) ['c'] """values=self.filter(lambdakv:kv[0]==key).values()ifself.partitionerisnotNone:returnself.ctx.runJob(values,lambdax:x,[self.partitioner(key)])returnvalues.collect()
def_to_java_object_rdd(self)->"JavaObject":"""Return a JavaRDD of Object by unpickling It will convert each Python object into Java object by Pickle, whenever the RDD is serialized in batch or not. """rdd=self._pickled()assertself.ctx._jvmisnotNonereturnself.ctx._jvm.SerDeUtil.pythonToJava(rdd._jrdd,True)
[docs]defcountApprox(self,timeout:int,confidence:float=0.95)->int:""" Approximate version of count() that returns a potentially incomplete result within a timeout, even if not all tasks have finished. .. versionadded:: 1.2.0 Parameters ---------- timeout : int maximum time to wait for the job, in milliseconds confidence : float the desired statistical confidence in the result Returns ------- int a potentially incomplete result, with error bounds See Also -------- :meth:`RDD.count` Examples -------- >>> rdd = sc.parallelize(range(1000), 10) >>> rdd.countApprox(1000, 1.0) 1000 """drdd=self.mapPartitions(lambdait:[float(sum(1foriinit))])returnint(drdd.sumApprox(timeout,confidence))
[docs]defsumApprox(self:"RDD[Union[float, int]]",timeout:int,confidence:float=0.95)->BoundedFloat:""" Approximate operation to return the sum within a timeout or meet the confidence. .. versionadded:: 1.2.0 Parameters ---------- timeout : int maximum time to wait for the job, in milliseconds confidence : float the desired statistical confidence in the result Returns ------- :class:`BoundedFloat` a potentially incomplete result, with error bounds See Also -------- :meth:`RDD.sum` Examples -------- >>> rdd = sc.parallelize(range(1000), 10) >>> r = sum(range(1000)) >>> abs(rdd.sumApprox(1000) - r) / r < 0.05 True """jrdd=self.mapPartitions(lambdait:[float(sum(it))])._to_java_object_rdd()assertself.ctx._jvmisnotNonejdrdd=self.ctx._jvm.JavaDoubleRDD.fromRDD(jrdd.rdd())r=jdrdd.sumApprox(timeout,confidence).getFinalValue()returnBoundedFloat(r.mean(),r.confidence(),r.low(),r.high())
[docs]defmeanApprox(self:"RDD[Union[float, int]]",timeout:int,confidence:float=0.95)->BoundedFloat:""" Approximate operation to return the mean within a timeout or meet the confidence. .. versionadded:: 1.2.0 Parameters ---------- timeout : int maximum time to wait for the job, in milliseconds confidence : float the desired statistical confidence in the result Returns ------- :class:`BoundedFloat` a potentially incomplete result, with error bounds See Also -------- :meth:`RDD.mean` Examples -------- >>> rdd = sc.parallelize(range(1000), 10) >>> r = sum(range(1000)) / 1000.0 >>> abs(rdd.meanApprox(1000) - r) / r < 0.05 True """jrdd=self.map(float)._to_java_object_rdd()assertself.ctx._jvmisnotNonejdrdd=self.ctx._jvm.JavaDoubleRDD.fromRDD(jrdd.rdd())r=jdrdd.meanApprox(timeout,confidence).getFinalValue()returnBoundedFloat(r.mean(),r.confidence(),r.low(),r.high())
[docs]defcountApproxDistinct(self:"RDD[T]",relativeSD:float=0.05)->int:""" Return approximate number of distinct elements in the RDD. .. versionadded:: 1.2.0 Parameters ---------- relativeSD : float, optional Relative accuracy. Smaller values create counters that require more space. It must be greater than 0.000017. Returns ------- int approximate number of distinct elements See Also -------- :meth:`RDD.distinct` Notes ----- The algorithm used is based on streamlib's implementation of `"HyperLogLog in Practice: Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available here <https://doi.org/10.1145/2452376.2452456>`_. Examples -------- >>> n = sc.parallelize(range(1000)).map(str).countApproxDistinct() >>> 900 < n < 1100 True >>> n = sc.parallelize([i % 20 for i in range(1000)]).countApproxDistinct() >>> 16 < n < 24 True """ifrelativeSD<0.000017:raiseValueError("relativeSD should be greater than 0.000017")# the hash space in Java is 2^32hashRDD=self.map(lambdax:portable_hash(x)&0xFFFFFFFF)returnhashRDD._to_java_object_rdd().countApproxDistinct(relativeSD)
[docs]deftoLocalIterator(self:"RDD[T]",prefetchPartitions:bool=False)->Iterator[T]:""" Return an iterator that contains all of the elements in this RDD. The iterator will consume as much memory as the largest partition in this RDD. With prefetch it may consume up to the memory of the 2 largest partitions. .. versionadded:: 1.3.0 Parameters ---------- prefetchPartitions : bool, optional If Spark should pre-fetch the next partition before it is needed. Returns ------- :class:`collections.abc.Iterator` an iterator that contains all of the elements in this :class:`RDD` See Also -------- :meth:`RDD.collect` :meth:`pyspark.sql.DataFrame.toLocalIterator` Examples -------- >>> rdd = sc.parallelize(range(10)) >>> [x for x in rdd.toLocalIterator()] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] """assertself.ctx._jvmisnotNonewithSCCallSiteSync(self.context):sock_info=self.ctx._jvm.PythonRDD.toLocalIteratorAndServe(self._jrdd.rdd(),prefetchPartitions)return_local_iterator_from_socket(sock_info,self._jrdd_deserializer)
[docs]defbarrier(self:"RDD[T]")->"RDDBarrier[T]":""" Marks the current stage as a barrier stage, where Spark must launch all tasks together. In case of a task failure, instead of only restarting the failed task, Spark will abort the entire stage and relaunch all tasks for this stage. The barrier execution mode feature is experimental and it only handles limited scenarios. Please read the linked SPIP and design docs to understand the limitations and future plans. .. versionadded:: 2.4.0 Returns ------- :class:`RDDBarrier` instance that provides actions within a barrier stage. See Also -------- :class:`pyspark.BarrierTaskContext` Notes ----- For additional information see - `SPIP: Barrier Execution Mode <http://jira.apache.org/jira/browse/SPARK-24374>`_ - `Design Doc <https://jira.apache.org/jira/browse/SPARK-24582>`_ This API is experimental """returnRDDBarrier(self)
def_is_barrier(self)->bool:""" Whether this RDD is in a barrier stage. """returnself._jrdd.rdd().isBarrier()
[docs]defwithResources(self:"RDD[T]",profile:ResourceProfile)->"RDD[T]":""" Specify a :class:`pyspark.resource.ResourceProfile` to use when calculating this RDD. This is only supported on certain cluster managers and currently requires dynamic allocation to be enabled. It will result in new executors with the resources specified being acquired to calculate the RDD. .. versionadded:: 3.1.0 Parameters ---------- profile : :class:`pyspark.resource.ResourceProfile` a resource profile Returns ------- :class:`RDD` the same :class:`RDD` with user specified profile See Also -------- :meth:`RDD.getResourceProfile` Notes ----- This API is experimental """self.has_resource_profile=Trueifprofile._java_resource_profileisnotNone:jrp=profile._java_resource_profileelse:assertself.ctx._jvmisnotNonebuilder=self.ctx._jvm.org.apache.spark.resource.ResourceProfileBuilder()ereqs=ExecutorResourceRequests(self.ctx._jvm,profile._executor_resource_requests)treqs=TaskResourceRequests(self.ctx._jvm,profile._task_resource_requests)builder.require(ereqs._java_executor_resource_requests)builder.require(treqs._java_task_resource_requests)jrp=builder.build()self._jrdd.withResources(jrp)returnself
[docs]defgetResourceProfile(self)->Optional[ResourceProfile]:""" Get the :class:`pyspark.resource.ResourceProfile` specified with this RDD or None if it wasn't specified. .. versionadded:: 3.1.0 Returns ------- class:`pyspark.resource.ResourceProfile` The user specified profile or None if none were specified See Also -------- :meth:`RDD.withResources` Notes ----- This API is experimental """rp=self._jrdd.getResourceProfile()ifrpisnotNone:returnResourceProfile(_java_resource_profile=rp)else:returnNone
def_prepare_for_python_RDD(sc:"SparkContext",command:Any)->Tuple[bytes,Any,Any,Any]:# the serialized command will be compressed by broadcastser=CloudPickleSerializer()pickled_command=ser.dumps(command)assertsc._jvmisnotNoneiflen(pickled_command)>sc._jvm.PythonUtils.getBroadcastThreshold(sc._jsc):# Default 1M# The broadcast will have same life cycle as created PythonRDDbroadcast=sc.broadcast(pickled_command)pickled_command=ser.dumps(broadcast)broadcast_vars=[x._jbroadcastforxinsc._pickled_broadcast_vars]sc._pickled_broadcast_vars.clear()returnpickled_command,broadcast_vars,sc.environment,sc._python_includesdef_wrap_function(sc:"SparkContext",func:Callable,deserializer:Any,serializer:Any,profiler:Any=None)->"JavaObject":assertdeserializer,"deserializer should not be empty"assertserializer,"serializer should not be empty"command=(func,profiler,deserializer,serializer)pickled_command,broadcast_vars,env,includes=_prepare_for_python_RDD(sc,command)assertsc._jvmisnotNonereturnsc._jvm.SimplePythonFunction(bytearray(pickled_command),env,includes,sc.pythonExec,sc.pythonVer,broadcast_vars,sc._javaAccumulator,)
[docs]classRDDBarrier(Generic[T]):""" Wraps an RDD in a barrier stage, which forces Spark to launch tasks of this stage together. :class:`RDDBarrier` instances are created by :meth:`RDD.barrier`. .. versionadded:: 2.4.0 Notes ----- This API is experimental """def__init__(self,rdd:RDD[T]):self.rdd=rdd
[docs]defmapPartitions(self,f:Callable[[Iterable[T]],Iterable[U]],preservesPartitioning:bool=False)->RDD[U]:""" Returns a new RDD by applying a function to each partition of the wrapped RDD, where tasks are launched together in a barrier stage. The interface is the same as :meth:`RDD.mapPartitions`. Please see the API doc there. .. versionadded:: 2.4.0 Parameters ---------- f : function a function to run on each partition of the RDD preservesPartitioning : bool, optional, default False indicates whether the input function preserves the partitioner, which should be False unless this is a pair RDD and the input Returns ------- :class:`RDD` a new :class:`RDD` by applying a function to each partition See Also -------- :meth:`RDD.mapPartitions` Notes ----- This API is experimental Examples -------- >>> rdd = sc.parallelize([1, 2, 3, 4], 2) >>> def f(iterator): yield sum(iterator) ... >>> barrier = rdd.barrier() >>> barrier <pyspark.rdd.RDDBarrier ...> >>> barrier.mapPartitions(f).collect() [3, 7] """deffunc(s:int,iterator:Iterable[T])->Iterable[U]:returnf(iterator)returnPipelinedRDD(self.rdd,func,preservesPartitioning,isFromBarrier=True)
[docs]defmapPartitionsWithIndex(self,f:Callable[[int,Iterable[T]],Iterable[U]],preservesPartitioning:bool=False,)->RDD[U]:""" Returns a new RDD by applying a function to each partition of the wrapped RDD, while tracking the index of the original partition. And all tasks are launched together in a barrier stage. The interface is the same as :meth:`RDD.mapPartitionsWithIndex`. Please see the API doc there. .. versionadded:: 3.0.0 Parameters ---------- f : function a function to run on each partition of the RDD preservesPartitioning : bool, optional, default False indicates whether the input function preserves the partitioner, which should be False unless this is a pair RDD and the input Returns ------- :class:`RDD` a new :class:`RDD` by applying a function to each partition See Also -------- :meth:`RDD.mapPartitionsWithIndex` Notes ----- This API is experimental Examples -------- >>> rdd = sc.parallelize([1, 2, 3, 4], 4) >>> def f(splitIndex, iterator): yield splitIndex ... >>> barrier = rdd.barrier() >>> barrier <pyspark.rdd.RDDBarrier ...> >>> barrier.mapPartitionsWithIndex(f).sum() 6 """returnPipelinedRDD(self.rdd,f,preservesPartitioning,isFromBarrier=True)
classPipelinedRDD(RDD[U],Generic[T,U]):""" Examples -------- Pipelined maps: >>> rdd = sc.parallelize([1, 2, 3, 4]) >>> rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect() [4, 8, 12, 16] >>> rdd.map(lambda x: 2 * x).map(lambda x: 2 * x).collect() [4, 8, 12, 16] Pipelined reduces: >>> from operator import add >>> rdd.map(lambda x: 2 * x).reduce(add) 20 >>> rdd.flatMap(lambda x: [x, x]).reduce(add) 20 """def__init__(self,prev:RDD[T],func:Callable[[int,Iterable[T]],Iterable[U]],preservesPartitioning:bool=False,isFromBarrier:bool=False,):ifnotisinstance(prev,PipelinedRDD)ornotprev._is_pipelinable():# This transformation is the first in its stage:self.func=funcself.preservesPartitioning=preservesPartitioningself._prev_jrdd=prev._jrddself._prev_jrdd_deserializer=prev._jrdd_deserializerelse:prev_func:Callable[[int,Iterable[V]],Iterable[T]]=prev.funcdefpipeline_func(split:int,iterator:Iterable[V])->Iterable[U]:returnfunc(split,prev_func(split,iterator))self.func=pipeline_funcself.preservesPartitioning=prev.preservesPartitioningandpreservesPartitioningself._prev_jrdd=prev._prev_jrdd# maintain the pipelineself._prev_jrdd_deserializer=prev._prev_jrdd_deserializerself.is_cached=Falseself.has_resource_profile=Falseself.is_checkpointed=Falseself.ctx=prev.ctxself.prev=prevself._jrdd_val:Optional["JavaObject"]=Noneself._id=Noneself._jrdd_deserializer=self.ctx.serializerself._bypass_serializer=Falseself.partitioner=prev.partitionerifself.preservesPartitioningelseNoneself.is_barrier=isFromBarrierorprev._is_barrier()defgetNumPartitions(self)->int:returnself._prev_jrdd.partitions().size()@propertydef_jrdd(self)->"JavaObject":ifself._jrdd_val:returnself._jrdd_valifself._bypass_serializer:self._jrdd_deserializer=NoOpSerializer()if(self.ctx.profiler_collectorandself.ctx._conf.get("spark.python.profile","false")=="true"):profiler=self.ctx.profiler_collector.new_profiler(self.ctx)else:profiler=Nonewrapped_func=_wrap_function(self.ctx,self.func,self._prev_jrdd_deserializer,self._jrdd_deserializer,profiler)assertself.ctx._jvmisnotNonepython_rdd=self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(),wrapped_func,self.preservesPartitioning,self.is_barrier)self._jrdd_val=python_rdd.asJavaRDD()ifprofiler:assertself._jrdd_valisnotNoneself._id=self._jrdd_val.id()self.ctx.profiler_collector.add_profiler(self._id,profiler)returnself._jrdd_valdefid(self)->int:ifself._idisNone:self._id=self._jrdd.id()returnself._iddef_is_pipelinable(self)->bool:returnnot(self.is_cachedorself.is_checkpointedorself.has_resource_profile)def_is_barrier(self)->bool:returnself.is_barrierdef_test()->None:importdoctestimporttempfilefrompyspark.contextimportSparkContexttmp_dir=tempfile.TemporaryDirectory()globs=globals().copy()# The small batch size here ensures that we see multiple batches,# even in these small test examples:globs["sc"]=SparkContext("local[4]","PythonTest")globs["sc"].setCheckpointDir(tmp_dir.name)(failure_count,test_count)=doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS)globs["sc"].stop()tmp_dir.cleanup()iffailure_count:tmp_dir.cleanup()sys.exit(-1)if__name__=="__main__":_test()