diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 9b477ffecfd23..e1d87e99c8a5e 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -23,6 +23,12 @@ from pyspark.mllib.linalg import _convert_to_vector from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.util import JavaLoader, JavaSaveable +from typing import Dict, Optional, Tuple, Union, overload, TYPE_CHECKING +from pyspark.rdd import RDD + +if TYPE_CHECKING: + from pyspark.mllib._typing import VectorLike + __all__ = [ "DecisionTreeModel", @@ -40,7 +46,15 @@ class TreeEnsembleModel(JavaModelWrapper, JavaSaveable): .. versionadded:: 1.3.0 """ - def predict(self, x): + @overload + def predict(self, x: "VectorLike") -> float: + ... + + @overload + def predict(self, x: RDD["VectorLike"]) -> RDD[float]: + ... + + def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[float, RDD[float]]: """ Predict values for a single data point or an RDD of points using the model trained. @@ -60,37 +74,45 @@ def predict(self, x): return self.call("predict", _convert_to_vector(x)) @since("1.3.0") - def numTrees(self): + def numTrees(self) -> int: """ Get number of trees in ensemble. """ return self.call("numTrees") @since("1.3.0") - def totalNumNodes(self): + def totalNumNodes(self) -> int: """ Get total number of nodes, summed over all trees in the ensemble. """ return self.call("totalNumNodes") - def __repr__(self): + def __repr__(self) -> str: """Summary of model""" return self._java_model.toString() @since("1.3.0") - def toDebugString(self): + def toDebugString(self) -> str: """Full model""" return self._java_model.toDebugString() -class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader): +class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader["DecisionTreeModel"]): """ A decision tree model for classification or regression. .. versionadded:: 1.1.0 """ - def predict(self, x): + @overload + def predict(self, x: "VectorLike") -> float: + ... + + @overload + def predict(self, x: RDD["VectorLike"]) -> RDD[float]: + ... + + def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[float, RDD[float]]: """ Predict the label of one or more examples. @@ -115,29 +137,29 @@ def predict(self, x): return self.call("predict", _convert_to_vector(x)) @since("1.1.0") - def numNodes(self): + def numNodes(self) -> int: """Get number of nodes in tree, including leaf nodes.""" return self._java_model.numNodes() @since("1.1.0") - def depth(self): + def depth(self) -> int: """ Get depth of tree (e.g. depth 0 means 1 leaf node, depth 1 means 1 internal node + 2 leaf nodes). """ return self._java_model.depth() - def __repr__(self): + def __repr__(self) -> str: """summary of model.""" return self._java_model.toString() @since("1.2.0") - def toDebugString(self): + def toDebugString(self) -> str: """full model.""" return self._java_model.toDebugString() @classmethod - def _java_loader_class(cls): + def _java_loader_class(cls) -> str: return "org.apache.spark.mllib.tree.model.DecisionTreeModel" @@ -152,16 +174,16 @@ class DecisionTree: @classmethod def _train( cls, - data, - type, - numClasses, - features, - impurity="gini", - maxDepth=5, - maxBins=32, - minInstancesPerNode=1, - minInfoGain=0.0, - ): + data: RDD[LabeledPoint], + type: str, + numClasses: int, + features: Dict[int, int], + impurity: str = "gini", + maxDepth: int = 5, + maxBins: int = 32, + minInstancesPerNode: int = 1, + minInfoGain: float = 0.0, + ) -> DecisionTreeModel: first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" model = callMLlibFunc( @@ -181,15 +203,15 @@ def _train( @classmethod def trainClassifier( cls, - data, - numClasses, - categoricalFeaturesInfo, - impurity="gini", - maxDepth=5, - maxBins=32, - minInstancesPerNode=1, - minInfoGain=0.0, - ): + data: RDD[LabeledPoint], + numClasses: int, + categoricalFeaturesInfo: Dict[int, int], + impurity: str = "gini", + maxDepth: int = 5, + maxBins: int = 32, + minInstancesPerNode: int = 1, + minInfoGain: float = 0.0, + ) -> DecisionTreeModel: """ Train a decision tree model for classification. @@ -276,14 +298,14 @@ def trainClassifier( @since("1.1.0") def trainRegressor( cls, - data, - categoricalFeaturesInfo, - impurity="variance", - maxDepth=5, - maxBins=32, - minInstancesPerNode=1, - minInfoGain=0.0, - ): + data: RDD[LabeledPoint], + categoricalFeaturesInfo: Dict[int, int], + impurity: str = "variance", + maxDepth: int = 5, + maxBins: int = 32, + minInstancesPerNode: int = 1, + minInfoGain: float = 0.0, + ) -> DecisionTreeModel: """ Train a decision tree model for regression. @@ -354,7 +376,7 @@ def trainRegressor( @inherit_doc -class RandomForestModel(TreeEnsembleModel, JavaLoader): +class RandomForestModel(TreeEnsembleModel, JavaLoader["RandomForestModel"]): """ Represents a random forest model. @@ -362,7 +384,7 @@ class RandomForestModel(TreeEnsembleModel, JavaLoader): """ @classmethod - def _java_loader_class(cls): + def _java_loader_class(cls) -> str: return "org.apache.spark.mllib.tree.model.RandomForestModel" @@ -374,22 +396,22 @@ class RandomForest: .. versionadded:: 1.2.0 """ - supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird") + supportedFeatureSubsetStrategies: Tuple[str, ...] = ("auto", "all", "sqrt", "log2", "onethird") @classmethod def _train( cls, - data, - algo, - numClasses, - categoricalFeaturesInfo, - numTrees, - featureSubsetStrategy, - impurity, - maxDepth, - maxBins, - seed, - ): + data: RDD[LabeledPoint], + algo: str, + numClasses: int, + categoricalFeaturesInfo: Dict[int, int], + numTrees: int, + featureSubsetStrategy: str, + impurity: str, + maxDepth: int, + maxBins: int, + seed: Optional[int], + ) -> RandomForestModel: first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies: @@ -414,16 +436,16 @@ def _train( @classmethod def trainClassifier( cls, - data, - numClasses, - categoricalFeaturesInfo, - numTrees, - featureSubsetStrategy="auto", - impurity="gini", - maxDepth=4, - maxBins=32, - seed=None, - ): + data: RDD[LabeledPoint], + numClasses: int, + categoricalFeaturesInfo: Dict[int, int], + numTrees: int, + featureSubsetStrategy: str = "auto", + impurity: str = "gini", + maxDepth: int = 4, + maxBins: int = 32, + seed: Optional[int] = None, + ) -> RandomForestModel: """ Train a random forest model for binary or multiclass classification. @@ -530,15 +552,15 @@ def trainClassifier( @classmethod def trainRegressor( cls, - data, - categoricalFeaturesInfo, - numTrees, - featureSubsetStrategy="auto", - impurity="variance", - maxDepth=4, - maxBins=32, - seed=None, - ): + data: RDD[LabeledPoint], + categoricalFeaturesInfo: Dict[int, int], + numTrees: int, + featureSubsetStrategy: str = "auto", + impurity: str = "variance", + maxDepth: int = 4, + maxBins: int = 32, + seed: Optional[int] = None, + ) -> RandomForestModel: """ Train a random forest model for regression. @@ -625,7 +647,7 @@ def trainRegressor( @inherit_doc -class GradientBoostedTreesModel(TreeEnsembleModel, JavaLoader): +class GradientBoostedTreesModel(TreeEnsembleModel, JavaLoader["GradientBoostedTreesModel"]): """ Represents a gradient-boosted tree model. @@ -633,7 +655,7 @@ class GradientBoostedTreesModel(TreeEnsembleModel, JavaLoader): """ @classmethod - def _java_loader_class(cls): + def _java_loader_class(cls) -> str: return "org.apache.spark.mllib.tree.model.GradientBoostedTreesModel" @@ -648,15 +670,15 @@ class GradientBoostedTrees: @classmethod def _train( cls, - data, - algo, - categoricalFeaturesInfo, - loss, - numIterations, - learningRate, - maxDepth, - maxBins, - ): + data: RDD[LabeledPoint], + algo: str, + categoricalFeaturesInfo: Dict[int, int], + loss: str, + numIterations: int, + learningRate: float, + maxDepth: int, + maxBins: int, + ) -> GradientBoostedTreesModel: first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" model = callMLlibFunc( @@ -675,14 +697,14 @@ def _train( @classmethod def trainClassifier( cls, - data, - categoricalFeaturesInfo, - loss="logLoss", - numIterations=100, - learningRate=0.1, - maxDepth=3, - maxBins=32, - ): + data: RDD[LabeledPoint], + categoricalFeaturesInfo: Dict[int, int], + loss: str = "logLoss", + numIterations: int = 100, + learningRate: float = 0.1, + maxDepth: int = 3, + maxBins: int = 32, + ) -> GradientBoostedTreesModel: """ Train a gradient-boosted trees model for classification. @@ -765,14 +787,14 @@ def trainClassifier( @classmethod def trainRegressor( cls, - data, - categoricalFeaturesInfo, - loss="leastSquaresError", - numIterations=100, - learningRate=0.1, - maxDepth=3, - maxBins=32, - ): + data: RDD[LabeledPoint], + categoricalFeaturesInfo: Dict[int, int], + loss: str = "leastSquaresError", + numIterations: int = 100, + learningRate: float = 0.1, + maxDepth: int = 3, + maxBins: int = 32, + ) -> GradientBoostedTreesModel: """ Train a gradient-boosted trees model for regression. @@ -851,7 +873,7 @@ def trainRegressor( ) -def _test(): +def _test() -> None: import doctest globs = globals().copy() diff --git a/python/pyspark/mllib/tree.pyi b/python/pyspark/mllib/tree.pyi deleted file mode 100644 index fedb494f19062..0000000000000 --- a/python/pyspark/mllib/tree.pyi +++ /dev/null @@ -1,124 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import overload -from typing import Dict, Optional, Tuple -from pyspark.mllib._typing import VectorLike -from pyspark.rdd import RDD -from pyspark.mllib.common import JavaModelWrapper -from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.util import JavaLoader, JavaSaveable - -class TreeEnsembleModel(JavaModelWrapper, JavaSaveable): - @overload - def predict(self, x: VectorLike) -> float: ... - @overload - def predict(self, x: RDD[VectorLike]) -> RDD[VectorLike]: ... - def numTrees(self) -> int: ... - def totalNumNodes(self) -> int: ... - def toDebugString(self) -> str: ... - -class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader[DecisionTreeModel]): - @overload - def predict(self, x: VectorLike) -> float: ... - @overload - def predict(self, x: RDD[VectorLike]) -> RDD[VectorLike]: ... - def numNodes(self) -> int: ... - def depth(self) -> int: ... - def toDebugString(self) -> str: ... - -class DecisionTree: - @classmethod - def trainClassifier( - cls, - data: RDD[LabeledPoint], - numClasses: int, - categoricalFeaturesInfo: Dict[int, int], - impurity: str = ..., - maxDepth: int = ..., - maxBins: int = ..., - minInstancesPerNode: int = ..., - minInfoGain: float = ..., - ) -> DecisionTreeModel: ... - @classmethod - def trainRegressor( - cls, - data: RDD[LabeledPoint], - categoricalFeaturesInfo: Dict[int, int], - impurity: str = ..., - maxDepth: int = ..., - maxBins: int = ..., - minInstancesPerNode: int = ..., - minInfoGain: float = ..., - ) -> DecisionTreeModel: ... - -class RandomForestModel(TreeEnsembleModel, JavaLoader[RandomForestModel]): ... - -class RandomForest: - supportedFeatureSubsetStrategies: Tuple[str, ...] - @classmethod - def trainClassifier( - cls, - data: RDD[LabeledPoint], - numClasses: int, - categoricalFeaturesInfo: Dict[int, int], - numTrees: int, - featureSubsetStrategy: str = ..., - impurity: str = ..., - maxDepth: int = ..., - maxBins: int = ..., - seed: Optional[int] = ..., - ) -> RandomForestModel: ... - @classmethod - def trainRegressor( - cls, - data: RDD[LabeledPoint], - categoricalFeaturesInfo: Dict[int, int], - numTrees: int, - featureSubsetStrategy: str = ..., - impurity: str = ..., - maxDepth: int = ..., - maxBins: int = ..., - seed: Optional[int] = ..., - ) -> RandomForestModel: ... - -class GradientBoostedTreesModel(TreeEnsembleModel, JavaLoader[GradientBoostedTreesModel]): ... - -class GradientBoostedTrees: - @classmethod - def trainClassifier( - cls, - data: RDD[LabeledPoint], - categoricalFeaturesInfo: Dict[int, int], - loss: str = ..., - numIterations: int = ..., - learningRate: float = ..., - maxDepth: int = ..., - maxBins: int = ..., - ) -> GradientBoostedTreesModel: ... - @classmethod - def trainRegressor( - cls, - data: RDD[LabeledPoint], - categoricalFeaturesInfo: Dict[int, int], - loss: str = ..., - numIterations: int = ..., - learningRate: float = ..., - maxDepth: int = ..., - maxBins: int = ..., - ) -> GradientBoostedTreesModel: ...