Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 101 additions & 4 deletions python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,18 +246,55 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
"be used in the model. Supported options: auto, binomial, multinomial",
typeConverter=TypeConverters.toString)

lowerBoundsOnCoefficients = Param(Params._dummy(), "lowerBoundsOnCoefficients",
"The lower bounds on coefficients if fitting under bound "
"constrained optimization. The bound matrix must be "
"compatible with the shape "
"(1, number of features) for binomial regression, or "
"(number of classes, number of features) "
"for multinomial regression.",
typeConverter=TypeConverters.toMatrix)

upperBoundsOnCoefficients = Param(Params._dummy(), "upperBoundsOnCoefficients",
"The upper bounds on coefficients if fitting under bound "
"constrained optimization. The bound matrix must be "
"compatible with the shape "
"(1, number of features) for binomial regression, or "
"(number of classes, number of features) "
"for multinomial regression.",
typeConverter=TypeConverters.toMatrix)

lowerBoundsOnIntercepts = Param(Params._dummy(), "lowerBoundsOnIntercepts",
"The lower bounds on intercepts if fitting under bound "
"constrained optimization. The bounds vector size must be"
"equal with 1 for binomial regression, or the number of"
"lasses for multinomial regression.",
typeConverter=TypeConverters.toVector)

upperBoundsOnIntercepts = Param(Params._dummy(), "upperBoundsOnIntercepts",
"The upper bounds on intercepts if fitting under bound "
"constrained optimization. The bound vector size must be "
"equal with 1 for binomial regression, or the number of "
"classes for multinomial regression.",
typeConverter=TypeConverters.toVector)

@keyword_only
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
threshold=0.5, thresholds=None, probabilityCol="probability",
rawPredictionCol="rawPrediction", standardization=True, weightCol=None,
aggregationDepth=2, family="auto"):
aggregationDepth=2, family="auto",
lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):

"""
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
threshold=0.5, thresholds=None, probabilityCol="probability", \
rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \
aggregationDepth=2, family="auto")
aggregationDepth=2, family="auto", \
lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \
lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
If the threshold and thresholds Params are both set, they must be equivalent.
"""
super(LogisticRegression, self).__init__()
Expand All @@ -274,13 +311,17 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
threshold=0.5, thresholds=None, probabilityCol="probability",
rawPredictionCol="rawPrediction", standardization=True, weightCol=None,
aggregationDepth=2, family="auto"):
aggregationDepth=2, family="auto",
lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
"""
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
threshold=0.5, thresholds=None, probabilityCol="probability", \
rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \
aggregationDepth=2, family="auto")
aggregationDepth=2, family="auto", \
lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \
lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
Sets params for logistic regression.
If the threshold and thresholds Params are both set, they must be equivalent.
"""
Expand Down Expand Up @@ -375,6 +416,62 @@ def getFamily(self):
"""
return self.getOrDefault(self.family)

@since("2.3.0")
def setLowerBoundsOnCoefficients(self, value):
"""
Sets the value of :py:attr:`lowerBoundsOnCoefficients`
"""
return self._set(lowerBoundsOnCoefficients=value)

@since("2.3.0")
def getLowerBoundsOnCoefficients(self):
"""
Gets the value of :py:attr:`lowerBoundsOnCoefficients`
"""
return self.getOrDefault(self.lowerBoundsOnCoefficients)

@since("2.3.0")
def setUpperBoundsOnCoefficients(self, value):
"""
Sets the value of :py:attr:`upperBoundsOnCoefficients`
"""
return self._set(upperBoundsOnCoefficients=value)

@since("2.3.0")
def getUpperBoundsOnCoefficients(self):
"""
Gets the value of :py:attr:`upperBoundsOnCoefficients`
"""
return self.getOrDefault(self.upperBoundsOnCoefficients)

@since("2.3.0")
def setLowerBoundsOnIntercepts(self, value):
"""
Sets the value of :py:attr:`lowerBoundsOnIntercepts`
"""
return self._set(lowerBoundsOnIntercepts=value)

@since("2.3.0")
def getLowerBoundsOnIntercepts(self):
"""
Gets the value of :py:attr:`lowerBoundsOnIntercepts`
"""
return self.getOrDefault(self.lowerBoundsOnIntercepts)

@since("2.3.0")
def setUpperBoundsOnIntercepts(self, value):
"""
Sets the value of :py:attr:`upperBoundsOnIntercepts`
"""
return self._set(upperBoundsOnIntercepts=value)

@since("2.3.0")
def getUpperBoundsOnIntercepts(self):
"""
Gets the value of :py:attr:`upperBoundsOnIntercepts`
"""
return self.getOrDefault(self.upperBoundsOnIntercepts)


class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable):
"""
Expand Down
11 changes: 10 additions & 1 deletion python/pyspark/ml/param/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

from py4j.java_gateway import JavaObject

from pyspark.ml.linalg import DenseVector, Vector
from pyspark.ml.linalg import DenseVector, Vector, Matrix
from pyspark.ml.util import Identifiable


Expand Down Expand Up @@ -169,6 +169,15 @@ def toVector(value):
return DenseVector(value)
raise TypeError("Could not convert %s to vector" % value)

@staticmethod
def toMatrix(value):
"""
Convert a value to a MLlib Matrix, if possible.
"""
if isinstance(value, Matrix):
return value
raise TypeError("Could not convert %s to matrix" % value)

@staticmethod
def toFloat(value):
"""
Expand Down
37 changes: 37 additions & 0 deletions python/pyspark/ml/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1271,6 +1271,43 @@ def test_tweedie_distribution(self):
self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4))


class LogisticRegressionTest(SparkSessionTestCase):

def test_binomial_logistic_regression_with_bound(self):

df = self.spark.createDataFrame(
[(1.0, 1.0, Vectors.dense(0.0, 5.0)),
(0.0, 2.0, Vectors.dense(1.0, 2.0)),
(1.0, 3.0, Vectors.dense(2.0, 1.0)),
(0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"])

lor = LogisticRegression(regParam=0.01, weightCol="weight",
lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
upperBoundsOnIntercepts=Vectors.dense(0.0))
model = lor.fit(df)
self.assertTrue(
np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4))
self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))

def test_multinomial_logistic_regression_with_bound(self):

data_path = "data/mllib/sample_multiclass_classification_data.txt"
df = self.spark.read.format("libsvm").load(data_path)

lor = LogisticRegression(regParam=0.01,
lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0))
model = lor.fit(df)
expected = [[4.593, 4.5516, 9.0099, 12.2904],
[1.0, 8.1093, 7.0, 10.0],
[3.041, 5.0, 8.0, 11.0]]
for i in range(0, len(expected)):
self.assertTrue(
np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4))
self.assertTrue(
np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))


class FPGrowthTests(SparkSessionTestCase):
def setUp(self):
super(FPGrowthTests, self).setUp()
Expand Down