Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 87 additions & 4 deletions python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,18 +246,55 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
"be used in the model. Supported options: auto, binomial, multinomial",
typeConverter=TypeConverters.toString)

lowerBoundsOnCoefficients = Param(Params._dummy(), "lowerBoundsOnCoefficients",
"The lower bounds on coefficients if fitting under bound "
"constrained optimization. The bound matrix must be "
"compatible with the shape "
"(1, number of features) for binomial regression, or "
"(number of classes, number of features) "
"for multinomial regression.",
typeConverter=TypeConverters.toMatrix)

upperBoundsOnCoefficients = Param(Params._dummy(), "upperBoundsOnCoefficients",
"The upper bounds on coefficients if fitting under bound "
"constrained optimization. The bound matrix must be "
"compatible with the shape "
"(1, number of features) for binomial regression, or "
"(number of classes, number of features) "
"for multinomial regression.",
typeConverter=TypeConverters.toMatrix)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can condense this and the above text blocks some more


lowerBoundsOnIntercepts = Param(Params._dummy(), "lowerBoundsOnIntercepts",
"The lower bounds on intercepts if fitting under bound "
"constrained optimization. The bounds vector size must be"
"equal with 1 for binomial regression, or the number of"
"lasses for multinomial regression.",
typeConverter=TypeConverters.toVector)

upperBoundsOnIntercepts = Param(Params._dummy(), "upperBoundsOnIntercepts",
"The upper bounds on intercepts if fitting under bound "
"constrained optimization. The bound vector size must be "
"equal with 1 for binomial regression, or the number of "
"classes for multinomial regression.",
typeConverter=TypeConverters.toVector)

@keyword_only
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
threshold=0.5, thresholds=None, probabilityCol="probability",
rawPredictionCol="rawPrediction", standardization=True, weightCol=None,
aggregationDepth=2, family="auto"):
aggregationDepth=2, family="auto",
lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should fill up the previous line before starting another, here and below

lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):

"""
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
threshold=0.5, thresholds=None, probabilityCol="probability", \
rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \
aggregationDepth=2, family="auto")
aggregationDepth=2, family="auto", \
lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \
lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
If the threshold and thresholds Params are both set, they must be equivalent.
"""
super(LogisticRegression, self).__init__()
Expand All @@ -274,13 +311,17 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
threshold=0.5, thresholds=None, probabilityCol="probability",
rawPredictionCol="rawPrediction", standardization=True, weightCol=None,
aggregationDepth=2, family="auto"):
aggregationDepth=2, family="auto",
lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
"""
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
threshold=0.5, thresholds=None, probabilityCol="probability", \
rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \
aggregationDepth=2, family="auto")
aggregationDepth=2, family="auto", \
lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \
lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
Sets params for logistic regression.
If the threshold and thresholds Params are both set, they must be equivalent.
"""
Expand Down Expand Up @@ -375,6 +416,48 @@ def getFamily(self):
"""
return self.getOrDefault(self.family)

@since("2.3.0")
def setLowerBoundsOnCoefficients(self, value):
"""
Sets the value of :py:attr:`lowerBoundsOnCoefficients`
"""
return self._set(lowerBoundsOnCoefficients=value)

@since("2.3.0")
def getLowerBoundsOnCoefficients(self):
"""
Gets the value of :py:attr:`lowerBoundsOnCoefficients`
"""
return self.getOrDefault(self.lowerBoundsOnCoefficients)

@since("2.3.0")
def setUpperBoundsOnCoefficients(self, value):
"""
Sets the value of :py:attr:`upperBoundsOnCoefficients`
"""
return self._set(upperBoundsOnCoefficients=value)

@since("2.3.0")
def getUpperBoundsOnCoefficients(self):
"""
Gets the value of :py:attr:`upperBoundsOnCoefficients`
"""
return self.getOrDefault(self.upperBoundsOnCoefficients)

@since("2.3.0")
def setLowerBoundsOnIntercepts(self, value):
"""
Sets the value of :py:attr:`lowerBoundsOnIntercepts`
"""
return self._set(lowerBoundsOnIntercepts=value)

@since("2.3.0")
def getLowerBoundsOnIntercepts(self):
"""
Gets the value of :py:attr:`lowerBoundsOnIntercepts`
"""
return self.getOrDefault(self.lowerBoundsOnIntercepts)


class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable):
"""
Expand Down
11 changes: 10 additions & 1 deletion python/pyspark/ml/param/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

from py4j.java_gateway import JavaObject

from pyspark.ml.linalg import DenseVector, Vector
from pyspark.ml.linalg import DenseVector, Vector, Matrix
from pyspark.ml.util import Identifiable


Expand Down Expand Up @@ -169,6 +169,15 @@ def toVector(value):
return DenseVector(value)
raise TypeError("Could not convert %s to vector" % value)

@staticmethod
def toMatrix(value):
"""
Convert a value to ML Matrix, if possible
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ML -> MLlib, MLlib is the only official name for both spark.mllib and spark.ml package.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While I am aware of this, distinction between ml.linalg and mllib.linalg, is a common source of confusion for the PySpark users. Of course we could be more forgiving, and automatically convert objects to the required class.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a big issue, but you can still refer the clarification in MLlib user guide to get the convention in MLlib.

"""
if isinstance(value, Matrix):
return value
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this method really necessary? It's not actually converting anything, just checking the type. If this wasn't here and the user put something other than a Matrix, what error would be raised?

raise TypeError("Could not convert %s to Matrix" % value)

@staticmethod
def toFloat(value):
"""
Expand Down
118 changes: 118 additions & 0 deletions python/pyspark/ml/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,34 @@
ser = PickleSerializer()


def generate_multinomial_logistic_input(
weights, x_mean, x_variance, add_intercept, n_points, seed=None):
"""Creates multinomial logistic dataset"""

if seed:
np.random.seed(seed)
n_features = x_mean.shape[0]

x = np.random.randn(n_points, n_features)
x = x * np.sqrt(x_variance) + x_mean

if add_intercept:
x = np.hstack([x, np.ones((n_points, 1))])

# Compute margins
margins = np.hstack([np.zeros((n_points, 1)), x.dot(weights.T)])
# Shift to avoid overflow and compute probs
probs = np.exp(np.subtract(margins, margins.max(axis=1).reshape(n_points, -1)))
# Compute cumulative prob
cum_probs = np.cumsum(probs / probs.sum(axis=1).reshape(n_points, -1), axis=1)
# Assign class
classes = np.apply_along_axis(
lambda x: np.searchsorted(cum_probs[1, ], np.random.random()),
axis=1, arr=cum_probs)
return [(float(label), DenseVector(features))
for (label, features) in list(zip(classes, x[:, :-int(add_intercept)]))]


class MLlibTestCase(unittest.TestCase):
def setUp(self):
self.sc = SparkContext('local[4]', "MLlib tests")
Expand Down Expand Up @@ -832,6 +860,96 @@ def test_logistic_regression(self):
except OSError:
pass

def logistic_regression_check_thresholds(self):
self.assertIsInstance(
LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]),
LogisticRegressionModel
)

self.assertRaisesRegexp(
ValueError,
"Logistic Regression getThreshold found inconsistent.*$",
LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5]
)

def test_binomial_logistic_regression_bounds(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zero323 We usually run PySpark MLlib test with loading a dataset from data/mllib/ or manual generating a dummy/hard-coded dataset rather than rewrite the same test case as Scala. We keep PySpark test as simple as possible. You can refer this test case. Thanks.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Example datasets are not that good for checking constraints, and generator seems like a better idea than creating large enough example by hand. I can of course remove it, if this is an issue.

Copy link
Contributor

@yanboliang yanboliang Jun 23, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For PySpark, we should only check the output be consistent with Scala. The most straight-forward way for this test should be loading data directly and run constraint LR on it:

data_path = "data/mllib/sample_multiclass_classification_data.txt"
df = spark.read.format("libsvm").load(data_path)
......

This will make the test case simple and time-saving. Thanks.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree this is probably overkill for testing this. The functionality is already in Scala and should be tested there, here in python we are just setting the parameters.

x_mean = np.array([5.843, 3.057, 3.758, 1.199])
x_variance = np.array([0.6856, 0.1899, 3.116, 0.581])

coefficients = np.array(
[[-0.57997, 0.912083, -0.371077, -0.819866, 2.688191]]
)

dataset = self.spark.createDataFrame(
generate_multinomial_logistic_input(
coefficients, x_mean, x_variance, True, 1000),
["label", "features"]
)

lower_bounds_on_coefficients = Matrices.dense(1, 4, [0.0, -1.0, 0.0, -1.0])
upper_bounds_on_coefficients = Matrices.dense(1, 4, [0.0, 1.0, 1.0, 0.0])
lower_bounds_on_intercepts = Vectors.dense([0.0])
upper_bounds_on_intercepts = Vectors.dense([1.0])

lr = LogisticRegression(
standardization=True, fitIntercept=True,
lowerBoundsOnCoefficients=lower_bounds_on_coefficients,
upperBoundsOnCoefficients=upper_bounds_on_coefficients,
lowerBoundsOnIntercepts=lower_bounds_on_intercepts,
upperBoundsOnIntercepts=upper_bounds_on_intercepts
)

lrm = lr.fit(dataset)

self.assertIsInstance(lrm, LogisticRegressionModel)
self.assertTrue(np.all(
lower_bounds_on_coefficients.toArray() <= lrm.coefficientMatrix.toArray()
))

self.assertTrue(np.all(
lrm.coefficientMatrix.toArray() <= upper_bounds_on_coefficients.toArray()
))

def test_multinomial_regression_bounds(self):
x_mean = np.array([5.843, 3.057, 3.758, 1.199])
x_variance = np.array([0.6856, 0.1899, 3.116, 0.581])

coefficients = np.array([
[-0.57997, 0.912083, -0.371077, -0.819866, 2.688191],
[-0.16624, -0.84355, -0.048509, -0.301789, 4.170682]
])

dataset = self.spark.createDataFrame(
generate_multinomial_logistic_input(
coefficients, x_mean, x_variance, True, 1000),
["label", "features"]
)

lower_bounds_on_coefficients = Matrices.dense(3, 4, np.repeat(-10.0, 12))
upper_bounds_on_coefficients = Matrices.dense(3, 4, np.repeat(10.0, 12))
lower_bounds_on_intercepts = Vectors.dense(np.repeat(-3.0, 3))
upper_bounds_on_intercepts = Vectors.dense(np.repeat(3.0, 3))

lr = LogisticRegression(
standardization=True, fitIntercept=True,
lowerBoundsOnCoefficients=lower_bounds_on_coefficients,
upperBoundsOnCoefficients=upper_bounds_on_coefficients,
lowerBoundsOnIntercepts=lower_bounds_on_intercepts,
upperBoundsOnIntercepts=upper_bounds_on_intercepts
)

lrm = lr.fit(dataset)

self.assertIsInstance(lrm, LogisticRegressionModel)

self.assertTrue(np.all(
lower_bounds_on_coefficients.toArray() <= lrm.coefficientMatrix.toArray()
))

self.assertTrue(np.all(
lrm.coefficientMatrix.toArray() <= upper_bounds_on_coefficients.toArray()
))

def _compare_params(self, m1, m2, param):
"""
Compare 2 ML Params instances for the given param, and assert both have the same param value
Expand Down