-
Notifications
You must be signed in to change notification settings - Fork 695
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SEDONA-661] add local outlier factor implementation. (#1623)
* add local outlier factor implementation. * LOF docs * precommit changes * precommit formatting changes --------- Co-authored-by: jameswillis <[email protected]>
- Loading branch information
1 parent
dca28e1
commit b1ceb1e
Showing
8 changed files
with
479 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
|
||
"""Algorithms for detecting outliers in spatial datasets.""" |
60 changes: 60 additions & 0 deletions
60
python/sedona/stats/outlier_detection/local_outlier_factor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
|
||
"""Functions related to calculating the local outlier factor of a dataset.""" | ||
from typing import Optional | ||
|
||
from pyspark.sql import DataFrame, SparkSession | ||
|
||
ID_COLUMN_NAME = "__id" | ||
CONTENTS_COLUMN_NAME = "__contents" | ||
|
||
|
||
def local_outlier_factor( | ||
dataframe: DataFrame, | ||
k: int = 20, | ||
geometry: Optional[str] = None, | ||
handle_ties: bool = False, | ||
use_spheroid=False, | ||
): | ||
"""Annotates a dataframe with a column containing the local outlier factor for each data record. | ||
The dataframe should contain at least one GeometryType column. Rows must be unique. If one geometry column is | ||
present it will be used automatically. If two are present, the one named 'geometry' will be used. If more than one | ||
are present and neither is named 'geometry', the column name must be provided. | ||
Args: | ||
dataframe: apache sedona idDataframe containing the point geometries | ||
k: number of nearest neighbors that will be considered for the LOF calculation | ||
geometry: name of the geometry column | ||
handle_ties: whether to handle ties in the k-distance calculation. Default is false | ||
use_spheroid: whether to use a cartesian or spheroidal distance calculation. Default is false | ||
Returns: | ||
A PySpark DataFrame containing the lof for each row | ||
""" | ||
sedona = SparkSession.getActiveSession() | ||
|
||
result_df = sedona._jvm.org.apache.sedona.stats.outlierDetection.LocalOutlierFactor.localOutlierFactor( | ||
dataframe._jdf, | ||
k, | ||
geometry, | ||
handle_ties, | ||
use_spheroid, | ||
) | ||
|
||
return DataFrame(result_df, sedona) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
|
||
import numpy as np | ||
import pyspark.sql.functions as f | ||
import pytest | ||
from pyspark.sql import DataFrame | ||
from pyspark.sql.types import DoubleType, IntegerType, StructField, StructType | ||
from sklearn.neighbors import LocalOutlierFactor | ||
from tests.test_base import TestBase | ||
|
||
from sedona.sql.st_constructors import ST_MakePoint | ||
from sedona.sql.st_functions import ST_X, ST_Y | ||
from sedona.stats.outlier_detection.local_outlier_factor import local_outlier_factor | ||
|
||
|
||
class TestLOF(TestBase): | ||
def get_small_data(self) -> DataFrame: | ||
schema = StructType( | ||
[ | ||
StructField("id", IntegerType(), True), | ||
StructField("x", DoubleType(), True), | ||
StructField("y", DoubleType(), True), | ||
] | ||
) | ||
return self.spark.createDataFrame( | ||
[ | ||
(1, 1.0, 2.0), | ||
(2, 2.0, 2.0), | ||
(3, 3.0, 3.0), | ||
], | ||
schema, | ||
).select("id", ST_MakePoint("x", "y").alias("geometry")) | ||
|
||
def get_medium_data(self): | ||
np.random.seed(42) | ||
|
||
X_inliers = 0.3 * np.random.randn(100, 2) | ||
X_inliers = np.r_[X_inliers + 2, X_inliers - 2] | ||
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) | ||
return np.r_[X_inliers, X_outliers] | ||
|
||
def get_medium_dataframe(self, data): | ||
schema = StructType( | ||
[StructField("x", DoubleType(), True), StructField("y", DoubleType(), True)] | ||
) | ||
|
||
return ( | ||
self.spark.createDataFrame(data, schema) | ||
.select(ST_MakePoint("x", "y").alias("geometry")) | ||
.withColumn("anotherColumn", f.rand()) | ||
) | ||
|
||
def compare_results(self, actual, expected, k): | ||
assert len(actual) == len(expected) | ||
missing = set(expected.keys()) - set(actual.keys()) | ||
assert len(missing) == 0 | ||
big_diff = { | ||
k: (v, expected[k], abs(1 - v / expected[k])) | ||
for k, v in actual.items() | ||
if abs(1 - v / expected[k]) > 0.0000000001 | ||
} | ||
assert len(big_diff) == 0 | ||
|
||
@pytest.mark.parametrize("k", [5, 21, 3]) | ||
def test_lof_matches_sklearn(self, k): | ||
data = self.get_medium_data() | ||
actual = { | ||
tuple(x[0]): x[1] | ||
for x in local_outlier_factor(self.get_medium_dataframe(data.tolist()), k) | ||
.select(f.array(ST_X("geometry"), ST_Y("geometry")), "lof") | ||
.collect() | ||
} | ||
clf = LocalOutlierFactor(n_neighbors=k, contamination="auto") | ||
clf.fit_predict(data) | ||
expected = dict( | ||
zip( | ||
[tuple(x) for x in data], | ||
[float(-x) for x in clf.negative_outlier_factor_], | ||
) | ||
) | ||
self.compare_results(actual, expected, k) | ||
|
||
# TODO uncomment when KNN join supports empty dfs | ||
# def test_handle_empty_dataframe(self): | ||
# empty_df = self.spark.createDataFrame([], self.get_small_data().schema) | ||
# result_df = local_outlier_factor(empty_df, 2) | ||
# | ||
# assert 0 == result_df.count() | ||
|
||
def test_raise_error_for_invalid_k_value(self): | ||
with pytest.raises(Exception): | ||
local_outlier_factor(self.get_small_data(), -1) |
Oops, something went wrong.