Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
1b7844c
Create qcdataframe.py
d33bs Jun 10, 2024
9784841
linting
d33bs Jun 10, 2024
c39e2b0
add qcdataframe
d33bs Jun 10, 2024
f3003e1
linting
d33bs Jun 10, 2024
b97f3a5
adding tests
d33bs Jun 10, 2024
1069a4a
linting
d33bs Jun 10, 2024
d52a89f
update name, tests
d33bs Jun 11, 2024
1bf8262
add back compat for self type
d33bs Jun 11, 2024
772f895
back compat for isinstance
d33bs Jun 11, 2024
d0ea33c
linting
d33bs Jun 11, 2024
fe3fcf0
add cli for cosmicqc
d33bs Jun 11, 2024
30aae3e
linting
d33bs Jun 11, 2024
2bd3d5f
add tests and wrappers
d33bs Jun 11, 2024
b5c9997
linting and test adjustment
d33bs Jun 12, 2024
bf40aed
attempting wrapper
d33bs Jun 12, 2024
feec9a4
patch python-fire; fix tests
d33bs Jun 12, 2024
d7519a8
add docstring to top of test
d33bs Jun 12, 2024
f8773b2
add csv.gz compatibility
d33bs Jun 14, 2024
0ad522f
add export capabilities
d33bs Jun 18, 2024
6d33de4
rename file to correct module name
d33bs Jun 18, 2024
34cf9bf
add export capabilities
d33bs Jun 18, 2024
c016bde
Merge branch 'add-file-data-ingest' into add-cli
d33bs Jun 18, 2024
09d815c
add output capabilities
d33bs Jun 18, 2024
fbedb7d
Apply suggestions from code review
d33bs Jun 18, 2024
0257014
update tests and docs
d33bs Jun 18, 2024
985a6dd
fix tests
d33bs Jun 19, 2024
5034a07
update tests; add constructor path for scdataframe
d33bs Jun 19, 2024
fd81868
linting
d33bs Jun 19, 2024
42fedf5
Merge branch 'add-file-data-ingest' into add-cli
d33bs Jun 19, 2024
3bab60f
modify tests
d33bs Jun 19, 2024
6d61bf3
enable pd.series compatibility
d33bs Jun 19, 2024
11cd3b0
Merge branch 'add-file-data-ingest' into add-cli
d33bs Jun 19, 2024
90c2088
update for exports via cli
d33bs Jun 19, 2024
e35f97e
Merge remote-tracking branch 'upstream/main' into add-cli
d33bs Jun 25, 2024
c966ce8
fix docstring
d33bs Jun 25, 2024
ac734ca
add return types for test util
d33bs Jun 25, 2024
812b171
fix deps
d33bs Jun 26, 2024
2e18455
add to docs on exports
d33bs Jun 26, 2024
273acf1
add docs for context
d33bs Jun 26, 2024
77c27e1
note about ignore rule
d33bs Jun 26, 2024
4d57177
remove todo
d33bs Jun 26, 2024
1900c65
minor comment about display
d33bs Jun 26, 2024
0a46f40
retain code comment
d33bs Jun 26, 2024
5d5257b
correct code comment
d33bs Jun 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions example.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
,example_feature
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9
9,10
113 changes: 75 additions & 38 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,14 @@ scipy = [
]
pyarrow = "^16.0.0"
pyyaml = "^6.0.1"
fire = "^0.6.0"

[tool.poetry.group.dev.dependencies]
pytest = "^8.2.0"

[tool.poetry.scripts]
cosmicqc = "cosmicqc.cli:cli_analyze"

[tool.isort]
profile = "black"

Expand Down
45 changes: 39 additions & 6 deletions src/cosmicqc/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def identify_outliers(
feature_thresholds: Union[Dict[str, float], str],
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
include_threshold_scores: bool = False,
export_path: Optional[str] = None,
) -> Union[pd.Series, pd.DataFrame]:
"""
This function uses z-scoring to format the data for detecting outlier
Expand All @@ -35,8 +36,6 @@ def identify_outliers(
df: Union[SCDataFrame, pd.DataFrame, str]
DataFrame or file string-based filepath of a
Parquet, CSV, or TSV file with CytoTable output or similar data.
metadata_columns: List[str]
List of metadata columns that should be outputted with the outlier data.
feature_thresholds: Dict[str, float]
One of two options:
A dictionary with the feature name(s) as the key(s) and their assigned
Expand All @@ -48,6 +47,13 @@ def identify_outliers(
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
An optional feature thresholds file where thresholds may be
defined within a file.
include_threshold_scores: bool
Whether to include the threshold scores in addition to whether
the threshold set passes per row.
export_path: Optional[str] = None
An optional path to export the data using SCDataFrame export
capabilities. If None no export is performed.
Note: compatible exports are CSV's, TSV's, and parquet.

Returns:
Union[pd.Series, pd.DataFrame]:
Expand Down Expand Up @@ -95,7 +101,7 @@ def identify_outliers(
condition = outlier_df[zscore_columns[feature]] < threshold
conditions.append(condition)

return (
result = (
# create a boolean pd.series identifier for dataframe
# based on all conditions for use within other functions.
reduce(operator.and_, conditions)
Expand All @@ -111,12 +117,18 @@ def identify_outliers(
)
)

if export_path is not None:
SCDataFrame(data=result).export(file_path=export_path)

return result


def find_outliers(
df: Union[SCDataFrame, pd.DataFrame, str],
metadata_columns: List[str],
feature_thresholds: Union[Dict[str, float], str],
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
export_path: Optional[str] = None,
) -> pd.DataFrame:
"""
This function uses identify_outliers to return a dataframe
Expand All @@ -139,6 +151,10 @@ def find_outliers(
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
An optional feature thresholds file where thresholds may be
defined within a file.
export_path: Optional[str] = None
An optional path to export the data using SCDataFrame export
capabilities. If None no export is performed.
Note: compatible exports are CSV's, TSV's, and parquet.

Returns:
pd.DataFrame:
Expand Down Expand Up @@ -174,15 +190,22 @@ def find_outliers(
# Include metadata columns in the output DataFrame
columns_to_include = list(feature_thresholds.keys()) + metadata_columns

result = outliers_df[columns_to_include]

# export the file if specified
if export_path is not None:
SCDataFrame(data=result).export(file_path=export_path)

# Return outliers DataFrame with specified columns
return outliers_df[columns_to_include]
return result


def label_outliers(
df: Union[SCDataFrame, pd.DataFrame, str],
feature_thresholds: Optional[Union[Dict[str, float], str]] = None,
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
include_threshold_scores: bool = False,
export_path: Optional[str] = None,
) -> pd.DataFrame:
"""
Use identify_outliers to label the original dataset for
Expand All @@ -206,6 +229,10 @@ def label_outliers(
include_threshold_scores: bool = False
Whether to include the scores in addition to whether an outlier
was detected or not.
export_path: Optional[str] = None
An optional path to export the data using SCDataFrame export
capabilities. If None no export is performed.
Note: compatible exports are CSV's, TSV's, and parquet.

Returns:
pd.DataFrame:
Expand All @@ -224,7 +251,7 @@ def label_outliers(
feature_thresholds_file=feature_thresholds_file,
include_threshold_scores=include_threshold_scores,
)
return pd.concat(
result = pd.concat(
[
df,
(
Expand Down Expand Up @@ -265,7 +292,13 @@ def label_outliers(
axis=1,
)
# return a dataframe with a deduplicated columns by name
return labeled_df.loc[:, ~labeled_df.columns.duplicated()]
result = labeled_df.loc[:, ~labeled_df.columns.duplicated()]

# export the file if specified
if export_path is not None:
SCDataFrame(data=result).export(file_path=export_path)

return result


def read_thresholds_set_from_file(
Expand Down
Loading