Skip to content

Commit ac58c15

Browse files
authored
Merge pull request #56 from TAPE-Lab/testv03
Dependency updates
2 parents 00c3a1a + 84e83a1 commit ac58c15

8 files changed

+190
-17
lines changed

README.md

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
<!-- [![Documentation Status](https://readthedocs.org/projects/cytof-dataanalysis/badge/?version=latest)](https://cytof-dataanalysis.readthedocs.io/en/latest/?badge=latest) -->
2-
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4587193.svg)](https://doi.org/10.5281/zenodo.4587193)
2+
[![zenodo:10.5281/zenodo.4587193](https://img.shields.io/badge/Zenodo-10.5281%2Fzenodo.4587193-4B81BE.svg)](https://doi.org/10.5281/zenodo.4587193) <!-- 4B81BE is the colour for Zenodo -->
3+
[![natprotocols:10.1038/s41596-021-00603-4](https://img.shields.io/badge/DOI-10.1038%2Fs41596--021--00603--4-644B96.svg)](https://doi.org/10.1038/s41596-021-00603-4) <!-- 644B96 is the colour for Nature Protocols -->
4+
5+
<!-- Development branch to add some new functionality related to Dimensionality reduction
6+
(and perhaps clustering), adding ARM support (for Apple silicon), and improving
7+
code annotation for our functions.
8+
Take also as an opportunity to redefine the conda environment file. -->
39

410
# **Cy**TOF Si**gn**alling An**al**ysis (*CyGNAL*)
511

code/4-dremi.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@
145145
if num_outliers_total == 0:
146146
df_info_dict[colname_arc] = "-" # this is a placeholder
147147
# Store the info for each marker pair in df_info
148-
df_info = df_info.append(df_info_dict, ignore_index=True)
148+
df_info = pd.concat([df_info, df_info_dict], ignore_index=True)
149149

150150

151151
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Save to file~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

code/aux/aux1_data_preprocess.py

+32-2
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,17 @@
99
#Idea is to rename all columns and then filter non-relevant ones (less optimal,
1010
# easier and more compatible with writing new reduced file in the last step)
1111

12-
#Filtering
12+
#Filtering: The function allows you to filter out columns based on a specific pattern, which can be useful for removing columns that are not relevant to your analysis.
1313
def filter_columns(renamed_columns):
14+
"""
15+
Filters out unnecessary columns from the list of renamed columns.
16+
17+
Args:
18+
renamed_columns (list): List of renamed columns.
19+
20+
Returns:
21+
tuple: A tuple containing two lists - columns to keep and filtered columns.
22+
"""
1423
reg_filter = re.compile("^\d+[A-Za-z]+$") #Removes columns with just isotope
1524
filtered_columns = [] #Stores the columns that where deemed unnecessary
1625
columns_to_keep = [] #Columns that the reduced file should have
@@ -22,8 +31,17 @@ def filter_columns(renamed_columns):
2231
return columns_to_keep, filtered_columns
2332

2433

25-
#Renaming
34+
#Renaming: The function helps standardize and clean the column names, making them more consistent and suitable for further analysis.
2635
def rename_columns(df_file_cols):
36+
"""
37+
Renames the column names by removing specific patterns and applying renaming rules.
38+
39+
Args:
40+
df_file_cols (list): List of column names.
41+
42+
Returns:
43+
list: List of renamed column names.
44+
"""
2745
reg_rename = re.compile("(__[a-z].*$|__\d.*$|_\(.*$|___.*$)")
2846
#First two options match ending constructs with double underscores
2947
#Third option matches endings within brackets
@@ -59,7 +77,19 @@ def rename_columns(df_file_cols):
5977
#Add also the generation of a .csv file with the markers in the panel.
6078
#It should be ok to do it here b4 concatenation in the next step because if
6179
# they are to be concatenaded they shpould already have the same panel of markers
80+
#The function generates a panel markers file that can be used to indicate the selection status of markers. The file will contain marker names along with "N" values indicating that none of the markers are selected.
6281
def write_panel_markers(cols, output_dir, info_run):
82+
"""
83+
Writes the panel markers to a panel markers file.
84+
85+
Args:
86+
cols (list): List of markers (column names).
87+
output_dir (str): Output directory path.
88+
info_run (str): Information about the run.
89+
90+
Returns:
91+
None
92+
"""
6393
all_markers = cols[0]
6494
counter_marker = []
6595
for i in all_markers:

code/aux/aux2_umap.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,19 @@
1212
# UMAP function
1313
# umap embedding calculation; result saved in a pandas dataframe
1414
# the names of the umap info columns are also defined here
15-
15+
#The function applies UMAP dimensionality reduction to the input data and combines it with the original data, providing the UMAP-transformed data as the output.
1616
def perform_umap(umap_params, all_together_vs_marks, no_arc):
17+
"""
18+
Performs UMAP dimensionality reduction on the given data.
19+
20+
Args:
21+
umap_params (dict): UMAP parameters including "info", "n", "m", "d", "comp", "rs", and "nsr".
22+
all_together_vs_marks (array-like): Input data for UMAP transformation.
23+
no_arc (DataFrame): Untransformed data.
24+
25+
Returns:
26+
DataFrame: UMAP-transformed data with added UMAP dimension columns.
27+
"""
1728
info_run = umap_params["info"]
1829
run_name = "UMAP_"+info_run
1930
#Calculate UMAP on arc tranf data (all_together...)
@@ -33,7 +44,3 @@ def perform_umap(umap_params, all_together_vs_marks, no_arc):
3344
no_arc = no_arc.join(umap_emb)
3445

3546
return no_arc
36-
37-
38-
39-

code/aux/aux3_emd.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,22 @@
66
import scprep
77

88
# Function to calculate EMD
9-
9+
#The function calculates the EMD between two distributions for each marker and stores the results in a DataFrame. The EMD measures the minimum amount of work required to transform one distribution into another, considering the distances between individual data points.
1010
def calculate_emd(marker_list, emd_infodict, compare_from, compare_to,
1111
emd_df):
12+
"""
13+
Calculates Earth Mover's Distance (EMD) between two distributions for each marker in the given marker list.
14+
15+
Args:
16+
marker_list (list): List of markers.
17+
emd_infodict (dict): EMD information dictionary.
18+
compare_from (DataFrame): Data for comparison from.
19+
compare_to (DataFrame): Data for comparison to.
20+
emd_df (DataFrame): DataFrame to store EMD results.
21+
22+
Returns:
23+
DataFrame: Updated DataFrame with EMD results.
24+
"""
1225
deprecated_string = "no_norm" #No normalisation implemented. Deprecate
1326
for marker in marker_list:
1427
emd_infodict["marker"] = marker
@@ -30,6 +43,6 @@ def calculate_emd(marker_list, emd_infodict, compare_from, compare_to,
3043
compare_from[marker],
3144
compare_to[marker])
3245
#Add EMD score to the output dataframe
33-
emd_df = emd_df.append(emd_infodict, ignore_index=True)
46+
emd_df = pd.concat([emd_df, pd.DataFrame([emd_infodict])], ignore_index=True)
3447

3548
return emd_df

code/aux/aux4_dremi.py

+14
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,21 @@
1212
# find outliers for both marker_x and marker_y based on cufoffs of standard deviations
1313
# return the number of outliers and a dataframe after outlier removal
1414
# update the df_info_dict with the number of outliers
15+
#The function identifies outliers based on the absolute difference between each marker's value and its mean, normalized by the marker's standard deviation. Rows that have values exceeding the cutoff multiplied by the standard deviation are considered outliers.
1516
def outlier_removal(df, cutoff, marker_x, marker_y, df_info_dict):
17+
"""
18+
Removes outliers from a DataFrame based on a cutoff value and specific markers.
19+
20+
Args:
21+
df (DataFrame): Input DataFrame.
22+
cutoff (float): Cutoff value for outlier removal.
23+
marker_x (str): Marker column name for x-axis.
24+
marker_y (str): Marker column name for y-axis.
25+
df_info_dict (dict): Dictionary to store outlier information.
26+
27+
Returns:
28+
tuple: A tuple containing the number of total outliers removed and the DataFrame without outliers.
29+
"""
1630
num_outliers_total = 0
1731
num_outliers_x = 0
1832
num_outliers_y = 0

code/aux/aux_functions.py

+106-3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,17 @@
1111

1212
#Read broken FCS through r.flowCore
1313
def read_rFCS(file_path):
14+
"""
15+
Reads an FCS file using R packages and returns a pandas DataFrame containing the data.
16+
17+
Args:
18+
file_path (str): The path to the FCS file.
19+
20+
Returns:
21+
df_file (pandas.DataFrame): A DataFrame containing the FCS data.
22+
no_filter (bool): Indicates whether filtering was applied to the columns.
23+
True if no filtering was applied, False otherwise.
24+
"""
1425
from rpy2.robjects import globalenv, pandas2ri, r
1526
from rpy2.robjects.packages import importr
1627
from rpy2.rinterface_lib.callbacks import logger
@@ -87,6 +98,17 @@ def read_rFCS(file_path):
8798

8899
#Arcsinh transform the data
89100
def arcsinh_transf(cofactor, no_arc):
101+
"""
102+
Applies the arcsinh transformation to selected columns of a DataFrame.
103+
104+
Args:
105+
cofactor (float): The scaling factor for the arcsinh transformation.
106+
no_arc (pandas.DataFrame): The DataFrame containing the data to be transformed.
107+
108+
Returns:
109+
arc (pandas.DataFrame): The DataFrame with the selected columns transformed using arcsinh.
110+
cols (list): The list of column names that were transformed.
111+
"""
90112
#Select only the columns containing the markers (as they start with a number for the isotope)
91113
cols = [x for x in no_arc.columns if x[0].isdigit()]
92114
#Apply the arcsinh only to those columns (don't want to change time or any other)
@@ -100,6 +122,16 @@ def arcsinh_transf(cofactor, no_arc):
100122

101123
#Function to concatenate all files: Read input .txt and .fcs. Sanity check. Concatenate
102124
def concatenate_fcs(input_dir):
125+
"""
126+
Reads and concatenates multiple mass cytometry FCS or text files from a directory.
127+
128+
Args:
129+
input_dir (str): The directory path containing the FCS and text files.
130+
131+
Returns:
132+
no_arc (pandas.DataFrame): The concatenated DataFrame containing the data from all files.
133+
filelist (list): The list of file names that were read and concatenated.
134+
"""
103135
txt_filelist = [f for f in os.listdir(input_dir) if f.endswith(".txt")]
104136
fcs_filelist = [f for f in os.listdir(input_dir) if f.endswith(".fcs")]
105137
filelist = txt_filelist+fcs_filelist
@@ -139,11 +171,21 @@ def concatenate_fcs(input_dir):
139171
lambda x: str(fcounter)+"-"+str(x))
140172
except KeyError:
141173
sys.exit("ERROR: Cell_Index missing from data. Have you preprocessed it?")
142-
no_arc = no_arc.append(df, ignore_index=True)
174+
no_arc = pd.concat([no_arc, df], ignore_index=True)
143175
return no_arc, filelist
144176

145177
#Function to concatenate all files and save as txt -> DEPRECATE IN THE NEAR FUTURE!
146178
def concatenate_save(input_dir, output_dir):
179+
"""
180+
Reads and concatenates multiple tab-separated mass cytometry text files from a directory and saves the concatenated data to a single file.
181+
182+
Args:
183+
input_dir (str): The directory path containing the input text files.
184+
output_dir (str): The directory path where the concatenated file will be saved.
185+
186+
Returns:
187+
None
188+
"""
147189
input_files = [f for f in os.listdir(input_dir) if f.endswith(".txt")]
148190
concat = pd.DataFrame()
149191
#Add counter to keep track of the number of files in input ->
@@ -157,14 +199,26 @@ def concatenate_save(input_dir, output_dir):
157199
df["Sample_ID-Cell_Index"] = df["Cell_Index"].apply(
158200
lambda x: str(fcounter)+"-"+str(x)) #File+ID #This way the cell-index will be preserved after Cytobank upload
159201
# df["Cell_Index"] = df["Cell_Index"].apply(lambda x: str(fcounter)+"-"+str(x)) #File+ID
160-
concat = concat.append(df, ignore_index=True)
202+
concat = pd.concat([concat, df], ignore_index=True)
161203
print("Concatenating...")
162204
concat.to_csv(f'{output_dir}/concat_{name}.txt', index = False, sep = '\t')
163205
print(f"Concatenated file saved as:\nconcat_{name}.txt")
164206

165207
#Downsample dataframe by column and save to file which IDs were removed
166208
def downsample_data(no_arc, info_run, output_dir,
167-
split_bycol="file_identifier"):
209+
split_bycol="file_identifier"):
210+
"""
211+
Performs downsampling on a DataFrame based on a specified column for splitting the data.
212+
213+
Args:
214+
no_arc (pandas.DataFrame): The input DataFrame containing the data to be downsampled.
215+
info_run (str): The name or identifier for the downsampling run.
216+
output_dir (str): The directory path where the downsampling status file will be saved.
217+
split_bycol (str, optional): The column name to split the data for downsampling. Defaults to "file_identifier".
218+
219+
Returns:
220+
reduced_df (pandas.DataFrame): The downsampled DataFrame.
221+
"""
168222
downsampled_dframe = no_arc.copy()
169223
#Defiine downsampling size (N) per file: at least N cells in all input files
170224
downsample_size = downsampled_dframe[split_bycol].value_counts().min()
@@ -190,11 +244,30 @@ def downsample_data(no_arc, info_run, output_dir,
190244

191245
# Random downsampling of a dataframe to n rows
192246
def downsample_df(df, n):
247+
"""
248+
Performs downsampling on a DataFrame by randomly selecting a specified number of rows.
249+
250+
Args:
251+
df (pandas.DataFrame): The input DataFrame to be downsampled.
252+
n (int): The number of rows to be randomly selected.
253+
254+
Returns:
255+
df_downsampled (pandas.DataFrame): The downsampled DataFrame.
256+
"""
193257
df_downsampled = df.sample(n)
194258
return df_downsampled
195259

196260
#Function to read a .csv file of the panel's markers with some to be selected
197261
def read_marker_csv(input_dir):
262+
"""
263+
Reads a marker CSV file from the input directory and extracts the markers that are flagged for use.
264+
265+
Args:
266+
input_dir (str): The directory path containing the marker CSV file.
267+
268+
Returns:
269+
selected_markers (list): A list of selected markers.
270+
"""
198271
marker_files = [f for f in os.listdir(f"{input_dir}") if f.endswith(".csv")]
199272
if len(marker_files) != 1: #Sanity check
200273
sys.exit("ERROR: There should be ONE .csv file with the markers to use in the input folder!")
@@ -205,6 +278,16 @@ def read_marker_csv(input_dir):
205278
return selected_markers
206279

207280
def write_panel_emd(df, input_dir):
281+
"""
282+
Writes a panel markers CSV file based on the unique markers present in a DataFrame of EMD scores.
283+
284+
Args:
285+
df (pandas.DataFrame): The input EMD DataFrame containing marker information.
286+
input_dir (str): The directory path where the panel markers CSV file will be written.
287+
288+
Returns:
289+
None
290+
"""
208291
all_markers = list(set(df['marker']))
209292
counter_marker = []
210293
for i in all_markers:
@@ -213,6 +296,16 @@ def write_panel_emd(df, input_dir):
213296
markers.to_csv(f"{input_dir}/panel_markers.csv", index=False, header=False)
214297

215298
def write_panel_dremi(df, input_dir):
299+
"""
300+
Writes a panel markers CSV file based on the unique markers present in a DataFrame of DREMI scores.
301+
302+
Args:
303+
df (pandas.DataFrame): The input DREMI DataFrame containing marker information.
304+
input_dir (str): The directory path where the panel markers CSV file will be written.
305+
306+
Returns:
307+
None
308+
"""
216309
all_markers = list(set(df['marker_x']))
217310
counter_marker = []
218311
for i in all_markers:
@@ -222,6 +315,16 @@ def write_panel_dremi(df, input_dir):
222315

223316
#Simple yes or no input function (default NO)
224317
def yes_or_NO(question, default="NO"):
318+
"""
319+
Prompts the user with a yes or no question and returns a boolean value based on the response.
320+
321+
Args:
322+
question (str): The question to prompt the user.
323+
default (str): The default response. Options: "NO" (default) or "YES".
324+
325+
Returns:
326+
bool: True if the user's response is yes, False otherwise.
327+
"""
225328
if default.lower() == "no":
226329
while True:
227330
reply = str(input(question+' (y/[N]): ')).lower().strip()

conda_env.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ channels:
33
# - bioconda
44
- conda-forge
55
dependencies:
6-
- python=3.8
7-
- pip=21
8-
- r-base=4.0
6+
- python=3
7+
- pip
8+
- r-base=4
99
- natsort
1010
- numpy #ALLarch
1111
- pandas=1.5 #ALLarch

0 commit comments

Comments
 (0)