Merge pull request #56 from TAPE-Lab/testv03

fercrcode · web-flow · commit ac58c15ff5e8 · 2023-12-07T08:30:43.000Z
Dependency updates
diff --git a/README.md b/README.md
@@ -1,5 +1,11 @@
 <!-- [![Documentation Status](https://readthedocs.org/projects/cytof-dataanalysis/badge/?version=latest)](https://cytof-dataanalysis.readthedocs.io/en/latest/?badge=latest) -->
-[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4587193.svg)](https://doi.org/10.5281/zenodo.4587193)
+[![zenodo:10.5281/zenodo.4587193](https://img.shields.io/badge/Zenodo-10.5281%2Fzenodo.4587193-4B81BE.svg)](https://doi.org/10.5281/zenodo.4587193) <!-- 4B81BE is the colour for Zenodo -->
+[![natprotocols:10.1038/s41596-021-00603-4](https://img.shields.io/badge/DOI-10.1038%2Fs41596--021--00603--4-644B96.svg)](https://doi.org/10.1038/s41596-021-00603-4) <!-- 644B96 is the colour for Nature Protocols -->
+
+<!-- Development branch to add some new functionality related to Dimensionality reduction 
+(and perhaps clustering), adding ARM support (for Apple silicon), and improving 
+code annotation for our functions.
+Take also as an opportunity to redefine the conda environment file. -->
 
 # **Cy**TOF Si**gn**alling An**al**ysis (*CyGNAL*)
 
diff --git a/code/4-dremi.py b/code/4-dremi.py
@@ -145,7 +145,7 @@
                 if num_outliers_total == 0:
                     df_info_dict[colname_arc] = "-" # this is a placeholder
         # Store the info for each marker pair in df_info      
-        df_info = df_info.append(df_info_dict, ignore_index=True)    
+        df_info = pd.concat([df_info, df_info_dict], ignore_index=True)    
 
 
 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Save to file~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
diff --git a/code/aux/aux1_data_preprocess.py b/code/aux/aux1_data_preprocess.py
@@ -9,8 +9,17 @@
 #Idea is to rename all columns and then filter non-relevant ones (less optimal,
 # easier and more compatible with writing new reduced file in the last step)
 
-#Filtering
+#Filtering: The function allows you to filter out columns based on a specific pattern, which can be useful for removing columns that are not relevant to your analysis.
 def filter_columns(renamed_columns):
+    """
+    Filters out unnecessary columns from the list of renamed columns.
+
+    Args:
+        renamed_columns (list): List of renamed columns.
+
+    Returns:
+        tuple: A tuple containing two lists - columns to keep and filtered columns.
+    """
     reg_filter = re.compile("^\d+[A-Za-z]+$") #Removes columns with just isotope
     filtered_columns = [] #Stores the columns that where deemed unnecessary
     columns_to_keep = [] #Columns that the reduced file should have
@@ -22,8 +31,17 @@ def filter_columns(renamed_columns):
     return columns_to_keep, filtered_columns
 
 
-#Renaming
+#Renaming: The function helps standardize and clean the column names, making them more consistent and suitable for further analysis.
 def rename_columns(df_file_cols):
+    """
+    Renames the column names by removing specific patterns and applying renaming rules.
+
+    Args:
+        df_file_cols (list): List of column names.
+
+    Returns:
+        list: List of renamed column names.
+    """
     reg_rename = re.compile("(__[a-z].*$|__\d.*$|_\(.*$|___.*$)")
         #First two options match ending constructs with double underscores
         #Third option matches endings within brackets
@@ -59,7 +77,19 @@ def rename_columns(df_file_cols):
 #Add also the generation of a .csv file with the markers in the panel.
 #It should be ok to do it here b4 concatenation in the next step because if 
 # they are to be concatenaded they shpould already have the same panel of markers
+#The function generates a panel markers file that can be used to indicate the selection status of markers. The file will contain marker names along with "N" values indicating that none of the markers are selected.
 def write_panel_markers(cols, output_dir, info_run):
+    """
+    Writes the panel markers to a panel markers file.
+
+    Args:
+        cols (list): List of markers (column names).
+        output_dir (str): Output directory path.
+        info_run (str): Information about the run.
+
+    Returns:
+        None
+    """
     all_markers = cols[0]
     counter_marker = []
     for i in all_markers:
diff --git a/code/aux/aux2_umap.py b/code/aux/aux2_umap.py
@@ -12,8 +12,19 @@
 # UMAP function
 # umap embedding calculation; result saved in a pandas dataframe
 # the names of the umap info columns are also defined here
-
+#The function applies UMAP dimensionality reduction to the input data and combines it with the original data, providing the UMAP-transformed data as the output.
 def perform_umap(umap_params, all_together_vs_marks, no_arc):
+    """
+    Performs UMAP dimensionality reduction on the given data.
+
+    Args:
+        umap_params (dict): UMAP parameters including "info", "n", "m", "d", "comp", "rs", and "nsr".
+        all_together_vs_marks (array-like): Input data for UMAP transformation.
+        no_arc (DataFrame): Untransformed data.
+
+    Returns:
+        DataFrame: UMAP-transformed data with added UMAP dimension columns.
+    """
     info_run = umap_params["info"]
     run_name = "UMAP_"+info_run
     #Calculate UMAP on arc tranf data (all_together...)
@@ -33,7 +44,3 @@ def perform_umap(umap_params, all_together_vs_marks, no_arc):
     no_arc = no_arc.join(umap_emb)
 
     return no_arc
-        
-
-
-            
diff --git a/code/aux/aux3_emd.py b/code/aux/aux3_emd.py
@@ -6,9 +6,22 @@
 import scprep
 
 # Function to calculate EMD
-
+#The function calculates the EMD between two distributions for each marker and stores the results in a DataFrame. The EMD measures the minimum amount of work required to transform one distribution into another, considering the distances between individual data points.
 def calculate_emd(marker_list, emd_infodict, compare_from, compare_to,
                     emd_df):
+    """
+    Calculates Earth Mover's Distance (EMD) between two distributions for each marker in the given marker list.
+
+    Args:
+        marker_list (list): List of markers.
+        emd_infodict (dict): EMD information dictionary.
+        compare_from (DataFrame): Data for comparison from.
+        compare_to (DataFrame): Data for comparison to.
+        emd_df (DataFrame): DataFrame to store EMD results.
+
+    Returns:
+        DataFrame: Updated DataFrame with EMD results.
+    """
     deprecated_string = "no_norm" #No normalisation implemented. Deprecate
     for marker in marker_list:
         emd_infodict["marker"] = marker
@@ -30,6 +43,6 @@ def calculate_emd(marker_list, emd_infodict, compare_from, compare_to,
                                                         compare_from[marker],
                                                         compare_to[marker])
         #Add EMD score to the output dataframe
-        emd_df = emd_df.append(emd_infodict, ignore_index=True)
+        emd_df = pd.concat([emd_df, pd.DataFrame([emd_infodict])], ignore_index=True)
     
     return emd_df
diff --git a/code/aux/aux4_dremi.py b/code/aux/aux4_dremi.py
@@ -12,7 +12,21 @@
 # find outliers for both marker_x and marker_y based on cufoffs of standard deviations
 # return the number of outliers and a dataframe after outlier removal
 # update the df_info_dict with the number of outliers
+#The function identifies outliers based on the absolute difference between each marker's value and its mean, normalized by the marker's standard deviation. Rows that have values exceeding the cutoff multiplied by the standard deviation are considered outliers.
 def outlier_removal(df, cutoff, marker_x, marker_y, df_info_dict):
+    """
+    Removes outliers from a DataFrame based on a cutoff value and specific markers.
+
+    Args:
+        df (DataFrame): Input DataFrame.
+        cutoff (float): Cutoff value for outlier removal.
+        marker_x (str): Marker column name for x-axis.
+        marker_y (str): Marker column name for y-axis.
+        df_info_dict (dict): Dictionary to store outlier information.
+
+    Returns:
+        tuple: A tuple containing the number of total outliers removed and the DataFrame without outliers.
+    """
     num_outliers_total = 0
     num_outliers_x = 0
     num_outliers_y = 0
diff --git a/code/aux/aux_functions.py b/code/aux/aux_functions.py
@@ -11,6 +11,17 @@
 
 #Read broken FCS through r.flowCore
 def read_rFCS(file_path):
+    """
+    Reads an FCS file using R packages and returns a pandas DataFrame containing the data.
+
+    Args:
+        file_path (str): The path to the FCS file.
+
+    Returns:
+        df_file (pandas.DataFrame): A DataFrame containing the FCS data.
+        no_filter (bool): Indicates whether filtering was applied to the columns.
+            True if no filtering was applied, False otherwise.
+    """
     from rpy2.robjects import globalenv, pandas2ri, r
     from rpy2.robjects.packages import importr
     from rpy2.rinterface_lib.callbacks import logger
@@ -87,6 +98,17 @@ def read_rFCS(file_path):
 
 #Arcsinh transform the data
 def arcsinh_transf(cofactor, no_arc):
+    """
+    Applies the arcsinh transformation to selected columns of a DataFrame.
+
+    Args:
+        cofactor (float): The scaling factor for the arcsinh transformation.
+        no_arc (pandas.DataFrame): The DataFrame containing the data to be transformed.
+
+    Returns:
+        arc (pandas.DataFrame): The DataFrame with the selected columns transformed using arcsinh.
+        cols (list): The list of column names that were transformed.
+    """
     #Select only the columns containing the markers (as they start with a number for the isotope)
     cols = [x for x in no_arc.columns if x[0].isdigit()]
     #Apply the arcsinh only to those columns (don't want to change time or any other)
@@ -100,6 +122,16 @@ def arcsinh_transf(cofactor, no_arc):
 
 #Function to concatenate all files: Read input .txt and .fcs. Sanity check. Concatenate
 def concatenate_fcs(input_dir):
+    """
+    Reads and concatenates multiple mass cytometry FCS or text files from a directory.
+
+    Args:
+        input_dir (str): The directory path containing the FCS and text files.
+
+    Returns:
+        no_arc (pandas.DataFrame): The concatenated DataFrame containing the data from all files.
+        filelist (list): The list of file names that were read and concatenated.
+    """
     txt_filelist = [f for f in os.listdir(input_dir) if f.endswith(".txt")]
     fcs_filelist = [f for f in os.listdir(input_dir) if f.endswith(".fcs")]
     filelist = txt_filelist+fcs_filelist
@@ -139,11 +171,21 @@ def concatenate_fcs(input_dir):
                                             lambda x: str(fcounter)+"-"+str(x))
         except KeyError:
             sys.exit("ERROR: Cell_Index missing from data. Have you preprocessed it?")
-        no_arc = no_arc.append(df, ignore_index=True)
+        no_arc = pd.concat([no_arc, df], ignore_index=True)
     return no_arc, filelist
 
 #Function to concatenate all files and save as txt -> DEPRECATE IN THE NEAR FUTURE!
 def concatenate_save(input_dir, output_dir):
+    """
+    Reads and concatenates multiple tab-separated  mass cytometry text files from a directory and saves the concatenated data to a single file.
+
+    Args:
+        input_dir (str): The directory path containing the input text files.
+        output_dir (str): The directory path where the concatenated file will be saved.
+
+    Returns:
+        None
+    """
     input_files = [f for f in os.listdir(input_dir) if f.endswith(".txt")]
     concat = pd.DataFrame()
     #Add counter to keep track of the number of files in input -> 
@@ -157,14 +199,26 @@ def concatenate_save(input_dir, output_dir):
         df["Sample_ID-Cell_Index"] = df["Cell_Index"].apply(
                                         lambda x: str(fcounter)+"-"+str(x)) #File+ID #This way the cell-index will be preserved after Cytobank upload
         # df["Cell_Index"] = df["Cell_Index"].apply(lambda x: str(fcounter)+"-"+str(x)) #File+ID
-        concat = concat.append(df, ignore_index=True)
+        concat = pd.concat([concat, df], ignore_index=True)
     print("Concatenating...")
     concat.to_csv(f'{output_dir}/concat_{name}.txt', index = False, sep = '\t')
     print(f"Concatenated file saved as:\nconcat_{name}.txt")
 
 #Downsample dataframe by column and save to file which IDs were removed
 def downsample_data(no_arc, info_run, output_dir, 
-                    split_bycol="file_identifier"): 
+                    split_bycol="file_identifier"):
+    """
+    Performs downsampling on a DataFrame based on a specified column for splitting the data.
+
+    Args:
+        no_arc (pandas.DataFrame): The input DataFrame containing the data to be downsampled.
+        info_run (str): The name or identifier for the downsampling run.
+        output_dir (str): The directory path where the downsampling status file will be saved.
+        split_bycol (str, optional): The column name to split the data for downsampling. Defaults to "file_identifier".
+
+    Returns:
+        reduced_df (pandas.DataFrame): The downsampled DataFrame.
+    """
     downsampled_dframe = no_arc.copy()
     #Defiine downsampling size (N) per file: at least N cells in all input files
     downsample_size = downsampled_dframe[split_bycol].value_counts().min() 
@@ -190,11 +244,30 @@ def downsample_data(no_arc, info_run, output_dir,
 
 # Random downsampling of a dataframe to n rows
 def downsample_df(df, n):
+    """
+    Performs downsampling on a DataFrame by randomly selecting a specified number of rows.
+
+    Args:
+        df (pandas.DataFrame): The input DataFrame to be downsampled.
+        n (int): The number of rows to be randomly selected.
+
+    Returns:
+        df_downsampled (pandas.DataFrame): The downsampled DataFrame.
+    """
     df_downsampled = df.sample(n)
     return df_downsampled
 
 #Function to read a .csv file of the panel's markers with some to be selected
 def read_marker_csv(input_dir):
+    """
+    Reads a marker CSV file from the input directory and extracts the markers that are flagged for use.
+
+    Args:
+        input_dir (str): The directory path containing the marker CSV file.
+
+    Returns:
+        selected_markers (list): A list of selected markers.
+    """
     marker_files = [f for f in os.listdir(f"{input_dir}") if f.endswith(".csv")]
     if len(marker_files) != 1: #Sanity check
         sys.exit("ERROR: There should be ONE .csv file with the markers to use in the input folder!")
@@ -205,6 +278,16 @@ def read_marker_csv(input_dir):
     return selected_markers
 
 def write_panel_emd(df, input_dir):
+    """
+    Writes a panel markers CSV file based on the unique markers present in a DataFrame of EMD scores.
+
+    Args:
+        df (pandas.DataFrame): The input EMD DataFrame containing marker information.
+        input_dir (str): The directory path where the panel markers CSV file will be written.
+
+    Returns:
+        None
+    """
     all_markers = list(set(df['marker']))
     counter_marker = []
     for i in all_markers:
@@ -213,6 +296,16 @@ def write_panel_emd(df, input_dir):
     markers.to_csv(f"{input_dir}/panel_markers.csv", index=False, header=False)
 
 def write_panel_dremi(df, input_dir):
+    """
+    Writes a panel markers CSV file based on the unique markers present in a DataFrame of DREMI scores.
+
+    Args:
+        df (pandas.DataFrame): The input DREMI DataFrame containing marker information.
+        input_dir (str): The directory path where the panel markers CSV file will be written.
+
+    Returns:
+        None
+    """
     all_markers = list(set(df['marker_x']))
     counter_marker = []
     for i in all_markers:
@@ -222,6 +315,16 @@ def write_panel_dremi(df, input_dir):
 
 #Simple yes or no input function (default NO)
 def yes_or_NO(question, default="NO"):
+    """
+    Prompts the user with a yes or no question and returns a boolean value based on the response.
+
+    Args:
+        question (str): The question to prompt the user.
+        default (str): The default response. Options: "NO" (default) or "YES".
+
+    Returns:
+        bool: True if the user's response is yes, False otherwise.
+    """
     if default.lower() == "no":
         while True:
             reply = str(input(question+' (y/[N]): ')).lower().strip()
diff --git a/conda_env.yml b/conda_env.yml
@@ -3,9 +3,9 @@ channels:
   # - bioconda
   - conda-forge
 dependencies: 
-  - python=3.8  
-  - pip=21  
-  - r-base=4.0
+  - python=3 
+  - pip
+  - r-base=4
   - natsort  
   - numpy #ALLarch
   - pandas=1.5  #ALLarch