|
| 1 | +############################################################################### |
| 2 | +#~~~~~~~~~~~~~~~~~~~~~~~~~~#~Batch rename panel markers~#~~~~~~~~~~~~~~~~~~~~~~~~~~# |
| 3 | +############################################################################### |
| 4 | +#OPTIONAL: This scripts renames the channel names in a collection of datasets |
| 5 | +#so that only the channel isotope (number and element) and the antibody target |
| 6 | +#are kept, ignoring any version numbers and other details. |
| 7 | +# Works with both .txt files and FCS files. |
| 8 | + |
| 9 | +import os |
| 10 | +import re |
| 11 | +import sys # Fix importing from diff. directory |
| 12 | +sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) |
| 13 | + |
| 14 | +import fcsparser |
| 15 | +import fcswrite |
| 16 | +import pandas as pd |
| 17 | + |
| 18 | +from aux.aux1_data_preprocess import filter_columns, write_panel_markers |
| 19 | +from aux.aux_functions import yes_or_NO |
| 20 | + |
| 21 | +#Future WIP: Add support for sequential hands off -> if flag use set of seq i/o |
| 22 | +# sequential_mode = vars(sys.modules[__name__])['__package__'] |
| 23 | +# print(sequential_mode) #Will populate if run from superior script |
| 24 | + |
| 25 | +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~I/O~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# |
| 26 | +base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| 27 | +folder_name = "opt6_renameMIN" |
| 28 | + |
| 29 | +input_dir = f"{base_dir}/Utils_Data/input/{folder_name}" |
| 30 | +output_dir = f"{base_dir}/Utils_Data/output/{folder_name}" |
| 31 | + |
| 32 | +# prepare file list; put the data files to be processed in the 'input' folder |
| 33 | +txt_filelist = [f for f in os.listdir(input_dir) if f.endswith(".txt")] |
| 34 | +fcs_filelist = [f for f in os.listdir(input_dir) if f.endswith(".fcs")] |
| 35 | +filelist = txt_filelist+fcs_filelist |
| 36 | + |
| 37 | +if len(filelist)==0: |
| 38 | + sys.exit (f"ERROR: There are no files in {input_dir}!") |
| 39 | +if len(txt_filelist)!=0: |
| 40 | + print("Found the following .txt files: ", txt_filelist) |
| 41 | + txt_sopts = yes_or_NO("Would you like to save the processed .txt files also in .fcs format?") |
| 42 | +if len(fcs_filelist)!=0: |
| 43 | + print("Found the following .fcs files: ", fcs_filelist) |
| 44 | + fcs_sopts = yes_or_NO("Would you like to save the processed .fcs files also in .txt format?") |
| 45 | + |
| 46 | + |
| 47 | +info_run = input("Write info run (using no spaces!): ") |
| 48 | +if len(info_run) == 0: |
| 49 | + print("No info run given. Saving results in UNNAMED") |
| 50 | + info_run = "UNNAMED" |
| 51 | + |
| 52 | +if os.path.isdir(f"{output_dir}/{info_run}") == False: |
| 53 | + os.makedirs(f"{output_dir}/{info_run}") |
| 54 | +else: |
| 55 | + if info_run !="UNNAMED": |
| 56 | + sys.exit("ERROR: You already used this name for a previous run. \nUse a different name!") |
| 57 | +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# |
| 58 | + |
| 59 | +#~~~~~~~~~~~~~~~~~~~~~~~~~~Minimal renaming function~~~~~~~~~~~~~~~~~~~~~~~~~~# |
| 60 | + |
| 61 | +def renameMIN_columns(df_file_cols): |
| 62 | + reg_rename = re.compile("(__[a-z].*$|__\d.*$|_\(.*$|___.*$|_v\d$)") |
| 63 | + #First two options match ending constructs with double underscores |
| 64 | + #Third option matches endings within brackets |
| 65 | + #Fourth option matches antibody version (unique to opt6) |
| 66 | + df_file_cols_processed = [] |
| 67 | + df_file_cols_renamed = [] |
| 68 | + df_file_cols_final = [] |
| 69 | + |
| 70 | + for i in df_file_cols: #First pass to remove most issues |
| 71 | + try: |
| 72 | + df_file_cols_processed.append(reg_rename.sub("",i)) |
| 73 | + print(df_file_cols_processed) |
| 74 | + except: |
| 75 | + df_file_cols_processed.append(i) |
| 76 | + #Second pass to remove trailing underscores |
| 77 | + for i in df_file_cols_processed: |
| 78 | + try: |
| 79 | + df_file_cols_renamed.append(re.sub(r"_$","",i)) |
| 80 | + except: |
| 81 | + df_file_cols_renamed.append(i) |
| 82 | + #Third pass replace '__' with '_' |
| 83 | + for i in df_file_cols_renamed: |
| 84 | + try: |
| 85 | + df_file_cols_final.append(re.sub(r"__","_",i)) |
| 86 | + except: |
| 87 | + df_file_cols_final.append(i) |
| 88 | + # Keeping with Xiao's convention, rename Event # to Cell_Index |
| 89 | + for n,i in enumerate(df_file_cols_final): |
| 90 | + if i=="Event #": |
| 91 | + df_file_cols_final[n] = "Cell_Index" |
| 92 | + |
| 93 | + return df_file_cols_final |
| 94 | + |
| 95 | +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# |
| 96 | + |
| 97 | + |
| 98 | + |
| 99 | +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Pre-processing~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# |
| 100 | +cols = [] |
| 101 | +no_filter=False |
| 102 | + |
| 103 | +for i in filelist: |
| 104 | + file_path = f"{input_dir}/{i}" |
| 105 | + if i in txt_filelist: |
| 106 | + # if format_filelist=="txt": |
| 107 | + df_file = pd.read_csv(file_path, sep = '\t') |
| 108 | + print(i) |
| 109 | + else: |
| 110 | + try: #Use fcsparser to read the fcs data files |
| 111 | + print (i) |
| 112 | + metafcs,df_file = fcsparser.parse(file_path, meta_data_only=False, |
| 113 | + channel_naming='$PnS') |
| 114 | + # nonstandard_FCS = "NO" |
| 115 | + reg_pnn = re.compile("(\d+Di$)") #Detect if, despite flag |
| 116 | + pnn_extracted=[] #columns match PnN pattern |
| 117 | + for n in df_file.columns.values.tolist(): |
| 118 | + if reg_pnn.search(n): |
| 119 | + pnn_extracted.append(n) |
| 120 | + if len(pnn_extracted)!=0: |
| 121 | + raise fcsparser.api.ParserFeatureNotImplementedError |
| 122 | + # print(df_file.columns) |
| 123 | + except fcsparser.api.ParserFeatureNotImplementedError: |
| 124 | + print("WARNING: Non-standard .fcs file detected: ", i) |
| 125 | + print("This might take a while. Please take care and check the output") |
| 126 | + from aux.aux_functions import read_rFCS # Import only if needed |
| 127 | + |
| 128 | + #use rpy2 to read the files and load into python |
| 129 | + df_file, no_filter = read_rFCS(file_path) |
| 130 | + # print(df_file.columns) |
| 131 | + #print ("remove:\n", df_file) |
| 132 | + # nonstandard_FCS ="YES" #Offer to save as .txt by default |
| 133 | + |
| 134 | + shape_before = df_file.shape |
| 135 | + df_file_cols = list(df_file.columns) |
| 136 | + |
| 137 | + #for i in df_file_cols: print(i) |
| 138 | + |
| 139 | + #%% Perform renaming and filtering |
| 140 | + try: |
| 141 | + if no_filter==False: |
| 142 | + renamed_columns = renameMIN_columns(df_file_cols) |
| 143 | + columns_to_keep, filtered_columns = filter_columns(renamed_columns) |
| 144 | + df_file.columns = renamed_columns |
| 145 | + f_reduced = df_file[columns_to_keep].iloc[:].copy() |
| 146 | + print ("Removed the following columns: ", filtered_columns) |
| 147 | + |
| 148 | + #Store columns present in each of the input files |
| 149 | + cols.append([x for x in f_reduced.columns if x[0].isdigit()]) |
| 150 | + |
| 151 | + shape_after = f_reduced.shape |
| 152 | + print ( |
| 153 | + f"file: {i}\n\trows before: {shape_before[0]} - columns before: {shape_before[1]}\n\trows after: {shape_after[0]} - columns after: {shape_after[1]}\n") |
| 154 | + else: |
| 155 | + print("No filtering being performed") |
| 156 | + f_reduced = df_file |
| 157 | + cols.append(df_file_cols) |
| 158 | + except: |
| 159 | + print("Column names processing and filtering failed. Check the format!", |
| 160 | + "Using original unchanged panel") |
| 161 | + f_reduced = df_file |
| 162 | + cols.append(df_file_cols) |
| 163 | + |
| 164 | + #Add Cell_Index column |
| 165 | + if "Cell_Index" not in f_reduced.columns: |
| 166 | + print("MISSING CELL_INDEX") |
| 167 | + f_reduced.reset_index(inplace=True) |
| 168 | + f_reduced.rename({"index": "Cell_Index"}, axis="columns", inplace=True) |
| 169 | + f_reduced["Cell_Index"] = pd.to_numeric(f_reduced["Cell_Index"]) |
| 170 | + print(f_reduced) #Print final dataframe |
| 171 | + |
| 172 | + #Saving files#: |
| 173 | + if i in txt_filelist: |
| 174 | + f_reduced.to_csv(f"{output_dir}/{info_run}/renamedMIN_{i}", |
| 175 | + index = False, sep = '\t') |
| 176 | + # index = False to be compatible with Cytobank |
| 177 | + if txt_sopts: |
| 178 | + #SAVE AS FCS |
| 179 | + fcswrite.write_fcs(f"{output_dir}/{info_run}/renamedMIN_{i}.fcs", |
| 180 | + chn_names=list(f_reduced.columns), |
| 181 | + compat_chn_names=False, |
| 182 | + data=f_reduced.to_numpy()) |
| 183 | + |
| 184 | + else: |
| 185 | + fcswrite.write_fcs(f"{output_dir}/{info_run}/renamedMIN_{i}", |
| 186 | + chn_names=list(f_reduced.columns), |
| 187 | + compat_chn_names=False, |
| 188 | + data=f_reduced.to_numpy()) |
| 189 | + if fcs_sopts: |
| 190 | + print("Converting .fcs to .txt") |
| 191 | + f_reduced.to_csv(f"{output_dir}/{info_run}/renamedMIN_{i}.txt", |
| 192 | + index = False, sep = '\t') #Changed to index=False |
| 193 | + |
| 194 | + |
0 commit comments