Skip to content

Commit f31df85

Browse files
committed
Added opt6 script to further process channel names.
Added new dependecy (natsorted) and minor changes to marker_changes in 1-preproc
1 parent b908ede commit f31df85

File tree

9 files changed

+223
-11
lines changed

9 files changed

+223
-11
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,4 @@ Preprocessed_Data/PDO21_UNTrep2/Pro_20200109__EGF__Test__P21_0ng_2.S__Phase.txt
159159
Preprocessed_Data/maria_bothtxtfcs.7z
160160
Analysis/EMD_output/UNNAMED/
161161
Allpackages_includingbaselibs.csv
162+
Utils_Data/output/opt6_renameMIN/UNNAMED/

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Linux distributions (including Ubuntu on [WSL](https://github.com/Microsoft/WSL)
3535
workflow and most computational steps.
3636
* `fcsparser`
3737
* `fcswrite`
38+
* `natsort`
3839
* `numpy`
3940
* `pandas`
4041
* `plotly`

Utils_Data/input/opt6_renameMIN/README.md

Whitespace-only changes.

Utils_Data/output/opt6_renameMIN/README.md

Whitespace-only changes.

code/1-data_preprocess.py

+18-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
###############################################################################
22
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#~Pre-processing~#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
33
###############################################################################
4-
#FIRST STEP: Data and pranel preprocessing. Marker list generation.
4+
#FIRST STEP: Data and panel preprocessing. Marker list generation.
55
import os
66
import re
77
import sys
@@ -10,6 +10,8 @@
1010
import fcswrite
1111
import pandas as pd
1212

13+
from natsort import natsorted
14+
1315
from aux.aux1_data_preprocess import *
1416
from aux.aux_functions import yes_or_NO
1517

@@ -88,7 +90,7 @@
8890

8991
shape_before = df_file.shape
9092
df_file_cols = list(df_file.columns)
91-
93+
9294
#%% Perform renaming and filtering
9395
try:
9496
if no_filter==False:
@@ -147,7 +149,18 @@
147149

148150
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Panel markers~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
149151
if not all(x==cols[0] for x in cols):
150-
sys.exit("ERROR when generating shared marker panel:\nCheck your input files as THE PANELS DON'T MATCH!")
151-
else:
152-
write_panel_markers(cols, f"{output_dir}/{info_run}", info_run)
152+
print("WARNING when generating shared marker panel:\nCheck your input files as THE PANELS DON'T MATCH!")
153+
print("The panel_markers.csv file will contain only the following matching markers:\n")
154+
shared_cols = set(cols[0])
155+
for s in cols[1:]: #Use set intersection to get shared markers
156+
shared_cols.intersection_update(s)
157+
shared_cols = natsorted(list(shared_cols)) #Convert back to sorted list
158+
for marker in shared_cols:
159+
print(marker)
160+
print("\nIf the resulting panel_markers.csv does not have the desired markers,",
161+
"\nconsider building it manually or changing the files in the input directory.")
162+
163+
cols = [shared_cols,0] #Keep format as nested list
164+
165+
write_panel_markers(cols, f"{output_dir}/{info_run}", info_run)
153166

code/aux/aux1_data_preprocess.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
#Filtering
1313
def filter_columns(renamed_columns):
14-
reg_filter = re.compile("^\d+[A-Za-z]+$")
14+
reg_filter = re.compile("^\d+[A-Za-z]+$") #Removes columns with just isotope
1515
filtered_columns = [] #Stores the columns that where deemed unnecessary
1616
columns_to_keep = [] #Columns that the reduced file should have
1717
for i in renamed_columns:
@@ -25,11 +25,13 @@ def filter_columns(renamed_columns):
2525
#Renaming
2626
def rename_columns(df_file_cols):
2727
reg_rename = re.compile("(__[a-z].*$|__\d.*$|_\(.*$|___.*$)")
28+
#First two options match ending constructs with double underscores
29+
#Third option matches endings within brackets
2830
df_file_cols_processed = []
2931
df_file_cols_renamed = []
3032
df_file_cols_final = []
3133

32-
for i in df_file_cols:
34+
for i in df_file_cols: #First pass to remove most issues
3335
try:
3436
df_file_cols_processed.append(reg_rename.sub("",i))
3537
except:

code/utils/opt6_renameMIN.py

+194
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
###############################################################################
2+
#~~~~~~~~~~~~~~~~~~~~~~~~~~#~Batch rename panel markers~#~~~~~~~~~~~~~~~~~~~~~~~~~~#
3+
###############################################################################
4+
#OPTIONAL: This scripts renames the channel names in a collection of datasets
5+
#so that only the channel isotope (number and element) and the antibody target
6+
#are kept, ignoring any version numbers and other details.
7+
# Works with both .txt files and FCS files.
8+
9+
import os
10+
import re
11+
import sys # Fix importing from diff. directory
12+
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
13+
14+
import fcsparser
15+
import fcswrite
16+
import pandas as pd
17+
18+
from aux.aux1_data_preprocess import filter_columns, write_panel_markers
19+
from aux.aux_functions import yes_or_NO
20+
21+
#Future WIP: Add support for sequential hands off -> if flag use set of seq i/o
22+
# sequential_mode = vars(sys.modules[__name__])['__package__']
23+
# print(sequential_mode) #Will populate if run from superior script
24+
25+
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~I/O~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
26+
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
27+
folder_name = "opt6_renameMIN"
28+
29+
input_dir = f"{base_dir}/Utils_Data/input/{folder_name}"
30+
output_dir = f"{base_dir}/Utils_Data/output/{folder_name}"
31+
32+
# prepare file list; put the data files to be processed in the 'input' folder
33+
txt_filelist = [f for f in os.listdir(input_dir) if f.endswith(".txt")]
34+
fcs_filelist = [f for f in os.listdir(input_dir) if f.endswith(".fcs")]
35+
filelist = txt_filelist+fcs_filelist
36+
37+
if len(filelist)==0:
38+
sys.exit (f"ERROR: There are no files in {input_dir}!")
39+
if len(txt_filelist)!=0:
40+
print("Found the following .txt files: ", txt_filelist)
41+
txt_sopts = yes_or_NO("Would you like to save the processed .txt files also in .fcs format?")
42+
if len(fcs_filelist)!=0:
43+
print("Found the following .fcs files: ", fcs_filelist)
44+
fcs_sopts = yes_or_NO("Would you like to save the processed .fcs files also in .txt format?")
45+
46+
47+
info_run = input("Write info run (using no spaces!): ")
48+
if len(info_run) == 0:
49+
print("No info run given. Saving results in UNNAMED")
50+
info_run = "UNNAMED"
51+
52+
if os.path.isdir(f"{output_dir}/{info_run}") == False:
53+
os.makedirs(f"{output_dir}/{info_run}")
54+
else:
55+
if info_run !="UNNAMED":
56+
sys.exit("ERROR: You already used this name for a previous run. \nUse a different name!")
57+
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
58+
59+
#~~~~~~~~~~~~~~~~~~~~~~~~~~Minimal renaming function~~~~~~~~~~~~~~~~~~~~~~~~~~#
60+
61+
def renameMIN_columns(df_file_cols):
62+
reg_rename = re.compile("(__[a-z].*$|__\d.*$|_\(.*$|___.*$|_v\d$)")
63+
#First two options match ending constructs with double underscores
64+
#Third option matches endings within brackets
65+
#Fourth option matches antibody version (unique to opt6)
66+
df_file_cols_processed = []
67+
df_file_cols_renamed = []
68+
df_file_cols_final = []
69+
70+
for i in df_file_cols: #First pass to remove most issues
71+
try:
72+
df_file_cols_processed.append(reg_rename.sub("",i))
73+
print(df_file_cols_processed)
74+
except:
75+
df_file_cols_processed.append(i)
76+
#Second pass to remove trailing underscores
77+
for i in df_file_cols_processed:
78+
try:
79+
df_file_cols_renamed.append(re.sub(r"_$","",i))
80+
except:
81+
df_file_cols_renamed.append(i)
82+
#Third pass replace '__' with '_'
83+
for i in df_file_cols_renamed:
84+
try:
85+
df_file_cols_final.append(re.sub(r"__","_",i))
86+
except:
87+
df_file_cols_final.append(i)
88+
# Keeping with Xiao's convention, rename Event # to Cell_Index
89+
for n,i in enumerate(df_file_cols_final):
90+
if i=="Event #":
91+
df_file_cols_final[n] = "Cell_Index"
92+
93+
return df_file_cols_final
94+
95+
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
96+
97+
98+
99+
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Pre-processing~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
100+
cols = []
101+
no_filter=False
102+
103+
for i in filelist:
104+
file_path = f"{input_dir}/{i}"
105+
if i in txt_filelist:
106+
# if format_filelist=="txt":
107+
df_file = pd.read_csv(file_path, sep = '\t')
108+
print(i)
109+
else:
110+
try: #Use fcsparser to read the fcs data files
111+
print (i)
112+
metafcs,df_file = fcsparser.parse(file_path, meta_data_only=False,
113+
channel_naming='$PnS')
114+
# nonstandard_FCS = "NO"
115+
reg_pnn = re.compile("(\d+Di$)") #Detect if, despite flag
116+
pnn_extracted=[] #columns match PnN pattern
117+
for n in df_file.columns.values.tolist():
118+
if reg_pnn.search(n):
119+
pnn_extracted.append(n)
120+
if len(pnn_extracted)!=0:
121+
raise fcsparser.api.ParserFeatureNotImplementedError
122+
# print(df_file.columns)
123+
except fcsparser.api.ParserFeatureNotImplementedError:
124+
print("WARNING: Non-standard .fcs file detected: ", i)
125+
print("This might take a while. Please take care and check the output")
126+
from aux.aux_functions import read_rFCS # Import only if needed
127+
128+
#use rpy2 to read the files and load into python
129+
df_file, no_filter = read_rFCS(file_path)
130+
# print(df_file.columns)
131+
#print ("remove:\n", df_file)
132+
# nonstandard_FCS ="YES" #Offer to save as .txt by default
133+
134+
shape_before = df_file.shape
135+
df_file_cols = list(df_file.columns)
136+
137+
#for i in df_file_cols: print(i)
138+
139+
#%% Perform renaming and filtering
140+
try:
141+
if no_filter==False:
142+
renamed_columns = renameMIN_columns(df_file_cols)
143+
columns_to_keep, filtered_columns = filter_columns(renamed_columns)
144+
df_file.columns = renamed_columns
145+
f_reduced = df_file[columns_to_keep].iloc[:].copy()
146+
print ("Removed the following columns: ", filtered_columns)
147+
148+
#Store columns present in each of the input files
149+
cols.append([x for x in f_reduced.columns if x[0].isdigit()])
150+
151+
shape_after = f_reduced.shape
152+
print (
153+
f"file: {i}\n\trows before: {shape_before[0]} - columns before: {shape_before[1]}\n\trows after: {shape_after[0]} - columns after: {shape_after[1]}\n")
154+
else:
155+
print("No filtering being performed")
156+
f_reduced = df_file
157+
cols.append(df_file_cols)
158+
except:
159+
print("Column names processing and filtering failed. Check the format!",
160+
"Using original unchanged panel")
161+
f_reduced = df_file
162+
cols.append(df_file_cols)
163+
164+
#Add Cell_Index column
165+
if "Cell_Index" not in f_reduced.columns:
166+
print("MISSING CELL_INDEX")
167+
f_reduced.reset_index(inplace=True)
168+
f_reduced.rename({"index": "Cell_Index"}, axis="columns", inplace=True)
169+
f_reduced["Cell_Index"] = pd.to_numeric(f_reduced["Cell_Index"])
170+
print(f_reduced) #Print final dataframe
171+
172+
#Saving files#:
173+
if i in txt_filelist:
174+
f_reduced.to_csv(f"{output_dir}/{info_run}/renamedMIN_{i}",
175+
index = False, sep = '\t')
176+
# index = False to be compatible with Cytobank
177+
if txt_sopts:
178+
#SAVE AS FCS
179+
fcswrite.write_fcs(f"{output_dir}/{info_run}/renamedMIN_{i}.fcs",
180+
chn_names=list(f_reduced.columns),
181+
compat_chn_names=False,
182+
data=f_reduced.to_numpy())
183+
184+
else:
185+
fcswrite.write_fcs(f"{output_dir}/{info_run}/renamedMIN_{i}",
186+
chn_names=list(f_reduced.columns),
187+
compat_chn_names=False,
188+
data=f_reduced.to_numpy())
189+
if fcs_sopts:
190+
print("Converting .fcs to .txt")
191+
f_reduced.to_csv(f"{output_dir}/{info_run}/renamedMIN_{i}.txt",
192+
index = False, sep = '\t') #Changed to index=False
193+
194+

conda_env.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ channels:
55
dependencies:
66
- python=3.8
77
- pip=21
8-
- r-base=4.0
8+
- r-base=4.0
9+
- natsort
910
- numpy #ALLarch
1011
- pandas #ALLarch
1112
- scikit-learn #ALLarch

dependency_troubleshoot.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#Script to test if all necessary dependencies have been installed:
22
import sys, importlib
33

4-
python_packages =["copy","fcsparser","fcswrite","itertools","numpy","pandas",
5-
"plotly","pynndescent","re","rpy2","scprep","sklearn",
6-
"subprocess","umap"]
4+
python_packages =["copy","fcsparser","fcswrite","itertools","natsort","numpy",
5+
"pandas","plotly","pynndescent","re","rpy2","scprep",
6+
"sklearn","subprocess","umap"]
77
#copy, itertools, re, subprocess -> come with Python
88
count=0
99
for i in python_packages:

0 commit comments

Comments
 (0)