forked from LungCellAtlas/scRNAseq_pipelines
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLCA_pipeline_setup.sh
executable file
·340 lines (295 loc) · 12.6 KB
/
LCA_pipeline_setup.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
#!/bin/bash
# A: Lisa Sikkema, 2020
# D: download files, set up conda environment and build reference genome for Lung Cell Atlas cellranger pipeline
# version of pipeline
pipeline_version="0.1.0"
# parameter defaults:
# Ensembl release
ensrel="99"
# Genome string
genomestring="GRCh38"
# species
species="homo_sapiens"
# Expected cell ranger version
expectedcrv="3.1.0"
# Resources for mkref
# number of cores
nthreads="20"
# Memory in GB
memgb="48"
# Include Sars-cov2 genome:
incl_sarscov2=false
# Working path
#work_path="."
work_path="/share/data2/000-10x-HCA-bams/HCA_test/"
script_path="/home/becavin/scRNAseq_pipelines/"
# script parts to run/skip:
download_files=true
create_env=true
build_ref_genome=true
usage() {
cat <<HELP_USAGE
Lung Cell Atlas pipeline version: v${pipeline_version}
Usage: $(basename "$0") [-htmsegcuDCRS]
Options:
-h show this help text
-t <n_threads> number of threads to be used by cellranger mkref
(default: ${nthreads})
-m <mem_in_Gb> memory in GB to be used by cellranger mkref
(default: ${memgb})
-s <species> species for genome building. Use homo_sapiens
for human. (default: ${species})
-e <ensembl_release> ensembl release of reference genome to build.
(default: ${ensrel})
-g <genome_release> genome release, must match with the
corresponding Ensembl release (see also Ensembls
fa file name for the release), use without the
patch id (e.g. GRCh37; GRCh38, GRCm38)
(default: ${genomestring})
-c <path_to_envs_dir> path to envs directory from your miniconda or
anaconda, with trailing slash, e.g.
/Users/doejohn/miniconda3/envs/
-u <user:pass> user:pass received from Lung Cell Atlas to
acquire access to files for download
-D <true|false> include download of required files. Set this
to false if files were already downloaded and
you want to skip this step. (default: ${create_env})
-C <true|false> include creation of conda environment. Set this to
false if conda environment is already created
and you want to skip this step.
(default: ${download_files})
-R <true|false> include building of reference genome. Set this
to false if reference genome has already been
built and you want to skipt this step.
(default: ${build_ref_genome})
-S <true|false> include Sars-cov2 in the reference genome. Set
this to true if Sars-cov2 should be included.
(default: ${incl_sarscov2})
HELP_USAGE
}
# TAKE IN ARGUMENTS
# go through optional arguments:
# preceding colon after getopts: don't allow for any automated error messages
# colon following flag letter: this flag requires an argument
while getopts ":ht:m:s:e:g:c:u:D:C:R:S:" opt; do
# case is bash version of 'if' that allows for multiple scenarios
case "$opt" in
# if h is added, print usage and exit
h ) usage
exit
;;
t ) nthreads=$OPTARG
;; # continue
m ) memgb=$OPTARG
;;
s ) species=$OPTARG
;;
e ) ensrel=$OPTARG
;;
g ) genomestring=$OPTARG
;;
c ) conda_envs_dir=$OPTARG
;;
u ) user_pass=$OPTARG
;;
D ) download_files=$OPTARG
;;
C ) create_env=$OPTARG
;;
R ) build_ref_genome=$OPTARG
;;
S ) incl_sarscov2=$OPTARG
;;
# if unknown flag, print error message and put it into stderr
\? ) echo "Invalid option: $OPTARG" >&2
usage
exit 1
;;
# if argument is missing from flag that requires argument,
# print error and exit 1.
# the print and echo outputs are sent to stderr file?
# then exit 1
: ) printf "missing argument for -%s\n" "$OPTARG" >&2
echo "$usage" >&2
exit 1
;;
esac
done
# move to next argument, and go through loop again:
shift $((OPTIND -1))
# CHECK PASSED ARGUMENTS/PARAMETERS:
# check if arguments specifying steps to skip/include (-DCR) are set to either true or false
# first convert them to lower case:
download_files="${download_files,,}"
create_env="${create_env,,}"
build_ref_genome="${build_ref_genome,,}"
# check if they're set to either true or false
if [ $download_files != true ] && [ $download_files != false ]; then
echo "-D flag can only be set to 'true' or 'false'! exiting."
exit 1
fi
# do the same for create_env:
if [ $create_env != true ] && [ $create_env != false ]; then
"-C flag can only be set to 'true' or 'false'! exiting."
exit 1
fi
# and for ref genome:
if [ $build_ref_genome != true ] && [ $build_ref_genome != false ]; then
echo "-R flag can only be set to 'true' or 'false'! exiting."
exit 1
fi
# if files have to be downloaded, then check if user_pass argument was provided:
# -z is True if string has length 0
if [ $download_files == true ] && [ -z $user_pass ]; then
echo "user pass argument (-u flag) not provided. Exiting."
exit 1
fi
# check if conda_envs_dir argument was passed:
if [[ ($create_env == true || $build_ref_genome == true) && -z $conda_envs_dir ]]; then
echo "conda environment directory (-c flag) argument not provided. Exiting."
exit 1
fi
# check if conda_envs_dir is a directory:
if [ $create_env == true ] || [ $build_ref_genome == true ]; then
# check if it is a directory (if not: exit), and if directory ends with /envs. (If not, only print a warning.)
if ! [ -d $conda_envs_dir ]; then
echo "Specified conda_envs_dir $conda_envs_dir does not exist. Exiting."
exit 1
elif [[ $conda_envs_dir != */envs ]]; then
echo "Warning: Specified conda envs directory $conda_envs_dir does not end with \"/envs\". Make sure no trailing slash is included. Is this the correct directory?"
fi
fi
# if reference genome build is included, check if $incl_sarscov2 is either true or false:
# first convert to lowercase
incl_sarscov2="${incl_sarscov2,,}"
# now check
if [ $build_ref_genome == true ]; then
if [ $incl_sarscov2 != true ] && [ $incl_sarscov2 != false ]; then
echo "-S flag can only be set to 'true' or 'false'! exiting."
exit 1
fi
fi
# CREATE LOG AND PRINT PARAMETERS
# cd into workpath and store full path
cd $work_path
work_path=`pwd`
# Log filenname
LOGFILE=$work_path/"LOG_LCA_pipeline_setup.log"
# Check if logfile exists
if [ -f ${LOGFILE} ]; then
echo "ERROR: LOG file ${LOGFILE} already exists. please remove. exit."
exit 1
fi
# Create log file and add DATE
echo `date` > ${LOGFILE}
echo "Log file saved in : ${LOGFILE}"
# print pipeline version
echo "Lung Cell Atlas pipeline version: v${pipeline_version}" | tee -a ${LOGFILE}
# Print which steps will be skipped or included:
echo "STEPS TO BE INCLUDED/SKIPPED:" | tee -a ${LOGFILE}
# print for each step if it is included or skipped
if [ $download_files == true ]; then
echo "downloading of required files will be included" | tee -a ${LOGFILE}
elif [ $download_files == false ]; then
echo "downloading of required files will be skipped" | tee -a ${LOGFILE}
fi
# do the same for create_env:
if [ $create_env == true ]; then
echo "creation of conda environment will be included" | tee -a ${LOGFILE}
elif [ $create_env == false ]; then
echo "creation of conda environment will be skipped" | tee -a ${LOGFILE}
fi
# and for build_ref_genome
if [ $build_ref_genome == true ]; then
echo "building of reference genome will be included" | tee -a ${LOGFILE}
if [ $incl_sarscov2 == true ]; then
echo "Sars-cov2 genome will be added to the reference genome." | tee -a ${LOGFILE}
fi
elif [ $build_ref_genome == false ]; then
echo "building of reference genome will be skipped" | tee -a ${LOGFILE}
fi
# print parameters. tee command (t-split) splits output into normal printing and a second target,
# in this case the log file to which it will -a(ppend) the output.
# i.e. parameters are printed and stored in logfile.
echo "Params:" | tee -a ${LOGFILE}
echo "cellranger version expected: ${expectedcrv}, Ensembl release: ${ensrel}, Genome release: ${genomestring}, Species: ${species}" | tee -a ${LOGFILE}
echo "nthreads: ${nthreads}, memgb: ${memgb}" | tee -a ${LOGFILE}
echo "user:pass provided" | tee -a ${LOGFILE}
echo "conda_envs_dir: $conda_envs_dir" | tee -a ${LOGFILE}
# let user confirm parameters:
read -r -p "Are the parameters correct? Continue? [y/N] " response
if [[ "$response" =~ ^([yY][eE][sS]|[yY])$ ]]
then
echo "Parameters confirmed." | tee -a ${LOGFILE}
else
echo "Parameters not confirmed, exit." | tee -a ${LOGFILE}
exit 1
fi
# DOWNLOAD THE REQUIRED FILES, if $download_files == true:
if [ $download_files == true ]; then
# now download the tar file:
echo "We will download the necessary files now, this shouldn't take too long..." | tee -a ${LOGFILE}
curl --user $user_pass https://hmgubox2.helmholtz-muenchen.de/public.php/webdav/LCA_pipeline_downloads.tar.gz --output $work_path/LCA_pipeline_downloads.tar.gz -k
curl --user $user_pass https://hmgubox2.helmholtz-muenchen.de/public.php/webdav/LCA_pipeline_downloads_CHECKSUM --output $work_path/LCA_pipeline_downloads_CHECKSUM -k
echo "Done." | tee -a ${LOGFILE}
# validate that file is intact and not corrupted using checksum:
echo "Checking md5sum of downloaded file..." | tee -a ${LOGFILE}
md5sum -c $work_path/LCA_pipeline_downloads_CHECKSUM | tee -a ${LOGFILE}
# now unpack downloaded file:
echo "Unpacking downloaded tar file now..." | tee -a ${LOGFILE}
tar -xzvf $work_path/LCA_pipeline_downloads.tar.gz | tee -a ${LOGFILE}
echo "Done" | tee -a ${LOGFILE}
# move directories up one folder and remove donwload dir + tar
mv $work_path/LCA_pipeline_downloads/* $work_path/ 2>&1 | tee -a ${LOGFILE}
rmdir $work_path/LCA_pipeline_downloads 2>&1 | tee -a ${LOGFILE}
rm $work_path/LCA_pipeline_downloads.tar.gz 2>&1 | tee -a ${LOGFILE}
fi
# CREATE CONDA ENVIRONMENT, if $create_env == true:
path_to_env="$conda_envs_dir/cr3-velocyto-scanpy"
if [ $create_env == true ]; then
echo "Creating conda environment in ${path_to_env}... NOTE! This can take a few hours..." | tee -a ${LOGFILE}
echo "start time: `date`" | tee -a ${LOGFILE}
conda create --prefix $path_to_env -c ./conda-bld -c conda-forge -c bioconda -y cellranger=3.1.0=0 scanpy=1.4.4.post1=py_3 velocyto.py=0.17.17=py36hc1659b7_0 samtools=1.10=h9402c20_2 conda=4.8.2=py36_0 nextflow=19.10 java-jdk=8.0.112 | tee -a ${LOGFILE}
echo "End time: `date`" | tee -a ${LOGFILE}
fi
# BUILD REFERENCE GENOME, if $build_ref_genome == true:
if [ $build_ref_genome == true ]; then
# activate environment. Since the command conda activate doesn't (always?) work
# in subshell, we first want to use source to make the command available:
path_to_conda_sh=$(conda info --base)/etc/profile.d/conda.sh
source $path_to_conda_sh
# now we can activate environment
echo "Activating environment...." | tee -a ${LOGFILE}
conda activate $path_to_env # this cannot be put into LOGFILE, because then the conda environment is not properly activated for some reason.
# now start building the reference genome!
# cd into the correct folder;
if [ ! -d ${work_path}/refgenomes ]; then
mkdir ${work_path}/refgenomes
fi
cd ${work_path}/refgenomes
pwd
echo "Currently working in folder `pwd`" | tee -a ${LOGFILE}
# now run the script to build the genome:
echo "We will now start building the reference genome, using the script ${script_path}/refgenomes/create_cellranger_ref_from_ensembl.sh" | tee -a ${LOGFILE}
echo "This might take a few hours. Start time: `date`" | tee -a ${LOGFILE}
echo "For a detailed log of the genome building, check out the logfile in your ${work_path}/refgenomes folder!" | tee -a ${LOGFILE}
if [ $incl_sarscov2 == false ]; then
${script_path}/refgenomes/create_cellranger_ref_from_ensembl.sh -e ${ensrel} -g ${genomestring} -s ${species} -c ${expectedcrv} -t ${nthreads} -m ${memgb} -u true | tee -a ${LOGFILE}
# check md5sum if default parameters were used:
if [ ${genomestring} == "GRCh38" ] && [ ${ensrel} == "99" ] && [ ${species} == "homo_sapiens" ] && [ ${expectedcrv} == "3.1.0" ]; then
echo "Checking md5sum of output folder..." | tee -a ${LOGFILE}
md5sum -c ${script_path}/refgenomes/md5checks/homo_sapiens_GRCh38_ensrel99_cr3.1.0.md5 | tee -a ${LOGFILE}
fi
elif [ $incl_sarscov2 == true ]; then
echo "Including Sars-cov2 genome into the reference..." | tee -a ${LOGFILE}
${script_path}/refgenomes/create_cellranger_ref_from_ensembl.sh -e ${ensrel} -g ${genomestring} -s ${species} -c ${expectedcrv} -t ${nthreads} -m ${memgb} -u true -n sars_cov2 -f ./sars_cov2_genome/sars_cov2_genome.gtf -a ./sars_cov2_genome/sars_cov2.fasta | tee -a ${LOGFILE}
# check md5sum if default parameters were used:
if [ ${genomestring} == "GRCh38" ] && [ ${ensrel} == "99" ] && [ ${species} == "homo_sapiens" ] && [ ${expectedcrv} == "3.1.0" ]; then
echo "Checking md5sum of output folder..." | tee -a ${LOGFILE}
md5sum -c ${script_path}/refgenomes/md5checks/homo_sapiens_GRCh38_ensrel99_cr3.1.0_sars_cov2.md5 | tee -a ${LOGFILE}
fi
fi
echo "End time: `date`" | tee -a ${LOGFILE}
cd ../
fi
echo 'End of script.' | tee -a ${LOGFILE}