-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmonitor_finished_pump_studies.sh
executable file
·100 lines (94 loc) · 4.7 KB
/
monitor_finished_pump_studies.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env bash
set -exo pipefail
#assume this has the source bucket credentials (not open data) only
#0) starts with manifest of all studies + runs
#in a loop:
#1) checks for finished studies in S3 against manifest are aren't in a) unifier q already and b) aren't done unifying
#2) adds studies from 1) into unifier q
#3) repeat
export LC_ALL=C
#in seconds
SLEEP_TIME=60
export UNIFIER_Q="https://sqs.us-east-1.amazonaws.com/315553526860/monorail_batch_unify"
export REGION=$(echo "$UNIFIER_Q" | cut -d'.' -f 2)
#export PUMP_S3_OUTPUT="s3://monorail-batch/pump-outputs"
export PUMP_S3_OUTPUT="s3://recount-opendata/recount3expansion/pump"
export UNIFIER_S3_OUTPUT="s3://recount-opendata/recount3expansion/unifier"
export date=$(date +%Y%m%d_%s)
export DEBUG=1
#list of all idx<TAB>studies<TAB>runs being processed by Monorail Pump
#this should be a full path and that path should be writable by this script
manifestF=$1
#human or mouse
org0=$2
if [[ -z $org0 ]]; then
org0="human"
fi
#for fgrepping later
head -1 $manifestF | tr $'\t' $'\n' | fgrep -n "" > ${manifestF}.fields
study_col=$(egrep -e $':study_id$' ${manifestF}.fields | cut -d':' -f 1)
sample_col=$(egrep -e $':external_id$' ${manifestF}.fields | cut -d':' -f 1)
if [[ $sample_col -lt $study_col ]]; then
paste <(cut -f $study_col $manifestF) <(cut -f $sample_col $manifestF) | sed 's#^#\t#' | sed 's#$#\t#' > ${manifestF}.cut.tabs
else
cut -f ${study_col},${sample_col} $manifestF | sed 's#^#\t#' | sed 's#$#\t#' > ${manifestF}.cut.tabs
fi
mkdir -p PUMP_DONES
export PUMP_DONES=$PUMP_S3_OUTPUT/$org0/DONES
while true; do
aws s3 sync --no-sign-request $PUMP_DONES/ ./PUMP_DONES/
#example of path in the .DONE file:
#ERP001942.ERR188246.20240807_1723047815.DONE
#containing:
#s3://recount-opendata/recount3expansion/pump/human/95/SRP222095/69/SRR10133969.20240807_1723052789
#determine which studies are finished (ignore unfinished studies for now, another script will take care of stragglers):
pushd ./PUMP_DONES/
#get study,run ids for the DONE samples
ls *.DONE | cut -d'.' -f 1-2 | sed 's#\.#\t#' | sort -u > dones
#count number of finished run ids for each study
cut -f 1 dones | sort | uniq -c | sed 's#^# #' | tr -s " " $'\t' | cut -f 2- | sort > dones.counts2study
#format the list of studies for grepping
cut -f 2 dones.counts2study | sed 's#$#\t#' | sed 's#^#\t#' > dones.counts2study.tabs
#grep out the list of studies and get the expected run counts for each study to compare with
fgrep -f dones.counts2study.tabs ${manifestF}.cut.tabs | cut -f 2 | sort | uniq -c | sed 's#^# #' | tr -s " " $'\t' | cut -f 2- | sort > ${manifestF}.cut.tabs.counts2study
#compare finished # of runs per study counts with expected counts, get the studies that match
comm -1 -2 ${manifestF}.cut.tabs.counts2study dones.counts2study > dones.counts2study.matching
#get just the study ids of the matching ones
cut -f 2 dones.counts2study.matching > dones.counts2study.matching.studies
#now queue up finished studies from pump, for unifying
for study0 in `cat dones.counts2study.matching.studies`; do
lo=${study0: -2}
#send this to the queue
studys3="s3://recount-opendata/recount3expansion/pump/$org0/$lo/$study0"
echo "$studys3" >> ../ready2unify.studies
#clear pump .DONE files from S3 for this study
#the idea being to try to avoid/minimize as best as possible any multiple overlapping enqueues of the same study
#(only ever want 1!)
for f in `ls ${study0}.*`; do
if [[ -n $DEBUG ]]; then
#echo "aws s3 rm $PUMP_S3_OUTPUT/$org0/DONES/$f"
#echo "aws s3 cp $f $PUMP_S3_OUTPUT/$org0/UNIFYING/"
echo "aws s3 mv $PUMP_S3_OUTPUT/$org0/DONES/$f $PUMP_S3_OUTPUT/$org0/UNIFYING/"
else
aws s3 mv $PUMP_S3_OUTPUT/$org0/DONES/$f $PUMP_S3_OUTPUT/$org0/UNIFYING/
#still track these pump files just in case...
#aws s3 cp $f $PUMP_S3_OUTPUT/$org0/UNIFYING/
fi
done
#clear local directory of sample .DONE files for this study as well
if [[ -n $DEBUG ]]; then
echo "rm -rf ${study0}.*"
else
rm -rf ${study0}.*
fi
#finally, when all other sentinel files have been (re)moved, enqueue the unifier
#aws sqs send-message --region $REGION --queue-url $UNIFIER_Q --message-body "$studys3"
if [[ -n $DEBUG ]]; then
echo "aws sqs send-message --region $REGION --queue-url $UNIFIER_Q --message-body \"$studys3\""
else
aws sqs send-message --region $REGION --queue-url $UNIFIER_Q --message-body "$studys3"
fi
done
popd
sleep $SLEEP_TIME
done