-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdenormalize.sh
executable file
·52 lines (40 loc) · 1.36 KB
/
denormalize.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/bin/sh
# Denormalises data from the original Yelp files such
# that each line of the output file contains review & business info
sort_wrapper() {
# $1 - file
# $2 - key position of sorting
# $3 - out_file
LC_ALL=C sort -t \" -k "$2" "$1" > "$3"
}
[ \( "$1" = "-h" \) -o \( "$#" -ne 3 \) ] &&
printf "Usage: denormalise.sh in_dir out_file
in_dir path to a directory with original Yelp files
out_file output file\n
ids_file file of sorted business ids (id per line)\n" &&
exit 0
filter_exec='./denormalization/filter.py'
join_exec='./denormalization/join.py'
extract_ids_exec='./denormalization/extract_ids.py'
# sort businesses
business="$1/business_sorted.json"
sort_wrapper "$1/business.json" 4 "$business"
# filter only a particular time period and filter out german&french
tmp=`mktemp -p .`
tmp2=`mktemp -p .`
cat "$1/review.json" | PYTHONPATH=. $filter_exec -l 2012-05-01 2012-12-01 > "$tmp"
# join with businesses
sort_wrapper "$tmp" 10 "$tmp2"
mv "$tmp2" "$tmp"
PYTHONPATH=. $join_exec "$tmp" "$business" "$tmp2" "business_id"
mv "$tmp2" "$tmp"
# getting linguistics data - GENEEA
# extracting ids - used for subsequent data extraction
sort_wrapper "$tmp" 4 "$tmp2"
mv "$tmp2" "$tmp"
PYTHONPATH=. $extract_ids_exec "$tmp" "$3"
# the final data are sorted alphabetically with respect to review_id
# OUTPUT FILE
mv "$tmp" "$2"
# remove tmp files
rm -f "$business"