-
Notifications
You must be signed in to change notification settings - Fork 1
/
transform.py
executable file
·58 lines (43 loc) · 1.44 KB
/
transform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#! /usr/bin/python
import sys, time
import fileinput
import calendar
import collections
from urlparse import urlparse
user_dict = collections.OrderedDict()
maxlen = 10000000
def getTime(rawTime):
return str(calendar.timegm(time.strptime(rawTime, "%Y-%m-%d %H:%M:%S")))
def normalizeUrl(url):
o = urlparse(url)
return " " + o.netloc.replace("www.","") + o.path
def getUid(ip, user_agent, url):
return ip + user_agent + url
def build(timestamp, host, uri, code, size):
return "|".join([timestamp, host, uri, code, size])
def transform(raw):
tokens = raw.split('\t')
uid = getUid(tokens[6], tokens[7], tokens[13])
line = None
if tokens[12] == "PAGE_ENTER":
if len(user_dict) >= maxlen:
user_dict.popitem(last = False)
user_dict[uid] = False
elif tokens[12] == "AD_VIEW":
if uid in user_dict:
user_dict[uid] = True
elif tokens[12] == "PAGE_EXIT":
if uid in user_dict:
status = user_dict.pop(uid)
if status == True:
line = build(getTime(tokens[0]), tokens[16], normalizeUrl(tokens[13]), "400", "50000")
else:
line = build(getTime(tokens[0]), tokens[16], normalizeUrl(tokens[13]), "200", "50000")
return line
def main(argv):
for rawLine in fileinput.input():
line = transform(rawLine)
if line != None:
print line
if __name__ == '__main__':
main(sys.argv)