-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathParser.py
executable file
·66 lines (58 loc) · 1.97 KB
/
Parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#
# Parser.py
# eAUrnik
#
from lxml import html
def parse_block(block):
if not block.xpath("table/tr/td[1]"):
return
title = block.xpath("table/tr/td[1]")[0].text.strip()
class_attribute = block.get("class")
if "ednevnik-seznam_ur_teden-td-odpadlo" in class_attribute:
return
# if "ednevnik-seznam_ur_teden-td-nadomescanje" in class_attribute:
# title += " (N)"
if "ednevnik-seznam_ur_teden-td-zaposlitev" in class_attribute:
title += " (Z)"
icon = block.xpath("table/tr/td[2]/img")
if icon and icon[0].get("title") in ["JV", "PB"]:
return
if block.xpath("div"):
subtitle_unformatted = block.xpath("div")[0].text.strip()
subtitle_components = subtitle_unformatted.split(", ")
subtitle = subtitle_components[1] + ", " + subtitle_components[0]
else:
subtitle = ""
return (title, subtitle)
def lessons(page):
tree = html.fromstring(page)
if not tree.body.xpath("table"):
return
table = tree.body.xpath("table")[0]
lines = table.xpath("tr")
durations = []
lessons = []
for i in range(1, len(lines)):
coloumns = lines[i].xpath("td")
duration = "0" + coloumns[0].xpath("div[2]")[0].text
durations.append(duration)
rows = []
for j in range(1, len(coloumns)):
cell = coloumns[j]
blocks = cell.xpath("div")
if not blocks:
rows.append([])
continue
coloumn_lessons = []
parsed = parse_block(blocks[0])
if parsed:
coloumn_lessons = [parsed]
for k in range(1, len(blocks)):
block = blocks[k].xpath("div")[0]
parsed = parse_block(block)
if parsed:
coloumn_lessons.append(parsed)
rows.append(coloumn_lessons)
lessons.append(rows)
lessons = list(map(list, zip(*lessons)))
return (durations, lessons)