-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatafolha.py
151 lines (122 loc) · 4.42 KB
/
datafolha.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# datafolha.py
# (c) 2020 CincoNoveSeis Jornalismo Ltda.
#
# This program is licensed under the GNU General Public License, version 3.
# See the LICENSE file for details.
# NOTE: This parser is unable to parse poll PDFs with multiple scenarios for a
# first round. Trying it WILL GENERATE UNRELIABLE DATA. At Pindograma, we
# separate these PDFs from the others.
from util import as_dec, is_number
from table import TableParser
from itertools import tee, groupby
from functools import reduce
from operator import itemgetter
import re
class DatafolhaParser(TableParser):
@classmethod
def page_numbers_for_id_search(cls):
return [2, 3, 4, 5, 6]
@classmethod
def page_numbers_for_content_search(cls):
return range(6, 100)
@classmethod
def is_proper_field(cls, text):
return ('Projeto:' not in text and
'Base:' not in text and
'Data do campo:' not in text and
'INTENCAO' not in text and
'PRETENDE' not in text and
not text.startswith('***') and
'SENADORES' not in text and
re.search(r'[pP]\d+', text) is None and
not text.startswith('No dia'))
@classmethod
def field_has_newlines(cls, element):
return len(element.get_text().strip().split('\n')) > 1
@classmethod
def postprocess(cls, oup):
pp = []
grouper = itemgetter('estimulada', 'tse_id', 'position')
for k, grp in groupby(sorted(oup, key = grouper), grouper):
polls = [x['poll_data'] for x in grp]
pp.extend([{
'estimulada': k[0],
'tse_id': k[1],
'position': k[2],
'poll_data': p
} for p in polls])
return [i for n, i in enumerate(pp) if i not in pp[n + 1:]]
class DatafolhaMayorParser(DatafolhaParser):
@classmethod
def is_relevant_page(cls, text):
text = text.lower()
if 'segundo turno' in text:
return 'p.1 ' in text
return (('p.1 ' in text or
'p.2 ' in text or
'p.2a' in text or
'p.3 ' in text or
'p.4 ' in text) and
'votar' in text and
'numero' not in text and
'nao votaria' not in text and
'ficasse apenas' not in text and
'fonte' not in text and
'interesse' not in text and
'mudar seu voto' not in text)
@classmethod
def rule_out_page(cls, text):
text = text.lower()
return ('votos validos' in text or
'voto validos' in text or
'rejeicao' in text)
@classmethod
def get_pste(cls, text, ah):
text = text.lower()
if text.startswith('p.'):
e = 'estimulada' in text
st = 'segundo turno' in text
return ('p', st, e)
return None
@classmethod
def allow_mixed_mode(cls):
return False
class DatafolhaGeneralParser(DatafolhaParser):
@classmethod
def is_proper_field(cls, text):
return (super().is_proper_field(text) and
not 'TOTAL' in text)
@classmethod
def is_relevant_page(cls, text):
text = text.lower()
return (('eleicao para governador' in text or
'eleicao para senador' in text or
'eleicao para presidente' in text or
'eleicoes para governador' in text or
'eleicoes para senador' in text or
'eleicoes para presidente' in text) and
'nao votaria' not in text and
'ficasse apenas' not in text and
'fonte' not in text and
'interesse' not in text)
@classmethod
def rule_out_page(cls, text):
text = text.lower()
return (re.search(r'votos\s+validos', text, re.MULTILINE) or
'rejeicao' in text)
@classmethod
def get_pste(cls, text, ah):
text = text.lower()
if text.startswith('p.'):
e = 'estimulada' in text
st = 'segundo turno' in text
p = None
if 'senador' in text:
p = 's'
elif 'governador' in text:
p = 'g'
elif 'presidente' in text:
p = 'pr'
if p is not None:
return (p, st, e)
return None