-
Notifications
You must be signed in to change notification settings - Fork 0
/
hackernews.py
executable file
·125 lines (110 loc) · 4.45 KB
/
hackernews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python
"""Download hackernews data into dataframe."""
import glob
import re
import subprocess
import warnings
from datetime import datetime
from os.path import exists
from pathlib import Path
import pandas as pd
from bs4 import BeautifulSoup
from nltk import ngrams
from nltk.corpus import stopwords
DOWNLOAD_DIR = f'{Path.home()}/code/predictor/downloads'
STORAGE_DIR = f'{Path.home()}/code/predictor/storage'
STORAGE_FILE = f'{STORAGE_DIR}/hackernews.pkl.gz'
HACKERNEWS_TIMESTAMP = f'{STORAGE_DIR}/hackernews-timestamp.txt'
STEAMPIPE = f'{Path.home()}/bin/steampipe'
NGRAMS = (2, 3, 4)
IMPORT_ALL = False
STOPWORDS = [word.lower() for word in stopwords.words('english')]
def download_hackernews():
"""Download latest hackernews data."""
today_str = datetime.now().strftime('%Y-%m-%d')
output_file = f'{DOWNLOAD_DIR}/hackernews-{today_str}.csv'
if exists(output_file) and Path(output_file).stat().st_size > 0:
return output_file
subprocess.run(
# pylint: disable-next=line-too-long
f'''{STEAMPIPE} query "select time, text from hackernews_item where type='comment' order by time asc" --output csv > {output_file}''',
shell=True, check=True)
return output_file
def sum_duplicates(dataframe):
"""Sum duplicate entries by date and gram."""
new_df = dataframe.groupby(['gram', dataframe.index]).sum()
unstack_df = new_df.unstack(level=1)
day_dfs = []
for column in unstack_df.columns:
# column looks like ('count', '2022-10-11')
reset = unstack_df[column].reset_index()
reset.columns = reset.columns.droplevel(level=1)
reset['date'] = column[1]
reset = reset.set_index('date')
day_dfs.append(reset)
return pd.concat(day_dfs).dropna().sort_index()
def split_sentence(sentence):
"""Split a sentence into phrases."""
try:
text = BeautifulSoup(sentence.text, features='html.parser').get_text()
except TypeError:
return []
text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
split_text = [word for word in text.split() if len(
word) > 1 and word not in STOPWORDS]
phrases = []
for num in NGRAMS:
for grams in ngrams(split_text, num):
if grams:
phrases.append(' '.join(grams))
return phrases
def main():
"""Main."""
if IMPORT_ALL:
csv_files = sorted(glob.glob(f'{DOWNLOAD_DIR}/hackernews*.csv'))
else:
csv_files = [download_hackernews()]
dfs = []
for csv_file in csv_files:
dfs.append(pd.read_csv(
csv_file, index_col=0, parse_dates=True,
infer_datetime_format=True))
hackernews_df = pd.concat(dfs)
hackernews_df = hackernews_df[hackernews_df['text'] != '<null>']
hackernews_df['text'] = hackernews_df['text'].str.lower()
latest_timestamp_str = str(hackernews_df.index.max())
# Start from last checkpoint
if not IMPORT_ALL and exists(HACKERNEWS_TIMESTAMP):
with open(HACKERNEWS_TIMESTAMP, encoding='utf-8') as input_file:
previous_timestamp = pd.Timestamp(input_file.read())
hackernews_df = hackernews_df[hackernews_df.index > previous_timestamp]
if len(hackernews_df) == 0:
return
# Remove time component
hackernews_df.index = pd.DatetimeIndex(
hackernews_df.index.strftime('%Y-%m-%d'))
hackernews_df.index.name = 'date'
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')
hackernews_series = hackernews_df.apply(split_sentence, axis=1)
hackernews_df = pd.DataFrame({'gram': hackernews_series}).explode('gram')
dfs = []
for idx in sorted(hackernews_df.index.unique()):
day_df = hackernews_df.loc[idx]
day_df = pd.DataFrame(
{'count': day_df.groupby(['gram']).size()}).reset_index()
day_df = day_df.assign(date=idx).set_index('date')
dfs.append(day_df)
hackernews_df = pd.concat(dfs)
if not IMPORT_ALL and exists(STORAGE_FILE):
hackernews_df = pd.concat(
[pd.read_pickle(STORAGE_FILE), hackernews_df])
hackernews_df = sum_duplicates(hackernews_df)
# Get rid of single phrases
hackernews_df = hackernews_df[hackernews_df['count'] > 1]
hackernews_df.to_pickle(STORAGE_FILE, compression='gzip')
with open(HACKERNEWS_TIMESTAMP, 'w', encoding='utf-8') as output_file:
output_file.write(latest_timestamp_str)
if len(csv_files) == 1:
subprocess.run(['gzip', csv_files[0]], check=True)
if __name__ == '__main__':
main()