-
Notifications
You must be signed in to change notification settings - Fork 0
/
Rand_Wiki_Text.py
80 lines (71 loc) · 2.56 KB
/
Rand_Wiki_Text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# Rand_Wiki_Text.py Version 0.1
# prints out wikipedia featured article text
# The site to query
SITE = "https://en.wikipedia.org/wiki/"
# the page on the site to query
URL = SITE + "Wikipedia:Featured_articles"
# note that if you want to change either of these, you'll probably
# have to update the parsing code in the "except" below
SAVEFILE = 'wiki_pages.txt'
# Version History
# V 0.1 2021-11-29 Worked on it
# V 0.2 2024-05-25 Libraries and better cleaning
import urllib.request
from urllib.request import urlopen
from random import choice
from time import sleep
from SlowPrint import sprint
from HTML_Cleaner import *
DEBUG = False
def prepWiki(rawhtm):
garbage_starts = rawhtm.find('See also</span>')
if garbage_starts > 1000:
rawhtm = rawhtm[:garbage_starts]
pre_excise = ('a class="mw-jump-link"', 'table class="infobox',
'table class="sidebar')
for tg in pre_excise:
rawhtm = excise_content(rawhtm, tg)
return rawhtm
def get_clean_wiki_page(page_name):
with urllib.request.urlopen(SITE + page_name) as response:
wikihtm = response.read()
page = wikihtm.decode("utf-8")
page = prepWiki(page)
page = cln(page)
return page
if __name__ == '__main__':
try:
f = open(SAVEFILE, 'r')
raw = f.readline()
f.close()
rem_pgs = eval(raw)
except: # rebuild the list
rem_pgs = []
from urllib.request import urlopen
with urlopen(URL) as f:
featuredhtm = f.read()
# second place you get 'class="hlist"'
cursor = featuredhtm.find(b'class="hlist"')
cursor = featuredhtm.find(b'class="hlist"', cursor + 1)
# then make a list of all the html links
sstr = b'<a href="'
offset = len(sstr)
cursor = featuredhtm.find(sstr, cursor) + offset
while cursor > offset:
end = featuredhtm.find(b'"', cursor)
page = str(featuredhtm[cursor:end])[2:-1]
if page[:6] != '/wiki/': break
if page.find('User:') > 0: break
if page.find('Help:') > 0: break
if page.find('Category:') > 0: break
page = page[6:]
rem_pgs.append(page)
cursor = featuredhtm.find(sstr, end) + offset
while True:
if len(rem_pgs) == 0:
break
idx = choice(range(len(rem_pgs)))
chosen_page = rem_pgs[idx]
#chosen_page = 'Atlanersa'
print('\n\n')
sprint(get_clean_wiki_page(chosen_page))