Skip to content

Commit 2c35519

Browse files
committed
v1 added
1 parent 53b5e61 commit 2c35519

File tree

10 files changed

+721
-0
lines changed

10 files changed

+721
-0
lines changed

Diff for: .gitignore

+160
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# PyInstaller
30+
# Usually these files are written by a python script from a template
31+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32+
*.manifest
33+
*.spec
34+
35+
# Installer logs
36+
pip-log.txt
37+
pip-delete-this-directory.txt
38+
39+
# Unit test / coverage reports
40+
htmlcov/
41+
.tox/
42+
.nox/
43+
.coverage
44+
.coverage.*
45+
.cache
46+
nosetests.xml
47+
coverage.xml
48+
*.cover
49+
*.py,cover
50+
.hypothesis/
51+
.pytest_cache/
52+
cover/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
.pybuilder/
76+
target/
77+
78+
# Jupyter Notebook
79+
.ipynb_checkpoints
80+
81+
# IPython
82+
profile_default/
83+
ipython_config.py
84+
85+
# pyenv
86+
# For a library or package, you might want to ignore these files since the code is
87+
# intended to run in multiple environments; otherwise, check them in:
88+
# .python-version
89+
90+
# pipenv
91+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
93+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
94+
# install all needed dependencies.
95+
#Pipfile.lock
96+
97+
# poetry
98+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99+
# This is especially recommended for binary packages to ensure reproducibility, and is more
100+
# commonly ignored for libraries.
101+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102+
#poetry.lock
103+
104+
# pdm
105+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106+
#pdm.lock
107+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108+
# in version control.
109+
# https://pdm.fming.dev/#use-with-ide
110+
.pdm.toml
111+
112+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113+
__pypackages__/
114+
115+
# Celery stuff
116+
celerybeat-schedule
117+
celerybeat.pid
118+
119+
# SageMath parsed files
120+
*.sage.py
121+
122+
# Environments
123+
.env
124+
.venv
125+
env/
126+
venv/
127+
ENV/
128+
env.bak/
129+
venv.bak/
130+
131+
# Spyder project settings
132+
.spyderproject
133+
.spyproject
134+
135+
# Rope project settings
136+
.ropeproject
137+
138+
# mkdocs documentation
139+
/site
140+
141+
# mypy
142+
.mypy_cache/
143+
.dmypy.json
144+
dmypy.json
145+
146+
# Pyre type checker
147+
.pyre/
148+
149+
# pytype static type analyzer
150+
.pytype/
151+
152+
# Cython debug symbols
153+
cython_debug/
154+
155+
# PyCharm
156+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158+
# and can be added to the global gitignore or merged into this file. For a more nuclear
159+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160+
#.idea/

Diff for: .vscode/settings.json

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"python.formatting.provider": "none",
3+
"[python]": {
4+
"editor.defaultFormatter": "ms-python.black-formatter"
5+
}
6+
}

Diff for: poetry.lock

+424
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: pyproject.toml

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
[tool.poetry]
2+
name = "wiki-search"
3+
version = "0.1.0"
4+
description = ""
5+
authors = ["Ilya Savitsky <[email protected]>"]
6+
readme = "README.md"
7+
packages = [{include = "wiki_search"}]
8+
9+
[tool.poetry.dependencies]
10+
python = "^3.10"
11+
beautifulsoup4 = "^4.12.2"
12+
markdownify = "^0.11.6"
13+
rich = "^13.3.5"
14+
requests = "^2.29.0"
15+
16+
17+
[tool.poetry.group.dev.dependencies]
18+
black = "^23.3.0"
19+
20+
[build-system]
21+
requires = ["poetry-core"]
22+
build-backend = "poetry.core.masonry.api"

Diff for: tests/__init__.py

Whitespace-only changes.

Diff for: wiki_search/__init__.py

Whitespace-only changes.

Diff for: wiki_search/__main__.py

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import argparse
2+
from bs4 import BeautifulSoup
3+
from markdownify import MarkdownConverter
4+
from rich.console import Console
5+
from rich.markdown import Markdown
6+
from soup_filters import filter_soup
7+
from wikipedia import NoArticleFound, fetch_wiki_data
8+
9+
console = Console()
10+
11+
parser = argparse.ArgumentParser(description="Search a Wikipedia article and print it")
12+
parser.add_argument("keyword", type=str, help="The keyword to search")
13+
args = parser.parse_args()
14+
15+
try:
16+
page_info = fetch_wiki_data(args.keyword)
17+
except NoArticleFound as e:
18+
console.print(f'No article "{e.fprompt}" found')
19+
exit(1)
20+
21+
soup = BeautifulSoup(page_info["text"]["*"], "html.parser")
22+
23+
filtered_soup = filter_soup(page_info["title"], soup)
24+
25+
md = MarkdownConverter(escape_underscores=False, autolinks=False).convert_soup(
26+
filtered_soup
27+
)
28+
29+
markdown = Markdown(md)
30+
31+
with console.pager():
32+
console.print(markdown)

Diff for: wiki_search/options.py

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from dataclasses import dataclass
2+
3+
4+
@dataclass
5+
class Options:
6+
class_prefix: str = "en"
7+
skip_tables: bool = True

Diff for: wiki_search/soup_filters.py

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from bs4 import BeautifulSoup
2+
from options import Options
3+
4+
5+
def filter_soup(opts: Options, title: str, soup: BeautifulSoup) -> BeautifulSoup:
6+
body = soup.find("div", class_="mw-parser-output")
7+
for tag in body.find_all("table", class_="infobox"):
8+
tag.decompose()
9+
10+
for tag in body.find_all("div", class_="navbox"):
11+
tag.decompose()
12+
13+
if opts.skip_tables:
14+
for tag in body.find_all("table"):
15+
tag.decompose()
16+
else:
17+
for tag in body.find_all("table", class_="sidebar"):
18+
tag.decompose()
19+
20+
for tag in body.find_all("table", class_="metadata"):
21+
tag.decompose()
22+
23+
for tag in body.find_all("div", class_="metadata"):
24+
tag.decompose()
25+
26+
for tag in body.find_all("style"):
27+
tag.decompose()
28+
29+
for tag in body.find_all("span", class_="mw-editsection"):
30+
tag.decompose()
31+
32+
for tag in body.find_all("span", class_="mw-cite-backlink"):
33+
tag.decompose()
34+
35+
for tag in body.find_all("a"):
36+
tag["href"] = f"https://{opts.class_prefix}.wikipedia.org" + tag["href"]
37+
38+
for tag in body.find_all("sup", class_="reference"):
39+
if "a" in tag:
40+
tag.string = tag["a"].text
41+
42+
final_soup = BeautifulSoup()
43+
header_tag = final_soup.new_tag("h1")
44+
header_tag.string = title
45+
final_soup.append(header_tag)
46+
final_soup.append(body)
47+
48+
return final_soup

Diff for: wiki_search/wikipedia.py

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import requests
2+
from options import Options
3+
4+
5+
class NoArticleFound(Exception):
6+
def __init__(self, failed_prompt: str):
7+
self.fprompt = failed_prompt
8+
9+
10+
def fetch_wiki_data(opts: Options, prompt: str) -> str:
11+
url = f"https://{opts.class_prefix}.wikipedia.org/w/api.php"
12+
params = {
13+
"action": "parse",
14+
"page": prompt,
15+
"format": "json",
16+
"prop": "text",
17+
"redirects": "",
18+
}
19+
data = requests.get(url, params=params).json()
20+
if "error" in data:
21+
raise NoArticleFound(prompt)
22+
return data["parse"]

0 commit comments

Comments
 (0)