Skip to content

Commit

Permalink
Always download entire official database in one go
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinlul committed Aug 12, 2023
1 parent c33346a commit aaf58bd
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 24 deletions.
24 changes: 24 additions & 0 deletions .github/workflows/action-validator.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Validate workflow file syntax
on:
push:
branches: [master]
paths:
- .github/workflows/*
pull_request:
paths:
- .github/workflows/*
permissions: {}
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
action-validator:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
sparse-checkout: .github/workflows
- run: |
curl -o action_validator -fsSL https://github.com/mpalmer/action-validator/releases/download/v0.5.3/action-validator_linux_amd64
chmod +x action_validator
- run: ./action_validator .github/workflows/*
42 changes: 28 additions & 14 deletions .github/workflows/scrape.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-FileCopyrightText: © 2022–2023 Kevin Lu
# SPDX-Licence-Identifier: LGPL-3.0-or-later
# SPDX-FileCopyrightText: © 2023 Kevin Lu
# SPDX-Licence-Identifier: AGPL-3.0-or-later
name: Scrape official database for card text

on:
Expand All @@ -11,24 +11,38 @@ on:
jobs:
scrape:
runs-on: ubuntu-latest
outputs:
status: ${{ steps.commit.outputs.status }}
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v3
with:
python-version: "3.10"
cache: pip
- run: pip install -r src/requirements.txt
- run: curl -fsSLO https://github.com/DawnbrandBots/yaml-yugi/raw/aggregate/cards.json
- run: python3 src/job.py cards.json official
- name: Commit
if: ${{ always() }}
- run: python3 src/dump.py ko _site/ocg.csv
- id: commit
uses: DawnbrandBots/yaml-yugi/.github/actions/commit-push@master
with:
message: "Scrape official Korean card database: ${{ github.run_number }} (${{ github.run_id }})"
- if: steps.commit.outputs.status > 0
name: Add timestamp to GitHub Pages
run: |
git config user.name GitHub Actions
git config user.email [email protected]
git add .
git commit -m "Scrape official Korean card database"
git pull --rebase
git push
upload:
export DATETIME=$(date -Is)
cat _site/index.html | DATE=$(date -I) envsubst '$DATE$DATETIME' >_site/index.html
echo "$DATETIME" > _site/VERSION
- uses: actions/configure-pages@v2
- uses: actions/upload-pages-artifact@v1
deploy:
needs: scrape
uses: ./.github/workflows/publish.yml
if: needs.scrape.outputs.status > 0 && !cancelled()
runs-on: ubuntu-latest
permissions:
pages: write
id-token: write
environment:
name: github-pages
url: ${{ steps.deploy.outputs.page_url }}
steps:
- id: deploy
uses: actions/deploy-pages@v1
1 change: 0 additions & 1 deletion _site/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,5 @@
<h1>YAML Yugi (한국어)</h1>
<h2>$DATETIME</h2>
<p><a href="ocg.csv" download="ocg-$DATE.csv">CSV</a></p>
<p><a href="ocg.db3" download="ocg-$DATE.db3">SQLite</a></p>
</body>
</html>
65 changes: 65 additions & 0 deletions src/dump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# SPDX-FileCopyrightText: © 2023 Kevin Lu
# SPDX-Licence-Identifier: AGPL-3.0-or-later
from csv import writer
import logging
import sys
from typing import Tuple, TYPE_CHECKING

from bs4 import BeautifulSoup
from httpx import Client


if TYPE_CHECKING:
from bs4 import Tag


def download_all(locale: str) -> str:
prime_url = f"https://www.db.yugioh-card.com/yugiohdb/card_search.action?ope=1&sess=1&keyword=&stype=1&ctype=&starfr=&starto=&pscalefr=&pscaleto=&linkmarkerfr=&linkmarkerto=&link_m=2&atkfr=&atkto=&deffr=&defto=&othercon=2&request_locale={locale}"
dump_url = f"https://www.db.yugioh-card.com/yugiohdb/card_search.action?ope=1&sess=2&sort=1&rp=99999&page=1&stype=1&othercon=2&page=1&request_locale={locale}"
with Client(http2=True, follow_redirects=True, headers={"Referer": "https://www.db.yugioh-card.com/"}) as client:
client.get(prime_url).raise_for_status()
response = client.get(dump_url)
response.raise_for_status()
return response.text


def replace_text_breaks(element: "Tag") -> str:
for br in element.find_all("br"):
br.replace_with("\n")
return element.text.strip()


def parse_card_text(div: "Tag") -> Tuple[str, str, str, str | None]:
cid_input = div.select_one("input.cid")
# Alternate: span.card_name
name_input = div.select_one("input.cnm")
text_dd = div.select_one("dd.box_card_text")
pendulum_span = div.select_one("span.box_card_pen_effect")

konami_id = cid_input["value"]
name = name_input["value"]
text = replace_text_breaks(text_dd)
pendulum = replace_text_breaks(pendulum_span) if pendulum_span else None

return konami_id, name, text, pendulum


def transform_all(html: str, output_csv: str) -> None:
logging.log("Start HTML parse")
soup = BeautifulSoup(html, "lxml")
logging.log("End HTML parse")
with open(output_csv, "w") as f:
csv = writer(f)
csv.writerow(["konami_id", "name", "text", "pendulum"])
for div in soup.select("div.t_row.c_normal"):
csv.writerow(parse_card_text(div))


if __name__ == "__main__":
if len(sys.argv) < 3:
sys.exit(f"Usage: {sys.argv[0]} <locale> <output.csv>")

logging.basicConfig(level=logging.INFO)
html = download_all(sys.argv[1])
# TODO: compute SHA256 checksum and skip if matching
transform_all(html, sys.argv[2])
1 change: 1 addition & 0 deletions src/requirements.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
beautifulsoup4
httpx[http2]
lxml
ruamel.yaml
22 changes: 13 additions & 9 deletions src/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,38 +4,42 @@
#
# pip-compile
#
anyio==3.6.2
anyio==3.7.1
# via httpcore
beautifulsoup4==4.11.1
beautifulsoup4==4.12.2
# via -r requirements.in
certifi==2023.7.22
# via
# httpcore
# httpx
exceptiongroup==1.1.2
# via anyio
h11==0.14.0
# via httpcore
h2==4.1.0
# via httpx
hpack==4.0.0
# via h2
httpcore==0.16.3
httpcore==0.17.3
# via httpx
httpx[http2]==0.23.3
httpx[http2]==0.24.1
# via -r requirements.in
hyperframe==6.0.1
# via h2
idna==3.4
# via
# anyio
# rfc3986
rfc3986[idna2008]==1.5.0
# via httpx
ruamel-yaml==0.17.21
# httpx
lxml==4.9.3
# via -r requirements.in
ruamel-yaml==0.17.32
# via -r requirements.in
ruamel-yaml-clib==0.2.7
# via ruamel-yaml
sniffio==1.3.0
# via
# anyio
# httpcore
# httpx
soupsieve==2.3.2.post1
soupsieve==2.4.1
# via beautifulsoup4

0 comments on commit aaf58bd

Please sign in to comment.