-
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Always download entire official database in one go
- Loading branch information
Showing
6 changed files
with
131 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
name: Validate workflow file syntax | ||
on: | ||
push: | ||
branches: [master] | ||
paths: | ||
- .github/workflows/* | ||
pull_request: | ||
paths: | ||
- .github/workflows/* | ||
permissions: {} | ||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.ref }} | ||
cancel-in-progress: true | ||
jobs: | ||
action-validator: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v3 | ||
with: | ||
sparse-checkout: .github/workflows | ||
- run: | | ||
curl -o action_validator -fsSL https://github.com/mpalmer/action-validator/releases/download/v0.5.3/action-validator_linux_amd64 | ||
chmod +x action_validator | ||
- run: ./action_validator .github/workflows/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
# SPDX-FileCopyrightText: © 2022–2023 Kevin Lu | ||
# SPDX-Licence-Identifier: LGPL-3.0-or-later | ||
# SPDX-FileCopyrightText: © 2023 Kevin Lu | ||
# SPDX-Licence-Identifier: AGPL-3.0-or-later | ||
name: Scrape official database for card text | ||
|
||
on: | ||
|
@@ -11,24 +11,38 @@ on: | |
jobs: | ||
scrape: | ||
runs-on: ubuntu-latest | ||
outputs: | ||
status: ${{ steps.commit.outputs.status }} | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- uses: actions/setup-python@v3 | ||
with: | ||
python-version: "3.10" | ||
cache: pip | ||
- run: pip install -r src/requirements.txt | ||
- run: curl -fsSLO https://github.com/DawnbrandBots/yaml-yugi/raw/aggregate/cards.json | ||
- run: python3 src/job.py cards.json official | ||
- name: Commit | ||
if: ${{ always() }} | ||
- run: python3 src/dump.py ko _site/ocg.csv | ||
- id: commit | ||
uses: DawnbrandBots/yaml-yugi/.github/actions/commit-push@master | ||
with: | ||
message: "Scrape official Korean card database: ${{ github.run_number }} (${{ github.run_id }})" | ||
- if: steps.commit.outputs.status > 0 | ||
name: Add timestamp to GitHub Pages | ||
run: | | ||
git config user.name GitHub Actions | ||
git config user.email [email protected] | ||
git add . | ||
git commit -m "Scrape official Korean card database" | ||
git pull --rebase | ||
git push | ||
upload: | ||
export DATETIME=$(date -Is) | ||
cat _site/index.html | DATE=$(date -I) envsubst '$DATE$DATETIME' >_site/index.html | ||
echo "$DATETIME" > _site/VERSION | ||
- uses: actions/configure-pages@v2 | ||
- uses: actions/upload-pages-artifact@v1 | ||
deploy: | ||
needs: scrape | ||
uses: ./.github/workflows/publish.yml | ||
if: needs.scrape.outputs.status > 0 && !cancelled() | ||
runs-on: ubuntu-latest | ||
permissions: | ||
pages: write | ||
id-token: write | ||
environment: | ||
name: github-pages | ||
url: ${{ steps.deploy.outputs.page_url }} | ||
steps: | ||
- id: deploy | ||
uses: actions/deploy-pages@v1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# SPDX-FileCopyrightText: © 2023 Kevin Lu | ||
# SPDX-Licence-Identifier: AGPL-3.0-or-later | ||
from csv import writer | ||
import logging | ||
import sys | ||
from typing import Tuple, TYPE_CHECKING | ||
|
||
from bs4 import BeautifulSoup | ||
from httpx import Client | ||
|
||
|
||
if TYPE_CHECKING: | ||
from bs4 import Tag | ||
|
||
|
||
def download_all(locale: str) -> str: | ||
prime_url = f"https://www.db.yugioh-card.com/yugiohdb/card_search.action?ope=1&sess=1&keyword=&stype=1&ctype=&starfr=&starto=&pscalefr=&pscaleto=&linkmarkerfr=&linkmarkerto=&link_m=2&atkfr=&atkto=&deffr=&defto=&othercon=2&request_locale={locale}" | ||
dump_url = f"https://www.db.yugioh-card.com/yugiohdb/card_search.action?ope=1&sess=2&sort=1&rp=99999&page=1&stype=1&othercon=2&page=1&request_locale={locale}" | ||
with Client(http2=True, follow_redirects=True, headers={"Referer": "https://www.db.yugioh-card.com/"}) as client: | ||
client.get(prime_url).raise_for_status() | ||
response = client.get(dump_url) | ||
response.raise_for_status() | ||
return response.text | ||
|
||
|
||
def replace_text_breaks(element: "Tag") -> str: | ||
for br in element.find_all("br"): | ||
br.replace_with("\n") | ||
return element.text.strip() | ||
|
||
|
||
def parse_card_text(div: "Tag") -> Tuple[str, str, str, str | None]: | ||
cid_input = div.select_one("input.cid") | ||
# Alternate: span.card_name | ||
name_input = div.select_one("input.cnm") | ||
text_dd = div.select_one("dd.box_card_text") | ||
pendulum_span = div.select_one("span.box_card_pen_effect") | ||
|
||
konami_id = cid_input["value"] | ||
name = name_input["value"] | ||
text = replace_text_breaks(text_dd) | ||
pendulum = replace_text_breaks(pendulum_span) if pendulum_span else None | ||
|
||
return konami_id, name, text, pendulum | ||
|
||
|
||
def transform_all(html: str, output_csv: str) -> None: | ||
logging.log("Start HTML parse") | ||
soup = BeautifulSoup(html, "lxml") | ||
logging.log("End HTML parse") | ||
with open(output_csv, "w") as f: | ||
csv = writer(f) | ||
csv.writerow(["konami_id", "name", "text", "pendulum"]) | ||
for div in soup.select("div.t_row.c_normal"): | ||
csv.writerow(parse_card_text(div)) | ||
|
||
|
||
if __name__ == "__main__": | ||
if len(sys.argv) < 3: | ||
sys.exit(f"Usage: {sys.argv[0]} <locale> <output.csv>") | ||
|
||
logging.basicConfig(level=logging.INFO) | ||
html = download_all(sys.argv[1]) | ||
# TODO: compute SHA256 checksum and skip if matching | ||
transform_all(html, sys.argv[2]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
beautifulsoup4 | ||
httpx[http2] | ||
lxml | ||
ruamel.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters