-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
130 lines (109 loc) · 3.99 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from bs4 import BeautifulSoup
import requests
import csv
from selenium import webdriver
import os
# Declare current level, unit and lesson number
CUR_LEVEL = "upper-intermediate" # needs to match Levels inside dict below
UNIT = 2
LESSON = 3
# Yoyochinese usage for MP3 file name and urls
LEVELS = {
"beginner": ["CCR", "beginner-conversational"],
"intermediate": ["ICC", "intermediate-conversational"],
"upper-intermediate": ["UICC", "upper-intermediate-conversational"]
}
# Declare URL
# URL below will only work for UPPER-intermediate URLs.
# Change URL manually for Beginner and intermediate course URL link
URL = f"https://yoyochinese.com/lesson/{LEVELS[CUR_LEVEL][1]}-unit-{UNIT}-lesson-{LESSON}/dialogue"
# URL = "https://yoyochinese.com/lesson/intermediate-conversational-unit-56-lesson-3-Talking-About-Music-Pop/dialogue"
# Declare HTML class with "Inspect" on your web browser
HTML_CLASS = "Lynmuf9J_7kHUCdOFb3r" # Class of the <li> element
AUDIO_URL = "https://cdn.yoyochinese.com/audio/dialoguereplay/"
MP3_FILE_PREFIX = f"{LEVELS[CUR_LEVEL][0]}"
# Audio name pattern is not always the same on Yoyochinese,
# Change string accordingly and inside get_audio_file_name()
# "N" not always positioned at the end...
def format_unit_num():
"""
Add extra 0 if unit < 10
"""
u = 0
if UNIT < 10:
u = f"0{UNIT}"
else:
u = UNIT
return u
def format_tag():
"""
Declare tag name for CSV file for Anki
"""
unit = format_unit_num()
tag = f"yoyochinese::{CUR_LEVEL}::unit-{unit}"
return tag
def get_audio_file_name(index, unit):
"""
Format string depending on unit and lesson number
and append an index for each dialogue sentence inside a lesson.
"""
u = 0
if (index + 1) < 10:
n = f"0{index + 1}"
else:
n = index + 1
# LEVEL-UNIT-LESSON-INDEX-N
# DR-LEVEL-UNIT-LESSON-N-INDEX
return f"DR-{MP3_FILE_PREFIX}-0{unit}-0{LESSON}-N-0{n}.mp3"
def save_audio(audio_file_name):
"""
Save dialogue sentences MP3 files inside current directory
with the same file name saved inside the CSV.
"""
response = requests.get(f"{AUDIO_URL}{audio_file_name}")
if response.status_code == 200:
open(f"{audio_file_name}", "wb").write(response.content)
print(audio_file_name)
else:
print(f"Invalid MP3 URL status code: {response.status_code}")
def parser():
"""
Get a page source after the JavaScript has executed, parse the HTML,
and format it, then save to CSV file. Save the accompanying audio files.
"""
driver = webdriver.Chrome()
driver.get(URL)
# Get page source after the JavaScript has executed
page_source = driver.page_source
print("URL: ", URL)
# Parse HTML
soup = BeautifulSoup(page_source, 'html.parser')
job_elements = soup.find_all("li", class_=HTML_CLASS)
file = open('output.csv', 'a', newline='', encoding='utf-8')
writer = csv.writer(file)
if os.path.getsize("output.csv") == 0:
print("File is empty, adding headers...")
headers = ['sentence', 'pinyin', 'translation', 'audio', 'tags']
writer.writerow(headers)
else:
print("File is NOT empty")
unit = format_unit_num()
tag = format_tag()
# For each row get the sentence dialogue elements
for index, job_element in enumerate(job_elements):
chinese = job_element.find("p", class_="characters").get_text()
pinyin = job_element.find("p", class_="pinyin").get_text()
english = job_element.find("p", class_="english").get_text()
# Get matching file name and save audio file
audio_file_name = get_audio_file_name(index, unit)
save_audio(audio_file_name)
audio_to_anki = f"[sound:{audio_file_name}]" # Formatted for Anki audio field
# Append to CSV file
rows = ([chinese, pinyin, english, audio_to_anki, tag])
writer.writerow(rows)
file.close()
# Close the browser
driver.quit()
print("Done !")
if __name__ == '__main__':
parser()