Skip to content

Commit bc08f22

Browse files
committed
Updates scraping in construction of DSing
1 parent 07214c2 commit bc08f22

File tree

3 files changed

+111
-77
lines changed

3 files changed

+111
-77
lines changed

DSing Construction/runme_sing2dsing.sh

+2-5
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ echo "DSing30 = The English spoken recordings from GB plus all the other English
1212

1313
version=
1414

15-
DSing_dest=/media/gerardo/SoloSinging/${version}
15+
DSing_dest=/media/gerardo/SoloSinging/DSing_Task/${version}
1616
SmuleSing_path=/media/gerardo/SoloSinging/DAMP/sing_300x30x2
1717

1818
# A- Prepare the workspace
@@ -25,10 +25,7 @@ python copy_lyrics.py $DSing_dest $SmuleSing_path
2525
python identify_wordlevel_lyrics.py $DSing_dest
2626

2727
# D- Download sentence-level prompt-lyrics from Smule
28-
#
29-
# I need to change this step.
30-
# Smule changes the divs and blocks scrapping
31-
# python scraping_lyrics.py $workspace $db_path
28+
python scraping_lyrics.py $DSing_dest
3229

3330
# E- Transform word to sentence level
3431
python word_to_sentence_level.py $DSing_dest

DSing Construction/scraping_lyrics.py

+79-59
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from os import listdir, makedirs
55
import argparse
66
from user_agent import generate_user_agent
7+
import json
78

89

910
def get_countries(workspace_path):
@@ -29,70 +30,81 @@ def download_lyrics(args):
2930
for country in countries:
3031
print("[English Subset] Recovering from country {}".format(country))
3132
word_level_list = file2list(join(workspace, "data", country, "word_level.txt"))
33+
path_downloaded_lyrics = join(workspace, "DownloadLyric")
34+
create_folder(path_downloaded_lyrics)
35+
recovered_lyrics = [f for f in listdir(path_downloaded_lyrics) if f.endswith('.txt')]
3236

3337
for word_level in word_level_list:
3438
headers = {'User-Agent': generate_user_agent(device_type="desktop", os=('mac', 'linux'))}
3539
# Arrangement ID
3640
arrangement = word_level.split('.')[0]
37-
# Metadata file path
38-
metadata_path = join(workspace, 'data', country, country + "ArrangementMeta", arrangement + ".txt")
39-
40-
# Read metadata file for current word_level prompt
41-
try:
42-
metadata = dict(map(str, x.split(':', 1)) for x in file2list(metadata_path))
43-
except ValueError:
44-
# Metadata file has format errors
45-
# create empty dict
46-
if arrangement not in metadata_with_errors:
47-
metadata_with_errors.append(arrangement)
48-
metadata = {}
49-
50-
# Catch error if title is not in Metadata
51-
try:
52-
title = format_text(metadata['Arrangement title'].lstrip())
53-
except KeyError:
54-
if arrangement not in metadata_with_errors:
55-
metadata_with_errors.append(arrangement)
56-
title = ""
57-
58-
# Catch error if artist is not in Metadata
59-
try:
60-
artist = format_text(metadata['Arrangement artist'].lstrip())
61-
except KeyError:
62-
if arrangement not in metadata_with_errors:
63-
metadata_with_errors.append(arrangement)
64-
artist = ""
65-
66-
url = 'https://www.smule.com/song/{}-{}-karaoke-lyrics/{}/arrangement'.\
67-
format(artist, title, arrangement)
68-
69-
# try to get the lyrics several time in case of errors from the network connection
70-
attempts = 50
71-
while attempts > 0:
72-
response = requests.get(url, timeout=5, headers=headers)
73-
html = response.content
74-
soup = BeautifulSoup(html, "html5lib")
75-
mydiv = soup.find_all("div", {"class": "main"})
41+
42+
# Just download the lyrics on the first occurrence, no per country
43+
if arrangement + ".txt" not in recovered_lyrics:
44+
# Metadata file path
45+
metadata_path = join(workspace, 'data', country, country + "ArrangementMeta", arrangement + ".txt")
46+
47+
# Read metadata file for current word_level prompt
48+
try:
49+
metadata = dict(map(str, x.split(':', 1)) for x in file2list(metadata_path))
50+
except ValueError:
51+
# Metadata file has format errors
52+
# create empty dict
53+
if arrangement not in metadata_with_errors:
54+
metadata_with_errors.append(arrangement)
55+
metadata = {}
56+
57+
# Catch error if title is not in Metadata
58+
try:
59+
title = format_text(metadata['Arrangement title'].lstrip())
60+
except KeyError:
61+
if arrangement not in metadata_with_errors:
62+
metadata_with_errors.append(arrangement)
63+
title = ""
64+
65+
# Catch error if artist is not in Metadata
66+
try:
67+
artist = format_text(metadata['Arrangement artist'].lstrip())
68+
except KeyError:
69+
if arrangement not in metadata_with_errors:
70+
metadata_with_errors.append(arrangement)
71+
artist = ""
72+
73+
url = 'https://www.smule.com/song/{}-{}-karaoke-lyrics/{}/arrangement'.\
74+
format(artist, title, arrangement)
75+
76+
# try to get the lyrics several time in case of errors from the network connection
77+
attempts = 5
78+
while attempts > 0:
79+
response = requests.get(url, timeout=5, headers=headers)
80+
html = response.content
81+
soup = BeautifulSoup(html, "html.parser")
82+
mydiv = soup.find_all("script")#, {"class": "_1frabae"})
83+
if len(mydiv) < 1:
84+
attempts -= 1
85+
else:
86+
attempts = 0
87+
7688
if len(mydiv) < 1:
77-
attempts -= 1
89+
mydiv = soup.find_all("div", {"class": "column error-gone"})
90+
print("[WARNING] can't find {}".format(url))
91+
for div in mydiv:
92+
path_to_error_download = join(workspace, "data", country, "error_download.txt")
93+
with open(path_to_error_download, "a") as error_file:
94+
error_file.write("arrangement: {}\terror: {}\tdetails: {}".format(
95+
arrangement, div.h1.get_text(), div.p.get_text()
96+
))
7897
else:
79-
attempts = 0
80-
81-
if len(mydiv) < 1:
82-
mydiv = soup.find_all("div", {"class": "column error-gone"})
83-
print("[WARNING] can't find {}".format(url))
84-
for div in mydiv:
85-
path_to_error_download = join(workspace, "data", country, "error_download.txt")
86-
with open(path_to_error_download, "a") as error_file:
87-
error_file.write("arrangement: {}\terror: {}\tdetails: {}".format(
88-
arrangement, div.h1.get_text(), div.p.get_text()
89-
))
90-
else:
91-
for div in mydiv:
92-
lyric = div.get_text(strip=True, separator="\n").split("\n")
93-
path_new_lyric = join(workspace, "data", country, country + "DownloadLyric", arrangement + ".txt")
94-
#print("[Recover lyric] url {} - > save in {}".format(url, path_new_lyric))
95-
list2file(path_new_lyric, lyric)
98+
for div in mydiv:
99+
lyric_text = div.get_text()#.replace("\n","")#.split("\n")
100+
if "<p>" in lyric_text:
101+
lyric = lyric_text[lyric_text.find("<p>")+3:lyric_text.find("</p>")-4].split("<br>")
102+
path_new_lyric = join(path_downloaded_lyrics,
103+
arrangement + ".txt")
104+
print("[Recover lyric] url {} - > save in {}".format(url, path_new_lyric))
105+
list2file(path_new_lyric, lyric)
106+
break
107+
96108

97109

98110
def format_text(text):
@@ -114,11 +126,19 @@ def create_folder(fd):
114126
makedirs(fd)
115127

116128

129+
def clean_text(text):
130+
if "&#39;" in text:
131+
text = text.replace("&#39;", "'")
132+
if "&quot;" in text:
133+
text = text.replace("&quot;", '"')
134+
return(text)
135+
136+
117137
def list2file(path, data):
118138
create_folder(dirname(path))
119139
with open(path, "w") as file:
120140
for item in data:
121-
file.write("{}\n".format(item))
141+
file.write("{}\n".format(clean_text(item)))
122142

123143

124144
def file2list(filepath):
@@ -143,7 +163,7 @@ def file2list(filepath):
143163
' to recover the lyrics as appears in the Smule website'
144164
)
145165

146-
parser.add_argument("workspace", type=str, help="Path to Workspece")
166+
parser.add_argument("workspace", type=str, help="Path to Workspace")
147167
parser.add_argument('--version', action='version',
148168
version='%(prog)s 1.0')
149169

DSing Construction/word_to_sentence_level.py

+30-13
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,9 @@ def file2list(filepath):
5353
outlist = []
5454
with open(filepath) as file:
5555
for line in file:
56-
outlist.append(line.replace('\n', ''))
56+
line = line.replace('\n', '')
57+
if line:
58+
outlist.append(line)
5759
return outlist
5860

5961

@@ -69,16 +71,18 @@ def create_original_json(args):
6971

7072
for country in countries:
7173
print("[English Subset] Doing word2sentence lyrics of country {}".format(country))
72-
arrangement_list = [f for f in listdir(join(workspace, "data", country, country + "DownloadLyric"))
73-
if f.endswith('.txt')]
74+
arrangement_list = file2list(join(workspace, "data", country, "word_level.txt"))
75+
7476
for arrangement in arrangement_list:
75-
new_text_path = join(workspace, "data", country, country + 'DownloadLyric', arrangement)
76-
original_annotation_path = join(workspace, "data", country, country + 'Lyrics', arrangement.split(".")[0] + ".json")
77-
reconstructed_annotation = reconstruct_original_lyrics(new_text_path, original_annotation_path)
78-
original_reconstructed_lyrics_path = join(workspace, "data", country, country + 'DownloadLyric', arrangement.split(".")[0] + ".json")
79-
create_folder(dirname(original_reconstructed_lyrics_path))
80-
with open(original_reconstructed_lyrics_path, 'w') as outfile:
81-
json.dump(reconstructed_annotation, outfile, indent=4)
77+
new_text_path = join(workspace, 'DownloadLyric', arrangement.split(".")[0] + ".txt")
78+
if exists(new_text_path):
79+
original_annotation_path = join(workspace, "data", country, country + 'Lyrics', arrangement)
80+
reconstructed_annotation = reconstruct_original_lyrics(new_text_path, original_annotation_path)
81+
original_reconstructed_lyrics_path = join(workspace, "data", country, country + 'DownloadLyric',
82+
arrangement.split(".")[0] + ".json")
83+
create_folder(dirname(original_reconstructed_lyrics_path))
84+
with open(original_reconstructed_lyrics_path, 'w') as outfile:
85+
json.dump(reconstructed_annotation, outfile, indent=4)
8286

8387
one_word_recovered = [f for f in listdir(join(workspace, "data", country, country + 'DownloadLyric'))
8488
if f.endswith(".json")]
@@ -94,7 +98,6 @@ def reconstruct_original_lyrics(text_lyrics_path, json_lyrics_path):
9498
text_lyrics = file2list(text_lyrics_path)
9599
it_text_lyrics = iter(text_lyrics)
96100
current_text = next(it_text_lyrics)
97-
98101
reconstruct = []
99102
element = {'t': 0.0,
100103
'l': ""}
@@ -104,12 +107,26 @@ def reconstruct_original_lyrics(text_lyrics_path, json_lyrics_path):
104107
except json.decoder.JSONDecodeError:
105108
print(data_file)
106109
for item in data:
107-
if item['l'] in current_text:
110+
if item['l'] == current_text[:len(item['l'])]:
108111
if element['t'] == 0:
109112
element['t'] = item['t']
110113
element['l'] = current_text
111114
current_text = current_text[len(item['l']):].lstrip()
112-
if len(current_text) == 0:
115+
116+
# if item['l'] == current_text[1:len(item['l'])]:
117+
# if element['t'] == 0:
118+
# element['t'] = item['t']
119+
# element['l'] = current_text
120+
# current_text = current_text[len(item['l'])+1:].lstrip()
121+
#
122+
# if item['l'][1:] == current_text[:len(item['l'])-1]:
123+
# if element['t'] == 0:
124+
# element['t'] = item['t']
125+
# element['l'] = current_text
126+
# current_text = current_text[len(item['l'])-1:].lstrip()
127+
128+
129+
if len(current_text) == 0 or current_text == ',' or current_text == '.':
113130
reconstruct.append(element)
114131
element = {'t': 0.0,
115132
'l': ""}

0 commit comments

Comments
 (0)