4
4
from os import listdir , makedirs
5
5
import argparse
6
6
from user_agent import generate_user_agent
7
+ import json
7
8
8
9
9
10
def get_countries (workspace_path ):
@@ -29,70 +30,81 @@ def download_lyrics(args):
29
30
for country in countries :
30
31
print ("[English Subset] Recovering from country {}" .format (country ))
31
32
word_level_list = file2list (join (workspace , "data" , country , "word_level.txt" ))
33
+ path_downloaded_lyrics = join (workspace , "DownloadLyric" )
34
+ create_folder (path_downloaded_lyrics )
35
+ recovered_lyrics = [f for f in listdir (path_downloaded_lyrics ) if f .endswith ('.txt' )]
32
36
33
37
for word_level in word_level_list :
34
38
headers = {'User-Agent' : generate_user_agent (device_type = "desktop" , os = ('mac' , 'linux' ))}
35
39
# Arrangement ID
36
40
arrangement = word_level .split ('.' )[0 ]
37
- # Metadata file path
38
- metadata_path = join (workspace , 'data' , country , country + "ArrangementMeta" , arrangement + ".txt" )
39
-
40
- # Read metadata file for current word_level prompt
41
- try :
42
- metadata = dict (map (str , x .split (':' , 1 )) for x in file2list (metadata_path ))
43
- except ValueError :
44
- # Metadata file has format errors
45
- # create empty dict
46
- if arrangement not in metadata_with_errors :
47
- metadata_with_errors .append (arrangement )
48
- metadata = {}
49
-
50
- # Catch error if title is not in Metadata
51
- try :
52
- title = format_text (metadata ['Arrangement title' ].lstrip ())
53
- except KeyError :
54
- if arrangement not in metadata_with_errors :
55
- metadata_with_errors .append (arrangement )
56
- title = ""
57
-
58
- # Catch error if artist is not in Metadata
59
- try :
60
- artist = format_text (metadata ['Arrangement artist' ].lstrip ())
61
- except KeyError :
62
- if arrangement not in metadata_with_errors :
63
- metadata_with_errors .append (arrangement )
64
- artist = ""
65
-
66
- url = 'https://www.smule.com/song/{}-{}-karaoke-lyrics/{}/arrangement' .\
67
- format (artist , title , arrangement )
68
-
69
- # try to get the lyrics several time in case of errors from the network connection
70
- attempts = 50
71
- while attempts > 0 :
72
- response = requests .get (url , timeout = 5 , headers = headers )
73
- html = response .content
74
- soup = BeautifulSoup (html , "html5lib" )
75
- mydiv = soup .find_all ("div" , {"class" : "main" })
41
+
42
+ # Just download the lyrics on the first occurrence, no per country
43
+ if arrangement + ".txt" not in recovered_lyrics :
44
+ # Metadata file path
45
+ metadata_path = join (workspace , 'data' , country , country + "ArrangementMeta" , arrangement + ".txt" )
46
+
47
+ # Read metadata file for current word_level prompt
48
+ try :
49
+ metadata = dict (map (str , x .split (':' , 1 )) for x in file2list (metadata_path ))
50
+ except ValueError :
51
+ # Metadata file has format errors
52
+ # create empty dict
53
+ if arrangement not in metadata_with_errors :
54
+ metadata_with_errors .append (arrangement )
55
+ metadata = {}
56
+
57
+ # Catch error if title is not in Metadata
58
+ try :
59
+ title = format_text (metadata ['Arrangement title' ].lstrip ())
60
+ except KeyError :
61
+ if arrangement not in metadata_with_errors :
62
+ metadata_with_errors .append (arrangement )
63
+ title = ""
64
+
65
+ # Catch error if artist is not in Metadata
66
+ try :
67
+ artist = format_text (metadata ['Arrangement artist' ].lstrip ())
68
+ except KeyError :
69
+ if arrangement not in metadata_with_errors :
70
+ metadata_with_errors .append (arrangement )
71
+ artist = ""
72
+
73
+ url = 'https://www.smule.com/song/{}-{}-karaoke-lyrics/{}/arrangement' .\
74
+ format (artist , title , arrangement )
75
+
76
+ # try to get the lyrics several time in case of errors from the network connection
77
+ attempts = 5
78
+ while attempts > 0 :
79
+ response = requests .get (url , timeout = 5 , headers = headers )
80
+ html = response .content
81
+ soup = BeautifulSoup (html , "html.parser" )
82
+ mydiv = soup .find_all ("script" )#, {"class": "_1frabae"})
83
+ if len (mydiv ) < 1 :
84
+ attempts -= 1
85
+ else :
86
+ attempts = 0
87
+
76
88
if len (mydiv ) < 1 :
77
- attempts -= 1
89
+ mydiv = soup .find_all ("div" , {"class" : "column error-gone" })
90
+ print ("[WARNING] can't find {}" .format (url ))
91
+ for div in mydiv :
92
+ path_to_error_download = join (workspace , "data" , country , "error_download.txt" )
93
+ with open (path_to_error_download , "a" ) as error_file :
94
+ error_file .write ("arrangement: {}\t error: {}\t details: {}" .format (
95
+ arrangement , div .h1 .get_text (), div .p .get_text ()
96
+ ))
78
97
else :
79
- attempts = 0
80
-
81
- if len (mydiv ) < 1 :
82
- mydiv = soup .find_all ("div" , {"class" : "column error-gone" })
83
- print ("[WARNING] can't find {}" .format (url ))
84
- for div in mydiv :
85
- path_to_error_download = join (workspace , "data" , country , "error_download.txt" )
86
- with open (path_to_error_download , "a" ) as error_file :
87
- error_file .write ("arrangement: {}\t error: {}\t details: {}" .format (
88
- arrangement , div .h1 .get_text (), div .p .get_text ()
89
- ))
90
- else :
91
- for div in mydiv :
92
- lyric = div .get_text (strip = True , separator = "\n " ).split ("\n " )
93
- path_new_lyric = join (workspace , "data" , country , country + "DownloadLyric" , arrangement + ".txt" )
94
- #print("[Recover lyric] url {} - > save in {}".format(url, path_new_lyric))
95
- list2file (path_new_lyric , lyric )
98
+ for div in mydiv :
99
+ lyric_text = div .get_text ()#.replace("\n","")#.split("\n")
100
+ if "<p>" in lyric_text :
101
+ lyric = lyric_text [lyric_text .find ("<p>" )+ 3 :lyric_text .find ("</p>" )- 4 ].split ("<br>" )
102
+ path_new_lyric = join (path_downloaded_lyrics ,
103
+ arrangement + ".txt" )
104
+ print ("[Recover lyric] url {} - > save in {}" .format (url , path_new_lyric ))
105
+ list2file (path_new_lyric , lyric )
106
+ break
107
+
96
108
97
109
98
110
def format_text (text ):
@@ -114,11 +126,19 @@ def create_folder(fd):
114
126
makedirs (fd )
115
127
116
128
129
+ def clean_text (text ):
130
+ if "'" in text :
131
+ text = text .replace ("'" , "'" )
132
+ if """ in text :
133
+ text = text .replace (""" , '"' )
134
+ return (text )
135
+
136
+
117
137
def list2file (path , data ):
118
138
create_folder (dirname (path ))
119
139
with open (path , "w" ) as file :
120
140
for item in data :
121
- file .write ("{}\n " .format (item ))
141
+ file .write ("{}\n " .format (clean_text ( item ) ))
122
142
123
143
124
144
def file2list (filepath ):
@@ -143,7 +163,7 @@ def file2list(filepath):
143
163
' to recover the lyrics as appears in the Smule website'
144
164
)
145
165
146
- parser .add_argument ("workspace" , type = str , help = "Path to Workspece " )
166
+ parser .add_argument ("workspace" , type = str , help = "Path to Workspace " )
147
167
parser .add_argument ('--version' , action = 'version' ,
148
168
version = '%(prog)s 1.0' )
149
169
0 commit comments