Updates scraping in construction of DSing

groadabike · groadabike · commit bc08f22194f3 · 2019-10-17T10:41:58.000+01:00
diff --git a/DSing Construction/runme_sing2dsing.sh b/DSing Construction/runme_sing2dsing.sh
@@ -12,7 +12,7 @@ echo "DSing30 = The English spoken recordings from GB plus all the other English
 
 version=
 
-DSing_dest=/media/gerardo/SoloSinging/${version}
+DSing_dest=/media/gerardo/SoloSinging/DSing_Task/${version}
 SmuleSing_path=/media/gerardo/SoloSinging/DAMP/sing_300x30x2
 
 # A- Prepare the workspace
@@ -25,10 +25,7 @@ python copy_lyrics.py $DSing_dest $SmuleSing_path
 python identify_wordlevel_lyrics.py $DSing_dest
 
 # D- Download sentence-level prompt-lyrics from Smule
-#
-# I need to change this step.
-# Smule changes the divs and blocks scrapping
-# python scraping_lyrics.py  $workspace $db_path
+python scraping_lyrics.py  $DSing_dest
 
 # E- Transform word to sentence level
 python word_to_sentence_level.py $DSing_dest
diff --git a/DSing Construction/scraping_lyrics.py b/DSing Construction/scraping_lyrics.py
@@ -4,6 +4,7 @@
 from os import listdir, makedirs
 import argparse
 from user_agent import generate_user_agent
+import json
 
 
 def get_countries(workspace_path):
@@ -29,70 +30,81 @@ def download_lyrics(args):
     for country in countries:
         print("[English Subset] Recovering from country {}".format(country))
         word_level_list = file2list(join(workspace, "data", country, "word_level.txt"))
+        path_downloaded_lyrics = join(workspace, "DownloadLyric")
+        create_folder(path_downloaded_lyrics)
+        recovered_lyrics = [f for f in listdir(path_downloaded_lyrics) if f.endswith('.txt')]
 
         for word_level in word_level_list:
             headers = {'User-Agent': generate_user_agent(device_type="desktop", os=('mac', 'linux'))}
             # Arrangement ID
             arrangement = word_level.split('.')[0]
-            # Metadata file path
-            metadata_path = join(workspace, 'data', country, country + "ArrangementMeta", arrangement + ".txt")
-
-            # Read metadata file for current word_level prompt
-            try:
-                metadata = dict(map(str, x.split(':', 1)) for x in file2list(metadata_path))
-            except ValueError:
-                # Metadata file has format errors
-                # create empty dict
-                if arrangement not in metadata_with_errors:
-                    metadata_with_errors.append(arrangement)
-                metadata = {}
-
-            # Catch error if title is not in Metadata
-            try:
-                title = format_text(metadata['Arrangement title'].lstrip())
-            except KeyError:
-                if arrangement not in metadata_with_errors:
-                    metadata_with_errors.append(arrangement)
-                title = ""
-
-            # Catch error if artist is not in Metadata
-            try:
-                artist = format_text(metadata['Arrangement artist'].lstrip())
-            except KeyError:
-                if arrangement not in metadata_with_errors:
-                    metadata_with_errors.append(arrangement)
-                artist = ""
-
-            url = 'https://www.smule.com/song/{}-{}-karaoke-lyrics/{}/arrangement'.\
-                format(artist, title, arrangement)
-
-            # try to get the lyrics several time in case of errors from the network connection
-            attempts = 50
-            while attempts > 0:
-                response = requests.get(url, timeout=5, headers=headers)
-                html = response.content
-                soup = BeautifulSoup(html, "html5lib")
-                mydiv = soup.find_all("div", {"class": "main"})
+
+            # Just download the lyrics on the first occurrence, no per country
+            if arrangement + ".txt" not in recovered_lyrics:
+                # Metadata file path
+                metadata_path = join(workspace, 'data', country, country + "ArrangementMeta", arrangement + ".txt")
+
+                # Read metadata file for current word_level prompt
+                try:
+                    metadata = dict(map(str, x.split(':', 1)) for x in file2list(metadata_path))
+                except ValueError:
+                    # Metadata file has format errors
+                    # create empty dict
+                    if arrangement not in metadata_with_errors:
+                        metadata_with_errors.append(arrangement)
+                    metadata = {}
+
+                # Catch error if title is not in Metadata
+                try:
+                    title = format_text(metadata['Arrangement title'].lstrip())
+                except KeyError:
+                    if arrangement not in metadata_with_errors:
+                        metadata_with_errors.append(arrangement)
+                    title = ""
+
+                # Catch error if artist is not in Metadata
+                try:
+                    artist = format_text(metadata['Arrangement artist'].lstrip())
+                except KeyError:
+                    if arrangement not in metadata_with_errors:
+                        metadata_with_errors.append(arrangement)
+                    artist = ""
+
+                url = 'https://www.smule.com/song/{}-{}-karaoke-lyrics/{}/arrangement'.\
+                    format(artist, title, arrangement)
+
+                # try to get the lyrics several time in case of errors from the network connection
+                attempts = 5
+                while attempts > 0:
+                    response = requests.get(url, timeout=5, headers=headers)
+                    html = response.content
+                    soup = BeautifulSoup(html, "html.parser")
+                    mydiv = soup.find_all("script")#, {"class": "_1frabae"})
+                    if len(mydiv) < 1:
+                        attempts -= 1
+                    else:
+                        attempts = 0
+
                 if len(mydiv) < 1:
-                    attempts -= 1
+                    mydiv = soup.find_all("div", {"class": "column error-gone"})
+                    print("[WARNING] can't find {}".format(url))
+                    for div in mydiv:
+                        path_to_error_download = join(workspace, "data", country, "error_download.txt")
+                        with open(path_to_error_download, "a") as error_file:
+                            error_file.write("arrangement: {}\terror: {}\tdetails: {}".format(
+                                arrangement, div.h1.get_text(), div.p.get_text()
+                            ))
                 else:
-                    attempts = 0
-
-            if len(mydiv) < 1:
-                mydiv = soup.find_all("div", {"class": "column error-gone"})
-                print("[WARNING] can't find {}".format(url))
-                for div in mydiv:
-                    path_to_error_download = join(workspace, "data", country, "error_download.txt")
-                    with open(path_to_error_download, "a") as error_file:
-                        error_file.write("arrangement: {}\terror: {}\tdetails: {}".format(
-                            arrangement, div.h1.get_text(), div.p.get_text()
-                        ))
-            else:
-                for div in mydiv:
-                    lyric = div.get_text(strip=True, separator="\n").split("\n")
-                    path_new_lyric = join(workspace, "data", country, country + "DownloadLyric", arrangement + ".txt")
-                    #print("[Recover lyric] url {} - > save in {}".format(url, path_new_lyric))
-                    list2file(path_new_lyric, lyric)
+                    for div in mydiv:
+                        lyric_text = div.get_text()#.replace("\n","")#.split("\n")
+                        if "<p>" in lyric_text:
+                            lyric = lyric_text[lyric_text.find("<p>")+3:lyric_text.find("</p>")-4].split("<br>")
+                            path_new_lyric = join(path_downloaded_lyrics,
+                                                  arrangement + ".txt")
+                            print("[Recover lyric] url {} - > save in {}".format(url, path_new_lyric))
+                            list2file(path_new_lyric, lyric)
+                            break
+
 
 
 def format_text(text):
@@ -114,11 +126,19 @@ def create_folder(fd):
         makedirs(fd)
 
 
+def clean_text(text):
+    if "&#39;" in text:
+        text = text.replace("&#39;", "'")
+    if "&quot;" in text:
+        text = text.replace("&quot;", '"')
+    return(text)
+
+
 def list2file(path, data):
     create_folder(dirname(path))
     with open(path, "w") as file:
         for item in data:
-            file.write("{}\n".format(item))
+            file.write("{}\n".format(clean_text(item)))
 
 
 def file2list(filepath):
@@ -143,7 +163,7 @@ def file2list(filepath):
                     ' to recover the lyrics as appears in the Smule website'
     )
 
-    parser.add_argument("workspace", type=str, help="Path to Workspece")
+    parser.add_argument("workspace", type=str, help="Path to Workspace")
     parser.add_argument('--version', action='version',
                         version='%(prog)s 1.0')
 
diff --git a/DSing Construction/word_to_sentence_level.py b/DSing Construction/word_to_sentence_level.py
@@ -53,7 +53,9 @@ def file2list(filepath):
     outlist = []
     with open(filepath) as file:
         for line in file:
-            outlist.append(line.replace('\n', ''))
+            line = line.replace('\n', '')
+            if line:
+                outlist.append(line)
     return outlist
 
 
@@ -69,16 +71,18 @@ def create_original_json(args):
 
     for country in countries:
         print("[English Subset] Doing word2sentence lyrics of country {}".format(country))
-        arrangement_list = [f for f in listdir(join(workspace, "data", country, country + "DownloadLyric"))
-                            if f.endswith('.txt')]
+        arrangement_list = file2list(join(workspace, "data", country, "word_level.txt"))
+
         for arrangement in arrangement_list:
-            new_text_path = join(workspace, "data", country, country + 'DownloadLyric', arrangement)
-            original_annotation_path = join(workspace, "data", country, country + 'Lyrics', arrangement.split(".")[0] + ".json")
-            reconstructed_annotation = reconstruct_original_lyrics(new_text_path, original_annotation_path)
-            original_reconstructed_lyrics_path = join(workspace, "data", country, country + 'DownloadLyric', arrangement.split(".")[0] + ".json")
-            create_folder(dirname(original_reconstructed_lyrics_path))
-            with open(original_reconstructed_lyrics_path, 'w') as outfile:
-                json.dump(reconstructed_annotation, outfile, indent=4)
+            new_text_path = join(workspace, 'DownloadLyric', arrangement.split(".")[0] + ".txt")
+            if exists(new_text_path):
+                original_annotation_path = join(workspace, "data", country, country + 'Lyrics', arrangement)
+                reconstructed_annotation = reconstruct_original_lyrics(new_text_path, original_annotation_path)
+                original_reconstructed_lyrics_path = join(workspace, "data", country, country + 'DownloadLyric',
+                                                            arrangement.split(".")[0] + ".json")
+                create_folder(dirname(original_reconstructed_lyrics_path))
+                with open(original_reconstructed_lyrics_path, 'w') as outfile:
+                    json.dump(reconstructed_annotation, outfile, indent=4)
 
         one_word_recovered = [f for f in listdir(join(workspace, "data", country, country + 'DownloadLyric'))
                               if f.endswith(".json")]
@@ -94,7 +98,6 @@ def reconstruct_original_lyrics(text_lyrics_path, json_lyrics_path):
     text_lyrics = file2list(text_lyrics_path)
     it_text_lyrics = iter(text_lyrics)
     current_text = next(it_text_lyrics)
-
     reconstruct = []
     element = {'t': 0.0,
                'l': ""}
@@ -104,12 +107,26 @@ def reconstruct_original_lyrics(text_lyrics_path, json_lyrics_path):
         except json.decoder.JSONDecodeError:
             print(data_file)
     for item in data:
-        if item['l'] in current_text:
+        if item['l'] == current_text[:len(item['l'])]:
             if element['t'] == 0:
                 element['t'] = item['t']
                 element['l'] = current_text
             current_text = current_text[len(item['l']):].lstrip()
-        if len(current_text) == 0:
+
+        # if item['l'] == current_text[1:len(item['l'])]:
+        #     if element['t'] == 0:
+        #         element['t'] = item['t']
+        #         element['l'] = current_text
+        #     current_text = current_text[len(item['l'])+1:].lstrip()
+        #
+        # if item['l'][1:] == current_text[:len(item['l'])-1]:
+        #     if element['t'] == 0:
+        #         element['t'] = item['t']
+        #         element['l'] = current_text
+        #     current_text = current_text[len(item['l'])-1:].lstrip()
+
+
+        if len(current_text) == 0 or current_text == ',' or current_text == '.':
             reconstruct.append(element)
             element = {'t': 0.0,
                        'l': ""}