Skip to content

Commit

Permalink
style: Constant case path string variables
Browse files Browse the repository at this point in the history
  • Loading branch information
chuangcaleb committed Apr 27, 2022
1 parent f5dea09 commit b348761
Show file tree
Hide file tree
Showing 9 changed files with 36 additions and 41 deletions.
8 changes: 4 additions & 4 deletions building_model/mtr_utils/import_dataset.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import pandas as pd
from mtr_utils import config as cfg

song_theme_feature_database_path = 'data/features/song_theme_feature_database.csv'
song_theme_label_database_path = 'data/labels/song_theme_label_database.xlsx'
FEATURE_DB_PATH = 'data/features/song_theme_feature_database.csv'
LABEL_DB_PATH = 'data/labels/song_theme_label_database.xlsx'

try:

# Access song_theme_feature_database
raw_feature_df = pd.read_csv(song_theme_feature_database_path)
raw_feature_df = pd.read_csv(FEATURE_DB_PATH)

# Access song_theme_labels_database
raw_label_df = pd.read_excel(song_theme_label_database_path)
raw_label_df = pd.read_excel(LABEL_DB_PATH)

# Extract recognizable data from label dataset
recognz_label_df = raw_label_df[raw_label_df.recognizable == 1]
Expand Down
6 changes: 3 additions & 3 deletions calculating_dataset/clean_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import re

# Access song_theme_database db
song_theme_feature_database_path = 'data/features/song_theme_feature_database.csv'
features_df = pd.read_csv(song_theme_feature_database_path, na_values=[' NaN'])
FEATURE_DB_PATH = 'data/features/song_theme_feature_database.csv'
features_df = pd.read_csv(FEATURE_DB_PATH, na_values=[' NaN'])

# Clean ids
features_df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
Expand All @@ -14,4 +14,4 @@
# Replace NaN with 0
features_df.fillna(0, inplace=True)

features_df.to_csv(song_theme_feature_database_path, index=False)
features_df.to_csv(FEATURE_DB_PATH, index=False)
4 changes: 2 additions & 2 deletions calculating_dataset/generate_jsymbolic_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def config_write(string):
# Input midi bin's root dir
BIN_ROOT_DIR = DATA_ROOT_DIR + 'bin/'

LABEL_DATABASE_PATH = DATA_ROOT_DIR + 'labels/song_theme_label_database.xlsx'
LABEL_DB_PATH = DATA_ROOT_DIR + 'labels/song_theme_label_database.xlsx'

# Output path
FEAT_ROOT_DIR = DATA_ROOT_DIR + 'features/'
Expand All @@ -25,7 +25,7 @@ def config_write(string):
# * Import Data ----------------------------------------------------------------

# Access song_theme_label_database db
label_df = pd.read_excel(LABEL_DATABASE_PATH)
label_df = pd.read_excel(LABEL_DB_PATH)

# Access our custom config file
config_file = open(CONFIG_PATH, 'wb')
Expand Down
8 changes: 4 additions & 4 deletions collecting_data/1_scraping_midi/scrape_bitmidi.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
source = 'bitmidi'
domain = "http://www." + source + ".com"

download_path = 'data/bin/' + source
if not os.path.exists(download_path):
os.makedirs(download_path)
OUTPUT_DIR = 'data/bin/' + source
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)

terminate = 0
page_number = 0
Expand Down Expand Up @@ -63,7 +63,7 @@
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
filename = download_header + "_" + source + ".mid"
with open(download_path + '/' + filename, 'wb') as saveMidFile:
with open(OUTPUT_DIR + '/' + filename, 'wb') as saveMidFile:
saveMidFile.write(mid_file.content)
print('Downloaded {} successfully.\n'.format(download_header))

Expand Down
2 changes: 1 addition & 1 deletion collecting_data/1_scraping_midi/scrape_freemidi.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
source = 'freemidi'
domain = "http://www." + source + ".com"

path = os.path.realpath(__file__)
CURRENT_PATH = os.path.realpath(__file__)

print(f"\n\nScraping from {domain}")
print("Type Y to scrape; anything else to skip\n")
Expand Down
9 changes: 4 additions & 5 deletions collecting_data/1_scraping_midi/scrape_midiworld.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
domain = "http://www.midiworld.com"
category = 'movie%20themes' # CHANGE THIS

download_path = 'data/bin/' + \
source + "/" + re.sub(r'%20', "-", category)
if not os.path.exists(download_path):
os.makedirs(download_path)
OUTPUT_DIR = 'data/bin/' + source + "/" + re.sub(r'%20', "-", category)
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)


# Extract metadata from the download label
Expand All @@ -35,7 +34,7 @@ def downloadFile(anchor, filename):

link = anchor['href']
mid_file = requests.get(link, stream=True)
with open(download_path + '/' + filename, 'wb') as saveMidFile:
with open(OUTPUT_DIR + '/' + filename, 'wb') as saveMidFile:
saveMidFile.write(mid_file.content)
print('Downloaded \"{}\" successfully.'.format(filename))

Expand Down
13 changes: 6 additions & 7 deletions collecting_data/2_building_dataset/create_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,16 @@
Compares key index(es), compile & sort unique set, then overwrite?
"""

root_path = 'data/bin'
song_theme_database_path = 'data/labels/song_theme_label_database.xlsx'
label_df = pd.DataFrame()
BIN_DIR = 'data/bin'
LABEL_DB_PATH = 'data/labels/song_theme_label_database.xlsx'

# Get list of directories/sources
directory_names = os.listdir(root_path)
directory_names = os.listdir(BIN_DIR)
# Get list of subfiles
directories_data = [x for x in os.walk(root_path) if x[0] != root_path]
directories_data = [x for x in os.walk(BIN_DIR) if x[0] != BIN_DIR]


if not os.path.exists(song_theme_database_path):
if not os.path.exists(LABEL_DB_PATH):

# For each source/directory
for i, directory_data in enumerate(directories_data):
Expand All @@ -38,7 +37,7 @@
print(label_df)

# Write to output csv file
label_df.to_excel(song_theme_database_path, index=False)
label_df.to_excel(LABEL_DB_PATH, index=False)

else:

Expand Down
20 changes: 8 additions & 12 deletions collecting_data/2_building_dataset/db_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@

from process_db import *

label_root_dir = 'data/labels/'
song_theme_label_database_path = label_root_dir + \
'song_theme_label_database.xlsx'
LABELS_DIR = 'data/labels/'
LABEL_DB_PATH = LABELS_DIR + 'song_theme_label_database.xlsx'
STATS_EXPORT_PATH = LABELS_DIR + 'label_stats_summary.json'

# Convert all p's to 1's
p_to_1_convert(song_theme_label_database_path)
p_to_1_convert(LABEL_DB_PATH)

# Import data
label_df = pd.read_excel(song_theme_label_database_path)
label_df = pd.read_excel(LABEL_DB_PATH)

# * Aux methods

Expand Down Expand Up @@ -99,13 +99,11 @@ def percentage(positive, total):
)

plt.tight_layout()
plt.savefig(label_root_dir + 'label_freq.png')
plt.savefig(LABELS_DIR + 'label_freq.png')

# * EXPORT

sorted_label_stats_dict = sorted_label_stats_df.to_dict()
# with open(label_summary_export_path, 'w') as f:
# f.write(sorted_label_stats_json)

stats_dict = {
'total_count': total_count,
Expand All @@ -117,8 +115,6 @@ def percentage(positive, total):
'recog_procs_perc': perc_recog_procs,
} | sorted_label_stats_dict

label_summary_export_path = label_root_dir + 'label_stats_summary.json'
json.dump(stats_dict, open(STATS_EXPORT_PATH, "w"))

json.dump(stats_dict, open(label_summary_export_path, "w"))

print('Saved label statistics to ' + label_summary_export_path)
print('Saved label statistics to ' + STATS_EXPORT_PATH)
7 changes: 4 additions & 3 deletions collecting_data/2_building_dataset/process_db.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import pandas as pd


def p_to_1_convert(song_theme_label_database_path):
def p_to_1_convert(label_db_path):
""" Converts all occurrences of 'p' to '1' """

label_df = pd.read_excel(song_theme_label_database_path)
label_df = pd.read_excel(label_db_path)

# Replace all 'p' labels with '1'
label_df.replace('p', 1, inplace=True)
Expand All @@ -12,5 +13,5 @@ def p_to_1_convert(song_theme_label_database_path):
# main_df.iloc[:, 2:28] = main_df.iloc[:, 2:28].astype("Int64")

# Write back to excel
label_df.to_excel(song_theme_label_database_path, index=False,
label_df.to_excel(label_db_path, index=False,
header=True, freeze_panes=(1, 1))

0 comments on commit b348761

Please sign in to comment.