Skip to content

Commit

Permalink
feat: enhance media title parsing and search fallbacks
Browse files Browse the repository at this point in the history
- Add advanced query cleaning function with:
  * Configurable max word limit
  * Better handling of TV/movie title variations
- Expand show episode pattern matching to support "series.X.YofZ" format
- Enhance movie title cleaning with better technical term filtering
- Fix proper name propagation in movie processing results

Addresses #36
  • Loading branch information
sureshfizzy committed Dec 21, 2024
1 parent fe4cd77 commit 8ff7b08
Show file tree
Hide file tree
Showing 7 changed files with 167 additions and 44 deletions.
44 changes: 25 additions & 19 deletions MediaHub/api/tmdb_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import urllib.parse
from utils.logging_utils import log_message
from config.config import get_api_key, is_imdb_folder_id_enabled, is_tvdb_folder_id_enabled, is_tmdb_folder_id_enabled
from utils.file_utils import clean_query, normalize_query, standardize_title, remove_genre_names, extract_title, clean_query_movie
from utils.file_utils import clean_query, normalize_query, standardize_title, remove_genre_names, extract_title, clean_query_movie, advanced_clean_query

_api_cache = {}

Expand Down Expand Up @@ -90,10 +90,15 @@ def search_fallback(query, year=None):
results = search_fallback(query, year)

if not results:
log_message(f"Searching with Cleaned Show Name", "DEBUG", "stdout")
log_message(f"Searching with Cleaned Query", "DEBUG", "stdout")
title = clean_query(file)
results = fetch_results(title, year)

if not results:
log_message(f"Searching with Advanced Query", "DEBUG", "stdout")
title = advanced_clean_query(file)
results = fetch_results(title, year)

if not results and year:
fallback_url = f"https://api.themoviedb.org/3/search/tv?api_key={api_key}&query={year}"
log_message(f"Fallback search URL: {fallback_url}", "DEBUG", "stdout")
Expand Down Expand Up @@ -228,43 +233,43 @@ def fetch_results(query, year=None):
params['primary_release_year'] = year

full_url = f"{url}?{urllib.parse.urlencode(params)}"
log_message(f"Primary search URL (without year): {full_url}", "DEBUG", "stdout")
log_message(f"Fetching results from URL: {full_url}", "DEBUG", "stdout")
response = perform_search(params, url)

if not response and year:
log_message("No results found with year, retrying without year.", "DEBUG", "stdout")
del params['primary_release_year']
full_url_without_year = f"{url}?{urllib.parse.urlencode(params)}"
log_message(f"Secondary search URL (without year): {full_url_without_year}", "DEBUG", "stdout")
response = perform_search(params, url)

return response

def search_with_extracted_title(query, year=None):
title = extract_title(query)
log_message(f"Searching with extracted title: '{title}'", "DEBUG", "stdout")
return fetch_results(title, year)

def search_fallback(query, year=None):
query = re.sub(r'\s*\(.*$', '', query).strip()
log_message(f"Fallback search query: '{query}'", "DEBUG", "stdout")
return fetch_results(query, year)
fallback_query = re.sub(r'\s*\(.*$', '', query).strip()
log_message(f"Primary search failed, attempting with extracted title", "DEBUG", "stdout")
return fetch_results(fallback_query, year)

results = fetch_results(query, year)

if not results:
log_message(f"Primary search failed, attempting with extracted title", "DEBUG", "stdout")
log_message("Primary search failed. Attempting extracted title search.", "DEBUG", "stdout")
results = search_with_extracted_title(query, year)

if not results and file:
log_message(f"Searching with Cleaned Movie Name", "DEBUG", "stdout")
cleaned_title = clean_query_movie(file)
log_message("Attempting search with cleaned movie name.", "DEBUG", "stdout")
cleaned_title = clean_query_movie(file)[0]
results = fetch_results(cleaned_title, year)
return file, cleaned_title

if not results:
log_message(f"Extracted title search failed, attempting web scraping fallback", "DEBUG", "stdout")
results = perform_fallback_movie_search(query, year)

if not results and year:
log_message("Performing additional fallback search without query.", "DEBUG", "stdout")
results = search_fallback(query, year)

if not results and year:
Expand All @@ -291,7 +296,7 @@ def search_fallback(query, year=None):
results = fetch_results(cleaned_dir_query, year or dir_year)

if not results:
log_message(f"No results found for query '{query}' with year '{year}'.", level="WARNING")
log_message(f"No results found for query '{query}' with year '{year}'.", "WARNING", "stdout")
_api_cache[cache_key] = f"{query}"
return f"{query}"

Expand All @@ -301,13 +306,13 @@ def search_fallback(query, year=None):
if len(results) == 1:
chosen_movie = results[0]
else:
log_message(f"Multiple movies found for query '{query}':", level="INFO")
log_message(f"Multiple movies found for query '{query}':", "INFO", "stdout")
for idx, movie in enumerate(results[:3]):
movie_name = movie.get('title')
movie_id = movie.get('id')
release_date = movie.get('release_date')
movie_year = release_date.split('-')[0] if release_date else "Unknown Year"
log_message(f"{idx + 1}: {movie_name} ({movie_year}) [tmdb-{movie_id}]", level="INFO")
log_message(f"{idx + 1}: {movie_name} ({movie_year}) [tmdb-{movie_id}]", "INFO", "stdout")

choice = input("Choose a movie (1-3) or press Enter to skip: ").strip()
if choice.isdigit() and 1 <= int(choice) <= 3:
Expand All @@ -320,6 +325,7 @@ def search_fallback(query, year=None):
release_date = chosen_movie.get('release_date')
movie_year = release_date.split('-')[0] if release_date else "Unknown Year"
tmdb_id = chosen_movie.get('id')

external_ids = get_external_ids(tmdb_id, 'movie')
imdb_id = external_ids.get('imdb_id', '')

Expand All @@ -332,10 +338,10 @@ def search_fallback(query, year=None):

_api_cache[cache_key] = proper_name
return tmdb_id, imdb_id, movie_name
else:
log_message(f"No valid selection made for query '{query}', skipping.", level="WARNING")
_api_cache[cache_key] = f"{query}"
return f"{query}"

log_message(f"No valid movie selected or found for query '{query}'.", "WARNING", "stdout")
_api_cache[cache_key] = f"{query}"
return f"{query}"

def present_movie_choices(results, query):
log_message(f"Multiple movies found for query '{query}':", level="INFO")
Expand Down
1 change: 1 addition & 0 deletions MediaHub/processors/anime_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def extract_anime_episode_info(filename):
ordinal_season_patterns = [
r'^(.+?)\s+(\d+)(?:st|nd|rd|th)\s+Season[-_\s]*(?:-\s*)?(\d+)(?:\s|$)',
r'^(.+?)\s+(\d+)(?:st|nd|rd|th)\s+Season.*?[-_](\d+)(?:\s|$)',
r'^(.+?)\s*S(\d+)\s*(\d+)(?:\s|$)'
]

for pattern in ordinal_season_patterns:
Expand Down
6 changes: 3 additions & 3 deletions MediaHub/processors/movie_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,9 @@ def process_movie(src_file, root, file, dest_dir, actual_dir, tmdb_folder_id_ena
proper_movie_name = f"{movie_name} ({year})"
elif api_key:
result = search_movie(movie_name, year, auto_select=auto_select, file=file)
if isinstance(result, tuple):
if isinstance(result, tuple) and len(result) == 3:
tmdb_id, imdb_id, proper_name = result
proper_movie_name = f"{movie_name} ({year})"
proper_movie_name = f"{proper_name} ({year})"
if is_tmdb_folder_id_enabled() and tmdb_id:
proper_movie_name += f" {{tmdb-{tmdb_id}}}"
if is_imdb_folder_id_enabled() and imdb_id:
Expand All @@ -109,7 +109,7 @@ def process_movie(src_file, root, file, dest_dir, actual_dir, tmdb_folder_id_ena
elif is_tmdb_folder_id_enabled():
proper_movie_name += f" {{tmdb-{result['id']}}}"
else:
proper_movie_name = f"{movie_name} ({year})"
proper_movie_name = f"{proper_name} ({year})"
else:
proper_movie_name = f"{movie_name} ({year})"

Expand Down
10 changes: 9 additions & 1 deletion MediaHub/processors/show_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,15 @@ def process_show(src_file, root, file, dest_dir, actual_dir, tmdb_folder_id_enab
if not anime_result or episode_match:
if episode_match:
episode_identifier = episode_match.group(2)
if re.match(r'S\d{2}[eE]\d{2}', episode_identifier, re.IGNORECASE):
series_pattern = re.search(r'series\.(\d+)\.(\d+)of\d+', file, re.IGNORECASE)
if series_pattern:
season_number = series_pattern.group(1).zfill(2)
episode_number = series_pattern.group(2).zfill(2)
episode_identifier = f"S{season_number}E{episode_number}"
show_name = re.sub(r'\.series\.\d+\.\d+of\d+.*$', '', clean_folder_name, flags=re.IGNORECASE)
show_name = show_name.replace('.', ' ').strip()
create_season_folder = True
elif re.match(r'S\d{2}[eE]\d{2}', episode_identifier, re.IGNORECASE):
show_name = re.sub(r'\s*(S\d{2}.*|Season \d+).*', '', clean_folder_name).replace('-', ' ').replace('.', ' ').strip()
create_season_folder = True
elif re.match(r'[0-9]+x[0-9]+', episode_identifier, re.IGNORECASE):
Expand Down
4 changes: 2 additions & 2 deletions MediaHub/processors/symlink_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,13 @@ def process_file(args, processed_files_log):
return

# Enhanced Regex Patterns to Identify Shows or Mini-Series
episode_match = re.search(r'(.*?)(S\d{2}\.E\d{2}|S\d{2}E\d{2}|S\d{2}e\d{2}|[0-9]+x[0-9]+|S\d{2}[0-9]+|[0-9]+e[0-9]+|\bep\.?\s*\d{1,2}\b|\bEp\.?\s*\d{1,2}\b|\bEP\.?\s*\d{1,2}\b|S\d{2}\sE\d{2}|MINI[- ]SERIES|MINISERIES|\s-\s\d{2,3}|\s-\d{2,3}|\s-\s*\d{2,3}|[Ee]pisode\s*\d{2}|[Ee]p\s*\d{2}|Season_-\d{2}|\bSeason\d+\b|\bE\d+\b)', file, re.IGNORECASE)
episode_match = re.search(r'(.*?)(S\d{1,2}\.?E\d{2}|S\d{1,2}\s*\d{2}|S\d{2}E\d{2}|S\d{2}e\d{2}|[0-9]+x[0-9]+|[0-9]+e[0-9]+|\bep\.?\s*\d{1,2}\b|\bEp\.?\s*\d{1,2}\b|\bEP\.?\s*\d{1,2}\b|S\d{2}\sE\d{2}|MINI[- ]SERIES|MINISERIES|\s-\s\d{2,3}|\s-\d{2,3}|\s-\s*\d{2,3}|[Ee]pisode\s*\d{2}|[Ee]p\s*\d{2}|Season_-\d{2}|\bSeason\d+\b|\bE\d+\b|series\.\d+\.\d+of\d+)', file, re.IGNORECASE)

mini_series_match = re.search(r'(MINI[- ]SERIES|MINISERIES)', file, re.IGNORECASE)
anime_episode_pattern = re.compile(r'\s-\s\d{2,3}\s', re.IGNORECASE)

# Get additional anime patterns
other_anime_patterns = get_anime_patterns()
anime_patterns = get_anime_patterns()

# Check if the file should be considered an extra based on size
if skip_extras_folder and is_file_extra(file, src_file):
Expand Down
142 changes: 125 additions & 17 deletions MediaHub/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import requests
from utils.logging_utils import log_message
from config.config import *
from typing import Tuple, Optional

def fetch_json(url):
"""Fetch JSON data from the provided URL."""
Expand Down Expand Up @@ -287,29 +288,136 @@ def is_file_extra(file, file_path):
else:
return False

def clean_query_movie(query, keywords_file='keywords.json'):
def clean_query_movie(query: str, keywords_file: str = 'keywords.json') -> tuple[str, Optional[int]]:
if not isinstance(query, str):
log_message(f"Invalid query type: {type(query)}. Expected string.", "ERROR", "stderr")
return ""
return "", None

log_message(f"Original query: '{query}'", "DEBUG", "stdout")

# Load keywords to remove
# Load configurable keywords to remove
remove_keywords = load_keywords(keywords_file)

query = re.sub(r'www\.[^\s]+\s+-\s+', '', query)
year_match = re.search(r'(?:19|20)\d{2}', query)
year = int(year_match.group(0)) if year_match else None

query = re.sub(r'^\[[^\]]+\]', '', query)
query = re.sub(r'-[\w\.]+-?$', '', query)
query = re.sub(r'\[[\w\.]+\.(?:com|org|net)[^\]]*\]', '', query)

query = re.sub(r'\[[^\]]*(?:Audio|字幕|双语|音轨)[^\]]*\]', '', query)

tech_patterns = [
r'\b\d{3,4}[pi]\b',
r'\bWEB-?DL\b',
r'\b(?:H|x)(?:264|265)\b',
r'\bBlu-?Ray\b',
r'\bHDR\d*\b',
r'\bDDP?\d\.?\d?\b',
r'\b(?:\d+)?Audio\b',
r'\b\d+bit\b',
r'\[\d+\.\d+GB\]',
r'\b(?:AAC|AC3)\b',
r'\.\w+$'
]
for pattern in tech_patterns:
query = re.sub(pattern, '', query, flags=re.IGNORECASE)

english_match = re.search(r'([A-Za-z][A-Za-z\s\.]+(?:Gone[A-Za-z\s\.]+)?)', query)
if english_match:
potential_title = english_match.group(1)
if not re.search(r'\b(?:WEB|DL|HDR|DDP|AAC)\b', potential_title, re.IGNORECASE):
final_title = potential_title
else:
final_title = query
else:
parts = re.split(r'[\[\]\(\)]', query)
final_title = next((part for part in parts if part and not re.search(r'\b(?:WEB|DL|HDR|DDP|AAC)\b', part, re.IGNORECASE)), parts[0])

final_title = re.sub(r'\s*\b\d{4}\b\s*', '', final_title)
final_title = re.sub(r'\s*\[.*?\]\s*', '', final_title)
final_title = re.sub(r'\s*\(.*?\)\s*', '', final_title)
final_title = re.sub(r'(?<=\w)\.(?=\w)', ' ', final_title)
final_title = re.sub(r'^[\W_]+|[\W_]+$', '', final_title)
final_title = re.sub(r'\s+', ' ', final_title)
final_title = final_title.strip()

log_message(f"Cleaned movie title: '{final_title}'", "DEBUG", "stdout")
return final_title, year

def advanced_clean_query(query: str, max_words: int = 4, keywords_file: str = 'keywords.json') -> Tuple[str, Optional[str]]:
"""
Enhanced query cleaning function that uses advanced pattern recognition
to clean TV show and movie titles, limiting to specified number of words.
Args:
query (str): The input query string to clean
max_words (int): Maximum number of words to keep in the final output
keywords_file (str): Path to keywords JSON file
Returns:
Tuple[str, Optional[str]]: Cleaned query and episode info if present
"""
if not isinstance(query, str):
return "", None

episode_patterns = [
r'(?:\d+of\d+)',
r'(?:S\d{1,2}E\d{1,2})',
r'(?:Season\s*\d+)',
r'(?:Series\s*\d+)',
r'(?:\d{1,2}x\d{1,2})',
r'(?:E\d{1,2})',
r'(?:\d{1,2}\s*-\s*\d{1,2})',
r'\bS\d{1,2}\b',
r'\bSeason\s*\d{1,2}\b',
r'\[S\d{1,2}\]',
r'\(S\d{1,2}\)',
r'S\d{1,2}$'
]

technical_patterns = [
r'\d{3,4}p',
r'(?:WEB-DL|HDTV|BluRay|BDRip)',
r'(?:x264|x265|h264|h265)',
r'(?:AAC|AC3|MP3)',
r'(?:HEVC|10bit)',
r'\[.*?\]',
r'\(.*?\)',
r'(?:MVGroup|Forum)',
r'\b\d{4}\b',
r'(?:mkv|mp4|avi)',
r'S\d{1,2}E\d{1,2}',
r'-\s*S\d+E\d+E\d+',
r'\[\d+\]'
]

channel_pattern = r'^(?:Ch\d+|BBC\d*|ITV\d*|NBC|CBS|ABC|Fox|A&C)\.'
query = re.sub(channel_pattern, '', query, flags=re.IGNORECASE)

for pattern in technical_patterns:
query = re.sub(pattern, '', query, flags=re.IGNORECASE)

episode_info = None
for pattern in episode_patterns:
match = re.search(pattern, query, re.IGNORECASE)
if match:
episode_info = match.group(0)
query = re.sub(pattern, '', query)
break

query = query.replace('.', ' ')
keywords_pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, remove_keywords)) + r')\b', re.IGNORECASE)
query = keywords_pattern.sub('', query)
query = re.sub(r'\b(?:\d{3,4}p|WEB-DL|HDRIP|BLURAY|DVDRIP|UNTOUCHED|AVC|AAC|ESub)\b', '', query, flags=re.IGNORECASE)
query = re.sub(r'\b\d+(?:\.\d+)?\s*(?:GB|MB)\b', '', query, flags=re.IGNORECASE)
query = re.sub(r'\(\d{4}\)', '', query)
query = query.replace('_', ' ')
query = query.replace('-', ' ')
query = re.sub(r'\[.*?\]', '', query)
query = re.sub(r'-+', ' ', query)
query = re.sub(r'\s+', ' ', query).strip()
query = re.sub(r'\b\d+\b', '', query).strip()
query = re.sub(r'\b(?:Telugu|Hindi|Tamil|Malayalam|Kannada|Bengali|Punjabi|Marathi|Gujarati|English)\b', '', query, flags=re.IGNORECASE).strip()
query = re.sub(r'\b(?:mkv|mp4|avi)\b', '', query, flags=re.IGNORECASE).strip()

log_message(f"Cleaned movie query: '{query}'", "DEBUG", "stdout")
return query
query = re.sub(r'\(.*?\)', '', query)
query = re.sub(r'[^\w\s]', '', query)
query = re.sub(r'\s+', ' ', query)
common_words = {'complete', 'series', 'season', 'episode', 'part'}
query_words = query.split()
query_words = [word for word in query_words if word.lower() not in common_words]
query_words = query_words[:max_words]
query = ' '.join(query_words)

query = query.strip()
return query, None
4 changes: 2 additions & 2 deletions MediaHub/utils/keywords.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"WebDl", "Rip", "4K", "HDR", "DV", "2160p", "BDRip", "AC3", "5.1", "Sub", "NAHOM", "mkv", "Complete",
"www", "Torrenting", "com", "TamilMV", "Mp4", "Series", "HorribleRips", "mp4", "[Anime Time]", "[Dual Audio]", "[1080p]", "[HEVC 10bit]", "[AAC]", "[Batch]",
"[Erai-raws]", "[720p]", "[Multiple Subtitle]", "[ENG]", "[POR-BR]", "[SPA-LA]", "[SPA]", "[ARA]", "[FRE]", "[GER]", "[ITA]", "[RUS]",
"iobop", " DTS", "x2ua", "bluray"
"iobop", " DTS", "x2ua", "bluray", "cz", "Sample"
],
"skip_patterns": [
"^FC2-PPV-\\d+",
Expand All @@ -25,6 +25,6 @@
"SubsPlease", "Erai-raws", "HorribleSubs", "HorribleRips", "Judas", "EMBER", "ASW", "Commie", "GJM",
"SSA", "Mezashite", "Underwater", "Anime Time", "Seregorn", "Memento", "Sakura-Subs", "Chronos",
"R1", "DmonHiro", "Frostii", "Otakusensei", "AYZ", "Live-eviL", "ShindenSubs", "Vivid", "Kawaii-Subs",
"NyaaSi", "Hikari-Subs", "Yumeko", "Tenshi", "Gokudera", "TokyoTosho"
"NyaaSi", "Hikari-Subs", "Yumeko", "Tenshi", "Gokudera", "TokyoTosho", "NoobSubs"
]
}

0 comments on commit 8ff7b08

Please sign in to comment.