From 8ff7b08f2e884e6141fb4b2f0bf064fdc10b6c31 Mon Sep 17 00:00:00 2001 From: Suresh S Date: Sat, 21 Dec 2024 10:30:30 +0000 Subject: [PATCH] feat: enhance media title parsing and search fallbacks - Add advanced query cleaning function with: * Configurable max word limit * Better handling of TV/movie title variations - Expand show episode pattern matching to support "series.X.YofZ" format - Enhance movie title cleaning with better technical term filtering - Fix proper name propagation in movie processing results Addresses #36 --- MediaHub/api/tmdb_api.py | 44 ++++---- MediaHub/processors/anime_processor.py | 1 + MediaHub/processors/movie_processor.py | 6 +- MediaHub/processors/show_processor.py | 10 +- MediaHub/processors/symlink_creator.py | 4 +- MediaHub/utils/file_utils.py | 142 ++++++++++++++++++++++--- MediaHub/utils/keywords.json | 4 +- 7 files changed, 167 insertions(+), 44 deletions(-) diff --git a/MediaHub/api/tmdb_api.py b/MediaHub/api/tmdb_api.py index 2edbb73..8c1cc0b 100644 --- a/MediaHub/api/tmdb_api.py +++ b/MediaHub/api/tmdb_api.py @@ -6,7 +6,7 @@ import urllib.parse from utils.logging_utils import log_message from config.config import get_api_key, is_imdb_folder_id_enabled, is_tvdb_folder_id_enabled, is_tmdb_folder_id_enabled -from utils.file_utils import clean_query, normalize_query, standardize_title, remove_genre_names, extract_title, clean_query_movie +from utils.file_utils import clean_query, normalize_query, standardize_title, remove_genre_names, extract_title, clean_query_movie, advanced_clean_query _api_cache = {} @@ -90,10 +90,15 @@ def search_fallback(query, year=None): results = search_fallback(query, year) if not results: - log_message(f"Searching with Cleaned Show Name", "DEBUG", "stdout") + log_message(f"Searching with Cleaned Query", "DEBUG", "stdout") title = clean_query(file) results = fetch_results(title, year) + if not results: + log_message(f"Searching with Advanced Query", "DEBUG", "stdout") + title = advanced_clean_query(file) + results = fetch_results(title, year) + if not results and year: fallback_url = f"https://api.themoviedb.org/3/search/tv?api_key={api_key}&query={year}" log_message(f"Fallback search URL: {fallback_url}", "DEBUG", "stdout") @@ -228,43 +233,43 @@ def fetch_results(query, year=None): params['primary_release_year'] = year full_url = f"{url}?{urllib.parse.urlencode(params)}" - log_message(f"Primary search URL (without year): {full_url}", "DEBUG", "stdout") + log_message(f"Fetching results from URL: {full_url}", "DEBUG", "stdout") response = perform_search(params, url) if not response and year: + log_message("No results found with year, retrying without year.", "DEBUG", "stdout") del params['primary_release_year'] - full_url_without_year = f"{url}?{urllib.parse.urlencode(params)}" - log_message(f"Secondary search URL (without year): {full_url_without_year}", "DEBUG", "stdout") response = perform_search(params, url) return response def search_with_extracted_title(query, year=None): title = extract_title(query) + log_message(f"Searching with extracted title: '{title}'", "DEBUG", "stdout") return fetch_results(title, year) def search_fallback(query, year=None): - query = re.sub(r'\s*\(.*$', '', query).strip() - log_message(f"Fallback search query: '{query}'", "DEBUG", "stdout") - return fetch_results(query, year) + fallback_query = re.sub(r'\s*\(.*$', '', query).strip() + log_message(f"Primary search failed, attempting with extracted title", "DEBUG", "stdout") + return fetch_results(fallback_query, year) results = fetch_results(query, year) if not results: - log_message(f"Primary search failed, attempting with extracted title", "DEBUG", "stdout") + log_message("Primary search failed. Attempting extracted title search.", "DEBUG", "stdout") results = search_with_extracted_title(query, year) if not results and file: - log_message(f"Searching with Cleaned Movie Name", "DEBUG", "stdout") - cleaned_title = clean_query_movie(file) + log_message("Attempting search with cleaned movie name.", "DEBUG", "stdout") + cleaned_title = clean_query_movie(file)[0] results = fetch_results(cleaned_title, year) - return file, cleaned_title if not results: log_message(f"Extracted title search failed, attempting web scraping fallback", "DEBUG", "stdout") results = perform_fallback_movie_search(query, year) if not results and year: + log_message("Performing additional fallback search without query.", "DEBUG", "stdout") results = search_fallback(query, year) if not results and year: @@ -291,7 +296,7 @@ def search_fallback(query, year=None): results = fetch_results(cleaned_dir_query, year or dir_year) if not results: - log_message(f"No results found for query '{query}' with year '{year}'.", level="WARNING") + log_message(f"No results found for query '{query}' with year '{year}'.", "WARNING", "stdout") _api_cache[cache_key] = f"{query}" return f"{query}" @@ -301,13 +306,13 @@ def search_fallback(query, year=None): if len(results) == 1: chosen_movie = results[0] else: - log_message(f"Multiple movies found for query '{query}':", level="INFO") + log_message(f"Multiple movies found for query '{query}':", "INFO", "stdout") for idx, movie in enumerate(results[:3]): movie_name = movie.get('title') movie_id = movie.get('id') release_date = movie.get('release_date') movie_year = release_date.split('-')[0] if release_date else "Unknown Year" - log_message(f"{idx + 1}: {movie_name} ({movie_year}) [tmdb-{movie_id}]", level="INFO") + log_message(f"{idx + 1}: {movie_name} ({movie_year}) [tmdb-{movie_id}]", "INFO", "stdout") choice = input("Choose a movie (1-3) or press Enter to skip: ").strip() if choice.isdigit() and 1 <= int(choice) <= 3: @@ -320,6 +325,7 @@ def search_fallback(query, year=None): release_date = chosen_movie.get('release_date') movie_year = release_date.split('-')[0] if release_date else "Unknown Year" tmdb_id = chosen_movie.get('id') + external_ids = get_external_ids(tmdb_id, 'movie') imdb_id = external_ids.get('imdb_id', '') @@ -332,10 +338,10 @@ def search_fallback(query, year=None): _api_cache[cache_key] = proper_name return tmdb_id, imdb_id, movie_name - else: - log_message(f"No valid selection made for query '{query}', skipping.", level="WARNING") - _api_cache[cache_key] = f"{query}" - return f"{query}" + + log_message(f"No valid movie selected or found for query '{query}'.", "WARNING", "stdout") + _api_cache[cache_key] = f"{query}" + return f"{query}" def present_movie_choices(results, query): log_message(f"Multiple movies found for query '{query}':", level="INFO") diff --git a/MediaHub/processors/anime_processor.py b/MediaHub/processors/anime_processor.py index c0064b7..fbe387d 100644 --- a/MediaHub/processors/anime_processor.py +++ b/MediaHub/processors/anime_processor.py @@ -49,6 +49,7 @@ def extract_anime_episode_info(filename): ordinal_season_patterns = [ r'^(.+?)\s+(\d+)(?:st|nd|rd|th)\s+Season[-_\s]*(?:-\s*)?(\d+)(?:\s|$)', r'^(.+?)\s+(\d+)(?:st|nd|rd|th)\s+Season.*?[-_](\d+)(?:\s|$)', + r'^(.+?)\s*S(\d+)\s*(\d+)(?:\s|$)' ] for pattern in ordinal_season_patterns: diff --git a/MediaHub/processors/movie_processor.py b/MediaHub/processors/movie_processor.py index 721c963..a0b594a 100644 --- a/MediaHub/processors/movie_processor.py +++ b/MediaHub/processors/movie_processor.py @@ -95,9 +95,9 @@ def process_movie(src_file, root, file, dest_dir, actual_dir, tmdb_folder_id_ena proper_movie_name = f"{movie_name} ({year})" elif api_key: result = search_movie(movie_name, year, auto_select=auto_select, file=file) - if isinstance(result, tuple): + if isinstance(result, tuple) and len(result) == 3: tmdb_id, imdb_id, proper_name = result - proper_movie_name = f"{movie_name} ({year})" + proper_movie_name = f"{proper_name} ({year})" if is_tmdb_folder_id_enabled() and tmdb_id: proper_movie_name += f" {{tmdb-{tmdb_id}}}" if is_imdb_folder_id_enabled() and imdb_id: @@ -109,7 +109,7 @@ def process_movie(src_file, root, file, dest_dir, actual_dir, tmdb_folder_id_ena elif is_tmdb_folder_id_enabled(): proper_movie_name += f" {{tmdb-{result['id']}}}" else: - proper_movie_name = f"{movie_name} ({year})" + proper_movie_name = f"{proper_name} ({year})" else: proper_movie_name = f"{movie_name} ({year})" diff --git a/MediaHub/processors/show_processor.py b/MediaHub/processors/show_processor.py index 3259cc0..9270662 100644 --- a/MediaHub/processors/show_processor.py +++ b/MediaHub/processors/show_processor.py @@ -77,7 +77,15 @@ def process_show(src_file, root, file, dest_dir, actual_dir, tmdb_folder_id_enab if not anime_result or episode_match: if episode_match: episode_identifier = episode_match.group(2) - if re.match(r'S\d{2}[eE]\d{2}', episode_identifier, re.IGNORECASE): + series_pattern = re.search(r'series\.(\d+)\.(\d+)of\d+', file, re.IGNORECASE) + if series_pattern: + season_number = series_pattern.group(1).zfill(2) + episode_number = series_pattern.group(2).zfill(2) + episode_identifier = f"S{season_number}E{episode_number}" + show_name = re.sub(r'\.series\.\d+\.\d+of\d+.*$', '', clean_folder_name, flags=re.IGNORECASE) + show_name = show_name.replace('.', ' ').strip() + create_season_folder = True + elif re.match(r'S\d{2}[eE]\d{2}', episode_identifier, re.IGNORECASE): show_name = re.sub(r'\s*(S\d{2}.*|Season \d+).*', '', clean_folder_name).replace('-', ' ').replace('.', ' ').strip() create_season_folder = True elif re.match(r'[0-9]+x[0-9]+', episode_identifier, re.IGNORECASE): diff --git a/MediaHub/processors/symlink_creator.py b/MediaHub/processors/symlink_creator.py index 5d82976..a03a8c7 100644 --- a/MediaHub/processors/symlink_creator.py +++ b/MediaHub/processors/symlink_creator.py @@ -87,13 +87,13 @@ def process_file(args, processed_files_log): return # Enhanced Regex Patterns to Identify Shows or Mini-Series - episode_match = re.search(r'(.*?)(S\d{2}\.E\d{2}|S\d{2}E\d{2}|S\d{2}e\d{2}|[0-9]+x[0-9]+|S\d{2}[0-9]+|[0-9]+e[0-9]+|\bep\.?\s*\d{1,2}\b|\bEp\.?\s*\d{1,2}\b|\bEP\.?\s*\d{1,2}\b|S\d{2}\sE\d{2}|MINI[- ]SERIES|MINISERIES|\s-\s\d{2,3}|\s-\d{2,3}|\s-\s*\d{2,3}|[Ee]pisode\s*\d{2}|[Ee]p\s*\d{2}|Season_-\d{2}|\bSeason\d+\b|\bE\d+\b)', file, re.IGNORECASE) + episode_match = re.search(r'(.*?)(S\d{1,2}\.?E\d{2}|S\d{1,2}\s*\d{2}|S\d{2}E\d{2}|S\d{2}e\d{2}|[0-9]+x[0-9]+|[0-9]+e[0-9]+|\bep\.?\s*\d{1,2}\b|\bEp\.?\s*\d{1,2}\b|\bEP\.?\s*\d{1,2}\b|S\d{2}\sE\d{2}|MINI[- ]SERIES|MINISERIES|\s-\s\d{2,3}|\s-\d{2,3}|\s-\s*\d{2,3}|[Ee]pisode\s*\d{2}|[Ee]p\s*\d{2}|Season_-\d{2}|\bSeason\d+\b|\bE\d+\b|series\.\d+\.\d+of\d+)', file, re.IGNORECASE) mini_series_match = re.search(r'(MINI[- ]SERIES|MINISERIES)', file, re.IGNORECASE) anime_episode_pattern = re.compile(r'\s-\s\d{2,3}\s', re.IGNORECASE) # Get additional anime patterns - other_anime_patterns = get_anime_patterns() + anime_patterns = get_anime_patterns() # Check if the file should be considered an extra based on size if skip_extras_folder and is_file_extra(file, src_file): diff --git a/MediaHub/utils/file_utils.py b/MediaHub/utils/file_utils.py index 3e6c82f..5b2b1ab 100644 --- a/MediaHub/utils/file_utils.py +++ b/MediaHub/utils/file_utils.py @@ -5,6 +5,7 @@ import requests from utils.logging_utils import log_message from config.config import * +from typing import Tuple, Optional def fetch_json(url): """Fetch JSON data from the provided URL.""" @@ -287,29 +288,136 @@ def is_file_extra(file, file_path): else: return False -def clean_query_movie(query, keywords_file='keywords.json'): +def clean_query_movie(query: str, keywords_file: str = 'keywords.json') -> tuple[str, Optional[int]]: if not isinstance(query, str): log_message(f"Invalid query type: {type(query)}. Expected string.", "ERROR", "stderr") - return "" + return "", None log_message(f"Original query: '{query}'", "DEBUG", "stdout") - # Load keywords to remove + # Load configurable keywords to remove remove_keywords = load_keywords(keywords_file) - query = re.sub(r'www\.[^\s]+\s+-\s+', '', query) + year_match = re.search(r'(?:19|20)\d{2}', query) + year = int(year_match.group(0)) if year_match else None + + query = re.sub(r'^\[[^\]]+\]', '', query) + query = re.sub(r'-[\w\.]+-?$', '', query) + query = re.sub(r'\[[\w\.]+\.(?:com|org|net)[^\]]*\]', '', query) + + query = re.sub(r'\[[^\]]*(?:Audio|字幕|双语|音轨)[^\]]*\]', '', query) + + tech_patterns = [ + r'\b\d{3,4}[pi]\b', + r'\bWEB-?DL\b', + r'\b(?:H|x)(?:264|265)\b', + r'\bBlu-?Ray\b', + r'\bHDR\d*\b', + r'\bDDP?\d\.?\d?\b', + r'\b(?:\d+)?Audio\b', + r'\b\d+bit\b', + r'\[\d+\.\d+GB\]', + r'\b(?:AAC|AC3)\b', + r'\.\w+$' + ] + for pattern in tech_patterns: + query = re.sub(pattern, '', query, flags=re.IGNORECASE) + + english_match = re.search(r'([A-Za-z][A-Za-z\s\.]+(?:Gone[A-Za-z\s\.]+)?)', query) + if english_match: + potential_title = english_match.group(1) + if not re.search(r'\b(?:WEB|DL|HDR|DDP|AAC)\b', potential_title, re.IGNORECASE): + final_title = potential_title + else: + final_title = query + else: + parts = re.split(r'[\[\]\(\)]', query) + final_title = next((part for part in parts if part and not re.search(r'\b(?:WEB|DL|HDR|DDP|AAC)\b', part, re.IGNORECASE)), parts[0]) + + final_title = re.sub(r'\s*\b\d{4}\b\s*', '', final_title) + final_title = re.sub(r'\s*\[.*?\]\s*', '', final_title) + final_title = re.sub(r'\s*\(.*?\)\s*', '', final_title) + final_title = re.sub(r'(?<=\w)\.(?=\w)', ' ', final_title) + final_title = re.sub(r'^[\W_]+|[\W_]+$', '', final_title) + final_title = re.sub(r'\s+', ' ', final_title) + final_title = final_title.strip() + + log_message(f"Cleaned movie title: '{final_title}'", "DEBUG", "stdout") + return final_title, year + +def advanced_clean_query(query: str, max_words: int = 4, keywords_file: str = 'keywords.json') -> Tuple[str, Optional[str]]: + """ + Enhanced query cleaning function that uses advanced pattern recognition + to clean TV show and movie titles, limiting to specified number of words. + + Args: + query (str): The input query string to clean + max_words (int): Maximum number of words to keep in the final output + keywords_file (str): Path to keywords JSON file + + Returns: + Tuple[str, Optional[str]]: Cleaned query and episode info if present + """ + if not isinstance(query, str): + return "", None + + episode_patterns = [ + r'(?:\d+of\d+)', + r'(?:S\d{1,2}E\d{1,2})', + r'(?:Season\s*\d+)', + r'(?:Series\s*\d+)', + r'(?:\d{1,2}x\d{1,2})', + r'(?:E\d{1,2})', + r'(?:\d{1,2}\s*-\s*\d{1,2})', + r'\bS\d{1,2}\b', + r'\bSeason\s*\d{1,2}\b', + r'\[S\d{1,2}\]', + r'\(S\d{1,2}\)', + r'S\d{1,2}$' + ] + + technical_patterns = [ + r'\d{3,4}p', + r'(?:WEB-DL|HDTV|BluRay|BDRip)', + r'(?:x264|x265|h264|h265)', + r'(?:AAC|AC3|MP3)', + r'(?:HEVC|10bit)', + r'\[.*?\]', + r'\(.*?\)', + r'(?:MVGroup|Forum)', + r'\b\d{4}\b', + r'(?:mkv|mp4|avi)', + r'S\d{1,2}E\d{1,2}', + r'-\s*S\d+E\d+E\d+', + r'\[\d+\]' + ] + + channel_pattern = r'^(?:Ch\d+|BBC\d*|ITV\d*|NBC|CBS|ABC|Fox|A&C)\.' + query = re.sub(channel_pattern, '', query, flags=re.IGNORECASE) + + for pattern in technical_patterns: + query = re.sub(pattern, '', query, flags=re.IGNORECASE) + + episode_info = None + for pattern in episode_patterns: + match = re.search(pattern, query, re.IGNORECASE) + if match: + episode_info = match.group(0) + query = re.sub(pattern, '', query) + break + query = query.replace('.', ' ') - keywords_pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, remove_keywords)) + r')\b', re.IGNORECASE) - query = keywords_pattern.sub('', query) - query = re.sub(r'\b(?:\d{3,4}p|WEB-DL|HDRIP|BLURAY|DVDRIP|UNTOUCHED|AVC|AAC|ESub)\b', '', query, flags=re.IGNORECASE) - query = re.sub(r'\b\d+(?:\.\d+)?\s*(?:GB|MB)\b', '', query, flags=re.IGNORECASE) - query = re.sub(r'\(\d{4}\)', '', query) + query = query.replace('_', ' ') + query = query.replace('-', ' ') query = re.sub(r'\[.*?\]', '', query) - query = re.sub(r'-+', ' ', query) - query = re.sub(r'\s+', ' ', query).strip() - query = re.sub(r'\b\d+\b', '', query).strip() - query = re.sub(r'\b(?:Telugu|Hindi|Tamil|Malayalam|Kannada|Bengali|Punjabi|Marathi|Gujarati|English)\b', '', query, flags=re.IGNORECASE).strip() - query = re.sub(r'\b(?:mkv|mp4|avi)\b', '', query, flags=re.IGNORECASE).strip() - - log_message(f"Cleaned movie query: '{query}'", "DEBUG", "stdout") - return query + query = re.sub(r'\(.*?\)', '', query) + query = re.sub(r'[^\w\s]', '', query) + query = re.sub(r'\s+', ' ', query) + common_words = {'complete', 'series', 'season', 'episode', 'part'} + query_words = query.split() + query_words = [word for word in query_words if word.lower() not in common_words] + query_words = query_words[:max_words] + query = ' '.join(query_words) + + query = query.strip() + return query, None diff --git a/MediaHub/utils/keywords.json b/MediaHub/utils/keywords.json index 37044d5..eff6ed4 100644 --- a/MediaHub/utils/keywords.json +++ b/MediaHub/utils/keywords.json @@ -4,7 +4,7 @@ "WebDl", "Rip", "4K", "HDR", "DV", "2160p", "BDRip", "AC3", "5.1", "Sub", "NAHOM", "mkv", "Complete", "www", "Torrenting", "com", "TamilMV", "Mp4", "Series", "HorribleRips", "mp4", "[Anime Time]", "[Dual Audio]", "[1080p]", "[HEVC 10bit]", "[AAC]", "[Batch]", "[Erai-raws]", "[720p]", "[Multiple Subtitle]", "[ENG]", "[POR-BR]", "[SPA-LA]", "[SPA]", "[ARA]", "[FRE]", "[GER]", "[ITA]", "[RUS]", - "iobop", " DTS", "x2ua", "bluray" + "iobop", " DTS", "x2ua", "bluray", "cz", "Sample" ], "skip_patterns": [ "^FC2-PPV-\\d+", @@ -25,6 +25,6 @@ "SubsPlease", "Erai-raws", "HorribleSubs", "HorribleRips", "Judas", "EMBER", "ASW", "Commie", "GJM", "SSA", "Mezashite", "Underwater", "Anime Time", "Seregorn", "Memento", "Sakura-Subs", "Chronos", "R1", "DmonHiro", "Frostii", "Otakusensei", "AYZ", "Live-eviL", "ShindenSubs", "Vivid", "Kawaii-Subs", - "NyaaSi", "Hikari-Subs", "Yumeko", "Tenshi", "Gokudera", "TokyoTosho" + "NyaaSi", "Hikari-Subs", "Yumeko", "Tenshi", "Gokudera", "TokyoTosho", "NoobSubs" ] }