-
-
Notifications
You must be signed in to change notification settings - Fork 36
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
New Extractor! #33
New Extractor! #33
Changes from all commits
ea70c73
3fb7d60
e3fb8de
d53f043
3c79e2e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,71 @@ | ||||||||||||||||
import re | ||||||||||||||||
from typing import Dict, Any | ||||||||||||||||
from bs4 import BeautifulSoup, SoupStrainer | ||||||||||||||||
from mediaflow_proxy.extractors.base import BaseExtractor, ExtractorError | ||||||||||||||||
import json | ||||||||||||||||
from urllib.parse import urlparse, parse_qs | ||||||||||||||||
|
||||||||||||||||
|
||||||||||||||||
class VixCloudExtractor(BaseExtractor): | ||||||||||||||||
"""VixCloud URL extractor.""" | ||||||||||||||||
|
||||||||||||||||
def __init__(self, *args, **kwargs): | ||||||||||||||||
super().__init__(*args, **kwargs) | ||||||||||||||||
self.mediaflow_endpoint = "hls_manifest_proxy" | ||||||||||||||||
|
||||||||||||||||
async def version(self, domain: str) -> str: | ||||||||||||||||
"""Get version of VixCloud Parent Site.""" | ||||||||||||||||
DOMAIN = domain | ||||||||||||||||
base_url = f"https://streamingcommunity.{DOMAIN}/richiedi-un-titolo" | ||||||||||||||||
response = await self._make_request( | ||||||||||||||||
base_url, | ||||||||||||||||
headers={ | ||||||||||||||||
"Referer": f"https://streamingcommunity.{DOMAIN}/", | ||||||||||||||||
"Origin": f"https://streamingcommunity.{DOMAIN}", | ||||||||||||||||
}, | ||||||||||||||||
) | ||||||||||||||||
if response.status_code != 200: | ||||||||||||||||
raise ExtractorError("Outdated Domain") | ||||||||||||||||
# Soup the response | ||||||||||||||||
soup = BeautifulSoup(response.text, "lxml", parse_only=SoupStrainer("div", {"id": "app"})) | ||||||||||||||||
if soup: | ||||||||||||||||
# Extract version | ||||||||||||||||
try: | ||||||||||||||||
data = json.loads(soup.find("div", {"id": "app"}).get("data-page")) | ||||||||||||||||
return data["version"] | ||||||||||||||||
except (KeyError, json.JSONDecodeError, AttributeError) as e: | ||||||||||||||||
raise ExtractorError(f"Failed to parse version: {e}") | ||||||||||||||||
|
||||||||||||||||
async def extract(self, url: str, **kwargs) -> Dict[str, Any]: | ||||||||||||||||
"""Extract Vixcloud URL.""" | ||||||||||||||||
domain = url.split("://")[1].split("/")[0].split(".")[1] | ||||||||||||||||
version = await self.version(domain) | ||||||||||||||||
response = await self._make_request(url, headers={"x-inertia": "true", "x-inertia-version": version}) | ||||||||||||||||
soup = BeautifulSoup(response.text, "lxml", parse_only=SoupStrainer("iframe")) | ||||||||||||||||
iframe = soup.find("iframe").get("src") | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add error handling for iframe extraction. The iframe extraction could fail if no iframe is found, leading to an AttributeError. - iframe = soup.find("iframe").get("src")
+ iframe_element = soup.find("iframe")
+ if not iframe_element:
+ raise ExtractorError("No iframe found in the response")
+ iframe = iframe_element.get("src")
+ if not iframe:
+ raise ExtractorError("Iframe source URL not found") 📝 Committable suggestion
Suggested change
|
||||||||||||||||
parsed_url = urlparse(iframe) | ||||||||||||||||
query_params = parse_qs(parsed_url.query) | ||||||||||||||||
response = await self._make_request(iframe, headers={"x-inertia": "true", "x-inertia-version": version}) | ||||||||||||||||
|
||||||||||||||||
if response.status_code != 200: | ||||||||||||||||
raise ExtractorError("Failed to extract URL components, Invalid Request") | ||||||||||||||||
soup = BeautifulSoup(response.text, "lxml", parse_only=SoupStrainer("body")) | ||||||||||||||||
if soup: | ||||||||||||||||
script = soup.find("body").find("script").text | ||||||||||||||||
token = re.search(r"'token':\s*'(\w+)'", script).group(1) | ||||||||||||||||
expires = re.search(r"'expires':\s*'(\d+)'", script).group(1) | ||||||||||||||||
vixid = iframe.split("/embed/")[1].split("?")[0] | ||||||||||||||||
base_url = iframe.split("://")[1].split("/")[0] | ||||||||||||||||
final_url = f"https://{base_url}/playlist/{vixid}.m3u8?token={token}&expires={expires}" | ||||||||||||||||
Comment on lines
+57
to
+59
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Make URL parsing more robust. The URL parsing for vixid and base_url is fragile. Use urlparse consistently for more reliable URL handling. - vixid = iframe.split("/embed/")[1].split("?")[0]
- base_url = iframe.split("://")[1].split("/")[0]
+ iframe_parsed = urlparse(iframe)
+ vixid = iframe_parsed.path.split("/embed/")[1]
+ base_url = iframe_parsed.netloc
|
||||||||||||||||
if "canPlayFHD" in query_params: | ||||||||||||||||
# canPlayFHD = "h=1" | ||||||||||||||||
final_url += "&h=1" | ||||||||||||||||
if "b" in query_params: | ||||||||||||||||
# b = "b=1" | ||||||||||||||||
final_url += "&b=1" | ||||||||||||||||
self.base_headers["referer"] = url | ||||||||||||||||
return { | ||||||||||||||||
"destination_url": final_url, | ||||||||||||||||
"request_headers": self.base_headers, | ||||||||||||||||
"mediaflow_endpoint": self.mediaflow_endpoint, | ||||||||||||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Make domain extraction more robust.
The current domain extraction is fragile and assumes a specific URL format. Consider using
urlparse
for more reliable domain extraction.