From 2f56caf0835786fa5af92da428ef90a530d90f26 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Dec 2020 11:58:52 +0100 Subject: [PATCH 1/3] [yandexvideo] fix extraction for Python 3.4 --- youtube_dl/extractor/yandexvideo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/yandexvideo.py b/youtube_dl/extractor/yandexvideo.py index ab8c84c9394..6a166ec9b92 100644 --- a/youtube_dl/extractor/yandexvideo.py +++ b/youtube_dl/extractor/yandexvideo.py @@ -66,7 +66,7 @@ def _real_extract(self, url): video_id = self._match_id(url) player = try_get((self._download_json( - 'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{ + 'https://frontend.vh.yandex.ru/graphql', video_id, data=('''{ player(content_id: "%s") { computed_title content_url @@ -86,7 +86,7 @@ def _real_extract(self, url): title views_count } -}''' % video_id.encode(), fatal=False)), lambda x: x['player']['content']) +}''' % video_id).encode(), fatal=False)), lambda x: x['player']['content']) if not player or player.get('error'): player = self._download_json( 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, From c35bc8260665e071d5088f5be3bb37f7664ef06e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Dec 2020 12:04:35 +0100 Subject: [PATCH 2/3] [vvvvid] skip unplayable episodes and extract akamai formats(closes #27599) --- youtube_dl/extractor/vvvvid.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 014a67e53c5..145805492e5 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -152,7 +152,6 @@ def f(m): embed_code = ds(embed_code) video_type = video_data.get('video_type') if video_type in ('video/rcs', 'video/kenc'): - embed_code = re.sub(r'https?://([^/]+)/z/', r'https://\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') if video_type == 'video/kenc': kenc = self._download_json( 'https://www.vvvvid.it/kenc', video_id, query={ @@ -163,9 +162,7 @@ def f(m): kenc_message = kenc.get('message') if kenc_message: embed_code += '?' + ds(kenc_message) - formats.extend(self._extract_m3u8_formats( - embed_code, video_id, 'mp4', - m3u8_id='hls', fatal=False)) + formats.extend(self._extract_akamai_formats(embed_code, video_id)) else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) @@ -217,6 +214,8 @@ def _real_extract(self, url): season_number = int_or_none(season.get('number')) episodes = season.get('episodes') or [] for episode in episodes: + if episode.get('playable') is False: + continue season_id = str_or_none(episode.get('season_id')) video_id = str_or_none(episode.get('video_id')) if not (season_id and video_id): From 4d7d056909665f68e2aaa95fe42a000c287265b6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Dec 2020 12:17:55 +0100 Subject: [PATCH 3/3] [sky] add support for Sports News articles and Brighcove videos(closes #13054) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/sky.py | 101 ++++++++++++++++++++++------- 2 files changed, 79 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 254206a0825..54cbbe9d236 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1052,6 +1052,7 @@ from .sky import ( SkyNewsIE, SkySportsIE, + SkySportsNewsIE, ) from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE diff --git a/youtube_dl/extractor/sky.py b/youtube_dl/extractor/sky.py index 681691004fe..ff2c977a02b 100644 --- a/youtube_dl/extractor/sky.py +++ b/youtube_dl/extractor/sky.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( extract_attributes, @@ -11,36 +13,59 @@ class SkyBaseIE(InfoExtractor): - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_data = extract_attributes(self._search_regex( - r'(]+>)', - webpage, 'video data')) - - video_url = 'ooyala:%s' % video_data['data-video-id'] - if video_data.get('data-token-required') == 'true': - token_fetch_options = self._parse_json(video_data.get( - 'data-token-fetch-options', '{}'), video_id, fatal=False) or {} - token_fetch_url = token_fetch_options.get('url') - if token_fetch_url: - embed_token = self._download_webpage(urljoin( - url, token_fetch_url), video_id, fatal=False) - if embed_token: - video_url = smuggle_url( - video_url, {'embed_token': embed_token.strip('"')}) + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + _SDC_EL_REGEX = r'(?s)(]+data-(?:component-name|fn)="sdc-(?:articl|sit)e-video"[^>]*>)' + + def _process_ooyala_element(self, webpage, sdc_el, url): + sdc = extract_attributes(sdc_el) + provider = sdc.get('data-provider') + if provider == 'ooyala': + video_id = sdc['data-sdc-video-id'] + video_url = 'ooyala:%s' % video_id + ie_key = 'Ooyala' + ooyala_el = self._search_regex( + r'(]+class="[^"]*\bsdc-article-video__media-ooyala\b[^"]*"[^>]+data-video-id="%s"[^>]*>)' % video_id, + webpage, 'video data', fatal=False) + if ooyala_el: + ooyala_attrs = extract_attributes(ooyala_el) or {} + if ooyala_attrs.get('data-token-required') == 'true': + token_fetch_url = (self._parse_json(ooyala_attrs.get( + 'data-token-fetch-options', '{}'), + video_id, fatal=False) or {}).get('url') + if token_fetch_url: + embed_token = self._download_json(urljoin( + url, token_fetch_url), video_id, fatal=False) + if embed_token: + video_url = smuggle_url( + video_url, {'embed_token': embed_token}) + elif provider == 'brightcove': + video_id = sdc['data-video-id'] + account_id = sdc.get('data-account-id') or '6058004172001' + player_id = sdc.get('data-player-id') or 'RC9PQUaJ6' + video_url = self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id) + ie_key = 'BrightcoveNew' return { '_type': 'url_transparent', 'id': video_id, 'url': video_url, + 'ie_key': ie_key, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info = self._process_ooyala_element(webpage, self._search_regex( + self._SDC_EL_REGEX, webpage, 'sdc element'), url) + info.update({ 'title': self._og_search_title(webpage), 'description': strip_or_none(self._og_search_description(webpage)), - 'ie_key': 'Ooyala', - } + }) + return info class SkySportsIE(SkyBaseIE): + IE_NAME = 'sky:sports' _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/([^/]+/)*(?P[0-9]+)' _TESTS = [{ 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', @@ -62,15 +87,45 @@ class SkySportsIE(SkyBaseIE): class SkyNewsIE(SkyBaseIE): + IE_NAME = 'sky:news' _VALID_URL = r'https?://news\.sky\.com/video/[0-9a-z-]+-(?P[0-9]+)' _TEST = { 'url': 'https://news.sky.com/video/russian-plane-inspected-after-deadly-fire-11712962', - 'md5': 'd6327e581473cea9976a3236ded370cd', + 'md5': '411e8893fd216c75eaf7e4c65d364115', 'info_dict': { - 'id': '1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', + 'id': 'ref:1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', 'ext': 'mp4', 'title': 'Russian plane inspected after deadly fire', 'description': 'The Russian Investigative Committee has released video of the wreckage of a passenger plane which caught fire near Moscow.', + 'uploader_id': '6058004172001', + 'timestamp': 1567112345, + 'upload_date': '20190829', }, - 'add_ie': ['Ooyala'], + 'add_ie': ['BrightcoveNew'], } + + +class SkySportsNewsIE(SkyBaseIE): + IE_NAME = 'sky:sports:news' + _VALID_URL = r'https?://(?:www\.)?skysports\.com/([^/]+/)*news/\d+/(?P\d+)' + _TEST = { + 'url': 'http://www.skysports.com/golf/news/12176/10871916/dustin-johnson-ready-to-conquer-players-championship-at-tpc-sawgrass', + 'info_dict': { + 'id': '10871916', + 'title': 'Dustin Johnson ready to conquer Players Championship at TPC Sawgrass', + 'description': 'Dustin Johnson is confident he can continue his dominant form in 2017 by adding the Players Championship to his list of victories.', + }, + 'playlist_count': 2, + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + + entries = [] + for sdc_el in re.findall(self._SDC_EL_REGEX, webpage): + entries.append(self._process_ooyala_element(webpage, sdc_el, url)) + + return self.playlist_result( + entries, article_id, self._og_search_title(webpage), + self._html_search_meta(['og:description', 'description'], webpage))