From c4ad62274d29c30b5e59b5d27c05e8905a4a3770 Mon Sep 17 00:00:00 2001 From: Ondrej Zara Date: Tue, 11 Apr 2017 20:42:37 +0200 Subject: [PATCH 1/4] added support for decko.ceskatelevize.cz to CeskaTelevize IE --- youtube_dl/extractor/ceskatelevize.py | 85 ++++++++++++++++++++++----- youtube_dl/extractor/extractors.py | 1 + 2 files changed, 70 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index e250de18c..2041a44f9 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_parse_unquote, compat_urllib_parse_urlparse, + compat_urllib_parse_urlencode ) from ..utils import ( ExtractorError, @@ -19,7 +20,14 @@ from ..utils import ( class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P[^/#?]+)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)* + |decko.ceskatelevize.cz/video/ + ) + (?P[^/#?]+) + ''' _TESTS = [{ 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', 'info_dict': { @@ -65,6 +73,14 @@ class CeskaTelevizeIE(InfoExtractor): }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', 'only_matching': True, + }, { + 'url': 'http://decko.ceskatelevize.cz/video/213543116070004', + 'info_dict': { + 'id': '61924494877085121', + 'ext': 'mp4', + 'description': 'Internetové hřiště České televize pro malé i velké děti.', + 'title': 'Déčko' + } }] def _real_extract(self, url): @@ -78,23 +94,29 @@ class CeskaTelevizeIE(InfoExtractor): type_ = None episode_id = None + is_decko = "decko.ceskatelevize.cz" in url - playlist = self._parse_json( - self._search_regex( - r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist', - default='{}'), playlist_id) - if playlist: - type_ = playlist.get('type') - episode_id = playlist.get('id') + if is_decko: + type_ = "episode" + episode_id = compat_urllib_parse_unquote(playlist_id) + episode_id = episode_id.replace(" ", "").replace("_", "") + else: + playlist = self._parse_json( + self._search_regex( + r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist', + default='{}'), playlist_id) + if playlist: + type_ = playlist.get('type') + episode_id = playlist.get('id') - if not type_: - type_ = self._html_search_regex( - r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', - webpage, 'type') - if not episode_id: - episode_id = self._html_search_regex( - r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', - webpage, 'episode_id') + if not type_: + type_ = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', + webpage, 'type') + if not episode_id: + episode_id = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', + webpage, 'episode_id') data = { 'playlist[0][type]': type_, @@ -277,3 +299,34 @@ class CeskaTelevizePoradyIE(InfoExtractor): webpage, 'iframe player url', group='url')) return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) + + +class CeskaTelevizeDeckoIE(InfoExtractor): + _VALID_URL = r'https?://decko.ceskatelevize.cz/(?P[a-z-]+)$' + _TEST = { + 'url': 'http://decko.ceskatelevize.cz/nejmensi-slon-na-svete', + 'playlist_count': 13 + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + idec = self._html_search_regex(r'var\s+IDEC\s+=\s+\'(.+?)\'', webpage, 'IDEC') + + args = compat_urllib_parse_urlencode({"IDEC":idec}) + url = "http://decko.ceskatelevize.cz/rest/Programme/relatedVideosForEpisode?" + args + json = self._download_json(url, video_id) + episodes = json.get("episodes", []) + + entries = [] + for episode in episodes: + idec = episode.get("episode", {}).get("IDEC") + idec = idec.replace(" ", "").replace("/", "") + url = "http://decko.ceskatelevize.cz/video/" + idec + entries.append(self.url_result(url)) + + return { + '_type': 'playlist', + 'entries': entries + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 68e1a5cfc..4f14f4a15 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -168,6 +168,7 @@ from .cda import CDAIE from .ceskatelevize import ( CeskaTelevizeIE, CeskaTelevizePoradyIE, + CeskaTelevizeDeckoIE ) from .channel9 import Channel9IE from .charlierose import CharlieRoseIE From 611c5af425f313f37c97ed5f8606679a25acbe57 Mon Sep 17 00:00:00 2001 From: Ondrej Zara Date: Tue, 18 Jul 2017 16:04:02 +0200 Subject: [PATCH 2/4] several adjustments according to the code review --- youtube_dl/extractor/ceskatelevize.py | 32 +++++++++++++++++---------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 2041a44f9..8ad5055d0 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -12,6 +12,7 @@ from ..compat import ( from ..utils import ( ExtractorError, float_or_none, + str_or_none, sanitized_Request, unescapeHTML, urlencode_postdata, @@ -94,10 +95,10 @@ class CeskaTelevizeIE(InfoExtractor): type_ = None episode_id = None - is_decko = "decko.ceskatelevize.cz" in url + is_decko = 'decko.ceskatelevize.cz' in url if is_decko: - type_ = "episode" + type_ = 'episode' episode_id = compat_urllib_parse_unquote(playlist_id) episode_id = episode_id.replace(" ", "").replace("_", "") else: @@ -302,28 +303,35 @@ class CeskaTelevizePoradyIE(InfoExtractor): class CeskaTelevizeDeckoIE(InfoExtractor): - _VALID_URL = r'https?://decko.ceskatelevize.cz/(?P[a-z-]+)$' - _TEST = { + _VALID_URL = r'https?://decko.ceskatelevize.cz/(?P[a-z-]+)(\?.*)?$' + _TESTS = [{ 'url': 'http://decko.ceskatelevize.cz/nejmensi-slon-na-svete', 'playlist_count': 13 - } + }, { + 'url': 'http://decko.ceskatelevize.cz/nejmensi-slon-na-svete?foo=bar', + 'playlist_count': 13 + + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - idec = self._html_search_regex(r'var\s+IDEC\s+=\s+\'(.+?)\'', webpage, 'IDEC') + idec = self._html_search_regex(r'var\s+IDEC\s*=\s*\'(.+?)\'', webpage, 'IDEC') - args = compat_urllib_parse_urlencode({"IDEC":idec}) - url = "http://decko.ceskatelevize.cz/rest/Programme/relatedVideosForEpisode?" + args + args = compat_urllib_parse_urlencode({'IDEC':idec}) + url = 'http://decko.ceskatelevize.cz/rest/Programme/relatedVideosForEpisode?' + args json = self._download_json(url, video_id) - episodes = json.get("episodes", []) + episodes = json.get('episodes', []) entries = [] for episode in episodes: - idec = episode.get("episode", {}).get("IDEC") - idec = idec.replace(" ", "").replace("/", "") - url = "http://decko.ceskatelevize.cz/video/" + idec + idec = str_or_none(episode.get('episode', {}).get('IDEC')) + if idec is None: + continue + + idec = idec.replace(' ', '').replace('/', '') + url = 'http://decko.ceskatelevize.cz/video/' + idec entries.append(self.url_result(url)) return { From c3b29a31beeb61b5fd30bb0cd4a851969946674e Mon Sep 17 00:00:00 2001 From: Ondrej Zara Date: Wed, 19 Jul 2017 09:15:14 +0200 Subject: [PATCH 3/4] inlined local variables --- youtube_dl/extractor/ceskatelevize.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 8ad5055d0..6e8aa5e83 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -318,10 +318,10 @@ class CeskaTelevizeDeckoIE(InfoExtractor): webpage = self._download_webpage(url, video_id) idec = self._html_search_regex(r'var\s+IDEC\s*=\s*\'(.+?)\'', webpage, 'IDEC') - - args = compat_urllib_parse_urlencode({'IDEC':idec}) - url = 'http://decko.ceskatelevize.cz/rest/Programme/relatedVideosForEpisode?' + args - json = self._download_json(url, video_id) + json = self._download_json( + 'http://decko.ceskatelevize.cz/rest/Programme/relatedVideosForEpisode?' + + compat_urllib_parse_urlencode({'IDEC':idec}), + video_id) episodes = json.get('episodes', []) entries = [] From a769c2aa09a2b49eaec98795d3b3bbe38329afc8 Mon Sep 17 00:00:00 2001 From: Lubos Vondra Date: Fri, 3 Nov 2017 23:12:12 +0100 Subject: [PATCH 4/4] seems that they use " instead of ' in czech tv now. --- youtube_dl/extractor/ceskatelevize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 6e8aa5e83..b2b63c3d3 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -317,7 +317,7 @@ class CeskaTelevizeDeckoIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - idec = self._html_search_regex(r'var\s+IDEC\s*=\s*\'(.+?)\'', webpage, 'IDEC') + idec = self._html_search_regex(r'var\s+IDEC\s*=\s*[\'\"](.+?)[\'\"]', webpage, 'IDEC') json = self._download_json( 'http://decko.ceskatelevize.cz/rest/Programme/relatedVideosForEpisode?' + compat_urllib_parse_urlencode({'IDEC':idec}),