From ce7daceed69e4e097cffb15135d4e7eb197bbced Mon Sep 17 00:00:00 2001 From: Mario Refolo Date: Sat, 29 Jun 2019 19:10:17 +0200 Subject: [PATCH 1/2] [rtvs] Fixed extractor --- youtube_dl/extractor/rtvs.py | 41 +++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/rtvs.py b/youtube_dl/extractor/rtvs.py index 6573b260d..9cd15036f 100644 --- a/youtube_dl/extractor/rtvs.py +++ b/youtube_dl/extractor/rtvs.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import determine_ext class RTVSIE(InfoExtractor): @@ -11,7 +12,7 @@ class RTVSIE(InfoExtractor): 'url': 'http://www.rtvs.sk/radio/archiv/11224/414872', 'md5': '134d5d6debdeddf8a5d761cbc9edacb8', 'info_dict': { - 'id': '414872', + 'id': '135320', 'ext': 'mp3', 'title': 'Ostrov pokladov 1 časť.mp3' }, @@ -23,7 +24,7 @@ class RTVSIE(InfoExtractor): 'url': 'http://www.rtvs.sk/televizia/archiv/8249/63118', 'md5': '85e2c55cf988403b70cac24f5c086dc6', 'info_dict': { - 'id': '63118', + 'id': '17189', 'ext': 'mp4', 'title': 'Amaro Džives - Náš deň', 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.' @@ -39,9 +40,39 @@ class RTVSIE(InfoExtractor): webpage = self._download_webpage(url, video_id) playlist_url = self._search_regex( - r'playlist["\']?\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + r'url = (["\'])(?:https?:)?(?://)(?P(?:(?!\1).)+)\1', webpage, 'playlist url', group='url') + if not playlist_url.startswith("http"): + playlist_url = "http://" + playlist_url + data = self._download_json( - playlist_url, video_id, 'Downloading playlist')[0] - return self._parse_jwplayer_data(data, video_id=video_id) + playlist_url, video_id, 'Downloading playlist') + + try: + data_media = data['clip'] + except KeyError: + data_media = data['playlist'][0] + + media_id = data_media['mediaid'] + title = data_media['title'] + description = data_media.get('description') + thumbnail = data_media.get('image') + + info = { + 'id': media_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } + + url = data_media['sources'][0]['src'] + + if determine_ext(url) == 'm3u8': + info['formats'] = self._extract_m3u8_formats( + url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + else: + info['url'] = url + + return info From 9d36667658fdb2257d7221350f7388c8486de333 Mon Sep 17 00:00:00 2001 From: Mario Refolo Date: Sun, 7 Jul 2019 23:04:09 +0200 Subject: [PATCH 2/2] Separates tv and radio in 2 classes --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/rtvs.py | 87 +++++++++++++++++++----------- 2 files changed, 60 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 530474f3f..9302dcb6a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -979,7 +979,10 @@ from .rtp import RTPIE from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE from .rtvnh import RTVNHIE -from .rtvs import RTVSIE +from .rtvs import ( + RTVSTVIE, + RTVSRADIOIE, +) from .rudo import RudoIE from .ruhd import RUHDIE from .rutube import ( diff --git a/youtube_dl/extractor/rtvs.py b/youtube_dl/extractor/rtvs.py index 9cd15036f..46733ec0d 100644 --- a/youtube_dl/extractor/rtvs.py +++ b/youtube_dl/extractor/rtvs.py @@ -2,25 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import determine_ext -class RTVSIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv/\d+/(?P\d+)' - _TESTS = [{ - # radio archive - 'url': 'http://www.rtvs.sk/radio/archiv/11224/414872', - 'md5': '134d5d6debdeddf8a5d761cbc9edacb8', - 'info_dict': { - 'id': '135320', - 'ext': 'mp3', - 'title': 'Ostrov pokladov 1 časť.mp3' - }, - 'params': { - 'skip_download': True, - } - }, { - # tv archive +class RTVSTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:televizia)/archiv/\d+/(?P\d+)' + _TEST = { 'url': 'http://www.rtvs.sk/televizia/archiv/8249/63118', 'md5': '85e2c55cf988403b70cac24f5c086dc6', 'info_dict': { @@ -32,7 +18,7 @@ class RTVSIE(InfoExtractor): 'params': { 'skip_download': True, } - }] + } def _real_extract(self, url): video_id = self._match_id(url) @@ -49,30 +35,69 @@ class RTVSIE(InfoExtractor): data = self._download_json( playlist_url, video_id, 'Downloading playlist') - try: - data_media = data['clip'] - except KeyError: - data_media = data['playlist'][0] + data_media = data['clip'] media_id = data_media['mediaid'] title = data_media['title'] description = data_media.get('description') thumbnail = data_media.get('image') + urldl = data_media['sources'][0]['src'] - info = { + formats = self._extract_m3u8_formats( + urldl, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + + return { 'id': media_id, 'title': title, 'description': description, 'thumbnail': thumbnail, + 'formats': formats } - url = data_media['sources'][0]['src'] - if determine_ext(url) == 'm3u8': - info['formats'] = self._extract_m3u8_formats( - url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') - else: - info['url'] = url +class RTVSRADIOIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio)/archiv/\d+/(?P\d+)' + _TEST = { + 'url': 'http://www.rtvs.sk/radio/archiv/11224/414872', + 'md5': '134d5d6debdeddf8a5d761cbc9edacb8', + 'info_dict': { + 'id': '135320', + 'ext': 'mp3', + 'title': 'Ostrov pokladov 1 časť.mp3' + }, + 'params': { + 'skip_download': True, + } + } - return info + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + playlist_url = self._search_regex( + r'url = (["\'])(?:https?:)?(?://)(?P(?:(?!\1).)+)\1', webpage, + 'playlist url', group='url') + + if not playlist_url.startswith("http"): + playlist_url = "http://" + playlist_url + + data = self._download_json( + playlist_url, video_id, 'Downloading playlist') + + data_media = data['playlist'][0] + + media_id = data_media['mediaid'] + title = data_media['title'] + description = data_media.get('description') + thumbnail = data_media.get('image') + urldl = data_media['sources'][0]['src'] + + return { + 'id': media_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'url': urldl + }