From 31f169129914a6855c71b0d076cc69ca4206121e Mon Sep 17 00:00:00 2001 From: Arthur Bols Date: Sun, 31 Mar 2019 17:57:41 +0200 Subject: [PATCH 1/3] removed redactie, added vrt.be fixed downloader --- youtube_dl/extractor/vrt.py | 62 +++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py index 444295d68..72f20e4e9 100644 --- a/youtube_dl/extractor/vrt.py +++ b/youtube_dl/extractor/vrt.py @@ -6,12 +6,14 @@ import re from .common import InfoExtractor from ..utils import ( float_or_none, + int_or_none, + try_get, ) class VRTIE(InfoExtractor): - IE_DESC = 'deredactie.be, sporza.be, cobra.be and cobra.canvas.be' - _VALID_URL = r'https?://(?:deredactie|sporza|cobra(?:\.canvas)?)\.be/cm/(?:[^/]+/)+(?P[^/]+)/*' + IE_DESC = 'vrt.be, sporza.be, cobra.be and cobra.canvas.be' + _VALID_URL = r'https?://(?:www\.)?(?:vrt|sporza|cobra(?:\.canvas)?)\.be/(?:[^/]+/)+(?P[^/]+)/*' _TESTS = [ # deredactie.be { @@ -96,39 +98,39 @@ class VRTIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_id = self._search_regex( - r'data-video-id="([^"]+)_[^"]+"', webpage, 'video id', fatal=False) + r'data-videoid="([^"]+)"', webpage, 'video id') - src = self._search_regex( - r'data-video-src="([^"]+)"', webpage, 'video src', default=None) + publication_id = self._search_regex( + r'data-publicationid="([^"]+)"', webpage, 'publication id') - video_type = self._search_regex( - r'data-video-type="([^"]+)"', webpage, 'video type', default=None) + media_url = self._search_regex( + r'data-mediaapiurl="([^"]+)"', webpage, 'media url') - if video_type == 'YouTubeVideo': - return self.url_result(src, 'Youtube') + client = self._search_regex( + r'data-client="([^"]+)"', webpage, 'client') + + headers = {'Content-Type': 'application/json'} + result = self._download_json( + '%s/tokens' % (media_url), video_id, + 'Downloading player token', + headers=headers, data={}) + + vrtPlayerToken = result['vrtPlayerToken'] + print(vrtPlayerToken) formats = [] - mobj = re.search( - r'data-video-iphone-server="(?P[^"]+)"\s+data-video-iphone-path="(?P[^"]+)"', - webpage) - if mobj: - formats.extend(self._extract_m3u8_formats( - '%s/%s' % (mobj.group('server'), mobj.group('path')), - video_id, 'mp4', m3u8_id='hls', fatal=False)) + targetUrls = self._download_json( + '%s/videos/%s$%s?vrtPlayerToken=%s&client=%s' % (media_url, publication_id, video_id, vrtPlayerToken, client), + video_id, + 'Downloading target url data', + headers=headers) - if src: - formats = self._extract_wowza_formats(src, video_id) - if 'data-video-geoblocking="true"' not in webpage: - for f in formats: - if f['url'].startswith('rtsp://'): - http_format = f.copy() - http_format.update({ - 'url': f['url'].replace('rtsp://', 'http://').replace('vod.', 'download.').replace('/_definst_/', '/').replace('mp4:', ''), - 'format_id': f['format_id'].replace('rtsp', 'http'), - 'protocol': 'http', - }) - formats.append(http_format) + for t in targetUrls['targetUrls']: + if t['url'].endswith('m3u8'): + formats.extend(self._extract_m3u8_formats(t['url'], video_id)) + elif t['url'].endswith('mpd'): + formats.extend(self._extract_mpd_formats(t['url'], video_id)) if not formats and 'data-video-geoblocking="true"' in webpage: self.raise_geo_restricted('This video is only available in Belgium') @@ -141,8 +143,8 @@ class VRTIE(InfoExtractor): timestamp = float_or_none(self._search_regex( r'data-video-sitestat-pubdate="(\d+)"', webpage, 'timestamp', fatal=False), 1000) duration = float_or_none(self._search_regex( - r'data-video-duration="(\d+)"', webpage, 'duration', fatal=False), 1000) - + r'data-duration="(\d+)"', webpage, 'duration', fatal=False), 1000) + return { 'id': video_id, 'title': title, From f2963782bcc004c0528a925ce3ade2580464688f Mon Sep 17 00:00:00 2001 From: Arthur Bols Date: Sun, 31 Mar 2019 18:44:57 +0200 Subject: [PATCH 2/3] fixed extension, cleanup and added tests --- docs/supportedsites.md | 2 +- youtube_dl/extractor/vrt.py | 91 ++++++++----------------------------- 2 files changed, 20 insertions(+), 73 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a3d4447a8..f495b28b5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1058,7 +1058,7 @@ - **VoxMediaVolume** - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **Vrak** - - **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be + - **VRT**: vrt.be, sporza.be - **VrtNU**: VrtNU.be - **vrv** - **vrv:series** diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py index 72f20e4e9..f8f87bcae 100644 --- a/youtube_dl/extractor/vrt.py +++ b/youtube_dl/extractor/vrt.py @@ -12,83 +12,34 @@ from ..utils import ( class VRTIE(InfoExtractor): - IE_DESC = 'vrt.be, sporza.be, cobra.be and cobra.canvas.be' - _VALID_URL = r'https?://(?:www\.)?(?:vrt|sporza|cobra(?:\.canvas)?)\.be/(?:[^/]+/)+(?P[^/]+)/*' + IE_DESC = 'vrt.be, sporza.be' + _VALID_URL = r'https?://(?:www\.)?(?:vrt|sporza)\.be/(?!(?:vrtnu))(?:[^/]+/)+(?P[^/]+)/*' _TESTS = [ - # deredactie.be + # vrt.be { - 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/programmas/journaal/EP_141025_JOL', - 'md5': '4cebde1eb60a53782d4f3992cbd46ec8', + 'url': 'https://www.vrt.be/vrtnws/nl/2019/03/29/cyberbeveiliging-164-studenten-nemen-deel-aan-wedstrijd-die-oo/', + 'md5': 'b965693d0cb2c7ca5c0acbecd15d9442', 'info_dict': { - 'id': '2129880', - 'ext': 'flv', - 'title': 'Het journaal L - 25/10/14', - 'description': None, - 'timestamp': 1414271750.949, - 'upload_date': '20141025', - 'duration': 929, + 'id': 'vid-c65417a1-c725-47b2-8692-4c77234119cd', + 'ext': 'mp4', + 'title': 'Cyberbeveiliging - 164 studenten nemen deel aan wedstrijd, die ook een soort jobbeurs is', + 'description': 'Het tekort aan computerwetenschappers is een oud zeer. Voor hen zijn er zo maar eventjes 16.000 vacatures.', + 'duration': 88.19, }, 'skip': 'HTTP Error 404: Not Found', }, # sporza.be { - 'url': 'http://sporza.be/cm/sporza/videozone/programmas/extratime/EP_141020_Extra_time', - 'md5': '11f53088da9bf8e7cfc42456697953ff', + 'url': 'https://sporza.be/nl/2019/03/31/sterke-alexander-kristoff-wint-gent-wevelgem-in-de-sprint/', + 'md5': 'fb5eb1716e2d451d5f3abcf3c9fcab58', 'info_dict': { - 'id': '2124639', - 'ext': 'flv', - 'title': 'Bekijk Extra Time van 20 oktober', - 'description': 'md5:83ac5415a4f1816c6a93f8138aef2426', - 'timestamp': 1413835980.560, - 'upload_date': '20141020', - 'duration': 3238, + 'id': 'vid-0eb67979-227a-42b0-ab6d-1a5836779d7e', + 'ext': 'mp4', + 'title': 'Sterke Alexander Kristoff wint Gent-Wevelgem in de sprint', + 'description': '...', + 'duration': 334.05, }, 'skip': 'HTTP Error 404: Not Found', - }, - # cobra.be - { - 'url': 'http://cobra.be/cm/cobra/videozone/rubriek/film-videozone/141022-mv-ellis-cafecorsari', - 'md5': '78a2b060a5083c4f055449a72477409d', - 'info_dict': { - 'id': '2126050', - 'ext': 'flv', - 'title': 'Bret Easton Ellis in Café Corsari', - 'description': 'md5:f699986e823f32fd6036c1855a724ee9', - 'timestamp': 1413967500.494, - 'upload_date': '20141022', - 'duration': 661, - }, - 'skip': 'HTTP Error 404: Not Found', - }, - { - # YouTube video - 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/nieuws/cultuurenmedia/1.2622957', - 'md5': 'b8b93da1df1cea6c8556255a796b7d61', - 'info_dict': { - 'id': 'Wji-BZ0oCwg', - 'ext': 'mp4', - 'title': 'ROGUE ONE: A STAR WARS STORY Official Teaser Trailer', - 'description': 'md5:8e468944dce15567a786a67f74262583', - 'uploader': 'Star Wars', - 'uploader_id': 'starwars', - 'upload_date': '20160407', - }, - 'add_ie': ['Youtube'], - }, - { - 'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055', - 'info_dict': { - 'id': '2377055', - 'ext': 'mp4', - 'title': 'Cafe Derby', - 'description': 'Lenny Van Wesemael debuteert met de langspeelfilm Café Derby. Een waar gebeurd maar ook verzonnen verhaal.', - 'upload_date': '20150626', - 'timestamp': 1435305240.769, - }, - 'params': { - # m3u8 download - 'skip_download': True, - } } ] @@ -116,7 +67,6 @@ class VRTIE(InfoExtractor): headers=headers, data={}) vrtPlayerToken = result['vrtPlayerToken'] - print(vrtPlayerToken) formats = [] @@ -127,14 +77,11 @@ class VRTIE(InfoExtractor): headers=headers) for t in targetUrls['targetUrls']: - if t['url'].endswith('m3u8'): + if '.m3u8' in t['url']: formats.extend(self._extract_m3u8_formats(t['url'], video_id)) - elif t['url'].endswith('mpd'): + elif '.mpd' in t['url']: formats.extend(self._extract_mpd_formats(t['url'], video_id)) - if not formats and 'data-video-geoblocking="true"' in webpage: - self.raise_geo_restricted('This video is only available in Belgium') - self._sort_formats(formats) title = self._og_search_title(webpage) From b7c9583fd31ea564483f8d0e8bae283cd72aae4e Mon Sep 17 00:00:00 2001 From: Arthur Bols Date: Mon, 1 Apr 2019 16:16:49 +0200 Subject: [PATCH 3/3] flake8 compliance --- youtube_dl/extractor/vrt.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py index f8f87bcae..8a62725bd 100644 --- a/youtube_dl/extractor/vrt.py +++ b/youtube_dl/extractor/vrt.py @@ -1,13 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( float_or_none, - int_or_none, - try_get, ) @@ -91,7 +86,7 @@ class VRTIE(InfoExtractor): r'data-video-sitestat-pubdate="(\d+)"', webpage, 'timestamp', fatal=False), 1000) duration = float_or_none(self._search_regex( r'data-duration="(\d+)"', webpage, 'duration', fatal=False), 1000) - + return { 'id': video_id, 'title': title,