From 5333bea24f70d141da33b1e6290323538b66ee7a Mon Sep 17 00:00:00 2001 From: Frederic Bournival Date: Sun, 19 Apr 2020 17:13:54 -0400 Subject: [PATCH 1/5] First implementation for the TV5UnisCa extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tv5unisca.py | 51 ++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 youtube_dl/extractor/tv5unisca.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e407ab3d9..8a24e8c95 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1181,6 +1181,7 @@ from .tv2dk import ( from .tv2hu import TV2HuIE from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE +from .tv5unisca import TV5UnisCaIE from .tva import TVAIE from .tvanouvelles import ( TVANouvellesIE, diff --git a/youtube_dl/extractor/tv5unisca.py b/youtube_dl/extractor/tv5unisca.py new file mode 100644 index 000000000..4f126d25e --- /dev/null +++ b/youtube_dl/extractor/tv5unisca.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + get_element_by_id +) + +import re + + +class TV5UnisCaIE(InfoExtractor): + IE_DESC = 'TV5UNISCA' + _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/(?P[^?#]+)' + _TESTS = [] + _GEO_BYPASS = False + + def _real_extract(self, format_url): + + display_id = self._match_id(format_url) + webpage = self._download_webpage(format_url, display_id) + + next_data_dict = self._parse_json( + get_element_by_id('__NEXT_DATA__', webpage), display_id)\ + .get('props').get('apolloState') + + info_dict = self._json_ld( + next_data_dict['$ArtisanBlocksPageMetaData:50.blockConfiguration.pageMetaDataConfiguration']['jsonLd'], + display_id + ) + + formats = [] + for key in filter(lambda k: re.match(r'\$Video:\d+\.encodings\.', k), next_data_dict.keys()): + format_ul = next_data_dict[key].get('url') + if not format_ul: + continue + if format_ul.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(format_ul, display_id)) + if format_ul.endswith('.ism/manifest'): + formats.extend(self._extract_ism_formats(format_ul, display_id, ism_id='mss', fatal=False)) + if format_ul.endswith('.mp4'): + formats.append({ + 'url': format_ul, + 'format_id': 'http' + }) + + info_dict['id'] = info_dict['display_id'] = display_id + info_dict['formats'] = formats + + return info_dict From 9d1e43453a633de8b097baa4cb02f4db79c0bbb2 Mon Sep 17 00:00:00 2001 From: Frederic Bournival Date: Sun, 19 Apr 2020 17:17:45 -0400 Subject: [PATCH 2/5] Using another variable name for specific playlist/format urls --- youtube_dl/extractor/tv5unisca.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/tv5unisca.py b/youtube_dl/extractor/tv5unisca.py index 4f126d25e..519b3cfaa 100644 --- a/youtube_dl/extractor/tv5unisca.py +++ b/youtube_dl/extractor/tv5unisca.py @@ -16,10 +16,10 @@ class TV5UnisCaIE(InfoExtractor): _TESTS = [] _GEO_BYPASS = False - def _real_extract(self, format_url): + def _real_extract(self, url): - display_id = self._match_id(format_url) - webpage = self._download_webpage(format_url, display_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) next_data_dict = self._parse_json( get_element_by_id('__NEXT_DATA__', webpage), display_id)\ @@ -32,16 +32,16 @@ class TV5UnisCaIE(InfoExtractor): formats = [] for key in filter(lambda k: re.match(r'\$Video:\d+\.encodings\.', k), next_data_dict.keys()): - format_ul = next_data_dict[key].get('url') - if not format_ul: + url = next_data_dict[key].get('url') + if not url: continue - if format_ul.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(format_ul, display_id)) - if format_ul.endswith('.ism/manifest'): - formats.extend(self._extract_ism_formats(format_ul, display_id, ism_id='mss', fatal=False)) - if format_ul.endswith('.mp4'): + if url.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(url, display_id)) + if url.endswith('.ism/manifest'): + formats.extend(self._extract_ism_formats(url, display_id, ism_id='mss', fatal=False)) + if url.endswith('.mp4'): formats.append({ - 'url': format_ul, + 'url': url, 'format_id': 'http' }) From 2a48d8cc936cfeb575078efb29e964b613415e74 Mon Sep 17 00:00:00 2001 From: Frederic Bournival Date: Sun, 19 Apr 2020 17:29:45 -0400 Subject: [PATCH 3/5] Geo countries --- youtube_dl/extractor/tv5unisca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv5unisca.py b/youtube_dl/extractor/tv5unisca.py index 519b3cfaa..04bb981a4 100644 --- a/youtube_dl/extractor/tv5unisca.py +++ b/youtube_dl/extractor/tv5unisca.py @@ -14,7 +14,7 @@ class TV5UnisCaIE(InfoExtractor): IE_DESC = 'TV5UNISCA' _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/(?P[^?#]+)' _TESTS = [] - _GEO_BYPASS = False + _GEO_COUNTRIES = ['CA', 'FR'] def _real_extract(self, url): From 8f91c383bf7f45c5482c4c45b8f08e8d14051b34 Mon Sep 17 00:00:00 2001 From: Frederic Bournival Date: Sun, 19 Apr 2020 21:03:38 -0400 Subject: [PATCH 4/5] Setting GEO bypass Removing ISM since failing on many old videos Sorting formats Adding tests --- youtube_dl/extractor/tv5unisca.py | 40 ++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/tv5unisca.py b/youtube_dl/extractor/tv5unisca.py index 04bb981a4..2897c4124 100644 --- a/youtube_dl/extractor/tv5unisca.py +++ b/youtube_dl/extractor/tv5unisca.py @@ -13,8 +13,33 @@ import re class TV5UnisCaIE(InfoExtractor): IE_DESC = 'TV5UNISCA' _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/(?P[^?#]+)' - _TESTS = [] - _GEO_COUNTRIES = ['CA', 'FR'] + _TESTS = [{ + 'url': 'https://www.tv5unis.ca/videos/expedition-kayak/saisons/1/episodes/2', + 'info_dict': { + 'id': 'expedition-kayak/saisons/1/episodes/2', + 'episode_number': 2, + 'season_number': 1, + 'ext': 'm3u8', + 'title': 'Expédition kayak - Gaspésie 2', + 'description': 'md5:aecf01897141d3997f10685b3f2662ef', + 'upload_date': '20200417', + 'timestamp': 1587085203, + } + }, { + 'url': 'https://www.tv5unis.ca/videos/la-bataille-de-notre-dame', + 'info_dict': { + 'id': 'la-bataille-de-notre-dame', + 'ext': 'm3u8', + 'title': 'La bataille de Notre-Dame', + 'description': 'md5:b69a25dbe9b1880eadad219af7372a7c', + 'upload_date': '20200414', + 'timestamp': 1586824384, + }, + 'params': { + 'skip_download': True, + } + }] + _GEO_BYPASS = False def _real_extract(self, url): @@ -22,7 +47,7 @@ class TV5UnisCaIE(InfoExtractor): webpage = self._download_webpage(url, display_id) next_data_dict = self._parse_json( - get_element_by_id('__NEXT_DATA__', webpage), display_id)\ + get_element_by_id('__NEXT_DATA__', webpage), display_id) \ .get('props').get('apolloState') info_dict = self._json_ld( @@ -30,6 +55,11 @@ class TV5UnisCaIE(InfoExtractor): display_id ) + if info_dict.get('season', ''): + info_dict['title'] = ' - '.join((info_dict.get('season', ''), info_dict.get('episode', ''))) + + info_dict['id'] = info_dict['display_id'] = display_id + formats = [] for key in filter(lambda k: re.match(r'\$Video:\d+\.encodings\.', k), next_data_dict.keys()): url = next_data_dict[key].get('url') @@ -37,15 +67,13 @@ class TV5UnisCaIE(InfoExtractor): continue if url.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats(url, display_id)) - if url.endswith('.ism/manifest'): - formats.extend(self._extract_ism_formats(url, display_id, ism_id='mss', fatal=False)) if url.endswith('.mp4'): formats.append({ 'url': url, 'format_id': 'http' }) - info_dict['id'] = info_dict['display_id'] = display_id + self._sort_formats(formats) info_dict['formats'] = formats return info_dict From 5b8203f184748843ad1b1bc2c1acb02d23b9625f Mon Sep 17 00:00:00 2001 From: Frederic Bournival Date: Sun, 19 Apr 2020 21:41:26 -0400 Subject: [PATCH 5/5] Handling cases where the site removed a video but do not 404 --- youtube_dl/extractor/tv5unisca.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tv5unisca.py b/youtube_dl/extractor/tv5unisca.py index 2897c4124..21a7f4108 100644 --- a/youtube_dl/extractor/tv5unisca.py +++ b/youtube_dl/extractor/tv5unisca.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + ExtractorError, get_element_by_id ) @@ -50,11 +51,15 @@ class TV5UnisCaIE(InfoExtractor): get_element_by_id('__NEXT_DATA__', webpage), display_id) \ .get('props').get('apolloState') - info_dict = self._json_ld( - next_data_dict['$ArtisanBlocksPageMetaData:50.blockConfiguration.pageMetaDataConfiguration']['jsonLd'], - display_id + metadata = next_data_dict.get( + '$ArtisanBlocksPageMetaData:50.blockConfiguration.pageMetaDataConfiguration', None ) + if not metadata: + raise ExtractorError('Video removed or not found.', expected=True) + + info_dict = self._json_ld(metadata.get('jsonLd'), display_id) + if info_dict.get('season', ''): info_dict['title'] = ' - '.join((info_dict.get('season', ''), info_dict.get('episode', '')))