From f226880c6d44098b5e99b05a83ed739e18d15690 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 14 Mar 2018 01:28:40 +0100 Subject: [PATCH 001/148] [tennistv] Add support for tennistv.com --- test/test_utils.py | 1 + youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tennistv.py | 113 +++++++++++++++++++++++++++++ youtube_dl/utils.py | 5 ++ 4 files changed, 120 insertions(+) create mode 100644 youtube_dl/extractor/tennistv.py diff --git a/test/test_utils.py b/test/test_utils.py index f92c65b59..a1fe6fdb2 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -352,6 +352,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_timestamp('2017-03-30T17:52:41Q'), 1490896361) self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540) self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140) + self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6d6ae89f8..3bde40eb3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1062,6 +1062,7 @@ from .telequebec import ( ) from .teletask import TeleTaskIE from .telewebion import TelewebionIE +from .tennistv import TennisTVIE from .testurl import TestURLIE from .tf1 import TF1IE from .tfo import TFOIE diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py new file mode 100644 index 000000000..601a17e57 --- /dev/null +++ b/youtube_dl/extractor/tennistv.py @@ -0,0 +1,113 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + unified_timestamp, +) + + +class TennisTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tennistv\.com/videos/(?P[-a-z0-9]+)' + _TEST = { + 'url': 'https://www.tennistv.com/videos/indian-wells-2018-verdasco-fritz', + 'info_dict': { + 'id': 'indian-wells-2018-verdasco-fritz', + 'ext': 'mp4', + 'title': 'Fernando Verdasco v Taylor Fritz', + 'description': 're:^After his stunning victory.{174}$', + 'thumbnail': 'https://atp-prod.akamaized.net/api/images/v1/images/112831/landscape/1242/0', + 'timestamp': 1521017381, + 'upload_date': '20180314', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires email and password of a subscribed account', + } + _NETRC_MACHINE = 'tennistv' + + def _login(self): + (username, password) = self._get_login_info() + if not username or not password: + raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) + + login_form = { + 'Email': username, + 'Password': password, + } + login_json = json.dumps(login_form) + headers = { + 'content-type': 'application/json', + 'Referer': 'https://www.tennistv.com/login', + 'Origin': 'https://www.tennistv.com', + } + + login_result = self._download_json( + 'https://www.tennistv.com/api/users/v1/login', None, + note='Logging in', + errnote='Login failed (wrong password?)', + headers=headers, + data=login_json) + + if login_result['error']['errorCode']: + raise ExtractorError('Login failed, %s said: %r' % (self.IE_NAME, login_result['error']['errorMessage'])) + + if login_result['entitlement'] != 'SUBSCRIBED': + self.report_warning('%s may not be subscribed to %s.' % (username, self.IE_NAME)) + + self._session_token = login_result['sessionToken'] + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + internal_id = self._search_regex(r'video=([0-9]+)', webpage, 'internal video id') + + headers = { + 'Origin': 'https://www.tennistv.com', + 'authorization': 'ATP %s' % self._session_token, + 'content-type': 'application/json', + 'Referer': url, + } + check_data = { + 'videoID': internal_id, + 'VideoUrlType': 'HLSV3', + } + check_json = json.dumps(check_data) + check_result = self._download_json( + 'https://www.tennistv.com/api/users/v1/entitlementchecknondiva', + video_id, note='Checking video authorization', headers=headers, data=check_json) + formats = self._extract_m3u8_formats(check_result['contentUrl'], video_id, ext='mp4') + + vdata_url = 'https://www.tennistv.com/api/channels/v1/de/none/video/%s' % video_id + vdata = self._download_json(vdata_url, video_id) + + timestamp = unified_timestamp(vdata['timestamp']) + thumbnail = vdata['video']['thumbnailUrl'] + description = vdata['displayText']['description'] + title = vdata['video']['title'] + + series = vdata['tour'] + venue = vdata['displayText']['venue'] + round_str = vdata['seo']['round'] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'timestamp': timestamp, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'series': series, + 'season': venue, + 'episode': round_str, + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a21455f70..027d12785 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1211,6 +1211,11 @@ def unified_timestamp(date_str, day_first=True): if m: date_str = date_str[:-len(m.group('tz'))] + # Python only supports microseconds, so remove nanoseconds + m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str) + if m: + date_str = m.group(1) + for expression in date_formats(day_first): try: dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) From f3672ac522e85b3eae339a95d34f46e92d8ebaa3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 14 Mar 2018 09:55:46 +0100 Subject: [PATCH 002/148] [line] lint (remove space on empty line) --- youtube_dl/extractor/line.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/line.py b/youtube_dl/extractor/line.py index 8414312fc..7f5fa446e 100644 --- a/youtube_dl/extractor/line.py +++ b/youtube_dl/extractor/line.py @@ -77,7 +77,7 @@ class LineTVIE(InfoExtractor): title = self._og_search_title(webpage) # like_count requires an additional API request https://tv.line.me/api/likeit/getCount - + return { 'id': video_id, 'title': title, From e6e68069f6fe25fe4a2b72487be840ba2ec3c5c6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 14 Mar 2018 11:23:09 +0100 Subject: [PATCH 003/148] [tennistv] Correctly encode POST parameters In python 3.x, the POST parameters must be bytes, not str. --- ChangeLog | 2 +- youtube_dl/extractor/tennistv.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index f2f0d6143..ad639c805 100644 --- a/ChangeLog +++ b/ChangeLog @@ -2,7 +2,7 @@ version Extractors + [line] Add support for tv.line.me (#9427) - ++ [tennistv] Add support for tennistv.com version 2018.03.10 diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py index 601a17e57..def29b6fa 100644 --- a/youtube_dl/extractor/tennistv.py +++ b/youtube_dl/extractor/tennistv.py @@ -40,7 +40,7 @@ class TennisTVIE(InfoExtractor): 'Email': username, 'Password': password, } - login_json = json.dumps(login_form) + login_json = json.dumps(login_form).encode('utf-8') headers = { 'content-type': 'application/json', 'Referer': 'https://www.tennistv.com/login', @@ -81,7 +81,7 @@ class TennisTVIE(InfoExtractor): 'videoID': internal_id, 'VideoUrlType': 'HLSV3', } - check_json = json.dumps(check_data) + check_json = json.dumps(check_data).encode('utf-8') check_result = self._download_json( 'https://www.tennistv.com/api/users/v1/entitlementchecknondiva', video_id, note='Checking video authorization', headers=headers, data=check_json) From b848a4ca1a4bb5b2f64eb551d1bbd73ddcd2e9b1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 14 Mar 2018 11:48:20 +0100 Subject: [PATCH 004/148] [tennistv] Remove duplicate key in dictionary --- youtube_dl/extractor/tennistv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py index def29b6fa..0c6f70784 100644 --- a/youtube_dl/extractor/tennistv.py +++ b/youtube_dl/extractor/tennistv.py @@ -104,7 +104,6 @@ class TennisTVIE(InfoExtractor): 'title': title, 'description': description, 'formats': formats, - 'timestamp': timestamp, 'thumbnail': thumbnail, 'timestamp': timestamp, 'series': series, From b8c6badc96fa52e1851d2c5803cb9a1563bf9de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Mar 2018 22:38:30 +0700 Subject: [PATCH 005/148] [soundcloud] Update client id (closes #15866) --- youtube_dl/extractor/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 1ca310b90..46332e5c2 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -157,7 +157,7 @@ class SoundcloudIE(InfoExtractor): }, ] - _CLIENT_ID = 'DQskPX1pntALRzMp4HSxya3Mc0AO66Ro' + _CLIENT_ID = 'LvWovRaJZlWCHql0bISuum8Bd2KX79mb' @staticmethod def _extract_urls(webpage): From c95dfb050942e353fa39f83d02bf08dedb13963a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Mar 2018 22:45:05 +0700 Subject: [PATCH 006/148] [ChangeLog] Actualize [ci skip] --- ChangeLog | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index ad639c805..1b6e62135 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,12 @@ version Extractors -+ [line] Add support for tv.line.me (#9427) +* [soundcloud] Update client id (#15866) + [tennistv] Add support for tennistv.com ++ [line] Add support for tv.line.me (#9427) +* [xnxx] Fix extraction (#15817) +* [njpwworld] Fix authentication (#15815) + version 2018.03.10 From 46c6742d4f1b1afa2d6dc787e8b0b119f9c5ee98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Mar 2018 22:49:22 +0700 Subject: [PATCH 007/148] release 2018.03.14 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index bc0c5ef18..481e2ed74 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.10** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.14** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.03.10 +[debug] youtube-dl version 2018.03.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 1b6e62135..47736e076 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.03.14 Extractors * [soundcloud] Update client id (#15866) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index cb11f1b42..80358bb14 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -427,6 +427,7 @@ - **limelight** - **limelight:channel** - **limelight:channel_list** + - **LineTV** - **LiTV** - **LiveLeak** - **LiveLeakEmbed** @@ -829,6 +830,7 @@ - **TeleQuebecLive** - **TeleTask** - **Telewebion** + - **TennisTV** - **TF1** - **TFO** - **TheIntercept** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a35d10818..6ce11c39b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.03.10' +__version__ = '2018.03.14' From 27b1c73f14617eec4286cc85c68c87b6635cfff3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 15 Mar 2018 14:33:36 +0100 Subject: [PATCH 008/148] [instagram] fix user videos extraction(fixes #15858) --- youtube_dl/extractor/instagram.py | 106 +++++++++++++----------------- 1 file changed, 47 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index a77f619d2..ac9d92a8d 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -1,6 +1,6 @@ from __future__ import unicode_literals -import itertools +import json import re from .common import InfoExtractor @@ -238,70 +238,58 @@ class InstagramUserIE(InfoExtractor): } def _entries(self, uploader_id): - query = { - '__a': 1, - } - - def get_count(kind): + def get_count(suffix): return int_or_none(try_get( - node, lambda x: x['%ss' % kind]['count'])) + node, lambda x: x['edge_media_' + suffix]['count'])) - for page_num in itertools.count(1): - page = self._download_json( - 'https://instagram.com/%s/' % uploader_id, uploader_id, - note='Downloading page %d' % page_num, - fatal=False, query=query) - if not page: - break - - nodes = try_get(page, lambda x: x['user']['media']['nodes'], list) - if not nodes: - break - - max_id = None - - for node in nodes: - node_id = node.get('id') - if node_id: - max_id = node_id - - if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: - continue - video_id = node.get('code') - if not video_id: - continue - - info = self.url_result( - 'https://instagram.com/p/%s/' % video_id, - ie=InstagramIE.ie_key(), video_id=video_id) - - description = try_get( - node, [lambda x: x['caption'], lambda x: x['text']['id']], - compat_str) - thumbnail = node.get('thumbnail_src') or node.get('display_src') - timestamp = int_or_none(node.get('date')) - - comment_count = get_count('comment') - like_count = get_count('like') - view_count = int_or_none(node.get('video_views')) - - info.update({ - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'comment_count': comment_count, - 'like_count': like_count, - 'view_count': view_count, + edges = self._download_json( + 'https://www.instagram.com/graphql/query/', uploader_id, query={ + 'query_hash': '472f257a40c653c64c666ce877d59d2b', + 'variables': json.dumps({ + 'id': uploader_id, + 'first': 999999999, }) + })['data']['user']['edge_owner_to_timeline_media']['edges'] - yield info + for edge in edges: + node = edge['node'] - if not max_id: - break + if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: + continue + video_id = node.get('shortcode') + if not video_id: + continue - query['max_id'] = max_id + info = self.url_result( + 'https://instagram.com/p/%s/' % video_id, + ie=InstagramIE.ie_key(), video_id=video_id) + + description = try_get( + node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], + compat_str) + thumbnail = node.get('thumbnail_src') or node.get('display_src') + timestamp = int_or_none(node.get('taken_at_timestamp')) + + comment_count = get_count('to_comment') + like_count = get_count('preview_like') + view_count = int_or_none(node.get('video_view_count')) + + info.update({ + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'comment_count': comment_count, + 'like_count': like_count, + 'view_count': view_count, + }) + + yield info def _real_extract(self, url): - uploader_id = self._match_id(url) + username = self._match_id(url) + uploader_id = self._download_json( + 'https://instagram.com/%s/' % username, username, query={ + '__a': 1, + })['graphql']['user']['id'] return self.playlist_result( - self._entries(uploader_id), uploader_id, uploader_id) + self._entries(uploader_id), username, username) From 8e70c1bfac98b3d0d304b66ff1d616dd26522acb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 15 Mar 2018 22:37:41 +0700 Subject: [PATCH 009/148] [heise] Improve extraction (closes #15496, closes #15784, closes #15026) --- youtube_dl/extractor/heise.py | 76 +++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 8f49f52ef..5c03780a3 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -7,6 +7,7 @@ from .youtube import YoutubeIE from ..utils import ( determine_ext, int_or_none, + NO_DEFAULT, parse_iso8601, smuggle_url, xpath_text, @@ -16,18 +17,19 @@ from ..utils import ( class HeiseIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?heise\.de/(?:[^/]+/)+[^/]+-(?P[0-9]+)\.html' _TESTS = [{ + # kaltura embed 'url': 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html', - 'md5': 'ffed432483e922e88545ad9f2f15d30e', 'info_dict': { - 'id': '2404147', + 'id': '1_kkrq94sm', 'ext': 'mp4', 'title': "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone", - 'format_id': 'mp4_720p', - 'timestamp': 1411812600, - 'upload_date': '20140927', + 'timestamp': 1512734959, + 'upload_date': '20171208', 'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20', - 'thumbnail': r're:^https?://.*/gallery/$', - } + }, + 'params': { + 'skip_download': True, + }, }, { # YouTube embed 'url': 'http://www.heise.de/newsticker/meldung/Netflix-In-20-Jahren-vom-Videoverleih-zum-TV-Revolutionaer-3814130.html', @@ -46,13 +48,26 @@ class HeiseIE(InfoExtractor): }, }, { 'url': 'https://www.heise.de/video/artikel/nachgehakt-Wie-sichert-das-c-t-Tool-Restric-tor-Windows-10-ab-3700244.html', - 'md5': '4b58058b46625bdbd841fc2804df95fc', 'info_dict': { 'id': '1_ntrmio2s', + 'ext': 'mp4', + 'title': "nachgehakt: Wie sichert das c't-Tool Restric'tor Windows 10 ab?", + 'description': 'md5:47e8ffb6c46d85c92c310a512d6db271', 'timestamp': 1512470717, 'upload_date': '20171205', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.heise.de/ct/artikel/c-t-uplink-20-8-Staubsaugerroboter-Xiaomi-Vacuum-2-AR-Brille-Meta-2-und-Android-rooten-3959893.html', + 'info_dict': { + 'id': '1_59mk80sf', 'ext': 'mp4', - 'title': 'ct10 nachgehakt hos restrictor', + 'title': "c't uplink 20.8: Staubsaugerroboter Xiaomi Vacuum 2, AR-Brille Meta 2 und Android rooten", + 'description': 'md5:f50fe044d3371ec73a8f79fcebd74afc', + 'timestamp': 1517567237, + 'upload_date': '20180202', }, 'params': { 'skip_download': True, @@ -72,19 +87,40 @@ class HeiseIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_meta('fulltitle', webpage, default=None) - if not title or title == "c't": - title = self._search_regex( - r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', - webpage, 'title') + def extract_title(default=NO_DEFAULT): + title = self._html_search_meta( + ('fulltitle', 'title'), webpage, default=None) + if not title or title == "c't": + title = self._search_regex( + r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', + webpage, 'title', default=None) + if not title: + title = self._html_search_regex( + r']+\bclass=["\']article_page_title[^>]+>(.+?)<', + webpage, 'title', default=default) + return title - yt_urls = YoutubeIE._extract_urls(webpage) - if yt_urls: - return self.playlist_from_matches(yt_urls, video_id, title, ie=YoutubeIE.ie_key()) + title = extract_title(default=None) + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'description', webpage) kaltura_url = KalturaIE._extract_url(webpage) if kaltura_url: - return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) + return { + '_type': 'url_transparent', + 'url': smuggle_url(kaltura_url, {'source_url': url}), + 'ie_key': KalturaIE.ie_key(), + 'title': title, + 'description': description, + } + + yt_urls = YoutubeIE._extract_urls(webpage) + if yt_urls: + return self.playlist_from_matches( + yt_urls, video_id, title, ie=YoutubeIE.ie_key()) + + title = extract_title() container_id = self._search_regex( r'
]+data-container="([0-9]+)"', @@ -115,10 +151,6 @@ class HeiseIE(InfoExtractor): }) self._sort_formats(formats) - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'description', webpage) - return { 'id': video_id, 'title': title, From 3526c3043b5d6ce64d9cf0ccab20ef0b7a1e6a13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 16 Mar 2018 00:19:17 +0700 Subject: [PATCH 010/148] [bilibili] Fix and improve extraction (closes #15048, closes #15430, closes #15622, closes #15863) --- youtube_dl/extractor/bilibili.py | 145 ++++++++++++++++++++----------- 1 file changed, 93 insertions(+), 52 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index beffcecd0..b898223e3 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -27,14 +27,14 @@ class BiliBiliIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', - 'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e', + 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', 'info_dict': { 'id': '1074402', - 'ext': 'mp4', + 'ext': 'flv', 'title': '【金坷垃】金泡沫', 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'duration': 308.315, - 'timestamp': 1398012660, + 'duration': 308.067, + 'timestamp': 1398012678, 'upload_date': '20140420', 'thumbnail': r're:^https?://.+\.jpg', 'uploader': '菊子桑', @@ -59,17 +59,38 @@ class BiliBiliIE(InfoExtractor): 'url': 'http://www.bilibili.com/video/av8903802/', 'info_dict': { 'id': '8903802', - 'ext': 'mp4', 'title': '阿滴英文|英文歌分享#6 "Closer', 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', - 'uploader': '阿滴英文', - 'uploader_id': '65880958', - 'timestamp': 1488382620, - 'upload_date': '20170301', - }, - 'params': { - 'skip_download': True, # Test metadata only }, + 'playlist': [{ + 'info_dict': { + 'id': '8903802_part1', + 'ext': 'flv', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', + 'uploader': '阿滴英文', + 'uploader_id': '65880958', + 'timestamp': 1488382634, + 'upload_date': '20170301', + }, + 'params': { + 'skip_download': True, # Test metadata only + }, + }, { + 'info_dict': { + 'id': '8903802_part2', + 'ext': 'flv', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', + 'uploader': '阿滴英文', + 'uploader_id': '65880958', + 'timestamp': 1488382634, + 'upload_date': '20170301', + }, + 'params': { + 'skip_download': True, # Test metadata only + }, + }] }] _APP_KEY = '84956560bc028eb7' @@ -92,9 +113,13 @@ class BiliBiliIE(InfoExtractor): webpage = self._download_webpage(url, video_id) if 'anime/' not in url: - cid = compat_parse_qs(self._search_regex( - [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', - r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], + cid = self._search_regex( + r'cid(?:["\']:|=)(\d+)', webpage, 'cid', + default=None + ) or compat_parse_qs(self._search_regex( + [r'1EmbedPlayer\([^)]+,\s*"([^"]+)"\)', + r'1EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', + r'1]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], webpage, 'player parameters'))['cid'][0] else: if 'no_bangumi_tip' not in smuggled_data: @@ -114,53 +139,66 @@ class BiliBiliIE(InfoExtractor): self._report_error(js) cid = js['result']['cid'] - payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) - sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() - headers = { 'Referer': url } headers.update(self.geo_verification_headers()) - video_info = self._download_json( - 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), - video_id, note='Downloading video info page', - headers=headers) - - if 'durl' not in video_info: - self._report_error(video_info) - entries = [] - for idx, durl in enumerate(video_info['durl']): - formats = [{ - 'url': durl['url'], - 'filesize': int_or_none(durl['size']), - }] - for backup_url in durl.get('backup_url', []): - formats.append({ - 'url': backup_url, - # backup URLs have lower priorities - 'preference': -2 if 'hd.mp4' in backup_url else -3, + RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') + for num, rendition in enumerate(RENDITIONS, start=1): + payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) + sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() + + video_info = self._download_json( + 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), + video_id, note='Downloading video info page', + headers=headers, fatal=num == len(RENDITIONS)) + + if not video_info: + continue + + if 'durl' not in video_info: + if num < len(RENDITIONS): + continue + self._report_error(video_info) + + for idx, durl in enumerate(video_info['durl']): + formats = [{ + 'url': durl['url'], + 'filesize': int_or_none(durl['size']), + }] + for backup_url in durl.get('backup_url', []): + formats.append({ + 'url': backup_url, + # backup URLs have lower priorities + 'preference': -2 if 'hd.mp4' in backup_url else -3, + }) + + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + + self._sort_formats(formats) + + entries.append({ + 'id': '%s_part%s' % (video_id, idx), + 'duration': float_or_none(durl.get('length'), 1000), + 'formats': formats, }) + break - for a_format in formats: - a_format.setdefault('http_headers', {}).update({ - 'Referer': url, - }) - - self._sort_formats(formats) - - entries.append({ - 'id': '%s_part%s' % (video_id, idx), - 'duration': float_or_none(durl.get('length'), 1000), - 'formats': formats, - }) - - title = self._html_search_regex(']*>([^<]+)', webpage, 'title') + title = self._html_search_regex( + (']+\btitle=(["\'])(?P(?:(?!\1).)+)\1', + '(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', + group='title') description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( - r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', default=None)) + r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', + default=None) or self._html_search_meta( + 'uploadDate', webpage, 'timestamp', default=None)) thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) # TODO 'view_count' requires deobfuscating Javascript @@ -174,13 +212,16 @@ class BiliBiliIE(InfoExtractor): } uploader_mobj = re.search( - r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"', + r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>(?P<name>[^<]+)', webpage) if uploader_mobj: info.update({ 'uploader': uploader_mobj.group('name'), 'uploader_id': uploader_mobj.group('id'), }) + if not info.get('uploader'): + info['uploader'] = self._html_search_meta( + 'author', webpage, 'uploader', default=None) for entry in entries: entry.update(info) From d12396085754a597c2c5e621e4a68471871e2cfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 16 Mar 2018 03:18:53 +0700 Subject: [PATCH 011/148] [bilibili] Switch to v2 playurl API --- youtube_dl/extractor/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index b898223e3..90697c4a7 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -152,7 +152,7 @@ class BiliBiliIE(InfoExtractor): sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() video_info = self._download_json( - 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), + 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), video_id, note='Downloading video info page', headers=headers, fatal=num == len(RENDITIONS)) From 178ee88319a384b66d9b2da27a819f32ba870425 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 17 Mar 2018 23:57:07 +0700 Subject: [PATCH 012/148] [generic] Add support for xfileshare embeds (closes #15879) --- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/xfileshare.py | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a98f3636a..dbd565066 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -104,6 +104,7 @@ from .mediasite import MediasiteIE from .springboardplatform import SpringboardPlatformIE from .yapfiles import YapFilesIE from .vice import ViceIE +from .xfileshare import XFileShareIE class GenericIE(InfoExtractor): @@ -2971,6 +2972,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( vice_urls, video_id, video_title, ie=ViceIE.ie_key()) + xfileshare_urls = XFileShareIE._extract_urls(webpage) + if xfileshare_urls: + return self.playlist_from_matches( + xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) + def merge_dicts(dict1, dict2): merged = {} for k, v in dict1.items(): diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index ad747978d..bc3239f68 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -118,6 +118,15 @@ class XFileShareIE(InfoExtractor): 'only_matching': True }] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' + % '|'.join(site for site in list(zip(*XFileShareIE._SITES))[0]), + webpage)] + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') From 96b8b9abaecb7518d901dc9d6a617f19c3161236 Mon Sep 17 00:00:00 2001 From: Ricardo Constantino <wiiaboo@gmail.com> Date: Wed, 7 Mar 2018 21:31:53 +0000 Subject: [PATCH 013/148] [extractor/generic] Support relative URIs in _parse_xspf <location> can have relative URIs, not just absolute. --- test/test_InfoExtractor.py | 42 ++++++++++++++++++++++++++++++++ test/testdata/xspf/foo_xspf.xspf | 34 ++++++++++++++++++++++++++ youtube_dl/extractor/common.py | 6 ++--- youtube_dl/extractor/generic.py | 4 ++- 4 files changed, 82 insertions(+), 4 deletions(-) create mode 100644 test/testdata/xspf/foo_xspf.xspf diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 7b31d5198..a695ce64b 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -694,6 +694,48 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + def test_parse_xspf(self): + _TEST_CASES = [ + ( + 'foo_xspf', + 'https://example.org/src/', + [{ + 'description': 'Visit http://bigbrother404.bandcamp.com', + 'duration': 202.416, + 'formats': [{'url': 'https://example.org/src/cd1/track%201.mp3'}], + 'id': 'foo_xspf', + 'title': 'Pandemonium' + }, + { + 'description': 'Visit http://bigbrother404.bandcamp.com', + 'duration': 255.857, + 'formats': [{'url': 'https://example.org/%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF%E3%80%80%EF%BC%92.mp3'}], + 'id': 'foo_xspf', + 'title': 'Final Cartridge (Nichico Twelve Remix)' + }, + { + 'description': 'Visit http://bigbrother404.bandcamp.com', + 'duration': 287.915, + 'formats': [ + {'url': 'https://example.org/src/track3.mp3'}, + {'url': 'https://example.com/track3.mp3'} + ], + 'id': 'foo_xspf', + 'title': 'Rebuilding Nightingale' + }] + ), + ] + + for xspf_file, xspf_base_url, expected_entries in _TEST_CASES: + with io.open('./test/testdata/xspf/%s.xspf' % xspf_file, + mode='r', encoding='utf-8') as f: + entries = self.ie._parse_xspf( + compat_etree_fromstring(f.read().encode('utf-8')), + xspf_file, xspf_base_url) + expect_value(self, entries, expected_entries, None) + for i in range(len(entries)): + expect_dict(self, entries[i], expected_entries[i]) + if __name__ == '__main__': unittest.main() diff --git a/test/testdata/xspf/foo_xspf.xspf b/test/testdata/xspf/foo_xspf.xspf new file mode 100644 index 000000000..b7f0086b3 --- /dev/null +++ b/test/testdata/xspf/foo_xspf.xspf @@ -0,0 +1,34 @@ +<?xml version="1.0" encoding="UTF-8"?> +<playlist version="1" xmlns="http://xspf.org/ns/0/"> + <date>2018-03-09T18:01:43Z</date> + <trackList> + <track> + <location>cd1/track%201.mp3</location> + <title>Pandemonium + Foilverb + Visit http://bigbrother404.bandcamp.com + Pandemonium EP + 1 + 202416 + + + ../%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF%E3%80%80%EF%BC%92.mp3 + Final Cartridge (Nichico Twelve Remix) + Visit http://bigbrother404.bandcamp.com + Foilverb + Pandemonium EP + 2 + 255857 + + + track3.mp3 + https://example.com/track3.mp3 + Rebuilding Nightingale + Visit http://bigbrother404.bandcamp.com + Foilverb + Pandemonium EP + 3 + 287915 + + + diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index fcdd0fd14..c1e1012e7 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1700,9 +1700,9 @@ class InfoExtractor(object): 'Unable to download xspf manifest', fatal=fatal) if xspf is False: return [] - return self._parse_xspf(xspf, playlist_id) + return self._parse_xspf(xspf, playlist_id, base_url(playlist_url)) - def _parse_xspf(self, playlist, playlist_id): + def _parse_xspf(self, playlist, playlist_id, playlist_base_url=''): NS_MAP = { 'xspf': 'http://xspf.org/ns/0/', 's1': 'http://static.streamone.nl/player/ns/0', @@ -1720,7 +1720,7 @@ class InfoExtractor(object): xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) formats = [{ - 'url': location.text, + 'url': urljoin(playlist_base_url, location.text), 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index dbd565066..023ccbc9b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2232,7 +2232,9 @@ class GenericIE(InfoExtractor): self._sort_formats(smil['formats']) return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': - return self.playlist_result(self._parse_xspf(doc, video_id), video_id) + return self.playlist_result( + self._parse_xspf(doc, video_id, compat_str(full_response.geturl())), + video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( doc, From e0d198c18d4a5f191adbfb43259c104d16e30596 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Mar 2018 02:17:34 +0700 Subject: [PATCH 014/148] [extractor/common] Add _download_xml_handle --- youtube_dl/extractor/common.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c1e1012e7..a50778509 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -644,19 +644,31 @@ class InfoExtractor(object): content, _ = res return content + def _download_xml_handle( + self, url_or_request, video_id, note='Downloading XML', + errnote='Unable to download XML', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}): + """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)""" + res = self._download_webpage_handle( + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding, data=data, headers=headers, query=query) + if res is False: + return res + xml_string, urlh = res + return self._parse_xml( + xml_string, video_id, transform_source=transform_source, + fatal=fatal), urlh + def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): """Return the xml as an xml.etree.ElementTree.Element""" - xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query) - if xml_string is False: - return xml_string - return self._parse_xml( - xml_string, video_id, transform_source=transform_source, - fatal=fatal) + res = self._download_xml_handle( + url_or_request, video_id, note=note, errnote=errnote, + transform_source=transform_source, fatal=fatal, encoding=encoding, + data=data, headers=headers, query=query) + return res if res is False else res[0] def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): if transform_source: From 47a5cb77344536ca79d81a04904ac9ef9b02050f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Mar 2018 02:46:50 +0700 Subject: [PATCH 015/148] Generalize XML manifest processing code and improve XSPF parsing (closes #15794) --- test/test_InfoExtractor.py | 41 ++++++++++++++++++------------- youtube_dl/extractor/common.py | 43 +++++++++++++++++++-------------- youtube_dl/extractor/generic.py | 4 ++- 3 files changed, 52 insertions(+), 36 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index a695ce64b..4833396a5 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -698,40 +698,47 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ _TEST_CASES = [ ( 'foo_xspf', - 'https://example.org/src/', + 'https://example.org/src/foo_xspf.xspf', [{ + 'id': 'foo_xspf', + 'title': 'Pandemonium', 'description': 'Visit http://bigbrother404.bandcamp.com', 'duration': 202.416, - 'formats': [{'url': 'https://example.org/src/cd1/track%201.mp3'}], + 'formats': [{ + 'manifest_url': 'https://example.org/src/foo_xspf.xspf', + 'url': 'https://example.org/src/cd1/track%201.mp3', + }], + }, { 'id': 'foo_xspf', - 'title': 'Pandemonium' - }, - { + 'title': 'Final Cartridge (Nichico Twelve Remix)', 'description': 'Visit http://bigbrother404.bandcamp.com', 'duration': 255.857, - 'formats': [{'url': 'https://example.org/%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF%E3%80%80%EF%BC%92.mp3'}], + 'formats': [{ + 'manifest_url': 'https://example.org/src/foo_xspf.xspf', + 'url': 'https://example.org/%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF%E3%80%80%EF%BC%92.mp3', + }], + }, { 'id': 'foo_xspf', - 'title': 'Final Cartridge (Nichico Twelve Remix)' - }, - { + 'title': 'Rebuilding Nightingale', 'description': 'Visit http://bigbrother404.bandcamp.com', 'duration': 287.915, - 'formats': [ - {'url': 'https://example.org/src/track3.mp3'}, - {'url': 'https://example.com/track3.mp3'} - ], - 'id': 'foo_xspf', - 'title': 'Rebuilding Nightingale' + 'formats': [{ + 'manifest_url': 'https://example.org/src/foo_xspf.xspf', + 'url': 'https://example.org/src/track3.mp3', + }, { + 'manifest_url': 'https://example.org/src/foo_xspf.xspf', + 'url': 'https://example.com/track3.mp3', + }] }] ), ] - for xspf_file, xspf_base_url, expected_entries in _TEST_CASES: + for xspf_file, xspf_url, expected_entries in _TEST_CASES: with io.open('./test/testdata/xspf/%s.xspf' % xspf_file, mode='r', encoding='utf-8') as f: entries = self.ie._parse_xspf( compat_etree_fromstring(f.read().encode('utf-8')), - xspf_file, xspf_base_url) + xspf_file, xspf_url=xspf_url, xspf_base_url=xspf_url) expect_value(self, entries, expected_entries, None) for i in range(len(entries)): expect_dict(self, entries[i], expected_entries[i]) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a50778509..2e2a02948 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1706,22 +1706,24 @@ class InfoExtractor(object): }) return subtitles - def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True): + def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): xspf = self._download_xml( - playlist_url, playlist_id, 'Downloading xpsf playlist', + xspf_url, playlist_id, 'Downloading xpsf playlist', 'Unable to download xspf manifest', fatal=fatal) if xspf is False: return [] - return self._parse_xspf(xspf, playlist_id, base_url(playlist_url)) + return self._parse_xspf( + xspf, playlist_id, xspf_url=xspf_url, + xspf_base_url=base_url(xspf_url)) - def _parse_xspf(self, playlist, playlist_id, playlist_base_url=''): + def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None): NS_MAP = { 'xspf': 'http://xspf.org/ns/0/', 's1': 'http://static.streamone.nl/player/ns/0', } entries = [] - for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): + for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): title = xpath_text( track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) description = xpath_text( @@ -1731,12 +1733,18 @@ class InfoExtractor(object): duration = float_or_none( xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) - formats = [{ - 'url': urljoin(playlist_base_url, location.text), - 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), - 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), - 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), - } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] + formats = [] + for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)): + format_url = urljoin(xspf_base_url, location.text) + if not format_url: + continue + formats.append({ + 'url': format_url, + 'manifest_url': xspf_url, + 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), + 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), + 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), + }) self._sort_formats(formats) entries.append({ @@ -1750,18 +1758,18 @@ class InfoExtractor(object): return entries def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): - res = self._download_webpage_handle( + res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', errnote=errnote or 'Failed to download MPD manifest', fatal=fatal) if res is False: return [] - mpd, urlh = res + mpd_doc, urlh = res mpd_base_url = base_url(urlh.geturl()) return self._parse_mpd_formats( - compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, + mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url, formats_dict=formats_dict, mpd_url=mpd_url) def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): @@ -2035,17 +2043,16 @@ class InfoExtractor(object): return formats def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): - res = self._download_webpage_handle( + res = self._download_xml_handle( ism_url, video_id, note=note or 'Downloading ISM manifest', errnote=errnote or 'Failed to download ISM manifest', fatal=fatal) if res is False: return [] - ism, urlh = res + ism_doc, urlh = res - return self._parse_ism_formats( - compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id) + return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): """ diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 023ccbc9b..1cc491b19 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2233,7 +2233,9 @@ class GenericIE(InfoExtractor): return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': return self.playlist_result( - self._parse_xspf(doc, video_id, compat_str(full_response.geturl())), + self._parse_xspf( + doc, video_id, xspf_url=url, + xspf_base_url=compat_str(full_response.geturl())), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( From 6e3f23d912ae2b7018a13f87ff89572dfac10d02 Mon Sep 17 00:00:00 2001 From: kayb94 <30302445+kayb94@users.noreply.github.com> Date: Sun, 18 Mar 2018 21:14:33 +0000 Subject: [PATCH 016/148] [prosiebensat1] Add support for galileo.tv (closes #15894) --- youtube_dl/extractor/prosiebensat1.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 7efff4566..d0955d079 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -133,7 +133,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): (?: prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|7tv|advopedia )\.(?:de|at|ch)| - ran\.de|fem\.com|advopedia\.de + ran\.de|fem\.com|advopedia\.de|galileo\.tv/video ) /(?P.+) ''' @@ -326,6 +326,11 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'url': 'http://www.sat1gold.de/tv/edel-starck/video/11-staffel-1-episode-1-partner-wider-willen-ganze-folge', 'only_matching': True, }, + { + # geo restricted to Germany + 'url': 'https://www.galileo.tv/video/diese-emojis-werden-oft-missverstanden', + 'only_matching': True, + }, { 'url': 'http://www.sat1gold.de/tv/edel-starck/playlist/die-gesamte-1-staffel', 'only_matching': True, @@ -343,7 +348,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): r'"clip_id"\s*:\s+"(\d+)"', r'clipid: "(\d+)"', r'clip[iI]d=(\d+)', - r'clip[iI]d\s*=\s*["\'](\d+)', + r'clip[iI][dD]\s*=\s*["\'](\d+)', r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", r'proMamsId"\s*:\s*"(\d+)', r'proMamsId"\s*:\s*"(\d+)', From 9a054fcbbadf06101b081f8be0594b38b654364f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 19 Mar 2018 23:28:37 +0700 Subject: [PATCH 017/148] [ceskatelevize] Add support for iframe embeds (closes #15918) --- youtube_dl/extractor/ceskatelevize.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index e250de18c..6bad90859 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -13,6 +13,7 @@ from ..utils import ( float_or_none, sanitized_Request, unescapeHTML, + update_url_query, urlencode_postdata, USER_AGENTS, ) @@ -265,6 +266,10 @@ class CeskaTelevizePoradyIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + }, { + # iframe embed + 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', + 'only_matching': True, }] def _real_extract(self, url): @@ -272,8 +277,11 @@ class CeskaTelevizePoradyIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - data_url = unescapeHTML(self._search_regex( - r']*\bdata-url=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'iframe player url', group='url')) + data_url = update_url_query(unescapeHTML(self._search_regex( + (r']*\bdata-url=(["\'])(?P(?:(?!\1).)+)\1', + r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), + webpage, 'iframe player url', group='url')), query={ + 'autoStart': 'true', + }) return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) From 38f59e2793dfcb5f493977857304ab50b784e6ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 19 Mar 2018 23:40:19 +0700 Subject: [PATCH 018/148] [canalc2] Add support for HTML5 videos (closes #15916, closes #15919) --- youtube_dl/extractor/canalc2.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index acd87e371..407cc8084 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -31,6 +31,10 @@ class Canalc2IE(InfoExtractor): webpage = self._download_webpage( 'http://www.canalc2.tv/video/%s' % video_id, video_id) + title = self._html_search_regex( + r'(?s)class="[^"]*col_description[^"]*">.*?

(.+?)

', + webpage, 'title') + formats = [] for _, video_url in re.findall(r'file\s*=\s*(["\'])(.+?)\1', webpage): if video_url.startswith('rtmp://'): @@ -49,17 +53,21 @@ class Canalc2IE(InfoExtractor): 'url': video_url, 'format_id': 'http', }) - self._sort_formats(formats) - title = self._html_search_regex( - r'(?s)class="[^"]*col_description[^"]*">.*?

(.*?)

', webpage, 'title') - duration = parse_duration(self._search_regex( - r'id=["\']video_duree["\'][^>]*>([^<]+)', - webpage, 'duration', fatal=False)) + if formats: + info = { + 'formats': formats, + } + else: + info = self._parse_html5_media_entries(url, webpage, url)[0] - return { + self._sort_formats(info['formats']) + + info.update({ 'id': video_id, 'title': title, - 'duration': duration, - 'formats': formats, - } + 'duration': parse_duration(self._search_regex( + r'id=["\']video_duree["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)), + }) + return info From 6780154e6bcdabdb35a24d2b1c5049c94fbe27a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 19 Mar 2018 23:43:53 +0700 Subject: [PATCH 019/148] [extractor/common] Improve thumbnail extraction for HTML5 entries --- youtube_dl/extractor/common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2e2a02948..890232586 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2150,8 +2150,8 @@ class InfoExtractor(object): return formats def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None): - def absolute_url(video_url): - return compat_urlparse.urljoin(base_url, video_url) + def absolute_url(item_url): + return urljoin(base_url, item_url) def parse_content_type(content_type): if not content_type: @@ -2208,7 +2208,7 @@ class InfoExtractor(object): if src: _, formats = _media_formats(src, media_type) media_info['formats'].extend(formats) - media_info['thumbnail'] = media_attributes.get('poster') + media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) if media_content: for source_tag in re.findall(r']+>', media_content): source_attributes = extract_attributes(source_tag) From 21dedcb5804b070bea143e4670df3b6f2951a078 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 20 Mar 2018 00:27:39 +0700 Subject: [PATCH 020/148] [cbs] Skip unavailable assets (closes #13490, closes #13506, closes #15776) --- youtube_dl/extractor/cbs.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 1268e38ef..f425562ab 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .theplatform import ThePlatformFeedIE from ..utils import ( + ExtractorError, int_or_none, find_xpath_attr, xpath_element, @@ -61,6 +62,7 @@ class CBSIE(CBSBaseIE): asset_types = [] subtitles = {} formats = [] + last_e = None for item in items_data.findall('.//item'): asset_type = xpath_text(item, 'assetType') if not asset_type or asset_type in asset_types: @@ -74,11 +76,17 @@ class CBSIE(CBSBaseIE): query['formats'] = 'MPEG4,M3U' elif asset_type in ('RTMP', 'WIFI', '3G'): query['formats'] = 'MPEG4,FLV' - tp_formats, tp_subtitles = self._extract_theplatform_smil( - update_url_query(tp_release_url, query), content_id, - 'Downloading %s SMIL data' % asset_type) + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query(tp_release_url, query), content_id, + 'Downloading %s SMIL data' % asset_type) + except ExtractorError as e: + last_e = e + continue formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) + if last_e and not formats: + raise last_e self._sort_formats(formats) info = self._extract_theplatform_metadata(tp_path, content_id) From 832f9d5258ac53e916515ad0b6b1490c872d6174 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 20 Mar 2018 01:06:58 +0700 Subject: [PATCH 021/148] [9now] Bypass geo restriction (closes #15920) --- youtube_dl/extractor/ninenow.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ninenow.py b/youtube_dl/extractor/ninenow.py index 351bea7ba..f32f530f7 100644 --- a/youtube_dl/extractor/ninenow.py +++ b/youtube_dl/extractor/ninenow.py @@ -4,15 +4,17 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + ExtractorError, int_or_none, float_or_none, - ExtractorError, + smuggle_url, ) class NineNowIE(InfoExtractor): IE_NAME = '9now.com.au' _VALID_URL = r'https?://(?:www\.)?9now\.com\.au/(?:[^/]+/){2}(?P[^/?#]+)' + _GEO_COUNTRIES = ['AU'] _TESTS = [{ # clip 'url': 'https://www.9now.com.au/afl-footy-show/2016/clip-ciql02091000g0hp5oktrnytc', @@ -75,7 +77,9 @@ class NineNowIE(InfoExtractor): return { '_type': 'url_transparent', - 'url': self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': self._GEO_COUNTRIES}), 'id': video_id, 'title': title, 'description': common_data.get('description'), From d9e2240f7c5d6b1a8ecd133625827f2c806dc9c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 20 Mar 2018 01:40:53 +0700 Subject: [PATCH 022/148] [7plus] Extract series metadata (closes #15862, closes #15906) --- youtube_dl/extractor/sevenplus.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/sevenplus.py b/youtube_dl/extractor/sevenplus.py index 9792f820a..84568ac69 100644 --- a/youtube_dl/extractor/sevenplus.py +++ b/youtube_dl/extractor/sevenplus.py @@ -4,22 +4,30 @@ from __future__ import unicode_literals import re from .brightcove import BrightcoveNewIE -from ..utils import update_url_query +from ..compat import compat_str +from ..utils import ( + try_get, + update_url_query, +) class SevenPlusIE(BrightcoveNewIE): IE_NAME = '7plus' _VALID_URL = r'https?://(?:www\.)?7plus\.com\.au/(?P[^?]+\?.*?\bepisode-id=(?P[^&#]+))' _TESTS = [{ - 'url': 'https://7plus.com.au/BEAT?episode-id=BEAT-001', + 'url': 'https://7plus.com.au/MTYS?episode-id=MTYS7-003', 'info_dict': { - 'id': 'BEAT-001', + 'id': 'MTYS7-003', 'ext': 'mp4', - 'title': 'S1 E1 - Help / Lucy In The Sky With Diamonds', - 'description': 'md5:37718bea20a8eedaca7f7361af566131', + 'title': 'S7 E3 - Wind Surf', + 'description': 'md5:29c6a69f21accda7601278f81b46483d', 'uploader_id': '5303576322001', - 'upload_date': '20171031', - 'timestamp': 1509440068, + 'upload_date': '20171201', + 'timestamp': 1512106377, + 'series': 'Mighty Ships', + 'season_number': 7, + 'episode_number': 3, + 'episode': 'Wind Surf', }, 'params': { 'format': 'bestvideo', @@ -63,5 +71,14 @@ class SevenPlusIE(BrightcoveNewIE): value = item.get(src_key) if value: info[dst_key] = value + info['series'] = try_get( + item, lambda x: x['seriesLogo']['name'], compat_str) + mobj = re.search(r'^S(\d+)\s+E(\d+)\s+-\s+(.+)$', info['title']) + if mobj: + info.update({ + 'season_number': int(mobj.group(1)), + 'episode_number': int(mobj.group(2)), + 'episode': mobj.group(3), + }) return info From c651de39d51cddf5ddefb446a89a62a6a424c39c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 20 Mar 2018 01:49:22 +0700 Subject: [PATCH 023/148] [ChangeLog] Actualize [ci skip] --- ChangeLog | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/ChangeLog b/ChangeLog index 47736e076..f3a1ca60d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,25 @@ +version + +Core +* [extractor/common] Improve thumbnail extraction for HTML5 entries +* Generalize XML manifest processing code and improve XSPF parsing ++ [extractor/common] Add _download_xml_handle ++ [extractor/common] Add support for relative URIs in _parse_xspf (#15794) + +Extractors ++ [7plus] Extract series metadata (#15862, #15906) +* [9now] Bypass geo restriction (#15920) +* [cbs] Skip unavailable assets (#13490, #13506, #15776) ++ [canalc2] Add support for HTML5 videos (#15916, #15919) ++ [ceskatelevize] Add support for iframe embeds (#15918) ++ [prosiebensat1] Add support for galileo.tv (#15894) ++ [generic] Add support for xfileshare embeds (#15879) +* [bilibili] Switch to v2 playurl API +* [bilibili] Fix and improve extraction (#15048, #15430, #15622, #15863) +* [heise] Improve extraction (#15496, #15784, #15026) +* [instagram] Fix user videos extraction (#15858) + + version 2018.03.14 Extractors From a66d1d079a3c2f2791b0a67c97cc9cec8c2faffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 20 Mar 2018 01:55:48 +0700 Subject: [PATCH 024/148] release 2018.03.20 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 481e2ed74..75c5b2226 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.14** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.20*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.20** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.03.14 +[debug] youtube-dl version 2018.03.20 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index f3a1ca60d..0d748316e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.03.20 Core * [extractor/common] Improve thumbnail extraction for HTML5 entries diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6ce11c39b..c686714f0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.03.14' +__version__ = '2018.03.20' From 3395958d2befc710181bbde872074ce81eee9158 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 20 Mar 2018 23:07:11 +0100 Subject: [PATCH 025/148] libsyn: adapt to new page structure and replace testcase --- youtube_dl/extractor/libsyn.py | 52 +++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index 4750b03a3..f7311f483 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -1,24 +1,28 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + parse_duration, + unified_strdate, +) class LibsynIE(InfoExtractor): _VALID_URL = r'(?Phttps?://html5-player\.libsyn\.com/embed/episode/id/(?P[0-9]+))' _TESTS = [{ - 'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/', - 'md5': '443360ee1b58007bc3dcf09b41d093bb', + 'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/', + 'md5': '2a55e75496c790cdeb058e7e6c087746', 'info_dict': { - 'id': '3377616', + 'id': '6385796', 'ext': 'mp3', - 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", - 'description': 'md5:601cb790edd05908957dae8aaa866465', - 'upload_date': '20150220', + 'title': "Champion Minded - Developing a Growth Mindset", + 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.', + 'upload_date': '20180320', 'thumbnail': 're:^https?://.*', }, }, { @@ -39,31 +43,45 @@ class LibsynIE(InfoExtractor): url = m.group('mainurl') webpage = self._download_webpage(url, video_id) - formats = [{ - 'url': media_url, - } for media_url in set(re.findall(r'var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))] - podcast_title = self._search_regex( - r'

([^<]+)

', webpage, 'podcast title', default=None) + r'

([^<]+)

', webpage, 'podcast title', default=None) + if podcast_title: + podcast_title = podcast_title.strip() episode_title = self._search_regex( - r'(?:
|

)([^<]+)|

)([^<]+)(.+?)

', webpage, + r'(.+?)

', webpage, 'description', default=None) - thumbnail = self._search_regex( - r']+class="info-show-icon"[^>]+src="([^"]+)"', - webpage, 'thumbnail', fatal=False) + if description: + # Strip non-breaking and normal spaces + description = description.replace('\u00A0', ' ').strip() release_date = unified_strdate(self._search_regex( r'
Released: ([^<]+)<', webpage, 'release date', fatal=False)) + data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block') + data = json.loads(data_json) + + formats = [{ + 'url': data['media_url'], + 'format_id': 'main', + }, { + 'url': data['media_url_libsyn'], + 'format_id': 'libsyn', + }] + thumbnail = data.get('thumbnail_url') + duration = parse_duration(data.get('duration')) + return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': release_date, + 'duration': duration, 'formats': formats, } From 328ddf56a151830ae002842b7088464e4e391b5d Mon Sep 17 00:00:00 2001 From: Vijay Singh Date: Wed, 21 Mar 2018 12:13:31 +0530 Subject: [PATCH 026/148] [Youku] Update ccode --- youtube_dl/extractor/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 5b0b248cd..2f5a7b023 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0507', + 'ccode': '0590', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, From cba5d1b6b36d79fcafe0600d9805e6b82ed5388f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Mar 2018 23:43:03 +0700 Subject: [PATCH 027/148] [instagram:user] Add pagination (closes #15934) --- youtube_dl/extractor/instagram.py | 96 +++++++++++++++++++------------ 1 file changed, 59 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index ac9d92a8d..f9cd11b8e 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals +import itertools import json import re @@ -242,48 +243,69 @@ class InstagramUserIE(InfoExtractor): return int_or_none(try_get( node, lambda x: x['edge_media_' + suffix]['count'])) - edges = self._download_json( - 'https://www.instagram.com/graphql/query/', uploader_id, query={ - 'query_hash': '472f257a40c653c64c666ce877d59d2b', - 'variables': json.dumps({ - 'id': uploader_id, - 'first': 999999999, + cursor = '' + for page_num in itertools.count(1): + media = self._download_json( + 'https://www.instagram.com/graphql/query/', uploader_id, + 'Downloading JSON page %d' % page_num, query={ + 'query_hash': '472f257a40c653c64c666ce877d59d2b', + 'variables': json.dumps({ + 'id': uploader_id, + 'first': 100, + 'after': cursor, + }) + })['data']['user']['edge_owner_to_timeline_media'] + + edges = media.get('edges') + if not edges or not isinstance(edges, list): + break + + for edge in edges: + node = edge.get('node') + if not node or not isinstance(node, dict): + continue + if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: + continue + video_id = node.get('shortcode') + if not video_id: + continue + + info = self.url_result( + 'https://instagram.com/p/%s/' % video_id, + ie=InstagramIE.ie_key(), video_id=video_id) + + description = try_get( + node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], + compat_str) + thumbnail = node.get('thumbnail_src') or node.get('display_src') + timestamp = int_or_none(node.get('taken_at_timestamp')) + + comment_count = get_count('to_comment') + like_count = get_count('preview_like') + view_count = int_or_none(node.get('video_view_count')) + + info.update({ + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'comment_count': comment_count, + 'like_count': like_count, + 'view_count': view_count, }) - })['data']['user']['edge_owner_to_timeline_media']['edges'] - for edge in edges: - node = edge['node'] + yield info - if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: - continue - video_id = node.get('shortcode') - if not video_id: - continue + page_info = media.get('page_info') + if not page_info or not isinstance(page_info, dict): + break - info = self.url_result( - 'https://instagram.com/p/%s/' % video_id, - ie=InstagramIE.ie_key(), video_id=video_id) + has_next_page = page_info.get('has_next_page') + if not has_next_page: + break - description = try_get( - node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) - thumbnail = node.get('thumbnail_src') or node.get('display_src') - timestamp = int_or_none(node.get('taken_at_timestamp')) - - comment_count = get_count('to_comment') - like_count = get_count('preview_like') - view_count = int_or_none(node.get('video_view_count')) - - info.update({ - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'comment_count': comment_count, - 'like_count': like_count, - 'view_count': view_count, - }) - - yield info + cursor = page_info.get('end_cursor') + if not cursor or not isinstance(cursor, compat_str): + break def _real_extract(self, url): username = self._match_id(url) From 8b7340a45eb0e3aeaa996896ff8690b6c3a32af6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 22 Mar 2018 22:55:28 +0700 Subject: [PATCH 028/148] [lenta] Add extractor (closes #15953) --- youtube_dl/extractor/extractors.py | 5 +-- youtube_dl/extractor/generic.py | 18 ---------- youtube_dl/extractor/lenta.py | 53 ++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 20 deletions(-) create mode 100644 youtube_dl/extractor/lenta.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3bde40eb3..de48a37ad 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -532,13 +532,14 @@ from .lcp import ( ) from .learnr import LearnrIE from .lecture2go import Lecture2GoIE -from .lego import LEGOIE -from .lemonde import LemondeIE from .leeco import ( LeIE, LePlaylistIE, LetvCloudIE, ) +from .lego import LEGOIE +from .lemonde import LemondeIE +from .lenta import LentaIE from .libraryofcongress import LibraryOfCongressIE from .libsyn import LibsynIE from .lifenews import ( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1cc491b19..cf64398e3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1270,24 +1270,6 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, - # EaglePlatform embed (generic URL) - { - 'url': 'http://lenta.ru/news/2015/03/06/navalny/', - # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used - 'info_dict': { - 'id': '227304', - 'ext': 'mp4', - 'title': 'Навальный вышел на свободу', - 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 87, - 'view_count': int, - 'age_limit': 0, - }, - 'params': { - 'skip_download': True, - }, - }, # referrer protected EaglePlatform embed { 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', diff --git a/youtube_dl/extractor/lenta.py b/youtube_dl/extractor/lenta.py new file mode 100644 index 000000000..2ebd4e577 --- /dev/null +++ b/youtube_dl/extractor/lenta.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class LentaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lenta\.ru/[^/]+/\d+/\d+/\d+/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://lenta.ru/news/2018/03/22/savshenko_go/', + 'info_dict': { + 'id': '964400', + 'ext': 'mp4', + 'title': 'Надежду Савченко задержали', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 61, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + # EaglePlatform iframe embed + 'url': 'http://lenta.ru/news/2015/03/06/navalny/', + 'info_dict': { + 'id': '227304', + 'ext': 'mp4', + 'title': 'Навальный вышел на свободу', + 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 87, + 'view_count': int, + 'age_limit': 0, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'vid\s*:\s*["\']?(\d+)', webpage, 'eagleplatform id', + default=None) + if video_id: + return self.url_result( + 'eagleplatform:lentaru.media.eagleplatform.com:%s' % video_id, + ie='EaglePlatform', video_id=video_id) + + return self.url_result(url, ie='Generic') From b9f5a41207bc704cca9e9e357f79d525828a39b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 23 Mar 2018 23:53:18 +0700 Subject: [PATCH 029/148] [crackle] Fix extraction (closes #15969) --- youtube_dl/extractor/crackle.py | 207 ++++++++++++++++---------------- 1 file changed, 102 insertions(+), 105 deletions(-) diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index 13f425b2b..57d84241a 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -1,31 +1,41 @@ # coding: utf-8 from __future__ import unicode_literals, division +import re + from .common import InfoExtractor -from ..utils import int_or_none +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + parse_age_limit, + parse_duration, +) class CrackleIE(InfoExtractor): _GEO_COUNTRIES = ['US'] _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' _TEST = { - 'url': 'http://www.crackle.com/comedians-in-cars-getting-coffee/2498934', + 'url': 'https://www.crackle.com/andromeda/2502343', 'info_dict': { - 'id': '2498934', + 'id': '2502343', 'ext': 'mp4', - 'title': 'Everybody Respects A Bloody Nose', - 'description': 'Jerry is kaffeeklatsching in L.A. with funnyman J.B. Smoove (Saturday Night Live, Real Husbands of Hollywood). They’re headed for brew at 10 Speed Coffee in a 1964 Studebaker Avanti.', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 906, - 'series': 'Comedians In Cars Getting Coffee', - 'season_number': 8, - 'episode_number': 4, - 'subtitles': { - 'en-US': [ - {'ext': 'vtt'}, - {'ext': 'tt'}, - ] - }, + 'title': 'Under The Night', + 'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a', + 'duration': 2583, + 'view_count': int, + 'average_rating': 0, + 'age_limit': 14, + 'genre': 'Action, Sci-Fi', + 'creator': 'Allan Kroeker', + 'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe', + 'release_year': 2000, + 'series': 'Andromeda', + 'episode': 'Under The Night', + 'season_number': 1, + 'episode_number': 1, }, 'params': { # m3u8 download @@ -33,108 +43,95 @@ class CrackleIE(InfoExtractor): } } - _THUMBNAIL_RES = [ - (120, 90), - (208, 156), - (220, 124), - (220, 220), - (240, 180), - (250, 141), - (315, 236), - (320, 180), - (360, 203), - (400, 300), - (421, 316), - (460, 330), - (460, 460), - (462, 260), - (480, 270), - (587, 330), - (640, 480), - (700, 330), - (700, 394), - (854, 480), - (1024, 1024), - (1920, 1080), - ] - - # extracted from http://legacyweb-us.crackle.com/flash/ReferrerRedirect.ashx - _MEDIA_FILE_SLOTS = { - 'c544.flv': { - 'width': 544, - 'height': 306, - }, - '360p.mp4': { - 'width': 640, - 'height': 360, - }, - '480p.mp4': { - 'width': 852, - 'height': 478, - }, - '480p_1mbps.mp4': { - 'width': 852, - 'height': 478, - }, - } - def _real_extract(self, url): video_id = self._match_id(url) - config_doc = self._download_xml( - 'http://legacyweb-us.crackle.com/flash/QueryReferrer.ashx?site=16', - video_id, 'Downloading config') + media = self._download_json( + 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s' + % (video_id, self._GEO_COUNTRIES[0]), video_id, query={ + 'disableProtocols': 'true', + 'format': 'json' + }) - item = self._download_xml( - 'http://legacyweb-us.crackle.com/app/revamp/vidwallcache.aspx?flags=-1&fm=%s' % video_id, - video_id, headers=self.geo_verification_headers()).find('i') - title = item.attrib['t'] + title = media['Title'] + + formats = [] + for e in media['MediaURLs']: + if e.get('UseDRM') is True: + continue + format_url = e.get('Path') + if not format_url or not isinstance(format_url, compat_str): + continue + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + description = media.get('Description') + duration = int_or_none(media.get( + 'DurationInSeconds')) or parse_duration(media.get('Duration')) + view_count = int_or_none(media.get('CountViews')) + average_rating = float_or_none(media.get('UserRating')) + age_limit = parse_age_limit(media.get('Rating')) + genre = media.get('Genre') + release_year = int_or_none(media.get('ReleaseYear')) + creator = media.get('Directors') + artist = media.get('Cast') + + if media.get('MediaTypeDisplayValue') == 'Full Episode': + series = media.get('ShowName') + episode = title + season_number = int_or_none(media.get('Season')) + episode_number = int_or_none(media.get('Episode')) + else: + series = episode = season_number = episode_number = None subtitles = {} - formats = self._extract_m3u8_formats( - 'http://content.uplynk.com/ext/%s/%s.m3u8' % (config_doc.attrib['strUplynkOwnerId'], video_id), - video_id, 'mp4', m3u8_id='hls', fatal=None) + cc_files = media.get('ClosedCaptionFiles') + if isinstance(cc_files, list): + for cc_file in cc_files: + if not isinstance(cc_file, dict): + continue + cc_url = cc_file.get('Path') + if not cc_url or not isinstance(cc_url, compat_str): + continue + lang = cc_file.get('Locale') or 'en' + subtitles.setdefault(lang, []).append({'url': cc_url}) + thumbnails = [] - path = item.attrib.get('p') - if path: - for width, height in self._THUMBNAIL_RES: - res = '%dx%d' % (width, height) + images = media.get('Images') + if isinstance(images, list): + for image_key, image_url in images.items(): + mobj = re.search(r'Img_(\d+)[xX](\d+)', image_key) + if not mobj: + continue thumbnails.append({ - 'id': res, - 'url': 'http://images-us-am.crackle.com/%stnl_%s.jpg' % (path, res), - 'width': width, - 'height': height, - 'resolution': res, + 'url': image_url, + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), }) - http_base_url = 'http://ahttp.crackle.com/' + path - for mfs_path, mfs_info in self._MEDIA_FILE_SLOTS.items(): - formats.append({ - 'url': http_base_url + mfs_path, - 'format_id': 'http-' + mfs_path.split('.')[0], - 'width': mfs_info['width'], - 'height': mfs_info['height'], - }) - for cc in item.findall('cc'): - locale = cc.attrib.get('l') - v = cc.attrib.get('v') - if locale and v: - if locale not in subtitles: - subtitles[locale] = [] - for url_ext, ext in (('vtt', 'vtt'), ('xml', 'tt')): - subtitles.setdefault(locale, []).append({ - 'url': '%s/%s%s_%s.%s' % (config_doc.attrib['strSubtitleServer'], path, locale, v, url_ext), - 'ext': ext, - }) - self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) return { 'id': video_id, 'title': title, - 'description': item.attrib.get('d'), - 'duration': int(item.attrib.get('r'), 16) / 1000 if item.attrib.get('r') else None, - 'series': item.attrib.get('sn'), - 'season_number': int_or_none(item.attrib.get('se')), - 'episode_number': int_or_none(item.attrib.get('ep')), + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'average_rating': average_rating, + 'age_limit': age_limit, + 'genre': genre, + 'creator': creator, + 'artist': artist, + 'release_year': release_year, + 'series': series, + 'episode': episode, + 'season_number': season_number, + 'episode_number': episode_number, 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, From 7d34016fb0c7ef489f382bd106dcfedf401b617f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Mar 2018 01:49:50 +0700 Subject: [PATCH 030/148] [crackle] Bypass geo restriction --- youtube_dl/extractor/crackle.py | 192 ++++++++++++++++++-------------- 1 file changed, 109 insertions(+), 83 deletions(-) diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index 57d84241a..fc014f8b5 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -4,20 +4,24 @@ from __future__ import unicode_literals, division import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_HTTPError, +) from ..utils import ( determine_ext, float_or_none, int_or_none, parse_age_limit, parse_duration, + ExtractorError ) class CrackleIE(InfoExtractor): - _GEO_COUNTRIES = ['US'] _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' _TEST = { + # geo restricted to CA 'url': 'https://www.crackle.com/andromeda/2502343', 'info_dict': { 'id': '2502343', @@ -46,93 +50,115 @@ class CrackleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - media = self._download_json( - 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s' - % (video_id, self._GEO_COUNTRIES[0]), video_id, query={ - 'disableProtocols': 'true', - 'format': 'json' - }) + country_code = self._downloader.params.get('geo_bypass_country', None) + countries = [country_code] if country_code else ( + 'US', 'AU', 'CA', 'AS', 'FM', 'GU', 'MP', 'PR', 'PW', 'MH', 'VI') - title = media['Title'] + last_e = None - formats = [] - for e in media['MediaURLs']: - if e.get('UseDRM') is True: + for country in countries: + try: + media = self._download_json( + 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s' + % (video_id, country), video_id, + 'Downloading media JSON as %s' % country, + 'Unable to download media JSON', query={ + 'disableProtocols': 'true', + 'format': 'json' + }) + except ExtractorError as e: + # 401 means geo restriction, trying next country + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + last_e = e + continue + raise + + media_urls = media.get('MediaURLs') + if not media_urls or not isinstance(media_urls, list): continue - format_url = e.get('Path') - if not format_url or not isinstance(format_url, compat_str): - continue - ext = determine_ext(format_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) - description = media.get('Description') - duration = int_or_none(media.get( - 'DurationInSeconds')) or parse_duration(media.get('Duration')) - view_count = int_or_none(media.get('CountViews')) - average_rating = float_or_none(media.get('UserRating')) - age_limit = parse_age_limit(media.get('Rating')) - genre = media.get('Genre') - release_year = int_or_none(media.get('ReleaseYear')) - creator = media.get('Directors') - artist = media.get('Cast') + title = media['Title'] - if media.get('MediaTypeDisplayValue') == 'Full Episode': - series = media.get('ShowName') - episode = title - season_number = int_or_none(media.get('Season')) - episode_number = int_or_none(media.get('Episode')) - else: - series = episode = season_number = episode_number = None - - subtitles = {} - cc_files = media.get('ClosedCaptionFiles') - if isinstance(cc_files, list): - for cc_file in cc_files: - if not isinstance(cc_file, dict): + formats = [] + for e in media['MediaURLs']: + if e.get('UseDRM') is True: continue - cc_url = cc_file.get('Path') - if not cc_url or not isinstance(cc_url, compat_str): + format_url = e.get('Path') + if not format_url or not isinstance(format_url, compat_str): continue - lang = cc_file.get('Locale') or 'en' - subtitles.setdefault(lang, []).append({'url': cc_url}) + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) - thumbnails = [] - images = media.get('Images') - if isinstance(images, list): - for image_key, image_url in images.items(): - mobj = re.search(r'Img_(\d+)[xX](\d+)', image_key) - if not mobj: - continue - thumbnails.append({ - 'url': image_url, - 'width': int(mobj.group(1)), - 'height': int(mobj.group(2)), - }) + description = media.get('Description') + duration = int_or_none(media.get( + 'DurationInSeconds')) or parse_duration(media.get('Duration')) + view_count = int_or_none(media.get('CountViews')) + average_rating = float_or_none(media.get('UserRating')) + age_limit = parse_age_limit(media.get('Rating')) + genre = media.get('Genre') + release_year = int_or_none(media.get('ReleaseYear')) + creator = media.get('Directors') + artist = media.get('Cast') - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'average_rating': average_rating, - 'age_limit': age_limit, - 'genre': genre, - 'creator': creator, - 'artist': artist, - 'release_year': release_year, - 'series': series, - 'episode': episode, - 'season_number': season_number, - 'episode_number': episode_number, - 'thumbnails': thumbnails, - 'subtitles': subtitles, - 'formats': formats, - } + if media.get('MediaTypeDisplayValue') == 'Full Episode': + series = media.get('ShowName') + episode = title + season_number = int_or_none(media.get('Season')) + episode_number = int_or_none(media.get('Episode')) + else: + series = episode = season_number = episode_number = None + + subtitles = {} + cc_files = media.get('ClosedCaptionFiles') + if isinstance(cc_files, list): + for cc_file in cc_files: + if not isinstance(cc_file, dict): + continue + cc_url = cc_file.get('Path') + if not cc_url or not isinstance(cc_url, compat_str): + continue + lang = cc_file.get('Locale') or 'en' + subtitles.setdefault(lang, []).append({'url': cc_url}) + + thumbnails = [] + images = media.get('Images') + if isinstance(images, list): + for image_key, image_url in images.items(): + mobj = re.search(r'Img_(\d+)[xX](\d+)', image_key) + if not mobj: + continue + thumbnails.append({ + 'url': image_url, + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'average_rating': average_rating, + 'age_limit': age_limit, + 'genre': genre, + 'creator': creator, + 'artist': artist, + 'release_year': release_year, + 'series': series, + 'episode': episode, + 'season_number': season_number, + 'episode_number': episode_number, + 'thumbnails': thumbnails, + 'subtitles': subtitles, + 'formats': formats, + } + + raise last_e From b015cb1af3453c6f27c1b8ebd1916c1fc4f94923 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Mar 2018 14:11:27 +0700 Subject: [PATCH 031/148] [24video] Add support for 24video.sexy (closes #15973) --- youtube_dl/extractor/twentyfourvideo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 96e0b96e3..4b3b3e705 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -14,7 +14,7 @@ from ..utils import ( class TwentyFourVideoIE(InfoExtractor): IE_NAME = '24video' - _VALID_URL = r'https?://(?P(?:www\.)?24video\.(?:net|me|xxx|sex|tube|adult))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' + _VALID_URL = r'https?://(?P(?:www\.)?24video\.(?:net|me|xxx|sexy?|tube|adult))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' _TESTS = [{ 'url': 'http://www.24video.net/video/view/1044982', From 86e1958944952afbe208101802c90f9a096adea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Mar 2018 14:21:08 +0700 Subject: [PATCH 032/148] [afreecatv] Update referrer (closes #15947) --- youtube_dl/extractor/afreecatv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index df2a3fc4a..0f4535804 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -187,11 +187,11 @@ class AfreecaTVIE(InfoExtractor): r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs') video_id = self._search_regex( r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) - + print(video_id, station_id, bbs_id) video_xml = self._download_xml( 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', video_id, headers={ - 'Referer': 'http://vod.afreecatv.com/embed.php', + 'Referer': url, }, query={ 'nTitleNo': video_id, 'nStationNo': station_id, From 16132cff7231d591bc4e6e3a12c02f9110d54e11 Mon Sep 17 00:00:00 2001 From: Joseph Spiros Date: Sat, 24 Mar 2018 03:57:34 -0400 Subject: [PATCH 033/148] [vrv] Fix extraction on python2 (closes #15928) --- youtube_dl/extractor/vrv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 9959627c0..64b13f0ed 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -12,7 +12,7 @@ import time from .common import InfoExtractor from ..compat import ( compat_urllib_parse_urlencode, - compat_urlparse, + compat_urllib_parse, ) from ..utils import ( float_or_none, @@ -39,11 +39,11 @@ class VRVBaseIE(InfoExtractor): data = json.dumps(data).encode() headers['Content-Type'] = 'application/json' method = 'POST' if data else 'GET' - base_string = '&'.join([method, compat_urlparse.quote(base_url, ''), compat_urlparse.quote(encoded_query, '')]) + base_string = '&'.join([method, compat_urllib_parse.quote(base_url, ''), compat_urllib_parse.quote(encoded_query, '')]) oauth_signature = base64.b64encode(hmac.new( (self._API_PARAMS['oAuthSecret'] + '&').encode('ascii'), base_string.encode(), hashlib.sha1).digest()).decode() - encoded_query += '&oauth_signature=' + compat_urlparse.quote(oauth_signature, '') + encoded_query += '&oauth_signature=' + compat_urllib_parse.quote(oauth_signature, '') return self._download_json( '?'.join([base_url, encoded_query]), video_id, note='Downloading %s JSON metadata' % note, headers=headers, data=data) From 0ff2c1ecb67b61e1410e1d0fe1966a7680e18947 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Mar 2018 15:59:48 +0700 Subject: [PATCH 034/148] [downloader/fragment] Fix download finalization when writing file to stdout (closes #15799) --- youtube_dl/downloader/fragment.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index ea5e3a4b5..927c7e491 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -241,12 +241,16 @@ class FragmentFD(FileDownloader): if os.path.isfile(ytdl_filename): os.remove(ytdl_filename) elapsed = time.time() - ctx['started'] - self.try_rename(ctx['tmpfilename'], ctx['filename']) - fsize = os.path.getsize(encodeFilename(ctx['filename'])) + + if ctx['tmpfilename'] == '-': + downloaded_bytes = ctx['complete_frags_downloaded_bytes'] + else: + self.try_rename(ctx['tmpfilename'], ctx['filename']) + downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, + 'downloaded_bytes': downloaded_bytes, + 'total_bytes': downloaded_bytes, 'filename': ctx['filename'], 'status': 'finished', 'elapsed': elapsed, From 80aa24609415af36ac30caa392e85f8c20349535 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 8 Oct 2016 09:27:24 -0400 Subject: [PATCH 035/148] [downloader/external] Fix download finalization when writing file to stdout (closes #10809) An OSError or IOError generally indicates something a little more wrong than a "simple" UnavailableVideoError, so print the actual traceback that leads to the exception. Otherwise meaningful postmortem debugging a bug report is essentially infeasible. --- youtube_dl/downloader/common.py | 10 ++++++---- youtube_dl/downloader/external.py | 24 +++++++++++++++--------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index cc16bbb83..7062eee8b 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -249,12 +249,14 @@ class FileDownloader(object): if self.params.get('noprogress', False): self.to_screen('[download] Download completed') else: - s['_total_bytes_str'] = format_bytes(s['total_bytes']) + if s.get('total_bytes') is not None: + s['_total_bytes_str'] = format_bytes(s['total_bytes']) + msg_template = '100%% of %(_total_bytes_str)s' + else: + msg_template = 'Completed' if s.get('elapsed') is not None: s['_elapsed_str'] = self.format_seconds(s['elapsed']) - msg_template = '100%% of %(_total_bytes_str)s in %(_elapsed_str)s' - else: - msg_template = '100%% of %(_total_bytes_str)s' + msg_template += ' in %(_elapsed_str)s' self._report_progress_status( msg_template % s, is_last_line=True) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index db018fa89..48c255ddc 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -41,15 +41,21 @@ class ExternalFD(FileDownloader): self.to_screen('[%s] Interrupted by user' % self.get_basename()) if retval == 0: - fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize)) - self.try_rename(tmpfilename, filename) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - }) + if filename == '-': + self._hook_progress({ + 'filename': filename, + 'status': 'finished', + }) + else: + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize)) + self.try_rename(tmpfilename, filename) + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': filename, + 'status': 'finished', + }) return True else: self.to_stderr('\n') From 2ea212628e0ffc0d66858817841643c4579c5d9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Mar 2018 16:27:36 +0700 Subject: [PATCH 036/148] [downloader/common] Improve progress reporting when no total bytes available --- youtube_dl/downloader/common.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 7062eee8b..edd125ee2 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -249,11 +249,10 @@ class FileDownloader(object): if self.params.get('noprogress', False): self.to_screen('[download] Download completed') else: + msg_template = '100%%' if s.get('total_bytes') is not None: s['_total_bytes_str'] = format_bytes(s['total_bytes']) - msg_template = '100%% of %(_total_bytes_str)s' - else: - msg_template = 'Completed' + msg_template += ' of %(_total_bytes_str)s' if s.get('elapsed') is not None: s['_elapsed_str'] = self.format_seconds(s['elapsed']) msg_template += ' in %(_elapsed_str)s' From f0298f653e2199d6e4488882e40eea8e31140d7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Mar 2018 16:29:03 +0700 Subject: [PATCH 037/148] [downloader/external] Simplify finished progress hook reporting and add elapsed time (closes #10876) --- youtube_dl/downloader/external.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 48c255ddc..958d00aac 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -1,9 +1,10 @@ from __future__ import unicode_literals import os.path +import re import subprocess import sys -import re +import time from .common import FileDownloader from ..compat import ( @@ -30,6 +31,7 @@ class ExternalFD(FileDownloader): tmpfilename = self.temp_name(filename) try: + started = time.time() retval = self._call_downloader(tmpfilename, info_dict) except KeyboardInterrupt: if not info_dict.get('is_live'): @@ -41,21 +43,20 @@ class ExternalFD(FileDownloader): self.to_screen('[%s] Interrupted by user' % self.get_basename()) if retval == 0: - if filename == '-': - self._hook_progress({ - 'filename': filename, - 'status': 'finished', - }) - else: + status = { + 'filename': filename, + 'status': 'finished', + 'elapsed': time.time() - started, + } + if filename != '-': fsize = os.path.getsize(encodeFilename(tmpfilename)) self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize)) self.try_rename(tmpfilename, filename) - self._hook_progress({ + status.update({ 'downloaded_bytes': fsize, 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', }) + self._hook_progress(status) return True else: self.to_stderr('\n') From 29d9594561fd92b07d1c2cff04ae5a4c144946b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Mar 2018 22:11:01 +0700 Subject: [PATCH 038/148] [ChangeLog] Actualize [ci skip] --- ChangeLog | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ChangeLog b/ChangeLog index 0d748316e..d4f442421 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +version + +Core ++ [downloader/external] Add elapsed time to progress hook (#10876) +* [downloader/external,fragment] Fix download finalization when writing file + to stdout (#10809, #10876, #15799) + +Extractors +* [vrv] Fix extraction on python2 (#15928) +* [afreecatv] Update referrer (#15947) ++ [24video] Add support for 24video.sexy (#15973) +* [crackle] Bypass geo restriction +* [crackle] Fix extraction (#15969) ++ [lenta] Add support for lenta.ru (#15953) ++ [instagram:user] Add pagination (#15934) +* [youku] Update ccode (#15939) +* [libsyn] Adapt to new page structure + + version 2018.03.20 Core From 671e241bfbf5d1954ff07c98e0ba2c3d7c2405c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Mar 2018 05:03:47 +0700 Subject: [PATCH 039/148] release 2018.03.26 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 75c5b2226..86912f5e7 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.20*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.20** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.26** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.03.20 +[debug] youtube-dl version 2018.03.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index d4f442421..0d43b580f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.03.26 Core + [downloader/external] Add elapsed time to progress hook (#10876) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 80358bb14..0d7d7fbb3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -419,6 +419,7 @@ - **Lecture2Go** - **LEGO** - **Lemonde** + - **Lenta** - **LePlaylist** - **LetvCloud**: 乐视云 - **Libsyn** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c686714f0..d6d87ad74 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.03.20' +__version__ = '2018.03.26' From c3cfc71a0c822c86a01ad9c150415724d0b2b045 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Mar 2018 22:30:11 +0700 Subject: [PATCH 040/148] [ChangeLog] Actualize [ci skip] --- ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 0d43b580f..d4f442421 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version 2018.03.26 +version Core + [downloader/external] Add elapsed time to progress hook (#10876) From bbd9d8c17075055ddfd9873092a29a3e21566805 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Mar 2018 22:32:03 +0700 Subject: [PATCH 041/148] release 2018.03.26.1 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 86912f5e7..0cd090e40 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.26** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.26.1*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.26.1** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.03.26 +[debug] youtube-dl version 2018.03.26.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index d4f442421..f9d04ffd9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.03.26.1 Core + [downloader/external] Add elapsed time to progress hook (#10876) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d6d87ad74..d38fde039 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.03.26' +__version__ = '2018.03.26.1' From 99c3091850118d08c14c78f5cc6ab5ce73f4196a Mon Sep 17 00:00:00 2001 From: Attila-Mihaly Balazs Date: Tue, 27 Mar 2018 18:02:04 +0300 Subject: [PATCH 042/148] [videa] Extend _VALID_URL --- youtube_dl/extractor/videa.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py index 311df58f4..d0e34c819 100644 --- a/youtube_dl/extractor/videa.py +++ b/youtube_dl/extractor/videa.py @@ -16,7 +16,7 @@ from ..utils import ( class VideaIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// - videa\.hu/ + videa(?:kid)?\.hu/ (?: videok/(?:[^/]+/)*[^?#&]+-| player\?.*?\bv=| @@ -31,7 +31,7 @@ class VideaIE(InfoExtractor): 'id': '8YfIAjxwWGwT8HVQ', 'ext': 'mp4', 'title': 'Az őrült kígyász 285 kígyót enged szabadon', - 'thumbnail': 'http://videa.hu/static/still/1.4.1.1007274.1204470.3', + 'thumbnail': r're:^https?://.*', 'duration': 21, }, }, { @@ -43,6 +43,15 @@ class VideaIE(InfoExtractor): }, { 'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', 'only_matching': True, + }, { + 'url': 'https://videakid.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', + 'only_matching': True, + }, { + 'url': 'https://videakid.hu/player?v=8YfIAjxwWGwT8HVQ', + 'only_matching': True, + }, { + 'url': 'https://videakid.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', + 'only_matching': True, }] @staticmethod From 9e6a4180158026e78f65563d0586923fef8ccece Mon Sep 17 00:00:00 2001 From: xofe <22776566+xofe@users.noreply.github.com> Date: Tue, 27 Mar 2018 15:08:40 +0000 Subject: [PATCH 043/148] [abc:iview] Unescape title and series meta fields --- youtube_dl/extractor/abc.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 87017ed39..512f04684 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -13,6 +13,7 @@ from ..utils import ( int_or_none, parse_iso8601, try_get, + unescapeHTML, update_url_query, ) @@ -109,16 +110,17 @@ class ABCIViewIE(InfoExtractor): # ABC iview programs are normally available for 14 days only. _TESTS = [{ - 'url': 'http://iview.abc.net.au/programs/call-the-midwife/ZW0898A003S00', + 'url': 'https://iview.abc.net.au/programs/ben-and-hollys-little-kingdom/ZY9247A021S00', 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', 'info_dict': { - 'id': 'ZW0898A003S00', + 'id': 'ZY9247A021S00', 'ext': 'mp4', - 'title': 'Series 5 Ep 3', - 'description': 'md5:e0ef7d4f92055b86c4f33611f180ed79', - 'upload_date': '20171228', - 'uploader_id': 'abc1', - 'timestamp': 1514499187, + 'title': "Gaston's Visit", + 'series': "Ben And Holly's Little Kingdom", + 'description': 'md5:18db170ad71cf161e006a4c688e33155', + 'upload_date': '20180318', + 'uploader_id': 'abc4kids', + 'timestamp': 1521400959, }, 'params': { 'skip_download': True, @@ -169,12 +171,12 @@ class ABCIViewIE(InfoExtractor): return { 'id': video_id, - 'title': title, + 'title': unescapeHTML(title), 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), 'thumbnail': self._html_search_meta(['og:image', 'twitter:image:src'], webpage), 'duration': int_or_none(video_params.get('eventDuration')), 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), - 'series': video_params.get('seriesTitle'), + 'series': unescapeHTML(video_params.get('seriesTitle')), 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], 'episode_number': int_or_none(self._html_search_meta('episodeNumber', webpage, default=None)), 'episode': self._html_search_meta('episode_title', webpage, default=None), From 5d60b9971784289acd4325a8ed7b5afd7bea05ca Mon Sep 17 00:00:00 2001 From: "Arend v. Reinersdorff" Date: Tue, 27 Mar 2018 17:25:29 +0200 Subject: [PATCH 044/148] [options] Mention comments support in --batch-file --- youtube_dl/options.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 7d1bbc021..3e4ac03a2 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -676,7 +676,8 @@ def parseOpts(overrideArguments=None): filesystem.add_option( '-a', '--batch-file', dest='batchfile', metavar='FILE', - help='File containing URLs to download (\'-\' for stdin)') + help="File containing URLs to download ('-' for stdin), one URL per line. " + "Lines starting with '#', ';' or ']' are considered as comments and ignored.") filesystem.add_option( '--id', default=False, action='store_true', dest='useid', help='Use only video ID in file name') From 02f6ccbce3a50d8db3eac06a5820347cf674ca86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 29 Mar 2018 23:06:13 +0700 Subject: [PATCH 045/148] [dramafever] Partially switch to API v5 (closes #16026) --- youtube_dl/extractor/dramafever.py | 156 +++++++++++++++++++---------- 1 file changed, 102 insertions(+), 54 deletions(-) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 6b60e542b..c7a048f9d 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -3,25 +3,26 @@ from __future__ import unicode_literals import itertools -from .amp import AMPIE +from .common import InfoExtractor from ..compat import ( - compat_HTTPError, + compat_str, compat_urlparse, ) from ..utils import ( - ExtractorError, clean_html, + ExtractorError, int_or_none, - remove_end, + parse_age_limit, + parse_duration, sanitized_Request, + unified_timestamp, urlencode_postdata ) -class DramaFeverBaseIE(AMPIE): +class DramaFeverBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' _NETRC_MACHINE = 'dramafever' - _GEO_COUNTRIES = ['US', 'CA'] _CONSUMER_SECRET = 'DA59dtVXYLxajktV' @@ -70,18 +71,20 @@ class DramaFeverIE(DramaFeverBaseIE): IE_NAME = 'dramafever' _VALID_URL = r'https?://(?:www\.)?dramafever\.com/(?:[^/]+/)?drama/(?P[0-9]+/[0-9]+)(?:/|$)' _TESTS = [{ - 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', + 'url': 'https://www.dramafever.com/drama/4274/1/Heirs/', 'info_dict': { - 'id': '4512.1', - 'ext': 'flv', - 'title': 'Cooking with Shin', - 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0', + 'id': '4274.1', + 'ext': 'wvm', + 'title': 'Heirs - Episode 1', + 'description': 'md5:362a24ba18209f6276e032a651c50bc2', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 3783, + 'timestamp': 1381354993, + 'upload_date': '20131009', + 'series': 'Heirs', + 'season_number': 1, 'episode': 'Episode 1', 'episode_number': 1, - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1404336058, - 'upload_date': '20140702', - 'duration': 344, }, 'params': { # m3u8 download @@ -110,50 +113,95 @@ class DramaFeverIE(DramaFeverBaseIE): 'only_matching': True, }] + def _call_api(self, path, video_id, note, fatal=False): + return self._download_json( + 'https://www.dramafever.com/api/5/' + path, + video_id, note=note, headers={ + 'x-consumer-key': self._consumer_secret, + }, fatal=fatal) + + def _get_subtitles(self, video_id): + subtitles = {} + subs = self._call_api( + 'video/%s/subtitles/webvtt/' % video_id, video_id, + 'Downloading subtitles JSON', fatal=False) + if not subs or not isinstance(subs, list): + return subtitles + for sub in subs: + if not isinstance(sub, dict): + continue + sub_url = sub.get('url') + if not sub_url or not isinstance(sub_url, compat_str): + continue + subtitles.setdefault( + sub.get('code') or sub.get('language') or 'en', []).append({ + 'url': sub_url + }) + return subtitles + def _real_extract(self, url): video_id = self._match_id(url).replace('/', '.') - try: - info = self._extract_feed_info( - 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - self.raise_geo_restricted( - msg='Currently unavailable in your country', - countries=self._GEO_COUNTRIES) - raise - - # title is postfixed with video id for some reason, removing - if info.get('title'): - info['title'] = remove_end(info['title'], video_id).strip() - series_id, episode_number = video_id.split('.') - episode_info = self._download_json( - # We only need a single episode info, so restricting page size to one episode - # and dealing with page number as with episode number - r'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_number=%s&page_size=1' - % (self._consumer_secret, series_id, episode_number), - video_id, 'Downloading episode info JSON', fatal=False) - if episode_info: - value = episode_info.get('value') - if isinstance(value, list): - for v in value: - if v.get('type') == 'Episode': - subfile = v.get('subfile') or v.get('new_subfile') - if subfile and subfile != 'http://www.dramafever.com/st/': - info.setdefault('subtitles', {}).setdefault('English', []).append({ - 'ext': 'srt', - 'url': subfile, - }) - episode_number = int_or_none(v.get('number')) - episode_fallback = 'Episode' - if episode_number: - episode_fallback += ' %d' % episode_number - info['episode'] = v.get('title') or episode_fallback - info['episode_number'] = episode_number - break - return info + video = self._call_api( + 'series/%s/episodes/%s/' % (series_id, episode_number), video_id, + 'Downloading video JSON') + + formats = [] + download_assets = video.get('download_assets') + if download_assets and isinstance(download_assets, dict): + for format_id, format_dict in download_assets.items(): + if not isinstance(format_dict, dict): + continue + format_url = format_dict.get('url') + if not format_url or not isinstance(format_url, compat_str): + continue + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'filesize': int_or_none(video.get('filesize')), + }) + + stream = self._call_api( + 'video/%s/stream/' % video_id, video_id, 'Downloading stream JSON', + fatal=False) + if stream: + stream_url = stream.get('stream_url') + if stream_url: + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + title = video.get('title') or 'Episode %s' % episode_number + description = video.get('description') + thumbnail = video.get('thumbnail') + timestamp = unified_timestamp(video.get('release_date')) + duration = parse_duration(video.get('duration')) + age_limit = parse_age_limit(video.get('tv_rating')) + series = video.get('series_title') + season_number = int_or_none(video.get('season')) + + if series: + title = '%s - %s' % (series, title) + + subtitles = self.extract_subtitles(video_id) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'age_limit': age_limit, + 'series': series, + 'season_number': season_number, + 'episode_number': int_or_none(episode_number), + 'formats': formats, + 'subtitles': subtitles, + } class DramaFeverSeriesIE(DramaFeverBaseIE): From 190f6c936be0ec03ed999cbf34e73f38c9beb022 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 29 Mar 2018 23:49:09 +0700 Subject: [PATCH 046/148] [naver] Fix extraction (closes #16029) --- youtube_dl/extractor/naver.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 2047d4402..bb3d94413 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -43,9 +41,14 @@ class NaverIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m_id = re.search(r'var rmcPlayer = new nhn\.rmcnmv\.RMCVideoPlayer\("(.+?)", "(.+?)"', - webpage) - if m_id is None: + vid = self._search_regex( + r'videoId["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'video id', fatal=None, group='value') + in_key = self._search_regex( + r'inKey["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'key', default=None, group='value') + + if not vid or not in_key: error = self._html_search_regex( r'(?s)
\s*(?:)?\s*

(?P.+?)

\s*
', webpage, 'error', default=None) @@ -53,9 +56,9 @@ class NaverIE(InfoExtractor): raise ExtractorError(error, expected=True) raise ExtractorError('couldn\'t extract vid and key') video_data = self._download_json( - 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + m_id.group(1), + 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + vid, video_id, query={ - 'key': m_id.group(2), + 'key': in_key, }) meta = video_data['meta'] title = meta['subject'] From 3e78d23b5783d01f60bcb515febd5a590a734ee4 Mon Sep 17 00:00:00 2001 From: Luca Steeb Date: Fri, 30 Mar 2018 18:25:43 +0200 Subject: [PATCH 047/148] [openload] Add support for oload.site --- youtube_dl/extractor/openload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index eaaaf8a08..af7db6e12 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', From 0b4bbcdcb6f62e080e70c026eb28a5e92f46dfc8 Mon Sep 17 00:00:00 2001 From: kenavera Date: Sat, 31 Mar 2018 17:14:49 +0200 Subject: [PATCH 048/148] [medialaan] Fix vod id --- youtube_dl/extractor/medialaan.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py index f8c30052f..50d5db802 100644 --- a/youtube_dl/extractor/medialaan.py +++ b/youtube_dl/extractor/medialaan.py @@ -141,6 +141,7 @@ class MedialaanIE(GigyaBaseIE): vod_id = config.get('vodId') or self._search_regex( (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"', + r'"vodId"\s*:\s*"(.+?)"', r'<[^>]+id=["\']vod-(\d+)'), webpage, 'video_id', default=None) From 0669f8fd8f19fbe0783974654fc2a6925d6162b0 Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Sat, 31 Mar 2018 11:46:08 -0500 Subject: [PATCH 049/148] [xvideos] Fix thumbnail extraction (closes #15978) --- youtube_dl/extractor/xvideos.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 085c8d4f3..efee95651 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -58,7 +58,9 @@ class XVideosIE(InfoExtractor): group='title') or self._og_search_title(webpage) thumbnail = self._search_regex( - r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) + (r'setThumbUrl\(\s*(["\'])(?P(?:(?!\1).)+)\1', + r'url_bigthumb=(?P.+?)&'), + webpage, 'thumbnail', fatal=False, group='thumbnail') duration = int_or_none(self._og_search_property( 'duration', webpage, default=None)) or parse_duration( self._search_regex( From 95a1322bc10687efac0b00fb3fd55708e556baf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Apr 2018 02:06:14 +0700 Subject: [PATCH 050/148] [bilibili] Remove debug from player params regexes --- youtube_dl/extractor/bilibili.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 90697c4a7..3e3348ef5 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -117,9 +117,9 @@ class BiliBiliIE(InfoExtractor): r'cid(?:["\']:|=)(\d+)', webpage, 'cid', default=None ) or compat_parse_qs(self._search_regex( - [r'1EmbedPlayer\([^)]+,\s*"([^"]+)"\)', - r'1EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', - r'1]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], + [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', + r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', + r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], webpage, 'player parameters'))['cid'][0] else: if 'no_bangumi_tip' not in smuggled_data: From 03fcde10ced29291268f39cb8ccf7ee5dd40f676 Mon Sep 17 00:00:00 2001 From: kenavera Date: Sun, 1 Apr 2018 16:22:51 +0200 Subject: [PATCH 051/148] [nationalgeographic] Add support for new URL schema (closes #16001) --- youtube_dl/extractor/nationalgeographic.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 246f6795a..4d2ee6408 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -68,11 +68,11 @@ class NationalGeographicVideoIE(InfoExtractor): class NationalGeographicIE(ThePlatformIE, AdobePassIE): IE_NAME = 'natgeo' - _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:(?:wild/)?[^/]+/)?(?:videos|episodes)/(?P[^/?]+)' + _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:(?:(?:wild/)?[^/]+/)?(?:videos|episodes)|u)/(?P[^/?]+)' _TESTS = [ { - 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/', + 'url': 'http://channel.nationalgeographic.com/u/kdi9Ld0PN2molUUIMSBGxoeDhD729KRjQcnxtetilWPMevo8ZwUBIDuPR0Q3D2LVaTsk0MPRkRWDB8ZhqWVeyoxfsZZm36yRp1j-zPfsHEyI_EgAeFY/', 'md5': '518c9aa655686cf81493af5cc21e2a04', 'info_dict': { 'id': 'vKInpacll2pC', @@ -86,7 +86,7 @@ class NationalGeographicIE(ThePlatformIE, AdobePassIE): 'add_ie': ['ThePlatform'], }, { - 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/', + 'url': 'http://channel.nationalgeographic.com/u/kdvOstqYaBY-vSBPyYgAZRUL4sWUJ5XUUPEhc7ISyBHqoIO4_dzfY3K6EjHIC0hmFXoQ7Cpzm6RkET7S3oMlm6CFnrQwSUwo/', 'md5': 'c4912f656b4cbe58f3e000c489360989', 'info_dict': { 'id': 'Pok5lWCkiEFA', @@ -106,6 +106,14 @@ class NationalGeographicIE(ThePlatformIE, AdobePassIE): { 'url': 'http://channel.nationalgeographic.com/videos/treasures-rediscovered/', 'only_matching': True, + }, + { + 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/', + 'only_matching': True, + }, + { + 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/', + 'only_matching': True, } ] From e51762be19289da50977fd6f2d0ee2a1722765a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Apr 2018 22:47:39 +0700 Subject: [PATCH 052/148] [afreecatv] Add support for authentication (#14450) --- youtube_dl/extractor/afreecatv.py | 47 +++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 0f4535804..bb3728bb0 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + urlencode_postdata, xpath_text, ) @@ -28,6 +29,7 @@ class AfreecaTVIE(InfoExtractor): ) (?P\d+) ''' + _NETRC_MACHINE = 'afreecatv' _TESTS = [{ 'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=', 'md5': 'f72c89fe7ecc14c1b5ce506c4996046e', @@ -172,6 +174,51 @@ class AfreecaTVIE(InfoExtractor): video_key['part'] = int(m.group('part')) return video_key + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_form = { + 'szWork': 'login', + 'szType': 'json', + 'szUid': username, + 'szPassword': password, + 'isSaveId': 'false', + 'szScriptVar': 'oLoginRet', + 'szAction': '', + } + + response = self._download_json( + 'https://login.afreecatv.com/app/LoginAction.php', None, + 'Logging in', data=urlencode_postdata(login_form)) + + _ERRORS = { + -4: 'Your account has been suspended due to a violation of our terms and policies.', + -5: 'https://member.afreecatv.com/app/user_delete_progress.php', + -6: 'https://login.afreecatv.com/membership/changeMember.php', + -8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.", + -9: 'https://member.afreecatv.com/app/pop_login_block.php', + -11: 'https://login.afreecatv.com/afreeca/second_login.php', + -12: 'https://member.afreecatv.com/app/user_security.php', + 0: 'The username does not exist or you have entered the wrong password.', + -1: 'The username does not exist or you have entered the wrong password.', + -3: 'You have entered your username/password incorrectly.', + -7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.', + -10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.', + -32008: 'You have failed to log in. Please contact our Help Center.', + } + + result = int_or_none(response.get('RESULT')) + if result != 1: + error = _ERRORS.get(result, 'You have failed to log in.') + raise ExtractorError( + 'Unable to login: %s said: %s' % (self.IE_NAME, error), + expected=True) + def _real_extract(self, url): video_id = self._match_id(url) From d563fb32ba5ef4b1a8061fca27edf3b1ad7eb8fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Apr 2018 23:07:54 +0700 Subject: [PATCH 053/148] [afreecatv] Remove debug output --- youtube_dl/extractor/afreecatv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index bb3728bb0..095e6204f 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -234,7 +234,7 @@ class AfreecaTVIE(InfoExtractor): r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs') video_id = self._search_regex( r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) - print(video_id, station_id, bbs_id) + video_xml = self._download_xml( 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', video_id, headers={ From 86693c4930b98e8df33736d87361400422b1adab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 Apr 2018 00:00:45 +0700 Subject: [PATCH 054/148] [afreecatv] Use partial view only when necessary (closes #14450) --- youtube_dl/extractor/afreecatv.py | 56 +++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 095e6204f..4b3d97136 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -141,22 +141,22 @@ class AfreecaTVIE(InfoExtractor): 'skip_download': True, }, }, { - # adult video - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/26542731', + # PARTIAL_ADULT + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/32028439', 'info_dict': { - 'id': '20171001_F1AE1711_196617479_1', + 'id': '20180327_27901457_202289533_1', 'ext': 'mp4', - 'title': '[생]서아 초심 찾기 방송 (part 1)', + 'title': '[생]빨개요♥ (part 1)', 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'BJ서아', + 'uploader': '[SA]서아', 'uploader_id': 'bjdyrksu', - 'upload_date': '20171001', - 'duration': 3600, - 'age_limit': 18, + 'upload_date': '20180327', + 'duration': 3601, }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['adult content'], }, { 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', 'only_matching': True, @@ -235,21 +235,41 @@ class AfreecaTVIE(InfoExtractor): video_id = self._search_regex( r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) - video_xml = self._download_xml( - 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', - video_id, headers={ - 'Referer': url, - }, query={ + partial_view = False + for _ in range(2): + query = { 'nTitleNo': video_id, 'nStationNo': station_id, 'nBbsNo': bbs_id, - 'partialView': 'SKIP_ADULT', - }) + } + if partial_view: + query['partialView'] = 'SKIP_ADULT' + video_xml = self._download_xml( + 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', + video_id, 'Downloading video info XML%s' + % (' (skipping adult)' if partial_view else ''), + video_id, headers={ + 'Referer': url, + }, query=query) - flag = xpath_text(video_xml, './track/flag', 'flag', default=None) - if flag and flag != 'SUCCEED': + flag = xpath_text(video_xml, './track/flag', 'flag', default=None) + if flag and flag == 'SUCCEED': + break + if flag == 'PARTIAL_ADULT': + self._downloader.report_warning( + 'In accordance with local laws and regulations, underage users are restricted from watching adult content. ' + 'Only content suitable for all ages will be downloaded. ' + 'Provide account credentials if you wish to download restricted content.') + partial_view = True + continue + elif flag == 'ADULT': + error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.' + else: + error = flag raise ExtractorError( - '%s said: %s' % (self.IE_NAME, flag), expected=True) + '%s said: %s' % (self.IE_NAME, error), expected=True) + else: + raise ExtractorError('Unable to download video info') video_element = video_xml.findall(compat_xpath('./track/video'))[-1] if video_element is None or video_element.text is None: From 8bd1df3c316970f15662831c28311560884356a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 Apr 2018 22:19:42 +0700 Subject: [PATCH 055/148] [dramafever] Fix authentication (closes #16067) --- youtube_dl/extractor/dramafever.py | 41 ++++++++++++++++++------------ 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index c7a048f9d..ffbd2623d 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -2,9 +2,11 @@ from __future__ import unicode_literals import itertools +import json from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_str, compat_urlparse, ) @@ -14,14 +16,11 @@ from ..utils import ( int_or_none, parse_age_limit, parse_duration, - sanitized_Request, unified_timestamp, - urlencode_postdata ) class DramaFeverBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' _NETRC_MACHINE = 'dramafever' _CONSUMER_SECRET = 'DA59dtVXYLxajktV' @@ -39,8 +38,8 @@ class DramaFeverBaseIE(InfoExtractor): 'consumer secret', default=self._CONSUMER_SECRET) def _real_initialize(self): - self._login() self._consumer_secret = self._get_consumer_secret() + self._login() def _login(self): (username, password) = self._get_login_info() @@ -52,19 +51,29 @@ class DramaFeverBaseIE(InfoExtractor): 'password': password, } - request = sanitized_Request( - self._LOGIN_URL, urlencode_postdata(login_form)) - response = self._download_webpage( - request, None, 'Logging in') + try: + response = self._download_json( + 'https://www.dramafever.com/api/users/login', None, 'Logging in', + data=json.dumps(login_form).encode('utf-8'), headers={ + 'x-consumer-key': self._consumer_secret, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (403, 404): + response = self._parse_json( + e.cause.read().decode('utf-8'), None) + else: + raise - if all(logout_pattern not in response - for logout_pattern in ['href="/accounts/logout/"', '>Log out<']): - error = self._html_search_regex( - r'(?s)]+\bclass="hidden-xs prompt"[^>]*>(.+?) Date: Mon, 12 Mar 2018 08:57:41 +0100 Subject: [PATCH 056/148] [tvnow] Add support for shows --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tvnow.py | 73 +++++++++++++++++++++++------- 2 files changed, 58 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index de48a37ad..e3a67cc5b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1136,6 +1136,7 @@ from .tvnoe import TVNoeIE from .tvnow import ( TVNowIE, TVNowListIE, + TVNowListChannelIE, ) from .tvp import ( TVPEmbedIE, diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py index 1bf472444..8e0ac6be5 100644 --- a/youtube_dl/extractor/tvnow.py +++ b/youtube_dl/extractor/tvnow.py @@ -19,7 +19,7 @@ class TVNowBaseIE(InfoExtractor): 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', 'manifest.dashclear', 'format.title', 'format.defaultImage169Format', - 'format.defaultImage169Logo') + 'format.defaultImage169Logo', 'replaceMovieInformation') def _call_api(self, path, video_id, query): return self._download_json( @@ -58,7 +58,7 @@ class TVNowBaseIE(InfoExtractor): duration = parse_duration(info.get('duration')) f = info.get('format', {}) - thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + thumbnail = ('https://aistvnow-a.akamaihd.net/tvnow/movie/%s' % info.get('replaceMovieInformation')) or f.get('defaultImage169Format') or f.get('defaultImage169Logo') return { 'id': video_id, @@ -133,7 +133,27 @@ class TVNowIE(TVNowBaseIE): return self._extract_video(info, display_id) -class TVNowListIE(TVNowBaseIE): +class TVNowListBaseIE(TVNowBaseIE): + def _extend_query(self, show, season, video=None): + fields = [] + fields.extend(show) + fields.extend('formatTabs.%s' % field for field in season) + if video: + fields.extend( + 'formatTabs.formatTabPages.container.movies.%s' % field + for field in video) + + return fields + + def _tvnow_list_info(self, list_id, show_id, fields): + return self._call_api( + 'formats/seo', list_id, query={ + 'fields': ','.join(fields), + 'name': show_id + '.php' + }) + + +class TVNowListIE(TVNowListBaseIE): _VALID_URL = r'(?Phttps?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P[^/]+)/)list/(?P[^?/#&]+)$' _SHOW_FIELDS = ('title', ) @@ -152,18 +172,7 @@ class TVNowListIE(TVNowBaseIE): def _real_extract(self, url): base_url, show_id, season_id = re.match(self._VALID_URL, url).groups() - fields = [] - fields.extend(self._SHOW_FIELDS) - fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS) - fields.extend( - 'formatTabs.formatTabPages.container.movies.%s' % field - for field in self._VIDEO_FIELDS) - - list_info = self._call_api( - 'formats/seo', season_id, query={ - 'fields': ','.join(fields), - 'name': show_id + '.php' - }) + list_info = self._tvnow_list_info(season_id, show_id, self._extend_query(self._SHOW_FIELDS, self._SEASON_FIELDS, self._VIDEO_FIELDS)) season = next( season for season in list_info['formatTabs']['items'] @@ -177,8 +186,40 @@ class TVNowListIE(TVNowBaseIE): seo_url = info.get('seoUrl') if not seo_url: continue + entries.append(self.url_result( - base_url + seo_url + '/player', 'TVNow', info.get('id'))) + base_url + seo_url + '/player', 'TVNow', str(info.get('id', seo_url)))) return self.playlist_result( entries, compat_str(season.get('id') or season_id), title) + + +class TVNowListChannelIE(TVNowListBaseIE): + _VALID_URL = r'(?Phttps?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P[^/]+))' + + _SHOW_FIELDS = ('id', 'title', ) + _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) + + _TESTS = [{ + 'url': 'https://www.tvnow.at/vox/ab-ins-beet', + 'only_matching': 'True', + }] + + @classmethod + def suitable(cls, url): + return False if TVNowIE.suitable(url) or TVNowListIE.suitable(url) else super(TVNowListChannelIE, cls).suitable(url) + + def _real_extract(self, url): + base_url, show_id = re.match(self._VALID_URL, url).groups() + + list_info = self._tvnow_list_info(show_id, show_id, self._extend_query(self._SHOW_FIELDS, self._SEASON_FIELDS)) + + entries = [] + for season_info in list_info['formatTabs']['items']: + season_url = season_info.get('seoheadline') + if not season_url: + continue + entries.append(self.url_result( + base_url + "/list/" + season_url, 'TVNowList', compat_str(season_info.get('id')), season_info.get('headline'))) + + return self.playlist_result(entries) From ea6679fbeb1fb91131022886a0a8697e4c75f07f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 Apr 2018 00:08:22 +0700 Subject: [PATCH 057/148] [tvnow] Fix issues, simplify and improve (closes #15837) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/tvnow.py | 124 +++++++++++++++++++++-------- 2 files changed, 90 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e3a67cc5b..bded6e144 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1136,7 +1136,7 @@ from .tvnoe import TVNoeIE from .tvnow import ( TVNowIE, TVNowListIE, - TVNowListChannelIE, + TVNowShowIE, ) from .tvp import ( TVPEmbedIE, diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py index 8e0ac6be5..808571ece 100644 --- a/youtube_dl/extractor/tvnow.py +++ b/youtube_dl/extractor/tvnow.py @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, parse_iso8601, parse_duration, + try_get, update_url_query, ) @@ -19,7 +20,7 @@ class TVNowBaseIE(InfoExtractor): 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', 'manifest.dashclear', 'format.title', 'format.defaultImage169Format', - 'format.defaultImage169Logo', 'replaceMovieInformation') + 'format.defaultImage169Logo') def _call_api(self, path, video_id, query): return self._download_json( @@ -58,14 +59,22 @@ class TVNowBaseIE(InfoExtractor): duration = parse_duration(info.get('duration')) f = info.get('format', {}) - thumbnail = ('https://aistvnow-a.akamaihd.net/tvnow/movie/%s' % info.get('replaceMovieInformation')) or f.get('defaultImage169Format') or f.get('defaultImage169Logo') + + thumbnails = [{ + 'url': 'https://aistvnow-a.akamaihd.net/tvnow/movie/%s' % video_id, + }] + thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + if thumbnail: + thumbnails.append({ + 'url': thumbnail, + }) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'timestamp': timestamp, 'duration': duration, 'series': f.get('title'), @@ -77,7 +86,12 @@ class TVNowBaseIE(InfoExtractor): class TVNowIE(TVNowBaseIE): - _VALID_URL = r'https?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P[^/]+)/(?:player|preview)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ + (?P[^/]+)/ + (?!(?:list|jahr)(?:/|$))(?P[^/?\#&]+) + ''' _TESTS = [{ 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3/player', @@ -99,27 +113,30 @@ class TVNowIE(TVNowBaseIE): }, { # rtl2 'url': 'https://www.tvnow.de/rtl2/armes-deutschland/episode-0008/player', - 'only_matching': 'True', + 'only_matching': True, }, { # rtlnitro 'url': 'https://www.tvnow.de/nitro/alarm-fuer-cobra-11-die-autobahnpolizei/auf-eigene-faust-pilot/player', - 'only_matching': 'True', + 'only_matching': True, }, { # superrtl 'url': 'https://www.tvnow.de/superrtl/die-lustigsten-schlamassel-der-welt/u-a-ketchup-effekt/player', - 'only_matching': 'True', + 'only_matching': True, }, { # ntv 'url': 'https://www.tvnow.de/ntv/startup-news/goetter-in-weiss/player', - 'only_matching': 'True', + 'only_matching': True, }, { # vox 'url': 'https://www.tvnow.de/vox/auto-mobil/neues-vom-automobilmarkt-2017-11-19-17-00-00/player', - 'only_matching': 'True', + 'only_matching': True, }, { # rtlplus 'url': 'https://www.tvnow.de/rtlplus/op-ruft-dr-bruckner/die-vernaehte-frau/player', - 'only_matching': 'True', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3', + 'only_matching': True, }] def _real_extract(self, url): @@ -134,27 +151,29 @@ class TVNowIE(TVNowBaseIE): class TVNowListBaseIE(TVNowBaseIE): - def _extend_query(self, show, season, video=None): - fields = [] - fields.extend(show) - fields.extend('formatTabs.%s' % field for field in season) - if video: - fields.extend( - 'formatTabs.formatTabPages.container.movies.%s' % field - for field in video) + _SHOW_VALID_URL = r'''(?x) + (?P + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ + (?P[^/]+) + ) + ''' - return fields - - def _tvnow_list_info(self, list_id, show_id, fields): + def _extract_list_info(self, display_id, show_id): + fields = list(self._SHOW_FIELDS) + fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS) + fields.extend( + 'formatTabs.formatTabPages.container.movies.%s' % field + for field in self._VIDEO_FIELDS) return self._call_api( - 'formats/seo', list_id, query={ + 'formats/seo', display_id, query={ 'fields': ','.join(fields), 'name': show_id + '.php' }) class TVNowListIE(TVNowListBaseIE): - _VALID_URL = r'(?Phttps?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P[^/]+)/)list/(?P[^?/#&]+)$' + _VALID_URL = r'%s/(?:list|jahr)/(?P[^?\#&]+)' % TVNowListBaseIE._SHOW_VALID_URL _SHOW_FIELDS = ('title', ) _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) @@ -167,59 +186,94 @@ class TVNowListIE(TVNowListBaseIE): 'title': '30 Minuten Deutschland - Aktuell', }, 'playlist_mincount': 1, + }, { + 'url': 'https://www.tvnow.de/vox/ab-ins-beet/list/staffel-14', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/jahr/2018/3', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if TVNowIE.suitable(url) + else super(TVNowListIE, cls).suitable(url)) + def _real_extract(self, url): base_url, show_id, season_id = re.match(self._VALID_URL, url).groups() - list_info = self._tvnow_list_info(season_id, show_id, self._extend_query(self._SHOW_FIELDS, self._SEASON_FIELDS, self._VIDEO_FIELDS)) + list_info = self._extract_list_info(season_id, show_id) season = next( season for season in list_info['formatTabs']['items'] if season.get('seoheadline') == season_id) - title = '%s - %s' % (list_info['title'], season['headline']) + title = list_info.get('title') + headline = season.get('headline') + if title and headline: + title = '%s - %s' % (title, headline) + else: + title = headline or title entries = [] for container in season['formatTabPages']['items']: - for info in ((container.get('container') or {}).get('movies') or {}).get('items') or []: + items = try_get( + container, lambda x: x['container']['movies']['items'], + list) or [] + for info in items: seo_url = info.get('seoUrl') if not seo_url: continue - + video_id = info.get('id') entries.append(self.url_result( - base_url + seo_url + '/player', 'TVNow', str(info.get('id', seo_url)))) + '%s/%s/player' % (base_url, seo_url), TVNowIE.ie_key(), + compat_str(video_id) if video_id else None)) return self.playlist_result( entries, compat_str(season.get('id') or season_id), title) -class TVNowListChannelIE(TVNowListBaseIE): - _VALID_URL = r'(?Phttps?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P[^/]+))' +class TVNowShowIE(TVNowListBaseIE): + _VALID_URL = TVNowListBaseIE._SHOW_VALID_URL _SHOW_FIELDS = ('id', 'title', ) _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) + _VIDEO_FIELDS = () _TESTS = [{ 'url': 'https://www.tvnow.at/vox/ab-ins-beet', - 'only_matching': 'True', + 'info_dict': { + 'id': 'ab-ins-beet', + 'title': 'Ab ins Beet!', + }, + 'playlist_mincount': 7, + }, { + 'url': 'https://www.tvnow.at/vox/ab-ins-beet/list', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/jahr/', + 'only_matching': True, }] @classmethod def suitable(cls, url): - return False if TVNowIE.suitable(url) or TVNowListIE.suitable(url) else super(TVNowListChannelIE, cls).suitable(url) + return (False if TVNowIE.suitable(url) or TVNowListIE.suitable(url) + else super(TVNowShowIE, cls).suitable(url)) def _real_extract(self, url): base_url, show_id = re.match(self._VALID_URL, url).groups() - list_info = self._tvnow_list_info(show_id, show_id, self._extend_query(self._SHOW_FIELDS, self._SEASON_FIELDS)) + list_info = self._extract_list_info(show_id, show_id) entries = [] for season_info in list_info['formatTabs']['items']: season_url = season_info.get('seoheadline') if not season_url: continue + season_id = season_info.get('id') entries.append(self.url_result( - base_url + "/list/" + season_url, 'TVNowList', compat_str(season_info.get('id')), season_info.get('headline'))) + '%s/list/%s' % (base_url, season_url), TVNowListIE.ie_key(), + compat_str(season_id) if season_id else None, + season_info.get('headline'))) - return self.playlist_result(entries) + return self.playlist_result(entries, show_id, list_info.get('title')) From 10f9caec048ca0c7c85a568d1dab12d7d7f7b45d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 Apr 2018 00:23:03 +0700 Subject: [PATCH 058/148] [ChangeLog] Actualize [ci skip] --- ChangeLog | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ChangeLog b/ChangeLog index f9d04ffd9..89c58aba2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +version + +Extractors ++ [tvnow] Add support for shows (#15837) +* [dramafever] Fix authentication (#16067) +* [afreecatv] Use partial view only when necessary (#14450) ++ [afreecatv] Add support for authentication (#14450) ++ [nationalgeographic] Add support for new URL schema (#16001, #16054) +* [xvideos] Fix thumbnail extraction (#15978, #15979) +* [medialaan] Fix vod id (#16038) ++ [openload] Add support for oload.site (#16039) +* [naver] Fix extraction (#16029) +* [dramafever] Partially switch to API v5 (#16026) +* [abc:iview] Unescape title and series meta fields (#15994) +* [videa] Extend URL regular expression (#16003) + + version 2018.03.26.1 Core From e8dfecb3842ba54a4260af81e859e487e36eba41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 Apr 2018 00:26:11 +0700 Subject: [PATCH 059/148] release 2018.04.03 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 4 +++- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 0cd090e40..99e8acd33 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.03.26.1*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.03.26.1** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.03*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.03** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.03.26.1 +[debug] youtube-dl version 2018.04.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 89c58aba2..89dfbd8b8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.04.03 Extractors + [tvnow] Add support for shows (#15837) diff --git a/README.md b/README.md index 7dba5775d..5af0f387b 100644 --- a/README.md +++ b/README.md @@ -223,7 +223,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo ## Filesystem Options: -a, --batch-file FILE File containing URLs to download ('-' for - stdin) + stdin), one URL per line. Lines starting + with '#', ';' or ']' are considered as + comments and ignored. --id Use only video ID in file name -o, --output TEMPLATE Output filename template, see the "OUTPUT TEMPLATE" for all the info diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0d7d7fbb3..17baac5ab 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -887,6 +887,7 @@ - **TVNoe** - **TVNow** - **TVNowList** + - **TVNowShow** - **tvp**: Telewizja Polska - **tvp:embed**: Telewizja Polska - **tvp:series** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d38fde039..a3163509c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.03.26.1' +__version__ = '2018.04.03' From fd97fa7bfc59983d315892c26f861842820a9579 Mon Sep 17 00:00:00 2001 From: Mattias Wadman Date: Fri, 30 Mar 2018 20:02:09 +0200 Subject: [PATCH 060/148] [svtplay:series] Add extractor Related to #11130 --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/svt.py | 57 ++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index bded6e144..b46a304ac 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1031,6 +1031,7 @@ from .sunporno import SunPornoIE from .svt import ( SVTIE, SVTPlayIE, + SVTPlaylistIE, ) from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 48bc4529e..d02fd9450 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -9,6 +9,8 @@ from ..utils import ( dict_get, int_or_none, try_get, + urljoin, + compat_str, ) @@ -189,3 +191,58 @@ class SVTPlayIE(SVTBaseIE): r'\s*\|\s*.+?$', '', info_dict.get('episode') or self._og_search_title(webpage)) return info_dict + + +class SVTPlaylistIE(InfoExtractor): + IE_DESC = 'SVT Play serie' + _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P[^/?&#]+)' + IE_NAME = 'svtplay:serie' + _TESTS = [{ + 'url': 'https://www.svtplay.se/rederiet', + 'info_dict': { + 'id': 'rederiet', + 'title': 'Rederiet', + 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', + }, + 'playlist_mincount': 318, + }] + + @classmethod + def suitable(cls, url): + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPlaylistIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + + page = self._download_webpage( + url, video_id, + note='Downloading serie page', + errnote='unable to fetch serie page') + + root_json = self._search_regex( + r'root\[\'__svtplay\'\]\s*=(.+);\n', + page, 'root') + root = self._parse_json(root_json, video_id) + + metadata = root.get('metaData', {}) + related_videos_accordion = root['relatedVideoContent']['relatedVideosAccordion'] + + entries = [] + for season in related_videos_accordion: + videos = season.get('videos') + if not isinstance(videos, list): + continue + + for video in videos: + content_url = video.get('contentUrl') + if not isinstance(content_url, compat_str): + continue + entries.append( + self.url_result( + urljoin(url, content_url), + ie=SVTPlayIE.ie_key(), + video_title=video.get('title') + )) + + return self.playlist_result( + entries, video_id, metadata.get('title'), metadata.get('description')) From b71bb3ba8be711abab4c05527d28c4b5e4552401 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 Apr 2018 23:52:00 +0700 Subject: [PATCH 061/148] [svtplay:series] Improve extraction (closes #16059) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/svt.py | 36 ++++++++++++++---------------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b46a304ac..c9f60114d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1031,7 +1031,7 @@ from .sunporno import SunPornoIE from .svt import ( SVTIE, SVTPlayIE, - SVTPlaylistIE, + SVTSeriesIE, ) from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index d02fd9450..45b4b8bf7 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -193,10 +193,8 @@ class SVTPlayIE(SVTBaseIE): return info_dict -class SVTPlaylistIE(InfoExtractor): - IE_DESC = 'SVT Play serie' +class SVTSeriesIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P[^/?&#]+)' - IE_NAME = 'svtplay:serie' _TESTS = [{ 'url': 'https://www.svtplay.se/rederiet', 'info_dict': { @@ -209,33 +207,28 @@ class SVTPlaylistIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPlaylistIE, cls).suitable(url) + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url) def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage( - url, video_id, - note='Downloading serie page', - errnote='unable to fetch serie page') + webpage = self._download_webpage( + url, video_id, 'Downloading serie page') - root_json = self._search_regex( - r'root\[\'__svtplay\'\]\s*=(.+);\n', - page, 'root') - root = self._parse_json(root_json, video_id) - - metadata = root.get('metaData', {}) - related_videos_accordion = root['relatedVideoContent']['relatedVideosAccordion'] + root = self._parse_json( + self._search_regex( + r'root\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P{.+?})\s*;\s*\n', + webpage, 'content', group='json'), + video_id) entries = [] - for season in related_videos_accordion: + for season in root['relatedVideoContent']['relatedVideosAccordion']: videos = season.get('videos') if not isinstance(videos, list): continue - for video in videos: content_url = video.get('contentUrl') - if not isinstance(content_url, compat_str): + if not content_url or not isinstance(content_url, compat_str): continue entries.append( self.url_result( @@ -244,5 +237,10 @@ class SVTPlaylistIE(InfoExtractor): video_title=video.get('title') )) + metadata = root.get('metaData') + if not isinstance(metadata, dict): + metadata = {} + return self.playlist_result( - entries, video_id, metadata.get('title'), metadata.get('description')) + entries, video_id, metadata.get('title'), + metadata.get('description')) From df146eb2827a97da507833c08169d84d708dfb02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 5 Apr 2018 00:05:09 +0700 Subject: [PATCH 062/148] [svtplay:series] Add support for season URLs --- youtube_dl/extractor/svt.py | 43 ++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 45b4b8bf7..d1d601b1f 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -4,6 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( determine_ext, dict_get, @@ -203,6 +207,14 @@ class SVTSeriesIE(InfoExtractor): 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', }, 'playlist_mincount': 318, + }, { + 'url': 'https://www.svtplay.se/rederiet?tab=sasong2', + 'info_dict': { + 'id': 'rederiet-sasong2', + 'title': 'Rederiet - Säsong 2', + 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', + }, + 'playlist_count': 12, }] @classmethod @@ -210,19 +222,33 @@ class SVTSeriesIE(InfoExtractor): return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url) def _real_extract(self, url): - video_id = self._match_id(url) + series_id = self._match_id(url) + + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + season_slug = qs.get('tab', [None])[0] + + if season_slug: + series_id += '-%s' % season_slug webpage = self._download_webpage( - url, video_id, 'Downloading serie page') + url, series_id, 'Downloading series page') root = self._parse_json( self._search_regex( r'root\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P{.+?})\s*;\s*\n', webpage, 'content', group='json'), - video_id) + series_id) + + season_name = None entries = [] for season in root['relatedVideoContent']['relatedVideosAccordion']: + if not isinstance(season, dict): + continue + if season_slug: + if season.get('slug') != season_slug: + continue + season_name = season.get('name') videos = season.get('videos') if not isinstance(videos, list): continue @@ -241,6 +267,13 @@ class SVTSeriesIE(InfoExtractor): if not isinstance(metadata, dict): metadata = {} + title = metadata.get('title') + season_name = season_name or season_slug + + if title and season_name: + title = '%s - %s' % (title, season_name) + elif season_slug: + title = season_slug + return self.playlist_result( - entries, video_id, metadata.get('title'), - metadata.get('description')) + entries, series_id, title, metadata.get('description')) From 1236ac6b0bc5ef49e4065ddfc310d15651633093 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 5 Apr 2018 00:28:36 +0700 Subject: [PATCH 063/148] [svtplay] Share svtplay regex --- youtube_dl/extractor/svt.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index d1d601b1f..b544da414 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -128,7 +128,11 @@ class SVTIE(SVTBaseIE): return info_dict -class SVTPlayIE(SVTBaseIE): +class SVTPlayBaseIE(SVTBaseIE): + _SVTPLAY_RE = r'root\s*\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P{.+?})\s*;\s*\n' + + +class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp)/(?P[0-9]+)' _TESTS = [{ @@ -163,8 +167,8 @@ class SVTPlayIE(SVTBaseIE): data = self._parse_json( self._search_regex( - r'root\["__svtplay"\]\s*=\s*([^;]+);', - webpage, 'embedded data', default='{}'), + self._SVTPLAY_RE, webpage, 'embedded data', default='{}', + group='json'), video_id, fatal=False) thumbnail = self._og_search_thumbnail(webpage) @@ -197,7 +201,7 @@ class SVTPlayIE(SVTBaseIE): return info_dict -class SVTSeriesIE(InfoExtractor): +class SVTSeriesIE(SVTPlayBaseIE): _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P[^/?&#]+)' _TESTS = [{ 'url': 'https://www.svtplay.se/rederiet', @@ -235,8 +239,7 @@ class SVTSeriesIE(InfoExtractor): root = self._parse_json( self._search_regex( - r'root\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P{.+?})\s*;\s*\n', - webpage, 'content', group='json'), + self._SVTPLAY_RE, webpage, 'content', group='json'), series_id) season_name = None From 235d828b7b113f22309a0f30048678baea210620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 5 Apr 2018 23:49:15 +0700 Subject: [PATCH 064/148] [openload] Fix extraction (closes #16099) --- youtube_dl/extractor/openload.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index af7db6e12..3e0a7a9a2 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -334,10 +334,11 @@ class OpenloadIE(InfoExtractor): decoded_id = (get_element_by_id('streamurl', webpage) or get_element_by_id('streamuri', webpage) or - get_element_by_id('streamurj', webpage)) - - if not decoded_id: - raise ExtractorError('Can\'t find stream URL', video_id=video_id) + get_element_by_id('streamurj', webpage) or + self._search_regex( + (r'>\s*([\da-zA-Z]+~\d{10,}~\d+\.\d+\.0\.0~[\da-zA-Z]+)\s*<', + r'>\s*([\w~]+~\d+\.\d+\.\d+\.\d+~[\w~]+)'), webpage, + 'stream URL')) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id From fdfb32a0dd80de4be67b0fcf93764bfa2a4ce7a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 6 Apr 2018 00:15:22 +0700 Subject: [PATCH 065/148] [openload] Relax stream URL regex --- youtube_dl/extractor/openload.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 3e0a7a9a2..9f5bebe40 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -336,8 +336,8 @@ class OpenloadIE(InfoExtractor): get_element_by_id('streamuri', webpage) or get_element_by_id('streamurj', webpage) or self._search_regex( - (r'>\s*([\da-zA-Z]+~\d{10,}~\d+\.\d+\.0\.0~[\da-zA-Z]+)\s*<', - r'>\s*([\w~]+~\d+\.\d+\.\d+\.\d+~[\w~]+)'), webpage, + (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<', + r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)'), webpage, 'stream URL')) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id From e944737c597a2f5e8e6ade93a25fc812119d4eb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 6 Apr 2018 23:40:15 +0700 Subject: [PATCH 066/148] [openload] Add support for oload.xyz --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 9f5bebe40..650f95656 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -298,6 +298,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.stream/f/KnG-kKZdcfY', 'only_matching': True, + }, { + 'url': 'https://oload.xyz/f/WwRBpzW8Wtk', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' From e2750e1437497925c5a058947b850ddadd7ee7d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 7 Apr 2018 20:55:01 +0700 Subject: [PATCH 067/148] [liveleak] Extend _VALID_URL (closes #16117) --- youtube_dl/extractor/liveleak.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 246aac576..26671753c 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -7,7 +7,7 @@ from ..utils import int_or_none class LiveLeakIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P[\w_]+)(?:.*)' + _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?.*?\b[it]=(?P[\w_]+)' _TESTS = [{ 'url': 'http://www.liveleak.com/view?i=757_1364311680', 'md5': '0813c2430bea7a46bf13acf3406992f4', @@ -79,6 +79,9 @@ class LiveLeakIE(InfoExtractor): 'title': 'Fuel Depot in China Explosion caught on video', }, 'playlist_count': 3, + }, { + 'url': 'https://www.liveleak.com/view?t=HvHi_1523016227', + 'only_matching': True, }] @staticmethod From 9d15be3a5b1f0764d8493ccfc312fc0d0a2df164 Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Sat, 7 Apr 2018 09:39:21 -0500 Subject: [PATCH 068/148] [drtuber] Fix title extraction (closes #16107) --- youtube_dl/extractor/drtuber.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index c88b3126b..5c41c8022 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -66,7 +66,9 @@ class DrTuberIE(InfoExtractor): self._sort_formats(formats) title = self._html_search_regex( - (r'class="title_watch"[^>]*><(?:p|h\d+)[^>]*>([^<]+)<', + (r']+class=["\']title[^>]+>([^<]+)', + r'([^<]+)\s*@\s+DrTuber', + r'class="title_watch"[^>]*><(?:p|h\d+)[^>]*>([^<]+)<', r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'), webpage, 'title') From ff826177cc154ba8c67b8162a25e067783dc4caa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Apr 2018 23:57:32 +0700 Subject: [PATCH 069/148] [instagram:user] Fix extraction (closes #16119) --- youtube_dl/extractor/instagram.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index f9cd11b8e..9f570249f 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -243,6 +243,8 @@ class InstagramUserIE(InfoExtractor): return int_or_none(try_get( node, lambda x: x['edge_media_' + suffix]['count'])) + self._set_cookie('instagram.com', 'ig_pr', '1') + cursor = '' for page_num in itertools.count(1): media = self._download_json( From 1c9b1a449430bd8b267c9c43ce7ed7cb73ac4433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Apr 2018 00:08:45 +0700 Subject: [PATCH 070/148] [acast] Fix extraction (closes #16118) --- youtube_dl/extractor/acast.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 5871e72dc..4ad549c92 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -7,7 +7,7 @@ import functools from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - int_or_none, + float_or_none, unified_timestamp, OnDemandPagedList, ) @@ -46,18 +46,22 @@ class ACastIE(InfoExtractor): def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() + s = self._download_json( + 'https://play-api.acast.com/stitch/%s/%s' % (channel, display_id), + display_id)['result'] + media_url = s['url'] cast_data = self._download_json( 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), display_id) e = cast_data['result']['episode'] return { 'id': compat_str(e['id']), 'display_id': display_id, - 'url': e['mediaUrl'], + 'url': media_url, 'title': e['name'], 'description': e.get('description'), 'thumbnail': e.get('image'), 'timestamp': unified_timestamp(e.get('publishingDate')), - 'duration': int_or_none(e.get('duration')), + 'duration': float_or_none(s.get('duration') or e.get('duration')), } From cae5d9705c28ffc0bf5e149a5f92d31a48208e49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Apr 2018 00:21:55 +0700 Subject: [PATCH 071/148] [acast] Extract more metadata --- youtube_dl/extractor/acast.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 4ad549c92..6d846ea7a 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -8,6 +8,8 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( float_or_none, + int_or_none, + try_get, unified_timestamp, OnDemandPagedList, ) @@ -24,23 +26,29 @@ class ACastIE(InfoExtractor): 'id': '57de3baa-4bb0-487e-9418-2692c1277a34', 'ext': 'mp3', 'title': '"Where Are You?": Taipei 101, Taiwan', + 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e', 'timestamp': 1196172000, 'upload_date': '20071127', - 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e', 'duration': 211, + 'creator': 'Concierge', + 'series': 'Condé Nast Traveler Podcast', + 'episode': '"Where Are You?": Taipei 101, Taiwan', } }, { # test with multiple blings 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': 'e87d5b8516cd04c0d81b6ee1caca28d0', + 'md5': 'a02393c74f3bdb1801c3ec2695577ce0', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', 'title': '2. Raggarmordet - Röster ur det förflutna', + 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', 'timestamp': 1477346700, 'upload_date': '20161024', - 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', - 'duration': 2766, + 'duration': 2766.602563, + 'creator': 'Anton Berg & Martin Johnson', + 'series': 'Spår', + 'episode': '2. Raggarmordet - Röster ur det förflutna', } }] @@ -51,17 +59,25 @@ class ACastIE(InfoExtractor): display_id)['result'] media_url = s['url'] cast_data = self._download_json( - 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), display_id) - e = cast_data['result']['episode'] + 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), + display_id)['result'] + e = cast_data['episode'] + title = e['name'] return { 'id': compat_str(e['id']), 'display_id': display_id, 'url': media_url, - 'title': e['name'], - 'description': e.get('description'), + 'title': title, + 'description': e.get('description') or e.get('summary'), 'thumbnail': e.get('image'), 'timestamp': unified_timestamp(e.get('publishingDate')), 'duration': float_or_none(s.get('duration') or e.get('duration')), + 'filesize': int_or_none(e.get('contentLength')), + 'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str), + 'series': try_get(cast_data, lambda x: x['show']['name'], compat_str), + 'season_number': int_or_none(e.get('seasonNumber')), + 'episode': title, + 'episode_number': int_or_none(e.get('episodeNumber')), } From 717ea4e14e59bded0c2fb20e84b6513d82644b43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Apr 2018 00:29:43 +0700 Subject: [PATCH 072/148] [steam] Bypass mature content check (closes #16113) --- youtube_dl/extractor/steam.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py index e5ac586a7..a6a191ceb 100644 --- a/youtube_dl/extractor/steam.py +++ b/youtube_dl/extractor/steam.py @@ -75,6 +75,9 @@ class SteamIE(InfoExtractor): gameID = m.group('gameID') playlist_id = gameID videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id + + self._set_cookie('steampowered.com', 'mature_content', '1') + webpage = self._download_webpage(videourl, playlist_id) if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None: From 66b686727b198a6b14ddcbcfdcbaadd5b203362f Mon Sep 17 00:00:00 2001 From: aeph6Ee0 <aeph6Ee0@users.noreply.github.com> Date: Sat, 7 Apr 2018 22:09:42 +0200 Subject: [PATCH 073/148] [extractor/common] Relax JSON-LD context check (closes #16006) --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 890232586..59b9d3739 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1025,7 +1025,7 @@ class InfoExtractor(object): }) for e in json_ld: - if e.get('@context') == 'http://schema.org': + if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')): item_type = e.get('@type') if expected_type is not None and expected_type != item_type: return info From 608c738c7d8e6be21f0cc0bb7a844bad9d841964 Mon Sep 17 00:00:00 2001 From: GDR! <gdr@gdr.name> Date: Sun, 8 Apr 2018 17:13:00 +0200 Subject: [PATCH 074/148] [odnoklassniki] Extend _VALID_URL (closes #16081) --- youtube_dl/extractor/odnoklassniki.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 5c8b37e18..d87d0960f 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -19,7 +19,7 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer|live)/(?P<id>[\d-]+)' + _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?/|web-api/video/moviePlayer/|live/|dk\?.*?st\.mvId=)(?P<id>[\d-]+)' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', @@ -101,6 +101,9 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'https://www.ok.ru/live/484531969818', 'only_matching': True, + }, { + 'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#', + 'only_matching': True, }] def _real_extract(self, url): From d04ca9761615e2ed3fdf89d8d87a4b9adfffacc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Apr 2018 22:21:21 +0700 Subject: [PATCH 075/148] [odnoklassniki] Improve _VALID_URL readability --- youtube_dl/extractor/odnoklassniki.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index d87d0960f..190d8af4d 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -19,7 +19,18 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?/|web-api/video/moviePlayer/|live/|dk\?.*?st\.mvId=)(?P<id>[\d-]+)' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|m|mobile)\.)? + (?:odnoklassniki|ok)\.ru/ + (?: + video(?:embed)?/| + web-api/video/moviePlayer/| + live/| + dk\?.*?st\.mvId= + ) + (?P<id>[\d-]+) + ''' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', From 1fc37ca3f181159c98bccf081766abb73b9d344f Mon Sep 17 00:00:00 2001 From: Surya Oktafendri <f2face@f2face.com> Date: Mon, 9 Apr 2018 00:19:23 +0700 Subject: [PATCH 076/148] [generic] Add support for share-videos.se embeds (closes #16089) --- youtube_dl/extractor/generic.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index cf64398e3..4b210da72 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1967,6 +1967,16 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, + { + 'url': 'http://share-videos.se/auto/video/83645793?uid=13', + 'md5': 'b68d276de422ab07ee1d49388103f457', + 'info_dict': { + 'id': '83645793', + 'title': 'Lock up and get excited', + 'thumbnail': r're:^https?://.*\.jpg(\?.*)?$', + 'ext': 'mp4' + } } # { # # TODO: find another test @@ -2978,6 +2988,14 @@ class GenericIE(InfoExtractor): merged[k] = v return merged + # Look for Share-Videos.se embeds + sharevideosse_urls = [m.group('url') for m in re.finditer( + r'<iframe[^>]+?src\s*=\s*(["\'])(?P<url>https?://embed\.share-videos\.se/auto/embed/\d+.+?)\1', + webpage)] + if sharevideosse_urls: + return self.playlist_from_matches( + sharevideosse_urls, video_id, video_title) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: From d3431dcb90ea72fed502ecfd8f34e7499009a53a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 Apr 2018 00:25:44 +0700 Subject: [PATCH 077/148] [generic] Restrict share-videos.se embeds regex to filter bogus URLs (#16115) --- youtube_dl/extractor/generic.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4b210da72..8922d1914 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1974,10 +1974,10 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': '83645793', 'title': 'Lock up and get excited', - 'thumbnail': r're:^https?://.*\.jpg(\?.*)?$', 'ext': 'mp4' - } - } + }, + 'skip': 'TODO: fix nested playlists processing in tests', + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2973,6 +2973,13 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( + r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', + webpage)] + if sharevideos_urls: + return self.playlist_from_matches( + sharevideos_urls, video_id, video_title) + def merge_dicts(dict1, dict2): merged = {} for k, v in dict1.items(): @@ -2988,14 +2995,6 @@ class GenericIE(InfoExtractor): merged[k] = v return merged - # Look for Share-Videos.se embeds - sharevideosse_urls = [m.group('url') for m in re.finditer( - r'<iframe[^>]+?src\s*=\s*(["\'])(?P<url>https?://embed\.share-videos\.se/auto/embed/\d+.+?)\1', - webpage)] - if sharevideosse_urls: - return self.playlist_from_matches( - sharevideosse_urls, video_id, video_title) - # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: From 069937151e429a2127569910d204c03eec167f0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 Apr 2018 00:37:15 +0700 Subject: [PATCH 078/148] [generic] Add support for tube8 embeds --- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/tube8.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8922d1914..e3cb5c5ce 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -58,6 +58,7 @@ from .xhamster import XHamsterEmbedIE from .tnaflix import TNAFlixNetworkEmbedIE from .drtuber import DrTuberIE from .redtube import RedTubeIE +from .tube8 import Tube8IE from .vimeo import VimeoIE from .dailymotion import DailymotionIE from .dailymail import DailyMailIE @@ -2556,6 +2557,11 @@ class GenericIE(InfoExtractor): if redtube_urls: return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) + # Look for embedded Tube8 player + tube8_urls = Tube8IE._extract_urls(webpage) + if tube8_urls: + return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) + # Look for embedded Tvigle player mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 1853a1104..368c45729 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -31,6 +31,12 @@ class Tube8IE(KeezMoviesIE): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)', + webpage) + def _real_extract(self, url): webpage, info = self._extract_info(url) From 94c3442e6ae176b01b3b5eae0a3adc355319b569 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 Apr 2018 01:03:55 +0700 Subject: [PATCH 079/148] [YoutubeDL] Do not save/restore console title while simulate (closes #16103) --- youtube_dl/YoutubeDL.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 523dd1f7d..fca4999eb 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -532,6 +532,8 @@ class YoutubeDL(object): def save_console_title(self): if not self.params.get('consoletitle', False): return + if self.params.get('simulate', False): + return if compat_os_name != 'nt' and 'TERM' in os.environ: # Save the title on stack self._write_string('\033[22;0t', self._screen_file) @@ -539,6 +541,8 @@ class YoutubeDL(object): def restore_console_title(self): if not self.params.get('consoletitle', False): return + if self.params.get('simulate', False): + return if compat_os_name != 'nt' and 'TERM' in os.environ: # Restore the title from stack self._write_string('\033[23;0t', self._screen_file) From 880ed89d491af9d85680777422c49e07f747e095 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 Apr 2018 01:14:47 +0700 Subject: [PATCH 080/148] [ChangeLog] Actualize [ci skip] --- ChangeLog | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ChangeLog b/ChangeLog index 89dfbd8b8..9b01a8062 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,26 @@ +version <unreleased> + +Core +* [YoutubeDL] Do not save/restore console title while simulate (#16103) +* [extractor/common] Relax JSON-LD context check (#16006) + +Extractors ++ [generic] Add support for tube8 embeds ++ [generic] Add support for share-videos.se embeds (#16089, #16115) +* [odnoklassniki] Extend URL regular expression (#16081) +* [steam] Bypass mature content check (#16113) ++ [acast] Extract more metadata +* [acast] Fix extraction (#16118) +* [instagram:user] Fix extraction (#16119) +* [drtuber] Fix title extraction (#16107, #16108) +* [liveleak] Extend URL regular expression (#16117) ++ [openload] Add support for oload.xyz +* [openload] Relax stream URL regular expression +* [openload] Fix extraction (#16099) ++ [svtplay:series] Add support for season URLs ++ [svtplay:series] Add support for series (#11130, #16059) + + version 2018.04.03 Extractors From f7f9757efcd4f5eaaa31e16ff14fc6627f515393 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 Apr 2018 01:19:27 +0700 Subject: [PATCH 081/148] release 2018.04.09 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 99e8acd33..ed622afd1 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.03*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.03** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.09** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.04.03 +[debug] youtube-dl version 2018.04.09 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 9b01a8062..4385c4091 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.04.09 Core * [YoutubeDL] Do not save/restore console title while simulate (#16103) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 17baac5ab..1c13199d4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -804,6 +804,7 @@ - **SunPorno** - **SVT** - **SVTPlay**: SVT Play and Öppet arkiv + - **SVTSeries** - **SWRMediathek** - **Syfy** - **SztvHu** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a3163509c..307d6041a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.04.03' +__version__ = '2018.04.09' From fce7962691a0f5874753cad431a8bb6ed31efc69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 10 Apr 2018 23:07:37 +0700 Subject: [PATCH 082/148] [twitch] Add support for mobile URLs (closes #16146) --- youtube_dl/extractor/twitch.py | 47 ++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 1981b4d4a..f736283e9 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -28,7 +28,7 @@ from ..utils import ( class TwitchBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:(?:www|go)\.)?twitch\.tv' + _VALID_URL_BASE = r'https?://(?:(?:www|go|m)\.)?twitch\.tv' _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'https://usher.ttvnw.net' @@ -226,7 +226,7 @@ class TwitchVodIE(TwitchItemBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:(?:www|go)\.)?twitch\.tv/(?:[^/]+/v|videos)/| + (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v|videos)/| player\.twitch\.tv/\?.*?\bvideo=v ) (?P<id>\d+) @@ -279,6 +279,9 @@ class TwitchVodIE(TwitchItemBaseIE): }, { 'url': 'https://www.twitch.tv/videos/6528877', 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/beagsandjam/v/247478721', + 'only_matching': True, }] def _real_extract(self, url): @@ -390,14 +393,17 @@ class TwitchProfileIE(TwitchPlaylistBaseIE): _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE _PLAYLIST_TYPE = 'profile' - _TEST = { + _TESTS = [{ 'url': 'http://www.twitch.tv/vanillatv/profile', 'info_dict': { 'id': 'vanillatv', 'title': 'VanillaTV', }, 'playlist_mincount': 412, - } + }, { + 'url': 'http://m.twitch.tv/vanillatv/profile', + 'only_matching': True, + }] class TwitchVideosBaseIE(TwitchPlaylistBaseIE): @@ -411,14 +417,17 @@ class TwitchAllVideosIE(TwitchVideosBaseIE): _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive,upload,highlight' _PLAYLIST_TYPE = 'all videos' - _TEST = { + _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/all', 'info_dict': { 'id': 'spamfish', 'title': 'Spamfish', }, 'playlist_mincount': 869, - } + }, { + 'url': 'https://m.twitch.tv/spamfish/videos/all', + 'only_matching': True, + }] class TwitchUploadsIE(TwitchVideosBaseIE): @@ -427,14 +436,17 @@ class TwitchUploadsIE(TwitchVideosBaseIE): _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'upload' _PLAYLIST_TYPE = 'uploads' - _TEST = { + _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/uploads', 'info_dict': { 'id': 'spamfish', 'title': 'Spamfish', }, 'playlist_mincount': 0, - } + }, { + 'url': 'https://m.twitch.tv/spamfish/videos/uploads', + 'only_matching': True, + }] class TwitchPastBroadcastsIE(TwitchVideosBaseIE): @@ -443,14 +455,17 @@ class TwitchPastBroadcastsIE(TwitchVideosBaseIE): _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive' _PLAYLIST_TYPE = 'past broadcasts' - _TEST = { + _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/past-broadcasts', 'info_dict': { 'id': 'spamfish', 'title': 'Spamfish', }, 'playlist_mincount': 0, - } + }, { + 'url': 'https://m.twitch.tv/spamfish/videos/past-broadcasts', + 'only_matching': True, + }] class TwitchHighlightsIE(TwitchVideosBaseIE): @@ -459,14 +474,17 @@ class TwitchHighlightsIE(TwitchVideosBaseIE): _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'highlight' _PLAYLIST_TYPE = 'highlights' - _TEST = { + _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/highlights', 'info_dict': { 'id': 'spamfish', 'title': 'Spamfish', }, 'playlist_mincount': 805, - } + }, { + 'url': 'https://m.twitch.tv/spamfish/videos/highlights', + 'only_matching': True, + }] class TwitchStreamIE(TwitchBaseIE): @@ -474,7 +492,7 @@ class TwitchStreamIE(TwitchBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:(?:www|go)\.)?twitch\.tv/| + (?:(?:www|go|m)\.)?twitch\.tv/| player\.twitch\.tv/\?.*?\bchannel= ) (?P<id>[^/#?]+) @@ -508,6 +526,9 @@ class TwitchStreamIE(TwitchBaseIE): }, { 'url': 'https://go.twitch.tv/food', 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/food', + 'only_matching': True, }] @classmethod From dd9aea8cbdabc7622446d387ed6ed59e47b79de7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 11 Apr 2018 01:25:41 +0700 Subject: [PATCH 083/148] [instagram:user] Add request signing (closes #16119) --- youtube_dl/extractor/instagram.py | 161 +++++++++++++++++++++++++++--- 1 file changed, 149 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 9f570249f..1c917bc95 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -2,14 +2,20 @@ from __future__ import unicode_literals import itertools import json +import os import re +import subprocess +import tempfile from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + check_executable, + ExtractorError, get_element_by_attribute, int_or_none, lowercase_escape, + std_headers, try_get, ) @@ -238,24 +244,140 @@ class InstagramUserIE(InfoExtractor): } } - def _entries(self, uploader_id): + _SIGN_CODE = ''' +"use strict"; +function i(e, t) { + var r = (65535 & e) + (65535 & t); + return (e >> 16) + (t >> 16) + (r >> 16) << 16 | 65535 & r +} +function a(e, t, r, n, o, a) { + return i((s = i(i(t, e), i(n, a))) << (c = o) | s >>> 32 - c, r); + var s, c +} +function s(e, t, r, n, o, i, s) { + return a(t & r | ~t & n, e, t, o, i, s) +} +function c(e, t, r, n, o, i, s) { + return a(t & n | r & ~n, e, t, o, i, s) +} +function u(e, t, r, n, o, i, s) { + return a(t ^ r ^ n, e, t, o, i, s) +} +function l(e, t, r, n, o, i, s) { + return a(r ^ (t | ~n), e, t, o, i, s) +} +function p(e, t) { + var r, n, o, a, p; + e[t >> 5] |= 128 << t % 32, + e[14 + (t + 64 >>> 9 << 4)] = t; + var d = 1732584193 + , f = -271733879 + , h = -1732584194 + , g = 271733878; + for (r = 0; r < e.length; r += 16) + n = d, + o = f, + a = h, + p = g, + f = l(f = l(f = l(f = l(f = u(f = u(f = u(f = u(f = c(f = c(f = c(f = c(f = s(f = s(f = s(f = s(f, h = s(h, g = s(g, d = s(d, f, h, g, e[r], 7, -680876936), f, h, e[r + 1], 12, -389564586), d, f, e[r + 2], 17, 606105819), g, d, e[r + 3], 22, -1044525330), h = s(h, g = s(g, d = s(d, f, h, g, e[r + 4], 7, -176418897), f, h, e[r + 5], 12, 1200080426), d, f, e[r + 6], 17, -1473231341), g, d, e[r + 7], 22, -45705983), h = s(h, g = s(g, d = s(d, f, h, g, e[r + 8], 7, 1770035416), f, h, e[r + 9], 12, -1958414417), d, f, e[r + 10], 17, -42063), g, d, e[r + 11], 22, -1990404162), h = s(h, g = s(g, d = s(d, f, h, g, e[r + 12], 7, 1804603682), f, h, e[r + 13], 12, -40341101), d, f, e[r + 14], 17, -1502002290), g, d, e[r + 15], 22, 1236535329), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 1], 5, -165796510), f, h, e[r + 6], 9, -1069501632), d, f, e[r + 11], 14, 643717713), g, d, e[r], 20, -373897302), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 5], 5, -701558691), f, h, e[r + 10], 9, 38016083), d, f, e[r + 15], 14, -660478335), g, d, e[r + 4], 20, -405537848), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 9], 5, 568446438), f, h, e[r + 14], 9, -1019803690), d, f, e[r + 3], 14, -187363961), g, d, e[r + 8], 20, 1163531501), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 13], 5, -1444681467), f, h, e[r + 2], 9, -51403784), d, f, e[r + 7], 14, 1735328473), g, d, e[r + 12], 20, -1926607734), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 5], 4, -378558), f, h, e[r + 8], 11, -2022574463), d, f, e[r + 11], 16, 1839030562), g, d, e[r + 14], 23, -35309556), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 1], 4, -1530992060), f, h, e[r + 4], 11, 1272893353), d, f, e[r + 7], 16, -155497632), g, d, e[r + 10], 23, -1094730640), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 13], 4, 681279174), f, h, e[r], 11, -358537222), d, f, e[r + 3], 16, -722521979), g, d, e[r + 6], 23, 76029189), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 9], 4, -640364487), f, h, e[r + 12], 11, -421815835), d, f, e[r + 15], 16, 530742520), g, d, e[r + 2], 23, -995338651), h = l(h, g = l(g, d = l(d, f, h, g, e[r], 6, -198630844), f, h, e[r + 7], 10, 1126891415), d, f, e[r + 14], 15, -1416354905), g, d, e[r + 5], 21, -57434055), h = l(h, g = l(g, d = l(d, f, h, g, e[r + 12], 6, 1700485571), f, h, e[r + 3], 10, -1894986606), d, f, e[r + 10], 15, -1051523), g, d, e[r + 1], 21, -2054922799), h = l(h, g = l(g, d = l(d, f, h, g, e[r + 8], 6, 1873313359), f, h, e[r + 15], 10, -30611744), d, f, e[r + 6], 15, -1560198380), g, d, e[r + 13], 21, 1309151649), h = l(h, g = l(g, d = l(d, f, h, g, e[r + 4], 6, -145523070), f, h, e[r + 11], 10, -1120210379), d, f, e[r + 2], 15, 718787259), g, d, e[r + 9], 21, -343485551), + d = i(d, n), + f = i(f, o), + h = i(h, a), + g = i(g, p); + return [d, f, h, g] +} +function d(e) { + var t, r = "", n = 32 * e.length; + for (t = 0; t < n; t += 8) + r += String.fromCharCode(e[t >> 5] >>> t % 32 & 255); + return r +} +function f(e) { + var t, r = []; + for (r[(e.length >> 2) - 1] = void 0, + t = 0; t < r.length; t += 1) + r[t] = 0; + var n = 8 * e.length; + for (t = 0; t < n; t += 8) + r[t >> 5] |= (255 & e.charCodeAt(t / 8)) << t % 32; + return r +} +function h(e) { + var t, r, n = ""; + for (r = 0; r < e.length; r += 1) + t = e.charCodeAt(r), + n += "0123456789abcdef".charAt(t >>> 4 & 15) + "0123456789abcdef".charAt(15 & t); + return n +} +function g(e) { + return unescape(encodeURIComponent(e)) +} +function b(e) { + return function(e) { + return d(p(f(e), 8 * e.length)) + }(g(e)) +} +function m(e, t) { + return function(e, t) { + var r, n, o = f(e), i = [], a = []; + for (i[15] = a[15] = void 0, + o.length > 16 && (o = p(o, 8 * e.length)), + r = 0; r < 16; r += 1) + i[r] = 909522486 ^ o[r], + a[r] = 1549556828 ^ o[r]; + return n = p(i.concat(f(t)), 512 + 8 * t.length), + d(p(a.concat(n), 640)) + }(g(e), g(t)) +} +function v(e, t, r) { + return t ? r ? m(t, e) : h(m(t, e)) : r ? b(e) : h(b(e)) +} +function sign(s) { + return v(s); +} +''' + + def _entries(self, data): def get_count(suffix): return int_or_none(try_get( node, lambda x: x['edge_media_' + suffix]['count'])) + uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] + csrf_token = data['config']['csrf_token'] + rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' + self._set_cookie('instagram.com', 'ig_pr', '1') + def sign(s): + js_code = self._SIGN_CODE + "console.log(sign('%s')); phantom.exit();" % s + with open(self._phantomjs_script.name, 'w') as f: + f.write(js_code) + p = subprocess.Popen( + ['phantomjs', '--ssl-protocol=any', f.name], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + gis, err = p.communicate() + if p.returncode != 0: + raise ExtractorError('Failed to sign request\n:' + err.decode('utf-8')) + return gis.decode('utf-8').strip() + cursor = '' for page_num in itertools.count(1): + variables = json.dumps({ + 'id': uploader_id, + 'first': 100, + 'after': cursor, + }) + gis = sign( + '%s:%s:%s:%s' + % (rhx_gis, csrf_token, std_headers['User-Agent'], variables)) media = self._download_json( 'https://www.instagram.com/graphql/query/', uploader_id, - 'Downloading JSON page %d' % page_num, query={ + 'Downloading JSON page %d' % page_num, headers={ + 'X-Requested-With': 'XMLHttpRequest', + 'X-Instagram-GIS': gis, + }, query={ 'query_hash': '472f257a40c653c64c666ce877d59d2b', - 'variables': json.dumps({ - 'id': uploader_id, - 'first': 100, - 'after': cursor, - }) + 'variables': variables, })['data']['user']['edge_owner_to_timeline_media'] edges = media.get('edges') @@ -309,11 +431,26 @@ class InstagramUserIE(InfoExtractor): if not cursor or not isinstance(cursor, compat_str): break + def _real_initialize(self): + if not check_executable('phantomjs', ['-v']): + raise ExtractorError( + 'PhantomJS executable not found in PATH, download it from http://phantomjs.org', + expected=True) + self._phantomjs_script = tempfile.NamedTemporaryFile(delete=False) + self._phantomjs_script.close() + + def __del__(self): + os.unlink(self._phantomjs_script.name) + def _real_extract(self, url): username = self._match_id(url) - uploader_id = self._download_json( - 'https://instagram.com/%s/' % username, username, query={ - '__a': 1, - })['graphql']['user']['id'] + + webpage = self._download_webpage(url, username) + + data = self._parse_json( + self._search_regex( + r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), + username) + return self.playlist_result( - self._entries(uploader_id), username, username) + self._entries(data), username, username) From 315ab3d500964f1d8442135889e1886ca6d90100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 11 Apr 2018 01:51:57 +0700 Subject: [PATCH 084/148] [instagram:user] Simplify signing (#16119) --- youtube_dl/extractor/instagram.py | 128 +----------------------------- 1 file changed, 3 insertions(+), 125 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 1c917bc95..76452a6a1 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -1,17 +1,13 @@ from __future__ import unicode_literals import itertools +import hashlib import json -import os import re -import subprocess -import tempfile from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - check_executable, - ExtractorError, get_element_by_attribute, int_or_none, lowercase_escape, @@ -244,99 +240,6 @@ class InstagramUserIE(InfoExtractor): } } - _SIGN_CODE = ''' -"use strict"; -function i(e, t) { - var r = (65535 & e) + (65535 & t); - return (e >> 16) + (t >> 16) + (r >> 16) << 16 | 65535 & r -} -function a(e, t, r, n, o, a) { - return i((s = i(i(t, e), i(n, a))) << (c = o) | s >>> 32 - c, r); - var s, c -} -function s(e, t, r, n, o, i, s) { - return a(t & r | ~t & n, e, t, o, i, s) -} -function c(e, t, r, n, o, i, s) { - return a(t & n | r & ~n, e, t, o, i, s) -} -function u(e, t, r, n, o, i, s) { - return a(t ^ r ^ n, e, t, o, i, s) -} -function l(e, t, r, n, o, i, s) { - return a(r ^ (t | ~n), e, t, o, i, s) -} -function p(e, t) { - var r, n, o, a, p; - e[t >> 5] |= 128 << t % 32, - e[14 + (t + 64 >>> 9 << 4)] = t; - var d = 1732584193 - , f = -271733879 - , h = -1732584194 - , g = 271733878; - for (r = 0; r < e.length; r += 16) - n = d, - o = f, - a = h, - p = g, - f = l(f = l(f = l(f = l(f = u(f = u(f = u(f = u(f = c(f = c(f = c(f = c(f = s(f = s(f = s(f = s(f, h = s(h, g = s(g, d = s(d, f, h, g, e[r], 7, -680876936), f, h, e[r + 1], 12, -389564586), d, f, e[r + 2], 17, 606105819), g, d, e[r + 3], 22, -1044525330), h = s(h, g = s(g, d = s(d, f, h, g, e[r + 4], 7, -176418897), f, h, e[r + 5], 12, 1200080426), d, f, e[r + 6], 17, -1473231341), g, d, e[r + 7], 22, -45705983), h = s(h, g = s(g, d = s(d, f, h, g, e[r + 8], 7, 1770035416), f, h, e[r + 9], 12, -1958414417), d, f, e[r + 10], 17, -42063), g, d, e[r + 11], 22, -1990404162), h = s(h, g = s(g, d = s(d, f, h, g, e[r + 12], 7, 1804603682), f, h, e[r + 13], 12, -40341101), d, f, e[r + 14], 17, -1502002290), g, d, e[r + 15], 22, 1236535329), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 1], 5, -165796510), f, h, e[r + 6], 9, -1069501632), d, f, e[r + 11], 14, 643717713), g, d, e[r], 20, -373897302), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 5], 5, -701558691), f, h, e[r + 10], 9, 38016083), d, f, e[r + 15], 14, -660478335), g, d, e[r + 4], 20, -405537848), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 9], 5, 568446438), f, h, e[r + 14], 9, -1019803690), d, f, e[r + 3], 14, -187363961), g, d, e[r + 8], 20, 1163531501), h = c(h, g = c(g, d = c(d, f, h, g, e[r + 13], 5, -1444681467), f, h, e[r + 2], 9, -51403784), d, f, e[r + 7], 14, 1735328473), g, d, e[r + 12], 20, -1926607734), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 5], 4, -378558), f, h, e[r + 8], 11, -2022574463), d, f, e[r + 11], 16, 1839030562), g, d, e[r + 14], 23, -35309556), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 1], 4, -1530992060), f, h, e[r + 4], 11, 1272893353), d, f, e[r + 7], 16, -155497632), g, d, e[r + 10], 23, -1094730640), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 13], 4, 681279174), f, h, e[r], 11, -358537222), d, f, e[r + 3], 16, -722521979), g, d, e[r + 6], 23, 76029189), h = u(h, g = u(g, d = u(d, f, h, g, e[r + 9], 4, -640364487), f, h, e[r + 12], 11, -421815835), d, f, e[r + 15], 16, 530742520), g, d, e[r + 2], 23, -995338651), h = l(h, g = l(g, d = l(d, f, h, g, e[r], 6, -198630844), f, h, e[r + 7], 10, 1126891415), d, f, e[r + 14], 15, -1416354905), g, d, e[r + 5], 21, -57434055), h = l(h, g = l(g, d = l(d, f, h, g, e[r + 12], 6, 1700485571), f, h, e[r + 3], 10, -1894986606), d, f, e[r + 10], 15, -1051523), g, d, e[r + 1], 21, -2054922799), h = l(h, g = l(g, d = l(d, f, h, g, e[r + 8], 6, 1873313359), f, h, e[r + 15], 10, -30611744), d, f, e[r + 6], 15, -1560198380), g, d, e[r + 13], 21, 1309151649), h = l(h, g = l(g, d = l(d, f, h, g, e[r + 4], 6, -145523070), f, h, e[r + 11], 10, -1120210379), d, f, e[r + 2], 15, 718787259), g, d, e[r + 9], 21, -343485551), - d = i(d, n), - f = i(f, o), - h = i(h, a), - g = i(g, p); - return [d, f, h, g] -} -function d(e) { - var t, r = "", n = 32 * e.length; - for (t = 0; t < n; t += 8) - r += String.fromCharCode(e[t >> 5] >>> t % 32 & 255); - return r -} -function f(e) { - var t, r = []; - for (r[(e.length >> 2) - 1] = void 0, - t = 0; t < r.length; t += 1) - r[t] = 0; - var n = 8 * e.length; - for (t = 0; t < n; t += 8) - r[t >> 5] |= (255 & e.charCodeAt(t / 8)) << t % 32; - return r -} -function h(e) { - var t, r, n = ""; - for (r = 0; r < e.length; r += 1) - t = e.charCodeAt(r), - n += "0123456789abcdef".charAt(t >>> 4 & 15) + "0123456789abcdef".charAt(15 & t); - return n -} -function g(e) { - return unescape(encodeURIComponent(e)) -} -function b(e) { - return function(e) { - return d(p(f(e), 8 * e.length)) - }(g(e)) -} -function m(e, t) { - return function(e, t) { - var r, n, o = f(e), i = [], a = []; - for (i[15] = a[15] = void 0, - o.length > 16 && (o = p(o, 8 * e.length)), - r = 0; r < 16; r += 1) - i[r] = 909522486 ^ o[r], - a[r] = 1549556828 ^ o[r]; - return n = p(i.concat(f(t)), 512 + 8 * t.length), - d(p(a.concat(n), 640)) - }(g(e), g(t)) -} -function v(e, t, r) { - return t ? r ? m(t, e) : h(m(t, e)) : r ? b(e) : h(b(e)) -} -function sign(s) { - return v(s); -} -''' - def _entries(self, data): def get_count(suffix): return int_or_none(try_get( @@ -348,18 +251,6 @@ function sign(s) { self._set_cookie('instagram.com', 'ig_pr', '1') - def sign(s): - js_code = self._SIGN_CODE + "console.log(sign('%s')); phantom.exit();" % s - with open(self._phantomjs_script.name, 'w') as f: - f.write(js_code) - p = subprocess.Popen( - ['phantomjs', '--ssl-protocol=any', f.name], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - gis, err = p.communicate() - if p.returncode != 0: - raise ExtractorError('Failed to sign request\n:' + err.decode('utf-8')) - return gis.decode('utf-8').strip() - cursor = '' for page_num in itertools.count(1): variables = json.dumps({ @@ -367,14 +258,12 @@ function sign(s) { 'first': 100, 'after': cursor, }) - gis = sign( - '%s:%s:%s:%s' - % (rhx_gis, csrf_token, std_headers['User-Agent'], variables)) + s = '%s:%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent'], variables) media = self._download_json( 'https://www.instagram.com/graphql/query/', uploader_id, 'Downloading JSON page %d' % page_num, headers={ 'X-Requested-With': 'XMLHttpRequest', - 'X-Instagram-GIS': gis, + 'X-Instagram-GIS': hashlib.md5(s.encode('utf-8')).hexdigest(), }, query={ 'query_hash': '472f257a40c653c64c666ce877d59d2b', 'variables': variables, @@ -431,17 +320,6 @@ function sign(s) { if not cursor or not isinstance(cursor, compat_str): break - def _real_initialize(self): - if not check_executable('phantomjs', ['-v']): - raise ExtractorError( - 'PhantomJS executable not found in PATH, download it from http://phantomjs.org', - expected=True) - self._phantomjs_script = tempfile.NamedTemporaryFile(delete=False) - self._phantomjs_script.close() - - def __del__(self): - os.unlink(self._phantomjs_script.name) - def _real_extract(self, url): username = self._match_id(url) From d783aee56a720ce15cf2775afc330b2ed5d53baf Mon Sep 17 00:00:00 2001 From: Ray Douglass <3107146+raydouglass@users.noreply.github.com> Date: Wed, 11 Apr 2018 09:11:24 -0400 Subject: [PATCH 085/148] [fxnetworks] Add support for https theplatform URLs (closes #16125) --- youtube_dl/extractor/fxnetworks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py index 37549fb01..00e67426b 100644 --- a/youtube_dl/extractor/fxnetworks.py +++ b/youtube_dl/extractor/fxnetworks.py @@ -41,7 +41,7 @@ class FXNetworksIE(AdobePassIE): if 'The content you are trying to access is not available in your region.' in webpage: self.raise_geo_restricted() video_data = extract_attributes(self._search_regex( - r'(<a.+?rel="http://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data')) + r'(<a.+?rel="https?://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data')) player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) release_url = video_data['rel'] title = video_data['data-title'] From 64f03e5b4c86f7c7e6d660267d77e02da621a94d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 11 Apr 2018 23:28:55 +0700 Subject: [PATCH 086/148] [cbc:watch] Re-acquire device token when expired (closes #16160) --- youtube_dl/extractor/cbc.py | 59 +++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 3be0c646b..54b4b9be9 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -5,7 +5,10 @@ import json import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_HTTPError, +) from ..utils import ( js_to_json, smuggle_url, @@ -206,30 +209,48 @@ class CBCWatchBaseIE(InfoExtractor): def _call_api(self, path, video_id): url = path if path.startswith('http') else self._API_BASE_URL + path - result = self._download_xml(url, video_id, headers={ - 'X-Clearleap-DeviceId': self._device_id, - 'X-Clearleap-DeviceToken': self._device_token, - }) + for _ in range(2): + try: + result = self._download_xml(url, video_id, headers={ + 'X-Clearleap-DeviceId': self._device_id, + 'X-Clearleap-DeviceToken': self._device_token, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + # Device token has expired, re-acquiring device token + self._register_device() + continue + raise error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage') if error_message: raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message)) return result def _real_initialize(self): - if not self._device_id or not self._device_token: - device = self._downloader.cache.load('cbcwatch', 'device') or {} - self._device_id, self._device_token = device.get('id'), device.get('token') - if not self._device_id or not self._device_token: - result = self._download_xml( - self._API_BASE_URL + 'device/register', - None, data=b'<device><type>web</type></device>') - self._device_id = xpath_text(result, 'deviceId', fatal=True) - self._device_token = xpath_text(result, 'deviceToken', fatal=True) - self._downloader.cache.store( - 'cbcwatch', 'device', { - 'id': self._device_id, - 'token': self._device_token, - }) + if self._valid_device_token(): + return + device = self._downloader.cache.load('cbcwatch', 'device') or {} + self._device_id, self._device_token = device.get('id'), device.get('token') + if self._valid_device_token(): + return + self._register_device() + + def _valid_device_token(self): + return self._device_id and self._device_token + + def _register_device(self): + self._device_id = self._device_token = None + result = self._download_xml( + self._API_BASE_URL + 'device/register', + None, 'Acquiring device token', + data=b'<device><type>web</type></device>') + self._device_id = xpath_text(result, 'deviceId', fatal=True) + self._device_token = xpath_text(result, 'deviceToken', fatal=True) + self._downloader.cache.store( + 'cbcwatch', 'device', { + 'id': self._device_id, + 'token': self._device_token, + }) def _parse_rss_feed(self, rss): channel = xpath_element(rss, 'channel', fatal=True) From 92ded33a05e0760d0ae0e804a1dca0d8a84f3d14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 12 Apr 2018 04:53:45 +0700 Subject: [PATCH 087/148] [pornhub] Relax _VALID_URLs (closes #16165) --- youtube_dl/extractor/pornhub.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 9ce513aeb..23e24d216 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -33,7 +33,7 @@ class PornHubIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P<id>[\da-z]+) @@ -264,7 +264,7 @@ class PornHubPlaylistBaseIE(InfoExtractor): class PornHubPlaylistIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/playlist/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.pornhub.com/playlist/4667351', 'info_dict': { @@ -272,11 +272,14 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE): 'title': 'Nataly Hot', }, 'playlist_mincount': 2, + }, { + 'url': 'https://de.pornhub.com/playlist/4667351', + 'only_matching': True, }] class PornHubUserVideosIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:user|channel)s/(?P<id>[^/]+)/videos' + _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:user|channel)s/(?P<id>[^/]+)/videos' _TESTS = [{ 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'info_dict': { @@ -305,6 +308,9 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): # Most Viewed Videos 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi', 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', + 'only_matching': True, }] def _real_extract(self, url): From 68ddba20ae4e0ab28146e80d3e112a5a2661c386 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 13 Apr 2018 22:27:52 +0700 Subject: [PATCH 088/148] [instagram:user] Remove User-Agent from signature (closes #16119) --- youtube_dl/extractor/instagram.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 76452a6a1..8da1d5f2f 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -11,7 +11,6 @@ from ..utils import ( get_element_by_attribute, int_or_none, lowercase_escape, - std_headers, try_get, ) @@ -258,7 +257,7 @@ class InstagramUserIE(InfoExtractor): 'first': 100, 'after': cursor, }) - s = '%s:%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent'], variables) + s = '%s:%s:%s' % (rhx_gis, csrf_token, variables) media = self._download_json( 'https://www.instagram.com/graphql/query/', uploader_id, 'Downloading JSON page %d' % page_num, headers={ From 9b5aead6aa8ad82a5eecd2bc26c0e94399e92ca7 Mon Sep 17 00:00:00 2001 From: Timmy <hello@timmy.ws> Date: Sat, 14 Apr 2018 17:04:42 +0200 Subject: [PATCH 089/148] [vine:user] Fix extraction (closes #15514) --- youtube_dl/extractor/vine.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 46950d3a1..08ddffa66 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import itertools from .common import InfoExtractor from ..utils import ( @@ -116,14 +115,14 @@ class VineUserIE(InfoExtractor): _VINE_BASE_URL = 'https://vine.co/' _TESTS = [ { - 'url': 'https://vine.co/Visa', + 'url': 'https://vine.co/itsruthb', 'info_dict': { - 'id': 'Visa', + 'id': 'itsruthb', }, - 'playlist_mincount': 46, + 'playlist_mincount': 611, }, { - 'url': 'https://vine.co/u/941705360593584128', + 'url': 'https://vine.co/u/942914934646415360', 'only_matching': True, }, ] @@ -139,16 +138,10 @@ class VineUserIE(InfoExtractor): profile_url, user, note='Downloading user profile data') user_id = profile_data['data']['userId'] - timeline_data = [] - for pagenum in itertools.count(1): - timeline_url = '%sapi/timelines/users/%s?page=%s&size=100' % ( - self._VINE_BASE_URL, user_id, pagenum) - timeline_page = self._download_json( - timeline_url, user, note='Downloading page %d' % pagenum) - timeline_data.extend(timeline_page['data']['records']) - if timeline_page['data']['nextPage'] is None: - break - + user_archive = self._download_json( + 'https://archive.vine.co/profiles/%s.json' % user_id, user_id) + posts = user_archive['posts'] entries = [ - self.url_result(e['permalinkUrl'], 'Vine') for e in timeline_data] + self.url_result('https://vine.co/v/%s' % post_id, 'Vine') + for post_id in posts] return self.playlist_result(entries, user) From 8e41c9ad01b6deda96c29f685c4d8861b8759ba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 15 Apr 2018 22:43:25 +0700 Subject: [PATCH 090/148] [vine:user] Improve extraction (closes #16190) --- youtube_dl/extractor/vine.py | 45 +++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 08ddffa66..80b896b56 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( determine_ext, int_or_none, @@ -111,21 +112,24 @@ class VineIE(InfoExtractor): class VineUserIE(InfoExtractor): IE_NAME = 'vine:user' - _VALID_URL = r'(?:https?://)?vine\.co/(?P<u>u/)?(?P<user>[^/]+)/?(\?.*)?$' + _VALID_URL = r'https?://vine\.co/(?P<u>u/)?(?P<user>[^/]+)' _VINE_BASE_URL = 'https://vine.co/' - _TESTS = [ - { - 'url': 'https://vine.co/itsruthb', - 'info_dict': { - 'id': 'itsruthb', - }, - 'playlist_mincount': 611, + _TESTS = [{ + 'url': 'https://vine.co/itsruthb', + 'info_dict': { + 'id': 'itsruthb', + 'title': 'Ruth B', + 'description': '| Instagram/Twitter: itsruthb | still a lost boy from neverland', }, - { - 'url': 'https://vine.co/u/942914934646415360', - 'only_matching': True, - }, - ] + 'playlist_mincount': 611, + }, { + 'url': 'https://vine.co/u/942914934646415360', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if VineIE.suitable(url) else super(VineUserIE, cls).suitable(url) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -137,11 +141,14 @@ class VineUserIE(InfoExtractor): profile_data = self._download_json( profile_url, user, note='Downloading user profile data') - user_id = profile_data['data']['userId'] - user_archive = self._download_json( + data = profile_data['data'] + user_id = data.get('userId') or data['userIdStr'] + profile = self._download_json( 'https://archive.vine.co/profiles/%s.json' % user_id, user_id) - posts = user_archive['posts'] entries = [ - self.url_result('https://vine.co/v/%s' % post_id, 'Vine') - for post_id in posts] - return self.playlist_result(entries, user) + self.url_result( + 'https://vine.co/v/%s' % post_id, ie='Vine', video_id=post_id) + for post_id in profile['posts'] + if post_id and isinstance(post_id, compat_str)] + return self.playlist_result( + entries, user, profile.get('username'), profile.get('description')) From d6166a7602f5b78a4bb552ba0f4b176cbc0a4a03 Mon Sep 17 00:00:00 2001 From: Patrick Griffis <tingping@tingping.se> Date: Tue, 21 Mar 2017 00:49:31 +0200 Subject: [PATCH 091/148] [picarto] Add extractor --- youtube_dl/extractor/extractors.py | 4 ++ youtube_dl/extractor/picarto.py | 87 ++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100755 youtube_dl/extractor/picarto.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c9f60114d..d83e93dec 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -815,6 +815,10 @@ from .periscope import ( from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE +from .picarto import ( + PicartoVodIE, + PicartoIE, +) from .piksel import PikselIE from .pinkbike import PinkbikeIE from .pladform import PladformIE diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py new file mode 100755 index 000000000..1d6f714ed --- /dev/null +++ b/youtube_dl/extractor/picarto.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, js_to_json, urlencode_postdata + + +class PicartoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)[^/]*$' + _TEST = { + 'url': 'https://picarto.tv/Setz', + 'info_dict': { + 'id': 'Setz', + 'ext': 'mp4', + 'title': 're:^Setz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'timestamp': int, + 'is_live': True + }, + 'params': { + 'skip_download': True + } + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + stream_page = self._download_webpage(url, channel_id) + + if 'This channel does not exist.' in stream_page: + raise ExtractorError('Channel does not exist', expected=True) + + player_settings_js = self._html_search_regex( + r'(?s)playerSettings\[1\]\s*=\s*(\{.+?\}\n)', stream_page, 'player-settings') + player_settings = self._parse_json(player_settings_js, channel_id, + transform_source=js_to_json) + if not player_settings.get('online'): + raise ExtractorError('Stream is offline', expected=True) + + cdn_data = self._download_json('https://picarto.tv/process/channel', channel_id, + data=urlencode_postdata({'loadbalancinginfo': channel_id}), + note='Fetching load balancer info') + edge = [edge['ep'] for edge in cdn_data['edges'] if edge['id'] == cdn_data['preferedEdge']][0] + + formats = self._extract_m3u8_formats('https://%s/hls/%s/index.m3u8' % (edge, channel_id), + channel_id, 'mp4') + formats.append({'url': 'https://%s/mp4/%s.mp4' % (edge, channel_id)}) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'formats': formats, + 'ext': 'mp4', + 'title': self._live_title(channel_id), + 'is_live': True, + 'thumbnail': player_settings.get('vodThumb'), + 'age_limit': 18 if player_settings.get('mature') else None, + } + + +class PicartoVodIE(InfoExtractor): + _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[a-zA-Z0-9_\-\.]+).flv' + _TEST = { + 'url': 'https://picarto.tv/videopopout/Carrot_2018.01.11.07.55.12.flv', + 'md5': '80765b67813053ff31d4df2bd5e900ce', + 'info_dict': { + 'id': 'Carrot_2018.01.11.07.55.12', + 'ext': 'mp4', + 'title': 'Carrot_2018.01.11.07.55.12', + 'thumbnail': r're:^https?://.*\.jpg$' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + vod_info_js = self._html_search_regex(r'(?s)"#vod-player",\s*(\{.+?\})\)', + webpage, video_id) + vod_info = self._parse_json(vod_info_js, video_id, transform_source=js_to_json) + + return { + 'id': video_id, + 'title': video_id, + 'ext': 'mp4', + 'protocol': 'm3u8', + 'url': vod_info['vod'], + 'thumbnail': vod_info.get('vodThumb'), + } From a42839e548d81ae20e5164ae690075d2c423477e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 16 Apr 2018 00:31:25 +0700 Subject: [PATCH 092/148] [picarto] Improve extraction (closes #6205, closes #12514, closes #15276, closes #15551) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/picarto.py | 152 ++++++++++++++++++++++------- 2 files changed, 116 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d83e93dec..3570fa165 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -816,8 +816,8 @@ from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE from .picarto import ( - PicartoVodIE, PicartoIE, + PicartoVodIE, ) from .piksel import PikselIE from .pinkbike import PinkbikeIE diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py index 1d6f714ed..2366dfb34 100755 --- a/youtube_dl/extractor/picarto.py +++ b/youtube_dl/extractor/picarto.py @@ -1,12 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals +import time + from .common import InfoExtractor -from ..utils import ExtractorError, js_to_json, urlencode_postdata +from ..compat import compat_str +from ..utils import ( + ExtractorError, + js_to_json, + try_get, + update_url_query, + urlencode_postdata, +) class PicartoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)[^/]*$' + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)' _TEST = { 'url': 'https://picarto.tv/Setz', 'info_dict': { @@ -16,72 +25,141 @@ class PicartoIE(InfoExtractor): 'timestamp': int, 'is_live': True }, - 'params': { - 'skip_download': True - } + 'skip': 'Stream is offline', } + @classmethod + def suitable(cls, url): + return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) + def _real_extract(self, url): channel_id = self._match_id(url) stream_page = self._download_webpage(url, channel_id) - if 'This channel does not exist.' in stream_page: - raise ExtractorError('Channel does not exist', expected=True) + if '>This channel does not exist' in stream_page: + raise ExtractorError( + 'Channel %s does not exist' % channel_id, expected=True) - player_settings_js = self._html_search_regex( - r'(?s)playerSettings\[1\]\s*=\s*(\{.+?\}\n)', stream_page, 'player-settings') - player_settings = self._parse_json(player_settings_js, channel_id, - transform_source=js_to_json) - if not player_settings.get('online'): + player = self._parse_json( + self._search_regex( + r'(?s)playerSettings\[\d+\]\s*=\s*(\{.+?\}\s*\n)', stream_page, + 'player settings'), + channel_id, transform_source=js_to_json) + + if player.get('online') is False: raise ExtractorError('Stream is offline', expected=True) - cdn_data = self._download_json('https://picarto.tv/process/channel', channel_id, + cdn_data = self._download_json( + 'https://picarto.tv/process/channel', channel_id, data=urlencode_postdata({'loadbalancinginfo': channel_id}), - note='Fetching load balancer info') - edge = [edge['ep'] for edge in cdn_data['edges'] if edge['id'] == cdn_data['preferedEdge']][0] + note='Downloading load balancing info') - formats = self._extract_m3u8_formats('https://%s/hls/%s/index.m3u8' % (edge, channel_id), - channel_id, 'mp4') - formats.append({'url': 'https://%s/mp4/%s.mp4' % (edge, channel_id)}) + def get_event(key): + return try_get(player, lambda x: x['event'][key], compat_str) or '' + + params = { + 'token': player.get('token') or '', + 'ticket': get_event('ticket'), + 'con': int(time.time() * 1000), + 'type': get_event('ticket'), + 'scope': get_event('scope'), + } + + prefered_edge = cdn_data.get('preferedEdge') + default_tech = player.get('defaultTech') + + formats = [] + + for edge in cdn_data['edges']: + edge_ep = edge.get('ep') + if not edge_ep or not isinstance(edge_ep, compat_str): + continue + edge_id = edge.get('id') + for tech in cdn_data['techs']: + tech_label = tech.get('label') + tech_type = tech.get('type') + preference = 0 + if edge_id == prefered_edge: + preference += 1 + if tech_type == default_tech: + preference += 1 + format_id = [] + if edge_id: + format_id.append(edge_id) + if tech_type == 'application/x-mpegurl' or tech_label == 'HLS': + format_id.append('hls') + formats.extend(self._extract_m3u8_formats( + update_url_query( + 'https://%s/hls/%s/index.m3u8' + % (edge_ep, channel_id), params), + channel_id, 'mp4', preference=preference, + m3u8_id='-'.join(format_id), fatal=False)) + continue + elif tech_type == 'video/mp4' or tech_label == 'MP4': + format_id.append('mp4') + formats.append({ + 'url': update_url_query( + 'https://%s/mp4/%s.mp4' % (edge_ep, channel_id), + params), + 'format_id': '-'.join(format_id), + 'preference': preference, + }) + else: + # rtmp format does not seem to work + continue self._sort_formats(formats) + mature = player.get('mature') + if mature is None: + age_limit = None + else: + age_limit = 18 if mature is True else 0 + return { 'id': channel_id, - 'formats': formats, - 'ext': 'mp4', 'title': self._live_title(channel_id), 'is_live': True, - 'thumbnail': player_settings.get('vodThumb'), - 'age_limit': 18 if player_settings.get('mature') else None, + 'thumbnail': player.get('vodThumb'), + 'age_limit': age_limit, + 'formats': formats, } class PicartoVodIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[a-zA-Z0-9_\-\.]+).flv' - _TEST = { - 'url': 'https://picarto.tv/videopopout/Carrot_2018.01.11.07.55.12.flv', - 'md5': '80765b67813053ff31d4df2bd5e900ce', + _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv', + 'md5': '3ab45ba4352c52ee841a28fb73f2d9ca', 'info_dict': { - 'id': 'Carrot_2018.01.11.07.55.12', + 'id': 'ArtofZod_2017.12.12.00.13.23.flv', 'ext': 'mp4', - 'title': 'Carrot_2018.01.11.07.55.12', - 'thumbnail': r're:^https?://.*\.jpg$' - } - } + 'title': 'ArtofZod_2017.12.12.00.13.23.flv', + 'thumbnail': r're:^https?://.*\.jpg' + }, + }, { + 'url': 'https://picarto.tv/videopopout/Plague', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - vod_info_js = self._html_search_regex(r'(?s)"#vod-player",\s*(\{.+?\})\)', - webpage, video_id) - vod_info = self._parse_json(vod_info_js, video_id, transform_source=js_to_json) + vod_info = self._parse_json( + self._search_regex( + r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, + video_id), + video_id, transform_source=js_to_json) + + formats = self._extract_m3u8_formats( + vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) return { 'id': video_id, 'title': video_id, - 'ext': 'mp4', - 'protocol': 'm3u8', - 'url': vod_info['vod'], 'thumbnail': vod_info.get('vodThumb'), + 'formats': formats, } From c07cb68e7974a2ecd94f4101e6f094414df16e75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 16 Apr 2018 00:54:21 +0700 Subject: [PATCH 093/148] [smotri:broadcast] Fix extraction (closes #16180) --- youtube_dl/extractor/smotri.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 370fa8879..45995f30f 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -310,6 +310,7 @@ class SmotriBroadcastIE(InfoExtractor): IE_DESC = 'Smotri.com broadcasts' IE_NAME = 'smotri:broadcast' _VALID_URL = r'https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<id>[^/]+))/?.*' + _NETRC_MACHINE = 'smotri' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -352,17 +353,18 @@ class SmotriBroadcastIE(InfoExtractor): adult_content = False ticket = self._html_search_regex( - r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'([^']+)'\)", - broadcast_page, 'broadcast ticket') + (r'data-user-file=(["\'])(?P<ticket>(?!\1).+)\1', + r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'(?P<ticket>[^']+)'\)"), + broadcast_page, 'broadcast ticket', group='ticket') - url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket + broadcast_url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket broadcast_password = self._downloader.params.get('videopassword') if broadcast_password: - url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() + broadcast_url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() broadcast_json_page = self._download_webpage( - url, broadcast_id, 'Downloading broadcast JSON') + broadcast_url, broadcast_id, 'Downloading broadcast JSON') try: broadcast_json = json.loads(broadcast_json_page) From 0e6ccb3905cb86c53a91af4c9119e2fd102019d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 16 Apr 2018 00:56:05 +0700 Subject: [PATCH 094/148] [ChangeLog] Actualize [ci skip] --- ChangeLog | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4385c4091..12bda4951 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +version <unreleased> + +Extractors +* [smotri:broadcast] Fix extraction (#16180) ++ [picarto] Add support for picarto.tv (#6205, #12514, #15276, #15551) +* [vine:user] Fix extraction (#15514, #16190) +* [pornhub] Relax URL regular expression (#16165) +* [cbc:watch] Re-acquire device token when expired (#16160) ++ [fxnetworks] Add support for https theplatform URLs (#16125, #16157) ++ [instagram:user] Add request signing (#16119) ++ [twitch] Add support for mobile URLs (#16146) + + version 2018.04.09 Core From bdf7ba6f3a626b4c873257091d0771e54bd02dfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 16 Apr 2018 01:07:21 +0700 Subject: [PATCH 095/148] Set chmod 644 for all extractors --- youtube_dl/extractor/americastestkitchen.py | 0 youtube_dl/extractor/cda.py | 0 youtube_dl/extractor/joj.py | 0 youtube_dl/extractor/picarto.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 youtube_dl/extractor/americastestkitchen.py mode change 100755 => 100644 youtube_dl/extractor/cda.py mode change 100755 => 100644 youtube_dl/extractor/joj.py mode change 100755 => 100644 youtube_dl/extractor/picarto.py diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py old mode 100755 new mode 100644 diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py old mode 100755 new mode 100644 diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py old mode 100755 new mode 100644 diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py old mode 100755 new mode 100644 From 3c92fd1cd5b5ced11f03ebe64104457c21cd69ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 16 Apr 2018 01:09:18 +0700 Subject: [PATCH 096/148] release 2018.04.16 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index ed622afd1..69f996179 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.09** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.16*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.16** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.04.09 +[debug] youtube-dl version 2018.04.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 12bda4951..185fa1753 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.04.16 Extractors * [smotri:broadcast] Fix extraction (#16180) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1c13199d4..715d16cfe 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -628,6 +628,8 @@ - **PhilharmonieDeParis**: Philharmonie de Paris - **phoenix.de** - **Photobucket** + - **Picarto** + - **PicartoVod** - **Piksel** - **Pinkbike** - **Pladform** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 307d6041a..5aefdd0a2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.04.09' +__version__ = '2018.04.16' From 522d6b5c961f584055463f8c69de864ec075083b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 16 Apr 2018 07:48:36 +0100 Subject: [PATCH 097/148] [cbs] skip DRM asset types(fixes #16104) --- youtube_dl/extractor/cbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index f425562ab..1799d63ea 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -65,7 +65,7 @@ class CBSIE(CBSBaseIE): last_e = None for item in items_data.findall('.//item'): asset_type = xpath_text(item, 'assetType') - if not asset_type or asset_type in asset_types: + if not asset_type or asset_type in asset_types or asset_type in ('HLS_FPS', 'DASH_CENC'): continue asset_types.append(asset_type) query = { From 238d42cf5d4b1a95ba42bf56dcb1bf559ac11c29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 17 Apr 2018 22:37:50 +0700 Subject: [PATCH 098/148] [instagram:user] Fix extraction (closes #16119) --- youtube_dl/extractor/instagram.py | 49 ++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 8da1d5f2f..5cea37d92 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -6,11 +6,16 @@ import json import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_HTTPError, +) from ..utils import ( + ExtractorError, get_element_by_attribute, int_or_none, lowercase_escape, + std_headers, try_get, ) @@ -239,6 +244,8 @@ class InstagramUserIE(InfoExtractor): } } + _gis_tmpl = None + def _entries(self, data): def get_count(suffix): return int_or_none(try_get( @@ -257,16 +264,36 @@ class InstagramUserIE(InfoExtractor): 'first': 100, 'after': cursor, }) - s = '%s:%s:%s' % (rhx_gis, csrf_token, variables) - media = self._download_json( - 'https://www.instagram.com/graphql/query/', uploader_id, - 'Downloading JSON page %d' % page_num, headers={ - 'X-Requested-With': 'XMLHttpRequest', - 'X-Instagram-GIS': hashlib.md5(s.encode('utf-8')).hexdigest(), - }, query={ - 'query_hash': '472f257a40c653c64c666ce877d59d2b', - 'variables': variables, - })['data']['user']['edge_owner_to_timeline_media'] + + if self._gis_tmpl: + gis_tmpls = [self._gis_tmpl] + else: + gis_tmpls = [ + '%s' % rhx_gis, + '', + '%s:%s' % (rhx_gis, csrf_token), + '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), + ] + + for gis_tmpl in gis_tmpls: + try: + media = self._download_json( + 'https://www.instagram.com/graphql/query/', uploader_id, + 'Downloading JSON page %d' % page_num, headers={ + 'X-Requested-With': 'XMLHttpRequest', + 'X-Instagram-GIS': hashlib.md5( + ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), + }, query={ + 'query_hash': '42323d64886122307be10013ad2dcc44', + 'variables': variables, + })['data']['user']['edge_owner_to_timeline_media'] + self._gis_tmpl = gis_tmpl + break + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if gis_tmpl != gis_tmpls[-1]: + continue + raise edges = media.get('edges') if not edges or not isinstance(edges, list): From 518d5ba5191e3cc26c81e346ba5117e94db51469 Mon Sep 17 00:00:00 2001 From: Dan Salmon <sa7mon@users.noreply.github.com> Date: Tue, 17 Apr 2018 12:10:02 -0500 Subject: [PATCH 099/148] Fix some tests --- test/test_subtitles.py | 4 ++-- test/test_youtube_lists.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 1b8de822a..7d57a628e 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -232,7 +232,7 @@ class TestNPOSubtitles(BaseTestSubtitles): class TestMTVSubtitles(BaseTestSubtitles): - url = 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother' + url = 'http://www.cc.com/video-clips/p63lk0/adam-devine-s-house-party-chasing-white-swans' IE = ComedyCentralIE def getInfoDict(self): @@ -243,7 +243,7 @@ class TestMTVSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), 'b9f6ca22a6acf597ec76f61749765e65') + self.assertEqual(md5(subtitles['en']), '78206b8d8a0cfa9da64dc026eea48961') class TestNRKSubtitles(BaseTestSubtitles): diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 7a33dbf88..c4f0abbea 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -61,7 +61,7 @@ class TestYoutubeLists(unittest.TestCase): dl = FakeYDL() dl.params['extract_flat'] = True ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') + result = ie.extract('https://www.youtube.com/playlist?list=PL-KKIb8rvtMSrAO9YFbeM6UQrAqoFTUWv') self.assertIsPlaylist(result) for entry in result['entries']: self.assertTrue(entry.get('title')) From e30991f9206f98605ba6c4880ed40ad5556fa0b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 18 Apr 2018 01:24:02 +0700 Subject: [PATCH 100/148] [kaltura] Improve embeds detection (closes #16201) --- youtube_dl/extractor/generic.py | 18 +++++++++++++++++- youtube_dl/extractor/kaltura.py | 6 +++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e3cb5c5ce..af1322e00 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1220,7 +1220,7 @@ class GenericIE(InfoExtractor): 'title': '35871', 'timestamp': 1355743100, 'upload_date': '20121217', - 'uploader_id': 'batchUser', + 'uploader_id': 'cplapp@learn360.com', }, 'add_ie': ['Kaltura'], }, @@ -1271,6 +1271,22 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, + { + # meta twitter:player + 'url': 'http://thechive.com/2017/12/08/all-i-want-for-christmas-is-more-twerk/', + 'info_dict': { + 'id': '0_01b42zps', + 'ext': 'mp4', + 'title': 'Main Twerk (Video)', + 'upload_date': '20171208', + 'uploader_id': 'sebastian.salinas@thechive.com', + 'timestamp': 1512713057, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Kaltura'], + }, # referrer protected EaglePlatform embed { 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 562e25f6d..0ea89e4d6 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -135,10 +135,10 @@ class KalturaIE(InfoExtractor): ''', webpage) or re.search( r'''(?xs) - <iframe[^>]+src=(?P<q1>["']) - (?:https?:)?//(?:www\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+) + <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P<q1>["']) + (?:https?:)?//(?:(?:www|cdnapi)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+) (?:(?!(?P=q1)).)* - [?&]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+) + [?&;]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+) (?P=q1) ''', webpage) ) From 9b3036bd2e431ed4b037a3df21528a6e9bcb05b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 18 Apr 2018 10:12:24 +0700 Subject: [PATCH 101/148] [instagram:user] Fix extraction (closes #16119) --- youtube_dl/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 5cea37d92..0c13f54ee 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -261,7 +261,7 @@ class InstagramUserIE(InfoExtractor): for page_num in itertools.count(1): variables = json.dumps({ 'id': uploader_id, - 'first': 100, + 'first': 12, 'after': cursor, }) From b004d9bbf18ee2b6a9b916657c4d6734ff0d0adb Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 19 Apr 2018 15:07:50 +0100 Subject: [PATCH 102/148] [cbssports] fix extraction(fixes #16217) --- youtube_dl/extractor/cbssports.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 3a62c840b..27a243d08 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -4,28 +4,33 @@ from .cbs import CBSBaseIE class CBSSportsIE(CBSBaseIE): - _VALID_URL = r'https?://(?:www\.)?cbssports\.com/video/player/[^/]+/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.cbssports.com/video/player/videos/708337219968/0/ben-simmons-the-next-lebron?-not-so-fast', + 'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/', 'info_dict': { - 'id': '708337219968', + 'id': '1214315075735', 'ext': 'mp4', - 'title': 'Ben Simmons the next LeBron? Not so fast', - 'description': 'md5:854294f627921baba1f4b9a990d87197', - 'timestamp': 1466293740, - 'upload_date': '20160618', + 'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder', + 'description': 'md5:df6f48622612c2d6bd2e295ddef58def', + 'timestamp': 1524111457, + 'upload_date': '20180419', 'uploader': 'CBSI-NEW', }, 'params': { # m3u8 download 'skip_download': True, } + }, { + 'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/', + 'only_matching': True, }] def _extract_video_info(self, filter_query, video_id): return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id= self._search_regex([r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], webpage, 'video id') return self._extract_video_info('byId=%s' % video_id, video_id) From d86c5167ae5a1c33451e98d7e05d5b32b6fa3156 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 19 Apr 2018 15:48:03 +0100 Subject: [PATCH 103/148] [nexx] extract new azure urls(closes #16223) --- youtube_dl/extractor/nexx.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index c7029d29e..5e46a75c0 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -230,15 +230,18 @@ class NexxIE(InfoExtractor): azure_locator = stream_data['azureLocator'] - AZURE_URL = 'http://nx%s%02d.akamaized.net/' - - def get_cdn_shield_base(shield_type='', prefix='-p'): + def get_cdn_shield_base(shield_type='', static=False): for secure in ('', 's'): cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) if cdn_shield: return 'http%s://%s' % (secure, cdn_shield) else: - return AZURE_URL % (prefix, int(stream_data['azureAccount'].replace('nexxplayplus', ''))) + if 'fb' in stream_data['azureAccount']: + prefix = 'df' if static else 'f' + else: + prefix = 'd' if static else 'p' + account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) + return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) azure_stream_base = get_cdn_shield_base() is_ml = ',' in language @@ -260,7 +263,7 @@ class NexxIE(InfoExtractor): formats.extend(self._extract_ism_formats( azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) - azure_progressive_base = get_cdn_shield_base('Prog', '-d') + azure_progressive_base = get_cdn_shield_base('Prog', True) azure_file_distribution = stream_data.get('azureFileDistribution') if azure_file_distribution: fds = azure_file_distribution.split(',') From 5a19d231ca8e15d07c2a5ebd3cd6cc46b7596edc Mon Sep 17 00:00:00 2001 From: Douglas Su <d0u9.su@outlook.com> Date: Thu, 19 Apr 2018 23:21:50 +0800 Subject: [PATCH 104/148] [YoutubeDL] Fix typo in media extension compatibility checker --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index fca4999eb..ad3598805 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1853,7 +1853,7 @@ class YoutubeDL(object): def compatible_formats(formats): video, audio = formats # Check extension - video_ext, audio_ext = audio.get('ext'), video.get('ext') + video_ext, audio_ext = video.get('ext'), audio.get('ext') if video_ext and audio_ext: COMPATIBLE_EXTS = ( ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'), From 1792bc3a06dbdb788d12a1e6a4a8d7072be70edb Mon Sep 17 00:00:00 2001 From: Parmjit Virk <pvirk@mts.net> Date: Thu, 19 Apr 2018 10:25:51 -0500 Subject: [PATCH 105/148] [keezmovies] Add support for generic embeds (closes #16134) --- youtube_dl/extractor/keezmovies.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index e83115e2a..d4e6f7ac1 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -20,23 +20,23 @@ from ..utils import ( class KeezMoviesIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', - 'md5': '1c1e75d22ffa53320f45eeb07bc4cdc0', + 'url': 'https://www.keezmovies.com/video/arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money-18070681', + 'md5': '2ac69cdb882055f71d82db4311732a1a', 'info_dict': { - 'id': '1214711', - 'display_id': 'petite-asian-lady-mai-playing-in-bathtub', + 'id': '18070681', + 'display_id': 'arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money', 'ext': 'mp4', - 'title': 'Petite Asian Lady Mai Playing In Bathtub', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Arab wife want it so bad I see she thirsty and has tiny money.', + 'thumbnail': None, 'view_count': int, 'age_limit': 18, } }, { - 'url': 'http://www.keezmovies.com/video/1214711', + 'url': 'http://www.keezmovies.com/video/18070681', 'only_matching': True, }] - def _extract_info(self, url): + def _extract_info(self, url, fatal=True): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = (mobj.group('display_id') @@ -55,7 +55,7 @@ class KeezMoviesIE(InfoExtractor): encrypted = False def extract_format(format_url, height=None): - if not isinstance(format_url, compat_str) or not format_url.startswith('http'): + if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//')): return if format_url in format_urls: return @@ -105,7 +105,11 @@ class KeezMoviesIE(InfoExtractor): raise ExtractorError( 'Video %s is no longer available' % video_id, expected=True) - self._sort_formats(formats) + try: + self._sort_formats(formats) + except ExtractorError: + if fatal: + raise if not title: title = self._html_search_regex( @@ -122,7 +126,9 @@ class KeezMoviesIE(InfoExtractor): } def _real_extract(self, url): - webpage, info = self._extract_info(url) + webpage, info = self._extract_info(url, fatal=False) + if not info['formats']: + return self.url_result(url, 'Generic') info['view_count'] = str_to_int(self._search_regex( r'<b>([\d,.]+)</b> Views?', webpage, 'view count', fatal=False)) return info From d317973284f6d9886bda0bf8215ffb4f060af41d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Apr 2018 22:36:33 +0700 Subject: [PATCH 106/148] [extremetube] Fix metadata extraction --- youtube_dl/extractor/extremetube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 445f9438d..acd4090fa 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -8,12 +8,12 @@ class ExtremeTubeIE(KeezMoviesIE): _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', - 'md5': '1fb9228f5e3332ec8c057d6ac36f33e0', + 'md5': '92feaafa4b58e82f261e5419f39c60cb', 'info_dict': { 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'ext': 'mp4', 'title': 'Music Video 14 british euro brit european cumshots swallow', - 'uploader': 'unknown', + 'uploader': 'anonim', 'view_count': int, 'age_limit': 18, } @@ -36,10 +36,10 @@ class ExtremeTubeIE(KeezMoviesIE): r'<h1[^>]+title="([^"]+)"[^>]*>', webpage, 'title') uploader = self._html_search_regex( - r'Uploaded by:\s*</strong>\s*(.+?)\s*</div>', + r'Uploaded by:\s*</[^>]+>\s*<a[^>]+>(.+?)</a>', webpage, 'uploader', fatal=False) view_count = str_to_int(self._search_regex( - r'Views:\s*</strong>\s*<span>([\d,\.]+)</span>', + r'Views:\s*</[^>]+>\s*<[^>]+>([\d,\.]+)</', webpage, 'view count', fatal=False)) info.update({ From c19420027768c80a6a375f5a517fde9f65c1fc31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Apr 2018 22:38:31 +0700 Subject: [PATCH 107/148] [mofosex] Fix test --- youtube_dl/extractor/mofosex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py index 54716f5c7..1c652813a 100644 --- a/youtube_dl/extractor/mofosex.py +++ b/youtube_dl/extractor/mofosex.py @@ -12,7 +12,7 @@ class MofosexIE(KeezMoviesIE): _VALID_URL = r'https?://(?:www\.)?mofosex\.com/videos/(?P<id>\d+)/(?P<display_id>[^/?#&.]+)\.html' _TESTS = [{ 'url': 'http://www.mofosex.com/videos/318131/amateur-teen-playing-and-masturbating-318131.html', - 'md5': '39a15853632b7b2e5679f92f69b78e91', + 'md5': '558fcdafbb63a87c019218d6e49daf8a', 'info_dict': { 'id': '318131', 'display_id': 'amateur-teen-playing-and-masturbating-318131', From d65a48a0efd2184f7b2fdc823433f568bae56d86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 20 Apr 2018 23:12:13 +0700 Subject: [PATCH 108/148] [nick] Add support for nickjr.nl (closes #16230) --- youtube_dl/extractor/nick.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 090f1acee..256a24d86 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -81,13 +81,23 @@ class NickIE(MTVServicesInfoExtractor): class NickBrIE(MTVServicesInfoExtractor): IE_NAME = 'nickelodeon:br' - _VALID_URL = r'https?://(?P<domain>(?:www\.)?nickjr|mundonick\.uol)\.com\.br/(?:programas/)?[^/]+/videos/(?:episodios/)?(?P<id>[^/?#.]+)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?P<domain>(?:www\.)?nickjr|mundonick\.uol)\.com\.br| + (?:www\.)?nickjr\.nl + ) + /(?:programas/)?[^/]+/videos/(?:episodios/)?(?P<id>[^/?\#.]+) + ''' _TESTS = [{ 'url': 'http://www.nickjr.com.br/patrulha-canina/videos/210-labirinto-de-pipoca/', 'only_matching': True, }, { 'url': 'http://mundonick.uol.com.br/programas/the-loud-house/videos/muitas-irmas/7ljo9j', 'only_matching': True, + }, { + 'url': 'http://www.nickjr.nl/paw-patrol/videos/311-ge-wol-dig-om-terug-te-zijn/', + 'only_matching': True, }] def _real_extract(self, url): From 4b8588fe0215fb5ea75d4f37402ec51014cb8c53 Mon Sep 17 00:00:00 2001 From: einstein95 <einstein95@users.noreply.github.com> Date: Fri, 12 Jan 2018 07:01:02 +1300 Subject: [PATCH 109/148] [rentv] Fix extraction --- youtube_dl/extractor/rentv.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py index d338b3a93..df528b09e 100644 --- a/youtube_dl/extractor/rentv.py +++ b/youtube_dl/extractor/rentv.py @@ -26,9 +26,20 @@ class RENTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage('http://ren.tv/player/' + video_id, video_id) - jw_config = self._parse_json(self._search_regex( - r'config\s*=\s*({.+});', webpage, 'jw config'), video_id) - return self._parse_jwplayer_data(jw_config, video_id, m3u8_id='hls') + config = self._parse_json(self._search_regex( + r'config\s*=\s*({.+});', webpage, 'config'), video_id) + formats = [] + for video in config.get('src', ''): + formats.append({ + 'url': video.get('src', '') + }) + self._sort_formats(formats) + return { + 'id': video_id, + 'formats': formats, + 'title': config.get('title', ''), + 'thumbnail': config.get('image', '') + } class RENTVArticleIE(InfoExtractor): From a693386df1957ba03cbf5156a65dd18b2c37ac42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Apr 2018 23:22:10 +0700 Subject: [PATCH 110/148] [rentv] Improve extraction (closes #15227) --- youtube_dl/extractor/rentv.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py index df528b09e..8bcf87126 100644 --- a/youtube_dl/extractor/rentv.py +++ b/youtube_dl/extractor/rentv.py @@ -3,6 +3,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, +) class RENTVIE(InfoExtractor): @@ -13,7 +17,9 @@ class RENTVIE(InfoExtractor): 'info_dict': { 'id': '118577', 'ext': 'mp4', - 'title': 'Документальный спецпроект: "Промывка мозгов. Технологии XXI века"' + 'title': 'Документальный спецпроект: "Промывка мозгов. Технологии XXI века"', + 'timestamp': 1472230800, + 'upload_date': '20160826', } }, { 'url': 'http://ren.tv/player/118577', @@ -27,18 +33,31 @@ class RENTVIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage('http://ren.tv/player/' + video_id, video_id) config = self._parse_json(self._search_regex( - r'config\s*=\s*({.+});', webpage, 'config'), video_id) + r'config\s*=\s*({.+})\s*;', webpage, 'config'), video_id) + title = config['title'] formats = [] - for video in config.get('src', ''): - formats.append({ - 'url': video.get('src', '') - }) + for video in config['src']: + src = video.get('src') + if not src or not isinstance(src, compat_str): + continue + ext = determine_ext(src) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src, + }) self._sort_formats(formats) return { 'id': video_id, + 'title': title, + 'description': config.get('description'), + 'thumbnail': config.get('image'), + 'duration': int_or_none(config.get('duration')), + 'timestamp': int_or_none(config.get('date')), 'formats': formats, - 'title': config.get('title', ''), - 'thumbnail': config.get('image', '') } From 040c6296bb9da495d37ba134baee996a3a97b64f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Apr 2018 04:55:35 +0700 Subject: [PATCH 111/148] [ccma] Fix video extraction (closes #15931) --- youtube_dl/extractor/ccma.py | 50 +++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index bec0a825a..07f5206c1 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -4,11 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + clean_html, int_or_none, parse_duration, parse_iso8601, - clean_html, + parse_resolution, ) @@ -40,34 +42,42 @@ class CCMAIE(InfoExtractor): def _real_extract(self, url): media_type, media_id = re.match(self._VALID_URL, url).groups() - media_data = {} - formats = [] - profiles = ['pc'] if media_type == 'audio' else ['mobil', 'pc'] - for i, profile in enumerate(profiles): - md = self._download_json('http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ + + media = self._download_json( + 'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ 'media': media_type, 'idint': media_id, - 'profile': profile, - }, fatal=False) - if md: - media_data = md - media_url = media_data.get('media', {}).get('url') - if media_url: - formats.append({ - 'format_id': profile, - 'url': media_url, - 'quality': i, - }) + }) + + formats = [] + media_url = media['media']['url'] + if isinstance(media_url, list): + for format_ in media_url: + format_url = format_.get('file') + if not format_url or not isinstance(format_url, compat_str): + continue + label = format_.get('label') + f = parse_resolution(label) + f.update({ + 'url': format_url, + 'format_id': label, + }) + formats.append(f) + else: + formats.append({ + 'url': media_url, + 'vcodec': 'none' if media_type == 'audio' else None, + }) self._sort_formats(formats) - informacio = media_data['informacio'] + informacio = media['informacio'] title = informacio['titol'] durada = informacio.get('durada', {}) duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc')) subtitles = {} - subtitols = media_data.get('subtitols', {}) + subtitols = media.get('subtitols', {}) if subtitols: sub_url = subtitols.get('url') if sub_url: @@ -77,7 +87,7 @@ class CCMAIE(InfoExtractor): }) thumbnails = [] - imatges = media_data.get('imatges', {}) + imatges = media.get('imatges', {}) if imatges: thumbnail_url = imatges.get('url') if thumbnail_url: From 353f0bde78eb4dd9432c092f244bb30b2abd7f70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Apr 2018 04:57:22 +0700 Subject: [PATCH 112/148] [cbssports] PEP 8 --- youtube_dl/extractor/cbssports.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 27a243d08..83b764762 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -32,5 +32,7 @@ class CBSSportsIE(CBSBaseIE): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id= self._search_regex([r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], webpage, 'video id') + video_id = self._search_regex( + [r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], + webpage, 'video id') return self._extract_video_info('byId=%s' % video_id, video_id) From 488ff2dd3a193544a9912776d1c1b9d9fffc8fe7 Mon Sep 17 00:00:00 2001 From: 0x9fff00 <0x9fff00+git@protonmail.ch> Date: Sat, 17 Mar 2018 16:14:20 +0100 Subject: [PATCH 113/148] [svt] Add support for TV channel live streams (Closes #15279) --- youtube_dl/extractor/svt.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index b544da414..d01f85422 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -22,6 +22,8 @@ class SVTBaseIE(InfoExtractor): _GEO_COUNTRIES = ['SE'] def _extract_video(self, video_info, video_id): + is_live = dict_get(video_info, ('live', 'simulcast'), default=False) + m3u8_protocol = 'm3u8' if is_live else 'm3u8_native' formats = [] for vr in video_info['videoReferences']: player_type = vr.get('playerType') or vr.get('format') @@ -30,7 +32,7 @@ class SVTBaseIE(InfoExtractor): if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( vurl, video_id, - ext='mp4', entry_protocol='m3u8_native', + ext='mp4', entry_protocol=m3u8_protocol, m3u8_id=player_type, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( @@ -90,6 +92,7 @@ class SVTBaseIE(InfoExtractor): 'season_number': season_number, 'episode': episode, 'episode_number': episode_number, + 'is_live': is_live, } @@ -134,7 +137,7 @@ class SVTPlayBaseIE(SVTBaseIE): class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp)/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>\w+)' _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', @@ -158,6 +161,9 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg', 'only_matching': True, + }, { + 'url': 'https://www.svtplay.se/kanaler/svt1', + 'only_matching': True, }] def _real_extract(self, url): @@ -183,6 +189,8 @@ class SVTPlayIE(SVTPlayBaseIE): 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], 'thumbnail': thumbnail, }) + if info_dict['is_live']: + info_dict['title'] = self._live_title(info_dict['title']) return info_dict video_id = self._search_regex( @@ -198,6 +206,8 @@ class SVTPlayIE(SVTPlayBaseIE): info_dict['title'] = re.sub( r'\s*\|\s*.+?$', '', info_dict.get('episode') or self._og_search_title(webpage)) + if info_dict['is_live']: + info_dict['title'] = self._live_title(info_dict['title']) return info_dict From 6cdaaf703149f1d6f1d24cfdb5a538ca41d08a26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Apr 2018 05:33:08 +0700 Subject: [PATCH 114/148] [svt] Improve (closes #15809) --- youtube_dl/extractor/svt.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index d01f85422..f71eab8b2 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -137,7 +137,7 @@ class SVTPlayBaseIE(SVTBaseIE): class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', @@ -179,6 +179,10 @@ class SVTPlayIE(SVTPlayBaseIE): thumbnail = self._og_search_thumbnail(webpage) + def adjust_title(info): + if info['is_live']: + info['title'] = self._live_title(info['title']) + if data: video_info = try_get( data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], @@ -189,8 +193,7 @@ class SVTPlayIE(SVTPlayBaseIE): 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], 'thumbnail': thumbnail, }) - if info_dict['is_live']: - info_dict['title'] = self._live_title(info_dict['title']) + adjust_title(info_dict) return info_dict video_id = self._search_regex( @@ -206,8 +209,7 @@ class SVTPlayIE(SVTPlayBaseIE): info_dict['title'] = re.sub( r'\s*\|\s*.+?$', '', info_dict.get('episode') or self._og_search_title(webpage)) - if info_dict['is_live']: - info_dict['title'] = self._live_title(info_dict['title']) + adjust_title(info_dict) return info_dict From 3853309fe238bb709b7c5db261724c33b48a8693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Apr 2018 06:07:32 +0700 Subject: [PATCH 115/148] [youtube:feed] Implement lazy playlist extraction (closes #10184) --- youtube_dl/extractor/youtube.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 617be8e96..e9965509c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2699,10 +2699,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() - def _real_extract(self, url): - page = self._download_webpage( - 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE) - + def _entries(self, page): # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index ids = [] @@ -2713,12 +2710,15 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): # 'recommended' feed has infinite 'load more' and each new portion spins # the same videos in (sometimes) slightly different order, so we'll check # for unicity and break when portion has no new videos - new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches)) + new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) if not new_ids: break ids.extend(new_ids) + for entry in self._ids_to_results(new_ids): + yield entry + mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) if not mobj: break @@ -2730,8 +2730,12 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): content_html = more['content_html'] more_widget_html = more['load_more_widget_html'] + def _real_extract(self, url): + page = self._download_webpage( + 'https://www.youtube.com/feed/%s' % self._FEED_NAME, + self._PLAYLIST_TITLE) return self.playlist_result( - self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE) + self._entries(page), playlist_title=self._PLAYLIST_TITLE) class YoutubeWatchLaterIE(YoutubePlaylistIE): From 70d35d166c1cfb14af20fb6d45ed820b6249f941 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Apr 2018 06:08:05 +0700 Subject: [PATCH 116/148] [youtube] Add ability to authenticate with cookies --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e9965509c..e7bd1f18f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -87,7 +87,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): (username, password) = self._get_login_info() # No authentication to be performed if username is None: - if self._LOGIN_REQUIRED: + if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) return True From 2441c1aab152cd81d53f0a6fca982af9f8c8de10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Apr 2018 00:16:52 +0700 Subject: [PATCH 117/148] [breakcom] Fix extraction (closes #16254) --- youtube_dl/extractor/breakcom.py | 148 ++++++++++--------------------- 1 file changed, 47 insertions(+), 101 deletions(-) diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 5a87c2661..70d16767f 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -3,15 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..compat import compat_str -from ..utils import ( - int_or_none, - parse_age_limit, -) +from ..utils import int_or_none class BreakIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<site>break|screenjunkies)\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' + _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' _TESTS = [{ 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', 'info_dict': { @@ -19,125 +17,73 @@ class BreakIE(InfoExtractor): 'ext': 'mp4', 'title': 'When Girls Act Like D-Bags', 'age_limit': 13, + }, + }, { + # youtube embed + 'url': 'http://www.break.com/video/someone-forgot-boat-brakes-work', + 'info_dict': { + 'id': 'RrrDLdeL2HQ', + 'ext': 'mp4', + 'title': 'Whale Watching Boat Crashing Into San Diego Dock', + 'description': 'md5:afc1b2772f0a8468be51dd80eb021069', + 'upload_date': '20160331', + 'uploader': 'Steve Holden', + 'uploader_id': 'sdholden07', + }, + 'params': { + 'skip_download': True, } - }, { - 'url': 'http://www.screenjunkies.com/video/best-quentin-tarantino-movie-2841915', - 'md5': '5c2b686bec3d43de42bde9ec047536b0', - 'info_dict': { - 'id': '2841915', - 'display_id': 'best-quentin-tarantino-movie', - 'ext': 'mp4', - 'title': 'Best Quentin Tarantino Movie', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3671, - 'age_limit': 13, - 'tags': list, - }, - }, { - 'url': 'http://www.screenjunkies.com/video/honest-trailers-the-dark-knight', - 'info_dict': { - 'id': '2348808', - 'display_id': 'honest-trailers-the-dark-knight', - 'ext': 'mp4', - 'title': 'Honest Trailers - The Dark Knight', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)', - 'age_limit': 10, - 'tags': list, - }, - }, { - # requires subscription but worked around - 'url': 'http://www.screenjunkies.com/video/knocking-dead-ep-1-the-show-so-far-3003285', - 'info_dict': { - 'id': '3003285', - 'display_id': 'knocking-dead-ep-1-the-show-so-far', - 'ext': 'mp4', - 'title': 'State of The Dead Recap: Knocking Dead Pilot', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3307, - 'age_limit': 13, - 'tags': list, - }, }, { 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', 'only_matching': True, }] - _DEFAULT_BITRATES = (48, 150, 320, 496, 864, 2240, 3264) - def _real_extract(self, url): - site, display_id, video_id = re.match(self._VALID_URL, url).groups() + display_id, video_id = re.match(self._VALID_URL, url).groups() - if not video_id: - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - (r'src=["\']/embed/(\d+)', r'data-video-content-id=["\'](\d+)'), - webpage, 'video id') + webpage = self._download_webpage(url, display_id) - webpage = self._download_webpage( - 'http://www.%s.com/embed/%s' % (site, video_id), - display_id, 'Downloading video embed page') - embed_vars = self._parse_json( + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) + + content = self._parse_json( self._search_regex( - r'(?s)embedVars\s*=\s*({.+?})\s*</script>', webpage, 'embed vars'), + r'(?s)content["\']\s*:\s*(\[.+?\])\s*[,\n]', webpage, + 'content'), display_id) - youtube_id = embed_vars.get('youtubeId') - if youtube_id: - return self.url_result(youtube_id, 'Youtube') - - title = embed_vars['contentName'] - formats = [] - bitrates = [] - for f in embed_vars.get('media', []): - if not f.get('uri') or f.get('mediaPurpose') != 'play': + for video in content: + video_url = video.get('url') + if not video_url or not isinstance(video_url, compat_str): continue - bitrate = int_or_none(f.get('bitRate')) - if bitrate: - bitrates.append(bitrate) + bitrate = int_or_none(self._search_regex( + r'(\d+)_kbps', video_url, 'tbr', default=None)) formats.append({ - 'url': f['uri'], + 'url': video_url, 'format_id': 'http-%d' % bitrate if bitrate else 'http', - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), 'tbr': bitrate, - 'format': 'mp4', }) - - if not bitrates: - # When subscriptionLevel > 0, i.e. plus subscription is required - # media list will be empty. However, hds and hls uris are still - # available. We can grab them assuming bitrates to be default. - bitrates = self._DEFAULT_BITRATES - - auth_token = embed_vars.get('AuthToken') - - def construct_manifest_url(base_url, ext): - pieces = [base_url] - pieces.extend([compat_str(b) for b in bitrates]) - pieces.append('_kbps.mp4.%s?%s' % (ext, auth_token)) - return ','.join(pieces) - - if bitrates and auth_token: - hds_url = embed_vars.get('hdsUri') - if hds_url: - formats.extend(self._extract_f4m_formats( - construct_manifest_url(hds_url, 'f4m'), - display_id, f4m_id='hds', fatal=False)) - hls_url = embed_vars.get('hlsUri') - if hls_url: - formats.extend(self._extract_m3u8_formats( - construct_manifest_url(hls_url, 'm3u8'), - display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) + title = self._search_regex( + (r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + r'<h1[^>]*>(?P<value>[^<]+)'), webpage, 'title', group='value') + + def get(key, name): + return int_or_none(self._search_regex( + r'%s["\']\s*:\s*["\'](\d+)' % key, webpage, name, + default=None)) + + age_limit = get('ratings', 'age limit') + video_id = video_id or get('pid', 'video id') or display_id + return { 'id': video_id, 'display_id': display_id, 'title': title, - 'thumbnail': embed_vars.get('thumbUri'), - 'duration': int_or_none(embed_vars.get('videoLengthInSeconds')) or None, - 'age_limit': parse_age_limit(embed_vars.get('audienceRating')), - 'tags': embed_vars.get('tags', '').split(','), + 'thumbnail': self._og_search_thumbnail(webpage), + 'age_limit': age_limit, 'formats': formats, } From af751350e8651f665333554fa13b335b073fa736 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Apr 2018 02:50:11 +0700 Subject: [PATCH 118/148] [Makefile] Add support for pandoc 2 and disable smart extension (closes #16251) smart extension rewrites straight quotes as curly quotes, -- as en-dashes and so on that is unwanted behavior. --- Makefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index fe247810f..4a62f44bc 100644 --- a/Makefile +++ b/Makefile @@ -14,6 +14,9 @@ PYTHON ?= /usr/bin/env python # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local SYSCONFDIR = $(shell if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi) +# set markdown input format to "markdown-smart" for pandoc version 2 and to "markdown" for pandoc prior to version 2 +MARKDOWN = $(shell if [ `pandoc -v | head -n1 | cut -d" " -f2 | head -c1` = "2" ]; then echo markdown-smart; else echo markdown; fi) + install: youtube-dl youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish install -d $(DESTDIR)$(BINDIR) install -m 755 youtube-dl $(DESTDIR)$(BINDIR) @@ -82,11 +85,11 @@ supportedsites: $(PYTHON) devscripts/make_supportedsites.py docs/supportedsites.md README.txt: README.md - pandoc -f markdown -t plain README.md -o README.txt + pandoc -f $(MARKDOWN) -t plain README.md -o README.txt youtube-dl.1: README.md $(PYTHON) devscripts/prepare_manpage.py youtube-dl.1.temp.md - pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1 + pandoc -s -f $(MARKDOWN) -t man youtube-dl.1.temp.md -o youtube-dl.1 rm -f youtube-dl.1.temp.md youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in From 171625469ab1b2a4dc99ed173a10be45e7fc13d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Apr 2018 03:17:34 +0700 Subject: [PATCH 119/148] [etonline] Remove extractor (closes #16256) Covered by generic extractor --- youtube_dl/extractor/etonline.py | 39 ------------------------------ youtube_dl/extractor/extractors.py | 1 - 2 files changed, 40 deletions(-) delete mode 100644 youtube_dl/extractor/etonline.py diff --git a/youtube_dl/extractor/etonline.py b/youtube_dl/extractor/etonline.py deleted file mode 100644 index 17d7cfec6..000000000 --- a/youtube_dl/extractor/etonline.py +++ /dev/null @@ -1,39 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class ETOnlineIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?etonline\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.etonline.com/tv/211130_dove_cameron_liv_and_maddie_emotional_episode_series_finale/', - 'info_dict': { - 'id': '211130_dove_cameron_liv_and_maddie_emotional_episode_series_finale', - 'title': 'md5:a21ec7d3872ed98335cbd2a046f34ee6', - 'description': 'md5:8b94484063f463cca709617c79618ccd', - }, - 'playlist_count': 2, - }, { - 'url': 'http://www.etonline.com/media/video/here_are_the_stars_who_love_bringing_their_moms_as_dates_to_the_oscars-211359/', - 'only_matching': True, - }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911076001/default_default/index.html?videoId=ref:%s' - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - entries = [ - self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % video_id, 'BrightcoveNew', video_id) - for video_id in re.findall( - r'site\.brightcove\s*\([^,]+,\s*["\'](title_\d+)', webpage)] - - return self.playlist_result( - entries, playlist_id, - self._og_search_title(webpage, fatal=False), - self._og_search_description(webpage)) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3570fa165..6fb65e4fe 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -326,7 +326,6 @@ from .espn import ( FiveThirtyEightIE, ) from .esri import EsriVideoIE -from .etonline import ETOnlineIE from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE From 99036a1298089068dcf80c0985bfcc3f8c24f281 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Apr 2018 04:03:11 +0700 Subject: [PATCH 120/148] [pornflip] Relax _VALID_URL (closes #16258) --- youtube_dl/extractor/pornflip.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornflip.py b/youtube_dl/extractor/pornflip.py index ee04936e1..025985fbc 100644 --- a/youtube_dl/extractor/pornflip.py +++ b/youtube_dl/extractor/pornflip.py @@ -14,7 +14,7 @@ from ..utils import ( class PornFlipIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornflip\.com/(?:v|embed)/(?P<id>[0-9A-Za-z-]{11})' + _VALID_URL = r'https?://(?:www\.)?pornflip\.com/(?:v|embed)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.pornflip.com/v/wz7DfNhMmep', 'md5': '98c46639849145ae1fd77af532a9278c', @@ -40,6 +40,9 @@ class PornFlipIE(InfoExtractor): }, { 'url': 'https://www.pornflip.com/embed/EkRD6-vS2-s', 'only_matching': True, + }, { + 'url': 'https://www.pornflip.com/v/NG9q6Pb_iK8', + 'only_matching': True, }] def _real_extract(self, url): From 1cc47c667419e0eadc0a6989256ab7b276852adf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 24 Apr 2018 23:49:30 +0700 Subject: [PATCH 121/148] [utils] Fix match_str for boolean meta fields --- test/test_utils.py | 12 ++++++++++++ youtube_dl/utils.py | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index a1fe6fdb2..253a7fe17 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1072,6 +1072,18 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') self.assertFalse(match_str( 'like_count > 100 & dislike_count <? 50 & description', {'like_count': 190, 'dislike_count': 10})) + self.assertTrue(match_str('is_live', {'is_live': True})) + self.assertFalse(match_str('is_live', {'is_live': False})) + self.assertFalse(match_str('is_live', {'is_live': None})) + self.assertFalse(match_str('is_live', {})) + self.assertFalse(match_str('!is_live', {'is_live': True})) + self.assertTrue(match_str('!is_live', {'is_live': False})) + self.assertTrue(match_str('!is_live', {'is_live': None})) + self.assertTrue(match_str('!is_live', {})) + self.assertTrue(match_str('title', {'title': 'abc'})) + self.assertTrue(match_str('title', {'title': ''})) + self.assertFalse(match_str('!title', {'title': 'abc'})) + self.assertFalse(match_str('!title', {'title': ''})) def test_parse_dfxp_time_expr(self): self.assertEqual(parse_dfxp_time_expr(None), None) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 027d12785..574284e94 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2574,8 +2574,8 @@ def _match_one(filter_part, dct): return op(actual_value, comparison_value) UNARY_OPERATORS = { - '': lambda v: v is not None, - '!': lambda v: v is None, + '': lambda v: (v is True) if isinstance(v, bool) else (v is not None), + '!': lambda v: (v is False) if isinstance(v, bool) else (v is None), } operator_rex = re.compile(r'''(?x)\s* (?P<op>%s)\s*(?P<key>[a-z_]+) From 0ff51adae6feab7386874eddc0d61dbeaf063bf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 24 Apr 2018 23:53:01 +0700 Subject: [PATCH 122/148] [twitch] Extract is_live according to status (closes #16259) --- youtube_dl/extractor/twitch.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index f736283e9..4c11fd3c3 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -168,6 +168,13 @@ class TwitchItemBaseIE(TwitchBaseIE): return self.playlist_result(entries, info['id'], info['title']) def _extract_info(self, info): + status = info.get('status') + if status == 'recording': + is_live = True + elif status == 'recorded': + is_live = False + else: + is_live = None return { 'id': info['_id'], 'title': info.get('title') or 'Untitled Broadcast', @@ -178,6 +185,7 @@ class TwitchItemBaseIE(TwitchBaseIE): 'uploader_id': info.get('channel', {}).get('name'), 'timestamp': parse_iso8601(info.get('recorded_at')), 'view_count': int_or_none(info.get('views')), + 'is_live': is_live, } def _real_extract(self, url): From 76030543cd5e2214c47aa82f03b3e2cec97e7bc1 Mon Sep 17 00:00:00 2001 From: Alexandre Macabies <Zopieux@users.noreply.github.com> Date: Tue, 24 Apr 2018 19:49:30 +0200 Subject: [PATCH 123/148] [openload] Recognize IPv6 stream URLs (closes #16137) --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 650f95656..d0bdd60b8 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -340,7 +340,10 @@ class OpenloadIE(InfoExtractor): get_element_by_id('streamurj', webpage) or self._search_regex( (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<', - r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)'), webpage, + r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)', + r'>\s*([\w-]+~\d{10,}~(?:[a-f\d]+:){2}:~[\w-]+)\s*<', + r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)\s*<', + r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage, 'stream URL')) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id From 5d0fe6d23e4407bee3caec33955d4cb410bebb5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Apr 2018 00:56:16 +0700 Subject: [PATCH 124/148] Credit @Zopieux for #16250 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 6223212aa..880e0abee 100644 --- a/AUTHORS +++ b/AUTHORS @@ -236,3 +236,4 @@ Lei Wang Petr Novák Leonardo Taccari Martin Weinelt +Alexandre Macabies From 95284bc281d8aa3b1d6863ccb536da9d4cf6433c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Apr 2018 01:01:06 +0700 Subject: [PATCH 125/148] Credit @TingPing for picarto (#15551) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 880e0abee..812051796 100644 --- a/AUTHORS +++ b/AUTHORS @@ -236,4 +236,5 @@ Lei Wang Petr Novák Leonardo Taccari Martin Weinelt +TingPing Alexandre Macabies From ecb24f7c081b764dd669cb4b277d8c14e55b2a39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Apr 2018 01:02:28 +0700 Subject: [PATCH 126/148] Credit @f2face for #16115 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 812051796..eaf96d79d 100644 --- a/AUTHORS +++ b/AUTHORS @@ -236,5 +236,6 @@ Lei Wang Petr Novák Leonardo Taccari Martin Weinelt +Surya Oktafendri TingPing Alexandre Macabies From e028d4f506562a1febf76277795305e296823ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Apr 2018 01:03:42 +0700 Subject: [PATCH 127/148] [ChangeLog] Actualize [ci skip] --- ChangeLog | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/ChangeLog b/ChangeLog index 185fa1753..a731fde29 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,32 @@ +version <unreleased> + +Core +* [utils] Fix match_str for boolean meta fields ++ [Makefile] Add support for pandoc 2 and disable smart extension (#16251) +* [YoutubeDL] Fix typo in media extension compatibility checker (#16215) + +Extractors ++ [openload] Recognize IPv6 stream URLs (#16136, #16137, #16205, #16246, + #16250) ++ [twitch] Extract is_live according to status (#16259) +* [pornflip] Relax URL regular expression (#16258) +- [etonline] Remove extractor (#16256) +* [breakcom] Fix extraction (#16254) ++ [youtube] Add ability to authenticate with cookies +* [youtube:feed] Implement lazy playlist extraction (#10184) ++ [svt] Add support for TV channel live streams (#15279, #15809) +* [ccma] Fix video extraction (#15931) +* [rentv] Fix extraction (#15227) ++ [nick] Add support for nickjr.nl (#16230) +* [extremetube] Fix metadata extraction ++ [keezmovies] Add support for generic embeds (#16134, #16154) +* [nexx] Extract new azure URLs (#16223) +* [cbssports] Fix extraction (#16217) +* [kaltura] Improve embeds detection (#16201) +* [instagram:user] Fix extraction (#16119) +* [cbs] Skip DRM asset types (#16104) + + version 2018.04.16 Extractors From b5802d69f511481a87d8604fa1577bca8370cab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Apr 2018 01:12:40 +0700 Subject: [PATCH 128/148] release 2018.04.25 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 - youtube_dl/version.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 69f996179..252fa0adf 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.16*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.16** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.25*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.25** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.04.16 +[debug] youtube-dl version 2018.04.25 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index a731fde29..4a3df67df 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.04.25 Core * [utils] Fix match_str for boolean meta fields diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 715d16cfe..a110f687b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -257,7 +257,6 @@ - **ESPN** - **ESPNArticle** - **EsriVideo** - - **ETOnline** - **Europa** - **EveryonesMixtape** - **ExpoTV** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5aefdd0a2..4e3cb39c6 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.04.16' +__version__ = '2018.04.25' From d3711b00502d9104a3697aba5d210a25066ca756 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Apr 2018 02:14:27 +0700 Subject: [PATCH 129/148] [devscripts/gh-pages/generate-download.py] Use program checksum from versions.json --- devscripts/gh-pages/generate-download.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/devscripts/gh-pages/generate-download.py b/devscripts/gh-pages/generate-download.py index fcd7e1dff..a873d32ee 100755 --- a/devscripts/gh-pages/generate-download.py +++ b/devscripts/gh-pages/generate-download.py @@ -1,27 +1,22 @@ #!/usr/bin/env python3 from __future__ import unicode_literals -import hashlib -import urllib.request import json versions_info = json.load(open('update/versions.json')) version = versions_info['latest'] -URL = versions_info['versions'][version]['bin'][0] - -data = urllib.request.urlopen(URL).read() +version_dict = versions_info['versions'][version] # Read template page with open('download.html.in', 'r', encoding='utf-8') as tmplf: template = tmplf.read() -sha256sum = hashlib.sha256(data).hexdigest() template = template.replace('@PROGRAM_VERSION@', version) -template = template.replace('@PROGRAM_URL@', URL) -template = template.replace('@PROGRAM_SHA256SUM@', sha256sum) -template = template.replace('@EXE_URL@', versions_info['versions'][version]['exe'][0]) -template = template.replace('@EXE_SHA256SUM@', versions_info['versions'][version]['exe'][1]) -template = template.replace('@TAR_URL@', versions_info['versions'][version]['tar'][0]) -template = template.replace('@TAR_SHA256SUM@', versions_info['versions'][version]['tar'][1]) +template = template.replace('@PROGRAM_URL@', version_dict['bin'][0]) +template = template.replace('@PROGRAM_SHA256SUM@', version_dict['bin'][1]) +template = template.replace('@EXE_URL@', version_dict['exe'][0]) +template = template.replace('@EXE_SHA256SUM@', version_dict['exe'][1]) +template = template.replace('@TAR_URL@', version_dict['tar'][0]) +template = template.replace('@TAR_SHA256SUM@', version_dict['tar'][1]) with open('download.html', 'w', encoding='utf-8') as dlf: dlf.write(template) From c84eae4f66be8a22c14b852bdb01773bb3807239 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 27 Apr 2018 03:45:52 +0700 Subject: [PATCH 130/148] [funk:channel] Improve extraction (closes #16285) --- youtube_dl/extractor/funk.py | 51 ++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/funk.py b/youtube_dl/extractor/funk.py index faea6576f..0ff058619 100644 --- a/youtube_dl/extractor/funk.py +++ b/youtube_dl/extractor/funk.py @@ -5,7 +5,10 @@ import re from .common import InfoExtractor from .nexx import NexxIE -from ..utils import int_or_none +from ..utils import ( + int_or_none, + try_get, +) class FunkBaseIE(InfoExtractor): @@ -77,6 +80,20 @@ class FunkChannelIE(FunkBaseIE): 'params': { 'skip_download': True, }, + }, { + # only available via byIdList API + 'url': 'https://www.funk.net/channel/informr/martin-sonneborn-erklaert-die-eu', + 'info_dict': { + 'id': '205067', + 'ext': 'mp4', + 'title': 'Martin Sonneborn erklärt die EU', + 'description': 'md5:050f74626e4ed87edf4626d2024210c0', + 'timestamp': 1494424042, + 'upload_date': '20170510', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.funk.net/channel/59d5149841dca100012511e3/mein-erster-job-lovemilla-folge-1/lovemilla/', 'only_matching': True, @@ -87,16 +104,28 @@ class FunkChannelIE(FunkBaseIE): channel_id = mobj.group('id') alias = mobj.group('alias') - results = self._download_json( - 'https://www.funk.net/api/v3.0/content/videos/filter', channel_id, - headers={ - 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbCIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxzZWFyY2gtYXBpIn0.q4Y2xZG8PFHai24-4Pjx2gym9RmJejtmK6lMXP5wAgc', - 'Referer': url, - }, query={ - 'channelId': channel_id, - 'size': 100, - })['result'] + headers = { + 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbCIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxzZWFyY2gtYXBpIn0.q4Y2xZG8PFHai24-4Pjx2gym9RmJejtmK6lMXP5wAgc', + 'Referer': url, + } - video = next(r for r in results if r.get('alias') == alias) + video = None + + by_id_list = self._download_json( + 'https://www.funk.net/api/v3.0/content/videos/byIdList', channel_id, + headers=headers, query={ + 'ids': alias, + }, fatal=False) + if by_id_list: + video = try_get(by_id_list, lambda x: x['result'][0], dict) + + if not video: + results = self._download_json( + 'https://www.funk.net/api/v3.0/content/videos/filter', channel_id, + headers=headers, query={ + 'channelId': channel_id, + 'size': 100, + })['result'] + video = next(r for r in results if r.get('alias') == alias) return self._make_url_result(video) From 0fe7783eced5c62dbd95780c2150fd1080bd3927 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Apr 2018 01:59:15 +0700 Subject: [PATCH 131/148] [extractor/common] Add _download_json_handle --- youtube_dl/extractor/common.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 59b9d3739..e0c3c8eb0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -682,18 +682,30 @@ class InfoExtractor(object): else: self.report_warning(errmsg + str(ve)) - def _download_json(self, url_or_request, video_id, - note='Downloading JSON metadata', - errnote='Unable to download JSON metadata', - transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - json_string = self._download_webpage( + def _download_json_handle( + self, url_or_request, video_id, note='Downloading JSON metadata', + errnote='Unable to download JSON metadata', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}): + """Return a tuple (JSON object, URL handle)""" + res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) - if (not fatal) and json_string is False: - return None + if res is False: + return res + json_string, urlh = res return self._parse_json( - json_string, video_id, transform_source=transform_source, fatal=fatal) + json_string, video_id, transform_source=transform_source, + fatal=fatal), urlh + + def _download_json( + self, url_or_request, video_id, note='Downloading JSON metadata', + errnote='Unable to download JSON metadata', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}): + res = self._download_json_handle( + url_or_request, video_id, note=note, errnote=errnote, + transform_source=transform_source, fatal=fatal, encoding=encoding, + data=data, headers=headers, query=query) + return res if res is False else res[0] def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): if transform_source: From 6cc622327ff8289f94894f3695ed31014c61cf8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Apr 2018 02:47:17 +0700 Subject: [PATCH 132/148] [utils] Introduce merge_dicts --- test/test_utils.py | 12 ++++++++++++ youtube_dl/extractor/generic.py | 16 +--------------- youtube_dl/utils.py | 14 ++++++++++++++ 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 253a7fe17..14503ab53 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -42,6 +42,7 @@ from youtube_dl.utils import ( is_html, js_to_json, limit_length, + merge_dicts, mimetype2ext, month_by_name, multipart_encode, @@ -669,6 +670,17 @@ class TestUtil(unittest.TestCase): self.assertEqual(dict_get(d, ('b', 'c', key, )), None) self.assertEqual(dict_get(d, ('b', 'c', key, ), skip_false_values=False), false_value) + def test_merge_dicts(self): + self.assertEqual(merge_dicts({'a': 1}, {'b': 2}), {'a': 1, 'b': 2}) + self.assertEqual(merge_dicts({'a': 1}, {'a': 2}), {'a': 1}) + self.assertEqual(merge_dicts({'a': 1}, {'a': None}), {'a': 1}) + self.assertEqual(merge_dicts({'a': 1}, {'a': ''}), {'a': 1}) + self.assertEqual(merge_dicts({'a': 1}, {}), {'a': 1}) + self.assertEqual(merge_dicts({'a': None}, {'a': 1}), {'a': 1}) + self.assertEqual(merge_dicts({'a': ''}, {'a': 1}), {'a': ''}) + self.assertEqual(merge_dicts({'a': ''}, {'a': 'abc'}), {'a': 'abc'}) + self.assertEqual(merge_dicts({'a': None}, {'a': ''}, {'a': 'abc'}), {'a': 'abc'}) + def test_encode_compat_str(self): self.assertEqual(encode_compat_str(b'\xd1\x82\xd0\xb5\xd1\x81\xd1\x82', 'utf-8'), 'тест') self.assertEqual(encode_compat_str('тест', 'utf-8'), 'тест') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index af1322e00..d48914495 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -23,6 +23,7 @@ from ..utils import ( is_html, js_to_json, KNOWN_EXTENSIONS, + merge_dicts, mimetype2ext, orderedSet, sanitized_Request, @@ -3002,21 +3003,6 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( sharevideos_urls, video_id, video_title) - def merge_dicts(dict1, dict2): - merged = {} - for k, v in dict1.items(): - if v is not None: - merged[k] = v - for k, v in dict2.items(): - if v is None: - continue - if (k not in merged or - (isinstance(v, compat_str) and v and - isinstance(merged[k], compat_str) and - not merged[k])): - merged[k] = v - return merged - # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 574284e94..b460393bf 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2225,6 +2225,20 @@ def try_get(src, getter, expected_type=None): return v +def merge_dicts(*dicts): + merged = {} + for a_dict in dicts: + for k, v in a_dict.items(): + if v is None: + continue + if (k not in merged or + (isinstance(v, compat_str) and v and + isinstance(merged[k], compat_str) and + not merged[k])): + merged[k] = v + return merged + + def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) From e7e4a6e0f9166cee82c165ca69a6a3c94ddc5f45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Apr 2018 02:48:03 +0700 Subject: [PATCH 133/148] [extractor/common] Extract interaction statistic --- youtube_dl/extractor/common.py | 35 ++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e0c3c8eb0..a9939b0fd 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1020,6 +1020,40 @@ class InfoExtractor(object): if isinstance(json_ld, dict): json_ld = [json_ld] + INTERACTION_TYPE_MAP = { + 'CommentAction': 'comment', + 'AgreeAction': 'like', + 'DisagreeAction': 'dislike', + 'LikeAction': 'like', + 'DislikeAction': 'dislike', + 'ListenAction': 'view', + 'WatchAction': 'view', + 'ViewAction': 'view', + } + + def extract_interaction_statistic(e): + interaction_statistic = e.get('interactionStatistic') + if not isinstance(interaction_statistic, list): + return + for is_e in interaction_statistic: + if not isinstance(is_e, dict): + continue + if is_e.get('@type') != 'InteractionCounter': + continue + interaction_type = is_e.get('interactionType') + if not isinstance(interaction_type, compat_str): + continue + interaction_count = int_or_none(is_e.get('userInteractionCount')) + if interaction_count is None: + continue + count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1]) + if not count_kind: + continue + count_key = '%s_count' % count_kind + if info.get(count_key) is not None: + continue + info[count_key] = interaction_count + def extract_video_object(e): assert e['@type'] == 'VideoObject' info.update({ @@ -1035,6 +1069,7 @@ class InfoExtractor(object): 'height': int_or_none(e.get('height')), 'view_count': int_or_none(e.get('interactionCount')), }) + extract_interaction_statistic(e) for e in json_ld: if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')): From ae1c585cee3eb183cddf7c30a09b75d887307dee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Apr 2018 02:48:20 +0700 Subject: [PATCH 134/148] [vimeo] Extract JSON LD (closes #16295) --- youtube_dl/extractor/vimeo.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 08257147e..a026526b2 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -16,6 +16,7 @@ from ..utils import ( ExtractorError, InAdvancePagedList, int_or_none, + merge_dicts, NO_DEFAULT, RegexNotFoundError, sanitized_Request, @@ -639,16 +640,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'preference': 1, }) - info_dict = self._parse_config(config, video_id) - formats.extend(info_dict['formats']) + info_dict_config = self._parse_config(config, video_id) + formats.extend(info_dict_config['formats']) self._vimeo_sort_formats(formats) + json_ld = self._search_json_ld(webpage, video_id, default={}) + if not cc_license: cc_license = self._search_regex( r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') - info_dict.update({ + info_dict = { 'id': video_id, 'formats': formats, 'timestamp': unified_timestamp(timestamp), @@ -658,7 +661,9 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': like_count, 'comment_count': comment_count, 'license': cc_license, - }) + } + + info_dict = merge_dicts(info_dict, info_dict_config, json_ld) return info_dict From 7dd6ab4a47b08beafe45befa29c44df2db00547e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Apr 2018 04:51:39 +0700 Subject: [PATCH 135/148] [imdb] Extract all formats (closes #16249) --- youtube_dl/extractor/imdb.py | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 3ff672a89..425421968 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -3,7 +3,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + determine_ext, mimetype2ext, qualities, remove_end, @@ -73,19 +75,25 @@ class ImdbIE(InfoExtractor): video_info_list = format_info.get('videoInfoList') if not video_info_list or not isinstance(video_info_list, list): continue - video_info = video_info_list[0] - if not video_info or not isinstance(video_info, dict): - continue - video_url = video_info.get('videoUrl') - if not video_url: - continue - format_id = format_info.get('ffname') - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': mimetype2ext(video_info.get('videoMimeType')), - 'quality': quality(format_id), - }) + for video_info in video_info_list: + if not video_info or not isinstance(video_info, dict): + continue + video_url = video_info.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): + continue + if (video_info.get('videoMimeType') == 'application/x-mpegURL' or + determine_ext(video_url) == 'm3u8'): + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + format_id = format_info.get('ffname') + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': mimetype2ext(video_info.get('videoMimeType')), + 'quality': quality(format_id), + }) self._sort_formats(formats) return { From 500a86a52ee46a3a1acc864b602b74d141afdc24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 29 Apr 2018 00:33:31 +0700 Subject: [PATCH 136/148] [downloader/fragment] Restart download if .ytdl file is corrupt (closes #16312) --- youtube_dl/downloader/fragment.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 927c7e491..917f6dc01 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -74,9 +74,14 @@ class FragmentFD(FileDownloader): return not ctx['live'] and not ctx['tmpfilename'] == '-' def _read_ytdl_file(self, ctx): + assert 'ytdl_corrupt' not in ctx stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r') - ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index'] - stream.close() + try: + ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index'] + except Exception: + ctx['ytdl_corrupt'] = True + finally: + stream.close() def _write_ytdl_file(self, ctx): frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w') @@ -158,11 +163,17 @@ class FragmentFD(FileDownloader): if self.__do_ytdl_file(ctx): if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): self._read_ytdl_file(ctx) - if ctx['fragment_index'] > 0 and resume_len == 0: + is_corrupt = ctx.get('ytdl_corrupt') is True + is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 + if is_corrupt or is_inconsistent: + message = ( + '.ytdl file is corrupt' if is_corrupt else + 'Inconsistent state of incomplete fragment download') self.report_warning( - 'Inconsistent state of incomplete fragment download. ' - 'Restarting from the beginning...') + '%s. Restarting from the beginning...' % message) ctx['fragment_index'] = resume_len = 0 + if 'ytdl_corrupt' in ctx: + del ctx['ytdl_corrupt'] self._write_ytdl_file(ctx) else: self._write_ytdl_file(ctx) From 106c8c3edbc5b7e95cfba79ddc6252fad0adb859 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 29 Apr 2018 19:04:40 +0700 Subject: [PATCH 137/148] [nrktv] Update API host (closes #16324) --- youtube_dl/extractor/nrk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 18ead9426..3b4f51f61 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -237,7 +237,7 @@ class NRKTVIE(NRKBaseIE): (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P<part_id>\d+))? ''' % _EPISODE_RE - _API_HOST = 'psapi-ne.nrk.no' + _API_HOST = 'psapi-we.nrk.no' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', From 12b0d4e0e1df6d6a8b9ce10b9a69013497adc2b0 Mon Sep 17 00:00:00 2001 From: Meneth32 <meneth@hotmail.com> Date: Sun, 29 Apr 2018 16:59:40 +0200 Subject: [PATCH 138/148] [redditr] Add support for old.reddit.com URLs --- youtube_dl/extractor/reddit.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py index 53b1c967e..8372925be 100644 --- a/youtube_dl/extractor/reddit.py +++ b/youtube_dl/extractor/reddit.py @@ -47,7 +47,7 @@ class RedditIE(InfoExtractor): class RedditRIE(InfoExtractor): - _VALID_URL = r'(?P<url>https?://(?:www\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))' + _VALID_URL = r'(?P<url>https?://(?:(?:www|old)\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', 'info_dict': { @@ -74,6 +74,10 @@ class RedditRIE(InfoExtractor): # imgur 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', 'only_matching': True, + }, { + # imgur @ old reddit + 'url': 'https://old.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, }, { # streamable 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/', From 01aec8488084e62aa188b5167e57d01ef66cd256 Mon Sep 17 00:00:00 2001 From: Bastian de Groot <bastiandg@users.noreply.github.com> Date: Sun, 29 Apr 2018 17:14:37 +0200 Subject: [PATCH 139/148] [generic] Prefer enclosures over links in RSS feeds --- youtube_dl/extractor/generic.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d48914495..252f97c26 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -191,6 +191,16 @@ class GenericIE(InfoExtractor): 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', } }, + # RSS feed with enclosures and unsupported link URLs + { + 'url': 'http://www.hellointernet.fm/podcast?format=rss', + 'info_dict': { + 'id': 'http://www.hellointernet.fm/podcast?format=rss', + 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.', + 'title': 'Hello Internet', + }, + 'playlist_mincount': 100, + }, # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng { 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', @@ -2026,13 +2036,15 @@ class GenericIE(InfoExtractor): entries = [] for it in doc.findall('./channel/item'): - next_url = xpath_text(it, 'link', fatal=False) + next_url = None + enclosure_nodes = it.findall('./enclosure') + for e in enclosure_nodes: + next_url = e.attrib.get('url') + if next_url: + break + if not next_url: - enclosure_nodes = it.findall('./enclosure') - for e in enclosure_nodes: - next_url = e.attrib.get('url') - if next_url: - break + next_url = xpath_text(it, 'link', fatal=False) if not next_url: continue From 30226342ab346263b684170c4ce7d5266fec212e Mon Sep 17 00:00:00 2001 From: Niklas Haas <git@haasn.xyz> Date: Sun, 29 Apr 2018 11:23:23 +0200 Subject: [PATCH 140/148] [youtube] Correctly disable polymer on all requests Rather than just the one that use the _download_webpage helper. The need for this was made apparent by 0fe7783e, which refactored _download_json in a way that completely avoids the use of _download_webpage, thus breaking youtube. Fixes #16323 --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e7bd1f18f..04aeb91af 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -246,9 +246,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True - def _download_webpage(self, *args, **kwargs): + def _download_webpage_handle(self, *args, **kwargs): kwargs.setdefault('query', {})['disable_polymer'] = 'true' - return super(YoutubeBaseInfoExtractor, self)._download_webpage( + return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) def _real_initialize(self): From e5eadfa82f10bda43294d1da85024eec29c7973f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 29 Apr 2018 22:49:47 +0700 Subject: [PATCH 141/148] [udemy,xiami,yandexmusic] Override _download_webpage_handle instead of _download_webpage --- youtube_dl/extractor/udemy.py | 4 ++-- youtube_dl/extractor/xiami.py | 4 ++-- youtube_dl/extractor/yandexmusic.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 6d6c0a98f..439ed2a89 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -115,9 +115,9 @@ class UdemyIE(InfoExtractor): error_str += ' - %s' % error_data.get('formErrors') raise ExtractorError(error_str, expected=True) - def _download_webpage(self, *args, **kwargs): + def _download_webpage_handle(self, *args, **kwargs): kwargs.setdefault('headers', {})['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4' - return super(UdemyIE, self)._download_webpage( + return super(UdemyIE, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) def _download_json(self, url_or_request, *args, **kwargs): diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py index 7f871c8ec..8333fb534 100644 --- a/youtube_dl/extractor/xiami.py +++ b/youtube_dl/extractor/xiami.py @@ -9,8 +9,8 @@ from ..utils import int_or_none class XiamiBaseIE(InfoExtractor): _API_BASE_URL = 'http://www.xiami.com/song/playlist/cat/json/id' - def _download_webpage(self, *args, **kwargs): - webpage = super(XiamiBaseIE, self)._download_webpage(*args, **kwargs) + def _download_webpage_handle(self, *args, **kwargs): + webpage = super(XiamiBaseIE, self)._download_webpage_handle(*args, **kwargs) if '>Xiami is currently not available in your country.<' in webpage: self.raise_geo_restricted('Xiami is currently not available in your country') return webpage diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index eb1062142..e85eca073 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -34,8 +34,8 @@ class YandexMusicBaseIE(InfoExtractor): 'youtube-dl with --cookies', expected=True) - def _download_webpage(self, *args, **kwargs): - webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) + def _download_webpage_handle(self, *args, **kwargs): + webpage = super(YandexMusicBaseIE, self)._download_webpage_handle(*args, **kwargs) if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: self._raise_captcha() return webpage From 796bf9de45d6f01bf2d34ae22e1eacdc1a649fab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 29 Apr 2018 22:56:07 +0700 Subject: [PATCH 142/148] [yandexmusic] Convert release_year to int --- youtube_dl/extractor/yandexmusic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index e85eca073..009203851 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -57,14 +57,14 @@ class YandexMusicTrackIE(YandexMusicBaseIE): 'info_dict': { 'id': '4878838', 'ext': 'mp3', - 'title': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio - Gypsy Eyes 1', + 'title': 'Carlo Ambrosio, Carlo Ambrosio & Fabio Di Bari - Gypsy Eyes 1', 'filesize': 4628061, 'duration': 193.04, 'track': 'Gypsy Eyes 1', 'album': 'Gypsy Soul', 'album_artist': 'Carlo Ambrosio', - 'artist': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio', - 'release_year': '2009', + 'artist': 'Carlo Ambrosio, Carlo Ambrosio & Fabio Di Bari', + 'release_year': 2009, }, 'skip': 'Travis CI servers blocked by YandexMusic', } @@ -120,7 +120,7 @@ class YandexMusicTrackIE(YandexMusicBaseIE): track_info.update({ 'album': album.get('title'), 'album_artist': extract_artist(album.get('artists')), - 'release_year': compat_str(year) if year else None, + 'release_year': int_or_none(year), }) track_artist = extract_artist(track.get('artists')) From 4a733545867a014eb786348f8fb9e6ae95850742 Mon Sep 17 00:00:00 2001 From: Alex Seiler <seileralex@gmail.com> Date: Sun, 5 Nov 2017 18:07:35 +0100 Subject: [PATCH 143/148] [zattoo] Add extractor (closes #14668) --- youtube_dl/extractor/extractors.py | 6 + youtube_dl/extractor/zattoo.py | 234 +++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 youtube_dl/extractor/zattoo.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6fb65e4fe..9fe3f649d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1418,5 +1418,11 @@ from .youtube import ( ) from .zapiks import ZapiksIE from .zaq1 import Zaq1IE +from .zattoo import ( + QuicklineIE, + QuicklineLiveIE, + ZattooIE, + ZattooLiveIE, +) from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py new file mode 100644 index 000000000..928f22566 --- /dev/null +++ b/youtube_dl/extractor/zattoo.py @@ -0,0 +1,234 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from uuid import uuid4 +import re + +from .common import InfoExtractor +from ..utils import ( + compat_str, + ExtractorError, + sanitized_Request, + urlencode_postdata, +) + + +class ZattooBaseIE(InfoExtractor): + + _NETRC_MACHINE = 'zattoo' + _HOST_URL = 'https://zattoo.com' + + _power_guide_hash = None + + def _login(self, uuid, session_id): + (username, password) = self._get_login_info() + if not username or not password: + raise ExtractorError( + 'A valid %s account is needed to access this media.' % self._NETRC_MACHINE, + expected=True) + login_form = { + 'login': username, + 'password': password, + 'remember': True, + } + request = sanitized_Request( + '%s/zapi/v2/account/login' % self._HOST_URL, + urlencode_postdata(login_form)) + request.add_header( + 'Referer', '%s/login' % self._HOST_URL) + request.add_header( + 'Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8') + request.add_header( + 'Cookie', 'uuid=%s; beaker.session.id=%s' % (uuid, session_id)) + response = self._request_webpage( + request, None, 'Logging in') + data = self._parse_json(response.read(), None) + return data['session']['power_guide_hash'] + + def _get_app_token_and_version(self): + host_webpage = self._download_webpage( + self._HOST_URL, None, 'Downloading %s' % self._HOST_URL) + app_token = self._html_search_regex( + r'<script.+window\.appToken\s*=\s*\'(.+)\'', host_webpage, 'app token') + app_version = self._html_search_regex( + r'<!--\w+-(.+?)-', host_webpage, 'app version', default='2.8.2') + return app_token, app_version + + def _say_hello(self, uuid, app_token, app_version): + postdata = { + 'client_app_token': app_token, + 'uuid': uuid, + 'lang': 'en', + 'app_version': app_version, + 'format': 'json', + } + request = sanitized_Request( + '%s/zapi/v2/session/hello' % self._HOST_URL, + urlencode_postdata(postdata)) + response = self._request_webpage( + request, None, 'Say hello') + + cookie = response.headers.get('Set-Cookie') + session_id = self._search_regex( + r'beaker\.session\.id\s*=\s*(.+?);', cookie, 'session id') + return session_id + + def _extract_cid(self, video_id, channel_name): + channel_groups = self._download_json( + '%s/zapi/v2/cached/channels/%s' % (self._HOST_URL, + self._power_guide_hash), + video_id, + 'Downloading available channel list', + query={'details': False})['channel_groups'] + channel_list = [] + for chgrp in channel_groups: + channel_list.extend(chgrp['channels']) + try: + return next( + chan['cid'] for chan in channel_list + if chan['display_alias'] == channel_name or chan['cid'] == channel_name) + except StopIteration: + raise ExtractorError('Could not extract channel id') + + def _extract_cid_and_video_info(self, video_id): + data = self._download_json( + '%s/zapi/program/details' % self._HOST_URL, + video_id, + 'Downloading video information', + query={ + 'program_id': video_id, + 'complete': True + }) + + info_dict = { + 'id': video_id, + 'title': data['program']['title'], + 'description': data['program'].get('description'), + 'thumbnail': data['program'].get('image_url') + } + cid = data['program']['cid'] + return cid, info_dict + + def _extract_formats(self, cid, video_id, record_id=None, is_live=False): + postdata = { + 'stream_type': 'dash', + 'https_watch_urls': True, + } + if record_id: + url = '%s/zapi/watch/recording/%s' % (self._HOST_URL, record_id) + else: + url = '%s/zapi/watch/recall/%s/%s' % (self._HOST_URL, cid, video_id) + + if is_live: + postdata.update({'timeshift': 10800}) + url = '%s/zapi/watch/live/%s' % (self._HOST_URL, cid) + + data = self._download_json( + sanitized_Request(url, urlencode_postdata(postdata)), + video_id, 'Downloading dash formats') + + formats = [] + for elem in data['stream']['watch_urls']: + audio_channel = elem.get('audio_channel') + maxrate = elem.get('maxrate') + formats.extend( + self._extract_mpd_formats( + elem['url'], video_id, + mpd_id='dash-maxrate-%s-channel-%s' % (maxrate, audio_channel), fatal=False)) + + postdata.update({'stream_type': 'hls'}) + request = sanitized_Request( + url, urlencode_postdata(postdata)) + data = self._download_json( + request, video_id, 'Downloading hls formats') + for elem in data['stream']['watch_urls']: + audio_channel = elem.get('audio_channel') + preference = None + + # Prefer audio channel A: + if audio_channel == 'A': + preference = 1 + + maxrate = elem.get('maxrate') + formats.extend( + self._extract_m3u8_formats( + elem['url'], video_id, 'mp4', entry_protocol='m3u8_native', + preference=preference, + m3u8_id='hls-maxrate-%s-channel-%s' % (maxrate, audio_channel), + fatal=False)) + + self._sort_formats(formats) + return formats + + def _real_initialize(self): + uuid = compat_str(uuid4()) + app_token, app_version = self._get_app_token_and_version() + session_id = self._say_hello(uuid, app_token, app_version) + self._power_guide_hash = self._login(uuid, session_id) + + def _extract_video(self, channel_name, video_id, record_id=None, is_live=False): + if is_live: + cid = self._extract_cid(video_id, channel_name) + info_dict = { + 'id': channel_name, + 'title': self._live_title(channel_name), + 'is_live': True, + } + else: + cid, info_dict = self._extract_cid_and_video_info(video_id) + formats = self._extract_formats( + cid, video_id, record_id=record_id, is_live=is_live) + info_dict['formats'] = formats + return info_dict + + +class QuicklineBaseIE(ZattooBaseIE): + _NETRC_MACHINE = 'quickline' + _HOST_URL = 'https://mobiltv.quickline.com' + + +class QuicklineIE(QuicklineBaseIE): + _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<channel>[^/]+)/(?P<id>[0-9]+)' + + def _real_extract(self, url): + channel_name, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_video(channel_name, video_id) + + +class QuicklineLiveIE(QuicklineBaseIE): + _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<id>[^/]+)$' + + def _real_extract(self, url): + channel_name = video_id = self._match_id(url) + return self._extract_video(channel_name, video_id, is_live=True) + + +class ZattooIE(ZattooBaseIE): + _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<channel>[^/]+?)/(?P<id>[0-9]+)[^/]+(?:/(?P<recid>[0-9]+))?' + + # Since regular videos are only available for 7 days and recorded videos + # are only available for a specific user, we cannot have detailed tests. + _TESTS = [{ + 'url': 'https://zattoo.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', + 'only_matching': True, + }, { + 'url': 'https://zattoo.com/watch/srf_zwei/132905652-eishockey-spengler-cup/102791477/1512211800000/1514433500000/92000', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_name, video_id, record_id = re.match(self._VALID_URL, url).groups() + return self._extract_video(channel_name, video_id, record_id) + + +class ZattooLiveIE(ZattooBaseIE): + _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<id>[^/]+)$' + + _TEST = { + 'url': 'https://zattoo.com/watch/srf1', + 'only_matching': True, + } + + def _real_extract(self, url): + channel_name = video_id = self._match_id(url) + return self._extract_video(channel_name, video_id, is_live=True) From 67ca1a8ef7ea6094e1e34518b93cdb5ba59f31b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 1 May 2018 01:48:21 +0700 Subject: [PATCH 144/148] [zattoo] Improve and simplify (closes #14676) --- youtube_dl/extractor/zattoo.py | 238 +++++++++++++++++++-------------- 1 file changed, 137 insertions(+), 101 deletions(-) diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index 928f22566..773073d85 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -1,84 +1,82 @@ # coding: utf-8 from __future__ import unicode_literals -from uuid import uuid4 import re +from uuid import uuid4 from .common import InfoExtractor -from ..utils import ( +from ..compat import ( + compat_HTTPError, compat_str, +) +from ..utils import ( ExtractorError, - sanitized_Request, + int_or_none, + try_get, urlencode_postdata, ) class ZattooBaseIE(InfoExtractor): - _NETRC_MACHINE = 'zattoo' _HOST_URL = 'https://zattoo.com' _power_guide_hash = None - def _login(self, uuid, session_id): + def _login(self): (username, password) = self._get_login_info() if not username or not password: - raise ExtractorError( - 'A valid %s account is needed to access this media.' % self._NETRC_MACHINE, - expected=True) - login_form = { - 'login': username, - 'password': password, - 'remember': True, - } - request = sanitized_Request( - '%s/zapi/v2/account/login' % self._HOST_URL, - urlencode_postdata(login_form)) - request.add_header( - 'Referer', '%s/login' % self._HOST_URL) - request.add_header( - 'Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8') - request.add_header( - 'Cookie', 'uuid=%s; beaker.session.id=%s' % (uuid, session_id)) - response = self._request_webpage( - request, None, 'Logging in') - data = self._parse_json(response.read(), None) - return data['session']['power_guide_hash'] + self.raise_login_required( + 'A valid %s account is needed to access this media.' + % self._NETRC_MACHINE) - def _get_app_token_and_version(self): - host_webpage = self._download_webpage( - self._HOST_URL, None, 'Downloading %s' % self._HOST_URL) + try: + data = self._download_json( + '%s/zapi/v2/account/login' % self._HOST_URL, None, 'Logging in', + data=urlencode_postdata({ + 'login': username, + 'password': password, + 'remember': 'true', + }), headers={ + 'Referer': '%s/login' % self._HOST_URL, + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + raise ExtractorError( + 'Unable to login: incorrect username and/or password', + expected=True) + raise + + self._power_guide_hash = data['session']['power_guide_hash'] + + def _real_initialize(self): + webpage = self._download_webpage( + self._HOST_URL, None, 'Downloading app token') app_token = self._html_search_regex( - r'<script.+window\.appToken\s*=\s*\'(.+)\'', host_webpage, 'app token') + r'appToken\s*=\s*(["\'])(?P<token>(?:(?!\1).)+?)\1', + webpage, 'app token', group='token') app_version = self._html_search_regex( - r'<!--\w+-(.+?)-', host_webpage, 'app version', default='2.8.2') - return app_token, app_version + r'<!--\w+-(.+?)-', webpage, 'app version', default='2.8.2') - def _say_hello(self, uuid, app_token, app_version): - postdata = { - 'client_app_token': app_token, - 'uuid': uuid, - 'lang': 'en', - 'app_version': app_version, - 'format': 'json', - } - request = sanitized_Request( - '%s/zapi/v2/session/hello' % self._HOST_URL, - urlencode_postdata(postdata)) - response = self._request_webpage( - request, None, 'Say hello') + # Will setup appropriate cookies + self._request_webpage( + '%s/zapi/v2/session/hello' % self._HOST_URL, None, + 'Opening session', data=urlencode_postdata({ + 'client_app_token': app_token, + 'uuid': compat_str(uuid4()), + 'lang': 'en', + 'app_version': app_version, + 'format': 'json', + })) - cookie = response.headers.get('Set-Cookie') - session_id = self._search_regex( - r'beaker\.session\.id\s*=\s*(.+?);', cookie, 'session id') - return session_id + self._login() def _extract_cid(self, video_id, channel_name): channel_groups = self._download_json( '%s/zapi/v2/cached/channels/%s' % (self._HOST_URL, self._power_guide_hash), - video_id, - 'Downloading available channel list', + video_id, 'Downloading channel list', query={'details': False})['channel_groups'] channel_list = [] for chgrp in channel_groups: @@ -86,7 +84,9 @@ class ZattooBaseIE(InfoExtractor): try: return next( chan['cid'] for chan in channel_list - if chan['display_alias'] == channel_name or chan['cid'] == channel_name) + if chan.get('cid') and ( + chan.get('display_alias') == channel_name or + chan.get('cid') == channel_name)) except StopIteration: raise ExtractorError('Could not extract channel id') @@ -100,72 +100,90 @@ class ZattooBaseIE(InfoExtractor): 'complete': True }) + p = data['program'] + cid = p['cid'] + info_dict = { 'id': video_id, - 'title': data['program']['title'], - 'description': data['program'].get('description'), - 'thumbnail': data['program'].get('image_url') + 'title': p.get('title') or p['episode_title'], + 'description': p.get('description'), + 'thumbnail': p.get('image_url'), + 'creator': p.get('channel_name'), + 'episode': p.get('episode_title'), + 'episode_number': int_or_none(p.get('episode_number')), + 'season_number': int_or_none(p.get('season_number')), + 'release_year': int_or_none(p.get('year')), + 'categories': try_get(p, lambda x: x['categories'], list), } - cid = data['program']['cid'] + return cid, info_dict def _extract_formats(self, cid, video_id, record_id=None, is_live=False): - postdata = { - 'stream_type': 'dash', + postdata_common = { 'https_watch_urls': True, } - if record_id: + + if is_live: + postdata_common.update({'timeshift': 10800}) + url = '%s/zapi/watch/live/%s' % (self._HOST_URL, cid) + elif record_id: url = '%s/zapi/watch/recording/%s' % (self._HOST_URL, record_id) else: url = '%s/zapi/watch/recall/%s/%s' % (self._HOST_URL, cid, video_id) - if is_live: - postdata.update({'timeshift': 10800}) - url = '%s/zapi/watch/live/%s' % (self._HOST_URL, cid) - - data = self._download_json( - sanitized_Request(url, urlencode_postdata(postdata)), - video_id, 'Downloading dash formats') - formats = [] - for elem in data['stream']['watch_urls']: - audio_channel = elem.get('audio_channel') - maxrate = elem.get('maxrate') - formats.extend( - self._extract_mpd_formats( - elem['url'], video_id, - mpd_id='dash-maxrate-%s-channel-%s' % (maxrate, audio_channel), fatal=False)) + for stream_type in ('dash', 'hls', 'hls5', 'hds'): + postdata = postdata_common.copy() + postdata['stream_type'] = stream_type - postdata.update({'stream_type': 'hls'}) - request = sanitized_Request( - url, urlencode_postdata(postdata)) - data = self._download_json( - request, video_id, 'Downloading hls formats') - for elem in data['stream']['watch_urls']: - audio_channel = elem.get('audio_channel') - preference = None + data = self._download_json( + url, video_id, 'Downloading %s formats' % stream_type.upper(), + data=urlencode_postdata(postdata), fatal=False) + if not data: + continue - # Prefer audio channel A: - if audio_channel == 'A': - preference = 1 - - maxrate = elem.get('maxrate') - formats.extend( - self._extract_m3u8_formats( - elem['url'], video_id, 'mp4', entry_protocol='m3u8_native', - preference=preference, - m3u8_id='hls-maxrate-%s-channel-%s' % (maxrate, audio_channel), - fatal=False)) + watch_urls = try_get( + data, lambda x: x['stream']['watch_urls'], list) + if not watch_urls: + continue + for watch in watch_urls: + if not isinstance(watch, dict): + continue + watch_url = watch.get('url') + if not watch_url or not isinstance(watch_url, compat_str): + continue + format_id_list = [stream_type] + maxrate = watch.get('maxrate') + if maxrate: + format_id_list.append(compat_str(maxrate)) + audio_channel = watch.get('audio_channel') + if audio_channel: + format_id_list.append(compat_str(audio_channel)) + preference = 1 if audio_channel == 'A' else None + format_id = '-'.join(format_id_list) + if stream_type in ('dash', 'dash_widevine', 'dash_playready'): + this_formats = self._extract_mpd_formats( + watch_url, video_id, mpd_id=format_id, fatal=False) + elif stream_type in ('hls', 'hls5', 'hls5_fairplay'): + this_formats = self._extract_m3u8_formats( + watch_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, + fatal=False) + elif stream_type == 'hds': + this_formats = self._extract_f4m_formats( + watch_url, video_id, f4m_id=format_id, fatal=False) + elif stream_type == 'smooth_playready': + this_formats = self._extract_ism_formats( + watch_url, video_id, ism_id=format_id, fatal=False) + else: + assert False + for this_format in this_formats: + this_format['preference'] = preference + formats.extend(this_formats) self._sort_formats(formats) return formats - def _real_initialize(self): - uuid = compat_str(uuid4()) - app_token, app_version = self._get_app_token_and_version() - session_id = self._say_hello(uuid, app_token, app_version) - self._power_guide_hash = self._login(uuid, session_id) - def _extract_video(self, channel_name, video_id, record_id=None, is_live=False): if is_live: cid = self._extract_cid(video_id, channel_name) @@ -190,13 +208,27 @@ class QuicklineBaseIE(ZattooBaseIE): class QuicklineIE(QuicklineBaseIE): _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<channel>[^/]+)/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://mobiltv.quickline.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', + 'only_matching': True, + } + def _real_extract(self, url): channel_name, video_id = re.match(self._VALID_URL, url).groups() return self._extract_video(channel_name, video_id) class QuicklineLiveIE(QuicklineBaseIE): - _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<id>[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<id>[^/]+)' + + _TEST = { + 'url': 'https://mobiltv.quickline.com/watch/srf1', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if QuicklineIE.suitable(url) else super(QuicklineLiveIE, cls).suitable(url) def _real_extract(self, url): channel_name = video_id = self._match_id(url) @@ -222,13 +254,17 @@ class ZattooIE(ZattooBaseIE): class ZattooLiveIE(ZattooBaseIE): - _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<id>[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<id>[^/]+)' _TEST = { 'url': 'https://zattoo.com/watch/srf1', 'only_matching': True, } + @classmethod + def suitable(cls, url): + return False if ZattooIE.suitable(url) else super(ZattooLiveIE, cls).suitable(url) + def _real_extract(self, url): channel_name = video_id = self._match_id(url) return self._extract_video(channel_name, video_id, is_live=True) From 851396346803f77ab9573af56cae056aa904cf93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 1 May 2018 02:15:43 +0700 Subject: [PATCH 145/148] [udemy] Extract outputs renditions (closes #16289, closes #16291, closes #16320, closes #16321, closes #16334, closes #16335) --- youtube_dl/extractor/udemy.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 439ed2a89..bf1134e3f 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -58,6 +58,10 @@ class UdemyIE(InfoExtractor): # no url in outputs format entry 'url': 'https://www.udemy.com/learn-web-development-complete-step-by-step-guide-to-success/learn/v4/t/lecture/4125812', 'only_matching': True, + }, { + # only outputs rendition + 'url': 'https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0', + 'only_matching': True, }] def _extract_course_info(self, webpage, video_id): @@ -357,6 +361,12 @@ class UdemyIE(InfoExtractor): fatal=False) extract_subtitles(text_tracks) + if not formats and outputs: + for format_id, output in outputs.items(): + f = extract_output_format(output, format_id) + if f.get('url'): + formats.append(f) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) return { From c21692fa94df49ef925c06c00e5db1d8bb0f770d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 1 May 2018 03:09:04 +0700 Subject: [PATCH 146/148] [kaltura] Improve iframe embeds detection (closes #16337) --- youtube_dl/extractor/generic.py | 17 +++++++++++++++++ youtube_dl/extractor/kaltura.py | 3 ++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 252f97c26..73980601c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1282,6 +1282,23 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, + { + # Kaltura iframe embed, more sophisticated + 'url': 'http://www.cns.nyu.edu/~eero/math-tools/Videos/lecture-05sep2017.html', + 'info_dict': { + 'id': '1_9gzouybz', + 'ext': 'mp4', + 'title': 'lecture-05sep2017', + 'description': 'md5:40f347d91fd4ba047e511c5321064b49', + 'upload_date': '20170913', + 'uploader_id': 'eps2', + 'timestamp': 1505340777, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Kaltura'], + }, { # meta twitter:player 'url': 'http://thechive.com/2017/12/08/all-i-want-for-christmas-is-more-twerk/', diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 0ea89e4d6..04f68fce4 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -136,9 +136,10 @@ class KalturaIE(InfoExtractor): re.search( r'''(?xs) <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P<q1>["']) - (?:https?:)?//(?:(?:www|cdnapi)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+) + (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+) (?:(?!(?P=q1)).)* [?&;]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+) + (?:(?!(?P=q1)).)* (?P=q1) ''', webpage) ) From cc5772c4f0bcb7dfdfb0575787ff124dd7376de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 1 May 2018 03:30:23 +0700 Subject: [PATCH 147/148] [ChangeLog] Actualize [ci skip] --- ChangeLog | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4a3df67df..7841ee765 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,29 @@ +version <unreleased> + +Core +* [downloader/fragment] Restart download if .ytdl file is corrupt (#16312) ++ [extractor/common] Extract interaction statistic ++ [utils] Add merge_dicts ++ [extractor/common] Add _download_json_handle + +Extractors +* [kaltura] Improve iframe embeds detection (#16337) ++ [udemy] Extract outputs renditions (#16289, #16291, #16320, #16321, #16334, + #16335) ++ [zattoo] Add support for zattoo.com and mobiltv.quickline.com (#14668, #14676) +* [yandexmusic] Convert release_year to int +* [udemy] Override _download_webpage_handle instead of _download_webpage +* [xiami] Override _download_webpage_handle instead of _download_webpage +* [yandexmusic] Override _download_webpage_handle instead of _download_webpage +* [youtube] Correctly disable polymer on all requests (#16323, #16326) +* [generic] Prefer enclosures over links in RSS feeds (#16189) ++ [redditr] Add support for old.reddit.com URLs (#16274) +* [nrktv] Update API host (#16324) ++ [imdb] Extract all formats (#16249) ++ [vimeo] Extract JSON-LD (#16295) +* [funk:channel] Improve extraction (#16285) + + version 2018.04.25 Core From cc42941390b547ba950b4e76f4950be801f96134 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 1 May 2018 03:38:57 +0700 Subject: [PATCH 148/148] release 2018.05.01 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 4 ++++ youtube_dl/version.py | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 252fa0adf..c2bd5d8ae 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.04.25*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.04.25** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.01*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.01** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.04.25 +[debug] youtube-dl version 2018.05.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 7841ee765..916b8edb8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.05.01 Core * [downloader/fragment] Restart download if .ytdl file is corrupt (#16312) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a110f687b..c5a48002b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -667,6 +667,8 @@ - **qqmusic:playlist**: QQ音乐 - 歌单 - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 + - **Quickline** + - **QuicklineLive** - **R7** - **R7Article** - **radio.de** @@ -1092,6 +1094,8 @@ - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) - **Zapiks** - **Zaq1** + - **Zattoo** + - **ZattooLive** - **ZDF** - **ZDFChannel** - **zingmp3**: mp3.zing.vn diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4e3cb39c6..04896efc8 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.04.25' +__version__ = '2018.05.01'