From 748a462fbecc9c006d8e9ed6b3f596ff1893cf39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 19 Nov 2016 01:49:13 +0700 Subject: [PATCH 1/4] [twitter:card] Relax _VALID_URL (closes #11225) --- youtube_dl/extractor/twitter.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 3411fcf7e..ac0b221b4 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -25,7 +25,7 @@ class TwitterBaseIE(InfoExtractor): class TwitterCardIE(TwitterBaseIE): IE_NAME = 'twitter:card' - _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos/tweet)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos(?:/tweet)?)/(?P\d+)' _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', @@ -84,6 +84,9 @@ class TwitterCardIE(TwitterBaseIE): 'title': 'Twitter web player', 'thumbnail': 're:^https?://.*\.jpg', }, + }, { + 'url': 'https://twitter.com/i/videos/752274308186120192', + 'only_matching': True, }, ] From df46b19cb82b90807693d0d25ac5d817546dd63b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 19 Nov 2016 01:56:31 +0700 Subject: [PATCH 2/4] [toutv] Fix login form regex (closes #11223) --- youtube_dl/extractor/toutv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 573f2ff6b..26d770992 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -56,7 +56,7 @@ class TouTvIE(InfoExtractor): 'state': state, }) login_form = self._search_regex( - r'(?s)(]+id="Form-login".+?)', login_webpage, 'login form') + r'(?s)(]+(?:id|name)="Form-login".+?)', login_webpage, 'login form') form_data = self._hidden_inputs(login_form) form_data.update({ 'login-email': email, From 08ec95a6dba54aeec398c99f422abb2a5b59a7e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 19 Nov 2016 03:10:20 +0700 Subject: [PATCH 3/4] [ChangeLog] Actualize --- ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 874230f42..15129419c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,7 @@ version 2016.11.18 Extractors -* [youtube:live] Relax _VALID_URL (#11164) +* [youtube:live] Relax URL regular expression (#11164) * [openload] Fix extraction (#10408, #11122) * [vlive] Prefer locale over language for subtitles id (#11203) From 0aacd2deb1075e0d4d4b8b23b9a65b3967a1d658 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 19 Nov 2016 04:18:21 +0700 Subject: [PATCH 4/4] [bandcamp] Fix free downloads extraction and extract all formats (closes #11067) --- youtube_dl/extractor/bandcamp.py | 86 +++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 249c3d956..88c590e98 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -1,7 +1,9 @@ from __future__ import unicode_literals import json +import random import re +import time from .common import InfoExtractor from ..compat import ( @@ -12,6 +14,9 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + parse_filesize, + unescapeHTML, + update_url_query, ) @@ -81,35 +86,68 @@ class BandcampIE(InfoExtractor): r'(?ms)var TralbumData = .*?[{,]\s*id: (?P\d+),?$', webpage, 'video id') - download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page') - # We get the dictionary of the track from some javascript code - all_info = self._parse_json(self._search_regex( - r'(?sm)items: (.*?),$', download_webpage, 'items'), video_id) - info = all_info[0] - # We pick mp3-320 for now, until format selection can be easily implemented. - mp3_info = info['downloads']['mp3-320'] - # If we try to use this url it says the link has expired - initial_url = mp3_info['url'] - m_url = re.match( - r'(?Phttp://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P.*?)&id=(?P.*?)&ts=(?P.*)$', - initial_url) - # We build the url we will use to get the final track url - # This url is build in Bandcamp in the script download_bunde_*.js - request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), video_id, m_url.group('ts')) - final_url_webpage = self._download_webpage(request_url, video_id, 'Requesting download url') - # If we could correctly generate the .rand field the url would be - # in the "download_url" key - final_url = self._proto_relative_url(self._search_regex( - r'"retry_url":"(.+?)"', final_url_webpage, 'final video URL'), 'http:') + download_webpage = self._download_webpage( + download_link, video_id, 'Downloading free downloads page') + + blob = self._parse_json( + self._search_regex( + r'data-blob=(["\'])(?P{.+?})\1', download_webpage, + 'blob', group='blob'), + video_id, transform_source=unescapeHTML) + + info = blob['digital_items'][0] + + downloads = info['downloads'] + track = info['title'] + + artist = info.get('artist') + title = '%s - %s' % (artist, track) if artist else track + + download_formats = {} + for f in blob['download_formats']: + name, ext = f.get('name'), f.get('file_extension') + if all(isinstance(x, compat_str) for x in (name, ext)): + download_formats[name] = ext.strip('.') + + formats = [] + for format_id, f in downloads.items(): + format_url = f.get('url') + if not format_url: + continue + # Stat URL generation algorithm is reverse engineered from + # download_*_bundle_*.js + stat_url = update_url_query( + format_url.replace('/download/', '/statdownload/'), { + '.rand': int(time.time() * 1000 * random.random()), + }) + format_id = f.get('encoding_name') or format_id + stat = self._download_json( + stat_url, video_id, 'Downloading %s JSON' % format_id, + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1], + fatal=False) + if not stat: + continue + retry_url = stat.get('retry_url') + if not isinstance(retry_url, compat_str): + continue + formats.append({ + 'url': self._proto_relative_url(retry_url, 'http:'), + 'ext': download_formats.get(format_id), + 'format_id': format_id, + 'format_note': f.get('description'), + 'filesize': parse_filesize(f.get('size_mb')), + 'vcodec': 'none', + }) + self._sort_formats(formats) return { 'id': video_id, - 'title': info['title'], - 'ext': 'mp3', - 'vcodec': 'none', - 'url': final_url, + 'title': title, 'thumbnail': info.get('thumb_url'), 'uploader': info.get('artist'), + 'artist': artist, + 'track': track, + 'formats': formats, }