1
0
mirror of https://github.com/l1ving/youtube-dl synced 2020-11-18 19:53:54 -08:00

Add files via upload

Feature added to search for youtube playlists and getting correct youtube playlist id results.
Also added duration fetching for youtube videos, which already works, but the yield self.url_result(video_id, 'Youtube', video_id, video_title) return function has still to get changed to yield self.url_result(video_id, 'Youtube', video_id, video_title, video_duration) which I did not found where to do. Otherwise it returns missmatching count of arguments.
This commit is contained in:
Crypto90 2020-06-30 21:25:11 +02:00 committed by GitHub
parent e942cfd1a7
commit c12c50913f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -29,6 +29,7 @@ from ..compat import (
from ..utils import ( from ..utils import (
bool_or_none, bool_or_none,
clean_html, clean_html,
dict_get,
error_to_compat_str, error_to_compat_str,
extract_attributes, extract_attributes,
ExtractorError, ExtractorError,
@ -70,14 +71,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
_PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
_YOUTUBE_CLIENT_HEADERS = {
'x-youtube-client-name': '1',
'x-youtube-client-version': '1.20200609.04.02',
}
def _set_language(self): def _set_language(self):
self._set_cookie( self._set_cookie(
'.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en', '.youtube.com', 'PREF', 'f1=50000000&hl=en',
# YouTube sets the expire time to about two months # YouTube sets the expire time to about two months
expire_time=time.time() + 2 * 30 * 24 * 3600) expire_time=time.time() + 2 * 30 * 24 * 3600)
@ -306,8 +302,7 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
'Downloading page #%s%s' 'Downloading page #%s%s'
% (page_num, ' (retry #%d)' % count if count else ''), % (page_num, ' (retry #%d)' % count if count else ''),
transform_source=uppercase_escape, transform_source=uppercase_escape)
headers=self._YOUTUBE_CLIENT_HEADERS)
break break
except ExtractorError as e: except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
@ -326,35 +321,52 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
def _process_page(self, content): def _process_page(self, content):
for video_id, video_title in self.extract_videos_from_page(content): for video_id, video_title, video_duration in self.extract_videos_from_page(content):
if len(video_id) == 11:
#youtube video id found
yield self.url_result(video_id, 'Youtube', video_id, video_title) yield self.url_result(video_id, 'Youtube', video_id, video_title)
elif len(video_id) > 11:
#youtube playlist id found
yield self.url_result('https://www.youtube.com/playlist?list=%s' % video_id, 'YoutubePlaylist', video_id, video_title)
def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page, durations_in_page):
for mobj in re.finditer(video_re, page): for mobj in re.finditer(video_re, page):
# The link with index 0 is not the first video of the playlist (not sure if still actual) # The link with index 0 is not the first video of the playlist (not sure if still actual)
if 'index' in mobj.groupdict() and mobj.group('id') == '0': if 'index' in mobj.groupdict() and mobj.group('id') == '0':
continue continue
video_id = mobj.group('id') video_id = mobj.group('id')
video_title = unescapeHTML( playlist_id = mobj.group('plid') if 'plid' in mobj.groupdict() else None
mobj.group('title')) if 'title' in mobj.groupdict() else None if playlist_id != None:
video_id = playlist_id
video_title = unescapeHTML(mobj.group('title')) if 'title' in mobj.groupdict() else None
if video_title: if video_title:
video_title = video_title.strip() video_title = video_title.strip()
if video_title == '► Play all': if video_title == '► Play all':
video_title = None video_title = None
video_duration = mobj.group('duration') if 'duration' in mobj.groupdict() else None
if video_duration:
video_duration = video_duration.strip()
try: try:
idx = ids_in_page.index(video_id) idx = ids_in_page.index(video_id)
if video_title and not titles_in_page[idx]: if video_title and not titles_in_page[idx]:
titles_in_page[idx] = video_title titles_in_page[idx] = video_title
if video_duration and not durations_in_page[idx]:
durations_in_page[idx] = video_duration
except ValueError: except ValueError:
ids_in_page.append(video_id) ids_in_page.append(video_id)
titles_in_page.append(video_title) titles_in_page.append(video_title)
durations_in_page.append(video_duration)
def extract_videos_from_page(self, page): def extract_videos_from_page(self, page):
ids_in_page = [] ids_in_page = []
titles_in_page = [] titles_in_page = []
self.extract_videos_from_page_impl( durations_in_page = []
self._VIDEO_RE, page, ids_in_page, titles_in_page) self.extract_videos_from_page_impl(self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page)
return zip(ids_in_page, titles_in_page) return zip(ids_in_page, titles_in_page, durations_in_page)
class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
@ -394,15 +406,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?:www\.)?invidious\.drycat\.fr/| (?:www\.)?invidious\.drycat\.fr/|
(?:www\.)?tube\.poal\.co/| (?:www\.)?tube\.poal\.co/|
(?:www\.)?vid\.wxzm\.sx/| (?:www\.)?vid\.wxzm\.sx/|
(?:www\.)?yewtu\.be/|
(?:www\.)?yt\.elukerio\.org/| (?:www\.)?yt\.elukerio\.org/|
(?:www\.)?yt\.lelux\.fi/| (?:www\.)?yt\.lelux\.fi/|
(?:www\.)?invidious\.ggc-project\.de/|
(?:www\.)?yt\.maisputain\.ovh/|
(?:www\.)?invidious\.13ad\.de/|
(?:www\.)?invidious\.toot\.koeln/|
(?:www\.)?invidious\.fdn\.fr/|
(?:www\.)?watch\.nettohikari\.com/|
(?:www\.)?kgg2m7yk5aybusll\.onion/| (?:www\.)?kgg2m7yk5aybusll\.onion/|
(?:www\.)?qklhadlycap4cnod\.onion/| (?:www\.)?qklhadlycap4cnod\.onion/|
(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/| (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
@ -410,7 +415,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/| (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/| (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/| (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls (?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID: (?: # the various things that can precede the ID:
@ -440,10 +444,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?(1).+)? # if we found the ID, everything can follow (?(1).+)? # if we found the ID, everything can follow
$""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
_NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
_PLAYER_INFO_RE = (
r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
)
_formats = { _formats = {
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
@ -587,7 +587,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20120506', 'upload_date': '20120506',
'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
'alt_title': 'I Love It (feat. Charli XCX)', 'alt_title': 'I Love It (feat. Charli XCX)',
'description': 'md5:19a2f98d9032b9311e686ed039564f63', 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
'iconic ep', 'iconic', 'love', 'it'], 'iconic ep', 'iconic', 'love', 'it'],
@ -702,11 +702,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'nfWlot6h_JM', 'id': 'nfWlot6h_JM',
'ext': 'm4a', 'ext': 'm4a',
'title': 'Taylor Swift - Shake It Off', 'title': 'Taylor Swift - Shake It Off',
'description': 'md5:307195cd21ff7fa352270fe884570ef0', 'description': 'md5:bec2185232c05479482cb5a9b82719bf',
'duration': 242, 'duration': 242,
'uploader': 'TaylorSwiftVEVO', 'uploader': 'TaylorSwiftVEVO',
'uploader_id': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO',
'upload_date': '20140818', 'upload_date': '20140818',
'creator': 'Taylor Swift',
}, },
'params': { 'params': {
'youtube_include_dash_manifest': True, 'youtube_include_dash_manifest': True,
@ -771,11 +772,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20100430', 'upload_date': '20100430',
'uploader_id': 'deadmau5', 'uploader_id': 'deadmau5',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
'creator': 'Dada Life, deadmau5', 'creator': 'deadmau5',
'description': 'md5:12c56784b8032162bb936a5f76d55360', 'description': 'md5:12c56784b8032162bb936a5f76d55360',
'uploader': 'deadmau5', 'uploader': 'deadmau5',
'title': 'Deadmau5 - Some Chords (HD)', 'title': 'Deadmau5 - Some Chords (HD)',
'alt_title': 'This Machine Kills Some Chords', 'alt_title': 'Some Chords',
}, },
'expected_warnings': [ 'expected_warnings': [
'DASH manifest missing', 'DASH manifest missing',
@ -1151,7 +1152,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'skip_download': True, 'skip_download': True,
'youtube_include_dash_manifest': False, 'youtube_include_dash_manifest': False,
}, },
'skip': 'not actual anymore',
}, },
{ {
# Youtube Music Auto-generated description # Youtube Music Auto-generated description
@ -1162,8 +1162,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Voyeur Girl', 'title': 'Voyeur Girl',
'description': 'md5:7ae382a65843d6df2685993e90a8628f', 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
'upload_date': '20190312', 'upload_date': '20190312',
'uploader': 'Stephen - Topic', 'uploader': 'Various Artists - Topic',
'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA', 'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw',
'artist': 'Stephen', 'artist': 'Stephen',
'track': 'Voyeur Girl', 'track': 'Voyeur Girl',
'album': 'it\'s too much love to know my dear', 'album': 'it\'s too much love to know my dear',
@ -1227,7 +1227,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': '-hcAI0g-f5M', 'id': '-hcAI0g-f5M',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Put It On Me', 'title': 'Put It On Me',
'description': 'md5:f6422397c07c4c907c6638e1fee380a5', 'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e',
'upload_date': '20180426', 'upload_date': '20180426',
'uploader': 'Matt Maeson - Topic', 'uploader': 'Matt Maeson - Topic',
'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
@ -1245,26 +1245,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q', 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
'only_matching': True, 'only_matching': True,
}, },
{
# invalid -> valid video id redirection
'url': 'DJztXj2GPfl',
'info_dict': {
'id': 'DJztXj2GPfk',
'ext': 'mp4',
'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
'description': 'md5:bf577a41da97918e94fa9798d9228825',
'upload_date': '20090125',
'uploader': 'Prochorowka',
'uploader_id': 'Prochorowka',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
'artist': 'Panjabi MC',
'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
'album': 'Beware of the Boys (Mundian To Bach Ke)',
},
'params': {
'skip_download': True,
},
}
] ]
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -1291,18 +1271,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
""" Return a string representation of a signature """ """ Return a string representation of a signature """
return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
@classmethod
def _extract_player_info(cls, player_url):
for player_re in cls._PLAYER_INFO_RE:
id_m = re.search(player_re, player_url)
if id_m:
break
else:
raise ExtractorError('Cannot identify player %r' % player_url)
return id_m.group('ext'), id_m.group('id')
def _extract_signature_function(self, video_id, player_url, example_sig): def _extract_signature_function(self, video_id, player_url, example_sig):
player_type, player_id = self._extract_player_info(player_url) id_m = re.match(
r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
player_url)
if not id_m:
raise ExtractorError('Cannot identify player %r' % player_url)
player_type = id_m.group('ext')
player_id = id_m.group('id')
# Read from filesystem cache # Read from filesystem cache
func_id = '%s_%s_%s' % ( func_id = '%s_%s_%s' % (
@ -1384,7 +1360,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
funcname = self._search_regex( funcname = self._search_regex(
(r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\b(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
# Obsolete patterns # Obsolete patterns
r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
@ -1658,63 +1633,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_id = mobj.group(2) video_id = mobj.group(2)
return video_id return video_id
def _extract_chapters_from_json(self, webpage, video_id, duration):
if not webpage:
return
player = self._parse_json(
self._search_regex(
r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage,
'player args', default='{}'),
video_id, fatal=False)
if not player or not isinstance(player, dict):
return
watch_next_response = player.get('watch_next_response')
if not isinstance(watch_next_response, compat_str):
return
response = self._parse_json(watch_next_response, video_id, fatal=False)
if not response or not isinstance(response, dict):
return
chapters_list = try_get(
response,
lambda x: x['playerOverlays']
['playerOverlayRenderer']
['decoratedPlayerBarRenderer']
['decoratedPlayerBarRenderer']
['playerBar']
['chapteredPlayerBarRenderer']
['chapters'],
list)
if not chapters_list:
return
def chapter_time(chapter):
return float_or_none(
try_get(
chapter,
lambda x: x['chapterRenderer']['timeRangeStartMillis'],
int),
scale=1000)
chapters = []
for next_num, chapter in enumerate(chapters_list, start=1):
start_time = chapter_time(chapter)
if start_time is None:
continue
end_time = (chapter_time(chapters_list[next_num])
if next_num < len(chapters_list) else duration)
if end_time is None:
continue
title = try_get(
chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
compat_str)
chapters.append({
'start_time': start_time,
'end_time': end_time,
'title': title,
})
return chapters
@staticmethod @staticmethod
def _extract_chapters_from_description(description, duration): def _extract_chapters(description, duration):
if not description: if not description:
return None return None
chapter_lines = re.findall( chapter_lines = re.findall(
@ -1748,10 +1668,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}) })
return chapters return chapters
def _extract_chapters(self, webpage, description, video_id, duration):
return (self._extract_chapters_from_json(webpage, video_id, duration)
or self._extract_chapters_from_description(description, duration))
def _real_extract(self, url): def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {}) url, smuggled_data = unsmuggle_url(url, {})
@ -1779,10 +1695,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Get video webpage # Get video webpage
url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
video_webpage, urlh = self._download_webpage_handle(url, video_id) video_webpage = self._download_webpage(url, video_id)
qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
video_id = qs.get('v', [None])[0] or video_id
# Attempt to extract SWF player URL # Attempt to extract SWF player URL
mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
@ -1811,6 +1724,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def extract_view_count(v_info): def extract_view_count(v_info):
return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
def extract_token(v_info):
return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token'))
def extract_player_response(player_response, video_id): def extract_player_response(player_response, video_id):
pl_response = str_or_none(player_response) pl_response = str_or_none(player_response)
if not pl_response: if not pl_response:
@ -1823,7 +1739,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_response = {} player_response = {}
# Get video info # Get video info
video_info = {}
embed_webpage = None embed_webpage = None
if re.search(r'player-age-gate-content">', video_webpage) is not None: if re.search(r'player-age-gate-content">', video_webpage) is not None:
age_gate = True age_gate = True
@ -1838,14 +1753,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
}) })
video_info_url = proto + '://www.youtube.com/get_video_info?' + data video_info_url = proto + '://www.youtube.com/get_video_info?' + data
try:
video_info_webpage = self._download_webpage( video_info_webpage = self._download_webpage(
video_info_url, video_id, video_info_url, video_id,
note='Refetching age-gated info webpage', note='Refetching age-gated info webpage',
errnote='unable to download video info webpage') errnote='unable to download video info webpage')
except ExtractorError:
video_info_webpage = None
if video_info_webpage:
video_info = compat_parse_qs(video_info_webpage) video_info = compat_parse_qs(video_info_webpage)
pl_response = video_info.get('player_response', [None])[0] pl_response = video_info.get('player_response', [None])[0]
player_response = extract_player_response(pl_response, video_id) player_response = extract_player_response(pl_response, video_id)
@ -1853,6 +1764,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
view_count = extract_view_count(video_info) view_count = extract_view_count(video_info)
else: else:
age_gate = False age_gate = False
video_info = None
sts = None
# Try looking directly into the video webpage # Try looking directly into the video webpage
ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
if ytplayer_config: if ytplayer_config:
@ -1869,10 +1782,61 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
if args.get('livestream') == '1' or args.get('live_playback') == 1: if args.get('livestream') == '1' or args.get('live_playback') == 1:
is_live = True is_live = True
sts = ytplayer_config.get('sts')
if not player_response: if not player_response:
player_response = extract_player_response(args.get('player_response'), video_id) player_response = extract_player_response(args.get('player_response'), video_id)
if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
add_dash_mpd_pr(player_response) add_dash_mpd_pr(player_response)
# We also try looking in get_video_info since it may contain different dashmpd
# URL that points to a DASH manifest with possibly different itag set (some itags
# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
# manifest pointed by get_video_info's dashmpd).
# The general idea is to take a union of itags of both DASH manifests (for example
# video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093)
self.report_video_info_webpage_download(video_id)
for el in ('embedded', 'detailpage', 'vevo', ''):
query = {
'video_id': video_id,
'ps': 'default',
'eurl': '',
'gl': 'US',
'hl': 'en',
}
if el:
query['el'] = el
if sts:
query['sts'] = sts
video_info_webpage = self._download_webpage(
'%s://www.youtube.com/get_video_info' % proto,
video_id, note=False,
errnote='unable to download video info webpage',
fatal=False, query=query)
if not video_info_webpage:
continue
get_video_info = compat_parse_qs(video_info_webpage)
if not player_response:
pl_response = get_video_info.get('player_response', [None])[0]
player_response = extract_player_response(pl_response, video_id)
add_dash_mpd(get_video_info)
if view_count is None:
view_count = extract_view_count(get_video_info)
if not video_info:
video_info = get_video_info
get_token = extract_token(get_video_info)
if get_token:
# Different get_video_info requests may report different results, e.g.
# some may report video unavailability, but some may serve it without
# any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362,
# the original webpage as well as el=info and el=embedded get_video_info
# requests report video unavailability due to geo restriction while
# el=detailpage succeeds and returns valid data). This is probably
# due to YouTube measures against IP ranges of hosting providers.
# Working around by preferring the first succeeded video_info containing
# the token if no such video_info yet was found.
token = extract_token(video_info)
if not token:
video_info = get_video_info
break
def extract_unavailable_message(): def extract_unavailable_message():
messages = [] messages = []
@ -1885,22 +1849,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if messages: if messages:
return '\n'.join(messages) return '\n'.join(messages)
if not video_info and not player_response: if not video_info:
unavailable_message = extract_unavailable_message() unavailable_message = extract_unavailable_message()
if not unavailable_message: if not unavailable_message:
unavailable_message = 'Unable to extract video data' unavailable_message = 'Unable to extract video data'
raise ExtractorError( raise ExtractorError(
'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
if not isinstance(video_info, dict):
video_info = {}
video_details = try_get( video_details = try_get(
player_response, lambda x: x['videoDetails'], dict) or {} player_response, lambda x: x['videoDetails'], dict) or {}
microformat = try_get(
player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
video_title = video_info.get('title', [None])[0] or video_details.get('title') video_title = video_info.get('title', [None])[0] or video_details.get('title')
if not video_title: if not video_title:
self._downloader.report_warning('Unable to extract video title') self._downloader.report_warning('Unable to extract video title')
@ -1947,26 +1905,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# fields may contain comma as well (see # fields may contain comma as well (see
# https://github.com/ytdl-org/youtube-dl/issues/8536) # https://github.com/ytdl-org/youtube-dl/issues/8536)
feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
def feed_entry(name):
return try_get(feed_data, lambda x: x[name][0], compat_str)
feed_id = feed_entry('id')
if not feed_id:
continue
feed_title = feed_entry('title')
title = video_title
if feed_title:
title += ' (%s)' % feed_title
entries.append({ entries.append({
'_type': 'url_transparent', '_type': 'url_transparent',
'ie_key': 'Youtube', 'ie_key': 'Youtube',
'url': smuggle_url( 'url': smuggle_url(
'%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
{'force_singlefeed': True}), {'force_singlefeed': True}),
'title': title, 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
}) })
feed_ids.append(feed_id) feed_ids.append(feed_data['id'][0])
self.to_screen( self.to_screen(
'Downloading multifeed video (%s) - add --no-playlist to just download video %s' 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
% (', '.join(feed_ids), video_id)) % (', '.join(feed_ids), video_id))
@ -1978,8 +1925,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
view_count = extract_view_count(video_info) view_count = extract_view_count(video_info)
if view_count is None and video_details: if view_count is None and video_details:
view_count = int_or_none(video_details.get('viewCount')) view_count = int_or_none(video_details.get('viewCount'))
if view_count is None and microformat:
view_count = int_or_none(microformat.get('viewCount'))
if is_live is None: if is_live is None:
is_live = bool_or_none(video_details.get('isLive')) is_live = bool_or_none(video_details.get('isLive'))
@ -2039,12 +1984,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
} }
for fmt in streaming_formats: for fmt in streaming_formats:
if fmt.get('drmFamilies') or fmt.get('drm_families'): if fmt.get('drm_families'):
continue continue
url = url_or_none(fmt.get('url')) url = url_or_none(fmt.get('url'))
if not url: if not url:
cipher = fmt.get('cipher') or fmt.get('signatureCipher') cipher = fmt.get('cipher')
if not cipher: if not cipher:
continue continue
url_data = compat_parse_qs(cipher) url_data = compat_parse_qs(cipher)
@ -2095,10 +2040,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if self._downloader.params.get('verbose'): if self._downloader.params.get('verbose'):
if player_url is None: if player_url is None:
player_version = 'unknown'
player_desc = 'unknown' player_desc = 'unknown'
else: else:
player_type, player_version = self._extract_player_info(player_url) if player_url.endswith('swf'):
player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version) player_version = self._search_regex(
r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
'flash player', fatal=False)
player_desc = 'flash player %s' % player_version
else:
player_version = self._search_regex(
[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
parts_sizes = self._signature_cache_id(encrypted_sig) parts_sizes = self._signature_cache_id(encrypted_sig)
self.to_screen('{%s} signature length %s, %s' % self.to_screen('{%s} signature length %s, %s' %
(format_id, parts_sizes, player_desc)) (format_id, parts_sizes, player_desc))
@ -2231,12 +2188,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_uploader_id = mobj.group('uploader_id') video_uploader_id = mobj.group('uploader_id')
video_uploader_url = mobj.group('uploader_url') video_uploader_url = mobj.group('uploader_url')
else: else:
owner_profile_url = url_or_none(microformat.get('ownerProfileUrl')) self._downloader.report_warning('unable to extract uploader nickname')
if owner_profile_url:
video_uploader_id = self._search_regex(
r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
default=None)
video_uploader_url = owner_profile_url
channel_id = ( channel_id = (
str_or_none(video_details.get('channelId')) str_or_none(video_details.get('channelId'))
@ -2247,33 +2199,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_webpage, 'channel id', default=None, group='id')) video_webpage, 'channel id', default=None, group='id'))
channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
thumbnails = [] # thumbnail image
thumbnails_list = try_get(
video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
for t in thumbnails_list:
if not isinstance(t, dict):
continue
thumbnail_url = url_or_none(t.get('url'))
if not thumbnail_url:
continue
thumbnails.append({
'url': thumbnail_url,
'width': int_or_none(t.get('width')),
'height': int_or_none(t.get('height')),
})
if not thumbnails:
video_thumbnail = None
# We try first to get a high quality image: # We try first to get a high quality image:
m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
video_webpage, re.DOTALL) video_webpage, re.DOTALL)
if m_thumb is not None: if m_thumb is not None:
video_thumbnail = m_thumb.group(1) video_thumbnail = m_thumb.group(1)
thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str) elif 'thumbnail_url' not in video_info:
if thumbnail_url: self._downloader.report_warning('unable to extract video thumbnail')
video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url) video_thumbnail = None
if video_thumbnail: else: # don't panic if we can't find it
thumbnails.append({'url': video_thumbnail}) video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
# upload date # upload date
upload_date = self._html_search_meta( upload_date = self._html_search_meta(
@ -2283,8 +2219,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
[r'(?s)id="eow-date.*?>(.*?)</span>', [r'(?s)id="eow-date.*?>(.*?)</span>',
r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
video_webpage, 'upload date', default=None) video_webpage, 'upload date', default=None)
if not upload_date:
upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
upload_date = unified_strdate(upload_date) upload_date = unified_strdate(upload_date)
video_license = self._html_search_regex( video_license = self._html_search_regex(
@ -2356,21 +2290,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
m_cat_container = self._search_regex( m_cat_container = self._search_regex(
r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
video_webpage, 'categories', default=None) video_webpage, 'categories', default=None)
category = None
if m_cat_container: if m_cat_container:
category = self._html_search_regex( category = self._html_search_regex(
r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
default=None) default=None)
if not category:
category = try_get(
microformat, lambda x: x['category'], compat_str)
video_categories = None if category is None else [category] video_categories = None if category is None else [category]
else:
video_categories = None
video_tags = [ video_tags = [
unescapeHTML(m.group('content')) unescapeHTML(m.group('content'))
for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
if not video_tags:
video_tags = try_get(video_details, lambda x: x['keywords'], list)
def _extract_count(count_name): def _extract_count(count_name):
return str_to_int(self._search_regex( return str_to_int(self._search_regex(
@ -2421,7 +2351,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
errnote='Unable to download video annotations', fatal=False, errnote='Unable to download video annotations', fatal=False,
data=urlencode_postdata({xsrf_field_name: xsrf_token})) data=urlencode_postdata({xsrf_field_name: xsrf_token}))
chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration) chapters = self._extract_chapters(description_original, video_duration)
# Look for the DASH manifest # Look for the DASH manifest
if self._downloader.params.get('youtube_include_dash_manifest', True): if self._downloader.params.get('youtube_include_dash_manifest', True):
@ -2478,6 +2408,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
f['stretched_ratio'] = ratio f['stretched_ratio'] = ratio
if not formats: if not formats:
token = extract_token(video_info)
if not token:
if 'reason' in video_info: if 'reason' in video_info:
if 'The uploader has not made this video available in your country.' in video_info['reason']: if 'The uploader has not made this video available in your country.' in video_info['reason']:
regions_allowed = self._html_search_meta( regions_allowed = self._html_search_meta(
@ -2493,7 +2425,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
raise ExtractorError( raise ExtractorError(
'YouTube said: %s' % reason, 'YouTube said: %s' % reason,
expected=True, video_id=video_id) expected=True, video_id=video_id)
if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']): else:
raise ExtractorError(
'"token" parameter not in video info for unknown reason',
video_id=video_id)
if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])):
raise ExtractorError('This video is DRM protected.', expected=True) raise ExtractorError('This video is DRM protected.', expected=True)
self._sort_formats(formats) self._sort_formats(formats)
@ -2512,7 +2449,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'creator': video_creator or artist, 'creator': video_creator or artist,
'title': video_title, 'title': video_title,
'alt_title': video_alt_title or track, 'alt_title': video_alt_title or track,
'thumbnails': thumbnails, 'thumbnail': video_thumbnail,
'description': video_description, 'description': video_description,
'categories': video_categories, 'categories': video_categories,
'tags': video_tags, 'tags': video_tags,
@ -2574,23 +2511,20 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
_VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
IE_NAME = 'youtube:playlist' IE_NAME = 'youtube:playlist'
_TESTS = [{ _TESTS = [{
'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
'info_dict': { 'info_dict': {
'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', 'title': 'ytdl test PL',
'uploader': 'Sergey M.', 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
'title': 'youtube-dl public playlist',
}, },
'playlist_count': 1, 'playlist_count': 3,
}, { }, {
'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
'info_dict': { 'info_dict': {
'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
'uploader': 'Sergey M.', 'title': 'YDL_Empty_List',
'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
'title': 'youtube-dl empty playlist',
}, },
'playlist_count': 0, 'playlist_count': 0,
'skip': 'This playlist is private',
}, { }, {
'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
@ -2600,7 +2534,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'uploader': 'Christiaan008', 'uploader': 'Christiaan008',
'uploader_id': 'ChRiStIaAn008', 'uploader_id': 'ChRiStIaAn008',
}, },
'playlist_count': 96, 'playlist_count': 95,
}, { }, {
'note': 'issue #673', 'note': 'issue #673',
'url': 'PLBB231211A4F62143', 'url': 'PLBB231211A4F62143',
@ -3116,7 +3050,7 @@ class YoutubeLiveIE(YoutubeBaseInfoExtractor):
class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
IE_DESC = 'YouTube.com user/channel playlists' IE_DESC = 'YouTube.com user/channel playlists'
_VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists' _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
IE_NAME = 'youtube:playlists' IE_NAME = 'youtube:playlists'
_TESTS = [{ _TESTS = [{
@ -3142,14 +3076,11 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
'title': 'Chem Player', 'title': 'Chem Player',
}, },
'skip': 'Blocked', 'skip': 'Blocked',
}, {
'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
'only_matching': True,
}] }]
class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&amp;list=(?P<plid>[0-9A-Za-z_-]+))?(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?(.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?'
class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
@ -3291,8 +3222,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
more = self._download_json( more = self._download_json(
'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
'Downloading page #%s' % page_num, 'Downloading page #%s' % page_num,
transform_source=uppercase_escape, transform_source=uppercase_escape)
headers=self._YOUTUBE_CLIENT_HEADERS)
content_html = more['content_html'] content_html = more['content_html']
more_widget_html = more['load_more_widget_html'] more_widget_html = more['load_more_widget_html']