From c12c50913f5fff8d104c5e913b0b698f56499dd4 Mon Sep 17 00:00:00 2001 From: Crypto90 Date: Tue, 30 Jun 2020 21:25:11 +0200 Subject: [PATCH] Add files via upload Feature added to search for youtube playlists and getting correct youtube playlist id results. Also added duration fetching for youtube videos, which already works, but the yield self.url_result(video_id, 'Youtube', video_id, video_title) return function has still to get changed to yield self.url_result(video_id, 'Youtube', video_id, video_title, video_duration) which I did not found where to do. Otherwise it returns missmatching count of arguments. --- youtube_dl/extractor/youtube.py | 436 ++++++++++++++------------------ 1 file changed, 183 insertions(+), 253 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 43c5eff1f..7a4f3ded4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -29,6 +29,7 @@ from ..compat import ( from ..utils import ( bool_or_none, clean_html, + dict_get, error_to_compat_str, extract_attributes, ExtractorError, @@ -70,14 +71,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' - _YOUTUBE_CLIENT_HEADERS = { - 'x-youtube-client-name': '1', - 'x-youtube-client-version': '1.20200609.04.02', - } - def _set_language(self): self._set_cookie( - '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en', + '.youtube.com', 'PREF', 'f1=50000000&hl=en', # YouTube sets the expire time to about two months expire_time=time.time() + 2 * 30 * 24 * 3600) @@ -306,8 +302,7 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): 'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), - transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) + transform_source=uppercase_escape) break except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): @@ -326,35 +321,52 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): - for video_id, video_title in self.extract_videos_from_page(content): - yield self.url_result(video_id, 'Youtube', video_id, video_title) + for video_id, video_title, video_duration in self.extract_videos_from_page(content): + if len(video_id) == 11: + #youtube video id found + yield self.url_result(video_id, 'Youtube', video_id, video_title) + elif len(video_id) > 11: + #youtube playlist id found + yield self.url_result('https://www.youtube.com/playlist?list=%s' % video_id, 'YoutubePlaylist', video_id, video_title) - def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): + def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page, durations_in_page): for mobj in re.finditer(video_re, page): # The link with index 0 is not the first video of the playlist (not sure if still actual) if 'index' in mobj.groupdict() and mobj.group('id') == '0': continue video_id = mobj.group('id') - video_title = unescapeHTML( - mobj.group('title')) if 'title' in mobj.groupdict() else None + playlist_id = mobj.group('plid') if 'plid' in mobj.groupdict() else None + if playlist_id != None: + video_id = playlist_id + video_title = unescapeHTML(mobj.group('title')) if 'title' in mobj.groupdict() else None if video_title: video_title = video_title.strip() if video_title == '► Play all': video_title = None + video_duration = mobj.group('duration') if 'duration' in mobj.groupdict() else None + if video_duration: + video_duration = video_duration.strip() try: idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: titles_in_page[idx] = video_title + + + if video_duration and not durations_in_page[idx]: + durations_in_page[idx] = video_duration + except ValueError: ids_in_page.append(video_id) titles_in_page.append(video_title) - + durations_in_page.append(video_duration) + def extract_videos_from_page(self, page): ids_in_page = [] titles_in_page = [] - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) + durations_in_page = [] + self.extract_videos_from_page_impl(self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page) + return zip(ids_in_page, titles_in_page, durations_in_page) class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): @@ -394,15 +406,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?invidious\.drycat\.fr/| (?:www\.)?tube\.poal\.co/| (?:www\.)?vid\.wxzm\.sx/| - (?:www\.)?yewtu\.be/| (?:www\.)?yt\.elukerio\.org/| (?:www\.)?yt\.lelux\.fi/| - (?:www\.)?invidious\.ggc-project\.de/| - (?:www\.)?yt\.maisputain\.ovh/| - (?:www\.)?invidious\.13ad\.de/| - (?:www\.)?invidious\.toot\.koeln/| - (?:www\.)?invidious\.fdn\.fr/| - (?:www\.)?watch\.nettohikari\.com/| (?:www\.)?kgg2m7yk5aybusll\.onion/| (?:www\.)?qklhadlycap4cnod\.onion/| (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/| @@ -410,7 +415,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/| (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/| (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/| - (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: @@ -440,10 +444,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?(1).+)? # if we found the ID, everything can follow $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' - _PLAYER_INFO_RE = ( - r'/(?P[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P[a-z]+)$', - r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.(?P[a-z]+)$', - ) _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -587,7 +587,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', 'alt_title': 'I Love It (feat. Charli XCX)', - 'description': 'md5:19a2f98d9032b9311e686ed039564f63', + 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8', 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'iconic ep', 'iconic', 'love', 'it'], @@ -702,11 +702,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'nfWlot6h_JM', 'ext': 'm4a', 'title': 'Taylor Swift - Shake It Off', - 'description': 'md5:307195cd21ff7fa352270fe884570ef0', + 'description': 'md5:bec2185232c05479482cb5a9b82719bf', 'duration': 242, 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', + 'creator': 'Taylor Swift', }, 'params': { 'youtube_include_dash_manifest': True, @@ -771,11 +772,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20100430', 'uploader_id': 'deadmau5', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', - 'creator': 'Dada Life, deadmau5', + 'creator': 'deadmau5', 'description': 'md5:12c56784b8032162bb936a5f76d55360', 'uploader': 'deadmau5', 'title': 'Deadmau5 - Some Chords (HD)', - 'alt_title': 'This Machine Kills Some Chords', + 'alt_title': 'Some Chords', }, 'expected_warnings': [ 'DASH manifest missing', @@ -1151,7 +1152,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, 'youtube_include_dash_manifest': False, }, - 'skip': 'not actual anymore', }, { # Youtube Music Auto-generated description @@ -1162,8 +1162,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Voyeur Girl', 'description': 'md5:7ae382a65843d6df2685993e90a8628f', 'upload_date': '20190312', - 'uploader': 'Stephen - Topic', - 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA', + 'uploader': 'Various Artists - Topic', + 'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw', 'artist': 'Stephen', 'track': 'Voyeur Girl', 'album': 'it\'s too much love to know my dear', @@ -1227,7 +1227,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': '-hcAI0g-f5M', 'ext': 'mp4', 'title': 'Put It On Me', - 'description': 'md5:f6422397c07c4c907c6638e1fee380a5', + 'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e', 'upload_date': '20180426', 'uploader': 'Matt Maeson - Topic', 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', @@ -1245,26 +1245,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q', 'only_matching': True, }, - { - # invalid -> valid video id redirection - 'url': 'DJztXj2GPfl', - 'info_dict': { - 'id': 'DJztXj2GPfk', - 'ext': 'mp4', - 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)', - 'description': 'md5:bf577a41da97918e94fa9798d9228825', - 'upload_date': '20090125', - 'uploader': 'Prochorowka', - 'uploader_id': 'Prochorowka', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka', - 'artist': 'Panjabi MC', - 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix', - 'album': 'Beware of the Boys (Mundian To Bach Ke)', - }, - 'params': { - 'skip_download': True, - }, - } ] def __init__(self, *args, **kwargs): @@ -1291,18 +1271,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): """ Return a string representation of a signature """ return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) - @classmethod - def _extract_player_info(cls, player_url): - for player_re in cls._PLAYER_INFO_RE: - id_m = re.search(player_re, player_url) - if id_m: - break - else: - raise ExtractorError('Cannot identify player %r' % player_url) - return id_m.group('ext'), id_m.group('id') - def _extract_signature_function(self, video_id, player_url, example_sig): - player_type, player_id = self._extract_player_info(player_url) + id_m = re.match( + r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P[a-z]+)$', + player_url) + if not id_m: + raise ExtractorError('Cannot identify player %r' % player_url) + player_type = id_m.group('ext') + player_id = id_m.group('id') # Read from filesystem cache func_id = '%s_%s_%s' % ( @@ -1384,7 +1360,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', - r'\b(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', @@ -1658,63 +1633,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = mobj.group(2) return video_id - def _extract_chapters_from_json(self, webpage, video_id, duration): - if not webpage: - return - player = self._parse_json( - self._search_regex( - r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage, - 'player args', default='{}'), - video_id, fatal=False) - if not player or not isinstance(player, dict): - return - watch_next_response = player.get('watch_next_response') - if not isinstance(watch_next_response, compat_str): - return - response = self._parse_json(watch_next_response, video_id, fatal=False) - if not response or not isinstance(response, dict): - return - chapters_list = try_get( - response, - lambda x: x['playerOverlays'] - ['playerOverlayRenderer'] - ['decoratedPlayerBarRenderer'] - ['decoratedPlayerBarRenderer'] - ['playerBar'] - ['chapteredPlayerBarRenderer'] - ['chapters'], - list) - if not chapters_list: - return - - def chapter_time(chapter): - return float_or_none( - try_get( - chapter, - lambda x: x['chapterRenderer']['timeRangeStartMillis'], - int), - scale=1000) - chapters = [] - for next_num, chapter in enumerate(chapters_list, start=1): - start_time = chapter_time(chapter) - if start_time is None: - continue - end_time = (chapter_time(chapters_list[next_num]) - if next_num < len(chapters_list) else duration) - if end_time is None: - continue - title = try_get( - chapter, lambda x: x['chapterRenderer']['title']['simpleText'], - compat_str) - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': title, - }) - return chapters - @staticmethod - def _extract_chapters_from_description(description, duration): + def _extract_chapters(description, duration): if not description: return None chapter_lines = re.findall( @@ -1748,10 +1668,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }) return chapters - def _extract_chapters(self, webpage, description, video_id, duration): - return (self._extract_chapters_from_json(webpage, video_id, duration) - or self._extract_chapters_from_description(description, duration)) - def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -1779,10 +1695,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Get video webpage url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id - video_webpage, urlh = self._download_webpage_handle(url, video_id) - - qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query) - video_id = qs.get('v', [None])[0] or video_id + video_webpage = self._download_webpage(url, video_id) # Attempt to extract SWF player URL mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) @@ -1811,6 +1724,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def extract_view_count(v_info): return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) + def extract_token(v_info): + return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token')) + def extract_player_response(player_response, video_id): pl_response = str_or_none(player_response) if not pl_response: @@ -1823,7 +1739,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_response = {} # Get video info - video_info = {} embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True @@ -1838,21 +1753,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data - try: - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - except ExtractorError: - video_info_webpage = None - if video_info_webpage: - video_info = compat_parse_qs(video_info_webpage) - pl_response = video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(video_info) - view_count = extract_view_count(video_info) + video_info_webpage = self._download_webpage( + video_info_url, video_id, + note='Refetching age-gated info webpage', + errnote='unable to download video info webpage') + video_info = compat_parse_qs(video_info_webpage) + pl_response = video_info.get('player_response', [None])[0] + player_response = extract_player_response(pl_response, video_id) + add_dash_mpd(video_info) + view_count = extract_view_count(video_info) else: age_gate = False + video_info = None + sts = None # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: @@ -1869,10 +1782,61 @@ class YoutubeIE(YoutubeBaseInfoExtractor): args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True + sts = ytplayer_config.get('sts') if not player_response: player_response = extract_player_response(args.get('player_response'), video_id) if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): add_dash_mpd_pr(player_response) + # We also try looking in get_video_info since it may contain different dashmpd + # URL that points to a DASH manifest with possibly different itag set (some itags + # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH + # manifest pointed by get_video_info's dashmpd). + # The general idea is to take a union of itags of both DASH manifests (for example + # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093) + self.report_video_info_webpage_download(video_id) + for el in ('embedded', 'detailpage', 'vevo', ''): + query = { + 'video_id': video_id, + 'ps': 'default', + 'eurl': '', + 'gl': 'US', + 'hl': 'en', + } + if el: + query['el'] = el + if sts: + query['sts'] = sts + video_info_webpage = self._download_webpage( + '%s://www.youtube.com/get_video_info' % proto, + video_id, note=False, + errnote='unable to download video info webpage', + fatal=False, query=query) + if not video_info_webpage: + continue + get_video_info = compat_parse_qs(video_info_webpage) + if not player_response: + pl_response = get_video_info.get('player_response', [None])[0] + player_response = extract_player_response(pl_response, video_id) + add_dash_mpd(get_video_info) + if view_count is None: + view_count = extract_view_count(get_video_info) + if not video_info: + video_info = get_video_info + get_token = extract_token(get_video_info) + if get_token: + # Different get_video_info requests may report different results, e.g. + # some may report video unavailability, but some may serve it without + # any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362, + # the original webpage as well as el=info and el=embedded get_video_info + # requests report video unavailability due to geo restriction while + # el=detailpage succeeds and returns valid data). This is probably + # due to YouTube measures against IP ranges of hosting providers. + # Working around by preferring the first succeeded video_info containing + # the token if no such video_info yet was found. + token = extract_token(video_info) + if not token: + video_info = get_video_info + break def extract_unavailable_message(): messages = [] @@ -1885,22 +1849,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if messages: return '\n'.join(messages) - if not video_info and not player_response: + if not video_info: unavailable_message = extract_unavailable_message() if not unavailable_message: unavailable_message = 'Unable to extract video data' raise ExtractorError( 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) - if not isinstance(video_info, dict): - video_info = {} - video_details = try_get( player_response, lambda x: x['videoDetails'], dict) or {} - microformat = try_get( - player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {} - video_title = video_info.get('title', [None])[0] or video_details.get('title') if not video_title: self._downloader.report_warning('Unable to extract video title') @@ -1947,26 +1905,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # fields may contain comma as well (see # https://github.com/ytdl-org/youtube-dl/issues/8536) feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) - - def feed_entry(name): - return try_get(feed_data, lambda x: x[name][0], compat_str) - - feed_id = feed_entry('id') - if not feed_id: - continue - feed_title = feed_entry('title') - title = video_title - if feed_title: - title += ' (%s)' % feed_title entries.append({ '_type': 'url_transparent', 'ie_key': 'Youtube', 'url': smuggle_url( '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), {'force_singlefeed': True}), - 'title': title, + 'title': '%s (%s)' % (video_title, feed_data['title'][0]), }) - feed_ids.append(feed_id) + feed_ids.append(feed_data['id'][0]) self.to_screen( 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' % (', '.join(feed_ids), video_id)) @@ -1978,8 +1925,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): view_count = extract_view_count(video_info) if view_count is None and video_details: view_count = int_or_none(video_details.get('viewCount')) - if view_count is None and microformat: - view_count = int_or_none(microformat.get('viewCount')) if is_live is None: is_live = bool_or_none(video_details.get('isLive')) @@ -2039,12 +1984,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } for fmt in streaming_formats: - if fmt.get('drmFamilies') or fmt.get('drm_families'): + if fmt.get('drm_families'): continue url = url_or_none(fmt.get('url')) if not url: - cipher = fmt.get('cipher') or fmt.get('signatureCipher') + cipher = fmt.get('cipher') if not cipher: continue url_data = compat_parse_qs(cipher) @@ -2095,10 +2040,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('verbose'): if player_url is None: + player_version = 'unknown' player_desc = 'unknown' else: - player_type, player_version = self._extract_player_info(player_url) - player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version) + if player_url.endswith('swf'): + player_version = self._search_regex( + r'-(.+?)(?:/watch_as3)?\.swf$', player_url, + 'flash player', fatal=False) + player_desc = 'flash player %s' % player_version + else: + player_version = self._search_regex( + [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', + r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], + player_url, + 'html5 player', fatal=False) + player_desc = 'html5 player %s' % player_version + parts_sizes = self._signature_cache_id(encrypted_sig) self.to_screen('{%s} signature length %s, %s' % (format_id, parts_sizes, player_desc)) @@ -2231,12 +2188,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_uploader_id = mobj.group('uploader_id') video_uploader_url = mobj.group('uploader_url') else: - owner_profile_url = url_or_none(microformat.get('ownerProfileUrl')) - if owner_profile_url: - video_uploader_id = self._search_regex( - r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id', - default=None) - video_uploader_url = owner_profile_url + self._downloader.report_warning('unable to extract uploader nickname') channel_id = ( str_or_none(video_details.get('channelId')) @@ -2247,33 +2199,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_webpage, 'channel id', default=None, group='id')) channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None - thumbnails = [] - thumbnails_list = try_get( - video_details, lambda x: x['thumbnail']['thumbnails'], list) or [] - for t in thumbnails_list: - if not isinstance(t, dict): - continue - thumbnail_url = url_or_none(t.get('url')) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(t.get('width')), - 'height': int_or_none(t.get('height')), - }) - - if not thumbnails: + # thumbnail image + # We try first to get a high quality image: + m_thumb = re.search(r'', + video_webpage, re.DOTALL) + if m_thumb is not None: + video_thumbnail = m_thumb.group(1) + elif 'thumbnail_url' not in video_info: + self._downloader.report_warning('unable to extract video thumbnail') video_thumbnail = None - # We try first to get a high quality image: - m_thumb = re.search(r'', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str) - if thumbnail_url: - video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url) - if video_thumbnail: - thumbnails.append({'url': video_thumbnail}) + else: # don't panic if we can't find it + video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) # upload date upload_date = self._html_search_meta( @@ -2283,8 +2219,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): [r'(?s)id="eow-date.*?>(.*?)', r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = microformat.get('publishDate') or microformat.get('uploadDate') upload_date = unified_strdate(upload_date) video_license = self._html_search_regex( @@ -2356,21 +2290,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): m_cat_container = self._search_regex( r'(?s)]*>\s*Category\s*\s*]*>(.*?)', video_webpage, 'categories', default=None) - category = None if m_cat_container: category = self._html_search_regex( r'(?s)(.*?)', m_cat_container, 'category', default=None) - if not category: - category = try_get( - microformat, lambda x: x['category'], compat_str) - video_categories = None if category is None else [category] + video_categories = None if category is None else [category] + else: + video_categories = None video_tags = [ unescapeHTML(m.group('content')) for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - if not video_tags: - video_tags = try_get(video_details, lambda x: x['keywords'], list) def _extract_count(count_name): return str_to_int(self._search_regex( @@ -2421,7 +2351,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): errnote='Unable to download video annotations', fatal=False, data=urlencode_postdata({xsrf_field_name: xsrf_token})) - chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration) + chapters = self._extract_chapters(description_original, video_duration) # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): @@ -2478,23 +2408,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f['stretched_ratio'] = ratio if not formats: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta( - 'regionsAllowed', video_webpage, default=None) - countries = regions_allowed.split(',') if regions_allowed else None - self.raise_geo_restricted( - msg=video_info['reason'][0], countries=countries) - reason = video_info['reason'][0] - if 'Invalid parameters' in reason: - unavailable_message = extract_unavailable_message() - if unavailable_message: - reason = unavailable_message - raise ExtractorError( - 'YouTube said: %s' % reason, - expected=True, video_id=video_id) - if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']): - raise ExtractorError('This video is DRM protected.', expected=True) + token = extract_token(video_info) + if not token: + if 'reason' in video_info: + if 'The uploader has not made this video available in your country.' in video_info['reason']: + regions_allowed = self._html_search_meta( + 'regionsAllowed', video_webpage, default=None) + countries = regions_allowed.split(',') if regions_allowed else None + self.raise_geo_restricted( + msg=video_info['reason'][0], countries=countries) + reason = video_info['reason'][0] + if 'Invalid parameters' in reason: + unavailable_message = extract_unavailable_message() + if unavailable_message: + reason = unavailable_message + raise ExtractorError( + 'YouTube said: %s' % reason, + expected=True, video_id=video_id) + else: + raise ExtractorError( + '"token" parameter not in video info for unknown reason', + video_id=video_id) + + if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])): + raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) @@ -2512,7 +2449,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'creator': video_creator or artist, 'title': video_title, 'alt_title': video_alt_title or track, - 'thumbnails': thumbnails, + 'thumbnail': video_thumbnail, 'description': video_description, 'categories': video_categories, 'tags': video_tags, @@ -2574,23 +2511,20 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): _VIDEO_RE = _VIDEO_RE_TPL % r'(?P[0-9A-Za-z_-]{11})' IE_NAME = 'youtube:playlist' _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', - 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'title': 'youtube-dl public playlist', + 'title': 'ytdl test PL', + 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', }, - 'playlist_count': 1, + 'playlist_count': 3, }, { - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', - 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', - 'title': 'youtube-dl empty playlist', + 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', + 'title': 'YDL_Empty_List', }, 'playlist_count': 0, + 'skip': 'This playlist is private', }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', @@ -2600,7 +2534,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'uploader': 'Christiaan008', 'uploader_id': 'ChRiStIaAn008', }, - 'playlist_count': 96, + 'playlist_count': 95, }, { 'note': 'issue #673', 'url': 'PLBB231211A4F62143', @@ -3116,7 +3050,7 @@ class YoutubeLiveIE(YoutubeBaseInfoExtractor): class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com user/channel playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P[^/]+)/playlists' + _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P[^/]+)/playlists' IE_NAME = 'youtube:playlists' _TESTS = [{ @@ -3142,14 +3076,11 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): 'title': 'Chem Player', }, 'skip': 'Blocked', - }, { - 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', - 'only_matching': True, }] class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P[^"]+))?' + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&list=(?P<plid>[0-9A-Za-z_-]+))?(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?(.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?' class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): @@ -3181,7 +3112,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): errnote='Unable to download API page', query={'spf': 'navigate'}) html_content = data[1]['body']['content'] - + if 'class="search-message' in html_content: raise ExtractorError( '[youtube] No video results', expected=True) @@ -3291,8 +3222,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): more = self._download_json( 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, - transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) + transform_source=uppercase_escape) content_html = more['content_html'] more_widget_html = more['load_more_widget_html']