From d6ae092fc9439d15065630c69ad8d70fea6477b7 Mon Sep 17 00:00:00 2001 From: hodayabu <44240078+hodayabu@users.noreply.github.com> Date: Sun, 24 May 2020 19:00:32 +0300 Subject: [PATCH 01/33] missing metadate fix (#319) * missing metadate fix * timestamp fix Co-authored-by: bhodaya --- youtube_dl/extractor/facebook.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 7ead5e58f..ed05dfa7b 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import datetime import re import socket @@ -25,10 +26,12 @@ from ..utils import ( try_get, urlencode_postdata, update_url_query, - lowercase_escape + lowercase_escape, + parse_iso8601 ) + class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) (?: @@ -451,14 +454,17 @@ class FacebookIE(InfoExtractor): self._search_regex(r'ownerName"\s*:\s*"([^"]+)"', webpage, 'uploader', default=None) or \ self._og_search_title(webpage, default=None) + if webpage.find('Paid Partnership'): + timestamp = self._search_regex( + r'datePublished":"(.+?)"', webpage, + 'timestamp', default=None) + timestamp = parse_iso8601(timestamp) + else: + timestamp = int_or_none( + self._search_regex(r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary,'timestamp', default=None) + or self._search_regex(r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None) + ) or int_or_none(self._search_regex(r'publish_time":([\d]+)', webpage, 'timestamp', default=None)) - timestamp = int_or_none(self._search_regex( - r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary, - 'timestamp', default=None) or self._search_regex( - r']+data-utime=["\'](\d+)', webpage, - 'timestamp', default=None)) or int_or_none(self._search_regex( - r'publish_time":([\d]+)', webpage, - 'timestamp', default=None)) uploader_id = self._search_regex( r'ownerid:"([\d]+)', webpage, @@ -631,7 +637,6 @@ class FacebookIE(InfoExtractor): video_title = 'Facebook video #%s' % video_id return video_title - class FacebookTahoeData: def __init__(self, extractor, page, video_id): self._page = page From fcedf5eed12fc6608a3686f3ebaa897891bf488f Mon Sep 17 00:00:00 2001 From: Avichai Date: Sun, 24 May 2020 21:02:29 +0300 Subject: [PATCH 02/33] adding a backup for missing timestamp for paid partnership --- youtube_dl/extractor/facebook.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index ed05dfa7b..3b99383fa 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -454,12 +454,13 @@ class FacebookIE(InfoExtractor): self._search_regex(r'ownerName"\s*:\s*"([^"]+)"', webpage, 'uploader', default=None) or \ self._og_search_title(webpage, default=None) + timestamp = None if webpage.find('Paid Partnership'): timestamp = self._search_regex( r'datePublished":"(.+?)"', webpage, 'timestamp', default=None) timestamp = parse_iso8601(timestamp) - else: + if timestamp is None: timestamp = int_or_none( self._search_regex(r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary,'timestamp', default=None) or self._search_regex(r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None) From 1f1919d47273d6a85f56eecbd7087a59284d8aa6 Mon Sep 17 00:00:00 2001 From: Avichai Date: Sun, 24 May 2020 21:38:45 +0300 Subject: [PATCH 03/33] reversing the change --- youtube_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 3b99383fa..0ccb3c1cf 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -460,7 +460,7 @@ class FacebookIE(InfoExtractor): r'datePublished":"(.+?)"', webpage, 'timestamp', default=None) timestamp = parse_iso8601(timestamp) - if timestamp is None: + else: timestamp = int_or_none( self._search_regex(r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary,'timestamp', default=None) or self._search_regex(r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None) From 5fada70cbf825508edb85f3a0df8fb9b9a1fc622 Mon Sep 17 00:00:00 2001 From: Avichai Date: Sun, 24 May 2020 21:40:33 +0300 Subject: [PATCH 04/33] more reverse --- youtube_dl/extractor/facebook.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 0ccb3c1cf..ed05dfa7b 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -454,7 +454,6 @@ class FacebookIE(InfoExtractor): self._search_regex(r'ownerName"\s*:\s*"([^"]+)"', webpage, 'uploader', default=None) or \ self._og_search_title(webpage, default=None) - timestamp = None if webpage.find('Paid Partnership'): timestamp = self._search_regex( r'datePublished":"(.+?)"', webpage, From 846b0921306bf57095a777236590cd63f8e6c888 Mon Sep 17 00:00:00 2001 From: hodayabu <44240078+hodayabu@users.noreply.github.com> Date: Mon, 25 May 2020 16:17:52 +0300 Subject: [PATCH 05/33] Facebook timestamp fix (#320) * missing metadate fix * timestamp fix * timestamp conditions fix * timestamp conditions fix * timestamp conditions fix Co-authored-by: bhodaya --- youtube_dl/extractor/facebook.py | 34 ++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index ed05dfa7b..f1aa02913 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -454,16 +454,30 @@ class FacebookIE(InfoExtractor): self._search_regex(r'ownerName"\s*:\s*"([^"]+)"', webpage, 'uploader', default=None) or \ self._og_search_title(webpage, default=None) - if webpage.find('Paid Partnership'): - timestamp = self._search_regex( - r'datePublished":"(.+?)"', webpage, - 'timestamp', default=None) - timestamp = parse_iso8601(timestamp) - else: - timestamp = int_or_none( - self._search_regex(r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary,'timestamp', default=None) - or self._search_regex(r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None) - ) or int_or_none(self._search_regex(r'publish_time":([\d]+)', webpage, 'timestamp', default=None)) + timestamp = self._search_regex( + r'datePublished":"(.+?)"', webpage,'timestamp', default=None)\ + or self._search_regex(r'datePublished":"(.+?)"', tahoe_data.secondary, 'timestamp', default=None)\ + or self._search_regex(r'datePublished":"(.+?)"', tahoe_data.primary, 'timestamp', default=None) + timestamp = parse_iso8601(timestamp) + + if timestamp == None and webpage.find('Paid Partnership') == -1 or\ + (timestamp == None and webpage.find('Paid Partnership') > -1 and + 'cookiefile' in self._downloader.params): + + regex_search_result_date_time = self._search_regex(r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary, 'timestamp', default=None)\ + or self._search_regex(r'data-utime=\\\"(\d+)\\\"', tahoe_data.primary, 'timestamp', default=None)\ + or self._search_regex(r'data-utime=\\\"(\d+)\\\"', webpage,'timestamp', default=None)\ + or self._search_regex(r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)\ + or self._search_regex(r']+data-utime=["\'](\d+)', tahoe_data.secondary, 'timestamp', default=None)\ + or self._search_regex(r']+data-utime=["\'](\d+)', tahoe_data.primary, 'timestamp', default=None) + + regex_search_result_publish_time = self._search_regex(r'publish_time":([\d]+)', webpage, 'timestamp', default=None)\ + or self._search_regex(r'publish_time":([\d]+)', tahoe_data.primary, 'timestamp', default=None)\ + or self._search_regex(r'publish_time":([\d]+)', tahoe_data.secondary, 'timestamp', default=None) + + timestamp = int_or_none(regex_search_result_date_time) or int_or_none(regex_search_result_publish_time) + + uploader_id = self._search_regex( From a3736e799bb6514a8b982a64f74cc1c70ec56ce0 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Wed, 27 May 2020 15:33:51 +0300 Subject: [PATCH 06/33] fix_view_count_facebook --- youtube_dl/extractor/facebook.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f1aa02913..b11162a07 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -595,6 +595,10 @@ class FacebookIE(InfoExtractor): if value: return value + values = re.findall(r'(\d.\d+\w?) Views', tahoe_data.secondary) + if values: + return values[-1] + values = re.findall(r'(\d+\w?) Views', tahoe_data.secondary) if values: return values[-1] From a18d8b0a25b5a292fd874574b6e56c970fd889a2 Mon Sep 17 00:00:00 2001 From: hodayabu <44240078+hodayabu@users.noreply.github.com> Date: Sun, 31 May 2020 11:42:38 +0300 Subject: [PATCH 07/33] Facebook new ui (#323) * fix_view_count_facebook * facebook_new_ui_metadata * facebook_new_ui_live_info_fix Co-authored-by: bhodaya --- youtube_dl/extractor/facebook.py | 131 ++++++++++++++++++++++++++----- 1 file changed, 113 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index b11162a07..8f7a2b5c4 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -1,7 +1,5 @@ # coding: utf-8 from __future__ import unicode_literals - -import datetime import re import socket @@ -11,7 +9,7 @@ from ..compat import ( compat_http_client, compat_urllib_error, compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, + compat_urllib_parse_unquote_plus ) from ..utils import ( clean_html, @@ -27,11 +25,10 @@ from ..utils import ( urlencode_postdata, update_url_query, lowercase_escape, - parse_iso8601 + parse_iso8601, + unescapeHTML, ) - - class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) (?: @@ -385,7 +382,12 @@ class FacebookIE(InfoExtractor): elif '>You must log in to continue' in webpage: self.raise_login_required() - if not video_data : + if not video_data: + info_dict = self.get_from_new_ui(webpage, tahoe_data, url) + if info_dict: + return webpage, info_dict + + if not video_data: if self._search_regex(r'newsFeedStream.*?

(.*?)<\/span><\/h1>', webpage, "video_title") is not None: self.raise_login_required() raise ExtractorError('Cannot parse data') @@ -394,15 +396,8 @@ class FacebookIE(InfoExtractor): is_live_stream = video_data[0].get('is_live_stream', False) is_broadcast = video_data[0].get('is_broadcast', False) - live_status = 'not_live' - if is_broadcast: - live_status = 'completed' - if is_live_stream: - live_status = 'live' - if is_scheduled: - live_status = 'upcoming' + is_live, live_status = self.extract_live_info(is_scheduled, is_live_stream, is_broadcast) - is_live = live_status == 'live' subtitles = {} formats = [] @@ -478,8 +473,6 @@ class FacebookIE(InfoExtractor): timestamp = int_or_none(regex_search_result_date_time) or int_or_none(regex_search_result_publish_time) - - uploader_id = self._search_regex( r'ownerid:"([\d]+)', webpage, 'uploader_id', default=None) or self._search_regex( @@ -503,6 +496,50 @@ class FacebookIE(InfoExtractor): comment_count = parse_count(self._extract_comments_count(webpage, tahoe_data)) uploader_handle = self._resolve_uploader_handle(tahoe_data, uploader_id) + + info_dict = self.build_info_dict(webpage, tahoe_data, video_id, video_title, formats, uploader, timestamp, + thumbnail, view_count, uploader_id, is_live, live_status, likes_count, + shares_count, subtitles, comment_count, other_posts_view_count, uploader_handle) + + return webpage, info_dict + + + + def get_from_new_ui(self, webpage, tahoe_data, url): + + video_title = self._search_regex(r'"headline":"(.+?")', webpage, 'title') + comments_count = parse_count(self._search_regex(r'"commentCount":(.+?,)', webpage, 'comments_count')) + subtitles = self._search_regex(r'"about":"(.+?")', webpage, 'subtitles') + likes = parse_count(self._extract_likes(webpage, tahoe_data)) + + timestamp = self._search_regex(r'"datePublished":"(.+?)"', webpage, 'timestamp') + timestamp = parse_iso8601(timestamp) + + uploader_json = self._search_regex(r'"author":{(.+?)}', webpage, 'uploader') + uploader_handle, uploader = self._extract_uploader_info_new_ui(uploader_json) + + ids_json = self._search_regex(r'data-video-channel-id="(.+?)"', webpage, 'ids') + channel_id, video_id = self._extract_ids_info_new_ui(ids_json) + + post_view_counts = parse_count(self._search_regex(r'"postViewCount":(.+?),', tahoe_data.secondary, 'views')) + other_post_view_counts = parse_count(self._search_regex(r'"otherPostsViewCount":(.+?),', tahoe_data.secondary, 'other_views')) + + share_counts = parse_count(self._search_regex(r'"sharecount":(.+?),', tahoe_data.secondary, 'other_views')) + thumbnail = self._search_regex(r'"thumbnailUrl":"(.+?)"', webpage, 'thumbnail') + is_live, live_status = self.resolve_new_ui_live_info(webpage, tahoe_data) + + formats = self.resolve_new_ui_format(webpage) + info_dict = self.build_info_dict(webpage, tahoe_data, video_id, video_title, formats, uploader, timestamp, + thumbnail, post_view_counts, channel_id, is_live, live_status, likes, + share_counts, {}, comments_count, other_post_view_counts, + uploader_handle) + + return info_dict + + def build_info_dict(self,webpage, tahoe_data, video_id, video_title=None, formats=None, uploader=None, + timestamp=None, thumbnail=None, view_count=None, uploader_id=None, is_live=None, live_status=None, + likes_count=None, shares_count=None, subtitles=None, comment_count=None, other_posts_view_count=None, + uploader_handle=None): info_dict = { 'id': video_id, 'title': video_title, @@ -528,7 +565,7 @@ class FacebookIE(InfoExtractor): if uploader_id: info_dict['uploader_like_count'] = FacebookAjax(self, webpage, uploader_id).page_likes - return webpage, info_dict + return info_dict def _resolve_uploader_handle(self, tahoe_data, uploader_id): uploader_handle = self._search_regex(r'"video_path":"\\\/([^\/]+)\\\/', tahoe_data.primary, 'uploader_handle', @@ -655,6 +692,64 @@ class FacebookIE(InfoExtractor): video_title = 'Facebook video #%s' % video_id return video_title + def _extract_uploader_info_new_ui(self, uploader_json): + uploader_handle = self._search_regex(r'"name":"(.+?")', uploader_json, 'uploader') + uploader_url = self._search_regex(r'"url":"(.+?")', uploader_json, 'uploader_url') + uploader_url_str = uploader_url.decode("utf-8") + uploader = uploader_url_str.split('\\/')[-2] + return uploader_handle, uploader + + def _extract_ids_info_new_ui(self, ids_json): + ids_json_str = ids_json.decode("utf-8") + ids = ids_json_str.split(':') + channel_id = ids[0] + video_id = ids[1] + return channel_id, video_id + + def resolve_new_ui_live_info(self, webpage, tahoe_data): + + is_scheduled = '"isScheduledLive":true' in tahoe_data.secondary + is_live_stream = self._search_regex(r'"isLiveVOD":(.+?),', tahoe_data.secondary, "vod_live") + is_broadcast = '"isLiveBroadcast":true' in webpage + + return self.extract_live_info(is_scheduled, is_live_stream, is_broadcast) + + + def extract_live_info(self, is_scheduled, is_live_stream, is_broadcast): + live_status = 'not_live' + if is_broadcast: + live_status = 'completed' + if is_live_stream: + live_status = 'live' + if is_scheduled: + live_status = 'upcoming' + + is_live = live_status == 'live' + + return is_live, live_status + + + def resolve_new_ui_format(self, webpage): + format_url = self.build_format_url(webpage) + width = parse_count(self._search_regex(r' Date: Sun, 7 Jun 2020 12:47:42 +0300 Subject: [PATCH 08/33] Twitch - remove duplicate graph call --- youtube_dl/extractor/twitch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 9692053d3..1d8992eaf 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -220,6 +220,7 @@ class TwitchItemBaseIE(TwitchBaseIE): 'timestamp': parse_iso8601(info.get('recorded_at')), 'view_count': int_or_none(info.get('views')), 'is_live': is_live, + 'uploader_like_count': info['channel'].get('followers') } def _real_extract(self, url): @@ -372,17 +373,19 @@ class TwitchVodIE(TwitchItemBaseIE): 'ext': 'json', }], } + """ channel_id = info['uploader_id'] channel = self._call_api( 'kraken/channels/%s' % channel_id, channel_id, 'Downloading channel info JSON') info['uploader_like_count'] = channel.get('followers') - + description = info['description'] if description is None: description = channel.get('status') info['description'] = description + """ return info From c9ffdf7b52bcd2d1e9141c6208b5c86586505be6 Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Sun, 7 Jun 2020 15:03:48 +0300 Subject: [PATCH 09/33] Add missing handle and followers. --- youtube_dl/extractor/twitch.py | 64 ++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 1d8992eaf..9455c00d9 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -731,36 +731,38 @@ class TwitchClipsIE(TwitchBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - clip = self._download_json( + response = self._download_json( 'https://gql.twitch.tv/gql', video_id, data=json.dumps({ 'query': '''{ - clip(slug: "%s") { - broadcaster { - displayName - id - } - createdAt - curator { - displayName - id - } - durationSeconds - id - tiny: thumbnailURL(width: 86, height: 45) - small: thumbnailURL(width: 260, height: 147) - medium: thumbnailURL(width: 480, height: 272) - title - videoQualities { - frameRate - quality - sourceURL - } - viewCount - } -}''' % video_id, + clip(slug: "%s") { + broadcaster { + displayName + id + } + createdAt + curator { + displayName + id + } + durationSeconds + id + tiny: thumbnailURL(width: 86, height: 45) + small: thumbnailURL(width: 260, height: 147) + medium: thumbnailURL(width: 480, height: 272) + title + videoQualities { + frameRate + quality + sourceURL + } + viewCount + } + }''' % video_id, }).encode(), headers={ 'Client-ID': self._CLIENT_ID, - })['data']['clip'] + }) + + clip = response['data']['clip'] if not clip: raise ExtractorError( @@ -798,6 +800,13 @@ class TwitchClipsIE(TwitchBaseIE): }) thumbnails.append(thumb) + channel_id = clip['broadcaster']['id'] + channel = self._call_api( + 'kraken/channels/%s' % channel_id, + channel_id, 'Downloading channel info JSON') + + uploader_like_count = channel.get('followers') + creator_handle = channel.get('name') return { 'id': clip.get('id') or video_id, 'title': clip.get('title') or video_id, @@ -808,6 +817,9 @@ class TwitchClipsIE(TwitchBaseIE): 'thumbnails': thumbnails, 'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], compat_str), 'creator_id': try_get(clip, lambda x: x['broadcaster']['id'], compat_str), + 'creator_handle': creator_handle, 'uploader': try_get(clip, lambda x: x['curator']['displayName'], compat_str), 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str), + 'uploader_handle': try_get(clip, lambda x: x['curator']['name'], compat_str), + 'uploader_like_count': uploader_like_count } From 7924604f0dc395dbe71b3f3f7f79f47401d53bab Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Sun, 7 Jun 2020 15:07:56 +0300 Subject: [PATCH 10/33] . --- youtube_dl/extractor/twitch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 9455c00d9..63d8f675a 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -217,6 +217,7 @@ class TwitchItemBaseIE(TwitchBaseIE): 'thumbnails': thumbnails, 'uploader': info.get('channel', {}).get('display_name'), 'uploader_id': info.get('channel', {}).get('name'), + 'uploader_handle': info.get('channel', {}).get('name'), 'timestamp': parse_iso8601(info.get('recorded_at')), 'view_count': int_or_none(info.get('views')), 'is_live': is_live, @@ -677,6 +678,7 @@ class TwitchStreamIE(TwitchBaseIE): 'thumbnails': thumbnails, 'uploader': channel.get('display_name'), 'uploader_id': channel.get('name'), + 'uploader_handle': channel.get('name'), 'timestamp': timestamp, 'view_count': view_count, 'formats': formats, From 02aaba1c06f22bec632c9a54f8c3dd6294d4aedb Mon Sep 17 00:00:00 2001 From: bhodaya Date: Sun, 7 Jun 2020 15:18:18 +0300 Subject: [PATCH 11/33] fix title, subscription, comments, live --- youtube_dl/extractor/facebook.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 8f7a2b5c4..912dc9907 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -507,9 +507,10 @@ class FacebookIE(InfoExtractor): def get_from_new_ui(self, webpage, tahoe_data, url): - video_title = self._search_regex(r'"headline":"(.+?")', webpage, 'title') - comments_count = parse_count(self._search_regex(r'"commentCount":(.+?,)', webpage, 'comments_count')) - subtitles = self._search_regex(r'"about":"(.+?")', webpage, 'subtitles') + video_title = self._search_regex(r'"headline":"(.+?")', webpage, 'title', fatal=False) + if not video_title: + video_title = self._search_regex(r'"pageTitle">(.+?)<', webpage, 'title') + comments_count = parse_count(self._search_regex(r'"commentCount":(.+?,)', webpage, 'comments_count', fatal=False)) likes = parse_count(self._extract_likes(webpage, tahoe_data)) timestamp = self._search_regex(r'"datePublished":"(.+?)"', webpage, 'timestamp') @@ -719,10 +720,10 @@ class FacebookIE(InfoExtractor): live_status = 'not_live' if is_broadcast: live_status = 'completed' - if is_live_stream: - live_status = 'live' - if is_scheduled: - live_status = 'upcoming' + if is_live_stream: + live_status = 'live' + if is_scheduled: + live_status = 'upcoming' is_live = live_status == 'live' From 53b65d05347292dd6e1c3752d9e5219ba9899fb3 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Sun, 7 Jun 2020 17:00:16 +0300 Subject: [PATCH 12/33] fix uploader id --- youtube_dl/extractor/facebook.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 912dc9907..e3f227460 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -472,14 +472,7 @@ class FacebookIE(InfoExtractor): timestamp = int_or_none(regex_search_result_date_time) or int_or_none(regex_search_result_publish_time) - - uploader_id = self._search_regex( - r'ownerid:"([\d]+)', webpage, - 'uploader_id', default=None) or self._search_regex( - r'[\'\"]ownerid[\'\"]\s*:\s*[\'\"](\d+)[\'\"]',tahoe_data.secondary, - 'uploader_id', default=None) or \ - self._search_regex(r'\\\"page_id\\\"\s*:\s*\\\"(\d+)\\\"', tahoe_data.secondary, 'uploader_id', fatal=False) or \ - self._search_regex(r'content_owner_id_new\\":\\"(\d+)\\"', tahoe_data.secondary, 'uploader_id', fatal=False) + uploader_id = self._resolve_uploader_id(webpage, tahoe_data) thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) if is_live: @@ -509,7 +502,9 @@ class FacebookIE(InfoExtractor): video_title = self._search_regex(r'"headline":"(.+?")', webpage, 'title', fatal=False) if not video_title: - video_title = self._search_regex(r'"pageTitle">(.+?)<', webpage, 'title') + video_title = self._search_regex(r'"pageTitle">(.+?)<', webpage, 'title', fatal=False) + if not video_title: + video_title = self._extract_video_title(webpage, tahoe_data, video_id) comments_count = parse_count(self._search_regex(r'"commentCount":(.+?,)', webpage, 'comments_count', fatal=False)) likes = parse_count(self._extract_likes(webpage, tahoe_data)) @@ -520,7 +515,8 @@ class FacebookIE(InfoExtractor): uploader_handle, uploader = self._extract_uploader_info_new_ui(uploader_json) ids_json = self._search_regex(r'data-video-channel-id="(.+?)"', webpage, 'ids') - channel_id, video_id = self._extract_ids_info_new_ui(ids_json) + uploader_id = self._resolve_uploader_id(webpage, tahoe_data) + video_id = self._extract_ids_info_new_ui(ids_json) post_view_counts = parse_count(self._search_regex(r'"postViewCount":(.+?),', tahoe_data.secondary, 'views')) other_post_view_counts = parse_count(self._search_regex(r'"otherPostsViewCount":(.+?),', tahoe_data.secondary, 'other_views')) @@ -531,7 +527,7 @@ class FacebookIE(InfoExtractor): formats = self.resolve_new_ui_format(webpage) info_dict = self.build_info_dict(webpage, tahoe_data, video_id, video_title, formats, uploader, timestamp, - thumbnail, post_view_counts, channel_id, is_live, live_status, likes, + thumbnail, post_view_counts, uploader_id, is_live, live_status, likes, share_counts, {}, comments_count, other_post_view_counts, uploader_handle) @@ -703,9 +699,8 @@ class FacebookIE(InfoExtractor): def _extract_ids_info_new_ui(self, ids_json): ids_json_str = ids_json.decode("utf-8") ids = ids_json_str.split(':') - channel_id = ids[0] video_id = ids[1] - return channel_id, video_id + return video_id def resolve_new_ui_live_info(self, webpage, tahoe_data): @@ -750,6 +745,17 @@ class FacebookIE(InfoExtractor): format_url = unescapeHTML(format_url) return format_url + def _resolve_uploader_id(self, webpage, tahoe_data): + uploader_id = self._search_regex( + r'ownerid:"([\d]+)', webpage, + 'uploader_id', default=None) or self._search_regex( + r'[\'\"]ownerid[\'\"]\s*:\s*[\'\"](\d+)[\'\"]', tahoe_data.secondary, + 'uploader_id', default=None) or \ + self._search_regex(r'\\\"page_id\\\"\s*:\s*\\\"(\d+)\\\"', tahoe_data.secondary, 'uploader_id', + fatal=False) or \ + self._search_regex(r'content_owner_id_new\\":\\"(\d+)\\"', tahoe_data.secondary, 'uploader_id', + fatal=False) + return uploader_id class FacebookTahoeData: def __init__(self, extractor, page, video_id): From 1d4345ec78b6792506e426e62271779a3b9d805d Mon Sep 17 00:00:00 2001 From: hodayabu <44240078+hodayabu@users.noreply.github.com> Date: Mon, 8 Jun 2020 10:38:11 +0300 Subject: [PATCH 13/33] fix new ui bugs (#333) * fix new ui bugs * fix new ui bugs- pr comments Co-authored-by: bhodaya --- youtube_dl/extractor/facebook.py | 61 +++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index e3f227460..615371c64 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -383,7 +383,7 @@ class FacebookIE(InfoExtractor): self.raise_login_required() if not video_data: - info_dict = self.get_from_new_ui(webpage, tahoe_data, url) + info_dict = self.get_from_new_ui(webpage, tahoe_data, video_id) if info_dict: return webpage, info_dict @@ -448,11 +448,7 @@ class FacebookIE(InfoExtractor): _lowercase_escape(self._search_regex(r'\"ownerName\":"(.+?)"', tahoe_data.secondary, 'uploader_id', fatal=False)) or \ self._search_regex(r'ownerName"\s*:\s*"([^"]+)"', webpage, 'uploader', default=None) or \ self._og_search_title(webpage, default=None) - - timestamp = self._search_regex( - r'datePublished":"(.+?)"', webpage,'timestamp', default=None)\ - or self._search_regex(r'datePublished":"(.+?)"', tahoe_data.secondary, 'timestamp', default=None)\ - or self._search_regex(r'datePublished":"(.+?)"', tahoe_data.primary, 'timestamp', default=None) + timestamp = self._resolve_timestamp(webpage, tahoe_data) timestamp = parse_iso8601(timestamp) if timestamp == None and webpage.find('Paid Partnership') == -1 or\ @@ -498,34 +494,32 @@ class FacebookIE(InfoExtractor): - def get_from_new_ui(self, webpage, tahoe_data, url): + def get_from_new_ui(self, webpage, tahoe_data, video_id): + + video_title = self._resolve_new_ui_title(webpage, tahoe_data, video_id) + + comments_count = self._resolve_new_ui_comments_count(webpage, tahoe_data) - video_title = self._search_regex(r'"headline":"(.+?")', webpage, 'title', fatal=False) - if not video_title: - video_title = self._search_regex(r'"pageTitle">(.+?)<', webpage, 'title', fatal=False) - if not video_title: - video_title = self._extract_video_title(webpage, tahoe_data, video_id) - comments_count = parse_count(self._search_regex(r'"commentCount":(.+?,)', webpage, 'comments_count', fatal=False)) likes = parse_count(self._extract_likes(webpage, tahoe_data)) - timestamp = self._search_regex(r'"datePublished":"(.+?)"', webpage, 'timestamp') - timestamp = parse_iso8601(timestamp) + timestamp = self._resolve_new_ui_timestamp(webpage, tahoe_data) uploader_json = self._search_regex(r'"author":{(.+?)}', webpage, 'uploader') uploader_handle, uploader = self._extract_uploader_info_new_ui(uploader_json) - ids_json = self._search_regex(r'data-video-channel-id="(.+?)"', webpage, 'ids') uploader_id = self._resolve_uploader_id(webpage, tahoe_data) - video_id = self._extract_ids_info_new_ui(ids_json) post_view_counts = parse_count(self._search_regex(r'"postViewCount":(.+?),', tahoe_data.secondary, 'views')) other_post_view_counts = parse_count(self._search_regex(r'"otherPostsViewCount":(.+?),', tahoe_data.secondary, 'other_views')) share_counts = parse_count(self._search_regex(r'"sharecount":(.+?),', tahoe_data.secondary, 'other_views')) + thumbnail = self._search_regex(r'"thumbnailUrl":"(.+?)"', webpage, 'thumbnail') + is_live, live_status = self.resolve_new_ui_live_info(webpage, tahoe_data) formats = self.resolve_new_ui_format(webpage) + info_dict = self.build_info_dict(webpage, tahoe_data, video_id, video_title, formats, uploader, timestamp, thumbnail, post_view_counts, uploader_id, is_live, live_status, likes, share_counts, {}, comments_count, other_post_view_counts, @@ -757,6 +751,39 @@ class FacebookIE(InfoExtractor): fatal=False) return uploader_id + def _resolve_timestamp(self, webpage, tahoe_data): + timestamp = self._search_regex( + r'datePublished":"(.+?)"', webpage, 'timestamp', default=None) \ + or self._search_regex(r'datePublished":"(.+?)"', tahoe_data.secondary, 'timestamp', default=None) \ + or self._search_regex(r'datePublished":"(.+?)"', tahoe_data.primary, 'timestamp', default=None) + return timestamp + + def _resolve_new_ui_title(self, webpage, tahoe_data, video_id): + video_title = self._search_regex(r'"headline":"(.+?")', webpage, 'title', fatal=False) + if not video_title: + video_title = self._search_regex(r'"pageTitle">(.+?)<', webpage, 'title', fatal=False) + if not video_title: + video_title = self._extract_video_title(webpage, tahoe_data, video_id) + return video_title + + def _resolve_new_ui_comments_count(self, webpage, tahoe_data): + comments_count = parse_count( + self._search_regex(r'"commentCount":(.+?,)', webpage, 'comments_count', fatal=False)) + if comments_count is None: + comments_count = parse_count( + self._search_regex(r'"commentcount":(.+?,)', tahoe_data.secondary, 'comments_count', fatal=False)) + if comments_count is None: + comments_count = parse_count(self._extract_comments_count(webpage, tahoe_data)) + return comments_count + + def _resolve_new_ui_timestamp(self, webpage, tahoe_data): + timestamp = self._search_regex(r'"datePublished":"(.+?)"', webpage, 'timestamp', fatal=False) + if not timestamp: + timestamp = self._resolve_timestamp(webpage, tahoe_data) + timestamp = parse_iso8601(timestamp) + return timestamp + + class FacebookTahoeData: def __init__(self, extractor, page, video_id): self._page = page From 953b641944b3eeb9b8d54a6a4e12b6bfa7807384 Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Tue, 9 Jun 2020 09:41:10 +0300 Subject: [PATCH 14/33] fix twitch view count. --- youtube_dl/extractor/twitch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 63d8f675a..9c286e0a8 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -814,7 +814,7 @@ class TwitchClipsIE(TwitchBaseIE): 'title': clip.get('title') or video_id, 'formats': formats, 'duration': int_or_none(clip.get('durationSeconds')), - 'views': int_or_none(clip.get('viewCount')), + 'view_count': int_or_none(clip.get('viewCount')), 'timestamp': unified_timestamp(clip.get('createdAt')), 'thumbnails': thumbnails, 'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], compat_str), From e8d147618b17b2a7def26eec8bbb93edb7d7e614 Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Wed, 10 Jun 2020 16:16:36 +0300 Subject: [PATCH 15/33] facebook fix reactions as likes. --- youtube_dl/extractor/facebook.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 615371c64..e5e73633a 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -587,18 +587,24 @@ class FacebookIE(InfoExtractor): return value + @staticmethod + def _extract_first_pattern(pairs): + for pattern, data_list in pairs: + if not isinstance(data_list, list): + data_list = [data_list] + for data in data_list: + values = re.findall(pattern, data) + if values: + return values[-1] + def _extract_likes(self, webpage, tahoe_data): - values = re.findall(r'\blikecount\s*:\s*["\']([\d,.]+)', webpage) - if values: - return values[-1] - - values = re.findall(r'[\'\"]\blikecount[\'\"]\s*:\s*(\d+)', tahoe_data.secondary) - if values: - return values[-1] - - values = re.findall(r'"reaction_count"\s*:\s*{\s*"count"\s*:\s*(\d+)', tahoe_data.secondary) - if values: - return values[-1] + pairs = ( + (r'"reaction_count"\s*:\s*{\s*"count"\s*:\s*(\d+)', [tahoe_data.secondary, webpage]), + (r'reaction_count:{count:([\d]+)}', webpage), + (r'\blikecount\s*:\s*["\']([\d,.]+)', webpage), + (r'[\'\"]\blikecount[\'\"]\s*:\s*(\d+)', tahoe_data.secondary) + ) + return self._extract_first_pattern(pairs) def _extract_shares(self, webpage, tahoe_data): value = self._extract_meta_count(['sharecount'], webpage, tahoe_data, 'shares') From 2f3c50226a15d91f9266be51b0bf4175add351d3 Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Fri, 12 Jun 2020 18:38:29 +0300 Subject: [PATCH 16/33] fix clips. --- youtube_dl/extractor/twitch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 9c286e0a8..cbe823a5a 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -808,7 +808,7 @@ class TwitchClipsIE(TwitchBaseIE): channel_id, 'Downloading channel info JSON') uploader_like_count = channel.get('followers') - creator_handle = channel.get('name') + broadcaster_handle = channel.get('name') return { 'id': clip.get('id') or video_id, 'title': clip.get('title') or video_id, @@ -819,9 +819,9 @@ class TwitchClipsIE(TwitchBaseIE): 'thumbnails': thumbnails, 'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], compat_str), 'creator_id': try_get(clip, lambda x: x['broadcaster']['id'], compat_str), - 'creator_handle': creator_handle, + 'broadcaster_handle': broadcaster_handle, 'uploader': try_get(clip, lambda x: x['curator']['displayName'], compat_str), 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str), - 'uploader_handle': try_get(clip, lambda x: x['curator']['name'], compat_str), + 'uploader_handle': try_get(clip, lambda x: x['curator']['id'], compat_str), 'uploader_like_count': uploader_like_count } From fcb08be29cc14f44ababbaba419750707a0f39fa Mon Sep 17 00:00:00 2001 From: hodayabu <44240078+hodayabu@users.noreply.github.com> Date: Tue, 16 Jun 2020 16:20:45 +0300 Subject: [PATCH 17/33] Wrong title facebook crawled videos before fresh (#337) * timestamp conditions fix * title and thumbnail fix * pr fix * title and thumbnail fix * timestamp order Co-authored-by: bhodaya --- youtube_dl/extractor/facebook.py | 50 ++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index b13bf8e51..eba2edfef 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -442,35 +442,31 @@ class FacebookIE(InfoExtractor): if s: return lowercase_escape(s) - uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) or \ self._search_regex(r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',default=None) or \ _lowercase_escape(self._search_regex(r'\"ownerName\":"(.+?)"', tahoe_data.secondary, 'uploader_id', fatal=False)) or \ self._search_regex(r'ownerName"\s*:\s*"([^"]+)"', webpage, 'uploader', default=None) or \ self._og_search_title(webpage, default=None) + timestamp = self._resolve_timestamp(webpage, tahoe_data) timestamp = parse_iso8601(timestamp) - - if timestamp == None and webpage.find('Paid Partnership') == -1 or\ - (timestamp == None and webpage.find('Paid Partnership') > -1 and - 'cookiefile' in self._downloader.params): - - regex_search_result_date_time = self._search_regex(r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary, 'timestamp', default=None)\ - or self._search_regex(r'data-utime=\\\"(\d+)\\\"', tahoe_data.primary, 'timestamp', default=None)\ - or self._search_regex(r'data-utime=\\\"(\d+)\\\"', webpage,'timestamp', default=None)\ - or self._search_regex(r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)\ - or self._search_regex(r']+data-utime=["\'](\d+)', tahoe_data.secondary, 'timestamp', default=None)\ - or self._search_regex(r']+data-utime=["\'](\d+)', tahoe_data.primary, 'timestamp', default=None) - - regex_search_result_publish_time = self._search_regex(r'publish_time":([\d]+)', webpage, 'timestamp', default=None)\ - or self._search_regex(r'publish_time":([\d]+)', tahoe_data.primary, 'timestamp', default=None)\ - or self._search_regex(r'publish_time":([\d]+)', tahoe_data.secondary, 'timestamp', default=None) - + if timestamp is None and webpage.find('Paid Partnership') == -1 or \ + (timestamp is None and webpage.find('Paid Partnership') > -1 and 'cookiefile' in self._downloader.params): + regex_search_result_date_time = self._search_regex(r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary, 'timestamp', default=None) \ + or self._search_regex(r'data-utime=\\\"(\d+)\\\"', tahoe_data.primary, 'timestamp', default=None)\ + or self._search_regex(r'data-utime=\\\"(\d+)\\\"', webpage, 'timestamp', default=None)\ + or self._search_regex(r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)\ + or self._search_regex(r']+data-utime=["\'](\d+)', tahoe_data.secondary, 'timestamp', default=None)\ + or self._search_regex(r']+data-utime=["\'](\d+)', tahoe_data.primary, 'timestamp', default=None) + regex_search_result_publish_time = self._search_regex(r'publish_time":([\d]+)', webpage, 'timestamp', default=None) \ + or self._search_regex(r'publish_time":([\d]+)', tahoe_data.primary, 'timestamp', default=None) \ + or self._search_regex(r'publish_time":([\d]+)', tahoe_data.secondary, 'timestamp', default=None) timestamp = int_or_none(regex_search_result_date_time) or int_or_none(regex_search_result_publish_time) uploader_id = self._resolve_uploader_id(webpage, tahoe_data) - thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) + thumbnail = self._resolve_thumbnail(webpage, tahoe_data) + if is_live: view_count = parse_count( self._search_regex(r'viewerCount:([\d]+)', webpage, 'views', fatal=False) or \ @@ -673,16 +669,16 @@ class FacebookIE(InfoExtractor): video_title = self._html_search_regex( r']*class="uiHeaderTitle"[^>]*>([^<]*)

', webpage, 'title', default=None) - if not video_title: + if not self._valid_video_title(video_title): video_title = self._html_search_regex( r'(?s)(.*?)', webpage, 'alternative title', default=None) - if not video_title: + if not self._valid_video_title(video_title): video_title = self._og_search_title(webpage, default=None) - if not video_title: + if not self._valid_video_title(video_title): video_title = self._html_search_meta( 'description', webpage, 'title', default=None) - if not video_title: + if not self._valid_video_title(video_title): values = re.findall(r'videoTitle"\s*:\s*"(.*?)"', tahoe_data.secondary) if values: video_title = values[-1] @@ -792,6 +788,16 @@ class FacebookIE(InfoExtractor): timestamp = parse_iso8601(timestamp) return timestamp + def _resolve_thumbnail(self, webpage, tahoe_data): + thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) + if not thumbnail: + thumbnail = self._search_regex(r'"subtitles_src":"(.+?")', tahoe_data.primary, 'thumbnail', fatal=False) + return thumbnail + + def _valid_video_title(self, video_title): + return video_title and not u'Log In or Sign Up to View' in video_title + + class FacebookTahoeData: def __init__(self, extractor, page, video_id): From e7a5a1e20f4fb926dd8799f7d402b5c68f9329d2 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Wed, 17 Jun 2020 10:57:24 +0300 Subject: [PATCH 18/33] separate reactions from likes --- youtube_dl/extractor/facebook.py | 41 +++++++++++++++++++------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index eba2edfef..f6ac251f5 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -398,7 +398,6 @@ class FacebookIE(InfoExtractor): is_live, live_status = self.extract_live_info(is_scheduled, is_live_stream, is_broadcast) - subtitles = {} formats = [] for f in video_data: @@ -443,8 +442,9 @@ class FacebookIE(InfoExtractor): return lowercase_escape(s) uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) or \ - self._search_regex(r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',default=None) or \ - _lowercase_escape(self._search_regex(r'\"ownerName\":"(.+?)"', tahoe_data.secondary, 'uploader_id', fatal=False)) or \ + self._search_regex(r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', default=None) or \ + _lowercase_escape(self._search_regex(r'\"ownerName\":"(.+?)"', tahoe_data.secondary, 'uploader_id', + fatal=False)) or \ self._search_regex(r'ownerName"\s*:\s*"([^"]+)"', webpage, 'uploader', default=None) or \ self._og_search_title(webpage, default=None) @@ -476,6 +476,7 @@ class FacebookIE(InfoExtractor): view_count = parse_count(self._extract_views(webpage, tahoe_data)) other_posts_view_count = parse_count(self._extract_meta_count(['otherPostsViewCount'], webpage, tahoe_data, 'other_post_views')) + reactions_count = parse_count(self._extract_reactions(webpage, tahoe_data)) likes_count = parse_count(self._extract_likes(webpage, tahoe_data)) shares_count = parse_count(self._extract_shares(webpage, tahoe_data)) comment_count = parse_count(self._extract_comments_count(webpage, tahoe_data)) @@ -484,12 +485,11 @@ class FacebookIE(InfoExtractor): info_dict = self.build_info_dict(webpage, tahoe_data, video_id, video_title, formats, uploader, timestamp, thumbnail, view_count, uploader_id, is_live, live_status, likes_count, - shares_count, subtitles, comment_count, other_posts_view_count, uploader_handle) + reactions_count, shares_count, subtitles, comment_count, other_posts_view_count, + uploader_handle) return webpage, info_dict - - def get_from_new_ui(self, webpage, tahoe_data, video_id): video_title = self._resolve_new_ui_title(webpage, tahoe_data, video_id) @@ -498,6 +498,8 @@ class FacebookIE(InfoExtractor): likes = parse_count(self._extract_likes(webpage, tahoe_data)) + reactions = parse_count(self._extract_reactions(webpage, tahoe_data)) + timestamp = self._resolve_new_ui_timestamp(webpage, tahoe_data) uploader_json = self._search_regex(r'"author":{(.+?)}', webpage, 'uploader') @@ -517,15 +519,17 @@ class FacebookIE(InfoExtractor): formats = self.resolve_new_ui_format(webpage) info_dict = self.build_info_dict(webpage, tahoe_data, video_id, video_title, formats, uploader, timestamp, - thumbnail, post_view_counts, uploader_id, is_live, live_status, likes, + thumbnail, post_view_counts, uploader_id, is_live, live_status, likes, reactions, share_counts, {}, comments_count, other_post_view_counts, uploader_handle) return info_dict - def build_info_dict(self,webpage, tahoe_data, video_id, video_title=None, formats=None, uploader=None, - timestamp=None, thumbnail=None, view_count=None, uploader_id=None, is_live=None, live_status=None, - likes_count=None, shares_count=None, subtitles=None, comment_count=None, other_posts_view_count=None, + def build_info_dict(self, webpage, tahoe_data, video_id, video_title=None, formats=None, uploader=None, + timestamp=None, thumbnail=None, view_count=None, uploader_id=None, is_live=None, + live_status=None, + likes_count=None, reactions_count=None, shares_count=None, subtitles=None, comment_count=None, + other_posts_view_count=None, uploader_handle=None): info_dict = { 'id': video_id, @@ -539,6 +543,7 @@ class FacebookIE(InfoExtractor): 'is_live': is_live, 'live_status': live_status, 'like_count': likes_count, + 'reactions_count': reactions_count, 'share_count': shares_count, 'subtitles': subtitles, 'comment_count': comment_count, @@ -571,8 +576,8 @@ class FacebookIE(InfoExtractor): if value: break value = self._search_regex( - r'\b%s\s*:\s*["\']([\d,.]+)' % f, webpage, name, - default=None + r'\b%s\s*:\s*["\']([\d,.]+)' % f, webpage, name, + default=None ) if value: break @@ -593,10 +598,15 @@ class FacebookIE(InfoExtractor): if values: return values[-1] - def _extract_likes(self, webpage, tahoe_data): + def _extract_reactions(self, webpage, tahoe_data): pairs = ( (r'"reaction_count"\s*:\s*{\s*"count"\s*:\s*(\d+)', [tahoe_data.secondary, webpage]), - (r'reaction_count:{count:([\d]+)}', webpage), + (r'reaction_count:{count:([\d]+)}', webpage) + ) + return self._extract_first_pattern(pairs) + + def _extract_likes(self, webpage, tahoe_data): + pairs = ( (r'\blikecount\s*:\s*["\']([\d,.]+)', webpage), (r'[\'\"]\blikecount[\'\"]\s*:\s*(\d+)', tahoe_data.secondary) ) @@ -709,7 +719,6 @@ class FacebookIE(InfoExtractor): return self.extract_live_info(is_scheduled, is_live_stream, is_broadcast) - def extract_live_info(self, is_scheduled, is_live_stream, is_broadcast): live_status = 'not_live' if is_broadcast: @@ -723,7 +732,6 @@ class FacebookIE(InfoExtractor): return is_live, live_status - def resolve_new_ui_format(self, webpage): format_url = self.build_format_url(webpage) width = parse_count(self._search_regex(r' Date: Wed, 17 Jun 2020 16:25:11 +0300 Subject: [PATCH 19/33] add facebook test --- test/ci/test_facebook.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 test/ci/test_facebook.py diff --git a/test/ci/test_facebook.py b/test/ci/test_facebook.py new file mode 100644 index 000000000..540a89508 --- /dev/null +++ b/test/ci/test_facebook.py @@ -0,0 +1,31 @@ +import unittest +import youtube_dl + + +class facebookMetaData(unittest.TestCase): + def test_metadata_fetch(self): + params = {} + url = "https://www.facebook.com/iihfhockey/videos/2742345396033296/" + ydl = youtube_dl.YoutubeDL(params) + info = ydl.extract_info(url, download=False) + self.assertTrue(info.has_key('like_count')) + self.assertTrue(info.has_key('reactions_count')) + + def _test_metadata_fetch_with_log_in(self): + url = "https://www.facebook.com/iihfhockey/videos/2742345396033296/" + params = {} + with open("cookie_file") as file: + proxy = "ec2-35-175-164-238.compute-1.amazonaws.com:3128" + params['cookiefile'] = file.name + params['proxy'] = proxy + ydl = youtube_dl.YoutubeDL(params) + info = ydl.extract_info(url, download=False) + self.assertTrue(info.get('timestamp')) + self.assertTrue(info.get('view_count')) + self.assertTrue(info.get('width')) + self.assertTrue(info.get('uploader_id')) + self.assertTrue(info.get('thumbnail')) + + +if __name__ == '__main__': + unittest.main() From ed06b54607d330bfcba1174cb0118a11e2cc1e5d Mon Sep 17 00:00:00 2001 From: hodayabu <44240078+hodayabu@users.noreply.github.com> Date: Wed, 17 Jun 2020 17:53:43 +0300 Subject: [PATCH 20/33] fix facebook thumbnail (#340) Co-authored-by: bhodaya --- youtube_dl/extractor/facebook.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index eba2edfef..ea93173ce 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -791,7 +791,8 @@ class FacebookIE(InfoExtractor): def _resolve_thumbnail(self, webpage, tahoe_data): thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) if not thumbnail: - thumbnail = self._search_regex(r'"subtitles_src":"(.+?")', tahoe_data.primary, 'thumbnail', fatal=False) + thumbnail = self._search_regex(r'"thumbSrc":"(.+?)"', tahoe_data.secondary, 'thumbnail', fatal=False) + thumbnail = str(thumbnail).replace('\\', "") return thumbnail def _valid_video_title(self, video_title): From ff7e543821cd463c9989d480d216427d32593469 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Wed, 24 Jun 2020 09:42:00 +0300 Subject: [PATCH 21/33] fix facebook comments count --- test/ci/test_facebook.py | 27 +++++++++++++++++++++------ youtube_dl/extractor/facebook.py | 8 ++++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/test/ci/test_facebook.py b/test/ci/test_facebook.py index 540a89508..8533a81f5 100644 --- a/test/ci/test_facebook.py +++ b/test/ci/test_facebook.py @@ -3,25 +3,40 @@ import youtube_dl class facebookMetaData(unittest.TestCase): - def test_metadata_fetch(self): + def test_likes_metadata(self): params = {} url = "https://www.facebook.com/iihfhockey/videos/2742345396033296/" ydl = youtube_dl.YoutubeDL(params) info = ydl.extract_info(url, download=False) - self.assertTrue(info.has_key('like_count')) - self.assertTrue(info.has_key('reactions_count')) + self.assertGreater(info.get('like_count'), 200) - def _test_metadata_fetch_with_log_in(self): - url = "https://www.facebook.com/iihfhockey/videos/2742345396033296/" + def test_reactions_metadata(self): + params = {} + url = "https://www.facebook.com/supercarblondie/videos/519426815548240/" + ydl = youtube_dl.YoutubeDL(params) + info = ydl.extract_info(url, download=False) + self.assertGreater(info.get('reactions_count'), 1000000) + self.assertGreater(info.get('like_count'), 800000) + + def test_comments_live_video(self): + params = {} + url = "https://www.facebook.com/Medianetlive/videos/676754012901513/" + ydl = youtube_dl.YoutubeDL(params) + info = ydl.extract_info(url, download=False) + self.assertGreater(info.get('comment_count'), 0) + + def test_metadata_fetch_with_log_in(self): + url = "https://www.facebook.com/SerieA/videos/282581803097269" params = {} with open("cookie_file") as file: - proxy = "ec2-35-175-164-238.compute-1.amazonaws.com:3128" + proxy = "ec2-3-221-82-67.compute-1.amazonaws.com:3128" params['cookiefile'] = file.name params['proxy'] = proxy ydl = youtube_dl.YoutubeDL(params) info = ydl.extract_info(url, download=False) self.assertTrue(info.get('timestamp')) self.assertTrue(info.get('view_count')) + self.assertTrue(info.get('comment_count')) self.assertTrue(info.get('width')) self.assertTrue(info.get('uploader_id')) self.assertTrue(info.get('thumbnail')) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 108d385a1..cf60a2c21 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -630,6 +630,14 @@ class FacebookIE(InfoExtractor): if values: return values[-1] + value = self._search_regex(r'"commentCount":(.+?),', webpage, 'comment_count', fatal=False) + if value: + return value + + value = self._search_regex(r'"commentcount":(.+?),', tahoe_data.secondary, 'comment_count', fatal=False) + if value: + return value + def _extract_views(self, webpage, tahoe_data): value = self._extract_meta_count(['postViewCount', 'viewCount'], webpage, tahoe_data, 'likes') if value: From b84d66e6264ba2b1fbff49417204808747c84765 Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Wed, 24 Jun 2020 14:06:26 +0300 Subject: [PATCH 22/33] fix twitch --- youtube_dl/extractor/twitch.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index cbe823a5a..ad039745e 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -817,11 +817,10 @@ class TwitchClipsIE(TwitchBaseIE): 'view_count': int_or_none(clip.get('viewCount')), 'timestamp': unified_timestamp(clip.get('createdAt')), 'thumbnails': thumbnails, - 'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], compat_str), - 'creator_id': try_get(clip, lambda x: x['broadcaster']['id'], compat_str), + 'broadcaster': try_get(clip, lambda x: x['broadcaster']['displayName'], compat_str), + 'broadcaster_id': try_get(clip, lambda x: x['broadcaster']['id'], compat_str), 'broadcaster_handle': broadcaster_handle, 'uploader': try_get(clip, lambda x: x['curator']['displayName'], compat_str), 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str), - 'uploader_handle': try_get(clip, lambda x: x['curator']['id'], compat_str), 'uploader_like_count': uploader_like_count } From a1986b5bf25a0c17bee78897b6945bcd4f983077 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Sun, 28 Jun 2020 15:59:47 +0300 Subject: [PATCH 23/33] fix facebook thumbnail --- test/ci/test_facebook.py | 25 ++++++++++++++++++- youtube_dl/extractor/facebook.py | 41 +++++++++++++++++++++++++------- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/test/ci/test_facebook.py b/test/ci/test_facebook.py index 8533a81f5..383017c09 100644 --- a/test/ci/test_facebook.py +++ b/test/ci/test_facebook.py @@ -1,5 +1,6 @@ import unittest import youtube_dl +from youtube_dl.utils import DownloadError class facebookMetaData(unittest.TestCase): @@ -25,8 +26,15 @@ class facebookMetaData(unittest.TestCase): info = ydl.extract_info(url, download=False) self.assertGreater(info.get('comment_count'), 0) + def test_meta_data(self): + params = {} + url = "https://www.facebook.com/watch?v=177407933624543/" + ydl = youtube_dl.YoutubeDL(params) + info = ydl.extract_info(url, download=False) + self.assertGreater(info.get('comment_count'), 0) + def test_metadata_fetch_with_log_in(self): - url = "https://www.facebook.com/SerieA/videos/282581803097269" + url = "https://www.facebook.com/oristandup/videos/675360549895283" params = {} with open("cookie_file") as file: proxy = "ec2-3-221-82-67.compute-1.amazonaws.com:3128" @@ -34,6 +42,8 @@ class facebookMetaData(unittest.TestCase): params['proxy'] = proxy ydl = youtube_dl.YoutubeDL(params) info = ydl.extract_info(url, download=False) + print (info.get('title')) + print (info.get('timestamp')) self.assertTrue(info.get('timestamp')) self.assertTrue(info.get('view_count')) self.assertTrue(info.get('comment_count')) @@ -41,6 +51,19 @@ class facebookMetaData(unittest.TestCase): self.assertTrue(info.get('uploader_id')) self.assertTrue(info.get('thumbnail')) + def test_unavailable_video(self): + url = "https://www.facebook.com/101457238278830/videos/287839102599521/" + params = {} + with open("cookie_file") as file: + try: + proxy = "ec2-3-221-82-67.compute-1.amazonaws.com:3128" + params['cookiefile'] = file.name + params['proxy'] = proxy + ydl = youtube_dl.YoutubeDL(params) + info = ydl.extract_info(url, download=False) + except DownloadError: + self.assertRaises(DownloadError) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index cf60a2c21..f18eb872b 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -374,13 +374,7 @@ class FacebookIE(InfoExtractor): if not video_data: if not fatal_if_no_video: return webpage, False - m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) - if m_msg is not None: - raise ExtractorError( - 'The video is not available, Facebook said: "%s"' % m_msg.group(1), - expected=True) - elif '>You must log in to continue' in webpage: - self.raise_login_required() + self.validate_webpage(webpage) if not video_data: info_dict = self.get_from_new_ui(webpage, tahoe_data, video_id) @@ -806,13 +800,42 @@ class FacebookIE(InfoExtractor): def _resolve_thumbnail(self, webpage, tahoe_data): thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) + if not thumbnail: - thumbnail = self._search_regex(r'"thumbSrc":"(.+?)"', tahoe_data.secondary, 'thumbnail', fatal=False) + page = self.resolve_full_webpage(tahoe_data) + thumbnail = self._search_regex(r'"thumbnailUrl":"(.+?)"', page, 'thumbnail', fatal=False) thumbnail = str(thumbnail).replace('\\', "") return thumbnail def _valid_video_title(self, video_title): - return video_title and not u'Log In or Sign Up to View' in video_title + if video_title: + video_title = video_title.lower() + return video_title and not u'log in or sign up to view' in video_title + + def validate_webpage(self, webpage): + m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) + if m_msg is not None: + raise ExtractorError( + 'The video is not available, Facebook said: "%s"' % m_msg.group(1), + expected=True) + if 'Your Request Couldn\'t be Processed' in webpage: + raise ExtractorError( + 'The video is not available, Facebook said: this content is not available', + expected=True) + elif '>You must log in to continue' in webpage: + self.raise_login_required() + + def resolve_full_webpage(self, tahoe_data): + import urllib2 + user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3' + headers = {'User-Agent': user_agent} + full_url = self._search_regex(r'"permalinkURL":"(.+?)"', tahoe_data.primary, 'video_url', fatal=False) + full_url = str(full_url).replace('\\', "") + req = urllib2.Request(full_url, None, headers) + response = urllib2.urlopen(req) + page = response.read() + response.close() + return page class FacebookTahoeData: From 0f45a3da23690f9138f7b2c3083b72951c40e392 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Mon, 29 Jun 2020 09:28:58 +0300 Subject: [PATCH 24/33] fix facebook thumbnail --- test/ci/test_facebook.py | 2 +- youtube_dl/extractor/facebook.py | 15 +-------------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/test/ci/test_facebook.py b/test/ci/test_facebook.py index 383017c09..8910b3261 100644 --- a/test/ci/test_facebook.py +++ b/test/ci/test_facebook.py @@ -28,7 +28,7 @@ class facebookMetaData(unittest.TestCase): def test_meta_data(self): params = {} - url = "https://www.facebook.com/watch?v=177407933624543/" + url = "https://www.facebook.com/parapsychological.centr/videos/177407933624543/" ydl = youtube_dl.YoutubeDL(params) info = ydl.extract_info(url, download=False) self.assertGreater(info.get('comment_count'), 0) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f18eb872b..8db50e89f 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -802,8 +802,7 @@ class FacebookIE(InfoExtractor): thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) if not thumbnail: - page = self.resolve_full_webpage(tahoe_data) - thumbnail = self._search_regex(r'"thumbnailUrl":"(.+?)"', page, 'thumbnail', fatal=False) + thumbnail = self._search_regex(r'"thumbnailUrl":"(.+?)"', webpage, 'thumbnail', fatal=False) thumbnail = str(thumbnail).replace('\\', "") return thumbnail @@ -825,18 +824,6 @@ class FacebookIE(InfoExtractor): elif '>You must log in to continue' in webpage: self.raise_login_required() - def resolve_full_webpage(self, tahoe_data): - import urllib2 - user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3' - headers = {'User-Agent': user_agent} - full_url = self._search_regex(r'"permalinkURL":"(.+?)"', tahoe_data.primary, 'video_url', fatal=False) - full_url = str(full_url).replace('\\', "") - req = urllib2.Request(full_url, None, headers) - response = urllib2.urlopen(req) - page = response.read() - response.close() - return page - class FacebookTahoeData: def __init__(self, extractor, page, video_id): From 0ee48f6ffc3a429373d4b0cf8d26ec6676d13409 Mon Sep 17 00:00:00 2001 From: hodayabu <44240078+hodayabu@users.noreply.github.com> Date: Thu, 23 Jul 2020 11:30:22 +0300 Subject: [PATCH 25/33] get timestamp for paid videos (#348) Co-authored-by: bhodaya --- test/ci/test_facebook.py | 9 +++++++++ youtube_dl/extractor/facebook.py | 3 +++ 2 files changed, 12 insertions(+) diff --git a/test/ci/test_facebook.py b/test/ci/test_facebook.py index 8910b3261..b53969c96 100644 --- a/test/ci/test_facebook.py +++ b/test/ci/test_facebook.py @@ -64,6 +64,15 @@ class facebookMetaData(unittest.TestCase): except DownloadError: self.assertRaises(DownloadError) + def test_paid_videos_timestamp(self): + params = {} + url = "https://www.facebook.com/148456285190063/videos/307226959975478" + ydl = youtube_dl.YoutubeDL(params) + info = ydl.extract_info(url, download=False) + print (info.get('timestamp')) + self.assertTrue(info.get('timestamp')) + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 8db50e89f..222202658 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -457,6 +457,9 @@ class FacebookIE(InfoExtractor): or self._search_regex(r'publish_time":([\d]+)', tahoe_data.secondary, 'timestamp', default=None) timestamp = int_or_none(regex_search_result_date_time) or int_or_none(regex_search_result_publish_time) + if timestamp is None and webpage.find('Paid Partnership') > -1: + timestamp = self._search_regex(r'"publish_time\\":(.+?),', webpage, 'timestamp', default=None, fatal=False) + uploader_id = self._resolve_uploader_id(webpage, tahoe_data) thumbnail = self._resolve_thumbnail(webpage, tahoe_data) From 9b7ad50b3f4d0525417a82d07ec9ab31d45cd330 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Sun, 2 Aug 2020 17:36:59 +0300 Subject: [PATCH 26/33] facebook fixes --- youtube_dl/extractor/facebook.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 222202658..3f37282d9 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -440,6 +440,7 @@ class FacebookIE(InfoExtractor): _lowercase_escape(self._search_regex(r'\"ownerName\":"(.+?)"', tahoe_data.secondary, 'uploader_id', fatal=False)) or \ self._search_regex(r'ownerName"\s*:\s*"([^"]+)"', webpage, 'uploader', default=None) or \ + self._search_regex(r'ownerName"\s*:\s*"([^"]+)"', tahoe_data.primary, 'uploader', default=None) or \ self._og_search_title(webpage, default=None) timestamp = self._resolve_timestamp(webpage, tahoe_data) @@ -806,6 +807,9 @@ class FacebookIE(InfoExtractor): if not thumbnail: thumbnail = self._search_regex(r'"thumbnailUrl":"(.+?)"', webpage, 'thumbnail', fatal=False) + if not thumbnail: + thumbnail = self._search_regex(r'"preferredThumbnailURI":"(.+?)"', tahoe_data.primary, 'thumbnail', + fatal=False) thumbnail = str(thumbnail).replace('\\', "") return thumbnail From e3e82f7d20d834e69cec513cb40419431ef46f89 Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Tue, 4 Aug 2020 22:07:08 +0300 Subject: [PATCH 27/33] update facebook user agent. --- youtube_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 3f37282d9..8beffb22c 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -57,7 +57,7 @@ class FacebookIE(InfoExtractor): _NETRC_MACHINE = 'facebook' IE_NAME = 'facebook' - _CHROME_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36' + _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux armv7l) AppleWebKit/537.36 (KHTML, like Gecko) Raspbian Chromium/78.0.3904.108 Chrome/78.0.3904.108 Safari/537.36' _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=%s' From 8ebd1ff488558c71a372d39be50c09b40ea380b5 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Mon, 10 Aug 2020 14:14:20 +0300 Subject: [PATCH 28/33] generic title fix --- youtube_dl/extractor/facebook.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 8beffb22c..4088f6f84 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -692,8 +692,7 @@ class FacebookIE(InfoExtractor): if not self._valid_video_title(video_title): video_title = self._og_search_title(webpage, default=None) if not self._valid_video_title(video_title): - video_title = self._html_search_meta( - 'description', webpage, 'title', default=None) + video_title = self._resolve_description(webpage, tahoe_data) if not self._valid_video_title(video_title): values = re.findall(r'videoTitle"\s*:\s*"(.*?)"', tahoe_data.secondary) if values: @@ -816,7 +815,7 @@ class FacebookIE(InfoExtractor): def _valid_video_title(self, video_title): if video_title: video_title = video_title.lower() - return video_title and not u'log in or sign up to view' in video_title + return video_title and not u'log in or sign up to view' in video_title and not u'on facebook watch' in video_title def validate_webpage(self, webpage): m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) @@ -831,6 +830,11 @@ class FacebookIE(InfoExtractor): elif '>You must log in to continue' in webpage: self.raise_login_required() + def _resolve_description(self, webpage, tahoe_data): + description = self._html_search_meta( + 'description', webpage, 'title', default=None) + return description + class FacebookTahoeData: def __init__(self, extractor, page, video_id): From 832b943193a44c60cbf103e18600526791d07663 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Sun, 16 Aug 2020 16:49:51 +0300 Subject: [PATCH 29/33] fix uploader handle fb --- test/ci/test_facebook.py | 1 + youtube_dl/extractor/facebook.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/test/ci/test_facebook.py b/test/ci/test_facebook.py index b53969c96..1cc564518 100644 --- a/test/ci/test_facebook.py +++ b/test/ci/test_facebook.py @@ -32,6 +32,7 @@ class facebookMetaData(unittest.TestCase): ydl = youtube_dl.YoutubeDL(params) info = ydl.extract_info(url, download=False) self.assertGreater(info.get('comment_count'), 0) + self.assertTrue(info.get('uploader_handle'), 0) def test_metadata_fetch_with_log_in(self): url = "https://www.facebook.com/oristandup/videos/675360549895283" diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 4088f6f84..1abcf8d62 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -561,7 +561,7 @@ class FacebookIE(InfoExtractor): uploader_handle = self._search_regex(r'"video_path":"\\\/([^\/]+)\\\/', tahoe_data.primary, 'uploader_handle', fatal=False) if uploader_handle == uploader_id: - uploader_handle = self._search_regex(r'href=\\"https:\\\/\\\/www.facebook.com\\\/(.+?)\\\/\\', tahoe_data.secondary, + uploader_handle = self._search_regex(r'href=\\"https:\\\/\\\/www.facebook.com\\\/(.+?)\\\/', tahoe_data.secondary, 'uploader_handle', fatal=False) From f90e0c8de1a3515e09d901ba8c44c528c2448c5e Mon Sep 17 00:00:00 2001 From: hodayabu <44240078+hodayabu@users.noreply.github.com> Date: Wed, 19 Aug 2020 14:49:56 +0300 Subject: [PATCH 30/33] fix unicode characters (#354) * fix unicode characters * fix unicode characters Co-authored-by: bhodaya --- test/ci/test_facebook.py | 4 +++- youtube_dl/extractor/facebook.py | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/test/ci/test_facebook.py b/test/ci/test_facebook.py index 1cc564518..a6750ce43 100644 --- a/test/ci/test_facebook.py +++ b/test/ci/test_facebook.py @@ -28,11 +28,13 @@ class facebookMetaData(unittest.TestCase): def test_meta_data(self): params = {} - url = "https://www.facebook.com/parapsychological.centr/videos/177407933624543/" + url = "https://www.facebook.com/watch?v=925616657920281" ydl = youtube_dl.YoutubeDL(params) info = ydl.extract_info(url, download=False) self.assertGreater(info.get('comment_count'), 0) self.assertTrue(info.get('uploader_handle'), 0) + self.assertGreater(len(info.get('title')), 0) + def test_metadata_fetch_with_log_in(self): url = "https://www.facebook.com/oristandup/videos/675360549895283" diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 1abcf8d62..141a4f3e4 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -693,6 +693,9 @@ class FacebookIE(InfoExtractor): video_title = self._og_search_title(webpage, default=None) if not self._valid_video_title(video_title): video_title = self._resolve_description(webpage, tahoe_data) + if not self._valid_video_title(video_title): + video_title = self._html_search_regex(r'"videoTitle":"(.+?")', tahoe_data.secondary, 'title from secondary', default=None) + video_title = video_title.decode('unicode_escape') if not self._valid_video_title(video_title): values = re.findall(r'videoTitle"\s*:\s*"(.*?)"', tahoe_data.secondary) if values: From d710012d4ce2168f10a2086079e0dc437eb96528 Mon Sep 17 00:00:00 2001 From: Avichai Date: Wed, 9 Sep 2020 19:32:28 +0300 Subject: [PATCH 31/33] adding "," as an invalid title --- youtube_dl/extractor/facebook.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 141a4f3e4..07cc349a2 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -816,9 +816,19 @@ class FacebookIE(InfoExtractor): return thumbnail def _valid_video_title(self, video_title): - if video_title: - video_title = video_title.lower() - return video_title and not u'log in or sign up to view' in video_title and not u'on facebook watch' in video_title + if not video_title: + return False + + video_title = video_title.lower() + invalid_terms = [ + u'log in or sign up to view', + u'on facebook watch', + u'","' + ] + for term in invalid_terms: + if term in video_title: + return False + return True def validate_webpage(self, webpage): m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) From 2229c2b3e73b4bfa0b7bd669aefec7ce4e70881c Mon Sep 17 00:00:00 2001 From: hodayabu <44240078+hodayabu@users.noreply.github.com> Date: Thu, 10 Sep 2020 13:05:10 +0300 Subject: [PATCH 32/33] fix unicode characters (#357) Co-authored-by: bhodaya --- test/ci/test_facebook.py | 7 +++++++ youtube_dl/extractor/facebook.py | 1 + 2 files changed, 8 insertions(+) diff --git a/test/ci/test_facebook.py b/test/ci/test_facebook.py index a6750ce43..f2454fc79 100644 --- a/test/ci/test_facebook.py +++ b/test/ci/test_facebook.py @@ -35,6 +35,13 @@ class facebookMetaData(unittest.TestCase): self.assertTrue(info.get('uploader_handle'), 0) self.assertGreater(len(info.get('title')), 0) + def test_new_ui(self): + params = {} + url = "https://www.facebook.com/115232383642471/videos/444314619837387" + ydl = youtube_dl.YoutubeDL(params) + info = ydl.extract_info(url, download=False) + self.assertGreater(len(info.get('title')), 0) + def test_metadata_fetch_with_log_in(self): url = "https://www.facebook.com/oristandup/videos/675360549895283" diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 07cc349a2..ae9bdc28b 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -781,6 +781,7 @@ class FacebookIE(InfoExtractor): def _resolve_new_ui_title(self, webpage, tahoe_data, video_id): video_title = self._search_regex(r'"headline":"(.+?")', webpage, 'title', fatal=False) + video_title = video_title.decode('unicode_escape') if not video_title: video_title = self._search_regex(r'"pageTitle">(.+?)<', webpage, 'title', fatal=False) if not video_title: From 9cf8cac10d8287dd76799bf19107e6c3bfb6fe75 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Sun, 13 Sep 2020 13:05:55 +0300 Subject: [PATCH 33/33] fix new ui formats and views --- youtube_dl/extractor/facebook.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index ae9bdc28b..411e014ac 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -419,6 +419,8 @@ class FacebookIE(InfoExtractor): subtitles_src = f[0].get('subtitles_src') if subtitles_src: subtitles.setdefault('en', []).append({'url': subtitles_src}) + if not formats: + formats = self.resolve_new_ui_format(webpage) if not formats: raise ExtractorError('Cannot find video formats') @@ -505,7 +507,9 @@ class FacebookIE(InfoExtractor): uploader_id = self._resolve_uploader_id(webpage, tahoe_data) - post_view_counts = parse_count(self._search_regex(r'"postViewCount":(.+?),', tahoe_data.secondary, 'views')) + post_view_counts = self._extract_views(webpage, tahoe_data) + if not post_view_counts: + post_view_counts = parse_count(self._search_regex(r'"postViewCount":(.+?),', tahoe_data.secondary, 'views')) other_post_view_counts = parse_count(self._search_regex(r'"otherPostsViewCount":(.+?),', tahoe_data.secondary, 'other_views')) share_counts = parse_count(self._search_regex(r'"sharecount":(.+?),', tahoe_data.secondary, 'other_views'))