From 4303495ee80833ad7e3d2389a94e2db317ba44e6 Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Mon, 14 Jan 2019 13:29:24 +0200 Subject: [PATCH 01/14] Facebook - get timestamp from tahoe if missing. --- youtube_dl/extractor/facebook.py | 37 +++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 74954049d..05ea8a473 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -57,7 +57,8 @@ class FacebookIE(InfoExtractor): _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' - _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' + _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=%s' + _TESTS = [{ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', @@ -222,6 +223,10 @@ class FacebookIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # no timestamp + 'url': 'https://www.facebook.com/ChickenShow1996/videos/2289288568020072/', + 'only_matching': True, }] @staticmethod @@ -339,6 +344,7 @@ class FacebookIE(InfoExtractor): video_id, transform_source=js_to_json, fatal=False) video_data = extract_from_jsmods_instances(server_js_data) + tahoe_secondary_data = '' if not video_data: if not fatal_if_no_video: return webpage, False @@ -352,9 +358,7 @@ class FacebookIE(InfoExtractor): # Video info not in first request, do a secondary request using # tahoe player specific URL - tahoe_data = self._download_webpage( - self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, - data=urlencode_postdata({ + tahoe_request_data = urlencode_postdata({ '__a': 1, '__pc': self._search_regex( r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, @@ -365,15 +369,29 @@ class FacebookIE(InfoExtractor): 'fb_dtsg': self._search_regex( r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"', webpage, 'dtsg token', default=''), - }), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded', }) + tahoe_request_headers = { + 'Content-Type': 'application/x-www-form-urlencoded', + } + + tahoe_primary_data = self._download_webpage( + self._VIDEO_PAGE_TAHOE_TEMPLATE % (video_id, 'primary'), video_id, + data=tahoe_request_data, + headers=tahoe_request_headers + ) + + tahoe_secondary_data = self._download_webpage( + self._VIDEO_PAGE_TAHOE_TEMPLATE % (video_id, 'secondary'), video_id, + data=tahoe_request_data, + headers=tahoe_request_headers + ) + tahoe_js_data = self._parse_json( self._search_regex( - r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, + r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_primary_data, 'tahoe js data', default='{}'), video_id, fatal=False) + video_data = extract_from_jsmods_instances(tahoe_js_data) if not video_data: @@ -427,7 +445,10 @@ class FacebookIE(InfoExtractor): fatal=False) or self._og_search_title(webpage, fatal=False) timestamp = int_or_none(self._search_regex( r']+data-utime=["\'](\d+)', webpage, + 'timestamp', default=None) or self._search_regex( + r'data-utime=\\\"(\d+)\\\"', tahoe_secondary_data, 'timestamp', default=None)) + thumbnail = self._og_search_thumbnail(webpage) view_count = parse_count(self._search_regex( From e921ad845d29ed5a192dc6910b2457372ee27ec1 Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Mon, 14 Jan 2019 13:37:15 +0200 Subject: [PATCH 02/14] flake8 fixes --- youtube_dl/extractor/facebook.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 05ea8a473..b1847dd21 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -59,7 +59,6 @@ class FacebookIE(InfoExtractor): _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=%s' - _TESTS = [{ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', 'md5': '6a40d33c0eccbb1af76cf0485a052659', @@ -358,7 +357,8 @@ class FacebookIE(InfoExtractor): # Video info not in first request, do a secondary request using # tahoe player specific URL - tahoe_request_data = urlencode_postdata({ + tahoe_request_data = urlencode_postdata( + { '__a': 1, '__pc': self._search_regex( r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, @@ -371,8 +371,8 @@ class FacebookIE(InfoExtractor): webpage, 'dtsg token', default=''), }) tahoe_request_headers = { - 'Content-Type': 'application/x-www-form-urlencoded', - } + 'Content-Type': 'application/x-www-form-urlencoded', + } tahoe_primary_data = self._download_webpage( self._VIDEO_PAGE_TAHOE_TEMPLATE % (video_id, 'primary'), video_id, From 5916a2fc38a505c06c8dbb585a06b6bd5fe324df Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Mon, 14 Jan 2019 16:05:59 +0200 Subject: [PATCH 03/14] read uploader id --- youtube_dl/extractor/facebook.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index b1847dd21..d70837402 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -449,6 +449,12 @@ class FacebookIE(InfoExtractor): r'data-utime=\\\"(\d+)\\\"', tahoe_secondary_data, 'timestamp', default=None)) + uploader_id = self._search_regex( + r'ownerid:"([\d]+)', webpage, + 'uploader_id', default=None) or self._search_regex( + r'\"ownerid\":"(\d+)"', tahoe_secondary_data, + 'uploader_id', default=None) + thumbnail = self._og_search_thumbnail(webpage) view_count = parse_count(self._search_regex( @@ -463,6 +469,7 @@ class FacebookIE(InfoExtractor): 'timestamp': timestamp, 'thumbnail': thumbnail, 'view_count': view_count, + 'uploader_id': uploader_id } return webpage, info_dict From ba3b2c535f653f42e6c4d2d86ea172673d46f105 Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Mon, 14 Jan 2019 16:51:44 +0200 Subject: [PATCH 04/14] read viewcount properly. --- youtube_dl/extractor/facebook.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index d70837402..f7b65a2ae 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -458,8 +458,11 @@ class FacebookIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) view_count = parse_count(self._search_regex( - r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', - default=None)) + r'\bpostViewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', + default=None) or self._search_regex( + r'\"postViewCount\"\s*:\s*(\d+)', tahoe_secondary_data, 'view count', + default=None) + ) info_dict = { 'id': video_id, From 8b50d432121ce659460715c97b8d4cc8d0b76e3f Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Mon, 14 Jan 2019 16:56:04 +0200 Subject: [PATCH 05/14] read viewCount when postViewCount is not available --- youtube_dl/extractor/facebook.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f7b65a2ae..028182280 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -461,6 +461,10 @@ class FacebookIE(InfoExtractor): r'\bpostViewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', default=None) or self._search_regex( r'\"postViewCount\"\s*:\s*(\d+)', tahoe_secondary_data, 'view count', + default=None) or self._search_regex( + r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', + default=None) or self._search_regex( + r'\"viewCount\"\s*:\s*(\d+)', tahoe_secondary_data, 'view count', default=None) ) From 8811f87a76dc9c7a397bbe5ef8480fdd55b570db Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Mon, 14 Jan 2019 17:26:07 +0200 Subject: [PATCH 06/14] get uploader for tahoe data --- youtube_dl/extractor/facebook.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 028182280..4fdb75080 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -442,7 +442,9 @@ class FacebookIE(InfoExtractor): uploader = clean_html(get_element_by_id( 'fbPhotoPageAuthorName', webpage)) or self._search_regex( r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', - fatal=False) or self._og_search_title(webpage, fatal=False) + fatal=False) or self._og_search_title(webpage, fatal=False, default=None) or self._search_regex( + r'\"ownerName\":"(.*?)"', tahoe_secondary_data, + 'uploader_id') timestamp = int_or_none(self._search_regex( r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None) or self._search_regex( From f851a409e05a8260a53c8e9a1f2f11d76df62ba3 Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Mon, 14 Jan 2019 17:37:09 +0200 Subject: [PATCH 07/14] hide unable to extract uploader warning --- youtube_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 4fdb75080..d12948b06 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -442,7 +442,7 @@ class FacebookIE(InfoExtractor): uploader = clean_html(get_element_by_id( 'fbPhotoPageAuthorName', webpage)) or self._search_regex( r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', - fatal=False) or self._og_search_title(webpage, fatal=False, default=None) or self._search_regex( + fatal=False, default=None) or self._og_search_title(webpage, fatal=False, default=None) or self._search_regex( r'\"ownerName\":"(.*?)"', tahoe_secondary_data, 'uploader_id') timestamp = int_or_none(self._search_regex( From f8ae9b34da3e71f2b15f3b1e3b86616789cc5b59 Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Wed, 16 Jan 2019 10:41:57 +0200 Subject: [PATCH 08/14] apply comments --- youtube_dl/extractor/facebook.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index d12948b06..9e306d0d8 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -224,8 +224,19 @@ class FacebookIE(InfoExtractor): }, }, { # no timestamp - 'url': 'https://www.facebook.com/ChickenShow1996/videos/2289288568020072/', - 'only_matching': True, + 'url': 'https://www.facebook.com/SuperNewsGames/videos/642255722780473/', + 'info_dict': { + 'timestamp': 1521221400, + 'uploader': 'Super News Games', + 'uploader_id': '229550157384367', + 'id': '642255722780473', + 'ext': 'mp4', + 'upload_date': '20180316', + 'title': 'The Voice of Nick is trying Fortnite after 100 hours of PLAYERUNKNOWN\'S BATTL...', + }, + 'params': { + 'skip_download': True, + }, }] @staticmethod @@ -383,7 +394,7 @@ class FacebookIE(InfoExtractor): tahoe_secondary_data = self._download_webpage( self._VIDEO_PAGE_TAHOE_TEMPLATE % (video_id, 'secondary'), video_id, data=tahoe_request_data, - headers=tahoe_request_headers + headers=tahoe_request_headers, fatal=False ) tahoe_js_data = self._parse_json( @@ -441,10 +452,11 @@ class FacebookIE(InfoExtractor): video_title = 'Facebook video #%s' % video_id uploader = clean_html(get_element_by_id( 'fbPhotoPageAuthorName', webpage)) or self._search_regex( - r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', - fatal=False, default=None) or self._og_search_title(webpage, fatal=False, default=None) or self._search_regex( - r'\"ownerName\":"(.*?)"', tahoe_secondary_data, - 'uploader_id') + r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',default=None) or \ + self._og_search_title(webpage, default=None) or self._search_regex( + r'\"ownerName\":"(.+?)"', tahoe_secondary_data, + 'uploader_id', fatal=False) + timestamp = int_or_none(self._search_regex( r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None) or self._search_regex( @@ -455,7 +467,7 @@ class FacebookIE(InfoExtractor): r'ownerid:"([\d]+)', webpage, 'uploader_id', default=None) or self._search_regex( r'\"ownerid\":"(\d+)"', tahoe_secondary_data, - 'uploader_id', default=None) + 'uploader_id', fatal=False) thumbnail = self._og_search_thumbnail(webpage) From 906224637111a00e2b8012b79a8bdb965b285130 Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Sun, 20 Jan 2019 10:27:20 +0200 Subject: [PATCH 09/14] Relax regex. --- youtube_dl/extractor/facebook.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 9e306d0d8..ef195a891 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -466,7 +466,7 @@ class FacebookIE(InfoExtractor): uploader_id = self._search_regex( r'ownerid:"([\d]+)', webpage, 'uploader_id', default=None) or self._search_regex( - r'\"ownerid\":"(\d+)"', tahoe_secondary_data, + r'[\'\"]ownerid[\'\"]\s*:\s*[\'\"](\d+)[\'\"]', tahoe_secondary_data, 'uploader_id', fatal=False) thumbnail = self._og_search_thumbnail(webpage) @@ -474,11 +474,11 @@ class FacebookIE(InfoExtractor): view_count = parse_count(self._search_regex( r'\bpostViewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', default=None) or self._search_regex( - r'\"postViewCount\"\s*:\s*(\d+)', tahoe_secondary_data, 'view count', + r'[\'\"]postViewCount[\'\"]\s*:\s*(\d+)', tahoe_secondary_data, 'view count', default=None) or self._search_regex( r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', default=None) or self._search_regex( - r'\"viewCount\"\s*:\s*(\d+)', tahoe_secondary_data, 'view count', + r'[\'\"]viewCount[\'\"]\s*:\s*(\d+)', tahoe_secondary_data, 'view count', default=None) ) From 32fe6908c973a7977c208919385b3588b41935db Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Sun, 20 Jan 2019 10:33:16 +0200 Subject: [PATCH 10/14] Do not fail in case secondary data not exists. --- youtube_dl/extractor/facebook.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index ef195a891..2318cf4c7 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -354,7 +354,7 @@ class FacebookIE(InfoExtractor): video_id, transform_source=js_to_json, fatal=False) video_data = extract_from_jsmods_instances(server_js_data) - tahoe_secondary_data = '' + if not video_data: if not fatal_if_no_video: return webpage, False @@ -396,7 +396,9 @@ class FacebookIE(InfoExtractor): data=tahoe_request_data, headers=tahoe_request_headers, fatal=False ) - + if not tahoe_secondary_data: + tahoe_secondary_data = '' + tahoe_js_data = self._parse_json( self._search_regex( r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_primary_data, From cd805c064c4251856adc4e1c47379dcfee74dc88 Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Sun, 20 Jan 2019 10:33:40 +0200 Subject: [PATCH 11/14] remove whitespace. --- youtube_dl/extractor/facebook.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 2318cf4c7..ec2a6a1be 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -354,7 +354,6 @@ class FacebookIE(InfoExtractor): video_id, transform_source=js_to_json, fatal=False) video_data = extract_from_jsmods_instances(server_js_data) - if not video_data: if not fatal_if_no_video: return webpage, False @@ -398,7 +397,7 @@ class FacebookIE(InfoExtractor): ) if not tahoe_secondary_data: tahoe_secondary_data = '' - + tahoe_js_data = self._parse_json( self._search_regex( r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_primary_data, From 65f64270cdcb14c1699c4091a33ce1270579bf5c Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Mon, 28 Jan 2019 12:28:22 +0200 Subject: [PATCH 12/14] set default value for tahoe secondary data. --- youtube_dl/extractor/facebook.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index ec2a6a1be..e4d7ec235 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -354,6 +354,7 @@ class FacebookIE(InfoExtractor): video_id, transform_source=js_to_json, fatal=False) video_data = extract_from_jsmods_instances(server_js_data) + tahoe_secondary_data = '' if not video_data: if not fatal_if_no_video: return webpage, False From 2809c0e622a0dc2ae7cac1969b016ad7ae9dc00a Mon Sep 17 00:00:00 2001 From: Avi Peretz Date: Sun, 3 Feb 2019 12:00:09 +0200 Subject: [PATCH 13/14] refactor tahoe data --- youtube_dl/extractor/facebook.py | 94 +++++++++++++++++++------------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index e4d7ec235..66e99fdf1 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -354,7 +354,7 @@ class FacebookIE(InfoExtractor): video_id, transform_source=js_to_json, fatal=False) video_data = extract_from_jsmods_instances(server_js_data) - tahoe_secondary_data = '' + tahoe_data = FacebookTahoeData(self, webpage, video_id) if not video_data: if not fatal_if_no_video: return webpage, False @@ -365,43 +365,11 @@ class FacebookIE(InfoExtractor): expected=True) elif '>You must log in to continue' in webpage: self.raise_login_required() - # Video info not in first request, do a secondary request using # tahoe player specific URL - tahoe_request_data = urlencode_postdata( - { - '__a': 1, - '__pc': self._search_regex( - r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, - 'pkg cohort', default='PHASED:DEFAULT'), - '__rev': self._search_regex( - r'client_revision["\']\s*:\s*(\d+),', webpage, - 'client revision', default='3944515'), - 'fb_dtsg': self._search_regex( - r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"', - webpage, 'dtsg token', default=''), - }) - tahoe_request_headers = { - 'Content-Type': 'application/x-www-form-urlencoded', - } - - tahoe_primary_data = self._download_webpage( - self._VIDEO_PAGE_TAHOE_TEMPLATE % (video_id, 'primary'), video_id, - data=tahoe_request_data, - headers=tahoe_request_headers - ) - - tahoe_secondary_data = self._download_webpage( - self._VIDEO_PAGE_TAHOE_TEMPLATE % (video_id, 'secondary'), video_id, - data=tahoe_request_data, - headers=tahoe_request_headers, fatal=False - ) - if not tahoe_secondary_data: - tahoe_secondary_data = '' - tahoe_js_data = self._parse_json( self._search_regex( - r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_primary_data, + r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data.primary, 'tahoe js data', default='{}'), video_id, fatal=False) @@ -456,19 +424,19 @@ class FacebookIE(InfoExtractor): 'fbPhotoPageAuthorName', webpage)) or self._search_regex( r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',default=None) or \ self._og_search_title(webpage, default=None) or self._search_regex( - r'\"ownerName\":"(.+?)"', tahoe_secondary_data, + r'\"ownerName\":"(.+?)"', tahoe_data.secondary, 'uploader_id', fatal=False) timestamp = int_or_none(self._search_regex( r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None) or self._search_regex( - r'data-utime=\\\"(\d+)\\\"', tahoe_secondary_data, + r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary, 'timestamp', default=None)) uploader_id = self._search_regex( r'ownerid:"([\d]+)', webpage, 'uploader_id', default=None) or self._search_regex( - r'[\'\"]ownerid[\'\"]\s*:\s*[\'\"](\d+)[\'\"]', tahoe_secondary_data, + r'[\'\"]ownerid[\'\"]\s*:\s*[\'\"](\d+)[\'\"]', tahoe_data.secondary, 'uploader_id', fatal=False) thumbnail = self._og_search_thumbnail(webpage) @@ -476,11 +444,11 @@ class FacebookIE(InfoExtractor): view_count = parse_count(self._search_regex( r'\bpostViewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', default=None) or self._search_regex( - r'[\'\"]postViewCount[\'\"]\s*:\s*(\d+)', tahoe_secondary_data, 'view count', + r'[\'\"]postViewCount[\'\"]\s*:\s*(\d+)', tahoe_data.secondary, 'view count', default=None) or self._search_regex( r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', default=None) or self._search_regex( - r'[\'\"]viewCount[\'\"]\s*:\s*(\d+)', tahoe_secondary_data, 'view count', + r'[\'\"]viewCount[\'\"]\s*:\s*(\d+)', tahoe_data.secondary, 'view count', default=None) ) @@ -523,6 +491,54 @@ class FacebookIE(InfoExtractor): return info_dict +class FacebookTahoeData: + def __init__(self, extractor, page, video_id): + self._page = page + self._video_id = video_id + self._extractor = extractor + self._data = {} + + def _get_data(self, data_type): + if data_type in self._data: + data = self._data[data_type] + else: + req_data, headers = self._get_request_data_and_headers() + data = self._extractor._download_webpage( + self._extractor._VIDEO_PAGE_TAHOE_TEMPLATE % (self._video_id, data_type), self._video_id, + data=req_data, + headers=headers + ) + return '' if not data else data + + @property + def primary(self): + return self._get_data('primary') + + @property + def secondary(self): + return self._get_data('secondary') + + def _get_request_data_and_headers(self): + tahoe_request_data = urlencode_postdata( + { + '__a': 1, + '__pc': self._extractor._search_regex( + r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', self._page, + 'pkg cohort', default='PHASED:DEFAULT'), + '__rev': self._extractor._search_regex( + r'client_revision["\']\s*:\s*(\d+),', self._page, + 'client revision', default='3944515'), + 'fb_dtsg': self._extractor._search_regex( + r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"', + self._page, 'dtsg token', default=''), + }) + tahoe_request_headers = { + 'Content-Type': 'application/x-www-form-urlencoded', + } + + return tahoe_request_data, tahoe_request_headers + + class FacebookPluginsVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?Phttps.+)' From 343c86fa0bff0a4ea22039cc703a9a1a222470c8 Mon Sep 17 00:00:00 2001 From: Avichai Cohen Date: Tue, 2 Apr 2019 16:22:54 +0300 Subject: [PATCH 14/14] Adding is_live to info dictionary of facebook videos --- youtube_dl/extractor/facebook.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 789dd79d5..464ffd47f 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -218,6 +218,7 @@ class FacebookIE(InfoExtractor): 'ext': 'mp4', 'title': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses', 'uploader': 'ESL One Dota 2', + 'is_live': False }, 'params': { 'skip_download': True, @@ -379,6 +380,8 @@ class FacebookIE(InfoExtractor): if not video_data: raise ExtractorError('Cannot parse data') + is_live = video_data[0].get('is_broadcast', False) and video_data[0].get('is_live_stream', False) + formats = [] for f in video_data: format_id = f['stream_type'] @@ -442,6 +445,7 @@ class FacebookIE(InfoExtractor): 'timestamp': timestamp, 'thumbnail': thumbnail, 'view_count': view_count, + 'is_live': is_live } return webpage, info_dict