From c1b2396dd7d6410d3ebc812a7e23a69ecdbd48f9 Mon Sep 17 00:00:00 2001 From: yurganov Date: Tue, 10 Jul 2018 00:54:10 +0300 Subject: [PATCH 1/5] [facebook] Add count of views --- youtube_dl/extractor/facebook.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 8a9ed96c2..43c73e5e8 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -307,7 +307,6 @@ class FacebookIE(InfoExtractor): req = sanitized_Request(url) req.add_header('User-Agent', self._CHROME_USER_AGENT) webpage = self._download_webpage(req, video_id) - video_data = None def extract_video_data(instances): @@ -426,6 +425,10 @@ class FacebookIE(InfoExtractor): 'timestamp', default=None)) thumbnail = self._og_search_thumbnail(webpage) + view_count = re.sub("[^0-9]", "", self._search_regex( + r'viewCount\s*:\s*"([^"]+)"', webpage, 'view_count1', + fatal=False)) + info_dict = { 'id': video_id, 'title': video_title, @@ -433,6 +436,7 @@ class FacebookIE(InfoExtractor): 'uploader': uploader, 'timestamp': timestamp, 'thumbnail': thumbnail, + 'view_count': int(view_count), } return webpage, info_dict From ba07e77d53834559a2f9914b5193785dc1588c70 Mon Sep 17 00:00:00 2001 From: yurganov Date: Tue, 10 Jul 2018 01:00:36 +0300 Subject: [PATCH 2/5] [facebook] Add another rule for regexp views --- youtube_dl/extractor/facebook.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 43c73e5e8..f46f977f7 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -307,6 +307,7 @@ class FacebookIE(InfoExtractor): req = sanitized_Request(url) req.add_header('User-Agent', self._CHROME_USER_AGENT) webpage = self._download_webpage(req, video_id) + video_data = None def extract_video_data(instances): @@ -426,7 +427,7 @@ class FacebookIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) view_count = re.sub("[^0-9]", "", self._search_regex( - r'viewCount\s*:\s*"([^"]+)"', webpage, 'view_count1', + r'viewCount\s*:\s*"([^"]+)"\s*,\s*viewCountReduced', webpage, 'view_count', fatal=False)) info_dict = { From 0c3614bc7624d320de599a27f328e7b273917779 Mon Sep 17 00:00:00 2001 From: yurganov Date: Tue, 10 Jul 2018 01:02:22 +0300 Subject: [PATCH 3/5] [facebook] Remove empty line --- youtube_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f46f977f7..4b6dab780 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -307,7 +307,7 @@ class FacebookIE(InfoExtractor): req = sanitized_Request(url) req.add_header('User-Agent', self._CHROME_USER_AGENT) webpage = self._download_webpage(req, video_id) - + video_data = None def extract_video_data(instances): From 715f6cff3e45402313b0d9020179c4051905e6af Mon Sep 17 00:00:00 2001 From: yurganov Date: Tue, 10 Jul 2018 02:09:08 +0300 Subject: [PATCH 4/5] Regexp has been changed to more readable Use internal method int_or_none for string to int conversation Add test #11 --- youtube_dl/extractor/facebook.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 4b6dab780..5a2c49088 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -220,6 +220,21 @@ class FacebookIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # view_count exists + 'url': 'https://www.facebook.com/yandex.health/videos/2259860594030760/', + 'info_dict': { + 'id': '2259860594030760', + 'ext': 'mp4', + 'timestamp': 1530025716, + 'uploader': 'Яндекс.Здоровье', + 'title': unicode, + 'upload_date': '20180626', + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, }] @staticmethod @@ -426,9 +441,10 @@ class FacebookIE(InfoExtractor): 'timestamp', default=None)) thumbnail = self._og_search_thumbnail(webpage) - view_count = re.sub("[^0-9]", "", self._search_regex( - r'viewCount\s*:\s*"([^"]+)"\s*,\s*viewCountReduced', webpage, 'view_count', - fatal=False)) + view_count = int_or_none(re.sub("[^0-9]", "", self._search_regex( + r',viewCount\s*:\s*"([^"]+)"', + webpage, 'view_count', + default=None))) info_dict = { 'id': video_id, @@ -437,7 +453,7 @@ class FacebookIE(InfoExtractor): 'uploader': uploader, 'timestamp': timestamp, 'thumbnail': thumbnail, - 'view_count': int(view_count), + 'view_count': view_count, } return webpage, info_dict From ae38a83bfda794332d6651717a9d5c85de01e006 Mon Sep 17 00:00:00 2001 From: yurganov Date: Tue, 10 Jul 2018 02:22:43 +0300 Subject: [PATCH 5/5] Change title test to md5 method --- youtube_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 5a2c49088..f4f2f014d 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -228,7 +228,7 @@ class FacebookIE(InfoExtractor): 'ext': 'mp4', 'timestamp': 1530025716, 'uploader': 'Яндекс.Здоровье', - 'title': unicode, + 'title': 'md5:1cdf2e730316e05a89f60fc48cb4a9bf', 'upload_date': '20180626', 'view_count': int, },