From c3f75fe42319f90f9a8b60e7aa708f653ef889cd Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Tue, 14 Jan 2020 21:00:56 +0000 Subject: [PATCH] [facebook] Add user video playlist extraction --- youtube_dl/extractor/facebook.py | 103 +++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index ce64e2683..6ce6bea55 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals import re import socket +import json +import itertools from .common import InfoExtractor from ..compat import ( @@ -509,3 +511,104 @@ class FacebookPluginsVideoIE(InfoExtractor): return self.url_result( compat_urllib_parse_unquote(self._match_id(url)), FacebookIE.ie_key()) + +class FacebookUserIE(InfoExtractor): + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?facebook\.com/(?:pg/)?(?P[^/?#&]+))/videos(?!/\d)' + + _TESTS = [{ + # page + 'url': 'https://www.facebook.com/uniladmag/videos' + }, { + # page + 'url': 'https://www.facebook.com/Coca-Cola/videos/?ref=page_internal' + }, { + # profile + 'url': 'https://www.facebook.com/zuck/videos' + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group('id') + + page = self._download_webpage( + url, user_id, 'Downloading user webpage') + fb_url = self._html_search_meta( + 'al:android:url', page, default=None) + fb_url_re = re.match(r'fb://(?Ppage|profile)/(?P\d+)', fb_url) + if not fb_url_re: + raise ExtractorError('Could not extract page ID', expected=False) + page_id = fb_url_re.group('id') + fb_dtsg_ag_re = re.search(r'"async_get_token":"([\w\-:]+)"', page) + pagelet_token_re = re.search(r'pagelet_token:"([\w\-]+)"', page) + collection_token_re = re.search(r'pagelet_timeline_app_collection_([\d:]+)', page) + cursor = None + entries = [] + + if fb_url_re.group('type') == 'page': + endpoint = 'PagesVideoHubVideoContainerPagelet' + a_class = '_5asm' + data = { + 'page': page_id + } + elif fb_url_re.group('type') == 'profile': + if not (fb_dtsg_ag_re and pagelet_token_re and collection_token_re): + raise ExtractorError('You must be logged in to extract profile videos', expected=True) + endpoint = 'VideosByUserAppCollectionPagelet' + a_class = '_400z' + data = { + 'collection_token': collection_token_re[1], + 'disablepager': False, + 'overview': False, + 'profile_id': page_id, + 'pagelet_token': pagelet_token_re[1], + 'order': None, + 'sk': 'videos' + } + + for page_num in itertools.count(1): + js_data_page = self._download_webpage( + 'https://www.facebook.com/ajax/pagelet/generic.php/%s' % endpoint, + user_id, 'Downloading page %d' % page_num, + query={ + 'fb_dtsg_ag': fb_dtsg_ag_re[1] if fb_dtsg_ag_re else None, + 'data': json.dumps( + {**data, 'cursor': cursor}, + separators=(',', ':')), + '__a': 1 + }) + + js_data = self._parse_json(self._search_regex( + r'({.+})', js_data_page, + 'js data', default='{}'), user_id, fatal=True) + + for video in re.findall( + r'href="(?P[^"]+)"[^>]+%s' % a_class, + js_data['payload']): + entries.append( + self.url_result('https://www.facebook.com%s' % video, FacebookIE.ie_key()) + ) + + cursor = None + if fb_url_re.group('type') == 'page': + if not 'instances' in js_data['jsmods']: + break + for parent in js_data['jsmods']['instances']: + if type(parent) is list: + for child in parent: + if type(child) is list: + for subchild in child: + if type(subchild) is dict and 'cursor' in subchild: + cursor = subchild['cursor'] + break + elif fb_url_re.group('type') == 'profile': + for parent in js_data['jsmods']['require']: + if type(parent) is list: + for child in parent: + if type(child) is list: + if len(child) == 3: + cursor = child[2] + break + if not cursor: + break + + return self.playlist_result(entries, user_id)