From 8e8652bbf6a86e61ff9e198a8fefb911c87d163f Mon Sep 17 00:00:00 2001 From: ripe4spite Date: Sat, 8 Dec 2018 14:54:07 +0000 Subject: [PATCH 1/2] [xhamster] Add XHamsterUserIE --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/xhamster.py | 58 ++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e5f18a75d..737edda36 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1406,6 +1406,7 @@ from .xfileshare import XFileShareIE from .xhamster import ( XHamsterIE, XHamsterEmbedIE, + XHamsterUserIE, ) from .xiami import ( XiamiSongIE, diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 68a48034e..2c3fa93b9 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor @@ -319,3 +320,60 @@ class XHamsterEmbedIE(InfoExtractor): video_url = dict_get(vars, ('downloadLink', 'homepageLink', 'commentsLink', 'shareUrl')) return self.url_result(video_url, 'XHamster') + + +class XHamsterUserIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?xhamster\.com/users/(?P[^/]+)/videos/?' + _TESTS = [ + { + 'note': 'Paginated user profile', + 'url': 'https://xhamster.com/users/netvideogirls/videos', + 'playlist_mincount': 267, + 'info_dict': { + 'id': 'netvideogirls', + 'title': 'netvideogirls' + } + }, + { + 'note': 'Non-paginated user profile', + 'url': 'https://xhamster.com/users/firatkaan/videos', + 'playlist_mincount': 1, + 'info_dict': { + 'id': 'firatkaan', + 'title': 'firatkaan' + } + } + ] + + def _real_extract(self, url): + user_id = self._match_id(url) + videos = [] + + for page in itertools.count(1): + address = 'https://xhamster.com/users/%s/videos/%d' % (user_id, page) + webpage = self._download_webpage(address, user_id, note="Downloading page %d" % page) + video_url_matches = re.finditer(r']*>\s*
    \s*
\s*', webpage): + # The pager is empty; there is only a single page of results. + break + next_page_matcher = re.search(r'', webpage) + if next_page_matcher: + # There is a next page. + address = next_page_matcher.group(1) + continue + # Check we can find the previous page button as a sanity check. + prev_page_matcher = re.search(r'', webpage) + if prev_page_matcher: + # No more pages. + break + raise ExtractorError("Could not correctly parse pagination buttons.") + + return { + "_type": "playlist", + "entries": videos, + "id": user_id, + "title": user_id + } From 984a76242de2f4a6525425700f3784db19f3ecf0 Mon Sep 17 00:00:00 2001 From: ripe4spite Date: Sat, 8 Dec 2018 19:30:06 +0000 Subject: [PATCH 2/2] [xhamster] Attempt to make regular expressions more tolerant of page changes. --- youtube_dl/extractor/xhamster.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 2c3fa93b9..9ce24172f 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -352,20 +352,20 @@ class XHamsterUserIE(InfoExtractor): for page in itertools.count(1): address = 'https://xhamster.com/users/%s/videos/%d' % (user_id, page) webpage = self._download_webpage(address, user_id, note="Downloading page %d" % page) - video_url_matches = re.finditer(r'
]+class="[^"]*thumb-image-container[^"]*"[^>]+href="(https://xhamster.com/videos/[^"/]+)"', webpage) for video_url_match in video_url_matches: video_url = video_url_match.group(1) videos += [self.url_result(video_url, 'XHamster', '-'.split(video_url)[-1])] - if re.search(r'
]*>\s*
    \s*
\s*
', webpage): + if re.search(r'
]*>\s*]*>\s*\s*
', webpage): # The pager is empty; there is only a single page of results. break - next_page_matcher = re.search(r'', webpage) + next_page_matcher = re.search(r']+data-page="next"[^>]+href="([^"]+)"[^>]*>', webpage) if next_page_matcher: # There is a next page. address = next_page_matcher.group(1) continue # Check we can find the previous page button as a sanity check. - prev_page_matcher = re.search(r'', webpage) + prev_page_matcher = re.search(r']+data-page="prev"[^>]+href="([^"]+)"[^>]*>', webpage) if prev_page_matcher: # No more pages. break