From 7d55014e2f03016b478fbd732febb97f5890974c Mon Sep 17 00:00:00 2001 From: chazardsquair <48913100+chazardsquair@users.noreply.github.com> Date: Sun, 28 Apr 2019 06:07:33 -0500 Subject: [PATCH 1/5] [XVideos] Add support for untracked properties performers, performer_channels, uploader, uploader_channel, related_categories, tags, and sponsor properties are now exposed --- youtube_dl/extractor/xvideos.py | 132 +++++++++++++++++++++++++++++++- 1 file changed, 130 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 166bcf443..549bdd8ca 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -3,13 +3,20 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_urllib_parse_unquote, + compat_str +) from ..utils import ( clean_html, determine_ext, ExtractorError, int_or_none, parse_duration, + get_element_by_class, + get_elements_by_class, + js_to_json, + try_get ) @@ -32,7 +39,80 @@ class XVideosIE(InfoExtractor): 'title': 'Biker Takes his Girl', 'duration': 108, 'age_limit': 18, - } + 'uploader': "Kandys Kisses" + }, + }, { + 'url': 'https://www.xvideos.com/video43548989/petite_brooke_haze_is_so_cute', + 'md5': 'b629ee68705da901dbd60c3b3a7c16bc', + 'info_dict': { + 'id': '43548989', + 'ext': 'mp4', + 'title': 'Petite Brooke Haze is so cute', + 'duration': 521, + 'uploader': 'Amkempire', + 'uploader_channel': 'amkempire', + 'performers': [ + 'Brooke Haze' + ], + 'performer_channels': [ + 'brooke-haze' + ], + 'related_categories': list, + 'tags': list, + 'sponsor': [{ + 'name': 'AMKEmpire', + 'desc': ('AMKingdom caters to a plethora of niches, while offering high-end, quality content. ' + 'ATK/AMK features Hairy, Natural, Mature, and other Fetish categories, since 1996!') + } + ], + 'age_limit': 18, + }, + }, { + # multiple performers + 'url': 'https://www.xvideos.com/video721515/simon_says_fuck_4', + 'md5': '68f64996a6a53ab834e14a36f2357038', + 'info_dict': { + 'id': '721515', + 'ext': 'mp4', + 'title': 'Simon says fuck! 4', + 'duration': 310, + 'uploader': 'Latgpxxx', + 'uploader_channel': 'latgpxxx', + 'performers': [ + 'Jenna Haze', + 'Billy Glide' + ], + 'performer_channels': [ + 'jenna-haze', + 'billy-glide-1' + ], + 'related_categories': list, + 'tags': list, + 'sponsor': list, + 'age_limit': 18, + }, + }, { + # anonymous / unavailable uploader + 'url': 'https://www.xvideos.com/video91/3_young_school_girls', + 'md5': '86f6a54c5f3ad45a01c5daac512c59a6', + 'info_dict': { + 'id': '91', + 'ext': 'mp4', + 'title': '3 young school girls', + 'duration': 35, + 'uploader': None, + 'uploader_channel': None, + 'performers': [ + 'Alisha Klass' + ], + 'performer_channels': [ + 'alisha-klass' + ], + 'related_categories': list, + 'tags': list, + 'sponsor': list, + 'age_limit': 18, + }, }, { 'url': 'https://flashservice.xvideos.com/embedframe/4588838', 'only_matching': True, @@ -100,11 +180,59 @@ class XVideosIE(InfoExtractor): self._sort_formats(formats) + metadata_node = get_element_by_class("video-metadata", webpage) + uploader_node = get_element_by_class("uploader-tag", metadata_node) + performer_nodes = get_elements_by_class("profile", metadata_node) + base_pattern = r']+class=["\']name["\'][^>]*>.*?(?P[^<]+)' + + uploader = None + if uploader_node is not None: + uploader = self._search_regex( + base_pattern, uploader_node, 'name', default=None, group='name', fatal=False) + + performers = [] + if performer_nodes is not None: + for node in performer_nodes: + performer = self._search_regex(base_pattern, node, 'name', default=None, group='name', fatal=False) + if performer is not None: + performers.append(performer) + + uploader_channel = self._search_regex( + r']+href=["\']/?(?:profiles|channels)/(?P[^"]+)', metadata_node, + 'channel', default=None, group='channel', fatal=False) + + performer_channels = re.findall( + r']+href=["\']/?(?:pornstar-channels|models|pornstars)/(?P[^"]+)', metadata_node) + + tags = [item.replace("-", " ") for item in re.findall(r']+href=["\']/?tags/(?P[^"]+)', + metadata_node)] + + raw_conf = self._search_regex( + r'conf\s?=\s?(?P.+);', webpage, 'json', default=None, group='json', fatal=False) + + parsed_conf = self._parse_json( + raw_conf, video_id, transform_source=js_to_json, fatal=False) + + rc_list = try_get(parsed_conf, lambda x: x['data']['related_keywords']) or [] + rc_list_alt = try_get(parsed_conf, lambda x: x['dyn']['ads']['categories'], compat_str) or None + related_categories = rc_list or [item.replace('_', ' ') for item in rc_list_alt.split(',')] + + sponsor_dict = try_get(parsed_conf, lambda x: x['data']['sponsors']) or {} + sponsor = [{'name': sponsor.get('n'), 'desc': sponsor.get('d')} + for sponsor in sponsor_dict] + return { 'id': video_id, 'formats': formats, 'title': title, 'duration': duration, 'thumbnails': thumbnails, + 'uploader': uploader, + 'performers': performers, + 'uploader_channel': uploader_channel, + 'performer_channels': performer_channels, + 'related_categories': related_categories, + 'tags': tags, + 'sponsor': sponsor, 'age_limit': 18, } From f53dfc6b1446fc4537dc4c9d8efe7db98a6d9ea6 Mon Sep 17 00:00:00 2001 From: chazardsquair <48913100+chazardsquair@users.noreply.github.com> Date: Sun, 28 Apr 2019 14:09:57 -0500 Subject: [PATCH 2/5] [XVideos] Remove or rename meta fields Fields validated with common.py uploader_channel => renamed to uploader_id related_categories => renamed to categories sponsor => renamed to creator performers => removed performer_channels => removed --- youtube_dl/extractor/xvideos.py | 90 +++++++-------------------------- 1 file changed, 19 insertions(+), 71 deletions(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 549bdd8ca..b680042d9 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -39,7 +39,10 @@ class XVideosIE(InfoExtractor): 'title': 'Biker Takes his Girl', 'duration': 108, 'age_limit': 18, - 'uploader': "Kandys Kisses" + 'uploader': 'Kandys Kisses', + 'uploader_id': 'kandyskisses', + 'categories': list, + 'tags': list }, }, { 'url': 'https://www.xvideos.com/video43548989/petite_brooke_haze_is_so_cute', @@ -50,45 +53,10 @@ class XVideosIE(InfoExtractor): 'title': 'Petite Brooke Haze is so cute', 'duration': 521, 'uploader': 'Amkempire', - 'uploader_channel': 'amkempire', - 'performers': [ - 'Brooke Haze' - ], - 'performer_channels': [ - 'brooke-haze' - ], - 'related_categories': list, + 'uploader_id': 'amkempire', + 'categories': list, 'tags': list, - 'sponsor': [{ - 'name': 'AMKEmpire', - 'desc': ('AMKingdom caters to a plethora of niches, while offering high-end, quality content. ' - 'ATK/AMK features Hairy, Natural, Mature, and other Fetish categories, since 1996!') - } - ], - 'age_limit': 18, - }, - }, { - # multiple performers - 'url': 'https://www.xvideos.com/video721515/simon_says_fuck_4', - 'md5': '68f64996a6a53ab834e14a36f2357038', - 'info_dict': { - 'id': '721515', - 'ext': 'mp4', - 'title': 'Simon says fuck! 4', - 'duration': 310, - 'uploader': 'Latgpxxx', - 'uploader_channel': 'latgpxxx', - 'performers': [ - 'Jenna Haze', - 'Billy Glide' - ], - 'performer_channels': [ - 'jenna-haze', - 'billy-glide-1' - ], - 'related_categories': list, - 'tags': list, - 'sponsor': list, + 'creator': 'AMKEmpire', 'age_limit': 18, }, }, { @@ -101,16 +69,9 @@ class XVideosIE(InfoExtractor): 'title': '3 young school girls', 'duration': 35, 'uploader': None, - 'uploader_channel': None, - 'performers': [ - 'Alisha Klass' - ], - 'performer_channels': [ - 'alisha-klass' - ], - 'related_categories': list, + 'uploader_id': None, + 'categories': list, 'tags': list, - 'sponsor': list, 'age_limit': 18, }, }, { @@ -182,28 +143,17 @@ class XVideosIE(InfoExtractor): metadata_node = get_element_by_class("video-metadata", webpage) uploader_node = get_element_by_class("uploader-tag", metadata_node) - performer_nodes = get_elements_by_class("profile", metadata_node) - base_pattern = r']+class=["\']name["\'][^>]*>.*?(?P[^<]+)' uploader = None if uploader_node is not None: uploader = self._search_regex( - base_pattern, uploader_node, 'name', default=None, group='name', fatal=False) + r']+class=["\']name["\'][^>]*>.*?(?P[^<]+)', uploader_node, + 'name', default=None, group='name', fatal=False) - performers = [] - if performer_nodes is not None: - for node in performer_nodes: - performer = self._search_regex(base_pattern, node, 'name', default=None, group='name', fatal=False) - if performer is not None: - performers.append(performer) - - uploader_channel = self._search_regex( + uploader_id = self._search_regex( r']+href=["\']/?(?:profiles|channels)/(?P[^"]+)', metadata_node, 'channel', default=None, group='channel', fatal=False) - performer_channels = re.findall( - r']+href=["\']/?(?:pornstar-channels|models|pornstars)/(?P[^"]+)', metadata_node) - tags = [item.replace("-", " ") for item in re.findall(r']+href=["\']/?tags/(?P[^"]+)', metadata_node)] @@ -215,11 +165,10 @@ class XVideosIE(InfoExtractor): rc_list = try_get(parsed_conf, lambda x: x['data']['related_keywords']) or [] rc_list_alt = try_get(parsed_conf, lambda x: x['dyn']['ads']['categories'], compat_str) or None - related_categories = rc_list or [item.replace('_', ' ') for item in rc_list_alt.split(',')] + categories = rc_list or [item.replace('_', ' ') for item in rc_list_alt.split(',')] - sponsor_dict = try_get(parsed_conf, lambda x: x['data']['sponsors']) or {} - sponsor = [{'name': sponsor.get('n'), 'desc': sponsor.get('d')} - for sponsor in sponsor_dict] + sponsor_dict = try_get(parsed_conf, lambda x: x['data']['sponsors'][0]) or {} + creator = sponsor_dict.get('n') return { 'id': video_id, @@ -228,11 +177,10 @@ class XVideosIE(InfoExtractor): 'duration': duration, 'thumbnails': thumbnails, 'uploader': uploader, - 'performers': performers, - 'uploader_channel': uploader_channel, - 'performer_channels': performer_channels, - 'related_categories': related_categories, + 'uploader_id': uploader_id, + 'categories': categories, 'tags': tags, - 'sponsor': sponsor, + 'creator': creator, 'age_limit': 18, } + From e8c1120f6a7310c0103a0944b59e4d851bfd7019 Mon Sep 17 00:00:00 2001 From: chazardsquair <48913100+chazardsquair@users.noreply.github.com> Date: Sun, 28 Apr 2019 14:20:25 -0500 Subject: [PATCH 3/5] [XVideos] Remove unused import Flake8 found unused import, removed --- youtube_dl/extractor/xvideos.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index b680042d9..fea1ace94 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -14,7 +14,6 @@ from ..utils import ( int_or_none, parse_duration, get_element_by_class, - get_elements_by_class, js_to_json, try_get ) From 75493fcc0e106db7e6b1e781eb431eb203e594f8 Mon Sep 17 00:00:00 2001 From: chazardsquair <48913100+chazardsquair@users.noreply.github.com> Date: Mon, 29 Apr 2019 06:37:12 -0500 Subject: [PATCH 4/5] Handle 404's Display error message when attempting to download deleted or removed video, or when the page cannot be found --- youtube_dl/extractor/xvideos.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index fea1ace94..15107e81b 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -85,8 +85,20 @@ class XVideosIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'https://www.xvideos.com/video%s/' % video_id, video_id) + 'https://www.xvideos.com/video%s/' % video_id, video_id, expected_status=404) + status_404 = get_element_by_class("status-404", webpage) or get_element_by_class("http-error-page", webpage) + if status_404: + reg_not_found = r']+id=["\']content["\']>[\r\n]*?]*>(?P[^<]*)' + deleted = get_element_by_class("text-danger", status_404) + not_found = self._search_regex(reg_not_found, status_404, 'reason', default=None, group='reason') + reason = deleted or not_found or None + if reason: + raise ExtractorError('%s said: %s' % (self.IE_NAME, reason), expected=True, video_id=video_id) + + mobj = re.search(r'

(.+?)

', webpage) + if mobj: + raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True) mobj = re.search(r'

(.+?)

', webpage) if mobj: raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True) From 9b1df67ae274f98745516cc2651d57e22ed6e475 Mon Sep 17 00:00:00 2001 From: chazardsquair <48913100+chazardsquair@users.noreply.github.com> Date: Thu, 2 May 2019 22:14:29 -0500 Subject: [PATCH 5/5] Add support for additional meta fields Add support for view_count, like_count, dislike_count, and comment_count. Simplify and remove redundant code. --- youtube_dl/extractor/xvideos.py | 43 +++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 15107e81b..c371c6de3 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -14,8 +14,10 @@ from ..utils import ( int_or_none, parse_duration, get_element_by_class, + get_element_by_id, js_to_json, - try_get + try_get, + str_to_int ) @@ -37,11 +39,15 @@ class XVideosIE(InfoExtractor): 'ext': 'mp4', 'title': 'Biker Takes his Girl', 'duration': 108, - 'age_limit': 18, 'uploader': 'Kandys Kisses', 'uploader_id': 'kandyskisses', 'categories': list, - 'tags': list + 'tags': list, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, }, }, { 'url': 'https://www.xvideos.com/video43548989/petite_brooke_haze_is_so_cute', @@ -56,6 +62,10 @@ class XVideosIE(InfoExtractor): 'categories': list, 'tags': list, 'creator': 'AMKEmpire', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, 'age_limit': 18, }, }, { @@ -71,6 +81,10 @@ class XVideosIE(InfoExtractor): 'uploader_id': None, 'categories': list, 'tags': list, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, 'age_limit': 18, }, }, { @@ -87,7 +101,7 @@ class XVideosIE(InfoExtractor): webpage = self._download_webpage( 'https://www.xvideos.com/video%s/' % video_id, video_id, expected_status=404) - status_404 = get_element_by_class("status-404", webpage) or get_element_by_class("http-error-page", webpage) + status_404 = get_element_by_class("status-404", webpage) if status_404: reg_not_found = r']+id=["\']content["\']>[\r\n]*?]*>(?P[^<]*)' deleted = get_element_by_class("text-danger", status_404) @@ -97,9 +111,6 @@ class XVideosIE(InfoExtractor): raise ExtractorError('%s said: %s' % (self.IE_NAME, reason), expected=True, video_id=video_id) mobj = re.search(r'

(.+?)

', webpage) - if mobj: - raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True) - mobj = re.search(r'

(.+?)

', webpage) if mobj: raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True) @@ -178,8 +189,17 @@ class XVideosIE(InfoExtractor): rc_list_alt = try_get(parsed_conf, lambda x: x['dyn']['ads']['categories'], compat_str) or None categories = rc_list or [item.replace('_', ' ') for item in rc_list_alt.split(',')] - sponsor_dict = try_get(parsed_conf, lambda x: x['data']['sponsors'][0]) or {} - creator = sponsor_dict.get('n') + creator = try_get(parsed_conf, lambda x: x['data']['sponsors'][0]['n']) or {} + + like_wk = get_element_by_class('rating-inbtn', get_element_by_class('vote-action-good', webpage)) + like_count = (int(float(like_wk[:-1]) * 1000) if 'k' in like_wk else int(like_wk)) + + dislike_wk = get_element_by_class('rating-inbtn', get_element_by_class('vote-action-bad', webpage)) + dislike_count = (int(float(dislike_wk[:-1]) * 1000) if 'k' in dislike_wk else int(dislike_wk)) + + view_count = get_element_by_id('nb-views-number', webpage).replace(',', '') + + comment_count = get_element_by_class('navbadge', get_element_by_id('tabComments_btn', webpage)) return { 'id': video_id, @@ -192,6 +212,9 @@ class XVideosIE(InfoExtractor): 'categories': categories, 'tags': tags, 'creator': creator, + 'view_count': str_to_int(view_count), + 'comment_count': str_to_int(comment_count), + 'like_count': like_count, + 'dislike_count': dislike_count, 'age_limit': 18, } -