Split Thumbzilla out into it's own extractor.

This simplifies the pornhub extractor and makes it easier to maintain in the future.
2020-11-18 19:53:54 -08:00 · 2020-03-07 15:20:41 -08:00 · 2020-03-07 15:20:41 -08:00 · 87f50e3feb
commit 87f50e3feb
parent d332ec725d
3 changed files with 63 additions and 7 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1129,6 +1129,7 @@ from .thisamericanlife import ThisAmericanLifeIE
 from .thisav import ThisAVIE
 from .thisoldhouse import ThisOldHouseIE
 from .threeqsdn import ThreeQSDNIE
 from .thumbzilla import ThumbzillaIE
 from .tiktok import (
    TikTokIE,
    TikTokUserIE,
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@ -51,10 +51,7 @@ class PornHubIE(PornHubBaseIE):
    IE_DESC = 'PornHub and Thumbzilla'
    _VALID_URL = r'''(?x)
                    https?://
-                        (?:
+                        (?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)
                            (?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
                            (?:www\.)?thumbzilla\.com/video/
                        )
                        (?P<id>[\da-z]+)
                    '''
    _TESTS = [{
@ -140,9 +137,6 @@ class PornHubIE(PornHubBaseIE):
        # private video
        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
        'only_matching': True,
    }, {
        'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
        'only_matching': True,
    }, {
        'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
        'only_matching': True,
--- a/youtube_dl/extractor/thumbzilla.py
+++ b/youtube_dl/extractor/thumbzilla.py
@ -0,0 +1,61 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..compat import compat_urllib_request
 from .openload import PhantomJSwrapper
 from .pornhub import PornHubIE
 from ..utils import ExtractorError
 class ThumbzillaIE(InfoExtractor):
    """
    ThumbzillaIE is a frontend for other 'Tube' sites (mostly PornHub). ThumbzillaIE will
    parse the video and delegate to the appropriate extractor via a url_result.
    """
    IE_NAME = 'thumbzilla'
    IE_DESC = 'Thumbzilla'
    _VALID_URL = r'https?://(?P<host>(?:www\.)?thumbzilla\.com)/video/(?P<id>[\da-z]+)'
    _TEST = {
        'url': 'https://www.thumbzilla.com/video/ph5c8e8f15b40ff/hot-skinny-girl-gives-you',
        'info_dict': {
            'id': 'ph5c8e8f15b40ff',
            'ext': 'mp4',
            'upload_date': '20190317',
            'age_limit': 18,
            'uploader': 'lizashultz',
            'title': 'Hot skinny girl gives you.',
        }
    }
    def _download_webpage_handle(self, *args, **kwargs):
        def dl(*args, **kwargs):
            return super(ThumbzillaIE, self)._download_webpage_handle(*args, **kwargs)
        webpage, urlh = dl(*args, **kwargs)
        if any(re.search(p, webpage) for p in (
                r'<body\b[^>]+\bonload=["\']go\(\)',
                r'document\.cookie\s*=\s*["\']RNKEY=',
                r'document\.location\.reload\(true\)')):
            url_or_request = args[0]
            url = (url_or_request.get_full_url()
                   if isinstance(url_or_request, compat_urllib_request.Request)
                   else url_or_request)
            phantom = PhantomJSwrapper(self, required_version='2.0')
            phantom.get(url, html=webpage)
            webpage, urlh = dl(*args, **kwargs)
        return webpage, urlh
    def _real_extract(self, url):
        host, video_id = re.match(self._VALID_URL, url).groups()
        if video_id.startswith('ph'):
            return self.url_result('https://pornhub.com/view_video.php?viewkey=%s' % video_id,
                                   video_id=video_id, ie=PornHubIE.ie_key())
        else:
            raise ExtractorError('Unsupported video')