[slutload] Fix extraction for desktop and mobile site

Did a separate extractor for the mobile site, because its structure differs from the desktop site.
2020-11-18 19:53:54 -08:00 · 2018-07-18 12:33:06 +03:00 · 2018-07-18 12:33:06 +03:00 · dbd71aa288
commit dbd71aa288
parent 79367a9820
2 changed files with 54 additions and 18 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -988,7 +988,10 @@ from .skynewsarabia import (
 from .skysports import SkySportsIE
 from .slideshare import SlideshareIE
 from .slideslive import SlidesLiveIE
-from .slutload import SlutloadIE
+from .slutload import (
    SlutloadIE,
    SlutloadMobileIE
 )
 from .smotri import (
    SmotriIE,
    SmotriCommunityIE,
--- a/youtube_dl/extractor/slutload.py
+++ b/youtube_dl/extractor/slutload.py
@ -1,13 +1,11 @@
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 class SlutloadIE(InfoExtractor):
-    _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$'
+    _VALID_URL = r'^https?://(?:www\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$'
-    _TESTS = [{
+    _TEST = {
        'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/',
        'md5': '868309628ba00fd488cf516a113fd717',
        'info_dict': {
@ -17,27 +15,62 @@ class SlutloadIE(InfoExtractor):
            'age_limit': 18,
            'thumbnail': r're:https?://.*?\.jpg'
        }
-    }, {
+    }
        # mobile site
        'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        video_id = self._match_id(url)
-        desktop_url = re.sub(r'^(https?://)mobile\.', r'\1', url)
+        webpage = self._download_webpage(url, video_id)
        webpage = self._download_webpage(desktop_url, video_id)
        video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>',
                                              webpage, 'title').strip()
        video_title = self._html_search_regex(
            r'<h1><strong>([^<]+)</strong>',
            webpage, 'title').strip()
        video_url = self._html_search_regex(
-            r'(?s)<div id="vidPlayer"\s+data-url="([^"]+)"',
+            r'(?s)<video id=["\']desktop-player["\'].+?<source src=["\']([^"\']+)["\']',
            webpage, 'video URL')
        thumbnail = self._html_search_regex(
-            r'(?s)<div id="vidPlayer"\s+.*?previewer-file="([^"]+)"',
+            r'(?s)<video id=["\']desktop-player["\'].+?poster=["\']([^"\']+)["\']',
-            webpage, 'thumbnail', fatal=False)
+            webpage, 'thumbnail, fatal=False'
        )
        return {
            'id': video_id,
            'url': video_url,
            'title': video_title,
            'thumbnail': thumbnail,
            'age_limit': 18
        }
 class SlutloadMobileIE(InfoExtractor):
    _VALID_URL = r'^https?://mobile\.slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$'
    _TEST = {
        'url': 'http://mobile.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/',
        'md5': '868309628ba00fd488cf516a113fd717',
        'info_dict': {
            'id': 'TD73btpBqSxc',
            'ext': 'mp4',
            'age_limit': 18,
            'title': 'virginie baisee en cam',
            'thumbnail': r're:https?://.*?\.jpg'
        }
    }
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        video_title = self._html_search_regex(
            r'<div class=["\']videoHd[^"\']+["\']>([^<]+)</div>',
            webpage, 'title').strip()
        video_url = self._html_search_regex(
            r'(?s)<video id=["\']html5video["\'].+?src=["\']([^"\']+)["\']',
            webpage, 'video URL')
        thumbnail = self._html_search_regex(
            r'(?s)<video id=["\']html5video["\'].+?poster=["\']([^"\']+)["\']',
            webpage, 'thumbnail, fatal=False'
        )
        return {
            'id': video_id,