From 7cdeaca34b452531d3d8c0e30246c96a4ca6e5bb Mon Sep 17 00:00:00 2001 From: fnord Date: Mon, 13 Jul 2015 10:47:14 -0500 Subject: [PATCH 01/10] generic: dynamically find extractor for iframes/embeds/etc if static methods fail --- youtube_dl/extractor/generic.py | 62 +++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 392ad3648..4ec7d9c2a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -970,6 +970,57 @@ class GenericIE(InfoExtractor): 'title': title, } + def _extract_plugin_embeds(self, webpage, url): + match = re.findall( + r'<(?:[^>]+?data-video-url|meta[^>]+?content|(?:embed|iframe)[^>]+?src)\s*=\s*(["\'])(?P(?:https?:)?//.+?)\1', webpage) + + # In addition to 'generic', ignore matches from these plugins + # ..however _extract_plugin_embeds should run last + notbefore_blacklist = { + # test 37 (Wistia) http://thoughtworks.wistia.com/medias/uxjb0lwrcz + # duplicate embed causes test failure + 'Wistia': True, + # test 46 for rtl.nl (http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen) + # has a broken youtube embed, download & test failure + 'youtube': True, + } + elist = [] + if not match: + return elist + # eliminate duplicate checks + checked = {url: True} + for m in match: + u=unescapeHTML(m[1]) + + if checked.get(u,False) == True: + continue + checked[u] = True + + for ie in self._downloader._ies: + found = False + if ie.IE_NAME == self.IE_NAME: + continue + if not ie.working(): + continue + if notbefore_blacklist.get(ie.IE_NAME,False) == True: + continue + if ie.suitable(u): + print (' EMBED ['+ie.IE_NAME+'] '+u) + found = True + elist.append({ + '_type': 'url', + 'url': u, + 'ie_key': ie.ie_key(), + }) + break + if not found: + #self._downloader.params.get('verbose', False): + print (' EMBED [?!] '+u) + if elist: + print(''+str(len(elist))+' embeds') + return elist + + def _real_extract(self, url): if url.startswith('//'): return { @@ -1603,6 +1654,17 @@ class GenericIE(InfoExtractor): self._proto_relative_url(unescapeHTML(mobj.group(1))), 'AdobeTVVideo') + # Last-ditch attempt to find matching plugin for embeds + # (this can potentially replace alot of code above) + elist = self._extract_plugin_embeds(webpage, url) + if elist: + return { + '_type': 'playlist', + 'title': video_title, + 'id': video_id, + 'entries': elist, + } + def check_video(vurl): if YoutubeIE.suitable(vurl): return True From e88295882cddb9f14f7babab8a12088cb0981cc7 Mon Sep 17 00:00:00 2001 From: fnord Date: Mon, 13 Jul 2015 10:55:32 -0500 Subject: [PATCH 02/10] < AGSPhoenix> AtomicDryad, your use of 'alot' fills me with unreasonable fury and consternation. --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4ec7d9c2a..7ccedef6c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1655,7 +1655,7 @@ class GenericIE(InfoExtractor): 'AdobeTVVideo') # Last-ditch attempt to find matching plugin for embeds - # (this can potentially replace alot of code above) + # (this can potentially replace many lines of code above) elist = self._extract_plugin_embeds(webpage, url) if elist: return { From 899420bc743300be234e7c231fc2bfcf583b01d8 Mon Sep 17 00:00:00 2001 From: fnord Date: Mon, 13 Jul 2015 11:53:57 -0500 Subject: [PATCH 03/10] handle "//toolazy.for/protocol" urls --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7ccedef6c..ac94dcd39 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -990,7 +990,7 @@ class GenericIE(InfoExtractor): # eliminate duplicate checks checked = {url: True} for m in match: - u=unescapeHTML(m[1]) + u=self._proto_relative_url(unescapeHTML(m[1])) if checked.get(u,False) == True: continue From 4555a88fd71693b70241e6e43875316542484060 Mon Sep 17 00:00:00 2001 From: fnord Date: Mon, 13 Jul 2015 12:02:34 -0500 Subject: [PATCH 04/10] Generic: Eliminate redundant iframe/embed src regexp matching and extractor._VALID_URL duplication --- youtube_dl/extractor/generic.py | 213 -------------------------------- 1 file changed, 213 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ac94dcd39..0649ec1fe 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1217,17 +1217,6 @@ class GenericIE(InfoExtractor): 'entries': entries, } - # Look for embedded rtl.nl player - matches = re.findall( - r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', - webpage) - if matches: - return _playlist_from_matches(matches, ie='RtlNl') - - vimeo_url = VimeoIE._extract_vimeo_url(url, webpage) - if vimeo_url is not None: - return self.url_result(vimeo_url) - # Look for embedded YouTube player matches = re.findall(r'''(?x) (?: @@ -1251,13 +1240,6 @@ class GenericIE(InfoExtractor): if matches: return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) - # Look for embedded Dailymotion player - matches = re.findall( - r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage) - if matches: - return _playlist_from_matches( - matches, lambda m: unescapeHTML(m[1])) - # Look for embedded Dailymotion playlist player (#3822) m = re.search( r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) @@ -1299,11 +1281,6 @@ class GenericIE(InfoExtractor): if bliptv_url: return self.url_result(bliptv_url, 'BlipTV') - # Look for SVT player - svt_url = SVTIE._extract_url(webpage) - if svt_url: - return self.url_result(svt_url, 'SVT') - # Look for embedded condenast player matches = re.findall( r']+?src=(["\'])(?P(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - # Look for embedded Viddler player mobj = re.search( r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1', @@ -1340,19 +1311,6 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) - # Look for NYTimes player - mobj = re.search( - r']+src=(["\'])(?P(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Libsyn player - mobj = re.search( - r']+src=(["\'])(?P(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - # Look for Ooyala videos mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P[^"&]+)', webpage) or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P.{32})[\'"]', webpage) or @@ -1369,16 +1327,6 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala') - # Look for Aparat videos - mobj = re.search(r'