From 7f69c7d44b7c4ec43ecef9bc7492fa9466b9f29c Mon Sep 17 00:00:00 2001 From: Kerruba Date: Mon, 30 Apr 2018 22:05:00 +0100 Subject: [PATCH] Cleans code, add captions download --- youtube_dl/extractor/frontendmaster.py | 49 +++++++++++++++++++------- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index 479852f69..a3a342f9b 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -8,18 +8,19 @@ import re from .common import InfoExtractor from ..compat import ( compat_urlparse, + compat_basestring, compat_str) from ..utils import ( ExtractorError, urlencode_postdata, - qualities -) + qualities, unescapeHTML) class FrontEndMasterBaseIE(InfoExtractor): _API_BASE = 'https://api.frontendmasters.com/v1/kabuki/courses' _VIDEO_BASE = 'http://www.frontendmasters.com/courses' + _CAPTIONS_BASE = 'https://api.frontendmasters.com/v1/kabuki/transcripts' _COOKIES_BASE = 'https://api.frontendmasters.com' _LOGIN_URL = 'https://frontendmasters.com/login/' @@ -59,17 +60,15 @@ class FrontEndMasterBaseIE(InfoExtractor): response, 'error message', default=None) if error: - raise ExtractorError('Unable to login: check username and password', + raise ExtractorError('Unable to login: %s' % unescapeHTML(error), expected=True) def _match_course_id(self, url): - if '_VALID_URL_RE' not in self.__dict__: - self._VALID_URL_RE = re.compile(self._VALID_URL) - m = self._VALID_URL_RE.match(url) + m = re.match(self._VALID_URL, url) assert m return compat_str(m.group('courseid')) - def _download_course(self, course_id, url, display_id): + def _download_course(self, course_id, url): response = self._download_json( '%s/%s' % (self._API_BASE, course_id), course_id, 'Downloading course JSON', @@ -79,7 +78,8 @@ class FrontEndMasterBaseIE(InfoExtractor): }) return response - def _pair_section_with_video_elemen_index(self, lesson_elements): + @staticmethod + def _pair_section_with_video_elemen_index(lesson_elements): sections = {} current_section = None current_section_number = 0 @@ -100,7 +100,7 @@ class FrontEndMasterBaseIE(InfoExtractor): class FrontEndMasterIE(FrontEndMasterBaseIE): IE_NAME = 'frontend-masters' - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/(?P[a-z\-]+)/?' + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/(?P[a-z\-]+)' _NETRC_MACHINE = 'frontend-masters' @@ -117,6 +117,29 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): 'skip': 'Requires FrontendMasters account credentials', } + @staticmethod + def _convert_subtitles(captions): + if captions and isinstance(captions, compat_basestring): + if captions.startswith('WEBVTT'): + # Assumes captions are in WEBVTT format + captions = captions.replace('WEBVTT', '') + captions = captions.replace('.', ',') + return captions + + def _get_subtitles(self, video_hash, video_id): + captions = self._download_webpage( + '%s/%s.vtt' % (self._CAPTIONS_BASE, video_hash), video_id, + fatal=False) + srt_captions = FrontEndMasterIE._convert_subtitles(captions) + + if srt_captions: + return { + 'en': [{ + 'ext': 'srt', + 'data': srt_captions + }] + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -151,7 +174,6 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): lesson_section = None lesson_section_number = None - QUALITIES_PREFERENCE = ('low', 'medium', 'high') quality_key = qualities(QUALITIES_PREFERENCE) QUALITIES = { @@ -220,6 +242,8 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): self._sort_formats(formats) + subtitles = self.extract_subtitles(lesson_hash, video_id) + return { 'id': video_id, 'display_id': lesson_slug, @@ -228,7 +252,8 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): 'chapter': lesson_section, 'chapter_number': lesson_section_number, 'thumbnail': lesson_thumbnail_url, - 'formats': formats + 'formats': formats, + 'subtitles': subtitles } @@ -274,4 +299,4 @@ class FrontEndMasterCourseIE(FrontEndMasterBaseIE): 'ie_key': FrontEndMasterIE.ie_key() }) - return self.playlist_result(entries, course_id, title, description) \ No newline at end of file + return self.playlist_result(entries, course_id, title, description)