From 45cdca687c7f9ab77228a47cf5dcf34cad6ce3a3 Mon Sep 17 00:00:00 2001 From: shaileshaanand Date: Thu, 12 Mar 2020 12:17:51 +0530 Subject: [PATCH 1/5] [skillshare:class] Add support for skillshare.com classes --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/skillshare.py | 61 ++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 youtube_dl/extractor/skillshare.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 64d1fa251..5586c928c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -991,6 +991,7 @@ from .shared import ( from .showroomlive import ShowRoomLiveIE from .sina import SinaIE from .sixplay import SixPlayIE +from .skillshare import SkillshareClassIE from .skylinewebcams import SkylineWebcamsIE from .skynewsarabia import ( SkyNewsArabiaIE, diff --git a/youtube_dl/extractor/skillshare.py b/youtube_dl/extractor/skillshare.py new file mode 100644 index 000000000..c63b52ab5 --- /dev/null +++ b/youtube_dl/extractor/skillshare.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals +import json +from .common import InfoExtractor + + +class SkillshareClassIE(InfoExtractor): + IE_NAME = 'skillshare:class' + _VALID_URL = r'https?://(?:www\.)?skillshare\.com/classes/[^/]+/(?P[0-9]+)' + _TEST = { + 'url': 'https://www.skillshare.com/classes/SEO-Today-Strategies-to-Earn-Trust-Rank-High-and-Stand-Out/423483018', + 'only_matching': True, + 'info_dict': { + 'id': '5463396146001', + 'ext': 'mp4', + 'title': 'Introduction', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + search_term = r'(?P{"userData":{.+});\n' + lesson_info_api_url_format = "https://www.skillshare.com/sessions/{}/video" + video_api_url_format = "https://edge.api.brightcove.com/playback/v1/accounts/{}/videos/{}" + headers = {"Accept": "application/json;pk=BCpkADawqM2OOcM6njnM7hf9EaK6lIFlqiXB0iWjqGWU QjU7R8965xUvIQNqdQbnDTLz0IAO7E6Ir2rIbXJtFdzrGtitoee0n1XXRliD-RH9A-svuvNW 9qgo3Bh34HEZjXjG4Nml4iyz3KqF"} + class_id = self._match_id(url) + class_page = self._download_webpage(url, class_id) + class_json_data = json.loads(self._search_regex(search_term, class_page, 'class_json_data')) + account_id = str(class_json_data.get('pageData').get('videoPlayerData').get('brightcoveAccountId')) + class_title = class_json_data.get('pageData').get('headerData').get('title') + lessons = class_json_data.get('pageData').get('videoPlayerData').get('units')[0].get('sessions') + videos = [] + for lesson in lessons: + lesson_id = str(lesson.get('id')) + lesson_info_api_url = lesson_info_api_url_format.format(lesson_id) + lesson_info_api_response = self._download_json(lesson_info_api_url, lesson_id) + print(lesson_info_api_response) + if 'video_hashed_id' not in lesson_info_api_response: + break + video_hashed_id = lesson_info_api_response.get('video_hashed_id')[3:] + video_api_url = video_api_url_format.format(account_id, video_hashed_id) + video_api_response = self._download_json(video_api_url, video_hashed_id, headers=headers) + lesson_title = lesson.get('title') + lesson_url = video_api_response.get('sources')[-1].get('src') + video = { + 'id': video_hashed_id, + 'title': lesson_title, + 'url': lesson_url, + 'ext': 'mp4', + } + videos.append(video) + return { + 'id': class_id, + 'title': class_title, + '_type': 'playlist', + 'entries': videos + } From 39eb11694ca64bef676343f7f8e706b31e84184e Mon Sep 17 00:00:00 2001 From: shaileshaanand Date: Fri, 13 Mar 2020 13:10:36 +0530 Subject: [PATCH 2/5] [skillshare:class] use brightcove extractor --- youtube_dl/extractor/skillshare.py | 47 +++++++----------------------- 1 file changed, 11 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/skillshare.py b/youtube_dl/extractor/skillshare.py index c63b52ab5..43a6113c2 100644 --- a/youtube_dl/extractor/skillshare.py +++ b/youtube_dl/extractor/skillshare.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import json from .common import InfoExtractor +from .brightcove import BrightcoveNewIE class SkillshareClassIE(InfoExtractor): @@ -10,52 +11,26 @@ class SkillshareClassIE(InfoExtractor): _TEST = { 'url': 'https://www.skillshare.com/classes/SEO-Today-Strategies-to-Earn-Trust-Rank-High-and-Stand-Out/423483018', 'only_matching': True, - 'info_dict': { - 'id': '5463396146001', - 'ext': 'mp4', - 'title': 'Introduction', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type (for example int or float) - } } def _real_extract(self, url): - search_term = r'(?P{"userData":{.+});\n' - lesson_info_api_url_format = "https://www.skillshare.com/sessions/{}/video" - video_api_url_format = "https://edge.api.brightcove.com/playback/v1/accounts/{}/videos/{}" - headers = {"Accept": "application/json;pk=BCpkADawqM2OOcM6njnM7hf9EaK6lIFlqiXB0iWjqGWU QjU7R8965xUvIQNqdQbnDTLz0IAO7E6Ir2rIbXJtFdzrGtitoee0n1XXRliD-RH9A-svuvNW 9qgo3Bh34HEZjXjG4Nml4iyz3KqF"} class_id = self._match_id(url) - class_page = self._download_webpage(url, class_id) - class_json_data = json.loads(self._search_regex(search_term, class_page, 'class_json_data')) + class_json_data = json.loads(self._search_regex(r'(?P{"userData":{.+});\n', self._download_webpage(url, class_id), 'class_json_data')) account_id = str(class_json_data.get('pageData').get('videoPlayerData').get('brightcoveAccountId')) - class_title = class_json_data.get('pageData').get('headerData').get('title') lessons = class_json_data.get('pageData').get('videoPlayerData').get('units')[0].get('sessions') - videos = [] + entries = [] for lesson in lessons: lesson_id = str(lesson.get('id')) - lesson_info_api_url = lesson_info_api_url_format.format(lesson_id) - lesson_info_api_response = self._download_json(lesson_info_api_url, lesson_id) - print(lesson_info_api_response) + lesson_info_api_response = self._download_json("https://www.skillshare.com/sessions/{}/video".format(lesson_id), lesson_id) if 'video_hashed_id' not in lesson_info_api_response: break video_hashed_id = lesson_info_api_response.get('video_hashed_id')[3:] - video_api_url = video_api_url_format.format(account_id, video_hashed_id) - video_api_response = self._download_json(video_api_url, video_hashed_id, headers=headers) - lesson_title = lesson.get('title') - lesson_url = video_api_response.get('sources')[-1].get('src') - video = { + entry = { + '_type': 'url_transparent', 'id': video_hashed_id, - 'title': lesson_title, - 'url': lesson_url, - 'ext': 'mp4', + 'title': lesson.get('title'), + 'ie_key': BrightcoveNewIE.ie_key(), + 'url': 'https://players.brightcove.net/{}/default_default/index.html?videoId={}'.format(account_id, video_hashed_id), } - videos.append(video) - return { - 'id': class_id, - 'title': class_title, - '_type': 'playlist', - 'entries': videos - } + entries.append(entry) + return self.playlist_result(entries, class_id, class_json_data.get('pageData').get('headerData').get('title'), class_json_data.get("pageData").get('sectionData').get('description')) From 2df32f9697b45a82e92198d95088cc8e13d862b8 Mon Sep 17 00:00:00 2001 From: shaileshaanand Date: Sun, 15 Mar 2020 14:53:02 +0530 Subject: [PATCH 3/5] change get() to subscripts for mandatory data --- youtube_dl/extractor/skillshare.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/skillshare.py b/youtube_dl/extractor/skillshare.py index 43a6113c2..39d15c862 100644 --- a/youtube_dl/extractor/skillshare.py +++ b/youtube_dl/extractor/skillshare.py @@ -1,6 +1,5 @@ # coding: utf-8 from __future__ import unicode_literals -import json from .common import InfoExtractor from .brightcove import BrightcoveNewIE @@ -15,22 +14,22 @@ class SkillshareClassIE(InfoExtractor): def _real_extract(self, url): class_id = self._match_id(url) - class_json_data = json.loads(self._search_regex(r'(?P{"userData":{.+});\n', self._download_webpage(url, class_id), 'class_json_data')) - account_id = str(class_json_data.get('pageData').get('videoPlayerData').get('brightcoveAccountId')) - lessons = class_json_data.get('pageData').get('videoPlayerData').get('units')[0].get('sessions') + class_json_data = self._parse_json(self._search_regex(r'(?P{"userData":{.+});\n', self._download_webpage(url, class_id), 'class_json_data'), class_id) + account_id = str(class_json_data['pageData']['videoPlayerData']['brightcoveAccountId']) + lessons = class_json_data['pageData']['videoPlayerData']['units'][0]['sessions'] entries = [] for lesson in lessons: - lesson_id = str(lesson.get('id')) + lesson_id = str(lesson['id']) lesson_info_api_response = self._download_json("https://www.skillshare.com/sessions/{}/video".format(lesson_id), lesson_id) if 'video_hashed_id' not in lesson_info_api_response: break - video_hashed_id = lesson_info_api_response.get('video_hashed_id')[3:] + video_hashed_id = lesson_info_api_response['video_hashed_id'][3:] entry = { '_type': 'url_transparent', 'id': video_hashed_id, - 'title': lesson.get('title'), + 'title': lesson['title'], 'ie_key': BrightcoveNewIE.ie_key(), 'url': 'https://players.brightcove.net/{}/default_default/index.html?videoId={}'.format(account_id, video_hashed_id), } entries.append(entry) - return self.playlist_result(entries, class_id, class_json_data.get('pageData').get('headerData').get('title'), class_json_data.get("pageData").get('sectionData').get('description')) + return self.playlist_result(entries, class_id, class_json_data['pageData']['headerData']['title'], class_json_data["pageData"]['sectionData']['description']) From ad0a7f648070875c23133493092f461113debe32 Mon Sep 17 00:00:00 2001 From: shaileshaanand Date: Sun, 15 Mar 2020 15:07:04 +0530 Subject: [PATCH 4/5] use get() for optional description field --- youtube_dl/extractor/skillshare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/skillshare.py b/youtube_dl/extractor/skillshare.py index 39d15c862..c787fe750 100644 --- a/youtube_dl/extractor/skillshare.py +++ b/youtube_dl/extractor/skillshare.py @@ -32,4 +32,4 @@ class SkillshareClassIE(InfoExtractor): 'url': 'https://players.brightcove.net/{}/default_default/index.html?videoId={}'.format(account_id, video_hashed_id), } entries.append(entry) - return self.playlist_result(entries, class_id, class_json_data['pageData']['headerData']['title'], class_json_data["pageData"]['sectionData']['description']) + return self.playlist_result(entries, class_id, class_json_data['pageData']['headerData']['title'], class_json_data.get("pageData").get('sectionData').get('description')) From cd3a7c64ea63fefe345312242f3a66298d9cea1f Mon Sep 17 00:00:00 2001 From: shaileshaanand Date: Mon, 16 Mar 2020 13:32:26 +0530 Subject: [PATCH 5/5] Made all requested changes --- youtube_dl/extractor/skillshare.py | 33 ++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/skillshare.py b/youtube_dl/extractor/skillshare.py index c787fe750..44d8680fb 100644 --- a/youtube_dl/extractor/skillshare.py +++ b/youtube_dl/extractor/skillshare.py @@ -2,6 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from .brightcove import BrightcoveNewIE +from ..utils import ( + try_get, + compat_str +) class SkillshareClassIE(InfoExtractor): @@ -14,22 +18,33 @@ class SkillshareClassIE(InfoExtractor): def _real_extract(self, url): class_id = self._match_id(url) - class_json_data = self._parse_json(self._search_regex(r'(?P{"userData":{.+});\n', self._download_webpage(url, class_id), 'class_json_data'), class_id) - account_id = str(class_json_data['pageData']['videoPlayerData']['brightcoveAccountId']) + class_json_data = self._parse_json(self._search_regex( + r'(?P{.+pageData.+});\n', + self._download_webpage(url, class_id), 'class_json_data'), class_id) + account_id = class_json_data['pageData']['videoPlayerData']['brightcoveAccountId'] lessons = class_json_data['pageData']['videoPlayerData']['units'][0]['sessions'] entries = [] for lesson in lessons: - lesson_id = str(lesson['id']) - lesson_info_api_response = self._download_json("https://www.skillshare.com/sessions/{}/video".format(lesson_id), lesson_id) + lesson_id = lesson.get('id') + lesson_info_api_response = self._download_json( + "https://www.skillshare.com/sessions/%s/video" % lesson_id, + lesson_id) if 'video_hashed_id' not in lesson_info_api_response: break - video_hashed_id = lesson_info_api_response['video_hashed_id'][3:] + video_hashed_id = self._search_regex( + r'(\d+)', lesson_info_api_response.get('video_hashed_id'), + 'video_hashed_id') entry = { + # the brightcove extractor extracts the title and id '_type': 'url_transparent', - 'id': video_hashed_id, - 'title': lesson['title'], 'ie_key': BrightcoveNewIE.ie_key(), - 'url': 'https://players.brightcove.net/{}/default_default/index.html?videoId={}'.format(account_id, video_hashed_id), + 'url': 'https://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (account_id, video_hashed_id), } entries.append(entry) - return self.playlist_result(entries, class_id, class_json_data['pageData']['headerData']['title'], class_json_data.get("pageData").get('sectionData').get('description')) + return self.playlist_result( + entries, class_id, try_get( + class_json_data, lambda x: x['pageData']['headerData']['title'], + compat_str), + try_get( + class_json_data, lambda x: x['pageData']['sectionData']['description'], + compat_str))