From b402f3de828aec21db3c49ee361791a981ca7562 Mon Sep 17 00:00:00 2001 From: Ken Swenson Date: Fri, 9 Nov 2018 16:49:20 -0500 Subject: [PATCH 1/7] [tiktok] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tiktok.py | 41 ++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 youtube_dl/extractor/tiktok.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e5488cce4..184f8e3fe 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1120,6 +1120,7 @@ from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE from .threeqsdn import ThreeQSDNIE +from .tiktok import TikTokIE from .tinypic import TinyPicIE from .tmz import ( TMZIE, diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py new file mode 100644 index 000000000..39ab561fa --- /dev/null +++ b/youtube_dl/extractor/tiktok.py @@ -0,0 +1,41 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import urlhandle_detect_ext + + +class TikTokIE(InfoExtractor): + _VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P[0-9]+)' + _TEST = { + 'url': 'https://m.tiktok.com/v/6606727368545406213.html', + 'md5': '163ceff303bb52de60e6887fe399e6cd', + 'info_dict': { + 'id': '6606727368545406213', + 'ext': 'mp4', + 'title': 'Zureeal|TikTok|Global Video Community', + 'thumbnail': 'http://m-p16.akamaized.net/img/tos-maliva-p-0068/5e7a4ec40fb146888fa27aa8d78f86fd~noop.image', + 'description': 'Zureeal has just created an awesome short video with ♬ original sound - joogieboy1596', + 'uploader': 'Zureeal', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_string = self._search_regex(r'var data = ({.*});', webpage, 'json_string') + json_data = self._parse_json(json_string, video_id) + title = self._og_search_title(webpage) + video_url = json_data.get("video").get("play_addr").get("url_list")[0] + uploader = json_data.get("author").get("nickname") + thumbnail = json_data.get("video").get("cover").get("url_list")[0] + + return { + 'id': video_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'uploader': uploader, + 'url': video_url, + 'ext': urlhandle_detect_ext(self._download_webpage_handle(video_url, video_id)[1]), + 'thumbnail': thumbnail, + } From 5a79ced2a9136675b353172f247996697dd06346 Mon Sep 17 00:00:00 2001 From: Ken Swenson Date: Sat, 10 Nov 2018 06:56:24 -0500 Subject: [PATCH 2/7] [tiktok] Ensure optional fields aren't fatal --- youtube_dl/extractor/tiktok.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 39ab561fa..082ee908a 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -26,16 +26,21 @@ class TikTokIE(InfoExtractor): json_string = self._search_regex(r'var data = ({.*});', webpage, 'json_string') json_data = self._parse_json(json_string, video_id) title = self._og_search_title(webpage) + description = self._og_search_description(webpage) video_url = json_data.get("video").get("play_addr").get("url_list")[0] uploader = json_data.get("author").get("nickname") - thumbnail = json_data.get("video").get("cover").get("url_list")[0] + thumbnail_list = json_data.get("video").get("cover").get("url_list") + thumbnail = thumbnail_list[0] if len(thumbnail_list) > 0 else None + handle = self._download_webpage_handle(video_url, video_id, fatal=False) + URLHandle = handle[1] if handle is not False else None + ext = urlhandle_detect_ext(URLHandle) return { 'id': video_id, 'title': title, - 'description': self._og_search_description(webpage), + 'description': description, 'uploader': uploader, 'url': video_url, - 'ext': urlhandle_detect_ext(self._download_webpage_handle(video_url, video_id)[1]), + 'ext': ext, 'thumbnail': thumbnail, } From efe608880320c05a8948cbfa056e828490762276 Mon Sep 17 00:00:00 2001 From: Ken Swenson Date: Sat, 10 Nov 2018 09:42:57 -0500 Subject: [PATCH 3/7] [tiktok] Rewrite using safe conversions --- youtube_dl/extractor/tiktok.py | 36 ++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 082ee908a..26ff86e50 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -2,7 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import urlhandle_detect_ext +from ..utils import ( + urlhandle_detect_ext, + try_get, + compat_str, + url_or_none, + str_or_none, +) class TikTokIE(InfoExtractor): @@ -23,17 +29,27 @@ class TikTokIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - json_string = self._search_regex(r'var data = ({.*});', webpage, 'json_string') - json_data = self._parse_json(json_string, video_id) + + data = self._parse_json( + self._search_regex( + r'var data = ({.*});', webpage, 'json_string', webpage, 'data' + ), video_id) + title = self._og_search_title(webpage) description = self._og_search_description(webpage) - video_url = json_data.get("video").get("play_addr").get("url_list")[0] - uploader = json_data.get("author").get("nickname") - thumbnail_list = json_data.get("video").get("cover").get("url_list") - thumbnail = thumbnail_list[0] if len(thumbnail_list) > 0 else None - handle = self._download_webpage_handle(video_url, video_id, fatal=False) - URLHandle = handle[1] if handle is not False else None - ext = urlhandle_detect_ext(URLHandle) + + video_url = url_or_none( + try_get(data, lambda x: x['video']['play_addr']['url_list'][0], compat_str)) + + uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) + + thumbnail = url_or_none( + try_get( + data, lambda x: x['video']['cover']['url_list'][0], compat_str)) + + ext = str_or_none( + urlhandle_detect_ext( + self._request_webpage(video_url, video_id, fatal=False))) return { 'id': video_id, From 48113833c8853c43cf5091acb772c46ca95fd1ef Mon Sep 17 00:00:00 2001 From: Ken Swenson Date: Sat, 10 Nov 2018 14:43:24 -0500 Subject: [PATCH 4/7] [tiktok] Requested changes and all formats --- youtube_dl/extractor/tiktok.py | 78 ++++++++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 26ff86e50..c47adf509 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -3,11 +3,12 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - urlhandle_detect_ext, - try_get, compat_str, + determine_ext, + int_or_none, + try_get, url_or_none, - str_or_none, + urlhandle_detect_ext ) @@ -15,7 +16,7 @@ class TikTokIE(InfoExtractor): _VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P[0-9]+)' _TEST = { 'url': 'https://m.tiktok.com/v/6606727368545406213.html', - 'md5': '163ceff303bb52de60e6887fe399e6cd', + 'md5': 'd584b572e92fcd48888051f238022420', 'info_dict': { 'id': '6606727368545406213', 'ext': 'mp4', @@ -23,6 +24,8 @@ class TikTokIE(InfoExtractor): 'thumbnail': 'http://m-p16.akamaized.net/img/tos-maliva-p-0068/5e7a4ec40fb146888fa27aa8d78f86fd~noop.image', 'description': 'Zureeal has just created an awesome short video with ♬ original sound - joogieboy1596', 'uploader': 'Zureeal', + 'width': 540, + 'height': 960, } } @@ -32,14 +35,66 @@ class TikTokIE(InfoExtractor): data = self._parse_json( self._search_regex( - r'var data = ({.*});', webpage, 'json_string', webpage, 'data' + r'var data = ({.+?});', webpage, 'json_string', webpage, 'data' ), video_id) title = self._og_search_title(webpage) description = self._og_search_description(webpage) - video_url = url_or_none( - try_get(data, lambda x: x['video']['play_addr']['url_list'][0], compat_str)) + width = int_or_none(try_get(data, lambda x: x['video']['width'], int)) + height = int_or_none(try_get(data, lambda x: x['video']['height'], int)) + + formats = [] + + for url in data['video']['play_addr']['url_list']: + ext = determine_ext(url) + if ext == 'unknown_video': + urlh = self._request_webpage( + url, video_id, note='Determining extension' + ) + ext = urlhandle_detect_ext(urlh) + formats.append({ + 'url': url, + 'ext': ext, + 'height': height, + 'width': width, + 'quality': -2, + 'format_note': "Normal quality", + }) + + for url in data['video']['download_addr']['url_list']: + ext = determine_ext(url) + if ext == 'unknown_video': + urlh = self._request_webpage( + url, video_id, note='Determining extension' + ) + ext = urlhandle_detect_ext(urlh) + formats.append({ + 'url': url, + 'ext': ext, + 'height': height, + 'width': width, + 'quality': 1, + 'format_note': "Download quality", + }) + + for url in data['video']['play_addr_lowbr']['url_list']: + ext = determine_ext(url) + if ext == 'unknown_video': + urlh = self._request_webpage( + url, video_id, note='Determining extension' + ) + ext = urlhandle_detect_ext(urlh) + formats.append({ + 'url': url, + 'ext': ext, + 'height': height, + 'width': width, + 'quality': -3, + 'format_note': "Low bitrate", + }) + + self._sort_formats(formats) uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) @@ -47,16 +102,13 @@ class TikTokIE(InfoExtractor): try_get( data, lambda x: x['video']['cover']['url_list'][0], compat_str)) - ext = str_or_none( - urlhandle_detect_ext( - self._request_webpage(video_url, video_id, fatal=False))) - return { 'id': video_id, 'title': title, 'description': description, 'uploader': uploader, - 'url': video_url, - 'ext': ext, + 'formats': formats, 'thumbnail': thumbnail, + 'width': width, + 'height': height, } From 398624aeb10f1f8ecb2a9a7dbad04e7eb303c0fa Mon Sep 17 00:00:00 2001 From: Ken Swenson Date: Fri, 16 Nov 2018 15:15:31 -0500 Subject: [PATCH 5/7] [tiktok] Dedupe code and requested changes --- youtube_dl/extractor/tiktok.py | 65 ++++++++-------------------------- 1 file changed, 15 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index c47adf509..9dbd92237 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -4,11 +4,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( compat_str, - determine_ext, int_or_none, + str_or_none, try_get, url_or_none, - urlhandle_detect_ext ) @@ -22,7 +21,7 @@ class TikTokIE(InfoExtractor): 'ext': 'mp4', 'title': 'Zureeal|TikTok|Global Video Community', 'thumbnail': 'http://m-p16.akamaized.net/img/tos-maliva-p-0068/5e7a4ec40fb146888fa27aa8d78f86fd~noop.image', - 'description': 'Zureeal has just created an awesome short video with ♬ original sound - joogieboy1596', + 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', 'uploader': 'Zureeal', 'width': 540, 'height': 960, @@ -35,66 +34,32 @@ class TikTokIE(InfoExtractor): data = self._parse_json( self._search_regex( - r'var data = ({.+?});', webpage, 'json_string', webpage, 'data' + r'var data = ({.+?});', webpage, 'data' ), video_id) title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - width = int_or_none(try_get(data, lambda x: x['video']['width'], int)) - height = int_or_none(try_get(data, lambda x: x['video']['height'], int)) + description = str_or_none(try_get(data, lambda x: x['desc'])) + width = int_or_none(try_get(data, lambda x: x['video']['width'])) + height = int_or_none(try_get(data, lambda x: x['video']['height'])) formats = [] - for url in data['video']['play_addr']['url_list']: - ext = determine_ext(url) - if ext == 'unknown_video': - urlh = self._request_webpage( - url, video_id, note='Determining extension' - ) - ext = urlhandle_detect_ext(urlh) + def extract_formats(url_list): + if url_list[0] is None: + return + for url in url_list[0]: formats.append({ 'url': url, - 'ext': ext, + 'ext': 'mp4', 'height': height, 'width': width, - 'quality': -2, - 'format_note': "Normal quality", + 'format_note': url_list[1] }) - for url in data['video']['download_addr']['url_list']: - ext = determine_ext(url) - if ext == 'unknown_video': - urlh = self._request_webpage( - url, video_id, note='Determining extension' - ) - ext = urlhandle_detect_ext(urlh) - formats.append({ - 'url': url, - 'ext': ext, - 'height': height, - 'width': width, - 'quality': 1, - 'format_note': "Download quality", - }) - - for url in data['video']['play_addr_lowbr']['url_list']: - ext = determine_ext(url) - if ext == 'unknown_video': - urlh = self._request_webpage( - url, video_id, note='Determining extension' - ) - ext = urlhandle_detect_ext(urlh) - formats.append({ - 'url': url, - 'ext': ext, - 'height': height, - 'width': width, - 'quality': -3, - 'format_note': "Low bitrate", - }) - - self._sort_formats(formats) + extract_formats((try_get(data, lambda x: x['video']['play_addr_lowbr']['url_list']), 'Low quality')) + extract_formats((try_get(data, lambda x: x['video']['play_addr']['url_list']), 'Normal quality')) + extract_formats((try_get(data, lambda x: x['video']['download_addr']['url_list']), 'Download quality')) uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) From 1673afd9f2df6797614b62051092c3db356a1d0a Mon Sep 17 00:00:00 2001 From: Ken Swenson Date: Thu, 29 Nov 2018 18:29:01 -0500 Subject: [PATCH 6/7] Accept all whitespace in regex. Include quality for each format. --- youtube_dl/extractor/tiktok.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 9dbd92237..8601862c2 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -15,11 +15,10 @@ class TikTokIE(InfoExtractor): _VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P[0-9]+)' _TEST = { 'url': 'https://m.tiktok.com/v/6606727368545406213.html', - 'md5': 'd584b572e92fcd48888051f238022420', 'info_dict': { 'id': '6606727368545406213', 'ext': 'mp4', - 'title': 'Zureeal|TikTok|Global Video Community', + 'title': 'Zureeal on TikTok', 'thumbnail': 'http://m-p16.akamaized.net/img/tos-maliva-p-0068/5e7a4ec40fb146888fa27aa8d78f86fd~noop.image', 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', 'uploader': 'Zureeal', @@ -34,7 +33,7 @@ class TikTokIE(InfoExtractor): data = self._parse_json( self._search_regex( - r'var data = ({.+?});', webpage, 'data' + r'var\s*data\s*=\s*({.+?});', webpage, 'data' ), video_id) title = self._og_search_title(webpage) @@ -45,21 +44,21 @@ class TikTokIE(InfoExtractor): formats = [] - def extract_formats(url_list): - if url_list[0] is None: - return - for url in url_list[0]: + for key, label in (('play_addr_lowbr', 'Low'), ('play_addr', 'Normal'), ('download_addr', 'Download')): + for format in try_get(data, lambda x: x['video'][key]['url_list']): + format_url = url_or_none(format) + if not format_url: + continue formats.append({ 'url': url, 'ext': 'mp4', 'height': height, 'width': width, - 'format_note': url_list[1] + 'format_note': label, + 'quality': -2 if label == 'Low' else (1 if label == 'Download' else 0) }) - extract_formats((try_get(data, lambda x: x['video']['play_addr_lowbr']['url_list']), 'Low quality')) - extract_formats((try_get(data, lambda x: x['video']['play_addr']['url_list']), 'Normal quality')) - extract_formats((try_get(data, lambda x: x['video']['download_addr']['url_list']), 'Download quality')) + self._sort_formats(formats) uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) From 916e0c422d603663ee97c71652968197a5cbdfc0 Mon Sep 17 00:00:00 2001 From: Ken Swenson Date: Sat, 1 Dec 2018 12:33:30 -0500 Subject: [PATCH 7/7] Update regex, use regex for thumbnail, use enumerate for quality --- youtube_dl/extractor/tiktok.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 8601862c2..d71b09c66 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -15,11 +15,12 @@ class TikTokIE(InfoExtractor): _VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P[0-9]+)' _TEST = { 'url': 'https://m.tiktok.com/v/6606727368545406213.html', + 'md5': 'd584b572e92fcd48888051f238022420', 'info_dict': { 'id': '6606727368545406213', 'ext': 'mp4', 'title': 'Zureeal on TikTok', - 'thumbnail': 'http://m-p16.akamaized.net/img/tos-maliva-p-0068/5e7a4ec40fb146888fa27aa8d78f86fd~noop.image', + 'thumbnail': r're:^https?://.*~noop.image', 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', 'uploader': 'Zureeal', 'width': 540, @@ -33,7 +34,7 @@ class TikTokIE(InfoExtractor): data = self._parse_json( self._search_regex( - r'var\s*data\s*=\s*({.+?});', webpage, 'data' + r'var\s+data\s*=\s*({.+?});', webpage, 'data' ), video_id) title = self._og_search_title(webpage) @@ -44,18 +45,18 @@ class TikTokIE(InfoExtractor): formats = [] - for key, label in (('play_addr_lowbr', 'Low'), ('play_addr', 'Normal'), ('download_addr', 'Download')): + for count, (key, label) in enumerate((('play_addr_lowbr', 'Low'), ('play_addr', 'Normal'), ('download_addr', 'Download')), -2): for format in try_get(data, lambda x: x['video'][key]['url_list']): format_url = url_or_none(format) if not format_url: continue formats.append({ - 'url': url, + 'url': format_url, 'ext': 'mp4', 'height': height, 'width': width, 'format_note': label, - 'quality': -2 if label == 'Low' else (1 if label == 'Download' else 0) + 'quality': count }) self._sort_formats(formats)