From a030fdabcf5db93d563a6bbf0c5cccff7e6fe9e2 Mon Sep 17 00:00:00 2001 From: Qijiang Fan Date: Sun, 30 Aug 2015 14:33:12 +0800 Subject: [PATCH 1/3] [test] recursively check dict and list in expect_info_dict This allows to use md5:, re:, etc within the str inside a list or dict. --- test/helper.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/test/helper.py b/test/helper.py index cb6eec8d9..6612970ef 100644 --- a/test/helper.py +++ b/test/helper.py @@ -89,7 +89,7 @@ def gettestcases(include_onlymatching=False): md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() -def expect_info_dict(self, got_dict, expected_dict): +def expect_dict(self, got_dict, expected_dict): for info_field, expected in expected_dict.items(): if isinstance(expected, compat_str) and expected.startswith('re:'): got = got_dict.get(info_field) @@ -127,6 +127,22 @@ def expect_info_dict(self, got_dict, expected_dict): got = got_dict.get(info_field) self.assertTrue(isinstance(got, expected), 'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got))) + elif isinstance(expected, dict) and isinstance(got_dict.get(info_field, None), dict): + expect_dict(self, got_dict.get(info_field), expected) + elif isinstance(expected, list) and isinstance(got_dict.get(info_field, None), list): + got = got_dict.get(info_field, None) + self.assertEqual(len(expected), len(got), + 'Expect a list of length %d, but got a list of length %d' % ( + len(expected), len(got))) + _id = 0 + for i, j in zip(got, expected): + _type_i = type(i) + _type_j = type(j) + self.assertEqual(_type_j, _type_i, + 'Type doesn\'t match at element %d of the list in field %s, expect %s, got %s' % ( + _id, info_field, _type_j, _type_i)) + expect_dict(self, {'_': i}, {'_': j}) + _id += 1 else: if isinstance(expected, compat_str) and expected.startswith('md5:'): got = 'md5:' + md5(got_dict.get(info_field)) @@ -149,6 +165,9 @@ def expect_info_dict(self, got_dict, expected_dict): self.assertEqual(expected, got, 'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) + +def expect_info_dict(self, got_dict, expected_dict): + expect_dict(self, got_dict, expected_dict) # Check for the presence of mandatory fields if got_dict.get('_type') not in ('playlist', 'multi_video'): for key in ('id', 'url', 'title', 'ext'): From 825be94d9612767d63ff13dbd92efe7eab574ca3 Mon Sep 17 00:00:00 2001 From: Qijiang Fan Date: Mon, 24 Aug 2015 00:31:30 +0800 Subject: [PATCH 2/3] [qqmusic] Add subtitles for QQMusic Use .lrc lyrics as subtitles if lyrics in lrc format exist. --- youtube_dl/extractor/qqmusic.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 1654a641f..8ead04606 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -58,6 +58,11 @@ class QQMusicIE(InfoExtractor): curMs = int(time.time() * 1000) % 1000 return int(round(random.random() * 2147483647) * curMs % 1E10) + def _filter_lrc(self, data): + lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\][^\n]*|\[[^\]]*\])' + texts = re.findall(lyrics_expr, data) + return ''.join(i + "\n" for i in texts) + def _real_extract(self, url): mid = self._match_id(url) @@ -112,15 +117,25 @@ class QQMusicIE(InfoExtractor): self._check_formats(formats, mid) self._sort_formats(formats) - return { + actual_lrc_lyrics = self._filter_lrc(lrc_content) + + info_dict = { 'id': mid, 'formats': formats, 'title': song_name, 'upload_date': publish_time, 'creator': singer, 'description': lrc_content, - 'thumbnail': thumbnail_url, + 'thumbnail': thumbnail_url } + if actual_lrc_lyrics: + info_dict['subtitles'] = { + 'origin': [{ + 'ext': 'lrc', + 'data': actual_lrc_lyrics, + }] + } + return info_dict class QQPlaylistBaseIE(InfoExtractor): From 97de55d4acd897a6fdc36f253ebd4b949a5210b9 Mon Sep 17 00:00:00 2001 From: Qijiang Fan Date: Sat, 26 Sep 2015 21:51:08 +0800 Subject: [PATCH 3/3] [neteasemusic] Add subtitles for NetEaseMusic 1. Use .lrc format lyrics as subtitles if available. 2. Remove the 3rd digit after dot in NetEase's time tag to fit LRC format standard for time tag. 3. Update lyrics_expr to match empty string after time tag as a valid lrc (music players uses this to hide previous item's text). 4. Add new regular expressions to match only text for the line with multiple time tag for translation to avoid time tag is treated part of lyrics. This will return at most 2 lyrics. one for original text, the other for the translated text if it exists. Also add two extra tests to test: 1. multiple time tag in one line, 2. multiple time tag in one line with time need to be fixed to hundredth of second rather than millisecond. --- youtube_dl/extractor/neteasemusic.py | 108 +++++++++++++++++++++++---- 1 file changed, 92 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index a8e0a64ed..e061978de 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -66,6 +66,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): IE_DESC = '网易云音乐' _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P[0-9]+)' _TESTS = [{ + 'note': 'origin + translated lyrics, with time tag need to be fixed', 'url': 'http://music.163.com/#/song?id=32102397', 'md5': 'f2e97280e6345c74ba9d5677dd5dcb45', 'info_dict': { @@ -75,7 +76,10 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'creator': 'Taylor Swift / Kendrick Lamar', 'upload_date': '20150517', 'timestamp': 1431878400, - 'description': 'md5:a10a54589c2860300d02e1de821eb2ef', + 'subtitles': { + 'origin': [{'ext': 'lrc', 'data': 'md5:eb9ae90502b435de7d9e99fc7602adb4'}], + 'translated': [{'ext': 'lrc', 'data': 'md5:cadca69fdfb7b679d273cc01a518f7dd'}], + } }, }, { 'note': 'No lyrics translation.', @@ -87,7 +91,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'creator': '周杰伦', 'upload_date': '20141225', 'timestamp': 1419523200, - 'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c', + 'subtitles': { + 'origin': [{'ext': 'lrc', 'data': 'md5:a1766edaa6dbc85357f0ae9feabc867b'}], + } }, }, { 'note': 'No lyrics.', @@ -99,6 +105,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'creator': 'Dustin O\'Halloran', 'upload_date': '20080211', 'timestamp': 1202745600, + 'subtitles': {} }, }, { 'note': 'Has translated name.', @@ -108,30 +115,87 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'ext': 'mp3', 'title': '소원을 말해봐 (Genie)', 'creator': '少女时代', - 'description': 'md5:79d99cc560e4ca97e0c4d86800ee4184', 'upload_date': '20100127', 'timestamp': 1264608000, 'alt_title': '说出愿望吧(Genie)', + 'subtitles': { + 'origin': [{'ext': 'lrc', 'data': 'md5:8d5782f92bb275b9a6acd01e9ffd12b9'}], + 'translated': [{'ext': 'lrc', 'data': 'md5:d8270e3375fd305f92b18ad78585cabb'}], + } + } + }, { + 'note': 'some lines with multiple time tag', + 'url': 'http://music.163.com/#/song?id=4926366', + 'info_dict': { + 'id': '4926366', + 'ext': 'mp3', + 'title': 'sweet&sweet holiday', + 'timestamp': 1306252800, + 'upload_date': '20110524', + 'subtitles': { + 'origin': [{'ext': 'lrc', 'data': 'md5:9971db3f0361b0b66d47ba5c95bbff35'}], + 'translated': [{'ext': 'lrc', 'data': 'md5:ce5f9eef13ae4948b01bbb015fc507e1'}], + } + } + }, { + 'note': 'some lines with multiple time tag and time need to be fixed', + 'url': 'http://music.163.com/#/song?id=22826396', + 'info_dict': { + 'id': '22826396', + 'ext': 'mp3', + 'title': 'God knows...', + 'timestamp': 1306252800, + 'upload_date': '20110524', + 'subtitles': { + 'origin': [{'ext': 'lrc', 'data': 'md5:5ca2952ed8974f2c28beb1c0f89e2ab5'}], + 'translated': [{'ext': 'lrc', 'data': 'md5:5ed0d4adb337594f927674d351ba6626'}], + } } }] + def _fix_timestamp(self, timestamp): + # Netease returns timestamp use 2 or 3 digits for less than + # 1 second metrics + # While standard LRC requires exact 2 digits + # remvoe the 3rd digit if the there're three + match_expr = r'\[([0-9]{2}:[0-9]{2}\.[0-9]{2})(.*)\]' + match_result = re.match(match_expr, timestamp) + if match_result: + # This must match as match_expr is exact the first group of lyrics_expr + return "[" + match_result.group(1) + "]" + else: + # return a valid timestamp to avoid other applications' error + return "[00:00.00]" + + def _fix_lyric_timestamp(self, line): + text = line + corrected_times = "" + lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]*)' + # Handle time tag with one or more time in one line. + while True: + m = re.match(lyrics_expr, text) + if m: + corrected_times += self._fix_timestamp(m.group(1)) + text = m.group(2) + else: + break + return corrected_times + text + def _process_lyrics(self, lyrics_info): original = lyrics_info.get('lrc', {}).get('lyric') translated = lyrics_info.get('tlyric', {}).get('lyric') if not translated: - return original + translated = "" + if not original: + original = "" - lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)' + lyrics_expr = r'\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\][^\n]*' original_ts_texts = re.findall(lyrics_expr, original) - translation_ts_dict = dict( - (time_stamp, text) for time_stamp, text in re.findall(lyrics_expr, translated) - ) - lyrics = '\n'.join([ - '%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, '')) - for time_stamp, text in original_ts_texts - ]) - return lyrics + translated_ts_texts = re.findall(lyrics_expr, translated) + lyrics_original = '\n'.join(map(self._fix_lyric_timestamp, original_ts_texts)) + '\n' if original_ts_texts else '' + lyrics_translated = '\n'.join(map(self._fix_lyric_timestamp, translated_ts_texts)) + '\n' if translated_ts_texts else '' + return lyrics_original, lyrics_translated def _real_extract(self, url): song_id = self._match_id(url) @@ -150,13 +214,13 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): lyrics_info = self.query_api( 'song/lyric?id=%s&lv=-1&tv=-1' % song_id, song_id, 'Downloading lyrics data') - lyrics = self._process_lyrics(lyrics_info) + lyrics_original, lyrics_translated = self._process_lyrics(lyrics_info) alt_title = None if info.get('transNames'): alt_title = '/'.join(info.get('transNames')) - return { + ret = { 'id': song_id, 'title': info['name'], 'alt_title': alt_title, @@ -164,9 +228,21 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')), 'thumbnail': info.get('album', {}).get('picUrl'), 'duration': self.convert_milliseconds(info.get('duration', 0)), - 'description': lyrics, + 'description': '', + 'subtitles': {}, 'formats': formats, } + if lyrics_original: + ret['subtitles']['origin'] = [{ + 'ext': 'lrc', + 'data': lyrics_original + }] + if lyrics_translated: + ret['subtitles']['translated'] = [{ + 'ext': 'lrc', + 'data': lyrics_translated + }] + return ret class NetEaseMusicAlbumIE(NetEaseMusicBaseIE):