From 6a4d1f367240db447be71842006ed0bfe12f8abe Mon Sep 17 00:00:00 2001 From: user706 <39215612+user706@users.noreply.github.com> Date: Fri, 28 Dec 2018 23:50:22 +0100 Subject: [PATCH 1/3] fix extractor for national jukebox --- youtube_dl/extractor/libraryofcongress.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py index 03f205144..44e37664e 100644 --- a/youtube_dl/extractor/libraryofcongress.py +++ b/youtube_dl/extractor/libraryofcongress.py @@ -16,7 +16,7 @@ from ..utils import ( class LibraryOfCongressIE(InfoExtractor): IE_NAME = 'loc' IE_DESC = 'Library of Congress' - _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P[0-9a-z_.]+)' + _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|jukebox/recordings/detail/id/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P[0-9a-z_.]+)' _TESTS = [{ # embedded via
.+?)\1', r']+id=(["\'])uuid-(?P.+?)\1', r']+data-uuid=(["\'])(?P.+?)\1', - r'mediaObjectId\s*:\s*(["\'])(?P.+?)\1', + r'"?mediaObjectId"?\s*:\s*(["\'])(?P.+?)\1', r'data-tab="share-media-(?P[0-9A-F]{32})"'), webpage, 'media id', group='id') From 40e2cbd86c5fef790281bcc9b018625463d0053a Mon Sep 17 00:00:00 2001 From: user706 <39215612+user706@users.noreply.github.com> Date: Sat, 29 Dec 2018 00:35:04 +0100 Subject: [PATCH 2/3] [libraryofcongress] better title if national jukebox --- youtube_dl/extractor/libraryofcongress.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py index 44e37664e..85a75db46 100644 --- a/youtube_dl/extractor/libraryofcongress.py +++ b/youtube_dl/extractor/libraryofcongress.py @@ -83,8 +83,9 @@ class LibraryOfCongressIE(InfoExtractor): derivative = data['derivatives'][0] media_url = derivative['derivativeUrl'] - title = derivative.get('shortName') or data.get('shortName') or self._og_search_title( - webpage) + title = derivative.get('shortName') + if (not title) or (title == "128 Bit Derivative"): + title = data.get('shortName') or self._og_search_title(webpage) # Following algorithm was extracted from setAVSource js function # found in webpage From ed3ffda40b426e2afbedab75e15eb87f0c99cd76 Mon Sep 17 00:00:00 2001 From: user706 <39215612+user706@users.noreply.github.com> Date: Sat, 29 Dec 2018 00:41:34 +0100 Subject: [PATCH 3/3] [libraryofcongress] test for national jukebox --- youtube_dl/extractor/libraryofcongress.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py index 85a75db46..b1de82a9d 100644 --- a/youtube_dl/extractor/libraryofcongress.py +++ b/youtube_dl/extractor/libraryofcongress.py @@ -18,6 +18,15 @@ class LibraryOfCongressIE(InfoExtractor): IE_DESC = 'Library of Congress' _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|jukebox/recordings/detail/id/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P[0-9a-z_.]+)' _TESTS = [{ + # national jukebox + 'url': 'https://www.loc.gov/jukebox/recordings/detail/id/824', + 'md5': '0263fed2c1cd62d733037c20c81e9e0e', + 'info_dict': { + 'id': '824', + 'ext': 'mp3', + 'title': 'Bring back my Bonnie to me' + }, + }, { # embedded via