diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index a10c9fd83..64a2eb736 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.11.12** +- [ ] I've verified that I'm running youtube-dl version **2020.11.18** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.12 + [debug] youtube-dl version 2020.11.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 9cc120d3e..c0c789673 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.11.12** +- [ ] I've verified that I'm running youtube-dl version **2020.11.18** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 29bd5f5ac..c90dbf30c 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.12** +- [ ] I've verified that I'm running youtube-dl version **2020.11.18** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index cc33a993f..4c9d295de 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.11.12** +- [ ] I've verified that I'm running youtube-dl version **2020.11.18** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.12 + [debug] youtube-dl version 2020.11.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index cd577ecef..a6040daee 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.12** +- [ ] I've verified that I'm running youtube-dl version **2020.11.18** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 1ef7ea7b6..4d404a56e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,43 @@ +version 2020.11.18 + +Extractors +* [spiegel] Fix extraction (#24206, #24767) +* [youtube] Improve extraction + + Add support for --no-playlist (#27009) + * Improve playlist and mix extraction (#26390, #26509, #26534, #27011) + + Extract playlist uploader data +* [youtube:tab] Fix view count extraction (#27051) +* [malltv] Fix extraction (#27035) ++ [bandcamp] Extract playlist description (#22684) +* [urplay] Fix extraction (#26828) +* [youtube:tab] Fix playlist title extraction (#27015) +* [youtube] Fix chapters extraction (#26005) + + +version 2020.11.17 + +Core +* [utils] Skip ! prefixed code in js_to_json + +Extractors +* [youtube:tab] Fix extraction with cookies provided (#27005) +* [lrt] Fix extraction with empty tags (#20264) ++ [ndr:embed:base] Extract subtitles (#25447, #26106) ++ [servus] Add support for pm-wissen.com (#25869) +* [servus] Fix extraction (#26872, #26967, #26983, #27000) +* [xtube] Fix extraction (#26996) +* [lrt] Fix extraction ++ [lbry] Add support for lbry.tv ++ [condenast] Extract subtitles +* [condenast] Fix extraction +* [bandcamp] Fix extraction (#26681, #26684) +* [rai] Fix RaiPlay extraction (#26064, #26096) +* [vlive] Fix extraction +* [usanetwork] Fix extraction +* [nbc] Fix NBCNews/Today/MSNBC extraction +* [cnbc] Fix extraction + + version 2020.11.12 Extractors diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 0a1762dbc..878ae72b1 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -61,7 +61,7 @@ def build_lazy_ie(ie, name): return s -# find the correct sorting and add the required base classes so that sublcasses +# find the correct sorting and add the required base classes so that subclasses # can be correctly created classes = _ALL_CLASSES[:-1] ordered_cls = [] diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0c77d017e..9f0cd6ff6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -417,6 +417,7 @@ - **la7.it** - **laola1tv** - **laola1tv:embed** + - **lbry.tv** - **LCI** - **Lcp** - **LcpPlay** @@ -823,8 +824,6 @@ - **SpankBangPlaylist** - **Spankwire** - **Spiegel** - - **Spiegel:Article**: Articles on spiegel.de - - **Spiegeltv** - **sport.francetvinfo.fr** - **Sport5** - **SportBox** @@ -1042,7 +1041,6 @@ - **vk:wallpost** - **vlive** - **vlive:channel** - - **vlive:playlist** - **Vodlocker** - **VODPl** - **VODPlatform** diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 348744028..56a08bed8 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -31,16 +31,17 @@ class TestAllURLsMatching(unittest.TestCase): def test_youtube_playlist_matching(self): assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist']) + assertTab = lambda url: self.assertMatch(url, ['youtube:tab']) assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585 assertPlaylist('PL63F0C78739B09958') - # assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') + assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') - # assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') - assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 + assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') + assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M')) # Top tracks - # assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101') + assertTab('https://www.youtube.com/playlist?list=MCUS.20142101') def test_youtube_matching(self): self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M')) diff --git a/test/test_utils.py b/test/test_utils.py index c2d1e4fb1..925a21d34 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -937,6 +937,28 @@ class TestUtil(unittest.TestCase): self.assertEqual(d['x'], 1) self.assertEqual(d['y'], 'a') + # Just drop ! prefix for now though this results in a wrong value + on = js_to_json('''{ + a: !0, + b: !1, + c: !!0, + d: !!42.42, + e: !!![], + f: !"abc", + g: !"", + !42: 42 + }''') + self.assertEqual(json.loads(on), { + 'a': 0, + 'b': 1, + 'c': 0, + 'd': 42.42, + 'e': [], + 'f': "abc", + 'g': "", + '42': 42 + }) + on = js_to_json('["abc", "def",]') self.assertEqual(json.loads(on), ['abc', 'def']) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index f14b407dc..69e673a26 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import random @@ -5,10 +6,7 @@ import re import time from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, float_or_none, @@ -17,30 +15,32 @@ from ..utils import ( parse_filesize, str_or_none, try_get, - unescapeHTML, update_url_query, unified_strdate, unified_timestamp, url_or_none, + urljoin, ) class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P[^/?#&]+)' + _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭", 'duration': 9.8485, + 'uploader': 'youtube-dl "\'/\\ä↭', + 'upload_date': '20121129', + 'timestamp': 1354224127, }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '853e35bf34aa1d6fe2615ae612564b36', 'info_dict': { 'id': '2650410135', 'ext': 'aiff', @@ -79,11 +79,16 @@ class BandcampIE(InfoExtractor): }, }] + def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): + return self._parse_json(self._html_search_regex( + r'data-%s=(["\'])({.+?})\1' % attr, webpage, + attr + ' data', group=2), video_id, fatal=fatal) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') + title = self._match_id(url) webpage = self._download_webpage(url, title) - thumbnail = self._html_search_meta('og:image', webpage, default=None) + tralbum = self._extract_data_attr(webpage, title) + thumbnail = self._og_search_thumbnail(webpage) track_id = None track = None @@ -91,10 +96,7 @@ class BandcampIE(InfoExtractor): duration = None formats = [] - track_info = self._parse_json( - self._search_regex( - r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', - webpage, 'track info', default='{}'), title) + track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -111,37 +113,25 @@ class BandcampIE(InfoExtractor): 'abr': int_or_none(abr_str), }) track = track_info.get('title') - track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) + track_id = str_or_none( + track_info.get('track_id') or track_info.get('id')) track_number = int_or_none(track_info.get('track_num')) duration = float_or_none(track_info.get('duration')) - def extract(key): - return self._search_regex( - r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key, - webpage, key, default=None, group='value') - - artist = extract('artist') - album = extract('album_title') + embed = self._extract_data_attr(webpage, title, 'embed', False) + current = tralbum.get('current') or {} + artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') timestamp = unified_timestamp( - extract('publish_date') or extract('album_publish_date')) - release_date = unified_strdate(extract('album_release_date')) + current.get('publish_date') or tralbum.get('album_publish_date')) - download_link = self._search_regex( - r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'download link', default=None, group='url') + download_link = tralbum.get('freeDownloadPage') if download_link: - track_id = self._search_regex( - r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', - webpage, 'track id') + track_id = compat_str(tralbum['id']) download_webpage = self._download_webpage( download_link, track_id, 'Downloading free downloads page') - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, - 'blob', group='blob'), - track_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(download_webpage, track_id, 'blob') info = try_get( blob, (lambda x: x['digital_items'][0], @@ -207,20 +197,20 @@ class BandcampIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': artist, 'timestamp': timestamp, - 'release_date': release_date, + 'release_date': unified_strdate(tralbum.get('album_release_date')), 'duration': duration, 'track': track, 'track_number': track_number, 'track_id': track_id, 'artist': artist, - 'album': album, + 'album': embed.get('album_title'), 'formats': formats, } -class BandcampAlbumIE(InfoExtractor): +class BandcampAlbumIE(BandcampIE): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -230,7 +220,10 @@ class BandcampAlbumIE(InfoExtractor): 'info_dict': { 'id': '1353101989', 'ext': 'mp3', - 'title': 'Intro', + 'title': 'Blazo - Intro', + 'timestamp': 1311756226, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, { @@ -238,7 +231,10 @@ class BandcampAlbumIE(InfoExtractor): 'info_dict': { 'id': '38097443', 'ext': 'mp3', - 'title': 'Kero One - Keep It Alive (Blazo remix)', + 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)', + 'timestamp': 1311757238, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, ], @@ -274,6 +270,7 @@ class BandcampAlbumIE(InfoExtractor): 'title': '"Entropy" EP', 'uploader_id': 'jstrecords', 'id': 'entropy-ep', + 'description': 'md5:0ff22959c943622972596062f2f366a5', }, 'playlist_mincount': 3, }, { @@ -283,6 +280,7 @@ class BandcampAlbumIE(InfoExtractor): 'id': 'we-are-the-plague', 'title': 'WE ARE THE PLAGUE', 'uploader_id': 'insulters', + 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f', }, 'playlist_count': 2, }] @@ -294,41 +292,34 @@ class BandcampAlbumIE(InfoExtractor): else super(BandcampAlbumIE, cls).suitable(url)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uploader_id = mobj.group('subdomain') - album_id = mobj.group('album_id') + uploader_id, album_id = re.match(self._VALID_URL, url).groups() playlist_id = album_id or uploader_id webpage = self._download_webpage(url, playlist_id) - track_elements = re.findall( - r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage) - if not track_elements: + tralbum = self._extract_data_attr(webpage, playlist_id) + track_info = tralbum.get('trackinfo') + if not track_info: raise ExtractorError('The page doesn\'t contain any tracks') # Only tracks with duration info have songs entries = [ self.url_result( - compat_urlparse.urljoin(url, t_path), - ie=BandcampIE.ie_key(), - video_title=self._search_regex( - r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', - elem_content, 'track title', fatal=False)) - for elem_content, t_path in track_elements - if self._html_search_meta('duration', elem_content, default=None)] + urljoin(url, t['title_link']), BandcampIE.ie_key(), + str_or_none(t.get('track_id') or t.get('id')), t.get('title')) + for t in track_info + if t.get('duration')] + + current = tralbum.get('current') or {} - title = self._html_search_regex( - r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', - webpage, 'title', fatal=False) - if title: - title = title.replace(r'\"', '"') return { '_type': 'playlist', 'uploader_id': uploader_id, 'id': playlist_id, - 'title': title, + 'title': current.get('title'), + 'description': current.get('about'), 'entries': entries, } -class BandcampWeeklyIE(InfoExtractor): +class BandcampWeeklyIE(BandcampIE): IE_NAME = 'Bandcamp:weekly' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' _TESTS = [{ @@ -343,29 +334,23 @@ class BandcampWeeklyIE(InfoExtractor): 'release_date': '20170404', 'series': 'Bandcamp Weekly', 'episode': 'Magic Moments', - 'episode_number': 208, 'episode_id': '224', - } + }, + 'params': { + 'format': 'opus-lo', + }, }, { 'url': 'https://bandcamp.com/?blah/blah@&show=228', 'only_matching': True }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, - 'blob', group='blob'), - video_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(webpage, show_id, 'blob') - show = blob['bcw_show'] - - # This is desired because any invalid show id redirects to `bandcamp.com` - # which happens to expose the latest Bandcamp Weekly episode. - show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) + show = blob['bcw_data'][show_id] formats = [] for format_id, format_url in show['audio_stream'].items(): @@ -390,20 +375,8 @@ class BandcampWeeklyIE(InfoExtractor): if subtitle: title += ' - %s' % subtitle - episode_number = None - seq = blob.get('bcw_seq') - - if seq and isinstance(seq, list): - try: - episode_number = next( - int_or_none(e.get('episode_number')) - for e in seq - if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) - except StopIteration: - pass - return { - 'id': video_id, + 'id': show_id, 'title': title, 'description': show.get('desc') or show.get('short_desc'), 'duration': float_or_none(show.get('audio_duration')), @@ -411,7 +384,6 @@ class BandcampWeeklyIE(InfoExtractor): 'release_date': unified_strdate(show.get('published_date')), 'series': 'Bandcamp Weekly', 'episode': show.get('subtitle'), - 'episode_number': episode_number, - 'episode_id': compat_str(video_id), + 'episode_id': show_id, 'formats': formats } diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py index 6889b0f40..7b9f4536a 100644 --- a/youtube_dl/extractor/cnbc.py +++ b/youtube_dl/extractor/cnbc.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import smuggle_url @@ -38,7 +39,7 @@ class CNBCIE(InfoExtractor): class CNBCVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)' + _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)' _TEST = { 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', 'info_dict': { @@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor): } def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, - 'video id') + path, display_id = re.match(self._VALID_URL, url).groups() + video_id = self._download_json( + 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={ + 'query': '''{ + page(path: "%s") { + vcpsId + } +}''' % path, + })['data']['page']['vcpsId'] return self.url_result( - 'http://video.cnbc.com/gallery/?video=%s' % video_id, + 'http://video.cnbc.com/gallery/?video=%d' % video_id, CNBCIE.ie_key()) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index ed278fefc..d5e77af32 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -16,6 +16,8 @@ from ..utils import ( mimetype2ext, orderedSet, parse_iso8601, + strip_or_none, + try_get, ) @@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor): 'uploader': 'gq', 'upload_date': '20170321', 'timestamp': 1490126427, + 'description': 'How much grimmer would things be if these people were competent?', }, }, { # JS embed @@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor): 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', 'uploader': 'arstechnica', 'upload_date': '20150916', - 'timestamp': 1442434955, + 'timestamp': 1442434920, } }, { 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', @@ -196,6 +199,13 @@ class CondeNastIE(InfoExtractor): }) self._sort_formats(formats) + subtitles = {} + for t, caption in video_info.get('captions', {}).items(): + caption_url = caption.get('src') + if not (t in ('vtt', 'srt', 'tml') and caption_url): + continue + subtitles.setdefault('en', []).append({'url': caption_url}) + return { 'id': video_id, 'formats': formats, @@ -208,6 +218,7 @@ class CondeNastIE(InfoExtractor): 'season': video_info.get('season_title'), 'timestamp': parse_iso8601(video_info.get('premiere_date')), 'categories': video_info.get('categories'), + 'subtitles': subtitles, } def _real_extract(self, url): @@ -225,8 +236,16 @@ class CondeNastIE(InfoExtractor): if url_type == 'series': return self._extract_series(url, webpage) else: - params = self._extract_video_params(webpage, display_id) - info = self._search_json_ld( - webpage, display_id, fatal=False) + video = try_get(self._parse_json(self._search_regex( + r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', '{}'), display_id), + lambda x: x['transformed']['video']) + if video: + params = {'videoId': video['id']} + info = {'description': strip_or_none(video.get('description'))} + else: + params = self._extract_video_params(webpage, display_id) + info = self._search_json_ld( + webpage, display_id, fatal=False) info.update(self._extract_video(params)) return info diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9d7fecfe8..11ef47261 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -529,6 +529,7 @@ from .laola1tv import ( EHFTVIE, ITTFIE, ) +from .lbry import LBRYIE from .lci import LCIIE from .lcp import ( LcpPlayIE, @@ -1053,8 +1054,7 @@ from .spankbang import ( SpankBangPlaylistIE, ) from .spankwire import SpankwireIE -from .spiegel import SpiegelIE, SpiegelArticleIE -from .spiegeltv import SpiegeltvIE +from .spiegel import SpiegelIE from .spike import ( BellatorIE, ParamountNetworkIE, @@ -1357,7 +1357,6 @@ from .vk import ( from .vlive import ( VLiveIE, VLiveChannelIE, - VLivePlaylistIE ) from .vodlocker import VodlockerIE from .vodpl import VODPlIE diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py new file mode 100644 index 000000000..587deac90 --- /dev/null +++ b/youtube_dl/extractor/lbry.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + mimetype2ext, + try_get, +) + + +class LBRYIE(InfoExtractor): + IE_NAME = 'lbry.tv' + _VALID_URL = r'https?://(?:www\.)?lbry\.tv/(?P<id>@[0-9a-zA-Z-]+:[0-9a-z]+/[0-9a-zA-Z().-]+:[0-9a-z])' + _TESTS = [{ + # Video + 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', + 'md5': '65bd7ec1f6744ada55da8e4c48a2edf9', + 'info_dict': { + 'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d', + 'ext': 'mp4', + 'title': 'First day in LBRY? Start HERE!', + 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51', + 'timestamp': 1595694354, + 'upload_date': '20200725', + } + }, { + # Audio + 'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e', + 'md5': 'c94017d3eba9b49ce085a8fad6b98d00', + 'info_dict': { + 'id': 'e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', + 'ext': 'mp3', + 'title': 'The LBRY Foundation Community Podcast Episode 1 - Introduction, Streaming on LBRY, Transcoding', + 'description': 'md5:661ac4f1db09f31728931d7b88807a61', + 'timestamp': 1591312601, + 'upload_date': '20200604', + } + }] + + def _call_api_proxy(self, method, display_id, params): + return self._download_json( + 'https://api.lbry.tv/api/v1/proxy', display_id, + headers={'Content-Type': 'application/json-rpc'}, + data=json.dumps({ + 'method': method, + 'params': params, + }).encode())['result'] + + def _real_extract(self, url): + display_id = self._match_id(url).replace(':', '#') + uri = 'lbry://' + display_id + result = self._call_api_proxy( + 'resolve', display_id, {'urls': [uri]})[uri] + result_value = result['value'] + if result_value.get('stream_type') not in ('video', 'audio'): + raise ExtractorError('Unsupported URL', expected=True) + streaming_url = self._call_api_proxy( + 'get', display_id, {'uri': uri})['streaming_url'] + source = result_value.get('source') or {} + media = result_value.get('video') or result_value.get('audio') or {} + signing_channel = result_value.get('signing_channel') or {} + + return { + 'id': result['claim_id'], + 'title': result_value['title'], + 'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str), + 'description': result_value.get('description'), + 'license': result_value.get('license'), + 'timestamp': int_or_none(result.get('timestamp')), + 'tags': result_value.get('tags'), + 'width': int_or_none(media.get('width')), + 'height': int_or_none(media.get('height')), + 'duration': int_or_none(media.get('duration')), + 'channel': signing_channel.get('name'), + 'channel_id': signing_channel.get('claim_id'), + 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), + 'filesize': int_or_none(source.get('size')), + 'url': streaming_url, + } diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index f5c997ef4..89d549858 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -5,28 +5,26 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, - int_or_none, - parse_duration, - remove_end, + clean_html, + merge_dicts, ) class LRTIE(InfoExtractor): IE_NAME = 'lrt.lt' - _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))' _TESTS = [{ # m3u8 download - 'url': 'http://www.lrt.lt/mediateka/irasas/54391/', - 'md5': 'fe44cf7e4ab3198055f2c598fc175cb0', + 'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene', + 'md5': '85cb2bb530f31d91a9c65b479516ade4', 'info_dict': { - 'id': '54391', + 'id': '2000127261', 'ext': 'mp4', - 'title': 'Septynios Kauno dienos', - 'description': 'md5:24d84534c7dc76581e59f5689462411a', - 'duration': 1783, - 'view_count': int, - 'like_count': int, + 'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė', + 'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa', + 'duration': 3035, + 'timestamp': 1604079000, + 'upload_date': '20201030', }, }, { # direct mp3 download @@ -43,52 +41,35 @@ class LRTIE(InfoExtractor): }, }] + def _extract_js_var(self, webpage, var_name, default): + return self._search_regex( + r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name, + webpage, var_name.replace('_', ' '), default, group=2) + def _real_extract(self, url): - video_id = self._match_id(url) + path, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' - LRT') + media_url = self._extract_js_var(webpage, 'main_url', path) + media = self._download_json(self._extract_js_var( + webpage, 'media_info_url', + 'https://www.lrt.lt/servisai/stream_url/vod/media_info/'), + video_id, query={'url': media_url}) + jw_data = self._parse_jwplayer_data( + media['playlist_item'], video_id, base_url=url) - formats = [] - for _, file_url in re.findall( - r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): - ext = determine_ext(file_url) - if ext not in ('m3u8', 'mp3'): + json_ld_data = self._search_json_ld(webpage, video_id) + + tags = [] + for tag in (media.get('tags') or []): + tag_name = tag.get('name') + if not tag_name: continue - # mp3 served as m3u8 produces stuttered media file - if ext == 'm3u8' and '.mp3' in file_url: - continue - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - file_url, video_id, 'mp4', entry_protocol='m3u8_native', - fatal=False)) - elif ext == 'mp3': - formats.append({ - 'url': file_url, - 'vcodec': 'none', - }) - self._sort_formats(formats) + tags.append(tag_name) - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage) - duration = parse_duration(self._search_regex( - r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1', - webpage, 'duration', default=None, group='duration')) - - view_count = int_or_none(self._html_search_regex( - r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>', - webpage, 'view count', fatal=False, group='count')) - like_count = int_or_none(self._search_regex( - r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<', - webpage, 'like count', fatal=False, group='count')) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, + clean_info = { + 'description': clean_html(media.get('content')), + 'tags': tags, } + + return merge_dicts(clean_info, jw_data, json_ld_data) diff --git a/youtube_dl/extractor/malltv.py b/youtube_dl/extractor/malltv.py index 6f4fd927f..fadfd9338 100644 --- a/youtube_dl/extractor/malltv.py +++ b/youtube_dl/extractor/malltv.py @@ -1,10 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import merge_dicts +from ..utils import ( + clean_html, + dict_get, + float_or_none, + int_or_none, + merge_dicts, + parse_duration, + try_get, +) class MallTVIE(InfoExtractor): @@ -17,7 +23,7 @@ class MallTVIE(InfoExtractor): 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', 'ext': 'mp4', 'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?', - 'description': 'md5:25fc0ec42a72ba602b602c683fa29deb', + 'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35', 'duration': 216, 'timestamp': 1538870400, 'upload_date': '20181007', @@ -37,20 +43,46 @@ class MallTVIE(InfoExtractor): webpage = self._download_webpage( url, display_id, headers=self.geo_verification_headers()) - SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b' + video = self._parse_json(self._search_regex( + r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);', + webpage, 'video object'), display_id) + video_source = video['VideoSource'] video_id = self._search_regex( - SOURCE_RE, webpage, 'video id', group='id') + r'/([\da-z]+)/index\b', video_source, 'video id') - media = self._parse_html5_media_entries( - url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id, - m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] + formats = self._extract_m3u8_formats( + video_source + '.m3u8', video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + subtitles = {} + for s in (video.get('Subtitles') or {}): + s_url = s.get('Url') + if not s_url: + continue + subtitles.setdefault(s.get('Language') or 'cz', []).append({ + 'url': s_url, + }) + + entity_counts = video.get('EntityCounts') or {} + + def get_count(k): + v = entity_counts.get(k + 's') or {} + return int_or_none(dict_get(v, ('Count', 'StrCount'))) info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(media, info, { + return merge_dicts({ 'id': video_id, 'display_id': display_id, - 'title': self._og_search_title(webpage, default=None) or display_id, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - }) + 'title': video.get('Title'), + 'description': clean_html(video.get('Description')), + 'thumbnail': video.get('ThumbnailUrl'), + 'formats': formats, + 'subtitles': subtitles, + 'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')), + 'view_count': get_count('View'), + 'like_count': get_count('Like'), + 'dislike_count': get_count('Dislike'), + 'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])), + 'comment_count': get_count('Comment'), + }, info) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 6f3cb3003..ea5f5a315 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -10,7 +10,6 @@ from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, - js_to_json, parse_duration, smuggle_url, try_get, @@ -394,8 +393,8 @@ class NBCNewsIE(ThePlatformIE): webpage = self._download_webpage(url, video_id) data = self._parse_json(self._search_regex( - r'window\.__data\s*=\s*({.+});', webpage, - 'bootstrap json'), video_id, js_to_json) + r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', + webpage, 'bootstrap json'), video_id)['props']['initialState'] video_data = try_get(data, lambda x: x['video']['current'], dict) if not video_data: video_data = data['article']['content'][0]['primaryMedia']['video'] diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 2447c812e..ddd828d92 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -81,6 +81,29 @@ class NDRIE(NDRBaseIE): 'params': { 'skip_download': True, }, + }, { + # with subtitles + 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', + 'info_dict': { + 'id': 'extra18674', + 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', + 'ext': 'mp4', + 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', + 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6', + 'uploader': 'ndrtv', + 'upload_date': '20201113', + 'duration': 1749, + 'subtitles': { + 'de': [{ + 'ext': 'ttml', + 'url': r're:^https://www\.ndr\.de.+', + }], + }, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', 'only_matching': True, @@ -239,6 +262,20 @@ class NDREmbedBaseIE(InfoExtractor): 'preference': quality_key(thumbnail.get('quality')), }) + subtitles = {} + tracks = config.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + track_url = urljoin(url, track.get('src')) + if not track_url: + continue + subtitles.setdefault(track.get('srclang') or 'de', []).append({ + 'url': track_url, + 'ext': 'ttml', + }) + return { 'id': video_id, 'title': title, @@ -248,6 +285,7 @@ class NDREmbedBaseIE(InfoExtractor): 'duration': duration, 'thumbnails': thumbnails, 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 207a6c247..bee2d53f5 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -16,7 +17,6 @@ from ..utils import ( int_or_none, parse_duration, strip_or_none, - try_get, unescapeHTML, unified_strdate, unified_timestamp, @@ -141,6 +141,7 @@ class RaiPlayIE(RaiBaseIE): 'series': 'La Casa Bianca', 'season': '2016', }, + 'skip': 'This content is not available', }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', @@ -148,14 +149,12 @@ class RaiPlayIE(RaiBaseIE): 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', 'title': 'Report del 07/04/2014', - 'alt_title': 'S2013/14 - Puntata del 07/04/2014', - 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', + 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014', + 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 5', - 'creator': 'Rai 5', + 'uploader': 'Rai Gulp', 'duration': 6160, 'series': 'Report', - 'season_number': 5, 'season': '2013/14', }, 'params': { @@ -167,48 +166,51 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - url, video_id = mobj.group('url', 'id') + url, video_id = re.match(self._VALID_URL, url).groups() media = self._download_json( - '%s?json' % url, video_id, 'Downloading video JSON') + url.replace('.html', '.json'), video_id, 'Downloading video JSON') title = media['name'] video = media['video'] - relinker_info = self._extract_relinker_info(video['contentUrl'], video_id) + relinker_info = self._extract_relinker_info(video['content_url'], video_id) self._sort_formats(relinker_info['formats']) thumbnails = [] - if 'images' in media: - for _, value in media.get('images').items(): - if value: - thumbnails.append({ - 'url': value.replace('[RESOLUTION]', '600x400') - }) + for _, value in media.get('images', {}).items(): + if value: + thumbnails.append({ + 'url': urljoin(url, value), + }) - timestamp = unified_timestamp(try_get( - media, lambda x: x['availabilities'][0]['start'], compat_str)) + date_published = media.get('date_published') + time_published = media.get('time_published') + if date_published and time_published: + date_published += ' ' + time_published subtitles = self._extract_subtitles(url, video.get('subtitles')) + program_info = media.get('program_info') or {} + season = media.get('season') + info = { 'id': video_id, 'title': self._live_title(title) if relinker_info.get( 'is_live') else title, - 'alt_title': media.get('subtitle'), + 'alt_title': strip_or_none(media.get('subtitle')), 'description': media.get('description'), 'uploader': strip_or_none(media.get('channel')), - 'creator': strip_or_none(media.get('editor')), + 'creator': strip_or_none(media.get('editor') or None), 'duration': parse_duration(video.get('duration')), - 'timestamp': timestamp, + 'timestamp': unified_timestamp(date_published), 'thumbnails': thumbnails, - 'series': try_get( - media, lambda x: x['isPartOf']['name'], compat_str), - 'season_number': int_or_none(try_get( - media, lambda x: x['isPartOf']['numeroStagioni'])), - 'season': media.get('stagione') or None, + 'series': program_info.get('name'), + 'season_number': int_or_none(season), + 'season': season if (season and not season.isdigit()) else None, + 'episode': media.get('episode_title'), + 'episode_number': int_or_none(media.get('episode')), 'subtitles': subtitles, } @@ -300,7 +302,8 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1758, 'upload_date': '20140612', - } + }, + 'skip': 'This content is available only in Italy', }, { # with ContentItem in many metas 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', @@ -316,7 +319,7 @@ class RaiIE(RaiBaseIE): }, { # with ContentItem in og:url 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', - 'md5': '11959b4e44fa74de47011b5799490adf', + 'md5': '6865dd00cf0bbf5772fdd89d59bd768a', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', @@ -338,6 +341,7 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20141221', }, + 'skip': 'This content is not available', }, { # initEdizione('ContentItem-...' 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', @@ -360,6 +364,7 @@ class RaiIE(RaiBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'This content is available only in Italy', }, { # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py index 9401bf2cf..1610ddc2c 100644 --- a/youtube_dl/extractor/servus.py +++ b/youtube_dl/extractor/servus.py @@ -1,9 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + unified_timestamp, + urlencode_postdata, + url_or_none, +) class ServusIE(InfoExtractor): @@ -12,20 +18,29 @@ class ServusIE(InfoExtractor): (?:www\.)? (?: servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| - servustv\.com/videos + (?:servustv|pm-wissen)\.com/videos ) /(?P<id>[aA]{2}-\w+|\d+-\d+) ''' _TESTS = [{ # new URL schema 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', - 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', + 'md5': '60474d4c21f3eb148838f215c37f02b9', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', 'ext': 'mp4', 'title': 'Die Grünen aus Sicht des Volkes', + 'alt_title': 'Talk im Hangar-7 Voxpops Gruene', 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 62.442, + 'timestamp': 1605193976, + 'upload_date': '20201112', + 'series': 'Talk im Hangar-7', + 'season': 'Season 9', + 'season_number': 9, + 'episode': 'Episode 31 - September 14', + 'episode_number': 31, } }, { # old URL schema @@ -40,30 +55,94 @@ class ServusIE(InfoExtractor): }, { 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', 'only_matching': True, + }, { + 'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url).upper() - webpage = self._download_webpage(url, video_id) - title = self._search_regex( - (r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', - r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'), - webpage, 'title', default=None, - group='title') or self._og_search_title(webpage) - title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + token = self._download_json( + 'https://auth.redbullmediahouse.com/token', video_id, + 'Downloading token', data=urlencode_postdata({ + 'grant_type': 'client_credentials', + }), headers={ + 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==', + }) + access_token = token['access_token'] + token_type = token.get('token_type', 'Bearer') - formats = self._extract_m3u8_formats( - 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, - video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + video = self._download_json( + 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id, + video_id, 'Downloading video JSON', headers={ + 'Authorization': '%s %s' % (token_type, access_token), + }) + + formats = [] + thumbnail = None + for resource in video['resources']: + if not isinstance(resource, dict): + continue + format_url = url_or_none(resource.get('url')) + if not format_url: + continue + extension = resource.get('extension') + type_ = resource.get('type') + if extension == 'jpg' or type_ == 'reference_keyframe': + thumbnail = format_url + continue + ext = determine_ext(format_url) + if type_ == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif type_ == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif extension == 'mp4' or ext == 'mp4': + formats.append({ + 'url': format_url, + 'format_id': type_, + 'width': int_or_none(resource.get('width')), + 'height': int_or_none(resource.get('height')), + }) self._sort_formats(formats) + attrs = {} + for attribute in video['attributes']: + if not isinstance(attribute, dict): + continue + key = attribute.get('fieldKey') + value = attribute.get('fieldValue') + if not key or not value: + continue + attrs[key] = value + + title = attrs.get('title_stv') or video_id + alt_title = attrs.get('title') + description = attrs.get('long_description') or attrs.get('short_description') + series = attrs.get('label') + season = attrs.get('season') + episode = attrs.get('chapter') + duration = float_or_none(attrs.get('duration'), scale=1000) + season_number = int_or_none(self._search_regex( + r'Season (\d+)', season or '', 'season number', default=None)) + episode_number = int_or_none(self._search_regex( + r'Episode (\d+)', episode or '', 'episode number', default=None)) + return { 'id': video_id, 'title': title, + 'alt_title': alt_title, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': unified_timestamp(video.get('lastPublished')), + 'series': series, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, 'formats': formats, } diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 4df7f4ddc..2da32b9b2 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -1,159 +1,54 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .spiegeltv import SpiegeltvIE -from ..compat import compat_urlparse -from ..utils import ( - parse_duration, - strip_or_none, - unified_timestamp, -) +from .jwplatform import JWPlatformIE class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - 'md5': 'b57399839d055fccfeb9a0455c439868', + 'md5': '50c7948883ec85a3e431a0a44b7ad1d6', 'info_dict': { - 'id': '563747', + 'id': 'II0BUyxY', + 'display_id': '1259285', 'ext': 'mp4', - 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', + 'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', - 'duration': 49, + 'duration': 48.0, 'upload_date': '20130311', - 'timestamp': 1362994320, + 'timestamp': 1362997920, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', - 'info_dict': { - 'id': '580988', - 'ext': 'mp4', - 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', - 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', - 'duration': 983, - 'upload_date': '20131115', - 'timestamp': 1384546642, - }, - }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', - 'md5': '97b91083a672d72976faa8433430afb9', - 'info_dict': { - 'id': '601883', - 'ext': 'mp4', - 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', - 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', - 'upload_date': '20140904', - 'timestamp': 1409834160, - } - }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', 'only_matching': True, }, { - # nexx video + 'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html', + 'only_matching': True, + }, { + 'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7', + 'only_matching': True, + }, { 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id - handle = self._request_webpage(metadata_url, video_id) - - # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html - if SpiegeltvIE.suitable(handle.geturl()): - return self.url_result(handle.geturl(), 'Spiegeltv') - - video_data = self._parse_json(self._webpage_read_content( - handle, metadata_url, video_id), video_id) - title = video_data['title'] - nexx_id = video_data['nexxOmniaId'] - domain_id = video_data.get('nexxOmniaDomain') or '748' - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': 'nexx:%s:%s' % (domain_id, nexx_id), - 'title': title, - 'description': strip_or_none(video_data.get('teaser')), - 'duration': parse_duration(video_data.get('duration')), - 'timestamp': unified_timestamp(video_data.get('datum')), - 'ie_key': NexxIE.ie_key(), - } - - -class SpiegelArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' - IE_NAME = 'Spiegel:Article' - IE_DESC = 'Articles on spiegel.de' - _TESTS = [{ + }, { 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', - 'info_dict': { - 'id': '1516455', - 'ext': 'mp4', - 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', - 'description': 're:^Patrick Kämnitz gehört.{100,}', - 'upload_date': '20140825', - }, - }, { - 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', - 'info_dict': { - - }, - 'playlist_count': 6, - }, { - # Nexx iFrame embed - 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', - 'info_dict': { - 'id': '161464', - 'ext': 'mp4', - 'title': 'Nervenkitzel Achterbahn', - 'alt_title': 'Karussellbauer in Deutschland', - 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', - 'release_year': 2005, - 'creator': 'SPIEGEL TV', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2761, - 'timestamp': 1394021479, - 'upload_date': '20140305', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - - # Single video on top of the page - video_link = self._search_regex( - r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage, - 'video page URL', default=None) - if video_link: - video_url = compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', video_link) - return self.url_result(video_url) - - # Multiple embedded videos - embeds = re.findall( - r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"', - webpage) - entries = [ - self.url_result(compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', embed_path)) - for embed_path in embeds] - if embeds: - return self.playlist_result(entries) - - return self.playlist_from_matches( - NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key()) + media_id = self._html_search_regex( + r'("|["\'])mediaId\1\s*:\s*("|["\'])(?P<id>(?:(?!\2).)+)\2', + webpage, 'media id', group='id') + return { + '_type': 'url_transparent', + 'id': video_id, + 'display_id': video_id, + 'url': 'jwplatform:%s' % media_id, + 'title': self._og_search_title(webpage, default=None), + 'ie_key': JWPlatformIE.ie_key(), + } diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py deleted file mode 100644 index 6ccf4c342..000000000 --- a/youtube_dl/extractor/spiegeltv.py +++ /dev/null @@ -1,17 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .nexx import NexxIE - - -class SpiegeltvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/', - 'only_matching': True, - } - - def _real_extract(self, url): - return self.url_result( - 'https://api.nexx.cloud/v3/748/videos/byid/%s' - % self._match_id(url), ie=NexxIE.ie_key()) diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py index 6030b7cb5..10b817760 100644 --- a/youtube_dl/extractor/urplay.py +++ b/youtube_dl/extractor/urplay.py @@ -2,7 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_timestamp +from ..utils import ( + dict_get, + int_or_none, + unified_timestamp, +) class URPlayIE(InfoExtractor): @@ -15,8 +19,8 @@ class URPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd', 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', - 'timestamp': 1513512768, - 'upload_date': '20171217', + 'timestamp': 1513292400, + 'upload_date': '20171214', }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -25,7 +29,7 @@ class URPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'Tripp, Trapp, Träd : Sovkudde', 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', - 'timestamp': 1440093600, + 'timestamp': 1440086400, 'upload_date': '20150820', }, }, { @@ -35,37 +39,58 @@ class URPlayIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - + url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - urplayer_data = self._parse_json(self._search_regex( - r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id) - host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] + urplayer_data = self._parse_json(self._html_search_regex( + r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['currentProduct'] + episode = urplayer_data['title'] + raw_streaming_info = urplayer_data['streamingInfo']['raw'] + host = self._download_json( + 'http://streaming-loadbalancer.ur.se/loadbalancer.json', + video_id)['redirect'] formats = [] - for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)): - file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr) + for k, v in raw_streaming_info.items(): + if not (k in ('sd', 'hd') and isinstance(v, dict)): + continue + file_http = v.get('location') if file_http: formats.extend(self._extract_wowza_formats( - 'http://%s/%splaylist.m3u8' % (host, file_http), video_id, skip_protocols=['rtmp', 'rtsp'])) + 'http://%s/%splaylist.m3u8' % (host, file_http), + video_id, skip_protocols=['f4m', 'rtmp', 'rtsp'])) self._sort_formats(formats) - subtitles = {} - for subtitle in urplayer_data.get('subtitles', []): - subtitle_url = subtitle.get('file') - kind = subtitle.get('kind') - if not subtitle_url or (kind and kind != 'captions'): - continue - subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({ - 'url': subtitle_url, - }) + image = urplayer_data.get('image') or {} + thumbnails = [] + for k, v in image.items(): + t = { + 'id': k, + 'url': v, + } + wh = k.split('x') + if len(wh) == 2: + t.update({ + 'width': int_or_none(wh[0]), + 'height': int_or_none(wh[1]), + }) + thumbnails.append(t) + + series = urplayer_data.get('series') or {} + series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle')) return { 'id': video_id, - 'title': urplayer_data['title'], - 'description': self._og_search_description(webpage), - 'thumbnail': urplayer_data.get('image'), - 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), webpage, 'timestamp')), - 'series': urplayer_data.get('series_title'), - 'subtitles': subtitles, + 'title': '%s : %s' % (series_title, episode) if series_title else episode, + 'description': urplayer_data.get('description'), + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(urplayer_data.get('publishedAt')), + 'series': series_title, 'formats': formats, + 'duration': int_or_none(urplayer_data.get('duration')), + 'categories': urplayer_data.get('categories'), + 'tags': urplayer_data.get('keywords'), + 'season': series.get('label'), + 'episode': episode, + 'episode_number': int_or_none(urplayer_data.get('episodeNumber')), } diff --git a/youtube_dl/extractor/usanetwork.py b/youtube_dl/extractor/usanetwork.py index 54c7495cc..e3784e55f 100644 --- a/youtube_dl/extractor/usanetwork.py +++ b/youtube_dl/extractor/usanetwork.py @@ -1,74 +1,24 @@ # coding: utf-8 from __future__ import unicode_literals -from .adobepass import AdobePassIE -from ..utils import ( - NO_DEFAULT, - smuggle_url, - update_url_query, -) +from .nbc import NBCIE -class USANetworkIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity', - 'md5': '33c0d2ba381571b414024440d08d57fd', +class USANetworkIE(NBCIE): + _VALID_URL = r'https?(?P<permalink>://(?:www\.)?usanetwork\.com/[^/]+/video/[^/]+/(?P<id>\d+))' + _TESTS = [{ + 'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302', 'info_dict': { - 'id': '3086229', + 'id': '4185302', 'ext': 'mp4', - 'title': 'HPE Cybersecurity', - 'description': 'The more we digitize our world, the more vulnerable we are.', - 'upload_date': '20160818', - 'timestamp': 1471535460, - 'uploader': 'NBCU-USA', + 'title': 'Intelligence (Trailer)', + 'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.', + 'upload_date': '20200715', + 'timestamp': 1594785600, + 'uploader': 'NBCU-MPAT', }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def _x(name, default=NO_DEFAULT): - return self._search_regex( - r'data-%s\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % name, - webpage, name, default=default, group='value') - - video_id = _x('mpx-guid') - title = _x('episode-title') - mpx_account_id = _x('mpx-account-id', '2304992029') - - query = { - 'mbr': 'true', - } - if _x('is-full-episode', None) == '1': - query['manifest'] = 'm3u' - - if _x('is-entitlement', None) == '1': - adobe_pass = {} - drupal_settings = self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings', fatal=False) - if drupal_settings: - drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False) - if drupal_settings: - adobe_pass = drupal_settings.get('adobePass', {}) - resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'usa'), - title, video_id, _x('episode-rating', 'TV-14')) - query['auth'] = self._extract_mvpd_auth( - url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource) - - info = self._search_json_ld(webpage, video_id, default={}) - info.update({ - '_type': 'url_transparent', - 'url': smuggle_url(update_url_query( - 'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id), - query), {'force_smil_url': True}), - 'id': video_id, - 'title': title, - 'series': _x('show-title', None), - 'episode': title, - 'ie_key': 'ThePlatform', - }) - return info + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index f79531e6f..df1dc78dd 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -1,25 +1,30 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import time import itertools +import json -from .common import InfoExtractor from .naver import NaverBaseIE -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( ExtractorError, + int_or_none, merge_dicts, - remove_start, try_get, urlencode_postdata, ) -class VLiveIE(NaverBaseIE): +class VLiveBaseIE(NaverBaseIE): + _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + + +class VLiveIE(VLiveBaseIE): IE_NAME = 'vlive' - _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)' _NETRC_MACHINE = 'vlive' _TESTS = [{ 'url': 'http://www.vlive.tv/video/1326', @@ -27,7 +32,7 @@ class VLiveIE(NaverBaseIE): 'info_dict': { 'id': '1326', 'ext': 'mp4', - 'title': "[V LIVE] Girl's Day's Broadcast", + 'title': "Girl's Day's Broadcast", 'creator': "Girl's Day", 'view_count': int, 'uploader_id': 'muploader_a', @@ -37,7 +42,7 @@ class VLiveIE(NaverBaseIE): 'info_dict': { 'id': '16937', 'ext': 'mp4', - 'title': '[V LIVE] 첸백시 걍방', + 'title': '첸백시 걍방', 'creator': 'EXO', 'view_count': int, 'subtitles': 'mincount:12', @@ -58,12 +63,11 @@ class VLiveIE(NaverBaseIE): 'subtitles': 'mincount:10', }, 'skip': 'This video is only available for CH+ subscribers', + }, { + 'url': 'https://www.vlive.tv/embed/1326', + 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url) - def _real_initialize(self): self._login() @@ -95,173 +99,122 @@ class VLiveIE(NaverBaseIE): if not is_logged_in(): raise ExtractorError('Unable to log in', expected=True) + def _call_api(self, path_template, video_id, fields=None): + query = {'appId': self._APP_ID} + if fields: + query['fields'] = fields + return self._download_json( + 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, + 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0], + headers={'Referer': 'https://www.vlive.tv/'}, query=query) + def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://www.vlive.tv/video/%s' % video_id, video_id) + try: + post = self._call_api( + 'post/v1.0/officialVideoPost-%s', video_id, + 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}') + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_login_required(json.loads(e.cause.read().decode())['message']) + raise - VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' - VIDEO_PARAMS_FIELD = 'video params' + video = post['officialVideo'] - params = self._parse_json(self._search_regex( - VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id, - transform_source=lambda s: '[' + s + ']', fatal=False) + def get_common_fields(): + channel = post.get('channel') or {} + return { + 'title': video.get('title'), + 'creator': post.get('author', {}).get('nickname'), + 'channel': channel.get('channelName'), + 'channel_id': channel.get('channelCode'), + 'duration': int_or_none(video.get('playTime')), + 'view_count': int_or_none(video.get('playCount')), + 'like_count': int_or_none(video.get('likeCount')), + 'comment_count': int_or_none(video.get('commentCount')), + } - if not params or len(params) < 7: - params = self._search_regex( - VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD) - params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)] - - status, long_video_id, key = params[2], params[5], params[6] - status = remove_start(status, 'PRODUCT_') - - if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'): - return self._live(video_id, webpage) - elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'): - return self._replay(video_id, webpage, long_video_id, key) - - if status == 'LIVE_END': - raise ExtractorError('Uploading for replay. Please wait...', - expected=True) - elif status == 'COMING_SOON': - raise ExtractorError('Coming soon!', expected=True) - elif status == 'CANCELED': - raise ExtractorError('We are sorry, ' - 'but the live broadcast has been canceled.', - expected=True) - elif status == 'ONLY_APP': - raise ExtractorError('Unsupported video type', expected=True) - else: - raise ExtractorError('Unknown status %s' % status) - - def _get_common_fields(self, webpage): - title = self._og_search_title(webpage) - creator = self._html_search_regex( - r'<div[^>]+class="info_area"[^>]*>\s*(?:<em[^>]*>.*?</em\s*>\s*)?<a\s+[^>]*>([^<]+)', - webpage, 'creator', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - return { - 'title': title, - 'creator': creator, - 'thumbnail': thumbnail, - } - - def _live(self, video_id, webpage): - init_page = self._download_init_page(video_id) - - live_params = self._search_regex( - r'"liveStreamInfo"\s*:\s*(".*"),', - init_page, 'live stream info') - live_params = self._parse_json(live_params, video_id) - live_params = self._parse_json(live_params, video_id) - - formats = [] - for vid in live_params.get('resolutions', []): - formats.extend(self._extract_m3u8_formats( - vid['cdnUrl'], video_id, 'mp4', - m3u8_id=vid.get('name'), - fatal=False, live=True)) - self._sort_formats(formats) - - info = self._get_common_fields(webpage) - info.update({ - 'title': self._live_title(info['title']), - 'id': video_id, - 'formats': formats, - 'is_live': True, - }) - return info - - def _replay(self, video_id, webpage, long_video_id, key): - if '' in (long_video_id, key): - init_page = self._download_init_page(video_id) - video_info = self._parse_json(self._search_regex( - (r'(?s)oVideoStatus\s*=\s*({.+?})\s*</script', - r'(?s)oVideoStatus\s*=\s*({.+})'), init_page, 'video info'), - video_id) - if video_info.get('status') == 'NEED_CHANNEL_PLUS': - self.raise_login_required( - 'This video is only available for CH+ subscribers') - long_video_id, key = video_info['vid'], video_info['inkey'] - - return merge_dicts( - self._get_common_fields(webpage), - self._extract_video_info(video_id, long_video_id, key)) - - def _download_init_page(self, video_id): - return self._download_webpage( - 'https://www.vlive.tv/video/init/view', - video_id, note='Downloading live webpage', - data=urlencode_postdata({'videoSeq': video_id}), - headers={ - 'Referer': 'https://www.vlive.tv/video/%s' % video_id, - 'Content-Type': 'application/x-www-form-urlencoded' - }) + video_type = video.get('type') + if video_type == 'VOD': + inkey = self._call_api('video/v1.0/vod/%s/inkey', video_id)['inkey'] + vod_id = video['vodId'] + return merge_dicts( + get_common_fields(), + self._extract_video_info(video_id, vod_id, inkey)) + elif video_type == 'LIVE': + status = video.get('status') + if status == 'ON_AIR': + stream_url = self._call_api( + 'old/v3/live/%s/playInfo', + video_id)['result']['adaptiveStreamUrl'] + formats = self._extract_m3u8_formats(stream_url, video_id, 'mp4') + info = get_common_fields() + info.update({ + 'title': self._live_title(video['title']), + 'id': video_id, + 'formats': formats, + 'is_live': True, + }) + return info + elif status == 'ENDED': + raise ExtractorError( + 'Uploading for replay. Please wait...', expected=True) + elif status == 'RESERVED': + raise ExtractorError('Coming soon!', expected=True) + elif video.get('exposeStatus') == 'CANCEL': + raise ExtractorError( + 'We are sorry, but the live broadcast has been canceled.', + expected=True) + else: + raise ExtractorError('Unknown status ' + status) -class VLiveChannelIE(InfoExtractor): +class VLiveChannelIE(VLiveBaseIE): IE_NAME = 'vlive:channel' - _VALID_URL = r'https?://channels\.vlive\.tv/(?P<id>[0-9A-Z]+)' - _TEST = { + _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)' + _TESTS = [{ 'url': 'http://channels.vlive.tv/FCD4B', 'info_dict': { 'id': 'FCD4B', 'title': 'MAMAMOO', }, 'playlist_mincount': 110 - } - _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + }, { + 'url': 'https://www.vlive.tv/channel/FCD4B', + 'only_matching': True, + }] + + def _call_api(self, path, channel_key_suffix, channel_value, note, query): + q = { + 'app_id': self._APP_ID, + 'channel' + channel_key_suffix: channel_value, + } + q.update(query) + return self._download_json( + 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path, + channel_value, note='Downloading ' + note, query=q)['result'] def _real_extract(self, url): channel_code = self._match_id(url) - webpage = self._download_webpage( - 'http://channels.vlive.tv/%s/video' % channel_code, channel_code) + channel_seq = self._call_api( + 'decodeChannelCode', 'Code', channel_code, + 'decode channel code', {})['channelSeq'] - app_id = None - - app_js_url = self._search_regex( - r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1', - webpage, 'app js', default=None, group='url') - - if app_js_url: - app_js = self._download_webpage( - app_js_url, channel_code, 'Downloading app JS', fatal=False) - if app_js: - app_id = self._search_regex( - r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]', - app_js, 'app id', default=None) - - app_id = app_id or self._APP_ID - - channel_info = self._download_json( - 'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode', - channel_code, note='Downloading decode channel code', - query={ - 'app_id': app_id, - 'channelCode': channel_code, - '_': int(time.time()) - }) - - channel_seq = channel_info['result']['channelSeq'] channel_name = None entries = [] for page_num in itertools.count(1): - video_list = self._download_json( - 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList', - channel_code, note='Downloading channel list page #%d' % page_num, - query={ - 'app_id': app_id, - 'channelSeq': channel_seq, + video_list = self._call_api( + 'getChannelVideoList', 'Seq', channel_seq, + 'channel list page #%d' % page_num, { # Large values of maxNumOfRows (~300 or above) may cause # empty responses (see [1]), e.g. this happens for [2] that # has more than 300 videos. # 1. https://github.com/ytdl-org/youtube-dl/issues/13830 # 2. http://channels.vlive.tv/EDBF. 'maxNumOfRows': 100, - '_': int(time.time()), 'pageNo': page_num } ) @@ -269,11 +222,11 @@ class VLiveChannelIE(InfoExtractor): if not channel_name: channel_name = try_get( video_list, - lambda x: x['result']['channelInfo']['channelName'], + lambda x: x['channelInfo']['channelName'], compat_str) videos = try_get( - video_list, lambda x: x['result']['videoList'], list) + video_list, lambda x: x['videoList'], list) if not videos: break @@ -289,79 +242,3 @@ class VLiveChannelIE(InfoExtractor): return self.playlist_result( entries, channel_code, channel_name) - - -class VLivePlaylistIE(InfoExtractor): - IE_NAME = 'vlive:playlist' - _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)' - _VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s' - _TESTS = [{ - # regular working playlist - 'url': 'https://www.vlive.tv/video/117956/playlist/117963', - 'info_dict': { - 'id': '117963', - 'title': '아이돌룸(IDOL ROOM) 41회 - (여자)아이들' - }, - 'playlist_mincount': 10 - }, { - # playlist with no playlistVideoSeqs - 'url': 'http://www.vlive.tv/video/22867/playlist/22912', - 'info_dict': { - 'id': '22867', - 'ext': 'mp4', - 'title': '[V LIVE] Valentine Day Message from MINA', - 'creator': 'TWICE', - 'view_count': int - }, - 'params': { - 'skip_download': True, - } - }] - - def _build_video_result(self, video_id, message): - self.to_screen(message) - return self.url_result( - self._VIDEO_URL_TEMPLATE % video_id, - ie=VLiveIE.ie_key(), video_id=video_id) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, playlist_id = mobj.group('video_id', 'id') - - if self._downloader.params.get('noplaylist'): - return self._build_video_result( - video_id, - 'Downloading just video %s because of --no-playlist' - % video_id) - - self.to_screen( - 'Downloading playlist %s - add --no-playlist to just download video' - % playlist_id) - - webpage = self._download_webpage( - 'http://www.vlive.tv/video/%s/playlist/%s' - % (video_id, playlist_id), playlist_id) - - raw_item_ids = self._search_regex( - r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage, - 'playlist video seqs', default=None, fatal=False) - - if not raw_item_ids: - return self._build_video_result( - video_id, - 'Downloading just video %s because no playlist was found' - % video_id) - - item_ids = self._parse_json(raw_item_ids, playlist_id) - - entries = [ - self.url_result( - self._VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(), - video_id=compat_str(item_id)) - for item_id in item_ids] - - playlist_name = self._html_search_regex( - r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)', - webpage, 'playlist title', fatal=False) - - return self.playlist_result(entries, playlist_id, playlist_name) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 01b253dcb..18969058f 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -90,7 +90,7 @@ class XTubeIE(InfoExtractor): title, thumbnail, duration = [None] * 3 config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config', + r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config', default='{}'), video_id, transform_source=js_to_json, fatal=False) if config: config = config.get('mainRoll') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 696dec2c1..79f87aa85 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -45,6 +45,7 @@ from ..utils import ( unescapeHTML, unified_strdate, unsmuggle_url, + update_url_query, uppercase_escape, url_or_none, urlencode_postdata, @@ -65,7 +66,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' + _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' _YOUTUBE_CLIENT_HEADERS = { 'x-youtube-client-name': '1', @@ -509,48 +510,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'end_time': 9, } }, - { - 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY', - 'note': 'Test generic use_cipher_signature video (#897)', - 'info_dict': { - 'id': 'UxxajLWwzqY', - 'ext': 'mp4', - 'upload_date': '20120506', - 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', - 'alt_title': 'I Love It (feat. Charli XCX)', - 'description': 'md5:19a2f98d9032b9311e686ed039564f63', - 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', - 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', - 'iconic ep', 'iconic', 'love', 'it'], - 'duration': 180, - 'uploader': 'Icona Pop', - 'uploader_id': 'IconaPop', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', - 'creator': 'Icona Pop', - 'track': 'I Love It (feat. Charli XCX)', - 'artist': 'Icona Pop', - } - }, - { - 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ', - 'note': 'Test VEVO video with age protection (#956)', - 'info_dict': { - 'id': '07FYdnEawAQ', - 'ext': 'mp4', - 'upload_date': '20130703', - 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)', - 'alt_title': 'Tunnel Vision', - 'description': 'md5:07dab3356cde4199048e4c7cd93471e1', - 'duration': 419, - 'uploader': 'justintimberlakeVEVO', - 'uploader_id': 'justintimberlakeVEVO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', - 'creator': 'Justin Timberlake', - 'track': 'Tunnel Vision', - 'artist': 'Justin Timberlake', - 'age_limit': 18, - } - }, { 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', 'note': 'Embed-only video (#1746)', @@ -567,7 +526,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } }, { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ', 'note': 'Use the first video ID in the URL', 'info_dict': { 'id': 'BaW_jenozKc', @@ -626,24 +585,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'format': '141/bestaudio[ext=m4a]', }, }, - # JS player signature function name containing $ - { - 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM', - 'info_dict': { - 'id': 'nfWlot6h_JM', - 'ext': 'm4a', - 'title': 'Taylor Swift - Shake It Off', - 'description': 'md5:307195cd21ff7fa352270fe884570ef0', - 'duration': 242, - 'uploader': 'TaylorSwiftVEVO', - 'uploader_id': 'TaylorSwiftVEVO', - 'upload_date': '20140818', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141/bestaudio[ext=m4a]', - }, - }, # Controversy video { 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', @@ -675,22 +616,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'age_limit': 18, }, }, - # Age-gate video with encrypted signature - { - 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', - 'info_dict': { - 'id': '6kLq3WMV1nU', - 'ext': 'mp4', - 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', - 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', - 'duration': 246, - 'uploader': 'LloydVEVO', - 'uploader_id': 'LloydVEVO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', - 'upload_date': '20110629', - 'age_limit': 18, - }, - }, # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) # YouTube Red ad is not captured for creator { @@ -1050,10 +975,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'sJL6WA-aGkQ', 'only_matching': True, }, - { - 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', - 'only_matching': True, - }, { 'url': 'https://invidio.us/watch?v=BaW_jenozKc', 'only_matching': True, @@ -1105,73 +1026,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, - { - # Youtube Music Auto-generated description - # Retrieve 'artist' field from 'Artist:' in video description - # when it is present on youtube music video - 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY', - 'info_dict': { - 'id': 'k0jLE7tTwjY', - 'ext': 'mp4', - 'title': 'Latch Feat. Sam Smith', - 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335', - 'upload_date': '20150110', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w', - 'artist': 'Disclosure', - 'track': 'Latch Feat. Sam Smith', - 'album': 'Latch Featuring Sam Smith', - 'release_date': '20121008', - 'release_year': 2012, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle multiple artists on youtube music video - 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA', - 'info_dict': { - 'id': '74qn0eJSjpA', - 'ext': 'mp4', - 'title': 'Eastside', - 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2', - 'upload_date': '20180710', - 'uploader': 'Benny Blanco - Topic', - 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A', - 'artist': 'benny blanco, Halsey, Khalid', - 'track': 'Eastside', - 'album': 'Eastside', - 'release_date': '20180713', - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle youtube music video with release_year and no release_date - 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M', - 'info_dict': { - 'id': '-hcAI0g-f5M', - 'ext': 'mp4', - 'title': 'Put It On Me', - 'description': 'md5:f6422397c07c4c907c6638e1fee380a5', - 'upload_date': '20180426', - 'uploader': 'Matt Maeson - Topic', - 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', - 'artist': 'Matt Maeson', - 'track': 'Put It On Me', - 'album': 'The Hearse', - 'release_date': None, - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, { 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q', 'only_matching': True, @@ -1608,21 +1462,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_chapters_from_json(self, webpage, video_id, duration): if not webpage: return - player = self._parse_json( - self._search_regex( - r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage, - 'player args', default='{}'), - video_id, fatal=False) - if not player or not isinstance(player, dict): - return - watch_next_response = player.get('watch_next_response') - if not isinstance(watch_next_response, compat_str): - return - response = self._parse_json(watch_next_response, video_id, fatal=False) - if not response or not isinstance(response, dict): + data = self._extract_yt_initial_data(video_id, webpage) + if not data or not isinstance(data, dict): return chapters_list = try_get( - response, + data, lambda x: x['playerOverlays'] ['playerOverlayRenderer'] ['decoratedPlayerBarRenderer'] @@ -2504,7 +2348,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): class YoutubeTabIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com tab' - _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|playlist\?.*?\blist=)(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|(?:playlist|watch)\?.*?\blist=)(?P<id>[^/?#&]+)' IE_NAME = 'youtube:tab' _TESTS = [{ @@ -2514,6 +2358,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', 'title': 'Игорь Клейнер - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', }, }, { # playlists, multipage, different order @@ -2522,14 +2367,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', 'title': 'Игорь Клейнер - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', }, }, { # playlists, singlepage 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', 'playlist_mincount': 4, 'info_dict': { - 'id': 'ThirstForScience', - 'title': 'ThirstForScience', + 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + 'title': 'ThirstForScience - Playlists', + 'description': 'md5:609399d937ea957b0f53cbffb747a14c', } }, { 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', @@ -2560,6 +2407,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Home', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', }, 'playlist_mincount': 2, }, { @@ -2568,6 +2416,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', }, 'playlist_mincount': 975, }, { @@ -2576,6 +2425,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', }, 'playlist_mincount': 199, }, { @@ -2584,6 +2434,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Playlists', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', }, 'playlist_mincount': 17, }, { @@ -2592,6 +2443,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Community', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', }, 'playlist_mincount': 18, }, { @@ -2600,6 +2452,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Channels', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', }, 'playlist_mincount': 138, }, { @@ -2618,7 +2471,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': '29C3: Not my department', 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'uploader': 'Christiaan008', - 'uploader_id': 'ChRiStIaAn008', + 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', }, 'playlist_count': 96, }, { @@ -2628,7 +2481,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Uploads from Cauchemar', 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', 'uploader': 'Cauchemar', - 'uploader_id': 'Cauchemar89', + 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', }, 'playlist_mincount': 1123, }, { @@ -2642,7 +2495,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Uploads from Interstellar Movie', 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', 'uploader': 'Interstellar Movie', - 'uploader_id': 'InterstellarMovie1', + 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', }, 'playlist_mincount': 21, }, { @@ -2651,13 +2504,43 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'title': 'Data Analysis with Dr Mike Pound', 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'uploader_id': 'Computerphile', + 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', 'uploader': 'Computerphile', }, 'playlist_mincount': 11, }, { 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', 'only_matching': True, + }, { + # Playlist URL that does not actually serve a playlist + 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', + 'info_dict': { + 'id': 'FqZTN594JQw', + 'ext': 'webm', + 'title': "Smiley's People 01 detective, Adventure Series, Action", + 'uploader': 'STREEM', + 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', + 'upload_date': '20150526', + 'license': 'Standard YouTube License', + 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', + 'categories': ['People & Blogs'], + 'tags': list, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video is not available.', + 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', + 'only_matching': True, }] @classmethod @@ -2688,7 +2571,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): def _extract_video(self, renderer): video_id = renderer.get('videoId') title = try_get( - renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + renderer, + (lambda x: x['title']['runs'][0]['text'], + lambda x: x['title']['simpleText']), compat_str) description = try_get( renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str) @@ -2696,8 +2581,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): renderer, lambda x: x['lengthText']['simpleText'], compat_str)) view_count_text = try_get( renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or '' - view_count = int_or_none(self._search_regex( - r'^(\d+)', re.sub(r'\s', '', view_count_text), + view_count = str_to_int(self._search_regex( + r'^([\d,]+)', re.sub(r'\s', '', view_count_text), 'view count', default=None)) uploader = try_get( renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str) @@ -2768,7 +2653,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): for content in video_list_renderer['contents']: if not isinstance(content, dict): continue - renderer = content.get('playlistVideoRenderer') + renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer') if not isinstance(renderer, dict): continue video_id = renderer.get('videoId') @@ -2866,9 +2751,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'itct': ctp, } - def _entries(self, tab): + def _entries(self, tab, identity_token): continuation = None - slr_contents = tab['sectionListRenderer']['contents'] + slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or [] for slr_content in slr_contents: if not isinstance(slr_content, dict): continue @@ -2911,16 +2796,20 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not continuation: continuation = self._extract_continuation(is_renderer) + headers = { + 'x-youtube-client-name': '1', + 'x-youtube-client-version': '2.20201112.04.01', + } + if identity_token: + headers['x-youtube-identity-token'] = identity_token + for page_num in itertools.count(1): if not continuation: break browse = self._download_json( 'https://www.youtube.com/browse_ajax', None, 'Downloading page %d' % page_num, - headers={ - 'x-youtube-client-name': '1', - 'x-youtube-client-version': '2.20201030.01.00', - }, query=continuation, fatal=False) + headers=headers, query=continuation, fatal=False) if not browse: break response = try_get(browse, lambda x: x[1]['response'], dict) @@ -2973,57 +2862,110 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): else: raise ExtractorError('Unable to find selected tab') + @staticmethod + def _extract_uploader(data): + uploader = {} + sidebar_renderer = try_get( + data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) + if sidebar_renderer: + for item in sidebar_renderer: + if not isinstance(item, dict): + continue + renderer = item.get('playlistSidebarSecondaryInfoRenderer') + if not isinstance(renderer, dict): + continue + owner = try_get( + renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict) + if owner: + uploader['uploader'] = owner.get('text') + uploader['uploader_id'] = try_get( + owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str) + uploader['uploader_url'] = urljoin( + 'https://www.youtube.com/', + try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) + return uploader + + def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): + selected_tab = self._extract_selected_tab(tabs) + renderer = try_get( + data, lambda x: x['metadata']['channelMetadataRenderer'], dict) + if renderer: + channel_title = renderer.get('title') or item_id + tab_title = selected_tab.get('title') + title = channel_title or item_id + if tab_title: + title += ' - %s' % tab_title + description = renderer.get('description') + playlist_id = renderer.get('externalId') + renderer = try_get( + data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) + if renderer: + title = renderer.get('title') + description = None + playlist_id = item_id + playlist = self.playlist_result( + self._entries(selected_tab['content'], identity_token), + playlist_id=playlist_id, playlist_title=title, + playlist_description=description) + playlist.update(self._extract_uploader(data)) + return playlist + + def _extract_from_playlist(self, item_id, data, playlist): + title = playlist.get('title') or try_get( + data, lambda x: x['titleText']['simpleText'], compat_str) + playlist_id = playlist.get('playlistId') or item_id + return self.playlist_result( + self._playlist_entries(playlist), playlist_id=playlist_id, + playlist_title=title) + def _real_extract(self, url): - channel_id = self._match_id(url) + item_id = self._match_id(url) url = compat_urlparse.urlunparse( compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) - webpage = self._download_webpage(url, channel_id) - data = self._extract_yt_initial_data(channel_id, webpage) - tabs = data['contents']['twoColumnBrowseResultsRenderer']['tabs'] - selected_tab = self._extract_selected_tab(tabs) - channel_title = try_get( - data, lambda x: x['metadata']['channelMetadataRenderer']['title'], - compat_str) - channel_external_id = try_get( - data, lambda x: x['metadata']['channelMetadataRenderer']['externalId'], - compat_str) - tab_title = selected_tab.get('title') - title = channel_title or channel_id - if tab_title: - title += ' - %s' % tab_title - return self.playlist_result( - self._entries(selected_tab['content']), - playlist_id=channel_external_id or channel_id, - playlist_title=title) + # Handle both video/playlist URLs + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + video_id = qs.get('v', [None])[0] + playlist_id = qs.get('list', [None])[0] + if video_id and playlist_id: + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + webpage = self._download_webpage(url, item_id) + identity_token = self._search_regex( + r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, + 'identity token', default=None) + data = self._extract_yt_initial_data(item_id, webpage) + tabs = try_get( + data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) + if tabs: + return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token) + playlist = try_get( + data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) + if playlist: + return self._extract_from_playlist(item_id, data, playlist) + # Fallback to video extraction if no playlist alike page is recognized + if video_id: + return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) + # Failed to recognize + raise ExtractorError('Unable to recognize tab page') class YoutubePlaylistIE(InfoExtractor): IE_DESC = 'YouTube.com playlists' - _VALID_URL = r"""(?x)(?: + _VALID_URL = r'''(?x)(?: (?:https?://)? (?:\w+\.)? (?: (?: youtube(?:kids)?\.com| - invidio\.us + invidio\.us| + youtu\.be ) - / - (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) - \? (?:.*?[&;])*? (?:p|a|list)= - | p/ - )| - youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= - ) - ( - (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,} - # Top tracks, they can also include dots - |(?:MC)[\w\.]* - ) - .* - | - (%(playlist_id)s) - )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} + /.*?\?.*?\blist= + )? + (?P<id>%(playlist_id)s) + )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} IE_NAME = 'youtube:playlist' _TESTS = [{ 'note': 'issue #673', @@ -3032,7 +2974,7 @@ class YoutubePlaylistIE(InfoExtractor): 'title': '[OLD]Team Fortress 2 (Class-based LP)', 'id': 'PLBB231211A4F62143', 'uploader': 'Wickydoo', - 'uploader_id': 'Wickydoo', + 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', }, 'playlist_mincount': 29, }, { @@ -3060,41 +3002,8 @@ class YoutubePlaylistIE(InfoExtractor): 'title': '2018 Chinese New Singles (11/6 updated)', 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', 'uploader': 'LBK', - 'uploader_id': 'sdragonfang', + 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA', } - }, { - 'note': 'Embedded SWF player', - 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA7', - 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', - }, - 'skip': 'This playlist does not exist', - }, { - # Playlist URL that does not actually serve a playlist - 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', - 'info_dict': { - 'id': 'FqZTN594JQw', - 'ext': 'webm', - 'title': "Smiley's People 01 detective, Adventure Series, Action", - 'uploader': 'STREEM', - 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', - 'upload_date': '20150526', - 'license': 'Standard YouTube License', - 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', - 'categories': ['People & Blogs'], - 'tags': list, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - 'add_ie': [YoutubeIE.ie_key()], }, { 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', 'info_dict': { @@ -3125,9 +3034,6 @@ class YoutubePlaylistIE(InfoExtractor): # music album playlist 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', 'only_matching': True, - }, { - 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', - 'only_matching': True, }] @classmethod @@ -3136,13 +3042,12 @@ class YoutubePlaylistIE(InfoExtractor): YoutubePlaylistIE, cls).suitable(url) def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) + playlist_id = self._match_id(url) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + if not qs: + qs = {'list': playlist_id} return self.url_result( - 'https://www.youtube.com/playlist?list=%s' % playlist_id, + update_url_query('https://www.youtube.com/playlist', qs), ie=YoutubeTabIE.ie_key(), video_id=playlist_id) @@ -3390,13 +3295,13 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): class YoutubeWatchLaterIE(InfoExtractor): IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater' _TESTS = [{ - 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL', + 'url': 'https://www.youtube.com/feed/watch_later', 'only_matching': True, }, { - 'url': 'https://www.youtube.com/feed/watch_later', + 'url': ':ytwatchlater', 'only_matching': True, }] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 77e3f046d..73b6710dc 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4078,7 +4078,7 @@ def js_to_json(code): v = m.group(0) if v in ('true', 'false', 'null'): return v - elif v.startswith('/*') or v.startswith('//') or v == ',': + elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': return "" if v[0] in ("'", '"'): @@ -4103,7 +4103,8 @@ def js_to_json(code): {comment}|,(?={skip}[\]}}])| (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| - [0-9]+(?={skip}:) + [0-9]+(?={skip}:)| + !+ '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 04cd207ab..d4c6b936f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.11.12' +__version__ = '2020.11.18'