diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md
index a10c9fd83..64a2eb736 100644
--- a/.github/ISSUE_TEMPLATE/1_broken_site.md
+++ b/.github/ISSUE_TEMPLATE/1_broken_site.md
@@ -18,7 +18,7 @@ title: ''
- [ ] I'm reporting a broken site support
-- [ ] I've verified that I'm running youtube-dl version **2020.11.12**
+- [ ] I've verified that I'm running youtube-dl version **2020.11.18**
- [ ] I've checked that all provided URLs are alive and playable in a browser
- [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped
- [ ] I've searched the bugtracker for similar issues including closed ones
@@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v <
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
- [debug] youtube-dl version 2020.11.12
+ [debug] youtube-dl version 2020.11.18
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {}
diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md
index 9cc120d3e..c0c789673 100644
--- a/.github/ISSUE_TEMPLATE/2_site_support_request.md
+++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md
@@ -19,7 +19,7 @@ labels: 'site-support-request'
- [ ] I'm reporting a new site support request
-- [ ] I've verified that I'm running youtube-dl version **2020.11.12**
+- [ ] I've verified that I'm running youtube-dl version **2020.11.18**
- [ ] I've checked that all provided URLs are alive and playable in a browser
- [ ] I've checked that none of provided URLs violate any copyrights
- [ ] I've searched the bugtracker for similar site support requests including closed ones
diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md
index 29bd5f5ac..c90dbf30c 100644
--- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md
+++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md
@@ -18,13 +18,13 @@ title: ''
- [ ] I'm reporting a site feature request
-- [ ] I've verified that I'm running youtube-dl version **2020.11.12**
+- [ ] I've verified that I'm running youtube-dl version **2020.11.18**
- [ ] I've searched the bugtracker for similar site feature requests including closed ones
diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md
index cc33a993f..4c9d295de 100644
--- a/.github/ISSUE_TEMPLATE/4_bug_report.md
+++ b/.github/ISSUE_TEMPLATE/4_bug_report.md
@@ -18,7 +18,7 @@ title: ''
- [ ] I'm reporting a broken site support issue
-- [ ] I've verified that I'm running youtube-dl version **2020.11.12**
+- [ ] I've verified that I'm running youtube-dl version **2020.11.18**
- [ ] I've checked that all provided URLs are alive and playable in a browser
- [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped
- [ ] I've searched the bugtracker for similar bug reports including closed ones
@@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v <
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
- [debug] youtube-dl version 2020.11.12
+ [debug] youtube-dl version 2020.11.18
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {}
diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md
index cd577ecef..a6040daee 100644
--- a/.github/ISSUE_TEMPLATE/5_feature_request.md
+++ b/.github/ISSUE_TEMPLATE/5_feature_request.md
@@ -19,13 +19,13 @@ labels: 'request'
- [ ] I'm reporting a feature request
-- [ ] I've verified that I'm running youtube-dl version **2020.11.12**
+- [ ] I've verified that I'm running youtube-dl version **2020.11.18**
- [ ] I've searched the bugtracker for similar feature requests including closed ones
diff --git a/ChangeLog b/ChangeLog
index 1ef7ea7b6..4d404a56e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,43 @@
+version 2020.11.18
+
+Extractors
+* [spiegel] Fix extraction (#24206, #24767)
+* [youtube] Improve extraction
+ + Add support for --no-playlist (#27009)
+ * Improve playlist and mix extraction (#26390, #26509, #26534, #27011)
+ + Extract playlist uploader data
+* [youtube:tab] Fix view count extraction (#27051)
+* [malltv] Fix extraction (#27035)
++ [bandcamp] Extract playlist description (#22684)
+* [urplay] Fix extraction (#26828)
+* [youtube:tab] Fix playlist title extraction (#27015)
+* [youtube] Fix chapters extraction (#26005)
+
+
+version 2020.11.17
+
+Core
+* [utils] Skip ! prefixed code in js_to_json
+
+Extractors
+* [youtube:tab] Fix extraction with cookies provided (#27005)
+* [lrt] Fix extraction with empty tags (#20264)
++ [ndr:embed:base] Extract subtitles (#25447, #26106)
++ [servus] Add support for pm-wissen.com (#25869)
+* [servus] Fix extraction (#26872, #26967, #26983, #27000)
+* [xtube] Fix extraction (#26996)
+* [lrt] Fix extraction
++ [lbry] Add support for lbry.tv
++ [condenast] Extract subtitles
+* [condenast] Fix extraction
+* [bandcamp] Fix extraction (#26681, #26684)
+* [rai] Fix RaiPlay extraction (#26064, #26096)
+* [vlive] Fix extraction
+* [usanetwork] Fix extraction
+* [nbc] Fix NBCNews/Today/MSNBC extraction
+* [cnbc] Fix extraction
+
+
version 2020.11.12
Extractors
diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py
index 0a1762dbc..878ae72b1 100644
--- a/devscripts/make_lazy_extractors.py
+++ b/devscripts/make_lazy_extractors.py
@@ -61,7 +61,7 @@ def build_lazy_ie(ie, name):
return s
-# find the correct sorting and add the required base classes so that sublcasses
+# find the correct sorting and add the required base classes so that subclasses
# can be correctly created
classes = _ALL_CLASSES[:-1]
ordered_cls = []
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index 0c77d017e..9f0cd6ff6 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -417,6 +417,7 @@
- **la7.it**
- **laola1tv**
- **laola1tv:embed**
+ - **lbry.tv**
- **LCI**
- **Lcp**
- **LcpPlay**
@@ -823,8 +824,6 @@
- **SpankBangPlaylist**
- **Spankwire**
- **Spiegel**
- - **Spiegel:Article**: Articles on spiegel.de
- - **Spiegeltv**
- **sport.francetvinfo.fr**
- **Sport5**
- **SportBox**
@@ -1042,7 +1041,6 @@
- **vk:wallpost**
- **vlive**
- **vlive:channel**
- - **vlive:playlist**
- **Vodlocker**
- **VODPl**
- **VODPlatform**
diff --git a/test/test_all_urls.py b/test/test_all_urls.py
index 348744028..56a08bed8 100644
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -31,16 +31,17 @@ class TestAllURLsMatching(unittest.TestCase):
def test_youtube_playlist_matching(self):
assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist'])
+ assertTab = lambda url: self.assertMatch(url, ['youtube:tab'])
assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585
assertPlaylist('PL63F0C78739B09958')
- # assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
+ assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
- # assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
- assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
+ assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
+ assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))
# Top tracks
- # assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101')
+ assertTab('https://www.youtube.com/playlist?list=MCUS.20142101')
def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M'))
diff --git a/test/test_utils.py b/test/test_utils.py
index c2d1e4fb1..925a21d34 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -937,6 +937,28 @@ class TestUtil(unittest.TestCase):
self.assertEqual(d['x'], 1)
self.assertEqual(d['y'], 'a')
+ # Just drop ! prefix for now though this results in a wrong value
+ on = js_to_json('''{
+ a: !0,
+ b: !1,
+ c: !!0,
+ d: !!42.42,
+ e: !!![],
+ f: !"abc",
+ g: !"",
+ !42: 42
+ }''')
+ self.assertEqual(json.loads(on), {
+ 'a': 0,
+ 'b': 1,
+ 'c': 0,
+ 'd': 42.42,
+ 'e': [],
+ 'f': "abc",
+ 'g': "",
+ '42': 42
+ })
+
on = js_to_json('["abc", "def",]')
self.assertEqual(json.loads(on), ['abc', 'def'])
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index f14b407dc..69e673a26 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import random
@@ -5,10 +6,7 @@ import re
import time
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_urlparse,
-)
+from ..compat import compat_str
from ..utils import (
ExtractorError,
float_or_none,
@@ -17,30 +15,32 @@ from ..utils import (
parse_filesize,
str_or_none,
try_get,
- unescapeHTML,
update_url_query,
unified_strdate,
unified_timestamp,
url_or_none,
+ urljoin,
)
class BandcampIE(InfoExtractor):
- _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P
[^/?#&]+)'
+ _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P[^/?#&]+)'
_TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439',
'info_dict': {
'id': '1812978515',
'ext': 'mp3',
- 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
+ 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭",
'duration': 9.8485,
+ 'uploader': 'youtube-dl "\'/\\ä↭',
+ 'upload_date': '20121129',
+ 'timestamp': 1354224127,
},
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
# free download
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
- 'md5': '853e35bf34aa1d6fe2615ae612564b36',
'info_dict': {
'id': '2650410135',
'ext': 'aiff',
@@ -79,11 +79,16 @@ class BandcampIE(InfoExtractor):
},
}]
+ def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True):
+ return self._parse_json(self._html_search_regex(
+ r'data-%s=(["\'])({.+?})\1' % attr, webpage,
+ attr + ' data', group=2), video_id, fatal=fatal)
+
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- title = mobj.group('title')
+ title = self._match_id(url)
webpage = self._download_webpage(url, title)
- thumbnail = self._html_search_meta('og:image', webpage, default=None)
+ tralbum = self._extract_data_attr(webpage, title)
+ thumbnail = self._og_search_thumbnail(webpage)
track_id = None
track = None
@@ -91,10 +96,7 @@ class BandcampIE(InfoExtractor):
duration = None
formats = []
- track_info = self._parse_json(
- self._search_regex(
- r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n',
- webpage, 'track info', default='{}'), title)
+ track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict)
if track_info:
file_ = track_info.get('file')
if isinstance(file_, dict):
@@ -111,37 +113,25 @@ class BandcampIE(InfoExtractor):
'abr': int_or_none(abr_str),
})
track = track_info.get('title')
- track_id = str_or_none(track_info.get('track_id') or track_info.get('id'))
+ track_id = str_or_none(
+ track_info.get('track_id') or track_info.get('id'))
track_number = int_or_none(track_info.get('track_num'))
duration = float_or_none(track_info.get('duration'))
- def extract(key):
- return self._search_regex(
- r'\b%s\s*["\']?\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' % key,
- webpage, key, default=None, group='value')
-
- artist = extract('artist')
- album = extract('album_title')
+ embed = self._extract_data_attr(webpage, title, 'embed', False)
+ current = tralbum.get('current') or {}
+ artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
timestamp = unified_timestamp(
- extract('publish_date') or extract('album_publish_date'))
- release_date = unified_strdate(extract('album_release_date'))
+ current.get('publish_date') or tralbum.get('album_publish_date'))
- download_link = self._search_regex(
- r'freeDownloadPage\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage,
- 'download link', default=None, group='url')
+ download_link = tralbum.get('freeDownloadPage')
if download_link:
- track_id = self._search_regex(
- r'(?ms)var TralbumData = .*?[{,]\s*id: (?P\d+),?$',
- webpage, 'track id')
+ track_id = compat_str(tralbum['id'])
download_webpage = self._download_webpage(
download_link, track_id, 'Downloading free downloads page')
- blob = self._parse_json(
- self._search_regex(
- r'data-blob=(["\'])(?P{.+?})\1', download_webpage,
- 'blob', group='blob'),
- track_id, transform_source=unescapeHTML)
+ blob = self._extract_data_attr(download_webpage, track_id, 'blob')
info = try_get(
blob, (lambda x: x['digital_items'][0],
@@ -207,20 +197,20 @@ class BandcampIE(InfoExtractor):
'thumbnail': thumbnail,
'uploader': artist,
'timestamp': timestamp,
- 'release_date': release_date,
+ 'release_date': unified_strdate(tralbum.get('album_release_date')),
'duration': duration,
'track': track,
'track_number': track_number,
'track_id': track_id,
'artist': artist,
- 'album': album,
+ 'album': embed.get('album_title'),
'formats': formats,
}
-class BandcampAlbumIE(InfoExtractor):
+class BandcampAlbumIE(BandcampIE):
IE_NAME = 'Bandcamp:album'
- _VALID_URL = r'https?://(?:(?P[^.]+)\.)?bandcamp\.com(?:/album/(?P[^/?#&]+))?'
+ _VALID_URL = r'https?://(?:(?P[^.]+)\.)?bandcamp\.com(?:/album/(?P[^/?#&]+))?'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@@ -230,7 +220,10 @@ class BandcampAlbumIE(InfoExtractor):
'info_dict': {
'id': '1353101989',
'ext': 'mp3',
- 'title': 'Intro',
+ 'title': 'Blazo - Intro',
+ 'timestamp': 1311756226,
+ 'upload_date': '20110727',
+ 'uploader': 'Blazo',
}
},
{
@@ -238,7 +231,10 @@ class BandcampAlbumIE(InfoExtractor):
'info_dict': {
'id': '38097443',
'ext': 'mp3',
- 'title': 'Kero One - Keep It Alive (Blazo remix)',
+ 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)',
+ 'timestamp': 1311757238,
+ 'upload_date': '20110727',
+ 'uploader': 'Blazo',
}
},
],
@@ -274,6 +270,7 @@ class BandcampAlbumIE(InfoExtractor):
'title': '"Entropy" EP',
'uploader_id': 'jstrecords',
'id': 'entropy-ep',
+ 'description': 'md5:0ff22959c943622972596062f2f366a5',
},
'playlist_mincount': 3,
}, {
@@ -283,6 +280,7 @@ class BandcampAlbumIE(InfoExtractor):
'id': 'we-are-the-plague',
'title': 'WE ARE THE PLAGUE',
'uploader_id': 'insulters',
+ 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f',
},
'playlist_count': 2,
}]
@@ -294,41 +292,34 @@ class BandcampAlbumIE(InfoExtractor):
else super(BandcampAlbumIE, cls).suitable(url))
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- uploader_id = mobj.group('subdomain')
- album_id = mobj.group('album_id')
+ uploader_id, album_id = re.match(self._VALID_URL, url).groups()
playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id)
- track_elements = re.findall(
- r'(?s)', webpage)
- if not track_elements:
+ tralbum = self._extract_data_attr(webpage, playlist_id)
+ track_info = tralbum.get('trackinfo')
+ if not track_info:
raise ExtractorError('The page doesn\'t contain any tracks')
# Only tracks with duration info have songs
entries = [
self.url_result(
- compat_urlparse.urljoin(url, t_path),
- ie=BandcampIE.ie_key(),
- video_title=self._search_regex(
- r']+\bitemprop=["\']name["\'][^>]*>([^<]+)',
- elem_content, 'track title', fatal=False))
- for elem_content, t_path in track_elements
- if self._html_search_meta('duration', elem_content, default=None)]
+ urljoin(url, t['title_link']), BandcampIE.ie_key(),
+ str_or_none(t.get('track_id') or t.get('id')), t.get('title'))
+ for t in track_info
+ if t.get('duration')]
+
+ current = tralbum.get('current') or {}
- title = self._html_search_regex(
- r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',
- webpage, 'title', fatal=False)
- if title:
- title = title.replace(r'\"', '"')
return {
'_type': 'playlist',
'uploader_id': uploader_id,
'id': playlist_id,
- 'title': title,
+ 'title': current.get('title'),
+ 'description': current.get('about'),
'entries': entries,
}
-class BandcampWeeklyIE(InfoExtractor):
+class BandcampWeeklyIE(BandcampIE):
IE_NAME = 'Bandcamp:weekly'
_VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P\d+)'
_TESTS = [{
@@ -343,29 +334,23 @@ class BandcampWeeklyIE(InfoExtractor):
'release_date': '20170404',
'series': 'Bandcamp Weekly',
'episode': 'Magic Moments',
- 'episode_number': 208,
'episode_id': '224',
- }
+ },
+ 'params': {
+ 'format': 'opus-lo',
+ },
}, {
'url': 'https://bandcamp.com/?blah/blah@&show=228',
'only_matching': True
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ show_id = self._match_id(url)
+ webpage = self._download_webpage(url, show_id)
- blob = self._parse_json(
- self._search_regex(
- r'data-blob=(["\'])(?P{.+?})\1', webpage,
- 'blob', group='blob'),
- video_id, transform_source=unescapeHTML)
+ blob = self._extract_data_attr(webpage, show_id, 'blob')
- show = blob['bcw_show']
-
- # This is desired because any invalid show id redirects to `bandcamp.com`
- # which happens to expose the latest Bandcamp Weekly episode.
- show_id = int_or_none(show.get('show_id')) or int_or_none(video_id)
+ show = blob['bcw_data'][show_id]
formats = []
for format_id, format_url in show['audio_stream'].items():
@@ -390,20 +375,8 @@ class BandcampWeeklyIE(InfoExtractor):
if subtitle:
title += ' - %s' % subtitle
- episode_number = None
- seq = blob.get('bcw_seq')
-
- if seq and isinstance(seq, list):
- try:
- episode_number = next(
- int_or_none(e.get('episode_number'))
- for e in seq
- if isinstance(e, dict) and int_or_none(e.get('id')) == show_id)
- except StopIteration:
- pass
-
return {
- 'id': video_id,
+ 'id': show_id,
'title': title,
'description': show.get('desc') or show.get('short_desc'),
'duration': float_or_none(show.get('audio_duration')),
@@ -411,7 +384,6 @@ class BandcampWeeklyIE(InfoExtractor):
'release_date': unified_strdate(show.get('published_date')),
'series': 'Bandcamp Weekly',
'episode': show.get('subtitle'),
- 'episode_number': episode_number,
- 'episode_id': compat_str(video_id),
+ 'episode_id': show_id,
'formats': formats
}
diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py
index 6889b0f40..7b9f4536a 100644
--- a/youtube_dl/extractor/cnbc.py
+++ b/youtube_dl/extractor/cnbc.py
@@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
from .common import InfoExtractor
from ..utils import smuggle_url
@@ -38,7 +39,7 @@ class CNBCIE(InfoExtractor):
class CNBCVideoIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P[^./?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P/video/(?:[^/]+/)+(?P[^./?#&]+)\.html)'
_TEST = {
'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html',
'info_dict': {
@@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor):
}
def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
- video_id = self._search_regex(
- r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id,
- 'video id')
+ path, display_id = re.match(self._VALID_URL, url).groups()
+ video_id = self._download_json(
+ 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={
+ 'query': '''{
+ page(path: "%s") {
+ vcpsId
+ }
+}''' % path,
+ })['data']['page']['vcpsId']
return self.url_result(
- 'http://video.cnbc.com/gallery/?video=%s' % video_id,
+ 'http://video.cnbc.com/gallery/?video=%d' % video_id,
CNBCIE.ie_key())
diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py
index ed278fefc..d5e77af32 100644
--- a/youtube_dl/extractor/condenast.py
+++ b/youtube_dl/extractor/condenast.py
@@ -16,6 +16,8 @@ from ..utils import (
mimetype2ext,
orderedSet,
parse_iso8601,
+ strip_or_none,
+ try_get,
)
@@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor):
'uploader': 'gq',
'upload_date': '20170321',
'timestamp': 1490126427,
+ 'description': 'How much grimmer would things be if these people were competent?',
},
}, {
# JS embed
@@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor):
'title': '3D printed TSA Travel Sentry keys really do open TSA locks',
'uploader': 'arstechnica',
'upload_date': '20150916',
- 'timestamp': 1442434955,
+ 'timestamp': 1442434920,
}
}, {
'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player',
@@ -196,6 +199,13 @@ class CondeNastIE(InfoExtractor):
})
self._sort_formats(formats)
+ subtitles = {}
+ for t, caption in video_info.get('captions', {}).items():
+ caption_url = caption.get('src')
+ if not (t in ('vtt', 'srt', 'tml') and caption_url):
+ continue
+ subtitles.setdefault('en', []).append({'url': caption_url})
+
return {
'id': video_id,
'formats': formats,
@@ -208,6 +218,7 @@ class CondeNastIE(InfoExtractor):
'season': video_info.get('season_title'),
'timestamp': parse_iso8601(video_info.get('premiere_date')),
'categories': video_info.get('categories'),
+ 'subtitles': subtitles,
}
def _real_extract(self, url):
@@ -225,8 +236,16 @@ class CondeNastIE(InfoExtractor):
if url_type == 'series':
return self._extract_series(url, webpage)
else:
- params = self._extract_video_params(webpage, display_id)
- info = self._search_json_ld(
- webpage, display_id, fatal=False)
+ video = try_get(self._parse_json(self._search_regex(
+ r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
+ 'preload state', '{}'), display_id),
+ lambda x: x['transformed']['video'])
+ if video:
+ params = {'videoId': video['id']}
+ info = {'description': strip_or_none(video.get('description'))}
+ else:
+ params = self._extract_video_params(webpage, display_id)
+ info = self._search_json_ld(
+ webpage, display_id, fatal=False)
info.update(self._extract_video(params))
return info
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 9d7fecfe8..11ef47261 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -529,6 +529,7 @@ from .laola1tv import (
EHFTVIE,
ITTFIE,
)
+from .lbry import LBRYIE
from .lci import LCIIE
from .lcp import (
LcpPlayIE,
@@ -1053,8 +1054,7 @@ from .spankbang import (
SpankBangPlaylistIE,
)
from .spankwire import SpankwireIE
-from .spiegel import SpiegelIE, SpiegelArticleIE
-from .spiegeltv import SpiegeltvIE
+from .spiegel import SpiegelIE
from .spike import (
BellatorIE,
ParamountNetworkIE,
@@ -1357,7 +1357,6 @@ from .vk import (
from .vlive import (
VLiveIE,
VLiveChannelIE,
- VLivePlaylistIE
)
from .vodlocker import VodlockerIE
from .vodpl import VODPlIE
diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py
new file mode 100644
index 000000000..587deac90
--- /dev/null
+++ b/youtube_dl/extractor/lbry.py
@@ -0,0 +1,85 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ determine_ext,
+ ExtractorError,
+ int_or_none,
+ mimetype2ext,
+ try_get,
+)
+
+
+class LBRYIE(InfoExtractor):
+ IE_NAME = 'lbry.tv'
+ _VALID_URL = r'https?://(?:www\.)?lbry\.tv/(?P@[0-9a-zA-Z-]+:[0-9a-z]+/[0-9a-zA-Z().-]+:[0-9a-z])'
+ _TESTS = [{
+ # Video
+ 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1',
+ 'md5': '65bd7ec1f6744ada55da8e4c48a2edf9',
+ 'info_dict': {
+ 'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d',
+ 'ext': 'mp4',
+ 'title': 'First day in LBRY? Start HERE!',
+ 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51',
+ 'timestamp': 1595694354,
+ 'upload_date': '20200725',
+ }
+ }, {
+ # Audio
+ 'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e',
+ 'md5': 'c94017d3eba9b49ce085a8fad6b98d00',
+ 'info_dict': {
+ 'id': 'e7d93d772bd87e2b62d5ab993c1c3ced86ebb396',
+ 'ext': 'mp3',
+ 'title': 'The LBRY Foundation Community Podcast Episode 1 - Introduction, Streaming on LBRY, Transcoding',
+ 'description': 'md5:661ac4f1db09f31728931d7b88807a61',
+ 'timestamp': 1591312601,
+ 'upload_date': '20200604',
+ }
+ }]
+
+ def _call_api_proxy(self, method, display_id, params):
+ return self._download_json(
+ 'https://api.lbry.tv/api/v1/proxy', display_id,
+ headers={'Content-Type': 'application/json-rpc'},
+ data=json.dumps({
+ 'method': method,
+ 'params': params,
+ }).encode())['result']
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url).replace(':', '#')
+ uri = 'lbry://' + display_id
+ result = self._call_api_proxy(
+ 'resolve', display_id, {'urls': [uri]})[uri]
+ result_value = result['value']
+ if result_value.get('stream_type') not in ('video', 'audio'):
+ raise ExtractorError('Unsupported URL', expected=True)
+ streaming_url = self._call_api_proxy(
+ 'get', display_id, {'uri': uri})['streaming_url']
+ source = result_value.get('source') or {}
+ media = result_value.get('video') or result_value.get('audio') or {}
+ signing_channel = result_value.get('signing_channel') or {}
+
+ return {
+ 'id': result['claim_id'],
+ 'title': result_value['title'],
+ 'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str),
+ 'description': result_value.get('description'),
+ 'license': result_value.get('license'),
+ 'timestamp': int_or_none(result.get('timestamp')),
+ 'tags': result_value.get('tags'),
+ 'width': int_or_none(media.get('width')),
+ 'height': int_or_none(media.get('height')),
+ 'duration': int_or_none(media.get('duration')),
+ 'channel': signing_channel.get('name'),
+ 'channel_id': signing_channel.get('claim_id'),
+ 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')),
+ 'filesize': int_or_none(source.get('size')),
+ 'url': streaming_url,
+ }
diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py
index f5c997ef4..89d549858 100644
--- a/youtube_dl/extractor/lrt.py
+++ b/youtube_dl/extractor/lrt.py
@@ -5,28 +5,26 @@ import re
from .common import InfoExtractor
from ..utils import (
- determine_ext,
- int_or_none,
- parse_duration,
- remove_end,
+ clean_html,
+ merge_dicts,
)
class LRTIE(InfoExtractor):
IE_NAME = 'lrt.lt'
- _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P/mediateka/irasas/(?P[0-9]+))'
_TESTS = [{
# m3u8 download
- 'url': 'http://www.lrt.lt/mediateka/irasas/54391/',
- 'md5': 'fe44cf7e4ab3198055f2c598fc175cb0',
+ 'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene',
+ 'md5': '85cb2bb530f31d91a9c65b479516ade4',
'info_dict': {
- 'id': '54391',
+ 'id': '2000127261',
'ext': 'mp4',
- 'title': 'Septynios Kauno dienos',
- 'description': 'md5:24d84534c7dc76581e59f5689462411a',
- 'duration': 1783,
- 'view_count': int,
- 'like_count': int,
+ 'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė',
+ 'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa',
+ 'duration': 3035,
+ 'timestamp': 1604079000,
+ 'upload_date': '20201030',
},
}, {
# direct mp3 download
@@ -43,52 +41,35 @@ class LRTIE(InfoExtractor):
},
}]
+ def _extract_js_var(self, webpage, var_name, default):
+ return self._search_regex(
+ r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name,
+ webpage, var_name.replace('_', ' '), default, group=2)
+
def _real_extract(self, url):
- video_id = self._match_id(url)
+ path, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, video_id)
- title = remove_end(self._og_search_title(webpage), ' - LRT')
+ media_url = self._extract_js_var(webpage, 'main_url', path)
+ media = self._download_json(self._extract_js_var(
+ webpage, 'media_info_url',
+ 'https://www.lrt.lt/servisai/stream_url/vod/media_info/'),
+ video_id, query={'url': media_url})
+ jw_data = self._parse_jwplayer_data(
+ media['playlist_item'], video_id, base_url=url)
- formats = []
- for _, file_url in re.findall(
- r'file\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage):
- ext = determine_ext(file_url)
- if ext not in ('m3u8', 'mp3'):
+ json_ld_data = self._search_json_ld(webpage, video_id)
+
+ tags = []
+ for tag in (media.get('tags') or []):
+ tag_name = tag.get('name')
+ if not tag_name:
continue
- # mp3 served as m3u8 produces stuttered media file
- if ext == 'm3u8' and '.mp3' in file_url:
- continue
- if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- file_url, video_id, 'mp4', entry_protocol='m3u8_native',
- fatal=False))
- elif ext == 'mp3':
- formats.append({
- 'url': file_url,
- 'vcodec': 'none',
- })
- self._sort_formats(formats)
+ tags.append(tag_name)
- thumbnail = self._og_search_thumbnail(webpage)
- description = self._og_search_description(webpage)
- duration = parse_duration(self._search_regex(
- r'var\s+record_len\s*=\s*(["\'])(?P[0-9]+:[0-9]+:[0-9]+)\1',
- webpage, 'duration', default=None, group='duration'))
-
- view_count = int_or_none(self._html_search_regex(
- r']+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P.+?)
',
- webpage, 'view count', fatal=False, group='count'))
- like_count = int_or_none(self._search_regex(
- r']+id=(["\'])flikesCount.*?\1>(?P\d+)<',
- webpage, 'like count', fatal=False, group='count'))
-
- return {
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'thumbnail': thumbnail,
- 'description': description,
- 'duration': duration,
- 'view_count': view_count,
- 'like_count': like_count,
+ clean_info = {
+ 'description': clean_html(media.get('content')),
+ 'tags': tags,
}
+
+ return merge_dicts(clean_info, jw_data, json_ld_data)
diff --git a/youtube_dl/extractor/malltv.py b/youtube_dl/extractor/malltv.py
index 6f4fd927f..fadfd9338 100644
--- a/youtube_dl/extractor/malltv.py
+++ b/youtube_dl/extractor/malltv.py
@@ -1,10 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import merge_dicts
+from ..utils import (
+ clean_html,
+ dict_get,
+ float_or_none,
+ int_or_none,
+ merge_dicts,
+ parse_duration,
+ try_get,
+)
class MallTVIE(InfoExtractor):
@@ -17,7 +23,7 @@ class MallTVIE(InfoExtractor):
'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
'ext': 'mp4',
'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?',
- 'description': 'md5:25fc0ec42a72ba602b602c683fa29deb',
+ 'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35',
'duration': 216,
'timestamp': 1538870400,
'upload_date': '20181007',
@@ -37,20 +43,46 @@ class MallTVIE(InfoExtractor):
webpage = self._download_webpage(
url, display_id, headers=self.geo_verification_headers())
- SOURCE_RE = r'(]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P[\da-z]+)/index)\b'
+ video = self._parse_json(self._search_regex(
+ r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);',
+ webpage, 'video object'), display_id)
+ video_source = video['VideoSource']
video_id = self._search_regex(
- SOURCE_RE, webpage, 'video id', group='id')
+ r'/([\da-z]+)/index\b', video_source, 'video id')
- media = self._parse_html5_media_entries(
- url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id,
- m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0]
+ formats = self._extract_m3u8_formats(
+ video_source + '.m3u8', video_id, 'mp4', 'm3u8_native')
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for s in (video.get('Subtitles') or {}):
+ s_url = s.get('Url')
+ if not s_url:
+ continue
+ subtitles.setdefault(s.get('Language') or 'cz', []).append({
+ 'url': s_url,
+ })
+
+ entity_counts = video.get('EntityCounts') or {}
+
+ def get_count(k):
+ v = entity_counts.get(k + 's') or {}
+ return int_or_none(dict_get(v, ('Count', 'StrCount')))
info = self._search_json_ld(webpage, video_id, default={})
- return merge_dicts(media, info, {
+ return merge_dicts({
'id': video_id,
'display_id': display_id,
- 'title': self._og_search_title(webpage, default=None) or display_id,
- 'description': self._og_search_description(webpage, default=None),
- 'thumbnail': self._og_search_thumbnail(webpage, default=None),
- })
+ 'title': video.get('Title'),
+ 'description': clean_html(video.get('Description')),
+ 'thumbnail': video.get('ThumbnailUrl'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')),
+ 'view_count': get_count('View'),
+ 'like_count': get_count('Like'),
+ 'dislike_count': get_count('Dislike'),
+ 'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])),
+ 'comment_count': get_count('Comment'),
+ }, info)
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index 6f3cb3003..ea5f5a315 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -10,7 +10,6 @@ from .adobepass import AdobePassIE
from ..compat import compat_urllib_parse_unquote
from ..utils import (
int_or_none,
- js_to_json,
parse_duration,
smuggle_url,
try_get,
@@ -394,8 +393,8 @@ class NBCNewsIE(ThePlatformIE):
webpage = self._download_webpage(url, video_id)
data = self._parse_json(self._search_regex(
- r'window\.__data\s*=\s*({.+});', webpage,
- 'bootstrap json'), video_id, js_to_json)
+ r'',
+ webpage, 'bootstrap json'), video_id)['props']['initialState']
video_data = try_get(data, lambda x: x['video']['current'], dict)
if not video_data:
video_data = data['article']['content'][0]['primaryMedia']['video']
diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py
index 2447c812e..ddd828d92 100644
--- a/youtube_dl/extractor/ndr.py
+++ b/youtube_dl/extractor/ndr.py
@@ -81,6 +81,29 @@ class NDRIE(NDRBaseIE):
'params': {
'skip_download': True,
},
+ }, {
+ # with subtitles
+ 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html',
+ 'info_dict': {
+ 'id': 'extra18674',
+ 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring',
+ 'ext': 'mp4',
+ 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring',
+ 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6',
+ 'uploader': 'ndrtv',
+ 'upload_date': '20201113',
+ 'duration': 1749,
+ 'subtitles': {
+ 'de': [{
+ 'ext': 'ttml',
+ 'url': r're:^https://www\.ndr\.de.+',
+ }],
+ },
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html',
'only_matching': True,
@@ -239,6 +262,20 @@ class NDREmbedBaseIE(InfoExtractor):
'preference': quality_key(thumbnail.get('quality')),
})
+ subtitles = {}
+ tracks = config.get('tracks')
+ if tracks and isinstance(tracks, list):
+ for track in tracks:
+ if not isinstance(track, dict):
+ continue
+ track_url = urljoin(url, track.get('src'))
+ if not track_url:
+ continue
+ subtitles.setdefault(track.get('srclang') or 'de', []).append({
+ 'url': track_url,
+ 'ext': 'ttml',
+ })
+
return {
'id': video_id,
'title': title,
@@ -248,6 +285,7 @@ class NDREmbedBaseIE(InfoExtractor):
'duration': duration,
'thumbnails': thumbnails,
'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py
index 207a6c247..bee2d53f5 100644
--- a/youtube_dl/extractor/rai.py
+++ b/youtube_dl/extractor/rai.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import re
@@ -16,7 +17,6 @@ from ..utils import (
int_or_none,
parse_duration,
strip_or_none,
- try_get,
unescapeHTML,
unified_strdate,
unified_timestamp,
@@ -141,6 +141,7 @@ class RaiPlayIE(RaiBaseIE):
'series': 'La Casa Bianca',
'season': '2016',
},
+ 'skip': 'This content is not available',
}, {
'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
'md5': '8970abf8caf8aef4696e7b1f2adfc696',
@@ -148,14 +149,12 @@ class RaiPlayIE(RaiBaseIE):
'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
'ext': 'mp4',
'title': 'Report del 07/04/2014',
- 'alt_title': 'S2013/14 - Puntata del 07/04/2014',
- 'description': 'md5:f27c544694cacb46a078db84ec35d2d9',
+ 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014',
+ 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e',
'thumbnail': r're:^https?://.*\.jpg$',
- 'uploader': 'Rai 5',
- 'creator': 'Rai 5',
+ 'uploader': 'Rai Gulp',
'duration': 6160,
'series': 'Report',
- 'season_number': 5,
'season': '2013/14',
},
'params': {
@@ -167,48 +166,51 @@ class RaiPlayIE(RaiBaseIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- url, video_id = mobj.group('url', 'id')
+ url, video_id = re.match(self._VALID_URL, url).groups()
media = self._download_json(
- '%s?json' % url, video_id, 'Downloading video JSON')
+ url.replace('.html', '.json'), video_id, 'Downloading video JSON')
title = media['name']
video = media['video']
- relinker_info = self._extract_relinker_info(video['contentUrl'], video_id)
+ relinker_info = self._extract_relinker_info(video['content_url'], video_id)
self._sort_formats(relinker_info['formats'])
thumbnails = []
- if 'images' in media:
- for _, value in media.get('images').items():
- if value:
- thumbnails.append({
- 'url': value.replace('[RESOLUTION]', '600x400')
- })
+ for _, value in media.get('images', {}).items():
+ if value:
+ thumbnails.append({
+ 'url': urljoin(url, value),
+ })
- timestamp = unified_timestamp(try_get(
- media, lambda x: x['availabilities'][0]['start'], compat_str))
+ date_published = media.get('date_published')
+ time_published = media.get('time_published')
+ if date_published and time_published:
+ date_published += ' ' + time_published
subtitles = self._extract_subtitles(url, video.get('subtitles'))
+ program_info = media.get('program_info') or {}
+ season = media.get('season')
+
info = {
'id': video_id,
'title': self._live_title(title) if relinker_info.get(
'is_live') else title,
- 'alt_title': media.get('subtitle'),
+ 'alt_title': strip_or_none(media.get('subtitle')),
'description': media.get('description'),
'uploader': strip_or_none(media.get('channel')),
- 'creator': strip_or_none(media.get('editor')),
+ 'creator': strip_or_none(media.get('editor') or None),
'duration': parse_duration(video.get('duration')),
- 'timestamp': timestamp,
+ 'timestamp': unified_timestamp(date_published),
'thumbnails': thumbnails,
- 'series': try_get(
- media, lambda x: x['isPartOf']['name'], compat_str),
- 'season_number': int_or_none(try_get(
- media, lambda x: x['isPartOf']['numeroStagioni'])),
- 'season': media.get('stagione') or None,
+ 'series': program_info.get('name'),
+ 'season_number': int_or_none(season),
+ 'season': season if (season and not season.isdigit()) else None,
+ 'episode': media.get('episode_title'),
+ 'episode_number': int_or_none(media.get('episode')),
'subtitles': subtitles,
}
@@ -300,7 +302,8 @@ class RaiIE(RaiBaseIE):
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1758,
'upload_date': '20140612',
- }
+ },
+ 'skip': 'This content is available only in Italy',
}, {
# with ContentItem in many metas
'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html',
@@ -316,7 +319,7 @@ class RaiIE(RaiBaseIE):
}, {
# with ContentItem in og:url
'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html',
- 'md5': '11959b4e44fa74de47011b5799490adf',
+ 'md5': '6865dd00cf0bbf5772fdd89d59bd768a',
'info_dict': {
'id': 'efb17665-691c-45d5-a60c-5301333cbb0c',
'ext': 'mp4',
@@ -338,6 +341,7 @@ class RaiIE(RaiBaseIE):
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20141221',
},
+ 'skip': 'This content is not available',
}, {
# initEdizione('ContentItem-...'
'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
@@ -360,6 +364,7 @@ class RaiIE(RaiBaseIE):
'params': {
'skip_download': True,
},
+ 'skip': 'This content is available only in Italy',
}, {
# HLS live stream with ContentItem in og:url
'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',
diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py
index 9401bf2cf..1610ddc2c 100644
--- a/youtube_dl/extractor/servus.py
+++ b/youtube_dl/extractor/servus.py
@@ -1,9 +1,15 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ float_or_none,
+ int_or_none,
+ unified_timestamp,
+ urlencode_postdata,
+ url_or_none,
+)
class ServusIE(InfoExtractor):
@@ -12,20 +18,29 @@ class ServusIE(InfoExtractor):
(?:www\.)?
(?:
servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)|
- servustv\.com/videos
+ (?:servustv|pm-wissen)\.com/videos
)
/(?P[aA]{2}-\w+|\d+-\d+)
'''
_TESTS = [{
# new URL schema
'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/',
- 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4',
+ 'md5': '60474d4c21f3eb148838f215c37f02b9',
'info_dict': {
'id': 'AA-1T6VBU5PW1W12',
'ext': 'mp4',
'title': 'Die Grünen aus Sicht des Volkes',
+ 'alt_title': 'Talk im Hangar-7 Voxpops Gruene',
'description': 'md5:1247204d85783afe3682644398ff2ec4',
'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 62.442,
+ 'timestamp': 1605193976,
+ 'upload_date': '20201112',
+ 'series': 'Talk im Hangar-7',
+ 'season': 'Season 9',
+ 'season_number': 9,
+ 'episode': 'Episode 31 - September 14',
+ 'episode_number': 31,
}
}, {
# old URL schema
@@ -40,30 +55,94 @@ class ServusIE(InfoExtractor):
}, {
'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/',
'only_matching': True,
+ }, {
+ 'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url).upper()
- webpage = self._download_webpage(url, video_id)
- title = self._search_regex(
- (r'videoLabel\s*=\s*(["\'])(?P(?:(?!\1).)+)\1',
- r']+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P[^<]+)'),
- webpage, 'title', default=None,
- group='title') or self._og_search_title(webpage)
- title = re.sub(r'\s*-\s*Servus TV\s*$', '', title)
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
+ token = self._download_json(
+ 'https://auth.redbullmediahouse.com/token', video_id,
+ 'Downloading token', data=urlencode_postdata({
+ 'grant_type': 'client_credentials',
+ }), headers={
+ 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==',
+ })
+ access_token = token['access_token']
+ token_type = token.get('token_type', 'Bearer')
- formats = self._extract_m3u8_formats(
- 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id,
- video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
+ video = self._download_json(
+ 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id,
+ video_id, 'Downloading video JSON', headers={
+ 'Authorization': '%s %s' % (token_type, access_token),
+ })
+
+ formats = []
+ thumbnail = None
+ for resource in video['resources']:
+ if not isinstance(resource, dict):
+ continue
+ format_url = url_or_none(resource.get('url'))
+ if not format_url:
+ continue
+ extension = resource.get('extension')
+ type_ = resource.get('type')
+ if extension == 'jpg' or type_ == 'reference_keyframe':
+ thumbnail = format_url
+ continue
+ ext = determine_ext(format_url)
+ if type_ == 'dash' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ format_url, video_id, mpd_id='dash', fatal=False))
+ elif type_ == 'hls' or ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif extension == 'mp4' or ext == 'mp4':
+ formats.append({
+ 'url': format_url,
+ 'format_id': type_,
+ 'width': int_or_none(resource.get('width')),
+ 'height': int_or_none(resource.get('height')),
+ })
self._sort_formats(formats)
+ attrs = {}
+ for attribute in video['attributes']:
+ if not isinstance(attribute, dict):
+ continue
+ key = attribute.get('fieldKey')
+ value = attribute.get('fieldValue')
+ if not key or not value:
+ continue
+ attrs[key] = value
+
+ title = attrs.get('title_stv') or video_id
+ alt_title = attrs.get('title')
+ description = attrs.get('long_description') or attrs.get('short_description')
+ series = attrs.get('label')
+ season = attrs.get('season')
+ episode = attrs.get('chapter')
+ duration = float_or_none(attrs.get('duration'), scale=1000)
+ season_number = int_or_none(self._search_regex(
+ r'Season (\d+)', season or '', 'season number', default=None))
+ episode_number = int_or_none(self._search_regex(
+ r'Episode (\d+)', episode or '', 'episode number', default=None))
+
return {
'id': video_id,
'title': title,
+ 'alt_title': alt_title,
'description': description,
'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': unified_timestamp(video.get('lastPublished')),
+ 'series': series,
+ 'season': season,
+ 'season_number': season_number,
+ 'episode': episode,
+ 'episode_number': episode_number,
'formats': formats,
}
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py
index 4df7f4ddc..2da32b9b2 100644
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -1,159 +1,54 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from .nexx import (
- NexxIE,
- NexxEmbedIE,
-)
-from .spiegeltv import SpiegeltvIE
-from ..compat import compat_urlparse
-from ..utils import (
- parse_duration,
- strip_or_none,
- unified_timestamp,
-)
+from .jwplatform import JWPlatformIE
class SpiegelIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$'
+ _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
+ _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE
_TESTS = [{
'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
- 'md5': 'b57399839d055fccfeb9a0455c439868',
+ 'md5': '50c7948883ec85a3e431a0a44b7ad1d6',
'info_dict': {
- 'id': '563747',
+ 'id': 'II0BUyxY',
+ 'display_id': '1259285',
'ext': 'mp4',
- 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv',
+ 'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft',
'description': 'md5:8029d8310232196eb235d27575a8b9f4',
- 'duration': 49,
+ 'duration': 48.0,
'upload_date': '20130311',
- 'timestamp': 1362994320,
+ 'timestamp': 1362997920,
},
}, {
'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
- 'md5': '5b6c2f4add9d62912ed5fc78a1faed80',
- 'info_dict': {
- 'id': '580988',
- 'ext': 'mp4',
- 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers',
- 'description': 'md5:c2322b65e58f385a820c10fa03b2d088',
- 'duration': 983,
- 'upload_date': '20131115',
- 'timestamp': 1384546642,
- },
- }, {
- 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html',
- 'md5': '97b91083a672d72976faa8433430afb9',
- 'info_dict': {
- 'id': '601883',
- 'ext': 'mp4',
- 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.',
- 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"',
- 'upload_date': '20140904',
- 'timestamp': 1409834160,
- }
- }, {
- 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html',
'only_matching': True,
}, {
- # nexx video
+ 'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7',
+ 'only_matching': True,
+ }, {
'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html',
'only_matching': True,
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id
- handle = self._request_webpage(metadata_url, video_id)
-
- # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html
- if SpiegeltvIE.suitable(handle.geturl()):
- return self.url_result(handle.geturl(), 'Spiegeltv')
-
- video_data = self._parse_json(self._webpage_read_content(
- handle, metadata_url, video_id), video_id)
- title = video_data['title']
- nexx_id = video_data['nexxOmniaId']
- domain_id = video_data.get('nexxOmniaDomain') or '748'
-
- return {
- '_type': 'url_transparent',
- 'id': video_id,
- 'url': 'nexx:%s:%s' % (domain_id, nexx_id),
- 'title': title,
- 'description': strip_or_none(video_data.get('teaser')),
- 'duration': parse_duration(video_data.get('duration')),
- 'timestamp': unified_timestamp(video_data.get('datum')),
- 'ie_key': NexxIE.ie_key(),
- }
-
-
-class SpiegelArticleIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P[0-9]+)\.html'
- IE_NAME = 'Spiegel:Article'
- IE_DESC = 'Articles on spiegel.de'
- _TESTS = [{
+ }, {
'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html',
- 'info_dict': {
- 'id': '1516455',
- 'ext': 'mp4',
- 'title': 'Faszination Badminton: Nennt es bloß nicht Federball',
- 'description': 're:^Patrick Kämnitz gehört.{100,}',
- 'upload_date': '20140825',
- },
- }, {
- 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html',
- 'info_dict': {
-
- },
- 'playlist_count': 6,
- }, {
- # Nexx iFrame embed
- 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
- 'info_dict': {
- 'id': '161464',
- 'ext': 'mp4',
- 'title': 'Nervenkitzel Achterbahn',
- 'alt_title': 'Karussellbauer in Deutschland',
- 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
- 'release_year': 2005,
- 'creator': 'SPIEGEL TV',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 2761,
- 'timestamp': 1394021479,
- 'upload_date': '20140305',
- },
- 'params': {
- 'format': 'bestvideo',
- 'skip_download': True,
- },
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
-
- # Single video on top of the page
- video_link = self._search_regex(
- r'\s*.*?url\s*=\s*"([^"]+)"',
- webpage)
- entries = [
- self.url_result(compat_urlparse.urljoin(
- self.http_scheme() + '//spiegel.de/', embed_path))
- for embed_path in embeds]
- if embeds:
- return self.playlist_result(entries)
-
- return self.playlist_from_matches(
- NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key())
+ media_id = self._html_search_regex(
+ r'("|["\'])mediaId\1\s*:\s*("|["\'])(?P(?:(?!\2).)+)\2',
+ webpage, 'media id', group='id')
+ return {
+ '_type': 'url_transparent',
+ 'id': video_id,
+ 'display_id': video_id,
+ 'url': 'jwplatform:%s' % media_id,
+ 'title': self._og_search_title(webpage, default=None),
+ 'ie_key': JWPlatformIE.ie_key(),
+ }
diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py
deleted file mode 100644
index 6ccf4c342..000000000
--- a/youtube_dl/extractor/spiegeltv.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from .nexx import NexxIE
-
-
-class SpiegeltvIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P\d+)'
- _TEST = {
- 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/',
- 'only_matching': True,
- }
-
- def _real_extract(self, url):
- return self.url_result(
- 'https://api.nexx.cloud/v3/748/videos/byid/%s'
- % self._match_id(url), ie=NexxIE.ie_key())
diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py
index 6030b7cb5..10b817760 100644
--- a/youtube_dl/extractor/urplay.py
+++ b/youtube_dl/extractor/urplay.py
@@ -2,7 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import unified_timestamp
+from ..utils import (
+ dict_get,
+ int_or_none,
+ unified_timestamp,
+)
class URPlayIE(InfoExtractor):
@@ -15,8 +19,8 @@ class URPlayIE(InfoExtractor):
'ext': 'mp4',
'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd',
'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a',
- 'timestamp': 1513512768,
- 'upload_date': '20171217',
+ 'timestamp': 1513292400,
+ 'upload_date': '20171214',
},
}, {
'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde',
@@ -25,7 +29,7 @@ class URPlayIE(InfoExtractor):
'ext': 'mp4',
'title': 'Tripp, Trapp, Träd : Sovkudde',
'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1',
- 'timestamp': 1440093600,
+ 'timestamp': 1440086400,
'upload_date': '20150820',
},
}, {
@@ -35,37 +39,58 @@ class URPlayIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
+ url = url.replace('skola.se/Produkter', 'play.se/program')
webpage = self._download_webpage(url, video_id)
- urplayer_data = self._parse_json(self._search_regex(
- r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id)
- host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']
+ urplayer_data = self._parse_json(self._html_search_regex(
+ r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"',
+ webpage, 'urplayer data'), video_id)['currentProduct']
+ episode = urplayer_data['title']
+ raw_streaming_info = urplayer_data['streamingInfo']['raw']
+ host = self._download_json(
+ 'http://streaming-loadbalancer.ur.se/loadbalancer.json',
+ video_id)['redirect']
formats = []
- for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)):
- file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr)
+ for k, v in raw_streaming_info.items():
+ if not (k in ('sd', 'hd') and isinstance(v, dict)):
+ continue
+ file_http = v.get('location')
if file_http:
formats.extend(self._extract_wowza_formats(
- 'http://%s/%splaylist.m3u8' % (host, file_http), video_id, skip_protocols=['rtmp', 'rtsp']))
+ 'http://%s/%splaylist.m3u8' % (host, file_http),
+ video_id, skip_protocols=['f4m', 'rtmp', 'rtsp']))
self._sort_formats(formats)
- subtitles = {}
- for subtitle in urplayer_data.get('subtitles', []):
- subtitle_url = subtitle.get('file')
- kind = subtitle.get('kind')
- if not subtitle_url or (kind and kind != 'captions'):
- continue
- subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({
- 'url': subtitle_url,
- })
+ image = urplayer_data.get('image') or {}
+ thumbnails = []
+ for k, v in image.items():
+ t = {
+ 'id': k,
+ 'url': v,
+ }
+ wh = k.split('x')
+ if len(wh) == 2:
+ t.update({
+ 'width': int_or_none(wh[0]),
+ 'height': int_or_none(wh[1]),
+ })
+ thumbnails.append(t)
+
+ series = urplayer_data.get('series') or {}
+ series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle'))
return {
'id': video_id,
- 'title': urplayer_data['title'],
- 'description': self._og_search_description(webpage),
- 'thumbnail': urplayer_data.get('image'),
- 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), webpage, 'timestamp')),
- 'series': urplayer_data.get('series_title'),
- 'subtitles': subtitles,
+ 'title': '%s : %s' % (series_title, episode) if series_title else episode,
+ 'description': urplayer_data.get('description'),
+ 'thumbnails': thumbnails,
+ 'timestamp': unified_timestamp(urplayer_data.get('publishedAt')),
+ 'series': series_title,
'formats': formats,
+ 'duration': int_or_none(urplayer_data.get('duration')),
+ 'categories': urplayer_data.get('categories'),
+ 'tags': urplayer_data.get('keywords'),
+ 'season': series.get('label'),
+ 'episode': episode,
+ 'episode_number': int_or_none(urplayer_data.get('episodeNumber')),
}
diff --git a/youtube_dl/extractor/usanetwork.py b/youtube_dl/extractor/usanetwork.py
index 54c7495cc..e3784e55f 100644
--- a/youtube_dl/extractor/usanetwork.py
+++ b/youtube_dl/extractor/usanetwork.py
@@ -1,74 +1,24 @@
# coding: utf-8
from __future__ import unicode_literals
-from .adobepass import AdobePassIE
-from ..utils import (
- NO_DEFAULT,
- smuggle_url,
- update_url_query,
-)
+from .nbc import NBCIE
-class USANetworkIE(AdobePassIE):
- _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P[^/?#]+)'
- _TEST = {
- 'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity',
- 'md5': '33c0d2ba381571b414024440d08d57fd',
+class USANetworkIE(NBCIE):
+ _VALID_URL = r'https?(?P://(?:www\.)?usanetwork\.com/[^/]+/video/[^/]+/(?P\d+))'
+ _TESTS = [{
+ 'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302',
'info_dict': {
- 'id': '3086229',
+ 'id': '4185302',
'ext': 'mp4',
- 'title': 'HPE Cybersecurity',
- 'description': 'The more we digitize our world, the more vulnerable we are.',
- 'upload_date': '20160818',
- 'timestamp': 1471535460,
- 'uploader': 'NBCU-USA',
+ 'title': 'Intelligence (Trailer)',
+ 'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.',
+ 'upload_date': '20200715',
+ 'timestamp': 1594785600,
+ 'uploader': 'NBCU-MPAT',
},
- }
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
-
- def _x(name, default=NO_DEFAULT):
- return self._search_regex(
- r'data-%s\s*=\s*(["\'])(?P(?:(?!\1).)+)\1' % name,
- webpage, name, default=default, group='value')
-
- video_id = _x('mpx-guid')
- title = _x('episode-title')
- mpx_account_id = _x('mpx-account-id', '2304992029')
-
- query = {
- 'mbr': 'true',
- }
- if _x('is-full-episode', None) == '1':
- query['manifest'] = 'm3u'
-
- if _x('is-entitlement', None) == '1':
- adobe_pass = {}
- drupal_settings = self._search_regex(
- r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
- webpage, 'drupal settings', fatal=False)
- if drupal_settings:
- drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False)
- if drupal_settings:
- adobe_pass = drupal_settings.get('adobePass', {})
- resource = self._get_mvpd_resource(
- adobe_pass.get('adobePassResourceId', 'usa'),
- title, video_id, _x('episode-rating', 'TV-14'))
- query['auth'] = self._extract_mvpd_auth(
- url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource)
-
- info = self._search_json_ld(webpage, video_id, default={})
- info.update({
- '_type': 'url_transparent',
- 'url': smuggle_url(update_url_query(
- 'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id),
- query), {'force_smil_url': True}),
- 'id': video_id,
- 'title': title,
- 'series': _x('show-title', None),
- 'episode': title,
- 'ie_key': 'ThePlatform',
- })
- return info
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py
index f79531e6f..df1dc78dd 100644
--- a/youtube_dl/extractor/vlive.py
+++ b/youtube_dl/extractor/vlive.py
@@ -1,25 +1,30 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-import time
import itertools
+import json
-from .common import InfoExtractor
from .naver import NaverBaseIE
-from ..compat import compat_str
+from ..compat import (
+ compat_HTTPError,
+ compat_str,
+)
from ..utils import (
ExtractorError,
+ int_or_none,
merge_dicts,
- remove_start,
try_get,
urlencode_postdata,
)
-class VLiveIE(NaverBaseIE):
+class VLiveBaseIE(NaverBaseIE):
+ _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
+
+
+class VLiveIE(VLiveBaseIE):
IE_NAME = 'vlive'
- _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P[0-9]+)'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P[0-9]+)'
_NETRC_MACHINE = 'vlive'
_TESTS = [{
'url': 'http://www.vlive.tv/video/1326',
@@ -27,7 +32,7 @@ class VLiveIE(NaverBaseIE):
'info_dict': {
'id': '1326',
'ext': 'mp4',
- 'title': "[V LIVE] Girl's Day's Broadcast",
+ 'title': "Girl's Day's Broadcast",
'creator': "Girl's Day",
'view_count': int,
'uploader_id': 'muploader_a',
@@ -37,7 +42,7 @@ class VLiveIE(NaverBaseIE):
'info_dict': {
'id': '16937',
'ext': 'mp4',
- 'title': '[V LIVE] 첸백시 걍방',
+ 'title': '첸백시 걍방',
'creator': 'EXO',
'view_count': int,
'subtitles': 'mincount:12',
@@ -58,12 +63,11 @@ class VLiveIE(NaverBaseIE):
'subtitles': 'mincount:10',
},
'skip': 'This video is only available for CH+ subscribers',
+ }, {
+ 'url': 'https://www.vlive.tv/embed/1326',
+ 'only_matching': True,
}]
- @classmethod
- def suitable(cls, url):
- return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url)
-
def _real_initialize(self):
self._login()
@@ -95,173 +99,122 @@ class VLiveIE(NaverBaseIE):
if not is_logged_in():
raise ExtractorError('Unable to log in', expected=True)
+ def _call_api(self, path_template, video_id, fields=None):
+ query = {'appId': self._APP_ID}
+ if fields:
+ query['fields'] = fields
+ return self._download_json(
+ 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id,
+ 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0],
+ headers={'Referer': 'https://www.vlive.tv/'}, query=query)
+
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(
- 'https://www.vlive.tv/video/%s' % video_id, video_id)
+ try:
+ post = self._call_api(
+ 'post/v1.0/officialVideoPost-%s', video_id,
+ 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}')
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ self.raise_login_required(json.loads(e.cause.read().decode())['message'])
+ raise
- VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)'
- VIDEO_PARAMS_FIELD = 'video params'
+ video = post['officialVideo']
- params = self._parse_json(self._search_regex(
- VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id,
- transform_source=lambda s: '[' + s + ']', fatal=False)
+ def get_common_fields():
+ channel = post.get('channel') or {}
+ return {
+ 'title': video.get('title'),
+ 'creator': post.get('author', {}).get('nickname'),
+ 'channel': channel.get('channelName'),
+ 'channel_id': channel.get('channelCode'),
+ 'duration': int_or_none(video.get('playTime')),
+ 'view_count': int_or_none(video.get('playCount')),
+ 'like_count': int_or_none(video.get('likeCount')),
+ 'comment_count': int_or_none(video.get('commentCount')),
+ }
- if not params or len(params) < 7:
- params = self._search_regex(
- VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD)
- params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)]
-
- status, long_video_id, key = params[2], params[5], params[6]
- status = remove_start(status, 'PRODUCT_')
-
- if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'):
- return self._live(video_id, webpage)
- elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'):
- return self._replay(video_id, webpage, long_video_id, key)
-
- if status == 'LIVE_END':
- raise ExtractorError('Uploading for replay. Please wait...',
- expected=True)
- elif status == 'COMING_SOON':
- raise ExtractorError('Coming soon!', expected=True)
- elif status == 'CANCELED':
- raise ExtractorError('We are sorry, '
- 'but the live broadcast has been canceled.',
- expected=True)
- elif status == 'ONLY_APP':
- raise ExtractorError('Unsupported video type', expected=True)
- else:
- raise ExtractorError('Unknown status %s' % status)
-
- def _get_common_fields(self, webpage):
- title = self._og_search_title(webpage)
- creator = self._html_search_regex(
- r']+class="info_area"[^>]*>\s*(?:
]*>.*?\s*)?
]*>([^<]+)',
- webpage, 'creator', fatal=False)
- thumbnail = self._og_search_thumbnail(webpage)
- return {
- 'title': title,
- 'creator': creator,
- 'thumbnail': thumbnail,
- }
-
- def _live(self, video_id, webpage):
- init_page = self._download_init_page(video_id)
-
- live_params = self._search_regex(
- r'"liveStreamInfo"\s*:\s*(".*"),',
- init_page, 'live stream info')
- live_params = self._parse_json(live_params, video_id)
- live_params = self._parse_json(live_params, video_id)
-
- formats = []
- for vid in live_params.get('resolutions', []):
- formats.extend(self._extract_m3u8_formats(
- vid['cdnUrl'], video_id, 'mp4',
- m3u8_id=vid.get('name'),
- fatal=False, live=True))
- self._sort_formats(formats)
-
- info = self._get_common_fields(webpage)
- info.update({
- 'title': self._live_title(info['title']),
- 'id': video_id,
- 'formats': formats,
- 'is_live': True,
- })
- return info
-
- def _replay(self, video_id, webpage, long_video_id, key):
- if '' in (long_video_id, key):
- init_page = self._download_init_page(video_id)
- video_info = self._parse_json(self._search_regex(
- (r'(?s)oVideoStatus\s*=\s*({.+?})\s*[0-9A-Z]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P[0-9A-Z]+)'
+ _TESTS = [{
'url': 'http://channels.vlive.tv/FCD4B',
'info_dict': {
'id': 'FCD4B',
'title': 'MAMAMOO',
},
'playlist_mincount': 110
- }
- _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
+ }, {
+ 'url': 'https://www.vlive.tv/channel/FCD4B',
+ 'only_matching': True,
+ }]
+
+ def _call_api(self, path, channel_key_suffix, channel_value, note, query):
+ q = {
+ 'app_id': self._APP_ID,
+ 'channel' + channel_key_suffix: channel_value,
+ }
+ q.update(query)
+ return self._download_json(
+ 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path,
+ channel_value, note='Downloading ' + note, query=q)['result']
def _real_extract(self, url):
channel_code = self._match_id(url)
- webpage = self._download_webpage(
- 'http://channels.vlive.tv/%s/video' % channel_code, channel_code)
+ channel_seq = self._call_api(
+ 'decodeChannelCode', 'Code', channel_code,
+ 'decode channel code', {})['channelSeq']
- app_id = None
-
- app_js_url = self._search_regex(
- r'