From 2b23dc1cbdd752e75311c2f8e78a1dd079908bc9 Mon Sep 17 00:00:00 2001 From: df Date: Sun, 28 Mar 2021 06:21:35 +0100 Subject: [PATCH] Morph-based pages: add Weather --- youtube_dl/extractor/bbc.py | 88 ++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index a97f96db0..8fc572bd6 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -11,6 +11,7 @@ from ..compat import ( compat_etree_Element, compat_HTTPError, compat_parse_qs, + compat_str, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -812,6 +813,18 @@ class BBCIE(BBCCoUkIE): 'upload_date': '20190604', 'categories': ['Psychology'], }, + }, { + # BBC Weather + 'url': 'https://www.bbc.co.uk/weather/features/55581056', + 'info_dict': { + 'id': 'p093xhxl', + 'ext': 'mp4', + 'title': 'Weather for the Week Ahead', + 'description': 'There\'ll be a battle between colder and milder weather in the coming few days, before it turns chillier once again.', + 'duration': 209, + 'thumbnail': r're:https?://.+/p093xk3z.jpg', + 'upload_date': '20210113', + }, }] @classmethod @@ -1034,19 +1047,17 @@ class BBCIE(BBCCoUkIE): } # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) - # There are several setPayload calls may be present but the video - # seems to be always related to the first one + # Several setPayload calls may be present so pick the one with 'asset-data' + # For Weather, use 'asset-with-media' morph_payload = self._parse_json( self._search_regex( - r'Morph\.setPayload\([^,]+,\s*({.+?})\);', + r'Morph\.setPayload\s*\([^,]+-asset-(?:data|with-media)/[^,]+,\s*(\{.+[]}]\s*})\s*\)\s*;', webpage, 'morph payload', default='{}'), playlist_id, fatal=False) if morph_payload: # - obsolete? components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] for component in components: - if not isinstance(component, dict): - continue lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) if not lead_media: continue @@ -1077,6 +1088,73 @@ class BBCIE(BBCCoUkIE): 'formats': formats, 'subtitles': subtitles, } + # 'components' may be obsolete? + body_media = try_get(morph_payload, lambda x: x['body'], dict) or {} + # check for variant but similar format found with Weather + primary_video = try_get(body_media, lambda x: x['media']['videos']['primary'].values()[0], dict) + if primary_video: + body_media.update(primary_video) + programme_id = body_media.get('versionPid') + else: + body_media.update(body_media.get('media') or {}) + programme_id = body_media.get('pid') + if programme_id: + title = body_media.get('title') or \ + self._og_search_title(webpage) or \ + self._html_search_meta('title', webpage) + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + image_url = body_media.get('holdingImageUrl') + return { + 'id': programme_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': re.sub(r'(\{width}xn|\$recipe)', 'raw', image_url) if image_url else None, + 'duration': parse_duration(dict_get(body_media, ('duration', 'durationSeconds'))), + 'description': try_get(body_media, lambda x: x['promos']['summary'], compat_str) or \ + body_media.get('summary') or \ + self._html_search_meta('description', webpage), + 'timestamp': parse_iso8601(dict_get(body_media, ('dateTime', 'lastUpdated'))), + } + + # morph-based playlist (replaces playlist.sxml) + # a JS setPayload call with arg1 containing the playlist_id has JSON in arg2; + # deeply nested within it is our target string containing more JSON ... + morph_payload = self._parse_json( + self._search_regex( + r'Morph\.setPayload\s*\([^,]+%s%s%s[^,]+,\s*(\{.+[]}]\s*})\s*\)\s*;' % ('%2F', playlist_id, '%22%2CisStory%3Atrue'), + webpage, 'morph playlist payload', default='{}'), + playlist_id, fatal=False) + if morph_payload: + # looking for a string containing a JSON list + components = try_get(morph_payload, lambda x: x['body']['content']['article']['body'], compat_str) or '[]' + components = self._parse_json(components, playlist_id, fatal=False) or [] + for component in components: + if component.get('name') != 'video': + continue + component = component.get('videoData') or {} + programme_id = dict_get(component, ('vpid', 'pid')) + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + if not formats: + continue + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': component.get('title', 'Unnamed clip %s' % programme_id), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': dict_get(component, ('iChefImage', 'image')), + 'duration': parse_duration(component.get('duration')), + 'description': component.get('caption'), + }) + if entries: + return self.playlist_result( + entries, + playlist_id, + playlist_title, + playlist_description) body_media = try_get(morph_payload, lambda x: x['body'], dict) or {} body_media.update(body_media.get('media') or {})