import base64 import json import re import urllib.parse import xml.etree.ElementTree from .adobepass import AdobePassIE from .common import InfoExtractor from .theplatform import ThePlatformBaseIE, ThePlatformIE, default_ns from ..networking import HEADRequest from ..utils import ( ExtractorError, RegexNotFoundError, UserNotLive, clean_html, determine_ext, extract_attributes, float_or_none, get_element_html_by_class, int_or_none, join_nonempty, make_archive_id, mimetype2ext, parse_age_limit, parse_duration, parse_iso8601, remove_end, try_get, unescapeHTML, unified_timestamp, update_url_query, url_basename, url_or_none, ) from ..utils.traversal import require, traverse_obj class NBCUniversalBaseIE(ThePlatformBaseIE): _GEO_COUNTRIES = ['US'] _GEO_BYPASS = False _M3U8_RE = r'https?://[^/?#]+/prod/[\w-]+/(?P[^?#]+/)cmaf/mpeg_(?:cbcs|cenc)\w*/master_cmaf\w*\.m3u8' def _download_nbcu_smil_and_extract_m3u8_url(self, tp_path, video_id, query): smil = self._download_xml( f'https://link.theplatform.com/s/{tp_path}', video_id, 'Downloading SMIL manifest', 'Failed to download SMIL manifest', query={ **query, 'format': 'SMIL', # XXX: Do not confuse "format" with "formats" 'manifest': 'm3u', 'switch': 'HLSServiceSecure', # Or else we get broken mp4 http URLs instead of HLS }, headers=self.geo_verification_headers()) ns = f'//{{{default_ns}}}' if url := traverse_obj(smil, (f'{ns}video/@src', lambda _, v: determine_ext(v) == 'm3u8', any)): return url exc = traverse_obj(smil, (f'{ns}param', lambda _, v: v.get('name') == 'exception', '@value', any)) if exc == 'GeoLocationBlocked': self.raise_geo_restricted(countries=self._GEO_COUNTRIES) raise ExtractorError(traverse_obj(smil, (f'{ns}ref/@abstract', ..., any)), expected=exc == 'Expired') def _extract_nbcu_formats_and_subtitles(self, tp_path, video_id, query): # formats='mpeg4' will return either a working m3u8 URL or an m3u8 template for non-DRM HLS # formats='m3u+none,mpeg4' may return DRM HLS but w/the "folders" needed for non-DRM template query['formats'] = 'm3u+none,mpeg4' m3u8_url = self._download_nbcu_smil_and_extract_m3u8_url(tp_path, video_id, query) if mobj := re.fullmatch(self._M3U8_RE, m3u8_url): query['formats'] = 'mpeg4' m3u8_tmpl = self._download_nbcu_smil_and_extract_m3u8_url(tp_path, video_id, query) # Example: https://vod-lf-oneapp-prd.akamaized.net/prod/video/{folders}master_hls.m3u8 if '{folders}' in m3u8_tmpl: self.write_debug('Found m3u8 URL template, formatting URL path') m3u8_url = m3u8_tmpl.format(folders=mobj.group('folders')) if '/mpeg_cenc' in m3u8_url or '/mpeg_cbcs' in m3u8_url: self.report_drm(video_id) return self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') def _extract_nbcu_video(self, url, display_id, old_ie_key=None): webpage = self._download_webpage(url, display_id) settings = self._search_json( r']+data-drupal-selector="drupal-settings-json"[^>]*>', webpage, 'settings', display_id) query = {} tve = extract_attributes(get_element_html_by_class('tve-video-deck-app', webpage) or '') if tve: account_pid = tve.get('data-mpx-media-account-pid') or tve['data-mpx-account-pid'] account_id = tve['data-mpx-media-account-id'] metadata = self._parse_json( tve.get('data-normalized-video') or '', display_id, fatal=False, transform_source=unescapeHTML) video_id = tve.get('data-guid') or metadata['guid'] if tve.get('data-entitlement') == 'auth': auth = settings['tve_adobe_auth'] release_pid = tve['data-release-pid'] resource = self._get_mvpd_resource( tve.get('data-adobe-pass-resource-id') or auth['adobePassResourceId'], tve['data-title'], release_pid, tve.get('data-rating')) query['auth'] = self._extract_mvpd_auth( url, release_pid, auth['adobePassRequestorId'], resource, auth['adobePassSoftwareStatement']) else: ls_playlist = traverse_obj(settings, ( 'ls_playlist', lambda _, v: v['defaultGuid'], any, {require('LS playlist')})) video_id = ls_playlist['defaultGuid'] account_pid = ls_playlist.get('mpxMediaAccountPid') or ls_playlist['mpxAccountPid'] account_id = ls_playlist['mpxMediaAccountId'] metadata = traverse_obj(ls_playlist, ('videos', lambda _, v: v['guid'] == video_id, any)) or {} tp_path = f'{account_pid}/media/guid/{account_id}/{video_id}' formats, subtitles = self._extract_nbcu_formats_and_subtitles(tp_path, video_id, query) tp_metadata = self._download_theplatform_metadata(tp_path, video_id, fatal=False) parsed_info = self._parse_theplatform_metadata(tp_metadata) self._merge_subtitles(parsed_info['subtitles'], target=subtitles) return { **parsed_info, **traverse_obj(metadata, { 'title': ('title', {str}), 'description': ('description', {str}), 'duration': ('durationInSeconds', {int_or_none}), 'timestamp': ('airDate', {parse_iso8601}), 'thumbnail': ('thumbnailUrl', {url_or_none}), 'season_number': ('seasonNumber', {int_or_none}), 'episode_number': ('episodeNumber', {int_or_none}), 'episode': ('episodeTitle', {str}), 'series': ('show', {str}), }), 'id': video_id, 'display_id': display_id, 'formats': formats, 'subtitles': subtitles, '_old_archive_ids': [make_archive_id(old_ie_key, video_id)] if old_ie_key else None, } class NBCIE(NBCUniversalBaseIE): _VALID_URL = r'https?(?P://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/?#]+/video/[^/?#]+/(?P\w+))' _TESTS = [ { 'url': 'http://www.nbc.com/the-tonight-show/video/jimmy-fallon-surprises-fans-at-ben-jerrys/2848237', 'info_dict': { 'id': '2848237', 'ext': 'mp4', 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.', 'timestamp': 1424246400, 'upload_date': '20150218', 'uploader': 'NBCU-COM', 'episode': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', 'episode_number': 86, 'season': 'Season 2', 'season_number': 2, 'series': 'Tonight', 'duration': 236.504, 'tags': 'count:2', 'thumbnail': r're:https?://.+\.jpg', 'categories': ['Series/The Tonight Show Starring Jimmy Fallon'], 'media_type': 'Full Episode', 'age_limit': 14, '_old_archive_ids': ['theplatform 2848237'], }, 'params': { 'skip_download': 'm3u8', }, }, { 'url': 'https://www.nbc.com/the-golden-globe-awards/video/oprah-winfrey-receives-cecil-b-de-mille-award-at-the-2018-golden-globes/3646439', 'info_dict': { 'id': '3646439', 'ext': 'mp4', 'title': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes', 'episode': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes', 'episode_number': 1, 'season': 'Season 75', 'season_number': 75, 'series': 'Golden Globes', 'description': 'Oprah Winfrey receives the Cecil B. de Mille Award at the 75th Annual Golden Globe Awards.', 'uploader': 'NBCU-COM', 'upload_date': '20180107', 'timestamp': 1515312000, 'duration': 569.703, 'tags': 'count:8', 'thumbnail': r're:https?://.+\.jpg', 'media_type': 'Highlight', 'age_limit': 0, 'categories': ['Series/The Golden Globe Awards'], '_old_archive_ids': ['theplatform 3646439'], }, 'params': { 'skip_download': 'm3u8', }, }, { # Needs to be extracted from webpage instead of GraphQL 'url': 'https://www.nbc.com/paris2024/video/ali-truwit-found-purpose-pool-after-her-life-changed/para24_sww_alitruwittodayshow_240823', 'info_dict': { 'id': 'para24_sww_alitruwittodayshow_240823', 'ext': 'mp4', 'title': 'Ali Truwit found purpose in the pool after her life changed', 'description': 'md5:c16d7489e1516593de1cc5d3f39b9bdb', 'uploader': 'NBCU-SPORTS', 'duration': 311.077, 'thumbnail': r're:https?://.+\.jpg', 'episode': 'Ali Truwit found purpose in the pool after her life changed', 'timestamp': 1724435902.0, 'upload_date': '20240823', '_old_archive_ids': ['theplatform para24_sww_alitruwittodayshow_240823'], }, 'params': { 'skip_download': 'm3u8', }, }, { 'url': 'https://www.nbc.com/quantum-leap/video/bens-first-leap-nbcs-quantum-leap/NBCE125189978', 'only_matching': True, }, { 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310', 'only_matching': True, }, { # Percent escaped url 'url': 'https://www.nbc.com/up-all-night/video/day-after-valentine%27s-day/n2189', 'only_matching': True, }, ] _SOFTWARE_STATEMENT = 'eyJhbGciOiJSUzI1NiJ9.eyJzdWIiOiI1Yzg2YjdkYy04NDI3LTRjNDUtOGQwZi1iNDkzYmE3MmQwYjQiLCJuYmYiOjE1Nzg3MDM2MzEsImlzcyI6ImF1dGguYWRvYmUuY29tIiwiaWF0IjoxNTc4NzAzNjMxfQ.QQKIsBhAjGQTMdAqRTqhcz2Cddr4Y2hEjnSiOeKKki4nLrkDOsjQMmqeTR0hSRarraxH54wBgLvsxI7LHwKMvr7G8QpynNAxylHlQD3yhN9tFhxt4KR5wW3as02B-W2TznK9bhNWPKIyHND95Uo2Mi6rEQoq8tM9O09WPWaanE5BX_-r6Llr6dPq5F0Lpx2QOn2xYRb1T4nFxdFTNoss8GBds8OvChTiKpXMLHegLTc1OS4H_1a8tO_37jDwSdJuZ8iTyRLV4kZ2cpL6OL5JPMObD4-HQiec_dfcYgMKPiIfP9ZqdXpec2SVaCLsWEk86ZYvD97hLIQrK5rrKd1y-A' def _real_extract(self, url): permalink, video_id = self._match_valid_url(url).groups() permalink = 'http' + urllib.parse.unquote(permalink) video_data = self._download_json( 'https://friendship.nbc.co/v2/graphql', video_id, query={ 'query': '''query bonanzaPage( $app: NBCUBrands! = nbc $name: String! $oneApp: Boolean $platform: SupportedPlatforms! = web $type: EntityPageType! = VIDEO $userId: String! ) { bonanzaPage( app: $app name: $name oneApp: $oneApp platform: $platform type: $type userId: $userId ) { metadata { ... on VideoPageData { description episodeNumber keywords locked mpxAccountId mpxGuid rating resourceId seasonNumber secondaryTitle seriesShortTitle } } } }''', 'variables': json.dumps({ 'name': permalink, 'oneApp': True, 'userId': '0', }), })['data']['bonanzaPage']['metadata'] if not video_data: # Some videos are not available via GraphQL API webpage = self._download_webpage(url, video_id) video_data = self._search_json( r'