diff --git a/README.md b/README.md index 9f542844e0..aaa3beb716 100644 --- a/README.md +++ b/README.md @@ -1805,7 +1805,7 @@ The following extractors use this feature: * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning * `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage` * `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID) -* `po_token`: Proof of Origin (PO) Token(s) to use. Comma seperated list of PO Tokens in the format `CLIENT.CONTEXT+PO_TOKEN`, e.g. `youtube:po_token=web.gvs+XXX,web.player=XXX,web_safari.gvs+YYY`. Context can be either `gvs` (Google Video Server URLs) or `player` (Innertube player request) +* `po_token`: Proof of Origin (PO) Token(s) to use. Comma seperated list of PO Tokens in the format `CLIENT.CONTEXT+PO_TOKEN`, e.g. `youtube:po_token=web.gvs+XXX,web.player=XXX,web_safari.gvs+YYY`. Context can be any of `gvs` (Google Video Server URLs), `player` (Innertube player request) or `subs` (Subtitles) * `pot_trace`: Enable debug logging for PO Token fetching. Either `true` or `false` (default) * `fetch_pot`: Policy to use for fetching a PO Token from providers. One of `always` (always try fetch a PO Token regardless if the client requires one for the given context), `never` (never fetch a PO Token), or `auto` (default; only fetch a PO Token if the client requires one for the given context) diff --git a/test/test_pot/test_pot_builtin_utils.py b/test/test_pot/test_pot_builtin_utils.py index 1682e42a16..a95fc4e159 100644 --- a/test/test_pot/test_pot_builtin_utils.py +++ b/test/test_pot/test_pot_builtin_utils.py @@ -15,6 +15,7 @@ class TestGetWebPoContentBinding: for context, is_authenticated, expected in [ (PoTokenContext.GVS, False, ('example-visitor-data', ContentBindingType.VISITOR_DATA)), (PoTokenContext.PLAYER, False, ('example-video-id', ContentBindingType.VIDEO_ID)), + (PoTokenContext.SUBS, False, ('example-video-id', ContentBindingType.VIDEO_ID)), (PoTokenContext.GVS, True, ('example-data-sync-id', ContentBindingType.DATASYNC_ID)), ]], ('WEB_REMIX', PoTokenContext.GVS, False, ('example-visitor-data', ContentBindingType.VISITOR_DATA)), diff --git a/yt_dlp/extractor/youtube/_base.py b/yt_dlp/extractor/youtube/_base.py index 4194e1c217..9c5bb75fe4 100644 --- a/yt_dlp/extractor/youtube/_base.py +++ b/yt_dlp/extractor/youtube/_base.py @@ -35,6 +35,7 @@ from ...utils import ( class _PoTokenContext(enum.Enum): PLAYER = 'player' GVS = 'gvs' + SUBS = 'subs' # any clients starting with _ cannot be explicitly requested by the user @@ -787,6 +788,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _download_ytcfg(self, client, video_id): url = { + 'mweb': 'https://m.youtube.com', 'web': 'https://www.youtube.com', 'web_music': 'https://music.youtube.com', 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1', diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index b4c6ba4538..9f929664fb 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -72,6 +72,9 @@ from ...utils.networking import clean_headers, clean_proxies, select_proxy STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' STREAMING_DATA_INITIAL_PO_TOKEN = '__yt_dlp_po_token' +STREAMING_DATA_FETCH_SUBS_PO_TOKEN = '__yt_dlp_fetch_subs_po_token' +STREAMING_DATA_INNERTUBE_CONTEXT = '__yt_dlp_innertube_context' + PO_TOKEN_GUIDE_URL = 'https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide' @@ -2863,7 +2866,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, visitor_data=None, - data_sync_id=None, session_index=None, player_url=None, video_id=None, webpage=None, **kwargs): + data_sync_id=None, session_index=None, player_url=None, video_id=None, webpage=None, + required=False, **kwargs): """ Fetch a PO Token for a given client and context. This function will validate required parameters for a given context and client. @@ -2878,6 +2882,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): @param player_url: player URL. @param video_id: video ID. @param webpage: video webpage. + @param required: Whether the PO Token is required (i.e. try to fetch unless policy is "never"). @param kwargs: Additional arguments to pass down. May be more added in the future. @return: The fetched PO Token. None if it could not be fetched. """ @@ -2926,6 +2931,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_url=player_url, video_id=video_id, video_webpage=webpage, + required=required, **kwargs, ) @@ -2945,6 +2951,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or ( fetch_pot_policy == 'auto' and _PoTokenContext(context) not in self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS'] + and not kwargs.get('required', False) ) ): return None @@ -3133,6 +3140,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_url = self._download_player_url(video_id) tried_iframe_fallback = True + pr = initial_pr if client == 'web' else None + visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg) data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg) @@ -3147,12 +3156,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ytcfg': player_ytcfg or self._get_default_ytcfg(client), } - player_po_token = self.fetch_po_token( + # Don't need a player PO token for WEB if using player response from webpage + player_po_token = None if pr else self.fetch_po_token( context=_PoTokenContext.PLAYER, **fetch_po_token_args) gvs_po_token = self.fetch_po_token( context=_PoTokenContext.GVS, **fetch_po_token_args) + fetch_subs_po_token_func = functools.partial( + self.fetch_po_token, + context=_PoTokenContext.SUBS, + **fetch_po_token_args, + ) + required_pot_contexts = self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS'] if ( @@ -3179,7 +3195,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): only_once=True) deprioritize_pr = True - pr = initial_pr if client == 'web' else None try: pr = pr or self._extract_player_response( client, video_id, @@ -3197,10 +3212,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if pr_id := self._invalid_player_response(pr, video_id): skipped_clients[client] = pr_id elif pr: - # Save client name for introspection later - sd = traverse_obj(pr, ('streamingData', {dict})) or {} + # Save client details for introspection later + innertube_context = traverse_obj(player_ytcfg or self._get_default_ytcfg(client), 'INNERTUBE_CONTEXT') + sd = pr.setdefault('streamingData', {}) sd[STREAMING_DATA_CLIENT_NAME] = client sd[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token + sd[STREAMING_DATA_INNERTUBE_CONTEXT] = innertube_context + sd[STREAMING_DATA_FETCH_SUBS_PO_TOKEN] = fetch_subs_po_token_func for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): f[STREAMING_DATA_CLIENT_NAME] = client f[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token @@ -3262,6 +3280,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: self.report_warning(msg, only_once=True) + def _report_pot_subtitles_skipped(self, video_id, client_name, msg=None): + msg = msg or ( + f'{video_id}: Some {client_name} client subtitles require a PO Token which was not provided. ' + 'They will be discarded since they are not downloadable as-is. ' + f'You can manually pass a Subtitles PO Token for this client with ' + f'--extractor-args "youtube:po_token={client_name}.subs+XXX" . ' + f'For more information, refer to {PO_TOKEN_GUIDE_URL}') + + subs_wanted = any(( + self.get_param('writesubtitles'), + self.get_param('writeautomaticsub'), + self.get_param('listsubtitles'))) + + # Only raise a warning for non-default clients, to not confuse users. + if not subs_wanted or client_name in (*self._DEFAULT_CLIENTS, *self._DEFAULT_AUTHED_CLIENTS): + self.write_debug(msg, only_once=True) + else: + self.report_warning(msg, only_once=True) + def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): CHUNK_SIZE = 10 << 20 PREFERRED_LANG_VALUE = 10 @@ -3553,6 +3590,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}' fmts, subs = self._extract_m3u8_formats_and_subtitles( hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') + for sub in traverse_obj(subs, (..., ..., {dict})): + # HLS subs (m3u8) do not need a PO token; save client name for debugging + sub[STREAMING_DATA_CLIENT_NAME] = client_name subtitles = self._merge_subtitles(subs, subtitles) for f in fmts: if process_manifest_format(f, 'hls', client_name, self._search_regex( @@ -3564,6 +3604,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if po_token: dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}' formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) + for sub in traverse_obj(subs, (..., ..., {dict})): + # TODO: Investigate if DASH subs ever need a PO token; save client name for debugging + sub[STREAMING_DATA_CLIENT_NAME] = client_name subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH for f in formats: if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token): @@ -3890,47 +3933,81 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec', 'channels', 'acodec', 'lang', 'proto'), } + def get_lang_code(track): + return (remove_start(track.get('vssId') or '', '.').replace('.', '-') + or track.get('languageCode')) + + def process_language(container, base_url, lang_code, sub_name, client_name, query): + lang_subs = container.setdefault(lang_code, []) + for fmt in self._SUBTITLE_FORMATS: + query = {**query, 'fmt': fmt} + lang_subs.append({ + 'ext': fmt, + 'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)), + 'name': sub_name, + STREAMING_DATA_CLIENT_NAME: client_name, + }) + subtitles = {} - pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict) - if pctr: - def get_lang_code(track): - return (remove_start(track.get('vssId') or '', '.').replace('.', '-') - or track.get('languageCode')) + skipped_subs_clients = set() + prs = traverse_obj(player_responses, ( + # Filter out initial_pr which does not have streamingData (smuggled client context) + lambda _, v: v['streamingData'] and v['captions']['playerCaptionsTracklistRenderer'])) - # Converted into dicts to remove duplicates - captions = { - get_lang_code(sub): sub - for sub in traverse_obj(pctr, (..., 'captionTracks', ...))} - translation_languages = { - lang.get('languageCode'): self._get_text(lang.get('languageName'), max_runs=1) - for lang in traverse_obj(pctr, (..., 'translationLanguages', ...))} + pctrs = traverse_obj(prs, (..., 'captions', 'playerCaptionsTracklistRenderer', {dict})) + translation_languages = { + lang.get('languageCode'): self._get_text(lang.get('languageName'), max_runs=1) + for lang in traverse_obj(pctrs, (..., 'translationLanguages', ..., {dict}))} + # NB: Constructing the full subtitle dictionary is slow + get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and ( + self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles')) - def process_language(container, base_url, lang_code, sub_name, query): - lang_subs = container.setdefault(lang_code, []) - for fmt in self._SUBTITLE_FORMATS: - query.update({ - 'fmt': fmt, - }) - lang_subs.append({ - 'ext': fmt, - 'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)), - 'name': sub_name, - }) + all_captions = traverse_obj(pctrs, (..., 'captionTracks', ..., {dict})) + need_subs_langs = {get_lang_code(sub) for sub in all_captions if sub.get('kind') != 'asr'} + need_caps_langs = { + remove_start(get_lang_code(sub), 'a-') + for sub in all_captions if sub.get('kind') == 'asr'} - # NB: Constructing the full subtitle dictionary is slow - get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and ( - self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles')) - for lang_code, caption_track in captions.items(): - base_url = caption_track.get('baseUrl') - orig_lang = parse_qs(base_url).get('lang', [None])[-1] - if not base_url: - continue + for pr in prs: + pctr = pr['captions']['playerCaptionsTracklistRenderer'] + client_name = pr['streamingData'][STREAMING_DATA_CLIENT_NAME] + innertube_client_name = pr['streamingData'][STREAMING_DATA_INNERTUBE_CONTEXT]['client']['clientName'] + required_contexts = self._get_default_ytcfg(client_name)['PO_TOKEN_REQUIRED_CONTEXTS'] + fetch_subs_po_token_func = pr['streamingData'][STREAMING_DATA_FETCH_SUBS_PO_TOKEN] + + pot_params = {} + already_fetched_pot = False + + for caption_track in traverse_obj(pctr, ('captionTracks', lambda _, v: v['baseUrl'])): + base_url = caption_track['baseUrl'] + qs = parse_qs(base_url) + lang_code = get_lang_code(caption_track) + requires_pot = ( + # We can detect the experiment for now + any(e in traverse_obj(qs, ('exp', ...)) for e in ('xpe', 'xpv')) + or _PoTokenContext.SUBS in required_contexts) + + if not already_fetched_pot: + already_fetched_pot = True + if subs_po_token := fetch_subs_po_token_func(required=requires_pot): + pot_params.update({ + 'pot': subs_po_token, + 'potc': '1', + 'c': innertube_client_name, + }) + + if not pot_params and requires_pot: + skipped_subs_clients.add(client_name) + self._report_pot_subtitles_skipped(video_id, client_name) + break + + orig_lang = qs.get('lang', [None])[-1] lang_name = self._get_text(caption_track, 'name', max_runs=1) if caption_track.get('kind') != 'asr': if not lang_code: continue process_language( - subtitles, base_url, lang_code, lang_name, {}) + subtitles, base_url, lang_code, lang_name, client_name, pot_params) if not caption_track.get('isTranslatable'): continue for trans_code, trans_name in translation_languages.items(): @@ -3950,10 +4027,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Add an "-orig" label to the original language so that it can be distinguished. # The subs are returned without "-orig" as well for compatibility process_language( - automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {}) + automatic_captions, base_url, f'{trans_code}-orig', + f'{trans_name} (Original)', client_name, pot_params) # Setting tlang=lang returns damaged subtitles. - process_language(automatic_captions, base_url, trans_code, trans_name, - {} if orig_lang == orig_trans_code else {'tlang': trans_code}) + process_language( + automatic_captions, base_url, trans_code, trans_name, client_name, + pot_params if orig_lang == orig_trans_code else {'tlang': trans_code, **pot_params}) + + # Avoid duplication if we've already got everything we need + need_subs_langs.difference_update(subtitles) + need_caps_langs.difference_update(automatic_captions) + if not (need_subs_langs or need_caps_langs): + break + + if skipped_subs_clients and (need_subs_langs or need_caps_langs): + self._report_pot_subtitles_skipped(video_id, True, msg=join_nonempty( + f'{video_id}: There are missing subtitles languages because a PO token was not provided.', + need_subs_langs and f'Subtitles for these languages are missing: {", ".join(need_subs_langs)}.', + need_caps_langs and f'Automatic captions for {len(need_caps_langs)} languages are missing.', + delim=' ')) info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles diff --git a/yt_dlp/extractor/youtube/pot/provider.py b/yt_dlp/extractor/youtube/pot/provider.py index 53af92d30b..13b3b1f9bb 100644 --- a/yt_dlp/extractor/youtube/pot/provider.py +++ b/yt_dlp/extractor/youtube/pot/provider.py @@ -39,6 +39,7 @@ __all__ = [ class PoTokenContext(enum.Enum): GVS = 'gvs' PLAYER = 'player' + SUBS = 'subs' @dataclasses.dataclass diff --git a/yt_dlp/extractor/youtube/pot/utils.py b/yt_dlp/extractor/youtube/pot/utils.py index 1c0db243bf..7a5b7d4ab3 100644 --- a/yt_dlp/extractor/youtube/pot/utils.py +++ b/yt_dlp/extractor/youtube/pot/utils.py @@ -51,7 +51,7 @@ def get_webpo_content_binding( return visitor_id, ContentBindingType.VISITOR_ID return request.visitor_data, ContentBindingType.VISITOR_DATA - elif request.context == PoTokenContext.PLAYER or client_name != 'WEB_REMIX': + elif request.context in (PoTokenContext.PLAYER, PoTokenContext.SUBS): return request.video_id, ContentBindingType.VIDEO_ID return None, None