mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-06-18 15:55:30 +02:00
[ie/youtube] Add PO token support for subtitles (#13234)
Closes #13075 Authored by: bashonly, coletdjnz Co-authored-by: coletdjnz <coletdjnz@protonmail.com>
This commit is contained in:
parent
167d7a9f0f
commit
32ed5f107c
@ -1805,7 +1805,7 @@ The following extractors use this feature:
|
||||
* `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning
|
||||
* `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage`
|
||||
* `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID)
|
||||
* `po_token`: Proof of Origin (PO) Token(s) to use. Comma seperated list of PO Tokens in the format `CLIENT.CONTEXT+PO_TOKEN`, e.g. `youtube:po_token=web.gvs+XXX,web.player=XXX,web_safari.gvs+YYY`. Context can be either `gvs` (Google Video Server URLs) or `player` (Innertube player request)
|
||||
* `po_token`: Proof of Origin (PO) Token(s) to use. Comma seperated list of PO Tokens in the format `CLIENT.CONTEXT+PO_TOKEN`, e.g. `youtube:po_token=web.gvs+XXX,web.player=XXX,web_safari.gvs+YYY`. Context can be any of `gvs` (Google Video Server URLs), `player` (Innertube player request) or `subs` (Subtitles)
|
||||
* `pot_trace`: Enable debug logging for PO Token fetching. Either `true` or `false` (default)
|
||||
* `fetch_pot`: Policy to use for fetching a PO Token from providers. One of `always` (always try fetch a PO Token regardless if the client requires one for the given context), `never` (never fetch a PO Token), or `auto` (default; only fetch a PO Token if the client requires one for the given context)
|
||||
|
||||
|
@ -15,6 +15,7 @@ class TestGetWebPoContentBinding:
|
||||
for context, is_authenticated, expected in [
|
||||
(PoTokenContext.GVS, False, ('example-visitor-data', ContentBindingType.VISITOR_DATA)),
|
||||
(PoTokenContext.PLAYER, False, ('example-video-id', ContentBindingType.VIDEO_ID)),
|
||||
(PoTokenContext.SUBS, False, ('example-video-id', ContentBindingType.VIDEO_ID)),
|
||||
(PoTokenContext.GVS, True, ('example-data-sync-id', ContentBindingType.DATASYNC_ID)),
|
||||
]],
|
||||
('WEB_REMIX', PoTokenContext.GVS, False, ('example-visitor-data', ContentBindingType.VISITOR_DATA)),
|
||||
|
@ -35,6 +35,7 @@ from ...utils import (
|
||||
class _PoTokenContext(enum.Enum):
|
||||
PLAYER = 'player'
|
||||
GVS = 'gvs'
|
||||
SUBS = 'subs'
|
||||
|
||||
|
||||
# any clients starting with _ cannot be explicitly requested by the user
|
||||
@ -787,6 +788,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
|
||||
|
||||
def _download_ytcfg(self, client, video_id):
|
||||
url = {
|
||||
'mweb': 'https://m.youtube.com',
|
||||
'web': 'https://www.youtube.com',
|
||||
'web_music': 'https://music.youtube.com',
|
||||
'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1',
|
||||
|
@ -72,6 +72,9 @@ from ...utils.networking import clean_headers, clean_proxies, select_proxy
|
||||
|
||||
STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client'
|
||||
STREAMING_DATA_INITIAL_PO_TOKEN = '__yt_dlp_po_token'
|
||||
STREAMING_DATA_FETCH_SUBS_PO_TOKEN = '__yt_dlp_fetch_subs_po_token'
|
||||
STREAMING_DATA_INNERTUBE_CONTEXT = '__yt_dlp_innertube_context'
|
||||
|
||||
PO_TOKEN_GUIDE_URL = 'https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide'
|
||||
|
||||
|
||||
@ -2863,7 +2866,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
continue
|
||||
|
||||
def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, visitor_data=None,
|
||||
data_sync_id=None, session_index=None, player_url=None, video_id=None, webpage=None, **kwargs):
|
||||
data_sync_id=None, session_index=None, player_url=None, video_id=None, webpage=None,
|
||||
required=False, **kwargs):
|
||||
"""
|
||||
Fetch a PO Token for a given client and context. This function will validate required parameters for a given context and client.
|
||||
|
||||
@ -2878,6 +2882,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
@param player_url: player URL.
|
||||
@param video_id: video ID.
|
||||
@param webpage: video webpage.
|
||||
@param required: Whether the PO Token is required (i.e. try to fetch unless policy is "never").
|
||||
@param kwargs: Additional arguments to pass down. May be more added in the future.
|
||||
@return: The fetched PO Token. None if it could not be fetched.
|
||||
"""
|
||||
@ -2926,6 +2931,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
player_url=player_url,
|
||||
video_id=video_id,
|
||||
video_webpage=webpage,
|
||||
required=required,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -2945,6 +2951,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
or (
|
||||
fetch_pot_policy == 'auto'
|
||||
and _PoTokenContext(context) not in self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS']
|
||||
and not kwargs.get('required', False)
|
||||
)
|
||||
):
|
||||
return None
|
||||
@ -3133,6 +3140,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
player_url = self._download_player_url(video_id)
|
||||
tried_iframe_fallback = True
|
||||
|
||||
pr = initial_pr if client == 'web' else None
|
||||
|
||||
visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg)
|
||||
data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg)
|
||||
|
||||
@ -3147,12 +3156,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
'ytcfg': player_ytcfg or self._get_default_ytcfg(client),
|
||||
}
|
||||
|
||||
player_po_token = self.fetch_po_token(
|
||||
# Don't need a player PO token for WEB if using player response from webpage
|
||||
player_po_token = None if pr else self.fetch_po_token(
|
||||
context=_PoTokenContext.PLAYER, **fetch_po_token_args)
|
||||
|
||||
gvs_po_token = self.fetch_po_token(
|
||||
context=_PoTokenContext.GVS, **fetch_po_token_args)
|
||||
|
||||
fetch_subs_po_token_func = functools.partial(
|
||||
self.fetch_po_token,
|
||||
context=_PoTokenContext.SUBS,
|
||||
**fetch_po_token_args,
|
||||
)
|
||||
|
||||
required_pot_contexts = self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS']
|
||||
|
||||
if (
|
||||
@ -3179,7 +3195,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
only_once=True)
|
||||
deprioritize_pr = True
|
||||
|
||||
pr = initial_pr if client == 'web' else None
|
||||
try:
|
||||
pr = pr or self._extract_player_response(
|
||||
client, video_id,
|
||||
@ -3197,10 +3212,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
if pr_id := self._invalid_player_response(pr, video_id):
|
||||
skipped_clients[client] = pr_id
|
||||
elif pr:
|
||||
# Save client name for introspection later
|
||||
sd = traverse_obj(pr, ('streamingData', {dict})) or {}
|
||||
# Save client details for introspection later
|
||||
innertube_context = traverse_obj(player_ytcfg or self._get_default_ytcfg(client), 'INNERTUBE_CONTEXT')
|
||||
sd = pr.setdefault('streamingData', {})
|
||||
sd[STREAMING_DATA_CLIENT_NAME] = client
|
||||
sd[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token
|
||||
sd[STREAMING_DATA_INNERTUBE_CONTEXT] = innertube_context
|
||||
sd[STREAMING_DATA_FETCH_SUBS_PO_TOKEN] = fetch_subs_po_token_func
|
||||
for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})):
|
||||
f[STREAMING_DATA_CLIENT_NAME] = client
|
||||
f[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token
|
||||
@ -3262,6 +3280,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
else:
|
||||
self.report_warning(msg, only_once=True)
|
||||
|
||||
def _report_pot_subtitles_skipped(self, video_id, client_name, msg=None):
|
||||
msg = msg or (
|
||||
f'{video_id}: Some {client_name} client subtitles require a PO Token which was not provided. '
|
||||
'They will be discarded since they are not downloadable as-is. '
|
||||
f'You can manually pass a Subtitles PO Token for this client with '
|
||||
f'--extractor-args "youtube:po_token={client_name}.subs+XXX" . '
|
||||
f'For more information, refer to {PO_TOKEN_GUIDE_URL}')
|
||||
|
||||
subs_wanted = any((
|
||||
self.get_param('writesubtitles'),
|
||||
self.get_param('writeautomaticsub'),
|
||||
self.get_param('listsubtitles')))
|
||||
|
||||
# Only raise a warning for non-default clients, to not confuse users.
|
||||
if not subs_wanted or client_name in (*self._DEFAULT_CLIENTS, *self._DEFAULT_AUTHED_CLIENTS):
|
||||
self.write_debug(msg, only_once=True)
|
||||
else:
|
||||
self.report_warning(msg, only_once=True)
|
||||
|
||||
def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration):
|
||||
CHUNK_SIZE = 10 << 20
|
||||
PREFERRED_LANG_VALUE = 10
|
||||
@ -3553,6 +3590,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}'
|
||||
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||
hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live')
|
||||
for sub in traverse_obj(subs, (..., ..., {dict})):
|
||||
# HLS subs (m3u8) do not need a PO token; save client name for debugging
|
||||
sub[STREAMING_DATA_CLIENT_NAME] = client_name
|
||||
subtitles = self._merge_subtitles(subs, subtitles)
|
||||
for f in fmts:
|
||||
if process_manifest_format(f, 'hls', client_name, self._search_regex(
|
||||
@ -3564,6 +3604,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
if po_token:
|
||||
dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}'
|
||||
formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False)
|
||||
for sub in traverse_obj(subs, (..., ..., {dict})):
|
||||
# TODO: Investigate if DASH subs ever need a PO token; save client name for debugging
|
||||
sub[STREAMING_DATA_CLIENT_NAME] = client_name
|
||||
subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH
|
||||
for f in formats:
|
||||
if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token):
|
||||
@ -3890,47 +3933,81 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec', 'channels', 'acodec', 'lang', 'proto'),
|
||||
}
|
||||
|
||||
def get_lang_code(track):
|
||||
return (remove_start(track.get('vssId') or '', '.').replace('.', '-')
|
||||
or track.get('languageCode'))
|
||||
|
||||
def process_language(container, base_url, lang_code, sub_name, client_name, query):
|
||||
lang_subs = container.setdefault(lang_code, [])
|
||||
for fmt in self._SUBTITLE_FORMATS:
|
||||
query = {**query, 'fmt': fmt}
|
||||
lang_subs.append({
|
||||
'ext': fmt,
|
||||
'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)),
|
||||
'name': sub_name,
|
||||
STREAMING_DATA_CLIENT_NAME: client_name,
|
||||
})
|
||||
|
||||
subtitles = {}
|
||||
pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
|
||||
if pctr:
|
||||
def get_lang_code(track):
|
||||
return (remove_start(track.get('vssId') or '', '.').replace('.', '-')
|
||||
or track.get('languageCode'))
|
||||
skipped_subs_clients = set()
|
||||
prs = traverse_obj(player_responses, (
|
||||
# Filter out initial_pr which does not have streamingData (smuggled client context)
|
||||
lambda _, v: v['streamingData'] and v['captions']['playerCaptionsTracklistRenderer']))
|
||||
|
||||
# Converted into dicts to remove duplicates
|
||||
captions = {
|
||||
get_lang_code(sub): sub
|
||||
for sub in traverse_obj(pctr, (..., 'captionTracks', ...))}
|
||||
translation_languages = {
|
||||
lang.get('languageCode'): self._get_text(lang.get('languageName'), max_runs=1)
|
||||
for lang in traverse_obj(pctr, (..., 'translationLanguages', ...))}
|
||||
pctrs = traverse_obj(prs, (..., 'captions', 'playerCaptionsTracklistRenderer', {dict}))
|
||||
translation_languages = {
|
||||
lang.get('languageCode'): self._get_text(lang.get('languageName'), max_runs=1)
|
||||
for lang in traverse_obj(pctrs, (..., 'translationLanguages', ..., {dict}))}
|
||||
# NB: Constructing the full subtitle dictionary is slow
|
||||
get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and (
|
||||
self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles'))
|
||||
|
||||
def process_language(container, base_url, lang_code, sub_name, query):
|
||||
lang_subs = container.setdefault(lang_code, [])
|
||||
for fmt in self._SUBTITLE_FORMATS:
|
||||
query.update({
|
||||
'fmt': fmt,
|
||||
})
|
||||
lang_subs.append({
|
||||
'ext': fmt,
|
||||
'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)),
|
||||
'name': sub_name,
|
||||
})
|
||||
all_captions = traverse_obj(pctrs, (..., 'captionTracks', ..., {dict}))
|
||||
need_subs_langs = {get_lang_code(sub) for sub in all_captions if sub.get('kind') != 'asr'}
|
||||
need_caps_langs = {
|
||||
remove_start(get_lang_code(sub), 'a-')
|
||||
for sub in all_captions if sub.get('kind') == 'asr'}
|
||||
|
||||
# NB: Constructing the full subtitle dictionary is slow
|
||||
get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and (
|
||||
self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles'))
|
||||
for lang_code, caption_track in captions.items():
|
||||
base_url = caption_track.get('baseUrl')
|
||||
orig_lang = parse_qs(base_url).get('lang', [None])[-1]
|
||||
if not base_url:
|
||||
continue
|
||||
for pr in prs:
|
||||
pctr = pr['captions']['playerCaptionsTracklistRenderer']
|
||||
client_name = pr['streamingData'][STREAMING_DATA_CLIENT_NAME]
|
||||
innertube_client_name = pr['streamingData'][STREAMING_DATA_INNERTUBE_CONTEXT]['client']['clientName']
|
||||
required_contexts = self._get_default_ytcfg(client_name)['PO_TOKEN_REQUIRED_CONTEXTS']
|
||||
fetch_subs_po_token_func = pr['streamingData'][STREAMING_DATA_FETCH_SUBS_PO_TOKEN]
|
||||
|
||||
pot_params = {}
|
||||
already_fetched_pot = False
|
||||
|
||||
for caption_track in traverse_obj(pctr, ('captionTracks', lambda _, v: v['baseUrl'])):
|
||||
base_url = caption_track['baseUrl']
|
||||
qs = parse_qs(base_url)
|
||||
lang_code = get_lang_code(caption_track)
|
||||
requires_pot = (
|
||||
# We can detect the experiment for now
|
||||
any(e in traverse_obj(qs, ('exp', ...)) for e in ('xpe', 'xpv'))
|
||||
or _PoTokenContext.SUBS in required_contexts)
|
||||
|
||||
if not already_fetched_pot:
|
||||
already_fetched_pot = True
|
||||
if subs_po_token := fetch_subs_po_token_func(required=requires_pot):
|
||||
pot_params.update({
|
||||
'pot': subs_po_token,
|
||||
'potc': '1',
|
||||
'c': innertube_client_name,
|
||||
})
|
||||
|
||||
if not pot_params and requires_pot:
|
||||
skipped_subs_clients.add(client_name)
|
||||
self._report_pot_subtitles_skipped(video_id, client_name)
|
||||
break
|
||||
|
||||
orig_lang = qs.get('lang', [None])[-1]
|
||||
lang_name = self._get_text(caption_track, 'name', max_runs=1)
|
||||
if caption_track.get('kind') != 'asr':
|
||||
if not lang_code:
|
||||
continue
|
||||
process_language(
|
||||
subtitles, base_url, lang_code, lang_name, {})
|
||||
subtitles, base_url, lang_code, lang_name, client_name, pot_params)
|
||||
if not caption_track.get('isTranslatable'):
|
||||
continue
|
||||
for trans_code, trans_name in translation_languages.items():
|
||||
@ -3950,10 +4027,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
# Add an "-orig" label to the original language so that it can be distinguished.
|
||||
# The subs are returned without "-orig" as well for compatibility
|
||||
process_language(
|
||||
automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {})
|
||||
automatic_captions, base_url, f'{trans_code}-orig',
|
||||
f'{trans_name} (Original)', client_name, pot_params)
|
||||
# Setting tlang=lang returns damaged subtitles.
|
||||
process_language(automatic_captions, base_url, trans_code, trans_name,
|
||||
{} if orig_lang == orig_trans_code else {'tlang': trans_code})
|
||||
process_language(
|
||||
automatic_captions, base_url, trans_code, trans_name, client_name,
|
||||
pot_params if orig_lang == orig_trans_code else {'tlang': trans_code, **pot_params})
|
||||
|
||||
# Avoid duplication if we've already got everything we need
|
||||
need_subs_langs.difference_update(subtitles)
|
||||
need_caps_langs.difference_update(automatic_captions)
|
||||
if not (need_subs_langs or need_caps_langs):
|
||||
break
|
||||
|
||||
if skipped_subs_clients and (need_subs_langs or need_caps_langs):
|
||||
self._report_pot_subtitles_skipped(video_id, True, msg=join_nonempty(
|
||||
f'{video_id}: There are missing subtitles languages because a PO token was not provided.',
|
||||
need_subs_langs and f'Subtitles for these languages are missing: {", ".join(need_subs_langs)}.',
|
||||
need_caps_langs and f'Automatic captions for {len(need_caps_langs)} languages are missing.',
|
||||
delim=' '))
|
||||
|
||||
info['automatic_captions'] = automatic_captions
|
||||
info['subtitles'] = subtitles
|
||||
|
@ -39,6 +39,7 @@ __all__ = [
|
||||
class PoTokenContext(enum.Enum):
|
||||
GVS = 'gvs'
|
||||
PLAYER = 'player'
|
||||
SUBS = 'subs'
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
|
@ -51,7 +51,7 @@ def get_webpo_content_binding(
|
||||
return visitor_id, ContentBindingType.VISITOR_ID
|
||||
return request.visitor_data, ContentBindingType.VISITOR_DATA
|
||||
|
||||
elif request.context == PoTokenContext.PLAYER or client_name != 'WEB_REMIX':
|
||||
elif request.context in (PoTokenContext.PLAYER, PoTokenContext.SUBS):
|
||||
return request.video_id, ContentBindingType.VIDEO_ID
|
||||
|
||||
return None, None
|
||||
|
Loading…
x
Reference in New Issue
Block a user