From 51887484e46ab6015c041cb1ab626a55f25a03bd Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 12 Jun 2025 17:15:01 -0500 Subject: [PATCH] [ie] Add `_search_nuxt_json` helper (#13386) * Adds InfoExtractor._search_nuxt_json for webpage extraction * Adds InfoExtractor._resolve_nuxt_array for direct use with payload JSON * Adds yt_dlp.utils.jslib module for Python solutions to common JavaScript libraries * Adds devalue.parse and devalue.parse_iter to jslib utils Ref: * https://github.com/nuxt/nuxt/commit/9e503be0f2a24f4df72a3ccab2db4d3e63511f57 * https://github.com/Rich-Harris/devalue/blob/f3fd2aa93d79f21746555671f955a897335edb1b/src/parse.js Authored by: bashonly, Grub4K Co-authored-by: Simon Sawicki --- test/test_InfoExtractor.py | 131 ++++++++++++++++++ test/test_devalue.py | 235 +++++++++++++++++++++++++++++++++ yt_dlp/extractor/common.py | 58 ++++++++ yt_dlp/utils/jslib/__init__.py | 1 + yt_dlp/utils/jslib/devalue.py | 167 +++++++++++++++++++++++ 5 files changed, 592 insertions(+) create mode 100644 test/test_devalue.py create mode 100644 yt_dlp/utils/jslib/__init__.py create mode 100644 yt_dlp/utils/jslib/devalue.py diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index bc89b2955e..e6c8d574e0 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1947,6 +1947,137 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ with self.assertWarns(DeprecationWarning): self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {}) + def test_search_nuxt_json(self): + HTML_TMPL = '' + VALID_DATA = ''' + ["ShallowReactive",1], + {"data":2,"state":21,"once":25,"_errors":28,"_server_errors":30}, + ["ShallowReactive",3], + {"$abcdef123456":4}, + {"podcast":5,"activeEpisodeData":7}, + {"podcast":6,"seasons":14}, + {"title":10,"id":11}, + ["Reactive",8], + {"episode":9,"creators":18,"empty_list":20}, + {"title":12,"id":13,"refs":34,"empty_refs":35}, + "Series Title", + "podcast-id-01", + "Episode Title", + "episode-id-99", + [15,16,17], + 1, + 2, + 3, + [19], + "Podcast Creator", + [], + {"$ssite-config":22}, + {"env":23,"name":24,"map":26,"numbers":14}, + "production", + "podcast-website", + ["Set"], + ["Reactive",27], + ["Map"], + ["ShallowReactive",29], + {}, + ["NuxtError",31], + {"status":32,"message":33}, + 503, + "Service Unavailable", + [36,37], + [38,39], + ["Ref",40], + ["ShallowRef",41], + ["EmptyRef",42], + ["EmptyShallowRef",43], + "ref", + "shallow_ref", + "{\\"ref\\":1}", + "{\\"shallow_ref\\":2}" + ''' + PAYLOAD = { + 'data': { + '$abcdef123456': { + 'podcast': { + 'podcast': { + 'title': 'Series Title', + 'id': 'podcast-id-01', + }, + 'seasons': [1, 2, 3], + }, + 'activeEpisodeData': { + 'episode': { + 'title': 'Episode Title', + 'id': 'episode-id-99', + 'refs': ['ref', 'shallow_ref'], + 'empty_refs': [{'ref': 1}, {'shallow_ref': 2}], + }, + 'creators': ['Podcast Creator'], + 'empty_list': [], + }, + }, + }, + 'state': { + '$ssite-config': { + 'env': 'production', + 'name': 'podcast-website', + 'map': [], + 'numbers': [1, 2, 3], + }, + }, + 'once': [], + '_errors': {}, + '_server_errors': { + 'status': 503, + 'message': 'Service Unavailable', + }, + } + PARTIALLY_INVALID = [( + ''' + {"data":1}, + {"invalid_raw_list":2}, + [15,16,17] + ''', + {'data': {'invalid_raw_list': [None, None, None]}}, + ), ( + ''' + {"data":1}, + ["EmptyRef",2], + "not valid JSON" + ''', + {'data': None}, + ), ( + ''' + {"data":1}, + ["EmptyShallowRef",2], + "not valid JSON" + ''', + {'data': None}, + )] + INVALID = [ + ''' + [] + ''', + ''' + ["unsupported",1], + {"data":2}, + {} + ''', + ] + DEFAULT = object() + + self.assertEqual(self.ie._search_nuxt_json(HTML_TMPL.format(VALID_DATA), None), PAYLOAD) + self.assertEqual(self.ie._search_nuxt_json('', None, fatal=False), {}) + self.assertIs(self.ie._search_nuxt_json('', None, default=DEFAULT), DEFAULT) + + for data, expected in PARTIALLY_INVALID: + self.assertEqual( + self.ie._search_nuxt_json(HTML_TMPL.format(data), None, fatal=False), expected) + + for data in INVALID: + self.assertIs( + self.ie._search_nuxt_json(HTML_TMPL.format(data), None, default=DEFAULT), DEFAULT) + if __name__ == '__main__': unittest.main() diff --git a/test/test_devalue.py b/test/test_devalue.py new file mode 100644 index 0000000000..29eb89e87f --- /dev/null +++ b/test/test_devalue.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python3 + +# Allow direct execution +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +import datetime as dt +import json +import math +import re +import unittest + +from yt_dlp.utils.jslib import devalue + + +TEST_CASES_EQUALS = [{ + 'name': 'int', + 'unparsed': [-42], + 'parsed': -42, +}, { + 'name': 'str', + 'unparsed': ['woo!!!'], + 'parsed': 'woo!!!', +}, { + 'name': 'Number', + 'unparsed': [['Object', 42]], + 'parsed': 42, +}, { + 'name': 'String', + 'unparsed': [['Object', 'yar']], + 'parsed': 'yar', +}, { + 'name': 'Infinity', + 'unparsed': -4, + 'parsed': math.inf, +}, { + 'name': 'negative Infinity', + 'unparsed': -5, + 'parsed': -math.inf, +}, { + 'name': 'negative zero', + 'unparsed': -6, + 'parsed': -0.0, +}, { + 'name': 'RegExp', + 'unparsed': [['RegExp', 'regexp', 'gim']], # XXX: flags are ignored + 'parsed': re.compile('regexp'), +}, { + 'name': 'Date', + 'unparsed': [['Date', '2001-09-09T01:46:40.000Z']], + 'parsed': dt.datetime.fromtimestamp(1e9, tz=dt.timezone.utc), +}, { + 'name': 'Array', + 'unparsed': [[1, 2, 3], 'a', 'b', 'c'], + 'parsed': ['a', 'b', 'c'], +}, { + 'name': 'Array (empty)', + 'unparsed': [[]], + 'parsed': [], +}, { + 'name': 'Array (sparse)', + 'unparsed': [[-2, 1, -2], 'b'], + 'parsed': [None, 'b', None], +}, { + 'name': 'Object', + 'unparsed': [{'foo': 1, 'x-y': 2}, 'bar', 'z'], + 'parsed': {'foo': 'bar', 'x-y': 'z'}, +}, { + 'name': 'Set', + 'unparsed': [['Set', 1, 2, 3], 1, 2, 3], + 'parsed': [1, 2, 3], +}, { + 'name': 'Map', + 'unparsed': [['Map', 1, 2], 'a', 'b'], + 'parsed': [['a', 'b']], +}, { + 'name': 'BigInt', + 'unparsed': [['BigInt', '1']], + 'parsed': 1, +}, { + 'name': 'Uint8Array', + 'unparsed': [['Uint8Array', 'AQID']], + 'parsed': [1, 2, 3], +}, { + 'name': 'ArrayBuffer', + 'unparsed': [['ArrayBuffer', 'AQID']], + 'parsed': [1, 2, 3], +}, { + 'name': 'str (repetition)', + 'unparsed': [[1, 1], 'a string'], + 'parsed': ['a string', 'a string'], +}, { + 'name': 'None (repetition)', + 'unparsed': [[1, 1], None], + 'parsed': [None, None], +}, { + 'name': 'dict (repetition)', + 'unparsed': [[1, 1], {}], + 'parsed': [{}, {}], +}, { + 'name': 'Object without prototype', + 'unparsed': [['null']], + 'parsed': {}, +}, { + 'name': 'cross-realm POJO', + 'unparsed': [{}], + 'parsed': {}, +}] + +TEST_CASES_IS = [{ + 'name': 'bool', + 'unparsed': [True], + 'parsed': True, +}, { + 'name': 'Boolean', + 'unparsed': [['Object', False]], + 'parsed': False, +}, { + 'name': 'undefined', + 'unparsed': -1, + 'parsed': None, +}, { + 'name': 'null', + 'unparsed': [None], + 'parsed': None, +}, { + 'name': 'NaN', + 'unparsed': -3, + 'parsed': math.nan, +}] + +TEST_CASES_INVALID = [{ + 'name': 'empty string', + 'unparsed': '', + 'error': ValueError, + 'pattern': r'expected int or list as input', +}, { + 'name': 'hole', + 'unparsed': -2, + 'error': ValueError, + 'pattern': r'invalid integer input', +}, { + 'name': 'string', + 'unparsed': 'hello', + 'error': ValueError, + 'pattern': r'expected int or list as input', +}, { + 'name': 'number', + 'unparsed': 42, + 'error': ValueError, + 'pattern': r'invalid integer input', +}, { + 'name': 'boolean', + 'unparsed': True, + 'error': ValueError, + 'pattern': r'expected int or list as input', +}, { + 'name': 'null', + 'unparsed': None, + 'error': ValueError, + 'pattern': r'expected int or list as input', +}, { + 'name': 'object', + 'unparsed': {}, + 'error': ValueError, + 'pattern': r'expected int or list as input', +}, { + 'name': 'empty array', + 'unparsed': [], + 'error': ValueError, + 'pattern': r'expected a non-empty list as input', +}, { + 'name': 'Python negative indexing', + 'unparsed': [[1, 2, 3, 4, 5, 6, 7, -7], 1, 2, 3, 4, 5, 6, 7], + 'error': IndexError, + 'pattern': r'invalid index: -7', +}] + + +class TestDevalue(unittest.TestCase): + def test_devalue_parse_equals(self): + for tc in TEST_CASES_EQUALS: + self.assertEqual(devalue.parse(tc['unparsed']), tc['parsed'], tc['name']) + + def test_devalue_parse_is(self): + for tc in TEST_CASES_IS: + self.assertIs(devalue.parse(tc['unparsed']), tc['parsed'], tc['name']) + + def test_devalue_parse_invalid(self): + for tc in TEST_CASES_INVALID: + with self.assertRaisesRegex(tc['error'], tc['pattern'], msg=tc['name']): + devalue.parse(tc['unparsed']) + + def test_devalue_parse_cyclical(self): + name = 'Map (cyclical)' + result = devalue.parse([['Map', 1, 0], 'self']) + self.assertEqual(result[0][0], 'self', name) + self.assertIs(result, result[0][1], name) + + name = 'Set (cyclical)' + result = devalue.parse([['Set', 0, 1], 42]) + self.assertEqual(result[1], 42, name) + self.assertIs(result, result[0], name) + + result = devalue.parse([[0]]) + self.assertIs(result, result[0], 'Array (cyclical)') + + name = 'Object (cyclical)' + result = devalue.parse([{'self': 0}]) + self.assertIs(result, result['self'], name) + + name = 'Object with null prototype (cyclical)' + result = devalue.parse([['null', 'self', 0]]) + self.assertIs(result, result['self'], name) + + name = 'Objects (cyclical)' + result = devalue.parse([[1, 2], {'second': 2}, {'first': 1}]) + self.assertIs(result[0], result[1]['first'], name) + self.assertIs(result[1], result[0]['second'], name) + + def test_devalue_parse_revivers(self): + self.assertEqual( + devalue.parse([['indirect', 1], {'a': 2}, 'b'], revivers={'indirect': lambda x: x}), + {'a': 'b'}, 'revivers (indirect)') + + self.assertEqual( + devalue.parse([['parse', 1], '{"a":0}'], revivers={'parse': lambda x: json.loads(x)}), + {'a': 0}, 'revivers (parse)') + + +if __name__ == '__main__': + unittest.main() diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 1174bd4f5e..6058f66aea 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -101,6 +101,7 @@ from ..utils import ( xpath_with_ns, ) from ..utils._utils import _request_dump_filename +from ..utils.jslib import devalue class InfoExtractor: @@ -1795,6 +1796,63 @@ class InfoExtractor: ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) return traverse_obj(ret, traverse) or {} + def _resolve_nuxt_array(self, array, video_id, *, fatal=True, default=NO_DEFAULT): + """Resolves Nuxt rich JSON payload arrays""" + # Ref: https://github.com/nuxt/nuxt/commit/9e503be0f2a24f4df72a3ccab2db4d3e63511f57 + # https://github.com/nuxt/nuxt/pull/19205 + if default is not NO_DEFAULT: + fatal = False + + if not isinstance(array, list) or not array: + error_msg = 'Unable to resolve Nuxt JSON data: invalid input' + if fatal: + raise ExtractorError(error_msg, video_id=video_id) + elif default is NO_DEFAULT: + self.report_warning(error_msg, video_id=video_id) + return {} if default is NO_DEFAULT else default + + def indirect_reviver(data): + return data + + def json_reviver(data): + return json.loads(data) + + gen = devalue.parse_iter(array, revivers={ + 'NuxtError': indirect_reviver, + 'EmptyShallowRef': json_reviver, + 'EmptyRef': json_reviver, + 'ShallowRef': indirect_reviver, + 'ShallowReactive': indirect_reviver, + 'Ref': indirect_reviver, + 'Reactive': indirect_reviver, + }) + + while True: + try: + error_msg = f'Error resolving Nuxt JSON: {gen.send(None)}' + if fatal: + raise ExtractorError(error_msg, video_id=video_id) + elif default is NO_DEFAULT: + self.report_warning(error_msg, video_id=video_id, only_once=True) + else: + self.write_debug(f'{video_id}: {error_msg}', only_once=True) + except StopIteration as error: + return error.value or ({} if default is NO_DEFAULT else default) + + def _search_nuxt_json(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT): + """Parses metadata from Nuxt rich JSON payloads embedded in HTML""" + passed_default = default is not NO_DEFAULT + + array = self._search_json( + r']+\bid="__NUXT_DATA__"[^>]*>', webpage, + 'Nuxt JSON data', video_id, contains_pattern=r'\[(?s:.+)\]', + fatal=fatal, default=NO_DEFAULT if not passed_default else None) + + if not array: + return default if passed_default else {} + + return self._resolve_nuxt_array(array, video_id, fatal=fatal, default=default) + @staticmethod def _hidden_inputs(html): html = re.sub(r'', '', html) diff --git a/yt_dlp/utils/jslib/__init__.py b/yt_dlp/utils/jslib/__init__.py new file mode 100644 index 0000000000..19df08b120 --- /dev/null +++ b/yt_dlp/utils/jslib/__init__.py @@ -0,0 +1 @@ +# Utility functions for handling web input based on commonly used JavaScript libraries diff --git a/yt_dlp/utils/jslib/devalue.py b/yt_dlp/utils/jslib/devalue.py new file mode 100644 index 0000000000..d82880d921 --- /dev/null +++ b/yt_dlp/utils/jslib/devalue.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +import array +import base64 +import datetime as dt +import math +import re + +from .._utils import parse_iso8601 + +TYPE_CHECKING = False +if TYPE_CHECKING: + import collections.abc + import typing + + T = typing.TypeVar('T') + + +_ARRAY_TYPE_LOOKUP = { + 'Int8Array': 'b', + 'Uint8Array': 'B', + 'Uint8ClampedArray': 'B', + 'Int16Array': 'h', + 'Uint16Array': 'H', + 'Int32Array': 'i', + 'Uint32Array': 'I', + 'Float32Array': 'f', + 'Float64Array': 'd', + 'BigInt64Array': 'l', + 'BigUint64Array': 'L', + 'ArrayBuffer': 'B', +} + + +def parse_iter(parsed: typing.Any, /, *, revivers: dict[str, collections.abc.Callable[[list], typing.Any]] | None = None): + # based on https://github.com/Rich-Harris/devalue/blob/f3fd2aa93d79f21746555671f955a897335edb1b/src/parse.js + resolved = { + -1: None, + -2: None, + -3: math.nan, + -4: math.inf, + -5: -math.inf, + -6: -0.0, + } + + if isinstance(parsed, int) and not isinstance(parsed, bool): + if parsed not in resolved or parsed == -2: + raise ValueError('invalid integer input') + return resolved[parsed] + elif not isinstance(parsed, list): + raise ValueError('expected int or list as input') + elif not parsed: + raise ValueError('expected a non-empty list as input') + + if revivers is None: + revivers = {} + return_value = [None] + stack: list[tuple] = [(return_value, 0, 0)] + + while stack: + target, index, source = stack.pop() + if isinstance(source, tuple): + name, source, reviver = source + try: + resolved[source] = target[index] = reviver(target[index]) + except Exception as error: + yield TypeError(f'failed to parse {source} as {name!r}: {error}') + resolved[source] = target[index] = None + continue + + if source in resolved: + target[index] = resolved[source] + continue + + # guard against Python negative indexing + if source < 0: + yield IndexError(f'invalid index: {source!r}') + continue + + try: + value = parsed[source] + except IndexError as error: + yield error + continue + + if isinstance(value, list): + if value and isinstance(value[0], str): + # TODO: implement zips `strict=True` + if reviver := revivers.get(value[0]): + if value[1] == source: + # XXX: avoid infinite loop + yield IndexError(f'{value[0]!r} cannot point to itself (index: {source})') + continue + # inverse order: resolve index, revive value + stack.append((target, index, (value[0], value[1], reviver))) + stack.append((target, index, value[1])) + continue + + elif value[0] == 'Date': + try: + result = dt.datetime.fromtimestamp(parse_iso8601(value[1]), tz=dt.timezone.utc) + except Exception: + yield ValueError(f'invalid date: {value[1]!r}') + result = None + + elif value[0] == 'Set': + result = [None] * (len(value) - 1) + for offset, new_source in enumerate(value[1:]): + stack.append((result, offset, new_source)) + + elif value[0] == 'Map': + result = [] + for key, new_source in zip(*(iter(value[1:]),) * 2): + pair = [None, None] + stack.append((pair, 0, key)) + stack.append((pair, 1, new_source)) + result.append(pair) + + elif value[0] == 'RegExp': + # XXX: use jsinterp to translate regex flags + # currently ignores `value[2]` + result = re.compile(value[1]) + + elif value[0] == 'Object': + result = value[1] + + elif value[0] == 'BigInt': + result = int(value[1]) + + elif value[0] == 'null': + result = {} + for key, new_source in zip(*(iter(value[1:]),) * 2): + stack.append((result, key, new_source)) + + elif value[0] in _ARRAY_TYPE_LOOKUP: + typecode = _ARRAY_TYPE_LOOKUP[value[0]] + data = base64.b64decode(value[1]) + result = array.array(typecode, data).tolist() + + else: + yield TypeError(f'invalid type at {source}: {value[0]!r}') + result = None + else: + result = len(value) * [None] + for offset, new_source in enumerate(value): + stack.append((result, offset, new_source)) + + elif isinstance(value, dict): + result = {} + for key, new_source in value.items(): + stack.append((result, key, new_source)) + + else: + result = value + + target[index] = resolved[source] = result + + return return_value[0] + + +def parse(parsed: typing.Any, /, *, revivers: dict[str, collections.abc.Callable[[typing.Any], typing.Any]] | None = None): + generator = parse_iter(parsed, revivers=revivers) + while True: + try: + raise generator.send(None) + except StopIteration as error: + return error.value