From 85c8a405e3651dc041b758f4744d4fb3c4c55e01 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 1 Jun 2025 18:09:47 -0500 Subject: [PATCH] [ie] Improve JSON LD thumbnails extraction (#13368) Authored by: bashonly, doe1080 Co-authored-by: doe1080 <98906116+doe1080@users.noreply.github.com> --- test/test_InfoExtractor.py | 14 ++++++++++++++ yt_dlp/extractor/common.py | 6 +++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index c6ff6209a8..bc89b2955e 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -314,6 +314,20 @@ class TestInfoExtractor(unittest.TestCase): }, {}, ), + ( + # test thumbnail_url key without URL scheme + r''' +''', + { + 'thumbnails': [{'url': 'https://www.nobelprize.org/images/12693-landscape-medium-gallery.jpg'}], + }, + {}, + ), ] for html, expected_dict, search_json_ld_kwargs in _TESTS: expect_dict( diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index d5607296df..1174bd4f5e 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1675,9 +1675,9 @@ class InfoExtractor: 'ext': mimetype2ext(e.get('encodingFormat')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnails': [{'url': unescapeHTML(url)} - for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL')) - if url_or_none(url)], + 'thumbnails': traverse_obj(e, (('thumbnailUrl', 'thumbnailURL', 'thumbnail_url'), (None, ...), { + 'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}), + })), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), # author can be an instance of 'Organization' or 'Person' types.