[ie] Improve JSON LD thumbnails extraction (#13368)

Authored by: bashonly, doe1080

Co-authored-by: doe1080 <98906116+doe1080@users.noreply.github.com>
This commit is contained in:
bashonly 2025-06-01 18:09:47 -05:00 committed by GitHub
parent 943083edcd
commit 85c8a405e3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 17 additions and 3 deletions

View File

@ -314,6 +314,20 @@ class TestInfoExtractor(unittest.TestCase):
}, },
{}, {},
), ),
(
# test thumbnail_url key without URL scheme
r'''
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "VideoObject",
"thumbnail_url": "//www.nobelprize.org/images/12693-landscape-medium-gallery.jpg"
}</script>''',
{
'thumbnails': [{'url': 'https://www.nobelprize.org/images/12693-landscape-medium-gallery.jpg'}],
},
{},
),
] ]
for html, expected_dict, search_json_ld_kwargs in _TESTS: for html, expected_dict, search_json_ld_kwargs in _TESTS:
expect_dict( expect_dict(

View File

@ -1675,9 +1675,9 @@ class InfoExtractor:
'ext': mimetype2ext(e.get('encodingFormat')), 'ext': mimetype2ext(e.get('encodingFormat')),
'title': unescapeHTML(e.get('name')), 'title': unescapeHTML(e.get('name')),
'description': unescapeHTML(e.get('description')), 'description': unescapeHTML(e.get('description')),
'thumbnails': [{'url': unescapeHTML(url)} 'thumbnails': traverse_obj(e, (('thumbnailUrl', 'thumbnailURL', 'thumbnail_url'), (None, ...), {
for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL')) 'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}),
if url_or_none(url)], })),
'duration': parse_duration(e.get('duration')), 'duration': parse_duration(e.get('duration')),
'timestamp': unified_timestamp(e.get('uploadDate')), 'timestamp': unified_timestamp(e.get('uploadDate')),
# author can be an instance of 'Organization' or 'Person' types. # author can be an instance of 'Organization' or 'Person' types.