diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index d42bce21b2..f546ea4216 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -839,7 +839,10 @@ from .huya import ( from .hypem import HypemIE from .hypergryph import MonsterSirenHypergryphMusicIE from .hytale import HytaleIE -from .icareus import IcareusIE +from .icareus import ( + IcareusIE, + IcareusNextIE, +) from .ichinanalive import ( IchinanaLiveClipIE, IchinanaLiveIE, diff --git a/yt_dlp/extractor/icareus.py b/yt_dlp/extractor/icareus.py index 3d6e1f94d4..76a35af47e 100644 --- a/yt_dlp/extractor/icareus.py +++ b/yt_dlp/extractor/icareus.py @@ -1,11 +1,15 @@ +import json import re from .common import InfoExtractor +from .. import traverse_obj from ..utils import ( + ExtractorError, clean_html, determine_ext, get_element_by_class, int_or_none, + js_to_json, merge_dicts, parse_bitrate, parse_resolution, @@ -177,3 +181,169 @@ class IcareusIE(InfoExtractor): 'description': clean_html(info.get('description')), 'thumbnails': thumbnails if thumbnails[0]['url'] else None, }, info) + + +class IcareusNextIE(InfoExtractor): + _DOMAINS = '|'.join( + re.escape(domain) + for domain in ( + 'players.icareus.com', + 'helsinkikanava.fi', + ) + ) + _VALID_URL = ( + rf'(?Phttps?://(?:www\.)?(?:{_DOMAINS}))/(?P.+?)/(video|event)/details/(?P\d+)', + r'https?://players.icareus.com/(?P.+?)/embed/vod/(?P\d+)', + ) + _TESTS = [ + { # Regular VOD + 'url': 'https://www.helsinkikanava.fi/fi/video/details/68021894', + 'md5': '3e048a91cd6be16d34b98a1548ceed27', + 'info_dict': { + 'id': '68021894', + 'ext': 'mp4', + 'title': 'Perheiden parhaaksi', + 'description': 'md5:fe4e4ec742a34f53022f3a0409b0f6e7', + 'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=68021900', + }, + }, + { # Recorded livestream + 'url': 'https://www.helsinkikanava.fi/fi/event/details/76241489', + 'md5': 'a063a7ef36969ced44af9fe3d10a7f47', + 'info_dict': { + 'id': '76241489', + 'ext': 'mp4', + 'title': 'Helsingin kaupungin ja HUSin tiedotustilaisuus koronaepidemiatilanteesta 24.11.2020', + 'description': 'md5:3129d041c6fbbcdc7fe68d9a938fef1c', + 'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=76288630', + }, + }, + { # Embedded player + 'url': 'https://players.icareus.com/elonet/embed/vod/256250758', + 'md5': '420616d561582b9491f0a622b1a3d831', + 'info_dict': { + 'id': '256250758', + 'ext': 'mp4', + 'title': 'Shell Hurriganes', + 'description': 'Shell Hurriganes', + 'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=266941624', + }, + }, + ] + + def _is_playback_data_dict(self, element, display_id): + if isinstance(element, dict): + if 'src' in element and 'videoInfo' in element and str_or_none(element.get('id')) == str(display_id): + return True + return False + + def _find_playback_data(self, webpage: str, display_id: str): + # Adapted from Goplay + nextjs_data = traverse_obj( + re.findall(r']*>\s*self\.__next_f\.push\(\s*(\[.+?])\s*\);?\s*', webpage), + ( + ..., + {js_to_json}, + {json.loads}, + ..., + { + lambda s: self._search_json( + r'\w+\s*:\s*', + s, + 'next js data', + None, + contains_pattern=r'\[(?s:.+)\]', + default=None, + ), + }, + ..., + ), + ) + + for element in nextjs_data: + if self._is_playback_data_dict(element, display_id): + return element + + # If the playback data is not found in the first pass, try to find it in the children of the RSC data + for element in traverse_obj(nextjs_data, (..., 'children', ...)): + if self._is_playback_data_dict(element, display_id): + return element + + return None + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + playback_data = self._find_playback_data(webpage, display_id) + if playback_data is None: + raise ExtractorError('No playback data found', expected=True, video_id=display_id) + video_id = str(playback_data['id']) + video_info = playback_data['videoInfo'] + + subtitles = {} + for sub_info in video_info.get('subtitles') or []: + _, sdesc, surl = sub_info[:3] + sub_name = remove_end(sdesc.split(' ')[0], ':') + subtitles[sub_name] = [{'url': url_or_none(surl)}] + + formats = [] + for audio_url_datum in video_info.get('audio_urls') or []: + audio_url = audio_url_datum.get('url') + if audio_url is None: + continue + formats.append( + { + 'format': audio_url_datum.get('name'), + 'format_id': 'audio', + 'vcodec': 'none', + 'url': audio_url, + 'tbr': None, + }, + ) + + for url_datum in video_info.get('urls') or []: + video_url = url_or_none(url_datum.get('url')) + if video_url is None: + continue + ext = determine_ext(video_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_url, + video_id, + 'mp4', + m3u8_id='hls', + fatal=False, + ) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + pass # TODO: unsupported for now, no examples of this + + # This is weird, but it's the more robust way to find the video file URL for now + if m := re.search(r'\{\\"videoFileUrl\\":\\"(http.+?)\\"', webpage): + try: + if video_file_url := url_or_none(json.loads(f'"{m.group(1)}"')): + formats.append( + { + 'url': video_file_url, + 'format_id': 'download', + }, + ) + except json.JSONDecodeError: + pass + + thumbnails = [] + if thumbnail := url_or_none(video_info.get('thumbnail')): + thumbnails.append({'url': thumbnail}) + + description = clean_html(self._html_search_meta(['description'], webpage)) + title = clean_html(self._html_search_meta(['og:title'], webpage)) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'description': description, + 'thumbnails': thumbnails or None, + }