[ie/icareus] Add IcareusNext extractor, for new-style Icareus sites

2025-01-18 23:03:05 +01:00 · 2025-01-14 15:39:52 +02:00 · 2025-01-14 15:39:52 +02:00 · 22ed863fad
commit 22ed863fad
parent a3c0321825
2 changed files with 174 additions and 1 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -839,7 +839,10 @@ from .huya import (
 from .hypem import HypemIE
 from .hypergryph import MonsterSirenHypergryphMusicIE
 from .hytale import HytaleIE
-from .icareus import IcareusIE
+from .icareus import (
+    IcareusIE,
+    IcareusNextIE,
+)
 from .ichinanalive import (
    IchinanaLiveClipIE,
    IchinanaLiveIE,
--- a/yt_dlp/extractor/icareus.py
+++ b/yt_dlp/extractor/icareus.py
@ -1,11 +1,15 @@
+import json
 import re

 from .common import InfoExtractor
+from .. import traverse_obj
 from ..utils import (
+    ExtractorError,
    clean_html,
    determine_ext,
    get_element_by_class,
    int_or_none,
+    js_to_json,
    merge_dicts,
    parse_bitrate,
    parse_resolution,
@ -177,3 +181,169 @@ class IcareusIE(InfoExtractor):
            'description': clean_html(info.get('description')),
            'thumbnails': thumbnails if thumbnails[0]['url'] else None,
        }, info)
+
+
+class IcareusNextIE(InfoExtractor):
+    _DOMAINS = '|'.join(
+        re.escape(domain)
+        for domain in (
+            'players.icareus.com',
+            'helsinkikanava.fi',
+        )
+    )
+    _VALID_URL = (
+        rf'(?P<base_url>https?://(?:www\.)?(?:{_DOMAINS}))/(?P<language>.+?)/(video|event)/details/(?P<id>\d+)',
+        r'https?://players.icareus.com/(?P<brand>.+?)/embed/vod/(?P<id>\d+)',
+    )
+    _TESTS = [
+        {  # Regular VOD
+            'url': 'https://www.helsinkikanava.fi/fi/video/details/68021894',
+            'md5': '3e048a91cd6be16d34b98a1548ceed27',
+            'info_dict': {
+                'id': '68021894',
+                'ext': 'mp4',
+                'title': 'Perheiden parhaaksi',
+                'description': 'md5:fe4e4ec742a34f53022f3a0409b0f6e7',
+                'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=68021900',
+            },
+        },
+        {  # Recorded livestream
+            'url': 'https://www.helsinkikanava.fi/fi/event/details/76241489',
+            'md5': 'a063a7ef36969ced44af9fe3d10a7f47',
+            'info_dict': {
+                'id': '76241489',
+                'ext': 'mp4',
+                'title': 'Helsingin kaupungin ja HUSin tiedotustilaisuus koronaepidemiatilanteesta 24.11.2020',
+                'description': 'md5:3129d041c6fbbcdc7fe68d9a938fef1c',
+                'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=76288630',
+            },
+        },
+        {  # Embedded player
+            'url': 'https://players.icareus.com/elonet/embed/vod/256250758',
+            'md5': '420616d561582b9491f0a622b1a3d831',
+            'info_dict': {
+                'id': '256250758',
+                'ext': 'mp4',
+                'title': 'Shell Hurriganes',
+                'description': 'Shell Hurriganes',
+                'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=266941624',
+            },
+        },
+    ]
+
+    def _is_playback_data_dict(self, element, display_id):
+        if isinstance(element, dict):
+            if 'src' in element and 'videoInfo' in element and str_or_none(element.get('id')) == str(display_id):
+                return True
+        return False
+
+    def _find_playback_data(self, webpage: str, display_id: str):
+        # Adapted from Goplay
+        nextjs_data = traverse_obj(
+            re.findall(r'<script[^>]*>\s*self\.__next_f\.push\(\s*(\[.+?])\s*\);?\s*</script>', webpage),
+            (
+                ...,
+                {js_to_json},
+                {json.loads},
+                ...,
+                {
+                    lambda s: self._search_json(
+                        r'\w+\s*:\s*',
+                        s,
+                        'next js data',
+                        None,
+                        contains_pattern=r'\[(?s:.+)\]',
+                        default=None,
+                    ),
+                },
+                ...,
+            ),
+        )
+
+        for element in nextjs_data:
+            if self._is_playback_data_dict(element, display_id):
+                return element
+
+        # If the playback data is not found in the first pass, try to find it in the children of the RSC data
+        for element in traverse_obj(nextjs_data, (..., 'children', ...)):
+            if self._is_playback_data_dict(element, display_id):
+                return element
+
+        return None
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        playback_data = self._find_playback_data(webpage, display_id)
+        if playback_data is None:
+            raise ExtractorError('No playback data found', expected=True, video_id=display_id)
+        video_id = str(playback_data['id'])
+        video_info = playback_data['videoInfo']
+
+        subtitles = {}
+        for sub_info in video_info.get('subtitles') or []:
+            _, sdesc, surl = sub_info[:3]
+            sub_name = remove_end(sdesc.split(' ')[0], ':')
+            subtitles[sub_name] = [{'url': url_or_none(surl)}]
+
+        formats = []
+        for audio_url_datum in video_info.get('audio_urls') or []:
+            audio_url = audio_url_datum.get('url')
+            if audio_url is None:
+                continue
+            formats.append(
+                {
+                    'format': audio_url_datum.get('name'),
+                    'format_id': 'audio',
+                    'vcodec': 'none',
+                    'url': audio_url,
+                    'tbr': None,
+                },
+            )
+
+        for url_datum in video_info.get('urls') or []:
+            video_url = url_or_none(url_datum.get('url'))
+            if video_url is None:
+                continue
+            ext = determine_ext(video_url)
+            if ext == 'm3u8':
+                fmts, subs = self._extract_m3u8_formats_and_subtitles(
+                    video_url,
+                    video_id,
+                    'mp4',
+                    m3u8_id='hls',
+                    fatal=False,
+                )
+                formats.extend(fmts)
+                self._merge_subtitles(subs, target=subtitles)
+            else:
+                pass  # TODO: unsupported for now, no examples of this
+
+        # This is weird, but it's the more robust way to find the video file URL for now
+        if m := re.search(r'\{\\"videoFileUrl\\":\\"(http.+?)\\"', webpage):
+            try:
+                if video_file_url := url_or_none(json.loads(f'"{m.group(1)}"')):
+                    formats.append(
+                        {
+                            'url': video_file_url,
+                            'format_id': 'download',
+                        },
+                    )
+            except json.JSONDecodeError:
+                pass
+
+        thumbnails = []
+        if thumbnail := url_or_none(video_info.get('thumbnail')):
+            thumbnails.append({'url': thumbnail})
+
+        description = clean_html(self._html_search_meta(['description'], webpage))
+        title = clean_html(self._html_search_meta(['og:title'], webpage))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'subtitles': subtitles,
+            'description': description,
+            'thumbnails': thumbnails or None,
+        }