mirror of
https://github.com/yt-dlp/yt-dlp
synced 2025-01-18 23:03:05 +01:00
[ie/icareus] Add IcareusNext extractor, for new-style Icareus sites
This commit is contained in:
parent
a3c0321825
commit
22ed863fad
2 changed files with 174 additions and 1 deletions
|
@ -839,7 +839,10 @@ from .huya import (
|
|||
from .hypem import HypemIE
|
||||
from .hypergryph import MonsterSirenHypergryphMusicIE
|
||||
from .hytale import HytaleIE
|
||||
from .icareus import IcareusIE
|
||||
from .icareus import (
|
||||
IcareusIE,
|
||||
IcareusNextIE,
|
||||
)
|
||||
from .ichinanalive import (
|
||||
IchinanaLiveClipIE,
|
||||
IchinanaLiveIE,
|
||||
|
|
|
@ -1,11 +1,15 @@
|
|||
import json
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from .. import traverse_obj
|
||||
from ..utils import (
|
||||
ExtractorError,
|
||||
clean_html,
|
||||
determine_ext,
|
||||
get_element_by_class,
|
||||
int_or_none,
|
||||
js_to_json,
|
||||
merge_dicts,
|
||||
parse_bitrate,
|
||||
parse_resolution,
|
||||
|
@ -177,3 +181,169 @@ class IcareusIE(InfoExtractor):
|
|||
'description': clean_html(info.get('description')),
|
||||
'thumbnails': thumbnails if thumbnails[0]['url'] else None,
|
||||
}, info)
|
||||
|
||||
|
||||
class IcareusNextIE(InfoExtractor):
|
||||
_DOMAINS = '|'.join(
|
||||
re.escape(domain)
|
||||
for domain in (
|
||||
'players.icareus.com',
|
||||
'helsinkikanava.fi',
|
||||
)
|
||||
)
|
||||
_VALID_URL = (
|
||||
rf'(?P<base_url>https?://(?:www\.)?(?:{_DOMAINS}))/(?P<language>.+?)/(video|event)/details/(?P<id>\d+)',
|
||||
r'https?://players.icareus.com/(?P<brand>.+?)/embed/vod/(?P<id>\d+)',
|
||||
)
|
||||
_TESTS = [
|
||||
{ # Regular VOD
|
||||
'url': 'https://www.helsinkikanava.fi/fi/video/details/68021894',
|
||||
'md5': '3e048a91cd6be16d34b98a1548ceed27',
|
||||
'info_dict': {
|
||||
'id': '68021894',
|
||||
'ext': 'mp4',
|
||||
'title': 'Perheiden parhaaksi',
|
||||
'description': 'md5:fe4e4ec742a34f53022f3a0409b0f6e7',
|
||||
'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=68021900',
|
||||
},
|
||||
},
|
||||
{ # Recorded livestream
|
||||
'url': 'https://www.helsinkikanava.fi/fi/event/details/76241489',
|
||||
'md5': 'a063a7ef36969ced44af9fe3d10a7f47',
|
||||
'info_dict': {
|
||||
'id': '76241489',
|
||||
'ext': 'mp4',
|
||||
'title': 'Helsingin kaupungin ja HUSin tiedotustilaisuus koronaepidemiatilanteesta 24.11.2020',
|
||||
'description': 'md5:3129d041c6fbbcdc7fe68d9a938fef1c',
|
||||
'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=76288630',
|
||||
},
|
||||
},
|
||||
{ # Embedded player
|
||||
'url': 'https://players.icareus.com/elonet/embed/vod/256250758',
|
||||
'md5': '420616d561582b9491f0a622b1a3d831',
|
||||
'info_dict': {
|
||||
'id': '256250758',
|
||||
'ext': 'mp4',
|
||||
'title': 'Shell Hurriganes',
|
||||
'description': 'Shell Hurriganes',
|
||||
'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=266941624',
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
def _is_playback_data_dict(self, element, display_id):
|
||||
if isinstance(element, dict):
|
||||
if 'src' in element and 'videoInfo' in element and str_or_none(element.get('id')) == str(display_id):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _find_playback_data(self, webpage: str, display_id: str):
|
||||
# Adapted from Goplay
|
||||
nextjs_data = traverse_obj(
|
||||
re.findall(r'<script[^>]*>\s*self\.__next_f\.push\(\s*(\[.+?])\s*\);?\s*</script>', webpage),
|
||||
(
|
||||
...,
|
||||
{js_to_json},
|
||||
{json.loads},
|
||||
...,
|
||||
{
|
||||
lambda s: self._search_json(
|
||||
r'\w+\s*:\s*',
|
||||
s,
|
||||
'next js data',
|
||||
None,
|
||||
contains_pattern=r'\[(?s:.+)\]',
|
||||
default=None,
|
||||
),
|
||||
},
|
||||
...,
|
||||
),
|
||||
)
|
||||
|
||||
for element in nextjs_data:
|
||||
if self._is_playback_data_dict(element, display_id):
|
||||
return element
|
||||
|
||||
# If the playback data is not found in the first pass, try to find it in the children of the RSC data
|
||||
for element in traverse_obj(nextjs_data, (..., 'children', ...)):
|
||||
if self._is_playback_data_dict(element, display_id):
|
||||
return element
|
||||
|
||||
return None
|
||||
|
||||
def _real_extract(self, url):
|
||||
display_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, display_id)
|
||||
playback_data = self._find_playback_data(webpage, display_id)
|
||||
if playback_data is None:
|
||||
raise ExtractorError('No playback data found', expected=True, video_id=display_id)
|
||||
video_id = str(playback_data['id'])
|
||||
video_info = playback_data['videoInfo']
|
||||
|
||||
subtitles = {}
|
||||
for sub_info in video_info.get('subtitles') or []:
|
||||
_, sdesc, surl = sub_info[:3]
|
||||
sub_name = remove_end(sdesc.split(' ')[0], ':')
|
||||
subtitles[sub_name] = [{'url': url_or_none(surl)}]
|
||||
|
||||
formats = []
|
||||
for audio_url_datum in video_info.get('audio_urls') or []:
|
||||
audio_url = audio_url_datum.get('url')
|
||||
if audio_url is None:
|
||||
continue
|
||||
formats.append(
|
||||
{
|
||||
'format': audio_url_datum.get('name'),
|
||||
'format_id': 'audio',
|
||||
'vcodec': 'none',
|
||||
'url': audio_url,
|
||||
'tbr': None,
|
||||
},
|
||||
)
|
||||
|
||||
for url_datum in video_info.get('urls') or []:
|
||||
video_url = url_or_none(url_datum.get('url'))
|
||||
if video_url is None:
|
||||
continue
|
||||
ext = determine_ext(video_url)
|
||||
if ext == 'm3u8':
|
||||
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||
video_url,
|
||||
video_id,
|
||||
'mp4',
|
||||
m3u8_id='hls',
|
||||
fatal=False,
|
||||
)
|
||||
formats.extend(fmts)
|
||||
self._merge_subtitles(subs, target=subtitles)
|
||||
else:
|
||||
pass # TODO: unsupported for now, no examples of this
|
||||
|
||||
# This is weird, but it's the more robust way to find the video file URL for now
|
||||
if m := re.search(r'\{\\"videoFileUrl\\":\\"(http.+?)\\"', webpage):
|
||||
try:
|
||||
if video_file_url := url_or_none(json.loads(f'"{m.group(1)}"')):
|
||||
formats.append(
|
||||
{
|
||||
'url': video_file_url,
|
||||
'format_id': 'download',
|
||||
},
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
thumbnails = []
|
||||
if thumbnail := url_or_none(video_info.get('thumbnail')):
|
||||
thumbnails.append({'url': thumbnail})
|
||||
|
||||
description = clean_html(self._html_search_meta(['description'], webpage))
|
||||
title = clean_html(self._html_search_meta(['og:title'], webpage))
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': title,
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
'description': description,
|
||||
'thumbnails': thumbnails or None,
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue