1
0
Fork 0
mirror of https://github.com/yt-dlp/yt-dlp synced 2025-01-31 04:22:25 +01:00

[ie/globo] Fix extractor (#11795)

Closes #9512, Closes #11541, Closes #11772
Authored by: slipinthedove, YoshiTabletopGamer

Co-authored-by: YoshiTabletopGamer <88633614+YoshiTabletopGamer@users.noreply.github.com>
This commit is contained in:
dove 2025-01-29 20:55:40 -03:00 committed by GitHub
parent d59f14a0a7
commit f8d0161455
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1,32 +1,48 @@
import base64
import hashlib
import json import json
import random
import re import re
import uuid
from .common import InfoExtractor from .common import InfoExtractor
from ..networking import HEADRequest
from ..utils import ( from ..utils import (
ExtractorError, determine_ext,
filter_dict,
float_or_none, float_or_none,
int_or_none,
orderedSet, orderedSet,
str_or_none, str_or_none,
try_get, try_get,
url_or_none,
) )
from ..utils.traversal import subs_list_to_dict, traverse_obj
class GloboIE(InfoExtractor): class GloboIE(InfoExtractor):
_VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})' _VALID_URL = r'(?:globo:|https?://[^/?#]+?\.globo\.com/(?:[^/?#]+/))(?P<id>\d{7,})'
_NETRC_MACHINE = 'globo' _NETRC_MACHINE = 'globo'
_VIDEO_VIEW = '''
query getVideoView($videoId: ID!) {
video(id: $videoId) {
duration
description
relatedEpisodeNumber
relatedSeasonNumber
headline
title {
originProgramId
headline
}
}
}
'''
_TESTS = [{ _TESTS = [{
'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', 'url': 'https://globoplay.globo.com/v/3607726/',
'info_dict': { 'info_dict': {
'id': '3607726', 'id': '3607726',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa',
'duration': 103.204, 'duration': 103.204,
'uploader': 'G1', 'uploader': 'G1 ao vivo',
'uploader_id': '2015', 'uploader_id': '4209',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -38,39 +54,36 @@ class GloboIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP', 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP',
'duration': 137.973, 'duration': 137.973,
'uploader': 'Rede Globo', 'uploader': 'Bom Dia Brasil',
'uploader_id': '196', 'uploader_id': '810',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
}, {
'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html',
'only_matching': True,
}, {
'url': 'http://globosatplay.globo.com/globonews/v/4472924/',
'only_matching': True,
}, {
'url': 'http://globotv.globo.com/t/programa/v/clipe-sexo-e-as-negas-adeus/3836166/',
'only_matching': True,
}, {
'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/',
'only_matching': True,
}, {
'url': 'http://canaloff.globo.com/programas/desejar-profundo/videos/4518560.html',
'only_matching': True,
}, { }, {
'url': 'globo:3607726', 'url': 'globo:3607726',
'only_matching': True, 'only_matching': True,
}, { },
'url': 'https://globoplay.globo.com/v/10248083/', {
'url': 'globo:8013907', # needs subscription to globoplay
'info_dict': { 'info_dict': {
'id': '10248083', 'id': '8013907',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Melhores momentos: Equador 1 x 1 Brasil pelas Eliminatórias da Copa do Mundo 2022', 'title': 'Capítulo de 14081989',
'duration': 530.964, 'episode_number': 1,
'uploader': 'SporTV', },
'uploader_id': '698', 'params': {
'skip_download': True,
},
},
{
'url': 'globo:12824146',
'info_dict': {
'id': '12824146',
'ext': 'mp4',
'title': 'Acordo de damas',
'episode_number': 1,
'season_number': 2,
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -80,98 +93,70 @@ class GloboIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
self._request_webpage( info = self._download_json(
HEADRequest('https://globo-ab.globo.com/v2/selected-alternatives?experiments=player-isolated-experiment-02&skipImpressions=true'), 'https://cloud-jarvis.globo.com/graphql', video_id,
video_id, 'Getting cookies') query={
'operationName': 'getVideoView',
video = self._download_json( 'variables': json.dumps({'videoId': video_id}),
f'http://api.globovideos.com/videos/{video_id}/playlist', 'query': self._VIDEO_VIEW,
video_id)['videos'][0] }, headers={
if not self.get_param('allow_unplayable_formats') and video.get('encrypted') is True: 'content-type': 'application/json',
self.report_drm(video_id) 'x-platform-id': 'web',
'x-device-id': 'desktop',
title = video['title'] 'x-client-version': '2024.12-5',
})['data']['video']
formats = [] formats = []
security = self._download_json( video = self._download_json(
'https://playback.video.globo.com/v2/video-session', video_id, f'Downloading security hash for {video_id}', 'https://playback.video.globo.com/v4/video-session', video_id,
headers={'content-type': 'application/json'}, data=json.dumps({ f'Downloading resource info for {video_id}',
'player_type': 'desktop', headers={'Content-Type': 'application/json'},
data=json.dumps(filter_dict({
'player_type': 'mirakulo_8k_hdr',
'video_id': video_id, 'video_id': video_id,
'quality': 'max', 'quality': 'max',
'content_protection': 'widevine', 'content_protection': 'widevine',
'vsid': '581b986b-4c40-71f0-5a58-803e579d5fa2', 'vsid': f'{uuid.uuid4()}',
'tz': '-3.0:00', 'consumption': 'streaming',
}).encode()) 'capabilities': {'low_latency': True},
'tz': '-03:00',
'Authorization': try_get(self._get_cookies('https://globo.com'),
lambda x: f'Bearer {x["GLBID"].value}'),
'version': 1,
})).encode())
self._request_webpage(HEADRequest(security['sources'][0]['url_template']), video_id, 'Getting locksession cookie') if traverse_obj(video, ('resource', 'drm_protection_enabled', {bool})):
self.report_drm(video_id)
security_hash = security['sources'][0]['token'] main_source = video['sources'][0]
if not security_hash:
message = security.get('message')
if message:
raise ExtractorError(
f'{self.IE_NAME} returned error: {message}', expected=True)
hash_code = security_hash[:2] # 4k streams are exclusively outputted in dash, so we need to filter these out
padding = '%010d' % random.randint(1, 10000000000) if determine_ext(main_source['url']) == 'mpd':
if hash_code in ('04', '14'): formats, subtitles = self._extract_mpd_formats_and_subtitles(main_source['url'], video_id, mpd_id='dash')
received_time = security_hash[3:13] else:
received_md5 = security_hash[24:] formats, subtitles = self._extract_m3u8_formats_and_subtitles(
hash_prefix = security_hash[:23] main_source['url'], video_id, 'mp4', m3u8_id='hls')
elif hash_code in ('02', '12', '03', '13'): self._merge_subtitles(traverse_obj(main_source, ('text', ..., {
received_time = security_hash[2:12] 'url': ('subtitle', 'srt', 'url', {url_or_none}),
received_md5 = security_hash[22:] }, all, {subs_list_to_dict(lang='en')})), target=subtitles)
padding += '1'
hash_prefix = '05' + security_hash[:22]
padded_sign_time = str(int(received_time) + 86400) + padding
md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode()
signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=')
signed_hash = hash_prefix + padded_sign_time + signed_md5
source = security['sources'][0]['url_parts']
resource_url = source['scheme'] + '://' + source['domain'] + source['path']
signed_url = '{}?h={}&k=html5&a={}'.format(resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A')
fmts, subtitles = self._extract_m3u8_formats_and_subtitles(
signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
formats.extend(fmts)
for resource in video['resources']:
if resource.get('type') == 'subtitle':
subtitles.setdefault(resource.get('language') or 'por', []).append({
'url': resource.get('url'),
})
subs = try_get(security, lambda x: x['source']['subtitles'], expected_type=dict) or {}
for sub_lang, sub_url in subs.items():
if sub_url:
subtitles.setdefault(sub_lang or 'por', []).append({
'url': sub_url,
})
subs = try_get(security, lambda x: x['source']['subtitles_webvtt'], expected_type=dict) or {}
for sub_lang, sub_url in subs.items():
if sub_url:
subtitles.setdefault(sub_lang or 'por', []).append({
'url': sub_url,
})
duration = float_or_none(video.get('duration'), 1000)
uploader = video.get('channel')
uploader_id = str_or_none(video.get('channel_id'))
return { return {
'id': video_id, 'id': video_id,
'title': title, **traverse_obj(info, {
'duration': duration, 'title': ('headline', {str}),
'uploader': uploader, 'duration': ('duration', {float_or_none(scale=1000)}),
'uploader_id': uploader_id, 'uploader': ('title', 'headline', {str}),
'uploader_id': ('title', 'originProgramId', {str_or_none}),
'episode_number': ('relatedEpisodeNumber', {int_or_none}),
'season_number': ('relatedSeasonNumber', {int_or_none}),
}),
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
} }
class GloboArticleIE(InfoExtractor): class GloboArticleIE(InfoExtractor):
_VALID_URL = r'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/.]+)(?:\.html)?' _VALID_URL = r'https?://(?!globoplay).+?\.globo\.com/(?:[^/?#]+/)*(?P<id>[^/?#.]+)(?:\.html)?'
_VIDEOID_REGEXES = [ _VIDEOID_REGEXES = [
r'\bdata-video-id=["\'](\d{7,})["\']', r'\bdata-video-id=["\'](\d{7,})["\']',