1
0
Fork 0
mirror of https://github.com/yt-dlp/yt-dlp synced 2025-01-31 12:32:27 +01:00

Fix SenateISVPIE

This commit is contained in:
sepro 2025-01-15 19:20:03 +01:00
parent ebce89e107
commit 6b883f9d8c

View file

@ -5,9 +5,12 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
parse_qs, parse_qs,
remove_end,
unsmuggle_url, unsmuggle_url,
) )
class SenateBaseIE(InfoExtractor):
_COMMITTEES = { _COMMITTEES = {
'ag': ('76440', 'https://ag-f.akamaihd.net', '2036803', 'agriculture'), 'ag': ('76440', 'https://ag-f.akamaihd.net', '2036803', 'agriculture'),
'aging': ('76442', 'https://aging-f.akamaihd.net', '2036801', 'aging'), 'aging': ('76442', 'https://aging-f.akamaihd.net', '2036801', 'aging'),
@ -42,8 +45,21 @@ _COMMITTEES = {
'vetaff': ('76462', 'https://vetaff-f.akamaihd.net', '2036785', 'veteransaffairs'), 'vetaff': ('76462', 'https://vetaff-f.akamaihd.net', '2036785', 'veteransaffairs'),
} }
def _extract_formats(self, commitee, filename, video_id):
stream_num, stream_domain, stream_id, msl3 = self._COMMITTEES[commitee]
class SenateISVPIE(InfoExtractor): urls_alternatives = [f'https://www-senate-gov-media-srs.akamaized.net/hls/live/{stream_id}/{commitee}/{filename}/master.m3u8',
f'https://www-senate-gov-msl3archive.akamaized.net/{msl3}/{filename}_1/master.m3u8',
f'{stream_domain}/i/{filename}_1@{stream_num}/master.m3u8',
f'{stream_domain}/i/{filename}.mp4/master.m3u8']
for video_url in urls_alternatives:
formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4', fatal=False)
if formats:
break
return formats
class SenateISVPIE(SenateBaseIE):
_IE_NAME = 'senate.gov:isvp' _IE_NAME = 'senate.gov:isvp'
_VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)'
_EMBED_REGEX = [r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]"] _EMBED_REGEX = [r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]"]
@ -53,13 +69,14 @@ class SenateISVPIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'judiciary031715', 'id': 'judiciary031715',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Integrated Senate Video Player', 'title': 'ISVP',
'thumbnail': r're:^https?://.*\.(?:jpg|png)$', 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
}, },
'params': { 'params': {
# m3u8 download # m3u8 download
'skip_download': True, 'skip_download': True,
}, },
'expected_warnings': ['Failed to download m3u8 information'],
}, { }, {
'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false',
'info_dict': { 'info_dict': {
@ -71,14 +88,16 @@ class SenateISVPIE(InfoExtractor):
# m3u8 download # m3u8 download
'skip_download': True, 'skip_download': True,
}, },
'skip': 'This video is not available.',
}, { }, {
'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi',
# checksum differs each time # checksum differs each time
'info_dict': { 'info_dict': {
'id': 'intel090613', 'id': 'intel090613',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Integrated Senate Video Player', 'title': 'ISVP',
}, },
'expected_warnings': ['Failed to download m3u8 information'],
}, { }, {
# From http://www.c-span.org/video/?96791-1 # From http://www.c-span.org/video/?96791-1
'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715',
@ -91,8 +110,8 @@ class SenateISVPIE(InfoExtractor):
qs = urllib.parse.parse_qs(self._match_valid_url(url).group('qs')) qs = urllib.parse.parse_qs(self._match_valid_url(url).group('qs'))
if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): if not qs.get('filename') or not qs.get('type') or not qs.get('comm'):
raise ExtractorError('Invalid URL', expected=True) raise ExtractorError('Invalid URL', expected=True)
filename = qs['filename'][0]
video_id = re.sub(r'.mp4$', '', qs['filename'][0]) video_id = remove_end(filename, '.mp4')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
@ -103,40 +122,17 @@ class SenateISVPIE(InfoExtractor):
poster = qs.get('poster') poster = qs.get('poster')
thumbnail = poster[0] if poster else None thumbnail = poster[0] if poster else None
video_type = qs['type'][0] committee = qs['comm'][0]
committee = video_type if video_type == 'arch' else qs['comm'][0]
stream_num, domain = _COMMITTEES[committee]
formats = []
if video_type == 'arch':
filename = video_id if '.' in video_id else video_id + '.mp4'
m3u8_url = urllib.parse.urljoin(domain, 'i/' + filename + '/master.m3u8')
formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8')
else:
hdcore_sign = 'hdcore=3.1.0'
url_params = (domain, video_id, stream_num)
f4m_url = f'%s/z/%s_1@%s/manifest.f4m?{hdcore_sign}' % url_params
m3u8_url = '{}/i/{}_1@{}/master.m3u8'.format(*url_params)
for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'):
# URLs without the extra param induce an 404 error
entry.update({'extra_param_to_segment_url': hdcore_sign})
formats.append(entry)
for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'):
mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url'])
if mobj:
entry['format_id'] += mobj.group('tag')
formats.append(entry)
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'formats': formats, 'formats': self._extract_formats(committee, filename, video_id),
'thumbnail': thumbnail, 'thumbnail': thumbnail,
} }
class SenateGovIE(InfoExtractor): class SenateGovIE(SenateBaseIE):
_IE_NAME = 'senate.gov' _IE_NAME = 'senate.gov'
_VALID_URL = r'https?:\/\/(?:www\.)?(help|appropriations|judiciary|banking|armed-services|finance)\.senate\.gov' _VALID_URL = r'https?:\/\/(?:www\.)?(help|appropriations|judiciary|banking|armed-services|finance)\.senate\.gov'
_TESTS = [{ _TESTS = [{
@ -147,6 +143,8 @@ class SenateGovIE(InfoExtractor):
'title': 'Vaccines: Saving Lives, Ensuring Confidence, and Protecting Public Health', 'title': 'Vaccines: Saving Lives, Ensuring Confidence, and Protecting Public Health',
'description': 'The U.S. Senate Committee on Health, Education, Labor & Pensions', 'description': 'The U.S. Senate Committee on Health, Education, Labor & Pensions',
'ext': 'mp4', 'ext': 'mp4',
'age_limit': 0,
'thumbnail': 'https://www.help.senate.gov/assets/images/sharelogo.jpg',
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
}, { }, {
@ -156,8 +154,10 @@ class SenateGovIE(InfoExtractor):
'display_id': 'watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD', 'display_id': 'watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD',
'title': 'Review of the FY2019 Budget Request for the U.S. Army', 'title': 'Review of the FY2019 Budget Request for the U.S. Army',
'ext': 'mp4', 'ext': 'mp4',
'age_limit': 0,
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
'expected_warnings': ['Failed to download m3u8 information'],
}, { }, {
'url': 'https://www.banking.senate.gov/hearings/21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization', 'url': 'https://www.banking.senate.gov/hearings/21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization',
'info_dict': { 'info_dict': {
@ -166,6 +166,8 @@ class SenateGovIE(InfoExtractor):
'title': '21st Century Communities: Public Transportation Infrastructure Investment and FAST Act Reauthorization', 'title': '21st Century Communities: Public Transportation Infrastructure Investment and FAST Act Reauthorization',
'description': 'The Official website of The United States Committee on Banking, Housing, and Urban Affairs', 'description': 'The Official website of The United States Committee on Banking, Housing, and Urban Affairs',
'ext': 'mp4', 'ext': 'mp4',
'thumbnail': 'https://www.banking.senate.gov/themes/banking/images/sharelogo.jpg',
'age_limit': 0,
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
}] }]
@ -178,28 +180,18 @@ class SenateGovIE(InfoExtractor):
r'<iframe title="[^>"]*[^>"]*"\s[^>]*\bsrc="([^">]*)'), r'<iframe title="[^>"]*[^>"]*"\s[^>]*\bsrc="([^">]*)'),
webpage, 'hearing URL').replace('&amp;', '&') webpage, 'hearing URL').replace('&amp;', '&')
parse_info = parse_qs(iframe_src) parse_info = parse_qs(iframe_src)
comm = parse_info['comm'][-1] committee = parse_info['comm'][-1]
stream_num, stream_domain, stream_id, msl3 = _COMMITTEES[comm]
filename = parse_info['filename'][-1] filename = parse_info['filename'][-1]
urls_alternatives = [f'https://www-senate-gov-media-srs.akamaized.net/hls/live/{stream_id}/{comm}/{filename}/master.m3u8',
f'https://www-senate-gov-msl3archive.akamaized.net/{msl3}/{filename}_1/master.m3u8',
f'{stream_domain}/i/{filename}_1@{stream_num}/master.m3u8',
f'{stream_domain}/i/{filename}.mp4/master.m3u8']
for video_url in urls_alternatives:
formats = self._extract_m3u8_formats(video_url, display_id, ext='mp4', fatal=False)
if formats:
break
title = self._html_search_regex( title = self._html_search_regex(
(*self._og_regexes('title'), r'(?s)<title>([^<]*?)</title>'), webpage, 'video title') (*self._og_regexes('title'), r'(?s)<title>([^<]*?)</title>'), webpage, 'video title', fatal=False)
return { return {
'id': re.sub(r'.mp4$', '', filename), 'id': remove_end(filename, '.mp4'),
'display_id': display_id, 'display_id': display_id,
'title': re.sub(r'\s+', ' ', title.split('|')[0]).strip(), 'title': re.sub(r'\s+', ' ', title.split('|')[0]).strip(),
'description': self._og_search_description(webpage, default=None), 'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage, default=None),
'age_limit': self._rta_search(webpage), 'age_limit': self._rta_search(webpage),
'formats': formats, 'formats': self._extract_formats(committee, filename, display_id),
} }