1
0
Fork 0
mirror of https://github.com/yt-dlp/yt-dlp synced 2025-01-31 12:32:27 +01:00

rewrite code to use json api

This commit is contained in:
subsense 2025-01-13 13:59:02 +09:00
parent 75fbe5dc83
commit 9174bc2a81

View file

@ -1,123 +1,53 @@
import re
import secrets
from .common import InfoExtractor
from ..utils import (
ExtractorError,
unescapeHTML,
str_or_none,
traverse_obj,
url_or_none,
)
class EggsBaseIE(InfoExtractor):
def _parse_artist_name(self, webpage):
artist = self._search_regex(
r'<div[^>]+class=(["\'])artist_name\1[^>]*>([^<]+)</div>',
webpage, 'artist name', fatal=False, default=None, group=2)
if artist:
return artist.strip()
_API_HEADERS = {
'Accept': '*/*',
'apVersion': '8.2.00',
'deviceName': 'Android',
}
og_title = self._html_search_meta(['og:title'], webpage, 'og:title', default=None)
if og_title:
artist_match = re.search(r'(?P<artist>[^()]+)(?:\([^)]*\))?のEggsページ', og_title)
if artist_match:
return artist_match.group('artist').strip()
@staticmethod
def _generate_random_device_id():
return secrets.token_hex(8)
return 'Unknown Artist'
def _download_eggs_json(self, url, music_id):
headers = self._API_HEADERS.copy()
headers['deviceId'] = self._generate_random_device_id()
return self._download_json(url, video_id=music_id, headers=headers)
def _parse_single_song(self, url, webpage, song_id, default_artist='Unknown Artist'):
track_title = self._search_regex(
r'<div[^>]+class=(["\'])product_name\1[^>]*>\s*<p>([^<]+)</p>',
webpage, 'track title', fatal=False, default=None, group=2)
def _extract_music_info(self, data, song_id):
music_info = traverse_obj(data, {
'id': ('musicId', {str_or_none}, {lambda x: x or song_id}),
'title': ('musicTitle', {str}, {lambda x: x or 'Unknown Title'}),
'url': ('musicDataPath', {url_or_none}),
'uploader': ('artist', 'displayName', {str}, {lambda x: x or 'Unknown Artist'}),
'thumbnail': ('imageDataPath', {url_or_none}),
'youtube_url': ('youtubeUrl', {url_or_none}),
'youtube_id': ('youtubeVideoId', {str_or_none}),
'source_type': ('sourceType', {int}),
'vcodec': (None, {lambda x: 'none'}),
}, get_all=False)
if not track_title:
page_title = self._search_regex(
r'<title>(?P<title>[^<]+)</title>',
webpage, 'page title', fatal=False, default=None, group='title')
if page_title:
inner_match = re.search(r'「(?P<inner>[^」]+)」', page_title)
if inner_match:
track_title = inner_match.group('inner').strip()
if not music_info.get('url') and not (music_info.get('source_type') == 2 and music_info.get('youtube_url')):
raise ExtractorError('Audio URL not found (possibly an unsupported sourceType)', expected=True)
if not track_title:
track_title = 'Unknown Title'
artist = default_artist
if not artist or artist == 'Unknown Artist':
artist_regex = r'<span[^>]+class=(["\'])artist_name\1[^>]*>\s*<a[^>]*>([^<]+)</a>'
fallback_artist = self._search_regex(
artist_regex, webpage, 'artist name',
fatal=False, default=None, group=2)
if fallback_artist:
artist = fallback_artist.strip()
audio_url = self._search_regex(
r'<div[^>]+class=(["\'])[^"\']*player[^"\']*\1[^>]+data-src=(["\'])(?P<audio_url>[^"\']+)\2',
webpage, 'audio url', fatal=True, group='audio_url')
audio_url = url_or_none(unescapeHTML(audio_url))
if not audio_url:
raise ExtractorError('Invalid audio URL.', expected=True)
thumbnail = (
self._html_search_meta(['og:image'], webpage, 'thumbnail', default=None)
or self._search_regex(
r'<span[^>]*>\s*<img[^>]+src=(["\'])(?P<thumb>[^"\']+)\1',
webpage, 'thumbnail', fatal=False, default=None, group='thumb')
)
return {
'id': song_id,
'url': audio_url,
'title': track_title,
'uploader': artist,
'vcodec': 'none',
'thumbnail': thumbnail,
}
def _parse_artist_page(self, webpage, artist_id, artist_name):
song_blocks = re.findall(r'(?s)<li[^>]+id="songs\d+"[^>]*>.*?</li>', webpage)
entries = []
for block in song_blocks:
audio_url = self._search_regex(
r'data-src=(["\'])(?P<url>https?://.*?\.(?:mp3|m4a).*?)\1',
block, 'audio url', fatal=False, default=None, group='url')
audio_url = url_or_none(unescapeHTML(audio_url))
if not audio_url:
continue
track_id = self._search_regex(
r'data-srcid=(["\'])(?P<id>[^"\'<>]+)\1',
block, 'track id', fatal=False, default=None, group='id')
if not track_id:
continue
title = self._search_regex(
r'data-srcname=(["\'])(?P<title>[^"\']+)\1',
block, 'track title', fatal=False, default=None, group='title')
if not title:
title = 'Unknown Title'
thumbnail = self._search_regex(
r'<img[^>]+src=(["\'])(?P<th>[^"\']+)\1',
block, 'thumbnail', fatal=False, default=None, group='th')
entries.append({
'id': track_id,
'url': audio_url,
'title': title,
'uploader': artist_name,
'vcodec': 'none',
'thumbnail': thumbnail,
})
return entries
return music_info
class EggsIE(EggsBaseIE):
IE_NAME = 'eggs:single'
_VALID_URL = (
r'https?://(?:www\.)?eggs\.mu/artist/[^/]+/song/(?P<song_id>[^/]+)'
)
_VALID_URL = r'https?://eggs\.mu/artist/[^/]+/song/(?P<song_id>[^/]+)'
_TESTS = [{
'url': 'https://eggs.mu/artist/32_sunny_girl/song/0e95fd1d-4d61-4d5b-8b18-6092c551da90',
'info_dict': {
@ -125,39 +55,105 @@ class EggsIE(EggsBaseIE):
'ext': 'm4a',
'title': 'シネマと信号',
'uploader': 'Sunny Girl',
'thumbnail': r're:^https?://.*\.jpg(?:\?.*)?$',
'source_type': 1,
'thumbnail': r're:https?://.*\.jpg(?:\?.*)?$',
},
}, {
'url': 'https://eggs.mu/artist/KAMO_3pband/song/1d4bc45f-1af6-47a9-8b30-a70cae350b4f',
'info_dict': {
'id': '80cLKA2wnoA',
'ext': 'mp4',
'title': 'KAMO「いい女だから」Audio',
'uploader': 'KAMO',
'live_status': 'not_live',
'channel_id': 'UCsHLBw2__5Q9y55skXPotOg',
'channel_follower_count': int,
'description': 'md5:d260da711ecbec3e720293dc11401b87',
'availability': 'public',
'uploader_id': '@KAMO_band',
'upload_date': '20240925',
'thumbnail': 'https://i.ytimg.com/vi/80cLKA2wnoA/maxresdefault.jpg',
'comment_count': int,
'channel_url': 'https://www.youtube.com/channel/UCsHLBw2__5Q9y55skXPotOg',
'view_count': int,
'duration': 151,
'like_count': int,
'channel': 'KAMO',
'playable_in_embed': True,
'uploader_url': 'https://www.youtube.com/@KAMO_band',
'tags': [],
'timestamp': 1727271121,
'age_limit': 0,
'categories': ['People & Blogs'],
},
'add_ie': ['Youtube'],
'params': {'skip_download': 'Youtube'},
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
song_id = mobj.group('song_id')
webpage = self._download_webpage(url, song_id)
artist_name = self._parse_artist_name(webpage)
return self._parse_single_song(url, webpage, song_id, artist_name)
song_id = self._match_valid_url(url).group('song_id')
json_data = self._download_eggs_json(
f'https://app-front-api.eggs.mu/v1/musics/{song_id}', music_id=song_id)
music_info = self._extract_music_info(json_data, song_id)
if music_info['source_type'] == 2 and music_info['youtube_url']:
return self.url_result(
music_info['youtube_url'], ie='Youtube', video_id=music_info['youtube_id'])
return music_info
class EggsArtistIE(EggsBaseIE):
IE_NAME = 'eggs:artist'
_VALID_URL = (
r'https?://(?:www\.)?eggs\.mu/artist/(?P<artist_id>[^/]+)$'
)
_TESTS = [{
'url': 'https://eggs.mu/artist/32_sunny_girl',
'info_dict': {
'id': '32_sunny_girl',
'title': 'Sunny Girl',
_VALID_URL = r'https?://eggs\.mu/artist/(?P<artist_id>[^/]+)$'
_TESTS = [
{
'url': 'https://eggs.mu/artist/32_sunny_girl',
'info_dict': {
'id': '32_sunny_girl',
'title': 'Sunny Girl',
},
'playlist_mincount': 18,
},
'playlist_count': 18,
}]
{
'url': 'https://eggs.mu/artist/KAMO_3pband',
'info_dict': {
'id': 'KAMO_3pband',
'title': 'KAMO',
},
'playlist_mincount': 2,
},
]
def _real_extract(self, url):
artist_id = self._match_valid_url(url).group('artist_id')
webpage = self._download_webpage(url, artist_id)
artist_name = self._parse_artist_name(webpage)
entries = self._parse_artist_page(webpage, artist_id, artist_name)
json_data = self._download_eggs_json(
f'https://app-front-api.eggs.mu/v1/artists/{artist_id}/musics', music_id=artist_id)
items = traverse_obj(json_data, 'data', default=[])
entries = []
display_name = None
for item in items:
music_info = self._extract_music_info(item, '')
if not music_info['id']:
continue
if not display_name:
display_name = music_info['uploader']
if music_info['source_type'] == 2 and music_info['youtube_url']:
entries.append(
self.url_result(
music_info['youtube_url'], ie='Youtube', video_id=music_info['youtube_id']))
continue
if not music_info.get('url'):
continue
entries.append(music_info)
return self.playlist_result(
entries,
playlist_id=artist_id,
playlist_title=artist_name,
)
playlist_title=display_name or artist_id)