1
0
Fork 0
mirror of https://github.com/yt-dlp/yt-dlp synced 2025-01-18 23:03:05 +01:00

[snagfilms] Improve and simplify

This commit is contained in:
Sergey M․ 2015-06-27 18:20:42 +06:00
parent 533b99fbf9
commit 654fd03c73
2 changed files with 114 additions and 62 deletions

View file

@ -493,7 +493,10 @@ from .smotri import (
SmotriUserIE, SmotriUserIE,
SmotriBroadcastIE, SmotriBroadcastIE,
) )
from .snagfilms import SnagFilmsIE from .snagfilms import (
SnagFilmsIE,
SnagFilmsEmbedIE,
)
from .snotr import SnotrIE from .snotr import SnotrIE
from .sohu import SohuIE from .sohu import SohuIE
from .soompi import ( from .soompi import (

View file

@ -1,84 +1,133 @@
from re import match,DOTALL from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import js_to_json from ..utils import (
clean_html,
determine_ext,
int_or_none,
js_to_json,
parse_duration,
)
class SnagFilmsEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})'
_TESTS = [{
'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',
'md5': '2924e9215c6eff7a55ed35b72276bd93',
'info_dict': {
'id': '74849a00-85a9-11e1-9660-123139220831',
'ext': 'mp4',
'title': '#whilewewatch',
}
}, {
'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
formats = []
for source in self._parse_json(js_to_json(self._search_regex(
r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id):
file_ = source.get('file')
if not file_:
continue
type_ = source.get('type')
format_id = source.get('label')
ext = determine_ext(file_)
if any(_ == 'm3u8' for _ in (type_, ext)):
formats.extend(self._extract_m3u8_formats(
file_, video_id, 'mp4', m3u8_id='hls'))
else:
bitrate = int_or_none(self._search_regex(
r'(\d+)kbps', file_, 'bitrate', default=None))
height = int_or_none(self._search_regex(
r'^(\d+)[pP]$', format_id, 'height', default=None))
formats.append({
'url': file_,
'format_id': format_id,
'tbr': bitrate,
'height': height,
})
self._sort_formats(formats)
title = self._search_regex(
[r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'],
webpage, 'title')
return {
'id': video_id,
'title': title,
'formats': formats,
}
class SnagFilmsIE(InfoExtractor): class SnagFilmsIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www.|embed.)?snagfilms\.com/(?:films/title/(?P<display_id>.+?)|embed/player\?.*filmId=(?P<id>.+?))(?:&|/|$)' _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/films/title/(?P<id>[^/]+)'
_TESTS = [{ _TEST = {
'url': 'http://www.snagfilms.com/films/title/lost_for_life', 'url': 'http://www.snagfilms.com/films/title/lost_for_life',
'info_dict': 'md5': '19844f897b35af219773fd63bdec2942',
{ 'info_dict': {
'id': '0000014c-de2f-d5d6-abcf-ffef58af0017', 'id': '0000014c-de2f-d5d6-abcf-ffef58af0017',
'display_id': 'lost_for_life', 'display_id': 'lost_for_life',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Lost for Life', 'title': 'Lost for Life',
'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82',
'thumbnail': 're:^https?://.*\.jpg',
'duration': 4489, 'duration': 4489,
'description': 'In the United States, more than 2500 individuals are serving life-without-parole sentences for crimes they committed when they were seventeen years old or younger. Children as young as thirteen are among the thousands serving these sentences. Directed by Joshua Rof&eacute; (who spent four intensive years on the project), LOST FOR LIFE tells the stories of these individuals, their families and the families of juvenile murder victims. This searingly powerful documentary tackles this contentious issue from multiple perspectives and explores the complexity of the lives of those affected. What is justice when a kid kills? Can a horrific act place a life beyond redemption? Could you forgive?<br />', 'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals']
'categories': ['Documentary','Crime','Award Winning','Festivals']
} }
},{ }
'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831',
'info_dict':
{
'id': '74849a00-85a9-11e1-9660-123139220831',
'display_id': 'while_we_watch',
'ext': 'mp4',
'title': '#whilewewatch',
'duration': 2311,
'description': 'A gripping portrait of the Occupy Wall Street media revolution,&nbsp;#WHILEWEWATCH is the first definitive film to emerge from Zuccotti Park—with full access and cooperation from masterminds who made #OccupyWallStreet a reality.&nbsp;The #OccupyWallStreet media team had no fear of a critical city government, big corporations, hostile police or a lagging mainstream media to tell their story. Through rain, snow, grueling days and sleeping on concrete, they pump out exhilarating ideas to the world. With little money, they rely on Twitter, texting, Wi-Fi, posters, Tumblr, live streams, YouTube, Facebook, dramatic marches, drumbeats and chants. As the film unfolds, we witness the burgeoning power of social media.<br />',
'categories': ['Documentary','Politics']
}
}]
def _real_extract(self, url): def _real_extract(self, url):
display_id, video_id = match(self._VALID_URL,url).groups() display_id = self._match_id(url)
if display_id is None:
embed_webpage = self._download_webpage('http://www.snagfilms.com/embed/player?filmId=' + video_id, video_id)
display_id = self._html_search_regex(
r"snagfilms\.com/films/title/(?P<display_id>.+?)(?:/|')",
embed_webpage,
'display_id'
)
webpage = self._download_webpage('http://www.snagfilms.com/films/title/' + display_id, display_id)
json_data = self._parse_json(self._html_search_regex( webpage = self._download_webpage(url, display_id)
r'"data":{"film":(?P<data>{.*?}})}',
webpage,
'data'
), display_id)
if video_id is None: film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id')
video_id = json_data['id']
embed_webpage = self._download_webpage('http://www.snagfilms.com/embed/player?filmId=' + video_id, video_id)
title = json_data['title'] snag = self._parse_json(
duration = int(json_data['duration']) self._search_regex(
description = json_data['synopsis'] 'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'),
categories = [category['title'] for category in json_data['categories']] display_id)
thumbnail = json_data['image']
sources = self._parse_json(js_to_json(self._html_search_regex( for item in snag:
r'sources: (?P<sources>\[.*?\])', if item.get('data', {}).get('film', {}).get('id') == film_id:
embed_webpage, data = item['data']['film']
'sources', title = data['title']
flags=DOTALL description = clean_html(data.get('synopsis'))
)), video_id) thumbnail = data.get('image')
duration = int_or_none(data.get('duration') or data.get('runtime'))
formats = [] categories = [
for source in sources: category['title'] for category in data.get('categories', [])
if source['type'] == 'm3u8': if category.get('title')]
formats.extend(self._extract_m3u8_formats(source['file'], video_id)) break
else: else:
formats.append({'url': source['file'],'ext': source['type'], 'resolution': source['label']}) title = self._search_regex(
self._sort_formats(formats) r'itemprop="title">([^<]+)<', webpage, 'title')
description = self._html_search_regex(
r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>',
webpage, 'description', default=None) or self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage)
duration = parse_duration(self._search_regex(
r'<span itemprop="duration" class="film-duration strong">([^<]+)<',
webpage, 'duration', fatal=False))
categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage)
return { return {
'id': video_id, '_type': 'url_transparent',
'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id,
'id': film_id,
'display_id': display_id, 'display_id': display_id,
'title': title, 'title': title,
'duration': duration,
'description': description, 'description': description,
'categories': categories,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'formats': formats, 'duration': duration,
'categories': categories,
} }