2013-07-16 20:50:16 +02:00
import re
2024-06-12 01:09:58 +02:00
import urllib . parse
2013-07-16 20:50:16 +02:00
from . common import InfoExtractor
2014-12-13 12:24:42 +01:00
from . . utils import (
2016-08-04 18:28:49 +01:00
determine_ext ,
2017-03-22 23:22:14 +07:00
extract_attributes ,
2016-08-04 18:28:49 +01:00
int_or_none ,
2017-03-22 23:22:14 +07:00
js_to_json ,
mimetype2ext ,
orderedSet ,
2016-08-04 18:28:49 +01:00
parse_iso8601 ,
2020-11-20 00:52:59 +05:30
strip_or_none ,
try_get ,
2024-11-04 01:33:21 +01:00
urljoin ,
2014-12-13 12:24:42 +01:00
)
2013-07-16 20:50:16 +02:00
class CondeNastIE ( InfoExtractor ) :
"""
Condé Nast is a media group , some of its sites use a custom HTML5 player
that works the same in all of them .
"""
# The keys are the supported sites and the values are the name to be shown
# to the user and in the extractor description.
2014-01-17 03:32:02 +01:00
_SITES = {
2015-09-23 17:48:39 +01:00
' allure ' : ' Allure ' ,
' architecturaldigest ' : ' Architectural Digest ' ,
' arstechnica ' : ' Ars Technica ' ,
2015-09-25 05:15:21 +06:00
' bonappetit ' : ' Bon Appétit ' ,
2015-09-23 17:48:39 +01:00
' brides ' : ' Brides ' ,
' cnevids ' : ' Condé Nast ' ,
' cntraveler ' : ' Condé Nast Traveler ' ,
' details ' : ' Details ' ,
' epicurious ' : ' Epicurious ' ,
' glamour ' : ' Glamour ' ,
' golfdigest ' : ' Golf Digest ' ,
2014-01-17 03:32:02 +01:00
' gq ' : ' GQ ' ,
2015-09-23 17:48:39 +01:00
' newyorker ' : ' The New Yorker ' ,
' self ' : ' SELF ' ,
' teenvogue ' : ' Teen Vogue ' ,
' vanityfair ' : ' Vanity Fair ' ,
2014-01-17 03:32:02 +01:00
' vogue ' : ' Vogue ' ,
2015-09-23 17:48:39 +01:00
' wired ' : ' WIRED ' ,
2014-01-17 03:32:02 +01:00
' wmagazine ' : ' W Magazine ' ,
}
2013-07-16 20:50:16 +02:00
2024-06-12 01:09:58 +02:00
_VALID_URL = r ''' (?x)https?://(?:video|www|player(?:-backend)?) \ .(?: {} ) \ .com/
2017-05-12 16:17:32 +01:00
( ? :
( ? :
embed ( ? : js ) ? |
( ? : script | inline ) / video
2024-06-12 01:09:58 +02:00
) / ( ? P < id > [ 0 - 9 a - f ] { { 24 } } ) ( ? : / ( ? P < player_id > [ 0 - 9 a - f ] { { 24 } } ) ) ? ( ? : . + ? \btarget = ( ? P < target > [ ^ & ] + ) ) ? |
2017-05-12 16:17:32 +01:00
( ? P < type > watch | series | video ) / ( ? P < display_id > [ ^ / ? #]+)
2024-06-12 01:09:58 +02:00
) ''' .format( ' | ' .join(_SITES.keys()))
IE_DESC = ' Condé Nast media group: {} ' . format ( ' , ' . join ( sorted ( _SITES . values ( ) ) ) )
2013-07-16 20:50:16 +02:00
2022-08-01 06:53:25 +05:30
_EMBED_REGEX = [ r ''' (?x)
< ( ? : iframe | script ) [ ^ > ] + ? src = ( [ " \' ])(?P<url>
2024-06-12 01:09:58 +02:00
( ? : https ? : ) ? / / player ( ? : - backend ) ? \. ( ? : { } ) \. com / ( ? : embed ( ? : js ) ? | ( ? : script | inline ) / video ) / . + ?
) \1 ''' .format( ' | ' .join(_SITES.keys()))]
2014-10-13 19:59:35 +07:00
2015-09-27 05:53:21 +06:00
_TESTS = [ {
2014-01-17 03:32:02 +01:00
' url ' : ' http://video.wired.com/watch/3d-printed-speakers-lit-with-led ' ,
' md5 ' : ' 1921f713ed48aabd715691f774c451f7 ' ,
' info_dict ' : {
2014-04-21 05:47:52 +02:00
' id ' : ' 5171b343c2b4c00dd0c1ccb3 ' ,
' ext ' : ' mp4 ' ,
2014-01-17 03:32:02 +01:00
' title ' : ' 3D Printed Speakers Lit With LED ' ,
' description ' : ' Check out these beautiful 3D printed LED speakers. You can \' t actually buy them, but LumiGeek is working on a board that will let you make you \' re own. ' ,
2016-08-04 18:28:49 +01:00
' uploader ' : ' wired ' ,
' upload_date ' : ' 20130314 ' ,
' timestamp ' : 1363219200 ,
2024-06-12 01:09:58 +02:00
} ,
2017-03-22 23:22:14 +07:00
} , {
' url ' : ' http://video.gq.com/watch/the-closer-with-keith-olbermann-the-only-true-surprise-trump-s-an-idiot?c=series ' ,
' info_dict ' : {
' id ' : ' 58d1865bfd2e6126e2000015 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The Only True Surprise? Trump’ s an Idiot ' ,
' uploader ' : ' gq ' ,
' upload_date ' : ' 20170321 ' ,
' timestamp ' : 1490126427 ,
2020-11-20 00:52:59 +05:30
' description ' : ' How much grimmer would things be if these people were competent? ' ,
2017-03-22 23:22:14 +07:00
} ,
2015-09-27 05:53:21 +06:00
} , {
# JS embed
' url ' : ' http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js ' ,
' md5 ' : ' f1a6f9cafb7083bab74a710f65d08999 ' ,
' info_dict ' : {
' id ' : ' 55f9cf8b61646d1acf00000c ' ,
' ext ' : ' mp4 ' ,
' title ' : ' 3D printed TSA Travel Sentry keys really do open TSA locks ' ,
2016-08-04 18:28:49 +01:00
' uploader ' : ' arstechnica ' ,
' upload_date ' : ' 20150916 ' ,
2020-11-20 00:52:59 +05:30
' timestamp ' : 1442434920 ,
2024-06-12 01:09:58 +02:00
} ,
2017-05-12 16:17:32 +01:00
} , {
' url ' : ' https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player ' ,
' only_matching ' : True ,
} , {
' url ' : ' http://player-backend.cnevids.com/script/video/59138decb57ac36b83000005.js ' ,
' only_matching ' : True ,
2015-09-27 05:53:21 +06:00
} ]
2013-07-16 20:50:16 +02:00
def _extract_series ( self , url , webpage ) :
2016-08-04 18:28:49 +01:00
title = self . _html_search_regex (
r ' (?s)<div class= " cne-series-info " >.*?<h1>(.+?)</h1> ' ,
webpage , ' series title ' )
2024-06-12 01:09:58 +02:00
url_object = urllib . parse . urlparse ( url )
base_url = f ' { url_object . scheme } :// { url_object . netloc } '
2016-08-04 18:28:49 +01:00
m_paths = re . finditer (
r ' (?s)<p class= " cne-thumb-title " >.*?<a href= " (/watch/.+?)[ " \ ?] ' , webpage )
2013-07-16 20:50:16 +02:00
paths = orderedSet ( m . group ( 1 ) for m in m_paths )
2024-11-04 01:33:21 +01:00
entries = [ self . url_result ( urljoin ( base_url , path ) , ' CondeNast ' ) for path in paths ]
2013-07-16 20:50:16 +02:00
return self . playlist_result ( entries , playlist_title = title )
2017-09-15 01:59:47 +07:00
def _extract_video_params ( self , webpage , display_id ) :
query = self . _parse_json (
self . _search_regex (
r ' (?s)var \ s+params \ s*= \ s*( { .+?})[;,] ' , webpage , ' player params ' ,
default = ' {} ' ) ,
display_id , transform_source = js_to_json , fatal = False )
if query :
query [ ' videoId ' ] = self . _search_regex (
r ' (?:data-video-id=|currentVideoId \ s*= \ s*)[ " \' ]([ \ da-f]+) ' ,
webpage , ' video id ' , default = None )
2014-04-21 05:47:52 +02:00
else :
2016-08-04 18:28:49 +01:00
params = extract_attributes ( self . _search_regex (
r ' (<[^>]+data-js= " video-player " [^>]+>) ' ,
webpage , ' player params element ' ) )
query . update ( {
' videoId ' : params [ ' data-video ' ] ,
' playerId ' : params [ ' data-player ' ] ,
' target ' : params [ ' id ' ] ,
} )
2017-05-12 16:17:32 +01:00
return query
def _extract_video ( self , params ) :
video_id = params [ ' videoId ' ]
2016-08-05 21:01:16 +01:00
video_info = None
2017-09-15 01:59:47 +07:00
# New API path
query = params . copy ( )
query [ ' embedType ' ] = ' inline '
info_page = self . _download_json (
' http://player.cnevids.com/embed-api.json ' , video_id ,
' Downloading embed info ' , fatal = False , query = query )
# Old fallbacks
if not info_page :
if params . get ( ' playerId ' ) :
info_page = self . _download_json (
' http://player.cnevids.com/player/video.js ' , video_id ,
' Downloading video info ' , fatal = False , query = params )
if info_page :
video_info = info_page . get ( ' video ' )
if not video_info :
info_page = self . _download_webpage (
' http://player.cnevids.com/player/loader.js ' ,
video_id , ' Downloading loader info ' , query = params )
if not video_info :
2016-08-05 21:01:16 +01:00
info_page = self . _download_webpage (
2024-06-12 01:09:58 +02:00
f ' https://player.cnevids.com/inline/video/ { video_id } .js ' ,
2017-05-12 16:17:32 +01:00
video_id , ' Downloading inline info ' , query = {
2024-06-12 01:09:58 +02:00
' target ' : params . get ( ' target ' , ' embedplayer ' ) ,
2017-05-12 16:17:32 +01:00
} )
if not video_info :
2017-03-22 23:22:14 +07:00
video_info = self . _parse_json (
self . _search_regex (
r ' (?s)var \ s+config \ s*= \ s*( { .+?}); ' , info_page , ' config ' ) ,
video_id , transform_source = js_to_json ) [ ' video ' ]
2016-08-04 18:28:49 +01:00
title = video_info [ ' title ' ]
formats = [ ]
2017-03-22 23:22:14 +07:00
for fdata in video_info [ ' sources ' ] :
2016-08-04 18:28:49 +01:00
src = fdata . get ( ' src ' )
if not src :
continue
ext = mimetype2ext ( fdata . get ( ' type ' ) ) or determine_ext ( src )
2017-03-22 23:22:14 +07:00
if ext == ' m3u8 ' :
formats . extend ( self . _extract_m3u8_formats (
src , video_id , ' mp4 ' , entry_protocol = ' m3u8_native ' ,
m3u8_id = ' hls ' , fatal = False ) )
continue
2016-08-04 18:28:49 +01:00
quality = fdata . get ( ' quality ' )
formats . append ( {
2024-06-12 01:09:58 +02:00
' format_id ' : ext + ( f ' - { quality } ' if quality else ' ' ) ,
2016-08-04 18:28:49 +01:00
' url ' : src ,
' ext ' : ext ,
' quality ' : 1 if quality == ' high ' else 0 ,
} )
2013-07-16 20:50:16 +02:00
2020-11-20 00:52:59 +05:30
subtitles = { }
for t , caption in video_info . get ( ' captions ' , { } ) . items ( ) :
caption_url = caption . get ( ' src ' )
if not ( t in ( ' vtt ' , ' srt ' , ' tml ' ) and caption_url ) :
continue
subtitles . setdefault ( ' en ' , [ ] ) . append ( { ' url ' : caption_url } )
2017-05-12 16:17:32 +01:00
return {
2014-01-17 03:36:03 +01:00
' id ' : video_id ,
' formats ' : formats ,
2016-08-04 18:28:49 +01:00
' title ' : title ,
' thumbnail ' : video_info . get ( ' poster_frame ' ) ,
' uploader ' : video_info . get ( ' brand ' ) ,
' duration ' : int_or_none ( video_info . get ( ' duration ' ) ) ,
' tags ' : video_info . get ( ' tags ' ) ,
' series ' : video_info . get ( ' series_title ' ) ,
' season ' : video_info . get ( ' season_title ' ) ,
' timestamp ' : parse_iso8601 ( video_info . get ( ' premiere_date ' ) ) ,
2017-05-12 16:17:32 +01:00
' categories ' : video_info . get ( ' categories ' ) ,
2020-11-20 00:52:59 +05:30
' subtitles ' : subtitles ,
2017-05-12 16:17:32 +01:00
}
2013-07-16 20:50:16 +02:00
def _real_extract ( self , url ) :
2021-08-19 07:11:24 +05:30
video_id , player_id , target , url_type , display_id = self . _match_valid_url ( url ) . groups ( )
2013-07-16 20:50:16 +02:00
2017-05-12 16:17:32 +01:00
if video_id :
return self . _extract_video ( {
' videoId ' : video_id ,
' playerId ' : player_id ,
' target ' : target ,
} )
2015-09-27 05:53:21 +06:00
2017-05-12 16:17:32 +01:00
webpage = self . _download_webpage ( url , display_id )
2013-07-16 20:50:16 +02:00
if url_type == ' series ' :
return self . _extract_series ( url , webpage )
else :
2020-11-20 00:52:59 +05:30
video = try_get ( self . _parse_json ( self . _search_regex (
r ' __PRELOADED_STATE__ \ s*= \ s*( { .+?}); ' , webpage ,
' preload state ' , ' {} ' ) , display_id ) ,
lambda x : x [ ' transformed ' ] [ ' video ' ] )
if video :
params = { ' videoId ' : video [ ' id ' ] }
info = { ' description ' : strip_or_none ( video . get ( ' description ' ) ) }
else :
params = self . _extract_video_params ( webpage , display_id )
info = self . _search_json_ld (
webpage , display_id , fatal = False )
2017-05-12 16:17:32 +01:00
info . update ( self . _extract_video ( params ) )
return info