2013-10-23 16:31:53 +02:00
# encoding: utf-8
2014-01-06 23:38:16 +01:00
from __future__ import unicode_literals
2013-06-23 20:18:21 +02:00
import json
import re
2013-07-29 13:12:09 +02:00
import itertools
2013-06-23 20:18:21 +02:00
from . common import InfoExtractor
from . . utils import (
compat_urllib_parse ,
compat_urllib_request ,
clean_html ,
get_element_by_attribute ,
ExtractorError ,
2013-10-23 14:38:03 +02:00
RegexNotFoundError ,
2013-06-23 20:18:21 +02:00
std_headers ,
2013-10-15 12:05:13 +02:00
unsmuggle_url ,
2013-06-23 20:18:21 +02:00
)
2013-12-22 03:17:56 +01:00
2013-06-23 20:18:21 +02:00
class VimeoIE ( InfoExtractor ) :
""" Information extractor for vimeo.com. """
# _VALID_URL matches Vimeo URLs
2013-12-22 03:17:56 +01:00
_VALID_URL = r ''' (?x)
( ? P < proto > https ? : / / ) ?
( ? : ( ? : www | ( ? P < player > player ) ) \. ) ?
vimeo ( ? P < pro > pro ) ? \. com /
( ? : . * ? / ) ?
2013-12-22 03:34:13 +01:00
( ? : ( ? : play_redirect_hls | moogaloop \. swf ) \? clip_id = ) ?
2013-12-22 03:17:56 +01:00
( ? : videos ? / ) ?
( ? P < id > [ 0 - 9 ] + )
2013-12-22 03:34:13 +01:00
/ ? ( ? : [ ? & ] . * ) ? ( ? : [ #].*)?$'''
2013-07-07 23:24:34 +02:00
_NETRC_MACHINE = ' vimeo '
2014-01-06 23:38:16 +01:00
IE_NAME = ' vimeo '
2013-08-21 13:48:19 +02:00
_TESTS = [
{
2014-01-06 23:38:16 +01:00
' url ' : ' http://vimeo.com/56015672#at=0 ' ,
' file ' : ' 56015672.mp4 ' ,
' md5 ' : ' 8879b6cc097e987f02484baf890129e5 ' ,
' info_dict ' : {
" upload_date " : " 20121220 " ,
" description " : " This is a test case for youtube-dl. \n For more information, see github.com/rg3/youtube-dl \n Test chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550 " ,
" uploader_id " : " user7108434 " ,
" uploader " : " Filippo Valsorda " ,
" title " : " youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550 " ,
2013-08-21 13:48:19 +02:00
} ,
} ,
{
2014-01-06 23:38:16 +01:00
' url ' : ' http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876 ' ,
' file ' : ' 68093876.mp4 ' ,
' md5 ' : ' 3b5ca6aa22b60dfeeadf50b72e44ed82 ' ,
' note ' : ' Vimeo Pro video (#1197) ' ,
' info_dict ' : {
' uploader_id ' : ' openstreetmapus ' ,
' uploader ' : ' OpenStreetMap US ' ,
' title ' : ' Andy Allan - Putting the Carto into OpenStreetMap Cartography ' ,
2013-08-21 13:48:19 +02:00
} ,
} ,
2013-09-03 10:48:56 +02:00
{
2014-01-06 23:38:16 +01:00
' url ' : ' http://player.vimeo.com/video/54469442 ' ,
' file ' : ' 54469442.mp4 ' ,
' md5 ' : ' 619b811a4417aa4abe78dc653becf511 ' ,
' note ' : ' Videos that embed the url in the player page ' ,
' info_dict ' : {
' title ' : ' Kathy Sierra: Building the minimum Badass User, Business of Software ' ,
' uploader ' : ' The BLN & Business of Software ' ,
' uploader_id ' : ' theblnbusinessofsoftware ' ,
2013-09-03 10:48:56 +02:00
} ,
2013-10-23 16:31:53 +02:00
} ,
{
2014-01-06 23:38:16 +01:00
' url ' : ' http://vimeo.com/68375962 ' ,
' file ' : ' 68375962.mp4 ' ,
' md5 ' : ' aaf896bdb7ddd6476df50007a0ac0ae7 ' ,
' note ' : ' Video protected with password ' ,
' info_dict ' : {
' title ' : ' youtube-dl password protected test video ' ,
' upload_date ' : ' 20130614 ' ,
' uploader_id ' : ' user18948128 ' ,
' uploader ' : ' Jaime Marquínez Ferrándiz ' ,
2013-10-23 16:31:53 +02:00
} ,
2014-01-06 23:38:16 +01:00
' params ' : {
' videopassword ' : ' youtube-dl ' ,
2013-10-23 16:31:53 +02:00
} ,
} ,
2013-08-21 13:48:19 +02:00
]
2013-06-23 20:18:21 +02:00
2013-07-07 23:24:34 +02:00
def _login ( self ) :
( username , password ) = self . _get_login_info ( )
if username is None :
return
self . report_login ( )
login_url = ' https://vimeo.com/log_in '
webpage = self . _download_webpage ( login_url , None , False )
2014-01-07 05:19:28 +01:00
token = self . _search_regex ( r ' xsrft: \' (.*?) \' ' , webpage , ' login token ' )
2013-07-07 23:24:34 +02:00
data = compat_urllib_parse . urlencode ( { ' email ' : username ,
' password ' : password ,
' action ' : ' login ' ,
' service ' : ' vimeo ' ,
' token ' : token ,
} )
login_request = compat_urllib_request . Request ( login_url , data )
login_request . add_header ( ' Content-Type ' , ' application/x-www-form-urlencoded ' )
login_request . add_header ( ' Cookie ' , ' xsrft= %s ' % token )
2014-01-06 23:38:16 +01:00
self . _download_webpage ( login_request , None , False , ' Wrong login info ' )
2013-07-07 23:24:34 +02:00
2013-06-23 20:18:21 +02:00
def _verify_video_password ( self , url , video_id , webpage ) :
2013-06-25 22:22:32 +02:00
password = self . _downloader . params . get ( ' videopassword ' , None )
2013-06-23 20:18:21 +02:00
if password is None :
2014-01-06 23:38:16 +01:00
raise ExtractorError ( ' This video is protected by a password, use the --video-password option ' )
2014-01-07 05:19:28 +01:00
token = self . _search_regex ( r ' xsrft: \' (.*?) \' ' , webpage , ' login token ' )
2013-06-23 20:18:21 +02:00
data = compat_urllib_parse . urlencode ( { ' password ' : password ,
' token ' : token } )
# I didn't manage to use the password with https
if url . startswith ( ' https ' ) :
pass_url = url . replace ( ' https ' , ' http ' )
else :
pass_url = url
password_request = compat_urllib_request . Request ( pass_url + ' /password ' , data )
password_request . add_header ( ' Content-Type ' , ' application/x-www-form-urlencoded ' )
password_request . add_header ( ' Cookie ' , ' xsrft= %s ' % token )
self . _download_webpage ( password_request , video_id ,
2014-01-06 23:38:16 +01:00
' Verifying the password ' ,
' Wrong password ' )
2013-06-23 20:18:21 +02:00
2013-07-07 23:24:34 +02:00
def _real_initialize ( self ) :
self . _login ( )
2013-12-10 20:43:16 +01:00
def _real_extract ( self , url ) :
2013-10-15 12:05:13 +02:00
url , data = unsmuggle_url ( url )
headers = std_headers
if data is not None :
headers = headers . copy ( )
headers . update ( data )
2013-06-23 20:18:21 +02:00
# Extract ID from URL
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2014-01-06 23:38:16 +01:00
raise ExtractorError ( ' Invalid URL: %s ' % url )
2013-06-23 20:18:21 +02:00
video_id = mobj . group ( ' id ' )
2013-11-03 12:11:13 +01:00
if mobj . group ( ' pro ' ) or mobj . group ( ' player ' ) :
2013-08-21 13:48:19 +02:00
url = ' http://player.vimeo.com/video/ ' + video_id
2013-11-03 12:11:13 +01:00
else :
2013-06-23 20:18:21 +02:00
url = ' https://vimeo.com/ ' + video_id
# Retrieve video webpage to extract further information
2013-10-15 12:05:13 +02:00
request = compat_urllib_request . Request ( url , None , headers )
2013-06-23 20:18:21 +02:00
webpage = self . _download_webpage ( request , video_id )
# Now we begin extracting as much information as we can from what we
# retrieved. First we extract the information common to all extractors,
# and latter we extract those that are Vimeo specific.
self . report_extraction ( video_id )
# Extract the config JSON
try :
2013-10-23 16:31:53 +02:00
try :
config_url = self . _html_search_regex (
2014-01-06 23:38:16 +01:00
r ' data-config-url= " (.+?) " ' , webpage , ' config URL ' )
2013-10-23 16:31:53 +02:00
config_json = self . _download_webpage ( config_url , video_id )
config = json . loads ( config_json )
except RegexNotFoundError :
# For pro videos or player.vimeo.com urls
2013-12-10 20:28:12 +01:00
# We try to find out to which variable is assigned the config dic
m_variable_name = re . search ( ' ( \ w) \ .video \ .id ' , webpage )
if m_variable_name is not None :
config_re = r ' %s =( { .+?}); ' % re . escape ( m_variable_name . group ( 1 ) )
else :
config_re = [ r ' = { config:( { .+?}),assets: ' , r ' (?:[abc])=( { .+?}); ' ]
2014-01-06 23:38:16 +01:00
config = self . _search_regex ( config_re , webpage , ' info section ' ,
2013-12-10 20:28:12 +01:00
flags = re . DOTALL )
2013-10-23 16:31:53 +02:00
config = json . loads ( config )
2013-10-23 11:38:51 +02:00
except Exception as e :
2013-06-23 20:18:21 +02:00
if re . search ( ' The creator of this video has not given you permission to embed it on this domain. ' , webpage ) :
2014-01-06 23:38:16 +01:00
raise ExtractorError ( ' The author has restricted the access to this video, try with the " --referer " option ' )
2013-06-23 20:18:21 +02:00
2013-10-23 16:31:53 +02:00
if re . search ( ' <form[^>]+?id= " pw_form " ' , webpage ) is not None :
2013-06-23 20:18:21 +02:00
self . _verify_video_password ( url , video_id , webpage )
return self . _real_extract ( url )
else :
2014-01-06 23:38:16 +01:00
raise ExtractorError ( ' Unable to extract info section ' ,
2013-10-23 11:38:51 +02:00
cause = e )
2014-01-06 23:35:24 +01:00
else :
if config . get ( ' view ' ) == 4 :
self . _verify_video_password ( url , video_id , webpage )
return self . _real_extract ( url )
2013-06-23 20:18:21 +02:00
# Extract title
video_title = config [ " video " ] [ " title " ]
# Extract uploader and uploader_id
video_uploader = config [ " video " ] [ " owner " ] [ " name " ]
video_uploader_id = config [ " video " ] [ " owner " ] [ " url " ] . split ( ' / ' ) [ - 1 ] if config [ " video " ] [ " owner " ] [ " url " ] else None
# Extract video thumbnail
2013-09-03 10:48:56 +02:00
video_thumbnail = config [ " video " ] . get ( " thumbnail " )
if video_thumbnail is None :
_ , video_thumbnail = sorted ( ( int ( width ) , t_url ) for ( width , t_url ) in config [ " video " ] [ " thumbs " ] . items ( ) ) [ - 1 ]
2013-06-23 20:18:21 +02:00
# Extract video description
2013-09-03 11:11:36 +02:00
video_description = None
try :
video_description = get_element_by_attribute ( " itemprop " , " description " , webpage )
if video_description : video_description = clean_html ( video_description )
except AssertionError as err :
# On some pages like (http://player.vimeo.com/video/54469442) the
# html tags are not closed, python 2.6 cannot handle it
if err . args [ 0 ] == ' we should not get here! ' :
pass
else :
raise
2013-06-23 20:18:21 +02:00
# Extract upload date
video_upload_date = None
mobj = re . search ( r ' <meta itemprop= " dateCreated " content= " ( \ d {4} )-( \ d {2} )-( \ d {2} )T ' , webpage )
if mobj is not None :
video_upload_date = mobj . group ( 1 ) + mobj . group ( 2 ) + mobj . group ( 3 )
2013-12-06 13:03:08 +01:00
try :
2014-01-06 23:38:16 +01:00
view_count = int ( self . _search_regex ( r ' UserPlays:( \ d+) ' , webpage , ' view count ' ) )
like_count = int ( self . _search_regex ( r ' UserLikes:( \ d+) ' , webpage , ' like count ' ) )
comment_count = int ( self . _search_regex ( r ' UserComments:( \ d+) ' , webpage , ' comment count ' ) )
2013-12-06 13:03:08 +01:00
except RegexNotFoundError :
# This info is only available in vimeo.com/{id} urls
view_count = None
like_count = None
comment_count = None
2013-06-23 20:18:21 +02:00
# Vimeo specific: extract request signature and timestamp
sig = config [ ' request ' ] [ ' signature ' ]
timestamp = config [ ' request ' ] [ ' timestamp ' ]
# Vimeo specific: extract video codec and quality information
# First consider quality, then codecs, then take everything
2013-07-05 09:10:57 -07:00
codecs = [ ( ' vp6 ' , ' flv ' ) , ( ' vp8 ' , ' flv ' ) , ( ' h264 ' , ' mp4 ' ) ]
2013-11-03 14:03:17 +01:00
files = { ' hd ' : [ ] , ' sd ' : [ ] , ' other ' : [ ] }
2013-09-03 10:48:56 +02:00
config_files = config [ " video " ] . get ( " files " ) or config [ " request " ] . get ( " files " )
2013-06-23 20:18:21 +02:00
for codec_name , codec_extension in codecs :
2013-07-05 09:10:57 -07:00
for quality in config_files . get ( codec_name , [ ] ) :
format_id = ' - ' . join ( ( codec_name , quality ) ) . lower ( )
key = quality if quality in files else ' other '
video_url = None
if isinstance ( config_files [ codec_name ] , dict ) :
file_info = config_files [ codec_name ] [ quality ]
video_url = file_info . get ( ' url ' )
2013-06-23 20:18:21 +02:00
else :
2013-07-05 09:10:57 -07:00
file_info = { }
if video_url is None :
video_url = " http://player.vimeo.com/play_redirect?clip_id= %s &sig= %s &time= %s &quality= %s &codecs= %s &type=moogaloop_local&embed_location= " \
% ( video_id , sig , timestamp , quality , codec_name . upper ( ) )
files [ key ] . append ( {
' ext ' : codec_extension ,
' url ' : video_url ,
' format_id ' : format_id ,
' width ' : file_info . get ( ' width ' ) ,
' height ' : file_info . get ( ' height ' ) ,
} )
formats = [ ]
for key in ( ' other ' , ' sd ' , ' hd ' ) :
formats + = files [ key ]
if len ( formats ) == 0 :
2014-01-06 23:38:16 +01:00
raise ExtractorError ( ' No known codec found ' )
2013-06-23 20:18:21 +02:00
2013-11-03 12:11:13 +01:00
return {
2013-06-23 20:18:21 +02:00
' id ' : video_id ,
' uploader ' : video_uploader ,
' uploader_id ' : video_uploader_id ,
' upload_date ' : video_upload_date ,
' title ' : video_title ,
' thumbnail ' : video_thumbnail ,
' description ' : video_description ,
2013-07-05 09:10:57 -07:00
' formats ' : formats ,
2013-11-03 12:11:13 +01:00
' webpage_url ' : url ,
2013-12-06 13:03:08 +01:00
' view_count ' : view_count ,
' like_count ' : like_count ,
' comment_count ' : comment_count ,
2013-11-03 12:11:13 +01:00
}
2013-07-29 13:12:09 +02:00
class VimeoChannelIE ( InfoExtractor ) :
2014-01-06 23:38:16 +01:00
IE_NAME = ' vimeo:channel '
2013-07-29 13:12:09 +02:00
_VALID_URL = r ' (?:https?://)?vimeo. \ com/channels/(?P<id>[^/]+) '
_MORE_PAGES_INDICATOR = r ' <a.+?rel= " next " '
2013-12-01 22:36:18 +01:00
_TITLE_RE = r ' <link rel= " alternate " [^>]+?title= " (.*?) " '
2013-07-29 13:12:09 +02:00
2013-12-06 21:47:32 +01:00
def _page_url ( self , base_url , pagenum ) :
return ' %s /videos/page: %d / ' % ( base_url , pagenum )
2013-12-06 22:01:41 +01:00
def _extract_list_title ( self , webpage ) :
2014-01-06 23:38:16 +01:00
return self . _html_search_regex ( self . _TITLE_RE , webpage , ' list title ' )
2013-12-06 22:01:41 +01:00
2013-12-01 22:36:18 +01:00
def _extract_videos ( self , list_id , base_url ) :
2013-07-29 13:12:09 +02:00
video_ids = [ ]
for pagenum in itertools . count ( 1 ) :
2013-12-01 22:36:18 +01:00
webpage = self . _download_webpage (
2013-12-06 21:47:32 +01:00
self . _page_url ( base_url , pagenum ) , list_id ,
2014-01-06 23:38:16 +01:00
' Downloading page %s ' % pagenum )
2013-07-29 13:12:09 +02:00
video_ids . extend ( re . findall ( r ' id= " clip_( \ d+?) " ' , webpage ) )
if re . search ( self . _MORE_PAGES_INDICATOR , webpage , re . DOTALL ) is None :
break
entries = [ self . url_result ( ' http://vimeo.com/ %s ' % video_id , ' Vimeo ' )
for video_id in video_ids ]
return { ' _type ' : ' playlist ' ,
2013-12-01 22:36:18 +01:00
' id ' : list_id ,
2013-12-06 22:01:41 +01:00
' title ' : self . _extract_list_title ( webpage ) ,
2013-07-29 13:12:09 +02:00
' entries ' : entries ,
}
2013-12-01 22:36:18 +01:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
channel_id = mobj . group ( ' id ' )
return self . _extract_videos ( channel_id , ' http://vimeo.com/channels/ %s ' % channel_id )
class VimeoUserIE ( VimeoChannelIE ) :
2014-01-06 23:38:16 +01:00
IE_NAME = ' vimeo:user '
2014-01-07 07:13:42 +01:00
_VALID_URL = r ' (?:https?://)?vimeo. \ com/(?P<name>[^/]+)(?:/videos|[#?]|$) '
2013-12-01 22:36:18 +01:00
_TITLE_RE = r ' <a[^>]+?class= " user " >([^<>]+?)</a> '
@classmethod
def suitable ( cls , url ) :
2013-12-06 22:01:41 +01:00
if VimeoChannelIE . suitable ( url ) or VimeoIE . suitable ( url ) or VimeoAlbumIE . suitable ( url ) or VimeoGroupsIE . suitable ( url ) :
2013-12-01 22:36:18 +01:00
return False
return super ( VimeoUserIE , cls ) . suitable ( url )
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
name = mobj . group ( ' name ' )
return self . _extract_videos ( name , ' http://vimeo.com/ %s ' % name )
2013-12-06 21:47:32 +01:00
class VimeoAlbumIE ( VimeoChannelIE ) :
2014-01-06 23:38:16 +01:00
IE_NAME = ' vimeo:album '
2013-12-06 21:47:32 +01:00
_VALID_URL = r ' (?:https?://)?vimeo. \ com/album/(?P<id> \ d+) '
_TITLE_RE = r ' <header id= " page_header " > \ n \ s*<h1>(.*?)</h1> '
def _page_url ( self , base_url , pagenum ) :
return ' %s /page: %d / ' % ( base_url , pagenum )
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
2014-01-06 17:31:47 +01:00
album_id = mobj . group ( ' id ' )
2013-12-06 21:47:32 +01:00
return self . _extract_videos ( album_id , ' http://vimeo.com/album/ %s ' % album_id )
2013-12-06 22:01:41 +01:00
class VimeoGroupsIE ( VimeoAlbumIE ) :
2014-01-06 23:38:16 +01:00
IE_NAME = ' vimeo:group '
2013-12-06 22:01:41 +01:00
_VALID_URL = r ' (?:https?://)?vimeo. \ com/groups/(?P<name>[^/]+) '
def _extract_list_title ( self , webpage ) :
return self . _og_search_title ( webpage )
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
name = mobj . group ( ' name ' )
return self . _extract_videos ( name , ' http://vimeo.com/groups/ %s ' % name )
2014-01-06 17:31:47 +01:00
class VimeoReviewIE ( InfoExtractor ) :
2014-01-06 23:38:16 +01:00
IE_NAME = ' vimeo:review '
IE_DESC = ' Review pages on vimeo '
2014-01-06 17:31:47 +01:00
_VALID_URL = r ' (?:https?://)?vimeo. \ com/[^/]+/review/(?P<id>[^/]+) '
_TEST = {
' url ' : ' https://vimeo.com/user21297594/review/75524534/3c257a1b5d ' ,
' file ' : ' 75524534.mp4 ' ,
' md5 ' : ' c507a72f780cacc12b2248bb4006d253 ' ,
' info_dict ' : {
' title ' : " DICK HARDWICK ' Comedian ' " ,
' uploader ' : ' Richard Hardwick ' ,
}
}
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
video_id = mobj . group ( ' id ' )
player_url = ' https://player.vimeo.com/player/ ' + video_id
return self . url_result ( player_url , ' Vimeo ' , video_id )