1
0
Fork 0
mirror of https://github.com/yt-dlp/yt-dlp synced 2025-01-19 07:13:05 +01:00

Merge branch 'master' into extractor-cablecasttv

This commit is contained in:
Alejandro Garcia 2024-05-19 15:18:57 -04:00 committed by GitHub
commit c61f86c641
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
57 changed files with 1879 additions and 1425 deletions

View file

@ -12,6 +12,9 @@ on:
unix: unix:
default: true default: true
type: boolean type: boolean
linux_static:
default: true
type: boolean
linux_arm: linux_arm:
default: true default: true
type: boolean type: boolean
@ -27,9 +30,6 @@ on:
windows32: windows32:
default: true default: true
type: boolean type: boolean
meta_files:
default: true
type: boolean
origin: origin:
required: false required: false
default: '' default: ''
@ -52,7 +52,11 @@ on:
default: stable default: stable
type: string type: string
unix: unix:
description: yt-dlp, yt-dlp.tar.gz, yt-dlp_linux, yt-dlp_linux.zip description: yt-dlp, yt-dlp.tar.gz
default: true
type: boolean
linux_static:
description: yt-dlp_linux
default: true default: true
type: boolean type: boolean
linux_arm: linux_arm:
@ -75,10 +79,6 @@ on:
description: yt-dlp_x86.exe description: yt-dlp_x86.exe
default: true default: true
type: boolean type: boolean
meta_files:
description: SHA2-256SUMS, SHA2-512SUMS, _update_spec
default: true
type: boolean
origin: origin:
description: Origin description: Origin
required: false required: false
@ -112,27 +112,9 @@ jobs:
- uses: actions/setup-python@v5 - uses: actions/setup-python@v5
with: with:
python-version: "3.10" python-version: "3.10"
- uses: conda-incubator/setup-miniconda@v3
with:
miniforge-variant: Mambaforge
use-mamba: true
channels: conda-forge
auto-update-conda: true
activate-environment: ""
auto-activate-base: false
- name: Install Requirements - name: Install Requirements
run: | run: |
sudo apt -y install zip pandoc man sed sudo apt -y install zip pandoc man sed
cat > ./requirements.txt << EOF
python=3.10.*
pyinstaller
brotli-python
EOF
python devscripts/install_deps.py --print \
--exclude brotli --exclude brotlicffi \
--include secretstorage >> ./requirements.txt
mamba create -n build --file ./requirements.txt
- name: Prepare - name: Prepare
run: | run: |
python devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" python devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}"
@ -141,30 +123,15 @@ jobs:
- name: Build Unix platform-independent binary - name: Build Unix platform-independent binary
run: | run: |
make all tar make all tar
- name: Build Unix standalone binary
shell: bash -l {0}
run: |
unset LD_LIBRARY_PATH # Harmful; set by setup-python
conda activate build
python -m bundle.pyinstaller --onedir
(cd ./dist/yt-dlp_linux && zip -r ../yt-dlp_linux.zip .)
python -m bundle.pyinstaller
mv ./dist/yt-dlp_linux ./yt-dlp_linux
mv ./dist/yt-dlp_linux.zip ./yt-dlp_linux.zip
- name: Verify --update-to - name: Verify --update-to
if: vars.UPDATE_TO_VERIFICATION if: vars.UPDATE_TO_VERIFICATION
run: | run: |
binaries=("yt-dlp" "yt-dlp_linux") chmod +x ./yt-dlp
for binary in "${binaries[@]}"; do cp ./yt-dlp ./yt-dlp_downgraded
chmod +x ./${binary} version="$(./yt-dlp --version)"
cp ./${binary} ./${binary}_downgraded ./yt-dlp_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04
version="$(./${binary} --version)" downgraded_version="$(./yt-dlp_downgraded --version)"
./${binary}_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 [[ "$version" != "$downgraded_version" ]]
downgraded_version="$(./${binary}_downgraded --version)"
[[ "$version" != "$downgraded_version" ]]
done
- name: Upload artifacts - name: Upload artifacts
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
@ -172,8 +139,39 @@ jobs:
path: | path: |
yt-dlp yt-dlp
yt-dlp.tar.gz yt-dlp.tar.gz
yt-dlp_linux compression-level: 0
yt-dlp_linux.zip
linux_static:
needs: process
if: inputs.linux_static
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build static executable
env:
channel: ${{ inputs.channel }}
origin: ${{ needs.process.outputs.origin }}
version: ${{ inputs.version }}
run: |
mkdir ~/build
cd bundle/docker
docker compose up --build static
sudo chown "${USER}:docker" ~/build/yt-dlp_linux
- name: Verify --update-to
if: vars.UPDATE_TO_VERIFICATION
run: |
chmod +x ~/build/yt-dlp_linux
cp ~/build/yt-dlp_linux ~/build/yt-dlp_linux_downgraded
version="$(~/build/yt-dlp_linux --version)"
~/build/yt-dlp_linux_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04
downgraded_version="$(~/build/yt-dlp_linux_downgraded --version)"
[[ "$version" != "$downgraded_version" ]]
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: build-bin-${{ github.job }}
path: |
~/build/yt-dlp_linux
compression-level: 0 compression-level: 0
linux_arm: linux_arm:
@ -300,7 +298,7 @@ jobs:
macos_legacy: macos_legacy:
needs: process needs: process
if: inputs.macos_legacy if: inputs.macos_legacy
runs-on: macos-latest runs-on: macos-12
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@ -447,10 +445,11 @@ jobs:
compression-level: 0 compression-level: 0
meta_files: meta_files:
if: inputs.meta_files && always() && !cancelled() if: always() && !cancelled()
needs: needs:
- process - process
- unix - unix
- linux_static
- linux_arm - linux_arm
- macos - macos
- macos_legacy - macos_legacy

View file

@ -53,7 +53,7 @@ jobs:
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install test requirements - name: Install test requirements
run: python3 ./devscripts/install_deps.py --include dev --include curl_cffi run: python3 ./devscripts/install_deps.py --include dev --include curl-cffi
- name: Run tests - name: Run tests
continue-on-error: False continue-on-error: False
run: | run: |

View file

@ -666,7 +666,7 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git
The name of the browser to load cookies The name of the browser to load cookies
from. Currently supported browsers are: from. Currently supported browsers are:
brave, chrome, chromium, edge, firefox, brave, chrome, chromium, edge, firefox,
opera, safari, vivaldi. Optionally, the opera, safari, vivaldi, whale. Optionally, the
KEYRING used for decrypting Chromium cookies KEYRING used for decrypting Chromium cookies
on Linux, the name/path of the PROFILE to on Linux, the name/path of the PROFILE to
load cookies from, and the CONTAINER name load cookies from, and the CONTAINER name
@ -1760,7 +1760,7 @@ The following extractors use this feature:
#### youtube #### youtube
* `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes
* `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively
* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb`, `mweb_embedscreen` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. * `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb`, `mweb_embedscreen` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. The `android` clients will always be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients.
* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details
* `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp.
* `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
@ -1813,8 +1813,8 @@ The following extractors use this feature:
* `app_name`: Default app name to use with mobile API calls, e.g. `trill` * `app_name`: Default app name to use with mobile API calls, e.g. `trill`
* `app_version`: Default app version to use with mobile API calls - should be set along with `manifest_app_version`, e.g. `34.1.2` * `app_version`: Default app version to use with mobile API calls - should be set along with `manifest_app_version`, e.g. `34.1.2`
* `manifest_app_version`: Default numeric app version to use with mobile API calls, e.g. `2023401020` * `manifest_app_version`: Default numeric app version to use with mobile API calls, e.g. `2023401020`
* `aid`: Default app ID to use with API calls, e.g. `1180` * `aid`: Default app ID to use with mobile API calls, e.g. `1180`
* `app_info`: One or more app info strings in the format of `<iid>/[app_name]/[app_version]/[manifest_app_version]/[aid]`, where `iid` is the unique app install ID. `iid` is the only required value; all other values and their `/` separators can be omitted, e.g. `tiktok:app_info=1234567890123456789` or `tiktok:app_info=123,456/trill///1180,789//34.0.1/340001` * `app_info`: Enable mobile API extraction with one or more app info strings in the format of `<iid>/[app_name]/[app_version]/[manifest_app_version]/[aid]`, where `iid` is the unique app install ID. `iid` is the only required value; all other values and their `/` separators can be omitted, e.g. `tiktok:app_info=1234567890123456789` or `tiktok:app_info=123,456/trill///1180,789//34.0.1/340001`
#### rokfinchannel #### rokfinchannel
* `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` * `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks`

10
bundle/docker/compose.yml Normal file
View file

@ -0,0 +1,10 @@
services:
static:
build: static
environment:
channel: ${channel}
origin: ${origin}
version: ${version}
volumes:
- ~/build:/build
- ../..:/yt-dlp

View file

@ -0,0 +1,21 @@
FROM alpine:3.19 as base
RUN apk --update add --no-cache \
build-base \
python3 \
pipx \
;
RUN pipx install pyinstaller
# Requires above step to prepare the shared venv
RUN ~/.local/share/pipx/shared/bin/python -m pip install -U wheel
RUN apk --update add --no-cache \
scons \
patchelf \
binutils \
;
RUN pipx install staticx
WORKDIR /yt-dlp
COPY entrypoint.sh /entrypoint.sh
ENTRYPOINT /entrypoint.sh

View file

@ -0,0 +1,13 @@
#!/bin/ash
set -e
source ~/.local/share/pipx/venvs/pyinstaller/bin/activate
python -m devscripts.install_deps --include secretstorage
python -m devscripts.make_lazy_extractors
python devscripts/update-version.py -c "${channel}" -r "${origin}" "${version}"
python -m bundle.pyinstaller
deactivate
source ~/.local/share/pipx/venvs/staticx/bin/activate
staticx /yt-dlp/dist/yt-dlp_linux /build/yt-dlp_linux
deactivate

View file

@ -1,4 +1,3 @@
import functools
import inspect import inspect
import pytest import pytest
@ -10,7 +9,9 @@ from yt_dlp.utils._utils import _YDLLogger as FakeLogger
@pytest.fixture @pytest.fixture
def handler(request): def handler(request):
RH_KEY = request.param RH_KEY = getattr(request, 'param', None)
if not RH_KEY:
return
if inspect.isclass(RH_KEY) and issubclass(RH_KEY, RequestHandler): if inspect.isclass(RH_KEY) and issubclass(RH_KEY, RequestHandler):
handler = RH_KEY handler = RH_KEY
elif RH_KEY in _REQUEST_HANDLERS: elif RH_KEY in _REQUEST_HANDLERS:
@ -18,9 +19,46 @@ def handler(request):
else: else:
pytest.skip(f'{RH_KEY} request handler is not available') pytest.skip(f'{RH_KEY} request handler is not available')
return functools.partial(handler, logger=FakeLogger) class HandlerWrapper(handler):
RH_KEY = handler.RH_KEY
def __init__(self, *args, **kwargs):
super().__init__(logger=FakeLogger, *args, **kwargs)
return HandlerWrapper
def validate_and_send(rh, req): @pytest.fixture(autouse=True)
rh.validate(req) def skip_handler(request, handler):
return rh.send(req) """usage: pytest.mark.skip_handler('my_handler', 'reason')"""
for marker in request.node.iter_markers('skip_handler'):
if marker.args[0] == handler.RH_KEY:
pytest.skip(marker.args[1] if len(marker.args) > 1 else '')
@pytest.fixture(autouse=True)
def skip_handler_if(request, handler):
"""usage: pytest.mark.skip_handler_if('my_handler', lambda request: True, 'reason')"""
for marker in request.node.iter_markers('skip_handler_if'):
if marker.args[0] == handler.RH_KEY and marker.args[1](request):
pytest.skip(marker.args[2] if len(marker.args) > 2 else '')
@pytest.fixture(autouse=True)
def skip_handlers_if(request, handler):
"""usage: pytest.mark.skip_handlers_if(lambda request, handler: True, 'reason')"""
for marker in request.node.iter_markers('skip_handlers_if'):
if handler and marker.args[0](request, handler):
pytest.skip(marker.args[1] if len(marker.args) > 1 else '')
def pytest_configure(config):
config.addinivalue_line(
"markers", "skip_handler(handler): skip test for the given handler",
)
config.addinivalue_line(
"markers", "skip_handler_if(handler): skip test for the given handler if condition is true"
)
config.addinivalue_line(
"markers", "skip_handlers_if(handler): skip test for handlers when the condition is true"
)

View file

@ -338,3 +338,8 @@ def http_server_port(httpd):
def verify_address_availability(address): def verify_address_availability(address):
if find_available_port(address) is None: if find_available_port(address) is None:
pytest.skip(f'Unable to bind to source address {address} (address may not exist)') pytest.skip(f'Unable to bind to source address {address} (address may not exist)')
def validate_and_send(rh, req):
rh.validate(req)
return rh.send(req)

379
test/test_http_proxy.py Normal file
View file

@ -0,0 +1,379 @@
import abc
import base64
import contextlib
import functools
import json
import os
import random
import ssl
import threading
from http.server import BaseHTTPRequestHandler
from socketserver import ThreadingTCPServer
import pytest
from test.helper import http_server_port, verify_address_availability
from test.test_networking import TEST_DIR
from test.test_socks import IPv6ThreadingTCPServer
from yt_dlp.dependencies import urllib3
from yt_dlp.networking import Request
from yt_dlp.networking.exceptions import HTTPError, ProxyError, SSLError
class HTTPProxyAuthMixin:
def proxy_auth_error(self):
self.send_response(407)
self.send_header('Proxy-Authenticate', 'Basic realm="test http proxy"')
self.end_headers()
return False
def do_proxy_auth(self, username, password):
if username is None and password is None:
return True
proxy_auth_header = self.headers.get('Proxy-Authorization', None)
if proxy_auth_header is None:
return self.proxy_auth_error()
if not proxy_auth_header.startswith('Basic '):
return self.proxy_auth_error()
auth = proxy_auth_header[6:]
try:
auth_username, auth_password = base64.b64decode(auth).decode().split(':', 1)
except Exception:
return self.proxy_auth_error()
if auth_username != (username or '') or auth_password != (password or ''):
return self.proxy_auth_error()
return True
class HTTPProxyHandler(BaseHTTPRequestHandler, HTTPProxyAuthMixin):
def __init__(self, *args, proxy_info=None, username=None, password=None, request_handler=None, **kwargs):
self.username = username
self.password = password
self.proxy_info = proxy_info
super().__init__(*args, **kwargs)
def do_GET(self):
if not self.do_proxy_auth(self.username, self.password):
self.server.close_request(self.request)
return
if self.path.endswith('/proxy_info'):
payload = json.dumps(self.proxy_info or {
'client_address': self.client_address,
'connect': False,
'connect_host': None,
'connect_port': None,
'headers': dict(self.headers),
'path': self.path,
'proxy': ':'.join(str(y) for y in self.connection.getsockname()),
})
self.send_response(200)
self.send_header('Content-Type', 'application/json; charset=utf-8')
self.send_header('Content-Length', str(len(payload)))
self.end_headers()
self.wfile.write(payload.encode())
else:
self.send_response(404)
self.end_headers()
self.server.close_request(self.request)
if urllib3:
import urllib3.util.ssltransport
class SSLTransport(urllib3.util.ssltransport.SSLTransport):
"""
Modified version of urllib3 SSLTransport to support server side SSL
This allows us to chain multiple TLS connections.
"""
def __init__(self, socket, ssl_context, server_hostname=None, suppress_ragged_eofs=True, server_side=False):
self.incoming = ssl.MemoryBIO()
self.outgoing = ssl.MemoryBIO()
self.suppress_ragged_eofs = suppress_ragged_eofs
self.socket = socket
self.sslobj = ssl_context.wrap_bio(
self.incoming,
self.outgoing,
server_hostname=server_hostname,
server_side=server_side
)
self._ssl_io_loop(self.sslobj.do_handshake)
@property
def _io_refs(self):
return self.socket._io_refs
@_io_refs.setter
def _io_refs(self, value):
self.socket._io_refs = value
def shutdown(self, *args, **kwargs):
self.socket.shutdown(*args, **kwargs)
else:
SSLTransport = None
class HTTPSProxyHandler(HTTPProxyHandler):
def __init__(self, request, *args, **kwargs):
certfn = os.path.join(TEST_DIR, 'testcert.pem')
sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
sslctx.load_cert_chain(certfn, None)
if isinstance(request, ssl.SSLSocket):
request = SSLTransport(request, ssl_context=sslctx, server_side=True)
else:
request = sslctx.wrap_socket(request, server_side=True)
super().__init__(request, *args, **kwargs)
class HTTPConnectProxyHandler(BaseHTTPRequestHandler, HTTPProxyAuthMixin):
protocol_version = 'HTTP/1.1'
default_request_version = 'HTTP/1.1'
def __init__(self, *args, username=None, password=None, request_handler=None, **kwargs):
self.username = username
self.password = password
self.request_handler = request_handler
super().__init__(*args, **kwargs)
def do_CONNECT(self):
if not self.do_proxy_auth(self.username, self.password):
self.server.close_request(self.request)
return
self.send_response(200)
self.end_headers()
proxy_info = {
'client_address': self.client_address,
'connect': True,
'connect_host': self.path.split(':')[0],
'connect_port': int(self.path.split(':')[1]),
'headers': dict(self.headers),
'path': self.path,
'proxy': ':'.join(str(y) for y in self.connection.getsockname()),
}
self.request_handler(self.request, self.client_address, self.server, proxy_info=proxy_info)
self.server.close_request(self.request)
class HTTPSConnectProxyHandler(HTTPConnectProxyHandler):
def __init__(self, request, *args, **kwargs):
certfn = os.path.join(TEST_DIR, 'testcert.pem')
sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
sslctx.load_cert_chain(certfn, None)
request = sslctx.wrap_socket(request, server_side=True)
self._original_request = request
super().__init__(request, *args, **kwargs)
def do_CONNECT(self):
super().do_CONNECT()
self.server.close_request(self._original_request)
@contextlib.contextmanager
def proxy_server(proxy_server_class, request_handler, bind_ip=None, **proxy_server_kwargs):
server = server_thread = None
try:
bind_address = bind_ip or '127.0.0.1'
server_type = ThreadingTCPServer if '.' in bind_address else IPv6ThreadingTCPServer
server = server_type(
(bind_address, 0), functools.partial(proxy_server_class, request_handler=request_handler, **proxy_server_kwargs))
server_port = http_server_port(server)
server_thread = threading.Thread(target=server.serve_forever)
server_thread.daemon = True
server_thread.start()
if '.' not in bind_address:
yield f'[{bind_address}]:{server_port}'
else:
yield f'{bind_address}:{server_port}'
finally:
server.shutdown()
server.server_close()
server_thread.join(2.0)
class HTTPProxyTestContext(abc.ABC):
REQUEST_HANDLER_CLASS = None
REQUEST_PROTO = None
def http_server(self, server_class, *args, **kwargs):
return proxy_server(server_class, self.REQUEST_HANDLER_CLASS, *args, **kwargs)
@abc.abstractmethod
def proxy_info_request(self, handler, target_domain=None, target_port=None, **req_kwargs) -> dict:
"""return a dict of proxy_info"""
class HTTPProxyHTTPTestContext(HTTPProxyTestContext):
# Standard HTTP Proxy for http requests
REQUEST_HANDLER_CLASS = HTTPProxyHandler
REQUEST_PROTO = 'http'
def proxy_info_request(self, handler, target_domain=None, target_port=None, **req_kwargs):
request = Request(f'http://{target_domain or "127.0.0.1"}:{target_port or "40000"}/proxy_info', **req_kwargs)
handler.validate(request)
return json.loads(handler.send(request).read().decode())
class HTTPProxyHTTPSTestContext(HTTPProxyTestContext):
# HTTP Connect proxy, for https requests
REQUEST_HANDLER_CLASS = HTTPSProxyHandler
REQUEST_PROTO = 'https'
def proxy_info_request(self, handler, target_domain=None, target_port=None, **req_kwargs):
request = Request(f'https://{target_domain or "127.0.0.1"}:{target_port or "40000"}/proxy_info', **req_kwargs)
handler.validate(request)
return json.loads(handler.send(request).read().decode())
CTX_MAP = {
'http': HTTPProxyHTTPTestContext,
'https': HTTPProxyHTTPSTestContext,
}
@pytest.fixture(scope='module')
def ctx(request):
return CTX_MAP[request.param]()
@pytest.mark.parametrize(
'handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
@pytest.mark.parametrize('ctx', ['http'], indirect=True) # pure http proxy can only support http
class TestHTTPProxy:
def test_http_no_auth(self, handler, ctx):
with ctx.http_server(HTTPProxyHandler) as server_address:
with handler(proxies={ctx.REQUEST_PROTO: f'http://{server_address}'}) as rh:
proxy_info = ctx.proxy_info_request(rh)
assert proxy_info['proxy'] == server_address
assert proxy_info['connect'] is False
assert 'Proxy-Authorization' not in proxy_info['headers']
def test_http_auth(self, handler, ctx):
with ctx.http_server(HTTPProxyHandler, username='test', password='test') as server_address:
with handler(proxies={ctx.REQUEST_PROTO: f'http://test:test@{server_address}'}) as rh:
proxy_info = ctx.proxy_info_request(rh)
assert proxy_info['proxy'] == server_address
assert 'Proxy-Authorization' in proxy_info['headers']
def test_http_bad_auth(self, handler, ctx):
with ctx.http_server(HTTPProxyHandler, username='test', password='test') as server_address:
with handler(proxies={ctx.REQUEST_PROTO: f'http://test:bad@{server_address}'}) as rh:
with pytest.raises(HTTPError) as exc_info:
ctx.proxy_info_request(rh)
assert exc_info.value.response.status == 407
exc_info.value.response.close()
def test_http_source_address(self, handler, ctx):
with ctx.http_server(HTTPProxyHandler) as server_address:
source_address = f'127.0.0.{random.randint(5, 255)}'
verify_address_availability(source_address)
with handler(proxies={ctx.REQUEST_PROTO: f'http://{server_address}'},
source_address=source_address) as rh:
proxy_info = ctx.proxy_info_request(rh)
assert proxy_info['proxy'] == server_address
assert proxy_info['client_address'][0] == source_address
@pytest.mark.skip_handler('Urllib', 'urllib does not support https proxies')
def test_https(self, handler, ctx):
with ctx.http_server(HTTPSProxyHandler) as server_address:
with handler(verify=False, proxies={ctx.REQUEST_PROTO: f'https://{server_address}'}) as rh:
proxy_info = ctx.proxy_info_request(rh)
assert proxy_info['proxy'] == server_address
assert proxy_info['connect'] is False
assert 'Proxy-Authorization' not in proxy_info['headers']
@pytest.mark.skip_handler('Urllib', 'urllib does not support https proxies')
def test_https_verify_failed(self, handler, ctx):
with ctx.http_server(HTTPSProxyHandler) as server_address:
with handler(verify=True, proxies={ctx.REQUEST_PROTO: f'https://{server_address}'}) as rh:
# Accept SSLError as may not be feasible to tell if it is proxy or request error.
# note: if request proto also does ssl verification, this may also be the error of the request.
# Until we can support passing custom cacerts to handlers, we cannot properly test this for all cases.
with pytest.raises((ProxyError, SSLError)):
ctx.proxy_info_request(rh)
def test_http_with_idn(self, handler, ctx):
with ctx.http_server(HTTPProxyHandler) as server_address:
with handler(proxies={ctx.REQUEST_PROTO: f'http://{server_address}'}) as rh:
proxy_info = ctx.proxy_info_request(rh, target_domain='中文.tw')
assert proxy_info['proxy'] == server_address
assert proxy_info['path'].startswith('http://xn--fiq228c.tw')
assert proxy_info['headers']['Host'].split(':', 1)[0] == 'xn--fiq228c.tw'
@pytest.mark.parametrize(
'handler,ctx', [
('Requests', 'https'),
('CurlCFFI', 'https'),
], indirect=True)
class TestHTTPConnectProxy:
def test_http_connect_no_auth(self, handler, ctx):
with ctx.http_server(HTTPConnectProxyHandler) as server_address:
with handler(verify=False, proxies={ctx.REQUEST_PROTO: f'http://{server_address}'}) as rh:
proxy_info = ctx.proxy_info_request(rh)
assert proxy_info['proxy'] == server_address
assert proxy_info['connect'] is True
assert 'Proxy-Authorization' not in proxy_info['headers']
def test_http_connect_auth(self, handler, ctx):
with ctx.http_server(HTTPConnectProxyHandler, username='test', password='test') as server_address:
with handler(verify=False, proxies={ctx.REQUEST_PROTO: f'http://test:test@{server_address}'}) as rh:
proxy_info = ctx.proxy_info_request(rh)
assert proxy_info['proxy'] == server_address
assert 'Proxy-Authorization' in proxy_info['headers']
@pytest.mark.skip_handler(
'Requests',
'bug in urllib3 causes unclosed socket: https://github.com/urllib3/urllib3/issues/3374'
)
def test_http_connect_bad_auth(self, handler, ctx):
with ctx.http_server(HTTPConnectProxyHandler, username='test', password='test') as server_address:
with handler(verify=False, proxies={ctx.REQUEST_PROTO: f'http://test:bad@{server_address}'}) as rh:
with pytest.raises(ProxyError):
ctx.proxy_info_request(rh)
def test_http_connect_source_address(self, handler, ctx):
with ctx.http_server(HTTPConnectProxyHandler) as server_address:
source_address = f'127.0.0.{random.randint(5, 255)}'
verify_address_availability(source_address)
with handler(proxies={ctx.REQUEST_PROTO: f'http://{server_address}'},
source_address=source_address,
verify=False) as rh:
proxy_info = ctx.proxy_info_request(rh)
assert proxy_info['proxy'] == server_address
assert proxy_info['client_address'][0] == source_address
@pytest.mark.skipif(urllib3 is None, reason='requires urllib3 to test')
def test_https_connect_proxy(self, handler, ctx):
with ctx.http_server(HTTPSConnectProxyHandler) as server_address:
with handler(verify=False, proxies={ctx.REQUEST_PROTO: f'https://{server_address}'}) as rh:
proxy_info = ctx.proxy_info_request(rh)
assert proxy_info['proxy'] == server_address
assert proxy_info['connect'] is True
assert 'Proxy-Authorization' not in proxy_info['headers']
@pytest.mark.skipif(urllib3 is None, reason='requires urllib3 to test')
def test_https_connect_verify_failed(self, handler, ctx):
with ctx.http_server(HTTPSConnectProxyHandler) as server_address:
with handler(verify=True, proxies={ctx.REQUEST_PROTO: f'https://{server_address}'}) as rh:
# Accept SSLError as may not be feasible to tell if it is proxy or request error.
# note: if request proto also does ssl verification, this may also be the error of the request.
# Until we can support passing custom cacerts to handlers, we cannot properly test this for all cases.
with pytest.raises((ProxyError, SSLError)):
ctx.proxy_info_request(rh)
@pytest.mark.skipif(urllib3 is None, reason='requires urllib3 to test')
def test_https_connect_proxy_auth(self, handler, ctx):
with ctx.http_server(HTTPSConnectProxyHandler, username='test', password='test') as server_address:
with handler(verify=False, proxies={ctx.REQUEST_PROTO: f'https://test:test@{server_address}'}) as rh:
proxy_info = ctx.proxy_info_request(rh)
assert proxy_info['proxy'] == server_address
assert 'Proxy-Authorization' in proxy_info['headers']

View file

@ -6,6 +6,8 @@ import sys
import pytest import pytest
from yt_dlp.networking.common import Features, DEFAULT_TIMEOUT
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import gzip import gzip
@ -27,8 +29,12 @@ import zlib
from email.message import Message from email.message import Message
from http.cookiejar import CookieJar from http.cookiejar import CookieJar
from test.conftest import validate_and_send from test.helper import (
from test.helper import FakeYDL, http_server_port, verify_address_availability FakeYDL,
http_server_port,
validate_and_send,
verify_address_availability,
)
from yt_dlp.cookies import YoutubeDLCookieJar from yt_dlp.cookies import YoutubeDLCookieJar
from yt_dlp.dependencies import brotli, curl_cffi, requests, urllib3 from yt_dlp.dependencies import brotli, curl_cffi, requests, urllib3
from yt_dlp.networking import ( from yt_dlp.networking import (
@ -62,21 +68,6 @@ from yt_dlp.utils.networking import HTTPHeaderDict, std_headers
TEST_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DIR = os.path.dirname(os.path.abspath(__file__))
def _build_proxy_handler(name):
class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler):
proxy_name = name
def log_message(self, format, *args):
pass
def do_GET(self):
self.send_response(200)
self.send_header('Content-Type', 'text/plain; charset=utf-8')
self.end_headers()
self.wfile.write(f'{self.proxy_name}: {self.path}'.encode())
return HTTPTestRequestHandler
class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler): class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler):
protocol_version = 'HTTP/1.1' protocol_version = 'HTTP/1.1'
default_request_version = 'HTTP/1.1' default_request_version = 'HTTP/1.1'
@ -317,8 +308,9 @@ class TestRequestHandlerBase:
cls.https_server_thread.start() cls.https_server_thread.start()
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
class TestHTTPRequestHandler(TestRequestHandlerBase): class TestHTTPRequestHandler(TestRequestHandlerBase):
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_verify_cert(self, handler): def test_verify_cert(self, handler):
with handler() as rh: with handler() as rh:
with pytest.raises(CertificateVerifyError): with pytest.raises(CertificateVerifyError):
@ -329,7 +321,6 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
assert r.status == 200 assert r.status == 200
r.close() r.close()
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_ssl_error(self, handler): def test_ssl_error(self, handler):
# HTTPS server with too old TLS version # HTTPS server with too old TLS version
# XXX: is there a better way to test this than to create a new server? # XXX: is there a better way to test this than to create a new server?
@ -347,7 +338,6 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers')) validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers'))
assert not issubclass(exc_info.type, CertificateVerifyError) assert not issubclass(exc_info.type, CertificateVerifyError)
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_percent_encode(self, handler): def test_percent_encode(self, handler):
with handler() as rh: with handler() as rh:
# Unicode characters should be encoded with uppercase percent-encoding # Unicode characters should be encoded with uppercase percent-encoding
@ -359,7 +349,6 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
assert res.status == 200 assert res.status == 200
res.close() res.close()
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
@pytest.mark.parametrize('path', [ @pytest.mark.parametrize('path', [
'/a/b/./../../headers', '/a/b/./../../headers',
'/redirect_dotsegments', '/redirect_dotsegments',
@ -375,15 +364,13 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
assert res.url == f'http://127.0.0.1:{self.http_port}/headers' assert res.url == f'http://127.0.0.1:{self.http_port}/headers'
res.close() res.close()
# Not supported by CurlCFFI (non-standard) @pytest.mark.skip_handler('CurlCFFI', 'not supported by curl-cffi (non-standard)')
@pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True)
def test_unicode_path_redirection(self, handler): def test_unicode_path_redirection(self, handler):
with handler() as rh: with handler() as rh:
r = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/302-non-ascii-redirect')) r = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/302-non-ascii-redirect'))
assert r.url == f'http://127.0.0.1:{self.http_port}/%E4%B8%AD%E6%96%87.html' assert r.url == f'http://127.0.0.1:{self.http_port}/%E4%B8%AD%E6%96%87.html'
r.close() r.close()
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_raise_http_error(self, handler): def test_raise_http_error(self, handler):
with handler() as rh: with handler() as rh:
for bad_status in (400, 500, 599, 302): for bad_status in (400, 500, 599, 302):
@ -393,7 +380,6 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
# Should not raise an error # Should not raise an error
validate_and_send(rh, Request('http://127.0.0.1:%d/gen_200' % self.http_port)).close() validate_and_send(rh, Request('http://127.0.0.1:%d/gen_200' % self.http_port)).close()
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_response_url(self, handler): def test_response_url(self, handler):
with handler() as rh: with handler() as rh:
# Response url should be that of the last url in redirect chain # Response url should be that of the last url in redirect chain
@ -405,7 +391,6 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
res2.close() res2.close()
# Covers some basic cases we expect some level of consistency between request handlers for # Covers some basic cases we expect some level of consistency between request handlers for
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
@pytest.mark.parametrize('redirect_status,method,expected', [ @pytest.mark.parametrize('redirect_status,method,expected', [
# A 303 must either use GET or HEAD for subsequent request # A 303 must either use GET or HEAD for subsequent request
(303, 'POST', ('', 'GET', False)), (303, 'POST', ('', 'GET', False)),
@ -447,7 +432,6 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
assert expected[1] == res.headers.get('method') assert expected[1] == res.headers.get('method')
assert expected[2] == ('content-length' in headers.decode().lower()) assert expected[2] == ('content-length' in headers.decode().lower())
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_request_cookie_header(self, handler): def test_request_cookie_header(self, handler):
# We should accept a Cookie header being passed as in normal headers and handle it appropriately. # We should accept a Cookie header being passed as in normal headers and handle it appropriately.
with handler() as rh: with handler() as rh:
@ -480,19 +464,16 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
assert b'cookie: test=ytdlp' not in data.lower() assert b'cookie: test=ytdlp' not in data.lower()
assert b'cookie: test=test3' in data.lower() assert b'cookie: test=test3' in data.lower()
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_redirect_loop(self, handler): def test_redirect_loop(self, handler):
with handler() as rh: with handler() as rh:
with pytest.raises(HTTPError, match='redirect loop'): with pytest.raises(HTTPError, match='redirect loop'):
validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_loop')) validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_loop'))
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_incompleteread(self, handler): def test_incompleteread(self, handler):
with handler(timeout=2) as rh: with handler(timeout=2) as rh:
with pytest.raises(IncompleteRead, match='13 bytes read, 234221 more expected'): with pytest.raises(IncompleteRead, match='13 bytes read, 234221 more expected'):
validate_and_send(rh, Request('http://127.0.0.1:%d/incompleteread' % self.http_port)).read() validate_and_send(rh, Request('http://127.0.0.1:%d/incompleteread' % self.http_port)).read()
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_cookies(self, handler): def test_cookies(self, handler):
cookiejar = YoutubeDLCookieJar() cookiejar = YoutubeDLCookieJar()
cookiejar.set_cookie(http.cookiejar.Cookie( cookiejar.set_cookie(http.cookiejar.Cookie(
@ -509,7 +490,6 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions={'cookiejar': cookiejar})).read() rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions={'cookiejar': cookiejar})).read()
assert b'cookie: test=ytdlp' in data.lower() assert b'cookie: test=ytdlp' in data.lower()
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_headers(self, handler): def test_headers(self, handler):
with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh: with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh:
@ -525,7 +505,6 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
assert b'test2: test2' not in data assert b'test2: test2' not in data
assert b'test3: test3' in data assert b'test3: test3' in data
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_read_timeout(self, handler): def test_read_timeout(self, handler):
with handler() as rh: with handler() as rh:
# Default timeout is 20 seconds, so this should go through # Default timeout is 20 seconds, so this should go through
@ -541,26 +520,21 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
validate_and_send( validate_and_send(
rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1', extensions={'timeout': 4})) rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1', extensions={'timeout': 4}))
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_connect_timeout(self, handler): def test_connect_timeout(self, handler):
# nothing should be listening on this port # nothing should be listening on this port
connect_timeout_url = 'http://10.255.255.255' connect_timeout_url = 'http://10.255.255.255'
with handler(timeout=0.01) as rh: with handler(timeout=0.01) as rh, pytest.raises(TransportError):
now = time.time() now = time.time()
with pytest.raises(TransportError): validate_and_send(rh, Request(connect_timeout_url))
validate_and_send( assert time.time() - now < DEFAULT_TIMEOUT
rh, Request(connect_timeout_url))
assert 0.01 <= time.time() - now < 20
with handler() as rh: # Per request timeout, should override handler timeout
with pytest.raises(TransportError): request = Request(connect_timeout_url, extensions={'timeout': 0.01})
# Per request timeout, should override handler timeout with handler() as rh, pytest.raises(TransportError):
now = time.time() now = time.time()
validate_and_send( validate_and_send(rh, request)
rh, Request(connect_timeout_url, extensions={'timeout': 0.01})) assert time.time() - now < DEFAULT_TIMEOUT
assert 0.01 <= time.time() - now < 20
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_source_address(self, handler): def test_source_address(self, handler):
source_address = f'127.0.0.{random.randint(5, 255)}' source_address = f'127.0.0.{random.randint(5, 255)}'
# on some systems these loopback addresses we need for testing may not be available # on some systems these loopback addresses we need for testing may not be available
@ -572,13 +546,13 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
assert source_address == data assert source_address == data
# Not supported by CurlCFFI # Not supported by CurlCFFI
@pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) @pytest.mark.skip_handler('CurlCFFI', 'not supported by curl-cffi')
def test_gzip_trailing_garbage(self, handler): def test_gzip_trailing_garbage(self, handler):
with handler() as rh: with handler() as rh:
data = validate_and_send(rh, Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode() data = validate_and_send(rh, Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode()
assert data == '<html><video src="/vid.mp4" /></html>' assert data == '<html><video src="/vid.mp4" /></html>'
@pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) @pytest.mark.skip_handler('CurlCFFI', 'not applicable to curl-cffi')
@pytest.mark.skipif(not brotli, reason='brotli support is not installed') @pytest.mark.skipif(not brotli, reason='brotli support is not installed')
def test_brotli(self, handler): def test_brotli(self, handler):
with handler() as rh: with handler() as rh:
@ -589,7 +563,6 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
assert res.headers.get('Content-Encoding') == 'br' assert res.headers.get('Content-Encoding') == 'br'
assert res.read() == b'<html><video src="/vid.mp4" /></html>' assert res.read() == b'<html><video src="/vid.mp4" /></html>'
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_deflate(self, handler): def test_deflate(self, handler):
with handler() as rh: with handler() as rh:
res = validate_and_send( res = validate_and_send(
@ -599,7 +572,6 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
assert res.headers.get('Content-Encoding') == 'deflate' assert res.headers.get('Content-Encoding') == 'deflate'
assert res.read() == b'<html><video src="/vid.mp4" /></html>' assert res.read() == b'<html><video src="/vid.mp4" /></html>'
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_gzip(self, handler): def test_gzip(self, handler):
with handler() as rh: with handler() as rh:
res = validate_and_send( res = validate_and_send(
@ -609,7 +581,6 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
assert res.headers.get('Content-Encoding') == 'gzip' assert res.headers.get('Content-Encoding') == 'gzip'
assert res.read() == b'<html><video src="/vid.mp4" /></html>' assert res.read() == b'<html><video src="/vid.mp4" /></html>'
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_multiple_encodings(self, handler): def test_multiple_encodings(self, handler):
with handler() as rh: with handler() as rh:
for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'): for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'):
@ -620,8 +591,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
assert res.headers.get('Content-Encoding') == pair assert res.headers.get('Content-Encoding') == pair
assert res.read() == b'<html><video src="/vid.mp4" /></html>' assert res.read() == b'<html><video src="/vid.mp4" /></html>'
# Not supported by curl_cffi @pytest.mark.skip_handler('CurlCFFI', 'not supported by curl-cffi')
@pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True)
def test_unsupported_encoding(self, handler): def test_unsupported_encoding(self, handler):
with handler() as rh: with handler() as rh:
res = validate_and_send( res = validate_and_send(
@ -631,7 +601,6 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
assert res.headers.get('Content-Encoding') == 'unsupported' assert res.headers.get('Content-Encoding') == 'unsupported'
assert res.read() == b'raw' assert res.read() == b'raw'
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_read(self, handler): def test_read(self, handler):
with handler() as rh: with handler() as rh:
res = validate_and_send( res = validate_and_send(
@ -642,83 +611,48 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
assert res.read().decode().endswith('\n\n') assert res.read().decode().endswith('\n\n')
assert res.read() == b'' assert res.read() == b''
def test_request_disable_proxy(self, handler):
for proxy_proto in handler._SUPPORTED_PROXY_SCHEMES or ['http']:
# Given the handler is configured with a proxy
with handler(proxies={'http': f'{proxy_proto}://10.255.255.255'}, timeout=5) as rh:
# When a proxy is explicitly set to None for the request
res = validate_and_send(
rh, Request(f'http://127.0.0.1:{self.http_port}/headers', proxies={'http': None}))
# Then no proxy should be used
res.close()
assert res.status == 200
class TestHTTPProxy(TestRequestHandlerBase): @pytest.mark.skip_handlers_if(
# Note: this only tests http urls over non-CONNECT proxy lambda _, handler: Features.NO_PROXY not in handler._SUPPORTED_FEATURES, 'handler does not support NO_PROXY')
@classmethod
def setup_class(cls):
super().setup_class()
# HTTP Proxy server
cls.proxy = http.server.ThreadingHTTPServer(
('127.0.0.1', 0), _build_proxy_handler('normal'))
cls.proxy_port = http_server_port(cls.proxy)
cls.proxy_thread = threading.Thread(target=cls.proxy.serve_forever)
cls.proxy_thread.daemon = True
cls.proxy_thread.start()
# Geo proxy server
cls.geo_proxy = http.server.ThreadingHTTPServer(
('127.0.0.1', 0), _build_proxy_handler('geo'))
cls.geo_port = http_server_port(cls.geo_proxy)
cls.geo_proxy_thread = threading.Thread(target=cls.geo_proxy.serve_forever)
cls.geo_proxy_thread.daemon = True
cls.geo_proxy_thread.start()
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_http_proxy(self, handler):
http_proxy = f'http://127.0.0.1:{self.proxy_port}'
geo_proxy = f'http://127.0.0.1:{self.geo_port}'
# Test global http proxy
# Test per request http proxy
# Test per request http proxy disables proxy
url = 'http://foo.com/bar'
# Global HTTP proxy
with handler(proxies={'http': http_proxy}) as rh:
res = validate_and_send(rh, Request(url)).read().decode()
assert res == f'normal: {url}'
# Per request proxy overrides global
res = validate_and_send(rh, Request(url, proxies={'http': geo_proxy})).read().decode()
assert res == f'geo: {url}'
# and setting to None disables all proxies for that request
real_url = f'http://127.0.0.1:{self.http_port}/headers'
res = validate_and_send(
rh, Request(real_url, proxies={'http': None})).read().decode()
assert res != f'normal: {real_url}'
assert 'Accept' in res
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_noproxy(self, handler): def test_noproxy(self, handler):
with handler(proxies={'proxy': f'http://127.0.0.1:{self.proxy_port}'}) as rh: for proxy_proto in handler._SUPPORTED_PROXY_SCHEMES or ['http']:
# NO_PROXY # Given the handler is configured with a proxy
for no_proxy in (f'127.0.0.1:{self.http_port}', '127.0.0.1', 'localhost'): with handler(proxies={'http': f'{proxy_proto}://10.255.255.255'}, timeout=5) as rh:
nop_response = validate_and_send( for no_proxy in (f'127.0.0.1:{self.http_port}', '127.0.0.1', 'localhost'):
rh, Request(f'http://127.0.0.1:{self.http_port}/headers', proxies={'no': no_proxy})).read().decode( # When request no proxy includes the request url host
'utf-8') nop_response = validate_and_send(
assert 'Accept' in nop_response rh, Request(f'http://127.0.0.1:{self.http_port}/headers', proxies={'no': no_proxy}))
# Then the proxy should not be used
assert nop_response.status == 200
nop_response.close()
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) @pytest.mark.skip_handlers_if(
lambda _, handler: Features.ALL_PROXY not in handler._SUPPORTED_FEATURES, 'handler does not support ALL_PROXY')
def test_allproxy(self, handler): def test_allproxy(self, handler):
url = 'http://foo.com/bar' # This is a bit of a hacky test, but it should be enough to check whether the handler is using the proxy.
with handler() as rh: # 0.1s might not be enough of a timeout if proxy is not used in all cases, but should still get failures.
response = validate_and_send(rh, Request(url, proxies={'all': f'http://127.0.0.1:{self.proxy_port}'})).read().decode( with handler(proxies={'all': 'http://10.255.255.255'}, timeout=0.1) as rh:
'utf-8') with pytest.raises(TransportError):
assert response == f'normal: {url}' validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/headers')).close()
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) with handler(timeout=0.1) as rh:
def test_http_proxy_with_idn(self, handler): with pytest.raises(TransportError):
with handler(proxies={ validate_and_send(
'http': f'http://127.0.0.1:{self.proxy_port}', rh, Request(
}) as rh: f'http://127.0.0.1:{self.http_port}/headers', proxies={'all': 'http://10.255.255.255'})).close()
url = 'http://中文.tw/'
response = rh.send(Request(url)).read().decode()
# b'xn--fiq228c' is '中文'.encode('idna')
assert response == 'normal: http://xn--fiq228c.tw/'
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
class TestClientCertificate: class TestClientCertificate:
@classmethod @classmethod
def setup_class(cls): def setup_class(cls):
@ -745,27 +679,23 @@ class TestClientCertificate:
) as rh: ) as rh:
validate_and_send(rh, Request(f'https://127.0.0.1:{self.port}/video.html')).read().decode() validate_and_send(rh, Request(f'https://127.0.0.1:{self.port}/video.html')).read().decode()
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_certificate_combined_nopass(self, handler): def test_certificate_combined_nopass(self, handler):
self._run_test(handler, client_cert={ self._run_test(handler, client_cert={
'client_certificate': os.path.join(self.certdir, 'clientwithkey.crt'), 'client_certificate': os.path.join(self.certdir, 'clientwithkey.crt'),
}) })
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_certificate_nocombined_nopass(self, handler): def test_certificate_nocombined_nopass(self, handler):
self._run_test(handler, client_cert={ self._run_test(handler, client_cert={
'client_certificate': os.path.join(self.certdir, 'client.crt'), 'client_certificate': os.path.join(self.certdir, 'client.crt'),
'client_certificate_key': os.path.join(self.certdir, 'client.key'), 'client_certificate_key': os.path.join(self.certdir, 'client.key'),
}) })
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_certificate_combined_pass(self, handler): def test_certificate_combined_pass(self, handler):
self._run_test(handler, client_cert={ self._run_test(handler, client_cert={
'client_certificate': os.path.join(self.certdir, 'clientwithencryptedkey.crt'), 'client_certificate': os.path.join(self.certdir, 'clientwithencryptedkey.crt'),
'client_certificate_password': 'foobar', 'client_certificate_password': 'foobar',
}) })
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_certificate_nocombined_pass(self, handler): def test_certificate_nocombined_pass(self, handler):
self._run_test(handler, client_cert={ self._run_test(handler, client_cert={
'client_certificate': os.path.join(self.certdir, 'client.crt'), 'client_certificate': os.path.join(self.certdir, 'client.crt'),
@ -785,6 +715,25 @@ class TestHTTPImpersonateRequestHandler(TestRequestHandlerBase):
assert res.status == 200 assert res.status == 200
assert std_headers['user-agent'].lower() not in res.read().decode().lower() assert std_headers['user-agent'].lower() not in res.read().decode().lower()
def test_response_extensions(self, handler):
with handler() as rh:
for target in rh.supported_targets:
request = Request(
f'http://127.0.0.1:{self.http_port}/gen_200', extensions={'impersonate': target})
res = validate_and_send(rh, request)
assert res.extensions['impersonate'] == rh._get_request_target(request)
def test_http_error_response_extensions(self, handler):
with handler() as rh:
for target in rh.supported_targets:
request = Request(
f'http://127.0.0.1:{self.http_port}/gen_404', extensions={'impersonate': target})
try:
validate_and_send(rh, request)
except HTTPError as e:
res = e.response
assert res.extensions['impersonate'] == rh._get_request_target(request)
class TestRequestHandlerMisc: class TestRequestHandlerMisc:
"""Misc generic tests for request handlers, not related to request or validation testing""" """Misc generic tests for request handlers, not related to request or validation testing"""
@ -805,8 +754,8 @@ class TestRequestHandlerMisc:
assert len(logging_handlers) == before_count assert len(logging_handlers) == before_count
@pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
class TestUrllibRequestHandler(TestRequestHandlerBase): class TestUrllibRequestHandler(TestRequestHandlerBase):
@pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
def test_file_urls(self, handler): def test_file_urls(self, handler):
# See https://github.com/ytdl-org/youtube-dl/issues/8227 # See https://github.com/ytdl-org/youtube-dl/issues/8227
tf = tempfile.NamedTemporaryFile(delete=False) tf = tempfile.NamedTemporaryFile(delete=False)
@ -828,7 +777,6 @@ class TestUrllibRequestHandler(TestRequestHandlerBase):
os.unlink(tf.name) os.unlink(tf.name)
@pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
def test_http_error_returns_content(self, handler): def test_http_error_returns_content(self, handler):
# urllib HTTPError will try close the underlying response if reference to the HTTPError object is lost # urllib HTTPError will try close the underlying response if reference to the HTTPError object is lost
def get_response(): def get_response():
@ -841,7 +789,6 @@ class TestUrllibRequestHandler(TestRequestHandlerBase):
assert get_response().read() == b'<html></html>' assert get_response().read() == b'<html></html>'
@pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
def test_verify_cert_error_text(self, handler): def test_verify_cert_error_text(self, handler):
# Check the output of the error message # Check the output of the error message
with handler() as rh: with handler() as rh:
@ -851,7 +798,6 @@ class TestUrllibRequestHandler(TestRequestHandlerBase):
): ):
validate_and_send(rh, Request(f'https://127.0.0.1:{self.https_port}/headers')) validate_and_send(rh, Request(f'https://127.0.0.1:{self.https_port}/headers'))
@pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
@pytest.mark.parametrize('req,match,version_check', [ @pytest.mark.parametrize('req,match,version_check', [
# https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256 # https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
# bpo-39603: Check implemented in 3.7.9+, 3.8.5+ # bpo-39603: Check implemented in 3.7.9+, 3.8.5+
@ -1183,7 +1129,7 @@ class TestRequestHandlerValidation:
] ]
PROXY_SCHEME_TESTS = [ PROXY_SCHEME_TESTS = [
# scheme, expected to fail # proxy scheme, expected to fail
('Urllib', 'http', [ ('Urllib', 'http', [
('http', False), ('http', False),
('https', UnsupportedRequest), ('https', UnsupportedRequest),
@ -1209,30 +1155,41 @@ class TestRequestHandlerValidation:
('socks5', False), ('socks5', False),
('socks5h', False), ('socks5h', False),
]), ]),
('Websockets', 'ws', [
('http', UnsupportedRequest),
('https', UnsupportedRequest),
('socks4', False),
('socks4a', False),
('socks5', False),
('socks5h', False),
]),
(NoCheckRH, 'http', [('http', False)]), (NoCheckRH, 'http', [('http', False)]),
(HTTPSupportedRH, 'http', [('http', UnsupportedRequest)]), (HTTPSupportedRH, 'http', [('http', UnsupportedRequest)]),
('Websockets', 'ws', [('http', UnsupportedRequest)]),
(NoCheckRH, 'http', [('http', False)]), (NoCheckRH, 'http', [('http', False)]),
(HTTPSupportedRH, 'http', [('http', UnsupportedRequest)]), (HTTPSupportedRH, 'http', [('http', UnsupportedRequest)]),
] ]
PROXY_KEY_TESTS = [ PROXY_KEY_TESTS = [
# key, expected to fail # proxy key, proxy scheme, expected to fail
('Urllib', [ ('Urllib', 'http', [
('all', False), ('all', 'http', False),
('unrelated', False), ('unrelated', 'http', False),
]), ]),
('Requests', [ ('Requests', 'http', [
('all', False), ('all', 'http', False),
('unrelated', False), ('unrelated', 'http', False),
]), ]),
('CurlCFFI', [ ('CurlCFFI', 'http', [
('all', False), ('all', 'http', False),
('unrelated', False), ('unrelated', 'http', False),
]), ]),
(NoCheckRH, [('all', False)]), ('Websockets', 'ws', [
(HTTPSupportedRH, [('all', UnsupportedRequest)]), ('all', 'socks5', False),
(HTTPSupportedRH, [('no', UnsupportedRequest)]), ('unrelated', 'socks5', False),
]),
(NoCheckRH, 'http', [('all', 'http', False)]),
(HTTPSupportedRH, 'http', [('all', 'http', UnsupportedRequest)]),
(HTTPSupportedRH, 'http', [('no', 'http', UnsupportedRequest)]),
] ]
EXTENSION_TESTS = [ EXTENSION_TESTS = [
@ -1274,28 +1231,54 @@ class TestRequestHandlerValidation:
]), ]),
] ]
@pytest.mark.parametrize('handler,fail,scheme', [
('Urllib', False, 'http'),
('Requests', False, 'http'),
('CurlCFFI', False, 'http'),
('Websockets', False, 'ws')
], indirect=['handler'])
def test_no_proxy(self, handler, fail, scheme):
run_validation(handler, fail, Request(f'{scheme}://', proxies={'no': '127.0.0.1,github.com'}))
run_validation(handler, fail, Request(f'{scheme}://'), proxies={'no': '127.0.0.1,github.com'})
@pytest.mark.parametrize('handler,scheme', [
('Urllib', 'http'),
(HTTPSupportedRH, 'http'),
('Requests', 'http'),
('CurlCFFI', 'http'),
('Websockets', 'ws')
], indirect=['handler'])
def test_empty_proxy(self, handler, scheme):
run_validation(handler, False, Request(f'{scheme}://', proxies={scheme: None}))
run_validation(handler, False, Request(f'{scheme}://'), proxies={scheme: None})
@pytest.mark.parametrize('proxy_url', ['//example.com', 'example.com', '127.0.0.1', '/a/b/c'])
@pytest.mark.parametrize('handler,scheme', [
('Urllib', 'http'),
(HTTPSupportedRH, 'http'),
('Requests', 'http'),
('CurlCFFI', 'http'),
('Websockets', 'ws')
], indirect=['handler'])
def test_invalid_proxy_url(self, handler, scheme, proxy_url):
run_validation(handler, UnsupportedRequest, Request(f'{scheme}://', proxies={scheme: proxy_url}))
@pytest.mark.parametrize('handler,scheme,fail,handler_kwargs', [ @pytest.mark.parametrize('handler,scheme,fail,handler_kwargs', [
(handler_tests[0], scheme, fail, handler_kwargs) (handler_tests[0], scheme, fail, handler_kwargs)
for handler_tests in URL_SCHEME_TESTS for handler_tests in URL_SCHEME_TESTS
for scheme, fail, handler_kwargs in handler_tests[1] for scheme, fail, handler_kwargs in handler_tests[1]
], indirect=['handler']) ], indirect=['handler'])
def test_url_scheme(self, handler, scheme, fail, handler_kwargs): def test_url_scheme(self, handler, scheme, fail, handler_kwargs):
run_validation(handler, fail, Request(f'{scheme}://'), **(handler_kwargs or {})) run_validation(handler, fail, Request(f'{scheme}://'), **(handler_kwargs or {}))
@pytest.mark.parametrize('handler,fail', [('Urllib', False), ('Requests', False), ('CurlCFFI', False)], indirect=['handler']) @pytest.mark.parametrize('handler,scheme,proxy_key,proxy_scheme,fail', [
def test_no_proxy(self, handler, fail): (handler_tests[0], handler_tests[1], proxy_key, proxy_scheme, fail)
run_validation(handler, fail, Request('http://', proxies={'no': '127.0.0.1,github.com'}))
run_validation(handler, fail, Request('http://'), proxies={'no': '127.0.0.1,github.com'})
@pytest.mark.parametrize('handler,proxy_key,fail', [
(handler_tests[0], proxy_key, fail)
for handler_tests in PROXY_KEY_TESTS for handler_tests in PROXY_KEY_TESTS
for proxy_key, fail in handler_tests[1] for proxy_key, proxy_scheme, fail in handler_tests[2]
], indirect=['handler']) ], indirect=['handler'])
def test_proxy_key(self, handler, proxy_key, fail): def test_proxy_key(self, handler, scheme, proxy_key, proxy_scheme, fail):
run_validation(handler, fail, Request('http://', proxies={proxy_key: 'http://example.com'})) run_validation(handler, fail, Request(f'{scheme}://', proxies={proxy_key: f'{proxy_scheme}://example.com'}))
run_validation(handler, fail, Request('http://'), proxies={proxy_key: 'http://example.com'}) run_validation(handler, fail, Request(f'{scheme}://'), proxies={proxy_key: f'{proxy_scheme}://example.com'})
@pytest.mark.parametrize('handler,req_scheme,scheme,fail', [ @pytest.mark.parametrize('handler,req_scheme,scheme,fail', [
(handler_tests[0], handler_tests[1], scheme, fail) (handler_tests[0], handler_tests[1], scheme, fail)
@ -1306,16 +1289,6 @@ class TestRequestHandlerValidation:
run_validation(handler, fail, Request(f'{req_scheme}://', proxies={req_scheme: f'{scheme}://example.com'})) run_validation(handler, fail, Request(f'{req_scheme}://', proxies={req_scheme: f'{scheme}://example.com'}))
run_validation(handler, fail, Request(f'{req_scheme}://'), proxies={req_scheme: f'{scheme}://example.com'}) run_validation(handler, fail, Request(f'{req_scheme}://'), proxies={req_scheme: f'{scheme}://example.com'})
@pytest.mark.parametrize('handler', ['Urllib', HTTPSupportedRH, 'Requests', 'CurlCFFI'], indirect=True)
def test_empty_proxy(self, handler):
run_validation(handler, False, Request('http://', proxies={'http': None}))
run_validation(handler, False, Request('http://'), proxies={'http': None})
@pytest.mark.parametrize('proxy_url', ['//example.com', 'example.com', '127.0.0.1', '/a/b/c'])
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
def test_invalid_proxy_url(self, handler, proxy_url):
run_validation(handler, UnsupportedRequest, Request('http://', proxies={'http': proxy_url}))
@pytest.mark.parametrize('handler,scheme,extensions,fail', [ @pytest.mark.parametrize('handler,scheme,extensions,fail', [
(handler_tests[0], handler_tests[1], extensions, fail) (handler_tests[0], handler_tests[1], extensions, fail)
for handler_tests in EXTENSION_TESTS for handler_tests in EXTENSION_TESTS

View file

@ -2059,7 +2059,22 @@ Line 1
assert extract_basic_auth('http://user:pass@foo.bar') == ('http://foo.bar', 'Basic dXNlcjpwYXNz') assert extract_basic_auth('http://user:pass@foo.bar') == ('http://foo.bar', 'Basic dXNlcjpwYXNz')
@unittest.skipUnless(compat_os_name == 'nt', 'Only relevant on Windows') @unittest.skipUnless(compat_os_name == 'nt', 'Only relevant on Windows')
def test_Popen_windows_escaping(self): def test_windows_escaping(self):
tests = [
'test"&',
'%CMDCMDLINE:~-1%&',
'a\nb',
'"',
'\\',
'!',
'^!',
'a \\ b',
'a \\" b',
'a \\ b\\',
# We replace \r with \n
('a\r\ra', 'a\n\na'),
]
def run_shell(args): def run_shell(args):
stdout, stderr, error = Popen.run( stdout, stderr, error = Popen.run(
args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@ -2067,15 +2082,15 @@ Line 1
assert not error assert not error
return stdout return stdout
# Test escaping for argument in tests:
assert run_shell(['echo', 'test"&']) == '"test""&"\n' if isinstance(argument, str):
assert run_shell(['echo', '%CMDCMDLINE:~-1%&']) == '"%CMDCMDLINE:~-1%&"\n' expected = argument
assert run_shell(['echo', 'a\nb']) == '"a"\n"b"\n' else:
assert run_shell(['echo', '"']) == '""""\n' argument, expected = argument
assert run_shell(['echo', '\\']) == '\\\n'
# Test if delayed expansion is disabled args = [sys.executable, '-c', 'import sys; print(end=sys.argv[1])', argument, 'end']
assert run_shell(['echo', '^!']) == '"^!"\n' assert run_shell(args) == expected
assert run_shell('echo "^!"') == '"^!"\n' assert run_shell(shell_quote(args, shell=True)) == expected
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -3,10 +3,12 @@
# Allow direct execution # Allow direct execution
import os import os
import sys import sys
import time
import pytest import pytest
from test.helper import verify_address_availability from test.helper import verify_address_availability
from yt_dlp.networking.common import Features, DEFAULT_TIMEOUT
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@ -18,7 +20,7 @@ import random
import ssl import ssl
import threading import threading
from yt_dlp import socks from yt_dlp import socks, traverse_obj
from yt_dlp.cookies import YoutubeDLCookieJar from yt_dlp.cookies import YoutubeDLCookieJar
from yt_dlp.dependencies import websockets from yt_dlp.dependencies import websockets
from yt_dlp.networking import Request from yt_dlp.networking import Request
@ -114,6 +116,7 @@ def ws_validate_and_send(rh, req):
@pytest.mark.skipif(not websockets, reason='websockets must be installed to test websocket request handlers') @pytest.mark.skipif(not websockets, reason='websockets must be installed to test websocket request handlers')
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
class TestWebsSocketRequestHandlerConformance: class TestWebsSocketRequestHandlerConformance:
@classmethod @classmethod
def setup_class(cls): def setup_class(cls):
@ -129,7 +132,6 @@ class TestWebsSocketRequestHandlerConformance:
cls.mtls_wss_thread, cls.mtls_wss_port = create_mtls_wss_websocket_server() cls.mtls_wss_thread, cls.mtls_wss_port = create_mtls_wss_websocket_server()
cls.mtls_wss_base_url = f'wss://127.0.0.1:{cls.mtls_wss_port}' cls.mtls_wss_base_url = f'wss://127.0.0.1:{cls.mtls_wss_port}'
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_basic_websockets(self, handler): def test_basic_websockets(self, handler):
with handler() as rh: with handler() as rh:
ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws = ws_validate_and_send(rh, Request(self.ws_base_url))
@ -141,7 +143,6 @@ class TestWebsSocketRequestHandlerConformance:
# https://www.rfc-editor.org/rfc/rfc6455.html#section-5.6 # https://www.rfc-editor.org/rfc/rfc6455.html#section-5.6
@pytest.mark.parametrize('msg,opcode', [('str', 1), (b'bytes', 2)]) @pytest.mark.parametrize('msg,opcode', [('str', 1), (b'bytes', 2)])
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_send_types(self, handler, msg, opcode): def test_send_types(self, handler, msg, opcode):
with handler() as rh: with handler() as rh:
ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws = ws_validate_and_send(rh, Request(self.ws_base_url))
@ -149,7 +150,6 @@ class TestWebsSocketRequestHandlerConformance:
assert int(ws.recv()) == opcode assert int(ws.recv()) == opcode
ws.close() ws.close()
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_verify_cert(self, handler): def test_verify_cert(self, handler):
with handler() as rh: with handler() as rh:
with pytest.raises(CertificateVerifyError): with pytest.raises(CertificateVerifyError):
@ -160,14 +160,12 @@ class TestWebsSocketRequestHandlerConformance:
assert ws.status == 101 assert ws.status == 101
ws.close() ws.close()
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_ssl_error(self, handler): def test_ssl_error(self, handler):
with handler(verify=False) as rh: with handler(verify=False) as rh:
with pytest.raises(SSLError, match=r'ssl(?:v3|/tls) alert handshake failure') as exc_info: with pytest.raises(SSLError, match=r'ssl(?:v3|/tls) alert handshake failure') as exc_info:
ws_validate_and_send(rh, Request(self.bad_wss_host)) ws_validate_and_send(rh, Request(self.bad_wss_host))
assert not issubclass(exc_info.type, CertificateVerifyError) assert not issubclass(exc_info.type, CertificateVerifyError)
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
@pytest.mark.parametrize('path,expected', [ @pytest.mark.parametrize('path,expected', [
# Unicode characters should be encoded with uppercase percent-encoding # Unicode characters should be encoded with uppercase percent-encoding
('/中文', '/%E4%B8%AD%E6%96%87'), ('/中文', '/%E4%B8%AD%E6%96%87'),
@ -182,7 +180,6 @@ class TestWebsSocketRequestHandlerConformance:
assert ws.status == 101 assert ws.status == 101
ws.close() ws.close()
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_remove_dot_segments(self, handler): def test_remove_dot_segments(self, handler):
with handler() as rh: with handler() as rh:
# This isn't a comprehensive test, # This isn't a comprehensive test,
@ -195,7 +192,6 @@ class TestWebsSocketRequestHandlerConformance:
# We are restricted to known HTTP status codes in http.HTTPStatus # We are restricted to known HTTP status codes in http.HTTPStatus
# Redirects are not supported for websockets # Redirects are not supported for websockets
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
@pytest.mark.parametrize('status', (200, 204, 301, 302, 303, 400, 500, 511)) @pytest.mark.parametrize('status', (200, 204, 301, 302, 303, 400, 500, 511))
def test_raise_http_error(self, handler, status): def test_raise_http_error(self, handler, status):
with handler() as rh: with handler() as rh:
@ -203,17 +199,30 @@ class TestWebsSocketRequestHandlerConformance:
ws_validate_and_send(rh, Request(f'{self.ws_base_url}/gen_{status}')) ws_validate_and_send(rh, Request(f'{self.ws_base_url}/gen_{status}'))
assert exc_info.value.status == status assert exc_info.value.status == status
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
@pytest.mark.parametrize('params,extensions', [ @pytest.mark.parametrize('params,extensions', [
({'timeout': sys.float_info.min}, {}), ({'timeout': sys.float_info.min}, {}),
({}, {'timeout': sys.float_info.min}), ({}, {'timeout': sys.float_info.min}),
]) ])
def test_timeout(self, handler, params, extensions): def test_read_timeout(self, handler, params, extensions):
with handler(**params) as rh: with handler(**params) as rh:
with pytest.raises(TransportError): with pytest.raises(TransportError):
ws_validate_and_send(rh, Request(self.ws_base_url, extensions=extensions)) ws_validate_and_send(rh, Request(self.ws_base_url, extensions=extensions))
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True) def test_connect_timeout(self, handler):
# nothing should be listening on this port
connect_timeout_url = 'ws://10.255.255.255'
with handler(timeout=0.01) as rh, pytest.raises(TransportError):
now = time.time()
ws_validate_and_send(rh, Request(connect_timeout_url))
assert time.time() - now < DEFAULT_TIMEOUT
# Per request timeout, should override handler timeout
request = Request(connect_timeout_url, extensions={'timeout': 0.01})
with handler() as rh, pytest.raises(TransportError):
now = time.time()
ws_validate_and_send(rh, request)
assert time.time() - now < DEFAULT_TIMEOUT
def test_cookies(self, handler): def test_cookies(self, handler):
cookiejar = YoutubeDLCookieJar() cookiejar = YoutubeDLCookieJar()
cookiejar.set_cookie(http.cookiejar.Cookie( cookiejar.set_cookie(http.cookiejar.Cookie(
@ -239,7 +248,6 @@ class TestWebsSocketRequestHandlerConformance:
assert json.loads(ws.recv())['cookie'] == 'test=ytdlp' assert json.loads(ws.recv())['cookie'] == 'test=ytdlp'
ws.close() ws.close()
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_source_address(self, handler): def test_source_address(self, handler):
source_address = f'127.0.0.{random.randint(5, 255)}' source_address = f'127.0.0.{random.randint(5, 255)}'
verify_address_availability(source_address) verify_address_availability(source_address)
@ -249,7 +257,6 @@ class TestWebsSocketRequestHandlerConformance:
assert source_address == ws.recv() assert source_address == ws.recv()
ws.close() ws.close()
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_response_url(self, handler): def test_response_url(self, handler):
with handler() as rh: with handler() as rh:
url = f'{self.ws_base_url}/something' url = f'{self.ws_base_url}/something'
@ -257,7 +264,6 @@ class TestWebsSocketRequestHandlerConformance:
assert ws.url == url assert ws.url == url
ws.close() ws.close()
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_request_headers(self, handler): def test_request_headers(self, handler):
with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh: with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh:
# Global Headers # Global Headers
@ -293,7 +299,6 @@ class TestWebsSocketRequestHandlerConformance:
'client_certificate_password': 'foobar', 'client_certificate_password': 'foobar',
} }
)) ))
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_mtls(self, handler, client_cert): def test_mtls(self, handler, client_cert):
with handler( with handler(
# Disable client-side validation of unacceptable self-signed testcert.pem # Disable client-side validation of unacceptable self-signed testcert.pem
@ -303,6 +308,44 @@ class TestWebsSocketRequestHandlerConformance:
) as rh: ) as rh:
ws_validate_and_send(rh, Request(self.mtls_wss_base_url)).close() ws_validate_and_send(rh, Request(self.mtls_wss_base_url)).close()
def test_request_disable_proxy(self, handler):
for proxy_proto in handler._SUPPORTED_PROXY_SCHEMES or ['ws']:
# Given handler is configured with a proxy
with handler(proxies={'ws': f'{proxy_proto}://10.255.255.255'}, timeout=5) as rh:
# When a proxy is explicitly set to None for the request
ws = ws_validate_and_send(rh, Request(self.ws_base_url, proxies={'http': None}))
# Then no proxy should be used
assert ws.status == 101
ws.close()
@pytest.mark.skip_handlers_if(
lambda _, handler: Features.NO_PROXY not in handler._SUPPORTED_FEATURES, 'handler does not support NO_PROXY')
def test_noproxy(self, handler):
for proxy_proto in handler._SUPPORTED_PROXY_SCHEMES or ['ws']:
# Given the handler is configured with a proxy
with handler(proxies={'ws': f'{proxy_proto}://10.255.255.255'}, timeout=5) as rh:
for no_proxy in (f'127.0.0.1:{self.ws_port}', '127.0.0.1', 'localhost'):
# When request no proxy includes the request url host
ws = ws_validate_and_send(rh, Request(self.ws_base_url, proxies={'no': no_proxy}))
# Then the proxy should not be used
assert ws.status == 101
ws.close()
@pytest.mark.skip_handlers_if(
lambda _, handler: Features.ALL_PROXY not in handler._SUPPORTED_FEATURES, 'handler does not support ALL_PROXY')
def test_allproxy(self, handler):
supported_proto = traverse_obj(handler._SUPPORTED_PROXY_SCHEMES, 0, default='ws')
# This is a bit of a hacky test, but it should be enough to check whether the handler is using the proxy.
# 0.1s might not be enough of a timeout if proxy is not used in all cases, but should still get failures.
with handler(proxies={'all': f'{supported_proto}://10.255.255.255'}, timeout=0.1) as rh:
with pytest.raises(TransportError):
ws_validate_and_send(rh, Request(self.ws_base_url)).close()
with handler(timeout=0.1) as rh:
with pytest.raises(TransportError):
ws_validate_and_send(
rh, Request(self.ws_base_url, proxies={'all': f'{supported_proto}://10.255.255.255'})).close()
def create_fake_ws_connection(raised): def create_fake_ws_connection(raised):
import websockets.sync.client import websockets.sync.client

View file

@ -2136,6 +2136,11 @@ class YoutubeDL:
def _check_formats(self, formats): def _check_formats(self, formats):
for f in formats: for f in formats:
working = f.get('__working')
if working is not None:
if working:
yield f
continue
self.to_screen('[info] Testing format %s' % f['format_id']) self.to_screen('[info] Testing format %s' % f['format_id'])
path = self.get_output_path('temp') path = self.get_output_path('temp')
if not self._ensure_dir_exists(f'{path}/'): if not self._ensure_dir_exists(f'{path}/'):
@ -2152,33 +2157,44 @@ class YoutubeDL:
os.remove(temp_file.name) os.remove(temp_file.name)
except OSError: except OSError:
self.report_warning('Unable to delete temporary file "%s"' % temp_file.name) self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
f['__working'] = success
if success: if success:
yield f yield f
else: else:
self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id']) self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
def _select_formats(self, formats, selector):
return list(selector({
'formats': formats,
'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video
or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio
}))
def _default_format_spec(self, info_dict, download=True): def _default_format_spec(self, info_dict, download=True):
download = download and not self.params.get('simulate')
prefer_best = download and (
self.params['outtmpl']['default'] == '-'
or info_dict.get('is_live') and not self.params.get('live_from_start'))
def can_merge(): def can_merge():
merger = FFmpegMergerPP(self) merger = FFmpegMergerPP(self)
return merger.available and merger.can_merge() return merger.available and merger.can_merge()
prefer_best = ( if not prefer_best and download and not can_merge():
not self.params.get('simulate') prefer_best = True
and download formats = self._get_formats(info_dict)
and ( evaluate_formats = lambda spec: self._select_formats(formats, self.build_format_selector(spec))
not can_merge() if evaluate_formats('b/bv+ba') != evaluate_formats('bv*+ba/b'):
or info_dict.get('is_live') and not self.params.get('live_from_start') self.report_warning('ffmpeg not found. The downloaded format may not be the best available. '
or self.params['outtmpl']['default'] == '-')) 'Installing ffmpeg is strongly recommended: https://github.com/yt-dlp/yt-dlp#dependencies')
compat = (
prefer_best
or self.params.get('allow_multiple_audio_streams', False)
or 'format-spec' in self.params['compat_opts'])
return ( compat = (self.params.get('allow_multiple_audio_streams')
'best/bestvideo+bestaudio' if prefer_best or 'format-spec' in self.params['compat_opts'])
else 'bestvideo*+bestaudio/best' if not compat
else 'bestvideo+bestaudio/best') return ('best/bestvideo+bestaudio' if prefer_best
else 'bestvideo+bestaudio/best' if compat
else 'bestvideo*+bestaudio/best')
def build_format_selector(self, format_spec): def build_format_selector(self, format_spec):
def syntax_error(note, start): def syntax_error(note, start):
@ -2928,12 +2944,7 @@ class YoutubeDL:
self.write_debug(f'Default format spec: {req_format}') self.write_debug(f'Default format spec: {req_format}')
format_selector = self.build_format_selector(req_format) format_selector = self.build_format_selector(req_format)
formats_to_download = list(format_selector({ formats_to_download = self._select_formats(formats, format_selector)
'formats': formats,
'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video
or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio
}))
if interactive_format_selection and not formats_to_download: if interactive_format_selection and not formats_to_download:
self.report_error('Requested format is not available', tb=False, is_error=False) self.report_error('Requested format is not available', tb=False, is_error=False)
continue continue
@ -3060,7 +3071,7 @@ class YoutubeDL:
f = formats[-1] f = formats[-1]
self.report_warning( self.report_warning(
'No subtitle format found matching "%s" for language %s, ' 'No subtitle format found matching "%s" for language %s, '
'using %s' % (formats_query, lang, f['ext'])) 'using %s. Use --list-subs for a list of available subtitles' % (formats_query, lang, f['ext']))
subs[lang] = f subs[lang] = f
return subs return subs

View file

@ -46,7 +46,7 @@ from .utils import (
from .utils._utils import _YDLLogger from .utils._utils import _YDLLogger
from .utils.networking import normalize_url from .utils.networking import normalize_url
CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'whale'}
SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'}
@ -219,6 +219,7 @@ def _get_chromium_based_browser_settings(browser_name):
'edge': os.path.join(appdata_local, R'Microsoft\Edge\User Data'), 'edge': os.path.join(appdata_local, R'Microsoft\Edge\User Data'),
'opera': os.path.join(appdata_roaming, R'Opera Software\Opera Stable'), 'opera': os.path.join(appdata_roaming, R'Opera Software\Opera Stable'),
'vivaldi': os.path.join(appdata_local, R'Vivaldi\User Data'), 'vivaldi': os.path.join(appdata_local, R'Vivaldi\User Data'),
'whale': os.path.join(appdata_local, R'Naver\Naver Whale\User Data'),
}[browser_name] }[browser_name]
elif sys.platform == 'darwin': elif sys.platform == 'darwin':
@ -230,6 +231,7 @@ def _get_chromium_based_browser_settings(browser_name):
'edge': os.path.join(appdata, 'Microsoft Edge'), 'edge': os.path.join(appdata, 'Microsoft Edge'),
'opera': os.path.join(appdata, 'com.operasoftware.Opera'), 'opera': os.path.join(appdata, 'com.operasoftware.Opera'),
'vivaldi': os.path.join(appdata, 'Vivaldi'), 'vivaldi': os.path.join(appdata, 'Vivaldi'),
'whale': os.path.join(appdata, 'Naver/Whale'),
}[browser_name] }[browser_name]
else: else:
@ -241,6 +243,7 @@ def _get_chromium_based_browser_settings(browser_name):
'edge': os.path.join(config, 'microsoft-edge'), 'edge': os.path.join(config, 'microsoft-edge'),
'opera': os.path.join(config, 'opera'), 'opera': os.path.join(config, 'opera'),
'vivaldi': os.path.join(config, 'vivaldi'), 'vivaldi': os.path.join(config, 'vivaldi'),
'whale': os.path.join(config, 'naver-whale'),
}[browser_name] }[browser_name]
# Linux keyring names can be determined by snooping on dbus while opening the browser in KDE: # Linux keyring names can be determined by snooping on dbus while opening the browser in KDE:
@ -252,6 +255,7 @@ def _get_chromium_based_browser_settings(browser_name):
'edge': 'Microsoft Edge' if sys.platform == 'darwin' else 'Chromium', 'edge': 'Microsoft Edge' if sys.platform == 'darwin' else 'Chromium',
'opera': 'Opera' if sys.platform == 'darwin' else 'Chromium', 'opera': 'Opera' if sys.platform == 'darwin' else 'Chromium',
'vivaldi': 'Vivaldi' if sys.platform == 'darwin' else 'Chrome', 'vivaldi': 'Vivaldi' if sys.platform == 'darwin' else 'Chrome',
'whale': 'Whale',
}[browser_name] }[browser_name]
browsers_without_profiles = {'opera'} browsers_without_profiles = {'opera'}
@ -347,6 +351,11 @@ def _process_chrome_cookie(decryptor, host_key, name, value, encrypted_value, pa
if value is None: if value is None:
return is_encrypted, None return is_encrypted, None
# In chrome, session cookies have expires_utc set to 0
# In our cookie-store, cookies that do not expire should have expires set to None
if not expires_utc:
expires_utc = None
return is_encrypted, http.cookiejar.Cookie( return is_encrypted, http.cookiejar.Cookie(
version=0, name=name, value=value, port=None, port_specified=False, version=0, name=name, value=value, port=None, port_specified=False,
domain=host_key, domain_specified=bool(host_key), domain_initial_dot=host_key.startswith('.'), domain=host_key, domain_specified=bool(host_key), domain_initial_dot=host_key.startswith('.'),

View file

@ -387,7 +387,11 @@ from .comedycentral import (
ComedyCentralIE, ComedyCentralIE,
ComedyCentralTVIE, ComedyCentralTVIE,
) )
from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonmistakes import (
BlobIE,
CommonMistakesIE,
UnicodeBOMIE,
)
from .commonprotocols import ( from .commonprotocols import (
MmsIE, MmsIE,
RtmpIE, RtmpIE,
@ -544,7 +548,6 @@ from .egghead import (
EggheadLessonIE, EggheadLessonIE,
) )
from .eighttracks import EightTracksIE from .eighttracks import EightTracksIE
from .einthusan import EinthusanIE
from .eitb import EitbIE from .eitb import EitbIE
from .elementorembed import ElementorEmbedIE from .elementorembed import ElementorEmbedIE
from .elonet import ElonetIE from .elonet import ElonetIE
@ -857,10 +860,6 @@ from .iwara import (
) )
from .ixigua import IxiguaIE from .ixigua import IxiguaIE
from .izlesene import IzleseneIE from .izlesene import IzleseneIE
from .jable import (
JableIE,
JablePlaylistIE,
)
from .jamendo import ( from .jamendo import (
JamendoIE, JamendoIE,
JamendoAlbumIE, JamendoAlbumIE,
@ -1495,7 +1494,6 @@ from .polskieradio import (
) )
from .popcorntimes import PopcorntimesIE from .popcorntimes import PopcorntimesIE
from .popcorntv import PopcornTVIE from .popcorntv import PopcornTVIE
from .porn91 import Porn91IE
from .pornbox import PornboxIE from .pornbox import PornboxIE
from .pornflip import PornFlipIE from .pornflip import PornFlipIE
from .pornhub import ( from .pornhub import (
@ -2373,7 +2371,6 @@ from .wykop import (
) )
from .xanimu import XanimuIE from .xanimu import XanimuIE
from .xboxclips import XboxClipsIE from .xboxclips import XboxClipsIE
from .xfileshare import XFileShareIE
from .xhamster import ( from .xhamster import (
XHamsterIE, XHamsterIE,
XHamsterEmbedIE, XHamsterEmbedIE,
@ -2428,8 +2425,6 @@ from .younow import (
YouNowMomentIE, YouNowMomentIE,
) )
from .youporn import YouPornIE from .youporn import YouPornIE
from .yourporn import YourPornIE
from .yourupload import YourUploadIE
from .zaiko import ( from .zaiko import (
ZaikoIE, ZaikoIE,
ZaikoETicketIE, ZaikoETicketIE,

View file

@ -39,7 +39,7 @@ class AluraIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
course, video_id = self._match_valid_url(url) course, video_id = self._match_valid_url(url).group('course_name', 'id')
video_url = self._VIDEO_URL % (course, video_id) video_url = self._VIDEO_URL % (course, video_id)
video_dict = self._download_json(video_url, video_id, 'Searching for videos') video_dict = self._download_json(video_url, video_id, 'Searching for videos')
@ -52,7 +52,7 @@ class AluraIE(InfoExtractor):
formats = [] formats = []
for video_obj in video_dict: for video_obj in video_dict:
video_url_m3u8 = video_obj.get('link') video_url_m3u8 = video_obj.get('mp4')
video_format = self._extract_m3u8_formats( video_format = self._extract_m3u8_formats(
video_url_m3u8, None, 'mp4', entry_protocol='m3u8_native', video_url_m3u8, None, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False) m3u8_id='hls', fatal=False)

View file

@ -602,7 +602,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'url': 'http://www.bbc.com/news/world-europe-32668511', 'url': 'http://www.bbc.com/news/world-europe-32668511',
'info_dict': { 'info_dict': {
'id': 'world-europe-32668511', 'id': 'world-europe-32668511',
'title': 'Russia stages massive WW2 parade', 'title': 'Russia stages massive WW2 parade despite Western boycott',
'description': 'md5:00ff61976f6081841f759a08bf78cc9c', 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
}, },
'playlist_count': 2, 'playlist_count': 2,
@ -623,6 +623,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'info_dict': { 'info_dict': {
'id': '3662a707-0af9-3149-963f-47bea720b460', 'id': '3662a707-0af9-3149-963f-47bea720b460',
'title': 'BUGGER', 'title': 'BUGGER',
'description': r're:BUGGER The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$',
}, },
'playlist_count': 18, 'playlist_count': 18,
}, { }, {
@ -631,14 +632,14 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'info_dict': { 'info_dict': {
'id': 'p02mprgb', 'id': 'p02mprgb',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'title': 'Germanwings crash site aerial video',
'description': 'md5:2868290467291b37feda7863f7a83f54', 'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$',
'duration': 47, 'duration': 47,
'timestamp': 1427219242, 'timestamp': 1427219242,
'upload_date': '20150324', 'upload_date': '20150324',
'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg',
}, },
'params': { 'params': {
# rtmp download
'skip_download': True, 'skip_download': True,
} }
}, { }, {
@ -656,21 +657,24 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
} },
'skip': 'now SIMORGH_DATA with no video',
}, { }, {
# single video embedded with data-playable containing XML playlists (regional section) # single video embedded with data-playable containing XML playlists (regional section)
'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
'info_dict': { 'info_dict': {
'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'id': '39275083',
'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', 'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
'timestamp': 1434713142, 'timestamp': 1434713142,
'upload_date': '20150619', 'upload_date': '20150619',
'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
} },
}, { }, {
# single video from video playlist embedded with vxp-playlist-data JSON # single video from video playlist embedded with vxp-playlist-data JSON
'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
@ -683,22 +687,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
} },
'skip': '404 Not Found',
}, { }, {
# single video story with digitalData # single video story with __PWA_PRELOADED_STATE__
'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
'info_dict': { 'info_dict': {
'id': 'p02q6gc4', 'id': 'p02q6gc4',
'ext': 'flv', 'ext': 'mp4',
'title': 'Sri Lankas spicy secret', 'title': 'Tasting the spice of life in Jaffna',
'description': 'As a new train line to Jaffna opens up the countrys north, travellers can experience a truly distinct slice of Tamil culture.', 'description': r're:(?s)BBC Travel Shows Henry Golding explores the city of Jaffna .{151} aftertaste\.$',
'timestamp': 1437674293, 'timestamp': 1646058397,
'upload_date': '20150723', 'upload_date': '20220228',
'duration': 255,
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg',
}, },
'params': {
# rtmp download
'skip_download': True,
}
}, { }, {
# single video story without digitalData # single video story without digitalData
'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
@ -710,12 +713,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'timestamp': 1415867444, 'timestamp': 1415867444,
'upload_date': '20141113', 'upload_date': '20141113',
}, },
'params': { 'skip': 'redirects to TopGear home page',
# rtmp download
'skip_download': True,
}
}, { }, {
# single video embedded with Morph # single video embedded with Morph
# TODO: replacement test page
'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
'info_dict': { 'info_dict': {
'id': 'p041vhd0', 'id': 'p041vhd0',
@ -726,27 +727,22 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'uploader': 'BBC Sport', 'uploader': 'BBC Sport',
'uploader_id': 'bbc_sport', 'uploader_id': 'bbc_sport',
}, },
'params': { 'skip': 'Video no longer in page',
# m3u8 download
'skip_download': True,
},
'skip': 'Georestricted to UK',
}, { }, {
# single video with playlist.sxml URL in playlist param # single video in __INITIAL_DATA__
'url': 'http://www.bbc.com/sport/0/football/33653409', 'url': 'http://www.bbc.com/sport/0/football/33653409',
'info_dict': { 'info_dict': {
'id': 'p02xycnp', 'id': 'p02xycnp',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', 'title': 'Ronaldo to Man Utd, Arsenal to spend?',
'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', 'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$',
'timestamp': 1437750175,
'upload_date': '20150724',
'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png',
'duration': 140, 'duration': 140,
}, },
'params': {
# rtmp download
'skip_download': True,
}
}, { }, {
# article with multiple videos embedded with playlist.sxml in playlist param # article with multiple videos embedded with Morph.setPayload
'url': 'http://www.bbc.com/sport/0/football/34475836', 'url': 'http://www.bbc.com/sport/0/football/34475836',
'info_dict': { 'info_dict': {
'id': '34475836', 'id': '34475836',
@ -754,6 +750,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
}, },
'playlist_count': 3, 'playlist_count': 3,
}, {
# Testing noplaylist
'url': 'http://www.bbc.com/sport/0/football/34475836',
'info_dict': {
'id': 'p034ppnv',
'ext': 'mp4',
'title': 'All you need to know about Jurgen Klopp',
'timestamp': 1444335081,
'upload_date': '20151008',
'duration': 122.0,
'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg',
},
'params': {
'noplaylist': True,
},
}, { }, {
# school report article with single video # school report article with single video
'url': 'http://www.bbc.co.uk/schoolreport/35744779', 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
@ -762,6 +773,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'title': 'School which breaks down barriers in Jerusalem', 'title': 'School which breaks down barriers in Jerusalem',
}, },
'playlist_count': 1, 'playlist_count': 1,
'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt',
}, { }, {
# single video with playlist URL from weather section # single video with playlist URL from weather section
'url': 'http://www.bbc.com/weather/features/33601775', 'url': 'http://www.bbc.com/weather/features/33601775',
@ -778,18 +790,33 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'thumbnail': r're:https?://.+/.+\.jpg', 'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1437785037, 'timestamp': 1437785037,
'upload_date': '20150725', 'upload_date': '20150725',
'duration': 105,
}, },
}, { }, {
# video with window.__INITIAL_DATA__ and value as JSON string # video with window.__INITIAL_DATA__ and value as JSON string
'url': 'https://www.bbc.com/news/av/world-europe-59468682', 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
'info_dict': { 'info_dict': {
'id': 'p0b71qth', 'id': 'p0b779gc',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Why France is making this woman a national hero', 'title': 'Why France is making this woman a national hero',
'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', 'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.',
'thumbnail': r're:https?://.+/.+\.jpg', 'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1638230731, 'timestamp': 1638215626,
'upload_date': '20211130', 'upload_date': '20211129',
'duration': 125,
},
}, {
# video with script id __NEXT_DATA__ and value as JSON string
'url': 'https://www.bbc.com/news/uk-68546268',
'info_dict': {
'id': 'p0hj0lq7',
'ext': 'mp4',
'title': 'Nasser Hospital doctor describes his treatment by IDF',
'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$',
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1710188248,
'upload_date': '20240311',
'duration': 104,
}, },
}, { }, {
# single video article embedded with data-media-vpid # single video article embedded with data-media-vpid
@ -817,6 +844,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'uploader': 'Radio 3', 'uploader': 'Radio 3',
'uploader_id': 'bbc_radio_three', 'uploader_id': 'bbc_radio_three',
}, },
'skip': '404 Not Found',
}, { }, {
'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
'info_dict': { 'info_dict': {
@ -824,6 +852,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'ext': 'mp4', 'ext': 'mp4',
'title': 'md5:2fabf12a726603193a2879a055f72514', 'title': 'md5:2fabf12a726603193a2879a055f72514',
'description': 'Learn English words and phrases from this story', 'description': 'Learn English words and phrases from this story',
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg',
}, },
'add_ie': [BBCCoUkIE.ie_key()], 'add_ie': [BBCCoUkIE.ie_key()],
}, { }, {
@ -832,28 +861,30 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'info_dict': { 'info_dict': {
'id': 'p07c6sb9', 'id': 'p07c6sb9',
'ext': 'mp4', 'ext': 'mp4',
'title': 'How positive thinking is harming your happiness', 'title': 'The downsides of positive thinking',
'alt_title': 'The downsides of positive thinking', 'description': 'The downsides of positive thinking',
'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
'duration': 235, 'duration': 235,
'thumbnail': r're:https?://.+/p07c9dsr.jpg', 'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)',
'upload_date': '20190604', 'upload_date': '20220223',
'categories': ['Psychology'], 'timestamp': 1645632746,
}, },
}, { }, {
# BBC Sounds # BBC Sounds
'url': 'https://www.bbc.co.uk/sounds/play/m001q78b', 'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx',
'info_dict': { 'info_dict': {
'id': 'm001q789', 'id': 'p0hrw4nr',
'ext': 'mp4', 'ext': 'mp4',
'title': 'The Night Tracks Mix - Music for the darkling hour', 'title': 'Are our coastlines being washed away?',
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg', 'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$',
'chapters': 'count:8', 'timestamp': 1713556800,
'description': 'md5:815fb51cbdaa270040aab8145b3f1d67', 'upload_date': '20240419',
'uploader': 'Radio 3', 'duration': 1588,
'duration': 1800, 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg',
'uploader_id': 'bbc_radio_three', 'uploader': 'World Service',
}, 'uploader_id': 'bbc_world_service',
'series': 'CrowdScience',
'chapters': [],
}
}, { # onion routes }, { # onion routes
'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
'only_matching': True, 'only_matching': True,
@ -1008,8 +1039,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
webpage, 'group id', default=None) webpage, 'group id', default=None)
if group_id: if group_id:
return self.url_result( return self.url_result(
'https://www.bbc.co.uk/programmes/%s' % group_id, f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE)
ie=BBCCoUkIE.ie_key())
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
programme_id = self._search_regex( programme_id = self._search_regex(
@ -1069,83 +1099,133 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
} }
# Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
# There are several setPayload calls may be present but the video # Several setPayload calls may be present but the video(s)
# seems to be always related to the first one # should be in one that mentions leadMedia or videoData
morph_payload = self._parse_json( morph_payload = self._search_json(
self._search_regex( r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id,
r'Morph\.setPayload\([^,]+,\s*({.+?})\);', contains_pattern=r'{(?s:(?:(?!</script>).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}',
webpage, 'morph payload', default='{}'), default={})
playlist_id, fatal=False)
if morph_payload: if morph_payload:
components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] for lead_media in traverse_obj(morph_payload, (
for component in components: 'body', 'components', ..., 'props', 'leadMedia', {dict})):
if not isinstance(component, dict): programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any))
continue
lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
if not lead_media:
continue
identifiers = lead_media.get('identifiers')
if not identifiers or not isinstance(identifiers, dict):
continue
programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
if not programme_id: if not programme_id:
continue continue
title = lead_media.get('title') or self._og_search_title(webpage)
formats, subtitles = self._download_media_selector(programme_id) formats, subtitles = self._download_media_selector(programme_id)
description = lead_media.get('summary')
uploader = lead_media.get('masterBrand')
uploader_id = lead_media.get('mid')
duration = None
duration_d = lead_media.get('duration')
if isinstance(duration_d, dict):
duration = parse_duration(dict_get(
duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
return { return {
'id': programme_id, 'id': programme_id,
'title': title, 'title': lead_media.get('title') or self._og_search_title(webpage),
'description': description, **traverse_obj(lead_media, {
'duration': duration, 'description': ('summary', {str}),
'uploader': uploader, 'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}),
'uploader_id': uploader_id, 'uploader': ('masterBrand', {str}),
'uploader_id': ('mid', {str}),
}),
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
} }
body = self._parse_json(traverse_obj(morph_payload, (
'body', 'content', 'article', 'body')), playlist_id, fatal=False)
for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')):
if video_data.get('vpid'):
video_id = video_data['vpid']
formats, subtitles = self._download_media_selector(video_id)
entry = {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
}
else:
video_id = video_data['pid']
entry = self.url_result(
f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE,
video_id, url_transparent=True)
entry.update({
'timestamp': traverse_obj(morph_payload, (
'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601})
),
**traverse_obj(video_data, {
'thumbnail': (('iChefImage', 'image'), {url_or_none}, any),
'title': (('title', 'caption'), {str}, any),
'duration': ('duration', {parse_duration}),
}),
})
if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id):
return entry
entries.append(entry)
if entries:
playlist_title = traverse_obj(morph_payload, (
'body', 'content', 'article', 'headline', {str})) or playlist_title
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
preload_state = self._parse_json(self._search_regex( # various PRELOADED_STATE JSON
r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, preload_state = self._search_json(
'preload state', default='{}'), playlist_id, fatal=False) r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage,
if preload_state: 'preload state', playlist_id, transform_source=js_to_json, default={})
current_programme = preload_state.get('programmes', {}).get('current') or {} # PRELOADED_STATE with current programmme
programme_id = current_programme.get('id') current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict}))
if current_programme and programme_id and current_programme.get('type') == 'playable_item': programme_id = traverse_obj(current_programme, ('id', {str}))
title = current_programme.get('titles', {}).get('tertiary') or playlist_title if programme_id and current_programme.get('type') == 'playable_item':
formats, subtitles = self._download_media_selector(programme_id) title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title
synopses = current_programme.get('synopses') or {} formats, subtitles = self._download_media_selector(programme_id)
network = current_programme.get('network') or {} return {
duration = int_or_none( 'id': programme_id,
current_programme.get('duration', {}).get('value')) 'title': title,
thumbnail = None 'formats': formats,
image_url = current_programme.get('image_url') **traverse_obj(current_programme, {
if image_url: 'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
thumbnail = image_url.replace('{recipe}', 'raw') 'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}),
'duration': ('duration', 'value', {int_or_none}),
'uploader': ('network', 'short_title', {str}),
'uploader_id': ('network', 'id', {str}),
'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any),
'series': ('titles', 'primary', {str}),
}),
'subtitles': subtitles,
'chapters': traverse_obj(preload_state, (
'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), {
'title': ('titles', {lambda x: join_nonempty(
'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
'start_time': ('offset', 'start', {float_or_none}),
'end_time': ('offset', 'end', {float_or_none}),
})
),
}
# PWA_PRELOADED_STATE with article video asset
asset_id = traverse_obj(preload_state, (
'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id,
'assetVideo', 0, {str}, any))
if asset_id:
video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str}))
if video_id:
article = traverse_obj(preload_state, (
'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any))
def image_url(image_id):
return traverse_obj(preload_state, (
'entities', 'images', image_id, 'url',
{lambda u: url_or_none(u.replace('$recipe', 'raw'))}))
formats, subtitles = self._download_media_selector(video_id)
return { return {
'id': programme_id, 'id': video_id,
'title': title, **traverse_obj(preload_state, ('entities', 'videos', asset_id, {
'description': dict_get(synopses, ('long', 'medium', 'short')), 'title': ('title', {str}),
'thumbnail': thumbnail, 'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any),
'duration': duration, 'thumbnail': (0, {image_url}),
'uploader': network.get('short_title'), 'duration': ('duration', {int_or_none}),
'uploader_id': network.get('id'), })),
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
'chapters': traverse_obj(preload_state, ( 'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})),
'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), {
'title': ('titles', {lambda x: join_nonempty(
'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
'start_time': ('offset', 'start', {float_or_none}),
'end_time': ('offset', 'end', {float_or_none}),
})) or None,
} }
else:
return self.url_result(
f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE,
asset_id, playlist_title, display_id=playlist_id,
description=playlist_description)
bbc3_config = self._parse_json( bbc3_config = self._parse_json(
self._search_regex( self._search_regex(
@ -1191,6 +1271,28 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
return self.playlist_result( return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description) entries, playlist_id, playlist_title, playlist_description)
def parse_model(model):
"""Extract single video from model structure"""
item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
if not item_id:
return
formats, subtitles = self._download_media_selector(item_id)
return {
'id': item_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(model, {
'title': ('title', {str}),
'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any),
'duration': ('versions', 0, 'duration', {int}),
'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}),
})
}
def is_type(*types):
return lambda _, v: v['type'] in types
initial_data = self._search_regex( initial_data = self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
'quoted preload state', default=None) 'quoted preload state', default=None)
@ -1202,6 +1304,19 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
initial_data = self._parse_json(initial_data, playlist_id, fatal=False) initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
if initial_data: if initial_data:
for video_data in traverse_obj(initial_data, (
'stores', 'article', 'articleBodyContent', is_type('video'))):
model = traverse_obj(video_data, (
'model', 'blocks', is_type('aresMedia'),
'model', 'blocks', is_type('aresMediaMetadata'),
'model', {dict}, any))
entry = parse_model(model)
if entry:
entries.append(entry)
if entries:
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
def parse_media(media): def parse_media(media):
if not media: if not media:
return return
@ -1234,27 +1349,90 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'subtitles': subtitles, 'subtitles': subtitles,
'timestamp': item_time, 'timestamp': item_time,
'description': strip_or_none(item_desc), 'description': strip_or_none(item_desc),
'duration': int_or_none(item.get('duration')),
}) })
for resp in (initial_data.get('data') or {}).values():
name = resp.get('name') for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])):
name = resp['name']
if name == 'media-experience': if name == 'media-experience':
parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
elif name == 'article': elif name == 'article':
for block in (try_get(resp, for block in traverse_obj(resp, (
(lambda x: x['data']['blocks'], 'data', (None, ('content', 'model')), 'blocks',
lambda x: x['data']['content']['model']['blocks'],), is_type('media', 'video'), 'model', {dict})):
list) or []): parse_media(block)
if block.get('type') not in ['media', 'video']:
continue
parse_media(block.get('model'))
return self.playlist_result( return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description) entries, playlist_id, playlist_title, playlist_description)
# extract from SIMORGH_DATA hydration JSON
simorgh_data = self._search_json(
r'window\s*\.\s*SIMORGH_DATA\s*=', webpage,
'simorgh data', playlist_id, default={})
if simorgh_data:
done = False
for video_data in traverse_obj(simorgh_data, (
'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))):
model = traverse_obj(video_data, (
'model', 'blocks', is_type('aresMedia'),
'model', 'blocks', is_type('aresMediaMetadata'),
'model', {dict}, any))
if video_data['type'] == 'video':
entry = parse_model(model)
else: # legacyMedia: no duration, subtitles
block_id, entry = traverse_obj(model, ('blockId', {str})), None
media_data = traverse_obj(simorgh_data, (
'pageData', 'promo', 'media',
{lambda x: x if x['id'] == block_id else None}))
formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), {
'url': ('url', {url_or_none}),
'ext': ('format', {str}),
'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
}))
if formats:
entry = {
'id': block_id,
'display_id': playlist_id,
'formats': formats,
'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})),
**traverse_obj(model, {
'title': ('title', {str}),
'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}),
}),
}
done = True
if entry:
entries.append(entry)
if done:
break
if entries:
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
def extract_all(pattern): def extract_all(pattern):
return list(filter(None, map( return list(filter(None, map(
lambda s: self._parse_json(s, playlist_id, fatal=False), lambda s: self._parse_json(s, playlist_id, fatal=False),
re.findall(pattern, webpage)))) re.findall(pattern, webpage))))
# US accessed article with single embedded video (e.g.
# https://www.bbc.com/news/uk-68546268)
next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}),
('props', 'pageProps', 'page'))
model = traverse_obj(next_data, (
..., 'contents', is_type('video'),
'model', 'blocks', is_type('media'),
'model', 'blocks', is_type('mediaMetadata'),
'model', {dict}, any))
if model and (entry := parse_model(model)):
if not entry.get('timestamp'):
entry['timestamp'] = traverse_obj(next_data, (
..., 'contents', is_type('timestamp'), 'model',
'timestamp', {functools.partial(int_or_none, scale=1000)}, any))
entries.append(entry)
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
# Multiple video article (e.g. # Multiple video article (e.g.
# http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX

View file

@ -93,11 +93,11 @@ class BilibiliBaseIE(InfoExtractor):
return formats return formats
def _download_playinfo(self, video_id, cid): def _download_playinfo(self, video_id, cid, headers=None):
return self._download_json( return self._download_json(
'https://api.bilibili.com/x/player/playurl', video_id, 'https://api.bilibili.com/x/player/playurl', video_id,
query={'bvid': video_id, 'cid': cid, 'fnval': 4048}, query={'bvid': video_id, 'cid': cid, 'fnval': 4048},
note=f'Downloading video formats for cid {cid}')['data'] note=f'Downloading video formats for cid {cid}', headers=headers)['data']
def json2srt(self, json_data): def json2srt(self, json_data):
srt_data = '' srt_data = ''
@ -493,7 +493,8 @@ class BiliBiliIE(BilibiliBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage, urlh = self._download_webpage_handle(url, video_id) headers = self.geo_verification_headers()
webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers)
if not self._match_valid_url(urlh.url): if not self._match_valid_url(urlh.url):
return self.url_result(urlh.url) return self.url_result(urlh.url)
@ -531,7 +532,7 @@ class BiliBiliIE(BilibiliBaseIE):
self._download_json( self._download_json(
'https://api.bilibili.com/x/player/pagelist', video_id, 'https://api.bilibili.com/x/player/pagelist', video_id,
fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'},
note='Extracting videos in anthology'), note='Extracting videos in anthology', headers=headers),
'data', expected_type=list) or [] 'data', expected_type=list) or []
is_anthology = len(page_list_json) > 1 is_anthology = len(page_list_json) > 1
@ -552,7 +553,7 @@ class BiliBiliIE(BilibiliBaseIE):
festival_info = {} festival_info = {}
if is_festival: if is_festival:
play_info = self._download_playinfo(video_id, cid) play_info = self._download_playinfo(video_id, cid, headers=headers)
festival_info = traverse_obj(initial_state, { festival_info = traverse_obj(initial_state, {
'uploader': ('videoInfo', 'upName'), 'uploader': ('videoInfo', 'upName'),
@ -666,14 +667,15 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
episode_id = self._match_id(url) episode_id = self._match_id(url)
webpage = self._download_webpage(url, episode_id) headers = self.geo_verification_headers()
webpage = self._download_webpage(url, episode_id, headers=headers)
if '您所在的地区无法观看本片' in webpage: if '您所在的地区无法观看本片' in webpage:
raise GeoRestrictedError('This video is restricted') raise GeoRestrictedError('This video is restricted')
elif '正在观看预览,大会员免费看全片' in webpage: elif '正在观看预览,大会员免费看全片' in webpage:
self.raise_login_required('This video is for premium members only') self.raise_login_required('This video is for premium members only')
headers = {'Referer': url, **self.geo_verification_headers()} headers['Referer'] = url
play_info = self._download_json( play_info = self._download_json(
'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id, 'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id,
'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id},
@ -724,7 +726,7 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
'duration': float_or_none(play_info.get('timelength'), scale=1000), 'duration': float_or_none(play_info.get('timelength'), scale=1000),
'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid), 'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid),
'__post_extractor': self.extract_comments(aid), '__post_extractor': self.extract_comments(aid),
'http_headers': headers, 'http_headers': {'Referer': url},
} }
@ -1043,15 +1045,17 @@ class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE):
try: try:
response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search', response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search',
playlist_id, note=f'Downloading page {page_idx}', query=query) playlist_id, note=f'Downloading page {page_idx}', query=query,
headers={'referer': url})
except ExtractorError as e: except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 412: if isinstance(e.cause, HTTPError) and e.cause.status == 412:
raise ExtractorError( raise ExtractorError(
'Request is blocked by server (412), please add cookies, wait and try later.', expected=True) 'Request is blocked by server (412), please add cookies, wait and try later.', expected=True)
raise raise
if response['code'] == -401: if response['code'] in (-352, -401):
raise ExtractorError( raise ExtractorError(
'Request is blocked by server (401), please add cookies, wait and try later.', expected=True) f'Request is blocked by server ({-response["code"]}), '
'please add cookies, wait and try later.', expected=True)
return response['data'] return response['data']
def get_metadata(page_data): def get_metadata(page_data):

View file

@ -1,7 +1,11 @@
import json
import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeIE from .youtube import YoutubeIE
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
bug_reports_message,
int_or_none, int_or_none,
qualities, qualities,
str_or_none, str_or_none,
@ -162,9 +166,19 @@ class BoostyIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
user, post_id = self._match_valid_url(url).group('user', 'post_id') user, post_id = self._match_valid_url(url).group('user', 'post_id')
auth_headers = {}
auth_cookie = self._get_cookies('https://boosty.to/').get('auth')
if auth_cookie is not None:
try:
auth_data = json.loads(urllib.parse.unquote(auth_cookie.value))
auth_headers['Authorization'] = f'Bearer {auth_data["accessToken"]}'
except (json.JSONDecodeError, KeyError):
self.report_warning(f'Failed to extract token from auth cookie{bug_reports_message()}')
post = self._download_json( post = self._download_json(
f'https://api.boosty.to/v1/blog/{user}/post/{post_id}', post_id, f'https://api.boosty.to/v1/blog/{user}/post/{post_id}', post_id,
note='Downloading post data', errnote='Unable to download post data') note='Downloading post data', errnote='Unable to download post data', headers=auth_headers)
post_title = post.get('title') post_title = post.get('title')
if not post_title: if not post_title:
@ -202,7 +216,9 @@ class BoostyIE(InfoExtractor):
'thumbnail': (('previewUrl', 'defaultPreview'), {url_or_none}), 'thumbnail': (('previewUrl', 'defaultPreview'), {url_or_none}),
}, get_all=False)}) }, get_all=False)})
if not entries: if not entries and not post.get('hasAccess'):
self.raise_login_required('This post requires a subscription', metadata_available=True)
elif not entries:
raise ExtractorError('No videos found', expected=True) raise ExtractorError('No videos found', expected=True)
if len(entries) == 1: if len(entries) == 1:
return entries[0] return entries[0]

View file

@ -1,32 +0,0 @@
from .common import InfoExtractor
class CableAVIE(InfoExtractor):
_VALID_URL = r'https?://cableav\.tv/(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://cableav.tv/lS4iR9lWjN8/',
'md5': '7e3fe5e49d61c4233b7f5b0f69b15e18',
'info_dict': {
'id': 'lS4iR9lWjN8',
'ext': 'mp4',
'title': '國產麻豆AV 叮叮映畫 DDF001 情欲小說家 - CableAV',
'description': '國產AV 480p, 720p 国产麻豆AV 叮叮映画 DDF001 情欲小说家',
'thumbnail': r're:^https?://.*\.jpg$',
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = self._og_search_video_url(webpage, secure=False)
formats = self._extract_m3u8_formats(video_url, video_id, 'mp4')
return {
'id': video_id,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
}

View file

@ -40,7 +40,7 @@ class CanalAlphaIE(InfoExtractor):
'id': '24484', 'id': '24484',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Ces innovations qui veulent rendre lagriculture plus durable', 'title': 'Ces innovations qui veulent rendre lagriculture plus durable',
'description': 'md5:3de3f151180684621e85be7c10e4e613', 'description': 'md5:85d594a3b5dc6ccfc4a85aba6e73b129',
'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_10236.jpg', 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_10236.jpg',
'upload_date': '20211026', 'upload_date': '20211026',
'duration': 360, 'duration': 360,
@ -58,14 +58,25 @@ class CanalAlphaIE(InfoExtractor):
'duration': 360, 'duration': 360,
}, },
'params': {'skip_download': True} 'params': {'skip_download': True}
}, {
'url': 'https://www.canalalpha.ch/play/le-journal/topic/33500/encore-des-mesures-deconomie-dans-le-jura',
'info_dict': {
'id': '33500',
'ext': 'mp4',
'title': 'Encore des mesures d\'économie dans le Jura',
'description': 'md5:938b5b556592f2d1b9ab150268082a80',
'thumbnail': 'https://static.canalalpha.ch/poster/news/news_46665.jpg',
'upload_date': '20240411',
'duration': 105,
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, id) webpage = self._download_webpage(url, video_id)
data_json = self._parse_json(self._search_regex( data_json = self._parse_json(self._search_regex(
r'window\.__SERVER_STATE__\s?=\s?({(?:(?!};)[^"]|"([^"]|\\")*")+})\s?;', r'window\.__SERVER_STATE__\s?=\s?({(?:(?!};)[^"]|"([^"]|\\")*")+})\s?;',
webpage, 'data_json'), id)['1']['data']['data'] webpage, 'data_json'), video_id)['1']['data']['data']
manifests = try_get(data_json, lambda x: x['video']['manifests'], expected_type=dict) or {} manifests = try_get(data_json, lambda x: x['video']['manifests'], expected_type=dict) or {}
subtitles = {} subtitles = {}
formats = [{ formats = [{
@ -75,15 +86,17 @@ class CanalAlphaIE(InfoExtractor):
'height': try_get(video, lambda x: x['res']['height'], expected_type=int), 'height': try_get(video, lambda x: x['res']['height'], expected_type=int),
} for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')] } for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')]
if manifests.get('hls'): if manifests.get('hls'):
m3u8_frmts, m3u8_subs = self._parse_m3u8_formats_and_subtitles(manifests['hls'], video_id=id) fmts, subs = self._extract_m3u8_formats_and_subtitles(
formats.extend(m3u8_frmts) manifests['hls'], video_id, m3u8_id='hls', fatal=False)
subtitles = self._merge_subtitles(subtitles, m3u8_subs) formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if manifests.get('dash'): if manifests.get('dash'):
dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash']) fmts, subs = self._extract_mpd_formats_and_subtitles(
formats.extend(dash_frmts) manifests['dash'], video_id, mpd_id='dash', fatal=False)
subtitles = self._merge_subtitles(subtitles, dash_subs) formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return { return {
'id': id, 'id': video_id,
'title': data_json.get('title').strip(), 'title': data_json.get('title').strip(),
'description': clean_html(dict_get(data_json, ('longDesc', 'shortDesc'))), 'description': clean_html(dict_get(data_json, ('longDesc', 'shortDesc'))),
'thumbnail': data_json.get('poster'), 'thumbnail': data_json.get('poster'),

View file

@ -151,7 +151,7 @@ class CBCIE(InfoExtractor):
class CBCPlayerIE(InfoExtractor): class CBCPlayerIE(InfoExtractor):
IE_NAME = 'cbc.ca:player' IE_NAME = 'cbc.ca:player'
_VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)' _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/(?:video/)?|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.cbc.ca/player/play/2683190193', 'url': 'http://www.cbc.ca/player/play/2683190193',
'md5': '64d25f841ddf4ddb28a235338af32e2c', 'md5': '64d25f841ddf4ddb28a235338af32e2c',
@ -277,6 +277,28 @@ class CBCPlayerIE(InfoExtractor):
'location': 'Canada', 'location': 'Canada',
'media_type': 'Full Program', 'media_type': 'Full Program',
}, },
}, {
'url': 'https://www.cbc.ca/player/play/video/1.7194274',
'md5': '188b96cf6bdcb2540e178a6caa957128',
'info_dict': {
'id': '2334524995812',
'ext': 'mp4',
'title': '#TheMoment a rare white spirit moose was spotted in Alberta',
'description': 'md5:18ae269a2d0265c5b0bbe4b2e1ac61a3',
'timestamp': 1714788791,
'duration': 77.678,
'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/201/543/THE_MOMENT.jpg',
'uploader': 'CBCC-NEW',
'chapters': 'count:0',
'upload_date': '20240504',
'categories': 'count:3',
'series': 'The National',
'tags': 'count:15',
'creators': ['encoder'],
'location': 'Canada',
'media_type': 'Excerpt',
},
}, { }, {
'url': 'cbcplayer:1.7159484', 'url': 'cbcplayer:1.7159484',
'only_matching': True, 'only_matching': True,

View file

@ -16,7 +16,6 @@ from ..utils import (
merge_dicts, merge_dicts,
multipart_encode, multipart_encode,
parse_duration, parse_duration,
random_birthday,
traverse_obj, traverse_obj,
try_call, try_call,
try_get, try_get,
@ -63,38 +62,57 @@ class CDAIE(InfoExtractor):
'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a', 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'crash404', 'uploader': 'crash404',
'view_count': int,
'average_rating': float, 'average_rating': float,
'duration': 137, 'duration': 137,
'age_limit': 0, 'age_limit': 0,
'upload_date': '20160220',
'timestamp': 1455968218,
} }
}, { }, {
# Age-restricted # Age-restricted with vfilm redirection
'url': 'http://www.cda.pl/video/1273454c4', 'url': 'https://www.cda.pl/video/8753244c4',
'md5': 'd8eeb83d63611289507010d3df3bb8b3',
'info_dict': { 'info_dict': {
'id': '1273454c4', 'id': '8753244c4',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Bronson (2008) napisy HD 1080p', 'title': '[18+] Bez Filtra: Rezerwowe Psy czyli... najwulgarniejsza polska gra?',
'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', 'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e',
'height': 1080, 'height': 1080,
'uploader': 'boniek61', 'uploader': 'arhn eu',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 5554, 'duration': 991,
'age_limit': 18, 'age_limit': 18,
'view_count': int,
'average_rating': float, 'average_rating': float,
}, 'timestamp': 1633888264,
'upload_date': '20211010',
}
}, {
# Age-restricted without vfilm redirection
'url': 'https://www.cda.pl/video/17028157b8',
'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992',
'info_dict': {
'id': '17028157b8',
'ext': 'mp4',
'title': 'STENDUPY MICHAŁ OGIŃSKI',
'description': 'md5:5851f3272bfc31f762d616040a1d609a',
'height': 480,
'uploader': 'oginski',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 18855,
'age_limit': 18,
'average_rating': float,
'timestamp': 1699705901,
'upload_date': '20231111',
}
}, { }, {
'url': 'http://ebd.cda.pl/0x0/5749950c', 'url': 'http://ebd.cda.pl/0x0/5749950c',
'only_matching': True, 'only_matching': True,
}] }]
def _download_age_confirm_page(self, url, video_id, *args, **kwargs): def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
form_data = random_birthday('rok', 'miesiac', 'dzien') data, content_type = multipart_encode({'age_confirm': ''})
form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
data, content_type = multipart_encode(form_data)
return self._download_webpage( return self._download_webpage(
urljoin(url, '/a/validatebirth'), video_id, *args, url, video_id, *args,
data=data, headers={ data=data, headers={
'Referer': url, 'Referer': url,
'Content-Type': content_type, 'Content-Type': content_type,
@ -164,7 +182,7 @@ class CDAIE(InfoExtractor):
if 'Authorization' in self._API_HEADERS: if 'Authorization' in self._API_HEADERS:
return self._api_extract(video_id) return self._api_extract(video_id)
else: else:
return self._web_extract(video_id, url) return self._web_extract(video_id)
def _api_extract(self, video_id): def _api_extract(self, video_id):
meta = self._download_json( meta = self._download_json(
@ -197,9 +215,9 @@ class CDAIE(InfoExtractor):
'view_count': meta.get('views'), 'view_count': meta.get('views'),
} }
def _web_extract(self, video_id, url): def _web_extract(self, video_id):
self._set_cookie('cda.pl', 'cda.player', 'html5') self._set_cookie('cda.pl', 'cda.player', 'html5')
webpage = self._download_webpage( webpage, urlh = self._download_webpage_handle(
f'{self._BASE_URL}/video/{video_id}/vfilm', video_id) f'{self._BASE_URL}/video/{video_id}/vfilm', video_id)
if 'Ten film jest dostępny dla użytkowników premium' in webpage: if 'Ten film jest dostępny dla użytkowników premium' in webpage:
@ -209,10 +227,10 @@ class CDAIE(InfoExtractor):
self.raise_geo_restricted() self.raise_geo_restricted()
need_confirm_age = False need_confirm_age = False
if self._html_search_regex(r'(<form[^>]+action="[^"]*/a/validatebirth[^"]*")', if self._html_search_regex(r'(<button[^>]+name="[^"]*age_confirm[^"]*")',
webpage, 'birthday validate form', default=None): webpage, 'birthday validate form', default=None):
webpage = self._download_age_confirm_page( webpage = self._download_age_confirm_page(
url, video_id, note='Confirming age') urlh.url, video_id, note='Confirming age')
need_confirm_age = True need_confirm_age = True
formats = [] formats = []
@ -222,9 +240,6 @@ class CDAIE(InfoExtractor):
(?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*? (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
<(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3> <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
''', webpage, 'uploader', default=None, group='uploader') ''', webpage, 'uploader', default=None, group='uploader')
view_count = self._search_regex(
r'Odsłony:(?:\s|&nbsp;)*([0-9]+)', webpage,
'view_count', default=None)
average_rating = self._search_regex( average_rating = self._search_regex(
(r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)', (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False, r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
@ -235,7 +250,6 @@ class CDAIE(InfoExtractor):
'title': self._og_search_title(webpage), 'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage), 'description': self._og_search_description(webpage),
'uploader': uploader, 'uploader': uploader,
'view_count': int_or_none(view_count),
'average_rating': float_or_none(average_rating), 'average_rating': float_or_none(average_rating),
'thumbnail': self._og_search_thumbnail(webpage), 'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats, 'formats': formats,

View file

@ -957,7 +957,8 @@ class InfoExtractor:
if urlh is False: if urlh is False:
assert not fatal assert not fatal
return False return False
content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
encoding=encoding, data=data)
return (content, urlh) return (content, urlh)
@staticmethod @staticmethod
@ -1005,8 +1006,10 @@ class InfoExtractor:
'Visit http://blocklist.rkn.gov.ru/ for a block reason.', 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
expected=True) expected=True)
def _request_dump_filename(self, url, video_id): def _request_dump_filename(self, url, video_id, data=None):
basen = f'{video_id}_{url}' if data is not None:
data = hashlib.md5(data).hexdigest()
basen = join_nonempty(video_id, data, url, delim='_')
trim_length = self.get_param('trim_file_name') or 240 trim_length = self.get_param('trim_file_name') or 240
if len(basen) > trim_length: if len(basen) > trim_length:
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
@ -1028,7 +1031,8 @@ class InfoExtractor:
except LookupError: except LookupError:
return webpage_bytes.decode('utf-8', 'replace') return webpage_bytes.decode('utf-8', 'replace')
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
prefix=None, encoding=None, data=None):
webpage_bytes = urlh.read() webpage_bytes = urlh.read()
if prefix is not None: if prefix is not None:
webpage_bytes = prefix + webpage_bytes webpage_bytes = prefix + webpage_bytes
@ -1037,7 +1041,9 @@ class InfoExtractor:
dump = base64.b64encode(webpage_bytes).decode('ascii') dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump) self._downloader.to_screen(dump)
if self.get_param('write_pages'): if self.get_param('write_pages'):
filename = self._request_dump_filename(urlh.url, video_id) if isinstance(url_or_request, Request):
data = self._create_request(url_or_request, data).data
filename = self._request_dump_filename(urlh.url, video_id, data)
self.to_screen(f'Saving request to {filename}') self.to_screen(f'Saving request to {filename}')
with open(filename, 'wb') as outf: with open(filename, 'wb') as outf:
outf.write(webpage_bytes) outf.write(webpage_bytes)
@ -1098,7 +1104,7 @@ class InfoExtractor:
impersonate=None, require_impersonation=False): impersonate=None, require_impersonation=False):
if self.get_param('load_pages'): if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query) url_or_request = self._create_request(url_or_request, data, headers, query)
filename = self._request_dump_filename(url_or_request.url, video_id) filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
self.to_screen(f'Loading request from {filename}') self.to_screen(f'Loading request from {filename}')
try: try:
with open(filename, 'rb') as dumpf: with open(filename, 'rb') as dumpf:

View file

@ -40,3 +40,19 @@ class UnicodeBOMIE(InfoExtractor):
'Your URL starts with a Byte Order Mark (BOM). ' 'Your URL starts with a Byte Order Mark (BOM). '
'Removing the BOM and looking for "%s" ...' % real_url) 'Removing the BOM and looking for "%s" ...' % real_url)
return self.url_result(real_url) return self.url_result(real_url)
class BlobIE(InfoExtractor):
IE_DESC = False
_VALID_URL = r'blob:'
_TESTS = [{
'url': 'blob:https://www.youtube.com/4eb3d090-a761-46e6-8083-c32016a36e3b',
'only_matching': True,
}]
def _real_extract(self, url):
raise ExtractorError(
'You\'ve asked yt-dlp to download a blob URL. '
'A blob URL exists only locally in your browser. '
'It is not possible for yt-dlp to access it.', expected=True)

View file

@ -53,15 +53,19 @@ class CrunchyrollBaseIE(InfoExtractor):
CrunchyrollBaseIE._AUTH_EXPIRY = time_seconds(seconds=traverse_obj(response, ('expires_in', {float_or_none}), default=300) - 10) CrunchyrollBaseIE._AUTH_EXPIRY = time_seconds(seconds=traverse_obj(response, ('expires_in', {float_or_none}), default=300) - 10)
def _request_token(self, headers, data, note='Requesting token', errnote='Failed to request token'): def _request_token(self, headers, data, note='Requesting token', errnote='Failed to request token'):
try: # TODO: Add impersonation support here try:
return self._download_json( return self._download_json(
f'{self._BASE_URL}/auth/v1/token', None, note=note, errnote=errnote, f'{self._BASE_URL}/auth/v1/token', None, note=note, errnote=errnote,
headers=headers, data=urlencode_postdata(data)) headers=headers, data=urlencode_postdata(data), impersonate=True)
except ExtractorError as error: except ExtractorError as error:
if not isinstance(error.cause, HTTPError) or error.cause.status != 403: if not isinstance(error.cause, HTTPError) or error.cause.status != 403:
raise raise
if target := error.cause.response.extensions.get('impersonate'):
raise ExtractorError(f'Got HTTP Error 403 when using impersonate target "{target}"')
raise ExtractorError( raise ExtractorError(
'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, ' 'Request blocked by Cloudflare. '
'Install the required impersonation dependency if possible, '
'or else navigate to Crunchyroll in your browser, '
'then pass the fresh cookies (with --cookies-from-browser or --cookies) ' 'then pass the fresh cookies (with --cookies-from-browser or --cookies) '
'and your browser\'s User-Agent (with --user-agent)', expected=True) 'and your browser\'s User-Agent (with --user-agent)', expected=True)
@ -394,10 +398,11 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')):
message = f'This {object_type} is for premium members only' message = f'This {object_type} is for premium members only'
if CrunchyrollBaseIE._REFRESH_TOKEN: if CrunchyrollBaseIE._REFRESH_TOKEN:
raise ExtractorError(message, expected=True) self.raise_no_formats(message, expected=True, video_id=internal_id)
self.raise_login_required(message, method='password') else:
self.raise_login_required(message, method='password', metadata_available=True)
result['formats'], result['subtitles'] = self._extract_stream(internal_id) else:
result['formats'], result['subtitles'] = self._extract_stream(internal_id)
result['chapters'] = self._extract_chapters(internal_id) result['chapters'] = self._extract_chapters(internal_id)
@ -583,14 +588,16 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
if not response: if not response:
raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True)
result = self._transform_music_response(response)
if not self._IS_PREMIUM and response.get('isPremiumOnly'): if not self._IS_PREMIUM and response.get('isPremiumOnly'):
message = f'This {response.get("type") or "media"} is for premium members only' message = f'This {response.get("type") or "media"} is for premium members only'
if CrunchyrollBaseIE._REFRESH_TOKEN: if CrunchyrollBaseIE._REFRESH_TOKEN:
raise ExtractorError(message, expected=True) self.raise_no_formats(message, expected=True, video_id=internal_id)
self.raise_login_required(message, method='password') else:
self.raise_login_required(message, method='password', metadata_available=True)
result = self._transform_music_response(response) else:
result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id) result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id)
return result return result

View file

@ -1,105 +0,0 @@
import json
from .common import InfoExtractor
from ..compat import (
compat_b64decode,
compat_str,
compat_urlparse,
)
from ..utils import (
extract_attributes,
ExtractorError,
get_elements_by_class,
urlencode_postdata,
)
class EinthusanIE(InfoExtractor):
_VALID_URL = r'https?://(?P<host>einthusan\.(?:tv|com|ca))/movie/watch/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://einthusan.tv/movie/watch/9097/',
'md5': 'ff0f7f2065031b8a2cf13a933731c035',
'info_dict': {
'id': '9097',
'ext': 'mp4',
'title': 'Ae Dil Hai Mushkil',
'description': 'md5:33ef934c82a671a94652a9b4e54d931b',
'thumbnail': r're:^https?://.*\.jpg$',
}
}, {
'url': 'https://einthusan.tv/movie/watch/51MZ/?lang=hindi',
'only_matching': True,
}, {
'url': 'https://einthusan.com/movie/watch/9097/',
'only_matching': True,
}, {
'url': 'https://einthusan.ca/movie/watch/4E9n/?lang=hindi',
'only_matching': True,
}]
# reversed from jsoncrypto.prototype.decrypt() in einthusan-PGMovieWatcher.js
def _decrypt(self, encrypted_data, video_id):
return self._parse_json(compat_b64decode((
encrypted_data[:10] + encrypted_data[-1] + encrypted_data[12:-1]
)).decode('utf-8'), video_id)
def _real_extract(self, url):
mobj = self._match_valid_url(url)
host = mobj.group('host')
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h3>([^<]+)</h3>', webpage, 'title')
player_params = extract_attributes(self._search_regex(
r'(<section[^>]+id="UIVideoPlayer"[^>]+>)', webpage, 'player parameters'))
page_id = self._html_search_regex(
'<html[^>]+data-pageid="([^"]+)"', webpage, 'page ID')
video_data = self._download_json(
'https://%s/ajax/movie/watch/%s/' % (host, video_id), video_id,
data=urlencode_postdata({
'xEvent': 'UIVideoPlayer.PingOutcome',
'xJson': json.dumps({
'EJOutcomes': player_params['data-ejpingables'],
'NativeHLS': False
}),
'arcVersion': 3,
'appVersion': 59,
'gorilla.csrf.Token': page_id,
}))['Data']
if isinstance(video_data, compat_str) and video_data.startswith('/ratelimited/'):
raise ExtractorError(
'Download rate reached. Please try again later.', expected=True)
ej_links = self._decrypt(video_data['EJLinks'], video_id)
formats = []
m3u8_url = ej_links.get('HLSLink')
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native'))
mp4_url = ej_links.get('MP4Link')
if mp4_url:
formats.append({
'url': mp4_url,
})
description = get_elements_by_class('synopsis', webpage)[0]
thumbnail = self._html_search_regex(
r'''<img[^>]+src=(["'])(?P<url>(?!\1).+?/moviecovers/(?!\1).+?)\1''',
webpage, 'thumbnail url', fatal=False, group='url')
if thumbnail is not None:
thumbnail = compat_urlparse.urljoin(url, thumbnail)
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail,
'description': description,
}

View file

@ -16,13 +16,31 @@ class EplusIbIE(InfoExtractor):
_VALID_URL = [r'https?://live\.eplus\.jp/ex/player\?ib=(?P<id>(?:\w|%2B|%2F){86}%3D%3D)', _VALID_URL = [r'https?://live\.eplus\.jp/ex/player\?ib=(?P<id>(?:\w|%2B|%2F){86}%3D%3D)',
r'https?://live\.eplus\.jp/(?P<id>sample|\d+)'] r'https?://live\.eplus\.jp/(?P<id>sample|\d+)']
_TESTS = [{ _TESTS = [{
'url': 'https://live.eplus.jp/ex/player?ib=YEFxb3Vyc2Dombnjg7blkrLlrablnJLjgrnjgq%2Fjg7zjg6vjgqLjgqTjg4njg6vlkIzlpb3kvJpgTGllbGxhIQ%3D%3D', 'url': 'https://live.eplus.jp/ex/player?ib=41K6Wzbr3PlcMD%2FOKHFlC%2FcZCe2Eaw7FK%2BpJS1ooUHki8d0vGSy2mYqxillQBe1dSnOxU%2B8%2FzXKls4XPBSb3vw%3D%3D',
'info_dict': { 'info_dict': {
'id': '354502-0001-002', 'id': '335699-0001-006',
'title': 'LoveLive!Series Presents COUNTDOWN LoveLive! 2021→2022LIVE with a smile!【Streaming+(配信)】', 'title': '少女☆歌劇 レヴュースタァライト -The LIVE 青嵐- BLUE GLITTER <定点映像配信>【Streaming+(配信)】',
'live_status': 'was_live', 'live_status': 'was_live',
'release_date': '20211231', 'release_date': '20201221',
'release_timestamp': 1640952000, 'release_timestamp': 1608544800,
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True,
},
'expected_warnings': [
'This event may not be accessible',
'No video formats found',
'Requested format is not available',
],
}, {
'url': 'https://live.eplus.jp/ex/player?ib=6QSsQdyRAwOFZrEHWlhRm7vocgV%2FO0YzBZ%2BaBEBg1XR%2FmbLn0R%2F048dUoAY038%2F%2F92MJ73BsoAtvUpbV6RLtDQ%3D%3D&show_id=2371511',
'info_dict': {
'id': '348021-0054-001',
'title': 'ラブライブ!スーパースター!! Liella! First LoveLive! Tour Starlines【東京/DAY.1】',
'live_status': 'was_live',
'release_date': '20220115',
'release_timestamp': 1642233600,
'description': str, 'description': str,
}, },
'params': { 'params': {
@ -124,6 +142,10 @@ class EplusIbIE(InfoExtractor):
if data_json.get('drm_mode') == 'ON': if data_json.get('drm_mode') == 'ON':
self.report_drm(video_id) self.report_drm(video_id)
if data_json.get('is_pass_ticket') == 'YES':
raise ExtractorError(
'This URL is for a pass ticket instead of a player page', expected=True)
delivery_status = data_json.get('delivery_status') delivery_status = data_json.get('delivery_status')
archive_mode = data_json.get('archive_mode') archive_mode = data_json.get('archive_mode')
release_timestamp = try_call(lambda: unified_timestamp(data_json['event_datetime']) - 32400) release_timestamp = try_call(lambda: unified_timestamp(data_json['event_datetime']) - 32400)

View file

@ -94,13 +94,14 @@ class EuropaIE(InfoExtractor):
class EuroParlWebstreamIE(InfoExtractor): class EuroParlWebstreamIE(InfoExtractor):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?://multimedia\.europarl\.europa\.eu/[^/#?]+/ https?://multimedia\.europarl\.europa\.eu/
(?:(?!video)[^/#?]+/[\w-]+_)(?P<id>[\w-]+) (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)
''' '''
_TESTS = [{ _TESTS = [{
'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
'info_dict': { 'info_dict': {
'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d',
'display_id': '20220914-0900-PLENARY',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Plenary session', 'title': 'Plenary session',
'release_timestamp': 1663139069, 'release_timestamp': 1663139069,
@ -125,6 +126,7 @@ class EuroParlWebstreamIE(InfoExtractor):
'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-culture-and-education_20230301-1130-COMMITTEE-CULT', 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-culture-and-education_20230301-1130-COMMITTEE-CULT',
'info_dict': { 'info_dict': {
'id': '7355662c-8eac-445e-4bb9-08db14b0ddd7', 'id': '7355662c-8eac-445e-4bb9-08db14b0ddd7',
'display_id': '20230301-1130-COMMITTEE-CULT',
'ext': 'mp4', 'ext': 'mp4',
'release_date': '20230301', 'release_date': '20230301',
'title': 'Committee on Culture and Education', 'title': 'Committee on Culture and Education',
@ -142,6 +144,19 @@ class EuroParlWebstreamIE(InfoExtractor):
'live_status': 'is_live', 'live_status': 'is_live',
}, },
'skip': 'Not live anymore' 'skip': 'Not live anymore'
}, {
'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/20240320-1345-SPECIAL-PRESSER',
'info_dict': {
'id': 'c1f11567-5b52-470a-f3e1-08dc3c216ace',
'display_id': '20240320-1345-SPECIAL-PRESSER',
'ext': 'mp4',
'release_date': '20240320',
'title': 'md5:7c6c814cac55dea5e2d87bf8d3db2234',
'release_timestamp': 1710939767,
}
}, {
'url': 'https://multimedia.europarl.europa.eu/webstreaming/briefing-for-media-on-2024-european-elections_20240429-1000-SPECIAL-OTHER',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -166,6 +181,7 @@ class EuroParlWebstreamIE(InfoExtractor):
return { return {
'id': json_info['id'], 'id': json_info['id'],
'display_id': display_id,
'title': traverse_obj(webpage_nextjs, (('mediaItem', 'title'), ('title', )), get_all=False), 'title': traverse_obj(webpage_nextjs, (('mediaItem', 'title'), ('title', )), get_all=False),
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,

View file

@ -1,9 +1,11 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import compat_parse_qs from ..compat import compat_parse_qs
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
bug_reports_message,
determine_ext, determine_ext,
extract_attributes, extract_attributes,
get_element_by_class, get_element_by_class,
@ -38,6 +40,17 @@ class GoogleDriveIE(InfoExtractor):
'duration': 45, 'duration': 45,
'thumbnail': 'https://drive.google.com/thumbnail?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'thumbnail': 'https://drive.google.com/thumbnail?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ',
} }
}, {
# has itag 50 which is not in YoutubeIE._formats (royalty Free music from 1922)
'url': 'https://drive.google.com/uc?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x',
'md5': '322db8d63dd19788c04050a4bba67073',
'info_dict': {
'id': '1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x',
'ext': 'mp3',
'title': 'My Buddy - Henry Burr - Gus Kahn - Walter Donaldson.mp3',
'duration': 184,
'thumbnail': 'https://drive.google.com/thumbnail?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x',
},
}, { }, {
# video can't be watched anonymously due to view count limit reached, # video can't be watched anonymously due to view count limit reached,
# but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
@ -58,22 +71,8 @@ class GoogleDriveIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
_FORMATS_EXT = { _FORMATS_EXT = {
'5': 'flv', **{k: v['ext'] for k, v in YoutubeIE._formats.items() if v.get('ext')},
'6': 'flv', '50': 'm4a',
'13': '3gp',
'17': '3gp',
'18': 'mp4',
'22': 'mp4',
'34': 'flv',
'35': 'flv',
'36': '3gp',
'37': 'mp4',
'38': 'mp4',
'43': 'webm',
'44': 'webm',
'45': 'webm',
'46': 'webm',
'59': 'mp4',
} }
_BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext' _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
_CAPTIONS_ENTRY_TAG = { _CAPTIONS_ENTRY_TAG = {
@ -194,10 +193,13 @@ class GoogleDriveIE(InfoExtractor):
if len(fmt_stream_split) < 2: if len(fmt_stream_split) < 2:
continue continue
format_id, format_url = fmt_stream_split[:2] format_id, format_url = fmt_stream_split[:2]
ext = self._FORMATS_EXT.get(format_id)
if not ext:
self.report_warning(f'Unknown format {format_id}{bug_reports_message()}')
f = { f = {
'url': lowercase_escape(format_url), 'url': lowercase_escape(format_url),
'format_id': format_id, 'format_id': format_id,
'ext': self._FORMATS_EXT[format_id], 'ext': ext,
} }
resolution = resolutions.get(format_id) resolution = resolutions.get(format_id)
if resolution: if resolution:

View file

@ -1,7 +1,8 @@
import re import re
from .cloudflarestream import CloudflareStreamIE
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import traverse_obj from ..utils.traversal import traverse_obj
class HytaleIE(InfoExtractor): class HytaleIE(InfoExtractor):
@ -49,7 +50,7 @@ class HytaleIE(InfoExtractor):
entries = [ entries = [
self.url_result( self.url_result(
f'https://cloudflarestream.com/{video_hash}/manifest/video.mpd?parentOrigin=https%3A%2F%2Fhytale.com', f'https://cloudflarestream.com/{video_hash}/manifest/video.mpd?parentOrigin=https%3A%2F%2Fhytale.com',
title=self._titles.get(video_hash), url_transparent=True) CloudflareStreamIE, title=self._titles.get(video_hash), url_transparent=True)
for video_hash in re.findall( for video_hash in re.findall(
r'<stream\s+class\s*=\s*"ql-video\s+cf-stream"\s+src\s*=\s*"([a-f0-9]{32})"', r'<stream\s+class\s*=\s*"ql-video\s+cf-stream"\s+src\s*=\s*"([a-f0-9]{32})"',
webpage) webpage)

View file

@ -1,103 +0,0 @@
import re
from .common import InfoExtractor
from ..utils import (
InAdvancePagedList,
int_or_none,
orderedSet,
unified_strdate,
)
class JableIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?jable\.tv/videos/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://jable.tv/videos/pppd-812/',
'md5': 'f1537283a9bc073c31ff86ca35d9b2a6',
'info_dict': {
'id': 'pppd-812',
'ext': 'mp4',
'title': 'PPPD-812 只要表現好巨乳女教師吉根柚莉愛就獎勵學生們在白虎穴內射出精液',
'description': 'md5:5b6d4199a854f62c5e56e26ccad19967',
'thumbnail': r're:^https?://.*\.jpg$',
'age_limit': 18,
'like_count': int,
'view_count': int,
},
}, {
'url': 'https://jable.tv/videos/apak-220/',
'md5': '71f9239d69ced58ab74a816908847cc1',
'info_dict': {
'id': 'apak-220',
'ext': 'mp4',
'title': 'md5:5c3861b7cf80112a6e2b70bccf170824',
'description': '',
'thumbnail': r're:^https?://.*\.jpg$',
'age_limit': 18,
'like_count': int,
'view_count': int,
'upload_date': '20220319',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
formats = self._extract_m3u8_formats(
self._search_regex(r'var\s+hlsUrl\s*=\s*\'([^\']+)', webpage, 'hls_url'), video_id, 'mp4', m3u8_id='hls')
return {
'id': video_id,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage, default=''),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'formats': formats,
'age_limit': 18,
'upload_date': unified_strdate(self._search_regex(
r'class="inactive-color">\D+\s+(\d{4}-\d+-\d+)', webpage, 'upload_date', default=None)),
'view_count': int_or_none(self._search_regex(
r'#icon-eye"></use></svg>\n*<span class="mr-3">([\d ]+)',
webpage, 'view_count', default='').replace(' ', '')),
'like_count': int_or_none(self._search_regex(
r'#icon-heart"></use></svg><span class="count">(\d+)', webpage, 'link_count', default=None)),
}
class JablePlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?jable\.tv/(?:categories|models|tags)/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://jable.tv/models/kaede-karen/',
'info_dict': {
'id': 'kaede-karen',
'title': '楓カレン',
},
'playlist_count': 34,
}, {
'url': 'https://jable.tv/categories/roleplay/',
'only_matching': True,
}, {
'url': 'https://jable.tv/tags/girl/',
'only_matching': True,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
def page_func(page_num):
return [
self.url_result(player_url, JableIE)
for player_url in orderedSet(re.findall(
r'href="(https://jable.tv/videos/[\w-]+/?)"',
self._download_webpage(url, playlist_id, query={
'mode': 'async',
'from': page_num + 1,
'function': 'get_block',
'block_id': 'list_videos_common_videos_list',
}, note=f'Downloading page {page_num + 1}')))]
return self.playlist_result(
InAdvancePagedList(page_func, int_or_none(self._search_regex(
r'from:(\d+)">[^<]+\s*&raquo;', webpage, 'last page number', default=1)), 24),
playlist_id, self._search_regex(
r'<h2 class="h3-md mb-1">([^<]+)', webpage, 'playlist title', default=None))

View file

@ -1,6 +1,12 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..networking.exceptions import HTTPError from ..networking.exceptions import HTTPError
from ..utils import ExtractorError, UserNotLive, int_or_none, url_or_none from ..utils import (
ExtractorError,
UserNotLive,
int_or_none,
str_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj from ..utils.traversal import traverse_obj
@ -9,17 +15,20 @@ class MixchIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://mixch.tv/u/16236849/live', 'url': 'https://mixch.tv/u/16943797/live',
'skip': 'don\'t know if this live persists', 'skip': 'don\'t know if this live persists',
'info_dict': { 'info_dict': {
'id': '16236849', 'id': '16943797',
'title': '24配信シェア⭕投票🙏💦', 'ext': 'mp4',
'comment_count': 13145, 'title': '#EntView #カリナ #セブチ 2024-05-05 06:58',
'view_count': 28348, 'comment_count': int,
'timestamp': 1636189377, 'view_count': int,
'uploader': '🦥伊咲👶🏻#フレアワ', 'timestamp': 1714726805,
'uploader_id': '16236849', 'uploader': 'Ent.View K-news🎶💕',
} 'uploader_id': '16943797',
'live_status': 'is_live',
'upload_date': '20240503',
},
}, { }, {
'url': 'https://mixch.tv/u/16137876/live', 'url': 'https://mixch.tv/u/16137876/live',
'only_matching': True, 'only_matching': True,
@ -48,8 +57,20 @@ class MixchIE(InfoExtractor):
'protocol': 'm3u8', 'protocol': 'm3u8',
}], }],
'is_live': True, 'is_live': True,
'__post_extractor': self.extract_comments(video_id),
} }
def _get_comments(self, video_id):
yield from traverse_obj(self._download_json(
f'https://mixch.tv/api-web/lives/{video_id}/messages', video_id,
note='Downloading comments', errnote='Failed to download comments'), (..., {
'author': ('name', {str}),
'author_id': ('user_id', {str_or_none}),
'id': ('message_id', {str}, {lambda x: x or None}),
'text': ('body', {str}),
'timestamp': ('created', {int}),
}))
class MixchArchiveIE(InfoExtractor): class MixchArchiveIE(InfoExtractor):
IE_NAME = 'mixch:archive' IE_NAME = 'mixch:archive'

View file

@ -561,7 +561,8 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
'timestamp': ('createTime', {self.kilo_or_none}), 'timestamp': ('createTime', {self.kilo_or_none}),
}) })
if not self._yes_playlist(info['songs'] and program_id, info['mainSong']['id']): if not self._yes_playlist(
info['songs'] and program_id, info['mainSong']['id'], playlist_label='program', video_label='song'):
formats = self.extract_formats(info['mainSong']) formats = self.extract_formats(info['mainSong'])
return { return {

View file

@ -5,7 +5,6 @@ from ..utils import (
merge_dicts, merge_dicts,
parse_count, parse_count,
url_or_none, url_or_none,
urljoin,
) )
from ..utils.traversal import traverse_obj from ..utils.traversal import traverse_obj
@ -16,8 +15,7 @@ class NFBBaseIE(InfoExtractor):
def _extract_ep_data(self, webpage, video_id, fatal=False): def _extract_ep_data(self, webpage, video_id, fatal=False):
return self._search_json( return self._search_json(
r'const\s+episodesData\s*=', webpage, 'episode data', video_id, r'episodesData\s*:', webpage, 'episode data', video_id, fatal=fatal) or {}
contains_pattern=r'\[\s*{(?s:.+)}\s*\]', fatal=fatal) or []
def _extract_ep_info(self, data, video_id, slug=None): def _extract_ep_info(self, data, video_id, slug=None):
info = traverse_obj(data, (lambda _, v: video_id in v['embed_url'], { info = traverse_obj(data, (lambda _, v: video_id in v['embed_url'], {
@ -224,18 +222,14 @@ class NFBIE(NFBBaseIE):
# type_ can change from film to serie(s) after redirect; new slug may have episode number # type_ can change from film to serie(s) after redirect; new slug may have episode number
type_, slug = self._match_valid_url(urlh.url).group('type', 'id') type_, slug = self._match_valid_url(urlh.url).group('type', 'id')
embed_url = urljoin(f'https://www.{site}.ca', self._html_search_regex( player_data = self._search_json(
r'<[^>]+\bid=["\']player-iframe["\'][^>]*\bsrc=["\']([^"\']+)', webpage, 'embed url')) r'window\.PLAYER_OPTIONS\[[^\]]+\]\s*=', webpage, 'player data', slug)
video_id = self._match_id(embed_url) # embed url has unique slug video_id = self._match_id(player_data['overlay']['url']) # overlay url always has unique slug
player = self._download_webpage(embed_url, video_id, 'Downloading player page')
if 'MESSAGE_GEOBLOCKED' in player:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
formats, subtitles = self._extract_m3u8_formats_and_subtitles( formats, subtitles = self._extract_m3u8_formats_and_subtitles(
self._html_search_regex(r'source:\s*\'([^\']+)', player, 'm3u8 url'), player_data['source'], video_id, 'mp4', m3u8_id='hls')
video_id, 'mp4', m3u8_id='hls')
if dv_source := self._html_search_regex(r'dvSource:\s*\'([^\']+)', player, 'dv', default=None): if dv_source := url_or_none(player_data.get('dvSource')):
fmts, subs = self._extract_m3u8_formats_and_subtitles( fmts, subs = self._extract_m3u8_formats_and_subtitles(
dv_source, video_id, 'mp4', m3u8_id='dv', preference=-2, fatal=False) dv_source, video_id, 'mp4', m3u8_id='dv', preference=-2, fatal=False)
for fmt in fmts: for fmt in fmts:
@ -246,17 +240,16 @@ class NFBIE(NFBBaseIE):
info = { info = {
'id': video_id, 'id': video_id,
'title': self._html_search_regex( 'title': self._html_search_regex(
r'<[^>]+\bid=["\']titleHeader["\'][^>]*>\s*<h1[^>]*>\s*([^<]+?)\s*</h1>', r'["\']nfb_version_title["\']\s*:\s*["\']([^"\']+)',
webpage, 'title', default=None), webpage, 'title', default=None),
'description': self._html_search_regex( 'description': self._html_search_regex(
r'<[^>]+\bid=["\']tabSynopsis["\'][^>]*>\s*<p[^>]*>\s*([^<]+)', r'<[^>]+\bid=["\']tabSynopsis["\'][^>]*>\s*<p[^>]*>\s*([^<]+)',
webpage, 'description', default=None), webpage, 'description', default=None),
'thumbnail': self._html_search_regex( 'thumbnail': url_or_none(player_data.get('poster')),
r'poster:\s*\'([^\']+)', player, 'thumbnail', default=None),
'uploader': self._html_search_regex( 'uploader': self._html_search_regex(
r'<[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', webpage, 'uploader', default=None), r'<[^>]+\bitemprop=["\']director["\'][^>]*>([^<]+)', webpage, 'uploader', default=None),
'release_year': int_or_none(self._html_search_regex( 'release_year': int_or_none(self._html_search_regex(
r'<[^>]+\bitemprop=["\']datePublished["\'][^>]*>([^<]+)', r'["\']nfb_version_year["\']\s*:\s*["\']([^"\']+)',
webpage, 'release_year', default=None)), webpage, 'release_year', default=None)),
} if type_ == 'film' else self._extract_ep_info(self._extract_ep_data(webpage, video_id, slug), video_id) } if type_ == 'film' else self._extract_ep_info(self._extract_ep_data(webpage, video_id, slug), video_id)

View file

@ -219,7 +219,29 @@ class PatreonIE(PatreonBaseIE):
'thumbnail': r're:^https?://.+', 'thumbnail': r're:^https?://.+',
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
}, {
# multiple attachments/embeds
'url': 'https://www.patreon.com/posts/holy-wars-solos-100601977',
'playlist_count': 3,
'info_dict': {
'id': '100601977',
'title': '"Holy Wars" (Megadeth) Solos Transcription & Lesson/Analysis',
'description': 'md5:d099ab976edfce6de2a65c2b169a88d3',
'uploader': 'Bradley Hall',
'uploader_id': '24401883',
'uploader_url': 'https://www.patreon.com/bradleyhallguitar',
'channel_id': '3193932',
'channel_url': 'https://www.patreon.com/bradleyhallguitar',
'channel_follower_count': int,
'timestamp': 1710777855,
'upload_date': '20240318',
'like_count': int,
'comment_count': int,
'thumbnail': r're:^https?://.+',
},
'skip': 'Patron-only content',
}] }]
_RETURN_TYPE = 'video'
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -234,58 +256,54 @@ class PatreonIE(PatreonBaseIE):
'include': 'audio,user,user_defined_tags,campaign,attachments_media', 'include': 'audio,user,user_defined_tags,campaign,attachments_media',
}) })
attributes = post['data']['attributes'] attributes = post['data']['attributes']
title = attributes['title'].strip() info = traverse_obj(attributes, {
image = attributes.get('image') or {} 'title': ('title', {str.strip}),
info = { 'description': ('content', {clean_html}),
'id': video_id, 'thumbnail': ('image', ('large_url', 'url'), {url_or_none}, any),
'title': title, 'timestamp': ('published_at', {parse_iso8601}),
'description': clean_html(attributes.get('content')), 'like_count': ('like_count', {int_or_none}),
'thumbnail': image.get('large_url') or image.get('url'), 'comment_count': ('comment_count', {int_or_none}),
'timestamp': parse_iso8601(attributes.get('published_at')), })
'like_count': int_or_none(attributes.get('like_count')),
'comment_count': int_or_none(attributes.get('comment_count')),
}
can_view_post = traverse_obj(attributes, 'current_user_can_view')
if can_view_post and info['comment_count']:
info['__post_extractor'] = self.extract_comments(video_id)
for i in post.get('included', []): entries = []
i_type = i.get('type') idx = 0
if i_type == 'media': for include in traverse_obj(post, ('included', lambda _, v: v['type'])):
media_attributes = i.get('attributes') or {} include_type = include['type']
download_url = media_attributes.get('download_url') if include_type == 'media':
media_attributes = traverse_obj(include, ('attributes', {dict})) or {}
download_url = url_or_none(media_attributes.get('download_url'))
ext = mimetype2ext(media_attributes.get('mimetype')) ext = mimetype2ext(media_attributes.get('mimetype'))
# if size_bytes is None, this media file is likely unavailable # if size_bytes is None, this media file is likely unavailable
# See: https://github.com/yt-dlp/yt-dlp/issues/4608 # See: https://github.com/yt-dlp/yt-dlp/issues/4608
size_bytes = int_or_none(media_attributes.get('size_bytes')) size_bytes = int_or_none(media_attributes.get('size_bytes'))
if download_url and ext in KNOWN_EXTENSIONS and size_bytes is not None: if download_url and ext in KNOWN_EXTENSIONS and size_bytes is not None:
# XXX: what happens if there are multiple attachments? idx += 1
return { entries.append({
**info, 'id': f'{video_id}-{idx}',
'ext': ext, 'ext': ext,
'filesize': size_bytes, 'filesize': size_bytes,
'url': download_url, 'url': download_url,
}
elif i_type == 'user':
user_attributes = i.get('attributes')
if user_attributes:
info.update({
'uploader': user_attributes.get('full_name'),
'uploader_id': str_or_none(i.get('id')),
'uploader_url': user_attributes.get('url'),
}) })
elif i_type == 'post_tag': elif include_type == 'user':
info.setdefault('tags', []).append(traverse_obj(i, ('attributes', 'value'))) info.update(traverse_obj(include, {
'uploader': ('attributes', 'full_name', {str}),
'uploader_id': ('id', {str_or_none}),
'uploader_url': ('attributes', 'url', {url_or_none}),
}))
elif i_type == 'campaign': elif include_type == 'post_tag':
info.update({ if post_tag := traverse_obj(include, ('attributes', 'value', {str})):
'channel': traverse_obj(i, ('attributes', 'title')), info.setdefault('tags', []).append(post_tag)
'channel_id': str_or_none(i.get('id')),
'channel_url': traverse_obj(i, ('attributes', 'url')), elif include_type == 'campaign':
'channel_follower_count': int_or_none(traverse_obj(i, ('attributes', 'patron_count'))), info.update(traverse_obj(include, {
}) 'channel': ('attributes', 'title', {str}),
'channel_id': ('id', {str_or_none}),
'channel_url': ('attributes', 'url', {url_or_none}),
'channel_follower_count': ('attributes', 'patron_count', {int_or_none}),
}))
# handle Vimeo embeds # handle Vimeo embeds
if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo': if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo':
@ -296,36 +314,50 @@ class PatreonIE(PatreonBaseIE):
v_url, video_id, 'Checking Vimeo embed URL', v_url, video_id, 'Checking Vimeo embed URL',
headers={'Referer': 'https://patreon.com/'}, headers={'Referer': 'https://patreon.com/'},
fatal=False, errnote=False): fatal=False, errnote=False):
return self.url_result( entries.append(self.url_result(
VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'), VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'),
VimeoIE, url_transparent=True, **info) VimeoIE, url_transparent=True))
embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none})) embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none}))
if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False): if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False):
return self.url_result(embed_url, **info) entries.append(self.url_result(embed_url))
post_file = traverse_obj(attributes, 'post_file') post_file = traverse_obj(attributes, ('post_file', {dict}))
if post_file: if post_file:
name = post_file.get('name') name = post_file.get('name')
ext = determine_ext(name) ext = determine_ext(name)
if ext in KNOWN_EXTENSIONS: if ext in KNOWN_EXTENSIONS:
return { entries.append({
**info, 'id': video_id,
'ext': ext, 'ext': ext,
'url': post_file['url'], 'url': post_file['url'],
} })
elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8': elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id) formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id)
return { entries.append({
**info, 'id': video_id,
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
} })
if can_view_post is False: can_view_post = traverse_obj(attributes, 'current_user_can_view')
comments = None
if can_view_post and info.get('comment_count'):
comments = self.extract_comments(video_id)
if not entries and can_view_post is False:
self.raise_no_formats('You do not have access to this post', video_id=video_id, expected=True) self.raise_no_formats('You do not have access to this post', video_id=video_id, expected=True)
else: elif not entries:
self.raise_no_formats('No supported media found in this post', video_id=video_id, expected=True) self.raise_no_formats('No supported media found in this post', video_id=video_id, expected=True)
elif len(entries) == 1:
info.update(entries[0])
else:
for entry in entries:
entry.update(info)
return self.playlist_result(entries, video_id, **info, __post_extractor=comments)
info['id'] = video_id
info['__post_extractor'] = comments
return info return info
def _get_comments(self, post_id): def _get_comments(self, post_id):

View file

@ -1,95 +0,0 @@
import urllib.parse
from .common import InfoExtractor
from ..utils import (
determine_ext,
int_or_none,
parse_duration,
remove_end,
unified_strdate,
ExtractorError,
)
class Porn91IE(InfoExtractor):
IE_NAME = '91porn'
_VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/view_video.php\?([^#]+&)?viewkey=(?P<id>\w+)'
_TESTS = [{
'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134',
'md5': 'd869db281402e0ef4ddef3c38b866f86',
'info_dict': {
'id': '7e42283b4f5ab36da134',
'title': '18岁大一漂亮学妹水嫩性感再爽一次',
'description': 'md5:1ff241f579b07ae936a54e810ad2e891',
'ext': 'mp4',
'duration': 431,
'upload_date': '20150520',
'comment_count': int,
'view_count': int,
'age_limit': 18,
}
}, {
'url': 'https://91porn.com/view_video.php?viewkey=7ef0cf3d362c699ab91c',
'md5': 'f8fd50540468a6d795378cd778b40226',
'info_dict': {
'id': '7ef0cf3d362c699ab91c',
'title': '真实空乘,冲上云霄第二部',
'description': 'md5:618bf9652cafcc66cd277bd96789baea',
'ext': 'mp4',
'duration': 248,
'upload_date': '20221119',
'comment_count': int,
'view_count': int,
'age_limit': 18,
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
self._set_cookie('91porn.com', 'language', 'cn_CN')
webpage = self._download_webpage(
'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id)
if '视频不存在,可能已经被删除或者被举报为不良内容!' in webpage:
raise ExtractorError('91 Porn says: Video does not exist', expected=True)
daily_limit = self._search_regex(
r'作为游客,你每天只可观看([\d]+)个视频', webpage, 'exceeded daily limit', default=None, fatal=False)
if daily_limit:
raise ExtractorError(f'91 Porn says: Daily limit {daily_limit} videos exceeded', expected=True)
video_link_url = self._search_regex(
r'document\.write\(\s*strencode2\s*\(\s*((?:"[^"]+")|(?:\'[^\']+\'))', webpage, 'video link')
video_link_url = self._search_regex(
r'src=["\']([^"\']+)["\']', urllib.parse.unquote(video_link_url), 'unquoted video link')
formats, subtitles = self._get_formats_and_subtitle(video_link_url, video_id)
return {
'id': video_id,
'title': remove_end(self._html_extract_title(webpage).replace('\n', ''), 'Chinese homemade video').strip(),
'formats': formats,
'subtitles': subtitles,
'upload_date': unified_strdate(self._search_regex(
r'<span\s+class=["\']title-yakov["\']>(\d{4}-\d{2}-\d{2})</span>', webpage, 'upload_date', fatal=False)),
'description': self._html_search_regex(
r'<span\s+class=["\']more title["\']>\s*([^<]+)', webpage, 'description', fatal=False),
'duration': parse_duration(self._search_regex(
r'时长:\s*<span[^>]*>\s*(\d+(?::\d+){1,2})', webpage, 'duration', fatal=False)),
'comment_count': int_or_none(self._search_regex(
r'留言:\s*<span[^>]*>\s*(\d+)\s*</span>', webpage, 'comment count', fatal=False)),
'view_count': int_or_none(self._search_regex(
r'热度:\s*<span[^>]*>\s*(\d+)\s*</span>', webpage, 'view count', fatal=False)),
'age_limit': 18,
}
def _get_formats_and_subtitle(self, video_link_url, video_id):
ext = determine_ext(video_link_url)
if ext == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_link_url, video_id, ext='mp4')
else:
formats = [{'url': video_link_url, 'ext': ext}]
subtitles = {}
return formats, subtitles

View file

@ -97,7 +97,7 @@ class PornHubBaseIE(InfoExtractor):
login_form = self._hidden_inputs(login_page) login_form = self._hidden_inputs(login_page)
login_form.update({ login_form.update({
'username': username, 'email': username,
'password': password, 'password': password,
}) })

View file

@ -361,7 +361,7 @@ class SoundcloudBaseIE(InfoExtractor):
'like_count': extract_count('favoritings') or extract_count('likes'), 'like_count': extract_count('favoritings') or extract_count('likes'),
'comment_count': extract_count('comment'), 'comment_count': extract_count('comment'),
'repost_count': extract_count('reposts'), 'repost_count': extract_count('reposts'),
'genre': info.get('genre'), 'genres': traverse_obj(info, ('genre', {str}, {lambda x: x or None}, all)),
'formats': formats if not extract_flat else None 'formats': formats if not extract_flat else None
} }
@ -395,10 +395,10 @@ class SoundcloudIE(SoundcloudBaseIE):
_TESTS = [ _TESTS = [
{ {
'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
'md5': 'ebef0a451b909710ed1d7787dddbf0d7', 'md5': 'de9bac153e7427a7333b4b0c1b6a18d2',
'info_dict': { 'info_dict': {
'id': '62986583', 'id': '62986583',
'ext': 'mp3', 'ext': 'opus',
'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
'uploader': 'E.T. ExTerrestrial Music', 'uploader': 'E.T. ExTerrestrial Music',
@ -411,6 +411,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'thumbnail': 'https://i1.sndcdn.com/artworks-000031955188-rwb18x-original.jpg',
'uploader_url': 'https://soundcloud.com/ethmusic',
'genres': [],
} }
}, },
# geo-restricted # geo-restricted
@ -418,7 +421,7 @@ class SoundcloudIE(SoundcloudBaseIE):
'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
'info_dict': { 'info_dict': {
'id': '47127627', 'id': '47127627',
'ext': 'mp3', 'ext': 'opus',
'title': 'Goldrushed', 'title': 'Goldrushed',
'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
'uploader': 'The Royal Concept', 'uploader': 'The Royal Concept',
@ -431,6 +434,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'uploader_url': 'https://soundcloud.com/the-concept-band',
'thumbnail': 'https://i1.sndcdn.com/artworks-v8bFHhXm7Au6-0-original.jpg',
'genres': ['Alternative'],
}, },
}, },
# private link # private link
@ -452,6 +458,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'uploader_url': 'https://soundcloud.com/jaimemf',
'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png',
'genres': ['youtubedl'],
}, },
}, },
# private link (alt format) # private link (alt format)
@ -473,6 +482,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'uploader_url': 'https://soundcloud.com/jaimemf',
'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png',
'genres': ['youtubedl'],
}, },
}, },
# downloadable song # downloadable song
@ -482,6 +494,21 @@ class SoundcloudIE(SoundcloudBaseIE):
'info_dict': { 'info_dict': {
'id': '343609555', 'id': '343609555',
'ext': 'wav', 'ext': 'wav',
'title': 'The Following',
'description': '',
'uploader': '80M',
'uploader_id': '312384765',
'uploader_url': 'https://soundcloud.com/the80m',
'upload_date': '20170922',
'timestamp': 1506120436,
'duration': 397.228,
'thumbnail': 'https://i1.sndcdn.com/artworks-000243916348-ktoo7d-original.jpg',
'license': 'all-rights-reserved',
'like_count': int,
'comment_count': int,
'repost_count': int,
'view_count': int,
'genres': ['Dance & EDM'],
}, },
}, },
# private link, downloadable format # private link, downloadable format
@ -503,6 +530,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'thumbnail': 'https://i1.sndcdn.com/artworks-000240712245-kedn4p-original.jpg',
'uploader_url': 'https://soundcloud.com/oriuplift',
'genres': ['Trance'],
}, },
}, },
# no album art, use avatar pic for thumbnail # no album art, use avatar pic for thumbnail
@ -525,6 +555,8 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'uploader_url': 'https://soundcloud.com/garyvee',
'genres': [],
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -532,13 +564,13 @@ class SoundcloudIE(SoundcloudBaseIE):
}, },
{ {
'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer',
'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', 'md5': '8227c3473a4264df6b02ad7e5b7527ac',
'info_dict': { 'info_dict': {
'id': '583011102', 'id': '583011102',
'ext': 'mp3', 'ext': 'opus',
'title': 'Mezzo Valzer', 'title': 'Mezzo Valzer',
'description': 'md5:4138d582f81866a530317bae316e8b61', 'description': 'md5:f4d5f39d52e0ccc2b4f665326428901a',
'uploader': 'Micronie', 'uploader': 'Giovanni Sarani',
'uploader_id': '3352531', 'uploader_id': '3352531',
'timestamp': 1551394171, 'timestamp': 1551394171,
'upload_date': '20190228', 'upload_date': '20190228',
@ -549,6 +581,8 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'genres': ['Piano'],
'uploader_url': 'https://soundcloud.com/giovannisarani',
}, },
}, },
{ {

View file

@ -45,19 +45,18 @@ class TikTokBaseIE(InfoExtractor):
# "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0 # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0
'aid': '0', 'aid': '0',
} }
_KNOWN_APP_INFO = [
'7351144126450059040',
'7351149742343391009',
'7351153174894626592',
]
_APP_INFO_POOL = None _APP_INFO_POOL = None
_APP_INFO = None _APP_INFO = None
_APP_USER_AGENT = None _APP_USER_AGENT = None
@property
def _KNOWN_APP_INFO(self):
return self._configuration_arg('app_info', ie_key=TikTokIE)
@property @property
def _API_HOSTNAME(self): def _API_HOSTNAME(self):
return self._configuration_arg( return self._configuration_arg(
'api_hostname', ['api22-normal-c-useast2a.tiktokv.com'], ie_key=TikTokIE)[0] 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0]
def _get_next_app_info(self): def _get_next_app_info(self):
if self._APP_INFO_POOL is None: if self._APP_INFO_POOL is None:
@ -66,13 +65,10 @@ class TikTokBaseIE(InfoExtractor):
for key, default in self._APP_INFO_DEFAULTS.items() for key, default in self._APP_INFO_DEFAULTS.items()
if key != 'iid' if key != 'iid'
} }
app_info_list = (
self._configuration_arg('app_info', ie_key=TikTokIE)
or random.sample(self._KNOWN_APP_INFO, len(self._KNOWN_APP_INFO)))
self._APP_INFO_POOL = [ self._APP_INFO_POOL = [
{**defaults, **dict( {**defaults, **dict(
(k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v
)} for app_info in app_info_list )} for app_info in self._KNOWN_APP_INFO
] ]
if not self._APP_INFO_POOL: if not self._APP_INFO_POOL:
@ -757,11 +753,13 @@ class TikTokIE(TikTokBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
video_id, user_id = self._match_valid_url(url).group('id', 'user_id') video_id, user_id = self._match_valid_url(url).group('id', 'user_id')
try:
return self._extract_aweme_app(video_id) if self._KNOWN_APP_INFO:
except ExtractorError as e: try:
e.expected = True return self._extract_aweme_app(video_id)
self.report_warning(f'{e}; trying with webpage') except ExtractorError as e:
e.expected = True
self.report_warning(f'{e}; trying with webpage')
url = self._create_url(user_id, video_id) url = self._create_url(user_id, video_id)
webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}) webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'})

View file

@ -2,85 +2,88 @@ import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
clean_html,
determine_ext, determine_ext,
extract_attributes, extract_attributes,
get_element_by_class,
get_element_html_by_class,
int_or_none, int_or_none,
parse_duration,
traverse_obj,
try_get,
url_or_none, url_or_none,
) )
from ..utils.traversal import traverse_obj
class TV5MondePlusIE(InfoExtractor): class TV5MondePlusIE(InfoExtractor):
IE_DESC = 'TV5MONDE+' IE_NAME = 'TV5MONDE'
_VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)' _VALID_URL = r'https?://(?:www\.)?tv5monde\.com/tv/video/(?P<id>[^/?#]+)'
_TESTS = [{ _TESTS = [{
# movie # documentary
'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/les-novices', 'url': 'https://www.tv5monde.com/tv/video/65931-baudouin-l-heritage-d-un-roi-baudouin-l-heritage-d-un-roi',
'md5': 'c86f60bf8b75436455b1b205f9745955', 'md5': 'd2a708902d3df230a357c99701aece05',
'info_dict': { 'info_dict': {
'id': 'ZX0ipMyFQq_6D4BA7b', 'id': '3FPa7JMu21_6D4BA7b',
'display_id': 'les-novices', 'display_id': '65931-baudouin-l-heritage-d-un-roi-baudouin-l-heritage-d-un-roi',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Les novices', 'title': "Baudouin, l'héritage d'un roi",
'description': 'md5:2e7c33ba3ad48dabfcc2a956b88bde2b', 'thumbnail': 'https://psi.tv5monde.com/upsilon-images/960x540/6f/baudouin-f49c6b0e.jpg',
'upload_date': '20230821', 'duration': 4842,
'thumbnail': 'https://revoir.tv5monde.com/uploads/media/video_thumbnail/0738/60/01e952b7ccf36b7c6007ec9131588954ab651de9.jpeg', 'upload_date': '20240130',
'duration': 5177, 'timestamp': 1706641242,
'episode': 'Les novices', 'episode': "BAUDOUIN, L'HERITAGE D'UN ROI",
'description': 'md5:78125c74a5cac06d7743a2d09126edad',
'series': "Baudouin, l'héritage d'un roi",
}, },
}, { }, {
# series episode # series episode
'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/opj-les-dents-de-la-terre-2', 'url': 'https://www.tv5monde.com/tv/video/52952-toute-la-vie-mardi-23-mars-2021',
'md5': 'f5e09637cadd55639c05874e22eb56bf',
'info_dict': { 'info_dict': {
'id': 'wJ0eeEPozr_6D4BA7b', 'id': 'obRRZ8m6g9_6D4BA7b',
'display_id': 'opj-les-dents-de-la-terre-2', 'display_id': '52952-toute-la-vie-mardi-23-mars-2021',
'ext': 'mp4', 'ext': 'mp4',
'title': "OPJ - Les dents de la Terre (2)", 'title': 'Toute la vie',
'description': 'md5:288f87fd68d993f814e66e60e5302d9d', 'description': 'md5:a824a2e1dfd94cf45fa379a1fb43ce65',
'upload_date': '20230823', 'thumbnail': 'https://psi.tv5monde.com/media/image/960px/5880553.jpg',
'series': 'OPJ', 'duration': 2526,
'episode': 'Les dents de la Terre (2)', 'upload_date': '20230721',
'duration': 2877, 'timestamp': 1689971646,
'thumbnail': 'https://dl-revoir.tv5monde.com/images/1a/5753448.jpg' 'series': 'Toute la vie',
'episode': 'Mardi 23 mars 2021',
}, },
}, { }, {
# movie # movie
'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/ceux-qui-travaillent', 'url': 'https://www.tv5monde.com/tv/video/8771-ce-fleuve-qui-nous-charrie-ce-fleuve-qui-nous-charrie-p001-ce-fleuve-qui-nous-charrie',
'md5': '32fa0cde16a4480d1251502a66856d5f', 'md5': '87cefc34e10a6bf4f7823cccd7b36eb2',
'info_dict': { 'info_dict': {
'id': 'dc57a011-ec4b-4648-2a9a-4f03f8352ed3', 'id': 'DOcfvdLKXL_6D4BA7b',
'display_id': 'ceux-qui-travaillent', 'display_id': '8771-ce-fleuve-qui-nous-charrie-ce-fleuve-qui-nous-charrie-p001-ce-fleuve-qui-nous-charrie',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Ceux qui travaillent', 'title': 'Ce fleuve qui nous charrie',
'description': 'md5:570e8bb688036ace873b2d50d24c026d', 'description': 'md5:62ba3f875343c7fc4082bdfbbc1be992',
'upload_date': '20210819', 'thumbnail': 'https://psi.tv5monde.com/media/image/960px/5476617.jpg',
'duration': 5300,
'upload_date': '20210822',
'timestamp': 1629594105,
'episode': 'CE FLEUVE QUI NOUS CHARRIE-P001-CE FLEUVE QUI NOUS CHARRIE',
'series': 'Ce fleuve qui nous charrie',
}, },
'skip': 'no longer available',
}, { }, {
# series episode # news
'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/vestiaires-caro-actrice', 'url': 'https://www.tv5monde.com/tv/video/70402-tv5monde-le-journal-edition-du-08-05-24-11h',
'md5': 'c62977d6d10754a2ecebba70ad370479',
'info_dict': { 'info_dict': {
'id': '9e9d599e-23af-6915-843e-ecbf62e97925', 'id': 'LgQFrOCNsc_6D4BA7b',
'display_id': 'vestiaires-caro-actrice', 'display_id': '70402-tv5monde-le-journal-edition-du-08-05-24-11h',
'ext': 'mp4', 'ext': 'mp4',
'title': "Vestiaires - Caro actrice", 'title': 'TV5MONDE, le journal',
'description': 'md5:db15d2e1976641e08377f942778058ea', 'description': 'md5:777dc209eaa4423b678477c36b0b04a8',
'upload_date': '20210819', 'thumbnail': 'https://psi.tv5monde.com/media/image/960px/6184105.jpg',
'series': "Vestiaires", 'duration': 854,
'episode': 'Caro actrice', 'upload_date': '20240508',
'timestamp': 1715159640,
'series': 'TV5MONDE, le journal',
'episode': 'EDITION DU 08/05/24 - 11H',
}, },
'params': {
'skip_download': True,
},
'skip': 'no longer available',
}, {
'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver',
'only_matching': True,
}, {
'url': 'https://revoir.tv5monde.com/toutes-les-videos/info-societe/le-journal-de-la-rts-edition-du-30-01-20-19h30',
'only_matching': True,
}] }]
_GEO_BYPASS = False _GEO_BYPASS = False
@ -98,7 +101,6 @@ class TV5MondePlusIE(InfoExtractor):
if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage: if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage:
self.raise_geo_restricted(countries=['FR']) self.raise_geo_restricted(countries=['FR'])
title = episode = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title')
vpl_data = extract_attributes(self._search_regex( vpl_data = extract_attributes(self._search_regex(
r'(<[^>]+class="video_player_loader"[^>]+>)', r'(<[^>]+class="video_player_loader"[^>]+>)',
webpage, 'video player loader')) webpage, 'video player loader'))
@ -147,26 +149,7 @@ class TV5MondePlusIE(InfoExtractor):
process_video_files(video_files) process_video_files(video_files)
metadata = self._parse_json( metadata = self._parse_json(
vpl_data['data-metadata'], display_id) vpl_data.get('data-metadata') or '{}', display_id, fatal=False)
duration = (int_or_none(try_get(metadata, lambda x: x['content']['duration']))
or parse_duration(self._html_search_meta('duration', webpage)))
description = self._html_search_regex(
r'(?s)<div[^>]+class=["\']episode-texte[^>]+>(.+?)</div>', webpage,
'description', fatal=False)
series = self._html_search_regex(
r'<p[^>]+class=["\']episode-emission[^>]+>([^<]+)', webpage,
'series', default=None)
if series and series != title:
title = '%s - %s' % (series, title)
upload_date = self._search_regex(
r'(?:date_publication|publish_date)["\']\s*:\s*["\'](\d{4}_\d{2}_\d{2})',
webpage, 'upload date', default=None)
if upload_date:
upload_date = upload_date.replace('_', '')
if not video_id: if not video_id:
video_id = self._search_regex( video_id = self._search_regex(
@ -175,16 +158,20 @@ class TV5MondePlusIE(InfoExtractor):
default=display_id) default=display_id)
return { return {
**traverse_obj(metadata, ('content', {
'id': ('id', {str}),
'title': ('title', {str}),
'episode': ('title', {str}),
'series': ('series', {str}),
'timestamp': ('publishDate_ts', {int_or_none}),
'duration': ('duration', {int_or_none}),
})),
'id': video_id, 'id': video_id,
'display_id': display_id, 'display_id': display_id,
'title': title, 'title': clean_html(get_element_by_class('main-title', webpage)),
'description': description, 'description': clean_html(get_element_by_class('text', get_element_html_by_class('ep-summary', webpage) or '')),
'thumbnail': vpl_data.get('data-image'), 'thumbnail': url_or_none(vpl_data.get('data-image')),
'duration': duration,
'upload_date': upload_date,
'formats': formats, 'formats': formats,
'subtitles': self._extract_subtitles(self._parse_json( 'subtitles': self._extract_subtitles(self._parse_json(
traverse_obj(vpl_data, ('data-captions', {str}), default='{}'), display_id, fatal=False)), traverse_obj(vpl_data, ('data-captions', {str}), default='{}'), display_id, fatal=False)),
'series': series,
'episode': episode,
} }

View file

@ -1,10 +1,9 @@
import functools
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import float_or_none, int_or_none, smuggle_url, strip_or_none
float_or_none, from ..utils.traversal import traverse_obj
int_or_none,
smuggle_url,
strip_or_none,
)
class TVAIE(InfoExtractor): class TVAIE(InfoExtractor):
@ -49,11 +48,20 @@ class QubIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '6084352463001', 'id': '6084352463001',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Épisode 01', 'title': 'Ép 01. Mon dernier jour',
'uploader_id': '5481942443001', 'uploader_id': '5481942443001',
'upload_date': '20190907', 'upload_date': '20190907',
'timestamp': 1567899756, 'timestamp': 1567899756,
'description': 'md5:9c0d7fbb90939420c651fd977df90145', 'description': 'md5:9c0d7fbb90939420c651fd977df90145',
'thumbnail': r're:https://.+\.jpg',
'episode': 'Ép 01. Mon dernier jour',
'episode_number': 1,
'tags': ['alerte amber', 'alerte amber saison 1', 'surdemande'],
'duration': 2625.963,
'season': 'Season 1',
'season_number': 1,
'series': 'Alerte Amber',
'channel': 'TVA',
}, },
}, { }, {
'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943', 'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943',
@ -64,22 +72,24 @@ class QubIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
entity_id = self._match_id(url) entity_id = self._match_id(url)
entity = self._download_json( webpage = self._download_webpage(url, entity_id)
'https://www.qub.ca/proxy/pfu/content-delivery-service/v1/entities', entity = self._search_nextjs_data(webpage, entity_id)['props']['initialProps']['pageProps']['fallbackData']
entity_id, query={'id': entity_id})
video_id = entity['videoId'] video_id = entity['videoId']
episode = strip_or_none(entity.get('name')) episode = strip_or_none(entity.get('name'))
return { return {
'_type': 'url_transparent', '_type': 'url_transparent',
'url': f'https://videos.tva.ca/details/_{video_id}',
'ie_key': TVAIE.ie_key(),
'id': video_id, 'id': video_id,
'title': episode, 'title': episode,
# 'url': self.BRIGHTCOVE_URL_TEMPLATE % entity['referenceId'],
'url': 'https://videos.tva.ca/details/_' + video_id,
'description': entity.get('longDescription'),
'duration': float_or_none(entity.get('durationMillis'), 1000),
'episode': episode, 'episode': episode,
'episode_number': int_or_none(entity.get('episodeNumber')), **traverse_obj(entity, {
# 'ie_key': 'BrightcoveNew', 'description': ('longDescription', {str}),
'ie_key': TVAIE.ie_key(), 'duration': ('durationMillis', {functools.partial(float_or_none, scale=1000)}),
'channel': ('knownEntities', 'channel', 'name', {str}),
'series': ('knownEntities', 'videoShow', 'name', {str}),
'season_number': ('slug', {lambda x: re.search(r'/s(?:ai|ea)son-(\d+)/', x)}, 1, {int_or_none}),
'episode_number': ('episodeNumber', {int_or_none}),
}),
} }

View file

@ -34,9 +34,9 @@ from ..utils import (
class TwitterBaseIE(InfoExtractor): class TwitterBaseIE(InfoExtractor):
_NETRC_MACHINE = 'twitter' _NETRC_MACHINE = 'twitter'
_API_BASE = 'https://api.twitter.com/1.1/' _API_BASE = 'https://api.x.com/1.1/'
_GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' _GRAPHQL_API_BASE = 'https://x.com/i/api/graphql/'
_BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:(?:twitter|x)\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/'
_AUTH = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' _AUTH = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
_LEGACY_AUTH = 'AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE' _LEGACY_AUTH = 'AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE'
_flow_token = None _flow_token = None
@ -153,6 +153,14 @@ class TwitterBaseIE(InfoExtractor):
def is_logged_in(self): def is_logged_in(self):
return bool(self._get_cookies(self._API_BASE).get('auth_token')) return bool(self._get_cookies(self._API_BASE).get('auth_token'))
# XXX: Temporary workaround until twitter.com => x.com migration is completed
def _real_initialize(self):
if self.is_logged_in or not self._get_cookies('https://twitter.com/').get('auth_token'):
return
# User has not yet been migrated to x.com and has passed twitter.com cookies
TwitterBaseIE._API_BASE = 'https://api.twitter.com/1.1/'
TwitterBaseIE._GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/'
@functools.cached_property @functools.cached_property
def _selected_api(self): def _selected_api(self):
return self._configuration_arg('api', ['graphql'], ie_key='Twitter')[0] return self._configuration_arg('api', ['graphql'], ie_key='Twitter')[0]
@ -196,17 +204,15 @@ class TwitterBaseIE(InfoExtractor):
if self.is_logged_in: if self.is_logged_in:
return return
webpage = self._download_webpage('https://twitter.com/', None, 'Downloading login page') guest_token = self._fetch_guest_token(None)
guest_token = self._search_regex(
r'\.cookie\s*=\s*["\']gt=(\d+);', webpage, 'gt', default=None) or self._fetch_guest_token(None)
headers = { headers = {
**self._set_base_headers(), **self._set_base_headers(),
'content-type': 'application/json', 'content-type': 'application/json',
'x-guest-token': guest_token, 'x-guest-token': guest_token,
'x-twitter-client-language': 'en', 'x-twitter-client-language': 'en',
'x-twitter-active-user': 'yes', 'x-twitter-active-user': 'yes',
'Referer': 'https://twitter.com/', 'Referer': 'https://x.com/',
'Origin': 'https://twitter.com', 'Origin': 'https://x.com',
} }
def build_login_json(*subtask_inputs): def build_login_json(*subtask_inputs):
@ -1191,6 +1197,31 @@ class TwitterIE(TwitterBaseIE):
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 1724884212803834154'], '_old_archive_ids': ['twitter 1724884212803834154'],
}, },
}, {
# x.com
'url': 'https://x.com/historyinmemes/status/1790637656616943991',
'md5': 'daca3952ba0defe2cfafb1276d4c1ea5',
'info_dict': {
'id': '1790637589910654976',
'ext': 'mp4',
'title': 'Historic Vids - One of the most intense moments in history',
'description': 'One of the most intense moments in history https://t.co/Zgzhvix8ES',
'display_id': '1790637656616943991',
'uploader': 'Historic Vids',
'uploader_id': 'historyinmemes',
'uploader_url': 'https://twitter.com/historyinmemes',
'channel_id': '855481986290524160',
'upload_date': '20240515',
'timestamp': 1715756260.0,
'duration': 15.488,
'tags': [],
'comment_count': int,
'repost_count': int,
'like_count': int,
'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+',
'age_limit': 0,
'_old_archive_ids': ['twitter 1790637656616943991'],
}
}, { }, {
# onion route # onion route
'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',

View file

@ -173,6 +173,20 @@ class KnownPiracyIE(UnsupportedInfoExtractor):
r'filemoon\.sx', r'filemoon\.sx',
r'hentai\.animestigma\.com', r'hentai\.animestigma\.com',
r'thisav\.com', r'thisav\.com',
r'gounlimited\.to',
r'highstream\.tv',
r'uqload\.com',
r'vedbam\.xyz',
r'vadbam\.net'
r'vidlo\.us',
r'wolfstream\.tv',
r'xvideosharing\.com',
r'(?:\w+\.)?viidshar\.com',
r'sxyprn\.com',
r'jable\.tv',
r'91porn\.com',
r'einthusan\.(?:tv|com|ca)',
r'yourupload\.com',
) )
_TESTS = [{ _TESTS = [{

View file

@ -451,6 +451,7 @@ class VKIE(VKBaseIE):
info_page, 'view count', default=None)) info_page, 'view count', default=None))
formats = [] formats = []
subtitles = {}
for format_id, format_url in data.items(): for format_id, format_url in data.items():
format_url = url_or_none(format_url) format_url = url_or_none(format_url)
if not format_url or not format_url.startswith(('http', '//', 'rtmp')): if not format_url or not format_url.startswith(('http', '//', 'rtmp')):
@ -462,12 +463,21 @@ class VKIE(VKBaseIE):
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
'url': format_url, 'url': format_url,
'ext': 'mp4',
'source_preference': 1,
'height': height, 'height': height,
}) })
elif format_id == 'hls': elif format_id == 'hls':
formats.extend(self._extract_m3u8_formats( fmts, subs = self._extract_m3u8_formats_and_subtitles(
format_url, video_id, 'mp4', 'm3u8_native', format_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=format_id, fatal=False, live=is_live)) m3u8_id=format_id, fatal=False, live=is_live)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif format_id.startswith('dash_'):
fmts, subs = self._extract_mpd_formats_and_subtitles(
format_url, video_id, mpd_id=format_id, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif format_id == 'rtmp': elif format_id == 'rtmp':
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
@ -475,7 +485,6 @@ class VKIE(VKBaseIE):
'ext': 'flv', 'ext': 'flv',
}) })
subtitles = {}
for sub in data.get('subs') or {}: for sub in data.get('subs') or {}:
subtitles.setdefault(sub.get('lang', 'en'), []).append({ subtitles.setdefault(sub.get('lang', 'en'), []).append({
'ext': sub.get('title', '.srt').split('.')[-1], 'ext': sub.get('title', '.srt').split('.')[-1],
@ -496,6 +505,7 @@ class VKIE(VKBaseIE):
'comment_count': int_or_none(mv_data.get('commcount')), 'comment_count': int_or_none(mv_data.get('commcount')),
'is_live': is_live, 'is_live': is_live,
'subtitles': subtitles, 'subtitles': subtitles,
'_format_sort_fields': ('res', 'source'),
} }

View file

@ -12,6 +12,7 @@ from ..utils import (
jwt_decode_hs256, jwt_decode_hs256,
traverse_obj, traverse_obj,
try_call, try_call,
url_basename,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
variadic, variadic,
@ -194,8 +195,7 @@ class WrestleUniverseVODIE(WrestleUniverseBaseIE):
return { return {
'id': video_id, 'id': video_id,
'formats': self._get_formats(video_data, ( 'formats': self._get_formats(video_data, ('protocolHls', 'url', {url_or_none}), video_id),
(('protocolHls', 'url'), ('chromecastUrls', ...)), {url_or_none}), video_id),
**traverse_obj(metadata, { **traverse_obj(metadata, {
'title': ('displayName', {str}), 'title': ('displayName', {str}),
'description': ('description', {str}), 'description': ('description', {str}),
@ -259,6 +259,10 @@ class WrestleUniversePPVIE(WrestleUniverseBaseIE):
'params': { 'params': {
'skip_download': 'm3u8', 'skip_download': 'm3u8',
}, },
}, {
'note': 'manifest provides live-a (partial) and live-b (full) streams',
'url': 'https://www.wrestle-universe.com/en/lives/umc99R9XsexXrxr9VjTo9g',
'only_matching': True,
}] }]
_API_PATH = 'events' _API_PATH = 'events'
@ -285,12 +289,16 @@ class WrestleUniversePPVIE(WrestleUniverseBaseIE):
video_data, decrypt = self._call_encrypted_api( video_data, decrypt = self._call_encrypted_api(
video_id, ':watchArchive', 'watch archive', data={'method': 1}) video_id, ':watchArchive', 'watch archive', data={'method': 1})
info['formats'] = self._get_formats(video_data, ( # 'chromecastUrls' can be only partial videos, avoid
('hls', None), ('urls', 'chromecastUrls'), ..., {url_or_none}), video_id) info['formats'] = self._get_formats(video_data, ('hls', (('urls', ...), 'url'), {url_or_none}), video_id)
for f in info['formats']: for f in info['formats']:
# bitrates are exaggerated in PPV playlists, so avoid wrong/huge filesize_approx values # bitrates are exaggerated in PPV playlists, so avoid wrong/huge filesize_approx values
if f.get('tbr'): if f.get('tbr'):
f['tbr'] = int(f['tbr'] / 2.5) f['tbr'] = int(f['tbr'] / 2.5)
# prefer variants with the same basename as the master playlist to avoid partial streams
f['format_id'] = url_basename(f['url']).partition('.')[0]
if not f['format_id'].startswith(url_basename(f['manifest_url']).partition('.')[0]):
f['preference'] = -10
hls_aes_key = traverse_obj(video_data, ('hls', 'key', {decrypt})) hls_aes_key = traverse_obj(video_data, ('hls', 'key', {decrypt}))
if hls_aes_key: if hls_aes_key:

View file

@ -1,198 +0,0 @@
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
decode_packed_codes,
determine_ext,
int_or_none,
js_to_json,
urlencode_postdata,
)
# based on openload_decode from 2bfeee69b976fe049761dd3012e30b637ee05a58
def aa_decode(aa_code):
symbol_table = [
('7', '((゚ー゚) + (o^_^o))'),
('6', '((o^_^o) +(o^_^o))'),
('5', '((゚ー゚) + (゚Θ゚))'),
('2', '((o^_^o) - (゚Θ゚))'),
('4', '(゚ー゚)'),
('3', '(o^_^o)'),
('1', '(゚Θ゚)'),
('0', '(c^_^o)'),
]
delim = '(゚Д゚)[゚ε゚]+'
ret = ''
for aa_char in aa_code.split(delim):
for val, pat in symbol_table:
aa_char = aa_char.replace(pat, val)
aa_char = aa_char.replace('+ ', '')
m = re.match(r'^\d+', aa_char)
if m:
ret += chr(int(m.group(0), 8))
else:
m = re.match(r'^u([\da-f]+)', aa_char)
if m:
ret += chr(int(m.group(1), 16))
return ret
class XFileShareIE(InfoExtractor):
_SITES = (
(r'aparat\.cam', 'Aparat'),
(r'clipwatching\.com', 'ClipWatching'),
(r'gounlimited\.to', 'GoUnlimited'),
(r'govid\.me', 'GoVid'),
(r'holavid\.com', 'HolaVid'),
(r'streamty\.com', 'Streamty'),
(r'thevideobee\.to', 'TheVideoBee'),
(r'uqload\.com', 'Uqload'),
(r'vidbom\.com', 'VidBom'),
(r'vidlo\.us', 'vidlo'),
(r'vidlocker\.xyz', 'VidLocker'),
(r'vidshare\.tv', 'VidShare'),
(r'vup\.to', 'VUp'),
(r'wolfstream\.tv', 'WolfStream'),
(r'xvideosharing\.com', 'XVideoSharing'),
)
IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])
_VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
% '|'.join(site for site in list(zip(*_SITES))[0]))
_EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' % '|'.join(site for site in list(zip(*_SITES))[0])]
_FILE_NOT_FOUND_REGEXES = (
r'>(?:404 - )?File Not Found<',
r'>The file was removed by administrator<',
)
_TESTS = [{
'url': 'https://uqload.com/dltx1wztngdz',
'md5': '3cfbb65e4c90e93d7b37bcb65a595557',
'info_dict': {
'id': 'dltx1wztngdz',
'ext': 'mp4',
'title': 'Rick Astley Never Gonna Give You mp4',
'thumbnail': r're:https://.*\.jpg'
}
}, {
'url': 'http://xvideosharing.com/fq65f94nd2ve',
'md5': '4181f63957e8fe90ac836fa58dc3c8a6',
'info_dict': {
'id': 'fq65f94nd2ve',
'ext': 'mp4',
'title': 'sample',
'thumbnail': r're:http://.*\.jpg',
},
}, {
'url': 'https://aparat.cam/n4d6dh0wvlpr',
'only_matching': True,
}, {
'url': 'https://wolfstream.tv/nthme29v9u2x',
'only_matching': True,
}]
def _real_extract(self, url):
host, video_id = self._match_valid_url(url).groups()
url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id)
webpage = self._download_webpage(url, video_id)
if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES):
raise ExtractorError('Video %s does not exist' % video_id, expected=True)
fields = self._hidden_inputs(webpage)
if fields.get('op') == 'download1':
countdown = int_or_none(self._search_regex(
r'<span id="countdown_str">(?:[Ww]ait)?\s*<span id="cxc">(\d+)</span>\s*(?:seconds?)?</span>',
webpage, 'countdown', default=None))
if countdown:
self._sleep(countdown, video_id)
webpage = self._download_webpage(
url, video_id, 'Downloading video page',
data=urlencode_postdata(fields), headers={
'Referer': url,
'Content-type': 'application/x-www-form-urlencoded',
})
title = (self._search_regex(
(r'style="z-index: [0-9]+;">([^<]+)</span>',
r'<td nowrap>([^<]+)</td>',
r'h4-fine[^>]*>([^<]+)<',
r'>Watch (.+)[ <]',
r'<h2 class="video-page-head">([^<]+)</h2>',
r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<', # streamin.to
r'title\s*:\s*"([^"]+)"'), # govid.me
webpage, 'title', default=None) or self._og_search_title(
webpage, default=None) or video_id).strip()
for regex, func in (
(r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes),
(r'(゚.+)', aa_decode)):
obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None)
if obf_code:
webpage = webpage.replace(obf_code, func(obf_code))
formats = []
jwplayer_data = self._search_regex(
[
r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);',
r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);',
], webpage,
'jwplayer data', default=None)
if jwplayer_data:
jwplayer_data = self._parse_json(
jwplayer_data.replace(r"\'", "'"), video_id, js_to_json)
if jwplayer_data:
formats = self._parse_jwplayer_data(
jwplayer_data, video_id, False,
m3u8_id='hls', mpd_id='dash')['formats']
if not formats:
urls = []
for regex in (
r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1',
r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1',
r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)',
r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'):
for mobj in re.finditer(regex, webpage):
video_url = mobj.group('url')
if video_url not in urls:
urls.append(video_url)
sources = self._search_regex(
r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None)
if sources:
urls.extend(self._parse_json(sources, video_id))
formats = []
for video_url in urls:
if determine_ext(video_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls',
fatal=False))
else:
formats.append({
'url': video_url,
'format_id': 'sd',
})
thumbnail = self._search_regex(
[
r'<video[^>]+poster="([^"]+)"',
r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],',
], webpage, 'thumbnail', default=None)
return {
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'formats': formats,
'http_headers': {'Referer': url}
}

View file

@ -173,8 +173,41 @@ class XVideosIE(InfoExtractor):
class XVideosQuickiesIE(InfoExtractor): class XVideosQuickiesIE(InfoExtractor):
IE_NAME = 'xvideos:quickies' IE_NAME = 'xvideos:quickies'
_VALID_URL = r'https?://(?P<domain>(?:[^/]+\.)?xvideos2?\.com)/amateur-channels/[^#]+#quickies/a/(?P<id>\d+)' _VALID_URL = r'https?://(?P<domain>(?:[^/?#]+\.)?xvideos2?\.com)/(?:profiles/|amateur-channels/)?[^/?#]+#quickies/a/(?P<id>\w+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.xvideos.com/lili_love#quickies/a/ipdtikh1a4c',
'md5': 'f9e4f518ff1de14b99a400bbd0fc5ee0',
'info_dict': {
'id': 'ipdtikh1a4c',
'ext': 'mp4',
'title': 'Mexican chichóna putisima',
'age_limit': 18,
'duration': 81,
'thumbnail': r're:^https://cdn.*-pic.xvideos-cdn.com/.+\.jpg',
}
}, {
'url': 'https://www.xvideos.com/profiles/lili_love#quickies/a/ipphaob6fd1',
'md5': '5340938aac6b46e19ebdd1d84535862e',
'info_dict': {
'id': 'ipphaob6fd1',
'ext': 'mp4',
'title': 'Puta chichona mexicana squirting',
'age_limit': 18,
'duration': 56,
'thumbnail': r're:^https://cdn.*-pic.xvideos-cdn.com/.+\.jpg',
}
}, {
'url': 'https://www.xvideos.com/amateur-channels/lili_love#quickies/a/hfmffmd7661',
'md5': '92428518bbabcb4c513e55922e022491',
'info_dict': {
'id': 'hfmffmd7661',
'ext': 'mp4',
'title': 'Chichona mexican slut',
'age_limit': 18,
'duration': 9,
'thumbnail': r're:^https://cdn.*-pic.xvideos-cdn.com/.+\.jpg',
}
}, {
'url': 'https://www.xvideos.com/amateur-channels/wifeluna#quickies/a/47258683', 'url': 'https://www.xvideos.com/amateur-channels/wifeluna#quickies/a/47258683',
'md5': '16e322a93282667f1963915568f782c1', 'md5': '16e322a93282667f1963915568f782c1',
'info_dict': { 'info_dict': {
@ -189,4 +222,4 @@ class XVideosQuickiesIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
domain, id_ = self._match_valid_url(url).group('domain', 'id') domain, id_ = self._match_valid_url(url).group('domain', 'id')
return self.url_result(f'https://{domain}/video{id_}/_', XVideosIE, id_) return self.url_result(f'https://{domain}/video{"" if id_.isdecimal() else "."}{id_}/_', XVideosIE, id_)

View file

@ -259,15 +259,15 @@ class ZenYandexIE(InfoExtractor):
webpage = self._download_webpage(redirect, video_id, note='Redirecting') webpage = self._download_webpage(redirect, video_id, note='Redirecting')
data_json = self._search_json( data_json = self._search_json(
r'("data"\s*:|data\s*=)', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}') r'("data"\s*:|data\s*=)', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}')
serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', webpage, 'server state')
webpage, 'server state').replace('State', 'Settings')
uploader = self._search_regex(r'(<a\s*class=["\']card-channel-link[^"\']+["\'][^>]+>)', uploader = self._search_regex(r'(<a\s*class=["\']card-channel-link[^"\']+["\'][^>]+>)',
webpage, 'uploader', default='<a>') webpage, 'uploader', default='<a>')
uploader_name = extract_attributes(uploader).get('aria-label') uploader_name = extract_attributes(uploader).get('aria-label')
video_json = try_get(data_json, lambda x: x[serverstate]['exportData']['video'], dict) item_id = traverse_obj(data_json, (serverstate, 'videoViewer', 'openedItemId', {str}))
stream_urls = try_get(video_json, lambda x: x['video']['streams']) video_json = traverse_obj(data_json, (serverstate, 'videoViewer', 'items', item_id, {dict})) or {}
formats, subtitles = [], {} formats, subtitles = [], {}
for s_url in stream_urls: for s_url in traverse_obj(video_json, ('video', 'streams', ..., {url_or_none})):
ext = determine_ext(s_url) ext = determine_ext(s_url)
if ext == 'mpd': if ext == 'mpd':
fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash') fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash')

View file

@ -72,15 +72,15 @@ class YouPornIE(InfoExtractor):
'id': '16290308', 'id': '16290308',
'age_limit': 18, 'age_limit': 18,
'categories': [], 'categories': [],
'description': 'md5:00ea70f642f431c379763c17c2f396bc', 'description': str, # TODO: detect/remove SEO spam description in ytdl backport
'display_id': 'tinderspecial-trailer1', 'display_id': 'tinderspecial-trailer1',
'duration': 298.0, 'duration': 298.0,
'ext': 'mp4', 'ext': 'mp4',
'upload_date': '20201123', 'upload_date': '20201123',
'uploader': 'Ersties', 'uploader': 'Ersties',
'tags': [], 'tags': [],
'thumbnail': 'https://fi1.ypncdn.com/202011/23/16290308/original/8/tinderspecial-trailer1-8(m=eaAaaEPbaaaa).jpg', 'thumbnail': r're:https://.+\.jpg',
'timestamp': 1606089600, 'timestamp': 1606147564,
'title': 'Tinder In Real Life', 'title': 'Tinder In Real Life',
'view_count': int, 'view_count': int,
} }
@ -88,11 +88,17 @@ class YouPornIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'display_id') video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
definitions = self._download_json( self._set_cookie('.youporn.com', 'age_verified', '1')
f'https://www.youporn.com/api/video/media_definitions/{video_id}/', display_id or video_id) webpage = self._download_webpage(f'https://www.youporn.com/watch/{video_id}', video_id)
definitions = self._search_json(r'\bplayervars\s*:', webpage, 'player vars', video_id)['mediaDefinitions']
def get_format_data(data, f): def get_format_data(data, stream_type):
return traverse_obj(data, lambda _, v: v['format'] == f and url_or_none(v['videoUrl'])) info_url = traverse_obj(data, (lambda _, v: v['format'] == stream_type, 'videoUrl', {url_or_none}, any))
if not info_url:
return []
return traverse_obj(
self._download_json(info_url, video_id, f'Downloading {stream_type} info JSON', fatal=False),
lambda _, v: v['format'] == stream_type and url_or_none(v['videoUrl']))
formats = [] formats = []
# Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s # Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s
@ -123,10 +129,6 @@ class YouPornIE(InfoExtractor):
f['height'] = height f['height'] = height
formats.append(f) formats.append(f)
webpage = self._download_webpage(
'http://www.youporn.com/watch/%s' % video_id, display_id,
headers={'Cookie': 'age_verified=1'})
title = self._html_search_regex( title = self._html_search_regex(
r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>', r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>',
webpage, 'title', default=None) or self._og_search_title( webpage, 'title', default=None) or self._og_search_title(

View file

@ -1,65 +0,0 @@
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
parse_duration,
urljoin,
)
class YourPornIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?sxyprn\.com/post/(?P<id>[^/?#&.]+)'
_TESTS = [{
'url': 'https://sxyprn.com/post/57ffcb2e1179b.html',
'md5': '6f8682b6464033d87acaa7a8ff0c092e',
'info_dict': {
'id': '57ffcb2e1179b',
'ext': 'mp4',
'title': 'md5:c9f43630bd968267672651ba905a7d35',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 165,
'age_limit': 18,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://sxyprn.com/post/57ffcb2e1179b.html',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
parts = self._parse_json(
self._search_regex(
r'data-vnfo=(["\'])(?P<data>{.+?})\1', webpage, 'data info',
group='data'),
video_id)[video_id].split('/')
num = 0
for c in parts[6] + parts[7]:
if c.isnumeric():
num += int(c)
parts[5] = compat_str(int(parts[5]) - num)
parts[1] += '8'
video_url = urljoin(url, '/'.join(parts))
title = (self._search_regex(
r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title',
default=None) or self._og_search_description(webpage)).strip()
thumbnail = self._og_search_thumbnail(webpage)
duration = parse_duration(self._search_regex(
r'duration\s*:\s*<[^>]+>([\d:]+)', webpage, 'duration',
default=None))
return {
'id': video_id,
'url': video_url,
'title': title,
'thumbnail': thumbnail,
'duration': duration,
'age_limit': 18,
'ext': 'mp4',
}

View file

@ -1,43 +0,0 @@
from .common import InfoExtractor
from ..utils import urljoin
class YourUploadIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:yourupload\.com/(?:watch|embed)|embed\.yourupload\.com)/(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'http://yourupload.com/watch/14i14h',
'md5': '5e2c63385454c557f97c4c4131a393cd',
'info_dict': {
'id': '14i14h',
'ext': 'mp4',
'title': 'BigBuckBunny_320x180.mp4',
'thumbnail': r're:^https?://.*\.jpe?g',
}
}, {
'url': 'http://www.yourupload.com/embed/14i14h',
'only_matching': True,
}, {
'url': 'http://embed.yourupload.com/14i14h',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
embed_url = 'http://www.yourupload.com/embed/%s' % video_id
webpage = self._download_webpage(embed_url, video_id)
title = self._og_search_title(webpage)
video_url = urljoin(embed_url, self._og_search_video_url(webpage))
thumbnail = self._og_search_thumbnail(webpage, default=None)
return {
'id': video_id,
'title': title,
'url': video_url,
'thumbnail': thumbnail,
'http_headers': {
'Referer': embed_url,
},
}

View file

@ -240,6 +240,16 @@ INNERTUBE_CLIENTS = {
}, },
'INNERTUBE_CONTEXT_CLIENT_NAME': 85 'INNERTUBE_CONTEXT_CLIENT_NAME': 85
}, },
# This client has pre-merged video+audio 720p/1080p streams
'mediaconnect': {
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'MEDIA_CONNECT_FRONTEND',
'clientVersion': '0.1',
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 95
},
} }
@ -1171,7 +1181,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$', r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
) )
_formats = { _formats = { # NB: Used in YoutubeWebArchiveIE and GoogleDriveIE
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
@ -2343,6 +2353,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'format': '17', # 3gp format available on android 'format': '17', # 3gp format available on android
'extractor_args': {'youtube': {'player_client': ['android']}}, 'extractor_args': {'youtube': {'player_client': ['android']}},
}, },
'skip': 'android client broken',
}, },
{ {
# Skip download of additional client configs (remix client config in this case) # Skip download of additional client configs (remix client config in this case)
@ -2720,7 +2731,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'heatmap': 'count:100', 'heatmap': 'count:100',
}, },
'params': { 'params': {
'extractor_args': {'youtube': {'player_client': ['android'], 'player_skip': ['webpage']}}, 'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}},
}, },
}, },
] ]
@ -3307,7 +3318,36 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'value': ('intensityScoreNormalized', {float_or_none}), 'value': ('intensityScoreNormalized', {float_or_none}),
})) or None })) or None
def _extract_comment(self, comment_renderer, parent=None): def _extract_comment(self, entities, parent=None):
comment_entity_payload = get_first(entities, ('payload', 'commentEntityPayload', {dict}))
if not (comment_id := traverse_obj(comment_entity_payload, ('properties', 'commentId', {str}))):
return
toolbar_entity_payload = get_first(entities, ('payload', 'engagementToolbarStateEntityPayload', {dict}))
time_text = traverse_obj(comment_entity_payload, ('properties', 'publishedTime', {str})) or ''
return {
'id': comment_id,
'parent': parent or 'root',
**traverse_obj(comment_entity_payload, {
'text': ('properties', 'content', 'content', {str}),
'like_count': ('toolbar', 'likeCountA11y', {parse_count}),
'author_id': ('author', 'channelId', {self.ucid_or_none}),
'author': ('author', 'displayName', {str}),
'author_thumbnail': ('author', 'avatarThumbnailUrl', {url_or_none}),
'author_is_uploader': ('author', 'isCreator', {bool}),
'author_is_verified': ('author', 'isVerified', {bool}),
'author_url': ('author', 'channelCommand', 'innertubeCommand', (
('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url')
), {lambda x: urljoin('https://www.youtube.com', x)}),
}, get_all=False),
'is_favorited': (None if toolbar_entity_payload is None else
toolbar_entity_payload.get('heartState') == 'TOOLBAR_HEART_STATE_HEARTED'),
'_time_text': time_text, # FIXME: non-standard, but we need a way of showing that it is an estimate.
'timestamp': self._parse_time_text(time_text),
}
def _extract_comment_old(self, comment_renderer, parent=None):
comment_id = comment_renderer.get('commentId') comment_id = comment_renderer.get('commentId')
if not comment_id: if not comment_id:
return return
@ -3388,21 +3428,39 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
break break
return _continuation return _continuation
def extract_thread(contents): def extract_thread(contents, entity_payloads):
if not parent: if not parent:
tracker['current_page_thread'] = 0 tracker['current_page_thread'] = 0
for content in contents: for content in contents:
if not parent and tracker['total_parent_comments'] >= max_parents: if not parent and tracker['total_parent_comments'] >= max_parents:
yield yield
comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer']) comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
comment_renderer = get_first(
(comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
expected_type=dict, default={})
comment = self._extract_comment(comment_renderer, parent) # old comment format
if not entity_payloads:
comment_renderer = get_first(
(comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
expected_type=dict, default={})
comment = self._extract_comment_old(comment_renderer, parent)
# new comment format
else:
view_model = (
traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel', {dict}))
or traverse_obj(content, ('commentViewModel', {dict})))
comment_keys = traverse_obj(view_model, (('commentKey', 'toolbarStateKey'), {str}))
if not comment_keys:
continue
entities = traverse_obj(entity_payloads, lambda _, v: v['entityKey'] in comment_keys)
comment = self._extract_comment(entities, parent)
if comment:
comment['is_pinned'] = traverse_obj(view_model, ('pinnedText', {str})) is not None
if not comment: if not comment:
continue continue
comment_id = comment['id'] comment_id = comment['id']
if comment.get('is_pinned'): if comment.get('is_pinned'):
tracker['pinned_comment_ids'].add(comment_id) tracker['pinned_comment_ids'].add(comment_id)
# Sometimes YouTube may break and give us infinite looping comments. # Sometimes YouTube may break and give us infinite looping comments.
@ -3495,7 +3553,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
check_get_keys = None check_get_keys = None
if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0): if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0):
check_get_keys = [[*continuation_items_path, ..., ( check_get_keys = [[*continuation_items_path, ..., (
'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]] 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentViewModel', 'commentRenderer'))]]
try: try:
response = self._extract_response( response = self._extract_response(
item_id=None, query=continuation, item_id=None, query=continuation,
@ -3519,6 +3577,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
raise raise
is_forced_continuation = False is_forced_continuation = False
continuation = None continuation = None
mutations = traverse_obj(response, ('frameworkUpdates', 'entityBatchUpdate', 'mutations', ..., {dict}))
for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]): for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]):
if is_first_continuation: if is_first_continuation:
continuation = extract_header(continuation_items) continuation = extract_header(continuation_items)
@ -3527,7 +3586,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
break break
continue continue
for entry in extract_thread(continuation_items): for entry in extract_thread(continuation_items, mutations):
if not entry: if not entry:
return return
yield entry yield entry
@ -3604,8 +3663,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
yt_query = { yt_query = {
'videoId': video_id, 'videoId': video_id,
} }
if _split_innertube_client(client)[0] in ('android', 'android_embedscreen'):
yt_query['params'] = 'CgIIAQ=='
pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0] pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0]
if pp_arg: if pp_arg:
@ -3621,19 +3678,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _get_requested_clients(self, url, smuggled_data): def _get_requested_clients(self, url, smuggled_data):
requested_clients = [] requested_clients = []
default = ['ios', 'android', 'web'] android_clients = []
default = ['ios', 'web']
allowed_clients = sorted( allowed_clients = sorted(
(client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'), (client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'),
key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True)
for client in self._configuration_arg('player_client'): for client in self._configuration_arg('player_client'):
if client in allowed_clients: if client == 'default':
requested_clients.append(client)
elif client == 'default':
requested_clients.extend(default) requested_clients.extend(default)
elif client == 'all': elif client == 'all':
requested_clients.extend(allowed_clients) requested_clients.extend(allowed_clients)
else: elif client not in allowed_clients:
self.report_warning(f'Skipping unsupported client {client}') self.report_warning(f'Skipping unsupported client {client}')
elif client.startswith('android'):
android_clients.append(client)
else:
requested_clients.append(client)
# Force deprioritization of broken Android clients for format de-duplication
requested_clients.extend(android_clients)
if not requested_clients: if not requested_clients:
requested_clients = default requested_clients = default
@ -3852,6 +3914,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)
client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) client_name = fmt.get(STREAMING_DATA_CLIENT_NAME)
# Android client formats are broken due to integrity check enforcement
# Ref: https://github.com/yt-dlp/yt-dlp/issues/9554
is_broken = client_name and client_name.startswith(short_client_name('android'))
if is_broken:
self.report_warning(
f'{video_id}: Android client formats are broken and may yield HTTP Error 403. '
'They will be deprioritized', only_once=True)
name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or ''
fps = int_or_none(fmt.get('fps')) or 0 fps = int_or_none(fmt.get('fps')) or 0
dct = { dct = {
@ -3864,7 +3934,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
name, fmt.get('isDrc') and 'DRC', name, fmt.get('isDrc') and 'DRC',
try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()),
try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()),
throttled and 'THROTTLED', is_damaged and 'DAMAGED', throttled and 'THROTTLED', is_damaged and 'DAMAGED', is_broken and 'BROKEN',
(self.get_param('verbose') or all_formats) and client_name, (self.get_param('verbose') or all_formats) and client_name,
delim=', '), delim=', '),
# Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372
@ -3882,8 +3952,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'language': join_nonempty(audio_track.get('id', '').split('.')[0], 'language': join_nonempty(audio_track.get('id', '').split('.')[0],
'desc' if language_preference < -1 else '') or None, 'desc' if language_preference < -1 else '') or None,
'language_preference': language_preference, 'language_preference': language_preference,
# Strictly de-prioritize damaged and 3gp formats # Strictly de-prioritize broken, damaged and 3gp formats
'preference': -10 if is_damaged else -2 if itag == '17' else None, 'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None,
} }
mime_mobj = re.match( mime_mobj = re.match(
r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')

View file

@ -21,7 +21,7 @@ from .exceptions import (
TransportError, TransportError,
) )
from .impersonate import ImpersonateRequestHandler, ImpersonateTarget from .impersonate import ImpersonateRequestHandler, ImpersonateTarget
from ..dependencies import curl_cffi from ..dependencies import curl_cffi, certifi
from ..utils import int_or_none from ..utils import int_or_none
if curl_cffi is None: if curl_cffi is None:
@ -132,6 +132,16 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
extensions.pop('cookiejar', None) extensions.pop('cookiejar', None)
extensions.pop('timeout', None) extensions.pop('timeout', None)
def send(self, request: Request) -> Response:
target = self._get_request_target(request)
try:
response = super().send(request)
except HTTPError as e:
e.response.extensions['impersonate'] = target
raise
response.extensions['impersonate'] = target
return response
def _send(self, request: Request): def _send(self, request: Request):
max_redirects_exceeded = False max_redirects_exceeded = False
session: curl_cffi.requests.Session = self._get_instance( session: curl_cffi.requests.Session = self._get_instance(
@ -156,6 +166,13 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
# See: https://curl.se/libcurl/c/CURLOPT_HTTPPROXYTUNNEL.html # See: https://curl.se/libcurl/c/CURLOPT_HTTPPROXYTUNNEL.html
session.curl.setopt(CurlOpt.HTTPPROXYTUNNEL, 1) session.curl.setopt(CurlOpt.HTTPPROXYTUNNEL, 1)
# curl_cffi does not currently set these for proxies
session.curl.setopt(CurlOpt.PROXY_CAINFO, certifi.where())
if not self.verify:
session.curl.setopt(CurlOpt.PROXY_SSL_VERIFYPEER, 0)
session.curl.setopt(CurlOpt.PROXY_SSL_VERIFYHOST, 0)
headers = self._get_impersonate_headers(request) headers = self._get_impersonate_headers(request)
if self._client_cert: if self._client_cert:
@ -203,7 +220,10 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
max_redirects_exceeded = True max_redirects_exceeded = True
curl_response = e.response curl_response = e.response
elif e.code == CurlECode.PROXY: elif (
e.code == CurlECode.PROXY
or (e.code == CurlECode.RECV_ERROR and 'Received HTTP code 407 from proxy after CONNECT' in str(e))
):
raise ProxyError(cause=e) from e raise ProxyError(cause=e) from e
else: else:
raise TransportError(cause=e) from e raise TransportError(cause=e) from e

View file

@ -31,6 +31,8 @@ from ..utils import (
) )
from ..utils.networking import HTTPHeaderDict, normalize_url from ..utils.networking import HTTPHeaderDict, normalize_url
DEFAULT_TIMEOUT = 20
def register_preference(*handlers: type[RequestHandler]): def register_preference(*handlers: type[RequestHandler]):
assert all(issubclass(handler, RequestHandler) for handler in handlers) assert all(issubclass(handler, RequestHandler) for handler in handlers)
@ -235,7 +237,7 @@ class RequestHandler(abc.ABC):
self._logger = logger self._logger = logger
self.headers = headers or {} self.headers = headers or {}
self.cookiejar = cookiejar if cookiejar is not None else YoutubeDLCookieJar() self.cookiejar = cookiejar if cookiejar is not None else YoutubeDLCookieJar()
self.timeout = float(timeout or 20) self.timeout = float(timeout or DEFAULT_TIMEOUT)
self.proxies = proxies or {} self.proxies = proxies or {}
self.source_address = source_address self.source_address = source_address
self.verbose = verbose self.verbose = verbose
@ -497,6 +499,7 @@ class Response(io.IOBase):
@param headers: response headers. @param headers: response headers.
@param status: Response HTTP status code. Default is 200 OK. @param status: Response HTTP status code. Default is 200 OK.
@param reason: HTTP status reason. Will use built-in reasons based on status code if not provided. @param reason: HTTP status reason. Will use built-in reasons based on status code if not provided.
@param extensions: Dictionary of handler-specific response extensions.
""" """
def __init__( def __init__(
@ -505,7 +508,9 @@ class Response(io.IOBase):
url: str, url: str,
headers: Mapping[str, str], headers: Mapping[str, str],
status: int = 200, status: int = 200,
reason: str = None): reason: str = None,
extensions: dict = None
):
self.fp = fp self.fp = fp
self.headers = Message() self.headers = Message()
@ -517,6 +522,7 @@ class Response(io.IOBase):
self.reason = reason or HTTPStatus(status).phrase self.reason = reason or HTTPStatus(status).phrase
except ValueError: except ValueError:
self.reason = None self.reason = None
self.extensions = extensions or {}
def readable(self): def readable(self):
return self.fp.readable() return self.fp.readable()

View file

@ -69,6 +69,10 @@ def _get_variant_and_executable_path():
# Ref: https://en.wikipedia.org/wiki/Uname#Examples # Ref: https://en.wikipedia.org/wiki/Uname#Examples
if machine[1:] in ('x86', 'x86_64', 'amd64', 'i386', 'i686'): if machine[1:] in ('x86', 'x86_64', 'amd64', 'i386', 'i686'):
machine = '_x86' if platform.architecture()[0][:2] == '32' else '' machine = '_x86' if platform.architecture()[0][:2] == '32' else ''
# sys.executable returns a /tmp/ path for staticx builds (linux_static)
# Ref: https://staticx.readthedocs.io/en/latest/usage.html#run-time-information
if static_exe_path := os.getenv('STATICX_PROG_PATH'):
path = static_exe_path
return f'{remove_end(sys.platform, "32")}{machine}_exe', path return f'{remove_end(sys.platform, "32")}{machine}_exe', path
path = os.path.dirname(__file__) path = os.path.dirname(__file__)

View file

@ -1638,16 +1638,14 @@ def get_filesystem_encoding():
return encoding if encoding is not None else 'utf-8' return encoding if encoding is not None else 'utf-8'
_WINDOWS_QUOTE_TRANS = str.maketrans({'"': '\\"', '\\': '\\\\'}) _WINDOWS_QUOTE_TRANS = str.maketrans({'"': R'\"'})
_CMD_QUOTE_TRANS = str.maketrans({ _CMD_QUOTE_TRANS = str.maketrans({
# Keep quotes balanced by replacing them with `""` instead of `\\"` # Keep quotes balanced by replacing them with `""` instead of `\\"`
'"': '""', '"': '""',
# Requires a variable `=` containing `"^\n\n"` (set in `utils.Popen`) # These require an env-variable `=` containing `"^\n\n"` (set in `utils.Popen`)
# `=` should be unique since variables containing `=` cannot be set using cmd # `=` should be unique since variables containing `=` cannot be set using cmd
'\n': '%=%', '\n': '%=%',
# While we are only required to escape backslashes immediately before quotes, '\r': '%=%',
# we instead escape all of 'em anyways to be consistent
'\\': '\\\\',
# Use zero length variable replacement so `%` doesn't get expanded # Use zero length variable replacement so `%` doesn't get expanded
# `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`) # `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`)
'%': '%%cd:~,%', '%': '%%cd:~,%',
@ -1656,19 +1654,14 @@ _CMD_QUOTE_TRANS = str.maketrans({
def shell_quote(args, *, shell=False): def shell_quote(args, *, shell=False):
args = list(variadic(args)) args = list(variadic(args))
if any(isinstance(item, bytes) for item in args):
deprecation_warning('Passing bytes to utils.shell_quote is deprecated')
encoding = get_filesystem_encoding()
for index, item in enumerate(args):
if isinstance(item, bytes):
args[index] = item.decode(encoding)
if compat_os_name != 'nt': if compat_os_name != 'nt':
return shlex.join(args) return shlex.join(args)
trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS
return ' '.join( return ' '.join(
s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII) else s.translate(trans).join('""') s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII)
else re.sub(r'(\\+)("|$)', r'\1\1\2', s).translate(trans).join('""')
for s in args) for s in args)