diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 9b8207b28b..4b69642603 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -707,3 +707,9 @@ Sakura286 SamDecrock stratus-ss subrat-lima +gitninja1234 +jkruse +xiaomac +wesson09 +Crypto90 +MutantPiggieGolem1 diff --git a/Changelog.md b/Changelog.md index 4dc0323683..22a9a6e4bb 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,85 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.12.23 + +#### Core changes +- [Don't sanitize filename on Unix when `--no-windows-filenames`](https://github.com/yt-dlp/yt-dlp/commit/6fc85f617a5850307fd5b258477070e6ee177796) ([#9591](https://github.com/yt-dlp/yt-dlp/issues/9591)) by [pukkandan](https://github.com/pukkandan) +- **update** + - [Check 64-bitness when upgrading ARM builds](https://github.com/yt-dlp/yt-dlp/commit/b91c3925c2059970daa801cb131c0c2f4f302e72) ([#11819](https://github.com/yt-dlp/yt-dlp/issues/11819)) by [bashonly](https://github.com/bashonly) + - [Fix endless update loop for `linux_exe` builds](https://github.com/yt-dlp/yt-dlp/commit/3d3ee458c1fe49dd5ebd7651a092119d23eb7000) ([#11827](https://github.com/yt-dlp/yt-dlp/issues/11827)) by [bashonly](https://github.com/bashonly) + +#### Extractor changes +- **soundcloud**: [Various fixes](https://github.com/yt-dlp/yt-dlp/commit/d298693b1b266d198e8eeecb90ea17c4a031268f) ([#11820](https://github.com/yt-dlp/yt-dlp/issues/11820)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Add age-gate workaround for some embeddable videos](https://github.com/yt-dlp/yt-dlp/commit/09a6c687126f04e243fcb105a828787efddd1030) ([#11821](https://github.com/yt-dlp/yt-dlp/issues/11821)) by [bashonly](https://github.com/bashonly) + - [Fix `uploader_id` extraction](https://github.com/yt-dlp/yt-dlp/commit/1a8851b689763e5173b96f70f8a71df0e4a44b66) ([#11818](https://github.com/yt-dlp/yt-dlp/issues/11818)) by [bashonly](https://github.com/bashonly) + - [Player client maintenance](https://github.com/yt-dlp/yt-dlp/commit/65cf46cddd873fd229dbb0fc0689bca4c201c6b6) ([#11893](https://github.com/yt-dlp/yt-dlp/issues/11893)) by [bashonly](https://github.com/bashonly) + - [Skip iOS formats that require PO Token](https://github.com/yt-dlp/yt-dlp/commit/9f42e68a74f3f00b0253fe70763abd57cac4237b) ([#11890](https://github.com/yt-dlp/yt-dlp/issues/11890)) by [coletdjnz](https://github.com/coletdjnz) + +### 2024.12.13 + +#### Extractor changes +- **patreon**: campaign: [Support /c/ URLs](https://github.com/yt-dlp/yt-dlp/commit/bc262bcad4d3683ceadf61a7eb87e233e72adef3) ([#11756](https://github.com/yt-dlp/yt-dlp/issues/11756)) by [bashonly](https://github.com/bashonly) +- **soundcloud**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/f4d3e9e6dc25077b79849a31a2f67f93fdc01e62) ([#11777](https://github.com/yt-dlp/yt-dlp/issues/11777)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Fix `release_date` extraction](https://github.com/yt-dlp/yt-dlp/commit/d5e2a379f2adcb28bc48c7d9e90716d7278f89d2) ([#11759](https://github.com/yt-dlp/yt-dlp/issues/11759)) by [MutantPiggieGolem1](https://github.com/MutantPiggieGolem1) + - [Fix signature function extraction for `2f1832d2`](https://github.com/yt-dlp/yt-dlp/commit/5460cd91891bf613a2065e2fc278d9903c37a127) ([#11801](https://github.com/yt-dlp/yt-dlp/issues/11801)) by [bashonly](https://github.com/bashonly) + - [Prioritize original language over auto-dubbed audio](https://github.com/yt-dlp/yt-dlp/commit/dc3c4fddcc653989dae71fc563d82a308fc898cc) ([#11803](https://github.com/yt-dlp/yt-dlp/issues/11803)) by [bashonly](https://github.com/bashonly) + - search_url: [Fix playlist searches](https://github.com/yt-dlp/yt-dlp/commit/f6c73aad5f1a67544bea137ebd9d1e22e0e56567) ([#11782](https://github.com/yt-dlp/yt-dlp/issues/11782)) by [Crypto90](https://github.com/Crypto90) + +#### Misc. changes +- **cleanup**: [Make more playlist entries lazy](https://github.com/yt-dlp/yt-dlp/commit/54216696261bc07cacd9a837c501d9e0b7fed09e) ([#11763](https://github.com/yt-dlp/yt-dlp/issues/11763)) by [seproDev](https://github.com/seproDev) + +### 2024.12.06 + +#### Core changes +- **cookies**: [Add `--cookies-from-browser` support for MS Store Firefox](https://github.com/yt-dlp/yt-dlp/commit/354cb4026cf2191e1a130ec2a627b95cabfbc60a) ([#11731](https://github.com/yt-dlp/yt-dlp/issues/11731)) by [wesson09](https://github.com/wesson09) + +#### Extractor changes +- **bilibili**: [Fix HD formats extraction](https://github.com/yt-dlp/yt-dlp/commit/fca3eb5f8be08d5fab2e18b45b7281a12e566725) ([#11734](https://github.com/yt-dlp/yt-dlp/issues/11734)) by [grqz](https://github.com/grqz) +- **soundcloud**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/2feb28028ee48f2185d2d95076e62accb09b9e2e) ([#11742](https://github.com/yt-dlp/yt-dlp/issues/11742)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Fix `n` sig extraction for player `3bb1f723`](https://github.com/yt-dlp/yt-dlp/commit/a95ee6d8803fca9157adecf63732ab58bf87fd88) ([#11750](https://github.com/yt-dlp/yt-dlp/issues/11750)) by [bashonly](https://github.com/bashonly) (With fixes in [4bd2655](https://github.com/yt-dlp/yt-dlp/commit/4bd2655398aed450456197a6767639114a24eac2)) + - [Fix signature function extraction](https://github.com/yt-dlp/yt-dlp/commit/4c85ccd1366c88cf93982f8350f58eed17355981) ([#11751](https://github.com/yt-dlp/yt-dlp/issues/11751)) by [bashonly](https://github.com/bashonly) + - [Player client maintenance](https://github.com/yt-dlp/yt-dlp/commit/2e49c789d3eebc39af8910705d65a98bca0e4c4f) ([#11724](https://github.com/yt-dlp/yt-dlp/issues/11724)) by [bashonly](https://github.com/bashonly) + +### 2024.12.03 + +#### Core changes +- [Add `playlist_webpage_url` field](https://github.com/yt-dlp/yt-dlp/commit/7d6c259a03bc4707a319e5e8c6eff0278707874b) ([#11613](https://github.com/yt-dlp/yt-dlp/issues/11613)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- [Handle fragmented formats in `_remove_duplicate_formats`](https://github.com/yt-dlp/yt-dlp/commit/e0500cbf796323551bbabe5b8ed8c75a511ba47a) ([#11637](https://github.com/yt-dlp/yt-dlp/issues/11637)) by [Grub4K](https://github.com/Grub4K) +- **bilibili** + - [Always try to extract HD formats](https://github.com/yt-dlp/yt-dlp/commit/dc1687648077c5bf64863b307ecc5ab7e029bd8d) ([#10559](https://github.com/yt-dlp/yt-dlp/issues/10559)) by [grqz](https://github.com/grqz) + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/239f5f36fe04603bec59c8b975f6a792f10246db) ([#11667](https://github.com/yt-dlp/yt-dlp/issues/11667)) by [grqz](https://github.com/grqz) (With fixes in [f05a1cd](https://github.com/yt-dlp/yt-dlp/commit/f05a1cd1492fc98dc8d80d2081d632a1879913d2) by [bashonly](https://github.com/bashonly), [grqz](https://github.com/grqz)) + - [Fix subtitles and chapters extraction](https://github.com/yt-dlp/yt-dlp/commit/a13a336aa6f906812701abec8101b73b73db8ff7) ([#11708](https://github.com/yt-dlp/yt-dlp/issues/11708)) by [xiaomac](https://github.com/xiaomac) +- **chaturbate**: [Fix support for non-public streams](https://github.com/yt-dlp/yt-dlp/commit/4b5eec0aaa7c02627f27a386591b735b90e681a8) ([#11624](https://github.com/yt-dlp/yt-dlp/issues/11624)) by [jkruse](https://github.com/jkruse) +- **dacast**: [Fix HLS AES formats extraction](https://github.com/yt-dlp/yt-dlp/commit/0a0d80800b9350d1a4c4b18d82cfb77ffbc3c507) ([#11644](https://github.com/yt-dlp/yt-dlp/issues/11644)) by [bashonly](https://github.com/bashonly) +- **dropbox**: [Fix password-protected video extraction](https://github.com/yt-dlp/yt-dlp/commit/00dcde728635633eee969ad4d498b9f233c4a94e) ([#11636](https://github.com/yt-dlp/yt-dlp/issues/11636)) by [bashonly](https://github.com/bashonly) +- **duoplay**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/62cba8a1bedbfc0ddde7267ae57b72bf5f7ea7b1) ([#11588](https://github.com/yt-dlp/yt-dlp/issues/11588)) by [bashonly](https://github.com/bashonly), [glensc](https://github.com/glensc) +- **facebook**: [Support more groups URLs](https://github.com/yt-dlp/yt-dlp/commit/e0f1ae813b36e783e2348ba2a1566e12f5cd8f6e) ([#11576](https://github.com/yt-dlp/yt-dlp/issues/11576)) by [grqz](https://github.com/grqz) +- **instagram**: [Support `share` URLs](https://github.com/yt-dlp/yt-dlp/commit/360aed810ad85db950df586282d256516c98cd2d) ([#11677](https://github.com/yt-dlp/yt-dlp/issues/11677)) by [grqz](https://github.com/grqz) +- **microsoftembed**: [Make format extraction non fatal](https://github.com/yt-dlp/yt-dlp/commit/2bea7936323ca4b6f3b9b1fdd892566223e30efa) ([#11654](https://github.com/yt-dlp/yt-dlp/issues/11654)) by [seproDev](https://github.com/seproDev) +- **mitele**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/cd0f934604587ed793e9177f6a127e5dcf99a7dd) ([#11683](https://github.com/yt-dlp/yt-dlp/issues/11683)) by [DarkZeros](https://github.com/DarkZeros) +- **stripchat**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/16336c51d0848a6868a4fa04e749fa03548b4913) ([#11596](https://github.com/yt-dlp/yt-dlp/issues/11596)) by [gitninja1234](https://github.com/gitninja1234) +- **tiktok**: [Deprioritize animated thumbnails](https://github.com/yt-dlp/yt-dlp/commit/910ecc422930bca14e2abe4986f5f92359e3cea8) ([#11645](https://github.com/yt-dlp/yt-dlp/issues/11645)) by [bashonly](https://github.com/bashonly) +- **vk**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/c038a7b187ba24360f14134842a7a2cf897c33b1) ([#11715](https://github.com/yt-dlp/yt-dlp/issues/11715)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Adjust player clients for site changes](https://github.com/yt-dlp/yt-dlp/commit/0d146c1e36f467af30e87b7af651bdee67b73500) ([#11663](https://github.com/yt-dlp/yt-dlp/issues/11663)) by [bashonly](https://github.com/bashonly) + - tab: [Fix playlists tab extraction](https://github.com/yt-dlp/yt-dlp/commit/fe70f20aedf528fdee332131bc9b6710e54e6f10) ([#11615](https://github.com/yt-dlp/yt-dlp/issues/11615)) by [seproDev](https://github.com/seproDev) + +#### Networking changes +- **Request Handler**: websockets: [Support websockets 14.0+](https://github.com/yt-dlp/yt-dlp/commit/c7316373c0a886f65a07a51e50ee147bb3294c85) ([#11616](https://github.com/yt-dlp/yt-dlp/issues/11616)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **cleanup** + - [Bump ruff to 0.8.x](https://github.com/yt-dlp/yt-dlp/commit/d8fb3490863653182864d2a53522f350d67a9ff8) ([#11608](https://github.com/yt-dlp/yt-dlp/issues/11608)) by [seproDev](https://github.com/seproDev) + - Miscellaneous + - [ccf0a6b](https://github.com/yt-dlp/yt-dlp/commit/ccf0a6b86b7f68a75463804fe485ec240b8635f0) by [bashonly](https://github.com/bashonly), [pzhlkj6612](https://github.com/pzhlkj6612) + - [2b67ac3](https://github.com/yt-dlp/yt-dlp/commit/2b67ac300ac8b44368fb121637d1743cea8c5b6b) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + ### 2024.11.18 #### Important changes diff --git a/README.md b/README.md index 062ea6608f..895be8fb4f 100644 --- a/README.md +++ b/README.md @@ -617,8 +617,7 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git --no-restrict-filenames Allow Unicode characters, "&" and spaces in filenames (default) --windows-filenames Force filenames to be Windows-compatible - --no-windows-filenames Make filenames Windows-compatible only if - using Windows (default) + --no-windows-filenames Sanitize filenames only minimally --trim-filenames LENGTH Limit the filename length (excluding extension) to the specified number of characters @@ -1780,7 +1779,7 @@ The following extractors use this feature: * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total -* `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8) +* `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8), `missing_pot` (include formats that require a PO Token but are missing one) * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning @@ -1864,7 +1863,7 @@ The following extractors use this feature: * `cdn`: One or more CDN IDs to use with the API call for stream URLs, e.g. `gcp_cdn`, `gs_cdn_pc_app`, `gs_cdn_mobile_web`, `gs_cdn_pc_web` #### soundcloud -* `formats`: Formats to request from the API. Requested values should be in the format of `{protocol}_{extension}` (omitting the bitrate), e.g. `hls_opus,http_aac`. The `*` character functions as a wildcard, e.g. `*_mp3`, and can be passed by itself to request all formats. Known protocols include `http`, `hls` and `hls-aes`; known extensions include `aac`, `opus` and `mp3`. Original `download` formats are always extracted. Default is `http_aac,hls_aac,http_opus,hls_opus,http_mp3,hls_mp3` +* `formats`: Formats to request from the API. Requested values should be in the format of `{protocol}_{codec}`, e.g. `hls_opus,http_aac`. The `*` character functions as a wildcard, e.g. `*_mp3`, and can be passed by itself to request all formats. Known protocols include `http`, `hls` and `hls-aes`; known codecs include `aac`, `opus` and `mp3`. Original `download` formats are always extracted. Default is `http_aac,hls_aac,http_opus,hls_opus,http_mp3,hls_mp3` #### orfon (orf:on) * `prefer_segments_playlist`: Prefer a playlist of program segments instead of a single complete video when available. If individual segments are desired, use `--concat-playlist never --extractor-args "orfon:prefer_segments_playlist"` diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 966d27a498..6b022a7eaa 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -761,6 +761,13 @@ class TestYoutubeDL(unittest.TestCase): test('%(width)06d.%%(ext)s', 'NA.%(ext)s') test('%%(width)06d.%(ext)s', '%(width)06d.mp4') + # Sanitization options + test('%(title3)s', (None, 'foo⧸bar⧹test')) + test('%(title5)s', (None, 'aei_A'), restrictfilenames=True) + test('%(title3)s', (None, 'foo_bar_test'), windowsfilenames=False, restrictfilenames=True) + if sys.platform != 'win32': + test('%(title3)s', (None, 'foo⧸bar\\test'), windowsfilenames=False) + # ID sanitization test('%(id)s', '_abcd', info={'id': '_abcd'}) test('%(some_id)s', '_abcd', info={'some_id': '_abcd'}) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 0f7ae34f44..13436f0884 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -68,6 +68,16 @@ _SIG_TESTS = [ '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', 'AOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL2QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', ), + ( + 'https://www.youtube.com/s/player/3bb1f723/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'MyOSJXtKI3m-uME_jv7-pT12gOFC02RFkGoqWpzE0Cs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + ), + ( + 'https://www.youtube.com/s/player/2f1832d2/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xxAj7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJ2OySqa0q', + ), ] _NSIG_TESTS = [ @@ -183,6 +193,14 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/b12cc44b/player_ias.vflset/en_US/base.js', 'keLa5R2U00sR9SQK', 'N1OGyujjEwMnLw', ), + ( + 'https://www.youtube.com/s/player/3bb1f723/player_ias.vflset/en_US/base.js', + 'gK15nzVyaXE9RsMP3z', 'ZFFWFLPWx9DEgQ', + ), + ( + 'https://www.youtube.com/s/player/2f1832d2/player_ias.vflset/en_US/base.js', + 'YWt1qdbe8SAfkoPHW5d', 'RrRjWQOJmBiP', + ), ] @@ -254,8 +272,11 @@ def signature(jscode, sig_input): def n_sig(jscode, sig_input): - funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) - return JSInterpreter(jscode).call_function(funcname, sig_input) + ie = YoutubeIE(FakeYDL()) + funcname = ie._extract_n_function_name(jscode) + jsi = JSInterpreter(jscode) + func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname))) + return func([sig_input]) make_sig_test = t_factory( diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index ab58c15014..ea14893acd 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -267,7 +267,9 @@ class YoutubeDL: outtmpl_na_placeholder: Placeholder for unavailable meta fields. restrictfilenames: Do not allow "&" and spaces in file names trim_file_name: Limit length of filename (extension excluded) - windowsfilenames: Force the filenames to be windows compatible + windowsfilenames: True: Force filenames to be Windows compatible + False: Sanitize filenames only minimally + This option has no effect when running on Windows ignoreerrors: Do not stop on download/postprocessing errors. Can be 'only_download' to ignore only download errors. Default is 'only_download' for CLI, but False for API @@ -1193,8 +1195,7 @@ class YoutubeDL: def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict - @param sanitize Whether to sanitize the output as a filename. - For backward compatibility, a function can also be passed + @param sanitize Whether to sanitize the output as a filename """ info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set @@ -1310,14 +1311,23 @@ class YoutubeDL: na = self.params.get('outtmpl_na_placeholder', 'NA') - def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')): + def filename_sanitizer(key, value, restricted): return sanitize_filename(str(value), restricted=restricted, is_id=( bool(re.search(r'(^|[_.])id(\.|$)', key)) if 'filename-sanitization' in self.params['compat_opts'] else NO_DEFAULT)) - sanitizer = sanitize if callable(sanitize) else filename_sanitizer - sanitize = bool(sanitize) + if callable(sanitize): + self.deprecation_warning('Passing a callable "sanitize" to YoutubeDL.prepare_outtmpl is deprecated') + elif not sanitize: + pass + elif (sys.platform != 'win32' and not self.params.get('restrictfilenames') + and self.params.get('windowsfilenames') is False): + def sanitize(key, value): + return str(value).replace('/', '\u29F8').replace('\0', '') + else: + def sanitize(key, value): + return filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')) def _dumpjson_default(obj): if isinstance(obj, (set, LazyList)): @@ -1400,13 +1410,13 @@ class YoutubeDL: if sanitize: # If value is an object, sanitize might convert it to a string - # So we convert it to repr first + # So we manually convert it before sanitizing if fmt[-1] == 'r': value, fmt = repr(value), str_fmt elif fmt[-1] == 'a': value, fmt = ascii(value), str_fmt if fmt[-1] in 'csra': - value = sanitizer(last_field, value) + value = sanitize(last_field, value) key = '{}\0{}'.format(key.replace('%', '%\0'), outer_mobj.group('format')) TMPL_DICT[key] = value diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 286f41f509..bbe95473c3 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -231,9 +231,11 @@ def validate_options(opts): elif value in ('inf', 'infinite'): return float('inf') try: - return int(value) + int_value = int(value) except (TypeError, ValueError): validate(False, f'{name} retry count', value) + validate_positive(f'{name} retry count', int_value) + return int_value def parse_range_with_arg(name, arg_name, value, parse_limits=parse_duration, parse_arg=parse_retries): @@ -253,7 +255,6 @@ def validate_options(opts): if opts.wait_for_video is not None: min_wait, max_wait, wait_retries = parse_range_with_arg( 'time range to wait for video', 'waiting', opts.wait_for_video) - validate_positive('waiting retry count', wait_retries) opts.wait_for_video = (min_wait, max_wait, wait_retries) # Format sort diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 772433b0f2..fad323c901 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -195,7 +195,10 @@ def _extract_firefox_cookies(profile, container, logger): def _firefox_browser_dirs(): if sys.platform in ('cygwin', 'win32'): - yield os.path.expandvars(R'%APPDATA%\Mozilla\Firefox\Profiles') + yield from map(os.path.expandvars, ( + R'%APPDATA%\Mozilla\Firefox\Profiles', + R'%LOCALAPPDATA%\Packages\Mozilla.Firefox_n80bbvh6b1yt2\LocalCache\Roaming\Mozilla\Firefox\Profiles', + )) elif sys.platform == 'darwin': yield os.path.expanduser('~/Library/Application Support/Firefox/Profiles') diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 967010826e..bbd6d21bd7 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1551,6 +1551,7 @@ from .pluralsight import ( PluralsightIE, ) from .plutotv import PlutoTVIE +from .plvideo import PlVideoIE from .podbayfm import ( PodbayFMChannelIE, PodbayFMIE, diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py index 919e1d6af5..7dff40556b 100644 --- a/yt_dlp/extractor/adn.py +++ b/yt_dlp/extractor/adn.py @@ -232,7 +232,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' error = self._parse_json(e.cause.response.read(), video_id) message = error.get('message') - if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': + if e.cause.status == 403 and error.get('code') == 'player-bad-geolocation-country': self.raise_geo_restricted(msg=message) raise ExtractorError(message) else: diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 91619d9d5c..2db951a608 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -681,12 +681,6 @@ class BiliBiliIE(BilibiliBaseIE): old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') - play_info = ( - traverse_obj( - self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id, default=None), - ('data', {dict})) - or self._download_playinfo(video_id, cid, headers=headers, query={'try_look': 1})) - festival_info = {} if is_festival: festival_info = traverse_obj(initial_state, { @@ -724,6 +718,13 @@ class BiliBiliIE(BilibiliBaseIE): duration=traverse_obj(initial_state, ('videoData', 'duration', {int_or_none})), __post_extractor=self.extract_comments(aid)) + play_info = None + if self.is_logged_in: + play_info = traverse_obj( + self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id, default=None), + ('data', {dict})) + if not play_info: + play_info = self._download_playinfo(video_id, cid, headers=headers, query={'try_look': 1}) formats = self.extract_formats(play_info) if video_data.get('is_upower_exclusive'): diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py index 2526f25dac..3ada1fd5de 100644 --- a/yt_dlp/extractor/brightcove.py +++ b/yt_dlp/extractor/brightcove.py @@ -31,6 +31,7 @@ from ..utils import ( update_url_query, url_or_none, ) +from ..utils.traversal import traverse_obj class BrightcoveLegacyIE(InfoExtractor): @@ -935,8 +936,8 @@ class BrightcoveNewIE(BrightcoveNewBaseIE): if content_type == 'playlist': return self.playlist_result( - [self._parse_brightcove_metadata(vid, vid.get('id'), headers) - for vid in json_data.get('videos', []) if vid.get('id')], + (self._parse_brightcove_metadata(vid, vid['id'], headers) + for vid in traverse_obj(json_data, ('videos', lambda _, v: v['id']))), json_data.get('id'), json_data.get('name'), json_data.get('description')) diff --git a/yt_dlp/extractor/cultureunplugged.py b/yt_dlp/extractor/cultureunplugged.py index 8e6579c355..c7ccd27479 100644 --- a/yt_dlp/extractor/cultureunplugged.py +++ b/yt_dlp/extractor/cultureunplugged.py @@ -1,7 +1,4 @@ -import time - from .common import InfoExtractor -from ..networking import HEADRequest from ..utils import int_or_none @@ -31,9 +28,6 @@ class CultureUnpluggedIE(InfoExtractor): video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id - # request setClientTimezone.php to get PHPSESSID cookie which is need to get valid json data in the next request - self._request_webpage(HEADRequest( - 'http://www.cultureunplugged.com/setClientTimezone.php?timeOffset=%d' % -(time.timezone / 3600)), display_id) movie_data = self._download_json( f'http://www.cultureunplugged.com/movie-data/cu-{video_id}.json', display_id) diff --git a/yt_dlp/extractor/dvtv.py b/yt_dlp/extractor/dvtv.py index 3e442b339b..52d67d2bd0 100644 --- a/yt_dlp/extractor/dvtv.py +++ b/yt_dlp/extractor/dvtv.py @@ -162,7 +162,7 @@ class DVTVIE(InfoExtractor): items = re.findall(r'(?s)playlist\.push\(({.+?})\);', webpage) if items: return self.playlist_result( - [self._parse_video_metadata(i, video_id, timestamp) for i in items], + (self._parse_video_metadata(i, video_id, timestamp) for i in items), video_id, self._html_search_meta('twitter:title', webpage)) item = self._search_regex( diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py index 9ef57410ac..a97add71a4 100644 --- a/yt_dlp/extractor/nytimes.py +++ b/yt_dlp/extractor/nytimes.py @@ -343,7 +343,7 @@ class NYTimesCookingIE(NYTimesBaseIE): if media_ids: media_ids.append(lead_video_id) return self.playlist_result( - [self._extract_video(media_id) for media_id in media_ids], page_id, title, description) + map(self._extract_video, media_ids), page_id, title, description) return { **self._extract_video(lead_video_id), diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 6bdeaf1571..a0e831a5ce 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -457,7 +457,7 @@ class PatreonCampaignIE(PatreonBaseIE): _VALID_URL = r'''(?x) https?://(?:www\.)?patreon\.com/(?: (?:m|api/campaigns)/(?P\d+)| - (?P(?!creation[?/]|posts/|rss[?/])[\w-]+) + (?:c/)?(?P(?!creation[?/]|posts/|rss[?/])[\w-]+) )(?:/posts)?/?(?:$|[?#])''' _TESTS = [{ 'url': 'https://www.patreon.com/dissonancepod/', @@ -509,6 +509,26 @@ class PatreonCampaignIE(PatreonBaseIE): 'thumbnail': r're:^https?://.*$', }, 'playlist_mincount': 201, + }, { + 'url': 'https://www.patreon.com/c/OgSog', + 'info_dict': { + 'id': '8504388', + 'title': 'OGSoG', + 'description': r're:(?s)Hello and welcome to our Patreon page. We are Mari, Lasercorn, .+', + 'channel': 'OGSoG', + 'channel_id': '8504388', + 'channel_url': 'https://www.patreon.com/OgSog', + 'uploader_url': 'https://www.patreon.com/OgSog', + 'uploader_id': '72323575', + 'uploader': 'David Moss', + 'thumbnail': r're:https?://.+/.+', + 'channel_follower_count': int, + 'age_limit': 0, + }, + 'playlist_mincount': 331, + }, { + 'url': 'https://www.patreon.com/c/OgSog/posts', + 'only_matching': True, }, { 'url': 'https://www.patreon.com/dissonancepod/posts', 'only_matching': True, diff --git a/yt_dlp/extractor/pixivsketch.py b/yt_dlp/extractor/pixivsketch.py index 344cdb3d05..50b7af5354 100644 --- a/yt_dlp/extractor/pixivsketch.py +++ b/yt_dlp/extractor/pixivsketch.py @@ -1,4 +1,5 @@ from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, traverse_obj, @@ -110,8 +111,8 @@ class PixivSketchUserIE(PixivSketchBaseIE): if not traverse_obj(data, 'is_broadcasting'): try: self._call_api(user_id, 'users/current.json', url, 'Investigating reason for request failure') - except ExtractorError as ex: - if ex.cause and ex.cause.code == 401: + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: self.raise_login_required(f'Please log in, or use direct link like https://sketch.pixiv.net/@{user_id}/1234567890', method='cookies') raise ExtractorError('This user is offline', expected=True) diff --git a/yt_dlp/extractor/plvideo.py b/yt_dlp/extractor/plvideo.py new file mode 100644 index 0000000000..9351af10ae --- /dev/null +++ b/yt_dlp/extractor/plvideo.py @@ -0,0 +1,130 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + parse_iso8601, + parse_resolution, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class PlVideoIE(InfoExtractor): + IE_DESC = 'Платформа' + _VALID_URL = r'https?://(?:www\.)?plvideo\.ru/(?:watch\?(?:[^#]+&)?v=|shorts/)(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://plvideo.ru/watch?v=Y5JzUzkcQTMK', + 'md5': 'fe8e18aca892b3b31f3bf492169f8a26', + 'info_dict': { + 'id': 'Y5JzUzkcQTMK', + 'ext': 'mp4', + 'thumbnail': 'https://img.plvideo.ru/images/fp-2024-images/v/cover/37/dd/37dd00a4c96c77436ab737e85947abd7/original663a4a3bb713e5.33151959.jpg', + 'title': 'Presidente de Cuba llega a Moscú en una visita de trabajo', + 'channel': 'RT en Español', + 'channel_id': 'ZH4EKqunVDvo', + 'media_type': 'video', + 'comment_count': int, + 'tags': ['rusia', 'cuba', 'russia', 'miguel díaz-canel'], + 'description': 'md5:a1a395d900d77a86542a91ee0826c115', + 'released_timestamp': 1715096124, + 'channel_is_verified': True, + 'like_count': int, + 'timestamp': 1715095911, + 'duration': 44320, + 'view_count': int, + 'dislike_count': int, + 'upload_date': '20240507', + 'modified_date': '20240701', + 'channel_follower_count': int, + 'modified_timestamp': 1719824073, + }, + }, { + 'url': 'https://plvideo.ru/shorts/S3Uo9c-VLwFX', + 'md5': '7d8fa2279406c69d2fd2a6fc548a9805', + 'info_dict': { + 'id': 'S3Uo9c-VLwFX', + 'ext': 'mp4', + 'channel': 'Romaatom', + 'tags': 'count:22', + 'dislike_count': int, + 'upload_date': '20241130', + 'description': 'md5:452e6de219bf2f32bb95806c51c3b364', + 'duration': 58433, + 'modified_date': '20241130', + 'thumbnail': 'https://img.plvideo.ru/images/fp-2024-11-cover/S3Uo9c-VLwFX/f9318999-a941-482b-b700-2102a7049366.jpg', + 'media_type': 'shorts', + 'like_count': int, + 'modified_timestamp': 1732961458, + 'channel_is_verified': True, + 'channel_id': 'erJyyTIbmUd1', + 'timestamp': 1732961355, + 'comment_count': int, + 'title': 'Белоусов отменил приказы о кадровом резерве на гражданской службе', + 'channel_follower_count': int, + 'view_count': int, + 'released_timestamp': 1732961458, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_data = self._download_json( + f'https://api.g1.plvideo.ru/v1/videos/{video_id}?Aud=18', video_id) + + is_live = False + formats = [] + subtitles = {} + automatic_captions = {} + for quality, data in traverse_obj(video_data, ('item', 'profiles', {dict.items}, lambda _, v: url_or_none(v[1]['hls']))): + formats.append({ + 'format_id': quality, + 'ext': 'mp4', + 'protocol': 'm3u8_native', + **traverse_obj(data, { + 'url': 'hls', + 'fps': ('fps', {float_or_none}), + 'aspect_ratio': ('aspectRatio', {float_or_none}), + }), + **parse_resolution(quality), + }) + if livestream_url := traverse_obj(video_data, ('item', 'livestream', 'url', {url_or_none})): + is_live = True + formats.extend(self._extract_m3u8_formats(livestream_url, video_id, 'mp4', live=True)) + for lang, url in traverse_obj(video_data, ('item', 'subtitles', {dict.items}, lambda _, v: url_or_none(v[1]))): + if lang.endswith('-auto'): + automatic_captions.setdefault(lang[:-5], []).append({ + 'url': url, + }) + else: + subtitles.setdefault(lang, []).append({ + 'url': url, + }) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'automatic_captions': automatic_captions, + 'is_live': is_live, + **traverse_obj(video_data, ('item', { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('cover', 'paths', 'original', 'src', {url_or_none}), + 'duration': ('uploadFile', 'videoDuration', {int_or_none}), + 'channel': ('channel', 'name', {str}), + 'channel_id': ('channel', 'id', {str}), + 'channel_follower_count': ('channel', 'stats', 'subscribers', {int_or_none}), + 'channel_is_verified': ('channel', 'verified', {bool}), + 'tags': ('tags', ..., {str}), + 'timestamp': ('createdAt', {parse_iso8601}), + 'released_timestamp': ('publishedAt', {parse_iso8601}), + 'modified_timestamp': ('updatedAt', {parse_iso8601}), + 'view_count': ('stats', 'viewTotalCount', {int_or_none}), + 'like_count': ('stats', 'likeCount', {int_or_none}), + 'dislike_count': ('stats', 'dislikeCount', {int_or_none}), + 'comment_count': ('stats', 'commentCount', {int_or_none}), + 'media_type': ('type', {str}), + })), + } diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 03089e98ea..85779e91ab 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -7,7 +7,6 @@ from .common import InfoExtractor, SearchInfoExtractor from ..networking import HEADRequest from ..networking.exceptions import HTTPError from ..utils import ( - KNOWN_EXTENSIONS, ExtractorError, float_or_none, int_or_none, @@ -211,6 +210,7 @@ class SoundcloudBaseIE(InfoExtractor): format_urls = set() formats = [] + has_drm = False query = {'client_id': self._CLIENT_ID} if secret_token: query['secret_token'] = secret_token @@ -246,55 +246,24 @@ class SoundcloudBaseIE(InfoExtractor): 'url': format_url, 'quality': 10, 'format_note': 'Original', + 'vcodec': 'none', }) def invalid_url(url): return not url or url in format_urls - def add_format(f, protocol, is_preview=False): - mobj = re.search(r'\.(?P\d+)\.(?P[0-9a-z]{3,4})(?=[/?])', stream_url) - if mobj: - for k, v in mobj.groupdict().items(): - if not f.get(k): - f[k] = v - format_id_list = [] - if protocol: - format_id_list.append(protocol) - ext = f.get('ext') - if ext == 'aac': - f.update({ - 'abr': 256, - 'quality': 5, - 'format_note': 'Premium', - }) - for k in ('ext', 'abr'): - v = str_or_none(f.get(k)) - if v: - format_id_list.append(v) - preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) - if preview: - format_id_list.append('preview') - abr = f.get('abr') - if abr: - f['abr'] = int(abr) - if protocol in ('hls', 'hls-aes'): - protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' - else: - protocol = 'http' - f.update({ - 'format_id': '_'.join(format_id_list), - 'protocol': protocol, - 'preference': -10 if preview else None, - }) - formats.append(f) - # New API - for t in traverse_obj(info, ('media', 'transcodings', lambda _, v: url_or_none(v['url']))): + for t in traverse_obj(info, ('media', 'transcodings', lambda _, v: url_or_none(v['url']) and v['preset'])): if extract_flat: break format_url = t['url'] + preset = t['preset'] + preset_base = preset.partition('_')[0] - protocol = traverse_obj(t, ('format', 'protocol', {str})) + protocol = traverse_obj(t, ('format', 'protocol', {str})) or 'http' + if protocol.startswith(('ctr-', 'cbc-')): + has_drm = True + continue if protocol == 'progressive': protocol = 'http' if protocol != 'hls' and '/hls' in format_url: @@ -302,35 +271,60 @@ class SoundcloudBaseIE(InfoExtractor): if protocol == 'encrypted-hls' or '/encrypted-hls' in format_url: protocol = 'hls-aes' - ext = None - if preset := traverse_obj(t, ('preset', {str_or_none})): - ext = preset.split('_')[0] - if ext not in KNOWN_EXTENSIONS: - ext = mimetype2ext(traverse_obj(t, ('format', 'mime_type', {str}))) - - identifier = join_nonempty(protocol, ext, delim='_') - if not self._is_requested(identifier): - self.write_debug(f'"{identifier}" is not a requested format, skipping') + short_identifier = f'{protocol}_{preset_base}' + if preset_base == 'abr': + self.write_debug(f'Skipping broken "{short_identifier}" format') + continue + if not self._is_requested(short_identifier): + self.write_debug(f'"{short_identifier}" is not a requested format, skipping') continue # XXX: if not extract_flat, 429 error must be caught where _extract_info_dict is called stream_url = traverse_obj(self._call_api( - format_url, track_id, f'Downloading {identifier} format info JSON', + format_url, track_id, f'Downloading {short_identifier} format info JSON', query=query, headers=self._HEADERS), ('url', {url_or_none})) - if invalid_url(stream_url): continue format_urls.add(stream_url) - add_format({ + + mime_type = traverse_obj(t, ('format', 'mime_type', {str})) + codec = self._search_regex(r'codecs="([^"]+)"', mime_type, 'codec', default=None) + ext = { + 'mp4a': 'm4a', + 'opus': 'opus', + }.get(codec[:4] if codec else None) or mimetype2ext(mime_type, default=None) + if not ext or ext == 'm3u8': + ext = preset_base + + is_premium = t.get('quality') == 'hq' + abr = int_or_none( + self._search_regex(r'(\d+)k$', preset, 'abr', default=None) + or self._search_regex(r'\.(\d+)\.(?:opus|mp3)[/?]', stream_url, 'abr', default=None) + or (256 if (is_premium and 'aac' in preset) else None)) + + is_preview = (t.get('snipped') + or '/preview/' in format_url + or re.search(r'/(?:preview|playlist)/0/30/', stream_url)) + + formats.append({ + 'format_id': join_nonempty(protocol, preset, is_preview and 'preview', delim='_'), 'url': stream_url, 'ext': ext, - }, protocol, t.get('snipped') or '/preview/' in format_url) + 'acodec': codec, + 'vcodec': 'none', + 'abr': abr, + 'protocol': 'm3u8_native' if protocol in ('hls', 'hls-aes') else 'http', + 'container': 'm4a_dash' if ext == 'm4a' else None, + 'quality': 5 if is_premium else 0 if (abr and abr >= 160) else -1, + 'format_note': 'Premium' if is_premium else None, + 'preference': -10 if is_preview else None, + }) - for f in formats: - f['vcodec'] = 'none' - - if not formats and info.get('policy') == 'BLOCK': - self.raise_geo_restricted(metadata_available=True) + if not formats: + if has_drm: + self.report_drm(track_id) + if info.get('policy') == 'BLOCK': + self.raise_geo_restricted(metadata_available=True) user = info.get('user') or {} diff --git a/yt_dlp/extractor/vidyard.py b/yt_dlp/extractor/vidyard.py index 2f6d1f4c51..89a89b13f1 100644 --- a/yt_dlp/extractor/vidyard.py +++ b/yt_dlp/extractor/vidyard.py @@ -421,5 +421,5 @@ class VidyardIE(VidyardBaseIE): return self._process_video_json(video_json['chapters'][0], video_id) return self.playlist_result( - [self._process_video_json(chapter, video_id) for chapter in video_json['chapters']], + (self._process_video_json(chapter, video_id) for chapter in video_json['chapters']), str(video_json['playerUuid']), video_json.get('name')) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 6ccc701a2b..4b36e41ffb 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -17,10 +17,10 @@ from ..utils import ( get_element_html_by_id, int_or_none, join_nonempty, + parse_qs, parse_resolution, str_or_none, str_to_int, - traverse_obj, try_call, unescapeHTML, unified_timestamp, @@ -29,6 +29,7 @@ from ..utils import ( urlencode_postdata, urljoin, ) +from ..utils.traversal import require, traverse_obj class VKBaseIE(InfoExtractor): @@ -91,17 +92,17 @@ class VKBaseIE(InfoExtractor): class VKIE(VKBaseIE): IE_NAME = 'vk' IE_DESC = 'VK' - _EMBED_REGEX = [r']+?src=(["\'])(?Phttps?://vk\.com/video_ext\.php.+?)\1'] + _EMBED_REGEX = [r']+?src=(["\'])(?Phttps?://vk(?:(?:video)?\.ru|\.com)/video_ext\.php.+?)\1'] _VALID_URL = r'''(?x) https?:// (?: (?: - (?:(?:m|new)\.)?vk\.com/video_| + (?:(?:m|new)\.)?vk(?:(?:video)?\.ru|\.com)/video_| (?:www\.)?daxab\.com/ ) ext\.php\?(?P.*?\boid=(?P-?\d+).*?\bid=(?P\d+).*)| (?: - (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?(?:video|clip)| + (?:(?:m|new)\.)?vk(?:(?:video)?\.ru|\.com)/(?:.+?\?.*?z=)?(?:video|clip)| (?:www\.)?daxab\.com/embed/ ) (?P-?\d+_\d+)(?:.*\blist=(?P([\da-f]+)|(ln-[\da-zA-Z]+)))? @@ -110,7 +111,7 @@ class VKIE(VKBaseIE): _TESTS = [ { - 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', + 'url': 'https://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', 'info_dict': { 'id': '-77521_162222515', 'ext': 'mp4', @@ -127,7 +128,7 @@ class VKIE(VKBaseIE): 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'http://vk.com/video205387401_165548505', + 'url': 'https://vk.com/video205387401_165548505', 'info_dict': { 'id': '205387401_165548505', 'ext': 'mp4', @@ -182,10 +183,10 @@ class VKIE(VKBaseIE): 'ext': 'mp4', 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', - 'duration': 178, + 'duration': 179, 'upload_date': '20130117', 'uploader': "Children's Joy Foundation Inc.", - 'uploader_id': 'thecjf', + 'uploader_id': '@CJFIofficial', 'view_count': int, 'channel_id': 'UCgzCNQ11TmR9V97ECnhi3gw', 'availability': 'public', @@ -193,7 +194,7 @@ class VKIE(VKBaseIE): 'live_status': 'not_live', 'playable_in_embed': True, 'channel': 'Children\'s Joy Foundation Inc.', - 'uploader_url': 'http://www.youtube.com/user/thecjf', + 'uploader_url': 'https://www.youtube.com/@CJFIofficial', 'thumbnail': r're:https?://.+\.jpg$', 'tags': 'count:27', 'start_time': 0.0, @@ -201,6 +202,7 @@ class VKIE(VKBaseIE): 'channel_url': 'https://www.youtube.com/channel/UCgzCNQ11TmR9V97ECnhi3gw', 'channel_follower_count': int, 'age_limit': 0, + 'timestamp': 1358394935, }, }, { @@ -222,6 +224,7 @@ class VKIE(VKBaseIE): 'thumbnail': r're:https?://.+x1080$', 'tags': list, }, + 'skip': 'This video has been deleted and is no longer available.', }, { 'url': 'https://vk.com/clips-74006511?z=clip-74006511_456247211', @@ -235,13 +238,13 @@ class VKIE(VKBaseIE): 'timestamp': 1664995597, 'title': 'Clip by @madempress', 'upload_date': '20221005', - 'uploader': 'Шальная императрица', + 'uploader': 'Шальная Императрица', 'uploader_id': '-74006511', }, }, { # video key is extra_data not url\d+ - 'url': 'http://vk.com/video-110305615_171782105', + 'url': 'https://vk.com/video-110305615_171782105', 'md5': 'e13fcda136f99764872e739d13fac1d1', 'info_dict': { 'id': '-110305615_171782105', @@ -273,6 +276,7 @@ class VKIE(VKBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No formats found', }, { # live stream, hls and rtmp links, most likely already finished live @@ -312,7 +316,16 @@ class VKIE(VKBaseIE): { 'url': 'https://vk.com/clip30014565_456240946', 'only_matching': True, - }] + }, + { + 'url': 'https://vkvideo.ru/video-127553155_456242961', + 'only_matching': True, + }, + { + 'url': 'https://vk.ru/video-220754053_456242564', + 'only_matching': True, + }, + ] def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -338,7 +351,7 @@ class VKIE(VKBaseIE): video_id = '{}_{}'.format(mobj.group('oid'), mobj.group('id')) info_page = self._download_webpage( - 'http://vk.com/video_ext.php?' + mobj.group('embed_query'), video_id) + 'https://vk.com/video_ext.php?' + mobj.group('embed_query'), video_id) error_message = self._html_search_regex( [r'(?s)]+class="video_layer_message"[^>]*>(.+?)', @@ -432,7 +445,7 @@ class VKIE(VKBaseIE): if m_opts_url: opts_url = m_opts_url.group(1) if opts_url.startswith('//'): - opts_url = 'http:' + opts_url + opts_url = 'https:' + opts_url return self.url_result(opts_url) data = player['params'][0] @@ -512,8 +525,11 @@ class VKIE(VKBaseIE): class VKUserVideosIE(VKBaseIE): IE_NAME = 'vk:uservideos' IE_DESC = "VK - User's Videos" - _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/video/(?:playlist/)?(?P[^?$#/&]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P
\w+))?|$)' - _TEMPLATE_URL = 'https://vk.com/videos' + _BASE_URL_RE = r'https?://(?:(?:m|new)\.)?vk(?:video\.ru|\.com/video)' + _VALID_URL = [ + rf'{_BASE_URL_RE}/playlist/(?P-?\d+_\d+)', + rf'{_BASE_URL_RE}/(?P@[^/?#]+)(?:/all)?/?(?!\?.*\bz=video)(?:[?#]|$)', + ] _TESTS = [{ 'url': 'https://vk.com/video/@mobidevices', 'info_dict': { @@ -527,12 +543,20 @@ class VKUserVideosIE(VKBaseIE): }, 'playlist_mincount': 182, }, { - 'url': 'https://vk.com/video/playlist/-174476437_2', + 'url': 'https://vkvideo.ru/playlist/-204353299_426', 'info_dict': { - 'id': '-174476437_playlist_2', - 'title': 'Анонсы', + 'id': '-204353299_playlist_426', }, - 'playlist_mincount': 108, + 'playlist_mincount': 33, + }, { + 'url': 'https://vk.com/video/@gorkyfilmstudio/all', + 'only_matching': True, + }, { + 'url': 'https://vkvideo.ru/@mobidevices', + 'only_matching': True, + }, { + 'url': 'https://vk.com/video/playlist/-174476437_2', + 'only_matching': True, }] _VIDEO = collections.namedtuple('Video', ['owner_id', 'id']) @@ -552,7 +576,7 @@ class VKUserVideosIE(VKBaseIE): v = self._VIDEO._make(video[:2]) video_id = '%d_%d' % (v.owner_id, v.id) yield self.url_result( - 'http://vk.com/video' + video_id, VKIE.ie_key(), video_id) + 'https://vk.com/video' + video_id, VKIE.ie_key(), video_id) if count >= total: break video_list_json = self._download_payload('al_video', page_id, { @@ -561,23 +585,25 @@ class VKUserVideosIE(VKBaseIE): 'oid': page_id, 'section': section, })[0][section] - count += video_list_json['count'] + new_count = video_list_json['count'] + if not new_count: + self.to_screen(f'{page_id}: Skipping {total - count} unavailable videos') + break + count += new_count video_list = video_list_json['list'] def _real_extract(self, url): - u_id, section = self._match_valid_url(url).groups() + u_id = self._match_id(url) webpage = self._download_webpage(url, u_id) if u_id.startswith('@'): - page_id = self._search_regex(r'data-owner-id\s?=\s?"([^"]+)"', webpage, 'page_id') - elif '_' in u_id: - page_id, section = u_id.split('_', 1) - section = f'playlist_{section}' + page_id = traverse_obj( + self._search_json(r'\bvar newCur\s*=', webpage, 'cursor data', u_id), + ('oid', {int}, {str_or_none}, {require('page id')})) + section = traverse_obj(parse_qs(url), ('section', 0)) or 'all' else: - raise ExtractorError('Invalid URL', expected=True) - - if not section: - section = 'all' + page_id, _, section = u_id.partition('_') + section = f'playlist_{section}' playlist_title = clean_html(get_element_by_class('VideoInfoPanel__title', webpage)) return self.playlist_result(self._entries(page_id, section), f'{page_id}_{section}', playlist_title) @@ -717,7 +743,7 @@ class VKWallPostIE(VKBaseIE): class VKPlayBaseIE(InfoExtractor): - _BASE_URL_RE = r'https?://(?:vkplay\.live|live\.vkplay\.ru)/' + _BASE_URL_RE = r'https?://(?:vkplay\.live|live\.vk(?:play|video)\.ru)/' _RESOLUTIONS = { 'tiny': '256x144', 'lowest': '426x240', @@ -797,6 +823,9 @@ class VKPlayIE(VKPlayBaseIE): }, { 'url': 'https://live.vkplay.ru/lebwa/record/33a4e4ce-e3ef-49db-bb14-f006cc6fabc9/records', 'only_matching': True, + }, { + 'url': 'https://live.vkvideo.ru/lebwa/record/33a4e4ce-e3ef-49db-bb14-f006cc6fabc9/records', + 'only_matching': True, }] def _real_extract(self, url): @@ -839,6 +868,9 @@ class VKPlayLiveIE(VKPlayBaseIE): }, { 'url': 'https://live.vkplay.ru/lebwa', 'only_matching': True, + }, { + 'url': 'https://live.vkvideo.ru/panterka', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/xiaohongshu.py b/yt_dlp/extractor/xiaohongshu.py index 1280ca6a9c..46543b823e 100644 --- a/yt_dlp/extractor/xiaohongshu.py +++ b/yt_dlp/extractor/xiaohongshu.py @@ -10,7 +10,7 @@ from ..utils.traversal import traverse_obj class XiaoHongShuIE(InfoExtractor): - _VALID_URL = r'https?://www\.xiaohongshu\.com/explore/(?P[\da-f]+)' + _VALID_URL = r'https?://www\.xiaohongshu\.com/(?:explore|discovery/item)/(?P[\da-f]+)' IE_DESC = '小红书' _TESTS = [{ 'url': 'https://www.xiaohongshu.com/explore/6411cf99000000001300b6d9', @@ -25,6 +25,18 @@ class XiaoHongShuIE(InfoExtractor): 'duration': 101.726, 'thumbnail': r're:https?://sns-webpic-qc\.xhscdn\.com/\d+/[a-z0-9]+/[\w]+', }, + }, { + 'url': 'https://www.xiaohongshu.com/discovery/item/674051740000000007027a15?xsec_token=CBgeL8Dxd1ZWBhwqRd568gAZ_iwG-9JIf9tnApNmteU2E=', + 'info_dict': { + 'id': '674051740000000007027a15', + 'ext': 'mp4', + 'title': '相互喜欢就可以了', + 'uploader_id': '63439913000000001901f49a', + 'duration': 28.073, + 'description': '#广州[话题]# #深圳[话题]# #香港[话题]# #街头采访[话题]# #是你喜欢的类型[话题]#', + 'thumbnail': r're:https?://sns-webpic-qc\.xhscdn\.com/\d+/[\da-f]+/[^/]+', + 'tags': ['广州', '深圳', '香港', '街头采访', '是你喜欢的类型'], + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 41cd90db95..1e83e41b8f 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -78,7 +78,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB', - 'clientVersion': '2.20240726.00.00', + 'clientVersion': '2.20241126.01.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, @@ -90,7 +90,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB', - 'clientVersion': '2.20240726.00.00', + 'clientVersion': '2.20241126.01.00', 'userAgent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15,gzip(gfe)', }, }, @@ -102,7 +102,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_EMBEDDED_PLAYER', - 'clientVersion': '1.20240723.01.00', + 'clientVersion': '1.20241201.00.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 56, @@ -113,7 +113,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_REMIX', - 'clientVersion': '1.20240724.00.00', + 'clientVersion': '1.20241127.01.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, @@ -124,7 +124,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_CREATOR', - 'clientVersion': '1.20240723.03.00', + 'clientVersion': '1.20241203.01.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, @@ -162,7 +162,6 @@ INNERTUBE_CLIENTS = { 'REQUIRE_JS_PLAYER': False, 'REQUIRE_PO_TOKEN': True, 'REQUIRE_AUTH': True, - 'SUPPORTS_COOKIES': True, }, # This client now requires sign-in for every video 'android_creator': { @@ -197,7 +196,6 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 28, 'REQUIRE_JS_PLAYER': False, - 'SUPPORTS_COOKIES': True, }, # iOS clients have HLS live streams. Setting device model to get 60fps formats. # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 @@ -214,6 +212,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, + 'REQUIRE_PO_TOKEN': True, 'REQUIRE_JS_PLAYER': False, }, # This client now requires sign-in for every video @@ -232,7 +231,6 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, 'REQUIRE_JS_PLAYER': False, 'REQUIRE_AUTH': True, - 'SUPPORTS_COOKIES': True, }, # This client now requires sign-in for every video 'ios_creator': { @@ -257,7 +255,9 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'MWEB', - 'clientVersion': '2.20240726.01.00', + 'clientVersion': '2.20241202.07.00', + # mweb does not require PO Token with this UA + 'userAgent': 'Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2, @@ -267,7 +267,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'TVHTML5', - 'clientVersion': '7.20240724.13.00', + 'clientVersion': '7.20241201.18.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 7, @@ -517,11 +517,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return self._search_regex(rf'^({self._YT_CHANNEL_UCID_RE})$', ucid, 'UC-id', default=None) def handle_or_none(self, handle): - return self._search_regex(rf'^({self._YT_HANDLE_RE})$', handle, '@-handle', default=None) + return self._search_regex(rf'^({self._YT_HANDLE_RE})$', urllib.parse.unquote(handle or ''), + '@-handle', default=None) def handle_from_url(self, url): return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_HANDLE_RE})', - url, 'channel handle', default=None) + urllib.parse.unquote(url or ''), 'channel handle', default=None) def ucid_from_url(self, url): return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_CHANNEL_UCID_RE})', @@ -1494,7 +1495,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, # Age-gate videos. See https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-888837000 { - 'note': 'Embed allowed age-gate video', + 'note': 'Embed allowed age-gate video; works with web_embedded', 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', 'info_dict': { 'id': 'HtVdAasjOgU', @@ -1524,7 +1525,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'heatmap': 'count:100', 'timestamp': 1401991663, }, - 'skip': 'Age-restricted; requires authentication', }, { 'note': 'Age-gate video with embed allowed in public site', @@ -2800,6 +2800,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}}, }, }, + { + # uploader_id has non-ASCII characters that are percent-encoded in YT's JSON + 'url': 'https://www.youtube.com/shorts/18NGQq7p3LY', + 'info_dict': { + 'id': '18NGQq7p3LY', + 'ext': 'mp4', + 'title': '아이브 이서 장원영 리즈 삐끼삐끼 챌린지', + 'description': '', + 'uploader': 'ㅇㅇ', + 'uploader_id': '@으아-v1k', + 'uploader_url': 'https://www.youtube.com/@으아-v1k', + 'channel': 'ㅇㅇ', + 'channel_id': 'UCC25oTm2J7ZVoi5TngOHg9g', + 'channel_url': 'https://www.youtube.com/channel/UCC25oTm2J7ZVoi5TngOHg9g', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'playable_in_embed': True, + 'age_limit': 0, + 'duration': 3, + 'timestamp': 1724306170, + 'upload_date': '20240822', + 'availability': 'public', + 'live_status': 'not_live', + 'view_count': int, + 'like_count': int, + 'channel_follower_count': int, + 'categories': ['People & Blogs'], + 'tags': [], + }, + }, ] _WEBPAGE_TESTS = [ @@ -3118,19 +3147,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.to_screen('Extracted signature function:\n' + code) def _parse_sig_js(self, jscode): + # Examples where `sig` is funcname: + # sig=function(a){a=a.split(""); ... ;return a.join("")}; + # ;c&&(c=sig(decodeURIComponent(c)),a.set(b,encodeURIComponent(c)));return a}; + # {var l=f,m=h.sp,n=sig(decodeURIComponent(h.s));l.set(m,encodeURIComponent(n))} + # sig=function(J){J=J.split(""); ... ;return J.join("")}; + # ;N&&(N=sig(decodeURIComponent(N)),J.set(R,encodeURIComponent(N)));return J}; + # {var H=u,k=f.sp,v=sig(decodeURIComponent(f.s));H.set(k,encodeURIComponent(v))} funcname = self._search_regex( - (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + (r'\b(?P[a-zA-Z0-9_$]+)&&\((?P=var)=(?P[a-zA-Z0-9_$]{2,})\(decodeURIComponent\((?P=var)\)\)', + r'(?P[a-zA-Z0-9_$]+)\s*=\s*function\(\s*(?P[a-zA-Z0-9_$]+)\s*\)\s*{\s*(?P=arg)\s*=\s*(?P=arg)\.split\(\s*""\s*\)\s*;\s*[^}]+;\s*return\s+(?P=arg)\.join\(\s*""\s*\)', + r'(?:\b|[^a-zA-Z0-9_$])(?P[a-zA-Z0-9_$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9_$]{2}\.[a-zA-Z0-9_$]{2}\(a,\d+\))?', + # Old patterns + r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\bm=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', - r'\bc&&\(c=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)', - r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?', - r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'("|\')signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P[a-zA-Z0-9$]+)\(', r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', - r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') @@ -3204,6 +3240,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # * a.D&&(b="nn"[+a.D],c=a.get(b))&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("") # * a.D&&(PL(a),b=a.j.n||null)&&(b=narray[0](b),a.set("n",b),narray.length||nfunc("") # * a.D&&(b="nn"[+a.D],vL(a),c=a.j[b]||null)&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("") + # * J.J="";J.url="";J.Z&&(R="nn"[+J.Z],mW(J),N=J.K[R]||null)&&(N=narray[idx](N),J.set(R,N))}}; funcname, idx = self._search_regex( r'''(?x) (?: @@ -3220,7 +3257,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): )\)&&\(c=| \b(?P[a-zA-Z0-9_$]+)= )(?P[a-zA-Z0-9_$]+)(?:\[(?P\d+)\])?\([a-zA-Z]\) - (?(var),[a-zA-Z0-9_$]+\.set\("n"\,(?P=var)\),(?P=nfunc)\.length)''', + (?(var),[a-zA-Z0-9_$]+\.set\((?:"n+"|[a-zA-Z0-9_$]+)\,(?P=var)\))''', jscode, 'n function name', group=('nfunc', 'idx'), default=(None, None)) if not funcname: self.report_warning(join_nonempty( @@ -3229,7 +3266,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return self._search_regex( r'''(?xs) ;\s*(?P[a-zA-Z0-9_$]+)\s*=\s*function\([a-zA-Z0-9_$]+\) - \s*\{(?:(?!};).)+?["']enhanced_except_''', + \s*\{(?:(?!};).)+?return\s*(?P["'])[\w-]+_w8_(?P=q)\s*\+\s*[a-zA-Z0-9_$]+''', jscode, 'Initial JS player n function name', group='name') elif not idx: return funcname @@ -3238,6 +3275,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])\s*[,;]', jscode, f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] + def _fixup_n_function_code(self, argnames, code): + return argnames, re.sub( + rf';\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(["\'])undefined\1\s*\)\s*return\s+{argnames[0]};', + ';', code) + def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) func_code = self.cache.load('youtube-nsig', player_id, min_ver='2024.07.09') @@ -3249,7 +3291,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): func_name = self._extract_n_function_name(jscode, player_url=player_url) - func_code = jsi.extract_function_code(func_name) + # XXX: Workaround for the `typeof` gotcha + func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name)) self.cache.store('youtube-nsig', player_id, func_code) return jsi, player_id, func_code @@ -3265,7 +3308,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): except Exception as e: raise JSInterpreter.Exception(traceback.format_exc(), cause=e) - if ret.startswith('enhanced_except_'): + if ret.startswith('enhanced_except_') or ret.endswith(s): raise JSInterpreter.Exception('Signature function returned an exception') return ret @@ -3929,13 +3972,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) require_po_token = self._get_default_ytcfg(client).get('REQUIRE_PO_TOKEN') - if not po_token and require_po_token: + if not po_token and require_po_token and 'missing_pot' in self._configuration_arg('formats'): self.report_warning( f'No PO Token provided for {client} client, ' - f'which is required for working {client} formats. ' - f'You can manually pass a PO Token for this client with ' - f'--extractor-args "youtube:po_token={client}+XXX"', - only_once=True) + f'which may be required for working {client} formats. This client will be deprioritized', only_once=True) deprioritize_pr = True pr = initial_pr if client == 'web' else None @@ -3968,15 +4008,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: prs.append(pr) + # web_embedded can work around age-gate and age-verification for some embeddable videos + if self._is_agegated(pr) and variant != 'web_embedded': + append_client(f'web_embedded.{base_client}') + # Unauthenticated users will only get web_embedded client formats if age-gated + if self._is_agegated(pr) and not self.is_authenticated: + self.to_screen( + f'{video_id}: This video is age-restricted; some formats may be missing ' + f'without authentication. {self._login_hint()}', only_once=True) + ''' This code is pointless while web_creator is in _DEFAULT_AUTHED_CLIENTS # EU countries require age-verification for accounts to access age-restricted videos # If account is not age-verified, _is_agegated() will be truthy for non-embedded clients - if self.is_authenticated and self._is_agegated(pr): + embedding_is_disabled = variant == 'web_embedded' and self._is_unplayable(pr) + if self.is_authenticated and (self._is_agegated(pr) or embedding_is_disabled): self.to_screen( f'{video_id}: This video is age-restricted and YouTube is requiring ' 'account age-verification; some formats may be missing', only_once=True) # web_creator can work around the age-verification requirement - # android_vr may also be able to work around age-verification # tv_embedded may(?) still work around age-verification if the video is embeddable append_client('web_creator') ''' @@ -3999,6 +4048,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or (live_status == 'post_live' and (duration or 0) > 2 * 3600)): return live_status + def _report_pot_format_skipped(self, video_id, client_name, proto): + msg = ( + f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. ' + 'They will be skipped as they may yield HTTP Error 403. ' + f'You can manually pass a PO Token for this client with --extractor-args "youtube:po_token={client_name}+XXX". ' + 'For more information, refer to https://github.com/yt-dlp/yt-dlp/wiki/Extractors#po-token-guide . ' + 'To enable these broken formats anyway, pass --extractor-args "youtube:formats=missing_pot"') + + # Only raise a warning for non-default clients, to not confuse users. + # iOS HLS formats still work without PO Token, so we don't need to warn about them. + if client_name in (*self._DEFAULT_CLIENTS, *self._DEFAULT_AUTHED_CLIENTS): + self.write_debug(msg, only_once=True) + else: + self.report_warning(msg, only_once=True) + def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): CHUNK_SIZE = 10 << 20 PREFERRED_LANG_VALUE = 10 @@ -4052,10 +4116,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if height: res_qualities[height] = quality + display_name = audio_track.get('displayName') or '' + is_original = 'original' in display_name.lower() + is_descriptive = 'descriptive' in display_name.lower() is_default = audio_track.get('audioIsDefault') - is_descriptive = 'descriptive' in (audio_track.get('displayName') or '').lower() language_code = audio_track.get('id', '').split('.')[0] - if language_code and is_default: + if language_code and (is_original or (is_default and not original_language)): original_language = language_code # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment @@ -4123,11 +4189,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): fmt_url = update_url_query(fmt_url, {'pot': po_token}) # Clients that require PO Token return videoplayback URLs that may return 403 - is_broken = (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) - if is_broken: - self.report_warning( - f'{video_id}: {client_name} client formats require a PO Token which was not provided. ' - 'They will be deprioritized as they may yield HTTP Error 403', only_once=True) + require_po_token = (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) + if require_po_token and 'missing_pot' not in self._configuration_arg('formats'): + self._report_pot_format_skipped(video_id, client_name, 'https') + continue name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' fps = int_or_none(fmt.get('fps')) or 0 @@ -4136,11 +4201,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'filesize': int_or_none(fmt.get('contentLength')), 'format_id': f'{itag}{"-drc" if fmt.get("isDrc") else ""}', 'format_note': join_nonempty( - join_nonempty(audio_track.get('displayName'), is_default and ' (default)', delim=''), + join_nonempty(display_name, is_default and ' (default)', delim=''), name, fmt.get('isDrc') and 'DRC', try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), - is_damaged and 'DAMAGED', is_broken and 'BROKEN', + is_damaged and 'DAMAGED', require_po_token and 'MISSING POT', (self.get_param('verbose') or all_formats) and short_client_name(client_name), delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 @@ -4155,9 +4220,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': fmt_url, 'width': int_or_none(fmt.get('width')), 'language': join_nonempty(language_code, 'desc' if is_descriptive else '') or None, - 'language_preference': PREFERRED_LANG_VALUE if is_default else -10 if is_descriptive else -1, + 'language_preference': PREFERRED_LANG_VALUE if is_original else 5 if is_default else -10 if is_descriptive else -1, # Strictly de-prioritize broken, damaged and 3gp formats - 'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None, + 'preference': -20 if require_po_token else -10 if is_damaged else -2 if itag == '17' else None, } mime_mobj = re.match( r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') @@ -4207,7 +4272,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): key = (proto, f.get('language')) if not all_formats and key in itags[itag]: return False - itags[itag].add(key) if f.get('source_preference') is None: f['source_preference'] = -1 @@ -4215,12 +4279,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Clients that require PO Token return videoplayback URLs that may return 403 # hls does not currently require PO Token if (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) and proto != 'hls': - self.report_warning( - f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. ' - 'They will be deprioritized as they may yield HTTP Error 403', only_once=True) - f['format_note'] = join_nonempty(f.get('format_note'), 'BROKEN', delim=' ') + if 'missing_pot' not in self._configuration_arg('formats'): + self._report_pot_format_skipped(video_id, client_name, proto) + return False + f['format_note'] = join_nonempty(f.get('format_note'), 'MISSING POT', delim=' ') f['source_preference'] -= 20 + itags[itag].add(key) + if itag and all_formats: f['format_id'] = f'{itag}-{proto}' elif any(p != proto for p, _ in itags[itag]): @@ -4674,7 +4740,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?=(?P[^\n]+))(?P=artist)\n+ (?=(?P[^\n]+))(?P=album)\n (?:.+?℗\s*(?P\d{4})(?!\d))? - (?:.+?Released on\s*:\s*(?P\d{4}-\d{2}-\d{2}))? + (?:.+?Released\ on\s*:\s*(?P\d{4}-\d{2}-\d{2}))? (.+?\nArtist\s*:\s* (?=(?P[^\n]+))(?P=clean_artist)\n )?.+\nAuto-generated\ by\ YouTube\.\s*$ @@ -5267,6 +5333,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): 'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}), 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)], 'richGridRenderer': lambda x: self._extract_entries(x, continuation_list), + 'lockupViewModel': lambda x: [self._extract_lockup_view_model(x)], } for key, renderer in isr_content.items(): if key not in known_renderers: diff --git a/yt_dlp/options.py b/yt_dlp/options.py index c05910a586..093fc1b466 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1372,12 +1372,12 @@ def create_parser(): help='Allow Unicode characters, "&" and spaces in filenames (default)') filesystem.add_option( '--windows-filenames', - action='store_true', dest='windowsfilenames', default=False, + action='store_true', dest='windowsfilenames', default=None, help='Force filenames to be Windows-compatible') filesystem.add_option( '--no-windows-filenames', action='store_false', dest='windowsfilenames', - help='Make filenames Windows-compatible only if using Windows (default)') + help='Sanitize filenames only minimally') filesystem.add_option( '--trim-filenames', '--trim-file-names', metavar='LENGTH', dest='trim_file_name', default=0, type=int, diff --git a/yt_dlp/update.py b/yt_dlp/update.py index ca2ec5f376..360f5ad58c 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -65,9 +65,14 @@ def _get_variant_and_executable_path(): machine = '_legacy' if version_tuple(platform.mac_ver()[0]) < (10, 15) else '' else: machine = f'_{platform.machine().lower()}' + is_64bits = sys.maxsize > 2**32 # Ref: https://en.wikipedia.org/wiki/Uname#Examples if machine[1:] in ('x86', 'x86_64', 'amd64', 'i386', 'i686'): - machine = '_x86' if platform.architecture()[0][:2] == '32' else '' + machine = '_x86' if not is_64bits else '' + # platform.machine() on 32-bit raspbian OS may return 'aarch64', so check "64-bitness" + # See: https://github.com/yt-dlp/yt-dlp/issues/11813 + elif machine[1:] == 'aarch64' and not is_64bits: + machine = '_armv7l' # sys.executable returns a /tmp/ path for staticx builds (linux_static) # Ref: https://staticx.readthedocs.io/en/latest/usage.html#run-time-information if static_exe_path := os.getenv('STATICX_PROG_PATH'): @@ -525,11 +530,16 @@ class Updater: @functools.cached_property def cmd(self): """The command-line to run the executable, if known""" + argv = None # There is no sys.orig_argv in py < 3.10. Also, it can be [] when frozen if getattr(sys, 'orig_argv', None): - return sys.orig_argv + argv = sys.orig_argv elif getattr(sys, 'frozen', False): - return sys.argv + argv = sys.argv + # linux_static exe's argv[0] will be /tmp/staticx-NNNN/yt-dlp_linux if we don't fixup here + if argv and os.getenv('STATICX_PROG_PATH'): + argv = [self.filename, *argv[1:]] + return argv def restart(self): """Restart the executable""" diff --git a/yt_dlp/version.py b/yt_dlp/version.py index f4b9400bc5..1ff43c611f 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.11.18' +__version__ = '2024.12.23' -RELEASE_GIT_HEAD = '7ea2787920cccc6b8ea30791993d114fbd564434' +RELEASE_GIT_HEAD = '65cf46cddd873fd229dbb0fc0689bca4c201c6b6' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.11.18' +_pkg_version = '2024.12.23'