Repository: soimort/you-get Branch: develop Commit: 049548f3f3f3 Files: 139 Total size: 574.3 KB Directory structure: gitextract_fe0wl05y/ ├── .github/ │ └── workflows/ │ └── python-package.yml ├── .gitignore ├── CHANGELOG.rst ├── CONTRIBUTING.md ├── MANIFEST.in ├── Makefile ├── README.md ├── README.rst ├── SECURITY.md ├── contrib/ │ └── completion/ │ ├── you-get-completion.bash │ └── you-get.fish ├── setup.cfg ├── setup.py ├── src/ │ └── you_get/ │ ├── cli_wrapper/ │ │ ├── player/ │ │ │ ├── dragonplayer.py │ │ │ ├── gnome_mplayer.py │ │ │ ├── mplayer.py │ │ │ ├── vlc.py │ │ │ └── wmp.py │ │ └── transcoder/ │ │ ├── ffmpeg.py │ │ ├── libav.py │ │ └── mencoder.py │ ├── common.py │ ├── extractor.py │ ├── extractors/ │ │ ├── acfun.py │ │ ├── alive.py │ │ ├── archive.py │ │ ├── baidu.py │ │ ├── bandcamp.py │ │ ├── baomihua.py │ │ ├── bigthink.py │ │ ├── bilibili.py │ │ ├── bokecc.py │ │ ├── cbs.py │ │ ├── ckplayer.py │ │ ├── cntv.py │ │ ├── coub.py │ │ ├── dailymotion.py │ │ ├── douban.py │ │ ├── douyin.py │ │ ├── douyutv.py │ │ ├── ehow.py │ │ ├── embed.py │ │ ├── facebook.py │ │ ├── fc2video.py │ │ ├── flickr.py │ │ ├── freesound.py │ │ ├── funshion.py │ │ ├── giphy.py │ │ ├── google.py │ │ ├── heavymusic.py │ │ ├── huomaotv.py │ │ ├── icourses.py │ │ ├── ifeng.py │ │ ├── imgur.py │ │ ├── infoq.py │ │ ├── instagram.py │ │ ├── interest.py │ │ ├── iqilu.py │ │ ├── iqiyi.py │ │ ├── iwara.py │ │ ├── ixigua.py │ │ ├── joy.py │ │ ├── kakao.py │ │ ├── khan.py │ │ ├── ku6.py │ │ ├── kuaishou.py │ │ ├── kugou.py │ │ ├── kuwo.py │ │ ├── le.py │ │ ├── lizhi.py │ │ ├── longzhu.py │ │ ├── lrts.py │ │ ├── magisto.py │ │ ├── metacafe.py │ │ ├── mgtv.py │ │ ├── miaopai.py │ │ ├── miomio.py │ │ ├── missevan.py │ │ ├── mixcloud.py │ │ ├── mtv81.py │ │ ├── nanagogo.py │ │ ├── naver.py │ │ ├── netease.py │ │ ├── nicovideo.py │ │ ├── pinterest.py │ │ ├── pixnet.py │ │ ├── pptv.py │ │ ├── qie.py │ │ ├── qie_video.py │ │ ├── qingting.py │ │ ├── qq.py │ │ ├── qq_egame.py │ │ ├── showroom.py │ │ ├── sina.py │ │ ├── sohu.py │ │ ├── soundcloud.py │ │ ├── suntv.py │ │ ├── ted.py │ │ ├── theplatform.py │ │ ├── tiktok.py │ │ ├── toutiao.py │ │ ├── tucao.py │ │ ├── tudou.py │ │ ├── tumblr.py │ │ ├── twitter.py │ │ ├── ucas.py │ │ ├── universal.py │ │ ├── veoh.py │ │ ├── vimeo.py │ │ ├── vk.py │ │ ├── w56.py │ │ ├── wanmen.py │ │ ├── ximalaya.py │ │ ├── xinpianchang.py │ │ ├── yixia.py │ │ ├── yizhibo.py │ │ ├── youku.py │ │ ├── youtube.py │ │ ├── zhanqi.py │ │ ├── zhibo.py │ │ └── zhihu.py │ ├── json_output.py │ ├── processor/ │ │ ├── ffmpeg.py │ │ ├── join_flv.py │ │ ├── join_mp4.py │ │ ├── join_ts.py │ │ └── rtmpdump.py │ ├── util/ │ │ ├── fs.py │ │ ├── git.py │ │ ├── log.py │ │ ├── os.py │ │ ├── strings.py │ │ └── term.py │ └── version.py ├── tests/ │ ├── test.py │ ├── test_common.py │ └── test_util.py ├── you-get └── you-get.plugin.zsh ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/python-package.yml ================================================ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions name: develop on: push: branches: [ develop ] pull_request: branches: [ develop ] jobs: build: runs-on: ubuntu-latest strategy: fail-fast: false matrix: python-version: [3.8, 3.9, '3.10', '3.11', '3.12', '3.13', pypy-3.8, pypy-3.9, pypy-3.10] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip setuptools pip install flake8 if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --ignore=F824 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with unittest run: | make test ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log # Sphinx documentation docs/_build/ # PyBuilder target/ # Misc _* *_ *.3gp *.asf *.download *.f4v *.flv *.gif *.html *.jpg *.lrc *.mkv *.mp3 *.mp4 *.mpg *.png *.srt *.ts *.webm *.xml *.json /.env /.idea *.m4a *.DS_Store *.txt *.sw[a-p] *.zip .emacs* .vscode ================================================ FILE: CHANGELOG.rst ================================================ Changelog ========= 0.3.36 ------ *Date: 2015-10-05* * New command-line option: --json * New site support: - Internet Archive * Bug fixes: - iQIYI - SoundCloud 0.3.35 ------ *Date: 2015-09-21* * New site support: - 755 http://7gogo.jp/ (via #659 by @soimort) - Funshion http://www.fun.tv/ (via #619 by @cnbeining) - iQilu http://v.iqilu.com/ (via #636 by @cnbeining) - Metacafe http://www.metacafe.com/ (via #620 by @cnbeining) - Qianmo http://qianmo.com/ (via #600 by @cnbeining) - Weibo Miaopai http://weibo.com/ (via #605 by @cnbeining) * Bug fixes: - 163 (by @lilydjwg) - CNTV (by @Red54) - Dailymotion (by @jackyzy823 and @ddumitran) - iQIYI (by @jackyzy823 and others) - QQ (by @soimort) - SoundCloud (by @soimort) - Tudou (by @CzBiX) - Vimeo channel (by @cnbeining) - YinYueTai (by @soimort) - Youku (by @junzh0u) - Embedded Youku/Tudou player (by @zhangn1985) 0.3.34 ------ *Date: 2015-07-12* * Bug fix release 0.3.33 ------ *Date: 2015-06-10* * Many bug fixes by our awesome contributors 0.3.32 ------ *Date: 2014-12-10* * New site support: - baomihua.com - zhanqi.tv * Bug fixes: - DouyuTV - Tudou - Tumblr - Vine - Youku 0.3.31 ------ *Date: 2014-11-01* * New site support: - Dongting (by @lilydjwg) - DouyuTV (by @0x00-pl) - LeTV cloud (by @cnbeining) * Bug fixes: - AcFun - Bilibili - Niconico - iQIYI 0.3.30 ------ *Date: 2014-09-21* * First Alpha release * Support PyPy3 * Bug fixes: - YouTube - Youku - Tudou - Niconico - AcFun 0.3.30dev-20140907 ------------------ *Date: 2014-09-07* * Bug fixes: - AcFun - iQIYI - MioMio - QQ 0.3.30dev-20140820 ------------------ *Date: 2014-08-20* * Bug fix release 0.3.30dev-20140812 ------------------ *Date: 2014-08-12* * Bug fixes: - Youku * New site support: - VideoBam (by @cnbeining) 0.3.30dev-20140806 ------------------ *Date: 2014-08-06* * Bug fixes: - Youku - Nicovideo - Bilibili - Letv * New site support: - Tucao.cc * Use FFmpeg concat demuxer to join video segments (ffmpeg>=1.1) 0.3.30dev-20140730 ------------------ *Date: 2014-07-30* * YouTube: support fixed * Youku: password-protected video support 0.3.30dev-20140723 ------------------ *Date: 2014-07-23* * YouTube: (experimental) video format selection * Youku: playlist support * NetEase Music: high quality download (by @farseer90718) * PPTV: support fixed (by @jackyzy823) * Catfun.tv: new site support (by @jackyzy823) * AcFun.tv: domain name fixed 0.3.30dev-20140716 ------------------ *Date: 2014-07-16* * Bug fix release for: - YouTube - Youku * New site support: (by @jackyzy823) - MTV 81 http://www.mtv81.com - Kugou (酷狗音乐) http://www.kugou.com - Kuwo (酷我音乐) http://www.kuwo.cn - NetEase Music (网易云音乐) http://music.163.com 0.3.30dev-20140629 ------------------ *Date: 2014-06-29* * Bug fix release for: - Youku - YouTube - TED - Bilibili * (Experimental) Video format selection (for Youku only) 0.3.29 ------ *Date: 2014-05-29* * Bug fix release 0.3.28.3 -------- *Date: 2014-05-18* * New site support: - CBS.com 0.3.28.2 -------- *Date: 2014-04-13* * Bug fix release 0.3.28.1 -------- *Date: 2014-02-28* * Bug fix release 0.3.28 ------ *Date: 2014-02-21* * New site support: - Magisto.com - VK.com 0.3.27 ------ *Date: 2014-02-14* * Bug fix release 0.3.26 ------ *Date: 2014-02-08* * New features: - Play video in players (#286) - LeTV support (#289) - Youku 1080P support * Bug fixes: - YouTube (#282, #292) - Sina (#246, #280) - Mixcloud - NetEase - QQ - Vine 0.3.25 ------ *Date: 2013-12-20* * Bug fix release 0.3.24 ------ *Date: 2013-10-30* * Experimental: Sogou proxy server * Fix issues for: - Vimeo 0.3.23 ------ *Date: 2013-10-23* * Support YouTube playlists * Support general short URLs * Fix issues for: - Sina 0.3.22 ------ *Date: 2013-10-18* * Fix issues for: - Baidu - Bilibili - JPopsuki TV - Niconico - PPTV - TED - Tumblr - YinYueTai - YouTube - ... 0.3.21 ------ *Date: 2013-08-17* * Fix issues for: - YouTube - YinYueTai - pan.baidu.com 0.3.20 ------ *Date: 2013-08-16* * Add support for: - eHow - Khan Academy - TED - 5sing * Fix issues for: - Tudou 0.3.18 ------ *Date: 2013-07-19* * Fix issues for: - Dailymotion - Youku - Sina - AcFun - bilibili 0.3.17 ------ *Date: 2013-07-12* * Fix issues for: - YouTube - 163 - bilibili * Code cleanup. 0.3.16 ------ *Date: 2013-06-28* * Fix issues for: - YouTube - Sohu - Google+ (enable HTTPS proxy) 0.3.15 ------ *Date: 2013-06-21* * Add support for: - Instagram 0.3.14 ------ *Date: 2013-06-14* * Add support for: - Alive.in.th * Remove support of: - JPopsuki * Fix issues for: - AcFun - iQIYI 0.3.13 ------ *Date: 2013-06-07* * Add support for: - Baidu Wangpan (video only) * Fix issue for: - Google+ 0.3.12 ------ *Date: 2013-05-19* * Fix issues for: - Google+ - Mixcloud - Tudou 0.3.11 ------ *Date: 2013-04-26* * Add support for: - Google Drive (Google Docs) 0.3.10 ------ *Date: 2013-04-19* * Add support for: - SongTaste * Support Libav as well as FFmpeg. 0.3.9 ----- *Date: 2013-04-12* * Add support for: - Freesound 0.3.8 ----- *Date: 2013-04-05* * Add support for: - Coursera 0.3.7 ----- *Date: 2013-03-29* * Add support for: - Baidu 0.3.6 ----- *Date: 2013-03-22* * Add support for: - Vine * Fix issue for: - YouTube 0.3.5 ----- *Date: 2013-03-15* * Default to use FFmpeg for merging .flv files. 0.3.4 ----- *Date: 2013-03-08* * Add support for: - Blip - VID48 0.3.3 ----- *Date: 2013-03-01* * Add support for: - Douban - MioMio * Fix issues for: - Tudou - Vimeo 0.3.2 ----- *Date: 2013-02-22* * Add support for: - JPopsuki * Fix issue for Xiami. 0.3.1 ----- *Date: 2013-02-15* * Fix issues for Google+ and Mixcloud. * API changed. 0.3.0 ----- *Date: 2013-02-08* * Add support for: - Niconico 0.3dev-20130201 --------------- *Date: 2013-02-01* * Add support for: - Mixcloud - Facebook - Joy.cn 0.3dev-20130125 --------------- *Date: 2013-01-25* * Dailymotion: downloading best quality available now. * iQIYI: fix `#77 `_. 0.3dev-20130118 --------------- *Date: 2013-01-18* * YinYueTai: downloading best quality available now. * Sohu: fix `#69 `_. 0.3dev-20130111 --------------- *Date: 2013-01-11* * Add support for: - NetEase (v.163.com) - YouTube short URLs * Vimeo: downloading best quality available now. 0.3dev-20130104 --------------- *Date: 2013-01-04* * Sohu: - fix `#53 `_. - merge pull request `#54 `_; downloading best quality available now. 0.3dev-20121228 --------------- *Date: 2012-12-28* * Add support for: - Xiami - Tumblr audios 0.3dev-20121221 --------------- *Date: 2012-12-21* * YouTube: fix `#45 `_. * Merge pull request `#46 `_; fix title parsing issue on Tudou. 0.3dev-20121220 --------------- *Date: 2012-12-20* * YouTube: quick dirty fix to `#45 `_. 0.3dev-20121219 --------------- *Date: 2012-12-19* * Add support for: - Tumblr 0.3dev-20121217 --------------- *Date: 2012-12-17* * Google+: downloading best quality available now. * Fix issues `#42 `_, `#43 `_ for Google+. * Merge pull request `#40 `_; fix some issues for Ku6, Sina and 56. 0.3dev-20121212 --------------- *Date: 2012-12-12* * YouTube: fix some major issues on parsing video titles. 0.3dev-20121210 --------------- *Date: 2012-12-10* * YouTube: downloading best quality available now. * Add support for: - SoundCloud 0.2.16 ------ *Date: 2012-12-01* * Add support for: - QQ * Small fixes merged from youku-lixian. 0.2.15 ------ *Date: 2012-11-30* * Fix issue `#30 `_ for bilibili. 0.2.14 ------ *Date: 2012-11-29* * Fix issue `#28 `_ for Tudou. * Better support for AcFun. 0.2.13 ------ *Date: 2012-10-30* * Nothing new. 0.2.12 ------ *Date: 2012-10-30* * Fix issue `#20 `_ for AcFun. 0.2.11 ------ *Date: 2012-10-23* * Move on to Python 3.3! * Fix issues: - `#17 `_ - `#18 `_ - `#19 `_ 0.2.10 ------ *Date: 2012-10-16* * Add support for: - Google+ 0.2.9 ----- *Date: 2012-10-09* * Fix issue `#16 `_. 0.2.8 ----- *Date: 2012-10-02* * Fix issue `#15 `_ for AcFun. 0.2.7 ----- *Date: 2012-09-28* * Fix issue `#6 `_ for YouTube. 0.2.6 ----- *Date: 2012-09-26* * Fix issue `#5 `_ for YinYueTai. 0.2.5 ----- *Date: 2012-09-25* * Add support for: - Dailymotion 0.2.4 ----- *Date: 2012-09-18* * Use FFmpeg for converting and joining video files. * Add '--url' and '--debug' options. 0.2.2 ----- *Date: 2012-09-17* * Add danmaku support for AcFun and bilibili. * Fix issue `#2 `_ and `#4 `_ for YouTube. * Temporarily fix issue for iQIYI (use .ts instead of .f4v). 0.2.1 ----- *Date: 2012-09-02* * Add support for: - ifeng 0.2 --- *Date: 2012-09-02* * Add support for: - Vimeo - AcFun - bilibili - CNTV - iQIYI - Ku6 - PPTV - Sina - Sohu - 56 0.1.3 ----- *Date: 2012-09-01* * Playlist URLs are now automatically handled. ('--playlist' option is no longer needed) * Handle KeyboardInterrupt silently. * Fix Unicode character display on code pages. 0.1 --- *Date: 2012-09-01* * First PyPI release. * Fix issue `#1 `_. 0.0.1 ----- *Date: 2012-08-21* * Initial release, forked from `iambus/youku-lixian `_; add: - YouTube support. - Pausing and resuming of downloads. - HTTP proxy settings. ================================================ FILE: CONTRIBUTING.md ================================================ # How to Report an Issue If you would like to report a problem you find when using `you-get`, please open a [Pull Request](https://github.com/soimort/you-get/pulls), which should include: 1. A detailed description of the encountered problem; 2. At least one commit, addressing the problem through some unit test(s). * Examples of good commits: [#2675](https://github.com/soimort/you-get/pull/2675/files), [#2680](https://github.com/soimort/you-get/pull/2680/files), [#2685](https://github.com/soimort/you-get/pull/2685/files) PRs that fail to meet the above criteria may be closed summarily with no further action. A valid PR will remain open until its addressed problem is fixed. # 如何汇报问题 为了防止对 GitHub Issues 的滥用,本项目不接受一般的 Issue。 如您在使用 `you-get` 的过程中发现任何问题,请开启一个 [Pull Request](https://github.com/soimort/you-get/pulls)。该 PR 应当包含: 1. 详细的问题描述; 2. 至少一个 commit,其内容是**与问题相关的**单元测试。**不要通过随意修改无关文件的方式来提交 PR!** * 有效的 commit 示例:[#2675](https://github.com/soimort/you-get/pull/2675/files), [#2680](https://github.com/soimort/you-get/pull/2680/files), [#2685](https://github.com/soimort/you-get/pull/2685/files) 不符合以上条件的 PR 可能被直接关闭。 有效的 PR 将会被一直保留,直至相应的问题得以修复。 ================================================ FILE: MANIFEST.in ================================================ include *.rst include *.txt include Makefile include CONTRIBUTING.md include README.md include you-get include you-get.json include you-get.plugin.zsh recursive-include contrib * ================================================ FILE: Makefile ================================================ .PHONY: default i test clean all html rst build install release default: i i: @(cd src; python -i -c 'import you_get; print("You-Get %s\n>>> import you_get" % you_get.version.__version__)') test: (cd src; python -m unittest discover -s ../tests) clean: zenity --question rm -fr build/ dist/ src/*.egg-info/ find . | grep __pycache__ | xargs rm -fr find . | grep .pyc | xargs rm -f all: build html: pandoc README.md > README.html rst: pandoc -s -t rst README.md > README.rst build: python -m build install: python -m pip install . release: build @echo 'Upload new version to PyPI using:' @echo ' twine upload --sign dist/you_get-VERSION*' ================================================ FILE: README.md ================================================ # You-Get [![Build Status](https://github.com/soimort/you-get/workflows/develop/badge.svg)](https://github.com/soimort/you-get/actions) [![PyPI version](https://img.shields.io/pypi/v/you-get.svg)](https://pypi.python.org/pypi/you-get/) [![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) **NOTICE (30 May 2022): Support for Python 3.5, 3.6 and 3.7 will eventually be dropped. ([see details here](https://github.com/soimort/you-get/wiki/TLS-1.3-post-handshake-authentication-(PHA)))** **NOTICE (8 Mar 2019): Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.** --- [You-Get](https://you-get.org/) is a tiny command-line utility to download media contents (videos, audios, images) from the Web, in case there is no other handy way to do it. Here's how you use `you-get` to download a video from [YouTube](https://www.youtube.com/watch?v=jNQXAC9IVRw): ```console $ you-get 'https://www.youtube.com/watch?v=jNQXAC9IVRw' site: YouTube title: Me at the zoo stream: - itag: 43 container: webm quality: medium size: 0.5 MiB (564215 bytes) # download-with: you-get --itag=43 [URL] Downloading Me at the zoo.webm ... 100% ( 0.5/ 0.5MB) ├██████████████████████████████████┤[1/1] 6 MB/s Saving Me at the zoo.en.srt ... Done. ``` And here's why you might want to use it: * You enjoyed something on the Internet, and just want to download them for your own pleasure. * You watch your favorite videos online from your computer, but you are prohibited from saving them. You feel that you have no control over your own computer. (And it's not how an open Web is supposed to work.) * You want to get rid of any closed-source technology or proprietary JavaScript code, and disallow things like Flash running on your computer. * You are an adherent of hacker culture and free software. What `you-get` can do for you: * Download videos / audios from popular websites such as YouTube, Youku, Niconico, and a bunch more. (See the [full list of supported sites](#supported-sites)) * Stream an online video in your media player. No web browser, no more ads. * Download images (of interest) by scraping a web page. * Download arbitrary non-HTML contents, i.e., binary files. Interested? [Install it](#installation) now and [get started by examples](#getting-started). Are you a Python programmer? Then check out [the source](https://github.com/soimort/you-get) and fork it! ![](https://i.imgur.com/GfthFAz.png) ## Installation ### Prerequisites The following dependencies are recommended: * **[Python](https://www.python.org/downloads/)** 3.7.4 or above * **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above * (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/) ### Option 1: Install via pip The official release of `you-get` is distributed on [PyPI](https://pypi.python.org/pypi/you-get), and can be installed easily from a PyPI mirror via the [pip](https://en.wikipedia.org/wiki/Pip_\(package_manager\)) package manager: (Note that you must use the Python 3 version of `pip`) $ pip install you-get ### Option 2: Install via [Antigen](https://github.com/zsh-users/antigen) (for Zsh users) Add the following line to your `.zshrc`: antigen bundle soimort/you-get ### Option 3: Download from GitHub You may either download the [stable](https://github.com/soimort/you-get/archive/master.zip) (identical with the latest release on PyPI) or the [develop](https://github.com/soimort/you-get/archive/develop.zip) (more hotfixes, unstable features) branch of `you-get`. Unzip it, and put the directory containing the `you-get` script into your `PATH`. Alternatively, run ``` $ cd path/to/you-get $ [sudo] python -m pip install . ``` Or ``` $ cd path/to/you-get $ python -m pip install . --user ``` to install `you-get` to a permanent path. (And don't omit the dot `.` representing the current directory) You can also use the [pipenv](https://pipenv.pypa.io/en/latest) to install the `you-get` in the Python virtual environment. ``` $ pipenv install -e . $ pipenv run you-get --version you-get: version 0.4.1555, a tiny downloader that scrapes the web. ``` ### Option 4: Git clone This is the recommended way for all developers, even if you don't often code in Python. ``` $ git clone git://github.com/soimort/you-get.git ``` Then put the cloned directory into your `PATH`, or run `python -m pip install path/to/you-get` to install `you-get` to a permanent path. ### Option 5: Homebrew (Mac only) You can install `you-get` easily via: ``` $ brew install you-get ``` ### Option 6: pkg (FreeBSD only) You can install `you-get` easily via: ``` # pkg install you-get ``` ### Option 7: Flox (Mac, Linux, and Windows WSL) You can install `you-get` easily via: ``` $ flox install you-get ``` ### Shell completion Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completion`](https://github.com/soimort/you-get/tree/develop/contrib/completion). Please consult your shell's manual for how to take advantage of them. ## Upgrading Based on which option you chose to install `you-get`, you may upgrade it via: ``` $ pip install --upgrade you-get ``` or download the latest release via: ``` $ you-get https://github.com/soimort/you-get/archive/master.zip ``` In order to get the latest ```develop``` branch without messing up the PIP, you can try: ``` $ pip install --upgrade --force-reinstall git+https://github.com/soimort/you-get@develop ``` ## Getting Started ### Download a video When you get a video of interest, you might want to use the `--info`/`-i` option to see all available quality and formats: ``` $ you-get -i 'https://www.youtube.com/watch?v=jNQXAC9IVRw' site: YouTube title: Me at the zoo streams: # Available quality and codecs [ DASH ] ____________________________________ - itag: 242 container: webm quality: 320x240 size: 0.6 MiB (618358 bytes) # download-with: you-get --itag=242 [URL] - itag: 395 container: mp4 quality: 320x240 size: 0.5 MiB (550743 bytes) # download-with: you-get --itag=395 [URL] - itag: 133 container: mp4 quality: 320x240 size: 0.5 MiB (498558 bytes) # download-with: you-get --itag=133 [URL] - itag: 278 container: webm quality: 192x144 size: 0.4 MiB (392857 bytes) # download-with: you-get --itag=278 [URL] - itag: 160 container: mp4 quality: 192x144 size: 0.4 MiB (370882 bytes) # download-with: you-get --itag=160 [URL] - itag: 394 container: mp4 quality: 192x144 size: 0.4 MiB (367261 bytes) # download-with: you-get --itag=394 [URL] [ DEFAULT ] _________________________________ - itag: 43 container: webm quality: medium size: 0.5 MiB (568748 bytes) # download-with: you-get --itag=43 [URL] - itag: 18 container: mp4 quality: small # download-with: you-get --itag=18 [URL] - itag: 36 container: 3gp quality: small # download-with: you-get --itag=36 [URL] - itag: 17 container: 3gp quality: small # download-with: you-get --itag=17 [URL] ``` By default, the one on the top is the one you will get. If that looks cool to you, download it: ``` $ you-get 'https://www.youtube.com/watch?v=jNQXAC9IVRw' site: YouTube title: Me at the zoo stream: - itag: 242 container: webm quality: 320x240 size: 0.6 MiB (618358 bytes) # download-with: you-get --itag=242 [URL] Downloading Me at the zoo.webm ... 100% ( 0.6/ 0.6MB) ├██████████████████████████████████████████████████████████████████████████████┤[2/2] 2 MB/s Merging video parts... Merged into Me at the zoo.webm Saving Me at the zoo.en.srt ... Done. ``` (If a YouTube video has any closed captions, they will be downloaded together with the video file, in SubRip subtitle format.) Or, if you prefer another format (mp4), just use whatever the option `you-get` shows to you: ``` $ you-get --itag=18 'https://www.youtube.com/watch?v=jNQXAC9IVRw' ``` **Note:** * At this point, format selection has not been generally implemented for most of our supported sites; in that case, the default format to download is the one with the highest quality. * `ffmpeg` is a required dependency, for downloading and joining videos streamed in multiple parts (e.g. on some sites like Youku), and for YouTube videos of 1080p or high resolution. * If you don't want `you-get` to join video parts after downloading them, use the `--no-merge`/`-n` option. ### Download anything else If you already have the URL of the exact resource you want, you can download it directly with: ``` $ you-get https://stallman.org/rms.jpg Site: stallman.org Title: rms Type: JPEG Image (image/jpeg) Size: 0.06 MiB (66482 Bytes) Downloading rms.jpg ... 100% ( 0.1/ 0.1MB) ├████████████████████████████████████████┤[1/1] 127 kB/s ``` Otherwise, `you-get` will scrape the web page and try to figure out if there's anything interesting to you: ``` $ you-get https://kopasas.tumblr.com/post/69361932517 Site: Tumblr.com Title: [tumblr] tumblr_mxhg13jx4n1sftq6do1_640 Type: Portable Network Graphics (image/png) Size: 0.11 MiB (118484 Bytes) Downloading [tumblr] tumblr_mxhg13jx4n1sftq6do1_640.png ... 100% ( 0.1/ 0.1MB) ├████████████████████████████████████████┤[1/1] 22 MB/s ``` **Note:** * This feature is an experimental one and far from perfect. It works best on scraping large-sized images from popular websites like Tumblr and Blogger, but there is really no universal pattern that can apply to any site on the Internet. ### Search on Google Videos and download You can pass literally anything to `you-get`. If it isn't a valid URL, `you-get` will do a Google search and download the most relevant video for you. (It might not be exactly the thing you wish to see, but still very likely.) ``` $ you-get "Richard Stallman eats" ``` ### Pause and resume a download You may use Ctrl+C to interrupt a download. A temporary `.download` file is kept in the output directory. Next time you run `you-get` with the same arguments, the download progress will resume from the last session. In case the file is completely downloaded (the temporary `.download` extension is gone), `you-get` will just skip the download. To enforce re-downloading, use the `--force`/`-f` option. (**Warning:** doing so will overwrite any existing file or temporary file with the same name!) ### Set the path and name of downloaded file Use the `--output-dir`/`-o` option to set the path, and `--output-filename`/`-O` to set the name of the downloaded file: ``` $ you-get -o ~/Videos -O zoo.webm 'https://www.youtube.com/watch?v=jNQXAC9IVRw' ``` **Tips:** * These options are helpful if you encounter problems with the default video titles, which may contain special characters that do not play well with your current shell / operating system / filesystem. * These options are also helpful if you write a script to batch download files and put them into designated folders with designated names. ### Proxy settings You may specify an HTTP proxy for `you-get` to use, via the `--http-proxy`/`-x` option: ``` $ you-get -x 127.0.0.1:8087 'https://www.youtube.com/watch?v=jNQXAC9IVRw' ``` However, the system proxy setting (i.e. the environment variable `http_proxy`) is applied by default. To disable any proxy, use the `--no-proxy` option. **Tips:** * If you need to use proxies a lot (in case your network is blocking certain sites), you might want to use `you-get` with [proxychains](https://github.com/rofl0r/proxychains-ng) and set `alias you-get="proxychains -q you-get"` (in Bash). * For some websites (e.g. Youku), if you need access to some videos that are only available in mainland China, there is an option of using a specific proxy to extract video information from the site: `--extractor-proxy`/`-y`. ### Watch a video Use the `--player`/`-p` option to feed the video into your media player of choice, e.g. `mpv` or `vlc`, instead of downloading it: ``` $ you-get -p vlc 'https://www.youtube.com/watch?v=jNQXAC9IVRw' ``` Or, if you prefer to watch the video in a browser, just without ads or comment section: ``` $ you-get -p chromium 'https://www.youtube.com/watch?v=jNQXAC9IVRw' ``` **Tips:** * It is possible to use the `-p` option to start another download manager, e.g., `you-get -p uget-gtk 'https://www.youtube.com/watch?v=jNQXAC9IVRw'`, though they may not play together very well. ### Load cookies Not all videos are publicly available to anyone. If you need to log in your account to access something (e.g., a private video), it would be unavoidable to feed the browser cookies to `you-get` via the `--cookies`/`-c` option. **Note:** * As of now, we are supporting two formats of browser cookies: Mozilla `cookies.sqlite` and Netscape `cookies.txt`. ### Reuse extracted data Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the page. Use `--json` to get an abstract of extracted data in the JSON format. **Warning:** * For the time being, this feature has **NOT** been stabilized and the JSON schema may have breaking changes in the future. ## Supported Sites | Site | URL | Videos? | Images? | Audios? | | :--: | :-- | :-----: | :-----: | :-----: | | **YouTube** | |✓| | | | **X (Twitter)** | |✓|✓| | | VK | |✓|✓| | | Vimeo | |✓| | | | Veoh | |✓| | | | **Tumblr** | |✓|✓|✓| | TED | |✓| | | | SoundCloud | | | |✓| | SHOWROOM | |✓| | | | Pinterest | | |✓| | | MTV81 | |✓| | | | Mixcloud | | | |✓| | Metacafe | |✓| | | | Magisto | |✓| | | | Khan Academy | |✓| | | | Internet Archive | |✓| | | | **Instagram** | |✓|✓| | | InfoQ | |✓| | | | Imgur | | |✓| | | Heavy Music Archive | | | |✓| | Freesound | | | |✓| | Flickr | |✓|✓| | | FC2 Video | |✓| | | | Facebook | |✓| | | | eHow | |✓| | | | Dailymotion | |✓| | | | Coub | |✓| | | | CBS | |✓| | | | Bandcamp | | | |✓| | AliveThai | |✓| | | | interest.me | |✓| | | | **755
ナナゴーゴー** | |✓|✓| | | **niconico
ニコニコ動画** | |✓| | | | **163
网易视频
网易云音乐** |
|✓| |✓| | 56网 | |✓| | | | **AcFun** | |✓| | | | **Baidu
百度贴吧** | |✓|✓| | | 爆米花网 | |✓| | | | **bilibili
哔哩哔哩** | |✓|✓|✓| | 豆瓣 | |✓| |✓| | 斗鱼 | |✓| | | | 凤凰视频 | |✓| | | | 风行网 | |✓| | | | iQIYI
爱奇艺 | |✓| | | | 激动网 | |✓| | | | 酷6网 | |✓| | | | 酷狗音乐 | | | |✓| | 酷我音乐 | | | |✓| | 乐视网 | |✓| | | | 荔枝FM | | | |✓| | 懒人听书 | | | |✓| | 秒拍 | |✓| | | | MioMio弹幕网 | |✓| | | | MissEvan
猫耳FM | | | |✓| | 痞客邦 | |✓| | | | PPTV聚力 | |✓| | | | 齐鲁网 | |✓| | | | QQ
腾讯视频 | |✓| | | | 企鹅直播 | |✓| | | | Sina
新浪视频
微博秒拍视频 |
|✓| | | | Sohu
搜狐视频 | |✓| | | | **Tudou
土豆** | |✓| | | | 阳光卫视 | |✓| | | | **Youku
优酷** | |✓| | | | 战旗TV | |✓| | | | 央视网 | |✓| | | | Naver
네이버 | |✓| | | | 芒果TV | |✓| | | | 火猫TV | |✓| | | | 阳光宽频网 | |✓| | | | 西瓜视频 | |✓| | | | 新片场 | |✓| | | | 快手 | |✓|✓| | | 抖音 | |✓| | | | TikTok | |✓| | | | 中国体育(TV) |
|✓| | | | 知乎 | |✓| | | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. ### Known bugs If something is broken and `you-get` can't get you things you want, don't panic. (Yes, this happens all the time!) Check if it's already a known problem on . If not, follow the guidelines on [how to report an issue](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md). ## Getting Involved You can reach us on the Gitter channel [#soimort/you-get](https://gitter.im/soimort/you-get) (here's how you [set up your IRC client](https://irc.gitter.im) for Gitter). If you have a quick question regarding `you-get`, ask it there. If you are seeking to report an issue or contribute, please make sure to read [the guidelines](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) first. ## Legal Issues This software is distributed under the [MIT license](https://raw.github.com/soimort/you-get/master/LICENSE.txt). In particular, please be aware that > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Translated to human words: *In case your use of the software forms the basis of copyright infringement, or you use the software for any other illegal purposes, the authors cannot take any responsibility for you.* We only ship the code here, and how you are going to use it is left to your own discretion. ## Authors Made by [@soimort](https://github.com/soimort), who is in turn powered by :coffee:, :beer: and :ramen:. You can find the [list of all contributors](https://github.com/soimort/you-get/graphs/contributors) here. ================================================ FILE: README.rst ================================================ You-Get ======= |PyPI version| |Build Status| |Gitter| `You-Get `__ is a tiny command-line utility to download media contents (videos, audios, images) from the Web, in case there is no other handy way to do it. Here's how you use ``you-get`` to download a video from `this web page `__: .. code:: console $ you-get http://www.fsf.org/blogs/rms/20140407-geneva-tedx-talk-free-software-free-society Site: fsf.org Title: TEDxGE2014_Stallman05_LQ Type: WebM video (video/webm) Size: 27.12 MiB (28435804 Bytes) Downloading TEDxGE2014_Stallman05_LQ.webm ... 100.0% ( 27.1/27.1 MB) ├████████████████████████████████████████┤[1/1] 12 MB/s And here's why you might want to use it: - You enjoyed something on the Internet, and just want to download them for your own pleasure. - You watch your favorite videos online from your computer, but you are prohibited from saving them. You feel that you have no control over your own computer. (And it's not how an open Web is supposed to work.) - You want to get rid of any closed-source technology or proprietary JavaScript code, and disallow things like Flash running on your computer. - You are an adherent of hacker culture and free software. What ``you-get`` can do for you: - Download videos / audios from popular websites such as YouTube, Youku, Niconico, and a bunch more. (See the `full list of supported sites <#supported-sites>`__) - Stream an online video in your media player. No web browser, no more ads. - Download images (of interest) by scraping a web page. - Download arbitrary non-HTML contents, i.e., binary files. Interested? `Install it <#installation>`__ now and `get started by examples <#getting-started>`__. Are you a Python programmer? Then check out `the source `__ and fork it! .. |PyPI version| image:: https://badge.fury.io/py/you-get.png :target: http://badge.fury.io/py/you-get .. |Build Status| image:: https://github.com/soimort/you-get/workflows/develop/badge.svg :target: https://github.com/soimort/you-get/actions .. |Gitter| image:: https://badges.gitter.im/Join%20Chat.svg :target: https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge ================================================ FILE: SECURITY.md ================================================ # Security Policy ## Reporting a Vulnerability Please report security issues to . ================================================ FILE: contrib/completion/you-get-completion.bash ================================================ # Bash completion definition for you-get. _you-get () { COMPREPLY=() local IFS=$' \n' local cur=$2 prev=$3 local -a opts_without_arg opts_with_arg opts_without_arg=( -V --version -h --help -i --info -u --url --json -n --no-merge --no-caption -f --force --no-proxy -d --debug ) opts_with_arg=( -F --format -O --output-filename -o --output-dir -p --player -c --cookies -x --http-proxy -y --extractor-proxy -t --timeout ) # Do not complete non option names [[ $cur == -* ]] || return 1 # Do not complete when the previous arg is an option expecting an argument for opt in "${opts_with_arg[@]}"; do [[ $opt == $prev ]] && return 1 done # Complete option names COMPREPLY=( $(compgen -W "${opts_without_arg[*]} ${opts_with_arg[*]}" \ -- "$cur") ) return 0 } complete -F _you-get you-get ================================================ FILE: contrib/completion/you-get.fish ================================================ # Fish completion definition for you-get. complete -c you-get -s V -l version -d 'print version and exit' complete -c you-get -s h -l help -d 'print help and exit' complete -c you-get -s i -l info -d 'print extracted information' complete -c you-get -s u -l url -d 'print extracted information' complete -c you-get -l json -d 'print extracted URLs in JSON format' complete -c you-get -s n -l no-merge -d 'do not merge video parts' complete -c you-get -l no-caption -d 'do not download captions' complete -c you-get -s f -l force -d 'force overwrite existing files' complete -c you-get -s F -l format -x -d 'set video format to the specified stream id' complete -c you-get -s O -l output-filename -d 'set output filename' \ -x -a '(__fish_complete_path (commandline -ct) "output filename")' complete -c you-get -s o -l output-dir -d 'set output directory' \ -x -a '(__fish_complete_directories (commandline -ct) "output directory")' complete -c you-get -s p -l player -x -d 'stream extracted URL to the specified player' complete -c you-get -s c -l cookies -d 'load cookies.txt or cookies.sqlite' \ -x -a '(__fish_complete_path (commandline -ct) "cookies.txt or cookies.sqlite")' complete -c you-get -s x -l http-proxy -x -d 'use the specified HTTP proxy for downloading' complete -c you-get -s y -l extractor-proxy -x -d 'use the specified HTTP proxy for extraction only' complete -c you-get -l no-proxy -d 'do not use a proxy' complete -c you-get -s t -l timeout -x -d 'set socket timeout' complete -c you-get -s d -l debug -d 'show traceback and other debug info' ================================================ FILE: setup.cfg ================================================ [build] force = 0 [global] verbose = 0 [egg_info] tag_build = tag_date = 0 tag_svn_revision = 0 ================================================ FILE: setup.py ================================================ #!/usr/bin/env python3 PROJ_NAME = 'you-get' PACKAGE_NAME = 'you_get' PROJ_METADATA = '%s.json' % PROJ_NAME import importlib.util import importlib.machinery def load_source(modname, filename): loader = importlib.machinery.SourceFileLoader(modname, filename) spec = importlib.util.spec_from_file_location(modname, filename, loader=loader) module = importlib.util.module_from_spec(spec) # The module is always executed and not cached in sys.modules. # Uncomment the following line to cache the module. # sys.modules[module.__name__] = module loader.exec_module(module) return module import os, json here = os.path.abspath(os.path.dirname(__file__)) proj_info = json.loads(open(os.path.join(here, PROJ_METADATA), encoding='utf-8').read()) try: README = open(os.path.join(here, 'README.rst'), encoding='utf-8').read() except: README = "" CHANGELOG = open(os.path.join(here, 'CHANGELOG.rst'), encoding='utf-8').read() VERSION = load_source('version', os.path.join(here, 'src/%s/version.py' % PACKAGE_NAME)).__version__ from setuptools import setup, find_packages setup( name = proj_info['name'], version = VERSION, author = proj_info['author'], author_email = proj_info['author_email'], url = proj_info['url'], license = proj_info['license'], description = proj_info['description'], keywords = proj_info['keywords'], long_description = README, packages = find_packages('src'), package_dir = {'' : 'src'}, test_suite = 'tests', platforms = 'any', zip_safe = True, include_package_data = True, classifiers = proj_info['classifiers'], entry_points = {'console_scripts': proj_info['console_scripts']}, install_requires = ['dukpy'], extras_require = { 'socks': ['PySocks'], } ) ================================================ FILE: src/you_get/cli_wrapper/player/dragonplayer.py ================================================ ================================================ FILE: src/you_get/cli_wrapper/player/gnome_mplayer.py ================================================ ================================================ FILE: src/you_get/cli_wrapper/player/mplayer.py ================================================ ================================================ FILE: src/you_get/cli_wrapper/player/vlc.py ================================================ #!/usr/bin/env python ================================================ FILE: src/you_get/cli_wrapper/player/wmp.py ================================================ ================================================ FILE: src/you_get/cli_wrapper/transcoder/ffmpeg.py ================================================ ================================================ FILE: src/you_get/cli_wrapper/transcoder/libav.py ================================================ ================================================ FILE: src/you_get/cli_wrapper/transcoder/mencoder.py ================================================ ================================================ FILE: src/you_get/common.py ================================================ #!/usr/bin/env python import io import os import re import sys import time import json import socket import locale import logging import argparse import ssl from http import cookiejar from importlib import import_module from urllib import request, parse, error from .version import __version__ from .util import log, term from .util.git import get_version from .util.strings import get_filename, unescape_html from . import json_output as json_output_ sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') SITES = { '163' : 'netease', '56' : 'w56', '365yg' : 'toutiao', 'acfun' : 'acfun', 'archive' : 'archive', 'baidu' : 'baidu', 'bandcamp' : 'bandcamp', 'baomihua' : 'baomihua', 'bigthink' : 'bigthink', 'bilibili' : 'bilibili', 'cctv' : 'cntv', 'cntv' : 'cntv', 'cbs' : 'cbs', 'coub' : 'coub', 'dailymotion' : 'dailymotion', 'douban' : 'douban', 'douyin' : 'douyin', 'douyu' : 'douyutv', 'ehow' : 'ehow', 'facebook' : 'facebook', 'fc2' : 'fc2video', 'flickr' : 'flickr', 'freesound' : 'freesound', 'fun' : 'funshion', 'google' : 'google', 'giphy' : 'giphy', 'heavy-music' : 'heavymusic', 'huomao' : 'huomaotv', 'iask' : 'sina', 'icourses' : 'icourses', 'ifeng' : 'ifeng', 'imgur' : 'imgur', 'in' : 'alive', 'infoq' : 'infoq', 'instagram' : 'instagram', 'interest' : 'interest', 'iqilu' : 'iqilu', 'iqiyi' : 'iqiyi', 'ixigua' : 'ixigua', 'isuntv' : 'suntv', 'iwara' : 'iwara', 'joy' : 'joy', 'kankanews' : 'bilibili', 'kakao' : 'kakao', 'khanacademy' : 'khan', 'ku6' : 'ku6', 'kuaishou' : 'kuaishou', 'kugou' : 'kugou', 'kuwo' : 'kuwo', 'le' : 'le', 'letv' : 'le', 'lizhi' : 'lizhi', 'longzhu' : 'longzhu', 'lrts' : 'lrts', 'magisto' : 'magisto', 'metacafe' : 'metacafe', 'mgtv' : 'mgtv', 'miomio' : 'miomio', 'missevan' : 'missevan', 'mixcloud' : 'mixcloud', 'mtv81' : 'mtv81', 'miaopai' : 'yixia', 'naver' : 'naver', '7gogo' : 'nanagogo', 'nicovideo' : 'nicovideo', 'pinterest' : 'pinterest', 'pixnet' : 'pixnet', 'pptv' : 'pptv', 'qingting' : 'qingting', 'qq' : 'qq', 'showroom-live' : 'showroom', 'sina' : 'sina', 'smgbb' : 'bilibili', 'sohu' : 'sohu', 'soundcloud' : 'soundcloud', 'ted' : 'ted', 'theplatform' : 'theplatform', 'tiktok' : 'tiktok', 'tucao' : 'tucao', 'tudou' : 'tudou', 'tumblr' : 'tumblr', 'twimg' : 'twitter', 'twitter' : 'twitter', 'ucas' : 'ucas', 'vimeo' : 'vimeo', 'wanmen' : 'wanmen', 'weibo' : 'miaopai', 'veoh' : 'veoh', 'vk' : 'vk', 'x' : 'twitter', 'xiaokaxiu' : 'yixia', 'xiaojiadianvideo' : 'fc2video', 'ximalaya' : 'ximalaya', 'xinpianchang' : 'xinpianchang', 'yizhibo' : 'yizhibo', 'youku' : 'youku', 'youtu' : 'youtube', 'youtube' : 'youtube', 'zhanqi' : 'zhanqi', 'zhibo' : 'zhibo', 'zhihu' : 'zhihu', } dry_run = False json_output = False force = False skip_existing_file_size_check = False player = None extractor_proxy = None cookies = None output_filename = None auto_rename = False insecure = False m3u8 = False postfix = False prefix = None fake_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/126.0.2592.113' # Latest Edge } if sys.stdout.isatty(): default_encoding = sys.stdout.encoding.lower() else: default_encoding = locale.getpreferredencoding().lower() def rc4(key, data): # all encryption algo should work on bytes assert type(key) == type(data) and type(key) == type(b'') state = list(range(256)) j = 0 for i in range(256): j += state[i] + key[i % len(key)] j &= 0xff state[i], state[j] = state[j], state[i] i = 0 j = 0 out_list = [] for char in data: i += 1 i &= 0xff j += state[i] j &= 0xff state[i], state[j] = state[j], state[i] prn = state[(state[i] + state[j]) & 0xff] out_list.append(char ^ prn) return bytes(out_list) def general_m3u8_extractor(url, headers={}): m3u8_list = get_content(url, headers=headers).split('\n') urls = [] for line in m3u8_list: line = line.strip() if line and not line.startswith('#'): if line.startswith('http'): urls.append(line) else: seg_url = parse.urljoin(url, line) urls.append(seg_url) return urls def maybe_print(*s): try: print(*s) except: pass def tr(s): if default_encoding == 'utf-8': return s else: return s # return str(s.encode('utf-8'))[2:-1] # DEPRECATED in favor of match1() def r1(pattern, text): m = re.search(pattern, text) if m: return m.group(1) # DEPRECATED in favor of match1() def r1_of(patterns, text): for p in patterns: x = r1(p, text) if x: return x def match1(text, *patterns): """Scans through a string for substrings matched some patterns (first-subgroups only). Args: text: A string to be scanned. patterns: Arbitrary number of regex patterns. Returns: When only one pattern is given, returns a string (None if no match found). When more than one pattern are given, returns a list of strings ([] if no match found). """ if len(patterns) == 1: pattern = patterns[0] match = re.search(pattern, text) if match: return match.group(1) else: return None else: ret = [] for pattern in patterns: match = re.search(pattern, text) if match: ret.append(match.group(1)) return ret def matchall(text, patterns): """Scans through a string for substrings matched some patterns. Args: text: A string to be scanned. patterns: a list of regex pattern. Returns: a list if matched. empty if not. """ ret = [] for pattern in patterns: match = re.findall(pattern, text) ret += match return ret def launch_player(player, urls): import subprocess import shlex urls = list(urls) for url in urls.copy(): if type(url) is list: urls.extend(url) urls = [url for url in urls if type(url) is str] assert urls if (sys.version_info >= (3, 3)): import shutil exefile=shlex.split(player)[0] if shutil.which(exefile) is not None: subprocess.call(shlex.split(player) + urls) else: log.wtf('[Failed] Cannot find player "%s"' % exefile) else: subprocess.call(shlex.split(player) + urls) def parse_query_param(url, param): """Parses the query string of a URL and returns the value of a parameter. Args: url: A URL. param: A string representing the name of the parameter. Returns: The value of the parameter. """ try: return parse.parse_qs(parse.urlparse(url).query)[param][0] except: return None def unicodize(text): return re.sub( r'\\u([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])', lambda x: chr(int(x.group(0)[2:], 16)), text ) # DEPRECATED in favor of util.legitimize() def escape_file_path(path): path = path.replace('/', '-') path = path.replace('\\', '-') path = path.replace('*', '-') path = path.replace('?', '-') return path def ungzip(data): """Decompresses data for Content-Encoding: gzip. """ from io import BytesIO import gzip buffer = BytesIO(data) f = gzip.GzipFile(fileobj=buffer) return f.read() def undeflate(data): """Decompresses data for Content-Encoding: deflate. (the zlib compression is used.) """ import zlib decompressobj = zlib.decompressobj(-zlib.MAX_WBITS) return decompressobj.decompress(data)+decompressobj.flush() # an http.client implementation of get_content() # because urllib does not support "Connection: keep-alive" def getHttps(host, url, headers, debuglevel=0): import http.client conn = http.client.HTTPSConnection(host) conn.set_debuglevel(debuglevel) conn.request("GET", url, headers=headers) resp = conn.getresponse() logging.debug('getHttps: %s' % resp.getheaders()) set_cookie = resp.getheader('set-cookie') data = resp.read() try: data = ungzip(data) # gzip data = undeflate(data) # deflate except: pass conn.close() return str(data, encoding='utf-8'), set_cookie # TODO: support raw data # DEPRECATED in favor of get_content() def get_response(url, faker=False): logging.debug('get_response: %s' % url) ctx = None if insecure: # ignore ssl errors ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE # install cookies if cookies: opener = request.build_opener(request.HTTPCookieProcessor(cookies)) request.install_opener(opener) if faker: response = request.urlopen( request.Request(url, headers=fake_headers), None, context=ctx, ) else: response = request.urlopen(url, context=ctx) data = response.read() if response.info().get('Content-Encoding') == 'gzip': data = ungzip(data) elif response.info().get('Content-Encoding') == 'deflate': data = undeflate(data) response.data = data return response # DEPRECATED in favor of get_content() def get_html(url, encoding=None, faker=False): content = get_response(url, faker).data return str(content, 'utf-8', 'ignore') # DEPRECATED in favor of get_content() def get_decoded_html(url, faker=False): response = get_response(url, faker) data = response.data charset = r1(r'charset=([\w-]+)', response.headers['content-type']) if charset: return data.decode(charset, 'ignore') else: return data def get_location(url, headers=None, get_method='HEAD'): logging.debug('get_location: %s' % url) if headers: req = request.Request(url, headers=headers) else: req = request.Request(url) req.get_method = lambda: get_method res = urlopen_with_retry(req) return res.geturl() def urlopen_with_retry(*args, **kwargs): retry_time = 3 for i in range(retry_time): try: if insecure: # ignore ssl errors ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE return request.urlopen(*args, context=ctx, **kwargs) else: return request.urlopen(*args, **kwargs) except socket.timeout as e: logging.debug('request attempt %s timeout' % str(i + 1)) if i + 1 == retry_time: raise e # try to tackle youku CDN fails except error.HTTPError as http_error: logging.debug('HTTP Error with code{}'.format(http_error.code)) if i + 1 == retry_time: raise http_error def get_content(url, headers={}, decoded=True): """Gets the content of a URL via sending a HTTP GET request. Args: url: A URL. headers: Request headers used by the client. decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type. Returns: The content as a string. """ logging.debug('get_content: %s' % url) req = request.Request(url, headers=headers) if cookies: # NOTE: Do not use cookies.add_cookie_header(req) # #HttpOnly_ cookies were not supported by CookieJar and MozillaCookieJar properly until python 3.10 # See also: # - https://github.com/python/cpython/pull/17471 # - https://bugs.python.org/issue2190 # Here we add cookies to the request headers manually cookie_strings = [] for cookie in list(cookies): cookie_strings.append(cookie.name + '=' + cookie.value) cookie_headers = {'Cookie': '; '.join(cookie_strings)} req.headers.update(cookie_headers) response = urlopen_with_retry(req) data = response.read() # Handle HTTP compression for gzip and deflate (zlib) content_encoding = response.getheader('Content-Encoding') if content_encoding == 'gzip': data = ungzip(data) elif content_encoding == 'deflate': data = undeflate(data) # Decode the response body if decoded: charset = match1( response.getheader('Content-Type', ''), r'charset=([\w-]+)' ) if charset is not None: data = data.decode(charset, 'ignore') else: data = data.decode('utf-8', 'ignore') return data def post_content(url, headers={}, post_data={}, decoded=True, **kwargs): """Post the content of a URL via sending a HTTP POST request. Args: url: A URL. headers: Request headers used by the client. decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type. Returns: The content as a string. """ if kwargs.get('post_data_raw'): logging.debug('post_content: %s\npost_data_raw: %s' % (url, kwargs['post_data_raw'])) else: logging.debug('post_content: %s\npost_data: %s' % (url, post_data)) req = request.Request(url, headers=headers) if cookies: # NOTE: Do not use cookies.add_cookie_header(req) # #HttpOnly_ cookies were not supported by CookieJar and MozillaCookieJar properly until python 3.10 # See also: # - https://github.com/python/cpython/pull/17471 # - https://bugs.python.org/issue2190 # Here we add cookies to the request headers manually cookie_strings = [] for cookie in list(cookies): cookie_strings.append(cookie.name + '=' + cookie.value) cookie_headers = {'Cookie': '; '.join(cookie_strings)} req.headers.update(cookie_headers) if kwargs.get('post_data_raw'): post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8') else: post_data_enc = bytes(parse.urlencode(post_data), 'utf-8') response = urlopen_with_retry(req, data=post_data_enc) data = response.read() # Handle HTTP compression for gzip and deflate (zlib) content_encoding = response.getheader('Content-Encoding') if content_encoding == 'gzip': data = ungzip(data) elif content_encoding == 'deflate': data = undeflate(data) # Decode the response body if decoded: charset = match1( response.getheader('Content-Type'), r'charset=([\w-]+)' ) if charset is not None: data = data.decode(charset) else: data = data.decode('utf-8') return data def url_size(url, faker=False, headers={}): if faker: response = urlopen_with_retry( request.Request(url, headers=fake_headers) ) elif headers: response = urlopen_with_retry(request.Request(url, headers=headers)) else: response = urlopen_with_retry(url) size = response.headers['content-length'] return int(size) if size is not None else float('inf') def urls_size(urls, faker=False, headers={}): return sum([url_size(url, faker=faker, headers=headers) for url in urls]) def get_head(url, headers=None, get_method='HEAD'): logging.debug('get_head: %s' % url) if headers: req = request.Request(url, headers=headers) else: req = request.Request(url) req.get_method = lambda: get_method res = urlopen_with_retry(req) return res.headers def url_info(url, faker=False, headers={}): logging.debug('url_info: %s' % url) if faker: response = urlopen_with_retry( request.Request(url, headers=fake_headers) ) elif headers: response = urlopen_with_retry(request.Request(url, headers=headers)) else: response = urlopen_with_retry(request.Request(url)) headers = response.headers type = headers['content-type'] if type == 'image/jpg; charset=UTF-8' or type == 'image/jpg': type = 'audio/mpeg' # fix for netease mapping = { 'video/3gpp': '3gp', 'video/f4v': 'flv', 'video/mp4': 'mp4', 'video/MP2T': 'ts', 'video/quicktime': 'mov', 'video/webm': 'webm', 'video/x-flv': 'flv', 'video/x-ms-asf': 'asf', 'audio/mp4': 'mp4', 'audio/mpeg': 'mp3', 'audio/wav': 'wav', 'audio/x-wav': 'wav', 'audio/wave': 'wav', 'image/jpeg': 'jpg', 'image/png': 'png', 'image/gif': 'gif', 'application/pdf': 'pdf', } if type in mapping: ext = mapping[type] else: type = None if headers['content-disposition']: try: filename = parse.unquote( r1(r'filename="?([^"]+)"?', headers['content-disposition']) ) if len(filename.split('.')) > 1: ext = filename.split('.')[-1] else: ext = None except: ext = None else: ext = None if headers['transfer-encoding'] != 'chunked': size = headers['content-length'] and int(headers['content-length']) else: size = None return type, ext, size def url_locations(urls, faker=False, headers={}): locations = [] for url in urls: logging.debug('url_locations: %s' % url) if faker: response = urlopen_with_retry( request.Request(url, headers=fake_headers) ) elif headers: response = urlopen_with_retry( request.Request(url, headers=headers) ) else: response = urlopen_with_retry(request.Request(url)) locations.append(response.url) return locations def url_save( url, filepath, bar, refer=None, is_part=False, faker=False, headers=None, timeout=None, **kwargs ): tmp_headers = headers.copy() if headers is not None else {} # When a referer specified with param refer, # the key must be 'Referer' for the hack here if refer is not None: tmp_headers['Referer'] = refer if type(url) is list: chunk_sizes = [url_size(url, faker=faker, headers=tmp_headers) for url in url] file_size = sum(chunk_sizes) is_chunked, urls = True, url else: file_size = url_size(url, faker=faker, headers=tmp_headers) chunk_sizes = [file_size] is_chunked, urls = False, [url] continue_renameing = True while continue_renameing: continue_renameing = False if os.path.exists(filepath): if not force and (file_size == os.path.getsize(filepath) or skip_existing_file_size_check): if not is_part: if bar: bar.done() if skip_existing_file_size_check: log.w( 'Skipping {} without checking size: file already exists'.format( tr(os.path.basename(filepath)) ) ) else: log.w( 'Skipping {}: file already exists'.format( tr(os.path.basename(filepath)) ) ) else: if bar: bar.update_received(file_size) return else: if not is_part: if bar: bar.done() if not force and auto_rename: path, ext = os.path.basename(filepath).rsplit('.', 1) finder = re.compile(r' \([1-9]\d*?\)$') if (finder.search(path) is None): thisfile = path + ' (1).' + ext else: def numreturn(a): return ' (' + str(int(a.group()[2:-1]) + 1) + ').' thisfile = finder.sub(numreturn, path) + ext filepath = os.path.join(os.path.dirname(filepath), thisfile) print('Changing name to %s' % tr(os.path.basename(filepath)), '...') continue_renameing = True continue if log.yes_or_no('File with this name already exists. Overwrite?'): log.w('Overwriting %s ...' % tr(os.path.basename(filepath))) else: return elif not os.path.exists(os.path.dirname(filepath)): os.mkdir(os.path.dirname(filepath)) temp_filepath = filepath + '.download' if file_size != float('inf') \ else filepath received = 0 if not force: open_mode = 'ab' if os.path.exists(temp_filepath): received += os.path.getsize(temp_filepath) if bar: bar.update_received(os.path.getsize(temp_filepath)) else: open_mode = 'wb' chunk_start = 0 chunk_end = 0 for i, url in enumerate(urls): received_chunk = 0 chunk_start += 0 if i == 0 else chunk_sizes[i - 1] chunk_end += chunk_sizes[i] if received < file_size and received < chunk_end: if faker: tmp_headers = fake_headers ''' if parameter headers passed in, we have it copied as tmp_header elif headers: headers = headers else: headers = {} ''' if received: # chunk_start will always be 0 if not chunked tmp_headers['Range'] = 'bytes=' + str(received - chunk_start) + '-' if refer: tmp_headers['Referer'] = refer if timeout: response = urlopen_with_retry( request.Request(url, headers=tmp_headers), timeout=timeout ) else: response = urlopen_with_retry( request.Request(url, headers=tmp_headers) ) try: range_start = int( response.headers[ 'content-range' ][6:].split('/')[0].split('-')[0] ) end_length = int( response.headers['content-range'][6:].split('/')[1] ) range_length = end_length - range_start except: content_length = response.headers['content-length'] range_length = int(content_length) if content_length is not None \ else float('inf') if is_chunked: # always append if chunked open_mode = 'ab' elif file_size != received + range_length: # is it ever necessary? received = 0 if bar: bar.received = 0 open_mode = 'wb' with open(temp_filepath, open_mode) as output: while True: buffer = None try: buffer = response.read(1024 * 256) except socket.timeout: pass if not buffer: if file_size == float('+inf'): # Prevent infinite downloading break if is_chunked and received_chunk == range_length: break elif not is_chunked and received == file_size: # Download finished break # Unexpected termination. Retry request tmp_headers['Range'] = 'bytes=' + str(received - chunk_start) + '-' response = urlopen_with_retry( request.Request(url, headers=tmp_headers) ) continue output.write(buffer) received += len(buffer) received_chunk += len(buffer) if bar: bar.update_received(len(buffer)) assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % ( received, os.path.getsize(temp_filepath), temp_filepath ) if os.access(filepath, os.W_OK) and file_size != float('inf'): # on Windows rename could fail if destination filepath exists # we should simply choose a new name instead of brutal os.remove(filepath) filepath = filepath + " (2)" os.rename(temp_filepath, filepath) class SimpleProgressBar: term_size = term.get_terminal_size()[1] def __init__(self, total_size, total_pieces=1): self.displayed = False self.total_size = total_size self.total_pieces = total_pieces self.current_piece = 1 self.received = 0 self.speed = '' self.last_updated = time.time() total_pieces_len = len(str(total_pieces)) # 38 is the size of all statically known size in self.bar total_str = '%5s' % round(self.total_size / 1048576, 1) total_str_width = max(len(total_str), 5) self.bar_size = self.term_size - 28 - 2 * total_pieces_len \ - 2 * total_str_width self.bar = '{:>4}%% ({:>%s}/%sMB) ├{:─<%s}┤[{:>%s}/{:>%s}] {}' % ( total_str_width, total_str, self.bar_size, total_pieces_len, total_pieces_len ) def update(self): self.displayed = True bar_size = self.bar_size percent = round(self.received * 100 / self.total_size, 1) if percent >= 100: percent = 100 dots = bar_size * int(percent) // 100 plus = int(percent) - dots // bar_size * 100 if plus > 0.8: plus = '█' elif plus > 0.4: plus = '>' else: plus = '' bar = '█' * dots + plus bar = self.bar.format( percent, round(self.received / 1048576, 1), bar, self.current_piece, self.total_pieces, self.speed ) sys.stdout.write('\r' + bar) sys.stdout.flush() def update_received(self, n): self.received += n time_diff = time.time() - self.last_updated bytes_ps = n / time_diff if time_diff else 0 if bytes_ps >= 1024 ** 3: self.speed = '{:4.0f} GB/s'.format(bytes_ps / 1024 ** 3) elif bytes_ps >= 1024 ** 2: self.speed = '{:4.0f} MB/s'.format(bytes_ps / 1024 ** 2) elif bytes_ps >= 1024: self.speed = '{:4.0f} kB/s'.format(bytes_ps / 1024) else: self.speed = '{:4.0f} B/s'.format(bytes_ps) self.last_updated = time.time() self.update() def update_piece(self, n): self.current_piece = n def done(self): if self.displayed: print() self.displayed = False class PiecesProgressBar: def __init__(self, total_size, total_pieces=1): self.displayed = False self.total_size = total_size self.total_pieces = total_pieces self.current_piece = 1 self.received = 0 def update(self): self.displayed = True bar = '{0:>5}%[{1:<40}] {2}/{3}'.format( '', '=' * 40, self.current_piece, self.total_pieces ) sys.stdout.write('\r' + bar) sys.stdout.flush() def update_received(self, n): self.received += n self.update() def update_piece(self, n): self.current_piece = n def done(self): if self.displayed: print() self.displayed = False class DummyProgressBar: def __init__(self, *args): pass def update_received(self, n): pass def update_piece(self, n): pass def done(self): pass def get_output_filename(urls, title, ext, output_dir, merge, **kwargs): # lame hack for the --output-filename option global output_filename if output_filename: result = output_filename if kwargs.get('part', -1) >= 0: result = '%s[%02d]' % (result, kwargs.get('part')) if ext: result = '%s.%s' % (result, ext) return result merged_ext = ext if (len(urls) > 1) and merge: from .processor.ffmpeg import has_ffmpeg_installed if ext in ['flv', 'f4v']: if has_ffmpeg_installed(): merged_ext = 'mp4' else: merged_ext = 'flv' elif ext == 'mp4': merged_ext = 'mp4' elif ext == 'ts': if has_ffmpeg_installed(): merged_ext = 'mkv' else: merged_ext = 'ts' result = title if kwargs.get('part', -1) >= 0: result = '%s[%02d]' % (result, kwargs.get('part')) result = '%s.%s' % (result, merged_ext) return result.replace("'", "_") def print_user_agent(faker=False): urllib_default_user_agent = 'Python-urllib/%d.%d' % sys.version_info[:2] user_agent = fake_headers['User-Agent'] if faker else urllib_default_user_agent print('User Agent: %s' % user_agent) def download_urls( urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False, headers={}, **kwargs ): assert urls if json_output: json_output_.download_urls( urls=urls, title=title, ext=ext, total_size=total_size, refer=refer ) return if dry_run: print_user_agent(faker=faker) try: print('Real URLs:\n%s' % '\n'.join(urls)) except: print('Real URLs:\n%s' % '\n'.join([j for i in urls for j in i])) return if player: launch_player(player, urls) return if not total_size: try: total_size = urls_size(urls, faker=faker, headers=headers) except: import traceback traceback.print_exc(file=sys.stdout) pass title = tr(get_filename(title)) if postfix and 'vid' in kwargs: title = "%s [%s]" % (title, kwargs['vid']) if prefix is not None: title = "[%s] %s" % (prefix, title) output_filename = get_output_filename(urls, title, ext, output_dir, merge) output_filepath = os.path.join(output_dir, output_filename) if total_size: if not force and os.path.exists(output_filepath) and not auto_rename\ and (os.path.getsize(output_filepath) >= total_size * 0.9\ or skip_existing_file_size_check): if skip_existing_file_size_check: log.w('Skipping %s without checking size: file already exists' % output_filepath) else: log.w('Skipping %s: file already exists' % output_filepath) print() return bar = SimpleProgressBar(total_size, len(urls)) else: bar = PiecesProgressBar(total_size, len(urls)) if len(urls) == 1: url = urls[0] print('Downloading %s ...' % tr(output_filename)) bar.update() url_save( url, output_filepath, bar, refer=refer, faker=faker, headers=headers, **kwargs ) bar.done() else: parts = [] print('Downloading %s ...' % tr(output_filename)) bar.update() for i, url in enumerate(urls): output_filename_i = get_output_filename(urls, title, ext, output_dir, merge, part=i) output_filepath_i = os.path.join(output_dir, output_filename_i) parts.append(output_filepath_i) # print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls)) bar.update_piece(i + 1) url_save( url, output_filepath_i, bar, refer=refer, is_part=True, faker=faker, headers=headers, **kwargs ) bar.done() if not merge: print() return if 'av' in kwargs and kwargs['av']: from .processor.ffmpeg import has_ffmpeg_installed if has_ffmpeg_installed(): from .processor.ffmpeg import ffmpeg_concat_av ret = ffmpeg_concat_av(parts, output_filepath, ext) print('Merged into %s' % output_filename) if ret == 0: for part in parts: os.remove(part) elif ext in ['flv', 'f4v']: try: from .processor.ffmpeg import has_ffmpeg_installed if has_ffmpeg_installed(): from .processor.ffmpeg import ffmpeg_concat_flv_to_mp4 ffmpeg_concat_flv_to_mp4(parts, output_filepath) else: from .processor.join_flv import concat_flv concat_flv(parts, output_filepath) print('Merged into %s' % output_filename) except: raise else: for part in parts: os.remove(part) elif ext == 'mp4': try: from .processor.ffmpeg import has_ffmpeg_installed if has_ffmpeg_installed(): from .processor.ffmpeg import ffmpeg_concat_mp4_to_mp4 ffmpeg_concat_mp4_to_mp4(parts, output_filepath) else: from .processor.join_mp4 import concat_mp4 concat_mp4(parts, output_filepath) print('Merged into %s' % output_filename) except: raise else: for part in parts: os.remove(part) elif ext == 'ts': try: from .processor.ffmpeg import has_ffmpeg_installed if has_ffmpeg_installed(): from .processor.ffmpeg import ffmpeg_concat_ts_to_mkv ffmpeg_concat_ts_to_mkv(parts, output_filepath) else: from .processor.join_ts import concat_ts concat_ts(parts, output_filepath) print('Merged into %s' % output_filename) except: raise else: for part in parts: os.remove(part) elif ext == 'mp3': try: from .processor.ffmpeg import has_ffmpeg_installed assert has_ffmpeg_installed() from .processor.ffmpeg import ffmpeg_concat_mp3_to_mp3 ffmpeg_concat_mp3_to_mp3(parts, output_filepath) print('Merged into %s' % output_filename) except: raise else: for part in parts: os.remove(part) else: print("Can't merge %s files" % ext) print() def download_rtmp_url( url, title, ext, params={}, total_size=0, output_dir='.', refer=None, merge=True, faker=False ): assert url if dry_run: print_user_agent(faker=faker) print('Real URL:\n%s\n' % [url]) if params.get('-y', False): # None or unset -> False print('Real Playpath:\n%s\n' % [params.get('-y')]) return if player: from .processor.rtmpdump import play_rtmpdump_stream play_rtmpdump_stream(player, url, params) return from .processor.rtmpdump import ( has_rtmpdump_installed, download_rtmpdump_stream ) assert has_rtmpdump_installed(), 'RTMPDump not installed.' download_rtmpdump_stream(url, title, ext, params, output_dir) def download_url_ffmpeg( url, title, ext, params={}, total_size=0, output_dir='.', refer=None, merge=True, faker=False, stream=True ): assert url if dry_run: print_user_agent(faker=faker) print('Real URL:\n%s\n' % [url]) if params.get('-y', False): # None or unset ->False print('Real Playpath:\n%s\n' % [params.get('-y')]) return if player: launch_player(player, [url]) return from .processor.ffmpeg import has_ffmpeg_installed, ffmpeg_download_stream assert has_ffmpeg_installed(), 'FFmpeg not installed.' global output_filename if output_filename: dotPos = output_filename.rfind('.') if dotPos > 0: title = output_filename[:dotPos] ext = output_filename[dotPos+1:] else: title = output_filename title = tr(get_filename(title)) ffmpeg_download_stream(url, title, ext, params, output_dir, stream=stream) def playlist_not_supported(name): def f(*args, **kwargs): raise NotImplementedError('Playlist is not supported for ' + name) return f def print_info(site_info, title, type, size, **kwargs): if json_output: json_output_.print_info( site_info=site_info, title=title, type=type, size=size ) return if type: type = type.lower() if type in ['3gp']: type = 'video/3gpp' elif type in ['asf', 'wmv']: type = 'video/x-ms-asf' elif type in ['flv', 'f4v']: type = 'video/x-flv' elif type in ['mkv']: type = 'video/x-matroska' elif type in ['mp3']: type = 'audio/mpeg' elif type in ['mp4']: type = 'video/mp4' elif type in ['mov']: type = 'video/quicktime' elif type in ['ts']: type = 'video/MP2T' elif type in ['webm']: type = 'video/webm' elif type in ['jpg']: type = 'image/jpeg' elif type in ['png']: type = 'image/png' elif type in ['gif']: type = 'image/gif' if type in ['video/3gpp']: type_info = '3GPP multimedia file (%s)' % type elif type in ['video/x-flv', 'video/f4v']: type_info = 'Flash video (%s)' % type elif type in ['video/mp4', 'video/x-m4v']: type_info = 'MPEG-4 video (%s)' % type elif type in ['video/MP2T']: type_info = 'MPEG-2 transport stream (%s)' % type elif type in ['video/webm']: type_info = 'WebM video (%s)' % type # elif type in ['video/ogg']: # type_info = 'Ogg video (%s)' % type elif type in ['video/quicktime']: type_info = 'QuickTime video (%s)' % type elif type in ['video/x-matroska']: type_info = 'Matroska video (%s)' % type # elif type in ['video/x-ms-wmv']: # type_info = 'Windows Media video (%s)' % type elif type in ['video/x-ms-asf']: type_info = 'Advanced Systems Format (%s)' % type # elif type in ['video/mpeg']: # type_info = 'MPEG video (%s)' % type elif type in ['audio/mp4', 'audio/m4a']: type_info = 'MPEG-4 audio (%s)' % type elif type in ['audio/mpeg']: type_info = 'MP3 (%s)' % type elif type in ['audio/wav', 'audio/wave', 'audio/x-wav']: type_info = 'Waveform Audio File Format ({})'.format(type) elif type in ['image/jpeg']: type_info = 'JPEG Image (%s)' % type elif type in ['image/png']: type_info = 'Portable Network Graphics (%s)' % type elif type in ['image/gif']: type_info = 'Graphics Interchange Format (%s)' % type elif type in ['m3u8']: if 'm3u8_type' in kwargs: if kwargs['m3u8_type'] == 'master': type_info = 'M3U8 Master {}'.format(type) else: type_info = 'M3U8 Playlist {}'.format(type) else: type_info = 'Unknown type (%s)' % type maybe_print('Site: ', site_info) maybe_print('Title: ', unescape_html(tr(title))) print('Type: ', type_info) if type != 'm3u8': print( 'Size: ', round(size / 1048576, 2), 'MiB (' + str(size) + ' Bytes)' ) if type == 'm3u8' and 'm3u8_url' in kwargs: print('M3U8 Url: {}'.format(kwargs['m3u8_url'])) print() def mime_to_container(mime): mapping = { 'video/3gpp': '3gp', 'video/mp4': 'mp4', 'video/webm': 'webm', 'video/x-flv': 'flv', } if mime in mapping: return mapping[mime] else: return mime.split('/')[1] def parse_host(host): """Parses host name and port number from a string. """ if re.match(r'^(\d+)$', host) is not None: return ("0.0.0.0", int(host)) if re.match(r'^(\w+)://', host) is None: host = "//" + host o = parse.urlparse(host) hostname = o.hostname or "0.0.0.0" port = o.port or 0 return (hostname, port) def set_proxy(proxy): proxy_handler = request.ProxyHandler({ 'http': '%s:%s' % proxy, 'https': '%s:%s' % proxy, }) opener = request.build_opener(proxy_handler) request.install_opener(opener) def unset_proxy(): proxy_handler = request.ProxyHandler({}) opener = request.build_opener(proxy_handler) request.install_opener(opener) # DEPRECATED in favor of set_proxy() and unset_proxy() def set_http_proxy(proxy): if proxy is None: # Use system default setting proxy_support = request.ProxyHandler() elif proxy == '': # Don't use any proxy proxy_support = request.ProxyHandler({}) else: # Use proxy proxy_support = request.ProxyHandler( {'http': '%s' % proxy, 'https': '%s' % proxy} ) opener = request.build_opener(proxy_support) request.install_opener(opener) def print_more_compatible(*args, **kwargs): import builtins as __builtin__ """Overload default print function as py (<3.3) does not support 'flush' keyword. Although the function name can be same as print to get itself overloaded automatically, I'd rather leave it with a different name and only overload it when importing to make less confusion. """ # nothing happens on py3.3 and later if sys.version_info[:2] >= (3, 3): return __builtin__.print(*args, **kwargs) # in lower pyver (e.g. 3.2.x), remove 'flush' keyword and flush it as requested doFlush = kwargs.pop('flush', False) ret = __builtin__.print(*args, **kwargs) if doFlush: kwargs.get('file', sys.stdout).flush() return ret def download_main(download, download_playlist, urls, playlist, **kwargs): for url in urls: if re.match(r'https?://', url) is None: url = 'http://' + url if m3u8: if output_filename: title = output_filename else: title = "m3u8file" download_url_ffmpeg(url=url, title=title,ext = 'mp4',output_dir = '.') elif playlist: download_playlist(url, **kwargs) else: download(url, **kwargs) def load_cookies(cookiefile): global cookies if cookiefile.endswith('.txt'): # MozillaCookieJar treats prefix '#HttpOnly_' as comments incorrectly! # do not use its load() # see also: # - https://docs.python.org/3/library/http.cookiejar.html#http.cookiejar.MozillaCookieJar # - https://github.com/python/cpython/blob/4b219ce/Lib/http/cookiejar.py#L2014 # - https://curl.haxx.se/libcurl/c/CURLOPT_COOKIELIST.html#EXAMPLE #cookies = cookiejar.MozillaCookieJar(cookiefile) #cookies.load() from http.cookiejar import Cookie cookies = cookiejar.MozillaCookieJar() now = time.time() ignore_discard, ignore_expires = False, False with open(cookiefile, 'r', encoding='utf-8') as f: for line in f: # last field may be absent, so keep any trailing tab if line.endswith("\n"): line = line[:-1] # skip comments and blank lines XXX what is $ for? if (line.strip().startswith(("#", "$")) or line.strip() == ""): if not line.strip().startswith('#HttpOnly_'): # skip for #HttpOnly_ continue domain, domain_specified, path, secure, expires, name, value = \ line.split("\t") secure = (secure == "TRUE") domain_specified = (domain_specified == "TRUE") if name == "": # cookies.txt regards 'Set-Cookie: foo' as a cookie # with no name, whereas http.cookiejar regards it as a # cookie with no value. name = value value = None initial_dot = domain.startswith(".") if not line.strip().startswith('#HttpOnly_'): # skip for #HttpOnly_ assert domain_specified == initial_dot discard = False if expires == "": expires = None discard = True # assume path_specified is false c = Cookie(0, name, value, None, False, domain, domain_specified, initial_dot, path, False, secure, expires, discard, None, None, {}) if not ignore_discard and c.discard: continue if not ignore_expires and c.is_expired(now): continue cookies.set_cookie(c) elif cookiefile.endswith(('.sqlite', '.sqlite3')): import sqlite3, shutil, tempfile temp_dir = tempfile.gettempdir() temp_cookiefile = os.path.join(temp_dir, 'temp_cookiefile.sqlite') shutil.copy2(cookiefile, temp_cookiefile) cookies = cookiejar.MozillaCookieJar() con = sqlite3.connect(temp_cookiefile) cur = con.cursor() cur.execute("""SELECT host, path, isSecure, expiry, name, value FROM moz_cookies""") for item in cur.fetchall(): c = cookiejar.Cookie( 0, item[4], item[5], None, False, item[0], item[0].startswith('.'), item[0].startswith('.'), item[1], False, item[2], item[3], item[3] == '', None, None, {}, ) cookies.set_cookie(c) else: log.e('[error] unsupported cookies format') # TODO: Chromium Cookies # SELECT host_key, path, secure, expires_utc, name, encrypted_value # FROM cookies # http://n8henrie.com/2013/11/use-chromes-cookies-for-easier-downloading-with-python-requests/ def set_socks_proxy(proxy): try: import socks if '@' in proxy: proxy_info = proxy.split("@") socks_proxy_addrs = proxy_info[1].split(':') socks_proxy_auth = proxy_info[0].split(":") socks.set_default_proxy( socks.SOCKS5, socks_proxy_addrs[0], int(socks_proxy_addrs[1]), True, socks_proxy_auth[0], socks_proxy_auth[1] ) else: socks_proxy_addrs = proxy.split(':') socks.set_default_proxy( socks.SOCKS5, socks_proxy_addrs[0], int(socks_proxy_addrs[1]), ) socket.socket = socks.socksocket def getaddrinfo(*args): return [ (socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1])) ] socket.getaddrinfo = getaddrinfo except ImportError: log.w( 'Error importing PySocks library, socks proxy ignored.' 'In order to use use socks proxy, please install PySocks.' ) def script_main(download, download_playlist, **kwargs): logging.basicConfig(format='[%(levelname)s] %(message)s') def print_version(): version = get_version( kwargs['repo_path'] if 'repo_path' in kwargs else __version__ ) log.i( 'version {}, a tiny downloader that scrapes the web.'.format( version ) ) parser = argparse.ArgumentParser( prog='you-get', usage='you-get [OPTION]... URL...', description='A tiny downloader that scrapes the web', add_help=False, ) parser.add_argument( '-V', '--version', action='store_true', help='Print version and exit' ) parser.add_argument( '-h', '--help', action='store_true', help='Print this help message and exit' ) dry_run_grp = parser.add_argument_group( 'Dry-run options', '(no actual downloading)' ) dry_run_grp = dry_run_grp.add_mutually_exclusive_group() dry_run_grp.add_argument( '-i', '--info', action='store_true', help='Print extracted information' ) dry_run_grp.add_argument( '-u', '--url', action='store_true', help='Print extracted information with URLs' ) dry_run_grp.add_argument( '--json', action='store_true', help='Print extracted URLs in JSON format' ) download_grp = parser.add_argument_group('Download options') download_grp.add_argument( '-n', '--no-merge', action='store_true', default=False, help='Do not merge video parts' ) download_grp.add_argument( '--no-caption', action='store_true', help='Do not download captions (subtitles, lyrics, danmaku, ...)' ) download_grp.add_argument( '--post', '--postfix', dest='postfix', action='store_true', default=False, help='Postfix downloaded files with unique identifiers' ) download_grp.add_argument( '--pre', '--prefix', dest='prefix', metavar='PREFIX', default=None, help='Prefix downloaded files with string' ) download_grp.add_argument( '-f', '--force', action='store_true', default=False, help='Force overwriting existing files' ) download_grp.add_argument( '--skip-existing-file-size-check', action='store_true', default=False, help='Skip existing file without checking file size' ) download_grp.add_argument( '-F', '--format', metavar='STREAM_ID', help='Set video format to STREAM_ID' ) download_grp.add_argument( '-O', '--output-filename', metavar='FILE', help='Set output filename' ) download_grp.add_argument( '-o', '--output-dir', metavar='DIR', default='.', help='Set output directory' ) download_grp.add_argument( '-p', '--player', metavar='PLAYER', help='Stream extracted URL to a PLAYER' ) download_grp.add_argument( '-c', '--cookies', metavar='COOKIES_FILE', help='Load cookies.txt or cookies.sqlite' ) download_grp.add_argument( '-t', '--timeout', metavar='SECONDS', type=int, default=600, help='Set socket timeout' ) download_grp.add_argument( '-d', '--debug', action='store_true', help='Show traceback and other debug info' ) download_grp.add_argument( '-I', '--input-file', metavar='FILE', type=argparse.FileType('r'), help='Read non-playlist URLs from FILE' ) download_grp.add_argument( '-P', '--password', help='Set video visit password to PASSWORD' ) download_grp.add_argument( '-l', '--playlist', action='store_true', help='Prefer to download a playlist' ) playlist_grp = parser.add_argument_group('Playlist optional options') playlist_grp.add_argument( '--first', metavar='FIRST', help='the first number' ) playlist_grp.add_argument( '--last', metavar='LAST', help='the last number' ) playlist_grp.add_argument( '--size', '--page-size', metavar='PAGE_SIZE', help='the page size number' ) download_grp.add_argument( '-a', '--auto-rename', action='store_true', default=False, help='Auto rename same name different files' ) download_grp.add_argument( '-k', '--insecure', action='store_true', default=False, help='ignore ssl errors' ) proxy_grp = parser.add_argument_group('Proxy options') proxy_grp = proxy_grp.add_mutually_exclusive_group() proxy_grp.add_argument( '-x', '--http-proxy', metavar='HOST:PORT', help='Use an HTTP proxy for downloading' ) proxy_grp.add_argument( '-y', '--extractor-proxy', metavar='HOST:PORT', help='Use an HTTP proxy for extracting only' ) proxy_grp.add_argument( '--no-proxy', action='store_true', help='Never use a proxy' ) proxy_grp.add_argument( '-s', '--socks-proxy', metavar='HOST:PORT or USERNAME:PASSWORD@HOST:PORT', help='Use an SOCKS5 proxy for downloading' ) download_grp.add_argument('--stream', help=argparse.SUPPRESS) download_grp.add_argument('--itag', help=argparse.SUPPRESS) download_grp.add_argument('-m', '--m3u8', action='store_true', default=False, help = 'download video using an m3u8 url') parser.add_argument('URL', nargs='*', help=argparse.SUPPRESS) args = parser.parse_args() if args.help: print_version() parser.print_help() sys.exit() if args.version: print_version() sys.exit() if args.debug: # Set level of root logger to DEBUG logging.getLogger().setLevel(logging.DEBUG) global force global skip_existing_file_size_check global dry_run global json_output global player global extractor_proxy global output_filename global auto_rename global insecure global m3u8 global postfix global prefix output_filename = args.output_filename extractor_proxy = args.extractor_proxy info_only = args.info if args.force: force = True if args.skip_existing_file_size_check: skip_existing_file_size_check = True if args.auto_rename: auto_rename = True if args.url: dry_run = True if args.json: json_output = True # to fix extractors not use VideoExtractor dry_run = True info_only = False if args.cookies: load_cookies(args.cookies) if args.m3u8: m3u8 = True caption = True stream_id = args.format or args.stream or args.itag if args.no_caption: caption = False if args.player: player = args.player caption = False if args.insecure: # ignore ssl insecure = True postfix = args.postfix prefix = args.prefix if args.no_proxy: set_http_proxy('') else: set_http_proxy(args.http_proxy) if args.socks_proxy: set_socks_proxy(args.socks_proxy) URLs = [] if args.input_file: logging.debug('you are trying to load urls from %s', args.input_file) if args.playlist: log.e( "reading playlist from a file is unsupported " "and won't make your life easier" ) sys.exit(2) URLs.extend(args.input_file.read().splitlines()) args.input_file.close() URLs.extend(args.URL) if not URLs: parser.print_help() sys.exit() socket.setdefaulttimeout(args.timeout) try: extra = {'args': args} if extractor_proxy: extra['extractor_proxy'] = extractor_proxy if stream_id: extra['stream_id'] = stream_id download_main( download, download_playlist, URLs, args.playlist, output_dir=args.output_dir, merge=not args.no_merge, info_only=info_only, json_output=json_output, caption=caption, password=args.password, **extra ) except KeyboardInterrupt: if args.debug: raise else: sys.exit(1) except UnicodeEncodeError: if args.debug: raise log.e( '[error] oops, the current environment does not seem to support ' 'Unicode.' ) log.e('please set it to a UTF-8-aware locale first,') log.e( 'so as to save the video (with some Unicode characters) correctly.' ) log.e('you can do it like this:') log.e(' (Windows) % chcp 65001 ') log.e(' (Linux) $ LC_CTYPE=en_US.UTF-8') sys.exit(1) except Exception: if not args.debug: log.e('[error] oops, something went wrong.') log.e( 'don\'t panic, c\'est la vie. please try the following steps:' ) log.e(' (1) Rule out any network problem.') log.e(' (2) Make sure you-get is up-to-date.') log.e(' (3) Check if the issue is already known, on') log.e(' https://github.com/soimort/you-get/wiki/Known-Bugs') log.e(' https://github.com/soimort/you-get/issues') log.e(' (4) Run the command with \'--debug\' option,') log.e(' and report this issue with the full output.') else: print_version() log.i(args) raise sys.exit(1) def google_search(url): keywords = r1(r'https?://(.*)', url) url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords) page = get_content(url, headers=fake_headers) videos = re.findall( r'(https://www\.youtube\.com/watch\?v=[\w-]+)', page ) print('Best matched result:') return(videos[0]) def url_to_module(url): try: video_host = r1(r'https?://([^/]+)/', url) video_url = r1(r'https?://[^/]+(.*)', url) assert video_host and video_url except AssertionError: url = google_search(url) video_host = r1(r'https?://([^/]+)/', url) video_url = r1(r'https?://[^/]+(.*)', url) if video_host.endswith('.com.cn') or video_host.endswith('.ac.cn'): video_host = video_host[:-3] domain = r1(r'(\.[^.]+\.[^.]+)$', video_host) or video_host assert domain, 'unsupported url: ' + url # all non-ASCII code points must be quoted (percent-encoded UTF-8) url = ''.join([ch if ord(ch) in range(128) else parse.quote(ch) for ch in url]) video_host = r1(r'https?://([^/]+)/', url) video_url = r1(r'https?://[^/]+(.*)', url) k = r1(r'([^.]+)', domain) if k in SITES: return ( import_module('.'.join(['you_get', 'extractors', SITES[k]])), url ) else: try: try: location = get_location(url) # t.co isn't happy with fake_headers except: location = get_location(url, headers=fake_headers) except: location = get_location(url, headers=fake_headers, get_method='GET') if location and location != url and not location.startswith('/'): return url_to_module(location) else: return import_module('you_get.extractors.universal'), url def any_download(url, **kwargs): m, url = url_to_module(url) m.download(url, **kwargs) def any_download_playlist(url, **kwargs): m, url = url_to_module(url) m.download_playlist(url, **kwargs) def main(**kwargs): script_main(any_download, any_download_playlist, **kwargs) ================================================ FILE: src/you_get/extractor.py ================================================ #!/usr/bin/env python from .common import match1, maybe_print, download_urls, get_filename, parse_host, set_proxy, unset_proxy, get_content, dry_run, player from .common import print_more_compatible as print from .util import log from . import json_output import os import sys class Extractor(): def __init__(self, *args): self.url = None self.title = None self.vid = None self.streams = {} self.streams_sorted = [] if args: self.url = args[0] class VideoExtractor(): def __init__(self, *args): self.url = None self.title = None self.vid = None self.m3u8_url = None self.streams = {} self.streams_sorted = [] self.audiolang = None self.password_protected = False self.dash_streams = {} self.caption_tracks = {} self.out = False self.ua = None self.referer = None self.danmaku = None self.lyrics = None if args: self.url = args[0] def download_by_url(self, url, **kwargs): self.url = url self.vid = None if 'extractor_proxy' in kwargs and kwargs['extractor_proxy']: set_proxy(parse_host(kwargs['extractor_proxy'])) self.prepare(**kwargs) if self.out: return if 'extractor_proxy' in kwargs and kwargs['extractor_proxy']: unset_proxy() try: self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams] except: self.streams_sorted = [dict([('itag', stream_type['itag'])] + list(self.streams[stream_type['itag']].items())) for stream_type in self.__class__.stream_types if stream_type['itag'] in self.streams] self.extract(**kwargs) self.download(**kwargs) def download_by_vid(self, vid, **kwargs): self.url = None self.vid = vid if 'extractor_proxy' in kwargs and kwargs['extractor_proxy']: set_proxy(parse_host(kwargs['extractor_proxy'])) self.prepare(**kwargs) if 'extractor_proxy' in kwargs and kwargs['extractor_proxy']: unset_proxy() try: self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams] except: self.streams_sorted = [dict([('itag', stream_type['itag'])] + list(self.streams[stream_type['itag']].items())) for stream_type in self.__class__.stream_types if stream_type['itag'] in self.streams] self.extract(**kwargs) self.download(**kwargs) def prepare(self, **kwargs): pass #raise NotImplementedError() def extract(self, **kwargs): pass #raise NotImplementedError() def p_stream(self, stream_id): if stream_id in self.streams: stream = self.streams[stream_id] else: stream = self.dash_streams[stream_id] if 'itag' in stream: print(" - itag: %s" % log.sprint(stream_id, log.NEGATIVE)) else: print(" - format: %s" % log.sprint(stream_id, log.NEGATIVE)) if 'container' in stream: print(" container: %s" % stream['container']) if 'video_profile' in stream: maybe_print(" video-profile: %s" % stream['video_profile']) if 'quality' in stream: print(" quality: %s" % stream['quality']) if 'size' in stream and 'container' in stream and stream['container'].lower() != 'm3u8': if stream['size'] != float('inf') and stream['size'] != 0: print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size'])) if 'm3u8_url' in stream: print(" m3u8_url: {}".format(stream['m3u8_url'])) if 'itag' in stream: print(" # download-with: %s" % log.sprint("you-get --itag=%s [URL]" % stream_id, log.UNDERLINE)) else: print(" # download-with: %s" % log.sprint("you-get --format=%s [URL]" % stream_id, log.UNDERLINE)) print() def p_i(self, stream_id): if stream_id in self.streams: stream = self.streams[stream_id] else: stream = self.dash_streams[stream_id] maybe_print(" - title: %s" % self.title) print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size'])) print(" url: %s" % self.url) print() sys.stdout.flush() def p(self, stream_id=None): maybe_print("site: %s" % self.__class__.name) maybe_print("title: %s" % self.title) if stream_id: # Print the stream print("stream:") self.p_stream(stream_id) elif stream_id is None: # Print stream with best quality print("stream: # Best quality") stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag'] self.p_stream(stream_id) elif stream_id == []: print("streams: # Available quality and codecs") # Print DASH streams if self.dash_streams: print(" [ DASH ] %s" % ('_' * 36)) itags = sorted(self.dash_streams, key=lambda i: -self.dash_streams[i]['size']) for stream in itags: self.p_stream(stream) # Print all other available streams if self.streams_sorted: print(" [ DEFAULT ] %s" % ('_' * 33)) for stream in self.streams_sorted: self.p_stream(stream['id'] if 'id' in stream else stream['itag']) if self.audiolang: print("audio-languages:") for i in self.audiolang: print(" - lang: {}".format(i['lang'])) print(" download-url: {}\n".format(i['url'])) sys.stdout.flush() def p_playlist(self, stream_id=None): maybe_print("site: %s" % self.__class__.name) print("playlist: %s" % self.title) print("videos:") def download(self, **kwargs): if 'json_output' in kwargs and kwargs['json_output']: json_output.output(self) elif 'info_only' in kwargs and kwargs['info_only']: if 'stream_id' in kwargs and kwargs['stream_id']: # Display the stream stream_id = kwargs['stream_id'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) else: # Display all available streams if 'index' not in kwargs: self.p([]) else: stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag'] self.p_i(stream_id) else: if 'stream_id' in kwargs and kwargs['stream_id']: # Download the stream stream_id = kwargs['stream_id'] else: # Download stream with the best quality from .processor.ffmpeg import has_ffmpeg_installed if has_ffmpeg_installed() and player is None and self.dash_streams or not self.streams_sorted: #stream_id = list(self.dash_streams)[-1] itags = sorted(self.dash_streams, key=lambda i: -self.dash_streams[i]['size']) stream_id = itags[0] else: stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) if stream_id in self.streams: urls = self.streams[stream_id]['src'] ext = self.streams[stream_id]['container'] total_size = self.streams[stream_id]['size'] else: urls = self.dash_streams[stream_id]['src'] ext = self.dash_streams[stream_id]['container'] total_size = self.dash_streams[stream_id]['size'] if ext == 'm3u8' or ext == 'm4a': ext = 'mp4' if not urls: log.wtf('[Failed] Cannot extract video source.') # For legacy main() headers = {} if self.ua is not None: headers['User-Agent'] = self.ua if self.referer is not None: headers['Referer'] = self.referer download_urls(urls, self.title, ext, total_size, headers=headers, output_dir=kwargs['output_dir'], merge=kwargs['merge'], av=stream_id in self.dash_streams, vid=self.vid) if 'caption' not in kwargs or not kwargs['caption']: print('Skipping captions or danmaku.') return for lang in self.caption_tracks: filename = '%s.%s.srt' % (get_filename(self.title), lang) print('Saving %s ... ' % filename, end="", flush=True) srt = self.caption_tracks[lang] with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf-8') as x: x.write(srt) print('Done.') if self.danmaku is not None and not dry_run: filename = '{}.cmt.xml'.format(get_filename(self.title)) print('Downloading {} ...\n'.format(filename)) with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp: fp.write(self.danmaku) if self.lyrics is not None and not dry_run: filename = '{}.lrc'.format(get_filename(self.title)) print('Downloading {} ...\n'.format(filename)) with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp: fp.write(self.lyrics) # For main_dev() #download_urls(urls, self.title, self.streams[stream_id]['container'], self.streams[stream_id]['size']) keep_obj = kwargs.get('keep_obj', False) if not keep_obj: self.__init__() ================================================ FILE: src/you_get/extractors/acfun.py ================================================ #!/usr/bin/env python from ..common import * from ..extractor import VideoExtractor class AcFun(VideoExtractor): name = "AcFun" stream_types = [ {'id': '2160P', 'qualityType': '2160p'}, {'id': '1080P60', 'qualityType': '1080p60'}, {'id': '720P60', 'qualityType': '720p60'}, {'id': '1080P+', 'qualityType': '1080p+'}, {'id': '1080P', 'qualityType': '1080p'}, {'id': '720P', 'qualityType': '720p'}, {'id': '540P', 'qualityType': '540p'}, {'id': '360P', 'qualityType': '360p'} ] def prepare(self, **kwargs): assert re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', self.url) if re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', self.url): html = get_content(self.url, headers=fake_headers) json_text = match1(html, r"(?s)videoInfo\s*=\s*(\{.*?\});") json_data = json.loads(json_text) vid = json_data.get('currentVideoInfo').get('id') up = json_data.get('user').get('name') self.title = json_data.get('title') video_list = json_data.get('videoList') if len(video_list) > 1: self.title += " - " + [p.get('title') for p in video_list if p.get('id') == vid][0] currentVideoInfo = json_data.get('currentVideoInfo') elif re.match(r"https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)", self.url): html = get_content(self.url, headers=fake_headers) tag_script = match1(html, r'') json_text = tag_script[tag_script.find('{') : tag_script.find('};') + 1] json_data = json.loads(json_text) self.title = json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title'] vid = str(json_data['videoId']) up = "acfun" currentVideoInfo = json_data.get('currentVideoInfo') else: raise NotImplementedError() if 'ksPlayJson' in currentVideoInfo: durationMillis = currentVideoInfo['durationMillis'] ksPlayJson = ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] ) representation = ksPlayJson.get('adaptationSet')[0].get('representation') stream_list = representation for stream in stream_list: m3u8_url = stream["url"] size = durationMillis * stream["avgBitrate"] / 8 # size = float('inf') container = 'mp4' stream_id = stream["qualityLabel"] quality = stream["qualityType"] stream_data = dict(src=m3u8_url, size=size, container=container, quality=quality) self.streams[stream_id] = stream_data assert self.title and m3u8_url self.title = unescape_html(self.title) self.title = escape_file_path(self.title) p_title = r1('active">([^<]+)', html) self.title = '%s (%s)' % (self.title, up) if p_title: self.title = '%s - %s' % (self.title, p_title) def download(self, **kwargs): if 'json_output' in kwargs and kwargs['json_output']: json_output.output(self) elif 'info_only' in kwargs and kwargs['info_only']: if 'stream_id' in kwargs and kwargs['stream_id']: # Display the stream stream_id = kwargs['stream_id'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) else: # Display all available streams if 'index' not in kwargs: self.p([]) else: stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag'] self.p_i(stream_id) else: if 'stream_id' in kwargs and kwargs['stream_id']: # Download the stream stream_id = kwargs['stream_id'] else: stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) if stream_id in self.streams: url = self.streams[stream_id]['src'] ext = self.streams[stream_id]['container'] total_size = self.streams[stream_id]['size'] if ext == 'm3u8' or ext == 'm4a': ext = 'mp4' if not url: log.wtf('[Failed] Cannot extract video source.') # For legacy main() headers = {} if self.ua is not None: headers['User-Agent'] = self.ua if self.referer is not None: headers['Referer'] = self.referer download_url_ffmpeg(url, self.title, ext, output_dir=kwargs['output_dir'], merge=kwargs['merge']) if 'caption' not in kwargs or not kwargs['caption']: print('Skipping captions or danmaku.') return for lang in self.caption_tracks: filename = '%s.%s.srt' % (get_filename(self.title), lang) print('Saving %s ... ' % filename, end="", flush=True) srt = self.caption_tracks[lang] with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf-8') as x: x.write(srt) print('Done.') if self.danmaku is not None and not dry_run: filename = '{}.cmt.xml'.format(get_filename(self.title)) print('Downloading {} ...\n'.format(filename)) with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp: fp.write(self.danmaku) if self.lyrics is not None and not dry_run: filename = '{}.lrc'.format(get_filename(self.title)) print('Downloading {} ...\n'.format(filename)) with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp: fp.write(self.lyrics) # For main_dev() #download_urls(urls, self.title, self.streams[stream_id]['container'], self.streams[stream_id]['size']) keep_obj = kwargs.get('keep_obj', False) if not keep_obj: self.__init__() def acfun_download(self, url, output_dir='.', merge=True, info_only=False, **kwargs): assert re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', url) def getM3u8UrlFromCurrentVideoInfo(currentVideoInfo): if 'playInfos' in currentVideoInfo: return currentVideoInfo['playInfos'][0]['playUrls'][0] elif 'ksPlayJson' in currentVideoInfo: ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] ) representation = ksPlayJson.get('adaptationSet')[0].get('representation') reps = [] for one in representation: reps.append( (one['width']* one['height'], one['url'], one['backupUrl']) ) return max(reps)[1] if re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url): html = get_content(url, headers=fake_headers) json_text = match1(html, r"(?s)videoInfo\s*=\s*(\{.*?\});") json_data = json.loads(json_text) vid = json_data.get('currentVideoInfo').get('id') up = json_data.get('user').get('name') title = json_data.get('title') video_list = json_data.get('videoList') if len(video_list) > 1: title += " - " + [p.get('title') for p in video_list if p.get('id') == vid][0] currentVideoInfo = json_data.get('currentVideoInfo') m3u8_url = getM3u8UrlFromCurrentVideoInfo(currentVideoInfo) elif re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)', url): html = get_content(url, headers=fake_headers) tag_script = match1(html, r'') json_text = tag_script[tag_script.find('{') : tag_script.find('};') + 1] json_data = json.loads(json_text) title = json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title'] vid = str(json_data['videoId']) up = "acfun" currentVideoInfo = json_data.get('currentVideoInfo') m3u8_url = getM3u8UrlFromCurrentVideoInfo(currentVideoInfo) else: raise NotImplementedError() assert title and m3u8_url title = unescape_html(title) title = escape_file_path(title) p_title = r1('active">([^<]+)', html) title = '%s (%s)' % (title, up) if p_title: title = '%s - %s' % (title, p_title) print_info(site_info, title, 'm3u8', float('inf')) if not info_only: download_url_ffmpeg(m3u8_url, title, 'mp4', output_dir=output_dir, merge=merge) site = AcFun() site_info = "AcFun.cn" download = site.download_by_url download_playlist = playlist_not_supported('acfun') ================================================ FILE: src/you_get/extractors/alive.py ================================================ #!/usr/bin/env python __all__ = ['alive_download'] from ..common import * def alive_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): html = get_html(url) title = r1(r'(.+?)<\/h2>', html) artist = r1(r'', html) output_dir = '%s/%s - %s' % (output_dir, artist, album_name) ids = json.loads(r1(r'', html).replace('"', '').replace(';', '"'))['ids'] track_nr = 1 for id in ids: song_data = baidu_get_song_data(id) song_url = baidu_get_song_url(song_data) song_title = baidu_get_song_title(song_data) song_lrc = baidu_get_song_lyric(song_data) file_name = '%02d.%s' % (track_nr, song_title) type, ext, size = url_info(song_url, faker=True) print_info(site_info, song_title, type, size) if not info_only: download_urls([song_url], file_name, ext, size, output_dir, merge=merge, faker=True) if song_lrc: type, ext, size = url_info(song_lrc, faker=True) print_info(site_info, song_title, type, size) if not info_only: download_urls([song_lrc], file_name, ext, size, output_dir, faker=True) track_nr += 1 def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only=False, **kwargs): if re.match(r'https?://pan.baidu.com', url): real_url, title, ext, size = baidu_pan_download(url) print_info('BaiduPan', title, ext, size) if not info_only: print('Hold on...') time.sleep(5) download_urls([real_url], title, ext, size, output_dir, url, merge=merge, faker=True) elif re.match(r'https?://music.baidu.com/album/\d+', url): id = r1(r'https?://music.baidu.com/album/(\d+)', url) baidu_download_album(id, output_dir, merge, info_only) elif re.match(r'https?://music.baidu.com/song/\d+', url): id = r1(r'https?://music.baidu.com/song/(\d+)', url) baidu_download_song(id, output_dir, merge, info_only) elif re.match('https?://tieba.baidu.com/', url): try: # embedded videos embed_download(url, output_dir, merge=merge, info_only=info_only, **kwargs) except: # images html = get_html(url) title = r1(r'title:"([^"]+)"', html) vhsrc = re.findall(r'"BDE_Image"[^>]+src="([^"]+\.mp4)"', html) or \ re.findall(r'vhsrc="([^"]+)"', html) if len(vhsrc) > 0: ext = 'mp4' size = url_size(vhsrc[0]) print_info(site_info, title, ext, size) if not info_only: download_urls(vhsrc, title, ext, size, output_dir=output_dir, merge=False) items = re.findall( r'//tiebapic.baidu.com/forum/w[^"]+/([^/"]+)', html) urls = ['http://tiebapic.baidu.com/forum/pic/item/' + i for i in set(items)] # handle albums kw = r1(r'kw=([^&]+)', html) or r1(r"kw:'([^']+)'", html) tid = r1(r'tid=(\d+)', html) or r1(r"tid:'([^']+)'", html) album_url = 'http://tieba.baidu.com/photo/g/bw/picture/list?kw=%s&tid=%s&pe=%s' % (kw, tid, 1000) album_info = json.loads(get_content(album_url)) for i in album_info['data']['pic_list']: urls.append( 'http://tiebapic.baidu.com/forum/pic/item/' + i['pic_id'] + '.jpg') ext = 'jpg' size = float('Inf') print_info(site_info, title, ext, size) if not info_only: download_urls(urls, title, ext, size, output_dir=output_dir, merge=False) def baidu_pan_download(url): errno_patt = r'errno":([^"]+),' refer_url = "" fake_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', 'Host': 'pan.baidu.com', 'Origin': 'http://pan.baidu.com', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2500.0 Safari/537.36', 'Referer': refer_url } if cookies: print('Use user specified cookies') else: print('Generating cookies...') fake_headers['Cookie'] = baidu_pan_gen_cookies(url) refer_url = "http://pan.baidu.com" html = get_content(url, fake_headers, decoded=True) isprotected = False sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse( html) if sign is None: if re.findall(r'\baccess-code\b', html): isprotected = True sign, timestamp, bdstoken, appid, primary_id, fs_id, uk, fake_headers, psk = baidu_pan_protected_share( url) # raise NotImplementedError("Password required!") if isprotected != True: raise AssertionError("Share not found or canceled: %s" % url) if bdstoken is None: bdstoken = "" if isprotected != True: sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse( html) request_url = "http://pan.baidu.com/api/sharedownload?sign=%s×tamp=%s&bdstoken=%s&channel=chunlei&clienttype=0&web=1&app_id=%s" % ( sign, timestamp, bdstoken, appid) refer_url = url post_data = { 'encrypt': 0, 'product': 'share', 'uk': uk, 'primaryid': primary_id, 'fid_list': '[' + fs_id + ']' } if isprotected == True: post_data['sekey'] = psk response_content = post_content(request_url, fake_headers, post_data, True) errno = match1(response_content, errno_patt) if errno != "0": raise AssertionError( "Server refused to provide download link! (Errno:%s)" % errno) real_url = r1(r'dlink":"([^"]+)"', response_content).replace('\\/', '/') title = r1(r'server_filename":"([^"]+)"', response_content) assert real_url type, ext, size = url_info(real_url, faker=True) title_wrapped = json.loads('{"wrapper":"%s"}' % title) title = title_wrapped['wrapper'] logging.debug(real_url) return real_url, title, ext, size def baidu_pan_parse(html): sign_patt = r'sign":"([^"]+)"' timestamp_patt = r'timestamp":([^"]+),' appid_patt = r'app_id":"([^"]+)"' bdstoken_patt = r'bdstoken":"([^"]+)"' fs_id_patt = r'fs_id":([^"]+),' uk_patt = r'uk":([^"]+),' errno_patt = r'errno":([^"]+),' primary_id_patt = r'shareid":([^"]+),' sign = match1(html, sign_patt) timestamp = match1(html, timestamp_patt) appid = match1(html, appid_patt) bdstoken = match1(html, bdstoken_patt) fs_id = match1(html, fs_id_patt) uk = match1(html, uk_patt) primary_id = match1(html, primary_id_patt) return sign, timestamp, bdstoken, appid, primary_id, fs_id, uk def baidu_pan_gen_cookies(url, post_data=None): from http import cookiejar cookiejar = cookiejar.CookieJar() opener = request.build_opener(request.HTTPCookieProcessor(cookiejar)) resp = opener.open('http://pan.baidu.com') if post_data != None: resp = opener.open(url, bytes(parse.urlencode(post_data), 'utf-8')) return cookjar2hdr(cookiejar) def baidu_pan_protected_share(url): print('This share is protected by password!') inpwd = input('Please provide unlock password: ') inpwd = inpwd.replace(' ', '').replace('\t', '') print('Please wait...') post_pwd = { 'pwd': inpwd, 'vcode': None, 'vstr': None } from http import cookiejar import time cookiejar = cookiejar.CookieJar() opener = request.build_opener(request.HTTPCookieProcessor(cookiejar)) resp = opener.open('http://pan.baidu.com') resp = opener.open(url) init_url = resp.geturl() verify_url = 'http://pan.baidu.com/share/verify?%s&t=%s&channel=chunlei&clienttype=0&web=1' % ( init_url.split('?', 1)[1], int(time.time())) refer_url = init_url fake_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', 'Host': 'pan.baidu.com', 'Origin': 'http://pan.baidu.com', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2500.0 Safari/537.36', 'Referer': refer_url } opener.addheaders = dict2triplet(fake_headers) pwd_resp = opener.open(verify_url, bytes( parse.urlencode(post_pwd), 'utf-8')) pwd_resp_str = ungzip(pwd_resp.read()).decode('utf-8') pwd_res = json.loads(pwd_resp_str) if pwd_res['errno'] != 0: raise AssertionError( 'Server returned an error: %s (Incorrect password?)' % pwd_res['errno']) pg_resp = opener.open('http://pan.baidu.com/share/link?%s' % init_url.split('?', 1)[1]) content = ungzip(pg_resp.read()).decode('utf-8') sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse( content) psk = query_cookiejar(cookiejar, 'BDCLND') psk = parse.unquote(psk) fake_headers['Cookie'] = cookjar2hdr(cookiejar) return sign, timestamp, bdstoken, appid, primary_id, fs_id, uk, fake_headers, psk def cookjar2hdr(cookiejar): cookie_str = '' for i in cookiejar: cookie_str = cookie_str + i.name + '=' + i.value + ';' return cookie_str[:-1] def query_cookiejar(cookiejar, name): for i in cookiejar: if i.name == name: return i.value def dict2triplet(dictin): out_triplet = [] for i in dictin: out_triplet.append((i, dictin[i])) return out_triplet site_info = "Baidu.com" download = baidu_download download_playlist = playlist_not_supported("baidu") ================================================ FILE: src/you_get/extractors/bandcamp.py ================================================ #!/usr/bin/env python __all__ = ['bandcamp_download'] from ..common import * def bandcamp_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url) trackinfo = json.loads(r1(r'(\[{"(video_poster_url|video_caption)".*}\]),', html)) for track in trackinfo: track_num = track['track_num'] title = '%s. %s' % (track_num, track['title']) file_url = 'http:' + track['file']['mp3-128'] mime, ext, size = url_info(file_url) print_info(site_info, title, mime, size) if not info_only: download_urls([file_url], title, ext, size, output_dir, merge=merge) site_info = "Bandcamp.com" download = bandcamp_download download_playlist = bandcamp_download ================================================ FILE: src/you_get/extractors/baomihua.py ================================================ #!/usr/bin/env python __all__ = ['baomihua_download', 'baomihua_download_by_id'] from ..common import * import urllib def baomihua_headers(referer=None, cookie=None): # a reasonable UA ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' headers = {'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'User-Agent': ua} if referer is not None: headers.update({'Referer': referer}) if cookie is not None: headers.update({'Cookie': cookie}) return headers def baomihua_download_by_id(id, title=None, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html('http://play.baomihua.com/getvideourl.aspx?flvid=%s&devicetype=phone_app' % id) host = r1(r'host=([^&]*)', html) assert host type = r1(r'videofiletype=([^&]*)', html) assert type vid = r1(r'&stream_name=([^&]*)', html) assert vid dir_str = r1(r'&dir=([^&]*)', html).strip() url = "http://%s/%s/%s.%s" % (host, dir_str, vid, type) _, ext, size = url_info(url, headers=baomihua_headers()) print_info(site_info, title, type, size) if not info_only: download_urls([url], title, ext, size, output_dir, merge = merge, headers=baomihua_headers()) def baomihua_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url) title = r1(r'(.*)', html) assert title id = r1(r'flvid\s*=\s*(\d+)', html) assert id baomihua_download_by_id(id, title, output_dir=output_dir, merge=merge, info_only=info_only) site_info = "baomihua.com" download = baomihua_download download_playlist = playlist_not_supported('baomihua') ================================================ FILE: src/you_get/extractors/bigthink.py ================================================ #!/usr/bin/env python from ..common import * from ..extractor import VideoExtractor import json class Bigthink(VideoExtractor): name = "Bigthink" stream_types = [ #this is just a sample. Will make it in prepare() # {'id': '1080'}, # {'id': '720'}, # {'id': '360'}, # {'id': '288'}, # {'id': '190'}, # {'id': '180'}, ] @staticmethod def get_streams_by_id(account_number, video_id): """ int, int->list Get the height of the videos. Since brightcove is using 3 kinds of links: rtmp, http and https, we will be using the HTTPS one to make it secure. If somehow akamaihd.net is blocked by the Great Fucking Wall, change the "startswith https" to http. """ endpoint = 'https://edge.api.brightcove.com/playback/v1/accounts/{account_number}/videos/{video_id}'.format(account_number = account_number, video_id = video_id) fake_header_id = fake_headers #is this somehow related to the time? Magic.... fake_header_id['Accept'] ='application/json;pk=BCpkADawqM1cc6wmJQC2tvoXZt4mrB7bFfi6zGt9QnOzprPZcGLE9OMGJwspQwKfuFYuCjAAJ53JdjI8zGFx1ll4rxhYJ255AXH1BQ10rnm34weknpfG-sippyQ' html = get_content(endpoint, headers= fake_header_id) html_json = json.loads(html) link_list = [] for i in html_json['sources']: if 'src' in i: #to avoid KeyError if i['src'].startswith('https'): link_list.append((str(i['height']), i['src'])) return link_list def prepare(self, **kwargs): html = get_content(self.url) self.title = match1(html, r' bangumi/play/ep # redirect: bangumi.bilibili.com/anime -> bangumi/play/ep elif re.match(r'https?://(www\.)?bilibili\.com/bangumi/play/ss(\d+)', self.url) or \ re.match(r'https?://bangumi\.bilibili\.com/anime/(\d+)/play', self.url): initial_state_text = match1(html_content, r'__INITIAL_STATE__=(.*?);\(function\(\)') # FIXME initial_state = json.loads(initial_state_text) ep_id = initial_state['epList'][0]['id'] self.url = 'https://www.bilibili.com/bangumi/play/ep%s' % ep_id html_content = get_content(self.url, headers=self.bilibili_headers(referer=self.url)) # redirect: s elif re.match(r'https?://(www\.)?bilibili\.com/s/(.+)', self.url): self.url = 'https://www.bilibili.com/%s' % match1(self.url, r'/s/(.+)') html_content = get_content(self.url, headers=self.bilibili_headers()) # redirect: festival elif re.match(r'https?://(www\.)?bilibili\.com/festival/(.+)', self.url): self.url = 'https://www.bilibili.com/video/%s' % match1(self.url, r'bvid=([^&]+)') html_content = get_content(self.url, headers=self.bilibili_headers()) # sort it out if re.match(r'https?://(www\.)?bilibili\.com/audio/au(\d+)', self.url): sort = 'audio' elif re.match(r'https?://(www\.)?bilibili\.com/bangumi/play/ep(\d+)', self.url): sort = 'bangumi' elif match1(html_content, r'))', html) json_data = json.loads(coub_data) return json_data def get_file_path(merge, output_dir, title, url): mime, ext, size = url_info(url) file_name = get_output_filename([], title, ext, output_dir, merge) file_path = os.path.join(output_dir, file_name) return file_name, file_path def get_loop_file_path(title, output_dir): return os.path.join(output_dir, get_output_filename([], title, "txt", None, False)) def cleanup_files(files): for file in files: os.remove(file) site_info = "coub.com" download = coub_download download_playlist = playlist_not_supported('coub') ================================================ FILE: src/you_get/extractors/dailymotion.py ================================================ #!/usr/bin/env python __all__ = ['dailymotion_download'] from ..common import * import urllib.parse def rebuilt_url(url): path = urllib.parse.urlparse(url).path aid = path.split('/')[-1].split('_')[0] return 'http://www.dailymotion.com/embed/video/{}?autoplay=1'.format(aid) def dailymotion_download(url, output_dir='.', merge=True, info_only=False, **kwargs): """Downloads Dailymotion videos by URL. """ html = get_content(rebuilt_url(url)) info = json.loads(match1(html, r'qualities":({.+?}),"')) title = match1(html, r'"video_title"\s*:\s*"([^"]+)"') or \ match1(html, r'"title"\s*:\s*"([^"]+)"') title = unicodize(title) for quality in ['1080','720','480','380','240','144','auto']: try: real_url = info[quality][1]["url"] if real_url: break except KeyError: pass mime, ext, size = url_info(real_url) print_info(site_info, title, mime, size) if not info_only: download_urls([real_url], title, ext, size, output_dir=output_dir, merge=merge) site_info = "Dailymotion.com" download = dailymotion_download download_playlist = playlist_not_supported('dailymotion') ================================================ FILE: src/you_get/extractors/douban.py ================================================ #!/usr/bin/env python __all__ = ['douban_download'] import urllib.request, urllib.parse from ..common import * def douban_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): html = get_html(url) if re.match(r'https?://movie', url): title = match1(html, 'name="description" content="([^"]+)') tid = match1(url, r'trailer/(\d+)') real_url = 'https://movie.douban.com/trailer/video_url?tid=%s' % tid type, ext, size = url_info(real_url) print_info(site_info, title, type, size) if not info_only: download_urls([real_url], title, ext, size, output_dir, merge = merge) elif 'subject' in url: titles = re.findall(r'data-title="([^"]*)">', html) song_id = re.findall(r'
  • (.+?)' hit = re.search(patt, url) if hit is None: log.wtf('Unknown url pattern') vid = hit.group(1) page = get_content(url, headers=headers) hit = re.search(title_patt, page) if hit is None: title = vid else: title = hit.group(1) meta = json.loads(get_content(ep + vid)) if meta['error'] != 0: log.wtf('Error from API server') m3u8_url = meta['data']['video_url'] print_info('Douyu Video', title, 'm3u8', 0, m3u8_url=m3u8_url) if not info_only: urls = general_m3u8_extractor(m3u8_url) download_urls(urls, title, 'ts', 0, output_dir=output_dir, merge=merge, **kwargs) def douyutv_download(url, output_dir='.', merge=True, info_only=False, **kwargs): if 'v.douyu.com/show/' in url: douyutv_video_download(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) return url = re.sub(r'.*douyu.com','https://m.douyu.com/room', url) html = get_content(url, headers) room_id_patt = r'"rid"\s*:\s*(\d+),' room_id = match1(html, room_id_patt) if room_id == "0": room_id = url[url.rfind('/') + 1:] api_url = "http://www.douyutv.com/api/v1/" args = "room/%s?aid=wp&client_sys=wp&time=%d" % (room_id, int(time.time())) auth_md5 = (args + "zNzMV1y4EMxOHS6I5WKm").encode("utf-8") auth_str = hashlib.md5(auth_md5).hexdigest() json_request_url = "%s%s&auth=%s" % (api_url, args, auth_str) content = get_content(json_request_url, headers) json_content = json.loads(content) data = json_content['data'] server_status = json_content.get('error', 0) if server_status != 0: raise ValueError("Server returned error:%s" % server_status) title = data.get('room_name') show_status = data.get('show_status') if show_status != "1": raise ValueError("The live stream is not online! (Errno:%s)" % server_status) real_url = data.get('rtmp_url') + '/' + data.get('rtmp_live') print_info(site_info, title, 'flv', float('inf')) if not info_only: download_url_ffmpeg(real_url, title, 'flv', params={}, output_dir=output_dir, merge=merge) site_info = "douyu.com" download = douyutv_download download_playlist = playlist_not_supported('douyu') ================================================ FILE: src/you_get/extractors/ehow.py ================================================ #!/usr/bin/env python __all__ = ['ehow_download'] from ..common import * def ehow_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): assert re.search(r'http://www.ehow.com/video_', url), "URL you entered is not supported" html = get_html(url) contentid = r1(r'', html) vid = r1(r'"demand_ehow_videoid":"([^"]+)"', html) assert vid xml = get_html('http://www.ehow.com/services/video/series.xml?demand_ehow_videoid=%s' % vid) from xml.dom.minidom import parseString doc = parseString(xml) tab = doc.getElementsByTagName('related')[0].firstChild for video in tab.childNodes: if re.search(contentid, video.attributes['link'].value): url = video.attributes['flv'].value break title = video.attributes['title'].value assert title type, ext, size = url_info(url) print_info(site_info, title, type, size) if not info_only: download_urls([url], title, ext, size, output_dir, merge = merge) site_info = "ehow.com" download = ehow_download download_playlist = playlist_not_supported('ehow') ================================================ FILE: src/you_get/extractors/embed.py ================================================ __all__ = ['embed_download'] import urllib.parse from ..common import * from .bilibili import bilibili_download from .dailymotion import dailymotion_download from .iqiyi import iqiyi_download_by_vid from .le import letvcloud_download_by_vu from .netease import netease_download from .qq import qq_download_by_vid from .sina import sina_download_by_vid from .tudou import tudou_download_by_id from .vimeo import vimeo_download_by_id from .youku import youku_download_by_vid from . import iqiyi from . import bokecc """ refer to http://open.youku.com/tools """ youku_embed_patterns = [ r'youku\.com/v_show/id_([a-zA-Z0-9=]+)', r'player\.youku\.com/player\.php/sid/([a-zA-Z0-9=]+)/v\.swf', r'loader\.swf\?VideoIDS=([a-zA-Z0-9=]+)', r'player\.youku\.com/embed/([a-zA-Z0-9=]+)', r'YKU.Player\(\'[a-zA-Z0-9]+\',{ client_id: \'[a-zA-Z0-9]+\', vid: \'([a-zA-Z0-9]+)\'' ] """ http://www.tudou.com/programs/view/html5embed.action?type=0&code=3LS_URGvl54&lcode=&resourceId=0_06_05_99 """ tudou_embed_patterns = [ r'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([a-zA-Z0-9_-]+)\&', r'www\.tudou\.com/v/([a-zA-Z0-9_-]+)/[^"]*v\.swf' ] """ refer to http://open.tudou.com/wiki/video/info """ tudou_api_patterns = [ ] iqiyi_embed_patterns = [ r'player\.video\.qiyi\.com/([^/]+)/[^/]+/[^/]+/[^/]+\.swf[^"]+tvId=(\d+)' ] netease_embed_patterns = [ r'(http://\w+\.163\.com/movie/[^\'"]+)' ] vimeo_embed_patters = [ r'player\.vimeo\.com/video/(\d+)' ] dailymotion_embed_patterns = [ r'www\.dailymotion\.com/embed/video/(\w+)' ] """ check the share button on http://www.bilibili.com/video/av5079467/ """ bilibili_embed_patterns = [ r'static\.hdslb\.com/miniloader\.swf.*aid=(\d+)' ] ''' http://open.iqiyi.com/lib/player.html ''' iqiyi_patterns = [r'(?:\"|\')(https?://dispatcher\.video\.qiyi\.com\/disp\/shareplayer\.swf\?.+?)(?:\"|\')', r'(?:\"|\')(https?://open\.iqiyi\.com\/developer\/player_js\/coopPlayerIndex\.html\?.+?)(?:\"|\')'] bokecc_patterns = [r'bokecc\.com/flash/pocle/player\.swf\?siteid=(.+?)&vid=(.{32})'] recur_limit = 3 def embed_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): content = get_content(url, headers=fake_headers) found = False title = match1(content, '([^<>]+)') vids = matchall(content, youku_embed_patterns) for vid in set(vids): found = True youku_download_by_vid(vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) vids = matchall(content, tudou_embed_patterns) for vid in set(vids): found = True tudou_download_by_id(vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) vids = matchall(content, iqiyi_embed_patterns) for vid in vids: found = True iqiyi_download_by_vid((vid[1], vid[0]), title=title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) urls = matchall(content, netease_embed_patterns) for url in urls: found = True netease_download(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) urls = matchall(content, vimeo_embed_patters) for url in urls: found = True vimeo_download_by_id(url, title=title, output_dir=output_dir, merge=merge, info_only=info_only, referer=url, **kwargs) urls = matchall(content, dailymotion_embed_patterns) for url in urls: found = True dailymotion_download(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) aids = matchall(content, bilibili_embed_patterns) for aid in aids: found = True url = 'http://www.bilibili.com/video/av%s/' % aid bilibili_download(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) iqiyi_urls = matchall(content, iqiyi_patterns) for url in iqiyi_urls: found = True iqiyi.download(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) bokecc_metas = matchall(content, bokecc_patterns) for meta in bokecc_metas: found = True bokecc.bokecc_download_by_id(meta[1], output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) if found: return True # Try harder, check all iframes if 'recur_lv' not in kwargs or kwargs['recur_lv'] < recur_limit: r = kwargs.get('recur_lv') if r is None: r = 1 else: r += 1 iframes = matchall(content, [r'(.+)', html) if title is None: title = url sd_urls = list(set([ unicodize(str.replace(i, '\\/', '/')) for i in re.findall(r'sd_src_no_ratelimit:"([^"]*)"', html) ])) hd_urls = list(set([ unicodize(str.replace(i, '\\/', '/')) for i in re.findall(r'hd_src_no_ratelimit:"([^"]*)"', html) ])) urls = hd_urls if hd_urls else sd_urls type, ext, size = url_info(urls[0], True) size = urls_size(urls) print_info(site_info, title, type, size) if not info_only: download_urls(urls, title, ext, size, output_dir, merge=False) site_info = "Facebook.com" download = facebook_download download_playlist = playlist_not_supported('facebook') ================================================ FILE: src/you_get/extractors/fc2video.py ================================================ #!/usr/bin/env python __all__ = ['fc2video_download'] from ..common import * from hashlib import md5 from urllib.parse import urlparse #---------------------------------------------------------------------- def makeMimi(upid): """From http://cdn37.atwikiimg.com/sitescript/pub/dksitescript/FC2.site.js Also com.hps.util.fc2.FC2EncrptUtil.makeMimiLocal L110""" strSeed = "gGddgPfeaf_gzyr" prehash = upid + "_" + strSeed return md5(prehash.encode('utf-8')).hexdigest() #---------------------------------------------------------------------- def fc2video_download_by_upid(upid, output_dir = '.', merge = True, info_only = False, **kwargs): """""" fake_headers = { 'DNT': '1', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-CA,en;q=0.8,en-US;q=0.6,zh-CN;q=0.4,zh;q=0.2', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.58 Safari/537.36', 'Accept': '*/*', 'X-Requested-With': 'ShockwaveFlash/19.0.0.245', 'Connection': 'keep-alive', } api_base = 'http://video.fc2.com/ginfo.php?upid={upid}&mimi={mimi}'.format(upid = upid, mimi = makeMimi(upid)) html = get_content(api_base, headers=fake_headers) video_url = match1(html, r'filepath=(.+)&sec') video_url = video_url.replace('&mid', '?mid') title = match1(html, r'&title=([^&]+)') type, ext, size = url_info(video_url, headers=fake_headers) print_info(site_info, title, type, size) if not info_only: download_urls([video_url], title, ext, size, output_dir, merge=merge, headers = fake_headers) #---------------------------------------------------------------------- def fc2video_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): """wrapper""" #'http://video.fc2.com/en/content/20151021bTVKnbEw' #'http://xiaojiadianvideo.asia/content/20151021bTVKnbEw' #'http://video.fc2.com/ja/content/20151021bTVKnbEw' #'http://video.fc2.com/tw/content/20151021bTVKnbEw' hostname = urlparse(url).hostname if not ('fc2.com' in hostname or 'xiaojiadianvideo.asia' in hostname): return False upid = match1(url, r'.+/content/(\w+)') fc2video_download_by_upid(upid, output_dir, merge, info_only) site_info = "FC2Video" download = fc2video_download download_playlist = playlist_not_supported('fc2video') ================================================ FILE: src/you_get/extractors/flickr.py ================================================ #!/usr/bin/env python __all__ = ['flickr_download_main'] from ..common import * import json pattern_url_photoset = r'https?://www\.flickr\.com/photos/.+/(?:(?:sets)|(?:albums))?/([^/]+)' pattern_url_photostream = r'https?://www\.flickr\.com/photos/([^/]+)(?:/|(?:/page))?$' pattern_url_single_photo = r'https?://www\.flickr\.com/photos/[^/]+/(\d+)' pattern_url_gallery = r'https?://www\.flickr\.com/photos/[^/]+/galleries/(\d+)' pattern_url_group = r'https?://www\.flickr\.com/groups/([^/]+)' pattern_url_favorite = r'https?://www\.flickr\.com/photos/([^/]+)/favorites' pattern_inline_title = r'([^<]*)' pattern_inline_api_key = r'api\.site_key\s*=\s*"([^"]+)"' pattern_inline_img_url = r'"url":"([^"]+)","key":"[^"]+"}}' pattern_inline_NSID = r'"nsid"\s*:\s*"([^"]+)"' pattern_inline_video_mark = r'("mediaType":"video")' # (api_key, method, ext, page) tmpl_api_call = ( 'https://api.flickr.com/services/rest?' '&format=json&nojsoncallback=1' # UNCOMMENT FOR TESTING #'&per_page=5' '&per_page=500' # this parameter CANNOT take control of 'flickr.galleries.getPhotos' # though the doc said it should. # it's always considered to be 500 '&api_key=%s' '&method=flickr.%s' '&extras=url_sq,url_q,url_t,url_s,url_n,url_m,url_z,url_c,url_l,url_h,url_k,url_o,media' '%s&page=%d' ) tmpl_api_call_video_info = ( 'https://api.flickr.com/services/rest?' '&format=json&nojsoncallback=1' '&method=flickr.video.getStreamInfo' '&api_key=%s' '&photo_id=%s' '&secret=%s' ) tmpl_api_call_photo_info = ( 'https://api.flickr.com/services/rest?' '&format=json&nojsoncallback=1' '&method=flickr.photos.getInfo' '&api_key=%s' '&photo_id=%s' ) # looks that flickr won't return urls for all sizes # we required in 'extras field without a acceptable header dummy_header = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0' } def get_content_headered(url): return get_content(url, dummy_header) def get_photoset_id(url, page): return match1(url, pattern_url_photoset) def get_photo_id(url, page): return match1(url, pattern_url_single_photo) def get_gallery_id(url, page): return match1(url, pattern_url_gallery) def get_api_key(page): match = match1(page, pattern_inline_api_key) # this happens only when the url points to a gallery page # that contains no inline api_key(and never makes xhr api calls) # in fact this might be a better approach for getting a temporary api key # since there's no place for a user to add custom information that may # misguide the regex in the homepage if not match: return match1(get_html('https://flickr.com'), pattern_inline_api_key) return match def get_NSID(url, page): return match1(page, pattern_inline_NSID) # [ # ( # regex_match_url, # remote_api_method, # additional_query_parameter_for_method, # parser_for_additional_parameter, # field_where_photourls_are_saved # ) # ] url_patterns = [ # www.flickr.com/photos/{username|NSID}/sets|albums/{album-id} ( pattern_url_photoset, 'photosets.getPhotos', 'photoset_id', get_photoset_id, 'photoset' ), # www.flickr.com/photos/{username|NSID}/{pageN}? ( pattern_url_photostream, # according to flickr api documentation, this method needs to be # authenticated in order to filter photo visible to the calling user # but it seems works fine anonymously as well 'people.getPhotos', 'user_id', get_NSID, 'photos' ), # www.flickr.com/photos/{username|NSID}/galleries/{gallery-id} ( pattern_url_gallery, 'galleries.getPhotos', 'gallery_id', get_gallery_id, 'photos' ), # www.flickr.com/groups/{groupname|groupNSID}/ ( pattern_url_group, 'groups.pools.getPhotos', 'group_id', get_NSID, 'photos' ), # www.flickr.com/photos/{username|NSID}/favorites/* ( pattern_url_favorite, 'favorites.getList', 'user_id', get_NSID, 'photos' ) ] def flickr_download_main(url, output_dir = '.', merge = False, info_only = False, **kwargs): urls = None size = 'o' # works for collections only title = None if 'stream_id' in kwargs: size = kwargs['stream_id'] if match1(url, pattern_url_single_photo): url, title = get_single_photo_url(url) urls = [url] else: urls, title = fetch_photo_url_list(url, size) index = 0 for url in urls: mime, ext, size = url_info(url) print_info('Flickr.com', title, mime, size) if not info_only: suffix = '[%d]' % index download_urls([url], title + suffix, ext, False, output_dir, None, False, False) index = index + 1 def fetch_photo_url_list(url, size): for pattern in url_patterns: # FIXME: fix multiple matching since the match group is dropped if match1(url, pattern[0]): return fetch_photo_url_list_impl(url, size, *pattern[1:]) raise NotImplementedError('Flickr extractor is not supported for %s.' % url) def fetch_photo_url_list_impl(url, size, method, id_field, id_parse_func, collection_name): page = get_html(url) api_key = get_api_key(page) ext_field = '' if id_parse_func: ext_field = '&%s=%s' % (id_field, id_parse_func(url, page)) page_number = 1 urls = [] while True: call_url = tmpl_api_call % (api_key, method, ext_field, page_number) photoset = json.loads(get_content_headered(call_url))[collection_name] pagen = photoset['page'] pages = photoset['pages'] for info in photoset['photo']: url = get_url_of_largest(info, api_key, size) urls.append(url) page_number = page_number + 1 # the typeof 'page' and 'pages' may change in different methods if str(pagen) == str(pages): break return urls, match1(page, pattern_inline_title) # image size suffixes used in inline json 'key' field # listed in descending order size_suffixes = ['o', 'k', 'h', 'l', 'c', 'z', 'm', 'n', 's', 't', 'q', 'sq'] def get_orig_video_source(api_key, pid, secret): parsed = json.loads(get_content_headered(tmpl_api_call_video_info % (api_key, pid, secret))) for stream in parsed['streams']['stream']: if stream['type'] == 'orig': return stream['_content'].replace('\\', '') return None def get_url_of_largest(info, api_key, size): if info['media'] == 'photo': sizes = size_suffixes if size in sizes: sizes = sizes[sizes.index(size):] for suffix in sizes: if 'url_' + suffix in info: return info['url_' + suffix].replace('\\', '') return None else: return get_orig_video_source(api_key, info['id'], info['secret']) def get_single_photo_url(url): page = get_html(url) pid = get_photo_id(url, page) title = match1(page, pattern_inline_title) if match1(page, pattern_inline_video_mark): api_key = get_api_key(page) reply = get_content(tmpl_api_call_photo_info % (api_key, get_photo_id(url, page))) secret = json.loads(reply)['photo']['secret'] return get_orig_video_source(api_key, pid, secret), title #last match always has the best resolution match = match1(page, pattern_inline_img_url) return 'https:' + match.replace('\\', ''), title site_info = "Flickr.com" download = flickr_download_main download_playlist = playlist_not_supported('flickr'); ================================================ FILE: src/you_get/extractors/freesound.py ================================================ #!/usr/bin/env python __all__ = ['freesound_download'] from ..common import * def freesound_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): page = get_html(url) title = r1(r' 0: res.append(self.mapping_table[num % self.base]) num = num // self.base return ''.join(res[::-1]) class Funshion(VideoExtractor): name = "funshion" stream_types = [ {'id': 'sdvd'}, {'id': 'sdvd_h265'}, {'id': 'hd'}, {'id': 'hd_h265'}, {'id': 'dvd'}, {'id': 'dvd_h265'}, {'id': 'tv'}, {'id': 'tv_h265'} ] a_mobile_url = 'http://m.fun.tv/implay/?mid=302555' video_ep = 'http://pv.funshion.com/v7/video/play/?id={}&cl=mweb&uc=111' media_ep = 'http://pm.funshion.com/v7/media/play/?id={}&cl=mweb&uc=111' coeff = None @classmethod def fetch_magic(cls, url): def search_dict(a_dict, target): for key, val in a_dict.items(): if val == target: return key magic_list = [] page = get_content(url) src = re.findall(r'src="(.+?)"', page) js = [path for path in src if path.endswith('.js')] host = 'http://' + urllib.parse.urlparse(url).netloc js_path = [urllib.parse.urljoin(host, rel_path) for rel_path in js] for p in js_path: if 'mtool' in p or 'mcore' in p: js_text = get_content(p) hit = re.search(r'\(\'(.+?)\',(\d+),(\d+),\'(.+?)\'\.split\(\'\|\'\),\d+,\{\}\)', js_text) code = hit.group(1) base = hit.group(2) size = hit.group(3) names = hit.group(4).split('|') mapping = KBaseMapping(base=int(base)) sym_to_name = {} for no in range(int(size), 0, -1): no_in_base = mapping.mapping(no) val = names[no] if no < len(names) and names[no] else no_in_base sym_to_name[no_in_base] = val moz_ec_name = search_dict(sym_to_name, 'mozEcName') push = search_dict(sym_to_name, 'push') patt = r'{}\.{}\("(.+?)"\)'.format(moz_ec_name, push) ec_list = re.findall(patt, code) [magic_list.append(sym_to_name[ec]) for ec in ec_list] return magic_list @classmethod def get_coeff(cls, magic_list): magic_set = set(magic_list) no_dup = [] for item in magic_list: if item in magic_set: magic_set.remove(item) no_dup.append(item) # really necessary? coeff = [0, 0, 0, 0] for num_pair in no_dup: idx = int(num_pair[-1]) val = int(num_pair[:-1], 16) coeff[idx] = val return coeff @classmethod def funshion_decrypt(cls, a_bytes, coeff): res_list = [] pos = 0 while pos < len(a_bytes): a = a_bytes[pos] if pos == len(a_bytes) - 1: res_list.append(a) pos += 1 else: b = a_bytes[pos + 1] m = a * coeff[0] + b * coeff[2] n = a * coeff[1] + b * coeff[3] res_list.append(m & 0xff) res_list.append(n & 0xff) pos += 2 return bytes(res_list).decode('utf8') @classmethod def funshion_decrypt_str(cls, a_str, coeff): # r'.{27}0' pattern, untested if len(a_str) == 28 and a_str[-1] == '0': data_bytes = base64.b64decode(a_str[:27] + '=') clear = cls.funshion_decrypt(data_bytes, coeff) return binascii.hexlify(clear.encode('utf8')).upper() data_bytes = base64.b64decode(a_str[2:]) return cls.funshion_decrypt(data_bytes, coeff) @classmethod def checksum(cls, sha1_str): if len(sha1_str) != 41: return False if not re.match(r'[0-9A-Za-z]{41}', sha1_str): return False sha1 = sha1_str[:-1] if (15 & sum([int(char, 16) for char in sha1])) == int(sha1_str[-1], 16): return True return False @classmethod def get_cdninfo(cls, hashid): url = 'http://jobsfe.funshion.com/query/v1/mp4/{}.json'.format(hashid) meta = json.loads(get_content(url, decoded=False).decode('utf8')) return meta['playlist'][0]['urls'] @classmethod def dec_playinfo(cls, info, coeff): res = None clear = cls.funshion_decrypt_str(info['infohash'], coeff) if cls.checksum(clear): res = dict(hashid=clear[:40], token=cls.funshion_decrypt_str(info['token'], coeff)) else: clear = cls.funshion_decrypt_str(info['infohash_prev'], coeff) if cls.checksum(clear): res = dict(hashid=clear[:40], token=cls.funshion_decrypt_str(info['token_prev'], coeff)) return res def prepare(self, **kwargs): if self.__class__.coeff is None: magic_list = self.__class__.fetch_magic(self.__class__.a_mobile_url) self.__class__.coeff = self.__class__.get_coeff(magic_list) if 'title' not in kwargs: url = 'http://pv.funshion.com/v5/video/profile/?id={}&cl=mweb&uc=111'.format(self.vid) meta = json.loads(get_content(url)) self.title = meta['name'] else: self.title = kwargs['title'] ep_url = self.__class__.video_ep if 'single_video' in kwargs else self.__class__.media_ep url = ep_url.format(self.vid) meta = json.loads(get_content(url)) streams = meta['playlist'] for stream in streams: definition = stream['code'] for s in stream['playinfo']: codec = 'h' + s['codec'][2:] # h.264 -> h264 for st in self.__class__.stream_types: s_id = '{}_{}'.format(definition, codec) if codec == 'h264': s_id = definition if s_id == st['id']: clear_info = self.__class__.dec_playinfo(s, self.__class__.coeff) cdn_list = self.__class__.get_cdninfo(clear_info['hashid']) base_url = cdn_list[0] vf = urllib.parse.quote(s['vf']) video_size = int(s['filesize']) token = urllib.parse.quote(base64.b64encode(clear_info['token'].encode('utf8'))) video_url = '{}?token={}&vf={}'.format(base_url, token, vf) self.streams[s_id] = dict(size=video_size, src=[video_url], container='mp4') def funshion_download(url, **kwargs): if re.match(r'http://www.fun.tv/vplay/v-(\w+)', url): vid = re.search(r'http://www.fun.tv/vplay/v-(\w+)', url).group(1) Funshion().download_by_vid(vid, single_video=True, **kwargs) elif re.match(r'http://www.fun.tv/vplay/.*g-(\w+)', url): epid = re.search(r'http://www.fun.tv/vplay/.*g-(\w+)', url).group(1) url = 'http://pm.funshion.com/v5/media/episode?id={}&cl=mweb&uc=111'.format(epid) meta = json.loads(get_content(url)) drama_name = meta['name'] extractor = Funshion() for ep in meta['episodes']: title = '{}_{}_{}'.format(drama_name, ep['num'], ep['name']) extractor.download_by_vid(ep['id'], title=title, **kwargs) else: log.wtf('Unknown url pattern') site_info = "funshion" download = funshion_download download_playlist = playlist_not_supported('funshion') ================================================ FILE: src/you_get/extractors/giphy.py ================================================ #!/usr/bin/env python __all__ = ['giphy_download'] from ..common import * def giphy_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url) url = list(set([ unicodize(str.replace(i, '\\/', '/')) for i in re.findall(r'', html) ])) title = r1(r'', html) if title is None: title = url[0] type, ext, size = url_info(url[0], True) size = urls_size(url) type = "video/mp4" ext = "mp4" print_info(site_info, title, type, size) if not info_only: download_urls(url, title, ext, size, output_dir, merge=False) site_info = "Giphy.com" download = giphy_download download_playlist = playlist_not_supported('giphy') ================================================ FILE: src/you_get/extractors/google.py ================================================ #!/usr/bin/env python __all__ = ['google_download'] from ..common import * import re # YouTube media encoding options, in descending quality order. # taken from http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs, 3/22/2013. youtube_codecs = [ {'itag': 38, 'container': 'MP4', 'video_resolution': '3072p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '3.5-5', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, {'itag': 46, 'container': 'WebM', 'video_resolution': '1080p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, {'itag': 37, 'container': 'MP4', 'video_resolution': '1080p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '3-4.3', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, {'itag': 102, 'container': 'WebM', 'video_resolution': '720p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '2', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, {'itag': 45, 'container': 'WebM', 'video_resolution': '720p', 'video_encoding': '', 'video_profile': '', 'video_bitrate': '', 'audio_encoding': '', 'audio_bitrate': ''}, {'itag': 22, 'container': 'MP4', 'video_resolution': '720p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '2-2.9', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, {'itag': 84, 'container': 'MP4', 'video_resolution': '720p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '2-2.9', 'audio_encoding': 'AAC', 'audio_bitrate': '152'}, {'itag': 120, 'container': 'FLV', 'video_resolution': '720p', 'video_encoding': 'AVC', 'video_profile': 'Main@L3.1', 'video_bitrate': '2', 'audio_encoding': 'AAC', 'audio_bitrate': '128'}, {'itag': 85, 'container': 'MP4', 'video_resolution': '520p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '2-2.9', 'audio_encoding': 'AAC', 'audio_bitrate': '152'}, {'itag': 44, 'container': 'WebM', 'video_resolution': '480p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '1', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'}, {'itag': 35, 'container': 'FLV', 'video_resolution': '480p', 'video_encoding': 'H.264', 'video_profile': 'Main', 'video_bitrate': '0.8-1', 'audio_encoding': 'AAC', 'audio_bitrate': '128'}, {'itag': 101, 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, {'itag': 100, 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'}, {'itag': 43, 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '0.5', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'}, {'itag': 34, 'container': 'FLV', 'video_resolution': '360p', 'video_encoding': 'H.264', 'video_profile': 'Main', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '128'}, {'itag': 82, 'container': 'MP4', 'video_resolution': '360p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'}, {'itag': 18, 'container': 'MP4', 'video_resolution': '270p/360p', 'video_encoding': 'H.264', 'video_profile': 'Baseline', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'}, {'itag': 6, 'container': 'FLV', 'video_resolution': '270p', 'video_encoding': 'Sorenson H.263', 'video_profile': '', 'video_bitrate': '0.8', 'audio_encoding': 'MP3', 'audio_bitrate': '64'}, {'itag': 83, 'container': 'MP4', 'video_resolution': '240p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'}, {'itag': 13, 'container': '3GP', 'video_resolution': '', 'video_encoding': 'MPEG-4 Visual', 'video_profile': '', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': ''}, {'itag': 5, 'container': 'FLV', 'video_resolution': '240p', 'video_encoding': 'Sorenson H.263', 'video_profile': '', 'video_bitrate': '0.25', 'audio_encoding': 'MP3', 'audio_bitrate': '64'}, {'itag': 36, 'container': '3GP', 'video_resolution': '240p', 'video_encoding': 'MPEG-4 Visual', 'video_profile': 'Simple', 'video_bitrate': '0.17', 'audio_encoding': 'AAC', 'audio_bitrate': '38'}, {'itag': 17, 'container': '3GP', 'video_resolution': '144p', 'video_encoding': 'MPEG-4 Visual', 'video_profile': 'Simple', 'video_bitrate': '0.05', 'audio_encoding': 'AAC', 'audio_bitrate': '24'}, ] fmt_level = dict( zip( [str(codec['itag']) for codec in youtube_codecs], range(len(youtube_codecs)))) def google_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): # Percent-encoding Unicode URL url = parse.quote(url, safe = ':/+%?=') service = url.split('/')[2].split('.')[0] if service == 'plus': # Google Plus # attempt to extract images first # TBD: posts with > 4 images # TBD: album links html = get_html(parse.unquote(url), faker=True) real_urls = [] for src in re.findall(r'src="([^"]+)"[^>]*itemprop="image"', html): t = src.split('/') t[0], t[-2] = t[0] or 'https:', 's0-d' u = '/'.join(t) real_urls.append(u) if not real_urls: real_urls = re.findall(r']*>([^<\n]+)', post_html) if title is None: response = request.urlopen(request.Request(real_url)) if response.headers['content-disposition']: filename = parse.unquote(r1(r'filename="?(.+)"?', response.headers['content-disposition'])).split('.') title = ''.join(filename[:-1]) except: pass for (i, real_url) in enumerate(real_urls): title_i = "%s[%s]" % (title, i) if len(real_urls) > 1 else title type, ext, size = url_info(real_url) if ext is None: ext = 'mp4' print_info(site_info, title_i, ext, size) if not info_only: download_urls([real_url], title_i, ext, size, output_dir, merge = merge) elif service in ['docs', 'drive'] : # Google Docs html = get_content(url, headers=fake_headers) title = r1(r'"title":"([^"]*)"', html) or r1(r' 1: title = ".".join(title.split('.')[:-1]) docid = r1('/file/d/([^/]+)', url) request.install_opener(request.build_opener(request.HTTPCookieProcessor())) real_url = "https://docs.google.com/uc?export=download&confirm=no_antivirus&id=%s" % docid redirected_url = get_location(real_url) if real_url != redirected_url: # tiny file - get real url here type, ext, size = url_info(redirected_url) real_url = redirected_url else: # huge file - the real_url is a confirm page and real url is in it confirm_page = get_content(real_url) hrefs = re.findall(r'href="(.+?)"', confirm_page) for u in hrefs: if u.startswith('/uc?export=download'): rel = unescape_html(u) confirm_url = 'https://docs.google.com' + rel real_url = get_location(confirm_url) _, ext, size = url_info(real_url, headers=fake_headers) if size is None: size = 0 print_info(site_info, title, ext, size) if not info_only: download_urls([real_url], title, ext, size, output_dir, merge = merge) site_info = "Google.com" download = google_download download_playlist = playlist_not_supported('google') ================================================ FILE: src/you_get/extractors/heavymusic.py ================================================ #!/usr/bin/env python __all__ = ['heavymusic_download'] from ..common import * def heavymusic_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url) tracks = re.findall(r'href="(online2\.php[^"]+)"', html) for track in tracks: band = r1(r'band=([^&]*)', track) album = r1(r'album=([^&]*)', track) title = r1(r'track=([^&]*)', track) file_url = 'http://www.heavy-music.ru/online2.php?band=%s&album=%s&track=%s' % (parse.quote(band), parse.quote(album), parse.quote(title)) _, _, size = url_info(file_url) print_info(site_info, title, 'mp3', size) if not info_only: download_urls([file_url], title[:-4], 'mp3', size, output_dir, merge=merge) site_info = "heavy-music.ru" download = heavymusic_download download_playlist = heavymusic_download ================================================ FILE: src/you_get/extractors/huomaotv.py ================================================ #!/usr/bin/env python __all__ = ['huomaotv_download'] from ..common import * def get_mobile_room_url(room_id): return 'http://www.huomao.com/mobile/mob_live/%s' % room_id def get_m3u8_url(stream_id): return 'http://live-ws.huomaotv.cn/live/%s/playlist.m3u8' % stream_id def huomaotv_download(url, output_dir='.', merge=True, info_only=False, **kwargs): room_id_pattern = r'huomao.com/(\d+)' room_id = match1(url, room_id_pattern) html = get_content(get_mobile_room_url(room_id)) stream_id_pattern = r'id="html_stream" value="(\w+)"' stream_id = match1(html, stream_id_pattern) m3u8_url = get_m3u8_url(stream_id) title = match1(html, r'([^<]{1,9999})') print_info(site_info, title, 'm3u8', float('inf')) if not info_only: download_url_ffmpeg(m3u8_url, title, 'm3u8', None, output_dir=output_dir, merge=merge) site_info = 'huomao.com' download = huomaotv_download download_playlist = playlist_not_supported('huomao') ================================================ FILE: src/you_get/extractors/icourses.py ================================================ #!/usr/bin/env python from ..common import * from urllib import parse, error import random from time import sleep import datetime import hashlib import base64 import logging import re from xml.dom.minidom import parseString __all__ = ['icourses_download', 'icourses_playlist_download'] def icourses_download(url, output_dir='.', **kwargs): if 'showResDetail.action' in url: hit = re.search(r'id=(\d+)&courseId=(\d+)', url) url = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}'.format(hit.group(1), hit.group(2)) if re.match(r'http://www.icourses.cn/coursestatic/course_(\d+).html', url): raise Exception('You can download it with -l flag') icourses_parser = ICousesExactor(url=url) icourses_parser.basic_extract() title = icourses_parser.title size = None for i in range(5): try: # use this url only for size size_url = icourses_parser.generate_url(0) _, type_, size = url_info(size_url, headers=fake_headers) except error.HTTPError: logging.warning('Failed to fetch the video file! Retrying...') sleep(random.Random().randint(2, 5)) # Prevent from blockage else: print_info(site_info, title, type_, size) break if size is None: raise Exception("Failed") if not kwargs['info_only']: real_url = icourses_parser.update_url(0) headers = fake_headers.copy() headers['Referer'] = url download_urls_icourses(real_url, title, 'flv',total_size=size, output_dir=output_dir, max_size=15728640, dyn_callback=icourses_parser.update_url) return def get_course_title(url, course_type, page=None): if page is None: try: # shard course page could be gbk but with charset="utf-8" page = get_content(url, decoded=False).decode('gbk') except UnicodeDecodeError: page = get_content(url, decoded=False).decode('utf8') if course_type == 'shared_old': patt = r'(.+?)<\/div>' elif course_type == 'shared_new': patt = r'

    (.+?)<\/h1>' else: patt = r'(.+?)<\/div>' return re.search(patt, page).group(1) def public_course_playlist(url, page=None): host = 'http://www.icourses.cn/' patt = r'(?:.|\n)+?' if page is None: page = get_content(url) playlist = re.findall(patt, page) return [(host+i[0], i[1]) for i in playlist] def public_course_get_title(url, page=None): patt = r'.+?第(\d+)讲' if page is None: page = get_content(url) seq_num = int(re.search(patt, page).group(1)) - 1 course_main_title = get_course_title(url, 'public', page) return '{}_第{}讲_{}'.format(course_main_title, seq_num+1, public_course_playlist(url, page)[seq_num][1]) def icourses_playlist_download(url, output_dir='.', **kwargs): page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)' resid_courseid_patt = r'changeforvideo\(\'(\d+)\',\'(\d+)\',\'(\d+)\'\)' ep = 'http://www.icourses.cn/jpk/viewCharacterDetail.action?sectionId={}&courseId={}' change_for_video_ip = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}' video_list = [] if 'viewVCourse' in url: playlist = public_course_playlist(url) for video in playlist: icourses_download(video[0], output_dir=output_dir, **kwargs) return elif 'coursestatic' in url: course_page = get_content(url) page_navi_vars = re.search(page_type_patt, course_page) if page_navi_vars is None: # type 2 shared course video_list = icourses_playlist_new(url, course_page) else: # type 1 shared course sec_page = get_content(ep.format(page_navi_vars.group(2), page_navi_vars.group(1))) video_list = re.findall(resid_courseid_patt, sec_page) elif 'viewCharacterDetail.action' in url or 'changeforVideo.action' in url: page = get_content(url) video_list = re.findall(resid_courseid_patt, page) if not video_list: raise Exception('Unknown url pattern') for video in video_list: video_url = change_for_video_ip.format(video[0], video[1]) sleep(random.Random().randint(0, 5)) # Prevent from blockage icourses_download(video_url, output_dir=output_dir, **kwargs) def icourses_playlist_new(url, page=None): # 2 helpers using same interface in the js code def to_chap(course_id, chap_id, mod): ep = 'http://www.icourses.cn/jpk/viewCharacterDetail2.action?courseId={}&characId={}&mod={}' req = post_content(ep.format(course_id, chap_id, mod), post_data={}) return req def to_sec(course_id, chap_id, mod): ep = 'http://www.icourses.cn/jpk/viewCharacterDetail2.action?courseId={}&characId={}&mod={}' req = post_content(ep.format(course_id, chap_id, mod), post_data={}) return req def show_sec(course_id, chap_id): ep = 'http://www.icourses.cn/jpk/getSectionNode.action?courseId={}&characId={}&mod=2' req = post_content(ep.format(course_id, chap_id), post_data={}) return req if page is None: page = get_content(url) chap_patt = r'

    .+?id="parent_row_(\d+)".+?onclick="(\w+)\((.+)\)"' to_chap_patt = r'this,(\d+),(\d+),(\d)' show_sec_patt = r'this,(\d+),(\d+)' res_patt = r'res_showResDetail\(\'(\d+)\',\'.+?\',\'\d+\',\'mp4\',\'(\d+)\'\)' l = re.findall(chap_patt, page) for i in l: if i[1] == 'ajaxtocharac': hit = re.search(to_chap_patt, i[2]) page = to_chap(hit.group(1), hit.group(2), hit.group(3)) hit_list = re.findall(res_patt, page) if hit_list: return get_playlist(hit_list[0][0], hit_list[0][1]) for hit in hit_list: print(hit) elif i[1] == 'showSectionNode2': hit = re.search(show_sec_patt, i[2]) page = show_sec(hit.group(1), hit.group(2)) # print(page) patt = r'ajaxtosection\(this,(\d+),(\d+),(\d+)\)' hit_list = re.findall(patt, page) # print(hit_list) for hit in hit_list: page = to_sec(hit[0], hit[1], hit[2]) vlist = re.findall(res_patt, page) if vlist: return get_playlist(vlist[0][0], vlist[0][1]) raise Exception("No video found in this playlist") def get_playlist(res_id, course_id): ep = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}' req = get_content(ep.format(res_id, course_id)) patt = r' (.*?)' title_b_patt = r'
    ((.|\n)*?)
    ' title_a = match1(self.page, title_a_patt).strip() title_b = match1(self.page, title_b_patt).strip() title = title_a + title_b title = re.sub('( +|\n|\t|\r| )', '', unescape_html(title).replace(' ', '')) self.title = title def get_flashvars(self): patt = r'var flashvars\s*=\s*(\{(?:.|\n)+?\});' hit = re.search(patt, self.page) if hit is None: raise Exception('Cannot find flashvars') flashvar_str = hit.group(1) uuid = re.search(r'uuid\s*:\s*\"?(\w+)\"?', flashvar_str).group(1) other = re.search(r'other\s*:\s*"(.*?)"', flashvar_str).group(1) isvc = re.search(r'IService\s*:\s*\'(.+?)\'', flashvar_str).group(1) player_time_patt = r'MPlayer.swf\?v\=(\d+)' player_time = re.search(player_time_patt, self.page).group(1) self.flashvars = dict(IService=isvc, uuid=uuid, other=other, v=player_time) def api_req(self, url): xml_str = get_content(url) dom = parseString(xml_str) status = dom.getElementsByTagName('result')[0].getAttribute('status') if status != 'success': raise Exception('API returned fail') api_res = {} meta = dom.getElementsByTagName('metadata') for m in meta: key = m.getAttribute('name') val = m.firstChild.nodeValue api_res[key] = val self.api_data = api_res def basic_extract(self): self.get_title() self.get_flashvars() api_req_url = '{}?{}'.format(self.flashvars['IService'], parse.urlencode(self.flashvars)) self.api_req(api_req_url) def do_extract(self, received=0): self.basic_extract() return self.generate_url(received) def update_url(self, received): args = self.common_args.copy() play_type = 'seek' if received else 'play' received = received if received else -1 args['ls'] = play_type args['start'] = received + 1 args['lt'] = self.get_date_str() if self.enc_mode: ssl_ts, sign = self.get_sign(self.media_url) extra_args = dict(h=sign, r=ssl_ts, p=self.__class__.ENCRYPT_MOD_VER) args.update(extra_args) return '{}?{}'.format(self.media_url, parse.urlencode(args)) @classmethod def get_date_str(self): fmt_str = '%-m-%-d/%-H:%-M:%-S' now = datetime.datetime.now() try: date_str = now.strftime(fmt_str) except ValueError: # msvcrt date_str = '{}-{}/{}:{}:{}'.format(now.month, now.day, now.hour, now.minute, now.second) return date_str def generate_url(self, received): media_host = self.get_media_host(self.api_data['host']) media_url = media_host + self.api_data['url'] self.media_url = media_url common_args = dict(lv=self.__class__.PLAYER_BASE_VER) h = self.api_data.get('h') r = self.api_data.get('p', self.__class__.ENCRYPT_MOD_VER) if self.api_data['ssl'] != 'true': self.enc_mode = False common_args.update(dict(h=h, r=r)) else: self.enc_mode = True common_args['p'] = self.__class__.ENCRYPT_MOD_VER self.common_args = common_args return self.update_url(received) def get_sign(self, media_url): media_host = parse.urlparse(media_url).netloc ran = random.randint(0, 9999999) ssl_callback = get_content('http://{}/ssl/ssl.shtml?r={}'.format(media_host, ran)).split(',') ssl_ts = int(datetime.datetime.strptime(ssl_callback[1], "%b %d %H:%M:%S %Y").timestamp() + int(ssl_callback[0])) sign_this = self.__class__.ENCRYPT_SALT + parse.urlparse(media_url).path + str(ssl_ts) arg_h = base64.b64encode(hashlib.md5(bytes(sign_this, 'utf-8')).digest(), altchars=b'-_') return ssl_ts, arg_h.decode('utf-8').strip('=') def get_media_host(self, ori_host): res = get_content(ori_host + '/ssl/host.shtml').strip() path = parse.urlparse(ori_host).path return ''.join([res, path]) def download_urls_icourses(url, title, ext, total_size, output_dir='.', headers=None, **kwargs): if dry_run or player: log.wtf('Non standard protocol') title = get_filename(title) filename = '%s.%s' % (title, ext) filepath = os.path.join(output_dir, filename) if not force and os.path.exists(filepath): print('Skipping {}: file already exists\n'.format(filepath)) return bar = SimpleProgressBar(total_size, 1) print('Downloading %s ...' % tr(filename)) url_save_icourses(url, filepath, bar, total_size, headers=headers, **kwargs) bar.done() print() def url_save_icourses(url, filepath, bar, total_size, dyn_callback=None, is_part=False, max_size=0, headers=None): def dyn_update_url(received): if callable(dyn_callback): logging.debug('Calling callback %s for new URL from %s' % (dyn_callback.__name__, received)) return dyn_callback(received) if bar is None: bar = DummyProgressBar() if os.path.exists(filepath): if not force: if not is_part: bar.done() print('Skipping %s: file already exists' % tr(os.path.basename(filepath))) else: filesize = os.path.getsize(filepath) bar.update_received(filesize) return else: if not is_part: bar.done() print('Overwriting %s' % os.path.basename(filepath), '...') elif not os.path.exists(os.path.dirname(filepath)): os.mkdir(os.path.dirname(filepath)) temp_filepath = filepath + '.download' received = 0 if not force: open_mode = 'ab' if os.path.exists(temp_filepath): tempfile_size = os.path.getsize(temp_filepath) received += tempfile_size bar.update_received(tempfile_size) else: open_mode = 'wb' if received: url = dyn_update_url(received) if headers is None: headers = {} response = urlopen_with_retry(request.Request(url, headers=headers)) # Do not update content-length here. # Only the 1st segment's content-length is the content-length of the file. # For other segments, content-length is the standard one, 15 * 1024 * 1024 with open(temp_filepath, open_mode) as output: before_this_uri = received # received - before_this_uri is size of the buf we get from one uri while True: update_bs = 256 * 1024 left_bytes = total_size - received to_read = left_bytes if left_bytes <= update_bs else update_bs # calc the block size to read -- The server can fail to send an EOF buffer = response.read(to_read) if not buffer: logging.debug('Got EOF from server') break output.write(buffer) received += len(buffer) bar.update_received(len(buffer)) if received >= total_size: break if max_size and (received - before_this_uri) >= max_size: url = dyn_update_url(received) before_this_uri = received response = urlopen_with_retry(request.Request(url, headers=headers)) assert received == os.path.getsize(temp_filepath), '%s == %s' % (received, os.path.getsize(temp_filepath)) if os.access(filepath, os.W_OK): os.remove(filepath) # on Windows rename could fail if destination filepath exists os.rename(temp_filepath, filepath) site_info = 'icourses.cn' download = icourses_download download_playlist = icourses_playlist_download ================================================ FILE: src/you_get/extractors/ifeng.py ================================================ #!/usr/bin/env python __all__ = ['ifeng_download', 'ifeng_download_by_id'] from ..common import * def ifeng_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False): assert r1(r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})', id), id url = 'http://vxml.ifengimg.com/video_info_new/%s/%s/%s.xml' % (id[-2], id[-2:], id) xml = get_html(url, 'utf-8') title = r1(r'Name="([^"]+)"', xml) title = unescape_html(title) url = r1(r'VideoPlayUrl="([^"]+)"', xml) from random import randint r = randint(10, 19) url = url.replace('http://wideo.ifeng.com/', 'http://ips.ifeng.com/wideo.ifeng.com/') type, ext, size = url_info(url) print_info(site_info, title, ext, size) if not info_only: download_urls([url], title, ext, size, output_dir, merge = merge) def ifeng_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): # old pattern /uuid.shtml # now it could be #uuid id = r1(r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})', url) if id: return ifeng_download_by_id(id, None, output_dir = output_dir, merge = merge, info_only = info_only) html = get_content(url) uuid_pattern = r'"([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})"' id = r1(r'var vid="([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})"', html) if id is None: video_pattern = r'"vid"\s*:\s*' + uuid_pattern id = match1(html, video_pattern) assert id, "can't find video info" return ifeng_download_by_id(id, None, output_dir = output_dir, merge = merge, info_only = info_only) site_info = "ifeng.com" download = ifeng_download download_playlist = playlist_not_supported('ifeng') ================================================ FILE: src/you_get/extractors/imgur.py ================================================ #!/usr/bin/env python from ..common import * from ..extractor import VideoExtractor from .universal import * class Imgur(VideoExtractor): name = "Imgur" stream_types = [ {'id': 'original'}, {'id': 'thumbnail'}, ] def prepare(self, **kwargs): self.ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/123.0.2420.97' if re.search(r'imgur\.com/a/', self.url): # album content = get_content(self.url, headers=fake_headers) album = match1(content, r'album\s*:\s*({.*}),') or \ match1(content, r'image\s*:\s*({.*}),') album = json.loads(album) count = album['album_images']['count'] images = album['album_images']['images'] ext = images[0]['ext'] self.streams = { 'original': { 'src': ['http://i.imgur.com/%s%s' % (i['hash'], ext) for i in images], 'size': sum([i['size'] for i in images]), 'container': ext[1:] }, 'thumbnail': { 'src': ['http://i.imgur.com/%ss%s' % (i['hash'], '.jpg') for i in images], 'container': 'jpg' } } self.title = album['title'] elif re.search(r'i\.imgur\.com/', self.url): # direct image _, container, size = url_info(self.url, faker=True) self.streams = { 'original': { 'src': [self.url], 'size': size, 'container': container } } self.title = r1(r'i\.imgur\.com/([^./]*)', self.url) else: # gallery image content = get_content(self.url, headers=fake_headers) url = match1(content, r'meta property="og:video"[^>]+(https?://i.imgur.com/[^"?]+)') or \ match1(content, r'meta property="og:image"[^>]+(https?://i.imgur.com/[^"?]+)') _, container, size = url_info(url, headers={'User-Agent': fake_headers['User-Agent']}) self.streams = { 'original': { 'src': [url], 'size': size, 'container': container } } self.title = r1(r'i\.imgur\.com/([^./]*)', url) def extract(self, **kwargs): if 'stream_id' in kwargs and kwargs['stream_id']: i = kwargs['stream_id'] if 'size' not in self.streams[i]: self.streams[i]['size'] = urls_size(self.streams[i]['src']) site = Imgur() download = site.download_by_url download_playlist = site.download_by_url ================================================ FILE: src/you_get/extractors/infoq.py ================================================ #!/usr/bin/env python from ..common import * from ..extractor import VideoExtractor import ssl class Infoq(VideoExtractor): name = "InfoQ" stream_types = [ {'id': 'video'}, {'id': 'audio'}, {'id': 'slides'} ] def prepare(self, **kwargs): content = get_content(self.url) self.title = match1(content, r'([^<]+)') s = match1(content, r'P\.s\s*=\s*\'([^\']+)\'') scp = match1(content, r'InfoQConstants\.scp\s*=\s*\'([^\']+)\'') scs = match1(content, r'InfoQConstants\.scs\s*=\s*\'([^\']+)\'') sck = match1(content, r'InfoQConstants\.sck\s*=\s*\'([^\']+)\'') mp3 = match1(content, r'name="filename"\s*value="([^"]+\.mp3)"') if mp3: mp3 = 'http://res.infoq.com/downloads/mp3downloads/%s' % mp3 pdf = match1(content, r'name="filename"\s*value="([^"]+\.pdf)"') if pdf: pdf = 'http://res.infoq.com/downloads/pdfdownloads/%s' % pdf # cookie handler ssl_context = request.HTTPSHandler( context=ssl.SSLContext(ssl.PROTOCOL_TLSv1)) cookie_handler = request.HTTPCookieProcessor() opener = request.build_opener(ssl_context, cookie_handler) opener.addheaders = [ ('Referer', self.url), ('Cookie', 'CloudFront-Policy=%s;CloudFront-Signature=%s;CloudFront-Key-Pair-Id=%s' % (scp, scs, sck)) ] request.install_opener(opener) if s: self.streams['video'] = {'url': s } if mp3: self.streams['audio'] = { 'url': mp3 } if pdf: self.streams['slides'] = { 'url': pdf } def extract(self, **kwargs): for i in self.streams: s = self.streams[i] _, s['container'], s['size'] = url_info(s['url']) s['src'] = [s['url']] site = Infoq() download = site.download_by_url download_playlist = site.download_by_url ================================================ FILE: src/you_get/extractors/instagram.py ================================================ #!/usr/bin/env python __all__ = ['instagram_download'] from ..common import * def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwargs): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.2592.87', 'sec-fetch-mode': 'navigate' # important } url = r1(r'([^?]*)', url) cont = get_content(url, headers=headers) vid = r1(r'instagram.com/\w+/([^/]+)', url) description = r1(r'([^<]*)', cont) # with logged-in cookies title = "{} [{}]".format(description.replace("\n", " "), vid) appId = r1(r'"appId":"(\d+)"', cont) media_id = r1(r'"media_id":"(\d+)"', cont) logging.debug('appId: %s' % appId) logging.debug('media_id: %s' % media_id) api_url = 'https://i.instagram.com/api/v1/media/%s/info/' % media_id try: api_cont = get_content(api_url, headers={**fake_headers, **{'x-ig-app-id': appId}}) post = json.loads(api_cont) except: log.wtf('[Error] Please specify a cookie file.') for item in post['items']: code = item['code'] carousel_media = item.get('carousel_media') or [item] for i, media in enumerate(carousel_media): title = '%s [%s]' % (code, i) image_url = media['image_versions2']['candidates'][0]['url'] ext = image_url.split('?')[0].split('.')[-1] size = int(get_head(image_url)['Content-Length']) print_info(site_info, title, ext, size) if not info_only: download_urls(urls=[image_url], title=title, ext=ext, total_size=size, output_dir=output_dir) # download videos (if any) if 'video_versions' in media: video_url = media['video_versions'][0]['url'] ext = video_url.split('?')[0].split('.')[-1] size = int(get_head(video_url)['Content-Length']) print_info(site_info, title, ext, size) if not info_only: download_urls(urls=[video_url], title=title, ext=ext, total_size=size, output_dir=output_dir) site_info = "Instagram.com" download = instagram_download download_playlist = playlist_not_supported('instagram') ================================================ FILE: src/you_get/extractors/interest.py ================================================ #!/usr/bin/env python from ..common import * from json import loads def interest_download(url, output_dir='.', merge=True, info_only=False, **kwargs): #http://ch.interest.me/zhtv/VOD/View/114789 #http://program.interest.me/zhtv/sonja/8/Vod/View/15794 html = get_content(url) #get title title = match1(html, r' http://www.iqiyi.com/common/flashplayer/20150916/MainPlayer_5_2_28_c3_3_7_4.swf use @fffonion 's method in #617. Add trace AVM(asasm) code in Iqiyi's encode function where the salt is put into the encode array and reassemble by RABCDasm(or WinRABCDasm),then use Fiddler to response modified file to replace the src file with its AutoResponder function ,set browser Fiddler proxy and play with !debug version! Flash Player ,finially get result in flashlog.txt(its location can be easily found in search engine). Code Like (without letters after #comment:),it just do the job : trace("{IQIYI_SALT}:"+salt_array.join("")) ```(Position After getTimer) findpropstrict QName(PackageNamespace(""), "trace") pushstring "{IQIYI_SALT}:" #comment for you to locate the salt getscopeobject 1 getslot 17 #comment: 17 is the salt slots number defined in code pushstring "" callproperty QName(Namespace("http://adobe.com/AS3/2006/builtin"), "join"), 1 add callpropvoid QName(PackageNamespace(""), "trace"), 1 ``` -> http://www.iqiyi.com/common/flashplayer/20150820/MainPlayer_5_2_27_2_c3_3_7_3.swf some small changes in Zombie.bite function ''' ''' com.qiyi.player.core.model.def.DefinitonEnum bid meaning for quality 0 none 1 standard 2 high 3 super 4 suprt-high 5 fullhd 10 4k 96 topspeed ''' ''' def mix(tvid): salt = '4a1caba4b4465345366f28da7c117d20' tm = str(randint(2000,4000)) sc = hashlib.new('md5', bytes(salt + tm + tvid, 'utf-8')).hexdigest() return tm, sc, 'eknas' def getVRSXORCode(arg1,arg2): loc3=arg2 %3 if loc3 == 1: return arg1^121 if loc3 == 2: return arg1^72 return arg1^103 def getVrsEncodeCode(vlink): loc6=0 loc2='' loc3=vlink.split("-") loc4=len(loc3) # loc5=loc4-1 for i in range(loc4-1,-1,-1): loc6=getVRSXORCode(int(loc3[loc4-i-1],16),i) loc2+=chr(loc6) return loc2[::-1] def getDispathKey(rid): tp=")(*&^flash@#$%a" #magic from swf time=json.loads(get_content("http://data.video.qiyi.com/t?tn="+str(random())))["t"] t=str(int(floor(int(time)/(10*60.0)))) return hashlib.new("md5",bytes(t+tp+rid,"utf-8")).hexdigest() ''' def getVMS(tvid, vid): t = int(time.time() * 1000) src = '76f90cbd92f94a2e925d83e8ccd22cb7' key = 'd5fb4bd9d50c4be6948c97edd7254b0e' sc = hashlib.new('md5', bytes(str(t) + key + vid, 'utf-8')).hexdigest() vmsreq= url = 'http://cache.m.iqiyi.com/tmts/{0}/{1}/?t={2}&sc={3}&src={4}'.format(tvid,vid,t,sc,src) return json.loads(get_content(vmsreq)) class Iqiyi(VideoExtractor): name = "爱奇艺 (Iqiyi)" stream_types = [ {'id': '4k', 'container': 'm3u8', 'video_profile': '4k'}, {'id': 'BD', 'container': 'm3u8', 'video_profile': '1080p'}, {'id': 'TD', 'container': 'm3u8', 'video_profile': '720p'}, {'id': 'TD_H265', 'container': 'm3u8', 'video_profile': '720p H265'}, {'id': 'HD', 'container': 'm3u8', 'video_profile': '540p'}, {'id': 'HD_H265', 'container': 'm3u8', 'video_profile': '540p H265'}, {'id': 'SD', 'container': 'm3u8', 'video_profile': '360p'}, {'id': 'LD', 'container': 'm3u8', 'video_profile': '210p'}, ] ''' supported_stream_types = [ 'high', 'standard'] stream_to_bid = { '4k': 10, 'fullhd' : 5, 'suprt-high' : 4, 'super' : 3, 'high' : 2, 'standard' :1, 'topspeed' :96} ''' ids = ['4k','BD', 'TD', 'HD', 'SD', 'LD'] vd_2_id = {10: '4k', 19: '4k', 5:'BD', 18: 'BD', 21: 'HD_H265', 2: 'HD', 4: 'TD', 17: 'TD_H265', 96: 'LD', 1: 'SD', 14: 'TD'} id_2_profile = {'4k':'4k', 'BD': '1080p','TD': '720p', 'HD': '540p', 'SD': '360p', 'LD': '210p', 'HD_H265': '540p H265', 'TD_H265': '720p H265'} def download_playlist_by_url(self, url, **kwargs): self.url = url video_page = get_content(url) videos = set(re.findall(r' np try: if info["data"]['vp']["tkl"]=='' : raise ValueError except: log.e("[Error] Do not support for iQIYI VIP video.") exit(-1) vs = info["data"]["vp"]["tkl"][0]["vs"] self.baseurl=info["data"]["vp"]["du"].split("/") for stream in self.stream_types: for i in vs: if self.stream_to_bid[stream['id']] == i['bid']: video_links=i["fs"] #now in i["flvs"] not in i["fs"] if not i["fs"][0]["l"].startswith("/"): tmp = getVrsEncodeCode(i["fs"][0]["l"]) if tmp.endswith('mp4'): video_links = i["flvs"] self.stream_urls[stream['id']] = video_links size = 0 for l in video_links: size += l['b'] self.streams[stream['id']] = {'container': stream['container'], 'video_profile': stream['video_profile'], 'size' : size} break def extract(self, **kwargs): if 'stream_id' in kwargs and kwargs['stream_id']: # Extract the stream stream_id = kwargs['stream_id'] if stream_id not in self.streams: log.e('[Error] Invalid video format.') log.e('Run \'-i\' command with no specific video format to view all available formats.') exit(2) else: # Extract stream with the best quality stream_id = self.streams_sorted[0]['id'] urls=[] for i in self.stream_urls[stream_id]: vlink=i["l"] if not vlink.startswith("/"): #vlink is encode vlink=getVrsEncodeCode(vlink) key=getDispathKey(vlink.split("/")[-1].split(".")[0]) baseurl = [x for x in self.baseurl] baseurl.insert(-1,key) url="/".join(baseurl)+vlink+'?su='+self.gen_uid+'&qyid='+uuid4().hex+'&client=&z=&bt=&ct=&tn='+str(randint(10000,20000)) urls.append(json.loads(get_content(url))["l"]) #download should be complete in 10 minutes #because the url is generated before start downloading #and the key may be expired after 10 minutes self.streams[stream_id]['src'] = urls ''' site = Iqiyi() download = site.download_by_url iqiyi_download_by_vid = site.download_by_vid download_playlist = site.download_playlist_by_url ================================================ FILE: src/you_get/extractors/iwara.py ================================================ #!/usr/bin/env python __all__ = ['iwara_download'] from ..common import * headers = { 'DNT': '1', 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'en-CA,en;q=0.8,en-US;q=0.6,zh-CN;q=0.4,zh;q=0.2', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Save-Data': 'on', 'Cookie':'has_js=1;show_adult=1', } stream_types = [ {'id': 'Source', 'container': 'mp4', 'video_profile': '原始'}, {'id': '540p', 'container': 'mp4', 'video_profile': '540p'}, {'id': '360p', 'container': 'mp4', 'video_profile': '360P'}, ] def iwara_download(url, output_dir='.', merge=True, info_only=False, **kwargs): global headers video_hash = match1(url, r'https?://\w+.iwara.tv/videos/(\w+)') video_url = match1(url, r'(https?://\w+.iwara.tv)/videos/\w+') html = get_content(url, headers=headers) title = r1(r'(.*)', html) api_url = video_url + '/api/video/' + video_hash content = get_content(api_url, headers=headers) data = json.loads(content) if len(data)<1 : print('Maybe is Private Video?'+'['+title+']') return True; down_urls = 'https:' + data[0]['uri'] type, ext, size = url_info(down_urls, headers=headers) print_info(site_info, title+data[0]['resolution'], type, size) if not info_only: download_urls([down_urls], title, ext, size, output_dir, merge=merge, headers=headers) def download_playlist_by_url( url, **kwargs): video_page = get_html(url) url_first=match1(url, r"(http[s]?://[^/]+)") videos = set(re.findall(r'0): for video in videos: iwara_download(url_first+video, **kwargs) else: maybe_print('this page not found any videos') site_info = "Iwara" download = iwara_download download_playlist = download_playlist_by_url ================================================ FILE: src/you_get/extractors/ixigua.py ================================================ #!/usr/bin/env python import base64 from ..common import * from json import loads from urllib import request __all__ = ['ixigua_download', 'ixigua_download_playlist_by_url'] headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 " "Safari/537.36", } def ixigua_download(url, output_dir='.', merge=True, info_only=False, stream_id='', **kwargs): # example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422 headers['cookie'] = "MONITOR_WEB_ID=7892c49b-296e-4499-8704-e47c1b15123; " \ "ixigua-a-s=1; ttcid=af99669b6304453480454f1507011d5c234; BD_REF=1; " \ "__ac_nonce=060d88ff000a75e8d17eb; __ac_signature=_02B4Z6wo100f01kX9ZpgAAIDAKIBBQUIPYT5F2WIAAPG2ad; " \ "ttwid=1%7CcIsVF_3vqSIk4XErhPB0H2VaTxT0tdsTMRbMjrJOPN8%7C1624806049%7C08ce7dd6f7d20506a41ba0a331ef96a6505d96731e6ad9f6c8c709f53f227ab1; " resp = urlopen_with_retry(request.Request(url, headers=headers)) html = resp.read().decode('utf-8') _cookies = [] for c in resp.getheader('Set-Cookie').split("httponly,"): _cookies.append(c.strip().split(' ')[0]) headers['cookie'] += ' '.join(_cookies) match_txt = match1(html, r"', home_page)[-1] client_id = get_content(js_url) return re.search(r'client_id:"(.+?)"', client_id).group(1) def get_resource_info(resource_url, client_id): cont = get_content(resource_url, decoded=True) x = re.escape('forEach(function(e){n(e)})}catch(e){}})},') x = re.search(r'' + x + r'(.*)\);', cont) info = json.loads(x.group(1))[-1]['data'][0] info = info['tracks'] if info.get('track_count') else [info] ids = [i['id'] for i in info if i.get('comment_count') is None] ids = list(map(str, ids)) ids_split = ['%2C'.join(ids[i:i+10]) for i in range(0, len(ids), 10)] api_url = 'https://api-v2.soundcloud.com/tracks?ids={ids}&client_id={client_id}&%5Bobject%20Object%5D=&app_version=1584348206&app_locale=en' res = [] for ids in ids_split: uri = api_url.format(ids=ids, client_id=client_id) cont = get_content(uri, decoded=True) res += json.loads(cont) res = iter(res) info = [next(res) if i.get('comment_count') is None else i for i in info] return info def sndcd_download(url, output_dir='.', merge=True, info_only=False, **kwargs): client_id = get_sndcd_apikey() r_info = get_resource_info(url, client_id) for info in r_info: title = info['title'] metadata = info.get('publisher_metadata') transcodings = info['media']['transcodings'] sq = [i for i in transcodings if i['quality'] == 'sq'] hq = [i for i in transcodings if i['quality'] == 'hq'] # source url surl = sq[0] if hq == [] else hq[0] surl = surl['url'] uri = surl + '?client_id=' + client_id r = get_content(uri) surl = json.loads(r)['url'] m3u8 = get_content(surl) # url list urll = re.findall(r'http.*?(?=\n)', m3u8) size = urls_size(urll) print_info(site_info, title, 'audio/mpeg', size) print(end='', flush=True) if not info_only: download_urls(urll, title=title, ext='mp3', total_size=size, output_dir=output_dir, merge=True) site_info = "SoundCloud.com" download = sndcd_download download_playlist = sndcd_download ================================================ FILE: src/you_get/extractors/suntv.py ================================================ #!/usr/bin/env python __all__ = ['suntv_download'] from ..common import * import urllib import re def suntv_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): if re.match(r'http://www.isuntv.com/\w+', url): API_URL = "http://www.isuntv.com/ajaxpro/SunTv.pro_vod_playcatemp4,App_Web_playcatemp4.ascx.9f08f04f.ashx" itemid = match1(url, r'http://www.isuntv.com/pro/ct(\d+).html') values = {"itemid" : itemid, "vodid": ""} data = str(values).replace("'", '"') data = data.encode('utf-8') req = urllib.request.Request(API_URL, data) req.add_header('AjaxPro-Method', 'ToPlay') #important! resp = urllib.request.urlopen(req) respData = resp.read() respData = respData.decode('ascii').strip('"') #Ahhhhhhh! video_url = 'http://www.isuntv.com' + str(respData) html = get_content(url, decoded=False) html = html.decode('gbk') title = match1(html, '([^<]+)').strip() #get rid of \r\n s size = 0 type, ext, size = url_info(video_url) print_info(site_info, title, type, size) if not info_only: download_urls([url], title, 'mp4', size, output_dir, merge=merge) site_info = "SunTV" download = suntv_download download_playlist = playlist_not_supported('suntv') ================================================ FILE: src/you_get/extractors/ted.py ================================================ #!/usr/bin/env python __all__ = ['ted_download'] from ..common import * import json def ted_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url) patt = r'"__INITIAL_DATA__"\s*:\s*\{(.+)\}' metadata = json.loads('{' + match1(html, patt) + '}') title = metadata['talks'][0]['title'] nativeDownloads = metadata['talks'][0]['downloads']['nativeDownloads'] for quality in ['high', 'medium', 'low']: if quality in nativeDownloads: url = nativeDownloads[quality] type, ext, size = url_info(url) print_info(site_info, title, type, size) if not info_only: download_urls([url], title, ext, size, output_dir, merge=merge) break site_info = "TED.com" download = ted_download download_playlist = playlist_not_supported('ted') ================================================ FILE: src/you_get/extractors/theplatform.py ================================================ #!/usr/bin/env python from ..common import * def theplatform_download_by_pid(pid, title, output_dir='.', merge=True, info_only=False, **kwargs): smil_url = "http://link.theplatform.com/s/dJ5BDC/%s/meta.smil?format=smil&mbr=true" % pid smil = get_content(smil_url) smil_base = unescape_html(match1(smil, r'<meta base="([^"]+)"')) smil_videos = {y:x for x,y in dict(re.findall(r'<video src="([^"]+)".+height="([^"]+)"', smil)).items()} for height in ['1080', '720', '480', '360', '240', '216']: if height in smil_videos: smil_video = smil_videos[height] break assert smil_video type, ext, size = 'mp4', 'mp4', 0 print_info(site_info, title, type, size) if not info_only: download_rtmp_url(url=smil_base, title=title, ext=ext,params={"-y":ext+':'+smil_video}, output_dir=output_dir) site_info = "thePlatform.com" download = theplatform_download_by_pid download_playlist = playlist_not_supported('theplatform') ================================================ FILE: src/you_get/extractors/tiktok.py ================================================ #!/usr/bin/env python __all__ = ['tiktok_download'] from ..common import * def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Referer': 'https://www.tiktok.com/', 'Connection': 'keep-alive' # important } m = re.match('(https?://)?([^/]+)(/.*)', url) host = m.group(2) if host != 'www.tiktok.com': # non-canonical URL if host == 'vt.tiktok.com': # short URL url = get_location(url) vid = r1(r'/video/(\d+)', url) url = 'https://www.tiktok.com/@/video/%s/' % vid host = 'www.tiktok.com' else: url = m.group(3).split('?')[0] vid = url.split('/')[3] # should be a string of numbers html, set_cookie = getHttps(host, url, headers=headers) tt_chain_token = r1('tt_chain_token=([^;]+);', set_cookie) headers['Cookie'] = 'tt_chain_token=%s' % tt_chain_token data = r1(r'<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" type="application/json">(.*?)</script>', html) info = json.loads(data) itemStruct = info['__DEFAULT_SCOPE__']['webapp.video-detail']['itemInfo']['itemStruct'] downloadAddr = itemStruct['video']['downloadAddr'] author = itemStruct['author']['uniqueId'] nickname = itemStruct['author']['nickname'] title = '%s [%s]' % (nickname or author, vid) mime, ext, size = url_info(downloadAddr, headers=headers) print_info(site_info, title, mime, size) if not info_only: download_urls([downloadAddr], title, ext, size, output_dir=output_dir, merge=merge, headers=headers) site_info = "TikTok.com" download = tiktok_download download_playlist = playlist_not_supported('tiktok') ================================================ FILE: src/you_get/extractors/toutiao.py ================================================ #!/usr/bin/env python import binascii import random from json import loads from urllib.parse import urlparse from ..common import * try: from base64 import decodebytes except ImportError: from base64 import decodestring decodebytes = decodestring __all__ = ['toutiao_download', ] def random_with_n_digits(n): return random.randint(10 ** (n - 1), (10 ** n) - 1) def sign_video_url(vid): r = str(random_with_n_digits(16)) url = 'https://ib.365yg.com/video/urls/v/1/toutiao/mp4/{vid}'.format(vid=vid) n = urlparse(url).path + '?r=' + r b_n = bytes(n, encoding="utf-8") s = binascii.crc32(b_n) aid = 1364 ts = int(time.time() * 1000) return url + '?r={r}&s={s}&aid={aid}&vfrom=xgplayer&callback=axiosJsonpCallback1&_={ts}'.format(r=r, s=s, aid=aid, ts=ts) class ToutiaoVideoInfo(object): def __init__(self): self.bitrate = None self.definition = None self.size = None self.height = None self.width = None self.type = None self.url = None def __str__(self): return json.dumps(self.__dict__) def get_file_by_vid(video_id): vRet = [] url = sign_video_url(video_id) ret = get_content(url) ret = loads(ret[20:-1]) vlist = ret.get('data').get('video_list') if len(vlist) > 0: vInfo = vlist.get(sorted(vlist.keys(), reverse=True)[0]) vUrl = vInfo.get('main_url') vUrl = decodebytes(vUrl.encode('ascii')).decode('ascii') videoInfo = ToutiaoVideoInfo() videoInfo.bitrate = vInfo.get('bitrate') videoInfo.definition = vInfo.get('definition') videoInfo.size = vInfo.get('size') videoInfo.height = vInfo.get('vheight') videoInfo.width = vInfo.get('vwidth') videoInfo.type = vInfo.get('vtype') videoInfo.url = vUrl vRet.append(videoInfo) return vRet def toutiao_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url, faker=True) video_id = match1(html, r".*?videoId: '(?P<vid>.*)'") title = match1(html, '.*?<title>(?P<title>.*?)') video_file_list = get_file_by_vid(video_id) # 调api获取视频源文件 type, ext, size = url_info(video_file_list[0].url, faker=True) print_info(site_info=site_info, title=title, type=type, size=size) if not info_only: download_urls([video_file_list[0].url], title, ext, size, output_dir, merge=merge, faker=True) site_info = "Toutiao.com" download = toutiao_download download_playlist = playlist_not_supported("toutiao") ================================================ FILE: src/you_get/extractors/tucao.py ================================================ #!/usr/bin/env python __all__ = ['tucao_download'] from ..common import * # import re import random import time from xml.dom import minidom #possible raw list types #1.
  • type=tudou&vid=199687639
  • #2.
  • type=tudou&vid=199506910|
  • #3.
  • type=video&file=http://xiaoshen140731.qiniudn.com/lovestage04.flv|
  • #4 may ?
  • type=video&file=http://xiaoshen140731.qiniudn.com/lovestage04.flv|xx**type=&vid=?
  • #5.
  • type=tudou&vid=200003098|07**type=tudou&vid=200000350|08
  • #6.
  • vid=49454694&type=sina|
  • #7.
  • type=189&vid=513031813243909|
  • # re_pattern=re.compile(r"(type=(.+?)&(vid|file)=(.*?))[\|<]") def tucao_single_download(type_link, title, output_dir=".", merge=True, info_only=False): if "file" in type_link: url=type_link[type_link.find("file=")+5:] vtype, ext, size=url_info(url) print_info(site_info, title, vtype, size) if not info_only: download_urls([url], title, ext, size, output_dir) #fix for 189 video source, see raw list types 7 elif "189" in type_link: vid = match1(type_link, r"vid=(\d+)") assert vid, "vid not exsits" url = "http://api.tucao.tv/api/down/{}".format(vid) vtype, ext, size=url_info(url) print_info(site_info, title, vtype, size) if not info_only: download_urls([url], title, ext, size, output_dir) else: u="http://www.tucao.tv/api/playurl.php?{}&key=tucao{:07x}.cc&r={}".format(type_link,random.getrandbits(28),int(time.time()*1000)) xml=minidom.parseString(get_content(u)) urls=[] size=0 for i in xml.getElementsByTagName("url"): urls.append(i.firstChild.nodeValue) vtype, ext, _size=url_info(i.firstChild.nodeValue) size+=_size print_info(site_info, title, vtype, size) if not info_only: download_urls(urls, title, ext, size, output_dir) def tucao_download(url, output_dir=".", merge=True, info_only=False, **kwargs): html=get_content(url) title=match1(html,r'

    (.*?)<\w') #fix for raw list that vid goes before type, see raw list types 6 raw_list=match1(html,r"
  • \s*(type=.+?|vid=.+?)
  • ") raw_l=raw_list.split("**") if len(raw_l)==1: format_link=raw_l[0][:-1] if raw_l[0].endswith("|") else raw_l[0] tucao_single_download(format_link,title,output_dir,merge,info_only) else: for i in raw_l: format_link,sub_title=i.split("|") tucao_single_download(format_link,title+"-"+sub_title,output_dir,merge,info_only) site_info = "tucao.tv" download = tucao_download download_playlist = playlist_not_supported("tucao") ================================================ FILE: src/you_get/extractors/tudou.py ================================================ #!/usr/bin/env python __all__ = ['tudou_download', 'tudou_download_playlist', 'tudou_download_by_id', 'tudou_download_by_iid'] from ..common import * from xml.dom.minidom import parseString import you_get.extractors.acfun def tudou_download_by_iid(iid, title, output_dir = '.', merge = True, info_only = False): data = json.loads(get_decoded_html('http://www.tudou.com/outplay/goto/getItemSegs.action?iid=%s' % iid)) temp = max([data[i] for i in data if 'size' in data[i][0]], key=lambda x:sum([part['size'] for part in x])) vids, size = [t["k"] for t in temp], sum([t["size"] for t in temp]) urls = [] for vid in vids: for i in parseString(get_html('http://ct.v2.tudou.com/f?id=%s' % vid)).getElementsByTagName('f'): urls.append(i.firstChild.nodeValue.strip()) ext = r1(r'http://[\w.]*/(\w+)/[\w.]*', urls[0]) print_info(site_info, title, ext, size) if not info_only: download_urls(urls, title, ext, size, output_dir=output_dir, merge = merge) def tudou_download_by_id(id, title, output_dir = '.', merge = True, info_only = False): html = get_html('http://www.tudou.com/programs/view/%s/' % id) iid = r1(r'iid\s*[:=]\s*(\S+)', html) try: title = r1(r'kw\s*[:=]\s*[\'\"]([^\n]+?)\'\s*\n', html).replace("\\'", "\'") except AttributeError: title = '' tudou_download_by_iid(iid, title, output_dir = output_dir, merge = merge, info_only = info_only) def tudou_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): if 'acfun.tudou.com' in url: #wrong way! url = url.replace('acfun.tudou.com', 'www.acfun.tv') you_get.extractors.acfun.acfun_download(url, output_dir, merge, info_only) return #throw you back # Embedded player id = r1(r'http://www.tudou.com/v/([^/]+)/', url) if id: return tudou_download_by_id(id, title="", info_only=info_only) html = get_content(url) try: title = r1(r'\Wkw\s*[:=]\s*[\'\"]([^\n]+?)\'\s*\n', html).replace("\\'", "\'") assert title title = unescape_html(title) except AttributeError: title = match1(html, r'id=\"subtitle\"\s*title\s*=\s*\"([^\"]+)\"') if title is None: title = '' vcode = r1(r'vcode\s*[:=]\s*\'([^\']+)\'', html) if vcode is None: vcode = match1(html, r'viden\s*[:=]\s*\"([\w+/=]+)\"') if vcode: from .youku import youku_download_by_vid return youku_download_by_vid(vcode, title=title, output_dir=output_dir, merge=merge, info_only=info_only, src='tudou', **kwargs) iid = r1(r'iid\s*[:=]\s*(\d+)', html) if not iid: return tudou_download_playlist(url, output_dir, merge, info_only) tudou_download_by_iid(iid, title, output_dir = output_dir, merge = merge, info_only = info_only) # obsolete? def parse_playlist(url): aid = r1(r'http://www.tudou.com/playlist/p/a(\d+)(?:i\d+)?\.html', url) html = get_decoded_html(url) if not aid: aid = r1(r"aid\s*[:=]\s*'(\d+)'", html) if re.match(r'http://www.tudou.com/albumcover/', url): atitle = r1(r"title\s*:\s*'([^']+)'", html) elif re.match(r'http://www.tudou.com/playlist/p/', url): atitle = r1(r'atitle\s*=\s*"([^"]+)"', html) else: raise NotImplementedError(url) assert aid assert atitle import json #url = 'http://www.tudou.com/playlist/service/getZyAlbumItems.html?aid='+aid url = 'http://www.tudou.com/playlist/service/getAlbumItems.html?aid='+aid return [(atitle + '-' + x['title'], str(x['itemId'])) for x in json.loads(get_html(url))['message']] def parse_plist(url): html = get_decoded_html(url) lcode = r1(r"lcode:\s*'([^']+)'", html) plist_info = json.loads(get_content('http://www.tudou.com/crp/plist.action?lcode=' + lcode)) return ([(item['kw'], item['iid']) for item in plist_info['items']]) def tudou_download_playlist(url, output_dir = '.', merge = True, info_only = False, **kwargs): videos = parse_plist(url) for i, (title, id) in enumerate(videos): print('Processing %s of %s videos...' % (i + 1, len(videos))) tudou_download_by_iid(id, title, output_dir = output_dir, merge = merge, info_only = info_only) site_info = "Tudou.com" download = tudou_download download_playlist = tudou_download_playlist ================================================ FILE: src/you_get/extractors/tumblr.py ================================================ #!/usr/bin/env python __all__ = ['tumblr_download'] from ..common import * from .universal import * from .dailymotion import dailymotion_download from .vimeo import vimeo_download def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): if re.match(r'https?://\d+\.media\.tumblr\.com/', url): universal_download(url, output_dir, merge=merge, info_only=info_only) return import ssl ssl_context = request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)) # server requires TLS v1.2 cookie_handler = request.HTTPCookieProcessor() opener = request.build_opener(ssl_context, cookie_handler) request.install_opener(opener) page = get_html(url) form_key = match1(page, r'id="tumblr_form_key" content="([^"]+)"') if form_key is not None: # bypass GDPR consent page referer = 'https://www.tumblr.com/privacy/consent?redirect=%s' % parse.quote_plus(url) post_content('https://www.tumblr.com/svc/privacy/consent', headers={ 'Content-Type': 'application/json', 'User-Agent': fake_headers['User-Agent'], 'Referer': referer, 'X-tumblr-form-key': form_key, 'X-Requested-With': 'XMLHttpRequest' }, post_data_raw='{"eu_resident":true,"gdpr_is_acceptable_age":true,"gdpr_consent_core":true,"gdpr_consent_first_party_ads":true,"gdpr_consent_third_party_ads":true,"gdpr_consent_search_history":true,"redirect_to":"%s","gdpr_reconsent":false}' % url) page = get_html(url, faker=True) html = parse.unquote(page).replace(r'\/', '/') feed = r1(r'', html) if feed in ['photo', 'photoset', 'entry'] or feed is None: # try to extract photos page_title = r1(r'([^<\n]*)', html) urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.jpg)', html) +\ re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.png)', html) +\ re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.gif)', html) +\ re.findall(r'(https?://\d+\.media\.tumblr\.com/[^;"&]+/s\d+x\d+/[^;"&]+\.jpg)', html) +\ re.findall(r'(https?://\d+\.media\.tumblr\.com/[^;"&]+/s\d+x\d+/[^;"&]+\.png)', html) +\ re.findall(r'(https?://\d+\.media\.tumblr\.com/[^;"&]+/s\d+x\d+/[^;"&]+\.gif)', html) tuggles = {} for url in urls: if url.endswith('.gif'): hd_url = url elif url.endswith('.jpg'): hd_url = url # FIXME: decide actual quality # r1(r'(.+)_\d+\.jpg$', url) + '_1280.jpg' elif url.endswith('.png'): hd_url = url # FIXME: decide actual quality # r1(r'(.+)_\d+\.png$', url) + '_1280.png' else: continue filename = parse.unquote(hd_url.split('/')[-1]) title = '.'.join(filename.split('.')[:-1]) tumblr_id = r1(r'^tumblr_(.+)_\d+$', title) or title try: quality = int(r1(r'^tumblr_.+_(\d+)$', title)) except: quality = int(r1(r'/s(\d+)x\d+/', hd_url)) ext = filename.split('.')[-1] try: size = int(get_head(hd_url)['Content-Length']) if tumblr_id not in tuggles or tuggles[tumblr_id]['quality'] < quality: tuggles[tumblr_id] = { 'title': title, 'url': hd_url, 'quality': quality, 'ext': ext, 'size': size, } except: pass if tuggles: #size = sum([tuggles[t]['size'] for t in tuggles]) #print_info(site_info, page_title, None, size) for t in tuggles: title = '[tumblr] ' + tuggles[t]['title'] ext = tuggles[t]['ext'] size = tuggles[t]['size'] url = tuggles[t]['url'] print_info(site_info, title, ext, size) if not info_only: download_urls([url], title, ext, size, output_dir=output_dir) return # feed == 'audio' or feed == 'video' or feed is None # try to extract video / audio real_url = r1(r'source src=\\x22([^\\]+)\\', html) if not real_url: real_url = r1(r'audio_file=([^&]+)&', html) if real_url: real_url = real_url + '?plead=please-dont-download-this-or-our-lawyers-wont-let-us-host-audio' if not real_url: real_url = r1(r']+tumblr_video_container[^>]+>]+src=[\'"]([^\'"]*)[\'"]', html) if iframe_url is None: universal_download(url, output_dir, merge=merge, info_only=info_only, **kwargs) return if iframe_url: iframe_html = get_content(iframe_url, headers=fake_headers) real_url = r1(r']*>[\n ]*]+src=[\'"]([^\'"]*)[\'"]', iframe_html) else: iframe_url = r1(r']+src=[\'"]([^\'"]*)[\'"]', html) if iframe_url[:2] == '//': iframe_url = 'http:' + iframe_url if re.search(r'player\.vimeo\.com', iframe_url): vimeo_download(iframe_url, output_dir, merge=merge, info_only=info_only, referer='http://tumblr.com/', **kwargs) return elif re.search(r'dailymotion\.com', iframe_url): dailymotion_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs) return else: iframe_html = get_content(iframe_url) real_url = r1(r'', html) or r1(r'', html) or r1(r'([^<\n]*)', html) or url.split("/")[4]).replace('\n', '') # this is better vcode = r1(r'tumblr_(\w+)', real_url) real_url = 'https://vt.media.tumblr.com/tumblr_%s.mp4' % vcode type, ext, size = url_info(real_url, faker=True) print_info(site_info, title, type, size) if not info_only: download_urls([real_url], title, ext, size, output_dir, merge=merge) site_info = "Tumblr.com" download = tumblr_download download_playlist = playlist_not_supported('tumblr') ================================================ FILE: src/you_get/extractors/twitter.py ================================================ #!/usr/bin/env python __all__ = ['twitter_download'] from ..common import * from .universal import * def extract_m3u(source): r1 = get_content(source) s1 = re.findall(r'(/ext_tw_video/.*)', r1) s1 += re.findall(r'(/amplify_video/.*)', r1) r2 = get_content('https://video.twimg.com%s' % s1[-1]) s2 = re.findall(r'(/ext_tw_video/.*)', r2) s2 += re.findall(r'(/amplify_video/.*)', r2) return ['https://video.twimg.com%s' % i for i in s2] def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*' } if re.match(r'https?://pbs\.twimg\.com', url): universal_download(url, output_dir, merge=merge, info_only=info_only, **kwargs) return if re.match(r'https?://mobile', url): # normalize mobile URL url = 'https://' + match1(url, r'//mobile\.(.+)') if re.match(r'https?://twitter\.com/i/moments/', url): # FIXME: moments html = get_html(url, faker=True) paths = re.findall(r'data-permalink-path="([^"]+)"', html) for path in paths: twitter_download('https://twitter.com' + path, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) return m = re.match(r'^https?://(mobile\.)?(x|twitter)\.com/([^/]+)/status/(\d+)', url) assert m screen_name, item_id = m.group(3), m.group(4) page_title = "{} [{}]".format(screen_name, item_id) # FIXME: this API won't work for protected or nsfw contents api_url = 'https://cdn.syndication.twimg.com/tweet-result?id=%s&token=!' % item_id content = get_content(api_url) info = json.loads(content) author = info['user']['name'] url = 'https://twitter.com/%s/status/%s' % (info['user']['screen_name'], item_id) full_text = info['text'] if 'photos' in info: for photo in info['photos']: photo_url = photo['url'] title = item_id + '_' + photo_url.split('.')[-2].split('/')[-1] urls = [ photo_url + ':orig' ] size = urls_size(urls, headers=headers) ext = photo_url.split('.')[-1] print_info(site_info, title, ext, size) if not info_only: download_urls(urls, title, ext, size, output_dir, merge=merge) if 'video' in info: for mediaDetail in info['mediaDetails']: if 'video_info' not in mediaDetail: continue variants = mediaDetail['video_info']['variants'] variants = sorted(variants, key=lambda kv: kv.get('bitrate', 0)) title = item_id + '_' + variants[-1]['url'].split('/')[-1].split('?')[0].split('.')[0] urls = [ variants[-1]['url'] ] size = urls_size(urls, headers=headers) mime, ext = variants[-1]['content_type'], 'mp4' print_info(site_info, title, ext, size) if not info_only: download_urls(urls, title, ext, size, output_dir, merge=merge, headers=headers) # TODO: should we deal with quoted tweets? site_info = "X.com" download = twitter_download download_playlist = playlist_not_supported('twitter') ================================================ FILE: src/you_get/extractors/ucas.py ================================================ #!/usr/bin/env python __all__ = ['ucas_download', 'ucas_download_single', 'ucas_download_playlist'] from ..common import * import http.client from time import time from random import random import xml.etree.ElementTree as ET from copy import copy """ Do not replace http.client with get_content for UCAS's server is not correctly returning data! """ def dictify(r,root=True): """http://stackoverflow.com/a/30923963/2946714""" if root: return {r.tag : dictify(r, False)} d=copy(r.attrib) if r.text: d["_text"]=r.text for x in r.findall("./*"): if x.tag not in d: d[x.tag]=[] d[x.tag].append(dictify(x,False)) return d def _get_video_query_url(resourceID): # has to be like this headers = { 'DNT': '1', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-CA,en;q=0.8,en-US;q=0.6,zh-CN;q=0.4,zh;q=0.2', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.47 Safari/537.36', 'Accept': '*/*', 'Referer': 'http://v.ucas.ac.cn/', 'Connection': 'keep-alive', } conn = http.client.HTTPConnection("210.76.211.10") conn.request("GET", "/vplus/remote.do?method=query2&loginname=videocas&pwd=af1c7a4c5f77f790722f7cae474c37e281203765d423a23b&resource=%5B%7B%22resourceID%22%3A%22" + resourceID + "%22%2C%22on%22%3A1%2C%22time%22%3A600%2C%22eid%22%3A100%2C%22w%22%3A800%2C%22h%22%3A600%7D%5D&timeStamp=" + str(int(time())), headers=headers) res = conn.getresponse() data = res.read() info = data.decode("utf-8") return match1(info, r'video":"(.+)"') def _get_virtualPath(video_query_url): #getResourceJsCode2 html = get_content(video_query_url) return match1(html, r"function\s+getVirtualPath\(\)\s+{\s+return\s+'(\w+)'") def _get_video_list(resourceID): """""" conn = http.client.HTTPConnection("210.76.211.10") conn.request("GET", '/vplus/member/resource.do?isyulan=0&method=queryFlashXmlByResourceId&resourceId={resourceID}&randoms={randoms}'.format(resourceID = resourceID, randoms = random())) res = conn.getresponse() data = res.read() video_xml = data.decode("utf-8") root = ET.fromstring(video_xml.split('___!!!___')[0]) r = dictify(root) huge_list = [] # main huge_list.append([i['value'] for i in sorted(r['video']['mainUrl'][0]['_flv'][0]['part'][0]['video'], key=lambda k: int(k['index']))]) # sub if '_flv' in r['video']['subUrl'][0]: huge_list.append([i['value'] for i in sorted(r['video']['subUrl'][0]['_flv'][0]['part'][0]['video'], key=lambda k: int(k['index']))]) return huge_list def _ucas_get_url_lists_by_resourceID(resourceID): video_query_url = _get_video_query_url(resourceID) assert video_query_url != '', 'Cannot find video GUID!' virtualPath = _get_virtualPath(video_query_url) assert virtualPath != '', 'Cannot find virtualPath!' url_lists = _get_video_list(resourceID) assert url_lists, 'Cannot find any URL to download!' # make real url # credit to a mate in UCAS for video_type_id, video_urls in enumerate(url_lists): for k, path in enumerate(video_urls): url_lists[video_type_id][k] = 'http://210.76.211.10/vplus/member/resource.do?virtualPath={virtualPath}&method=getImgByStream&imgPath={path}'.format(virtualPath = virtualPath, path = path) return url_lists def ucas_download_single(url, output_dir = '.', merge = False, info_only = False, **kwargs): '''video page''' html = get_content(url) # resourceID is UUID resourceID = re.findall( r'resourceID":"([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})', html)[0] assert resourceID != '', 'Cannot find resourceID!' title = match1(html, r'<div class="bc-h">(.+)</div>') url_lists = _ucas_get_url_lists_by_resourceID(resourceID) assert url_lists, 'Cannot find any URL of such class!' for k, part in enumerate(url_lists): part_title = title + '_' + str(k) print_info(site_info, part_title, 'flv', 0) if not info_only: download_urls(part, part_title, 'flv', total_size=None, output_dir=output_dir, merge=merge) def ucas_download_playlist(url, output_dir = '.', merge = False, info_only = False, **kwargs): '''course page''' html = get_content(url) parts = re.findall( r'(getplaytitle.do\?.+)"', html) assert parts, 'No part found!' for part_path in parts: ucas_download('http://v.ucas.ac.cn/course/' + part_path, output_dir=output_dir, merge=merge, info_only=info_only) def ucas_download(url, output_dir = '.', merge = False, info_only = False, **kwargs): if 'classid=' in url and 'getplaytitle.do' in url: ucas_download_single(url, output_dir=output_dir, merge=merge, info_only=info_only) elif 'CourseIndex.do' in url: ucas_download_playlist(url, output_dir=output_dir, merge=merge, info_only=info_only) site_info = "UCAS" download = ucas_download download_playlist = ucas_download_playlist ================================================ FILE: src/you_get/extractors/universal.py ================================================ #!/usr/bin/env python __all__ = ['universal_download'] from ..common import * from .embed import * def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs): try: content_type = get_head(url, headers=fake_headers)['Content-Type'] except: content_type = get_head(url, headers=fake_headers, get_method='GET')['Content-Type'] if content_type.startswith('text/html'): try: embed_download(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) except Exception: pass else: return domains = url.split('/')[2].split('.') if len(domains) > 2: domains = domains[1:] site_info = '.'.join(domains) if content_type.startswith('text/html'): # extract an HTML page response = get_response(url, faker=True) page = str(response.data) page_title = r1(r'<title>([^<]*)', page) if page_title: page_title = unescape_html(page_title) meta_videos = re.findall(r'<meta property="og:video:url" content="([^"]*)"', page) if meta_videos: try: for meta_video in meta_videos: meta_video_url = unescape_html(meta_video) type_, ext, size = url_info(meta_video_url) print_info(site_info, page_title, type_, size) if not info_only: download_urls([meta_video_url], page_title, ext, size, output_dir=output_dir, merge=merge, faker=True) except: pass else: return hls_urls = re.findall(r'(https?://[^;"\'\\]+' + r'\.m3u8?' + r'[^;"\'\\]*)', page) if hls_urls: try: for hls_url in hls_urls: type_, ext, size = url_info(hls_url) print_info(site_info, page_title, type_, size) if not info_only: download_url_ffmpeg(url=hls_url, title=page_title, ext='mp4', output_dir=output_dir) except: pass else: return # most common media file extensions on the Internet media_exts = [r'\.flv', r'\.mp3', r'\.mp4', r'\.webm', r'[-_]1\d\d\d\.jpe?g', r'[-_][6-9]\d\d\.jpe?g', # tumblr r'[-_]1\d\d\dx[6-9]\d\d\.jpe?g', r'[-_][6-9]\d\dx1\d\d\d\.jpe?g', r'[-_][6-9]\d\dx[6-9]\d\d\.jpe?g', r's1600/[\w%]+\.jpe?g', # blogger r'blogger\.googleusercontent\.com/img/a/\w*', # blogger r'img[6-9]\d\d/[\w%]+\.jpe?g' # oricon? ] urls = [] for i in media_exts: urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ =?;&"\'\\<>]*)', page) p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page) urls += [parse.unquote(url) for url in p_urls] q_urls = re.findall(r'(https?:\\\\/\\\\/[^ ;"\'<>]+' + i + r'[^ ;"\'<>]*)', page) urls += [url.replace('\\\\/', '/') for url in q_urls] # a link href to an image is often an interesting one urls += re.findall(r'href="(https?://[^"]+\.jpe?g)"', page, re.I) urls += re.findall(r'href="(https?://[^"]+\.png)"', page, re.I) urls += re.findall(r'href="(https?://[^"]+\.gif)"', page, re.I) # <img> with high widths urls += re.findall(r'<img src="([^"]*)"[^>]*width="\d\d\d+"', page, re.I) # relative path rel_urls = [] rel_urls += re.findall(r'href="(\.[^"]+\.jpe?g)"', page, re.I) rel_urls += re.findall(r'href="(\.[^"]+\.png)"', page, re.I) rel_urls += re.findall(r'href="(\.[^"]+\.gif)"', page, re.I) for rel_url in rel_urls: urls += [ r1(r'(.*/)', url) + rel_url ] # site-relative path rel_urls = [] rel_urls += re.findall(r'href="(/[^"]+\.jpe?g)"', page, re.I) rel_urls += re.findall(r'href="(/[^"]+\.png)"', page, re.I) rel_urls += re.findall(r'href="(/[^"]+\.gif)"', page, re.I) for rel_url in rel_urls: urls += [ r1(r'(https?://[^/]+)', url) + rel_url ] # sometimes naive urls += re.findall(r'data-original="(https?://[^"]+\.jpe?g)"', page, re.I) urls += re.findall(r'data-original="(https?://[^"]+\.png)"', page, re.I) urls += re.findall(r'data-original="(https?://[^"]+\.gif)"', page, re.I) # MPEG-DASH MPD mpd_urls = re.findall(r'src="(https?://[^"]+\.mpd)"', page) for mpd_url in mpd_urls: cont = get_content(mpd_url) base_url = r1(r'<BaseURL>(.*)</BaseURL>', cont) urls += [ r1(r'(.*/)[^/]*', mpd_url) + base_url ] # have some candy! candies = [] i = 1 for url in set(urls): filename = parse.unquote(url.split('/')[-1]) if 5 <= len(filename) <= 80: title = '.'.join(filename.split('.')[:-1]) or filename else: title = '%s' % i i += 1 if r1(r'(https://pinterest.com/pin/)', url): continue candies.append({'url': url, 'title': title}) for candy in candies: try: try: mime, ext, size = url_info(candy['url'], faker=False) assert size except: mime, ext, size = url_info(candy['url'], faker=True) if not size: size = float('Inf') except: continue else: print_info(site_info, candy['title'], ext, size) if not info_only: try: download_urls([candy['url']], candy['title'], ext, size, output_dir=output_dir, merge=merge, faker=False) except: download_urls([candy['url']], candy['title'], ext, size, output_dir=output_dir, merge=merge, faker=True) return else: # direct download url_trunk = url.split('?')[0] # strip query string filename = parse.unquote(url_trunk.split('/')[-1]) or parse.unquote(url_trunk.split('/')[-2]) title = '.'.join(filename.split('.')[:-1]) or filename _, ext, size = url_info(url, faker=True) print_info(site_info, title, ext, size) if not info_only: download_urls([url], title, ext, size, output_dir=output_dir, merge=merge, faker=True) return site_info = None download = universal_download download_playlist = playlist_not_supported('universal') ================================================ FILE: src/you_get/extractors/veoh.py ================================================ #!/usr/bin/env python __all__ = ['veoh_download'] from ..common import * def veoh_download(url, output_dir = '.', merge = False, info_only = False, **kwargs): '''Get item_id''' if re.match(r'http://www.veoh.com/watch/\w+', url): item_id = match1(url, r'http://www.veoh.com/watch/(\w+)') elif re.match(r'http://www.veoh.com/m/watch.php\?v=\.*', url): item_id = match1(url, r'http://www.veoh.com/m/watch.php\?v=(\w+)') else: raise NotImplementedError('Cannot find item ID') veoh_download_by_id(item_id, output_dir = '.', merge = False, info_only = info_only, **kwargs) #---------------------------------------------------------------------- def veoh_download_by_id(item_id, output_dir = '.', merge = False, info_only = False, **kwargs): """Source: Android mobile""" webpage_url = 'http://www.veoh.com/m/watch.php?v={item_id}&quality=1'.format(item_id = item_id) #grab download URL a = get_content(webpage_url, decoded=True) url = match1(a, r'<source src="(.*?)\"\W') #grab title title = match1(a, r'<meta property="og:title" content="([^"]*)"') type_, ext, size = url_info(url) print_info(site_info, title, type_, size) if not info_only: download_urls([url], title, ext, total_size=None, output_dir=output_dir, merge=merge) site_info = "Veoh" download = veoh_download download_playlist = playlist_not_supported('veoh') ================================================ FILE: src/you_get/extractors/vimeo.py ================================================ #!/usr/bin/env python __all__ = ['vimeo_download', 'vimeo_download_by_id', 'vimeo_download_by_channel', 'vimeo_download_by_channel_id'] from ..common import * from ..util.log import * from ..extractor import VideoExtractor from json import loads import urllib.error import urllib.parse access_token = 'f6785418277b72c7c87d3132c79eec24' #By Beining #---------------------------------------------------------------------- def vimeo_download_by_channel(url, output_dir='.', merge=False, info_only=False, **kwargs): """str->None""" # https://vimeo.com/channels/464686 channel_id = match1(url, r'http://vimeo.com/channels/(\w+)') vimeo_download_by_channel_id(channel_id, output_dir, merge, info_only, **kwargs) #---------------------------------------------------------------------- def vimeo_download_by_channel_id(channel_id, output_dir='.', merge=False, info_only=False, **kwargs): """str/int->None""" html = get_content('https://api.vimeo.com/channels/{channel_id}/videos?access_token={access_token}'.format(channel_id=channel_id, access_token=access_token)) data = loads(html) id_list = [] #print(data) for i in data['data']: id_list.append(match1(i['uri'], r'/videos/(\w+)')) for id in id_list: try: vimeo_download_by_id(id, None, output_dir, merge, info_only, **kwargs) except urllib.error.URLError as e: log.w('{} failed with {}'.format(id, e)) class VimeoExtractor(VideoExtractor): stream_types = [ {'id': '2160p', 'video_profile': '3840x2160'}, {'id': '1440p', 'video_profile': '2560x1440'}, {'id': '1080p', 'video_profile': '1920x1080'}, {'id': '720p', 'video_profile': '1280x720'}, {'id': '540p', 'video_profile': '960x540'}, {'id': '360p', 'video_profile': '640x360'} ] name = 'Vimeo' def prepare(self, **kwargs): headers = fake_headers.copy() if 'referer' in kwargs: headers['Referer'] = kwargs['referer'] try: page = get_content('https://vimeo.com/{}'.format(self.vid)) cfg_patt = r'clip_page_config\s*=\s*(\{.+?\});' cfg = json.loads(match1(page, cfg_patt)) video_page = get_content(cfg['player']['config_url'], headers=headers) self.title = cfg['clip']['title'] info = json.loads(video_page) except Exception as e: page = get_content('https://player.vimeo.com/video/{}'.format(self.vid)) self.title = r1(r'<title>([^<]+)', page) info = json.loads(match1(page, r'var t=(\{.+?\});')) plain = info['request']['files']['progressive'] for s in plain: meta = dict(src=[s['url']], container='mp4') meta['video_profile'] = '{}x{}'.format(s['width'], s['height']) for stream in self.__class__.stream_types: if s['quality'] == stream['id']: self.streams[s['quality']] = meta self.master_m3u8 = info['request']['files']['hls']['cdns'] def extract(self, **kwargs): for s in self.streams: self.streams[s]['size'] = urls_size(self.streams[s]['src']) master_m3u8s = [] for m in self.master_m3u8: master_m3u8s.append(self.master_m3u8[m]['url']) master_content = None master_url = None for master_u in master_m3u8s: try: master_content = get_content(master_u).split('\n') except urllib.error.URLError: continue else: master_url = master_u if master_content is None: return lines = [] for line in master_content: if len(line.strip()) > 0: lines.append(line.strip()) pos = 0 while pos < len(lines): if lines[pos].startswith('#EXT-X-STREAM-INF'): patt = r'RESOLUTION=(\d+)x(\d+)' hit = re.search(patt, lines[pos]) if hit is None: continue width = hit.group(1) height = hit.group(2) if height in ('2160', '1440'): m3u8_url = urllib.parse.urljoin(master_url, lines[pos+1]) meta = dict(m3u8_url=m3u8_url, container='m3u8') if height == '1440': meta['video_profile'] = '2560x1440' else: meta['video_profile'] = '3840x2160' meta['size'] = 0 meta['src'] = general_m3u8_extractor(m3u8_url) self.streams[height+'p'] = meta pos += 2 else: pos += 1 self.streams_sorted = [] for stream_type in self.stream_types: if stream_type['id'] in self.streams: item = [('id', stream_type['id'])] + list(self.streams[stream_type['id']].items()) self.streams_sorted.append(dict(item)) def vimeo_download_by_id(id, title=None, output_dir='.', merge=True, info_only=False, **kwargs): site = VimeoExtractor() site.download_by_vid(id, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs) def vimeo_download(url, output_dir='.', merge=True, info_only=False, **kwargs): if re.match(r'https?://vimeo.com/channels/\w+', url): vimeo_download_by_channel(url, output_dir, merge, info_only) else: id = r1(r'https?://[\w.]*vimeo.com[/\w]*/(\d+)', url) if id is None: video_page = get_content(url, headers=fake_headers) id = r1(r'"clip_id":(\d+)', video_page) assert id vimeo_download_by_id(id, None, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) site_info = "Vimeo.com" download = vimeo_download download_playlist = vimeo_download_by_channel ================================================ FILE: src/you_get/extractors/vk.py ================================================ #!/usr/bin/env python __all__ = ['vk_download'] from ..common import * def get_video_info(url): video_page = get_content(url) title = r1(r'
    (.[^>]+?)]+?)"', video_page) for quality in ['.1080.', '.720.', '.480.', '.360.', '.240.']: for source in sources: if source.find(quality) != -1: url = source break assert url type, ext, size = url_info(url) print_info(site_info, title, type, size) return url, title, ext, size def get_video_from_user_videolist(url): ep = 'https://vk.com/al_video.php' to_post = dict(act='show', al=1, module='direct', video=re.search(r'video(\d+_\d+)', url).group(1)) page = post_content(ep, post_data=to_post) video_pt = r'(.+?)
    ', page).group(1) mime, ext, size = url_info(url) print_info(site_info, title, mime, size) return url, title, ext, size def get_image_info(url): image_page = get_content(url) # used for title - vk page owner page_of = re.findall(r'Sender:
    (.[^>]+?)(.[^>]+?)Download full size', image_page) type, ext, size = url_info(image_link) print_info(site_info, title, type, size) return image_link, title, ext, size def vk_download(url, output_dir='.', stream_type=None, merge=True, info_only=False, **kwargs): link = None if re.match(r'(.+)z\=video(.+)', url): link, title, ext, size = get_video_info(url) elif re.match(r'(.+)vk\.com\/photo(.+)', url): link, title, ext, size = get_image_info(url) elif re.search(r'vk\.com\/video\d+_\d+', url): link, title, ext, size = get_video_from_user_videolist(url) else: raise NotImplementedError('Nothing to download here') if not info_only and link is not None: download_urls([link], title, ext, size, output_dir, merge=merge) site_info = "VK.com" download = vk_download download_playlist = playlist_not_supported('vk') ================================================ FILE: src/you_get/extractors/w56.py ================================================ #!/usr/bin/env python __all__ = ['w56_download', 'w56_download_by_id'] from ..common import * from .sohu import sohu_download import json def w56_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False): content = json.loads(get_html('http://vxml.56.com/json/%s/?src=site' % id)) info = content['info'] title = title or info['Subject'] assert title hd = info['hd'] assert hd in (0, 1, 2) hd_types = [['normal', 'qvga'], ['clear', 'vga'], ['super', 'wvga']][hd] files = [x for x in info['rfiles'] if x['type'] in hd_types] assert len(files) == 1 size = int(files[0]['filesize']) url = files[0]['url'] + '&prod=56' ext = 'mp4' print_info(site_info, title, ext, size) if not info_only: download_urls([url], title, ext, size, output_dir = output_dir, merge = merge) def w56_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): content = get_content(url) sohu_url = r1(r"url:\s*'([^']*)'", content) if sohu_url: sohu_download(sohu_url, output_dir, merge=merge, info_only=info_only, **kwargs) return id = r1(r'http://www.56.com/u\d+/v_(\w+).html', url) or \ r1(r'http://www.56.com/.*vid-(\w+).html', url) w56_download_by_id(id, output_dir = output_dir, merge = merge, info_only = info_only) site_info = "56.com" download = w56_download download_playlist = playlist_not_supported('56') ================================================ FILE: src/you_get/extractors/wanmen.py ================================================ #!/usr/bin/env python __all__ = ['wanmen_download', 'wanmen_download_by_course', 'wanmen_download_by_course_topic', 'wanmen_download_by_course_topic_part'] from ..common import * from .bokecc import bokecc_download_by_id from json import loads ##Helper functions def _wanmen_get_json_api_content_by_courseID(courseID): """int->JSON Return a parsed JSON tree of WanMen's API.""" return loads(get_content('http://api.wanmen.org/course/getCourseNested/{courseID}'.format(courseID = courseID))) def _wanmen_get_title_by_json_topic_part(json_content, tIndex, pIndex): """JSON, int, int, int->str Get a proper title with courseid+topicID+partID.""" return '_'.join([json_content[0]['name'], json_content[0]['Topics'][tIndex]['name'], json_content[0]['Topics'][tIndex]['Parts'][pIndex]['name']]) def _wanmen_get_boke_id_by_json_topic_part(json_content, tIndex, pIndex): """JSON, int, int, int->str Get one BokeCC video ID with courseid+topicID+partID.""" return json_content[0]['Topics'][tIndex]['Parts'][pIndex]['ccVideoLink'] ##Parsers def wanmen_download_by_course(json_api_content, output_dir='.', merge=True, info_only=False, **kwargs): """int->None Download a WHOLE course. Reuse the API call to save time.""" for tIndex in range(len(json_api_content[0]['Topics'])): for pIndex in range(len(json_api_content[0]['Topics'][tIndex]['Parts'])): wanmen_download_by_course_topic_part(json_api_content, tIndex, pIndex, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) def wanmen_download_by_course_topic(json_api_content, tIndex, output_dir='.', merge=True, info_only=False, **kwargs): """int, int->None Download a TOPIC of a course. Reuse the API call to save time.""" for pIndex in range(len(json_api_content[0]['Topics'][tIndex]['Parts'])): wanmen_download_by_course_topic_part(json_api_content, tIndex, pIndex, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) def wanmen_download_by_course_topic_part(json_api_content, tIndex, pIndex, output_dir='.', merge=True, info_only=False, **kwargs): """int, int, int->None Download ONE PART of the course.""" html = json_api_content title = _wanmen_get_title_by_json_topic_part(html, tIndex, pIndex) bokeccID = _wanmen_get_boke_id_by_json_topic_part(html, tIndex, pIndex) bokecc_download_by_id(vid = bokeccID, title = title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) ##Main entrance def wanmen_download(url, output_dir='.', merge=True, info_only=False, **kwargs): if not 'wanmen.org' in url: log.wtf('You are at the wrong place dude. This is for WanMen University!') raise courseID = int(match1(url, r'course\/(\d+)')) assert courseID > 0 #without courseID we cannot do anything tIndex = int(match1(url, r'tIndex=(\d+)')) pIndex = int(match1(url, r'pIndex=(\d+)')) json_api_content = _wanmen_get_json_api_content_by_courseID(courseID) if pIndex: #only download ONE single part assert tIndex >= 0 wanmen_download_by_course_topic_part(json_api_content, tIndex, pIndex, output_dir = output_dir, merge = merge, info_only = info_only) elif tIndex: #download a topic wanmen_download_by_course_topic(json_api_content, tIndex, output_dir = output_dir, merge = merge, info_only = info_only) else: #download the whole course wanmen_download_by_course(json_api_content, output_dir = output_dir, merge = merge, info_only = info_only) site_info = "WanMen University" download = wanmen_download download_playlist = wanmen_download_by_course ================================================ FILE: src/you_get/extractors/ximalaya.py ================================================ #!/usr/bin/env python __all__ = ['ximalaya_download_playlist', 'ximalaya_download', 'ximalaya_download_by_id'] from ..common import * import json import re stream_types = [ {'itag': '1', 'container': 'm4a', 'bitrate': 'default'}, {'itag': '2', 'container': 'm4a', 'bitrate': '32'}, {'itag': '3', 'container': 'm4a', 'bitrate': '64'} ] def ximalaya_download_by_id(id, title = None, output_dir = '.', info_only = False, stream_id = None): BASE_URL = 'http://www.ximalaya.com/tracks/' json_url = BASE_URL + id + '.json' json_data = json.loads(get_content(json_url, headers=fake_headers)) if 'res' in json_data: if json_data['res'] == False: raise ValueError('Server reported id %s is invalid' % id) if 'is_paid' in json_data and json_data['is_paid']: if 'is_free' in json_data and not json_data['is_free']: raise ValueError('%s is paid item' % id) if (not title) and 'title' in json_data: title = json_data['title'] #no size data in the json. should it be calculated? size = 0 url = json_data['play_path_64'] if stream_id: if stream_id == '1': url = json_data['play_path_32'] elif stream_id == '0': url = json_data['play_path'] logging.debug('ximalaya_download_by_id: %s' % url) ext = 'm4a' urls = [url] print('Site: %s' % site_info) print('title: %s' % title) if info_only: if stream_id: print_stream_info(stream_id) else: for item in range(0, len(stream_types)): print_stream_info(item) if not info_only: print('Type: MPEG-4 audio m4a') print('Size: N/A') download_urls(urls, title, ext, size, output_dir = output_dir, merge = False) def ximalaya_download(url, output_dir = '.', info_only = False, stream_id = None, **kwargs): if re.match(r'http://www\.ximalaya\.com/(\d+)/sound/(\d+)', url): id = match1(url, r'http://www\.ximalaya\.com/\d+/sound/(\d+)') else: raise NotImplementedError(url) ximalaya_download_by_id(id, output_dir = output_dir, info_only = info_only, stream_id = stream_id) def ximalaya_download_page(playlist_url, output_dir = '.', info_only = False, stream_id = None, **kwargs): if re.match(r'http://www\.ximalaya\.com/(\d+)/album/(\d+)', playlist_url): page_content = get_content(playlist_url) pattern = re.compile(r'
  • (\w+)') #video_url = match1(html, r'
    400 # change it to the dispatcher of aliCDN can do better # at least a little more recoverable from HTTP 403 if cls.dispatcher_url in url: return url elif 'k.youku.com' in url: return url else: url_seg_list = list(urllib.parse.urlsplit(url)) url_seg_list[1] = cls.dispatcher_url return urllib.parse.urlunsplit(url_seg_list) def get_vid_from_url(self): # It's unreliable. check #1633 b64p = r'([a-zA-Z0-9=]+)' p_list = [r'youku\.com/v_show/id_'+b64p, r'player\.youku\.com/player\.php/sid/'+b64p+r'/v\.swf', r'loader\.swf\?VideoIDS='+b64p, r'player\.youku\.com/embed/'+b64p] if not self.url: raise Exception('No url') for p in p_list: hit = re.search(p, self.url) if hit is not None: self.vid = hit.group(1) return def get_vid_from_page(self): if not self.url: raise Exception('No url') self.page = get_content(self.url) hit = re.search(r'videoId2:"([A-Za-z0-9=]+)"', self.page) if hit is not None: self.vid = hit.group(1) def prepare(self, **kwargs): assert self.url or self.vid if self.url and not self.vid: self.get_vid_from_url() if self.vid is None: self.get_vid_from_page() if self.vid is None: log.wtf('Cannot fetch vid') if kwargs.get('src') and kwargs['src'] == 'tudou': self.ccode = '0512' if kwargs.get('password') and kwargs['password']: self.password_protected = True self.password = kwargs['password'] self.utid = fetch_cna() time.sleep(3) self.youku_ups() if self.api_data.get('stream') is None: if self.api_error_code == -6001: # wrong vid parsed from the page vid_from_url = self.vid self.get_vid_from_page() if vid_from_url == self.vid: log.wtf(self.api_error_msg) self.youku_ups() if self.api_data.get('stream') is None: if self.api_error_code == -2002: # wrong password self.password_protected = True # it can be True already(from cli). offer another chance to retry self.password = input(log.sprint('Password: ', log.YELLOW)) self.youku_ups() if self.api_data.get('stream') is None: if self.api_error_msg: log.wtf(self.api_error_msg) else: log.wtf('Unknown error') self.title = self.api_data['video']['title'] stream_types = dict([(i['id'], i) for i in self.stream_types]) audio_lang = self.api_data['stream'][0]['audio_lang'] for stream in self.api_data['stream']: stream_id = stream['stream_type'] is_preview = False if stream_id in stream_types and stream['audio_lang'] == audio_lang: if 'alias-of' in stream_types[stream_id]: stream_id = stream_types[stream_id]['alias-of'] if stream_id not in self.streams: self.streams[stream_id] = { 'container': stream_types[stream_id]['container'], 'video_profile': stream_types[stream_id]['video_profile'], 'size': stream['size'], 'pieces': [{ 'segs': stream['segs'] }], 'm3u8_url': stream['m3u8_url'] } src = [] for seg in stream['segs']: if seg.get('cdn_url'): src.append(self.__class__.change_cdn(seg['cdn_url'])) else: is_preview = True self.streams[stream_id]['src'] = src else: self.streams[stream_id]['size'] += stream['size'] self.streams[stream_id]['pieces'].append({ 'segs': stream['segs'] }) src = [] for seg in stream['segs']: if seg.get('cdn_url'): src.append(self.__class__.change_cdn(seg['cdn_url'])) else: is_preview = True self.streams[stream_id]['src'].extend(src) if is_preview: log.w('{} is a preview'.format(stream_id)) # Audio languages if 'dvd' in self.api_data: al = self.api_data['dvd'].get('audiolang') if al: self.audiolang = al for i in self.audiolang: i['url'] = 'http://v.youku.com/v_show/id_{}'.format(i['vid']) def youku_download_playlist_by_url(url, **kwargs): video_page_pt = 'https?://v.youku.com/v_show/id_([A-Za-z0-9=]+)' js_cb_pt = r'\(({.+})\)' if re.match(video_page_pt, url): youku_obj = Youku() youku_obj.url = url youku_obj.prepare(**kwargs) total_episode = None try: total_episode = youku_obj.api_data['show']['episode_total'] except KeyError: log.wtf('Cannot get total_episode for {}'.format(url)) next_vid = youku_obj.vid for _ in range(total_episode): this_extractor = Youku() this_extractor.download_by_vid(next_vid, keep_obj=True, **kwargs) next_vid = this_extractor.video_next['encodevid'] ''' if youku_obj.video_list is None: log.wtf('Cannot find video list for {}'.format(url)) else: vid_list = [v['encodevid'] for v in youku_obj.video_list] for v in vid_list: Youku().download_by_vid(v, **kwargs) ''' elif re.match('https?://list.youku.com/show/id_', url): # http://list.youku.com/show/id_z2ae8ee1c837b11e18195.html # official playlist page = get_content(url) show_id = re.search(r'showid:"(\d+)"', page).group(1) ep = 'http://list.youku.com/show/module?id={}&tab=showInfo&callback=jQuery'.format(show_id) xhr_page = get_content(ep).replace(r'\/', '/').replace(r'\"', '"') video_url = re.search(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_page).group(1) youku_download_playlist_by_url('http://'+video_url, **kwargs) return elif re.match(r'https?://list.youku.com/albumlist/show/id_(\d+)\.html', url): # http://list.youku.com/albumlist/show/id_2336634.html # UGC playlist list_id = re.search(r'https?://list.youku.com/albumlist/show/id_(\d+)\.html', url).group(1) ep = 'http://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=tuijsonp6' first_u = ep.format(list_id, 1) xhr_page = get_content(first_u) json_data = json.loads(re.search(js_cb_pt, xhr_page).group(1)) video_cnt = json_data['data']['total'] xhr_html = json_data['html'] v_urls = re.findall(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_html) if video_cnt > 20: req_cnt = video_cnt // 20 for i in range(2, req_cnt+2): req_u = ep.format(list_id, i) xhr_page = get_content(req_u) json_data = json.loads(re.search(js_cb_pt, xhr_page).group(1).replace(r'\/', '/')) xhr_html = json_data['html'] page_videos = re.findall(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_html) v_urls.extend(page_videos) for u in v_urls[0::2]: url = 'http://' + u Youku().download_by_url(url, **kwargs) return def youku_download_by_url(url, **kwargs): Youku().download_by_url(url, **kwargs) def youku_download_by_vid(vid, **kwargs): Youku().download_by_vid(vid, **kwargs) download = youku_download_by_url download_playlist = youku_download_playlist_by_url ================================================ FILE: src/you_get/extractors/youtube.py ================================================ #!/usr/bin/env python from ..common import * from ..extractor import VideoExtractor try: import dukpy except ImportError: log.e('Please install dukpy in order to extract videos from YouTube:') log.e('$ pip install dukpy') exit(0) from urllib.parse import urlparse, parse_qs, urlencode from xml.dom.minidom import parseString class YouTube(VideoExtractor): name = "YouTube" # Non-DASH YouTube media encoding options, in descending quality order. # http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs. Retrieved July 17, 2014. stream_types = [ {'itag': '38', 'container': 'MP4', 'video_resolution': '3072p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '3.5-5', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, #{'itag': '85', 'container': 'MP4', 'video_resolution': '1080p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '3-4', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, {'itag': '46', 'container': 'WebM', 'video_resolution': '1080p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, {'itag': '37', 'container': 'MP4', 'video_resolution': '1080p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '3-4.3', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, #{'itag': '102', 'container': 'WebM', 'video_resolution': '720p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, {'itag': '45', 'container': 'WebM', 'video_resolution': '720p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '2', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, #{'itag': '84', 'container': 'MP4', 'video_resolution': '720p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '2-3', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, {'itag': '22', 'container': 'MP4', 'video_resolution': '720p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '2-3', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, {'itag': '120', 'container': 'FLV', 'video_resolution': '720p', 'video_encoding': 'H.264', 'video_profile': 'Main@L3.1', 'video_bitrate': '2', 'audio_encoding': 'AAC', 'audio_bitrate': '128'}, # Live streaming only {'itag': '44', 'container': 'WebM', 'video_resolution': '480p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '1', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'}, {'itag': '35', 'container': 'FLV', 'video_resolution': '480p', 'video_encoding': 'H.264', 'video_profile': 'Main', 'video_bitrate': '0.8-1', 'audio_encoding': 'AAC', 'audio_bitrate': '128'}, #{'itag': '101', 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, #{'itag': '100', 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'}, {'itag': '43', 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '0.5', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'}, {'itag': '34', 'container': 'FLV', 'video_resolution': '360p', 'video_encoding': 'H.264', 'video_profile': 'Main', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '128'}, #{'itag': '82', 'container': 'MP4', 'video_resolution': '360p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'}, {'itag': '18', 'container': 'MP4', 'video_resolution': '360p', 'video_encoding': 'H.264', 'video_profile': 'Baseline', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'}, {'itag': '6', 'container': 'FLV', 'video_resolution': '270p', 'video_encoding': 'Sorenson H.263', 'video_profile': '', 'video_bitrate': '0.8', 'audio_encoding': 'MP3', 'audio_bitrate': '64'}, #{'itag': '83', 'container': 'MP4', 'video_resolution': '240p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'}, {'itag': '13', 'container': '3GP', 'video_resolution': '', 'video_encoding': 'MPEG-4 Visual', 'video_profile': '', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': ''}, {'itag': '5', 'container': 'FLV', 'video_resolution': '240p', 'video_encoding': 'Sorenson H.263', 'video_profile': '', 'video_bitrate': '0.25', 'audio_encoding': 'MP3', 'audio_bitrate': '64'}, {'itag': '36', 'container': '3GP', 'video_resolution': '240p', 'video_encoding': 'MPEG-4 Visual', 'video_profile': 'Simple', 'video_bitrate': '0.175', 'audio_encoding': 'AAC', 'audio_bitrate': '32'}, {'itag': '17', 'container': '3GP', 'video_resolution': '144p', 'video_encoding': 'MPEG-4 Visual', 'video_profile': 'Simple', 'video_bitrate': '0.05', 'audio_encoding': 'AAC', 'audio_bitrate': '24'}, ] def dethrottle(js, url): def n_to_n(js, n): # Examples: # yma - https://www.youtube.com/s/player/84314bef/player_ias.vflset/en_US/base.js # Xka - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js # jma - https://www.youtube.com/s/player/8d9f6215/player_ias.vflset/sv_SE/base.js f1 = match1(js, r',[$\w]+\.length\|\|([$\w]+)\(""\)\)}};') # Examples: # Yla, ida - https://www.youtube.com/s/player/fb725ac8/player-plasma-ias-phone-sv_SE.vflset/base.js # Hla, eda - https://www.youtube.com/s/player/2f238d39/player-plasma-ias-phone-en_US.vflset/base.js # WyE, bE7, Gsn - https://www.youtube.com/s/player/3bb1f723/player-plasma-ias-phone-sv_SE.vflset/base.js if not f1: f0 = match1(js, r'\w=([$\w]+)\[0\]\(\w\),\w\.set\(\w,\w\)') f1 = match1(js, r'%s=\[([$\w]+)\]' % f0) f1def = match1(js, r'\W%s=(function\(\w+\).+?\)});' % re.escape(f1)) v1 = match1(f1def, r'if\(typeof ([$\w]+)==="undefined"\)') v1def = match1(js, r'(var %s=[^;]+;)' % v1) if not v1def: v1def = '' n = dukpy.evaljs('%s(%s)("%s")' % (v1def, f1def, n)) return n u = urlparse(url) qs = parse_qs(u.query) n = n_to_n(js, qs['n'][0]) qs['n'] = [n] return u._replace(query=urlencode(qs, doseq=True)).geturl() def s_to_sig(js, s): # Examples: # BPa - https://www.youtube.com/s/player/84314bef/player_ias.vflset/en_US/base.js # Xva - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js js_code = '' f1 = match1(js, r'=([$\w]+)\(decodeURIComponent\(') f1def = match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) # remove . prefix f1def = 'function %s%s' % (f1, f1def) f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def)) # find all invoked function names for f2 in f2s: f2e = re.escape(f2) f2def = re.search(r'[^$\w]%s:function\((\w+,\w+)\)(\{[^\{\}]+\})' % f2e, js) if f2def: f2def = 'function {}({}){}'.format(f2e, f2def.group(1), f2def.group(2)) else: f2def = re.search(r'[^$\w]%s:function\((\w+)\)(\{[^\{\}]+\})' % f2e, js) f2def = 'function {}({},b){}'.format(f2e, f2def.group(1), f2def.group(2)) js_code += f2def + ';' js_code += f1def + ';%s("%s")' % (f1, s) sig = dukpy.evaljs(js_code) return sig def chunk_by_range(url, size): urls = [] chunk_size = 10485760 start, end = 0, chunk_size - 1 urls.append('%s&range=%s-%s' % (url, start, end)) while end + 1 < size: # processed size < expected size start, end = end + 1, end + chunk_size urls.append('%s&range=%s-%s' % (url, start, end)) return urls def get_url_from_vid(vid): return 'https://youtu.be/{}'.format(vid) def get_vid_from_url(url): """Extracts video ID from URL. """ return match1(url, r'youtu\.be/([^?/]+)') or \ match1(url, r'youtube\.com/embed/([^/?]+)') or \ match1(url, r'youtube\.com/shorts/([^/?]+)') or \ match1(url, r'youtube\.com/v/([^/?]+)') or \ match1(url, r'youtube\.com/watch/([^/?]+)') or \ parse_query_param(url, 'v') or \ parse_query_param(parse_query_param(url, 'u'), 'v') def get_playlist_id_from_url(url): """Extracts playlist ID from URL. """ return parse_query_param(url, 'list') or \ parse_query_param(url, 'p') def download_playlist_by_url(self, url, **kwargs): self.url = url playlist_id = self.__class__.get_playlist_id_from_url(self.url) if playlist_id is None: log.wtf('[Failed] Unsupported URL pattern.') video_page = get_content('https://www.youtube.com/playlist?list=%s' % playlist_id) playlist_json_serialized = match1(video_page, r'window\["ytInitialData"\]\s*=\s*(.+);', r'var\s+ytInitialData\s*=\s*([^;]+);') if len(playlist_json_serialized) == 0: log.wtf('[Failed] Unable to extract playlist data') ytInitialData = json.loads(playlist_json_serialized[0]) tab0 = ytInitialData['contents']['twoColumnBrowseResultsRenderer']['tabs'][0] itemSection0 = tab0['tabRenderer']['content']['sectionListRenderer']['contents'][0] playlistVideoList0 = itemSection0['itemSectionRenderer']['contents'][0] videos = playlistVideoList0['playlistVideoListRenderer']['contents'] self.title = re.search(r'|var )', video_page).group(1)) self.check_playability_response(ytInitialPlayerResponse) # Get the video title self.title = ytInitialPlayerResponse["videoDetails"]["title"] # Check the status playabilityStatus = ytInitialPlayerResponse['playabilityStatus'] status = playabilityStatus['status'] logging.debug('status: %s' % status) if status != 'OK': # If cookies are loaded, status should be OK try: subreason = playabilityStatus['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'][0]['text'] log.e('[Error] %s (%s)' % (playabilityStatus['reason'], subreason)) except: log.e('[Error] %s' % playabilityStatus['reason']) if status == 'LOGIN_REQUIRED': log.e('View the video from a browser and export the cookies, then use --cookies to load cookies.') exit(1) stream_list = ytInitialPlayerResponse['streamingData']['formats'] for stream in stream_list: logging.debug('Found format: itag=%s' % stream['itag']) if 'signatureCipher' in stream: logging.debug(' Parsing signatureCipher for itag=%s...' % stream['itag']) qs = parse_qs(stream['signatureCipher']) #logging.debug(qs) sp = qs['sp'][0] sig = self.__class__.s_to_sig(self.js, qs['s'][0]) url = qs['url'][0] + '&{}={}'.format(sp, sig) elif 'url' in stream: url = stream['url'] else: log.wtf(' No signatureCipher or url for itag=%s' % stream['itag']) url = self.__class__.dethrottle(self.js, url) self.streams[str(stream['itag'])] = { 'itag': str(stream['itag']), 'url': url, 'quality': stream['quality'], 'type': stream['mimeType'], 'mime': stream['mimeType'].split(';')[0], 'container': mime_to_container(stream['mimeType'].split(';')[0]), } # FIXME: Prepare caption tracks try: caption_tracks = ytInitialPlayerResponse['captions']['playerCaptionsTracklistRenderer']['captionTracks'] for ct in caption_tracks: ttsurl, lang = ct['baseUrl'], ct['languageCode'] if ttsurl.startswith('/'): ttsurl = 'https://www.youtube.com' + ttsurl tts_xml = parseString(get_content(ttsurl)) transcript = tts_xml.getElementsByTagName('transcript')[0] texts = transcript.getElementsByTagName('text') srt = ""; seq = 0 for text in texts: if text.firstChild is None: continue # empty element seq += 1 start = float(text.getAttribute('start')) if text.getAttribute('dur'): dur = float(text.getAttribute('dur')) else: dur = 1.0 # could be ill-formed XML finish = start + dur m, s = divmod(start, 60); h, m = divmod(m, 60) start = '{:0>2}:{:0>2}:{:06.3f}'.format(int(h), int(m), s).replace('.', ',') m, s = divmod(finish, 60); h, m = divmod(m, 60) finish = '{:0>2}:{:0>2}:{:06.3f}'.format(int(h), int(m), s).replace('.', ',') content = unescape_html(text.firstChild.nodeValue) srt += '%s\n' % str(seq) srt += '%s --> %s\n' % (start, finish) srt += '%s\n\n' % content if 'kind' in ct: self.caption_tracks[ct['vssId']] = srt # autogenerated else: self.caption_tracks[lang] = srt except: pass # Prepare DASH streams if 'adaptiveFormats' in ytInitialPlayerResponse['streamingData']: streams = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] # FIXME: dead code? # streams without contentLength got broken urls, just remove them (#2767) streams = [stream for stream in streams if 'contentLength' in stream] for stream in streams: logging.debug('Found adaptiveFormat: itag=%s' % stream['itag']) stream['itag'] = str(stream['itag']) if 'qualityLabel' in stream: stream['quality_label'] = stream['qualityLabel'] del stream['qualityLabel'] logging.debug(' quality_label: \t%s' % stream['quality_label']) if 'width' in stream: stream['size'] = '{}x{}'.format(stream['width'], stream['height']) del stream['width'] del stream['height'] logging.debug(' size: \t%s' % stream['size']) stream['type'] = stream['mimeType'] logging.debug(' type: \t%s' % stream['type']) stream['clen'] = stream['contentLength'] stream['init'] = '{}-{}'.format( stream['initRange']['start'], stream['initRange']['end']) stream['index'] = '{}-{}'.format( stream['indexRange']['start'], stream['indexRange']['end']) del stream['mimeType'] del stream['contentLength'] del stream['initRange'] del stream['indexRange'] if 'signatureCipher' in stream: logging.debug(' Parsing signatureCipher for itag=%s...' % stream['itag']) qs = parse_qs(stream['signatureCipher']) #logging.debug(qs) sp = qs['sp'][0] sig = self.__class__.s_to_sig(self.js, qs['s'][0]) url = qs['url'][0] + '&ratebypass=yes&{}={}'.format(sp, sig) elif 'url' in stream: url = stream['url'] else: log.wtf('No signatureCipher or url for itag=%s' % stream['itag']) url = self.__class__.dethrottle(self.js, url) stream['url'] = url for stream in streams: # audio if stream['type'].startswith('audio/mp4'): dash_mp4_a_url = stream['url'] dash_mp4_a_size = stream['clen'] elif stream['type'].startswith('audio/webm'): dash_webm_a_url = stream['url'] dash_webm_a_size = stream['clen'] for stream in streams: # video if 'size' in stream: if stream['type'].startswith('video/mp4'): mimeType = 'video/mp4' dash_url = stream['url'] dash_size = stream['clen'] itag = stream['itag'] dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) dash_mp4_a_urls = self.__class__.chunk_by_range(dash_mp4_a_url, int(dash_mp4_a_size)) self.dash_streams[itag] = { 'quality': '%s (%s)' % (stream['size'], stream['quality_label']), 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'mp4', 'src': [dash_urls, dash_mp4_a_urls], 'size': int(dash_size) + int(dash_mp4_a_size) } elif stream['type'].startswith('video/webm'): mimeType = 'video/webm' dash_url = stream['url'] dash_size = stream['clen'] itag = stream['itag'] audio_url = None audio_size = None try: audio_url = dash_webm_a_url audio_size = int(dash_webm_a_size) except UnboundLocalError as e: audio_url = dash_mp4_a_url audio_size = int(dash_mp4_a_size) dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) audio_urls = self.__class__.chunk_by_range(audio_url, int(audio_size)) self.dash_streams[itag] = { 'quality': '%s (%s)' % (stream['size'], stream['quality_label']), 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'webm', 'src': [dash_urls, audio_urls], 'size': int(dash_size) + int(audio_size) } def extract(self, **kwargs): if not self.streams_sorted: # No stream is available return if 'stream_id' in kwargs and kwargs['stream_id']: # Extract the stream stream_id = kwargs['stream_id'] if stream_id not in self.streams and stream_id not in self.dash_streams: log.e('[Error] Invalid video format.') log.e('Run \'-i\' command with no specific video format to view all available formats.') exit(2) else: # Extract stream with the best quality stream_id = self.streams_sorted[0]['itag'] if stream_id in self.streams: src = self.streams[stream_id]['url'] self.streams[stream_id]['src'] = [src] self.streams[stream_id]['size'] = urls_size(self.streams[stream_id]['src']) site = YouTube() download = site.download_by_url download_playlist = site.download_playlist_by_url ================================================ FILE: src/you_get/extractors/zhanqi.py ================================================ #!/usr/bin/env python __all__ = ['zhanqi_download'] from ..common import * import json import base64 from urllib.parse import urlparse def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): path = urlparse(url).path[1:] if not (path.startswith('videos') or path.startswith('v2/videos')): #url = "https://www.zhanqi.tv/huashan?param_s=1_0.2.0" path_list = path.split('/') room_id = path_list[1] if path_list[0] == 'topic' else path_list[0] zhanqi_live(room_id, merge=merge, output_dir=output_dir, info_only=info_only, **kwargs) else: #url = 'https://www.zhanqi.tv/videos/Lyingman/2017/01/182308.html' # https://www.zhanqi.tv/v2/videos/215593.html video_id = path.split('.')[0].split('/')[-1] zhanqi_video(video_id, merge=merge, output_dir=output_dir, info_only=info_only, **kwargs) def zhanqi_live(room_id, merge=True, output_dir='.', info_only=False, **kwargs): api_url = "https://www.zhanqi.tv/api/static/v2.1/room/domain/{}.json".format(room_id) json_data = json.loads(get_content(api_url))['data'] status = json_data['status'] if status != '4': raise Exception("The live stream is not online!") nickname = json_data['nickname'] title = nickname + ": " + json_data['title'] video_levels = base64.b64decode(json_data['flashvars']['VideoLevels']).decode('utf8') m3u8_url = json.loads(video_levels)['streamUrl'] print_info(site_info, title, 'm3u8', 0, m3u8_url=m3u8_url, m3u8_type='master') if not info_only: download_url_ffmpeg(m3u8_url, title, 'mp4', output_dir=output_dir, merge=merge) def zhanqi_video(video_id, output_dir='.', info_only=False, merge=True, **kwargs): api_url = 'https://www.zhanqi.tv/api/static/v2.1/video/{}.json'.format(video_id) json_data = json.loads(get_content(api_url))['data'] title = json_data['title'] vid = json_data['flashvars']['VideoID'] m3u8_url = 'http://dlvod.cdn.zhanqi.tv/' + vid urls = general_m3u8_extractor(m3u8_url) print_info(site_info, title, 'm3u8', 0) if not info_only: download_urls(urls, title, 'ts', 0, output_dir=output_dir, merge=merge, **kwargs) site_info = "www.zhanqi.tv" download = zhanqi_download download_playlist = playlist_not_supported('zhanqi') ================================================ FILE: src/you_get/extractors/zhibo.py ================================================ #!/usr/bin/env python __all__ = ['zhibo_download'] from ..common import * def zhibo_vedio_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): # http://video.zhibo.tv/video/details/d103057f-663e-11e8-9d83-525400ccac43.html html = get_html(url) title = r1(r'([\s\S]*)', html) total_size = 0 part_urls= [] video_html = r1(r'', html) # video_guessulike = r1(r"window.xgData =([s\S'\s\.]*)\'\;[\s\S]*window.vouchData", video_html) video_url = r1(r"window.vurl = \'([s\S'\s\.]*)\'\;[\s\S]*window.imgurl", video_html) part_urls.append(video_url) ext = video_url.split('.')[-1] print_info(site_info, title, ext, total_size) if not info_only: download_urls(part_urls, title, ext, total_size, output_dir=output_dir, merge=merge) def zhibo_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): if 'video.zhibo.tv' in url: zhibo_vedio_download(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) return # if 'v.zhibo.tv' in url: # http://v.zhibo.tv/31609372 html = get_html(url) title = r1(r'([\s\S]*)', html) is_live = r1(r"window.videoIsLive=\'([s\S'\s\.]*)\'\;[\s\S]*window.resDomain", html) if is_live != "1": raise ValueError("The live stream is not online! (Errno:%s)" % is_live) match = re.search(r""" ourStreamName .*? '(.*?)' .*? rtmpHighSource .*? '(.*?)' .*? '(.*?)' """, html, re.S | re.X) real_url = match.group(3) + match.group(1) + match.group(2) print_info(site_info, title, 'flv', float('inf')) if not info_only: download_url_ffmpeg(real_url, title, 'flv', params={}, output_dir=output_dir, merge=merge) site_info = "zhibo.tv" download = zhibo_download download_playlist = playlist_not_supported('zhibo') ================================================ FILE: src/you_get/extractors/zhihu.py ================================================ #!/usr/bin/env python __all__ = ['zhihu_download', 'zhihu_download_playlist'] from ..common import * import json def zhihu_download(url, output_dir='.', merge=True, info_only=False, **kwargs): paths = url.split("/") # question or column if len(paths) < 3 and len(paths) < 6: raise TypeError("URL does not conform to specifications, Support column and question only." "Example URL: https://zhuanlan.zhihu.com/p/51669862 or " "https://www.zhihu.com/question/267782048/answer/490720324") if ("question" not in paths or "answer" not in paths) and "zhuanlan.zhihu.com" not in paths: raise TypeError("URL does not conform to specifications, Support column and question only." "Example URL: https://zhuanlan.zhihu.com/p/51669862 or " "https://www.zhihu.com/question/267782048/answer/490720324") html = get_html(url, faker=True) title = match1(html, r'data-react-helmet="true">(.*?)') for index, video_id in enumerate(matchall(html, [r' '0') or (vers[0] == 'avconv') try: v = vers[2][1:] if vers[2][0] == 'n' else vers[2] version = [int(i) for i in v.split('.')] except: version = [1, 0] return cmd, 'ffprobe', version except: return None FFMPEG, FFPROBE, FFMPEG_VERSION = get_usable_ffmpeg('ffmpeg') or get_usable_ffmpeg('avconv') or (None, None, None) if logging.getLogger().isEnabledFor(logging.DEBUG): LOGLEVEL = ['-loglevel', 'info'] STDIN = None else: LOGLEVEL = ['-loglevel', 'quiet'] STDIN = DEVNULL def has_ffmpeg_installed(): return FFMPEG is not None # Given a list of segments and the output path, generates the concat # list and returns the path to the concat list. def generate_concat_list(files, output): concat_list_path = output + '.txt' concat_list_dir = os.path.dirname(concat_list_path) with open(concat_list_path, 'w', encoding='utf-8') as concat_list: for file in files: if os.path.isfile(file): relpath = os.path.relpath(file, start=concat_list_dir) concat_list.write('file %s\n' % parameterize(relpath)) return concat_list_path def ffmpeg_concat_av(files, output, ext): print('Merging video parts... ', end="", flush=True) params = [FFMPEG] + LOGLEVEL for file in files: if os.path.isfile(file): params.extend(['-i', file]) params.extend(['-c', 'copy']) params.extend(['--', output]) if subprocess.call(params, stdin=STDIN): print('Merging without re-encode failed.\nTry again re-encoding audio... ', end="", flush=True) try: os.remove(output) except FileNotFoundError: pass params = [FFMPEG] + LOGLEVEL for file in files: if os.path.isfile(file): params.extend(['-i', file]) params.extend(['-c:v', 'copy']) if ext == 'mp4': params.extend(['-c:a', 'aac']) params.extend(['-strict', 'experimental']) elif ext == 'webm': params.extend(['-c:a', 'opus']) params.extend(['--', output]) return subprocess.call(params, stdin=STDIN) else: return 0 def ffmpeg_convert_ts_to_mkv(files, output='output.mkv'): for file in files: if os.path.isfile(file): params = [FFMPEG] + LOGLEVEL params.extend(['-y', '-i', file]) params.extend(['--', output]) subprocess.call(params, stdin=STDIN) return def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'): # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): concat_list = generate_concat_list(files, output) params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0', '-i', concat_list, '-c', 'copy'] params.extend(['--', output]) if subprocess.call(params, stdin=STDIN) == 0: os.remove(output + '.txt') return True else: raise for file in files: if os.path.isfile(file): params = [FFMPEG] + LOGLEVEL + ['-y', '-i'] params.extend([file, file + '.mpg']) subprocess.call(params, stdin=STDIN) inputs = [open(file + '.mpg', 'rb') for file in files] with open(output + '.mpg', 'wb') as o: for input in inputs: o.write(input.read()) params = [FFMPEG] + LOGLEVEL + ['-y', '-i'] params.append(output + '.mpg') params += ['-vcodec', 'copy', '-acodec', 'copy'] params.extend(['--', output]) if subprocess.call(params, stdin=STDIN) == 0: for file in files: os.remove(file + '.mpg') os.remove(output + '.mpg') return True else: raise def ffmpeg_concat_ts_to_mkv(files, output='output.mkv'): print('Merging video parts... ', end="", flush=True) params = [FFMPEG] + LOGLEVEL + ['-y', '-i'] params.append('concat:') for file in files: if os.path.isfile(file): params[-1] += file + '|' params += ['-f', 'matroska', '-c', 'copy'] params.extend(['--', output]) try: if subprocess.call(params, stdin=STDIN) == 0: return True else: return False except: return False def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'): print('Merging video parts... ', end="", flush=True) # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): concat_list = generate_concat_list(files, output) params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0', '-i', concat_list, '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] params.extend(['--', output]) subprocess.check_call(params, stdin=STDIN) os.remove(output + '.txt') return True for file in files: if os.path.isfile(file): params = [FFMPEG] + LOGLEVEL + ['-y', '-i'] params.append(file) params += ['-map', '0', '-c', 'copy', '-f', 'mpegts', '-bsf:v', 'h264_mp4toannexb'] params.append(file + '.ts') subprocess.call(params, stdin=STDIN) params = [FFMPEG] + LOGLEVEL + ['-y', '-i'] params.append('concat:') for file in files: f = file + '.ts' if os.path.isfile(f): params[-1] += f + '|' if FFMPEG == 'avconv': params += ['-c', 'copy'] else: params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc'] params.extend(['--', output]) if subprocess.call(params, stdin=STDIN) == 0: for file in files: os.remove(file + '.ts') return True else: raise def ffmpeg_concat_mp3_to_mp3(files, output='output.mp3'): print('Merging video parts... ', end="", flush=True) files = 'concat:' + '|'.join(files) params = [FFMPEG] + LOGLEVEL + ['-y'] params += ['-i', files, '-acodec', 'copy'] params.extend(['--', output]) subprocess.call(params) return True def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'): print('Merging video parts... ', end="", flush=True) # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): concat_list = generate_concat_list(files, output) params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0', '-i', concat_list, '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] params.extend(['--', output]) subprocess.check_call(params, stdin=STDIN) os.remove(output + '.txt') return True for file in files: if os.path.isfile(file): params = [FFMPEG] + LOGLEVEL + ['-y', '-i'] params.append(file) params += ['-c', 'copy', '-f', 'mpegts', '-bsf:v', 'h264_mp4toannexb'] params.append(file + '.ts') subprocess.call(params, stdin=STDIN) params = [FFMPEG] + LOGLEVEL + ['-y', '-i'] params.append('concat:') for file in files: f = file + '.ts' if os.path.isfile(f): params[-1] += f + '|' if FFMPEG == 'avconv': params += ['-c', 'copy'] else: params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc'] params.extend(['--', output]) subprocess.check_call(params, stdin=STDIN) for file in files: os.remove(file + '.ts') return True def ffmpeg_download_stream(files, title, ext, params={}, output_dir='.', stream=True): """str, str->True WARNING: NOT THE SAME PARMS AS OTHER FUNCTIONS!!!!!! You can basically download anything with this function but better leave it alone with """ output = title + '.' + ext if not (output_dir == '.'): output = output_dir + '/' + output print('Downloading streaming content with FFmpeg, press q to stop recording...') if stream: ffmpeg_params = [FFMPEG] + ['-y', '-re', '-i'] else: ffmpeg_params = [FFMPEG] + ['-y', '-i'] ffmpeg_params.append(files) #not the same here!!!! if FFMPEG == 'avconv': #who cares? ffmpeg_params += ['-c', 'copy'] else: ffmpeg_params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc'] if params is not None: if len(params) > 0: for k, v in params: ffmpeg_params.append(k) ffmpeg_params.append(v) ffmpeg_params.extend(['--', output]) print(' '.join(ffmpeg_params)) try: a = subprocess.Popen(ffmpeg_params, stdin= subprocess.PIPE) a.communicate() except KeyboardInterrupt: try: a.stdin.write('q'.encode('utf-8')) except: pass return True def ffmpeg_concat_audio_and_video(files, output, ext): print('Merging video and audio parts... ', end="", flush=True) if has_ffmpeg_installed: params = [FFMPEG] + LOGLEVEL params.extend(['-f', 'concat']) params.extend(['-safe', '0']) # https://stackoverflow.com/questions/38996925/ffmpeg-concat-unsafe-file-name for file in files: if os.path.isfile(file): params.extend(['-i', file]) params.extend(['-c:v', 'copy']) params.extend(['-c:a', 'aac']) params.extend(['-strict', 'experimental']) params.extend(['--', output + "." + ext]) return subprocess.call(params, stdin=STDIN) else: raise EnvironmentError('No ffmpeg found') def ffprobe_get_media_duration(file): print('Getting {} duration'.format(file)) params = [FFPROBE] params.extend(['-i', file]) params.extend(['-show_entries', 'format=duration']) params.extend(['-v', 'quiet']) params.extend(['-of', 'csv=p=0']) return subprocess.check_output(params, stdin=STDIN, stderr=subprocess.STDOUT).decode().strip() ================================================ FILE: src/you_get/processor/join_flv.py ================================================ #!/usr/bin/env python import struct from io import BytesIO TAG_TYPE_METADATA = 18 ################################################## # AMF0 ################################################## AMF_TYPE_NUMBER = 0x00 AMF_TYPE_BOOLEAN = 0x01 AMF_TYPE_STRING = 0x02 AMF_TYPE_OBJECT = 0x03 AMF_TYPE_MOVIECLIP = 0x04 AMF_TYPE_NULL = 0x05 AMF_TYPE_UNDEFINED = 0x06 AMF_TYPE_REFERENCE = 0x07 AMF_TYPE_MIXED_ARRAY = 0x08 AMF_TYPE_END_OF_OBJECT = 0x09 AMF_TYPE_ARRAY = 0x0A AMF_TYPE_DATE = 0x0B AMF_TYPE_LONG_STRING = 0x0C AMF_TYPE_UNSUPPORTED = 0x0D AMF_TYPE_RECORDSET = 0x0E AMF_TYPE_XML = 0x0F AMF_TYPE_CLASS_OBJECT = 0x10 AMF_TYPE_AMF3_OBJECT = 0x11 class ECMAObject: def __init__(self, max_number): self.max_number = max_number self.data = [] self.map = {} def put(self, k, v): self.data.append((k, v)) self.map[k] = v def get(self, k): return self.map[k] def set(self, k, v): for i in range(len(self.data)): if self.data[i][0] == k: self.data[i] = (k, v) break else: raise KeyError(k) self.map[k] = v def keys(self): return self.map.keys() def __str__(self): return 'ECMAObject<' + repr(self.map) + '>' def __eq__(self, other): return self.max_number == other.max_number and self.data == other.data def read_amf_number(stream): return struct.unpack('>d', stream.read(8))[0] def read_amf_boolean(stream): b = read_byte(stream) assert b in (0, 1) return bool(b) def read_amf_string(stream): xx = stream.read(2) if xx == b'': # dirty fix for the invalid Qiyi flv return None n = struct.unpack('>H', xx)[0] s = stream.read(n) assert len(s) == n return s.decode('utf-8') def read_amf_object(stream): obj = {} while True: k = read_amf_string(stream) if not k: assert read_byte(stream) == AMF_TYPE_END_OF_OBJECT break v = read_amf(stream) obj[k] = v return obj def read_amf_mixed_array(stream): max_number = read_uint(stream) mixed_results = ECMAObject(max_number) while True: k = read_amf_string(stream) if k is None: # dirty fix for the invalid Qiyi flv break if not k: assert read_byte(stream) == AMF_TYPE_END_OF_OBJECT break v = read_amf(stream) mixed_results.put(k, v) assert len(mixed_results.data) == max_number return mixed_results def read_amf_array(stream): n = read_uint(stream) v = [] for i in range(n): v.append(read_amf(stream)) return v amf_readers = { AMF_TYPE_NUMBER: read_amf_number, AMF_TYPE_BOOLEAN: read_amf_boolean, AMF_TYPE_STRING: read_amf_string, AMF_TYPE_OBJECT: read_amf_object, AMF_TYPE_MIXED_ARRAY: read_amf_mixed_array, AMF_TYPE_ARRAY: read_amf_array, } def read_amf(stream): return amf_readers[read_byte(stream)](stream) def write_amf_number(stream, v): stream.write(struct.pack('>d', v)) def write_amf_boolean(stream, v): if v: stream.write(b'\x01') else: stream.write(b'\x00') def write_amf_string(stream, s): s = s.encode('utf-8') stream.write(struct.pack('>H', len(s))) stream.write(s) def write_amf_object(stream, o): for k in o: write_amf_string(stream, k) write_amf(stream, o[k]) write_amf_string(stream, '') write_byte(stream, AMF_TYPE_END_OF_OBJECT) def write_amf_mixed_array(stream, o): write_uint(stream, o.max_number) for k, v in o.data: write_amf_string(stream, k) write_amf(stream, v) write_amf_string(stream, '') write_byte(stream, AMF_TYPE_END_OF_OBJECT) def write_amf_array(stream, o): write_uint(stream, len(o)) for v in o: write_amf(stream, v) amf_writers_tags = { float: AMF_TYPE_NUMBER, bool: AMF_TYPE_BOOLEAN, str: AMF_TYPE_STRING, dict: AMF_TYPE_OBJECT, ECMAObject: AMF_TYPE_MIXED_ARRAY, list: AMF_TYPE_ARRAY, } amf_writers = { AMF_TYPE_NUMBER: write_amf_number, AMF_TYPE_BOOLEAN: write_amf_boolean, AMF_TYPE_STRING: write_amf_string, AMF_TYPE_OBJECT: write_amf_object, AMF_TYPE_MIXED_ARRAY: write_amf_mixed_array, AMF_TYPE_ARRAY: write_amf_array, } def write_amf(stream, v): if isinstance(v, ECMAObject): tag = amf_writers_tags[ECMAObject] else: tag = amf_writers_tags[type(v)] write_byte(stream, tag) amf_writers[tag](stream, v) ################################################## # FLV ################################################## def read_int(stream): return struct.unpack('>i', stream.read(4))[0] def read_uint(stream): return struct.unpack('>I', stream.read(4))[0] def write_uint(stream, n): stream.write(struct.pack('>I', n)) def read_byte(stream): return ord(stream.read(1)) def write_byte(stream, b): stream.write(bytes([b])) def read_unsigned_medium_int(stream): x1, x2, x3 = struct.unpack('BBB', stream.read(3)) return (x1 << 16) | (x2 << 8) | x3 def read_tag(stream): # header size: 15 bytes header = stream.read(15) if len(header) == 4: return x = struct.unpack('>IBBBBBBBBBBB', header) previous_tag_size = x[0] data_type = x[1] body_size = (x[2] << 16) | (x[3] << 8) | x[4] assert body_size < 1024 * 1024 * 128, 'tag body size too big (> 128MB)' timestamp = (x[5] << 16) | (x[6] << 8) | x[7] timestamp += x[8] << 24 assert x[9:] == (0, 0, 0) body = stream.read(body_size) return (data_type, timestamp, body_size, body, previous_tag_size) #previous_tag_size = read_uint(stream) #data_type = read_byte(stream) #body_size = read_unsigned_medium_int(stream) #assert body_size < 1024*1024*128, 'tag body size too big (> 128MB)' #timestamp = read_unsigned_medium_int(stream) #timestamp += read_byte(stream) << 24 #assert read_unsigned_medium_int(stream) == 0 #body = stream.read(body_size) #return (data_type, timestamp, body_size, body, previous_tag_size) def write_tag(stream, tag): data_type, timestamp, body_size, body, previous_tag_size = tag write_uint(stream, previous_tag_size) write_byte(stream, data_type) write_byte(stream, body_size>>16 & 0xff) write_byte(stream, body_size>>8 & 0xff) write_byte(stream, body_size & 0xff) write_byte(stream, timestamp>>16 & 0xff) write_byte(stream, timestamp>>8 & 0xff) write_byte(stream, timestamp & 0xff) write_byte(stream, timestamp>>24 & 0xff) stream.write(b'\0\0\0') stream.write(body) def read_flv_header(stream): assert stream.read(3) == b'FLV' header_version = read_byte(stream) assert header_version == 1 type_flags = read_byte(stream) assert type_flags == 5 data_offset = read_uint(stream) assert data_offset == 9 def write_flv_header(stream): stream.write(b'FLV') write_byte(stream, 1) write_byte(stream, 5) write_uint(stream, 9) def read_meta_data(stream): meta_type = read_amf(stream) meta = read_amf(stream) return meta_type, meta def read_meta_tag(tag): data_type, timestamp, body_size, body, previous_tag_size = tag assert data_type == TAG_TYPE_METADATA assert timestamp == 0 assert previous_tag_size == 0 return read_meta_data(BytesIO(body)) #def write_meta_data(stream, meta_type, meta_data): # assert isinstance(meta_type, basesting) # write_amf(meta_type) # write_amf(meta_data) def write_meta_tag(stream, meta_type, meta_data): buffer = BytesIO() write_amf(buffer, meta_type) write_amf(buffer, meta_data) body = buffer.getvalue() write_tag(stream, (TAG_TYPE_METADATA, 0, len(body), body, 0)) ################################################## # main ################################################## def guess_output(inputs): import os.path inputs = map(os.path.basename, inputs) n = min(map(len, inputs)) for i in reversed(range(1, n)): if len(set(s[:i] for s in inputs)) == 1: return inputs[0][:i] + '.flv' return 'output.flv' def concat_flv(flvs, output = None): assert flvs, 'no flv file found' import os.path if not output: output = guess_output(flvs) elif os.path.isdir(output): output = os.path.join(output, guess_output(flvs)) print('Merging video parts...') ins = [open(flv, 'rb') for flv in flvs] for stream in ins: read_flv_header(stream) meta_tags = map(read_tag, ins) metas = list(map(read_meta_tag, meta_tags)) meta_types, metas = zip(*metas) assert len(set(meta_types)) == 1 meta_type = meta_types[0] # must merge fields: duration # TODO: check other meta info, update other meta info total_duration = sum(meta.get('duration') for meta in metas) meta_data = metas[0] meta_data.set('duration', total_duration) out = open(output, 'wb') write_flv_header(out) write_meta_tag(out, meta_type, meta_data) timestamp_start = 0 for stream in ins: while True: tag = read_tag(stream) if tag: data_type, timestamp, body_size, body, previous_tag_size = tag timestamp += timestamp_start tag = data_type, timestamp, body_size, body, previous_tag_size write_tag(out, tag) else: break timestamp_start = timestamp write_uint(out, previous_tag_size) return output def usage(): print('Usage: [python3] join_flv.py --output TARGET.flv flv...') def main(): import sys, getopt try: opts, args = getopt.getopt(sys.argv[1:], "ho:", ["help", "output="]) except getopt.GetoptError as err: usage() sys.exit(1) output = None for o, a in opts: if o in ("-h", "--help"): usage() sys.exit() elif o in ("-o", "--output"): output = a else: usage() sys.exit(1) if not args: usage() sys.exit(1) concat_flv(args, output) if __name__ == '__main__': main() ================================================ FILE: src/you_get/processor/join_mp4.py ================================================ #!/usr/bin/env python # reference: c041828_ISO_IEC_14496-12_2005(E).pdf ################################################## # reader and writer ################################################## import struct from io import BytesIO def skip(stream, n): stream.seek(stream.tell() + n) def skip_zeros(stream, n): assert stream.read(n) == b'\x00' * n def read_int(stream): return struct.unpack('>i', stream.read(4))[0] def read_uint(stream): return struct.unpack('>I', stream.read(4))[0] def write_uint(stream, n): stream.write(struct.pack('>I', n)) def write_ulong(stream, n): stream.write(struct.pack('>Q', n)) def read_ushort(stream): return struct.unpack('>H', stream.read(2))[0] def read_ulong(stream): return struct.unpack('>Q', stream.read(8))[0] def read_byte(stream): return ord(stream.read(1)) def copy_stream(source, target, n): buffer_size = 1024 * 1024 while n > 0: to_read = min(buffer_size, n) s = source.read(to_read) assert len(s) == to_read, 'no enough data' target.write(s) n -= to_read class Atom: def __init__(self, type, size, body): assert len(type) == 4 self.type = type self.size = size self.body = body def __str__(self): #return '' % (self.type, repr(self.body)) return '' % (self.type, '') def __repr__(self): return str(self) def write1(self, stream): write_uint(stream, self.size) stream.write(self.type) def write(self, stream): assert type(self.body) == bytes, '%s: %s' % (self.type, type(self.body)) assert self.size == 8 + len(self.body) self.write1(stream) stream.write(self.body) def calsize(self): return self.size class CompositeAtom(Atom): def __init__(self, type, size, body): assert isinstance(body, list) Atom.__init__(self, type, size, body) def write(self, stream): assert type(self.body) == list self.write1(stream) for atom in self.body: atom.write(stream) def calsize(self): self.size = 8 + sum([atom.calsize() for atom in self.body]) return self.size def get1(self, k): for a in self.body: if a.type == k: return a else: raise Exception('atom not found: ' + k) def get(self, *keys): atom = self for k in keys: atom = atom.get1(k) return atom def get_all(self, k): return list(filter(lambda x: x.type == k, self.body)) class VariableAtom(Atom): def __init__(self, type, size, body, variables): assert isinstance(body, bytes) Atom.__init__(self, type, size, body) self.variables = variables def write(self, stream): self.write1(stream) i = 0 n = 0 for name, offset, value, bsize in self.variables: stream.write(self.body[i:offset]) if bsize == 4: write_uint(stream, value) elif bsize == 8: write_ulong(stream, value) else: raise NotImplementedError() n += offset - i + bsize i = offset + bsize stream.write(self.body[i:]) n += len(self.body) - i assert n == len(self.body) def get(self, k): for v in self.variables: if v[0] == k: return v[2] else: raise Exception('field not found: ' + k) def set(self, k, v): for i in range(len(self.variables)): variable = self.variables[i] if variable[0] == k: self.variables[i] = (k, variable[1], v, variable[3]) break else: raise Exception('field not found: '+k) def read_raw(stream, size, left, type): assert size == left + 8 body = stream.read(left) return Atom(type, size, body) def read_udta(stream, size, left, type): assert size == left + 8 body = stream.read(left) class Udta(Atom): def write(self, stream): return def calsize(self): return 0 return Udta(type, size, body) def read_body_stream(stream, left): body = stream.read(left) assert len(body) == left return body, BytesIO(body) def read_full_atom(stream): value = read_uint(stream) version = value >> 24 flags = value & 0xffffff assert version == 0 return value def read_full_atom2(stream): value = read_uint(stream) version = value >> 24 flags = value & 0xffffff return version, value def read_mvhd(stream, size, left, type): body, stream = read_body_stream(stream, left) value = read_full_atom(stream) left -= 4 # new Date(movieTime * 1000 - 2082850791998L); creation_time = read_uint(stream) modification_time = read_uint(stream) time_scale = read_uint(stream) duration = read_uint(stream) left -= 16 qt_preferred_fate = read_uint(stream) qt_preferred_volume = read_ushort(stream) assert stream.read(10) == b'\x00' * 10 qt_matrixA = read_uint(stream) qt_matrixB = read_uint(stream) qt_matrixU = read_uint(stream) qt_matrixC = read_uint(stream) qt_matrixD = read_uint(stream) qt_matrixV = read_uint(stream) qt_matrixX = read_uint(stream) qt_matrixY = read_uint(stream) qt_matrixW = read_uint(stream) qt_previewTime = read_uint(stream) qt_previewDuration = read_uint(stream) qt_posterTime = read_uint(stream) qt_selectionTime = read_uint(stream) qt_selectionDuration = read_uint(stream) qt_currentTime = read_uint(stream) nextTrackID = read_uint(stream) left -= 80 assert left == 0 return VariableAtom(b'mvhd', size, body, [('duration', 16, duration, 4)]) def read_tkhd(stream, size, left, type): body, stream = read_body_stream(stream, left) value = read_full_atom(stream) left -= 4 # new Date(movieTime * 1000 - 2082850791998L); creation_time = read_uint(stream) modification_time = read_uint(stream) track_id = read_uint(stream) assert stream.read(4) == b'\x00' * 4 duration = read_uint(stream) left -= 20 assert stream.read(8) == b'\x00' * 8 qt_layer = read_ushort(stream) qt_alternate_group = read_ushort(stream) qt_volume = read_ushort(stream) assert stream.read(2) == b'\x00\x00' qt_matrixA = read_uint(stream) qt_matrixB = read_uint(stream) qt_matrixU = read_uint(stream) qt_matrixC = read_uint(stream) qt_matrixD = read_uint(stream) qt_matrixV = read_uint(stream) qt_matrixX = read_uint(stream) qt_matrixY = read_uint(stream) qt_matrixW = read_uint(stream) qt_track_width = read_uint(stream) width = qt_track_width >> 16 qt_track_height = read_uint(stream) height = qt_track_height >> 16 left -= 60 assert left == 0 return VariableAtom(b'tkhd', size, body, [('duration', 20, duration, 4)]) def read_mdhd(stream, size, left, type): body, stream = read_body_stream(stream, left) ver, value = read_full_atom2(stream) left -= 4 if ver == 1: creation_time = read_ulong(stream) modification_time = read_ulong(stream) time_scale = read_uint(stream) duration = read_ulong(stream) var = [('duration', 24, duration, 8)] left -= 28 else: assert ver == 0, "ver=%d" % ver creation_time = read_uint(stream) modification_time = read_uint(stream) time_scale = read_uint(stream) duration = read_uint(stream) var = [('duration', 16, duration, 4)] left -= 16 packed_language = read_ushort(stream) qt_quality = read_ushort(stream) left -= 4 assert left == 0 return VariableAtom(b'mdhd', size, body, var) def read_hdlr(stream, size, left, type): body, stream = read_body_stream(stream, left) value = read_full_atom(stream) left -= 4 qt_component_type = read_uint(stream) handler_type = read_uint(stream) qt_component_manufacturer = read_uint(stream) qt_component_flags = read_uint(stream) qt_component_flags_mask = read_uint(stream) left -= 20 track_name = stream.read(left) #assert track_name[-1] == b'\x00' return Atom(b'hdlr', size, body) def read_vmhd(stream, size, left, type): body, stream = read_body_stream(stream, left) value = read_full_atom(stream) left -= 4 assert left == 8 graphic_mode = read_ushort(stream) op_color_read = read_ushort(stream) op_color_green = read_ushort(stream) op_color_blue = read_ushort(stream) return Atom(b'vmhd', size, body) def read_stsd(stream, size, left, type): value = read_full_atom(stream) left -= 4 entry_count = read_uint(stream) left -= 4 children = [] for i in range(entry_count): atom = read_atom(stream) children.append(atom) left -= atom.size assert left == 0 #return Atom('stsd', size, children) class stsd_atom(Atom): def __init__(self, type, size, body): Atom.__init__(self, type, size, body) def write(self, stream): self.write1(stream) write_uint(stream, self.body[0]) write_uint(stream, len(self.body[1])) for atom in self.body[1]: atom.write(stream) def calsize(self): oldsize = self.size # TODO: remove self.size = 8 + 4 + 4 + sum([atom.calsize() for atom in self.body[1]]) assert oldsize == self.size, '%s: %d, %d' % (self.type, oldsize, self.size) # TODO: remove return self.size return stsd_atom(b'stsd', size, (value, children)) def read_avc1(stream, size, left, type): body, stream = read_body_stream(stream, left) skip_zeros(stream, 6) data_reference_index = read_ushort(stream) skip_zeros(stream, 2) skip_zeros(stream, 2) skip_zeros(stream, 12) width = read_ushort(stream) height = read_ushort(stream) horizontal_rez = read_uint(stream) >> 16 vertical_rez = read_uint(stream) >> 16 assert stream.read(4) == b'\x00' * 4 frame_count = read_ushort(stream) string_len = read_byte(stream) compressor_name = stream.read(31) depth = read_ushort(stream) assert stream.read(2) == b'\xff\xff' left -= 78 child = read_atom(stream) assert child.type in (b'avcC', b'pasp'), 'if the sub atom is not avcC or pasp (actual %s), you should not cache raw body' % child.type left -= child.size stream.read(left) # XXX return Atom(b'avc1', size, body) def read_avcC(stream, size, left, type): stream.read(left) return Atom(b'avcC', size, None) def read_stts(stream, size, left, type): value = read_full_atom(stream) left -= 4 entry_count = read_uint(stream) #assert entry_count == 1 left -= 4 samples = [] for i in range(entry_count): sample_count = read_uint(stream) sample_duration = read_uint(stream) samples.append((sample_count, sample_duration)) left -= 8 assert left == 0 #return Atom('stts', size, None) class stts_atom(Atom): def __init__(self, type, size, body): Atom.__init__(self, type, size, body) def write(self, stream): self.write1(stream) write_uint(stream, self.body[0]) write_uint(stream, len(self.body[1])) for sample_count, sample_duration in self.body[1]: write_uint(stream, sample_count) write_uint(stream, sample_duration) def calsize(self): #oldsize = self.size # TODO: remove self.size = 8 + 4 + 4 + len(self.body[1]) * 8 #assert oldsize == self.size, '%s: %d, %d' % (self.type, oldsize, self.size) # TODO: remove return self.size return stts_atom(b'stts', size, (value, samples)) def read_stss(stream, size, left, type): value = read_full_atom(stream) left -= 4 entry_count = read_uint(stream) left -= 4 samples = [] for i in range(entry_count): sample = read_uint(stream) samples.append(sample) left -= 4 assert left == 0 #return Atom('stss', size, None) class stss_atom(Atom): def __init__(self, type, size, body): Atom.__init__(self, type, size, body) def write(self, stream): self.write1(stream) write_uint(stream, self.body[0]) write_uint(stream, len(self.body[1])) for sample in self.body[1]: write_uint(stream, sample) def calsize(self): self.size = 8 + 4 + 4 + len(self.body[1]) * 4 return self.size return stss_atom(b'stss', size, (value, samples)) def read_stsc(stream, size, left, type): value = read_full_atom(stream) left -= 4 entry_count = read_uint(stream) left -= 4 chunks = [] for i in range(entry_count): first_chunk = read_uint(stream) samples_per_chunk = read_uint(stream) sample_description_index = read_uint(stream) assert sample_description_index == 1 # what is it? chunks.append((first_chunk, samples_per_chunk, sample_description_index)) left -= 12 #chunks, samples = zip(*chunks) #total = 0 #for c, s in zip(chunks[1:], samples): # total += c*s #print 'total', total assert left == 0 #return Atom('stsc', size, None) class stsc_atom(Atom): def __init__(self, type, size, body): Atom.__init__(self, type, size, body) def write(self, stream): self.write1(stream) write_uint(stream, self.body[0]) write_uint(stream, len(self.body[1])) for first_chunk, samples_per_chunk, sample_description_index in self.body[1]: write_uint(stream, first_chunk) write_uint(stream, samples_per_chunk) write_uint(stream, sample_description_index) def calsize(self): self.size = 8 + 4 + 4 + len(self.body[1]) * 12 return self.size return stsc_atom(b'stsc', size, (value, chunks)) def read_stsz(stream, size, left, type): value = read_full_atom(stream) left -= 4 sample_size = read_uint(stream) sample_count = read_uint(stream) left -= 8 assert sample_size == 0 total = 0 sizes = [] if sample_size == 0: for i in range(sample_count): entry_size = read_uint(stream) sizes.append(entry_size) total += entry_size left -= 4 assert left == 0 #return Atom('stsz', size, None) class stsz_atom(Atom): def __init__(self, type, size, body): Atom.__init__(self, type, size, body) def write(self, stream): self.write1(stream) write_uint(stream, self.body[0]) write_uint(stream, self.body[1]) write_uint(stream, self.body[2]) for entry_size in self.body[3]: write_uint(stream, entry_size) def calsize(self): self.size = 8 + 4 + 8 + len(self.body[3]) * 4 return self.size return stsz_atom(b'stsz', size, (value, sample_size, sample_count, sizes)) def read_stco(stream, size, left, type): value = read_full_atom(stream) left -= 4 entry_count = read_uint(stream) left -= 4 offsets = [] for i in range(entry_count): chunk_offset = read_uint(stream) offsets.append(chunk_offset) left -= 4 assert left == 0 #return Atom('stco', size, None) class stco_atom(Atom): def __init__(self, type, size, body): Atom.__init__(self, type, size, body) def write(self, stream): self.write1(stream) write_uint(stream, self.body[0]) write_uint(stream, len(self.body[1])) for chunk_offset in self.body[1]: write_uint(stream, chunk_offset) def calsize(self): self.size = 8 + 4 + 4 + len(self.body[1]) * 4 return self.size return stco_atom(b'stco', size, (value, offsets)) def read_ctts(stream, size, left, type): value = read_full_atom(stream) left -= 4 entry_count = read_uint(stream) left -= 4 samples = [] for i in range(entry_count): sample_count = read_uint(stream) sample_offset = read_uint(stream) samples.append((sample_count, sample_offset)) left -= 8 assert left == 0 class ctts_atom(Atom): def __init__(self, type, size, body): Atom.__init__(self, type, size, body) def write(self, stream): self.write1(stream) write_uint(stream, self.body[0]) write_uint(stream, len(self.body[1])) for sample_count, sample_offset in self.body[1]: write_uint(stream, sample_count) write_uint(stream, sample_offset) def calsize(self): self.size = 8 + 4 + 4 + len(self.body[1]) * 8 return self.size return ctts_atom(b'ctts', size, (value, samples)) def read_smhd(stream, size, left, type): body, stream = read_body_stream(stream, left) value = read_full_atom(stream) left -= 4 balance = read_ushort(stream) assert stream.read(2) == b'\x00\x00' left -= 4 assert left == 0 return Atom(b'smhd', size, body) def read_mp4a(stream, size, left, type): body, stream = read_body_stream(stream, left) assert stream.read(6) == b'\x00' * 6 data_reference_index = read_ushort(stream) assert stream.read(8) == b'\x00' * 8 channel_count = read_ushort(stream) sample_size = read_ushort(stream) assert stream.read(4) == b'\x00' * 4 time_scale = read_ushort(stream) assert stream.read(2) == b'\x00' * 2 left -= 28 atom = read_atom(stream) assert atom.type == b'esds' left -= atom.size assert left == 0 return Atom(b'mp4a', size, body) def read_descriptor(stream): tag = read_byte(stream) raise NotImplementedError() def read_esds(stream, size, left, type): value = read_uint(stream) version = value >> 24 assert version == 0 flags = value & 0xffffff left -= 4 body = stream.read(left) return Atom(b'esds', size, None) def read_composite_atom(stream, size, left, type): children = [] while left > 0: atom = read_atom(stream) children.append(atom) left -= atom.size assert left == 0, left return CompositeAtom(type, size, children) def read_mdat(stream, size, left, type): source_start = stream.tell() source_size = left skip(stream, left) #return Atom(type, size, None) #raise NotImplementedError() class mdat_atom(Atom): def __init__(self, type, size, body): Atom.__init__(self, type, size, body) def write(self, stream): self.write1(stream) self.write2(stream) def write2(self, stream): source, source_start, source_size = self.body original = source.tell() source.seek(source_start) copy_stream(source, stream, source_size) def calsize(self): return self.size return mdat_atom(b'mdat', size, (stream, source_start, source_size)) atom_readers = { b'mvhd': read_mvhd, # merge duration b'tkhd': read_tkhd, # merge duration b'mdhd': read_mdhd, # merge duration b'hdlr': read_hdlr, # nothing b'vmhd': read_vmhd, # nothing b'stsd': read_stsd, # nothing b'avc1': read_avc1, # nothing b'avcC': read_avcC, # nothing b'stts': read_stts, # sample_count, sample_duration b'stss': read_stss, # join indexes b'stsc': read_stsc, # merge # sample numbers b'stsz': read_stsz, # merge # samples b'stco': read_stco, # merge # chunk offsets b'ctts': read_ctts, # merge b'smhd': read_smhd, # nothing b'mp4a': read_mp4a, # nothing b'esds': read_esds, # noting b'ftyp': read_raw, b'yqoo': read_raw, b'moov': read_composite_atom, b'trak': read_composite_atom, b'mdia': read_composite_atom, b'minf': read_composite_atom, b'dinf': read_composite_atom, b'stbl': read_composite_atom, b'iods': read_raw, b'dref': read_raw, b'free': read_raw, b'edts': read_raw, b'pasp': read_raw, b'mdat': read_mdat, b'udta': read_udta, } #stsd sample descriptions (codec types, initialization etc.) #stts (decoding) time-to-sample #ctts (composition) time to sample #stsc sample-to-chunk, partial data-offset information #stsz sample sizes (framing) #stz2 compact sample sizes (framing) #stco chunk offset, partial data-offset information #co64 64-bit chunk offset #stss sync sample table (random access points) #stsh shadow sync sample table #padb sample padding bits #stdp sample degradation priority #sdtp independent and disposable samples #sbgp sample-to-group #sgpd sample group description #subs sub-sample information def read_atom(stream): header = stream.read(8) if not header: return assert len(header) == 8 n = 0 size = struct.unpack('>I', header[:4])[0] assert size > 0 n += 4 type = header[4:8] n += 4 assert type != b'uuid' if size == 1: size = read_ulong(stream) n += 8 left = size - n if type in atom_readers: return atom_readers[type](stream, size, left, type) raise NotImplementedError('%s: %d' % (type, left)) def write_atom(stream, atom): atom.write(stream) def parse_atoms(stream): atoms = [] while True: atom = read_atom(stream) if atom: atoms.append(atom) else: break return atoms def read_mp4(stream): print(stream.name) atoms = parse_atoms(stream) moov = list(filter(lambda x: x.type == b'moov', atoms)) mdat = list(filter(lambda x: x.type == b'mdat', atoms)) assert len(moov) == 1 assert len(mdat) == 1 moov = moov[0] mdat = mdat[0] return atoms, moov, mdat ################################################## # merge ################################################## def merge_stts(samples_list): sample_list = [] for samples in samples_list: #assert len(samples) == 1 #sample_list.append(samples[0]) sample_list += samples counts, durations = zip(*sample_list) #assert len(set(durations)) == 1, 'not all durations equal' if len(set(durations)) == 1: return [(sum(counts), durations[0])] return sample_list def merge_stss(samples, sample_number_list): results = [] start = 0 for samples, sample_number_list in zip(samples, sample_number_list): results.extend(map(lambda x: start + x, samples)) start += sample_number_list return results def merge_stsc(chunks_list, total_chunk_number_list): results = [] chunk_index = 1 for chunks, total in zip(chunks_list, total_chunk_number_list): for i in range(len(chunks)): if i < len(chunks) - 1: chunk_number = chunks[i + 1][0] - chunks[i][0] else: chunk_number = total + 1 - chunks[i][0] sample_number = chunks[i][1] description = chunks[i][2] results.append((chunk_index, sample_number, description)) chunk_index += chunk_number return results def merge_stco(offsets_list, mdats): offset = 0 results = [] for offsets, mdat in zip(offsets_list, mdats): results.extend(offset + x - mdat.body[1] for x in offsets) offset += mdat.size - 8 return results def merge_stsz(sizes_list): return sum(sizes_list, []) def merge_mdats(mdats): total_size = sum(x.size - 8 for x in mdats) + 8 class multi_mdat_atom(Atom): def __init__(self, type, size, body): Atom.__init__(self, type, size, body) def write(self, stream): self.write1(stream) self.write2(stream) def write2(self, stream): for mdat in self.body: mdat.write2(stream) def calsize(self): return self.size return multi_mdat_atom(b'mdat', total_size, mdats) def merge_moov(moovs, mdats): mvhd_duration = 0 for x in moovs: mvhd_duration += x.get(b'mvhd').get('duration') tkhd_durations = [0, 0] mdhd_durations = [0, 0] for x in moovs: traks = x.get_all(b'trak') assert len(traks) == 2 tkhd_durations[0] += traks[0].get(b'tkhd').get('duration') tkhd_durations[1] += traks[1].get(b'tkhd').get('duration') mdhd_durations[0] += traks[0].get(b'mdia', b'mdhd').get('duration') mdhd_durations[1] += traks[1].get(b'mdia', b'mdhd').get('duration') #mvhd_duration = min(mvhd_duration, tkhd_durations) trak0s = [x.get_all(b'trak')[0] for x in moovs] trak1s = [x.get_all(b'trak')[1] for x in moovs] stts0 = merge_stts(x.get(b'mdia', b'minf', b'stbl', b'stts').body[1] for x in trak0s) stts1 = merge_stts(x.get(b'mdia', b'minf', b'stbl', b'stts').body[1] for x in trak1s) stss = merge_stss((x.get(b'mdia', b'minf', b'stbl', b'stss').body[1] for x in trak0s), (len(x.get(b'mdia', b'minf', b'stbl', b'stsz').body[3]) for x in trak0s)) stsc0 = merge_stsc((x.get(b'mdia', b'minf', b'stbl', b'stsc').body[1] for x in trak0s), (len(x.get(b'mdia', b'minf', b'stbl', b'stco').body[1]) for x in trak0s)) stsc1 = merge_stsc((x.get(b'mdia', b'minf', b'stbl', b'stsc').body[1] for x in trak1s), (len(x.get(b'mdia', b'minf', b'stbl', b'stco').body[1]) for x in trak1s)) stco0 = merge_stco((x.get(b'mdia', b'minf', b'stbl', b'stco').body[1] for x in trak0s), mdats) stco1 = merge_stco((x.get(b'mdia', b'minf', b'stbl', b'stco').body[1] for x in trak1s), mdats) stsz0 = merge_stsz((x.get(b'mdia', b'minf', b'stbl', b'stsz').body[3] for x in trak0s)) stsz1 = merge_stsz((x.get(b'mdia', b'minf', b'stbl', b'stsz').body[3] for x in trak1s)) ctts = sum((x.get(b'mdia', b'minf', b'stbl', b'ctts').body[1] for x in trak0s), []) moov = moovs[0] moov.get(b'mvhd').set('duration', mvhd_duration) trak0 = moov.get_all(b'trak')[0] trak1 = moov.get_all(b'trak')[1] trak0.get(b'tkhd').set('duration', tkhd_durations[0]) trak1.get(b'tkhd').set('duration', tkhd_durations[1]) trak0.get(b'mdia', b'mdhd').set('duration', mdhd_durations[0]) trak1.get(b'mdia', b'mdhd').set('duration', mdhd_durations[1]) stts_atom = trak0.get(b'mdia', b'minf', b'stbl', b'stts') stts_atom.body = stts_atom.body[0], stts0 stts_atom = trak1.get(b'mdia', b'minf', b'stbl', b'stts') stts_atom.body = stts_atom.body[0], stts1 stss_atom = trak0.get(b'mdia', b'minf', b'stbl', b'stss') stss_atom.body = stss_atom.body[0], stss stsc_atom = trak0.get(b'mdia', b'minf', b'stbl', b'stsc') stsc_atom.body = stsc_atom.body[0], stsc0 stsc_atom = trak1.get(b'mdia', b'minf', b'stbl', b'stsc') stsc_atom.body = stsc_atom.body[0], stsc1 stco_atom = trak0.get(b'mdia', b'minf', b'stbl', b'stco') stco_atom.body = stss_atom.body[0], stco0 stco_atom = trak1.get(b'mdia', b'minf', b'stbl', b'stco') stco_atom.body = stss_atom.body[0], stco1 stsz_atom = trak0.get(b'mdia', b'minf', b'stbl', b'stsz') stsz_atom.body = stsz_atom.body[0], stsz_atom.body[1], len(stsz0), stsz0 stsz_atom = trak1.get(b'mdia', b'minf', b'stbl', b'stsz') stsz_atom.body = stsz_atom.body[0], stsz_atom.body[1], len(stsz1), stsz1 ctts_atom = trak0.get(b'mdia', b'minf', b'stbl', b'ctts') ctts_atom.body = ctts_atom.body[0], ctts old_moov_size = moov.size new_moov_size = moov.calsize() new_mdat_start = mdats[0].body[1] + new_moov_size - old_moov_size stco0 = list(map(lambda x: x + new_mdat_start, stco0)) stco1 = list(map(lambda x: x + new_mdat_start, stco1)) stco_atom = trak0.get(b'mdia', b'minf', b'stbl', b'stco') stco_atom.body = stss_atom.body[0], stco0 stco_atom = trak1.get(b'mdia', b'minf', b'stbl', b'stco') stco_atom.body = stss_atom.body[0], stco1 return moov def merge_mp4s(files, output): assert files ins = [open(mp4, 'rb') for mp4 in files] mp4s = list(map(read_mp4, ins)) moovs = list(map(lambda x: x[1], mp4s)) mdats = list(map(lambda x: x[2], mp4s)) moov = merge_moov(moovs, mdats) mdat = merge_mdats(mdats) with open(output, 'wb') as output: for x in mp4s[0][0]: if x.type == b'moov': moov.write(output) elif x.type == b'mdat': mdat.write(output) else: x.write(output) ################################################## # main ################################################## # TODO: FIXME: duplicate of join_flv def guess_output(inputs): import os.path inputs = map(os.path.basename, inputs) n = min(map(len, inputs)) for i in reversed(range(1, n)): if len(set(s[:i] for s in inputs)) == 1: return inputs[0][:i] + '.mp4' return 'output.mp4' def concat_mp4(mp4s, output = None): assert mp4s, 'no mp4 file found' import os.path if not output: output = guess_output(mp4s) elif os.path.isdir(output): output = os.path.join(output, guess_output(mp4s)) print('Merging video parts...') merge_mp4s(mp4s, output) return output def usage(): print('Usage: [python3] join_mp4.py --output TARGET.mp4 mp4...') def main(): import sys, getopt try: opts, args = getopt.getopt(sys.argv[1:], "ho:", ["help", "output="]) except getopt.GetoptError as err: usage() sys.exit(1) output = None for o, a in opts: if o in ("-h", "--help"): usage() sys.exit() elif o in ("-o", "--output"): output = a else: usage() sys.exit(1) if not args: usage() sys.exit(1) concat_mp4(args, output) if __name__ == '__main__': main() ================================================ FILE: src/you_get/processor/join_ts.py ================================================ #!/usr/bin/env python import struct from io import BytesIO ################################################## # main ################################################## def guess_output(inputs): import os.path inputs = map(os.path.basename, inputs) n = min(map(len, inputs)) for i in reversed(range(1, n)): if len(set(s[:i] for s in inputs)) == 1: return inputs[0][:i] + '.ts' return 'output.ts' def concat_ts(ts_parts, output = None): assert ts_parts, 'no ts files found' import os.path if not output: output = guess_output(ts_parts) elif os.path.isdir(output): output = os.path.join(output, guess_output(ts_parts)) print('Merging video parts...') ts_out_file = open(output, "wb") for ts_in in ts_parts: ts_in_file = open(ts_in, "rb") ts_in_data = ts_in_file.read() ts_in_file.close() ts_out_file.write(ts_in_data) ts_out_file.close() return output def usage(): print('Usage: [python3] join_ts.py --output TARGET.ts ts...') def main(): import sys, getopt try: opts, args = getopt.getopt(sys.argv[1:], "ho:", ["help", "output="]) except getopt.GetoptError as err: usage() sys.exit(1) output = None for o, a in opts: if o in ("-h", "--help"): usage() sys.exit() elif o in ("-o", "--output"): output = a else: usage() sys.exit(1) if not args: usage() sys.exit(1) concat_ts(args, output) if __name__ == '__main__': main() ================================================ FILE: src/you_get/processor/rtmpdump.py ================================================ #!/usr/bin/env python import os.path import subprocess def get_usable_rtmpdump(cmd): try: p = subprocess.Popen([cmd], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() return cmd except: return None RTMPDUMP = get_usable_rtmpdump('rtmpdump') def has_rtmpdump_installed(): return RTMPDUMP is not None # #params ={"-y":"playlist","-q":None,} #if Only Key ,Value should be None #-r -o should not be included in params def download_rtmpdump_stream(url, title, ext,params={},output_dir='.'): filename = '%s.%s' % (title, ext) filepath = os.path.join(output_dir, filename) cmdline = [RTMPDUMP, '-r'] cmdline.append(url) cmdline.append('-o') cmdline.append(filepath) for key in params.keys(): cmdline.append(key) if params[key]!=None: cmdline.append(params[key]) # cmdline.append('-y') # cmdline.append(playpath) print("Call rtmpdump:\n"+" ".join(cmdline)+"\n") subprocess.call(cmdline) return # def play_rtmpdump_stream(player, url, params={}): #construct left side of pipe cmdline = [RTMPDUMP, '-r'] cmdline.append(url) #append other params if exist for key in params.keys(): cmdline.append(key) if params[key]!=None: cmdline.append(params[key]) cmdline.append('-o') cmdline.append('-') #pipe start cmdline.append('|') cmdline.append(player) cmdline.append('-') #logging print("Call rtmpdump:\n"+" ".join(cmdline)+"\n") #call RTMPDump! subprocess.call(cmdline) # os.system("rtmpdump -r '%s' -y '%s' -o - | %s -" % (url, playpath, player)) return ================================================ FILE: src/you_get/util/fs.py ================================================ #!/usr/bin/env python from .os import detect_os def legitimize(text, os=detect_os()): """Converts a string to a valid filename. """ # POSIX systems text = text.translate({ 0: None, ord('/'): '-', ord('|'): '-', }) # FIXME: do some filesystem detection if os == 'windows' or os == 'cygwin' or os == 'wsl': # Windows (non-POSIX namespace) text = text.translate({ # Reserved in Windows VFAT and NTFS ord(':'): '-', ord('*'): '-', ord('?'): '-', ord('\\'): '-', ord('\"'): '\'', # Reserved in Windows VFAT ord('+'): '-', ord('<'): '-', ord('>'): '-', ord('['): '(', ord(']'): ')', ord('\t'): ' ', }) else: # *nix if os == 'mac': # Mac OS HFS+ text = text.translate({ ord(':'): '-', }) # Remove leading . if text.startswith("."): text = text[1:] text = text[:80] # Trim to 82 Unicode characters long return text ================================================ FILE: src/you_get/util/git.py ================================================ #!/usr/bin/env python import os import subprocess from ..version import __version__ def get_head(repo_path): """Get (branch, commit) from HEAD of a git repo.""" try: ref = open(os.path.join(repo_path, '.git', 'HEAD'), 'r').read().strip()[5:].split('/') branch = ref[-1] commit = open(os.path.join(repo_path, '.git', *ref), 'r').read().strip()[:7] return branch, commit except: return None def get_version(repo_path): try: version = __version__.split('.') major, minor, cn = [int(i) for i in version] p = subprocess.Popen(['git', '--git-dir', os.path.join(repo_path, '.git'), '--work-tree', repo_path, 'rev-list', 'HEAD', '--count'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) raw, err = p.communicate() c_head = int(raw.decode('ascii')) q = subprocess.Popen(['git', '--git-dir', os.path.join(repo_path, '.git'), '--work-tree', repo_path, 'rev-list', 'master', '--count'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) raw, err = q.communicate() c_master = int(raw.decode('ascii')) cc = c_head - c_master assert cc return '%s.%s.%s' % (major, minor, cn + cc) except: return __version__ ================================================ FILE: src/you_get/util/log.py ================================================ #!/usr/bin/env python # This file is Python 2 compliant. from ..version import script_name import os, sys TERM = os.getenv('TERM', '') IS_ANSI_TERMINAL = TERM in ( 'eterm-color', 'linux', 'screen', 'vt100', ) or TERM.startswith('xterm') # ANSI escape code # See RESET = 0 BOLD = 1 UNDERLINE = 4 NEGATIVE = 7 NO_BOLD = 21 NO_UNDERLINE = 24 POSITIVE = 27 BLACK = 30 RED = 31 GREEN = 32 YELLOW = 33 BLUE = 34 MAGENTA = 35 CYAN = 36 LIGHT_GRAY = 37 DEFAULT = 39 BLACK_BACKGROUND = 40 RED_BACKGROUND = 41 GREEN_BACKGROUND = 42 YELLOW_BACKGROUND = 43 BLUE_BACKGROUND = 44 MAGENTA_BACKGROUND = 45 CYAN_BACKGROUND = 46 LIGHT_GRAY_BACKGROUND = 47 DEFAULT_BACKGROUND = 49 DARK_GRAY = 90 # xterm LIGHT_RED = 91 # xterm LIGHT_GREEN = 92 # xterm LIGHT_YELLOW = 93 # xterm LIGHT_BLUE = 94 # xterm LIGHT_MAGENTA = 95 # xterm LIGHT_CYAN = 96 # xterm WHITE = 97 # xterm DARK_GRAY_BACKGROUND = 100 # xterm LIGHT_RED_BACKGROUND = 101 # xterm LIGHT_GREEN_BACKGROUND = 102 # xterm LIGHT_YELLOW_BACKGROUND = 103 # xterm LIGHT_BLUE_BACKGROUND = 104 # xterm LIGHT_MAGENTA_BACKGROUND = 105 # xterm LIGHT_CYAN_BACKGROUND = 106 # xterm WHITE_BACKGROUND = 107 # xterm def sprint(text, *colors): """Format text with color or other effects into ANSI escaped string.""" return "\33[{}m{content}\33[{}m".format(";".join([str(color) for color in colors]), RESET, content=text) if IS_ANSI_TERMINAL and colors else text def println(text, *colors): """Print text to standard output.""" sys.stdout.write(sprint(text, *colors) + "\n") def print_err(text, *colors): """Print text to standard error.""" sys.stderr.write(sprint(text, *colors) + "\n") def print_log(text, *colors): """Print a log message to standard error.""" sys.stderr.write(sprint("{}: {}".format(script_name, text), *colors) + "\n") def i(message): """Print a normal log message.""" print_log(message) def d(message): """Print a debug log message.""" print_log(message, BLUE) def w(message): """Print a warning log message.""" print_log(message, YELLOW) def e(message, exit_code=None): """Print an error log message.""" print_log(message, YELLOW, BOLD) if exit_code is not None: sys.exit(exit_code) def wtf(message, exit_code=1): """What a Terrible Failure!""" print_log(message, RED, BOLD) if exit_code is not None: sys.exit(exit_code) def yes_or_no(message): ans = str(input('%s (y/N) ' % message)).lower().strip() return ans == 'y' ================================================ FILE: src/you_get/util/os.py ================================================ #!/usr/bin/env python from platform import system def detect_os(): """Detect operating system. """ # Inspired by: # https://github.com/scivision/pybashutils/blob/78b7f2b339cb03b1c37df94015098bbe462f8526/pybashutils/windows_linux_detect.py syst = system().lower() os = 'unknown' if 'cygwin' in syst: os = 'cygwin' elif 'darwin' in syst: os = 'mac' elif 'linux' in syst: os = 'linux' # detect WSL https://github.com/Microsoft/BashOnWindows/issues/423 try: with open('/proc/version', 'r') as f: if 'microsoft' in f.read().lower(): os = 'wsl' except: pass elif 'windows' in syst: os = 'windows' elif 'bsd' in syst: os = 'bsd' return os ================================================ FILE: src/you_get/util/strings.py ================================================ try: # py 3.4 from html import unescape as unescape_html except ImportError: import re from html.entities import entitydefs def unescape_html(string): '''HTML entity decode''' string = re.sub(r'&#[^;]+;', _sharp2uni, string) string = re.sub(r'&[^;]+;', lambda m: entitydefs[m.group(0)[1:-1]], string) return string def _sharp2uni(m): '''&#...; ==> unicode''' s = m.group(0)[2:].rstrip(';;') if s.startswith('x'): return chr(int('0'+s, 16)) else: return chr(int(s)) from .fs import legitimize def get_filename(htmlstring): return legitimize(unescape_html(htmlstring)) def parameterize(string): return "'%s'" % string.replace("'", r"'\''") ================================================ FILE: src/you_get/util/term.py ================================================ #!/usr/bin/env python def get_terminal_size(): """Get (width, height) of the current terminal.""" try: import fcntl, termios, struct # fcntl module only available on Unix return struct.unpack('hh', fcntl.ioctl(1, termios.TIOCGWINSZ, '1234')) except: return (40, 80) ================================================ FILE: src/you_get/version.py ================================================ #!/usr/bin/env python script_name = 'you-get' __version__ = '0.4.1743' ================================================ FILE: tests/test.py ================================================ #!/usr/bin/env python import unittest from you_get.extractors import ( imgur, magisto, youtube, missevan, acfun, bilibili, soundcloud, tiktok, twitter, miaopai ) class YouGetTests(unittest.TestCase): def test_imgur(self): imgur.download('http://imgur.com/WVLk5nD', info_only=True) imgur.download('https://imgur.com/we-should-have-listened-WVLk5nD', info_only=True) def test_magisto(self): magisto.download( 'http://www.magisto.com/album/video/f3x9AAQORAkfDnIFDA', info_only=True ) #def test_youtube(self): #youtube.download( # 'http://www.youtube.com/watch?v=pzKerr0JIPA', info_only=True #) #youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True) #youtube.download( # 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa # info_only=True #) #youtube.download( # 'https://www.youtube.com/watch?v=oRdxUFDoQe0', info_only=True #) def test_acfun(self): acfun.download('https://www.acfun.cn/v/ac44560432', info_only=True) #def test_bilibili(self): #bilibili.download('https://www.bilibili.com/video/BV1sL4y177sC', info_only=True) #def test_soundcloud(self): ## single song #soundcloud.download( # 'https://soundcloud.com/keiny-pham/impure-bird', info_only=True #) ## playlist #soundcloud.download( # 'https://soundcloud.com/anthony-flieger/sets/cytus', info_only=True #) def test_tiktok(self): tiktok.download('https://www.tiktok.com/@zukky_48/video/7398162058153315605', info_only=True) tiktok.download('https://www.tiktok.com/@/video/7398162058153315605', info_only=True) tiktok.download('https://t.tiktok.com/i18n/share/video/7398162058153315605/', info_only=True) tiktok.download('https://vt.tiktok.com/ZSYKjKt6M/', info_only=True) def test_twitter(self): twitter.download('https://twitter.com/elonmusk/status/1530516552084234244', info_only=True) twitter.download('https://x.com/elonmusk/status/1530516552084234244', info_only=True) def test_weibo(self): miaopai.download('https://video.weibo.com/show?fid=1034:4825403706245135', info_only=True) if __name__ == '__main__': unittest.main() ================================================ FILE: tests/test_common.py ================================================ #!/usr/bin/env python import unittest from you_get.common import * class TestCommon(unittest.TestCase): def test_match1(self): self.assertEqual(match1('http://youtu.be/1234567890A', r'youtu.be/([^/]+)'), '1234567890A') self.assertEqual(match1('http://youtu.be/1234567890A', r'youtu.be/([^/]+)', r'youtu.(\w+)'), ['1234567890A', 'be']) ================================================ FILE: tests/test_util.py ================================================ #!/usr/bin/env python import unittest from you_get.util.fs import * class TestUtil(unittest.TestCase): def test_legitimize(self): self.assertEqual(legitimize("1*2", os="linux"), "1*2") self.assertEqual(legitimize("1*2", os="mac"), "1*2") self.assertEqual(legitimize("1*2", os="windows"), "1-2") self.assertEqual(legitimize("1*2", os="wsl"), "1-2") ================================================ FILE: you-get ================================================ #!/usr/bin/env python3 import os, sys _srcdir = '%s/src/' % os.path.dirname(os.path.realpath(__file__)) _filepath = os.path.dirname(sys.argv[0]) sys.path.insert(1, os.path.join(_filepath, _srcdir)) if sys.version_info[0] == 3: import you_get if __name__ == '__main__': you_get.main(repo_path=_filepath) else: # Python 2 from you_get.util import log log.e("[fatal] Python 3 is required!") log.wtf("try to run this script using 'python3 you-get'.") ================================================ FILE: you-get.plugin.zsh ================================================ #!/usr/bin/env zsh alias you-get="noglob python3 $(dirname $0)/you-get" alias you-vlc="noglob python3 $(dirname $0)/you-get --player vlc"