Repository: smacke/ffsubsync Branch: master Commit: 9c5ee3941111 Files: 66 Total size: 241.5 KB Directory structure: gitextract_fnpxmdm_/ ├── .coveragerc ├── .gitattributes ├── .github/ │ ├── FUNDING.yml │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ └── synchronization-problem.md │ └── workflows/ │ └── ci.yml ├── .gitignore ├── .gitmodules ├── .readthedocs.yml ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── docs/ │ ├── .gitignore │ ├── Makefile │ ├── _static/ │ │ └── .keep │ ├── _templates/ │ │ └── .keep │ ├── conf.py │ ├── index.rst │ ├── make.bat │ └── requirements-docs.txt ├── ffsubsync/ │ ├── __init__.py │ ├── _version.py │ ├── aligners.py │ ├── constants.py │ ├── ffmpeg_utils.py │ ├── ffsubsync.py │ ├── ffsubsync_gui.py │ ├── file_utils.py │ ├── generic_subtitles.py │ ├── golden_section_search.py │ ├── sklearn_shim.py │ ├── speech_transformers.py │ ├── subtitle_parser.py │ ├── subtitle_transformers.py │ └── version.py ├── gui/ │ ├── .gitignore │ ├── Makefile │ ├── README.md │ ├── build-macos.sh │ ├── build-windows.sh │ ├── build.spec │ ├── entrypoint-windows.sh │ ├── ffsubsync-gui.py │ ├── hooks/ │ │ └── hook-webrtcvad.py │ ├── package-macos.sh │ └── requirements.txt ├── pyproject.toml ├── pytest.ini ├── requirements-dev.txt ├── requirements.txt ├── resources/ │ └── img/ │ └── program_icon.icns ├── scripts/ │ ├── blacken.sh │ ├── bump-version.py │ ├── deploy.sh │ └── write-version.py ├── setup.cfg ├── setup.py ├── tests/ │ ├── test_alignment.py │ ├── test_integration.py │ ├── test_misc.py │ └── test_subtitles.py └── versioneer.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .coveragerc ================================================ [run] omit = ffsubsync/ffsubsync_gui.py, ffsubsync/_version.py, ffsubsync/version.py ================================================ FILE: .gitattributes ================================================ ffsubsync/_version.py export-subst ================================================ FILE: .github/FUNDING.yml ================================================ github: smacke ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Create a report to help us improve title: '' labels: bug assignees: '' --- **Environment (please complete the following information):** - OS: [e.g. Windows 10, MacOS Mojave, etc.] - python version (`python --version`) - subsync version (`subsync --version`) **Describe the bug** A clear and concise description of what the bug is. **To Reproduce** How to reproduce the behavior. **Expected behavior** A clear and concise description of what you expected to happen. **Output** Copy+paste stdout from running the command here. **Test case** [Optional] You can bundle additional debugging information into a tar archive as follows: ``` subsync vid.mkv -i in.srt -o out.srt --make-test-case ``` This will create a file `vid.mkv.$timestamp.tar.gz` or similar a few KiB in size; you can attach it by clicking the "attach files" button below. **Additional context** Add any other context about the problem here. ================================================ FILE: .github/ISSUE_TEMPLATE/synchronization-problem.md ================================================ --- name: Synchronization problem about: Help us to improve syncing by reporting failed syncs title: output subtitles still out of sync labels: out-of-sync assignees: '' --- **Upload a tarball with debugging information** 1. Run the command that produces the out-of-sync subtitle output, but with the additional `--make-test-case` flag, i.e.: `subsync ref.mkv -i in.srt -o failed.srt --make-test-case` 2. This results in a file of the form `ref.mkv.$timestamp.tar.gz` or similar. 3. Please upload this file using the "attach files" button at the bottom of the text prompt. That's all! Thank you for contributing a test case; this helps me to continue improving the sync and to add additional integration tests once improvements have been made. **Additional context** Add any other context about the problem here that might be helpful. ================================================ FILE: .github/workflows/ci.yml ================================================ name: ffsubsync on: [push, pull_request] jobs: build: runs-on: ${{ matrix.os }} strategy: matrix: os: [ 'ubuntu-22.04', 'windows-latest' ] python-version: [ '3.7', '3.8', '3.9', '3.10', '3.11', '3.12', '3.13', '3.14' ] include: - python-version: '3.7' os: 'macos-15-intel' - python-version: '3.8' os: 'macos-15-intel' - python-version: '3.9' os: 'macos-15-intel' - python-version: '3.10' os: 'macos-latest' - python-version: '3.11' os: 'macos-latest' - python-version: '3.12' os: 'macos-latest' - python-version: '3.13' os: 'macos-latest' - python-version: '3.14' os: 'macos-latest' steps: - uses: actions/checkout@v4 with: fetch-depth: 1 - uses: smacke/submodule-checkout@v3 if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version != '3.10'}} with: ssh-key: '${{ secrets.TEST_DATA_SECRET }}' - name: Set up Python uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt pip install -r requirements-dev.txt pip install -e . - name: Lint with flake8 run: | pip install flake8 # stop the build if there are Python syntax errors or undefined names #flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide #flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics flake8 . --exit-zero - name: Run unit tests with pytest (no coverage) if: matrix.os != 'ubuntu-latest' run: | pytest --cov-config=.coveragerc --cov-report= --cov=ffsubsync -v -m 'not integration' tests/ - name: Run unit tests with pytest (with coverage) if: matrix.os == 'ubuntu-latest' run: | pytest --cov-config=.coveragerc --cov-report=xml:cov.xml --cov=ffsubsync -v -m 'not integration' tests/ - name: Run integration tests with pytest if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version != '3.10'}} run: | INTEGRATION=1 pytest --cov-config=.coveragerc --cov-report=xml:cov.xml --cov=ffsubsync -v -m 'integration' tests/ - name: Upload coverage report if: matrix.os == 'ubuntu-latest' uses: codecov/codecov-action@v1 with: token: '${{ secrets.CODECOV_TOKEN }}' files: ./cov.xml env_vars: PYTHON name: codecov-umbrella fail_ci_if_error: true verbose: true ================================================ FILE: .gitignore ================================================ scratch-notebooks/ **/__pycache__ build dist *.egg-info .vim __version__ .venv/ .coverage ================================================ FILE: .gitmodules ================================================ [submodule "test-data"] path = test-data url = git@github.com:smacke/subsync-data ================================================ FILE: .readthedocs.yml ================================================ # .readthedocs.yml # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details # Required version: 2 # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py # Optionally build your docs in additional formats such as PDF and ePub formats: [pdf] # Optionally set the version of Python and requirements required to build your docs python: version: 3.8 install: - method: setuptools path: . - requirements: docs/requirements-docs.txt submodules: exclude: all ================================================ FILE: .travis.yml ================================================ language: python python: - "3.6" - "3.7" - "3.8" - "3.9" os: - linux # - osx dist: xenial git: submodules: false lfs_skip_smudge: true install: - pip install -r requirements.txt - pip install -r requirements-dev.txt - pip install -e . #addons: # apt: # update: true # packages: ffmpeg # homebrew: # packages: ffmpeg script: - pytest -v -m 'not integration' tests/ - flake8 . --exit-zero ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at stephen.macke@gmail.com. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq ================================================ FILE: HISTORY.rst ================================================ History ======= 0.4.31 (2025-11-23) ------------------- * Add support for Python 3.14; 0.4.30 (2025-09-01) ------------------- * Remove faust-cchardent dependency on Python 3.13; 0.4.29 (2025-02-18) ------------------- * Remove six dependency; 0.4.28 (2025-02-16) ------------------- * Add support for Python 3.13; 0.4.27 (2024-12-23) ------------------- * Add support for WebVTT by @GrahamDigital; * Make setuptools an explicit requirement to improve support for Python 3.12+; 0.4.26 (2024-10-15) ------------------- * Allow progress to work for multiple syncs even if alignment fails for a particular input; * Allow specifying ffmpeg exe path using --ffmpeg-path; * Updates for Python 3.12; * Don't report sync as successful if best score is in negatives (from @ajitid); * Turn on Audio Sync for audio extraction process (from @dvh312); 0.4.25 (2023-03-26) ------------------- * Replace unmaintained cchardet with faust-cchardet; 0.4.23 (2023-01-17) ------------------- * Bugfix for waitpid on Windows; 0.4.22 (2022-12-31) ------------------- * Misc maintenance / compatibility fixes; 0.4.19 (2022-01-07) ------------------- * Blacken code and get rid of future_annotations dependency; 0.4.18 (2021-11-07) ------------------- * Allow `--apply-offset-seconds` when only subtitles specified; * Make golden section search over scale factors option (`--gss`) available from help; * Use -inf as objective for invalid offsets; 0.4.17 (2021-10-03) ------------------- * Don't remove log file if --log-dir-path explicitly requested; * Add --suppress-output-if-offset-less-than arg to suppress output for small syncs; 0.4.16 (2021-07-22) ------------------- * Fix a couple of validation bugs that prevented certain uncommon command line options from use; 0.4.15 (2021-05-25) ------------------- * Make typing_extensions a requirement 0.4.14 (2021-05-10) ------------------- * Hotfix for pysubs2 on Python 3.6; 0.4.13 (2021-05-10) ------------------- * Support SSA embedded fonts using new pysubs2 'opaque_fonts' metadata; * Set min required pysubs2 version to 1.2.0 to ensure the aforementioned functionality is available; 0.4.12 (2021-04-13) ------------------- * Pin auditok to 0.1.5 to avoid API-breaking change 0.4.11 (2021-01-29) ------------------- * Misc sync improvements: * Have webrtcvad use '0' as the non speech label instead of 0.5; * Allow the vad non speech label to be specified via the --non-speech-label command line parameter; * Don't try to infer framerate ratio based on length between first and last speech frames for non-subtitle speech detection; 0.4.10 (2021-01-18) ------------------- * Lots of improvements from PRs submitted by @alucryd (big thanks!): * Retain ASS styles; * Support syncing several subs against the same ref via --overwrite-input flag; * Add --apply-offset-seconds postprocess option to shift alignment by prespecified amount; * Filter out metadata in subtitles when extracting speech; * Add experimental --golden-section-search over framerate ratio (off by default); * Try to improve sync by inferring framerate ratio based on relative duration of synced vs unsynced; 0.4.9 (2020-10-11) ------------------ * Make default max offset seconds 60 and enforce during alignment as opposed to throwing away alignments with > max_offset_seconds; * Add experimental section for using golden section search to find framerate ratio; * Restore ability to read stdin and write stdout after buggy permissions check; * Exceptions that occur during syncing were mistakenly suppressed; this is now fixed; 0.4.8 (2020-09-22) ------------------ * Use webrtcvad-wheels on Windows to eliminate dependency on compiler; 0.4.7 (2020-09-05) ------------------ * Misc bugfixes and stability improvements; 0.4.6 (2020-06-10) ------------------ * Bugfix for writing subs to stdout; 0.4.5 (2020-06-09) ------------------ * Allow MicroDVD input format; * Use output extension to determine output format; 0.4.4 (2020-06-08) ------------------ * Use rich formatting for Python >= 3.6; * Use versioneer to manage versions; 0.4.3 (2020-06-07) ------------------ * Fix regression where stdout not used for default output; * Add ability to specify path to ffmpeg / ffprobe binaries; * Add ability to overwrite the input / unsynced srt with the --overwrite-input flag; 0.4.2 (2020-06-06) ------------------ * Fix Python 2 compatibility bug; 0.4.1 (2020-06-06) ------------------ * Add --reference-stream option for selecting the stream / track from the video reference to use for speech detection; 0.4.0 (2020-06-02) ------------------ * Remove dependency on scikit-learn; * Implement PyInstaller / Gooey build process for graphical application on MacOS and Windows; 0.3.7 (2020-05-11) ------------------ * Fix PyPI issues; 0.3.5 (2020-05-08) ------------------ * Fix corner case bug that occurred when multiple sync attempts were scored the same; 0.3.4 (2020-03-20) ------------------ * Attempt speech extraction from subtitle tracks embedded in video first before using VAD; 0.3.3 (2020-03-15) ------------------ * Hotfix for test archive creation bug; 0.3.2 (2020-03-13) ------------------ * Add ability to merge synced and reference subs into bilingual subs when reference is srt; 0.3.1 (2020-03-12) ------------------ * Fix bug when handling ass/ssa input, this format should work now; 0.3.0 (2020-03-11) ------------------ * Better detection of text file encodings; * ASS / SSA functionality (but currently untested); * Allow serialize speech with --serialize-speech flag; * Convenient --make-test-case flag to create test cases when filing sync-related bugs; * Use utf-8 as default output encoding (instead of using same encoding as input); * More robust test framework (integration tests!); 0.2.17 (2019-12-21) ------------------ * Try to correct for framerate differences by picking best framerate ratio; 0.2.16 (2019-12-04) ------------------ * Revert changes from 0.2.9 now that srt parses weird timestamps robustly; 0.2.15 (2019-10-11) ------------------ * Revert changes from 0.2.12 (caused regression on Windows); 0.2.14 (2019-10-07) ------------------ * Bump min required scikit-learn to 0.20.4; 0.2.12 (2019-10-06) ------------------ * Clear O_NONBLOCK flag on stdout stream in case it is set; 0.2.11 (2019-10-06) ------------------ * Quick and dirty fix to recover without progress info if `ffmpeg.probe` raises; 0.2.10 (2019-09-22) ------------------ * Specify utf-8 encoding at top of file for backcompat with Python2; 0.2.9 (2019-09-22) ------------------ * Quck and dirty fix to properly handle timestamp ms fields with >3 digits; 0.2.8 (2019-06-15) ------------------ * Allow user to specify start time (in seconds) for processing; 0.2.7 (2019-05-28) ------------------ * Add utf-16 to list of encodings to try for inference purposes; 0.2.6 (2019-05-15) ------------------ * Fix argument parsing regression; 0.2.5 (2019-05-14) ------------------ * Clamp subtitles to maximum duration (default 10); 0.2.4 (2019-03-19) ------------------ * Add six to requirements.txt; * Set default encoding to utf8 to ensure non ascii filenames handled properly; 0.2.3 (2019-03-08) ------------------ * Minor change to subtitle speech extraction; 0.2.2 (2019-03-08) ------------------ * Allow reading input srt from stdin; * Allow specifying encodings for reference, input, and output srt; * Use the same encoding for both input srt and output srt by default; * Developer note: using sklearn-style data pipelines now; 0.2.1 (2019-03-07) ------------------ * Developer note: change progress-only to vlc-mode and remove from help docs; 0.2.0 (2019-03-06) ------------------ * Get rid of auditok (GPLv3, was hurting alignment algorithm); * Change to alignment algo: don't penalize matching video non-speech with subtitle speech; 0.1.7 (2019-03-05) ------------------ * Add Chinese to the list of encodings that can be inferred; * Make srt parsing more robust; 0.1.6 (2019-03-04) ------------------ * Misc bugfixes; * Proper logging; * Proper version handling; 0.1.0 (2019-02-24) ------------------ * Support srt format; * Support using srt as reference; * Support using video as reference (via ffmpeg); * Support writing to stdout or file (read from stdin not yet supported; can only read from file); ================================================ FILE: LICENSE ================================================ Copyright 2019 Stephen Macke Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MANIFEST.in ================================================ include *.rst include versioneer.py include ffsubsync/_version.py ================================================ FILE: Makefile ================================================ # -*- coding: utf-8 -*- .PHONY: clean build bump deploy black blackcheck check test tests deps devdeps clean: rm -rf dist/ build/ *.egg-info/ build: clean python setup.py sdist bdist_wheel --universal bump: ./scripts/bump-version.py deploy: build ./scripts/deploy.sh black: ./scripts/blacken.sh blackcheck: ./scripts/blacken.sh --check lint: flake8 typecheck: mypy ffsubsync check_no_typing: INTEGRATION=1 pytest --cov-config=.coveragerc --cov=ffsubsync check: blackcheck typecheck check_no_typing test: check tests: check deps: pip install -r requirements.txt devdeps: pip install -e . pip install -r requirements-dev.txt ================================================ FILE: README.md ================================================ FFsubsync ======= [![CI Status](https://github.com/smacke/ffsubsync/workflows/ffsubsync/badge.svg)](https://github.com/smacke/ffsubsync/actions) [![Support Ukraine](https://badgen.net/badge/support/UKRAINE/?color=0057B8&labelColor=FFD700)](https://github.com/vshymanskyy/StandWithUkraine/blob/main/docs/README.md) [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![License: MIT](https://img.shields.io/badge/License-MIT-maroon.svg)](https://opensource.org/licenses/MIT) [![Python Versions](https://img.shields.io/pypi/pyversions/ffsubsync.svg)](https://pypi.org/project/ffsubsync) [![Documentation Status](https://readthedocs.org/projects/ffsubsync/badge/?version=latest)](https://ffsubsync.readthedocs.io/en/latest/?badge=latest) [![PyPI Version](https://img.shields.io/pypi/v/ffsubsync.svg)](https://pypi.org/project/ffsubsync) Language-agnostic automatic synchronization of subtitles with video, so that subtitles are aligned to the correct starting point within the video. Turn this: | Into this: :-------------------------------:|:-------------------------: ![](https://raw.githubusercontent.com/smacke/ffsubsync/master/resources/img/tearing-me-apart-wrong.gif) | ![](https://raw.githubusercontent.com/smacke/ffsubsync/master/resources/img/tearing-me-apart-correct.gif) Helping Development ------------------- Please consider [supporting Ukraine](https://github.com/vshymanskyy/StandWithUkraine/blob/main/docs/README.md) rather than donating directly to this project. That said, at the request of some, you can now help cover my coffee expenses using the Github Sponsors button at the top, or using the below Paypal Donate button: [![Donate](https://www.paypalobjects.com/en_US/i/btn/btn_donate_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=XJC5ANLMYECJE) Install ------- First, make sure ffmpeg is installed. On MacOS, this looks like: ~~~ brew install ffmpeg ~~~ (Windows users: make sure `ffmpeg` is on your path and can be referenced from the command line!) Next, grab the package (compatible with Python >= 3.6): ~~~ pip install ffsubsync ~~~ If you want to live dangerously, you can grab the latest version as follows: ~~~ pip install git+https://github.com/smacke/ffsubsync@latest ~~~ Usage ----- `ffs`, `subsync` and `ffsubsync` all work as entrypoints: ~~~ ffs video.mp4 -i unsynchronized.srt -o synchronized.srt ~~~ There may be occasions where you have a correctly synchronized srt file in a language you are unfamiliar with, as well as an unsynchronized srt file in your native language. In this case, you can use the correctly synchronized srt file directly as a reference for synchronization, instead of using the video as the reference: ~~~ ffsubsync reference.srt -i unsynchronized.srt -o synchronized.srt ~~~ `ffsubsync` uses the file extension to decide whether to perform voice activity detection on the audio or to directly extract speech from an srt file. Sync Issues ----------- If the sync fails, the following recourses are available: - Try to sync assuming identical video / subtitle framerates by passing `--no-fix-framerate`; - Try passing `--gss` to use [golden-section search](https://en.wikipedia.org/wiki/Golden-section_search) to find the optimal ratio between video and subtitle framerates (by default, only a few common ratios are evaluated); - Try a value of `--max-offset-seconds` greater than the default of 60, in the event that the subtitles are out of sync by more than 60 seconds (empirically unlikely in practice, but possible). - Try `--vad=auditok` since [auditok](https://github.com/amsehili/auditok) can sometimes work better in the case of low-quality audio than WebRTC's VAD. Auditok does not specifically detect voice, but instead detects all audio; this property can yield suboptimal syncing behavior when a proper VAD can work well, but can be effective in some cases. If the sync still fails, consider trying one of the following similar tools: - [sc0ty/subsync](https://github.com/sc0ty/subsync): does speech-to-text and looks for matching word morphemes - [kaegi/alass](https://github.com/kaegi/alass): rust-based subtitle synchronizer with a fancy dynamic programming algorithm - [tympanix/subsync](https://github.com/tympanix/subsync): neural net based approach that optimizes directly for alignment when performing speech detection - [oseiskar/autosubsync](https://github.com/oseiskar/autosubsync): performs speech detection with bespoke spectrogram + logistic regression - [pums974/srtsync](https://github.com/pums974/srtsync): similar approach to ffsubsync (WebRTC's VAD + FFT to maximize signal cross correlation) Speed ----- `ffsubsync` usually finishes in 20 to 30 seconds, depending on the length of the video. The most expensive step is actually extraction of raw audio. If you already have a correctly synchronized "reference" srt file (in which case audio extraction can be skipped), `ffsubsync` typically runs in less than a second. How It Works ------------ The synchronization algorithm operates in 3 steps: 1. Discretize both the video file's audio stream and the subtitles into 10ms windows. 2. For each 10ms window, determine whether that window contains speech. This is trivial to do for subtitles (we just determine whether any subtitle is "on" during each time window); for the audio stream, use an off-the-shelf voice activity detector (VAD) like the one built into [webrtc](https://webrtc.org/). 3. Now we have two binary strings: one for the subtitles, and one for the video. Try to align these strings by matching 0's with 0's and 1's with 1's. We score these alignments as (# video 1's matched w/ subtitle 1's) - (# video 1's matched with subtitle 0's). The best-scoring alignment from step 3 determines how to offset the subtitles in time so that they are properly synced with the video. Because the binary strings are fairly long (millions of digits for video longer than an hour), the naive O(n^2) strategy for scoring all alignments is unacceptable. Instead, we use the fact that "scoring all alignments" is a convolution operation and can be implemented with the Fast Fourier Transform (FFT), bringing the complexity down to O(n log n). Limitations ----------- In most cases, inconsistencies between video and subtitles occur when starting or ending segments present in video are not present in subtitles, or vice versa. This can occur, for example, when a TV episode recap in the subtitles was pruned from video. FFsubsync typically works well in these cases, and in my experience this covers >95% of use cases. Handling breaks and splits outside of the beginning and ending segments is left to future work (see below). Future Work ----------- Besides general stability and usability improvements, one line of work aims to extend the synchronization algorithm to handle splits / breaks in the middle of video not present in subtitles (or vice versa). Developing a robust solution will take some time (assuming one is possible). See [#10](https://github.com/smacke/ffsubsync/issues/10) for more details. History ------- The implementation for this project was started during HackIllinois 2019, for which it received an **_Honorable Mention_** (ranked in the top 5 projects, excluding projects that won company-specific prizes). Credits ------- This project would not be possible without the following libraries: - [ffmpeg](https://www.ffmpeg.org/) and the [ffmpeg-python](https://github.com/kkroening/ffmpeg-python) wrapper, for extracting raw audio from video - VAD from [webrtc](https://webrtc.org/) and the [py-webrtcvad](https://github.com/wiseman/py-webrtcvad) wrapper, for speech detection - [srt](https://pypi.org/project/srt/) for operating on [SRT files](https://en.wikipedia.org/wiki/SubRip#SubRip_text_file_format) - [numpy](http://www.numpy.org/) and, indirectly, [FFTPACK](https://www.netlib.org/fftpack/), which powers the FFT-based algorithm for fast scoring of alignments between subtitles (or subtitles and video) - Other excellent Python libraries like [argparse](https://docs.python.org/3/library/argparse.html), [rich](https://github.com/willmcgugan/rich), and [tqdm](https://tqdm.github.io/), not related to the core functionality, but which enable much better experiences for developers and users. # License Code in this project is [MIT licensed](https://opensource.org/licenses/MIT). ================================================ FILE: docs/.gitignore ================================================ _build/ ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/_static/.keep ================================================ ================================================ FILE: docs/_templates/.keep ================================================ ================================================ FILE: docs/conf.py ================================================ # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # # import os # import sys # sys.path.insert(0, os.path.abspath('.')) # -- Project information ----------------------------------------------------- project = 'ffsubsync' copyright = '2020, Stephen Macke' author = 'Stephen Macke' # -- General configuration --------------------------------------------------- # ref: https://stackoverflow.com/questions/56336234/build-fail-sphinx-error-contents-rst-not-found master_doc = 'index' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autodoc', 'sphinxarg.ext', 'sphinx_rtd_theme', ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] ================================================ FILE: docs/index.rst ================================================ .. ffsubsync documentation master file, created by sphinx-quickstart on Mon Dec 2 17:06:18 2019. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. Welcome to FFsubsync's documentation! ===================================== .. toctree:: :maxdepth: 2 :caption: Contents: Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` .. argparse:: :module: ffsubsync.ffsubsync :func: make_parser :prog: ffsubsync ================================================ FILE: docs/make.bat ================================================ @ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=. set BUILDDIR=_build if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd ================================================ FILE: docs/requirements-docs.txt ================================================ # docs autodoc docutils<0.18 # ref: https://github.com/sphinx-doc/sphinx/issues/9788 sphinx-argparse sphinx-rtd-theme ================================================ FILE: ffsubsync/__init__.py ================================================ # -*- coding: utf-8 -*- import logging import sys try: from rich.console import Console from rich.logging import RichHandler # configure logging here because some other later imported library does it first otherwise # TODO: use a fileconfig logging.basicConfig( level=logging.INFO, format="%(message)s", datefmt="[%X]", handlers=[RichHandler(console=Console(file=sys.stderr))], ) except: # noqa: E722 logging.basicConfig(stream=sys.stderr, level=logging.INFO) from .version import __version__ # noqa from .ffsubsync import main # noqa ================================================ FILE: ffsubsync/_version.py ================================================ # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. Generated by # versioneer-0.18 (https://github.com/warner/python-versioneer) """Git implementation of _version.py.""" import errno import os import re import subprocess import sys def get_keywords(): """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = "$Format:%d$" git_full = "$Format:%H$" git_date = "$Format:%ci$" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" def get_config(): """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "pep440-pre" cfg.tag_prefix = "" cfg.parentdir_prefix = "ffsubsync-" cfg.versionfile_source = "ffsubsync/_version.py" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY = {} HANDLERS = {} def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None for c in commands: try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git p = subprocess.Popen([c] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None)) break except EnvironmentError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None, None stdout = p.communicate()[0].strip() if sys.version_info[0] >= 3: stdout = stdout.decode() if p.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) return None, p.returncode return stdout, p.returncode def versions_from_parentdir(parentdir_prefix, root, verbose): """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None, "date": None} else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print("Tried directories %s but none started with prefix %s" % (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: f = open(versionfile_abs, "r") for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) f.close() except EnvironmentError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") date = keywords.get("date") if date is not None: # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = set([r.strip() for r in refnames.strip("()").split(",")]) # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = set([r for r in refs if re.search(r'\d', r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] if verbose: print("picking %s" % r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", "--always", "--long", "--match", "%s*" % tag_prefix], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%s'" % describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" % (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def plus_or_dot(pieces): """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_pre(pieces): """TAG[.post.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post.devDISTANCE """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += ".post.dev%d" % pieces["distance"] else: # exception #1 rendered = "0.post.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces): """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Eexceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces): """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date")} def get_versions(): """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for i in cfg.versionfile_source.split('/'): root = os.path.dirname(root) except NameError: return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", "date": None} try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None} ================================================ FILE: ffsubsync/aligners.py ================================================ # -*- coding: utf-8 -*- import logging import math from typing import List, Optional, Tuple, Type, Union import numpy as np from ffsubsync.golden_section_search import gss from ffsubsync.sklearn_shim import Pipeline, TransformerMixin logging.basicConfig(level=logging.INFO) logger: logging.Logger = logging.getLogger(__name__) MIN_FRAMERATE_RATIO = 0.9 MAX_FRAMERATE_RATIO = 1.1 class FailedToFindAlignmentException(Exception): pass class FFTAligner(TransformerMixin): def __init__(self, max_offset_samples: Optional[int] = None) -> None: self.max_offset_samples: Optional[int] = max_offset_samples self.best_offset_: Optional[int] = None self.best_score_: Optional[float] = None self.get_score_: bool = False def _eliminate_extreme_offsets_from_solutions( self, convolve: np.ndarray, substring: np.ndarray ) -> np.ndarray: convolve = np.copy(convolve) if self.max_offset_samples is None: return convolve def _offset_to_index(offset): return len(convolve) - 1 + offset - len(substring) convolve[: _offset_to_index(-self.max_offset_samples)] = float("-inf") convolve[_offset_to_index(self.max_offset_samples) :] = float("-inf") return convolve def _compute_argmax(self, convolve: np.ndarray, substring: np.ndarray) -> None: best_idx = int(np.argmax(convolve)) self.best_offset_ = len(convolve) - 1 - best_idx - len(substring) self.best_score_ = convolve[best_idx] def fit(self, refstring, substring, get_score: bool = False) -> "FFTAligner": refstring, substring = [ list(map(int, s)) if isinstance(s, str) else s for s in [refstring, substring] ] refstring, substring = map( lambda s: 2 * np.array(s).astype(float) - 1, [refstring, substring] ) total_bits = math.log(len(substring) + len(refstring), 2) total_length = int(2 ** math.ceil(total_bits)) extra_zeros = total_length - len(substring) - len(refstring) subft = np.fft.fft(np.append(np.zeros(extra_zeros + len(refstring)), substring)) refft = np.fft.fft( np.flip(np.append(refstring, np.zeros(len(substring) + extra_zeros)), 0) ) convolve = np.real(np.fft.ifft(subft * refft)) self._compute_argmax( self._eliminate_extreme_offsets_from_solutions(convolve, substring), substring, ) self.get_score_ = get_score return self def transform(self, *_) -> Union[int, Tuple[float, int]]: if self.get_score_: return self.best_score_, self.best_offset_ else: return self.best_offset_ class MaxScoreAligner(TransformerMixin): def __init__( self, base_aligner: Union[FFTAligner, Type[FFTAligner]], srtin: Optional[str] = None, sample_rate=None, max_offset_seconds=None, ) -> None: self.srtin: Optional[str] = srtin if sample_rate is None or max_offset_seconds is None: self.max_offset_samples: Optional[int] = None else: self.max_offset_samples = abs(int(max_offset_seconds * sample_rate)) if isinstance(base_aligner, type): self.base_aligner: FFTAligner = base_aligner( max_offset_samples=self.max_offset_samples ) else: self.base_aligner = base_aligner self.max_offset_seconds: Optional[int] = max_offset_seconds self._scores: List[Tuple[Tuple[float, int], Pipeline]] = [] def fit_gss(self, refstring, subpipe_maker): def opt_func(framerate_ratio, is_last_iter): subpipe = subpipe_maker(framerate_ratio) substring = subpipe.fit_transform(self.srtin) score = self.base_aligner.fit_transform( refstring, substring, get_score=True ) logger.info( "got score %.0f (offset %d) for ratio %.3f", score[0], score[1], framerate_ratio, ) if is_last_iter: self._scores.append((score, subpipe)) return -score[0] gss(opt_func, MIN_FRAMERATE_RATIO, MAX_FRAMERATE_RATIO) return self def fit( self, refstring, subpipes: Union[Pipeline, List[Pipeline]] ) -> "MaxScoreAligner": if not isinstance(subpipes, list): subpipes = [subpipes] for subpipe in subpipes: if callable(subpipe): self.fit_gss(refstring, subpipe) continue elif hasattr(subpipe, "transform"): substring = subpipe.transform(self.srtin) else: substring = subpipe self._scores.append( ( self.base_aligner.fit_transform( refstring, substring, get_score=True ), subpipe, ) ) return self def transform(self, *_) -> Tuple[Tuple[float, float], Pipeline]: scores = self._scores if self.max_offset_samples is not None: scores = list( filter(lambda s: abs(s[0][1]) <= self.max_offset_samples, scores) ) if len(scores) == 0: raise FailedToFindAlignmentException( "Synchronization failed; consider passing " "--max-offset-seconds with a number larger than " "{}".format(self.max_offset_seconds) ) (score, offset), subpipe = max(scores, key=lambda x: x[0][0]) return (score, offset), subpipe ================================================ FILE: ffsubsync/constants.py ================================================ # -*- coding: utf-8 -*- from typing import List, Tuple SUBSYNC_RESOURCES_ENV_MAGIC: str = "ffsubsync_resources_xj48gjdkl340" SAMPLE_RATE: int = 100 FRAMERATE_RATIOS: List[float] = [24.0 / 23.976, 25.0 / 23.976, 25.0 / 24.0] DEFAULT_FRAME_RATE: int = 48000 DEFAULT_NON_SPEECH_LABEL: float = 0.0 DEFAULT_ENCODING: str = "infer" DEFAULT_MAX_SUBTITLE_SECONDS: int = 10 DEFAULT_START_SECONDS: int = 0 DEFAULT_SCALE_FACTOR: float = 1 DEFAULT_VAD: str = "subs_then_webrtc" DEFAULT_MAX_OFFSET_SECONDS: int = 60 DEFAULT_APPLY_OFFSET_SECONDS: int = 0 SUBTITLE_EXTENSIONS: Tuple[str, ...] = ("srt", "ass", "ssa", "sub") GITHUB_DEV_USER: str = "smacke" PROJECT_NAME: str = "FFsubsync" PROJECT_LICENSE: str = "MIT" COPYRIGHT_YEAR: str = "2019" GITHUB_REPO: str = "ffsubsync" DESCRIPTION: str = "Synchronize subtitles with video." LONG_DESCRIPTION: str = ( "Automatic and language-agnostic synchronization of subtitles with video." ) WEBSITE: str = "https://github.com/{}/{}/".format(GITHUB_DEV_USER, GITHUB_REPO) DEV_WEBSITE: str = "https://smacke.net/" # No trailing slash important for this one... API_RELEASE_URL: str = "https://api.github.com/repos/{}/{}/releases/latest".format( GITHUB_DEV_USER, GITHUB_REPO ) RELEASE_URL: str = "https://github.com/{}/{}/releases/latest/".format( GITHUB_DEV_USER, GITHUB_REPO ) ================================================ FILE: ffsubsync/ffmpeg_utils.py ================================================ # -*- coding: utf-8 -*- import logging import os import platform import subprocess from ffsubsync.constants import SUBSYNC_RESOURCES_ENV_MAGIC logging.basicConfig(level=logging.INFO) logger: logging.Logger = logging.getLogger(__name__) # ref: https://github.com/pyinstaller/pyinstaller/wiki/Recipe-subprocess # Create a set of arguments which make a ``subprocess.Popen`` (and # variants) call work with or without Pyinstaller, ``--noconsole`` or # not, on Windows and Linux. Typical use:: # # subprocess.call(['program_to_run', 'arg_1'], **subprocess_args()) # # When calling ``check_output``:: # # subprocess.check_output(['program_to_run', 'arg_1'], # **subprocess_args(False)) def subprocess_args(include_stdout=True): # The following is true only on Windows. if hasattr(subprocess, "STARTUPINFO"): # On Windows, subprocess calls will pop up a command window by default # when run from Pyinstaller with the ``--noconsole`` option. Avoid this # distraction. si = subprocess.STARTUPINFO() si.dwFlags |= subprocess.STARTF_USESHOWWINDOW # Windows doesn't search the path by default. Pass it an environment so # it will. env = os.environ else: si = None env = None # ``subprocess.check_output`` doesn't allow specifying ``stdout``:: # # Traceback (most recent call last): # File "test_subprocess.py", line 58, in # **subprocess_args(stdout=None)) # File "C:\Python27\lib\subprocess.py", line 567, in check_output # raise ValueError('stdout argument not allowed, it will be overridden.') # ValueError: stdout argument not allowed, it will be overridden. # # So, add it only if it's needed. if include_stdout: ret = {"stdout": subprocess.PIPE} else: ret = {} # On Windows, running this from the binary produced by Pyinstaller # with the ``--noconsole`` option requires redirecting everything # (stdin, stdout, stderr) to avoid an OSError exception # "[Error 6] the handle is invalid." ret.update( { "stdin": subprocess.PIPE, "stderr": subprocess.PIPE, "startupinfo": si, "env": env, } ) return ret def ffmpeg_bin_path(bin_name, gui_mode, ffmpeg_resources_path=None): if platform.system() == "Windows": bin_name = "{}.exe".format(bin_name) if ffmpeg_resources_path is not None: if not os.path.isdir(ffmpeg_resources_path): if bin_name.lower().startswith("ffmpeg"): return ffmpeg_resources_path ffmpeg_resources_path = os.path.dirname(ffmpeg_resources_path) return os.path.join(ffmpeg_resources_path, bin_name) try: resource_path = os.environ[SUBSYNC_RESOURCES_ENV_MAGIC] if len(resource_path) > 0: return os.path.join(resource_path, "ffmpeg-bin", bin_name) except KeyError: if gui_mode: logger.info( "Couldn't find resource path; falling back to searching system path" ) return bin_name ================================================ FILE: ffsubsync/ffsubsync.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- import argparse from datetime import datetime import logging import os import shutil import subprocess import sys from typing import cast, Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np from ffsubsync.aligners import FFTAligner, MaxScoreAligner from ffsubsync.constants import ( DEFAULT_APPLY_OFFSET_SECONDS, DEFAULT_FRAME_RATE, DEFAULT_MAX_OFFSET_SECONDS, DEFAULT_MAX_SUBTITLE_SECONDS, DEFAULT_NON_SPEECH_LABEL, DEFAULT_START_SECONDS, DEFAULT_VAD, DEFAULT_ENCODING, FRAMERATE_RATIOS, SAMPLE_RATE, SUBTITLE_EXTENSIONS, ) from ffsubsync.ffmpeg_utils import ffmpeg_bin_path from ffsubsync.sklearn_shim import Pipeline, TransformerMixin from ffsubsync.speech_transformers import ( VideoSpeechTransformer, DeserializeSpeechTransformer, make_subtitle_speech_pipeline, ) from ffsubsync.subtitle_parser import make_subtitle_parser from ffsubsync.subtitle_transformers import SubtitleMerger, SubtitleShifter from ffsubsync.version import get_version logger: logging.Logger = logging.getLogger(__name__) def override(args: argparse.Namespace, **kwargs: Any) -> Dict[str, Any]: args_dict = dict(args.__dict__) args_dict.update(kwargs) return args_dict def _ref_format(ref_fname: Optional[str]) -> Optional[str]: if ref_fname is None: return None return ref_fname[-3:] def make_test_case( args: argparse.Namespace, npy_savename: Optional[str], sync_was_successful: bool ) -> int: if npy_savename is None: raise ValueError("need non-null npy_savename") tar_dir = "{}.{}".format( args.reference, datetime.now().strftime("%Y-%m-%d-%H-%M-%S") ) logger.info("creating test archive {}.tar.gz...".format(tar_dir)) os.mkdir(tar_dir) try: log_path = "ffsubsync.log" if args.log_dir_path is not None and os.path.isdir(args.log_dir_path): log_path = os.path.join(args.log_dir_path, log_path) shutil.copy(log_path, tar_dir) shutil.copy(args.srtin[0], tar_dir) if sync_was_successful: shutil.move(args.srtout, tar_dir) if _ref_format(args.reference) in SUBTITLE_EXTENSIONS: shutil.copy(args.reference, tar_dir) elif args.serialize_speech or args.reference == npy_savename: shutil.copy(npy_savename, tar_dir) else: shutil.move(npy_savename, tar_dir) supported_formats = set(list(zip(*shutil.get_archive_formats()))[0]) preferred_formats = ["gztar", "bztar", "xztar", "zip", "tar"] for archive_format in preferred_formats: if archive_format in supported_formats: shutil.make_archive(tar_dir, archive_format, os.curdir, tar_dir) break else: logger.error( "failed to create test archive; no formats supported " "(this should not happen)" ) return 1 logger.info("...done") finally: shutil.rmtree(tar_dir) return 0 def get_srt_pipe_maker( args: argparse.Namespace, srtin: Optional[str] ) -> Callable[[Optional[float]], Union[Pipeline, Callable[[float], Pipeline]]]: if srtin is None: srtin_format = "srt" else: srtin_format = os.path.splitext(srtin)[-1][1:] parser = make_subtitle_parser(fmt=srtin_format, caching=True, **args.__dict__) return lambda scale_factor: make_subtitle_speech_pipeline( **override(args, scale_factor=scale_factor, parser=parser) ) def get_framerate_ratios_to_try(args: argparse.Namespace) -> List[Optional[float]]: if args.no_fix_framerate: return [] else: framerate_ratios = list( np.concatenate( [np.array(FRAMERATE_RATIOS), 1.0 / np.array(FRAMERATE_RATIOS)] ) ) if args.gss: framerate_ratios.append(None) return framerate_ratios def try_sync( args: argparse.Namespace, reference_pipe: Optional[Pipeline], result: Dict[str, Any] ) -> bool: result["sync_was_successful"] = False sync_was_successful = True logger.info( "extracting speech segments from %s...", "stdin" if not args.srtin else "subtitles file(s) {}".format(args.srtin), ) if not args.srtin: args.srtin = [None] for srtin in args.srtin: try: skip_sync = args.skip_sync or reference_pipe is None skip_infer_framerate_ratio = ( args.skip_infer_framerate_ratio or reference_pipe is None ) srtout = srtin if args.overwrite_input else args.srtout srt_pipe_maker = get_srt_pipe_maker(args, srtin) framerate_ratios = get_framerate_ratios_to_try(args) srt_pipes = [srt_pipe_maker(1.0)] + [ srt_pipe_maker(rat) for rat in framerate_ratios ] for srt_pipe in srt_pipes: if callable(srt_pipe): continue else: srt_pipe.fit(srtin) if not skip_infer_framerate_ratio and hasattr( reference_pipe[-1], "num_frames" ): inferred_framerate_ratio_from_length = ( float(reference_pipe[-1].num_frames) / cast(Pipeline, srt_pipes[0])[-1].num_frames ) logger.info( "inferred frameratio ratio: %.3f" % inferred_framerate_ratio_from_length ) srt_pipes.append( cast( Pipeline, srt_pipe_maker(inferred_framerate_ratio_from_length) ).fit(srtin) ) logger.info("...done") logger.info("computing alignments...") if skip_sync: best_score = 0.0 best_srt_pipe = cast(Pipeline, srt_pipes[0]) offset_samples = 0 else: (best_score, offset_samples), best_srt_pipe = MaxScoreAligner( FFTAligner, srtin, SAMPLE_RATE, args.max_offset_seconds ).fit_transform( reference_pipe.transform(args.reference), srt_pipes, ) if best_score < 0: sync_was_successful = False logger.info("...done") offset_seconds = ( offset_samples / float(SAMPLE_RATE) + args.apply_offset_seconds ) scale_step = best_srt_pipe.named_steps["scale"] logger.info("score: %.3f", best_score) logger.info("offset seconds: %.3f", offset_seconds) logger.info("framerate scale factor: %.3f", scale_step.scale_factor) output_steps: List[Tuple[str, TransformerMixin]] = [ ("shift", SubtitleShifter(offset_seconds)) ] if args.merge_with_reference: output_steps.append( ("merge", SubtitleMerger(reference_pipe.named_steps["parse"].subs_)) ) output_pipe = Pipeline(output_steps) out_subs = output_pipe.fit_transform(scale_step.subs_) if args.output_encoding != "same": out_subs = out_subs.set_encoding(args.output_encoding) suppress_output_thresh = args.suppress_output_if_offset_less_than if offset_seconds >= (suppress_output_thresh or float("-inf")): logger.info("writing output to {}".format(srtout or "stdout")) out_subs.write_file(srtout) else: logger.warning( "suppressing output because offset %s was less than suppression threshold %s", offset_seconds, args.suppress_output_if_offset_less_than, ) except Exception: sync_was_successful = False logger.exception("failed to sync %s", srtin) else: result["offset_seconds"] = offset_seconds result["framerate_scale_factor"] = scale_step.scale_factor result["sync_was_successful"] = sync_was_successful return sync_was_successful def make_reference_pipe(args: argparse.Namespace) -> Pipeline: ref_format = _ref_format(args.reference) if ref_format in SUBTITLE_EXTENSIONS: if args.vad is not None: logger.warning("Vad specified, but reference was not a movie") return cast( Pipeline, make_subtitle_speech_pipeline( fmt=ref_format, **override(args, encoding=args.reference_encoding or DEFAULT_ENCODING), ), ) elif ref_format in ("npy", "npz"): if args.vad is not None: logger.warning("Vad specified, but reference was not a movie") return Pipeline( [("deserialize", DeserializeSpeechTransformer(args.non_speech_label))] ) else: vad = args.vad or DEFAULT_VAD if args.reference_encoding is not None: logger.warning( "Reference srt encoding specified, but reference was a video file" ) ref_stream = args.reference_stream if ref_stream is not None and not ref_stream.startswith("0:"): ref_stream = "0:" + ref_stream return Pipeline( [ ( "speech_extract", VideoSpeechTransformer( vad=vad, sample_rate=SAMPLE_RATE, frame_rate=args.frame_rate, non_speech_label=args.non_speech_label, start_seconds=args.start_seconds, ffmpeg_path=args.ffmpeg_path, ref_stream=ref_stream, vlc_mode=args.vlc_mode, gui_mode=args.gui_mode, ), ), ] ) def extract_subtitles_from_reference(args: argparse.Namespace) -> int: stream = args.extract_subs_from_stream if not stream.startswith("0:s:"): stream = "0:s:{}".format(stream) elif not stream.startswith("0:") and stream.startswith("s:"): stream = "0:{}".format(stream) if not stream.startswith("0:s:"): logger.error( "invalid stream for subtitle extraction: %s", args.extract_subs_from_stream ) ffmpeg_args = [ ffmpeg_bin_path("ffmpeg", args.gui_mode, ffmpeg_resources_path=args.ffmpeg_path) ] ffmpeg_args.extend( [ "-y", "-nostdin", "-loglevel", "fatal", "-i", args.reference, "-map", "{}".format(stream), "-f", "srt", ] ) if args.srtout is None: ffmpeg_args.append("-") else: ffmpeg_args.append(args.srtout) logger.info( "attempting to extract subtitles to {} ...".format( "stdout" if args.srtout is None else args.srtout ) ) retcode = subprocess.call(ffmpeg_args) if retcode == 0: logger.info("...done") else: logger.error( "ffmpeg unable to extract subtitles from reference; return code %d", retcode ) return retcode def validate_args(args: argparse.Namespace) -> None: if args.vlc_mode: logger.setLevel(logging.CRITICAL) if args.reference is None: if args.apply_offset_seconds == 0 or not args.srtin: raise ValueError( "`reference` required unless `--apply-offset-seconds` specified" ) if args.apply_offset_seconds != 0: if not args.srtin: args.srtin = [args.reference] if not args.srtin: raise ValueError( "at least one of `srtin` or `reference` must be specified to apply offset seconds" ) if args.srtin: if len(args.srtin) > 1 and not args.overwrite_input: raise ValueError( "cannot specify multiple input srt files without overwriting" ) if len(args.srtin) > 1 and args.make_test_case: raise ValueError("cannot specify multiple input srt files for test cases") if len(args.srtin) > 1 and args.gui_mode: raise ValueError("cannot specify multiple input srt files in GUI mode") if ( args.make_test_case and not args.gui_mode ): # this validation not necessary for gui mode if not args.srtin or args.srtout is None: raise ValueError( "need to specify input and output srt files for test cases" ) if args.overwrite_input: if args.extract_subs_from_stream is not None: raise ValueError( "input overwriting not allowed for extracting subtitles from reference" ) if not args.srtin: raise ValueError( "need to specify input srt if --overwrite-input " "is specified since we cannot overwrite stdin" ) if args.srtout is not None: raise ValueError( "overwrite input set but output file specified; " "refusing to run in case this was not intended" ) if args.extract_subs_from_stream is not None: if args.make_test_case: raise ValueError("test case is for sync and not subtitle extraction") if args.srtin: raise ValueError( "stream specified for reference subtitle extraction; " "-i flag for sync input not allowed" ) def validate_file_permissions(args: argparse.Namespace) -> None: error_string_template = ( "unable to {action} {file}; " "try ensuring file exists and has correct permissions" ) if args.reference is not None and not os.access(args.reference, os.R_OK): raise ValueError( error_string_template.format(action="read reference", file=args.reference) ) if args.srtin: for srtin in args.srtin: if srtin is not None and not os.access(srtin, os.R_OK): raise ValueError( error_string_template.format( action="read input subtitles", file=srtin ) ) if ( args.srtout is not None and os.path.exists(args.srtout) and not os.access(args.srtout, os.W_OK) ): raise ValueError( error_string_template.format( action="write output subtitles", file=args.srtout ) ) if args.make_test_case or args.serialize_speech: npy_savename = os.path.splitext(args.reference)[0] + ".npz" if os.path.exists(npy_savename) and not os.access(npy_savename, os.W_OK): raise ValueError( "unable to write test case file archive %s (try checking permissions)" % npy_savename ) def _setup_logging( args: argparse.Namespace, ) -> Tuple[Optional[str], Optional[logging.FileHandler]]: log_handler = None log_path = None if args.make_test_case or args.log_dir_path is not None: log_path = "ffsubsync.log" if args.log_dir_path is not None and os.path.isdir(args.log_dir_path): log_path = os.path.join(args.log_dir_path, log_path) log_handler = logging.FileHandler(log_path) logger.addHandler(log_handler) logger.info("this log will be written to %s", os.path.abspath(log_path)) return log_path, log_handler def _npy_savename(args: argparse.Namespace) -> str: return os.path.splitext(args.reference)[0] + ".npz" def _run_impl(args: argparse.Namespace, result: Dict[str, Any]) -> bool: if args.extract_subs_from_stream is not None: result["retval"] = extract_subtitles_from_reference(args) return True if args.srtin is not None and ( args.reference is None or (len(args.srtin) == 1 and args.srtin[0] == args.reference) ): return try_sync(args, None, result) reference_pipe = make_reference_pipe(args) logger.info("extracting speech segments from reference '%s'...", args.reference) reference_pipe.fit(args.reference) logger.info("...done") if args.make_test_case or args.serialize_speech: logger.info("serializing speech...") np.savez_compressed( _npy_savename(args), speech=reference_pipe.transform(args.reference) ) logger.info("...done") if not args.srtin: logger.info( "unsynchronized subtitle file not specified; skipping synchronization" ) return False return try_sync(args, reference_pipe, result) def validate_and_transform_args( parser_or_args: Union[argparse.ArgumentParser, argparse.Namespace] ) -> Optional[argparse.Namespace]: if isinstance(parser_or_args, argparse.Namespace): parser = None args = parser_or_args else: parser = parser_or_args args = parser.parse_args() try: validate_args(args) except ValueError as e: logger.error(e) if parser is not None: parser.print_usage() return None if args.gui_mode and args.srtout is None: args.srtout = "{}.synced.srt".format(os.path.splitext(args.srtin[0])[0]) try: validate_file_permissions(args) except ValueError as e: logger.error(e) return None ref_format = _ref_format(args.reference) if args.merge_with_reference and ref_format not in SUBTITLE_EXTENSIONS: logger.error( "merging synced output with reference only valid " "when reference composed of subtitles" ) return None return args def run( parser_or_args: Union[argparse.ArgumentParser, argparse.Namespace] ) -> Dict[str, Any]: sync_was_successful = False result = { "retval": 0, "offset_seconds": None, "framerate_scale_factor": None, } args = validate_and_transform_args(parser_or_args) if args is None: result["retval"] = 1 return result log_path, log_handler = _setup_logging(args) try: sync_was_successful = _run_impl(args, result) result["sync_was_successful"] = sync_was_successful return result finally: if log_handler is not None and log_path is not None: log_handler.close() logger.removeHandler(log_handler) if args.make_test_case: result["retval"] += make_test_case( args, _npy_savename(args), sync_was_successful ) if args.log_dir_path is None or not os.path.isdir(args.log_dir_path): os.remove(log_path) def add_main_args_for_cli(parser: argparse.ArgumentParser) -> None: parser.add_argument( "reference", nargs="?", help=( "Reference (video, subtitles, or a numpy array with VAD speech) " "to which to synchronize input subtitles." ), ) parser.add_argument( "-i", "--srtin", nargs="*", help="Input subtitles file (default=stdin)." ) parser.add_argument( "-o", "--srtout", help="Output subtitles file (default=stdout)." ) parser.add_argument( "--merge-with-reference", "--merge", action="store_true", help="Merge reference subtitles with synced output subtitles.", ) parser.add_argument( "--make-test-case", "--create-test-case", action="store_true", help="If specified, serialize reference speech to a numpy array, " "and create an archive with input/output subtitles " "and serialized speech.", ) parser.add_argument( "--reference-stream", "--refstream", "--reference-track", "--reftrack", default=None, help=( "Which stream/track in the video file to use as reference, " "formatted according to ffmpeg conventions. For example, 0:s:0 " "uses the first subtitle track; 0:a:3 would use the third audio track. " "You can also drop the leading `0:`; i.e. use s:0 or a:3, respectively. " "Example: `ffs ref.mkv -i in.srt -o out.srt --reference-stream s:2`" ), ) def add_cli_only_args(parser: argparse.ArgumentParser) -> None: parser.add_argument( "-v", "--version", action="version", version="{package} {version}".format( package=__package__, version=get_version() ), ) parser.add_argument( "--overwrite-input", action="store_true", help=( "If specified, will overwrite the input srt " "instead of writing the output to a new file." ), ) parser.add_argument( "--encoding", default=DEFAULT_ENCODING, help="What encoding to use for reading input subtitles " "(default=%s)." % DEFAULT_ENCODING, ) parser.add_argument( "--max-subtitle-seconds", type=float, default=DEFAULT_MAX_SUBTITLE_SECONDS, help="Maximum duration for a subtitle to appear on-screen " "(default=%.3f seconds)." % DEFAULT_MAX_SUBTITLE_SECONDS, ) parser.add_argument( "--start-seconds", type=int, default=DEFAULT_START_SECONDS, help="Start time for processing " "(default=%d seconds)." % DEFAULT_START_SECONDS, ) parser.add_argument( "--max-offset-seconds", type=float, default=DEFAULT_MAX_OFFSET_SECONDS, help="The max allowed offset seconds for any subtitle segment " "(default=%d seconds)." % DEFAULT_MAX_OFFSET_SECONDS, ) parser.add_argument( "--apply-offset-seconds", type=float, default=DEFAULT_APPLY_OFFSET_SECONDS, help="Apply a predefined offset in seconds to all subtitle segments " "(default=%d seconds)." % DEFAULT_APPLY_OFFSET_SECONDS, ) parser.add_argument( "--frame-rate", type=int, default=DEFAULT_FRAME_RATE, help="Frame rate for audio extraction (default=%d)." % DEFAULT_FRAME_RATE, ) parser.add_argument( "--skip-infer-framerate-ratio", action="store_true", help="If set, do not try to infer framerate ratio based on duration ratio.", ) parser.add_argument( "--non-speech-label", type=float, default=DEFAULT_NON_SPEECH_LABEL, help="Label to use for frames detected as non-speech (default=%f)" % DEFAULT_NON_SPEECH_LABEL, ) parser.add_argument( "--output-encoding", default="utf-8", help="What encoding to use for writing output subtitles " '(default=utf-8). Can indicate "same" to use same ' "encoding as that of the input.", ) parser.add_argument( "--reference-encoding", help="What encoding to use for reading / writing reference subtitles " "(if applicable, default=infer).", ) parser.add_argument( "--vad", choices=[ "subs_then_webrtc", "webrtc", "subs_then_auditok", "auditok", "subs_then_silero", "silero", ], default=None, help="Which voice activity detector to use for speech extraction " "(if using video / audio as a reference, default={}).".format(DEFAULT_VAD), ) parser.add_argument( "--no-fix-framerate", action="store_true", help="If specified, subsync will not attempt to correct a framerate " "mismatch between reference and subtitles.", ) parser.add_argument( "--serialize-speech", action="store_true", help="If specified, serialize reference speech to a numpy array.", ) parser.add_argument( "--extract-subs-from-stream", "--extract-subtitles-from-stream", default=None, help="If specified, do not attempt sync; instead, just extract subtitles" " from the specified stream using the reference.", ) parser.add_argument( "--suppress-output-if-offset-less-than", type=float, default=None, help="If specified, do not produce output if offset below provided threshold.", ) parser.add_argument( "--ffmpeg-path", "--ffmpegpath", default=None, help="Where to look for ffmpeg and ffprobe. Uses the system PATH by default.", ) parser.add_argument( "--log-dir-path", default=None, help=( "If provided, will save log file ffsubsync.log to this path " "(must be an existing directory)." ), ) parser.add_argument( "--gss", action="store_true", help="If specified, use golden-section search to try to find" "the optimal framerate ratio between video and subtitles.", ) parser.add_argument( "--strict", action="store_true", help="If specified, refuse to parse srt files with formatting issues.", ) parser.add_argument("--vlc-mode", action="store_true", help=argparse.SUPPRESS) parser.add_argument("--gui-mode", action="store_true", help=argparse.SUPPRESS) parser.add_argument("--skip-sync", action="store_true", help=argparse.SUPPRESS) def make_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Synchronize subtitles with video.") add_main_args_for_cli(parser) add_cli_only_args(parser) return parser def main() -> int: parser = make_parser() return run(parser)["retval"] if __name__ == "__main__": sys.exit(main()) ================================================ FILE: ffsubsync/ffsubsync_gui.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- import logging import os import sys from gooey import Gooey, GooeyParser from ffsubsync.constants import ( RELEASE_URL, WEBSITE, DEV_WEBSITE, DESCRIPTION, LONG_DESCRIPTION, PROJECT_NAME, PROJECT_LICENSE, COPYRIGHT_YEAR, SUBSYNC_RESOURCES_ENV_MAGIC, ) # set the env magic so that we look for resources in the right place if SUBSYNC_RESOURCES_ENV_MAGIC not in os.environ: os.environ[SUBSYNC_RESOURCES_ENV_MAGIC] = getattr(sys, "_MEIPASS", "") from ffsubsync.ffsubsync import run, add_cli_only_args from ffsubsync.version import get_version, update_available logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) _menu = [ { "name": "File", "items": [ { "type": "AboutDialog", "menuTitle": "About", "name": PROJECT_NAME, "description": LONG_DESCRIPTION, "version": get_version(), "copyright": COPYRIGHT_YEAR, "website": WEBSITE, "developer": DEV_WEBSITE, "license": PROJECT_LICENSE, }, { "type": "Link", "menuTitle": "Download latest release", "url": RELEASE_URL, }, ], } ] @Gooey( program_name=PROJECT_NAME, image_dir=os.path.join(os.environ[SUBSYNC_RESOURCES_ENV_MAGIC], "img"), menu=_menu, tabbed_groups=True, progress_regex=r"(\d+)%", hide_progress_msg=True, ) def make_parser(): description = DESCRIPTION if update_available(): description += ( "\nUpdate available! Please go to " '"File" -> "Download latest release"' " to update FFsubsync." ) parser = GooeyParser(description=description) main_group = parser.add_argument_group("Basic") main_group.add_argument( "reference", help="Reference (video or subtitles file) to which to synchronize input subtitles.", widget="FileChooser", ) main_group.add_argument("srtin", help="Input subtitles file", widget="FileChooser") main_group.add_argument( "-o", "--srtout", help="Output subtitles file (default=${srtin}.synced.srt).", widget="FileSaver", ) advanced_group = parser.add_argument_group("Advanced") # TODO: these are shared between gui and cli; don't duplicate this code advanced_group.add_argument( "--merge-with-reference", "--merge", action="store_true", help="Merge reference subtitles with synced output subtitles.", ) advanced_group.add_argument( "--make-test-case", "--create-test-case", action="store_true", help="If specified, create a test archive a few KiB in size " "to send to the developer as a debugging aid.", ) advanced_group.add_argument( "--reference-stream", "--refstream", "--reference-track", "--reftrack", default=None, help="Which stream/track in the video file to use as reference, " "formatted according to ffmpeg conventions. For example, s:0 " "uses the first subtitle track; a:3 would use the fourth audio track.", ) return parser def main(): parser = make_parser() _ = parser.parse_args() # Fool Gooey into presenting the simpler menu add_cli_only_args(parser) args = parser.parse_args() args.gui_mode = True return run(args) if __name__ == "__main__": sys.exit(main()) ================================================ FILE: ffsubsync/file_utils.py ================================================ # -*- coding: utf-8 -*- import sys class open_file: """ Context manager that opens a filename and closes it on exit, but does nothing for file-like objects. """ def __init__(self, filename, *args, **kwargs) -> None: self.closing = kwargs.pop("closing", False) if filename is None: stream = sys.stdout if "w" in args else sys.stdin self.fh = open(stream.fileno(), *args, **kwargs) elif isinstance(filename, str): self.fh = open(filename, *args, **kwargs) self.closing = True else: self.fh = filename def __enter__(self): return self.fh def __exit__(self, exc_type, exc_val, exc_tb): if self.closing: self.fh.close() return False ================================================ FILE: ffsubsync/generic_subtitles.py ================================================ # -*- coding: utf-8 -*- import copy from datetime import timedelta import logging import os from typing import cast, Any, Dict, Iterator, List, Optional import pysubs2 import srt import sys logging.basicConfig(level=logging.INFO) logger: logging.Logger = logging.getLogger(__name__) class GenericSubtitle: def __init__(self, start, end, inner): self.start = start self.end = end self.inner = inner def __eq__(self, other: object) -> bool: if not isinstance(other, GenericSubtitle): return False eq = True eq = eq and self.start == other.start eq = eq and self.end == other.end eq = eq and self.inner == other.inner return eq @property def content(self) -> str: if isinstance(self.inner, srt.Subtitle): ret = self.inner.content elif isinstance(self.inner, pysubs2.SSAEvent): ret = self.inner.text else: raise NotImplementedError( "unsupported subtitle type: %s" % type(self.inner) ) return ret def resolve_inner_timestamps(self): ret = copy.deepcopy(self.inner) if isinstance(self.inner, srt.Subtitle): ret.start = self.start ret.end = self.end elif isinstance(self.inner, pysubs2.SSAEvent): ret.start = pysubs2.make_time(s=self.start.total_seconds()) ret.end = pysubs2.make_time(s=self.end.total_seconds()) else: raise NotImplementedError( "unsupported subtitle type: %s" % type(self.inner) ) return ret def merge_with(self, other): assert isinstance(self.inner, type(other.inner)) inner_merged = copy.deepcopy(self.inner) if isinstance(self.inner, srt.Subtitle): inner_merged.content = "{}\n{}".format( inner_merged.content, other.inner.content ) return self.__class__(self.start, self.end, inner_merged) else: raise NotImplementedError( "unsupported subtitle type: %s" % type(self.inner) ) @classmethod def wrap_inner_subtitle(cls, sub) -> "GenericSubtitle": if isinstance(sub, srt.Subtitle): return cls(sub.start, sub.end, sub) elif isinstance(sub, pysubs2.SSAEvent): return cls( timedelta(milliseconds=sub.start), timedelta(milliseconds=sub.end), sub ) else: raise NotImplementedError("unsupported subtitle type: %s" % type(sub)) class GenericSubtitlesFile: def __init__(self, subs: List[GenericSubtitle], *_, **kwargs: Any): sub_format: str = cast(str, kwargs.pop("sub_format", None)) if sub_format is None: raise ValueError("format must be specified") encoding: str = cast(str, kwargs.pop("encoding", None)) if encoding is None: raise ValueError("encoding must be specified") self.subs_: List[GenericSubtitle] = subs self._sub_format: str = sub_format self._encoding: str = encoding self._styles: Optional[Dict[str, pysubs2.SSAStyle]] = kwargs.pop("styles", None) self._fonts_opaque: Optional[Dict[str, Any]] = kwargs.pop("fonts_opaque", None) self._info: Optional[Dict[str, str]] = kwargs.pop("info", None) def set_encoding(self, encoding: str) -> "GenericSubtitlesFile": if encoding != "same": self._encoding = encoding return self def __len__(self) -> int: return len(self.subs_) def __getitem__(self, item: int) -> GenericSubtitle: return self.subs_[item] def __iter__(self) -> Iterator[GenericSubtitle]: return iter(self.subs_) def clone_props_for_subs( self, new_subs: List[GenericSubtitle] ) -> "GenericSubtitlesFile": return GenericSubtitlesFile( new_subs, sub_format=self._sub_format, encoding=self._encoding, styles=self._styles, fonts_opaque=self._fonts_opaque, info=self._info, ) def gen_raw_resolved_subs(self): for sub in self.subs_: yield sub.resolve_inner_timestamps() def offset(self, td: timedelta) -> "GenericSubtitlesFile": offset_subs = [] for sub in self.subs_: offset_subs.append(GenericSubtitle(sub.start + td, sub.end + td, sub.inner)) return self.clone_props_for_subs(offset_subs) def write_file(self, fname: str) -> None: # TODO: converter to go between self.subs_format and out_format if fname is None: out_format = self._sub_format else: out_format = os.path.splitext(fname)[-1][1:] subs = list(self.gen_raw_resolved_subs()) if self._sub_format in ("ssa", "ass", "vtt"): ssaf = pysubs2.SSAFile() ssaf.events = subs if self._styles is not None: ssaf.styles = self._styles if self._info is not None: ssaf.info = self._info if self._fonts_opaque is not None: ssaf.fonts_opaque = self._fonts_opaque to_write = ssaf.to_string(out_format) elif self._sub_format == "srt" and out_format in ("ssa", "ass", "vtt"): to_write = pysubs2.SSAFile.from_string(srt.compose(subs)).to_string( out_format ) elif out_format == "srt": to_write = srt.compose(subs) else: raise NotImplementedError("unsupported output format: %s" % out_format) with open(fname or sys.stdout.fileno(), "wb") as f: f.write(to_write.encode(self._encoding)) class SubsMixin: def __init__(self, subs: Optional[GenericSubtitlesFile] = None) -> None: self.subs_: Optional[GenericSubtitlesFile] = subs def set_encoding(self, encoding: str) -> "SubsMixin": self.subs_.set_encoding(encoding) return self ================================================ FILE: ffsubsync/golden_section_search.py ================================================ """Python program for golden section search (straight-up copied from Wikipedia). This implementation reuses function evaluations, saving 1/2 of the evaluations per iteration, and returns a bounding interval.""" import logging import math logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) invphi = (math.sqrt(5) - 1) / 2 # 1 / phi invphi2 = (3 - math.sqrt(5)) / 2 # 1 / phi^2 def gss(f, a, b, tol=1e-4): """Golden-section search. Given a function f with a single local minimum in the interval [a,b], gss returns a subset interval [c,d] that contains the minimum with d-c <= tol. Example: >>> f = lambda x: (x-2)**2 >>> a = 1 >>> b = 5 >>> tol = 1e-5 >>> (c,d) = gss(f, a, b, tol) >>> print(c, d) 1.9999959837979107 2.0000050911830893 """ (a, b) = (min(a, b), max(a, b)) h = b - a if h <= tol: return a, b # Required steps to achieve tolerance n = int(math.ceil(math.log(tol / h) / math.log(invphi))) logger.info( "About to perform %d iterations of golden section search to find the best framerate", n, ) def f_wrapped(x, is_last_iter): try: return f(x, is_last_iter) except TypeError: return f(x) c = a + invphi2 * h d = a + invphi * h yc = f_wrapped(c, n == 1) yd = f_wrapped(d, n == 1) for k in range(n - 1): if yc < yd: b = d d = c yd = yc h = invphi * h c = a + invphi2 * h yc = f_wrapped(c, k == n - 2) else: a = c c = d yc = yd h = invphi * h d = a + invphi * h yd = f(d, k == n - 2) if yc < yd: return a, d else: return c, b ================================================ FILE: ffsubsync/sklearn_shim.py ================================================ # -*- coding: utf-8 -*- """ This module borrows and adapts `Pipeline` from `sklearn.pipeline` and `TransformerMixin` from `sklearn.base` in the scikit-learn framework (commit hash d205638475ca542dc46862652e3bb0be663a8eac) to be precise). Both are BSD licensed and allow for this sort of thing; attribution is given as a comment above each class. License reproduced below: BSD 3-Clause License Copyright (c) 2007-2022 The scikit-learn developers. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ from collections import defaultdict from itertools import islice from typing import Any, Callable, Optional from typing_extensions import Protocol class TransformerProtocol(Protocol): fit: Callable[..., "TransformerProtocol"] transform: Callable[[Any], Any] # Author: Gael Varoquaux # License: BSD 3 clause class TransformerMixin(TransformerProtocol): """Mixin class for all transformers.""" def fit_transform(self, X: Any, y: Optional[Any] = None, **fit_params: Any) -> Any: """ Fit to data, then transform it. Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X. Parameters ---------- X : ndarray of shape (n_samples, n_features) Training set. y : ndarray of shape (n_samples,), default=None Target values. **fit_params : dict Additional fit parameters. Returns ------- X_new : ndarray array of shape (n_samples, n_features_new) Transformed array. """ # non-optimized default implementation; override when a better # method is possible for a given clustering algorithm if y is None: # fit method of arity 1 (unsupervised transformation) return self.fit(X, **fit_params).transform(X) else: # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X) # Author: Edouard Duchesnay # Gael Varoquaux # Virgile Fritsch # Alexandre Gramfort # Lars Buitinck # License: BSD class Pipeline: def __init__(self, steps, verbose=False): self.steps = steps self.verbose = verbose self._validate_steps() def _validate_steps(self): names, estimators = zip(*self.steps) # validate estimators transformers = estimators[:-1] estimator = estimators[-1] for t in transformers: if t is None or t == "passthrough": continue if not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not hasattr( t, "transform" ): raise TypeError( "All intermediate steps should be " "transformers and implement fit and transform " "or be the string 'passthrough' " "'%s' (type %s) doesn't" % (t, type(t)) ) # We allow last estimator to be None as an identity transformation if ( estimator is not None and estimator != "passthrough" and not hasattr(estimator, "fit") ): raise TypeError( "Last step of Pipeline should implement fit " "or be the string 'passthrough'. " "'%s' (type %s) doesn't" % (estimator, type(estimator)) ) def _iter(self, with_final=True, filter_passthrough=True): """ Generate (idx, (name, trans)) tuples from self.steps When filter_passthrough is True, 'passthrough' and None transformers are filtered out. """ stop = len(self.steps) if not with_final: stop -= 1 for idx, (name, trans) in enumerate(islice(self.steps, 0, stop)): if not filter_passthrough: yield idx, name, trans elif trans is not None and trans != "passthrough": yield idx, name, trans def __len__(self) -> int: """ Returns the length of the Pipeline """ return len(self.steps) def __getitem__(self, ind): """Returns a sub-pipeline or a single esimtator in the pipeline Indexing with an integer will return an estimator; using a slice returns another Pipeline instance which copies a slice of this Pipeline. This copy is shallow: modifying (or fitting) estimators in the sub-pipeline will affect the larger pipeline and vice-versa. However, replacing a value in `step` will not affect a copy. """ if isinstance(ind, slice): if ind.step not in (1, None): raise ValueError("Pipeline slicing only supports a step of 1") return self.__class__(self.steps[ind]) try: name, est = self.steps[ind] except TypeError: # Not an int, try get step by name return self.named_steps[ind] return est @property def _estimator_type(self): return self.steps[-1][1]._estimator_type @property def named_steps(self): return dict(self.steps) @property def _final_estimator(self): estimator = self.steps[-1][1] return "passthrough" if estimator is None else estimator def _log_message(self, step_idx): if not self.verbose: return None name, step = self.steps[step_idx] return "(step %d of %d) Processing %s" % (step_idx + 1, len(self.steps), name) # Estimator interface def _fit(self, X, y=None, **fit_params): # shallow copy of steps - this should really be steps_ self.steps = list(self.steps) self._validate_steps() fit_params_steps = {name: {} for name, step in self.steps if step is not None} for pname, pval in fit_params.items(): if "__" not in pname: raise ValueError( "Pipeline.fit does not accept the {} parameter. " "You can pass parameters to specific steps of your " "pipeline using the stepname__parameter format, e.g. " "`Pipeline.fit(X, y, logisticregression__sample_weight" "=sample_weight)`.".format(pname) ) step, param = pname.split("__", 1) fit_params_steps[step][param] = pval for step_idx, name, transformer in self._iter( with_final=False, filter_passthrough=False ): if transformer is None or transformer == "passthrough": continue # Fit or load from cache the current transformer X, fitted_transformer = _fit_transform_one( transformer, X, y, None, **fit_params_steps[name] ) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) if self._final_estimator == "passthrough": return X, {} return X, fit_params_steps[self.steps[-1][0]] def fit(self, X, y=None, **fit_params): """Fit the model Fit all the transforms one after the other and transform the data, then fit the transformed data using the final estimator. Parameters ---------- X : iterable Training data. Must fulfill input requirements of first step of the pipeline. y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline. **fit_params : dict of string -> object Parameters passed to the ``fit`` method of each step, where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``. Returns ------- self : Pipeline This estimator """ Xt, fit_params = self._fit(X, y, **fit_params) if self._final_estimator != "passthrough": self._final_estimator.fit(Xt, y, **fit_params) return self def fit_transform(self, X, y=None, **fit_params): """Fit the model and transform with the final estimator Fits all the transforms one after the other and transforms the data, then uses fit_transform on transformed data with the final estimator. Parameters ---------- X : iterable Training data. Must fulfill input requirements of first step of the pipeline. y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline. **fit_params : dict of string -> object Parameters passed to the ``fit`` method of each step, where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``. Returns ------- Xt : array-like of shape (n_samples, n_transformed_features) Transformed samples """ last_step = self._final_estimator Xt, fit_params = self._fit(X, y, **fit_params) if last_step == "passthrough": return Xt if hasattr(last_step, "fit_transform"): return last_step.fit_transform(Xt, y, **fit_params) else: return last_step.fit(Xt, y, **fit_params).transform(Xt) @property def transform(self): """Apply transforms, and transform with the final estimator This also works where final estimator is ``None``: all prior transformations are applied. Parameters ---------- X : iterable Data to transform. Must fulfill input requirements of first step of the pipeline. Returns ------- Xt : array-like of shape (n_samples, n_transformed_features) """ # _final_estimator is None or has transform, otherwise attribute error # XXX: Handling the None case means we can't use if_delegate_has_method if self._final_estimator != "passthrough": self._final_estimator.transform return self._transform def _transform(self, X): Xt = X for _, _, transform in self._iter(): Xt = transform.transform(Xt) return Xt @property def classes_(self): return self.steps[-1][-1].classes_ @property def _pairwise(self): # check if first estimator expects pairwise input return getattr(self.steps[0][1], "_pairwise", False) @property def n_features_in_(self): # delegate to first step (which will call _check_is_fitted) return self.steps[0][1].n_features_in_ def _name_estimators(estimators): """Generate names for estimators.""" names = [ estimator if isinstance(estimator, str) else type(estimator).__name__.lower() for estimator in estimators ] namecount = defaultdict(int) for est, name in zip(estimators, names): namecount[name] += 1 for k, v in list(namecount.items()): if v == 1: del namecount[k] for i in reversed(range(len(estimators))): name = names[i] if name in namecount: names[i] += "-%d" % namecount[name] namecount[name] -= 1 return list(zip(names, estimators)) def make_pipeline(*steps, **kwargs) -> Pipeline: """Construct a Pipeline from the given estimators. This is a shorthand for the Pipeline constructor; it does not require, and does not permit, naming the estimators. Instead, their names will be set to the lowercase of their types automatically. Parameters ---------- *steps : list of estimators. verbose : bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed. Returns ------- p : Pipeline """ verbose = kwargs.pop("verbose", False) if kwargs: raise TypeError( 'Unknown keyword arguments: "{}"'.format(list(kwargs.keys())[0]) ) return Pipeline(_name_estimators(steps), verbose=verbose) def _transform_one(transformer, X, y, weight, **fit_params): res = transformer.transform(X) # if we have a weight for this transformer, multiply output if weight is None: return res return res * weight def _fit_transform_one(transformer, X, y, weight, **fit_params): """ Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned with the fitted transformer. If ``weight`` is not ``None``, the result will be multiplied by ``weight``. """ if hasattr(transformer, "fit_transform"): res = transformer.fit_transform(X, y, **fit_params) else: res = transformer.fit(X, y, **fit_params).transform(X) if weight is None: return res, transformer return res * weight, transformer ================================================ FILE: ffsubsync/speech_transformers.py ================================================ # -*- coding: utf-8 -*- import os from contextlib import contextmanager import logging import io import subprocess import sys from datetime import timedelta from typing import cast, Callable, Dict, List, Optional, Union import ffmpeg import numpy as np import tqdm from ffsubsync.constants import ( DEFAULT_ENCODING, DEFAULT_MAX_SUBTITLE_SECONDS, DEFAULT_SCALE_FACTOR, DEFAULT_START_SECONDS, SAMPLE_RATE, ) from ffsubsync.ffmpeg_utils import ffmpeg_bin_path, subprocess_args from ffsubsync.generic_subtitles import GenericSubtitle from ffsubsync.sklearn_shim import TransformerMixin from ffsubsync.sklearn_shim import Pipeline from ffsubsync.subtitle_parser import make_subtitle_parser from ffsubsync.subtitle_transformers import SubtitleScaler logging.basicConfig(level=logging.INFO) logger: logging.Logger = logging.getLogger(__name__) def make_subtitle_speech_pipeline( fmt: str = "srt", encoding: str = DEFAULT_ENCODING, caching: bool = False, max_subtitle_seconds: int = DEFAULT_MAX_SUBTITLE_SECONDS, start_seconds: int = DEFAULT_START_SECONDS, scale_factor: float = DEFAULT_SCALE_FACTOR, parser=None, **kwargs, ) -> Union[Pipeline, Callable[[float], Pipeline]]: if parser is None: parser = make_subtitle_parser( fmt, encoding=encoding, caching=caching, max_subtitle_seconds=max_subtitle_seconds, start_seconds=start_seconds, **kwargs, ) assert parser.encoding == encoding assert parser.max_subtitle_seconds == max_subtitle_seconds assert parser.start_seconds == start_seconds def subpipe_maker(framerate_ratio): return Pipeline( [ ("parse", parser), ("scale", SubtitleScaler(framerate_ratio)), ( "speech_extract", SubtitleSpeechTransformer( sample_rate=SAMPLE_RATE, start_seconds=start_seconds, framerate_ratio=framerate_ratio, ), ), ] ) if scale_factor is None: return subpipe_maker else: return subpipe_maker(scale_factor) def _make_auditok_detector( sample_rate: int, frame_rate: int, non_speech_label: float ) -> Callable[[bytes], np.ndarray]: try: from auditok import ( BufferAudioSource, ADSFactory, AudioEnergyValidator, StreamTokenizer, ) except ImportError as e: logger.error( """Error: auditok not installed! Consider installing it with `pip install auditok`. Note that auditok is GPLv3 licensed, which means that successfully importing it at runtime creates a derivative work that is GPLv3 licensed. For personal use this is fine, but note that any commercial use that relies on auditok must be open source as per the GPLv3!* *Not legal advice. Consult with a lawyer. """ ) raise e bytes_per_frame = 2 frames_per_window = frame_rate // sample_rate validator = AudioEnergyValidator(sample_width=bytes_per_frame, energy_threshold=50) tokenizer = StreamTokenizer( validator=validator, min_length=0.2 * sample_rate, max_length=int(5 * sample_rate), max_continuous_silence=0.25 * sample_rate, ) def _detect(asegment: bytes) -> np.ndarray: asource = BufferAudioSource( data_buffer=asegment, sampling_rate=frame_rate, sample_width=bytes_per_frame, channels=1, ) ads = ADSFactory.ads(audio_source=asource, block_dur=1.0 / sample_rate) ads.open() tokens = tokenizer.tokenize(ads) length = ( len(asegment) // bytes_per_frame + frames_per_window - 1 ) // frames_per_window media_bstring = np.zeros(length + 1) for token in tokens: media_bstring[token[1]] = 1.0 media_bstring[token[2] + 1] = non_speech_label - 1.0 return np.clip(np.cumsum(media_bstring)[:-1], 0.0, 1.0) return _detect def _make_webrtcvad_detector( sample_rate: int, frame_rate: int, non_speech_label: float ) -> Callable[[bytes], np.ndarray]: import webrtcvad vad = webrtcvad.Vad() vad.set_mode(3) # set non-speech pruning aggressiveness from 0 to 3 window_duration = 1.0 / sample_rate # duration in seconds frames_per_window = int(window_duration * frame_rate + 0.5) bytes_per_frame = 2 def _detect(asegment: bytes) -> np.ndarray: media_bstring = [] failures = 0 for start in range(0, len(asegment) // bytes_per_frame, frames_per_window): stop = min(start + frames_per_window, len(asegment) // bytes_per_frame) try: is_speech = vad.is_speech( asegment[start * bytes_per_frame : stop * bytes_per_frame], sample_rate=frame_rate, ) except Exception: is_speech = False failures += 1 # webrtcvad has low recall on mode 3, so treat non-speech as "not sure" media_bstring.append(1.0 if is_speech else non_speech_label) return np.array(media_bstring) return _detect def _make_silero_detector( sample_rate: int, frame_rate: int, non_speech_label: float ) -> Callable[[bytes], np.ndarray]: import torch window_duration = 1.0 / sample_rate # duration in seconds frames_per_window = int(window_duration * frame_rate + 0.5) bytes_per_frame = 1 model, _ = torch.hub.load( repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False, onnx=False, ) exception_logged = False def _detect(asegment) -> np.ndarray: asegment = np.frombuffer(asegment, np.int16).astype(np.float32) / (1 << 15) asegment = torch.FloatTensor(asegment) media_bstring = [] failures = 0 for start in range(0, len(asegment) // bytes_per_frame, frames_per_window): stop = min(start + frames_per_window, len(asegment)) try: speech_prob = model( asegment[start * bytes_per_frame : stop * bytes_per_frame], frame_rate, ).item() except Exception: nonlocal exception_logged if not exception_logged: exception_logged = True logger.exception("exception occurred during speech detection") speech_prob = 0.0 failures += 1 media_bstring.append(1.0 - (1.0 - speech_prob) * (1.0 - non_speech_label)) return np.array(media_bstring) return _detect class ComputeSpeechFrameBoundariesMixin: def __init__(self) -> None: self.start_frame_: Optional[int] = None self.end_frame_: Optional[int] = None @property def num_frames(self) -> Optional[int]: if self.start_frame_ is None or self.end_frame_ is None: return None return self.end_frame_ - self.start_frame_ def fit_boundaries( self, speech_frames: np.ndarray ) -> "ComputeSpeechFrameBoundariesMixin": nz = np.nonzero(speech_frames > 0.5)[0] if len(nz) > 0: self.start_frame_ = int(np.min(nz)) self.end_frame_ = int(np.max(nz)) return self class VideoSpeechTransformer(TransformerMixin): def __init__( self, vad: str, sample_rate: int, frame_rate: int, non_speech_label: float, start_seconds: int = 0, ffmpeg_path: Optional[str] = None, ref_stream: Optional[str] = None, vlc_mode: bool = False, gui_mode: bool = False, ) -> None: super(VideoSpeechTransformer, self).__init__() self.vad: str = vad self.sample_rate: int = sample_rate self.frame_rate: int = frame_rate self._non_speech_label: float = non_speech_label self.start_seconds: int = start_seconds self.ffmpeg_path: Optional[str] = ffmpeg_path self.ref_stream: Optional[str] = ref_stream self.vlc_mode: bool = vlc_mode self.gui_mode: bool = gui_mode self.video_speech_results_: Optional[np.ndarray] = None def try_fit_using_embedded_subs(self, fname: str) -> None: embedded_subs = [] embedded_subs_times = [] if self.ref_stream is None: # check first 5; should cover 99% of movies streams_to_try: List[str] = list(map("0:s:{}".format, range(5))) else: streams_to_try = [self.ref_stream] for stream in streams_to_try: ffmpeg_args = [ ffmpeg_bin_path( "ffmpeg", self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path ) ] ffmpeg_args.extend( [ "-loglevel", "fatal", "-nostdin", "-i", fname, "-map", "{}".format(stream), "-f", "srt", "-", ] ) process = subprocess.Popen( ffmpeg_args, **subprocess_args(include_stdout=True) ) output = io.BytesIO(process.communicate()[0]) if process.returncode != 0: break pipe = cast( Pipeline, make_subtitle_speech_pipeline(start_seconds=self.start_seconds), ).fit(output) speech_step = pipe.steps[-1][1] embedded_subs.append(speech_step) embedded_subs_times.append(speech_step.max_time_) if len(embedded_subs) == 0: if self.ref_stream is None: error_msg = "Video file appears to lack subtitle stream" else: error_msg = "Stream {} not found".format(self.ref_stream) raise ValueError(error_msg) # use longest set of embedded subs subs_to_use = embedded_subs[int(np.argmax(embedded_subs_times))] self.video_speech_results_ = subs_to_use.subtitle_speech_results_ def fit(self, fname: str, *_) -> "VideoSpeechTransformer": if "subs" in self.vad and ( self.ref_stream is None or self.ref_stream.startswith("0:s:") ): try: logger.info("Checking video for subtitles stream...") self.try_fit_using_embedded_subs(fname) logger.info("...success!") return self except Exception as e: logger.info(e) try: total_duration = ( float( ffmpeg.probe( fname, cmd=ffmpeg_bin_path( "ffprobe", self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path, ), )["format"]["duration"] ) - self.start_seconds ) except Exception as e: logger.warning(e) total_duration = None if "webrtc" in self.vad: detector = _make_webrtcvad_detector( self.sample_rate, self.frame_rate, self._non_speech_label ) elif "auditok" in self.vad: detector = _make_auditok_detector( self.sample_rate, self.frame_rate, self._non_speech_label ) elif "silero" in self.vad: detector = _make_silero_detector( self.sample_rate, self.frame_rate, self._non_speech_label ) else: raise ValueError("unknown vad: %s" % self.vad) media_bstring: List[np.ndarray] = [] ffmpeg_args = [ ffmpeg_bin_path( "ffmpeg", self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path ) ] if self.start_seconds > 0: ffmpeg_args.extend( [ "-ss", str(timedelta(seconds=self.start_seconds)), ] ) ffmpeg_args.extend(["-loglevel", "fatal", "-nostdin", "-i", fname]) if self.ref_stream is not None and self.ref_stream.startswith("0:a:"): ffmpeg_args.extend(["-map", self.ref_stream]) ffmpeg_args.extend( [ "-f", "s16le", "-ac", "1", "-acodec", "pcm_s16le", "-af", "aresample=async=1", "-ar", str(self.frame_rate), "-", ] ) process = subprocess.Popen(ffmpeg_args, **subprocess_args(include_stdout=True)) bytes_per_frame = 2 frames_per_window = bytes_per_frame * self.frame_rate // self.sample_rate windows_per_buffer = 10000 simple_progress = 0.0 redirect_stderr = None tqdm_extra_args = {} should_print_redirected_stderr = self.gui_mode if self.gui_mode: try: from contextlib import redirect_stderr # type: ignore tqdm_extra_args["file"] = sys.stdout except ImportError: should_print_redirected_stderr = False if redirect_stderr is None: @contextmanager def redirect_stderr(enter_result=None): yield enter_result assert redirect_stderr is not None pbar_output = io.StringIO() with redirect_stderr(pbar_output): with tqdm.tqdm( total=total_duration, disable=self.vlc_mode, **tqdm_extra_args ) as pbar: while True: in_bytes = process.stdout.read( frames_per_window * windows_per_buffer ) if not in_bytes: break newstuff = len(in_bytes) / float(bytes_per_frame) / self.frame_rate if ( total_duration is not None and simple_progress + newstuff > total_duration ): newstuff = total_duration - simple_progress simple_progress += newstuff pbar.update(newstuff) if self.vlc_mode and total_duration is not None: print("%d" % int(simple_progress * 100.0 / total_duration)) sys.stdout.flush() if should_print_redirected_stderr: assert self.gui_mode # no need to flush since we pass -u to do unbuffered output for gui mode print(pbar_output.read()) if "silero" not in self.vad: in_bytes = np.frombuffer(in_bytes, np.uint8) media_bstring.append(detector(in_bytes)) process.wait() if len(media_bstring) == 0: raise ValueError( "Unable to detect speech. " "Perhaps try specifying a different stream / track, or a different vad." ) self.video_speech_results_ = np.concatenate(media_bstring) logger.info("total of speech segments: %s", np.sum(self.video_speech_results_)) return self def transform(self, *_) -> np.ndarray: return self.video_speech_results_ _PAIRED_NESTER: Dict[str, str] = { "(": ")", "{": "}", "[": "]", # FIXME: False positive sometimes when there are html tags, e.g. Hello? # '<': '>', } # TODO: need way better metadata detector def _is_metadata(content: str, is_beginning_or_end: bool) -> bool: content = content.strip() if len(content) == 0: return True if ( content[0] in _PAIRED_NESTER.keys() and content[-1] == _PAIRED_NESTER[content[0]] ): return True if is_beginning_or_end: if "english" in content.lower(): return True if " - " in content: return True return False class SubtitleSpeechTransformer(TransformerMixin, ComputeSpeechFrameBoundariesMixin): def __init__( self, sample_rate: int, start_seconds: int = 0, framerate_ratio: float = 1.0 ) -> None: super(SubtitleSpeechTransformer, self).__init__() self.sample_rate: int = sample_rate self.start_seconds: int = start_seconds self.framerate_ratio: float = framerate_ratio self.subtitle_speech_results_: Optional[np.ndarray] = None self.max_time_: Optional[int] = None def fit(self, subs: List[GenericSubtitle], *_) -> "SubtitleSpeechTransformer": max_time = 0 for sub in subs: max_time = max(max_time, sub.end.total_seconds()) self.max_time_ = max_time - self.start_seconds samples = np.zeros(int(max_time * self.sample_rate) + 2, dtype=float) start_frame = float("inf") end_frame = 0 for i, sub in enumerate(subs): if _is_metadata(sub.content, i == 0 or i + 1 == len(subs)): continue start = int( round( (sub.start.total_seconds() - self.start_seconds) * self.sample_rate ) ) start_frame = min(start_frame, start) duration = sub.end.total_seconds() - sub.start.total_seconds() end = start + int(round(duration * self.sample_rate)) end_frame = max(end_frame, end) samples[start:end] = min(1.0 / self.framerate_ratio, 1.0) self.subtitle_speech_results_ = samples self.fit_boundaries(self.subtitle_speech_results_) return self def transform(self, *_) -> np.ndarray: assert self.subtitle_speech_results_ is not None return self.subtitle_speech_results_ class DeserializeSpeechTransformer(TransformerMixin): def __init__(self, non_speech_label: float) -> None: super(DeserializeSpeechTransformer, self).__init__() self._non_speech_label: float = non_speech_label self.deserialized_speech_results_: Optional[np.ndarray] = None def fit(self, fname, *_) -> "DeserializeSpeechTransformer": speech = np.load(fname) if hasattr(speech, "files"): if "speech" in speech.files: speech = speech["speech"] else: raise ValueError( 'could not find "speech" array in ' "serialized file; only contains: %s" % speech.files ) speech[speech < 1.0] = self._non_speech_label self.deserialized_speech_results_ = speech return self def transform(self, *_) -> np.ndarray: assert self.deserialized_speech_results_ is not None return self.deserialized_speech_results_ ================================================ FILE: ffsubsync/subtitle_parser.py ================================================ # -*- coding: utf-8 -*- from datetime import timedelta import logging from typing import Any, cast, List, Optional try: import cchardet except: # noqa: E722 cchardet = None try: import chardet except: # noqa: E722 chardet = None try: import charset_normalizer except: # noqa: E722 charset_normalizer = None import pysubs2 from ffsubsync.sklearn_shim import TransformerMixin import srt from ffsubsync.constants import ( DEFAULT_ENCODING, DEFAULT_MAX_SUBTITLE_SECONDS, DEFAULT_START_SECONDS, ) from ffsubsync.file_utils import open_file from ffsubsync.generic_subtitles import GenericSubtitle, GenericSubtitlesFile, SubsMixin logging.basicConfig(level=logging.INFO) logger: logging.Logger = logging.getLogger(__name__) def _preprocess_subs( subs, max_subtitle_seconds: Optional[int] = None, start_seconds: int = 0, tolerant: bool = True, ) -> List[GenericSubtitle]: subs_list = [] start_time = timedelta(seconds=start_seconds) max_duration = timedelta(days=1) if max_subtitle_seconds is not None: max_duration = timedelta(seconds=max_subtitle_seconds) subs = iter(subs) while True: try: next_sub = GenericSubtitle.wrap_inner_subtitle(next(subs)) if next_sub.start < start_time: continue next_sub.end = min(next_sub.end, next_sub.start + max_duration) subs_list.append(next_sub) # We don't catch SRTParseError here b/c that is typically raised when we # are trying to parse with the wrong encoding, in which case we might # be able to try another one on the *entire* set of subtitles elsewhere. except ValueError as e: if tolerant: logger.warning(e) continue else: raise except StopIteration: break return subs_list class GenericSubtitleParser(SubsMixin, TransformerMixin): def __init__( self, fmt: str = "srt", encoding: str = "infer", caching: bool = False, max_subtitle_seconds: Optional[int] = None, start_seconds: int = 0, skip_ssa_info: bool = False, strict: bool = False, ) -> None: super(self.__class__, self).__init__() self.sub_format: str = fmt self.encoding: str = encoding self.caching: bool = caching self.fit_fname: Optional[str] = None self.detected_encoding_: Optional[str] = None self.max_subtitle_seconds: Optional[int] = max_subtitle_seconds self.start_seconds: int = start_seconds # FIXME: hack to get tests to pass; remove self._skip_ssa_info: bool = skip_ssa_info self._strict: bool = strict def fit(self, fname: str, *_) -> "GenericSubtitleParser": if self.caching and self.fit_fname == ("" if fname is None else fname): return self encodings_to_try = (self.encoding,) with open_file(fname, "rb") as f: subs = f.read() if self.encoding == "infer": for chardet_lib in (cchardet, charset_normalizer, chardet): if chardet_lib is not None: try: detected_encoding = cast( Optional[str], chardet_lib.detect(subs)["encoding"] ) except: # noqa: E722 continue if detected_encoding is not None: self.detected_encoding_ = detected_encoding encodings_to_try = (detected_encoding,) break assert self.detected_encoding_ is not None logger.info("detected encoding: %s" % self.detected_encoding_) exc = None for encoding in encodings_to_try: try: decoded_subs = subs.decode(encoding, errors="replace").strip() if self.sub_format == "srt": parsed_subs = srt.parse( decoded_subs, ignore_errors=not self._strict ) elif self.sub_format in ("ass", "ssa", "sub", "vtt"): parsed_subs = pysubs2.SSAFile.from_string(decoded_subs) else: raise NotImplementedError( "unsupported format: %s" % self.sub_format ) extra_generic_subtitle_file_kwargs = {} if isinstance(parsed_subs, pysubs2.SSAFile): extra_generic_subtitle_file_kwargs.update( dict( styles=parsed_subs.styles, # pysubs2 on Python >= 3.6 doesn't support this fonts_opaque=getattr(parsed_subs, "fonts_opaque", None), info=parsed_subs.info if not self._skip_ssa_info else None, ) ) self.subs_ = GenericSubtitlesFile( _preprocess_subs( parsed_subs, max_subtitle_seconds=self.max_subtitle_seconds, start_seconds=self.start_seconds, ), sub_format=self.sub_format, encoding=encoding, **extra_generic_subtitle_file_kwargs, ) self.fit_fname = "" if fname is None else fname if len(encodings_to_try) > 1: self.detected_encoding_ = encoding logger.info("detected encoding: %s" % self.detected_encoding_) return self except Exception as e: exc = e continue raise exc def transform(self, *_) -> GenericSubtitlesFile: return self.subs_ def make_subtitle_parser( fmt: str, encoding: str = DEFAULT_ENCODING, caching: bool = False, max_subtitle_seconds: int = DEFAULT_MAX_SUBTITLE_SECONDS, start_seconds: int = DEFAULT_START_SECONDS, **kwargs: Any, ) -> GenericSubtitleParser: return GenericSubtitleParser( fmt=fmt, encoding=encoding, caching=caching, max_subtitle_seconds=max_subtitle_seconds, start_seconds=start_seconds, skip_ssa_info=kwargs.get("skip_ssa_info", False), strict=kwargs.get("strict", False), ) ================================================ FILE: ffsubsync/subtitle_transformers.py ================================================ # -*- coding: utf-8 -*- from datetime import timedelta import logging import numbers from ffsubsync.generic_subtitles import GenericSubtitle, GenericSubtitlesFile, SubsMixin from ffsubsync.sklearn_shim import TransformerMixin logging.basicConfig(level=logging.INFO) logger: logging.Logger = logging.getLogger(__name__) class SubtitleShifter(SubsMixin, TransformerMixin): def __init__(self, td_seconds): super(SubsMixin, self).__init__() if not isinstance(td_seconds, timedelta): self.td_seconds = timedelta(seconds=td_seconds) else: self.td_seconds = td_seconds def fit(self, subs: GenericSubtitlesFile, *_): self.subs_ = subs.offset(self.td_seconds) return self def transform(self, *_): return self.subs_ class SubtitleScaler(SubsMixin, TransformerMixin): def __init__(self, scale_factor): assert isinstance(scale_factor, numbers.Number) super(SubsMixin, self).__init__() self.scale_factor = scale_factor def fit(self, subs: GenericSubtitlesFile, *_): scaled_subs = [] for sub in subs: scaled_subs.append( GenericSubtitle( # py2 doesn't support direct multiplication of timedelta w/ float timedelta(seconds=sub.start.total_seconds() * self.scale_factor), timedelta(seconds=sub.end.total_seconds() * self.scale_factor), sub.inner, ) ) self.subs_ = subs.clone_props_for_subs(scaled_subs) return self def transform(self, *_): return self.subs_ class SubtitleMerger(SubsMixin, TransformerMixin): def __init__(self, reference_subs, first="reference"): assert first in ("reference", "output") super(SubsMixin, self).__init__() self.reference_subs = reference_subs self.first = first def fit(self, output_subs: GenericSubtitlesFile, *_): def _merger_gen(a, b): ita, itb = iter(a), iter(b) cur_a = next(ita, None) cur_b = next(itb, None) while True: if cur_a is None and cur_b is None: return elif cur_a is None: while cur_b is not None: yield cur_b cur_b = next(itb, None) return elif cur_b is None: while cur_a is not None: yield cur_a cur_a = next(ita, None) return # else: neither are None if cur_a.start < cur_b.start: swapped = False else: swapped = True cur_a, cur_b = cur_b, cur_a ita, itb = itb, ita prev_a = cur_a while prev_a is not None and cur_a.start < cur_b.start: cur_a = next(ita, None) if cur_a is None or cur_a.start < cur_b.start: yield prev_a prev_a = cur_a if prev_a is None: while cur_b is not None: yield cur_b cur_b = next(itb, None) return if cur_b.start - prev_a.start < cur_a.start - cur_b.start: if swapped: yield cur_b.merge_with(prev_a) ita, itb = itb, ita cur_a, cur_b = cur_b, cur_a cur_a = next(ita, None) else: yield prev_a.merge_with(cur_b) cur_b = next(itb, None) else: if swapped: yield cur_b.merge_with(cur_a) ita, itb = itb, ita else: yield cur_a.merge_with(cur_b) cur_a = next(ita, None) cur_b = next(itb, None) merged_subs = [] if self.first == "reference": first, second = self.reference_subs, output_subs else: first, second = output_subs, self.reference_subs for merged in _merger_gen(first, second): merged_subs.append(merged) self.subs_ = output_subs.clone_props_for_subs(merged_subs) return self def transform(self, *_): return self.subs_ ================================================ FILE: ffsubsync/version.py ================================================ # -*- coding: utf-8 -*- import os from ffsubsync.constants import SUBSYNC_RESOURCES_ENV_MAGIC from ffsubsync._version import get_versions __version__ = get_versions()["version"] del get_versions def get_version(): if "unknown" in __version__.lower(): with open( os.path.join(os.environ[SUBSYNC_RESOURCES_ENV_MAGIC], "__version__") ) as f: return f.read().strip() else: return __version__ def make_version_tuple(vstr=None): if vstr is None: vstr = __version__ if vstr[0] == "v": vstr = vstr[1:] components = [] for component in vstr.split("+")[0].split("."): try: components.append(int(component)) except ValueError: break return tuple(components) def update_available(): import requests from requests.exceptions import Timeout from .constants import API_RELEASE_URL try: resp = requests.get(API_RELEASE_URL, timeout=1) latest_vstr = resp.json()["tag_name"] except Timeout: return False except KeyError: return False if not resp.ok: return False return make_version_tuple(get_version()) < make_version_tuple(latest_vstr) ================================================ FILE: gui/.gitignore ================================================ build dist ================================================ FILE: gui/Makefile ================================================ .PHONY: macos macos: clean app pkg app: ./build-macos.sh pkg: ./package-macos.sh clean: rm -r dist/ build/ ================================================ FILE: gui/README.md ================================================ == Note on platform-specific PyInstaller version in requirements.txt == PyInstaller>=3.6 introduces a webrtcvad hook that seems to not play nicely with the webrtcvad-wheels package. This package contains prebuilt wheels and is needed for Windows (unless I can get a working C compiler in my Windows build environment, which is doubtful). For MacOS this isn't a problem since I can use the vanilla webrtcvad package and leverage the preexisting hook in PyInstaller>=3.6, but for Windows I need to use the old version of PyInstaller without the hook and introduce my own (in the 'hooks' directory). == Note on Scikit-Learn == There is some DLL that wasn't getting bundled in the Windows PyInstaller build and causing the built exe to complain. My solution was to remove the dependency and include a shim for the Pipeline / Transformer fuctionality. ================================================ FILE: gui/build-macos.sh ================================================ #!/usr/bin/env bash python3 -m PyInstaller --clean -y --dist ./dist/macos build.spec # ref: https://github.com/chriskiehl/Gooey/issues/259#issuecomment-522432026 mkdir -p ./dist/macos/Contents ================================================ FILE: gui/build-windows.sh ================================================ #!/usr/bin/env bash nbits=${1:-64} tag="python3" if [[ "$nbits" == 32 ]]; then tag="${tag}-32bit" fi docker run -v "$(pwd):/src/" -v "$(pwd)/..:/ffsubsync/" --entrypoint /bin/sh "cdrx/pyinstaller-windows:${tag}" -c "pip install -e /ffsubsync && /ffsubsync/gui/entrypoint-windows.sh" rm -r "./dist/win${nbits}" mv ./dist/windows "./dist/win${nbits}" ================================================ FILE: gui/build.spec ================================================ # -*- mode: python -*- import os import platform import gooey root = '..' hookspath = None if platform.system() == 'Windows': root = '/ffsubsync' hookspath = [os.path.join(os.curdir, 'hooks')] ffmpeg_bin = os.path.join(root, 'resources/ffmpeg-bin') datas = [(os.path.join(root, 'resources/img/program_icon.png'), './img')] datas.append((os.path.join(root, 'resources/img/config_icon.png'), './img')) datas.append((os.path.join(root, '__version__'), '.')) if platform.system() == 'Darwin': ffmpeg_bin = os.path.join(ffmpeg_bin, 'macos') elif platform.system() == 'Windows': arch_bits = int(platform.architecture()[0][:2]) ffmpeg_bin = os.path.join(ffmpeg_bin, 'win{}'.format(arch_bits)) if arch_bits == 64: datas.append((os.path.join(root, 'resources/lib/win64/VCRUNTIME140_1.dll'), '.')) else: raise Exception('ffmpeg not available for {}'.format(platform.system())) gooey_root = os.path.dirname(gooey.__file__) gooey_languages = Tree(os.path.join(gooey_root, 'languages'), prefix = 'gooey/languages') gooey_images = Tree(os.path.join(gooey_root, 'images'), prefix = 'gooey/images') a = Analysis([os.path.join(os.curdir, 'ffsubsync-gui.py')], datas=datas, hiddenimports=['pkg_resources.py2_warn'], # ref: https://github.com/pypa/setuptools/issues/1963 hookspath=hookspath, runtime_hooks=None, binaries=[(ffmpeg_bin, 'ffmpeg-bin')], ) pyz = PYZ(a.pure) # runtime options to pass to interpreter -- '-u' is for unbuffered io options = [('u', None, 'OPTION')] exe = EXE(pyz, a.scripts, a.binaries, a.zipfiles, a.datas, options, gooey_languages, # Add them in to collected files gooey_images, # Same here. name='FFsubsync', debug=False, strip=None, upx=True, console=False, windowed=True, icon=os.path.join(root, 'resources', 'img', 'program_icon.ico') ) if platform.system() == 'Darwin': # info_plist = {'addition_prop': 'additional_value'} info_plist = {} app = BUNDLE(exe, icon=os.path.join(root, 'resources', 'img', 'program_icon.icns'), name='FFsubsync.app', bundle_identifier=None, info_plist=info_plist ) ================================================ FILE: gui/entrypoint-windows.sh ================================================ #!/bin/bash # Fail on errors. set -e # Make sure .bashrc is sourced . /root/.bashrc # Allow the workdir to be set using an env var. # Useful for CI pipiles which use docker for their build steps # and don't allow that much flexibility to mount volumes WORKDIR=${SRCDIR:-/src} # # In case the user specified a custom URL for PYPI, then use # that one, instead of the default one. # if [[ "$PYPI_URL" != "https://pypi.python.org/" ]] || \ [[ "$PYPI_INDEX_URL" != "https://pypi.python.org/simple" ]]; then # the funky looking regexp just extracts the hostname, excluding port # to be used as a trusted-host. mkdir -p /wine/drive_c/users/root/pip echo "[global]" > /wine/drive_c/users/root/pip/pip.ini echo "index = $PYPI_URL" >> /wine/drive_c/users/root/pip/pip.ini echo "index-url = $PYPI_INDEX_URL" >> /wine/drive_c/users/root/pip/pip.ini echo "trusted-host = $(echo $PYPI_URL | perl -pe 's|^.*?://(.*?)(:.*?)?/.*$|$1|')" >> /wine/drive_c/users/root/pip/pip.ini echo "Using custom pip.ini: " cat /wine/drive_c/users/root/pip/pip.ini fi cd $WORKDIR if [ -f requirements.txt ]; then pip install -r requirements.txt fi # [ -f requirements.txt ] rm /wine/drive_c/Python37/Lib/site-packages/PyInstaller/hooks/hook-webrtcvad.py echo "$@" if [[ "$@" == "" ]]; then pyinstaller --clean -y --dist ./dist/windows --workpath /tmp *.spec chown -R --reference=. ./dist/windows else sh -c "$@" fi # [[ "$@" == "" ]] ================================================ FILE: gui/ffsubsync-gui.py ================================================ from ffsubsync.ffsubsync_gui import main if __name__ == '__main__': main() ================================================ FILE: gui/hooks/hook-webrtcvad.py ================================================ from PyInstaller.utils.hooks import copy_metadata datas = copy_metadata('webrtcvad-wheels') ================================================ FILE: gui/package-macos.sh ================================================ #!/usr/bin/env bash set -Eeuxo pipefail BASE=. DIST="$BASE/dist" BUILD="$BASE/build/dmg" VERSION=$(python3 -c "from subsync.version import __version__; print(__version__)") APP="Subsync.app" TARGET="$DIST/subsync-${VERSION}-mac-x86_64.dmg" test -e "$BUILD" && rm -rf "$BUILD" test -e "$TARGET" && rm -f "$TARGET" mkdir -p "$BUILD" cp -r "$DIST/$APP" "$BUILD" create-dmg \ --volname "subsync installer" \ `#--volicon "icon.icns"` \ --window-pos 300 200 \ --window-size 700 500 \ --icon-size 150 \ --icon "$APP" 200 200 \ --hide-extension "$APP" \ --app-drop-link 450 200 \ --no-internet-enable \ "$TARGET" "$BUILD" ================================================ FILE: gui/requirements.txt ================================================ gooey pyinstaller>=3.6 requests ================================================ FILE: pyproject.toml ================================================ [project] name = "ffsubsync" dynamic = ["authors", "classifiers", "dependencies", "license", "readme", "scripts", "version"] [tool.black] line-length = 88 target-version = ['py39'] extend-exclude = '(^/versioneer|_version)\.py' ================================================ FILE: pytest.ini ================================================ [pytest] markers = integration: mark a test as an integration test. #filterwarnings = # ignore::DeprecationWarning ================================================ FILE: requirements-dev.txt ================================================ black flake8 mypy pytest pytest-cov pyyaml twine types-requests versioneer ================================================ FILE: requirements.txt ================================================ auditok==0.1.5 chardet;python_version>='3.7' charset_normalizer faust-cchardet;python_version<'3.13' ffmpeg-python numpy>=1.12.0 pysubs2;python_version<'3.7' pysubs2>=1.2.0;python_version>='3.7' rich setuptools srt>=3.0.0 tqdm typing_extensions webrtcvad;platform_system!='Windows' webrtcvad-wheels;platform_system=='Windows' ================================================ FILE: scripts/blacken.sh ================================================ #!/usr/bin/env bash # ref: https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/ set -euxo pipefail DIRS="./ffsubsync ./tests" black $DIRS $@ ================================================ FILE: scripts/bump-version.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- import argparse import subprocess import sys from ffsubsync.version import make_version_tuple def main(*_): components = list(make_version_tuple()) components[-1] += 1 version = '.'.join(str(c) for c in components) subprocess.check_output(['git', 'tag', version]) return 0 if __name__ == '__main__': parser = argparse.ArgumentParser(description='Bump version and create git tag.') args = parser.parse_args() sys.exit(main(args)) ================================================ FILE: scripts/deploy.sh ================================================ #!/usr/bin/env bash # ref: https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/ set -euxo pipefail if ! git diff-index --quiet HEAD --; then echo "dirty working tree; please clean or commit changes" exit 1 fi if ! git describe --exact-match --tags HEAD > /dev/null; then echo "current revision not tagged; please deploy from a tagged revision" exit 1 fi current="$(python -c 'import versioneer; print(versioneer.get_version())')" [[ $? -eq 1 ]] && exit 1 latest="$(git describe --tags $(git rev-list --tags --max-count=1))" [[ $? -eq 1 ]] && exit 1 if [[ "$current" != "$latest" ]]; then echo "current revision is not the latest version; please deploy from latest version" exit 1 fi expect <= 0.99 def detected_encoding(fname): parser = GenericSubtitleParser(skip_ssa_info=True) parser.fit(fname) return parser.detected_encoding_ @pytest.mark.integration @pytest.mark.parametrize("args,truth,should_detect_encoding", gen_synctest_configs()) def test_sync_matches_ground_truth(args, truth, should_detect_encoding): # context manager TemporaryDirectory not available on py2 dirpath = tempfile.mkdtemp() try: args.srtout = os.path.join( dirpath, "test" + os.path.splitext(args.srtin[0])[-1] ) args.skip_ssa_info = True assert ffsubsync.run(args)["retval"] == 0 assert timestamps_roughly_match(args.srtout, truth) if should_detect_encoding is not None: assert detected_encoding(args.srtin[0]) == should_detect_encoding finally: shutil.rmtree(dirpath) ================================================ FILE: tests/test_misc.py ================================================ # -*- coding: utf-8 -*- import pytest from ffsubsync.version import make_version_tuple @pytest.mark.parametrize( "vstr, expected", [("v0.1.1", (0, 1, 1)), ("v1.2.3", (1, 2, 3)), ("4.5.6.1", (4, 5, 6, 1))], ) def test_version_tuple_from_string(vstr, expected): assert make_version_tuple(vstr) == expected ================================================ FILE: tests/test_subtitles.py ================================================ # -*- coding: utf-8 -*- import itertools from io import BytesIO from datetime import timedelta import pytest import numpy as np from ffsubsync.sklearn_shim import make_pipeline from ffsubsync.speech_transformers import SubtitleSpeechTransformer from ffsubsync.subtitle_parser import GenericSubtitleParser from ffsubsync.subtitle_transformers import SubtitleShifter fake_srt = b"""1 00:00:00,178 --> 00:00:01,1416 Previously on "Your favorite TV show..." 2 00:00:01,1828 --> 00:00:04,549 Oh hi, Mark. 3 00:00:04,653 --> 00:00:03,3062 You are tearing me apart, Lisa! """ # Occasionally some srt files have timestamps whose 'milliseconds' # field has more than 3 digits... Ideally we should test that these # are handled properly with dedicated tests, but in the interest of # development speed I've opted to sprinkle in a few >3 digit # millisecond fields into the dummy string above in order to exercise # this case integration-test style in the below unit tests. @pytest.mark.parametrize("start_seconds", [0, 2, 4, 6]) def test_start_seconds(start_seconds): parser_zero = GenericSubtitleParser(start_seconds=0) parser_zero.fit(BytesIO(fake_srt)) parser = GenericSubtitleParser(start_seconds=start_seconds) parser.fit(BytesIO(fake_srt)) expected = [ sub for sub in parser_zero.subs_ if sub.start >= timedelta(seconds=start_seconds) ] assert all(esub == psub for esub, psub in zip(expected, parser.subs_)) @pytest.mark.parametrize("max_seconds", [1, 1.5, 2.0, 2.5]) def test_max_seconds(max_seconds): parser = GenericSubtitleParser(max_subtitle_seconds=max_seconds) parser.fit(BytesIO(fake_srt)) assert max(sub.end - sub.start for sub in parser.subs_) <= timedelta( seconds=max_seconds ) @pytest.mark.parametrize("encoding", ["utf-8", "ascii", "latin-1"]) def test_same_encoding(encoding): parser = GenericSubtitleParser(encoding=encoding) offseter = SubtitleShifter(1) pipe = make_pipeline(parser, offseter) pipe.fit(BytesIO(fake_srt)) assert parser.subs_._encoding == encoding assert offseter.subs_._encoding == parser.subs_._encoding assert offseter.subs_.set_encoding("same")._encoding == encoding assert offseter.subs_.set_encoding("utf-8")._encoding == "utf-8" @pytest.mark.parametrize("offset", [1, 1.5, -2.3]) def test_offset(offset): parser = GenericSubtitleParser() offseter = SubtitleShifter(offset) pipe = make_pipeline(parser, offseter) pipe.fit(BytesIO(fake_srt)) for sub_orig, sub_offset in zip(parser.subs_, offseter.subs_): assert ( abs( sub_offset.start.total_seconds() - sub_orig.start.total_seconds() - offset ) < 1e-6 ) assert ( abs(sub_offset.end.total_seconds() - sub_orig.end.total_seconds() - offset) < 1e-6 ) @pytest.mark.parametrize( "sample_rate,start_seconds", itertools.product([10, 20, 100, 300], [0, 2, 4, 6]) ) def test_speech_extraction(sample_rate, start_seconds): parser = GenericSubtitleParser(start_seconds=start_seconds) extractor = SubtitleSpeechTransformer( sample_rate=sample_rate, start_seconds=start_seconds ) pipe = make_pipeline(parser, extractor) bitstring = pipe.fit_transform(BytesIO(fake_srt)).astype(bool) bitstring_shifted_left = np.append(bitstring[1:], [False]) bitstring_shifted_right = np.append([False], bitstring[:-1]) bitstring_cumsum = np.cumsum(bitstring) consec_ones_end_pos = np.nonzero( bitstring_cumsum * (bitstring ^ bitstring_shifted_left) * (bitstring_cumsum != np.cumsum(bitstring_shifted_right)) )[0] prev = 0 for pos, sub in zip(consec_ones_end_pos, parser.subs_): start = int(round(sub.start.total_seconds() * sample_rate)) duration = sub.end.total_seconds() - sub.start.total_seconds() stop = start + int(round(duration * sample_rate)) assert bitstring_cumsum[pos] - prev == stop - start prev = bitstring_cumsum[pos] def test_max_time_found(): parser = GenericSubtitleParser() extractor = SubtitleSpeechTransformer(sample_rate=100) pipe = make_pipeline(parser, extractor) pipe.fit(BytesIO(fake_srt)) assert extractor.max_time_ == 6.062 ================================================ FILE: versioneer.py ================================================ # Version: 0.22 """The Versioneer - like a rocketeer, but for versions. The Versioneer ============== * like a rocketeer, but for versions! * https://github.com/python-versioneer/python-versioneer * Brian Warner * License: Public Domain * Compatible with: Python 3.6, 3.7, 3.8, 3.9, 3.10 and pypy3 * [![Latest Version][pypi-image]][pypi-url] * [![Build Status][travis-image]][travis-url] This is a tool for managing a recorded version number in distutils/setuptools-based python projects. The goal is to remove the tedious and error-prone "update the embedded version string" step from your release process. Making a new release should be as easy as recording a new tag in your version-control system, and maybe making new tarballs. ## Quick Install * `pip install versioneer` to somewhere in your $PATH * add a `[versioneer]` section to your setup.cfg (see [Install](INSTALL.md)) * run `versioneer install` in your source tree, commit the results * Verify version information with `python setup.py version` ## Version Identifiers Source trees come from a variety of places: * a version-control system checkout (mostly used by developers) * a nightly tarball, produced by build automation * a snapshot tarball, produced by a web-based VCS browser, like github's "tarball from tag" feature * a release tarball, produced by "setup.py sdist", distributed through PyPI Within each source tree, the version identifier (either a string or a number, this tool is format-agnostic) can come from a variety of places: * ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows about recent "tags" and an absolute revision-id * the name of the directory into which the tarball was unpacked * an expanded VCS keyword ($Id$, etc) * a `_version.py` created by some earlier build step For released software, the version identifier is closely related to a VCS tag. Some projects use tag names that include more than just the version string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool needs to strip the tag prefix to extract the version identifier. For unreleased software (between tags), the version identifier should provide enough information to help developers recreate the same tree, while also giving them an idea of roughly how old the tree is (after version 1.2, before version 1.3). Many VCS systems can report a description that captures this, for example `git describe --tags --dirty --always` reports things like "0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the 0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has uncommitted changes). The version identifier is used for multiple purposes: * to allow the module to self-identify its version: `myproject.__version__` * to choose a name and prefix for a 'setup.py sdist' tarball ## Theory of Operation Versioneer works by adding a special `_version.py` file into your source tree, where your `__init__.py` can import it. This `_version.py` knows how to dynamically ask the VCS tool for version information at import time. `_version.py` also contains `$Revision$` markers, and the installation process marks `_version.py` to have this marker rewritten with a tag name during the `git archive` command. As a result, generated tarballs will contain enough information to get the proper version. To allow `setup.py` to compute a version too, a `versioneer.py` is added to the top level of your source tree, next to `setup.py` and the `setup.cfg` that configures it. This overrides several distutils/setuptools commands to compute the version when invoked, and changes `setup.py build` and `setup.py sdist` to replace `_version.py` with a small static file that contains just the generated version data. ## Installation See [INSTALL.md](./INSTALL.md) for detailed installation instructions. ## Version-String Flavors Code which uses Versioneer can learn about its version string at runtime by importing `_version` from your main `__init__.py` file and running the `get_versions()` function. From the "outside" (e.g. in `setup.py`), you can import the top-level `versioneer.py` and run `get_versions()`. Both functions return a dictionary with different flavors of version information: * `['version']`: A condensed version string, rendered using the selected style. This is the most commonly used value for the project's version string. The default "pep440" style yields strings like `0.11`, `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section below for alternative styles. * `['full-revisionid']`: detailed revision identifier. For Git, this is the full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". * `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the commit date in ISO 8601 format. This will be None if the date is not available. * `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that this is only accurate if run in a VCS checkout, otherwise it is likely to be False or None * `['error']`: if the version string could not be computed, this will be set to a string describing the problem, otherwise it will be None. It may be useful to throw an exception in setup.py if this is set, to avoid e.g. creating tarballs with a version string of "unknown". Some variants are more useful than others. Including `full-revisionid` in a bug report should allow developers to reconstruct the exact code being tested (or indicate the presence of local changes that should be shared with the developers). `version` is suitable for display in an "about" box or a CLI `--version` output: it can be easily compared against release notes and lists of bugs fixed in various releases. The installer adds the following text to your `__init__.py` to place a basic version in `YOURPROJECT.__version__`: from ._version import get_versions __version__ = get_versions()['version'] del get_versions ## Styles The setup.cfg `style=` configuration controls how the VCS information is rendered into a version string. The default style, "pep440", produces a PEP440-compliant string, equal to the un-prefixed tag name for actual releases, and containing an additional "local version" section with more detail for in-between builds. For Git, this is TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags --dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and that this commit is two revisions ("+2") beyond the "0.11" tag. For released software (exactly equal to a known tag), the identifier will only contain the stripped tag, e.g. "0.11". Other styles are available. See [details.md](details.md) in the Versioneer source tree for descriptions. ## Debugging Versioneer tries to avoid fatal errors: if something goes wrong, it will tend to return a version of "0+unknown". To investigate the problem, run `setup.py version`, which will run the version-lookup code in a verbose mode, and will display the full contents of `get_versions()` (including the `error` string, which may help identify what went wrong). ## Known Limitations Some situations are known to cause problems for Versioneer. This details the most significant ones. More can be found on Github [issues page](https://github.com/python-versioneer/python-versioneer/issues). ### Subprojects Versioneer has limited support for source trees in which `setup.py` is not in the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are two common reasons why `setup.py` might not be in the root: * Source trees which contain multiple subprojects, such as [Buildbot](https://github.com/buildbot/buildbot), which contains both "master" and "slave" subprojects, each with their own `setup.py`, `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI distributions (and upload multiple independently-installable tarballs). * Source trees whose main purpose is to contain a C library, but which also provide bindings to Python (and perhaps other languages) in subdirectories. Versioneer will look for `.git` in parent directories, and most operations should get the right version string. However `pip` and `setuptools` have bugs and implementation details which frequently cause `pip install .` from a subproject directory to fail to find a correct version string (so it usually defaults to `0+unknown`). `pip install --editable .` should work correctly. `setup.py install` might work too. Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in some later version. [Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking this issue. The discussion in [PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the issue from the Versioneer side in more detail. [pip PR#3176](https://github.com/pypa/pip/pull/3176) and [pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve pip to let Versioneer work correctly. Versioneer-0.16 and earlier only looked for a `.git` directory next to the `setup.cfg`, so subprojects were completely unsupported with those releases. ### Editable installs with setuptools <= 18.5 `setup.py develop` and `pip install --editable .` allow you to install a project into a virtualenv once, then continue editing the source code (and test) without re-installing after every change. "Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a convenient way to specify executable scripts that should be installed along with the python package. These both work as expected when using modern setuptools. When using setuptools-18.5 or earlier, however, certain operations will cause `pkg_resources.DistributionNotFound` errors when running the entrypoint script, which must be resolved by re-installing the package. This happens when the install happens with one version, then the egg_info data is regenerated while a different version is checked out. Many setup.py commands cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into a different virtualenv), so this can be surprising. [Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes this one, but upgrading to a newer version of setuptools should probably resolve it. ## Updating Versioneer To upgrade your project to a new release of Versioneer, do the following: * install the new Versioneer (`pip install -U versioneer` or equivalent) * edit `setup.cfg`, if necessary, to include any new configuration settings indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. * re-run `versioneer install` in your source tree, to replace `SRC/_version.py` * commit any changed files ## Future Directions This tool is designed to make it easily extended to other version-control systems: all VCS-specific components are in separate directories like src/git/ . The top-level `versioneer.py` script is assembled from these components by running make-versioneer.py . In the future, make-versioneer.py will take a VCS name as an argument, and will construct a version of `versioneer.py` that is specific to the given VCS. It might also take the configuration arguments that are currently provided manually during installation by editing setup.py . Alternatively, it might go the other direction and include code from all supported VCS systems, reducing the number of intermediate scripts. ## Similar projects * [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time dependency * [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of versioneer * [versioningit](https://github.com/jwodder/versioningit) - a PEP 518-based setuptools plugin ## License To make Versioneer easier to embed, all its code is dedicated to the public domain. The `_version.py` that it creates is also in the public domain. Specifically, both are released under the Creative Commons "Public Domain Dedication" license (CC0-1.0), as described in https://creativecommons.org/publicdomain/zero/1.0/ . [pypi-image]: https://img.shields.io/pypi/v/versioneer.svg [pypi-url]: https://pypi.python.org/pypi/versioneer/ [travis-image]: https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg [travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer """ # pylint:disable=invalid-name,import-outside-toplevel,missing-function-docstring # pylint:disable=missing-class-docstring,too-many-branches,too-many-statements # pylint:disable=raise-missing-from,too-many-lines,too-many-locals,import-error # pylint:disable=too-few-public-methods,redefined-outer-name,consider-using-with # pylint:disable=attribute-defined-outside-init,too-many-arguments import configparser import errno import json import os import re import subprocess import sys from typing import Callable, Dict import functools class VersioneerConfig: """Container for Versioneer configuration parameters.""" def get_root(): """Get the project root directory. We require that all commands are run from the project root, i.e. the directory that contains setup.py, setup.cfg, and versioneer.py . """ root = os.path.realpath(os.path.abspath(os.getcwd())) setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): # allow 'python path/to/setup.py COMMAND' root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): err = ("Versioneer was unable to run the project root directory. " "Versioneer requires setup.py to be executed from " "its immediate directory (like 'python setup.py COMMAND'), " "or in a way that lets it use sys.argv[0] to find the root " "(like 'python path/to/setup.py COMMAND').") raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools # tree) execute all dependencies in a single python process, so # "versioneer" may be imported multiple times, and python's shared # module-import table will cache the first one. So we can't use # os.path.dirname(__file__), as that will find whichever # versioneer.py was first imported, even in later projects. my_path = os.path.realpath(os.path.abspath(__file__)) me_dir = os.path.normcase(os.path.splitext(my_path)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) if me_dir != vsr_dir: print("Warning: build in %s is using versioneer.py from %s" % (os.path.dirname(my_path), versioneer_py)) except NameError: pass return root def get_config_from_root(root): """Read the project setup.cfg file to determine Versioneer config.""" # This might raise OSError (if setup.cfg is missing), or # configparser.NoSectionError (if it lacks a [versioneer] section), or # configparser.NoOptionError (if it lacks "VCS="). See the docstring at # the top of versioneer.py for instructions on writing your setup.cfg . setup_cfg = os.path.join(root, "setup.cfg") parser = configparser.ConfigParser() with open(setup_cfg, "r") as cfg_file: parser.read_file(cfg_file) VCS = parser.get("versioneer", "VCS") # mandatory # Dict-like interface for non-mandatory entries section = parser["versioneer"] cfg = VersioneerConfig() cfg.VCS = VCS cfg.style = section.get("style", "") cfg.versionfile_source = section.get("versionfile_source") cfg.versionfile_build = section.get("versionfile_build") cfg.tag_prefix = section.get("tag_prefix") if cfg.tag_prefix in ("''", '""'): cfg.tag_prefix = "" cfg.parentdir_prefix = section.get("parentdir_prefix") cfg.verbose = section.get("verbose") return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" # these dictionaries contain VCS-specific tools LONG_VERSION_PY: Dict[str, str] = {} HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs, method): # decorator """Create decorator to mark a method as the handler of a VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" HANDLERS.setdefault(vcs, {})[method] = f return f return decorate def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) process = None popen_kwargs = {} if sys.platform == "win32": # This hides the console window if pythonw.exe is used startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW popen_kwargs["startupinfo"] = startupinfo for command in commands: try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git process = subprocess.Popen([command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), **popen_kwargs) break except OSError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None, None stdout = process.communicate()[0].strip().decode() if process.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) return None, process.returncode return stdout, process.returncode LONG_VERSION_PY['git'] = r''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. Generated by # versioneer-0.22 (https://github.com/python-versioneer/python-versioneer) """Git implementation of _version.py.""" import errno import os import re import subprocess import sys from typing import Callable, Dict import functools def get_keywords(): """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" def get_config(): """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "%(STYLE)s" cfg.tag_prefix = "%(TAG_PREFIX)s" cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY: Dict[str, str] = {} HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs, method): # decorator """Create decorator to mark a method as the handler of a VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) process = None popen_kwargs = {} if sys.platform == "win32": # This hides the console window if pythonw.exe is used startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW popen_kwargs["startupinfo"] = startupinfo for command in commands: try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git process = subprocess.Popen([command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), **popen_kwargs) break except OSError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: print("unable to run %%s" %% dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %%s" %% (commands,)) return None, None stdout = process.communicate()[0].strip().decode() if process.returncode != 0: if verbose: print("unable to run %%s (error)" %% dispcmd) print("stdout was %%s" %% stdout) return None, process.returncode return stdout, process.returncode def versions_from_parentdir(parentdir_prefix, root, verbose): """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for _ in range(3): for prefix in [parentdir_prefix, ""]: prefix = prefix.replace("-", "_") for dirname in [os.path.basename(root)] + os.listdir(root): dirname = dirname.replace("-", "_") if not dirname.startswith(prefix): continue components = dirname[len(prefix):].split(".") components = [ comp for comp in components if all(c.isdigit() for c in comp) ] if len(components) <= 1: continue return {"version": ".".join(components), "full-revisionid": None, "dirty": False, "error": None, "date": None} rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print("Tried directories %%s but none started with prefix %%s" %% (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: with open(versionfile_abs, "r") as fobj: for line in fobj: if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) except OSError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" if "refnames" not in keywords: raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %%d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r'\d', r)} if verbose: print("discarding '%%s', no digits" %% ",".join(refs - tags)) if verbose: print("likely tags: %%s" %% ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') if not re.match(r'\d', r): continue if verbose: print("picking %%s" %% r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] # GIT_DIR can interfere with correct operation of Versioneer. # It may be intended to be passed to the Versioneer-versioned project, # but that should not change where we get our version from. env = os.environ.copy() env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %%s not under git control" %% root) raise NotThisMethod("'git rev-parse --git-dir' returned error") MATCH_ARGS = ["--match", "%%s*" %% tag_prefix] if tag_prefix else [] # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty", "--always", "--long", *MATCH_ARGS], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") branch_name = branch_name.strip() if branch_name == "HEAD": # If we aren't exactly on a branch, pick a branch which represents # the current commit. If all else fails, we are on a branchless # commit. branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) # --contains was added in git-1.5.4 if rc != 0 or branches is None: raise NotThisMethod("'git branch --contains' returned error") branches = branches.split("\n") # Remove the first line if we're running detached if "(" in branches[0]: branches.pop(0) # Strip off the leading "* " from the list of branches. branches = [branch[2:] for branch in branches] if "master" in branches: branch_name = "master" elif not branches: branch_name = None else: # Pick the first branch that is returned. Good or bad. branch_name = branches[0] pieces["branch"] = branch_name # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%%s'" %% describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%%s' doesn't start with prefix '%%s'" print(fmt %% (full_tag, tag_prefix)) pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" %% (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = runner(GITS, ["show", "-s", "--format=%%ci", "HEAD"], cwd=root)[0].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def plus_or_dot(pieces): """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_branch(pieces): """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . The ".dev0" means not master branch. Note that .dev0 sorts backwards (a feature branch will appear "older" than the master branch). Exceptions: 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" rendered += "+untagged.%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def pep440_split_post(ver): """Split pep440 version string at the post-release segment. Returns the release segments before the post-release and the post-release version number (or -1 if no post-release segment is present). """ vc = str.split(ver, ".post") return vc[0], int(vc[1] or 0) if len(vc) == 2 else None def render_pep440_pre(pieces): """TAG[.postN.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post0.devDISTANCE """ if pieces["closest-tag"]: if pieces["distance"]: # update the post release segment tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: rendered += ".post%%d.dev%%d" %% (post_version+1, pieces["distance"]) else: rendered += ".post0.dev%%d" %% (pieces["distance"]) else: # no commits, use the tag as the version rendered = pieces["closest-tag"] else: # exception #1 rendered = "0.post0.dev%%d" %% pieces["distance"] return rendered def render_pep440_post(pieces): """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%%s" %% pieces["short"] else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%%s" %% pieces["short"] return rendered def render_pep440_post_branch(pieces): """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . The ".dev0" means not master branch. Exceptions: 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%%s" %% pieces["short"] if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += "+g%%s" %% pieces["short"] if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces): """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-branch": rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-post-branch": rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%%s'" %% style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date")} def get_versions(): """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for _ in cfg.versionfile_source.split('/'): root = os.path.dirname(root) except NameError: return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", "date": None} try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None} ''' @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: with open(versionfile_abs, "r") as fobj: for line in fobj: if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) except OSError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" if "refnames" not in keywords: raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r'\d', r)} if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') if not re.match(r'\d', r): continue if verbose: print("picking %s" % r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] # GIT_DIR can interfere with correct operation of Versioneer. # It may be intended to be passed to the Versioneer-versioned project, # but that should not change where we get our version from. env = os.environ.copy() env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) raise NotThisMethod("'git rev-parse --git-dir' returned error") MATCH_ARGS = ["--match", "%s*" % tag_prefix] if tag_prefix else [] # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty", "--always", "--long", *MATCH_ARGS], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") branch_name = branch_name.strip() if branch_name == "HEAD": # If we aren't exactly on a branch, pick a branch which represents # the current commit. If all else fails, we are on a branchless # commit. branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) # --contains was added in git-1.5.4 if rc != 0 or branches is None: raise NotThisMethod("'git branch --contains' returned error") branches = branches.split("\n") # Remove the first line if we're running detached if "(" in branches[0]: branches.pop(0) # Strip off the leading "* " from the list of branches. branches = [branch[2:] for branch in branches] if "master" in branches: branch_name = "master" elif not branches: branch_name = None else: # Pick the first branch that is returned. Good or bad. branch_name = branches[0] pieces["branch"] = branch_name # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%s'" % describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" % (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def do_vcs_install(manifest_in, versionfile_source, ipy): """Git-specific installation logic for Versioneer. For Git, this means creating/changing .gitattributes to mark _version.py for export-subst keyword substitution. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] files = [manifest_in, versionfile_source] if ipy: files.append(ipy) try: my_path = __file__ if my_path.endswith(".pyc") or my_path.endswith(".pyo"): my_path = os.path.splitext(my_path)[0] + ".py" versioneer_file = os.path.relpath(my_path) except NameError: versioneer_file = "versioneer.py" files.append(versioneer_file) present = False try: with open(".gitattributes", "r") as fobj: for line in fobj: if line.strip().startswith(versionfile_source): if "export-subst" in line.strip().split()[1:]: present = True break except OSError: pass if not present: with open(".gitattributes", "a+") as fobj: fobj.write(f"{versionfile_source} export-subst\n") files.append(".gitattributes") run_command(GITS, ["add", "--"] + files) def versions_from_parentdir(parentdir_prefix, root, verbose): """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for _ in range(3): for dirname in [os.path.basename(root)] + os.listdir(root): if dirname.startswith(parentdir_prefix): return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None, "date": None} rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print("Tried directories %s but none started with prefix %s" % (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") SHORT_VERSION_PY = """ # This file was generated by 'versioneer.py' (0.22) from # revision-control system data, or from the parent directory name of an # unpacked source archive. Distribution tarballs contain a pre-generated copy # of this file. import json version_json = ''' %s ''' # END VERSION_JSON def get_versions(): return json.loads(version_json) """ def versions_from_file(filename): """Try to determine the version from _version.py if present.""" try: with open(filename) as f: contents = f.read() except OSError: raise NotThisMethod("unable to read _version.py") mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S) if not mo: mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) def write_to_version_file(filename, versions): """Write the given version number to the given _version.py file.""" os.unlink(filename) contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) print("set %s to '%s'" % (filename, versions["version"])) def plus_or_dot(pieces): """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_branch(pieces): """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . The ".dev0" means not master branch. Note that .dev0 sorts backwards (a feature branch will appear "older" than the master branch). Exceptions: 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def pep440_split_post(ver): """Split pep440 version string at the post-release segment. Returns the release segments before the post-release and the post-release version number (or -1 if no post-release segment is present). """ vc = str.split(ver, ".post") return vc[0], int(vc[1] or 0) if len(vc) == 2 else None def render_pep440_pre(pieces): """TAG[.postN.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post0.devDISTANCE """ if pieces["closest-tag"]: if pieces["distance"]: # update the post release segment tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: rendered += ".post%d.dev%d" % (post_version+1, pieces["distance"]) else: rendered += ".post0.dev%d" % (pieces["distance"]) else: # no commits, use the tag as the version rendered = pieces["closest-tag"] else: # exception #1 rendered = "0.post0.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces): """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_post_branch(pieces): """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . The ".dev0" means not master branch. Exceptions: 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += "+g%s" % pieces["short"] if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces): """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-branch": rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-post-branch": rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date")} class VersioneerBadRootError(Exception): """The project root directory is unknown or missing key files.""" def get_versions(verbose=False): """Get the project version from whatever source is available. Returns dict with two keys: 'version' and 'full'. """ if "versioneer" in sys.modules: # see the discussion in cmdclass.py:get_cmdclass() del sys.modules["versioneer"] root = get_root() cfg = get_config_from_root(root) assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or cfg.verbose assert cfg.versionfile_source is not None, \ "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) # extract version from first of: _version.py, VCS command (e.g. 'git # describe'), parentdir. This is meant to work for developers using a # source checkout, for users of a tarball created by 'setup.py sdist', # and for users of a tarball/zipball created by 'git archive' or github's # download-from-tag feature or the equivalent in other VCSes. get_keywords_f = handlers.get("get_keywords") from_keywords_f = handlers.get("keywords") if get_keywords_f and from_keywords_f: try: keywords = get_keywords_f(versionfile_abs) ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) if verbose: print("got version from expanded keyword %s" % ver) return ver except NotThisMethod: pass try: ver = versions_from_file(versionfile_abs) if verbose: print("got version from file %s %s" % (versionfile_abs, ver)) return ver except NotThisMethod: pass from_vcs_f = handlers.get("pieces_from_vcs") if from_vcs_f: try: pieces = from_vcs_f(cfg.tag_prefix, root, verbose) ver = render(pieces, cfg.style) if verbose: print("got version from VCS %s" % ver) return ver except NotThisMethod: pass try: if cfg.parentdir_prefix: ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) if verbose: print("got version from parentdir %s" % ver) return ver except NotThisMethod: pass if verbose: print("unable to compute version") return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None} def get_version(): """Get the short version string for this project.""" return get_versions()["version"] def get_cmdclass(cmdclass=None): """Get the custom setuptools/distutils subclasses used by Versioneer. If the package uses a different cmdclass (e.g. one from numpy), it should be provide as an argument. """ if "versioneer" in sys.modules: del sys.modules["versioneer"] # this fixes the "python setup.py develop" case (also 'install' and # 'easy_install .'), in which subdependencies of the main project are # built (using setup.py bdist_egg) in the same python process. Assume # a main project A and a dependency B, which use different versions # of Versioneer. A's setup.py imports A's Versioneer, leaving it in # sys.modules by the time B's setup.py is executed, causing B to run # with the wrong versioneer. Setuptools wraps the sub-dep builds in a # sandbox that restores sys.modules to it's pre-build state, so the # parent is protected against the child's "import versioneer". By # removing ourselves from sys.modules here, before the child build # happens, we protect the child from the parent's versioneer too. # Also see https://github.com/python-versioneer/python-versioneer/issues/52 cmds = {} if cmdclass is None else cmdclass.copy() # we add "version" to both distutils and setuptools try: from setuptools import Command except ImportError: from distutils.core import Command class cmd_version(Command): description = "report generated version string" user_options = [] boolean_options = [] def initialize_options(self): pass def finalize_options(self): pass def run(self): vers = get_versions(verbose=True) print("Version: %s" % vers["version"]) print(" full-revisionid: %s" % vers.get("full-revisionid")) print(" dirty: %s" % vers.get("dirty")) print(" date: %s" % vers.get("date")) if vers["error"]: print(" error: %s" % vers["error"]) cmds["version"] = cmd_version # we override "build_py" in both distutils and setuptools # # most invocation pathways end up running build_py: # distutils/build -> build_py # distutils/install -> distutils/build ->.. # setuptools/bdist_wheel -> distutils/install ->.. # setuptools/bdist_egg -> distutils/install_lib -> build_py # setuptools/install -> bdist_egg ->.. # setuptools/develop -> ? # pip install: # copies source tree to a tempdir before running egg_info/etc # if .git isn't copied too, 'git describe' will fail # then does setup.py bdist_wheel, or sometimes setup.py install # setup.py egg_info -> ? # we override different "build_py" commands for both environments if 'build_py' in cmds: _build_py = cmds['build_py'] elif "setuptools" in sys.modules: from setuptools.command.build_py import build_py as _build_py else: from distutils.command.build_py import build_py as _build_py class cmd_build_py(_build_py): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() _build_py.run(self) # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) cmds["build_py"] = cmd_build_py if 'build_ext' in cmds: _build_ext = cmds['build_ext'] elif "setuptools" in sys.modules: from setuptools.command.build_ext import build_ext as _build_ext else: from distutils.command.build_ext import build_ext as _build_ext class cmd_build_ext(_build_ext): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() _build_ext.run(self) if self.inplace: # build_ext --inplace will only build extensions in # build/lib<..> dir with no _version.py to write to. # As in place builds will already have a _version.py # in the module dir, we do not need to write one. return # now locate _version.py in the new build/ directory and replace # it with an updated value target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) cmds["build_ext"] = cmd_build_ext if "cx_Freeze" in sys.modules: # cx_freeze enabled? from cx_Freeze.dist import build_exe as _build_exe # nczeczulin reports that py2exe won't like the pep440-style string # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. # setup(console=[{ # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION # "product_version": versioneer.get_version(), # ... class cmd_build_exe(_build_exe): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _build_exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write(LONG % {"DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, }) cmds["build_exe"] = cmd_build_exe del cmds["build_py"] if 'py2exe' in sys.modules: # py2exe enabled? from py2exe.distutils_buildexe import py2exe as _py2exe class cmd_py2exe(_py2exe): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _py2exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write(LONG % {"DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, }) cmds["py2exe"] = cmd_py2exe # we override different "sdist" commands for both environments if 'sdist' in cmds: _sdist = cmds['sdist'] elif "setuptools" in sys.modules: from setuptools.command.sdist import sdist as _sdist else: from distutils.command.sdist import sdist as _sdist class cmd_sdist(_sdist): def run(self): versions = get_versions() self._versioneer_generated_versions = versions # unless we update this, the command will keep using the old # version self.distribution.metadata.version = versions["version"] return _sdist.run(self) def make_release_tree(self, base_dir, files): root = get_root() cfg = get_config_from_root(root) _sdist.make_release_tree(self, base_dir, files) # now locate _version.py in the new base_dir directory # (remembering that it may be a hardlink) and replace it with an # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, self._versioneer_generated_versions) cmds["sdist"] = cmd_sdist return cmds CONFIG_ERROR = """ setup.cfg is missing the necessary Versioneer configuration. You need a section like: [versioneer] VCS = git style = pep440 versionfile_source = src/myproject/_version.py versionfile_build = myproject/_version.py tag_prefix = parentdir_prefix = myproject- You will also need to edit your setup.py to use the results: import versioneer setup(version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), ...) Please read the docstring in ./versioneer.py for configuration instructions, edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. """ SAMPLE_CONFIG = """ # See the docstring in versioneer.py for instructions. Note that you must # re-run 'versioneer.py setup' after changing this section, and commit the # resulting files. [versioneer] #VCS = git #style = pep440 #versionfile_source = #versionfile_build = #tag_prefix = #parentdir_prefix = """ OLD_SNIPPET = """ from ._version import get_versions __version__ = get_versions()['version'] del get_versions """ INIT_PY_SNIPPET = """ from . import {0} __version__ = {0}.get_versions()['version'] """ def do_setup(): """Do main VCS-independent setup function for installing Versioneer.""" root = get_root() try: cfg = get_config_from_root(root) except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: if isinstance(e, (OSError, configparser.NoSectionError)): print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) return 1 print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write(LONG % {"DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, }) ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") if os.path.exists(ipy): try: with open(ipy, "r") as f: old = f.read() except OSError: old = "" module = os.path.splitext(os.path.basename(cfg.versionfile_source))[0] snippet = INIT_PY_SNIPPET.format(module) if OLD_SNIPPET in old: print(" replacing boilerplate in %s" % ipy) with open(ipy, "w") as f: f.write(old.replace(OLD_SNIPPET, snippet)) elif snippet not in old: print(" appending to %s" % ipy) with open(ipy, "a") as f: f.write(snippet) else: print(" %s unmodified" % ipy) else: print(" %s doesn't exist, ok" % ipy) ipy = None # Make sure both the top-level "versioneer.py" and versionfile_source # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so # they'll be copied into source distributions. Pip won't be able to # install the package without this. manifest_in = os.path.join(root, "MANIFEST.in") simple_includes = set() try: with open(manifest_in, "r") as f: for line in f: if line.startswith("include "): for include in line.split()[1:]: simple_includes.add(include) except OSError: pass # That doesn't cover everything MANIFEST.in can do # (http://docs.python.org/2/distutils/sourcedist.html#commands), so # it might give some false negatives. Appending redundant 'include' # lines is safe, though. if "versioneer.py" not in simple_includes: print(" appending 'versioneer.py' to MANIFEST.in") with open(manifest_in, "a") as f: f.write("include versioneer.py\n") else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: print(" appending versionfile_source ('%s') to MANIFEST.in" % cfg.versionfile_source) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: print(" versionfile_source already in MANIFEST.in") # Make VCS-specific changes. For git, this means creating/changing # .gitattributes to mark _version.py for export-subst keyword # substitution. do_vcs_install(manifest_in, cfg.versionfile_source, ipy) return 0 def scan_setup_py(): """Validate the contents of setup.py against Versioneer's expectations.""" found = set() setters = False errors = 0 with open("setup.py", "r") as f: for line in f.readlines(): if "import versioneer" in line: found.add("import") if "versioneer.get_cmdclass()" in line: found.add("cmdclass") if "versioneer.get_version()" in line: found.add("get_version") if "versioneer.VCS" in line: setters = True if "versioneer.versionfile_source" in line: setters = True if len(found) != 3: print("") print("Your setup.py appears to be missing some important items") print("(but I might be wrong). Please make sure it has something") print("roughly like the following:") print("") print(" import versioneer") print(" setup( version=versioneer.get_version(),") print(" cmdclass=versioneer.get_cmdclass(), ...)") print("") errors += 1 if setters: print("You should remove lines like 'versioneer.VCS = ' and") print("'versioneer.versionfile_source = ' . This configuration") print("now lives in setup.cfg, and should be removed from setup.py") print("") errors += 1 return errors if __name__ == "__main__": cmd = sys.argv[1] if cmd == "setup": errors = do_setup() errors += scan_setup_py() if errors: sys.exit(1)