Repository: roskakori/pygount Branch: main Commit: c05d365b2447 Files: 52 Total size: 215.5 KB Directory structure: gitextract_04wbqrob/ ├── .gitattributes ├── .github/ │ └── workflows/ │ └── build.yml ├── .gitignore ├── .idea/ │ ├── .gitignore │ ├── encodings.xml │ ├── inspectionProfiles/ │ │ └── Project_Default.xml │ ├── misc.xml │ ├── modules.xml │ ├── pyProjectModel.xml │ ├── pygount.iml │ └── vcs.xml ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CHANGES.md ├── CONTRIBUTING.md ├── LICENSE.txt ├── README.md ├── docs/ │ ├── api.md │ ├── background.md │ ├── changes.md │ ├── continuous-integration.md │ ├── contributing.md │ ├── index.md │ ├── installation.md │ ├── json.md │ └── usage.md ├── mkdocs.yaml ├── pygount/ │ ├── __init__.py │ ├── analysis.py │ ├── command.py │ ├── common.py │ ├── git_storage.py │ ├── lexers.py │ ├── summary.py │ ├── write.py │ └── xmldialect.py ├── pyproject.toml ├── scripts/ │ ├── build_documentation.sh │ ├── build_movie.sh │ ├── test_coverage.sh │ └── update_dependencies.sh └── tests/ ├── __init__.py ├── _common.py ├── test_analysis.py ├── test_command.py ├── test_common.py ├── test_encoding.py ├── test_git_storage.py ├── test_lexers.py ├── test_summary.py ├── test_write.py └── test_xmldialect.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ * text=auto ================================================ FILE: .github/workflows/build.yml ================================================ # Continuous integration build for pygount. name: Build on: [push, pull_request] jobs: build: runs-on: ubuntu-latest strategy: matrix: python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] env: MAIN_PYTHON_VERSION: "3.12" # same as Ubuntu 24 LTS steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install uv uses: astral-sh/setup-uv@v6 with: # NOTE Using the "latest" version of uv is risky, but for the time being uv is updated # regularly, so a specific version would be outdated rather quickly. Once uv goes # version 1.0, this should be changed to something like ">=1 <2". version: "latest" - name: Load cached venv id: cached-uv-dependencies uses: actions/cache@v4 with: path: .venv key: venv-${{ runner.os }}-${{ hashFiles('**/uv.lock') }} - name: Install dependencies if: steps.cached-uv-dependencies.outputs.cache-hit != 'true' run: | uv sync - name: Build pygount package run: | uv build - name: Run the test suite run: | uv run pytest --cov=pygount --cov-branch - name: Build documentation if: ${{ matrix.python-version == env.MAIN_PYTHON_VERSION }} run: | uv run sh scripts/build_documentation.sh - name: Update coveralls statistics if: ${{ matrix.python-version == env.MAIN_PYTHON_VERSION }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | uv run coveralls --service=github check-style: runs-on: ubuntu-latest # Disable pre-commit check on main and production to prevent # pull request merges to fail with don't commit to branch". if: github.ref != 'refs/heads/main' steps: - uses: actions/checkout@v4 - name: Set up Python ${{ env.MAIN_PYTHON_VERSION }} uses: actions/setup-python@v5 with: python-version: ${{ env.MAIN_PYTHON_VERSION }} - name: Install pre-commit run: | pip install pre-commit - name: Load cached pre-commit id: cached-pre-commit uses: actions/cache@v4 with: path: ~/.cache/pre-commit key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }} - name: Install pre-commit hooks if: steps.cached-pre-commit.outputs.cache-hit != 'true' run: pre-commit install --install-hooks - name: Check coding style run: pre-commit run --all-files ================================================ FILE: .gitignore ================================================ # Created by https://www.toptal.com/developers/gitignore/api/python,pycharm # Edit at https://www.toptal.com/developers/gitignore?templates=python,pycharm ### PyCharm ### # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 # User-specific stuff .idea/**/workspace.xml .idea/**/tasks.xml .idea/**/usage.statistics.xml .idea/**/dictionaries .idea/**/shelf # AWS User-specific .idea/**/aws.xml # Generated files .idea/**/contentModel.xml # Sensitive or high-churn files .idea/**/dataSources/ .idea/**/dataSources.ids .idea/**/dataSources.local.xml .idea/**/sqlDataSources.xml .idea/**/dynamic.xml .idea/**/uiDesigner.xml .idea/**/dbnavigator.xml # Gradle .idea/**/gradle.xml .idea/**/libraries # Gradle and Maven with auto-import # When using Gradle or Maven with auto-import, you should exclude module files, # since they will be recreated, and may cause churn. Uncomment if using # auto-import. # .idea/artifacts # .idea/compiler.xml # .idea/jarRepositories.xml # .idea/modules.xml # .idea/*.iml # .idea/modules # *.iml # *.ipr # CMake cmake-build-*/ # Mongo Explorer plugin .idea/**/mongoSettings.xml # File-based project format *.iws # IntelliJ out/ # mpeltonen/sbt-idea plugin .idea_modules/ # JIRA plugin atlassian-ide-plugin.xml # Cursive Clojure plugin .idea/replstate.xml # Crashlytics plugin (for Android Studio and IntelliJ) com_crashlytics_export_strings.xml crashlytics.properties crashlytics-build.properties fabric.properties # Editor-based Rest Client .idea/httpRequests # Android studio 3.1+ serialized cache file .idea/caches/build_file_checksums.ser ### PyCharm Patch ### # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 # *.iml # modules.xml # .idea/misc.xml # *.ipr # Sonarlint plugin # https://plugins.jetbrains.com/plugin/7973-sonarlint .idea/**/sonarlint/ # SonarQube Plugin # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin .idea/**/sonarIssues.xml # Markdown Navigator plugin # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced .idea/**/markdown-navigator.xml .idea/**/markdown-navigator-enh.xml .idea/**/markdown-navigator/ # Cache file creation bug # See https://youtrack.jetbrains.com/issue/JBR-2257 .idea/$CACHE_FILE$ # CodeStream plugin # https://plugins.jetbrains.com/plugin/12206-codestream .idea/codestream.xml ### Python ### # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # End of https://www.toptal.com/developers/gitignore/api/python,pycharm # Various .DS_Store .pytest_cache /.idea/ruff.xml /build/ /dist/ /cloc.xml /tests/.temp/ /htmlcov/ ================================================ FILE: .idea/.gitignore ================================================ # Default ignored files /shelf/ /workspace.xml # Editor-based HTTP Client requests /httpRequests/ # Datasource local storage ignored files /dataSources/ /dataSources.local.xml ================================================ FILE: .idea/encodings.xml ================================================ ================================================ FILE: .idea/inspectionProfiles/Project_Default.xml ================================================ ================================================ FILE: .idea/misc.xml ================================================ ================================================ FILE: .idea/modules.xml ================================================ ================================================ FILE: .idea/pyProjectModel.xml ================================================ ================================================ FILE: .idea/pygount.iml ================================================ ================================================ FILE: .idea/vcs.xml ================================================ ================================================ FILE: .pre-commit-config.yaml ================================================ exclude: "^.idea" repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.15.12 hooks: - id: ruff args: ["--fix"] - id: ruff-format - repo: https://github.com/pre-commit/mirrors-prettier rev: v3.1.0 hooks: - id: prettier - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: fix-byte-order-marker - id: trailing-whitespace - id: end-of-file-fixer - id: mixed-line-ending - id: check-added-large-files - id: check-ast - id: check-json - id: check-merge-conflict - id: check-xml - id: check-yaml - id: debug-statements - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: no-commit-to-branch args: ["--branch", "main"] ================================================ FILE: .readthedocs.yaml ================================================ # Settings for "Read the Docs" build. # See . version: 2 build: os: "ubuntu-24.04" tools: python: "3.14" mkdocs: configuration: mkdocs.yaml python: install: - method: uv command: sync groups: - dev ================================================ FILE: CHANGES.md ================================================ # Version history For more information about which versions of pygount included what changes read the [respective chapter of the documentation](https://pygount.readthedocs.io/en/latest/changes/). ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to pygount For more information on building pygount and contributing to it, read the [respective chapter of the documentation](https://pygount.readthedocs.io/en/latest/contributing/). ================================================ FILE: LICENSE.txt ================================================ Copyright (c) 2016-2024, Thomas Aglassinger All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of pygount nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: README.md ================================================ [![PyPI](https://img.shields.io/pypi/v/pygount)](https://pypi.org/project/pygount/) [![Python Versions](https://img.shields.io/pypi/pyversions/pygount.svg)](https://www.python.org/downloads/) [![Build Status](https://github.com/roskakori/pygount/actions/workflows/build.yml/badge.svg)](https://github.com/roskakori/pygount/actions/workflows/build.yml) [![Test Coverage](https://img.shields.io/coveralls/github/roskakori/pygount)](https://coveralls.io/r/roskakori/pygount?branch=main) [![Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![License](https://img.shields.io/github/license/roskakori/pygount)](https://opensource.org/licenses/BSD-3-Clause) # pygount Pygount is a command line tool to scan folders for source code files and count the number of source code lines in it. It is similar to tools like [sloccount](https://www.dwheeler.com/sloccount/) and [cloc](https://github.com/AlDanial/cloc) but uses the [pygments](https://pygments.org/) package to analyze the source code and consequently can analyze any [programming language supported by pygments](https://pygments.org/languages/). The name is a combination of pygments and count. Pygount is open source and distributed under the [BSD license](https://opensource.org/licenses/BSD-3-Clause). The source code is available from https://github.com/roskakori/pygount. ## Quickstart For installation run ```bash $ pip install pygount ``` or use [uv](https://docs.astral.sh/uv/) to run it directly, for example: ```bash $ uvx pygount --version ``` To get a list of line counts for a projects stored in a certain folder: ```bash $ pygount ~/projects/example ``` To limit the analysis to certain file types identified by their suffix: ```bash $ pygount --suffix=cfg,py,yml ~/projects/example ``` To get a summary of each programming language with sum counts and percentage: ```bash $ pygount --format=summary ~/projects/example ``` To analyze a remote git repository directly without having to clone it first: ```bash $ pygount --format=summary https://github.com/roskakori/pygount.git ``` You can pass a specific revision at the end of the remote URL: ```bash $ pygount --format=summary https://github.com/roskakori/pygount.git/v1.5.1 ``` This example results in the following summary output: ``` ┏━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━┓ ┃ Language ┃ Files ┃ % ┃ Code ┃ % ┃ Comment ┃ % ┃ ┡━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━┩ │ Python │ 18 │ 47.4 │ 2132 │ 63.6 │ 418 │ 12.5 │ │ TOML │ 2 │ 5.3 │ 1204 │ 82.7 │ 1 │ 0.1 │ │ Batchfile │ 1 │ 2.6 │ 24 │ 68.6 │ 1 │ 2.9 │ │ Bash │ 2 │ 5.3 │ 12 │ 80.0 │ 3 │ 20.0 │ │ Makefile │ 1 │ 2.6 │ 9 │ 45.0 │ 7 │ 35.0 │ │ reStructuredText │ 9 │ 23.7 │ 0 │ 0.0 │ 438 │ 50.2 │ │ Markdown │ 3 │ 7.9 │ 0 │ 0.0 │ 53 │ 49.1 │ │ Text only │ 2 │ 5.3 │ 0 │ 0.0 │ 24 │ 82.8 │ ├──────────────────┼───────┼───────┼──────┼──────┼─────────┼──────┤ │ Sum │ 38 │ 100.0 │ 3381 │ 57.4 │ 945 │ 16.1 │ └──────────────────┴───────┴───────┴──────┴──────┴─────────┴──────┘ ``` Plenty of tools can post process SLOC information, for example the [SLOCCount plug-in](https://wiki.jenkins-ci.org/display/JENKINS/SLOCCount+Plugin) for the [Jenkins](https://jenkins.io/) continuous integration server. A popular format for such tools is the XML format used by cloc, which pygount also supports and can store in an output file: ```bash $ pygount --format=cloc-xml --out=cloc.xml ~/projects/example ``` To get a short description of all available command line options use: ```bash $ pygount --help ``` For more information and examples read the documentation chapter on [Usage](https://pygount.readthedocs.io/en/latest/usage/). ## Contributions To report bugs, visit the [issue tracker](https://github.com/roskakori/pygount/issues). In case you want to play with the source code or contribute improvements, see [CONTRIBUTING](https://pygount.readthedocs.io/en/latest/contributing/). ## Version history See [CHANGES](https://pygount.readthedocs.io/en/latest/changes/). ================================================ FILE: docs/api.md ================================================ # API ## Overview Pygount provides a simple API to integrate with other tools. This, however, is currently still a work in progress and subject to change. Here's an example on how to analyze one of pygount's own source codes: ```pycon >>> from pygount import SourceAnalysis >>> SourceAnalysis.from_file("pygount/analysis.py", "pygount") SourceAnalysis(path='pygount/analysis.py', language='Python', group='pygount', state=analyzed, code_count=509, documentation_count=141, empty_count=117, string_count=23) ``` Information about multiple source files can be summarized using `ProjectSummary`: First, set up the summary: ```pycon >>> from pygount import ProjectSummary >>> project_summary = ProjectSummary() ``` Next, find some files to analyze: ```pycon >>> from glob import glob >>> source_paths = glob("pygount/*.py") + glob("*.md") >>> source_paths ['pygount/command.py', 'pygount/analysis.py', 'pygount/write.py', 'pygount/__init__.py', 'pygount/xmldialect.py', 'pygount/summary.py', 'pygount/common.py', 'pygount/lexers.py', 'README.md', 'CONTRIBUTING.md', 'CHANGES.md'] ``` Then analyze them: ```pycon >>> for source_path in source_paths: ... source_analysis = SourceAnalysis.from_file(source_path, "pygount") ... project_summary.add(source_analysis) ``` Finally, take a look at the information collected, for example, by printing the values of `ProjectSummary.language_to_language_summary_map`: ```pycon >>> for language_summary in project_summary.language_to_language_summary_map.values(): ... print(language_summary) ... LanguageSummary(language='Python', file_count=8, code=1232, documentation=295, empty=331, string=84) LanguageSummary(language='markdown', file_count=3, code=64, documentation=0, empty=29, string=14) ``` ================================================ FILE: docs/background.md ================================================ # Background ## How pygount counts code Pygount primarily counts the physical lines of source code. It begins by using lexers from Pygments, if available. If Pygments doesn't have a suitable lexer, pygount employs its own internal lexers to differentiate between code and comments. These include: - Minimalist lexers for m4, VBScript, and WebFOCUS, capable of distinguishing between comments and code. - The Java lexer repurposed for OMG IDL. Additionally, plain text is treated with a separate lexer that considers all lines as comments. Lines consisting solely of comment tokens or whitespace are counted as comments. Lines with only whitespace are ignored. All other content is considered code. ## White characters A line containing only "white characters" is also ignored because they do not contribute to code complexity in any meaningful way. Currently, white characters are: ``` (),:;[]{} ``` Because of that, pygount tends to report about 5 to 15 percent fewer SLOC for C-like languages than other similar tools. ## No operations For some languages, "no operations" are detected and treated as white space. For example, Python's `pass` or Transact-SQL's `begin` and `end`. As an example, consider this Python code: ```python class SomeError(Exception): """ Some error caused by some issue. """ pass ``` This counts as 1 line of code and 3 lines of comments. The line with `pass` is considered a "no operation" and thus not taken into account. ## Pure string lines Many programming languages support the concept of strings, which typically often contain text to be shown to the end user or simple constant values. Similar to white character and "no operations" in most cases, they do not add much to the complexity of the code. Notable exceptions are strings containing code for domain-specific languages, templates, or SQL statements. Pygount currently takes an opinionated approach on how to count pure string lines depending on the output format: - With `--format=summary`, pure string lines are ignored similar to empty lines - With `--format` set to `sloccount` or `cloc-xml` string lines are counted as code, resulting in somewhat similar counts as the original tools. - With `--format=json` all variants are available as attributes, and you can choose which one you prefer. In hindsight, this is an inconsistency that might warrant a cleanup. See issue [#122](https://github.com/roskakori/pygount/issues/122) for a discussion and issue [#152](https://github.com/roskakori/pygount/issues/152) for a plan on how to clean this up. ## Binary files When a file is considered to be binary when all the following conditions match: 1. The file does not start with a BOM for UTF-8, UTF-16 or UTF-32 (which indicates text files). 2. The initial 8192 bytes contain at least one 0-byte. In this case, pygount assigns it the pseudo language `__binary__` and performs no further analysis. ## Generated files Generated files are recognized either by their content (`--generated`) or name (`--generated-names`). Use `--help` to see the current default patterns. In case you think the standard patterns should be extended, modify `pygount.analysis.DEFAULT_GENERATED_LINE|NAME_PATTERNS_TEXT` and [contribute a pull request](contributing.md). For source code repositories, committing generated files should generally be avoided. Instead, make the generation part of the build process. However, there are valid reasons to include generated files: 1. Package managers generate "lock" files from the package specification to ensure builds use the exact same versions and hashes. For example, "pyproject.toml" and "uv.lock". 2. Generation takes too long, for example, in Flutter projects with many nested sub-packages. 3. Generated files cannot be bootstrapped from scratch because of interdependencies. 4. Cloud tools require certain generated files to be present in the repository. An example would be [ReadTheDocs.org](https://readthedocs.org), which as of May 2025 in combination with [MkDocs](https://www.mkdocs.org/) needs additional dependencies to be specified in a `requirements.txt`. Many Python projects specify their dependencies in `pyproject.toml`, which can be used to generate the `requirements.txt`. However, the ReadTheDocs build does not allow easily including such a step, so the path of least resistance is to just include the generated `requirements.txt` file in the repository. ## Comparison with other tools Pygount can analyze more languages than other common tools such as sloccount or cloc because it builds on `pygments`, which provides lexers for hundreds of languages. This also makes enables supporting another language: [Write your own lexer](http://pygments.org/docs/lexerdevelopment/). For certain corner cases, pygount gives more accurate results because it actually lexes the code unlike other tools that mostly look for comment markers and can get confused when they show up inside strings. In practice, though, this should not make much of a difference. Pygount is slower than most other tools. Partially, this is due to actually lexing instead of just scanning the code. Partially, because other tools can use statically compiled languages such as Java or C, which are generally faster than dynamic languages. For many applications though, pygount should be "fast enough", especially when running as an asynchronous step during a continuous integration build. ================================================ FILE: docs/changes.md ================================================ # Changes This chapter describes the changes coming with each new version of pygount. ## Version 3.3.0, 2026-xx-xx Development: - Migrate ReadTheDocs documentation to uv (issue [#221](https://github.com/roskakori/pygount/issues/221)). ## Version 3.2.0, 2026-04-08 - Add detection of SVG as XML dialect (issue [#209](https://github.com/roskakori/pygount/issues/209)). - Fix detecttion of XML dialect when a `` header was present. ## Version 3.1.1, 2025-02-17 - Update dependencies and drop support for Python 3.9 (issue [#205](https://github.com/roskakori/pygount/issues/205)). ## Version 3.1.0, 2025-05-27 - Add command line option [`--generated-names`](usage.md#-generated-names) to specify which file names should be considered to be generated. The current patterns recognized are somewhat limited, so contributions are welcome. See the section on "[Generated files](background.md#generated-files)" for hints on how to do that (issue [#190](https://github.com/roskakori/pygount/issues/190)). - Change documentation from Sphinx to MkDocs in the hope to avoid it breaking regularly (issue [#191](https://github.com/roskakori/pygount/issues/191)). Development: - Replace `format()` with f-strings (contributed by Ben Allen, issue [#166](https://github.com/roskakori/pygount/issues/166)). - Change sdist archive to include more than just the Python source code. ## Version 3.0.0, 2025-05-23 - Count pure markup files as documentation: (contributed by Tytus Bucholc, issue [#6](https://github.com/roskakori/pygount/issues/6)). - Fix silent error on git failing (contributed by Tom De Bièvre, issue [#162](https://github.com/roskakori/pygount/issues/162)) - Transform common project URLs to repository: (contributed by Tom De Bièvre, issue [#164](https://github.com/roskakori/pygount/issues/164)) - Change dependency rules for rich to be more lenient (suggested by Brian McGillion, issue [#193](https://github.com/roskakori/pygount/issues/193)) ## Version 2.0.0, 2025-03-16 - Fix `TypeError` when processing files with a magic encoding comment specifying an unknown encoding and using `--format=json` (contributed by PyHedgehog, issue [#176](https://github.com/roskakori/pygount/issues/176)) - Fix false positives when extracting the encoding from magic coding comments (issue [#184](https://github.com/roskakori/pygount/issues/184)) - Add support for Python 3.13 and later (issue [#174](https://github.com/roskakori/pygount/issues/174)) - Remove temporary directory in the output of a git analysis (contributed by Isabel Beckenbach, issue [#113](https://github.com/roskakori/pygount/issues/113)) - Remove support for Python 3.8 (issue [#158](https://github.com/roskakori/pygount/issues/158)) - Development: Change packaging to uv (issue [#180](https://github.com/roskakori/pygount/issues/180)). - Development: Change linter to ruff and in turn, clean up code (issue [#157](https://github.com/roskakori/pygount/issues/157)). - Development: Change default branch to main (issue [#160](https://github.com/roskakori/pygount/issues/160)). - Removed deprecated code: (contributed by Marco Gambone and Niels Vanden Bussche, issue [#47](https://github.com/roskakori/pygount/issues/47)). ## Version 1.8.0, 2024-05-13 - Add all available counts and percentages to JSON format (issue [#122](https://github.com/roskakori/pygount/issues/122)). In particular, this makes available the `codeCount`, which is similar to the already existing `sourceCount` but does exclude lines that contain only strings. You can check their availability by validating that the `formatVersion` is at least 1.1.0. The documentation about "`How to count code` has more information about the available counts and the ways they are computed. Pygount 2.0 will probably introduce some breaking changes in this area, which can already be previewed and discussed at issue [#152](https://github.com/roskakori/pygount/issues/152). ## Version 1.7.0, 2024-05-13 - Fix analysis with [FIPS](https://en.wikipedia.org/wiki/Federal_Information_Processing_Standards) mode by changing computation of hash for duplicate detection from MD5 to SHA256. As a side effect, reasonably modern machines should receive a (probably unnoticeable) minor performance boost (contributed by Matthew Vine, issue [#137](https://github.com/roskakori/pygount/issues/137)). - Add command line option `--merge-embedded-languages` to merge embedded languages into their base language. For example, "HTML+Django/Jinja" counts as "HTML" (issue [#105](https://github.com/roskakori/pygount/issues/105)). - Add Python 3.12 and make it the main version for CI (issue [#145](https://github.com/roskakori/pygount/issues/145)). ## Version 1.6.1, 2023-07-02 - Fix missing check for seekable file handles (issue [#114](https://github.com/roskakori/pygount/issues/114)). - Fix the ReadTheDocs documentation build by switching to the built-in alabaster Sphinx theme (issue [#116](https://github.com/roskakori/pygount/issues/116)). ## Version 1.6.0, 2023-06-26 - Add support for analysis of remote git URL\'s in addition to local files (contributed by Rojdi Thomallari, issue [#109](https://github.com/roskakori/pygount/issues/109)). - Removed support for Python 3.7. - Improve API: - Add an option to pass a file handle to `SourceAnalysis.from_file()` (contributed by Dominik George, issue [#100](https://github.com/roskakori/pygount/issues/100)). ## Version 1.5.1, 2023-01-02 - Remove progress bar for `--format=sloccount` because it resulted into blank lines when running on Windows and could cause interwoven output on Unix (issue [#91](https://github.com/roskakori/pygount/issues/91)). ## Version 1.5.0, 2022-12-30 - Remove support for Python 3.6 and update dependencies (issue [#93](https://github.com/roskakori/pygount/issues/93)). ## Version 1.4.0, 2022-04-09 - Add progress bar during scan phase and improve visual design of `--format=summary` (contributed by Stanislav Zmiev, issue [#73](https://github.com/roskakori/pygount/issues/73)). - Add percentages to API. For example in addition to `code_count` now there also is `code_percentage`. ## Version 1.3.0, 2022-01-06 - Fix computation of "lines per second", which was a copy and paste of "files per second". - Add JSON as additional output `--format`, see [JSON](json.md) for details (issue [#62](https://github.com/roskakori/pygount/issues/62)). - Add detection of [GitHub community files](https://docs.github.com/en/communities/setting-up-your-project-for-healthy-contributions) without a suffix as text (issue [#54](https://github.com/roskakori/pygount/issues/54)). - Change the build process to [poetry](https://python-poetry.org/) to change several messy configuration files into a single even more messy configuration file. ## Version 1.2.5, 2021-05-16 - Remove support for Python 3.5. Probably it still works, but there is no easy way to test this anymore because 3.5 reached its end of life a while ago. ## Version 1.2.4, 2020-08-11 - Fix scanning of "." (for current folder), which was skipped entirely (issue [#56](https://github.com/roskakori/pygount/issues/56)). ## Version 1.2.3, 2020-07-05 - Improve detection of text files by trying to guess a lexer for `*.txt` before assuming it is text. This basically fixes the detection of `CMakelists.txt` as CMake file [#53](https://github.com/roskakori/pygount/issues/53)). However, it will only work with some files due to multiple issues with the regular expression Pygments used in versions up to 2.6.1 to detect CMake headers. This should be fixed once pull request [#1491](https://github.com/pygments/pygments/pull/1491) is applied. ## Version 1.2.2, 2020-06-24 - Changed preprocessor statements to count as code, unlike Pygments which treats them as special comments (contributed by nkr0, issue [#51](https://github.com/roskakori/pygount/issues/51)). ## Version 1.2.1, 2020-04-02 - Fix broken links in README on PyPI by moving the documentation to [ReadTheDocs](https://pygount.readthedocs.io/). - Improv API: - Change factory functions to methods and added deprecation warnings: - `source_analysis` → `SourceAnalysis.from_file` - `pseudo_source_analysis` → `SourceAnalysis.from_state` - Change attributes in `SourceAnalysis` to read-only properties. - Rename properties holding counts from `xxx` to `xxx_count`. - Add API reference to documentation. - Add a couple of type hints and assertions. ## Version 1.2.0, 2020-03-30 - Add file count to summary. - Change installation to fail when attempting to install on Python earlier than 3.5. - Improve API: - Change `SourceAnalysis.state` to be a proper enum instead of a string. - Add `ProjectSummary` to summarize multiple files. - Clean up the project: - Change continuous integration from Travis CI to GitHub actions in the hope that the CI build does not automatically break after a while because things constantly change in the CI backend. - Change README format from reStructuredText to Markdown. - Improve badges in README: added a badge for supported Python versions and unified the layout by using . - Remove obsolete development files (for ant, tox etc). ## Version 1.1.0, 2020-03-10 - Fix `--folders-to-skip` and `--names-to-skip` which simply were ignored (contributed by pclausen, issue [#17](https://github.com/roskakori/pygount/issues/17)). - Add option `--format=summary` to get a language overview and sum total (based on a contribution by Yuriy Petrovskiy, issue [#16](https://github.com/roskakori/pygount/issues/16)). - Add Python 3.7 and 3.8 to the list of supported versions. - Drop support for Python 3.3 and 3.4, mostly because it became hard to test without going through major hoops. ## Version 1.0.0, 2017-07-04 - Fix confusing warning about XML file `` caused by SAX parser. As a workaround, `` is now replaced by the actual path of the XML file that cannot be parsed. - Add Python 3.6 to the list of supported versions (issue [#14](https://github.com/roskakori/pygount/issues/14)). ## Version 0.9, 2017-05-04 - Fix `AssertionError` when option `--encoding=chardet` was specified. - Change the warning message "no fallback encoding specified, using \" to a debug message because it did not add any interesting information as the encoding actually used is visible in the info message for each file. - Add detection of binary files and exclude them from the analysis. In particular Django model objects (`*.mo`) are not considered Modelica source code anymore (issue [#11](https://github.com/roskakori/pygount/issues/11)). - Add detection of DocBook XML by DTD (issue [#10](https://github.com/roskakori/pygount/issues/10)). - Add support for suffices to indicate PL/SQL files according to [Oracle FAQ entry on file extensions](http://www.orafaq.com/wiki/File_extensions) (issue [#12](https://github.com/roskakori/pygount/issues/12)). - Add possibility to specify a fallback encoding for encoding 'chardet'. Use e.g. `--encoding=chardet;cp1252`. ## Version 0.8, 2016-10-07 - Fix option `--verbose`. Now each analyzed source code results in at least one informational message in the log. - Add detection of duplicates using size and then MD5 code as criteria (issue [#2](https://github.com/roskakori/pygount/issues/2)). Use the option `--duplicates` to still count duplicate source code. - Improve detection of programming language, which is now more consistent and yields the same language between Python invocations. ## Version 0.7, 2016-09-28 - Fix that option `--generated` was ignored. - Add support for a couple of languages not supported by `pygments` yet: - m4, VBScript, and WebFOCUS use minimalistic lexers that can distinguish between comments and code. - OMG IDL repurposes the existing Java lexer. - Add detection of certain XML dialects as separate language (issue [#8](https://github.com/roskakori/pygount/issues/8)). ## Version 0.6, 2016-09-26 - Fix that source files could end up as `__error__` if the first non-ASCII characters showed up only after kilobyte 16 and the encoding was not UTF-8. Now pygount attempts to read the whole file as UTF-8 before assuming it actually is UTF-8. - Change lines in plain text files to count as comments (issue [#9](https://github.com/roskakori/pygount/issues/9)). Before pygments treated them as `ResourceBundle`. - Change that empty files have `__empty__` as language (issue [#7](https://github.com/roskakori/pygount/issues/7)). - Extend workaround for [pygments issue #1284](https://bitbucket.org/birkenfeld/pygments-main/issues/1284) to replace any lexer `*+Evoque` by `*`. ## Version 0.5, 2016-09-22 - Add that generated source code is excluded from analysis (issue [#1](https://github.com/roskakori/pygount/issues/1)). Use option `--generated` to specify patterns that indicate generated code. - Add workaround for pygments sometimes detecting the same XML file as XML and other times as XML+Evoque (probably depending on the hash seed). Now XML+Evoque is always changed to XML. - Add `__pycache__` as default `--folders-to-skip`. - Add notes on pseudo languages for source code that cannot be analyzed. ## Version 0.4, 2016-09-11 - Fixed `LookupError` on broken encoding in magic comment (issue [#4](https://github.com/roskakori/pygount/issues/4)). - Add options `--folders-to-skip` and `--names-to-skip` to specify which files should be excluded from analysis. - Add comma (`,`) and colon (`:`) to list of "white characters" that do not count as code if there is nothing else in the line. - Improve pattern matching: for all options that according to `--help` take `PATTERNS` you can now specify that the patterns are regular expressions instead of shell patterns (using `[regex]`) and that they should extend the default patterns (using `[...]`). - Improve documentation: added notes on how code is counted and how pygount compares to other similar tools. ## Version 0.3, 2016-08-20 - Fix `@rem` comments in DOS batch files (issue [#3](https://github.com/roskakori/pygount/issues/3)). - Clean up code. ## Version 0.2, 2016-07-10 - Fix that files starting with underscore (e.g. `__init__.py`) were excluded from analysis. - Change `chardet` package to be optional. - Add possibility to specify single files and glob patterns to analyze. - Add that lines containing only certain characters are treated as white space instead of code. Currently, this concerns brackets (`()[]{}`) and semicolon (`;`). - Add that Python's `pass` statement is treated as white space instead of code. - Clean up and (slightly) optimized code. ## Version 0.1, 2016-07-05 - Initial public release. ================================================ FILE: docs/continuous-integration.md ================================================ # Continuous integration Pygount can produce output that can be processed by the [SLOCCount plug-in](https://wiki.jenkins-ci.org/display/JENKINS/SLOCCount+Plugin) for the [Jenkins](https://jenkins.io/) continuous integration server. It's recommended to run pygount as one of the first steps in your build process before any undesired file like compiler targets or generated source code are built. An example "Execute shell" build step for Jenkins is: ```bash $ pygount --format=cloc-xml --out cloc.xml --suffix=py --verbose ``` Then add a post-build action "Publish SLOCCount analysis results" and set "SLOCCount report" to "cloc.xml". ================================================ FILE: docs/contributing.md ================================================ # Contributing ## Project setup In case you want to play with the source code or contribute changes, proceed as follows: 1. Check out the project from GitHub: ```bash $ git clone https://github.com/roskakori/pygount.git $ cd pygount ``` 2. Install [uv](https://docs.astral.sh/uv/). 3. Create the virtual environment and install the required packages: ```bash $ uv sync --all-groups ``` 4. Install the pre-commit hook: ```bash $ uv run pre-commit install ``` ## Testing To run the test suite: ```bash $ uv run pytest ``` To build and browse the coverage report in HTML format: ```bash $ sh scripts/test_coverage.sh $ open htmlcov/index.html # macOS only ``` ## Documentation To build the documentation in HTML format: ```bash $ uv run scripts/build_documentation.sh $ open docs/_build/html/index.html # macOS only ``` ## Coding guidelines The code throughout uses a natural naming schema avoiding abbreviations, even for local variables and parameters. Many coding guidelines are automatically enforced (and some even fixed automatically) by the pre-commit hook. If you want to check and clean up the code without performing a commit, run: ```bash $ uv run pre-commit run --all-files ``` In particular, this applies checks from [black](https://black.readthedocs.io/en/stable/), [flake8](https://flake8.pycqa.org/) and [isort](https://pypi.org/project/isort/). ## Publish a new version This section is only relevant for developers with access to the PyPI project. To add a new release, first update the `pyproject.toml`: ```toml [project] version = "3.x.x" ``` Next, build the project and run the tests to ensure everything works: ```sh $ rm -rf dist # Remove any files from previous builds. $ uv build $ uv run pytest ``` Then create a tag in the repository: ```sh $ git tag -a -m "Tag version 3.x.x" v3.x.x $ git push --tags ``` Publish the new version on PyPI: ```sh $ uv publish ``` Finally, add a GitHub release based on the tag from above to the [release page](https://github.com/roskakori/pygount/releases). ================================================ FILE: docs/index.md ================================================ # Pygount Pygount is a command line tool to scan folders for source code files and count the number of source code lines in it. It is similar to tools like [sloccount](http://www.dwheeler.com/sloccount/) and [cloc](http://cloc.sourceforge.net/) but uses the [pygments](http://pygments.org/) package to parse the source code and consequently can analyze any [programming language supported by pygments](http://pygments.org/languages/). The name is a combination of "pygments" and "count". Pygount is open source and distributed under the [BSD license](https://opensource.org/licenses/BSD-3-Clause). The source code is available from . ================================================ FILE: docs/installation.md ================================================ # Installation Pygount is available from [PyPI](https://pypi.python.org/pypi/pygount) and can be installed by running: ```bash pip install pygount ``` Using [uv](https://docs.astral.sh/uv/), it can also run directly. For example: ```bash uvx pygount --version ``` ================================================ FILE: docs/json.md ================================================ # JSON The JavaScript objects notation (JSON) is widely used to interchange data. Running pygount with `--format=json` is a simple way to provide the results of an analysis for further processing. ## General format The general structure of the resulting JSON is: ```JSON { "formatVersion": "1.1.0", "pygountVersion": "1.8.0", "files": [...], "languages": [...], "runtime": {...}, "summary": {...} } ``` The naming of the entries deliberately uses camel case to conform to the [JSLint](https://www.jslint.com/) guidelines. Both `formatVersion` and `pygountVersion` use [semantic versioning](https://semver.org/). For more information about how this JSON evolved, see `JSON format history`. ### Files With `files` you can access a list of files analyzed, for example: ```JavaScript { "codeCount": 171, "documentationCount": 28, "emptyCount": 56, "group": "pygount", "isCountable": true, "language": "Python", "lineCount": 266, "path": "/tmp/pygount/pygount/write.py", "state": "analyzed", "stateInfo": null, "sourceCount": 182 } ``` The `*Count` fields have the following meaning: - `codeCount`: The number of lines that contains code excluding [Pure string lines](background.md#pure-string-lines) - `documentationCount`: The number of lines containing comments - `emptyCount`: The number of empty lines, which includes "`No operations`" lines - `lineCount`: Basically the number of lines shown in your editor respectively computed by shell commands like `wc -l`, - `sourceCount`: The source lines of code, similar to the traditional SLOC - `stringCount`: The number of `Pure string lines` Here, `sourceCount` is the number of source lines of code (SLOC), `documentationCount` the number of lines containing comments and The `state` can have one of the following values: - analyzed: successfully analyzed - binary: the file is a [binary file](background.md#binary-files) - duplicate: the file is a [duplicate](usage.md#-duplicates) of another - empty: the file is empty (file size = 0) - error: the source could not be parsed; in this case, `stateInfo` contains a message with more details - generated: the file has been generated as specified with `--generated` - unknown: pygments does not offer any lexer to analyze the file ### Languages In `languages` the summary for each language is available, for example: ```JSON { "documentationCount": 429, "documentationPercentage": 11.776008783969257, "codeCount": 2332, "codePercentage": 64.01317595388416, "emptyCount": 706, "emptyPercentage": 19.3796321712874, "fileCount": 20, "filePercentage": 48.78048780487805, "isPseudoLanguage": false, "language": "Python", "sourceCount": 2508, "sourcePercentage": 68.84435904474334, "stringCount": 176, "stringPercentage": 4.831183090859182 } ``` ### Summary In `summary` the total counts across the whole project can be accessed, for example: ```JSON { "totalCodeCount": 4366, "totalCodePercentage": 68.38972431077694, "totalDocumentationCount": 463, "totalDocumentationPercentage": 7.25250626566416, "totalEmptyCount": 1275, "totalEmptyPercentage": 19.971804511278197, "totalFileCount": 41, "totalSourceCount": 4646, "totalSourcePercentage": 72.77568922305764, "totalStringCount": 280, "totalStringPercentage": 4.385964912280702 } ``` ### Runtime The `runtime` entry collects general information about how well pygount performed in collecting the information, for example: ```JSON { "durationInSeconds": 0.6333059999999999, "filesPerSecond": 64.73963613166464, "finishedAt": "2024-05-13T16:14:31.977070+00:00", "linesPerSecond": 10080.435050354807, "startedAt": "2024-05-13T16:14:31.343764+00:00" } ``` ## Pretty printing Because the output is concise and consequently mostly illegible for a human reader, you might want to pipe it through a pretty printer. As you already have python installed, the easiest way is: ```sh pygount --format json | python -m json.tool ``` Another alternativ would be [jq](https://stedolan.github.io/jq/): ```sh pygount --format json | jq . ``` ## JSON format history v1.1.0, pygount 1.8.0 - Add `code_count` and `line_count` v1.0.0, pygount 1.3.0 - Initial version ================================================ FILE: docs/usage.md ================================================ # Usage ## General Run and specify the folder to analyze recursively, for example: ```bash $ pygount ~/development/sometool ``` If you omit the folder, the current folder of your shell is used as a starting point. Apart from folders you can also specify single files and shell patterns (using `?`, `*` and ranges like `[a-z]`). Certain files and folders are automatically excluded from the analysis: - files starting with dot (`.`) or ending in tilda (`~`) - folders starting with dot (`.`) or named `_svn`. ### `--folders-to-skip LIST`, `--names-to-skip LIST` To specify alternative patterns, use `--folders-to-skip` and `--names-to-skip`. Both take a comma separated list of patterns, see below on the pattern syntax. To, for example, also prevent folders starting with two underscores (`_`) from being analyzed, specify `--folders-to-skip=[...],__*`. ### `--suffix LIST` To limit the analysis on certain file types, you can specify a comma separated list of suffixes to take into account, for example `--suffix=py,sql,xml`. ### `--out FILE` By default, the results of the analysis are written to the standard output. To redirect the output to a file, use for example `--out=counts.txt`. To explicitly redirect to the standard output specify `--out=STDOUT`. ### `--format FORMAT` By default, the results of the analysis are written to the standard output in a format similar to sloccount. To redirect the output to a file, use e.g. `--out=counts.txt`. To change the format to an XML file similar to cloc, use `--format=cloc-xml`. To just get a quick grasp of the languages used in a project and their respective importance use `--format=summary` which provides a language overview and a sum total. For example, pygount's summary looks like this: ``` Language Files % Code % Comment % ---------------- ----- ------ ---- ------ ------- ------ Python 19 51.35 1924 72.99 322 86.10 reStructuredText 7 18.92 332 12.59 7 1.87 markdown 3 8.11 327 12.41 1 0.27 Batchfile 1 2.70 24 0.91 1 0.27 YAML 1 2.70 11 0.42 2 0.53 Makefile 1 2.70 9 0.34 7 1.87 INI 1 2.70 5 0.19 0 0.00 TOML 1 2.70 4 0.15 0 0.00 Text 3 8.11 0 0.00 34 9.09 ---------------- ----- ------ ---- ------ ------- ------ Sum total 37 2636 374 ``` The summary output is designed for human readers, and the column widths adjust to the data. For further processing the results of pygount, `--format=json` should be the easiest to deal with. For more information, see the chapter on [JSON](json.md). ### `--merge-embedded-languages` Some languages such as HTML or JavaScript allow embedding other languages in their source code. In that case, the source code is assigned to a language that contains both the base and end embedded language in its name, for example: - HTML+Jinja - JavaScript+Lasso If you prefer count all variants of a base language only under its own name, specify `--merge-embedded-languages`. The example above will then show as: - HTML - JavaScript Consequently, multiple different embedded languages will all count for its common base language. ## Remote repositories Additionally to local files, pygount can analyze remote git repositories: ```bash $ pygount https://github.com/roskakori/pygount.git ``` In the background, this creates a shallow clone of the repository in a temporary folder that after the analysis is removed automatically. Therefore, you need to have at read access to the repository. If you want to analyze a specific revision, specify it at the end of the URL: ```bash $ pygount https://github.com/roskakori/pygount.git/v1.6.0 ``` The remote URL supports the git standard protocols: git, HTTP/S and SSH. ```bash $ pygount git@github.com:username/project.git ``` You can specify multiple repositories, for example, to include both the web application, command line client and docker container of the [Weblate](https://weblate.org/) project: ```bash $ pygount https://github.com/WeblateOrg/weblate.git https://github.com/WeblateOrg/wlc.git https://github.com/WeblateOrg/docker.git ``` And you can even mix local files and remote repositories: ```bash $ pygount ~/projects/some https://github.com/roskakori/pygount.git ``` ## Patterns Some command line arguments take patterns as values. By default, patterns are shell patterns using `*`, `?` and ranges like `[a-z]` as placeholders. Depending on your platform, they are case-sensitive (Unix) or not (macOS, Windows). If a pattern starts with `[regex]` you can specify a comma separated list of regular expressions instead using all the constructs supported by the [Python regular expression syntax](https://docs.python.org/3/library/re.html#regular-expression-syntax). Regular expressions are case-sensitive unless they include a `(?i)` flag. If the first actual pattern is `[...]`, default patterns are included. Without it, defaults are ignored and only the patterns explicitly stated are taken into account. ### `--generated` So for example, to specify that generated code can also contain the German word "generiert" in a case-insensitive way use `--generated="[regex][...](?i).*generiert"`. ### `--generated-names` In addition to the source code, the file name can indicate that a source code is generated. For example, `--generated-names="*.lock,*.g.dart"`. The default already recognizes several standard generated names. ## Counting duplicates ### `--duplicates` By default, pygount prevents multiple source files with exactly the same content to be counted again. For two files to be considered duplicates, the following conditions must be met: 1. Both files have the same size. 2. Both files have the same [SHA-256](https://en.wikipedia.org/wiki/SHA-2) hashcode. This allows for efficient detection with a tiny possibility for false positives. However, it also prevents detection of files with only minor differences as duplicates. Examples are files that are identical except for additional white space, empty lines or different line endings. If you still want to count duplicates multiple times, specify `--duplicates`. This will also result in a minor performance gain of the analysis. ## Source code encoding ### --encoding ENCODING\[;FALLBACK\] When reading source code, pygount automatically detects the encoding. It uses a simple algorithm where it recognizes BOM, XML declarations such as: ```xml ``` and "magic" comments such as: ```ruby # encoding: cp1252 # coding: cp1252 # -*- coding: cp1252 -*- ``` If the file does not have an appropriate heading, pygount attempts to read it using UTF-8. If this fails, it reads the file using a fallback encoding (by default [CP1252](https://en.wikipedia.org/wiki/Windows-1252)) and ignores any encoding errors. You can change this behavior using the `--encoding` option: - To keep the automatic analysis and use a different fallback encoding, specify for example `--encoding=automatic;iso-8859-15`. - To use automatic detection based on heuristic, specify `--encoding=chardet`. For this to work, the [chardet](https://pypi.python.org/pypi/chardet) package must be installed, - To use a specific encoding (for all files analyzed), use for example `--encoding=iso-8859-15`. ## Pseudo languages If a source code is not counted, the number of lines is 0 and the language shown is a pseudo language indicating the reason: - `__binary__` - used for `binary`. - `__duplicate__` - the source code duplicate as described at the command line option `--duplicates`. - `__empty__` - the source code is an empty file with a size of 0 bytes. - `__error__` - the source code could not be parsed; for example, due to an I/O error. - `__generated__` - the source code is generated according to the command line option `--generated`. - `__unknown__` - pygments does not provide a lexer to parse the source code. ## Other information ### `--verbose` If `--verbose` is specified, pygount logs detailed information about what it is doing. ### `--help` To get a description of all the available command line options, run: ```bash $ pygount --help ``` ### `--version` To get pygount's current version number, run: ```bash $ pygount --version ``` ================================================ FILE: mkdocs.yaml ================================================ site_name: "pygount" site_url: "https://pygount.readthedocs.io/" site_author: "Thomas Aglassinger " site_description: "Documentation of pygount, a tool to count lines of code for hundreds of languages using pygments" repo_url: "https://github.com/roskakori/pygount" theme: name: material features: - navigation.footer markdown_extensions: - attr_list - codehilite - toc: permalink: true nav: - "Overview": "index.md" - "Installation": "installation.md" - "Usage": - "Usage": "usage.md" - "JSON format": "json.md" - "Continuous integration": "continuous-integration.md" - "Background": "background.md" - "API": "api.md" - "Changes": "changes.md" - "Contributing": "contributing.md" validation: nav: omitted_files: warn ================================================ FILE: pygount/__init__.py ================================================ """ Pygount counts lines of source code using pygments lexers. """ # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. from importlib.metadata import version from .analysis import DuplicatePool, SourceAnalysis, SourceScanner, SourceState, encoding_for from .common import Error, OptionError from .summary import LanguageSummary, ProjectSummary __version__ = version(__name__) __all__ = [ "DuplicatePool", "Error", "LanguageSummary", "OptionError", "ProjectSummary", "SourceAnalysis", "SourceScanner", "SourceState", "__version__", "encoding_for", ] ================================================ FILE: pygount/analysis.py ================================================ """ Functions to analyze source code and count lines in it. """ # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. import codecs import collections import glob import hashlib import itertools import logging import os import re from collections.abc import Iterator, Sequence from dataclasses import dataclass from enum import Enum from io import SEEK_CUR, BufferedIOBase, IOBase, RawIOBase, TextIOBase from pathlib import Path from re import Pattern from typing import Optional, Union import pygments.lexer import pygments.lexers import pygments.lexers.html import pygments.token import pygments.util import pygount.common import pygount.lexers import pygount.xmldialect from pygount.common import WHITE_SPACE_CHARACTERS, mapped_repr, matching_regex from pygount.git_storage import GitStorage, git_remote_url_and_revision_if_any HTTP_URL_REGEX = re.compile(r"^(https?://)") _ALLOWED_GIT_PLATFORMS = ["github.com", "bitbucket.org", "gitlab.com"] _ALLOWED_GIT_PLATFORM_CHOICES_PATTERN = "|".join(map(re.escape, _ALLOWED_GIT_PLATFORMS)) GIT_REPO_REGEX = re.compile(rf"^(https?://|git@)({_ALLOWED_GIT_PLATFORM_CHOICES_PATTERN})/[^/]+/[^/]+") # Attempt to import chardet. try: import chardet.universaldetector _detector = chardet.universaldetector.UniversalDetector() except ImportError: _detector = None has_chardet = bool(_detector) #: Fallback encoding to use if no encoding is specified DEFAULT_FALLBACK_ENCODING = "cp1252" #: Default glob patterns for folders not to analyze. DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT = ", ".join( [ ".?*", "_svn", # Subversion hack for Windows "__pycache__", # Python byte code ] ) #: Pygments token type; we need to define our own type because pygments' ``_TokenType`` is internal. TokenType = type(pygments.token.Token) _BASE_LANGUAGE_REGEX = re.compile(r"^(?P[^+]+)\+[^+].*$") #: BOMs to indicate that a file is a text file even if it contains zero bytes. _TEXT_BOMS = (codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE, codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE, codecs.BOM_UTF8) class SourceState(Enum): """ Possible values for :py:attr:`SourceAnalysis.state`. """ #: successfully analyzed analyzed = 1 #: source code is a binary binary = 2 #: source code is an identical copy of another duplicate = 3 #: source code is empty (file size = 0) empty = 4 #: source could not be parsed error = 5 #: source code has been generated generated = 6 # TODO: 'huge' = auto() # source code exceeds size limit #: pygments does not offer any lexer to analyze the source unknown = 7 #: Default patterns for regular expressions to detect generated code. #: The '(?i)' indicates that the patterns are case-insensitive. DEFAULT_GENERATED_LINE_PATTERNS_TEXT = pygount.common.REGEX_PATTERN_PREFIX + ", ".join( [ r"(?i).*autogenerated", r"(?i).*automatically generated", r"(?i).*do not edit", r"(?i).*generated with the .+ utility", r"(?i).*this is a generated file", r"(?i).*generated automatically", ] ) #: Default patterns for file names that are considered to be generated. DEFAULT_GENERATED_NAME_PATTERNS_TEXT = ", ".join( [ "*.g.dart", # See, for example, "*.lock", # For example, Cargo.lock, poetry.lock, uv.lock. "npm-shrinkwrap.json", # See . "go.sum", # See . "package-lock.json", # See . "pnpm-lock.yaml", # See . ] ) #: Default glob patterns for file names not to analyze. DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT = ", ".join([".*", "*~"]) _log = logging.getLogger("pygount") _MARK_TO_NAME_MAP = (("c", "code"), ("d", "documentation"), ("e", "empty"), ("s", "string")) _BOM_TO_ENCODING_MAP = collections.OrderedDict( ( # NOTE: We need an ordered dict due to the overlap between utf-32-le and utf-16-be. (codecs.BOM_UTF8, "utf-8-sig"), (codecs.BOM_UTF32_LE, "utf-32-le"), (codecs.BOM_UTF16_BE, "utf-16-be"), (codecs.BOM_UTF16_LE, "utf-16-le"), (codecs.BOM_UTF32_BE, "utf-32-be"), ) ) _XML_PROLOG_REGEX = re.compile(r'<\?xml\s+.*encoding="(?P[-_.a-zA-Z0-9]+)".*\?>') _MAGIC_COMMENT_LINE_START_REGEXES = [ re.compile(f"^{pattern}\\s*(?P.+)$", re.IGNORECASE) for pattern in [ r"#+", # Python, Ruby r"//+", # C++, Dart, Java, ... r"/\*+", # C etc r"--+", # Ada, SQL, VHDL r";+", # Assembly r"%+", # Latex, MatLab, Prolog r"rem\s", # Basic, Windows batch r"\*+", # Pascal r"\{", # Pascal ] ] _MAGIC_COMMENT_LINE_REMAINDER_REGEXES = [ re.compile(pattern, re.IGNORECASE) for pattern in [ # Covers for example "encoding: cp1252" and "encoding=cp1252". r"(en)?coding\s*[:=]\s*(?P[-_.a-z0-9]+)\b", # Covers for example "-*- coding: cp1252 -*-". r"-\*-\s*coding\s*[:=]\s*(?P[-_.a-z0-9]+)\s*(;.+\s*)?-\*-\s*", ] ] _STANDARD_PLAIN_TEXT_NAME_PATTERNS = ( # Text files for (moribund) gnits standards. "authors", "bugs", "changelog", "copying", "install", "license", "news", "readme", "thanks", # GitHub community recommendations, see # . # By now, in practice most projects use a suffix like "*.md" but some older ones # still might have such files without suffix. "code_of_conduct", "contributing", "support", # Other common text files. "changes", "faq", "readme\\.1st", "read\\.me", "todo", ) _PLAIN_TEXT_PATTERN = "(^" + "$)|(^".join(_STANDARD_PLAIN_TEXT_NAME_PATTERNS) + "$)" #: Regular expression to detect plain text files by name. _PLAIN_TEXT_NAME_REGEX = re.compile(_PLAIN_TEXT_PATTERN, re.IGNORECASE) _MARK_UP_NAME_PATTERN = r"^.*\.(md|rst|txt|\d+)$" _MARK_UP_NAME_REGEX = re.compile(_MARK_UP_NAME_PATTERN, re.IGNORECASE) #: Mapping for file suffixes to lexers for which pygments offers no official one. _SUFFIX_TO_FALLBACK_LEXER_MAP = { "fex": pygount.lexers.MinimalisticWebFocusLexer(), "idl": pygount.lexers.IdlLexer(), "m4": pygount.lexers.MinimalisticM4Lexer(), "svg": pygments.lexers.html.XmlLexer(), # TODO#213 Remove SVG hack. "txt": pygount.lexers.PlainTextLexer(), "vbe": pygount.lexers.MinimalisticVBScriptLexer(), "vbs": pygount.lexers.MinimalisticVBScriptLexer(), } for _oracle_suffix in ("pck", "pkb", "pks", "pls"): _SUFFIX_TO_FALLBACK_LEXER_MAP[_oracle_suffix] = pygments.lexers.get_lexer_by_name("plpgsql") @dataclass(frozen=True) class PathData: source_path: str group: str tmp_dir: Optional[str] = None def is_markup_file(source_path: str) -> bool: return _MARK_UP_NAME_REGEX.match(os.path.basename(source_path)) is not None class DuplicatePool: """ A pool that collects information about potential duplicate files. """ def __init__(self): self._size_to_paths_map = {} self._size_and_hash_to_path_map = {} @staticmethod def _hash_for(path_to_hash): buffer_size = 1024 * 1024 sha256_hash = hashlib.sha256() with open(path_to_hash, "rb", buffer_size) as file_to_hash: data = file_to_hash.read(buffer_size) while len(data) >= 1: sha256_hash.update(data) data = file_to_hash.read(buffer_size) return sha256_hash.digest() def duplicate_path(self, source_path: str) -> Optional[str]: """ Path to a duplicate for ``source_path`` or ``None`` if no duplicate exists. Internally information is stored to identify possible future duplicates of ``source_path``. """ result = None source_size = os.path.getsize(source_path) paths_with_same_size = self._size_to_paths_map.get(source_size) if paths_with_same_size is None: self._size_to_paths_map[source_size] = [source_path] else: source_hash = DuplicatePool._hash_for(source_path) if len(paths_with_same_size) == 1: # Retrofit the initial path with the same size and its hash. initial_path_with_same_size = paths_with_same_size[0] initial_hash = DuplicatePool._hash_for(initial_path_with_same_size) self._size_and_hash_to_path_map[(source_size, initial_hash)] = initial_path_with_same_size result = self._size_and_hash_to_path_map.get((source_size, source_hash)) self._size_and_hash_to_path_map[(source_size, source_hash)] = source_path return result class SourceAnalysis: """ Results from analyzing a source path. Prefer the factory methods :py:meth:`from_file()` and :py:meth:`from_state` to calling the constructor. """ def __init__( self, path: str, language: str, group: str, code: int, documentation: int, empty: int, string: int, state: SourceState, state_info: Optional[str] = None, ): SourceAnalysis._check_state_info(state, state_info) self._path = path self._language = language self._group = group self._code = code self._documentation = documentation self._empty = empty self._string = string self._state = state self._state_info = state_info @staticmethod def from_state( source_path: str, group: str, state: SourceState, state_info: Optional[str] = None, tmp_dir: Optional[str] = None, ) -> "SourceAnalysis": """ Factory method to create a :py:class:`SourceAnalysis` with all counts set to 0 and everything else according to the specified parameters. """ assert source_path is not None assert group is not None assert state != SourceState.analyzed, "use from() for analyzable sources" SourceAnalysis._check_state_info(state, state_info) reduced_path = source_path.rsplit(tmp_dir, maxsplit=1)[-1].lstrip(os.sep) if tmp_dir else source_path return SourceAnalysis( path=reduced_path, language=f"__{state.name}__", group=group, code=0, documentation=0, empty=0, string=0, state=state, state_info=state_info, ) @staticmethod def _check_state_info(state: SourceState, state_info: Optional[str]): assert state_info is None or isinstance(state_info, str), ( f"state_info must be be None or str but is: {state_info!r}" ) states_that_require_state_info = [SourceState.duplicate, SourceState.error, SourceState.generated] assert (state in states_that_require_state_info) == (state_info is not None), ( f"state={state} and state_info={state_info} " f"but state_info must be specified for the following states: {states_that_require_state_info}" ) @staticmethod def from_file( source_path: str, group: str, encoding: str = "automatic", fallback_encoding: str = "cp1252", generated_regexes: Optional[list[Pattern]] = None, duplicate_pool: Optional[DuplicatePool] = None, file_handle: Optional[IOBase] = None, merge_embedded_language: bool = False, tmp_dir: Optional[str] = None, *, generated_name_regexes: Optional[list[Pattern]] = None, ) -> "SourceAnalysis": """ Factory method to create a :py:class:`SourceAnalysis` by analyzing the source code in ``source_path`` or the open file ``file_handle``. :param source_path: path to source code to analyze :param group: name of a logical group the source code belongs to, e.g. a package. :param encoding: encoding according to :func:`encoding_for` :param fallback_encoding: fallback encoding according to :func:`encoding_for` :param generated_regexes: list of regular expression that if found within the first few lines if a source code identify is as generated source code for which SLOC should not be counted :param generated_name_regexes: list of regular expression that if the base file name matches, the file is considered to be generated and the SLOC should not be counted :param duplicate_pool: a :class:`DuplicatePool` where information about possible duplicates is collected, or ``None`` if possible duplicates should be counted multiple times. :param file_handle: a file-like object, or ``None`` to read and open the file from ``source_path``. If the file is open in text mode, it must be opened with the correct encoding. :param merge_embedded_language: If pygments detects a base and embedded language, the source code counts towards the base language. For example, "JavaScript+Lasso" counts as "JavaScript". :param tmp_dir: If a temporary directory was created, strip it from the path name. This happens right now only for git repositories. """ assert encoding is not None result = None lexer = None source_code = None if generated_name_regexes is not None: generated_name_regex = matching_regex(Path(source_path).name, generated_name_regexes) if generated_name_regex is not None: result = SourceAnalysis.from_state( source_path, group, SourceState.generated, state_info=generated_name_regex.pattern ) if result is None and file_handle is None: source_size = os.path.getsize(source_path) if source_size == 0: _log.info("%s: is empty", source_path) result = SourceAnalysis.from_state(source_path, group, SourceState.empty) elif is_binary_file(source_path): _log.info("%s: is binary", source_path) result = SourceAnalysis.from_state(source_path, group, SourceState.binary) elif not has_lexer(source_path): _log.info("%s: unknown language", source_path) result = SourceAnalysis.from_state(source_path, group, SourceState.unknown) if duplicate_pool is not None: duplicate_path = duplicate_pool.duplicate_path(source_path) if duplicate_path is not None: _log.info("%s: is a duplicate of %s", source_path, duplicate_path) result = SourceAnalysis.from_state(source_path, group, SourceState.duplicate, duplicate_path) if result is None: try: if file_handle is None: if encoding in ("automatic", "chardet"): encoding = encoding_for(source_path, encoding, fallback_encoding) with open(source_path, encoding=encoding) as source_file: source_code = source_file.read() elif not isinstance(file_handle, TextIOBase): if encoding in ("automatic", "chardet"): encoding = encoding_for(source_path, encoding, fallback_encoding, file_handle=file_handle) source_code = file_handle.read().decode(encoding) else: source_code = file_handle.read() except (LookupError, OSError, UnicodeError) as error: _log.warning("cannot read %s using encoding %s: %s", source_path, encoding, error) result = SourceAnalysis.from_state(source_path, group, SourceState.error, str(error)) if result is None: lexer = guess_lexer(source_path, source_code) assert lexer is not None actual_generated_regexes = ( generated_regexes if generated_regexes is not None else pygount.common.regexes_from(DEFAULT_GENERATED_LINE_PATTERNS_TEXT) ) if (result is None) and (len(actual_generated_regexes) != 0): number_line_and_regex = matching_number_line_and_regex( pygount.common.lines(source_code), actual_generated_regexes ) if number_line_and_regex is not None: number, _, regex = number_line_and_regex message = f"line {number} matches {regex}" _log.info("%s: is generated code because %s", source_path, message) result = SourceAnalysis.from_state(source_path, group, SourceState.generated, message) if result is None: assert lexer is not None assert source_code is not None language = base_language(lexer.name) if merge_embedded_language else lexer.name if ("xml" in language.lower()) or (language == "Genshi"): dialect = pygount.xmldialect.xml_dialect(source_path, source_code) if dialect is not None: language = dialect _log.info("%s: analyze as %s using encoding %s", source_path, language, encoding) mark_to_count_map = {"c": 0, "d": 0, "e": 0, "s": 0} is_markup = is_markup_file(source_path) for line_parts in _line_parts(lexer, source_code, is_markup=is_markup): mark_to_increment = "e" for mark_to_check in ("d", "s", "c"): if mark_to_check in line_parts: mark_to_increment = mark_to_check mark_to_count_map[mark_to_increment] += 1 reduced_path = source_path.rsplit(tmp_dir, maxsplit=1)[-1].lstrip(os.sep) if tmp_dir else source_path result = SourceAnalysis( path=reduced_path, language=language, group=group, code=mark_to_count_map["c"], documentation=mark_to_count_map["d"], empty=mark_to_count_map["e"], string=mark_to_count_map["s"], state=SourceState.analyzed, state_info=None, ) assert result is not None return result @property def path(self) -> str: return self._path @property def language(self) -> str: """ The programming language the analyzed source code is written in; if :py:attr:`state` does not equal :py:attr:`SourceState.analyzed` this will be a pseudo language. """ return self._language @property def group(self) -> str: """ Group the source code belongs to; this can be any text useful to group the files later on. It is perfectly valid to put all files in the same group. (Note: this property is mostly there for compatibility with the original SLOCCount.) """ return self._group @property def code_count(self) -> int: """number of lines containing code""" return self._code @property def documentation_count(self) -> int: """number of lines containing documentation (resp. comments)""" return self._documentation @property def empty_count(self) -> int: """ number of empty lines, including lines containing only white space, white characters or white code words See also: :py:func:`white_characters`, :py:func:`white_code_words` """ return self._empty @property def line_count(self) -> int: """number of total lines, which is what you text editor a `wc -l` would show """ return self.code_count + self.documentation_count + self.empty_count + self.string_count @property def string_count(self) -> int: """number of lines containing only strings but no other code""" return self._string @property def source_count(self) -> int: """number of source lines of code (the sum of code_count and string_count)""" return self.code_count + self.string_count @property def state(self) -> SourceState: """ The state of the analysis after parsing the source file. """ return self._state @property def state_info(self) -> Optional[Union[str, Exception]]: """ Possible additional information about :py:attr:`state`: * :py:attr:`SourceState.duplicate`: path to the original source file the :py:attr:`path` is a duplicate of * :py:attr:`SourceState.error`: the :py:exc:`Exception` causing the error * :py:attr:`SourceState.generated`: a human-readable explanation why the file is considered to be generated """ return self._state_info @property def is_countable(self) -> bool: """ ``True`` if source counts can be counted towards a total. """ return self.state in (SourceState.analyzed, SourceState.duplicate) def __repr__(self): name_to_value_map = { "path": repr(self.path), "language": repr(self.language), "group": repr(self.group), "state": self.state.name, } if self.state == SourceState.analyzed: name_to_value_map.update( { "code_count": self.code_count, "documentation_count": self.documentation_count, "empty_count": self.empty_count, "string_count": self.string_count, } ) if self.state_info is not None: name_to_value_map["state_info"] = repr(self.state_info) return mapped_repr(self, name_to_value_map) class SourceScanner: """ Scanner for source code files matching certain conditions. """ def __init__( self, source_patterns, suffixes="*", folders_to_skip=None, name_to_skip=None, ): self._source_patterns = source_patterns self._suffixes = pygount.common.regexes_from(suffixes) self._folder_regexps_to_skip = ( folders_to_skip if folders_to_skip is not None else pygount.common.regexes_from(DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT) ) self._name_regexps_to_skip = ( name_to_skip if folders_to_skip is not None else pygount.common.regexes_from(DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT) ) self._git_storages = [] def close(self): for git_storage in self._git_storages: git_storage.close() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() return False @property def source_patterns(self): return self._source_patterns @property def suffixes(self) -> list[Pattern]: return self._suffixes @property def folder_regexps_to_skip(self) -> list[Pattern]: return self._folder_regexps_to_skip @folder_regexps_to_skip.setter def folder_regexps_to_skip(self, regexps_or_pattern_text): self._folder_regexps_to_skip.append = pygount.common.regexes_from( regexps_or_pattern_text, self.folder_regexps_to_skip ) @property def name_regexps_to_skip(self) -> list[Pattern]: return self._name_regexps_to_skip @name_regexps_to_skip.setter def name_regexps_to_skip(self, regexps_or_pattern_text): self._name_regexps_to_skip = pygount.common.regexes_from(regexps_or_pattern_text, self.name_regexps_to_skip) def _is_path_to_skip(self, name, is_folder) -> bool: assert os.sep not in name, f"name={name!r}" regexps_to_skip = self._folder_regexps_to_skip if is_folder else self._name_regexps_to_skip return any(path_name_to_skip_regex.match(name) is not None for path_name_to_skip_regex in regexps_to_skip) def _paths_and_group_to_analyze_in(self, folder, group, tmp_dir) -> PathData: assert folder is not None assert group is not None for name in os.listdir(folder): path = os.path.join(folder, name) if not os.path.islink(path): is_folder = os.path.isdir(path) if self._is_path_to_skip(os.path.basename(path), is_folder): _log.debug("skip due to matching skip pattern: %s", path) elif is_folder: yield from self._paths_and_group_to_analyze_in(path, group, tmp_dir) else: yield PathData(source_path=path, group=group, tmp_dir=tmp_dir) def _paths_and_group_to_analyze(self, path_to_analyse_pattern, group=None, tmp_dir=None) -> Iterator[PathData]: for path_to_analyse in glob.glob(path_to_analyse_pattern): if os.path.islink(path_to_analyse): _log.debug("skip link: %s", path_to_analyse) else: is_folder = os.path.isdir(path_to_analyse) if self._is_path_to_skip(os.path.basename(path_to_analyse), is_folder): _log.debug("skip due to matching skip pattern: %s", path_to_analyse) else: actual_group = group if is_folder: if actual_group is None: actual_group = os.path.basename(path_to_analyse) if actual_group == "": # Compensate for trailing path separator. actual_group = os.path.basename(os.path.dirname(path_to_analyse)) yield from self._paths_and_group_to_analyze_in(path_to_analyse_pattern, actual_group, tmp_dir) else: if actual_group is None: actual_group = os.path.dirname(path_to_analyse) if actual_group == "": actual_group = os.path.basename(os.path.dirname(os.path.abspath(path_to_analyse))) yield PathData(source_path=path_to_analyse, group=actual_group, tmp_dir=tmp_dir) def _source_paths_and_groups_to_analyze(self, source_patterns_to_analyze) -> list[PathData]: assert source_patterns_to_analyze is not None result = [] def _process_source_pattern(source_pattern: str): remote_url, revision = git_remote_url_and_revision_if_any(source_pattern) if remote_url is not None: git_storage = GitStorage(remote_url, revision) self._git_storages.append(git_storage) git_storage.extract() result.extend( self._paths_and_group_to_analyze(git_storage.temp_folder, tmp_dir=git_storage.temp_folder) ) else: has_url_prefix = re.match(HTTP_URL_REGEX, source_pattern) if has_url_prefix: is_git_url = re.match(GIT_REPO_REGEX, source_pattern_to_analyze) is not None if not is_git_url: raise pygount.Error( f'URL to git repository {source_pattern} must end with ".git" or must match the pattern ' f"http(s)://({'|'.join(_ALLOWED_GIT_PLATFORMS)})/<...>/<...>.git. " f"For example: git@github.com:roskakori/pygount.git or " f"https://github.com/roskakori/pygount.git." ) source_pattern = source_pattern.rstrip("/") _process_source_pattern(source_pattern + ".git") else: result.extend(self._paths_and_group_to_analyze(source_pattern_to_analyze)) # NOTE: We could avoid initializing `source_pattern_to_analyze` here by moving the `try` inside # the loop, but this would incor a performance overhead (ruff's PERF203). source_pattern_to_analyze = None try: for source_pattern_to_analyze in source_patterns_to_analyze: _process_source_pattern(source_pattern_to_analyze) except OSError as error: assert source_pattern_to_analyze is not None raise OSError(f'cannot scan "{source_pattern_to_analyze}" for source files: {error}') from error result = sorted(set(result), key=lambda data: (data.source_path, data.group)) return result def source_paths(self) -> Iterator[PathData]: """ Paths to source code files matching all the conditions for this scanner. """ source_paths_and_groups_to_analyze = self._source_paths_and_groups_to_analyze(self.source_patterns) for path_data in source_paths_and_groups_to_analyze: suffix = os.path.splitext(path_data.source_path)[1].lstrip(".") is_suffix_to_analyze = any(suffix_regexp.match(suffix) for suffix_regexp in self.suffixes) if is_suffix_to_analyze: yield path_data else: _log.info("skip due to suffix: %s", path_data.source_path) _LANGUAGE_TO_WHITE_WORDS_MAP = {"batchfile": {"@"}, "python": {"pass"}, "sql": {"begin", "end"}} for _language in _LANGUAGE_TO_WHITE_WORDS_MAP: assert _language.islower() def matching_number_line_and_regex( source_lines: Iterator[str], generated_regexes: Sequence[Pattern], max_line_count: int = 15 ) -> Optional[tuple[int, str, Pattern]]: """ The first line and its number (starting with 0) in the source code that indicated that the source code is generated. :param source_lines: lines of text to scan :param generated_regexes: regular expressions a line must match to indicate the source code is generated. :param max_line_count: maximum number of lines to scan :return: a tuple of the form ``(number, line, regex)`` or ``None`` if the source lines do not match any ``generated_regexes``. """ initial_numbers_and_lines = enumerate(itertools.islice(source_lines, max_line_count)) matching_number_line_and_regexps = ( (number, line, matching_regex) for number, line in initial_numbers_and_lines for matching_regex in generated_regexes if matching_regex.match(line) ) possible_first_matching_number_line_and_regexp = list(itertools.islice(matching_number_line_and_regexps, 1)) result = ( possible_first_matching_number_line_and_regexp[0] if possible_first_matching_number_line_and_regexp else None ) return result def white_characters(language_id: str) -> str: """ Characters that count as white space if they are the only characters in a line. """ assert language_id is not None assert language_id.islower() return "(),:;[]{}" def white_code_words(language_id: str) -> set[str]: """ Words that do not count as code if it is the only word in a line. """ assert language_id is not None assert language_id.islower() return _LANGUAGE_TO_WHITE_WORDS_MAP.get(language_id, set()) def _delined_tokens(tokens: Iterator[tuple[TokenType, str]]) -> Iterator[TokenType]: for token_type, token_text in tokens: remaining_token_text = token_text newline_index = remaining_token_text.find("\n") while newline_index != -1: yield token_type, remaining_token_text[: newline_index + 1] remaining_token_text = remaining_token_text[newline_index + 1 :] newline_index = remaining_token_text.find("\n") if remaining_token_text != "": yield token_type, remaining_token_text def _pythonized_comments(tokens: Iterator[tuple[TokenType, str]]) -> Iterator[TokenType]: """ Similar to tokens but converts strings after a colon (`:`) to comments. """ is_after_colon = True for token_type, result_token_text in tokens: if is_after_colon and (token_type in pygments.token.String): result_token_type = pygments.token.Comment else: result_token_type = token_type if result_token_text == ":": is_after_colon = True elif token_type not in pygments.token.Comment: is_whitespace = len(result_token_text.rstrip(WHITE_SPACE_CHARACTERS)) == 0 if not is_whitespace: is_after_colon = False yield result_token_type, result_token_text def _line_parts(lexer: pygments.lexer.Lexer, text: str, is_markup: bool = False) -> Iterator[set[str]]: line_marks = set() tokens = _delined_tokens(lexer.get_tokens(text)) if lexer.name == "Python": tokens = _pythonized_comments(tokens) language_id = lexer.name.lower() white_text = " \f\n\r\t" + white_characters(language_id) white_words = white_code_words(language_id) for token_type, token_text in tokens: # NOTE: Pygments treats preprocessor statements as special comments. is_actual_comment = token_type in pygments.token.Comment and token_type not in ( pygments.token.Comment.Preproc, pygments.token.Comment.PreprocFile, ) if is_actual_comment: line_marks.add("d") # 'documentation' elif token_type in pygments.token.String: line_marks.add("s") # 'string' else: is_white_text = (token_text.strip() in white_words) or (token_text.rstrip(white_text) == "") if not is_white_text: line_mark = "d" if is_markup else "c" line_marks.add(line_mark) if token_text.endswith("\n"): yield line_marks line_marks = set() if len(line_marks) >= 1: yield line_marks def check_file_handle_is_seekable(file_handle: Optional[Union[BufferedIOBase, RawIOBase]], source_path: str): if not file_handle.seekable(): raise pygount.Error(f"cannot determine encoding: file handle must be seekable: {source_path}") def encoding_for( source_path: str, encoding: str = "automatic", fallback_encoding: Optional[str] = None, file_handle: Optional[Union[BufferedIOBase, RawIOBase]] = None, ) -> str: """ The encoding used by the text file stored in ``source_path``. The algorithm used is: * If ``encoding`` is ``'automatic``, attempt the following: 1. Check BOM for UTF-8, UTF-16 and UTF-32. 2. Look for XML prolog or magic heading like ``# -*- coding: cp1252 -*-`` 3. Read the file using UTF-8. 4. If all this fails, use the ``fallback_encoding`` and ignore any further encoding errors. * If ``encoding`` is ``'chardet`` use :mod:`chardet` to obtain the encoding. * For any other ``encoding`` simply use the specified value. """ assert encoding is not None if encoding == "automatic": if file_handle is None: with open(source_path, "rb") as source_file: heading = source_file.read(128) else: check_file_handle_is_seekable(file_handle, source_path) heading = file_handle.read(128) file_handle.seek(-len(heading), SEEK_CUR) result = None if len(heading) == 0: # File is empty, assume a dummy encoding. result = "utf-8" if result is None: result = next( ( encoding_for_bom for bom, encoding_for_bom in _BOM_TO_ENCODING_MAP.items() if heading[: len(bom)] == bom ), None, ) if result is None: result = encoding_from_header(heading) elif encoding == "chardet": assert _detector is not None, ( 'without chardet installed, encoding="chardet" must be rejected before calling encoding_for()' ) _detector.reset() if file_handle is None: with open(source_path, "rb") as source_file: lines = source_file.readlines() else: check_file_handle_is_seekable(file_handle, source_path) file_position = file_handle.tell() lines = file_handle.readlines() file_handle.seek(file_position) for line in lines: _detector.feed(line) if _detector.done: break result = _detector.result["encoding"] if result is None: _log.warning( "%s: chardet cannot determine encoding, assuming fallback encoding %s", source_path, fallback_encoding ) result = fallback_encoding else: # Simply use the specified encoding. result = encoding if result is None: # Encoding 'automatic' or 'chardet' failed to detect anything. if fallback_encoding is not None: # If defined, use the fallback encoding. result = fallback_encoding else: try: # Attempt to read the file as UTF-8. if file_handle is None: with open(source_path, encoding="utf-8") as source_file: source_file.read() else: check_file_handle_is_seekable(file_handle, source_path) file_position = file_handle.tell() file_handle.read() file_handle.seek(file_position) result = "utf-8" except UnicodeDecodeError: # UTF-8 did not work out, use the default as last resort. result = DEFAULT_FALLBACK_ENCODING _log.debug("%s: no fallback encoding specified, using %s", source_path, result) assert result is not None return result def encoding_from_header(header: bytes) -> Optional[str]: ascii_header = header.decode("ascii", errors="replace") result = encoding_from_possible_magic_comment(ascii_header) if result is None: result = encoding_from_possible_xml_prolog(ascii_header) return result def encoding_from_possible_magic_comment(ascii_header: str) -> Optional[str]: return next(_magic_comment_encodings(ascii_header), None) def _magic_comment_encodings(ascii_header: str) -> Iterator[str]: header_lines = ascii_header.split("\n")[:2] for header_line in header_lines: for magic_line_start_regex in _MAGIC_COMMENT_LINE_START_REGEXES: magic_line_start_match = re.match(magic_line_start_regex, header_line) if magic_line_start_match is not None: remainder = magic_line_start_match.group("remainder") for magic_coding_comment_regex in _MAGIC_COMMENT_LINE_REMAINDER_REGEXES: result = magic_coding_comment_regex.match(remainder) if result is not None: yield result.group("encoding") def encoding_from_possible_xml_prolog(ascii_header: str) -> Optional[str]: header_line = ascii_header.replace("\f\n\r\v", " ") xml_prolog_match = _XML_PROLOG_REGEX.match(header_line) return xml_prolog_match.group("encoding") if xml_prolog_match is not None else None def is_binary_file(source_path: str) -> bool: with open(source_path, "rb") as source_file: initial_bytes = source_file.read(8192) return not any(initial_bytes.startswith(bom) for bom in _TEXT_BOMS) and b"\0" in initial_bytes def is_plain_text(source_path): return _PLAIN_TEXT_NAME_REGEX.match(os.path.basename(source_path)) def has_lexer(source_path: str) -> bool: """ Initial quick check if there is a lexer for ``source_path``. This removes the need for calling :py:func:`pygments.lexers.guess_lexer_for_filename()` which fully reads the source file. """ result = bool(pygments.lexers.find_lexer_class_for_filename(source_path)) if not result: suffix = os.path.splitext(os.path.basename(source_path))[1].lstrip(".") result = suffix in _SUFFIX_TO_FALLBACK_LEXER_MAP return result def guess_lexer(source_path: str, text: str) -> pygments.lexer.Lexer: if is_plain_text(source_path): result = pygount.lexers.PlainTextLexer() else: try: result = pygments.lexers.guess_lexer_for_filename(source_path, text) except pygments.util.ClassNotFound: suffix = os.path.splitext(os.path.basename(source_path))[1].lstrip(".") result = _SUFFIX_TO_FALLBACK_LEXER_MAP.get(suffix) return result def base_language(language: str) -> str: base_language_match = _BASE_LANGUAGE_REGEX.match(language) return language if base_language_match is None else base_language_match.group("base_language") ================================================ FILE: pygount/command.py ================================================ """ Command line interface for pygount. """ # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. import argparse import contextlib import logging import os import sys from rich.progress import Progress import pygount import pygount.analysis import pygount.common import pygount.write #: Valid formats for option --format. VALID_OUTPUT_FORMATS = ("cloc-xml", "json", "sloccount", "summary") _DEFAULT_ENCODING = "automatic" _DEFAULT_OUTPUT_FORMAT = "sloccount" _DEFAULT_OUTPUT = "STDOUT" _DEFAULT_SOURCE_PATTERNS = os.curdir _DEFAULT_SUFFIXES = "*" _HELP_ENCODING = '''encoding to use when reading source code; use "automatic" to take BOMs, XML prolog and magic headers into account and fall back to UTF-8 or CP1252 if none fits; use "automatic;" to specify a different fallback encoding than CP1252; use "chardet" to let the chardet package determine the encoding; default: "%(default)s"''' _HELP_EPILOG = """SHELL-PATTERN is a pattern using *, ? and ranges like [a-z] as placeholders. PATTERNS is a comma separated list of SHELL-PATTERN. The prefix [regex] indicated that the PATTERNS use regular expression syntax. If default values are available, [...] indicates that the PATTERNS extend the existing default values.""" _HELP_FORMAT = ( f"output format, one of: " # HACK The chr(34) is necessary because ruff does not preserve the # backslash in '\"'. f"{', '.join([chr(34) + output_format + chr(34) for output_format in VALID_OUTPUT_FORMATS])};" f' default: "%(default)s"' ) _HELP_GENERATED = """comma separated list of regular expressions to detect generated code; default: %(default)s""" _HELP_GENERATED_NAMES = """comma separated list of glob patterns for file names not to treat as generated. Use "..." as first entry to append patterns to the default patterns; default: %(default)s""" _HELP_MERGE_EMBEDDED_LANGUAGES = """merge counts for embedded languages into their base language; for example, HTML+Jinja2 counts as HTML""" _HELP_FOLDERS_TO_SKIP = """comma separated list of glob patterns for folder names not to analyze. Use "..." as first entry to append patterns to the default patterns; default: %(default)s""" _HELP_NAMES_TO_SKIP = """comma separated list of glob patterns for file names not to analyze. Use "..." as first entry to append patterns to the default patterns; default: %(default)s""" _HELP_SUFFIX = '''limit analysis on files matching any suffix in comma separated LIST; shell patterns are possible; example: "py,sql"; default: "%(default)s"''' _OUTPUT_FORMAT_TO_WRITER_CLASS_MAP = { "cloc-xml": pygount.write.ClocXmlWriter, "json": pygount.write.JsonWriter, "sloccount": pygount.write.LineWriter, "summary": pygount.write.SummaryWriter, } assert set(VALID_OUTPUT_FORMATS) == set(_OUTPUT_FORMAT_TO_WRITER_CLASS_MAP.keys()) _log = logging.getLogger("pygount") def _check_encoding(name, encoding_to_check, alternative_encoding, source=None): """ Check that ``encoding`` is a valid Python encoding :param name: name under which the encoding is known to the user, e.g. 'default encoding' :param encoding_to_check: name of the encoding to check, e.g. 'utf-8' :param source: source where the encoding has been set, e.g. option name :raise pygount.common.OptionError if ``encoding`` is not a valid Python encoding """ assert name is not None if encoding_to_check not in (alternative_encoding, "chardet", None): try: "".encode(encoding_to_check) except LookupError: raise pygount.common.OptionError( f'{name} is "{encoding_to_check}" but must be "{alternative_encoding}" or a known Python encoding', source, ) from None class Command: """ Command interface for pygount, where options starting with defaults can gradually be set and finally :py:meth:`execute()`. """ def __init__(self): self.set_encodings(_DEFAULT_ENCODING) self._folders_to_skip = pygount.common.regexes_from(pygount.analysis.DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT) self._generated_line_regexs = pygount.common.regexes_from(pygount.analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT) self._generated_name_regexps = pygount.common.regexes_from( pygount.analysis.DEFAULT_GENERATED_NAME_PATTERNS_TEXT ) self._has_duplicates = False self._has_summary = False self._has_to_merge_embedded_languages = False self._is_verbose = False self._names_to_skip = pygount.common.regexes_from(pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT) self._output = _DEFAULT_OUTPUT self._output_format = _DEFAULT_OUTPUT_FORMAT self._source_patterns = _DEFAULT_SOURCE_PATTERNS self._suffixes = pygount.common.regexes_from(_DEFAULT_SUFFIXES) def set_encodings(self, encoding, source=None): encoding_is_chardet = (encoding == "chardet") or (encoding.startswith("chardet;")) if encoding_is_chardet and not pygount.analysis.has_chardet: # pragma: no cover raise pygount.common.OptionError('chardet must be installed to set default encoding to "chardet"') if encoding in ("automatic", "chardet"): default_encoding = encoding fallback_encoding = None elif encoding.startswith(("automatic;", "chardet;")): first_encoding_semicolon_index = encoding.find(";") default_encoding = encoding[:first_encoding_semicolon_index] fallback_encoding = encoding[first_encoding_semicolon_index + 1 :] else: default_encoding = encoding fallback_encoding = pygount.analysis.DEFAULT_FALLBACK_ENCODING self.set_default_encoding(default_encoding, source) self.set_fallback_encoding(fallback_encoding, source) @property def default_encoding(self): return self._default_encoding def set_default_encoding(self, default_encoding, source=None): _check_encoding("default encoding", default_encoding, "automatic", source) self._default_encoding = default_encoding @property def fallback_encoding(self): return self._fallback_encoding def set_fallback_encoding(self, fallback_encoding, source=None): _check_encoding("fallback encoding", fallback_encoding, "automatic", source) self._fallback_encoding = fallback_encoding @property def folders_to_skip(self): return self._folders_to_skip def set_folders_to_skip(self, regexes_or_patterns_text, source=None): self._folders_to_skip = pygount.common.regexes_from( regexes_or_patterns_text, pygount.analysis.DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT, source ) @property def generated_regexps(self): return self._generated_line_regexs def set_generated_regexps(self, regexes_or_patterns_text, source=None): self._generated_line_regexs = pygount.common.regexes_from( regexes_or_patterns_text, pygount.analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT, source ) @property def generated_name_regexps(self): return self._generated_name_regexps def set_generated_name_regexps(self, regexes_or_pattern_text, source=None): self._generated_name_regexps = pygount.common.regexes_from( regexes_or_pattern_text, pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT, source ) @property def has_duplicates(self): return self._has_duplicates def set_has_duplicates(self, has_duplicates, source=None): self._has_duplicates = bool(has_duplicates) @property def has_to_merge_embedded_languages(self): return self._has_to_merge_embedded_languages def set_has_to_merge_embedded_languages(self, has_to_merge_embedded_languages, source=None): self._has_to_merge_embedded_languages = bool(has_to_merge_embedded_languages) @property def is_verbose(self): return self._is_verbose def set_is_verbose(self, is_verbose, source=None): self._is_verbose = bool(is_verbose) @property def names_to_skip(self): return self._names_to_skip def set_names_to_skip(self, regexes_or_pattern_text, source=None): self._names_to_skip = pygount.common.regexes_from( regexes_or_pattern_text, pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT, source ) @property def output(self): return self._output def set_output(self, output, source=None): assert output is not None self._output = output @property def output_format(self): return self._output_format def set_output_format(self, output_format, source=None): assert output_format is not None if output_format not in VALID_OUTPUT_FORMATS: raise pygount.common.OptionError( f"format is {output_format} but must be one of: {VALID_OUTPUT_FORMATS}", source ) self._output_format = output_format @property def source_patterns(self): return self._source_patterns def set_source_patterns(self, glob_patterns_or_text, source=None): assert glob_patterns_or_text is not None self._source_patterns = pygount.common.as_list(glob_patterns_or_text) assert len(self._source_patterns) >= 0 @property def suffixes(self): return self._suffixes def set_suffixes(self, regexes_or_patterns_text, source=None): assert regexes_or_patterns_text is not None self._suffixes = pygount.common.regexes_from(regexes_or_patterns_text, _DEFAULT_SUFFIXES, source) def argument_parser(self): parser = argparse.ArgumentParser(description="count source lines of code", epilog=_HELP_EPILOG) parser.add_argument("--duplicates", "-d", action="store_true", help="analyze duplicate files") parser.add_argument("--encoding", "-e", default=_DEFAULT_ENCODING, help=_HELP_ENCODING) parser.add_argument( "--folders-to-skip", "-F", metavar="PATTERNS", default=pygount.analysis.DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT, help=_HELP_FOLDERS_TO_SKIP, ) parser.add_argument( "--format", "-f", metavar="FORMAT", choices=VALID_OUTPUT_FORMATS, default=_DEFAULT_OUTPUT_FORMAT, help=_HELP_FORMAT, ) parser.add_argument( "--generated", "-g", metavar="PATTERNS", default=pygount.analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT, help=_HELP_GENERATED, ) parser.add_argument( "--generated-names", "-G", metavar="PATTERNS", default=pygount.analysis.DEFAULT_GENERATED_NAME_PATTERNS_TEXT, help=_HELP_GENERATED_NAMES, ) parser.add_argument( "--merge-embedded-languages", "-m", action="store_true", help=_HELP_MERGE_EMBEDDED_LANGUAGES, ) parser.add_argument( "--names-to-skip", "-N", metavar="PATTERNS", default=pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT, help=_HELP_NAMES_TO_SKIP, ) parser.add_argument( "--out", "-o", metavar="FILE", default=_DEFAULT_OUTPUT, help='file to write results to; use "STDOUT" for standard output; default: "%(default)s"', ) parser.add_argument("--suffix", "-s", metavar="PATTERNS", default=_DEFAULT_SUFFIXES, help=_HELP_SUFFIX) parser.add_argument( "source_patterns", metavar="SHELL-PATTERN", nargs="*", default=[os.getcwd()], help="source files and directories to scan; can use glob patterns; default: current directory", ) parser.add_argument("--verbose", "-v", action="store_true", help="explain what is being done") parser.add_argument("--version", action="version", version="%(prog)s " + pygount.__version__) return parser def parsed_args(self, arguments): assert arguments is not None parser = self.argument_parser() args = parser.parse_args(arguments) if args.encoding == "automatic": default_encoding = args.encoding fallback_encoding = None elif args.encoding == "chardet": if not pygount.analysis.has_chardet: # pragma: no cover parser.error("chardet must be installed in order to specify --encoding=chardet") default_encoding = args.encoding fallback_encoding = None else: if args.encoding.startswith("automatic;"): first_encoding_semicolon_index = args.encoding.find(";") default_encoding = args.encoding[:first_encoding_semicolon_index] fallback_encoding = args.encoding[first_encoding_semicolon_index + 1 :] encoding_to_check = ("fallback encoding", fallback_encoding) else: default_encoding = args.encoding fallback_encoding = None encoding_to_check = ("encoding", default_encoding) if encoding_to_check is not None: name, encoding = encoding_to_check try: "".encode(encoding) except LookupError: parser.error(f"{name} specified with --encoding must be a known Python encoding: {encoding}") return args, default_encoding, fallback_encoding def apply_arguments(self, arguments=None): if arguments is None: # pragma: no cover arguments = sys.argv[1:] args, default_encoding, fallback_encoding = self.parsed_args(arguments) self.set_default_encoding(default_encoding, "option --encoding") self.set_fallback_encoding(fallback_encoding, "option --encoding") self.set_folders_to_skip(args.folders_to_skip, "option --folders-to-skip") self.set_generated_regexps(args.generated, "option --generated") self.set_generated_name_regexps(args.generated_names, "option --generated-names") self.set_has_duplicates(args.duplicates, "option --duplicates") self.set_has_to_merge_embedded_languages(args.merge_embedded_languages, "option --merge-embedded-languages") self.set_is_verbose(args.verbose, "option --verbose") self.set_names_to_skip(args.names_to_skip, "option --names-to-skip") self.set_output(args.out, "option --out") self.set_output_format(args.format, "option --format") self.set_source_patterns(args.source_patterns, "option PATTERNS") self.set_suffixes(args.suffix, "option --suffix") def execute(self): _log.setLevel(logging.INFO if self.is_verbose else logging.WARNING) with pygount.analysis.SourceScanner( self.source_patterns, self.suffixes, self.folders_to_skip, self.names_to_skip ) as source_scanner: source_paths_and_groups_to_analyze = list(source_scanner.source_paths()) duplicate_pool = pygount.analysis.DuplicatePool() if not self.has_duplicates else None writer_class = _OUTPUT_FORMAT_TO_WRITER_CLASS_MAP[self.output_format] is_stdout = self.output == "STDOUT" target_context_manager = ( contextlib.nullcontext(sys.stdout) if is_stdout else open(self.output, "w", encoding="utf-8", newline="") # noqa: SIM115 ) with ( target_context_manager as target_file, writer_class(target_file) as writer, Progress(disable=not writer.has_to_track_progress, transient=True) as progress, ): try: for path_data in progress.track(source_paths_and_groups_to_analyze): writer.add( pygount.analysis.SourceAnalysis.from_file( path_data.source_path, path_data.group, self.default_encoding, self.fallback_encoding, generated_regexes=self._generated_line_regexs, generated_name_regexes=self._generated_name_regexps, duplicate_pool=duplicate_pool, merge_embedded_language=self.has_to_merge_embedded_languages, tmp_dir=path_data.tmp_dir, ) ) finally: progress.stop() def pygount_command(arguments=None): result = 1 command = Command() try: command.apply_arguments(arguments) command.execute() result = 0 except KeyboardInterrupt: # pragma: no cover _log.error("interrupted as requested by user") except (pygount.common.OptionError, OSError) as error: _log.error(error) except Exception as error: _log.exception(error) return result def main(): # pragma: no cover logging.basicConfig(level=logging.WARNING) sys.exit(pygount_command()) if __name__ == "__main__": # pragma: no cover main() ================================================ FILE: pygount/common.py ================================================ """ Common classes and functions for pygount. """ # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. import fnmatch import functools import inspect import re import typing import warnings from collections.abc import Iterator, Sequence from re import Pattern from typing import Optional, Union WHITE_SPACE_CHARACTERS = " \f\n\r\t" #: Pseudo pattern to indicate that the remaining pattern are an addition to the default patterns. ADDITIONAL_PATTERN = "[...]" #: Prefix to use for pattern strings to describe a regular expression instead of a shell pattern. REGEX_PATTERN_PREFIX = "[regex]" _REGEX_TYPE = type(re.compile("")) class Error(Exception): """ Error to indicate that something went wrong during a pygount run. """ class OptionError(Error): """ Error to indicate that a value passed to a command line option must be fixed. """ def __init__(self, message, source=None): super().__init__(message) self.option_error_message = (source + ": ") if source is not None else "" self.option_error_message += message def __str__(self): return self.option_error_message def as_list(items_or_text: Union[str, Sequence[str]]) -> list[str]: if isinstance(items_or_text, str): # TODO: Allow to specify comma (,) in text using '[,]'. result = [item.strip() for item in items_or_text.split(",") if item.strip() != ""] else: result = list(items_or_text) return result def regex_from(pattern: Union[str, Pattern], is_shell_pattern=False) -> Pattern: assert pattern is not None if isinstance(pattern, str): result = re.compile(fnmatch.translate(pattern)) if is_shell_pattern else re.compile(pattern) else: result = pattern # Assume pattern already is a compiled regular expression return result def regexes_from( patterns_text: Union[str, Sequence[str], Sequence[Pattern]], default_patterns_text: Optional[Union[str, Sequence[Pattern], Sequence[str]]] = None, source: Optional[str] = None, ) -> list[Pattern]: assert patterns_text is not None result = [] default_regexes = [] try: if isinstance(patterns_text, str): is_shell_pattern = True patterns_text_without_prefixes = patterns_text if patterns_text_without_prefixes.startswith(REGEX_PATTERN_PREFIX): is_shell_pattern = False patterns_text_without_prefixes = patterns_text_without_prefixes[len(REGEX_PATTERN_PREFIX) :] if patterns_text_without_prefixes.startswith(ADDITIONAL_PATTERN): assert default_patterns_text is not None default_regexes = regexes_from(default_patterns_text) patterns_text_without_prefixes = patterns_text_without_prefixes[len(ADDITIONAL_PATTERN) :] patterns = as_list(patterns_text_without_prefixes) result = [regex_from(pattern, is_shell_pattern) for pattern in patterns] else: regexes = list(patterns_text) if len(regexes) >= 1 and regexes[0] is None: default_regexes = regexes_from(default_patterns_text) regexes = regexes[1:] for supposed_regex in regexes: assert isinstance(supposed_regex, _REGEX_TYPE), ( f"patterns_text must a text or sequence or regular expressions but contains: {supposed_regex}" ) result.extend(regexes) except re.error as error: raise OptionError(f"cannot parse pattern for regular repression: {error}", source) from None result.extend(default_regexes) return result def matching_regex(text: str, regexes: list[typing.Pattern]) -> Optional[typing.Pattern]: return next((regex for regex in regexes if regex.match(text)), None) def lines(text: str) -> Iterator[str]: """ Generator function to yield lines (delimited with ``'\n'``) stored in ``text``. This is useful when a regular expression should only match on a per-line basis in a memory efficient way. """ assert text is not None assert "\r" not in text previous_newline_index = 0 newline_index = text.find("\n") while newline_index != -1: yield text[previous_newline_index:newline_index] previous_newline_index = newline_index + 1 newline_index = text.find("\n", previous_newline_index) last_line = text[previous_newline_index:] if last_line != "": yield last_line def deprecated(reason: Optional[str]): # pragma: no cover """ Decorator to mark functions as deprecated and log a warning in case it is called. Source: https://stackoverflow.com/questions/2536307/decorators-in-the-python-standard-lib-deprecated-specifically """ if isinstance(reason, str): # The @deprecated is used with a 'reason'. # # .. code-block:: python # # @deprecated("please, use another function") # def old_function(x, y): # pass def decorator(func1): class_or_func = "class" if inspect.isclass(func1) else "function" @functools.wraps(func1) def new_func1(*args, **kwargs): warnings.simplefilter("always", DeprecationWarning) warnings.warn( f"Call to deprecated {class_or_func} {func1.__name__} ({reason}).", category=DeprecationWarning, stacklevel=2, ) warnings.simplefilter("default", DeprecationWarning) return func1(*args, **kwargs) return new_func1 return decorator if inspect.isclass(reason) or inspect.isfunction(reason): # The @deprecated is used without any 'reason'. # # .. code-block:: python # # @deprecated # def old_function(x, y): # pass func2 = reason class_or_func = "class" if inspect.isclass(func2) else "function" @functools.wraps(func2) def new_func2(*args, **kwargs): warnings.simplefilter("always", DeprecationWarning) warnings.warn( f"Call to deprecated {class_or_func} {func2.__name__}.", category=DeprecationWarning, stacklevel=2, ) warnings.simplefilter("default", DeprecationWarning) return func2(*args, **kwargs) return new_func2 raise TypeError(repr(type(reason))) def mapped_repr(type_, name_to_value_map) -> str: result = ", ".join(f"{name}={value}" for name, value in name_to_value_map.items()) result = f"{type_.__class__.__name__}({result})" return result ================================================ FILE: pygount/git_storage.py ================================================ import re import shutil from tempfile import mkdtemp from typing import Optional import git #: Regular expression to detect git url with the optional tag or branch # from https://stackoverflow.com/questions/2514859/regular-expression-for-git-repository server-name _GIT_URL_REGEX = re.compile( r"(?P((git|ssh|http(s)?)|(git@[\w.-]+))(:(//)?)([\w.@:/\-~]+)(\.git))(/)?(?P[\w./\-]+)?" ) def git_remote_url_and_revision_if_any(git_url: str) -> tuple[Optional[str], Optional[str]]: assert git_url is not None git_url_match = _GIT_URL_REGEX.match(git_url) return ( (None, None) if git_url_match is None else (git_url_match.group("remote_url"), git_url_match.group("revision")) ) class GitStorage: def __init__(self, remote_url: str, revision: Optional[str] = None): assert remote_url is not None self._remote_url = remote_url self._revision = revision self._temp_folder = mkdtemp() @property def temp_folder(self) -> str: return self._temp_folder def extract(self): multi_options = ["--depth", "1"] if self._revision is not None: multi_options.extend(["--branch", self._revision]) git.Repo.clone_from(self._remote_url, self._temp_folder, multi_options=multi_options) def close(self): shutil.rmtree(self._temp_folder, ignore_errors=True) ================================================ FILE: pygount/lexers.py ================================================ """ Additional lexers for pygount that fill gaps left by :py:mod:`pygments`. """ # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. import pygments.lexer import pygments.lexers import pygments.token import pygments.util class IdlLexer(pygments.lexers.JavaLexer): """ Lexer for OMG Interface Definition Language (IDL) that simply uses the existing Java lexer to find comments. While this is useless for syntax highlighting it is good enough for counting lines. """ name = "IDL" filenames = ["*.idl"] class MinimalisticM4Lexer(pygments.lexer.RegexLexer): """ Minimalistic lexer for m4 macro processor that can distinguish between comments and code. It does not recognize a redefined comment mark though. """ name = "M4" tokens = { "root": [ (r"(.*)(#.*\n)", pygments.lexer.bygroups(pygments.token.Text, pygments.token.Comment.Single)), (r".*\n", pygments.token.Text), ] } class MinimalisticVBScriptLexer(pygments.lexer.RegexLexer): """ Minimalistic lexer for VBScript that can distinguish between comments and code. """ name = "VBScript" tokens = {"root": [(r"\s*'.*\n", pygments.token.Comment.Single), (r".*\n", pygments.token.Text)]} class MinimalisticWebFocusLexer(pygments.lexer.RegexLexer): """ Minimalistic lexer for WebFOCUS that can distinguish between comments and code. """ name = "WebFOCUS" tokens = {"root": [(r"-\*.*\n", pygments.token.Comment.Single), (r".*\n", pygments.token.Text)]} class PlainTextLexer(pygments.lexer.RegexLexer): """ Simple lexer for plain text that treats every line with non-white space characters as :py:data:`pygments.Token.Comment.Single` and only lines that are empty or contain only white space as :py:data:`pygments.Token.Text`. This way, plaint text files count as documentation. """ name = "Text" tokens = {"root": [(r"\s*\n", pygments.token.Text), (r".+\n", pygments.token.Comment.Single)]} ================================================ FILE: pygount/summary.py ================================================ """ Summaries of analyses of multiple source codes. """ # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. import functools import re from collections.abc import Hashable from .analysis import SourceAnalysis from .common import mapped_repr _PSEUDO_LANGUAGE_REGEX = re.compile("^__[a-z]+__$") @functools.total_ordering class LanguageSummary: """ Summary of a source code counts from multiple files of the same language. """ def __init__(self, language: str): self._language = language self._code_count = 0 self._documentation_count = 0 self._empty_count = 0 self._file_count = 0 self._file_percentage = 0.0 self._string_count = 0 self._is_pseudo_language = _PSEUDO_LANGUAGE_REGEX.match(self.language) is not None self._has_up_to_date_percentages = False @property def language(self) -> str: """the language to be summarized""" return self._language @property def code_count(self) -> int: """sum lines of code for this language""" return self._code_count @property def code_percentage(self) -> float: """percentage of lines containing code for this language across entire project""" return _percentage_or_0(self.code_count, self.line_count) def _assert_has_up_to_date_percentages(self): assert self._has_up_to_date_percentages, "update_percentages() must be called first" @property def documentation_count(self) -> int: """sum lines of documentation for this language""" return self._documentation_count @property def documentation_percentage(self) -> float: """percentage of lines containing documentation for this language across entire project""" return _percentage_or_0(self.documentation_count, self.line_count) @property def empty_count(self) -> int: """sum empty lines for this language""" return self._empty_count @property def empty_percentage(self) -> float: """percentage of empty lines for this language across entire project""" return _percentage_or_0(self.empty_count, self.line_count) @property def file_count(self) -> int: """number of source code files for this language""" return self._file_count @property def file_percentage(self) -> float: """percentage of files in project""" self._assert_has_up_to_date_percentages() return self._file_percentage @property def line_count(self) -> int: """sum count of all lines of any kind for this language""" return self.code_count + self.documentation_count + self.empty_count + self.string_count @property def string_count(self) -> int: """sum number of lines containing strings for this language""" return self._string_count @property def string_percentage(self) -> float: """percentage of lines containing strings for this language across entire project""" return _percentage_or_0(self.string_count, self.line_count) @property def source_count(self) -> int: """sum number of source lines of code""" return self.code_count + self.string_count @property def source_percentage(self) -> float: """percentage of source lines for code for this language across the entire project""" return _percentage_or_0(self.source_count, self.line_count) @property def is_pseudo_language(self) -> bool: """``True`` if the language is not a real programming language""" return self._is_pseudo_language def sort_key(self) -> Hashable: """sort key to sort multiple languages by importance""" return self.code_count, self.documentation_count, self.string_count, self.empty_count, self.language def __hash__(self): return hash(self.language) def __eq__(self, other): return self.sort_key() == other.sort_key() def __lt__(self, other): return self.sort_key() < other.sort_key() def add(self, source_analysis: SourceAnalysis) -> None: """ Add counts from ``source_analysis`` to total counts for this language. """ assert source_analysis is not None assert source_analysis.language == self.language self._has_up_to_date_percentages = False self._file_count += 1 if source_analysis.is_countable: self._code_count += source_analysis.code_count self._documentation_count += source_analysis.documentation_count self._empty_count += source_analysis.empty_count self._string_count += source_analysis.string_count def update_file_percentage(self, project_summary: "ProjectSummary"): self._file_percentage = _percentage_or_0(self.file_count, project_summary.total_file_count) self._has_up_to_date_percentages = True def __repr__(self): name_to_value_map = { "language": f"{self.language!r}", "file_count": self.file_count, } if not self.is_pseudo_language: name_to_value_map.update( { "code_count": self.code_count, "documentation_count": self.documentation_count, "empty_count": self.empty_count, "string_count": self.string_count, } ) return mapped_repr(self, name_to_value_map) def _percentage_or_0(partial_count: int, total_count: int) -> float: assert partial_count >= 0 assert total_count >= 0 return 100 * partial_count / total_count if total_count != 0 else 0.0 class ProjectSummary: """ Summary of source code counts for several languages and files. """ def __init__(self): self._language_to_language_summary_map = {} self._total_code_count = 0 self._total_documentation_count = 0 self._total_empty_count = 0 self._total_string_count = 0 self._total_file_count = 0 self._total_line_count = 0 @property def language_to_language_summary_map(self) -> dict[str, LanguageSummary]: """ A map containing summarized counts for each language added with :py:meth:`add()` so far. """ return self._language_to_language_summary_map @property def total_code_count(self) -> int: return self._total_code_count @property def total_code_percentage(self) -> float: return _percentage_or_0(self.total_code_count, self.total_line_count) @property def total_documentation_count(self) -> int: return self._total_documentation_count @property def total_documentation_percentage(self) -> float: return _percentage_or_0(self.total_documentation_count, self.total_line_count) @property def total_empty_count(self) -> int: return self._total_empty_count @property def total_empty_percentage(self) -> float: return _percentage_or_0(self.total_empty_count, self.total_line_count) @property def total_file_count(self) -> int: return self._total_file_count @property def total_line_count(self) -> int: return self._total_line_count @property def total_source_count(self) -> int: return self.total_code_count + self.total_string_count @property def total_source_percentage(self) -> float: return _percentage_or_0(self.total_source_count, self.total_line_count) @property def total_string_count(self) -> int: return self._total_string_count @property def total_string_percentage(self) -> float: return _percentage_or_0(self.total_string_count, self.total_line_count) def add(self, source_analysis: SourceAnalysis) -> None: """ Add counts from ``source_analysis`` to total counts. """ self._total_file_count += 1 language_summary = self.language_to_language_summary_map.get(source_analysis.language) if language_summary is None: language_summary = LanguageSummary(source_analysis.language) self.language_to_language_summary_map[source_analysis.language] = language_summary language_summary.add(source_analysis) if source_analysis.is_countable: self._total_code_count += source_analysis.code_count self._total_documentation_count += source_analysis.documentation_count self._total_empty_count += source_analysis.empty_count self._total_line_count += ( source_analysis.code_count + source_analysis.documentation_count + source_analysis.empty_count + source_analysis.string_count ) self._total_string_count += source_analysis.string_count def update_file_percentages(self) -> None: """Update percentages for all languages part of the project.""" for language_summary in self._language_to_language_summary_map.values(): language_summary.update_file_percentage(self) def __repr__(self): return ( f"{self.__class__.__name__}(" f"total_file_count={self.total_file_count}, " f"total_line_count={self.total_line_count}, " f"languages={sorted(self.language_to_language_summary_map.keys())})" ) ================================================ FILE: pygount/write.py ================================================ """ Writers to store the results of a pygount analysis. """ # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. import datetime import json import math import os from xml.etree import ElementTree from rich.console import Console from rich.table import Table import pygount from . import SourceAnalysis from .summary import ProjectSummary #: Version of cloc the --format=cloc-xml pretends to be. CLOC_VERSION = "1.60" JSON_FORMAT_VERSION = "1.1.0" class BaseWriter: def __init__(self, target_stream): self._target_stream = target_stream try: self.target_name = self._target_stream.name except AttributeError: self.target_name = "" self.project_summary = ProjectSummary() self.started_at = self._utc_now() self.finished_at = None self.files_per_second = 0 self.lines_per_second = 0 self.duration = None self.duration_in_seconds = 0.0 self.has_to_track_progress = True def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() return False def add(self, source_analysis): self.project_summary.add(source_analysis) def close(self): self.project_summary.update_file_percentages() self.finished_at = self._utc_now() self.duration = self.finished_at - self.started_at self.duration_in_seconds = max( 0.001, self.duration.microseconds * 1e-6 + self.duration.seconds + self.duration.days * 3600 * 24 ) self.lines_per_second = self.project_summary.total_line_count / self.duration_in_seconds self.files_per_second = self.project_summary.total_file_count / self.duration_in_seconds @staticmethod def _utc_now() -> datetime.datetime: # After switching to Python 3.11+, we can change this to `now(datetime.UTC)`. return datetime.datetime.now(datetime.timezone.utc) class LineWriter(BaseWriter): """ Writer that simply writes a line of text for each source code. """ def __init__(self, target_stream): super().__init__(target_stream) self.has_to_track_progress = False def add(self, source_analysis): source_line_count = source_analysis.code_count + source_analysis.string_count line_to_write = ( f"{source_line_count}\t{source_analysis.language}\t{source_analysis.group}\t{source_analysis.path}" ) self._target_stream.write(line_to_write + os.linesep) class ClocXmlWriter(BaseWriter): """ Writer that writes XML output similar to cloc when called with options --by-file --xml. This kind of output can be processed by Jenkins' SLOCCount plug-in. """ def __init__(self, target_stream): super().__init__(target_stream) self._results_element = ElementTree.Element("results") self._header_element = ElementTree.SubElement(self._results_element, "header") ElementTree.SubElement(self._header_element, "cloc_url", text="https://github.com/roskakori/pygount") ElementTree.SubElement(self._header_element, "cloc_version", text=CLOC_VERSION) self._files_element = ElementTree.SubElement(self._results_element, "files") def __exit__(self, exc_type, exc_val, exc_tb): if exc_type is None: # Only write the XML if everything works out. self.close() def add(self, source_analysis: SourceAnalysis): super().add(source_analysis) file_attributes = { "blank": str(source_analysis.empty_count), "code": str(source_analysis.source_count), "comment": str(source_analysis.documentation_count), "language": source_analysis.language, "name": source_analysis.path, } ElementTree.SubElement(self._files_element, "file", attrib=file_attributes) def close(self): super().close() # Add various statistics to
. ElementTree.SubElement(self._header_element, "elapsed_seconds", text=str(self.duration_in_seconds)) ElementTree.SubElement(self._header_element, "n_files", text=str(self.project_summary.total_file_count)) ElementTree.SubElement(self._header_element, "n_lines", text=str(self.project_summary.total_line_count)) ElementTree.SubElement(self._header_element, "files_per_second", text=f"{self.files_per_second:f}") ElementTree.SubElement(self._header_element, "lines_per_second", text=f"{self.lines_per_second:f}") ElementTree.SubElement(self._header_element, "report_file", text=self.target_name) # Add totals to . file_attributes = { "blank": str(self.project_summary.total_empty_count), "code": str(self.project_summary.total_code_count + self.project_summary.total_string_count), "comment": str(self.project_summary.total_documentation_count), } ElementTree.SubElement(self._files_element, "total", attrib=file_attributes) # Write the whole XML file. if self._target_stream.encoding is not None: # Write XML declaration only for files but skip it for io.StringIO. self._target_stream.write(f'') xml_root = ElementTree.ElementTree(self._results_element) xml_root.write(self._target_stream, encoding="unicode", xml_declaration=False) class SummaryWriter(BaseWriter): """ Writer to summarize the analysis per language in a format that can easily be read by humans. """ _COLUMNS_WITH_JUSTIFY = ( ("Language", "left"), ("Files", "right"), ("%", "right"), ("Code", "right"), ("%", "right"), ("Comment", "right"), ("%", "right"), ) def close(self): super().close() table = Table() for column, justify in self._COLUMNS_WITH_JUSTIFY: table.add_column(column, justify=justify, overflow="fold") language_summaries = sorted(self.project_summary.language_to_language_summary_map.values(), reverse=True) for index, language_summary in enumerate(language_summaries, start=1): table.add_row( language_summary.language, str(language_summary.file_count), formatted_percentage(language_summary.file_percentage), str(language_summary.code_count), formatted_percentage(language_summary.code_percentage), str(language_summary.documentation_count), formatted_percentage(language_summary.documentation_percentage), end_section=(index == len(language_summaries)), ) table.add_row( "Sum", str(self.project_summary.total_file_count), formatted_percentage(100.0), str(self.project_summary.total_code_count), formatted_percentage(self.project_summary.total_code_percentage), str(self.project_summary.total_documentation_count), formatted_percentage(self.project_summary.total_documentation_percentage), ) Console(file=self._target_stream, soft_wrap=True).print(table) class JsonWriter(BaseWriter): """ Writer JSON output, ideal for further automatic processing. """ def __init__(self, target_stream): super().__init__(target_stream) self.source_analyses = [] def add(self, source_analysis: SourceAnalysis): super().add(source_analysis) self.source_analyses.append( { "codeCount": source_analysis.code_count, "documentationCount": source_analysis.documentation_count, "emptyCount": source_analysis.empty_count, "group": source_analysis.group, "isCountable": source_analysis.is_countable, "language": source_analysis.language, "lineCount": source_analysis.line_count, "path": source_analysis.path, "state": source_analysis.state.name, "stateInfo": source_analysis.state_info, "sourceCount": source_analysis.source_count, } ) def close(self): # NOTE: JSON names use camel case to follow JSLint's guidelines, see . super().close() json_map = { "formatVersion": JSON_FORMAT_VERSION, "pygountVersion": pygount.__version__, "files": self.source_analyses, "languages": [ { "documentationCount": language_summary.documentation_count, "documentationPercentage": language_summary.documentation_percentage, "codeCount": language_summary.code_count, "codePercentage": language_summary.code_percentage, "emptyCount": language_summary.empty_count, "emptyPercentage": language_summary.empty_percentage, "fileCount": language_summary.file_count, "filePercentage": language_summary.file_percentage, "isPseudoLanguage": language_summary.is_pseudo_language, "language": language_summary.language, "sourceCount": language_summary.source_count, "sourcePercentage": language_summary.source_percentage, "stringCount": language_summary.string_count, "stringPercentage": language_summary.string_percentage, } for language_summary in self.project_summary.language_to_language_summary_map.values() ], "runtime": { "durationInSeconds": self.duration_in_seconds, "filesPerSecond": self.files_per_second, "finishedAt": self.finished_at.isoformat(), "linesPerSecond": self.lines_per_second, "startedAt": self.started_at.isoformat(), }, "summary": { "totalCodeCount": self.project_summary.total_code_count, "totalCodePercentage": self.project_summary.total_code_percentage, "totalDocumentationCount": self.project_summary.total_documentation_count, "totalDocumentationPercentage": self.project_summary.total_documentation_percentage, "totalEmptyCount": self.project_summary.total_empty_count, "totalEmptyPercentage": self.project_summary.total_empty_percentage, "totalFileCount": self.project_summary.total_file_count, "totalSourceCount": self.project_summary.total_source_count, "totalSourcePercentage": self.project_summary.total_source_percentage, "totalStringCount": self.project_summary.total_string_count, "totalStringPercentage": self.project_summary.total_string_percentage, }, } json.dump(json_map, self._target_stream) def digit_width(line_count: int) -> int: assert line_count >= 0 return math.ceil(math.log10(line_count + 1)) if line_count != 0 else 1 def formatted_percentage(percentage: float) -> str: assert percentage >= 0.0 assert percentage <= 100.0 return f"{percentage:.01f}" ================================================ FILE: pygount/xmldialect.py ================================================ """ Function to obtain the language dialect used by XML source code. """ # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. import logging import re import xml.sax from pygount.common import WHITE_SPACE_CHARACTERS # TODO #10: Replace regex for DTD by working DTD handler. #: Regular expression to obtain DTD. _DTD_REGEX = re.compile(r'[a-zA-Z][a-zA-Z-]*)\s+PUBLIC\s+"(?P.+)"') _REGEX_PATTERNS_AND_DIALECTS = ( (".*DocBook.*", "DocBook XML"), (".+ SVG .+", "SVG XML"), ) _REGEXES_AND_DIALECTS = [(re.compile(pattern), dialect) for pattern, dialect in _REGEX_PATTERNS_AND_DIALECTS] for public_id_regex, dialect in _REGEX_PATTERNS_AND_DIALECTS: assert public_id_regex is not None assert dialect is not None assert dialect.strip() != "" #: Regex to detect Sax error messages with uninformative paths like ''. _SAX_MESSAGE_WITHOUT_PATH_PATTERN = re.compile(r"^<.+>(?P:\d+:\d+.+)") _log = logging.getLogger("pygount") class SaxParserDone(Exception): """ Pseudo error to indicate that the Sax parser ist done. """ class XmlDialectHandler(xml.sax.ContentHandler, xml.sax.handler.DTDHandler): def __init__(self, max_element_count=100): super().__init__() self.dialect = None self._path = "" self._element_count = 0 self._max_element_count = max_element_count def _set_dialect_and_stop_parsing(self, dialect): self.dialect = dialect raise SaxParserDone(f"language detected: {dialect}") def startElement(self, name, attrs): self._element_count += 1 if self._element_count == self._max_element_count: raise SaxParserDone(f"no language found after parsing {self._element_count} elements") self._path += "/" + name xmlns = attrs.get("xmlns", "") if (self._path == "/project") and ("name" in attrs): self._set_dialect_and_stop_parsing("Ant") elif (self._path in ("/book/title", "/chapter/title")) or (xmlns == "http://docbook.org/ns/docbook"): self._set_dialect_and_stop_parsing("DocBook XML") elif xmlns == "http://xmlns.jcp.org/xml/ns/javaee": self._set_dialect_and_stop_parsing("JavaEE XML") elif xmlns.startswith("http://maven.apache.org/POM"): self._set_dialect_and_stop_parsing("Maven") elif xmlns.startswith("http://www.netbeans.org/ns/project/"): self._set_dialect_and_stop_parsing("NetBeans Project") def endElement(self, name): self._path = self._path[: -len(name) - 1] def xml_dialect(xml_path, xml_code): # TODO #10: Remove hack to obtain DTD using a regex instead of a DTDHandler. xml_code_witout_header = without_xml_header(xml_code) dtd_match = _DTD_REGEX.match(xml_code_witout_header) if dtd_match is not None: public_id = dtd_match.group("public_id") for public_id_regex, dialect in _REGEXES_AND_DIALECTS: if public_id_regex.match(public_id): return dialect xml_dialect_handler = XmlDialectHandler() parser = xml.sax.make_parser() parser.setContentHandler(xml_dialect_handler) parser.setFeature(xml.sax.handler.feature_external_ges, False) parser.setFeature(xml.sax.handler.feature_external_pes, False) parser.setFeature(xml.sax.handler.feature_validation, False) try: parser.feed(xml_code) # NOTE: We can only call close() when the parser has finished, # otherwise close() raises a SAXException('parser finished'). parser.close() except SaxParserDone: # Language has been determined or the parser has given up. pass except (ValueError, xml.sax.SAXException) as error: # NOTE: ValueError is raised on unknown url type. error_message = str(error) message_without_path_match = _SAX_MESSAGE_WITHOUT_PATH_PATTERN.match(error_message) if message_without_path_match is not None: # HACK: Replace uninformative sax path like '' with actual XML path. error_message = xml_path + message_without_path_match.group("message_without_path") _log.warning(error_message) except OSError as error: _log.warning("%s: cannot analyze XML dialect: %s", xml_path, error) return xml_dialect_handler.dialect def without_xml_header(xml_code: str) -> str: result = xml_code.lstrip(WHITE_SPACE_CHARACTERS) if result.startswith("") if end_if_xml_declaration != -1: result = result[end_if_xml_declaration + 2 :].lstrip(WHITE_SPACE_CHARACTERS) return result ================================================ FILE: pyproject.toml ================================================ [project] name = "pygount" version = "3.3.0" description = "count source lines of code (SLOC) using pygments" authors = [{ name = "Thomas Aglassinger", email = "roskakori@users.sourceforge.net" }] requires-python = ">=3.10, <4" readme = "README.md" license = "BSD-3-Clause" keywords = [ "code analysis", "count", "SLOC", ] classifiers = [ "Development Status :: 5 - Production/Stable", "Environment :: Console", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", "Topic :: Software Development", ] dependencies = [ "chardet>=5,<6", "gitpython~=3.1", "pygments>=2,<3", "rich>=14", ] [project.urls] Homepage = "https://github.com/roskakori/pygount" Repository = "https://github.com/roskakori/pygount.git" Documentation = "https://pygount.readthedocs.io" "Issue Tracker" = "https://github.com/roskakori/pygount/issues" Changes = "https://pygount.readthedocs.io/en/latest/changes.html" [project.scripts] pygount = "pygount.command:main" [tool.pytest.ini_options] minversion = "9.0" addopts = [ "-rA" ] testpaths = [ "tests", ] [dependency-groups] dev = [ "coveralls>=4,<5", "coverage>=7,<8", "hatchling>=1.27.0", "mkdocs>=1.6,<2", "mkdocs-material>=9", "pytest>=9.0.3", "pytest-cov>=7,<8", "pre-commit>=4,<5", "ruff>=0.15", ] [tool.uv] default-groups = [ "dev", ] [tool.hatch.build.targets.sdist] exclude = [".idea", ".github", ".readthedocs.yaml"] [tool.hatch.build.targets.wheel] packages = ["pygount"] [build-system] requires = ["hatchling"] build-backend = "hatchling.build" [tool.ruff] exclude = [ ".eggs", ".git", ".pytest_cache", ".pytype", ".ruff_cache", ".vscode", "__pypackages__", "_build", "build", "dist", "htmlcov", ] line-length = 120 target-version = "py39" [tool.ruff.lint] ignore = [ # Missing trailing comma → May cause conflicts when used with the formatter. "COM812", # Too many branches "PLR0912", # Too many arguments in function definition "PLR0913", # Too many statements "PLR0915", # Magic value used in comparison "PLR2004", # TODO#89 Enable checks for usage of pathlib. "PTH100", "PTH103", "PTH107", "PTH109", "PTH110", "PTH112", "PTH114", "PTH118", "PTH119", "PTH120", "PTH122", "PTH123", "PTH202", "PTH207", "PTH208", # Unneccesarry assign → We regularly use `result = ...; return result` to examine the result in the debugger. "RET504", # TODO#506 Enable RUF012 check for mutable class attributes. # Mutable class attributes should be annotated with `typing.ClassVar` "RUF012", # Avoid specifying long messages outside the exception class "TRY003", # Abstract `raise` to an inner function "TRY301", ] select = [ # flake8-builtins "A", # flake8-bugbear "B", # flake8-commas "COM", # flake8-comprehensions "C4", # flake8-django "DJ", # flake8-datetimez "DTZ", # pycodestyle "E", # Pyflakes "F", # isort "I", # flake8-no-pep420 "INP", # flake8-gettext "INT", # flake8-logging "LOG", # perflint "PERF", # pygrep-hooks "PGH", # flake8-pie "PIE", # pylint "PL", # flake8-use-pathlib "PTH", # refactor "R", # flake8-raise "RSE", # flake8-return "RET", # ruff specific rules "RUF", # flake8-self "SLF", # flake8-simplify "SIM", # tryceratops "TRY", # flake8-debugger "T10", # flake8-print "T20", # pyupgrade "UP", ] [tool.ruff.lint.isort] known-first-party = ["pygount", "scripts", "tests"] ================================================ FILE: scripts/build_documentation.sh ================================================ #!/bin/sh # Build documentation using Sphinx set -e echo "📖 Building documentation" mkdocs build echo "✅ Successfully built documentation in site/index.html" ================================================ FILE: scripts/build_movie.sh ================================================ #!/bin/sh # Build a gource movie about the development. # # For this to work, use macOS and install the following: # # brew gource ffmpeg # # See also: set -ex mkdir -p build gource --auto-skip-seconds 1 --file-idle-time 0 --hide dirnames,filenames,mouse --seconds-per-day 1 --title Pygount -1920x1080 --output-ppm-stream - . | ffmpeg -y -r 30 -f image2pipe -vcodec ppm -i - -vcodec libx264 -preset ultrafast -pix_fmt yuv420p -crf 1 -threads 0 -bf 0 /tmp/pygount_movie.mp4 ================================================ FILE: scripts/test_coverage.sh ================================================ #!/bin/sh set -e uv run pytest --cov-reset --cov=pygount --cov-branch --cov-report html echo "To view results run: firefox htmlcov/index.html &" ================================================ FILE: scripts/update_dependencies.sh ================================================ #!/bin/sh # Update requirements files and pre-commit hooks to current versions. set -e echo "🧱 Updating project" uv sync uv lock --upgrade echo "🛠️ Updating pre-commit" uv run pre-commit autoupdate echo "🎉 Successfully updated dependencies" ================================================ FILE: tests/__init__.py ================================================ # Deliberately left empty. ================================================ FILE: tests/_common.py ================================================ """ Common constants and functions used by multiple tests. """ # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. import os import shutil import unittest from collections.abc import Iterator, Sequence from contextlib import contextmanager from tempfile import NamedTemporaryFile from typing import IO, TextIO, Union PYGOUNT_PROJECT_FOLDER = os.path.dirname(os.path.dirname(__file__)) PYGOUNT_SOURCE_FOLDER = os.path.join(PYGOUNT_PROJECT_FOLDER, "pygount") class TempFolderTest(unittest.TestCase): def setUp(self): self.tests_temp_folder = os.path.join(PYGOUNT_PROJECT_FOLDER, "tests", ".temp") os.makedirs(self.tests_temp_folder, exist_ok=True) def create_temp_file( self, relative_target_path, content: Union[str, bytes, Sequence[str]], encoding="utf-8", do_create_folder=False ): result = os.path.join(self.tests_temp_folder, relative_target_path) if do_create_folder: os.makedirs(os.path.dirname(result), exist_ok=True) with open(result, "w", encoding=encoding) as target_file: if isinstance(content, (str, bytes)): target_file.write(content) else: for line in content: target_file.write(line) target_file.write("\n") return result def create_temp_binary_file(self, relative_target_path, content: bytes): result = os.path.join(self.tests_temp_folder, relative_target_path) with open(result, "wb") as target_file: target_file.write(content) return result def tearDown(self): shutil.rmtree(self.tests_temp_folder) @contextmanager def temp_binary_file(data: bytes) -> Iterator[IO]: with NamedTemporaryFile(mode="wb+", suffix=".bin") as result: result.write(data) result.flush() result.seek(0) yield result @contextmanager def temp_source_file(suffix: str, lines: list[str], *, encoding: str = "utf-8") -> Iterator[TextIO]: with NamedTemporaryFile(encoding=encoding, mode="w+", suffix=f".{suffix}") as result: result.write("\n".join(lines)) result.flush() result.seek(0) yield result ================================================ FILE: tests/test_analysis.py ================================================ """ Tests for pygount source code analysis. """ # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. import glob import os import unittest from io import BytesIO, StringIO import pytest from pygments import lexers, token import pygount from pygount import Error as PygountError from pygount import analysis, common from pygount.analysis import ( _delined_tokens, _line_parts, _pythonized_comments, base_language, guess_lexer, is_markup_file, ) from ._common import PYGOUNT_PROJECT_FOLDER, PYGOUNT_SOURCE_FOLDER, TempFolderTest, temp_source_file from .test_xmldialect import EXAMPLE_ANT_CODE class SourceScannerTest(TempFolderTest): def setUp(self): super().setUp() self._tests_folder = os.path.dirname(__file__) def test_can_find_no_files(self): scanner = analysis.SourceScanner([]) actual_paths = list(scanner.source_paths()) assert actual_paths == [] def test_can_find_any_files(self): scanner = analysis.SourceScanner([PYGOUNT_SOURCE_FOLDER]) actual_paths = list(scanner.source_paths()) assert actual_paths != [] def test_can_find_python_files(self): scanner = analysis.SourceScanner([PYGOUNT_SOURCE_FOLDER], "py") actual_paths = list(scanner.source_paths()) assert actual_paths != [] for path_data in actual_paths: actual_suffix = os.path.splitext(path_data.source_path)[1] assert actual_suffix == ".py" def test_can_skip_dot_folder(self): project_folder_name = "project" project_folder = os.path.join(self.tests_temp_folder, project_folder_name) name_to_include = "include.py" relative_path_to_include = os.path.join(project_folder_name, "include", name_to_include) self.create_temp_file(relative_path_to_include, "include = 1", do_create_folder=True) relative_path_to_skip = os.path.join(project_folder_name, ".skip", "skip.py") self.create_temp_file(relative_path_to_skip, "skip = 2", do_create_folder=True) scanner = analysis.SourceScanner([project_folder]) scanned_names = [os.path.basename(path_data.source_path) for path_data in scanner.source_paths()] assert scanned_names == [name_to_include] def test_succeeds_on_not_git_extension(self): non_repo_urls = [["https://github.com/roskakori/pygount/"], ["git@github.com:roskakori/pygount"]] for non_repo_url in non_repo_urls: with analysis.SourceScanner(non_repo_url) as scanner: _ = list(scanner.source_paths()) def test_fails_on_non_git_urls(self): non_repo_urls = [["https://no/git/url"], ["https://google.com/nogit"]] for non_repo_url in non_repo_urls: with ( analysis.SourceScanner(non_repo_url) as scanner, pytest.raises(pygount.Error, match="URL to git repository"), ): _ = list(scanner.source_paths()) def test_can_find_python_files_in_dot(self): scanner = analysis.SourceScanner(["."], "py") actual_paths = list(scanner.source_paths()) assert actual_paths != [] for path_data in actual_paths: actual_suffix = os.path.splitext(path_data.source_path)[1] assert actual_suffix == ".py" def test_can_find_files_from_mixed_cloned_git_remote_url_and_local(self): git_remote_url = "https://github.com/roskakori/pygount.git" with analysis.SourceScanner([git_remote_url, PYGOUNT_SOURCE_FOLDER]) as scanner: actual_paths = list(scanner.source_paths()) assert actual_paths != [] assert actual_paths[0].source_path != actual_paths[-1].source_path assert actual_paths[-1].tmp_dir is not None class AnalysisTest(unittest.TestCase): def test_can_deline_tokens(self): assert list(_delined_tokens([(token.Comment, "# a")])) == [(token.Comment, "# a")] assert list(_delined_tokens([(token.Comment, "# a\n# b")])) == [ (token.Comment, "# a\n"), (token.Comment, "# b"), ] assert list(_delined_tokens([(token.Comment, "# a\n# b\n")])) == [ (token.Comment, "# a\n"), (token.Comment, "# b\n"), ] assert list(_delined_tokens([(token.Comment, "# a\n# b\n # c\n")])) == [ (token.Comment, "# a\n"), (token.Comment, "# b\n"), (token.Comment, " # c\n"), ] def test_can_compute_python_line_parts(self): python_lexer = lexers.get_lexer_by_name("python") assert list(_line_parts(python_lexer, "#")) == [set("d")] assert list(_line_parts(python_lexer, "s = 'x' # x")) == [set("cds")] def test_can_detect_white_text(self): python_lexer = lexers.get_lexer_by_name("python") assert list(_line_parts(python_lexer, "{[()]};")) == [set()] assert list(_line_parts(python_lexer, "pass")) == [set()] def test_can_convert_python_strings_to_comments(self): source_code = '#!/bin/python\n"Some tool."\n#(C) by me\ndef x():\n "Some function"\n return 1' python_lexer = lexers.get_lexer_by_name("python") python_tokens = python_lexer.get_tokens(source_code) for token_type, _ in list(_pythonized_comments(_delined_tokens(python_tokens))): assert token_type not in token.String def test_can_analyze_python(self): source_lines = [ '"Some tool."', "#!/bin/python", "#(C) by me", "def x():", ' "Some function"', ' return "abc"', ] actual_line_parts = _line_parts_with_detected_markup("python", source_lines) expected_line_parts = [{"d"}, {"d"}, {"d"}, {"c"}, {"d"}, {"c", "s"}] assert actual_line_parts == expected_line_parts def test_can_analyze_c(self): source_lines = [ "/*", " * The classic hello world for C99.", " */", "#include ", "int main(void) {", ' puts("Hello, World!");', "}", ] actual_line_parts = _line_parts_with_detected_markup("c", source_lines) expected_line_parts = [{"d"}, {"d"}, {"d"}, {"c"}, {"c"}, {"c", "s"}, set()] assert actual_line_parts == expected_line_parts def test_can_detect_all_lines_as_documentation_with_markup_enabled(): source_lines = [ "/*", " * The classic hello world for C99.", " */", "#include ", "int main(void) {", ' puts("Hello, World!");', "}", ] actual_line_parts = _line_parts_with_detected_markup("markdown", source_lines) assert all(line_part == "d" for line_part in actual_line_parts[-1]) assert actual_line_parts[-1:] == [set()] def _line_parts_with_detected_markup(lexer_name: str, source_lines: list[str]) -> list[set[str]]: lexer = lexers.get_lexer_by_name(lexer_name) is_markup = lexer_name in ["markdown", "md", "restructuredtext", "rst", "rest", "groff"] source_code = "\n".join(source_lines) return list(_line_parts(lexer, source_code, is_markup=is_markup)) class _NonSeekableEmptyBytesIO(BytesIO): # Class to create a 'dummy object that mimics a non-seekable file handle' def seekable(self) -> bool: return False class FileAnalysisTest(TempFolderTest): def test_can_analyze_encoding_error(self): test_path = self.create_temp_file("encoding_error.py", 'print("\N{EURO SIGN}")', encoding="cp1252") source_analysis = analysis.SourceAnalysis.from_file(test_path, "test", encoding="utf-8") assert source_analysis.language == "__error__" assert source_analysis.state == analysis.SourceState.error assert "0x80" in str(source_analysis.state_info) def test_can_detect_silent_dos_batch_remarks(self): test_bat_path = self.create_temp_file( "test_can_detect_silent_dos_batch_remarks.bat", ["rem normal comment", "@rem silent comment", "echo some code"], ) source_analysis = analysis.SourceAnalysis.from_file(test_bat_path, "test", encoding="utf-8") assert source_analysis.language == "Batchfile" assert source_analysis.code_count == 1 assert source_analysis.documentation_count == 2 def test_can_ignore_almost_magic_comment(self): test_bat_path = self.create_temp_file( "test_can_ignore_almost_magic_comment.json", ['{"x":"coding:no_such_coding"'], ) source_analysis = analysis.SourceAnalysis.from_file(test_bat_path, "test") assert source_analysis.language.lower() == "json" assert source_analysis.code_count == 1 assert source_analysis.documentation_count == 0 def test_fails_on_unknown_magic_encoding_comment(self): test_path = self.create_temp_file( "test_fails_on_unknown_magic_encoding_comment.py", ["# -*- coding: no_such_encoding -*-", 'print("hello")'] ) no_such_encoding = analysis.encoding_for(test_path) assert no_such_encoding == "no_such_encoding" source_analysis = analysis.SourceAnalysis.from_file(test_path, "test", encoding=no_such_encoding) assert source_analysis.language == "__error__" assert source_analysis.state == analysis.SourceState.error assert "unknown encoding" in str(source_analysis.state_info) def test_can_analyze_oracle_sql(self): test_oracle_sql_path = self.create_temp_file( "test_can_analyze_oracle_sql.pls", ["-- Oracle SQL example using an obscure suffix.", "select *", "from some_table;"], ) source_analysis = analysis.SourceAnalysis.from_file(test_oracle_sql_path, "test", encoding="utf-8") assert source_analysis.language.lower().endswith("sql") assert source_analysis.code_count == 2 assert source_analysis.documentation_count == 1 def test_can_analyze_webfocus(self): test_fex_path = self.create_temp_file( "some.fex", ["-* comment", "-type some text", "table file some print * end;"] ) source_analysis = analysis.SourceAnalysis.from_file(test_fex_path, "test", encoding="utf-8") assert source_analysis.language == "WebFOCUS" assert source_analysis.code_count == 2 assert source_analysis.documentation_count == 1 def test_can_analyze_xml_dialect(self): build_xml_path = self.create_temp_file("build.xml", EXAMPLE_ANT_CODE) source_analysis = analysis.SourceAnalysis.from_file(build_xml_path, "test") assert source_analysis.state == analysis.SourceState.analyzed assert source_analysis.language == "Ant" def test_can_analyze_unknown_language(self): unknown_language_path = self.create_temp_file("some.unknown_language", ["some", "lines", "of", "text"]) source_analysis = analysis.SourceAnalysis.from_file(unknown_language_path, "test") assert source_analysis.state == analysis.SourceState.unknown def test_can_detect_binary_source_code(self): binary_path = self.create_temp_binary_file("some_django.mo", b"hello\0world!") source_analysis = analysis.SourceAnalysis.from_file(binary_path, "test", encoding="utf-8") assert source_analysis.state == analysis.SourceState.binary assert source_analysis.code_count == 0 def test_can_analyze_stringio(self): test_path = "imaginary/path/to/file.py" test_code = "from random import randint\n\n# Print a random dice roll\nprint(randint(6))\n" source_analysis = analysis.SourceAnalysis.from_file(test_path, "test", file_handle=StringIO(test_code)) assert source_analysis.state == analysis.SourceState.analyzed assert source_analysis.language == "Python" assert source_analysis.code_count == 2 def test_can_analyze_bytesio(self): test_path = "imaginary/path/to/file.py" test_code = b"from random import randint\n\n# Print a random dice roll\nprint(randint(6))\n" source_analysis = analysis.SourceAnalysis.from_file(test_path, "test", file_handle=BytesIO(test_code)) assert source_analysis.state == analysis.SourceState.analyzed assert source_analysis.language == "Python" assert source_analysis.code_count == 2 def test_can_analyze_embedded_language(self): test_html_django_path = self.create_temp_file( "some.html", ["", "{% load i18n %}", ''], ) source_analysis = analysis.SourceAnalysis.from_file(test_html_django_path, "test", encoding="utf-8") assert source_analysis.language.lower() == "html+django/jinja" assert source_analysis.code_count == 3 def test_can_analyze_generated_name(self): test_uv_lock_path = self.create_temp_file("uv.lock", []) source_analysis = analysis.SourceAnalysis.from_file( test_uv_lock_path, "test", generated_name_regexes=pygount.common.regexes_from(pygount.analysis.DEFAULT_GENERATED_NAME_PATTERNS_TEXT), ) assert source_analysis.state == analysis.SourceState.generated def test_can_merge_embedded_language(self): test_html_django_path = self.create_temp_file( "some.html", ["", "{% load i18n %}", ''], ) source_analysis = analysis.SourceAnalysis.from_file( test_html_django_path, "test", encoding="utf-8", merge_embedded_language=True ) assert source_analysis.language.lower() == "html" assert source_analysis.code_count == 3 def test_can_analyze_unknown_magic_comment_encoding(self): test_python_path = self.create_temp_file("some.py", ["# -*- coding: no_such_encoding -*-", "print('hello')"]) source_analysis = analysis.SourceAnalysis.from_file(test_python_path, "test") assert source_analysis.language.lower() == "__error__" assert source_analysis.state_info == "unknown encoding: no_such_encoding" def test_fails_on_non_seekable_file_handle_with_encoding_automatic(self): file_handle = _NonSeekableEmptyBytesIO() with pytest.raises(PygountError, match=r".*file handle must be seekable.*"): analysis.SourceAnalysis.from_file("README.md", "test", file_handle=file_handle, encoding="automatic") def test_fails_on_non_seekable_file_handle_with_encoding_chardet(self): file_handle = _NonSeekableEmptyBytesIO() with pytest.raises(PygountError, match=r".*file handle must be seekable.*"): analysis.SourceAnalysis.from_file("README.md", "test", file_handle=file_handle, encoding="chardet") @pytest.mark.parametrize( "suffix, code_count, doc_count, expected_language_lower", [ ("rst", 0, 3, "restructuredtext"), ("md", 0, 3, "markdown"), ("txt", 0, 3, "text only"), ("4", 0, 3, "groff"), ], ) def test_can_analyze_markup_as_plain_documentation( suffix, code_count: int, doc_count: int, expected_language_lower: str ): source_lines = ["", "{% load i18n %}", "", " ", ''] expected_empty_count = 2 expected_documentation_count = len(source_lines) - expected_empty_count with temp_source_file(suffix, source_lines) as test_file: source_analysis = analysis.SourceAnalysis.from_file(test_file.name, "test", encoding="utf-8") assert source_analysis.language.lower() == expected_language_lower assert source_analysis.code_count == 0 assert source_analysis.documentation_count == expected_documentation_count assert source_analysis.empty_count == expected_empty_count def test_can_repr_source_analysis_from_file(): source_analysis = analysis.SourceAnalysis("some.py", "Python", "some", 1, 2, 3, 4, analysis.SourceState.analyzed) expected_source_analysis_repr = ( "SourceAnalysis(path='some.py', language='Python', group='some', " "state=analyzed, code_count=1, documentation_count=2, empty_count=3, string_count=4)" ) assert repr(source_analysis) == expected_source_analysis_repr assert repr(source_analysis) == str(source_analysis) def test_can_repr_empty_source_analysis_from_file(): source_analysis = analysis.SourceAnalysis("some.py", "__empty__", "some", 0, 0, 0, 0, analysis.SourceState.empty) expected_source_analysis_repr = "SourceAnalysis(path='some.py', language='__empty__', group='some', state=empty)" assert repr(source_analysis) == expected_source_analysis_repr assert repr(source_analysis) == str(source_analysis) def test_can_repr_error_source_analysis_from_file(): source_analysis = analysis.SourceAnalysis( "some.py", "__error__", "some", 0, 0, 0, 0, analysis.SourceState.error, "error details" ) expected_source_analysis_repr = ( "SourceAnalysis(path='some.py', language='__error__', group='some', state=error, state_info='error details')" ) assert repr(source_analysis) == expected_source_analysis_repr assert repr(source_analysis) == str(source_analysis) def test_can_guess_lexer_for_python(): lexer = guess_lexer("some.py", "pass") assert lexer is not None assert lexer.name == "Python" def test_can_guess_lexer_for_plain_text(): lexer = guess_lexer("README.1st", "hello!") assert lexer is not None assert lexer.name == "Text" def test_can_guess_lexer_for_cmakelists(): source_code = "\n".join( [ "cmake_minimum_required(VERSION 2.6)", "project(example)", "set(CMAKE_CXX_STANDARD 14)", "set(SOURCE_FILES example.cpp)", "add_executable(example ${SOURCE_FILES})", ] ) lexer = guess_lexer("CMakeLists.txt", source_code) assert lexer is not None assert lexer.name == "CMake" class GeneratedCodeTest(TempFolderTest): _STANDARD_SOURCE_LINES = [ "#!/bin/python3", " # Example code for", " # generated source code.", ' print("I\'m generated!")', " ", ] _STANDARD_GENERATED_REGEXES = common.regexes_from( common.REGEX_PATTERN_PREFIX + ".*some,.*other,.*generated,.*print" ) def test_can_detect_non_generated_code(self): default_generated_regexes = common.regexes_from(analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT) with open(__file__, encoding="utf-8") as source_file: matching_line_number_and_regex = analysis.matching_number_line_and_regex( source_file, default_generated_regexes ) assert matching_line_number_and_regex is None def test_can_detect_generated_code(self): matching_number_line_and_regex = analysis.matching_number_line_and_regex( GeneratedCodeTest._STANDARD_SOURCE_LINES, GeneratedCodeTest._STANDARD_GENERATED_REGEXES ) assert matching_number_line_and_regex is not None matching_number, matching_line, matching_regex = matching_number_line_and_regex assert matching_number == 2 assert matching_line == GeneratedCodeTest._STANDARD_SOURCE_LINES[2] assert matching_regex == GeneratedCodeTest._STANDARD_GENERATED_REGEXES[2] def test_can_not_detect_generated_code_with_late_comment(self): non_matching_number_line_and_regex = analysis.matching_number_line_and_regex( GeneratedCodeTest._STANDARD_SOURCE_LINES, GeneratedCodeTest._STANDARD_GENERATED_REGEXES, 2 ) assert non_matching_number_line_and_regex is None def test_can_analyze_generated_code_with_own_pattern(self): lines = ["-- Generiert mit Hau-Ruck-Franz-Deutsch.", "select * from sauerkraut;"] generated_sql_path = self.create_temp_file("generated.sql", lines) source_analysis = analysis.SourceAnalysis.from_file( generated_sql_path, "test", generated_regexes=common.regexes_from("[regex](?i).*generiert") ) assert source_analysis.state == analysis.SourceState.generated class SizeTest(TempFolderTest): def test_can_detect_empty_source_code(self): empty_py_path = self.create_temp_binary_file("empty.py", b"") source_analysis = analysis.SourceAnalysis.from_file(empty_py_path, "test", encoding="utf-8") assert source_analysis.state == analysis.SourceState.empty assert source_analysis.code_count == 0 def test_can_analyze_project_markdown_files(): project_root_folder = os.path.dirname(PYGOUNT_PROJECT_FOLDER) for text_path in glob.glob(os.path.join(project_root_folder, "*.md")): source_analysis = analysis.SourceAnalysis.from_file(text_path, "test") assert source_analysis.state == analysis.SourceState.analyzed assert source_analysis.documentation_count > 0 assert source_analysis.empty_count > 0 def test_has_no_duplicate_in_pygount_source(): duplicate_pool = analysis.DuplicatePool() source_paths = [] for sub_folder_name in ("pygount", "tests"): source_paths.extend( [ os.path.join(PYGOUNT_PROJECT_FOLDER, sub_folder_name, source_name) for source_name in os.listdir(os.path.join(PYGOUNT_PROJECT_FOLDER, sub_folder_name)) ] ) for source_path in source_paths: if source_path.endswith(".py"): duplicate_path = duplicate_pool.duplicate_path(source_path) assert duplicate_path is None, f"{source_path} must not be duplicate of {duplicate_path}" def test_can_compute_base_language(): assert base_language("JavaScript") == "JavaScript" assert base_language("JavaScript+Lasso") == "JavaScript" assert base_language("JavaScript+") == "JavaScript+" # no actual language assert base_language("C++") == "C++" assert base_language("++C") == "++C" # no actual language assert base_language("") == "" # no actual language, but should not crash either class DuplicatePoolTest(TempFolderTest): def test_can_distinguish_different_files(self): some_path = self.create_temp_file(__name__ + "_some", "some") other_path = self.create_temp_file(__name__ + "_other", "other") duplicate_pool = analysis.DuplicatePool() assert duplicate_pool.duplicate_path(some_path) is None assert duplicate_pool.duplicate_path(other_path) is None def test_can_detect_duplicate(self): same_content = "same" original_path = self.create_temp_file("original", same_content) duplicate_path = self.create_temp_file("duplicate", same_content) duplicate_pool = analysis.DuplicatePool() assert duplicate_pool.duplicate_path(original_path) is None assert original_path == duplicate_pool.duplicate_path(duplicate_path) @pytest.mark.parametrize( "suffix, expected_result", [("md", True), ("MD", True), ("mD", True), ("rst", True), ("py", False), ("4", True), ("c", False)], ) def test_can_detect_markup_file(suffix, expected_result): source_path = f"some_file_name.{suffix}" assert is_markup_file(source_path) == expected_result ================================================ FILE: tests/test_command.py ================================================ """ Tests for pygount command line interface. """ import contextlib # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. import json import os import tempfile from xml.etree import ElementTree import pytest import pygount from pygount import command from pygount.command import VALID_OUTPUT_FORMATS, Command from pygount.common import OptionError from pygount.write import JSON_FORMAT_VERSION from ._common import PYGOUNT_PROJECT_FOLDER, PYGOUNT_SOURCE_FOLDER, TempFolderTest class CommandTest(TempFolderTest): def test_fails_on_unknown_output_format(self): unknown_output_format = "no_such_output_format" command = Command() with pytest.raises(OptionError, match=unknown_output_format): command.set_output_format(unknown_output_format) def test_can_set_encoding(self): command = Command() command.set_encodings("automatic;cp1252") assert command.default_encoding == "automatic" assert command.fallback_encoding == "cp1252" def test_can_execute_on_own_code(self): output_path = os.path.join(self.tests_temp_folder, "test_can_execute_on_own_code.txt") with contextlib.suppress(FileNotFoundError): # Ignore missing file as it is going to be recreated. os.remove(output_path) command = Command() command.set_output(output_path) command.set_output_format("cloc-xml") command.set_source_patterns(PYGOUNT_SOURCE_FOLDER) command.set_suffixes("py") command.execute() cloc_xml_root = ElementTree.parse(output_path) file_elements = cloc_xml_root.findall("files/file") assert file_elements is not None assert len(file_elements) >= 1 def test_fails_on_broken_regex(self): command = Command() with pytest.raises(OptionError, match=r"^option --generated: cannot parse pattern for regular repression.*"): command.set_generated_regexps("[regex](", "option --generated") def test_can_use_chardet_for_encoding(self): command = Command() command.set_encodings("chardet") command.set_source_patterns(PYGOUNT_SOURCE_FOLDER) command.execute() class PygountCommandTest(TempFolderTest): def test_can_show_help(self): with pytest.raises(SystemExit) as error_info: command.pygount_command(["--help"]) assert error_info.value.code == 0 def test_can_show_version(self): with pytest.raises(SystemExit) as error_info: command.pygount_command(["--version"]) assert error_info.value.code == 0 def test_fails_on_unknown_encoding(self): with pytest.raises(SystemExit) as error_info: command.pygount_command(["--encoding", "no_such_encoding", tempfile.gettempdir()]) assert error_info.value.code == 2 def test_fails_on_unknown_format(self): with pytest.raises(SystemExit) as error_info: command.pygount_command(["--format", "no_such_encoding", tempfile.gettempdir()]) assert error_info.value.code == 2 def test_fails_on_broken_regex_pattern(self): exit_code = command.pygount_command(["--generated", "[regex](", tempfile.gettempdir()]) assert exit_code == 1 def test_can_analyze_pygount_setup_py(self): pygount_setup_py_path = os.path.join(PYGOUNT_PROJECT_FOLDER, "setup.py") exit_code = command.pygount_command(["--verbose", pygount_setup_py_path]) assert exit_code == 0 def test_can_analyze_pygount_source_code(self): exit_code = command.pygount_command(["--verbose", PYGOUNT_SOURCE_FOLDER]) assert exit_code == 0 def test_can_detect_generated_code(self): generated_code_path = os.path.join(self.tests_temp_folder, "generated.py") with open(generated_code_path, "w", encoding="utf-8") as generated_code_file: generated_code_file.write( "# Generated with pygount.test_command.PygountCommandTest.test_can_detect_generated_code.\n" ) generated_code_file.write("# Do not edit!\n") generated_code_file.write("print('hello World')\n") cloc_xml_path = os.path.join(self.tests_temp_folder, "cloc.xml") exit_code = command.pygount_command( ["--verbose", "--format", "cloc-xml", "--out", cloc_xml_path, generated_code_path] ) assert exit_code == 0 assert os.path.exists(cloc_xml_path) cloc_xml_root = ElementTree.parse(cloc_xml_path) file_elements = cloc_xml_root.findall("files/file[@language='__generated__']") assert file_elements is not None assert len(file_elements) >= 1 def test_can_detect_generated_code_with_own_pattern(self): generiert_py_path = os.path.join(self.tests_temp_folder, "generiert.py") with open(generiert_py_path, "w", encoding="utf-8") as generiert_py_file: generiert_py_file.write( "# Generiert mit pygount.test_command.PygountCommandTest." "test_can_detect_generated_code_with_own_pattern()\n" ) generiert_py_file.write("print('hello World')\n") cloc_xml_path = os.path.join(self.tests_temp_folder, "cloc.xml") exit_code = command.pygount_command( [ "--verbose", "--format=cloc-xml", "--generated=[regex](?i).*generiert", "--out", cloc_xml_path, generiert_py_path, ] ) assert exit_code == 0 assert os.path.exists(cloc_xml_path) cloc_xml_root = ElementTree.parse(cloc_xml_path) file_elements = cloc_xml_root.findall("files/file[@language='__generated__']") assert file_elements is not None assert len(file_elements) >= 1 def test_can_analyze_pygount_source_code_as_cloc_xml(self): cloc_xml_path = os.path.join(self.tests_temp_folder, "cloc.xml") exit_code = command.pygount_command( ["--verbose", "--format", "cloc-xml", "--out", cloc_xml_path, PYGOUNT_SOURCE_FOLDER] ) assert exit_code == 0 assert os.path.exists(cloc_xml_path) cloc_xml_root = ElementTree.parse(cloc_xml_path) file_elements = cloc_xml_root.findall("files/file") assert file_elements is not None assert len(file_elements) >= 1 def test_can_analyze_pygount_source_code_as_json(self): pygount_json_path = os.path.join(self.tests_temp_folder, "pygount.json") exit_code = command.pygount_command( ["--verbose", "--format", "json", "--out", pygount_json_path, PYGOUNT_SOURCE_FOLDER] ) assert exit_code == 0 assert os.path.exists(pygount_json_path) with open(pygount_json_path, encoding="utf-8") as pygount_json_file: json_map = json.load(pygount_json_file) assert json_map.get("pygountVersion") == pygount.__version__ assert json_map.get("formatVersion") == JSON_FORMAT_VERSION assert "files" in json_map assert "languages" in json_map assert "runtime" in json_map assert "summary" in json_map def test_can_detect_duplicates(self): source_code = "# Duplicate source\nprint('duplicate code')\n" original_path = os.path.join(self.tests_temp_folder, "original.py") with open(original_path, "w") as original_file: original_file.write(source_code) duplicate_path = os.path.join(self.tests_temp_folder, "duplicate.py") with open(duplicate_path, "w") as duplicate_file: duplicate_file.write(source_code) cloc_xml_path = os.path.join(self.tests_temp_folder, "cloc.xml") exit_code = command.pygount_command( ["--verbose", "--format", "cloc-xml", "--out", cloc_xml_path, original_path, duplicate_path] ) assert exit_code == 0 assert os.path.exists(cloc_xml_path) cloc_xml_root = ElementTree.parse(cloc_xml_path) file_elements = cloc_xml_root.findall("files/file[@language='__duplicate__']") assert file_elements is not None assert len(file_elements) == 1 def test_can_accept_duplicates(self): source_code = "# Duplicate source\nprint('duplicate code')\n" original_path = os.path.join(self.tests_temp_folder, "original.py") with open(original_path, "w") as original_file: original_file.write(source_code) duplicate_path = os.path.join(self.tests_temp_folder, "duplicate.py") with open(duplicate_path, "w") as duplicate_file: duplicate_file.write(source_code) cloc_xml_path = os.path.join(self.tests_temp_folder, "cloc.xml") exit_code = command.pygount_command( ["--duplicates", "--verbose", "--format", "cloc-xml", "--out", cloc_xml_path, original_path, duplicate_path] ) assert exit_code == 0 assert os.path.exists(cloc_xml_path) cloc_xml_root = ElementTree.parse(cloc_xml_path) file_elements = cloc_xml_root.findall("files/file[@language='__duplicate__']") assert file_elements is not None assert len(file_elements) == 0 def test_can_write_all_output_formats(self): for output_format in VALID_OUTPUT_FORMATS: exit_code = command.pygount_command(["--format", output_format, PYGOUNT_SOURCE_FOLDER]) self.assertEqual(exit_code, 0) def test_can_merge_embedded_languages(self): test_html_django_path = self.create_temp_file( "some.html", ["", "{% load i18n %}", ''], ) cloc_xml_path = os.path.join(self.tests_temp_folder, "cloc.xml") exit_code = command.pygount_command( ["--merge-embedded-languages", "--format", "cloc-xml", "--out", cloc_xml_path, test_html_django_path] ) assert exit_code == 0 assert os.path.exists(cloc_xml_path) cloc_xml_root = ElementTree.parse(cloc_xml_path) file_elements = cloc_xml_root.findall("files/file[@language='HTML']") assert file_elements is not None assert len(file_elements) == 1 ================================================ FILE: tests/test_common.py ================================================ """ Tests for :py:mod:`pygount.common` module. """ # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. import re import pytest import pygount.common from pygount.common import matching_regex def test_can_build_str(): error_without_source = pygount.common.OptionError("test") assert str(error_without_source) == "test" error_with_source = pygount.common.OptionError("test", "some_file.txt") assert str(error_with_source) == "some_file.txt: test" def test_can_match_from_regex(): regex = pygount.common.regex_from(re.compile(r"a\d+b")) assert regex.match("a123b") is not None assert regex.match("ab") is None def test_can_match_from_regex_pattern(): regex = pygount.common.regex_from(r"a\d+b") assert regex.match("a123b") is not None assert regex.match("ab") is None def test_can_match_from_shell_pattern(): regex = pygount.common.regex_from("*a[0-9]?*b*", True) assert regex.match("a123b") is not None assert regex.match("ab") is None def test_can_match_single_regex_from_shell_pattern(): regexes = pygount.common.regexes_from("*.py") assert len(regexes) == 1 assert regexes[0].match("some.py") is not None assert regexes[0].match("some.bat") is None def test_can_match_single_regex(): regexes = pygount.common.regexes_from(pygount.common.REGEX_PATTERN_PREFIX + r"^.+\.py$") assert len(regexes) == 1 assert regexes[0].match("some.py") is not None assert regexes[0].match("some.bat") is None def test_can_match_regex_from_multiple_regex_patterns(): regexes = pygount.common.regexes_from(pygount.common.REGEX_PATTERN_PREFIX + r"x, abc, ^.+\.py$") assert len(regexes) == 3 assert regexes[0].match("some.py") is None assert regexes[1].match("some.py") is None assert regexes[2].match("some.py") is not None def test_can_match_regex_from_multiple_default_shell_patterns(): regexes = pygount.common.regexes_from( pygount.common.REGEX_PATTERN_PREFIX + pygount.common.ADDITIONAL_PATTERN + r"x", "abc, *.py" ) assert len(regexes) == 3 assert regexes[0].match("some.py") is None assert regexes[1].match("some.py") is None assert regexes[2].match("some.py") is not None assert regexes[0].match("x") is not None def test_can_represent_text_as_list(): assert pygount.common.as_list("") == [] assert pygount.common.as_list("a") == ["a"] assert pygount.common.as_list("abc,d, e") == ["abc", "d", "e"] assert pygount.common.as_list(",,,,") == [] def test_can_represent_iterable_as_list(): assert pygount.common.as_list([]) == [] assert pygount.common.as_list(["a", 1, None]) == ["a", 1, None] assert pygount.common.as_list(()) == [] assert pygount.common.as_list(range(3)) == [0, 1, 2] @pytest.mark.parametrize( "text,patterns,expected_regex_index", [ ("some", [], -1), ("some", ["some"], 0), ("some", ["other"], -1), ("some", ["other", "some"], 1), ("some", ["s.+"], 0), ("some", [".*T.*"], -1), ], ) def test_can_compute_matching_regex(text: str, patterns: list[str], expected_regex_index: int): regexes = [re.compile(pattern) for pattern in patterns] regex = matching_regex(text, regexes) regex_index = regexes.index(regex) if regex is not None else -1 assert regex_index == expected_regex_index def test_can_convert_empty_text_to_lines(): assert list(pygount.common.lines("")) == [] def test_can_convert_single_letter_to_lines(): assert list(pygount.common.lines("a")) == ["a"] def test_can_convert_single_letter_with_newline_to_lines(): assert list(pygount.common.lines("a\n")) == ["a"] def test_can_convert_multiple_lines(): assert list(pygount.common.lines("a\nbc")) == ["a", "bc"] assert list(pygount.common.lines("a\nbc\n")) == ["a", "bc"] def test_can_convert_empty_lines(): assert list(pygount.common.lines("\n\n\n")) == ["", "", ""] def test_can_compute_mapped_repr(): class Dummy: pass assert pygount.common.mapped_repr(Dummy(), {}) == "Dummy()" assert ( pygount.common.mapped_repr(Dummy(), {"some": "such", "other": 1, "whatever": True}) == "Dummy(some=such, other=1, whatever=True)" ) ================================================ FILE: tests/test_encoding.py ================================================ """ Tests for encoding related functions. """ # Copyright (c) 2016-2025, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. from tempfile import NamedTemporaryFile import pytest from pygount.analysis import _BOM_TO_ENCODING_MAP, encoding_for, encoding_from_possible_magic_comment, is_binary_file from ._common import temp_binary_file, temp_source_file _ENCODING_TO_BOM_MAP = {encoding: bom for bom, encoding in _BOM_TO_ENCODING_MAP.items()} _TEST_CODE = "x = '\u00fd \u20ac'" @pytest.mark.parametrize( "ascii_header", [ "# encoding: cp1252", "# coding: cp1252", "# -*- coding: cp1252 -*-", "# eNcOdInG: cp1252", "## encoding: cp1252", "#encoding:cp1252", "# -*- coding: cp1252; mode: python; -*-" # Emacs modeline "/* coding: cp1252 */" # C "{ coding: cp1252 }" # Pascal "REM coding: cp1252", # Basic ], ) def test_can_detect_encoding_from_magic_comments(ascii_header: str): assert encoding_from_possible_magic_comment(ascii_header) == "cp1252" @pytest.mark.parametrize( "ascii_header", [ "", " ", " # encoding: cp1252", # Leading white space "# encoding: !$%&", "-*- coding: cp1252 -*-", # Not a comment "encoding: cp1252", '{"x":"encoding: cp1252"}', ], ) def test_can_ignore_encoding_from_magic_comments(ascii_header: str): assert encoding_from_possible_magic_comment(ascii_header) is None @pytest.mark.parametrize("encoding", _BOM_TO_ENCODING_MAP.values()) def test_can_detect_bom_encodings(encoding: str): _test_can_detect_bom_encoding(encoding) def _test_can_detect_bom_encoding(encoding: str): with NamedTemporaryFile(mode="wb+", suffix="txt") as test_file: if encoding != "utf-8-sig": bom = _ENCODING_TO_BOM_MAP[encoding] test_file.write(bom) test_file.write(_TEST_CODE.encode(encoding)) test_file.flush() test_file.seek(0) actual_encoding = encoding_for(test_file.name) assert actual_encoding == encoding @pytest.mark.parametrize("encoding", ["cp1252", "utf-8"]) def test_can_detect_plain_encoding(encoding: str): with temp_source_file("txt", _TEST_CODE, encoding=encoding) as test_file: actual_encoding = encoding_for(test_file.name) assert actual_encoding == encoding def test_can_detect_xml_prolog(): encoding = "iso-8859-15" xml_code = f'{_TEST_CODE}' with temp_source_file("xml", [xml_code], encoding=encoding) as test_file: actual_encoding = encoding_for(test_file.name) assert actual_encoding == encoding def test_can_detect_magic_comment(): encoding = "iso-8859-15" lines = ["#!/usr/bin/python", f"# -*- coding: {encoding} -*-", _TEST_CODE] with temp_source_file("txt", lines, encoding=encoding) as test_file: actual_encoding = encoding_for(test_file.name) assert actual_encoding == encoding def test_can_detect_automatic_encoding_for_empty_source(): with temp_binary_file(b"") as test_file: actual_encoding = encoding_for(test_file.name) assert actual_encoding == "utf-8" def test_can_detect_chardet_encoding(): test_path = __file__ actual_encoding = encoding_for(test_path) assert actual_encoding == "utf-8" def test_can_detect_utf8_when_cp1252_would_fail(): # Write closing double quote in UTF-8, which contains 0x9d, # which fails when read as CP1252. content = b"\xe2\x80\x9d" with temp_binary_file(content) as test_file: actual_encoding = encoding_for(test_file.name, encoding="automatic", fallback_encoding=None) assert actual_encoding == "utf-8" actual_encoding = encoding_for(test_file.name, encoding="automatic", fallback_encoding="cp1252") assert actual_encoding == "cp1252" def test_can_use_hardcoded_encoding(): with temp_source_file("txt", "\N{EURO SIGN}", encoding="cp1252") as test_file: test_path = test_file.name actual_encoding = encoding_for(test_path, "utf-8") assert actual_encoding == "utf-8" # Make sure that we cannot actually read the file using the hardcoded but wrong encoding. with open(test_path, encoding=actual_encoding) as broken_test_file, pytest.raises(UnicodeDecodeError): broken_test_file.read() def test_can_detect_binary_with_zero_byte(): with temp_binary_file(b"hello\0world") as binary_file: assert is_binary_file(binary_file.name) def test_can_detect_utf16_as_non_binary(): with NamedTemporaryFile(encoding="utf-16", mode="w+") as utf16_file: utf16_file.write("Hello world!") utf16_file.flush() utf16_file.seek(0) assert not is_binary_file(utf16_file.name) ================================================ FILE: tests/test_git_storage.py ================================================ from pathlib import Path from pygount.git_storage import GitStorage, git_remote_url_and_revision_if_any def test_can_extract_git_remote_url_and_revision_if_any(): assert git_remote_url_and_revision_if_any("hello") == (None, None) assert git_remote_url_and_revision_if_any("git@github.com:roskakori/pygount.git/v1.5.1") == ( "git@github.com:roskakori/pygount.git", "v1.5.1", ) assert git_remote_url_and_revision_if_any("git@github.com:roskakori/pygount.git") == ( "git@github.com:roskakori/pygount.git", None, ) assert git_remote_url_and_revision_if_any("git@github.com:roskakori/pygount.git/") == ( "git@github.com:roskakori/pygount.git", None, ) assert git_remote_url_and_revision_if_any("") == (None, None) def test_can_extract_and_close_and_find_files_from_cloned_git_remote_url_with_revision(): remote_url, revision = git_remote_url_and_revision_if_any("https://github.com/roskakori/pygount.git/v0.1") assert remote_url is not None git_storage = GitStorage(remote_url, revision) pyproject_path = Path(git_storage.temp_folder) / "pyproject.toml" readme_path = Path(git_storage.temp_folder) / "README.rst" try: git_storage.extract() assert readme_path.exists() assert not pyproject_path.exists() finally: git_storage.close() assert not readme_path.exists() ================================================ FILE: tests/test_lexers.py ================================================ """ Tests for additional lexers for pygount. """ # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. from pygments import token import pygount.lexers def test_can_lex_idl(): lexer = pygount.lexers.IdlLexer() text = "\n".join( [ "/* some", " * comment */", "module HelloApp {", " interface Hello {", " string sayHello(); // Be friendly!", " };", "};", ] ) text_tokens = list(lexer.get_tokens(text)) assert text_tokens == [ (token.Token.Comment.Multiline, "/* some\n * comment */"), (token.Token.Text.Whitespace, "\n"), (token.Token.Keyword.Declaration, "module"), (token.Token.Text, " "), (token.Token.Name.Class, "HelloApp"), (token.Token.Text.Whitespace, " "), (token.Token.Punctuation, "{"), (token.Token.Text.Whitespace, "\n"), (token.Token.Text.Whitespace, " "), (token.Token.Keyword.Declaration, "interface"), (token.Token.Text, " "), (token.Token.Name.Class, "Hello"), (token.Token.Text.Whitespace, " "), (token.Token.Punctuation, "{"), (token.Token.Text.Whitespace, "\n"), (token.Token.Text.Whitespace, " "), (token.Token.Name, "string"), (token.Token.Text.Whitespace, " "), (token.Token.Name.Function, "sayHello"), (token.Token.Punctuation, "("), (token.Token.Punctuation, ")"), (token.Token.Punctuation, ";"), (token.Token.Text.Whitespace, " "), (token.Token.Comment.Single, "// Be friendly!"), (token.Token.Text.Whitespace, "\n"), (token.Token.Text.Whitespace, " "), (token.Token.Punctuation, "}"), (token.Token.Punctuation, ";"), (token.Token.Text.Whitespace, "\n"), (token.Token.Punctuation, "}"), (token.Token.Punctuation, ";"), (token.Token.Text.Whitespace, "\n"), ] def test_can_lex_m4(): lexer = pygount.lexers.MinimalisticM4Lexer() text = "" text += "#\n" text += "# comment\n" text += "define(FRUIT, apple) # Healthy stuff!\n" text += "Eat some FRUIT!" text_tokens = list(lexer.get_tokens(text)) assert text_tokens == [ (token.Token.Comment.Single, "#\n"), (token.Token.Comment.Single, "# comment\n"), (token.Token.Text, "define(FRUIT, apple) "), (token.Token.Comment.Single, "# Healthy stuff!\n"), (token.Token.Text, "Eat some FRUIT!\n"), ] def test_can_lex_vbscript(): lexer = pygount.lexers.MinimalisticVBScriptLexer() text = "".join(["' comment\n", 'WScript.Echo "hello world!"']) text_tokens = list(lexer.get_tokens(text)) assert text_tokens == [ (token.Token.Comment.Single, "' comment\n"), (token.Token.Text, 'WScript.Echo "hello world!"\n'), ] def test_can_lex_webfocus(): lexer = pygount.lexers.MinimalisticWebFocusLexer() text = "".join(["-*\n", "-* comment\n", "-set &some='text';\n", "table file some print * end;"]) text_tokens = list(lexer.get_tokens(text)) assert text_tokens == [ (token.Token.Comment.Single, "-*\n"), (token.Token.Comment.Single, "-* comment\n"), (token.Token.Text, "-set &some='text';\n"), (token.Token.Text, "table file some print * end;\n"), ] def test_can_lex_plain_text(): lexer = pygount.lexers.PlainTextLexer() text = "".join( [ "a\n", # line with text "\n", # empty line " \t \n", # line containing only white space " ", # trailing while space line without newline character ] ) text_tokens = list(lexer.get_tokens(text)) assert text_tokens == [(token.Token.Comment.Single, "a\n"), (token.Token.Text, "\n \t \n \n")] ================================================ FILE: tests/test_summary.py ================================================ """ Tests to summarize analyses of multiple source codes. """ # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. from pygount.analysis import SourceAnalysis, SourceState from pygount.summary import LanguageSummary, ProjectSummary def test_can_repr_language_summary(): language_summary = LanguageSummary("Python") language_summary.add(SourceAnalysis("some.py", "Python", "some", 2, 3, 4, 5, SourceState.analyzed)) expected_language_summary_repr = ( "LanguageSummary(language='Python', file_count=1, " "code_count=2, documentation_count=3, empty_count=4, string_count=5)" ) assert repr(language_summary) == expected_language_summary_repr assert repr(language_summary) == str(language_summary) def test_can_repr_pseudo_language_summary(): language_summary = LanguageSummary("__empty__") language_summary.add(SourceAnalysis("some.py", "__empty__", "some", 0, 0, 0, 0, SourceState.empty)) expected_language_summary_repr = "LanguageSummary(language='__empty__', file_count=1)" assert repr(language_summary) == expected_language_summary_repr assert repr(language_summary) == str(language_summary) def test_can_summarize_project_with_multiple_files_of_same_language(): source_analyses = ( SourceAnalysis("some.py", "Python", "some", 300, 70, 4, 2, SourceState.analyzed), SourceAnalysis("other.py", "Python", "some", 700, 30, 6, 3, SourceState.analyzed), ) project_summary = ProjectSummary() for source_analysis in source_analyses: project_summary.add(source_analysis) assert set(project_summary.language_to_language_summary_map.keys()) == {"Python"} assert project_summary.total_file_count == 2 assert project_summary.total_code_count == 1000 assert project_summary.total_documentation_count == 100 assert project_summary.total_empty_count == 10 assert project_summary.total_string_count == 5 def test_can_summarize_project_with_multiple_files_of_different_languages(): source_analyses = ( SourceAnalysis("some.py", "Python", "some", 1000, 100, 10, 3, SourceState.analyzed), SourceAnalysis("some.sh", "Bash", "some", 200, 20, 5, 2, SourceState.analyzed), ) project_summary = ProjectSummary() for source_analysis in source_analyses: project_summary.add(source_analysis) assert set(project_summary.language_to_language_summary_map.keys()) == {"Bash", "Python"} assert project_summary.total_file_count == 2 assert project_summary.total_code_count == 1200 assert project_summary.total_documentation_count == 120 assert project_summary.total_empty_count == 15 assert project_summary.total_string_count == 5 assert ( repr(project_summary) == "ProjectSummary(total_file_count=2, total_line_count=1340, languages=['Bash', 'Python'])" ) def test_can_summarize_project_with_pseudo_languages(): source_analyses = ( SourceAnalysis("empty.py", "__empty__", "some", 0, 0, 0, 0, SourceState.empty), SourceAnalysis("generated.py", "__generated__", "some", 1, 2, 3, 4, SourceState.generated, "generated by test"), SourceAnalysis("binary.bin", "__binary__", "some", 0, 0, 0, 0, SourceState.binary), ) expected_languages = {source_analysis.language for source_analysis in source_analyses} project_summary = ProjectSummary() for source_analysis in source_analyses: project_summary.add(source_analysis) assert project_summary.total_file_count == 3 assert set(project_summary.language_to_language_summary_map.keys()) == expected_languages assert project_summary.total_code_count == 0 assert project_summary.total_documentation_count == 0 assert project_summary.total_empty_count == 0 assert project_summary.total_string_count == 0 assert repr(project_summary) == ( "ProjectSummary(total_file_count=3, total_line_count=0, languages=['__binary__', '__empty__', '__generated__'])" ) def test_can_repr_empty_project_summary(): project_summary = ProjectSummary() assert repr(project_summary) == "ProjectSummary(total_file_count=0, total_line_count=0, languages=[])" assert repr(project_summary) == str(project_summary) ================================================ FILE: tests/test_write.py ================================================ """ Test to write results of pygount analyses. """ # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. import io import re import tempfile from pathlib import Path from xml.etree import ElementTree import pytest from pygount import analysis, write from ._common import TempFolderTest def test_can_collect_totals(): source_analyses = ( analysis.SourceAnalysis("some.py", "Python", "some", 1, 2, 3, 4, analysis.SourceState.analyzed, None), analysis.SourceAnalysis("other.py", "Python", "some", 10, 20, 30, 40, analysis.SourceState.analyzed, None), ) with ( tempfile.NamedTemporaryFile("w", encoding="utf-8", prefix="pygount_", suffix=".tmp") as target_stream, write.BaseWriter(target_stream) as writer, ): for source_analysis in source_analyses: writer.add(source_analysis) assert writer.project_summary.total_file_count == 2 assert writer.project_summary.total_line_count == 110 assert writer.duration_in_seconds > 0 assert writer.lines_per_second > writer.files_per_second def test_can_write_cloc_xml(): source_analyses = ( analysis.SourceAnalysis("some.py", "Python", "some", 1, 2, 3, 4, analysis.SourceState.analyzed, None), analysis.SourceAnalysis("other.py", "Python", "some", 10, 20, 30, 40, analysis.SourceState.analyzed, None), ) with io.StringIO() as target_stream: with write.ClocXmlWriter(target_stream) as writer: for source_analysis in source_analyses: writer.add(source_analysis) xml_data = target_stream.getvalue() assert len(xml_data) >= 1 with io.StringIO(xml_data) as cloc_xml_stream: cloc_results_root = ElementTree.parse(cloc_xml_stream) file_elements = cloc_results_root.findall("files/file") assert file_elements is not None assert len(file_elements) == len(source_analyses) def test_can_compute_digit_width(): assert write.digit_width(0) == 1 assert write.digit_width(1) == 1 assert write.digit_width(9) == 1 assert write.digit_width(999) == 3 assert write.digit_width(1000) == 4 _LINE_WORD_REGEX = re.compile(r"[\w\\.]+") # HACK: For test assume all language names are "\w+". class _LineData: def __init__(self, line: str): line_parts = _LINE_WORD_REGEX.findall(line) self.language = line_parts[0] self.file_count = int(line_parts[1]) self.file_percentage = float(line_parts[2]) self.code_count = int(line_parts[3]) self.code_percentage = float(line_parts[4]) self.comment_count = int(line_parts[5]) self.comment_percentage = float(line_parts[6]) class SummaryWriterTest(TempFolderTest): def test_can_write_summary(self): source_analyses = ( analysis.SourceAnalysis("script.sh", "Bash", "some", 200, 25, 1, 2, analysis.SourceState.analyzed, None), analysis.SourceAnalysis("some.py", "Python", "some", 300, 45, 3, 4, analysis.SourceState.analyzed, None), analysis.SourceAnalysis("other.py", "Python", "some", 500, 30, 5, 6, analysis.SourceState.analyzed, None), ) lines = self._summary_lines_for(source_analyses) assert len(lines) == 8, f"lines={lines}" python_data = _LineData(lines[3]) assert python_data.language == "Python" assert python_data.file_count == 2 assert python_data.file_percentage == pytest.approx(66.7) assert python_data.code_count == 800 assert python_data.code_percentage == pytest.approx(89.6) assert python_data.comment_count == 75 assert python_data.comment_percentage == pytest.approx(8.4) bash_data = _LineData(lines[4]) assert bash_data.language == "Bash" assert bash_data.file_count == 1 assert bash_data.code_count == 200 assert bash_data.code_percentage == pytest.approx(87.7) assert bash_data.comment_count == 25 assert bash_data.comment_percentage == pytest.approx(11.0) sum_total_data = _LineData(lines[-2]) assert sum_total_data.file_count == 3 assert sum_total_data.file_percentage == pytest.approx(100.0) assert sum_total_data.code_count == 1000 assert sum_total_data.code_percentage == pytest.approx(89.2) assert sum_total_data.comment_count == 100 assert sum_total_data.comment_percentage == pytest.approx(8.9) def _summary_lines_for(self, source_analyses): # NOTE: We need to write to a file because the lines containing the # actual data are only available during close() at which point they # would not be accessible to StringIO.getvalue(). summary_path = Path(self.tests_temp_folder, "summary.tmp") with summary_path.open("w", encoding="utf-8") as summary_file, write.SummaryWriter(summary_file) as writer: for source_analysis in source_analyses: writer.add(source_analysis) return summary_path.read_text("utf-8").splitlines() ================================================ FILE: tests/test_xmldialect.py ================================================ """ Tests for function to obtain the language dialect used by XML source code. """ import pytest # Copyright (c) 2016-2024, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. import pygount.xmldialect from pygount.xmldialect import without_xml_header EXAMPLE_ANT_CODE = """ """ _EXAMPLE_POM_CODE = """ 4.0.0 com.mycompany.app my-app 1.0-SNAPSHOT jar Maven Quick Start Archetype http://maven.apache.org junit junit 4.8.2 test """ _EXAMPLE_DOCBOOK_DTD_CODE = """ Hello World in Python print('Hello World!') """ _EXAMPLE_SVG_CODE = ( '\n' '' '' ' ' ' Hello, world!" "" ) @pytest.mark.parametrize( "xml_code,expected", [ ("", ""), ("", ""), ('', ""), (' ', ""), (' ', ""), ('\n\n\n\n', ""), ], ) def test_can_compute_xml_code_without_header(xml_code: str, expected: str): assert without_xml_header(xml_code) == expected def test_can_detect_ant(): assert pygount.xmldialect.xml_dialect("", EXAMPLE_ANT_CODE) == "Ant" def test_can_detect_maven(): assert pygount.xmldialect.xml_dialect("", _EXAMPLE_POM_CODE) == "Maven" def test_can_ignore_broken_xml(): assert pygount.xmldialect.xml_dialect("", "") is None def test_can_detect_docbook_from_dtd(): assert pygount.xmldialect.xml_dialect("", _EXAMPLE_DOCBOOK_DTD_CODE) == "DocBook XML" def test_can_detect_svg_from_dtd(): assert pygount.xmldialect.xml_dialect("", _EXAMPLE_SVG_CODE) == "SVG XML"