Repository: roskakori/pygount
Branch: main
Commit: c05d365b2447
Files: 52
Total size: 215.5 KB

Directory structure:
gitextract_04wbqrob/

├── .gitattributes
├── .github/
│   └── workflows/
│       └── build.yml
├── .gitignore
├── .idea/
│   ├── .gitignore
│   ├── encodings.xml
│   ├── inspectionProfiles/
│   │   └── Project_Default.xml
│   ├── misc.xml
│   ├── modules.xml
│   ├── pyProjectModel.xml
│   ├── pygount.iml
│   └── vcs.xml
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CHANGES.md
├── CONTRIBUTING.md
├── LICENSE.txt
├── README.md
├── docs/
│   ├── api.md
│   ├── background.md
│   ├── changes.md
│   ├── continuous-integration.md
│   ├── contributing.md
│   ├── index.md
│   ├── installation.md
│   ├── json.md
│   └── usage.md
├── mkdocs.yaml
├── pygount/
│   ├── __init__.py
│   ├── analysis.py
│   ├── command.py
│   ├── common.py
│   ├── git_storage.py
│   ├── lexers.py
│   ├── summary.py
│   ├── write.py
│   └── xmldialect.py
├── pyproject.toml
├── scripts/
│   ├── build_documentation.sh
│   ├── build_movie.sh
│   ├── test_coverage.sh
│   └── update_dependencies.sh
└── tests/
    ├── __init__.py
    ├── _common.py
    ├── test_analysis.py
    ├── test_command.py
    ├── test_common.py
    ├── test_encoding.py
    ├── test_git_storage.py
    ├── test_lexers.py
    ├── test_summary.py
    ├── test_write.py
    └── test_xmldialect.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitattributes
================================================
* text=auto


================================================
FILE: .github/workflows/build.yml
================================================
# Continuous integration build for pygount.
name: Build

on: [push, pull_request]

jobs:
  build:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
    env:
      MAIN_PYTHON_VERSION: "3.12" # same as Ubuntu 24 LTS

    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install uv
        uses: astral-sh/setup-uv@v6
        with:
          # NOTE Using the "latest" version of uv is risky, but for the time being uv is updated
          #  regularly, so a specific version would be outdated rather quickly. Once uv goes
          #  version 1.0, this should be changed to something like ">=1 <2".
          version: "latest"
      - name: Load cached venv
        id: cached-uv-dependencies
        uses: actions/cache@v4
        with:
          path: .venv
          key: venv-${{ runner.os }}-${{ hashFiles('**/uv.lock') }}
      - name: Install dependencies
        if: steps.cached-uv-dependencies.outputs.cache-hit != 'true'
        run: |
          uv sync
      - name: Build pygount package
        run: |
          uv build
      - name: Run the test suite
        run: |
          uv run pytest --cov=pygount --cov-branch
      - name: Build documentation
        if: ${{ matrix.python-version == env.MAIN_PYTHON_VERSION }}
        run: |
          uv run sh scripts/build_documentation.sh
      - name: Update coveralls statistics
        if: ${{ matrix.python-version == env.MAIN_PYTHON_VERSION }}
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          uv run coveralls --service=github

  check-style:
    runs-on: ubuntu-latest
    # Disable pre-commit check on main and production to prevent
    # pull request merges to fail with don't commit to branch".
    if: github.ref != 'refs/heads/main'
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ env.MAIN_PYTHON_VERSION }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ env.MAIN_PYTHON_VERSION }}
      - name: Install pre-commit
        run: |
          pip install pre-commit
      - name: Load cached pre-commit
        id: cached-pre-commit
        uses: actions/cache@v4
        with:
          path: ~/.cache/pre-commit
          key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }}
      - name: Install pre-commit hooks
        if: steps.cached-pre-commit.outputs.cache-hit != 'true'
        run: pre-commit install --install-hooks
      - name: Check coding style
        run: pre-commit run --all-files


================================================
FILE: .gitignore
================================================

# Created by https://www.toptal.com/developers/gitignore/api/python,pycharm
# Edit at https://www.toptal.com/developers/gitignore?templates=python,pycharm

### PyCharm ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# AWS User-specific
.idea/**/aws.xml

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn.  Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser

### PyCharm Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721

# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr

# Sonarlint plugin
# https://plugins.jetbrains.com/plugin/7973-sonarlint
.idea/**/sonarlint/

# SonarQube Plugin
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
.idea/**/sonarIssues.xml

# Markdown Navigator plugin
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
.idea/**/markdown-navigator.xml
.idea/**/markdown-navigator-enh.xml
.idea/**/markdown-navigator/

# Cache file creation bug
# See https://youtrack.jetbrains.com/issue/JBR-2257
.idea/$CACHE_FILE$

# CodeStream plugin
# https://plugins.jetbrains.com/plugin/12206-codestream
.idea/codestream.xml

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# End of https://www.toptal.com/developers/gitignore/api/python,pycharm

# Various
.DS_Store
.pytest_cache
/.idea/ruff.xml
/build/
/dist/
/cloc.xml
/tests/.temp/
/htmlcov/


================================================
FILE: .idea/.gitignore
================================================
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml


================================================
FILE: .idea/encodings.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="Encoding" addBOMForNewFiles="with NO BOM" />
</project>

================================================
FILE: .idea/inspectionProfiles/Project_Default.xml
================================================
<component name="InspectionProjectProfileManager">
  <profile version="1.0">
    <option name="myName" value="Project Default" />
    <inspection_tool class="PyCompatibilityInspection" enabled="true" level="WARNING" enabled_by_default="true">
      <option name="ourVersions">
        <value>
          <list size="5">
            <item index="0" class="java.lang.String" itemvalue="3.7" />
            <item index="1" class="java.lang.String" itemvalue="3.8" />
            <item index="2" class="java.lang.String" itemvalue="3.9" />
            <item index="3" class="java.lang.String" itemvalue="3.10" />
            <item index="4" class="java.lang.String" itemvalue="3.10" />
          </list>
        </value>
      </option>
    </inspection_tool>
    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
      <option name="ignoredPackages">
        <value>
          <list size="2">
            <item index="0" class="java.lang.String" itemvalue="coverage" />
            <item index="1" class="java.lang.String" itemvalue="setuptools" />
          </list>
        </value>
      </option>
    </inspection_tool>
  </profile>
</component>

================================================
FILE: .idea/misc.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="Black">
    <option name="sdkName" value="Poetry (pygount)" />
  </component>
  <component name="ProjectRootManager" version="2" project-jdk-name="uv (pygount)" project-jdk-type="Python SDK" />
  <component name="PythonCompatibilityInspectionAdvertiser">
    <option name="version" value="3" />
  </component>
</project>

================================================
FILE: .idea/modules.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/pygount.iml" filepath="$PROJECT_DIR$/.idea/pygount.iml" />
    </modules>
  </component>
</project>

================================================
FILE: .idea/pyProjectModel.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="PyProjectModelSettings">
    <option name="showConfigurationNotification" value="false" />
  </component>
</project>

================================================
FILE: .idea/pygount.iml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$">
      <excludeFolder url="file://$MODULE_DIR$/dist" />
      <excludeFolder url="file://$MODULE_DIR$/.pytest_cache" />
      <excludeFolder url="file://$MODULE_DIR$/.idea/libraries" />
      <excludeFolder url="file://$MODULE_DIR$/pygount.egg-info" />
      <excludeFolder url="file://$MODULE_DIR$/htmlcov" />
      <excludeFolder url="file://$MODULE_DIR$/.venv" />
      <excludeFolder url="file://$MODULE_DIR$/build" />
      <excludeFolder url="file://$MODULE_DIR$/site" />
    </content>
    <orderEntry type="jdk" jdkName="uv (pygount)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
    <orderEntry type="library" name="R User Library" level="project" />
    <orderEntry type="library" name="R Skeletons" level="application" />
  </component>
  <component name="PackageRequirementsSettings">
    <option name="requirementsPath" value="" />
  </component>
  <component name="TestRunnerService">
    <option name="PROJECT_TEST_RUNNER" value="py.test" />
  </component>
</module>

================================================
FILE: .idea/vcs.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$" vcs="Git" />
  </component>
</project>

================================================
FILE: .pre-commit-config.yaml
================================================
exclude: "^.idea"

repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.15.12
    hooks:
      - id: ruff
        args: ["--fix"]
      - id: ruff-format

  - repo: https://github.com/pre-commit/mirrors-prettier
    rev: v3.1.0
    hooks:
      - id: prettier

  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v6.0.0
    hooks:
      - id: fix-byte-order-marker
      - id: trailing-whitespace
      - id: end-of-file-fixer
      - id: mixed-line-ending
      - id: check-added-large-files
      - id: check-ast
      - id: check-json
      - id: check-merge-conflict
      - id: check-xml
      - id: check-yaml
      - id: debug-statements

  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v6.0.0
    hooks:
      - id: no-commit-to-branch
        args: ["--branch", "main"]


================================================
FILE: .readthedocs.yaml
================================================
# Settings for "Read the Docs" build.
# See <https://docs.readthedocs.io/>.
version: 2

build:
  os: "ubuntu-24.04"
  tools:
    python: "3.14"

mkdocs:
  configuration: mkdocs.yaml

python:
  install:
    - method: uv
      command: sync
      groups:
        - dev


================================================
FILE: CHANGES.md
================================================
# Version history

For more information about which versions of pygount included what changes
read the
[respective chapter of the documentation](https://pygount.readthedocs.io/en/latest/changes/).


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to pygount

For more information on building pygount and contributing to it, read the
[respective chapter of the documentation](https://pygount.readthedocs.io/en/latest/contributing/).


================================================
FILE: LICENSE.txt
================================================
Copyright (c) 2016-2024, Thomas Aglassinger
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of pygount nor the names of its contributors may be used to
  endorse or promote products derived from this software without specific
  prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


================================================
FILE: README.md
================================================
[![PyPI](https://img.shields.io/pypi/v/pygount)](https://pypi.org/project/pygount/)
[![Python Versions](https://img.shields.io/pypi/pyversions/pygount.svg)](https://www.python.org/downloads/)
[![Build Status](https://github.com/roskakori/pygount/actions/workflows/build.yml/badge.svg)](https://github.com/roskakori/pygount/actions/workflows/build.yml)
[![Test Coverage](https://img.shields.io/coveralls/github/roskakori/pygount)](https://coveralls.io/r/roskakori/pygount?branch=main)
[![Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![License](https://img.shields.io/github/license/roskakori/pygount)](https://opensource.org/licenses/BSD-3-Clause)

# pygount

Pygount is a command line tool to scan folders for source code files and
count the number of source code lines in it. It is similar to tools like
[sloccount](https://www.dwheeler.com/sloccount/) and
[cloc](https://github.com/AlDanial/cloc) but uses the
[pygments](https://pygments.org/)
package to analyze the source code and consequently can analyze any
[programming language supported by pygments](https://pygments.org/languages/).

The name is a combination of pygments and count.

Pygount is open source and distributed under the
[BSD license](https://opensource.org/licenses/BSD-3-Clause). The source
code is available from https://github.com/roskakori/pygount.

## Quickstart

For installation run

```bash
$ pip install pygount
```

or use [uv](https://docs.astral.sh/uv/) to run it directly, for example:

```bash
$ uvx pygount --version
```

To get a list of line counts for a projects stored in a certain folder:

```bash
$ pygount ~/projects/example
```

To limit the analysis to certain file types identified by their suffix:

```bash
$ pygount --suffix=cfg,py,yml ~/projects/example
```

To get a summary of each programming language with sum counts and percentage:

```bash
$ pygount --format=summary ~/projects/example
```

To analyze a remote git repository directly without having to clone it first:

```bash
$ pygount --format=summary https://github.com/roskakori/pygount.git
```

You can pass a specific revision at the end of the remote URL:

```bash
$ pygount --format=summary https://github.com/roskakori/pygount.git/v1.5.1
```

This example results in the following summary output:

```
┏━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━┓
┃ Language         ┃ Files ┃     % ┃ Code ┃    % ┃ Comment ┃    % ┃
┡━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━┩
│ Python           │    18 │  47.4 │ 2132 │ 63.6 │     418 │ 12.5 │
│ TOML             │     2 │   5.3 │ 1204 │ 82.7 │       1 │  0.1 │
│ Batchfile        │     1 │   2.6 │   24 │ 68.6 │       1 │  2.9 │
│ Bash             │     2 │   5.3 │   12 │ 80.0 │       3 │ 20.0 │
│ Makefile         │     1 │   2.6 │    9 │ 45.0 │       7 │ 35.0 │
│ reStructuredText │     9 │  23.7 │    0 │  0.0 │     438 │ 50.2 │
│ Markdown         │     3 │   7.9 │    0 │  0.0 │      53 │ 49.1 │
│ Text only        │     2 │   5.3 │    0 │  0.0 │      24 │ 82.8 │
├──────────────────┼───────┼───────┼──────┼──────┼─────────┼──────┤
│ Sum              │    38 │ 100.0 │ 3381 │ 57.4 │     945 │ 16.1 │
└──────────────────┴───────┴───────┴──────┴──────┴─────────┴──────┘
```

Plenty of tools can post process SLOC information, for example the
[SLOCCount plug-in](https://wiki.jenkins-ci.org/display/JENKINS/SLOCCount+Plugin)
for the [Jenkins](https://jenkins.io/) continuous integration server.

A popular format for such tools is the XML format used by cloc, which pygount
also supports and can store in an output file:

```bash
$ pygount --format=cloc-xml --out=cloc.xml ~/projects/example
```

To get a short description of all available command line options use:

```bash
$ pygount --help
```

For more information and examples read the documentation chapter on
[Usage](https://pygount.readthedocs.io/en/latest/usage/).

## Contributions

To report bugs, visit the
[issue tracker](https://github.com/roskakori/pygount/issues).

In case you want to play with the source code or contribute improvements, see
[CONTRIBUTING](https://pygount.readthedocs.io/en/latest/contributing/).

## Version history

See [CHANGES](https://pygount.readthedocs.io/en/latest/changes/).


================================================
FILE: docs/api.md
================================================
# API

## Overview

Pygount provides a simple API to integrate with other tools. This, however, is currently still a work in progress and subject to change.

Here's an example on how to analyze one of pygount's own source codes:

```pycon
>>> from pygount import SourceAnalysis
>>> SourceAnalysis.from_file("pygount/analysis.py", "pygount")
SourceAnalysis(path='pygount/analysis.py', language='Python', group='pygount', state=analyzed, code_count=509, documentation_count=141, empty_count=117, string_count=23)
```

Information about multiple source files can be summarized using `ProjectSummary`:

First, set up the summary:

```pycon
>>> from pygount import ProjectSummary
>>> project_summary = ProjectSummary()
```

Next, find some files to analyze:

```pycon
>>> from glob import glob
>>> source_paths = glob("pygount/*.py") + glob("*.md")
>>> source_paths
['pygount/command.py', 'pygount/analysis.py', 'pygount/write.py', 'pygount/__init__.py', 'pygount/xmldialect.py', 'pygount/summary.py', 'pygount/common.py', 'pygount/lexers.py', 'README.md', 'CONTRIBUTING.md', 'CHANGES.md']
```

Then analyze them:

```pycon
>>> for source_path in source_paths:
...     source_analysis = SourceAnalysis.from_file(source_path, "pygount")
...     project_summary.add(source_analysis)
```

Finally, take a look at the information collected, for example, by printing the values of `ProjectSummary.language_to_language_summary_map`:

```pycon
>>> for language_summary in project_summary.language_to_language_summary_map.values():
...   print(language_summary)
...
LanguageSummary(language='Python', file_count=8, code=1232, documentation=295, empty=331, string=84)
LanguageSummary(language='markdown', file_count=3, code=64, documentation=0, empty=29, string=14)
```


================================================
FILE: docs/background.md
================================================
# Background

## How pygount counts code

Pygount primarily counts the physical lines of source code. It begins by using lexers from Pygments, if available. If Pygments doesn't have a suitable lexer, pygount employs its own internal lexers to differentiate between code and comments. These include:

- Minimalist lexers for m4, VBScript, and WebFOCUS, capable of distinguishing between comments and code.
- The Java lexer repurposed for OMG IDL.

Additionally, plain text is treated with a separate lexer that considers all lines as comments.

Lines consisting solely of comment tokens or whitespace are counted as comments.

Lines with only whitespace are ignored.

All other content is considered code.

## White characters

A line containing only "white characters" is also ignored because they do not contribute to code complexity in any meaningful way. Currently, white characters are:

```
(),:;[]{}
```

Because of that, pygount tends to report about 5 to 15 percent fewer SLOC for C-like languages than other similar tools.

## No operations

For some languages, "no operations" are detected and treated as white space. For example, Python's `pass` or Transact-SQL's `begin` and `end`.

As an example, consider this Python code:

```python
class SomeError(Exception):
    """
    Some error caused by some issue.
    """
    pass
```

This counts as 1 line of code and 3 lines of comments. The line with `pass` is considered a "no operation" and thus not taken into account.

## Pure string lines

Many programming languages support the concept of strings, which typically often contain text to be shown to the end user or simple constant values. Similar to white character and "no operations" in most cases, they do not add much to the complexity of the code. Notable exceptions are strings containing code for domain-specific languages, templates, or SQL statements.

Pygount currently takes an opinionated approach on how to count pure string lines depending on the output format:

- With `--format=summary`, pure string lines are ignored similar to empty lines
- With `--format` set to `sloccount` or `cloc-xml` string lines are counted as code, resulting in somewhat similar counts as the original tools.
- With `--format=json` all variants are available as attributes, and you can choose which one you prefer.

In hindsight, this is an inconsistency that might warrant a cleanup. See issue [#122](https://github.com/roskakori/pygount/issues/122) for a discussion and issue [#152](https://github.com/roskakori/pygount/issues/152) for a plan on how to clean this up.

## Binary files

When a file is considered to be binary when all the following
conditions match:

1. The file does not start with a BOM for UTF-8, UTF-16 or UTF-32 (which indicates text files).
2. The initial 8192 bytes contain at least one 0-byte.

In this case, pygount assigns it the pseudo language `__binary__` and performs no further analysis.

## Generated files

Generated files are recognized either by their content (`--generated`) or name (`--generated-names`). Use `--help` to see the current default patterns.

In case you think the standard patterns should be extended, modify `pygount.analysis.DEFAULT_GENERATED_LINE|NAME_PATTERNS_TEXT` and [contribute a pull request](contributing.md).

For source code repositories, committing generated files should generally be avoided. Instead, make the generation part of the build process. However, there are valid reasons to include generated files:

1. Package managers generate "lock" files from the package specification to ensure builds use the exact same versions and hashes. For example, "pyproject.toml" and "uv.lock".
2. Generation takes too long, for example, in Flutter projects with many nested sub-packages.
3. Generated files cannot be bootstrapped from scratch because of interdependencies.
4. Cloud tools require certain generated files to be present in the repository. An example would be [ReadTheDocs.org](https://readthedocs.org), which as of May 2025 in combination with [MkDocs](https://www.mkdocs.org/) needs additional dependencies to be specified in a `requirements.txt`. Many Python projects specify their dependencies in `pyproject.toml`, which can be used to generate the `requirements.txt`. However, the ReadTheDocs build does not allow easily including such a step, so the path of least resistance is to just include the generated `requirements.txt` file in the repository.

## Comparison with other tools

Pygount can analyze more languages than other common tools such as sloccount or cloc because it builds on `pygments`, which provides lexers for hundreds of languages. This also makes enables supporting another language: [Write your own lexer](http://pygments.org/docs/lexerdevelopment/).

For certain corner cases, pygount gives more accurate results because it actually lexes the code unlike other tools that mostly look for comment markers and can get confused when they show up inside strings. In practice, though, this should not make much of a difference.

Pygount is slower than most other tools. Partially, this is due to actually lexing instead of just scanning the code. Partially, because other tools can use statically compiled languages such as Java or C, which are generally faster than dynamic languages. For many applications though, pygount should be "fast enough", especially when running as an asynchronous step during a continuous integration build.


================================================
FILE: docs/changes.md
================================================
# Changes

This chapter describes the changes coming with each new version of
pygount.

## Version 3.3.0, 2026-xx-xx

Development:

- Migrate ReadTheDocs documentation to uv (issue [#221](https://github.com/roskakori/pygount/issues/221)).

## Version 3.2.0, 2026-04-08

- Add detection of SVG as XML dialect (issue [#209](https://github.com/roskakori/pygount/issues/209)).
- Fix detecttion of XML dialect when a `<?xml>` header was present.

## Version 3.1.1, 2025-02-17

- Update dependencies and drop support for Python 3.9 (issue [#205](https://github.com/roskakori/pygount/issues/205)).

## Version 3.1.0, 2025-05-27

- Add command line option [`--generated-names`](usage.md#-generated-names) to specify which file names should be considered to be generated. The current patterns recognized are somewhat limited, so contributions are welcome. See the section on "[Generated files](background.md#generated-files)" for hints on how to do that (issue [#190](https://github.com/roskakori/pygount/issues/190)).
- Change documentation from Sphinx to MkDocs in the hope to avoid it breaking regularly (issue [#191](https://github.com/roskakori/pygount/issues/191)).

Development:

- Replace `format()` with f-strings (contributed by Ben Allen, issue [#166](https://github.com/roskakori/pygount/issues/166)).
- Change sdist archive to include more than just the Python source code.

## Version 3.0.0, 2025-05-23

- Count pure markup files as documentation: (contributed by Tytus Bucholc, issue [#6](https://github.com/roskakori/pygount/issues/6)).
- Fix silent error on git failing (contributed by Tom De Bièvre, issue [#162](https://github.com/roskakori/pygount/issues/162))
- Transform common project URLs to repository: (contributed by Tom De Bièvre, issue [#164](https://github.com/roskakori/pygount/issues/164))
- Change dependency rules for rich to be more lenient (suggested by Brian McGillion, issue [#193](https://github.com/roskakori/pygount/issues/193))

## Version 2.0.0, 2025-03-16

- Fix `TypeError` when processing files with a magic encoding comment specifying an unknown encoding and using `--format=json` (contributed by PyHedgehog, issue [#176](https://github.com/roskakori/pygount/issues/176))
- Fix false positives when extracting the encoding from magic coding comments (issue [#184](https://github.com/roskakori/pygount/issues/184))
- Add support for Python 3.13 and later (issue [#174](https://github.com/roskakori/pygount/issues/174))
- Remove temporary directory in the output of a git analysis (contributed by Isabel Beckenbach, issue [#113](https://github.com/roskakori/pygount/issues/113))
- Remove support for Python 3.8 (issue [#158](https://github.com/roskakori/pygount/issues/158))
- Development: Change packaging to uv (issue [#180](https://github.com/roskakori/pygount/issues/180)).
- Development: Change linter to ruff and in turn, clean up code (issue [#157](https://github.com/roskakori/pygount/issues/157)).
- Development: Change default branch to main (issue [#160](https://github.com/roskakori/pygount/issues/160)).
- Removed deprecated code: (contributed by Marco Gambone and Niels Vanden Bussche, issue [#47](https://github.com/roskakori/pygount/issues/47)).

## Version 1.8.0, 2024-05-13

- Add all available counts and percentages to JSON format (issue [#122](https://github.com/roskakori/pygount/issues/122)).

  In particular, this makes available the `codeCount`, which is similar to the already existing `sourceCount` but does exclude lines that contain only strings. You can check their availability by validating that the `formatVersion` is at least 1.1.0.

  The documentation about "`How to count code` has more information about the available counts and the ways they are computed.

  Pygount 2.0 will probably introduce some breaking changes in this area, which can already be previewed and discussed at issue [#152](https://github.com/roskakori/pygount/issues/152).

## Version 1.7.0, 2024-05-13

- Fix analysis with [FIPS](https://en.wikipedia.org/wiki/Federal_Information_Processing_Standards) mode by changing computation of hash for duplicate detection from MD5 to SHA256. As a side effect, reasonably modern machines should receive a (probably unnoticeable) minor performance boost (contributed by Matthew Vine, issue [#137](https://github.com/roskakori/pygount/issues/137)).
- Add command line option `--merge-embedded-languages` to merge embedded languages into their base language. For example, "HTML+Django/Jinja" counts as "HTML" (issue [#105](https://github.com/roskakori/pygount/issues/105)).
- Add Python 3.12 and make it the main version for CI (issue [#145](https://github.com/roskakori/pygount/issues/145)).

## Version 1.6.1, 2023-07-02

- Fix missing check for seekable file handles (issue [#114](https://github.com/roskakori/pygount/issues/114)).
- Fix the ReadTheDocs documentation build by switching to the built-in alabaster Sphinx theme (issue [#116](https://github.com/roskakori/pygount/issues/116)).

## Version 1.6.0, 2023-06-26

- Add support for analysis of remote git URL\'s in addition to local files (contributed by Rojdi Thomallari, issue [#109](https://github.com/roskakori/pygount/issues/109)).
- Removed support for Python 3.7.
- Improve API:
  - Add an option to pass a file handle to `SourceAnalysis.from_file()` (contributed by Dominik George, issue [#100](https://github.com/roskakori/pygount/issues/100)).

## Version 1.5.1, 2023-01-02

- Remove progress bar for `--format=sloccount` because it resulted into blank lines when running on Windows and could cause interwoven output on Unix (issue [#91](https://github.com/roskakori/pygount/issues/91)).

## Version 1.5.0, 2022-12-30

- Remove support for Python 3.6 and update dependencies (issue [#93](https://github.com/roskakori/pygount/issues/93)).

## Version 1.4.0, 2022-04-09

- Add progress bar during scan phase and improve visual design of `--format=summary` (contributed by Stanislav Zmiev, issue [#73](https://github.com/roskakori/pygount/issues/73)).
- Add percentages to API. For example in addition to `code_count` now there also is `code_percentage`.

## Version 1.3.0, 2022-01-06

- Fix computation of "lines per second", which was a copy and paste of "files per second".
- Add JSON as additional output `--format`, see [JSON](json.md) for details (issue [#62](https://github.com/roskakori/pygount/issues/62)).
- Add detection of [GitHub community files](https://docs.github.com/en/communities/setting-up-your-project-for-healthy-contributions) without a suffix as text (issue [#54](https://github.com/roskakori/pygount/issues/54)).
- Change the build process to [poetry](https://python-poetry.org/) to change several messy configuration files into a single even more messy configuration file.

## Version 1.2.5, 2021-05-16

- Remove support for Python 3.5. Probably it still works, but there is no easy way to test this anymore because 3.5 reached its end of life a while ago.

## Version 1.2.4, 2020-08-11

- Fix scanning of "." (for current folder), which was skipped entirely (issue [#56](https://github.com/roskakori/pygount/issues/56)).

## Version 1.2.3, 2020-07-05

- Improve detection of text files by trying to guess a lexer for `*.txt` before assuming it is text. This basically fixes the detection of `CMakelists.txt` as CMake file [#53](https://github.com/roskakori/pygount/issues/53)). However, it will only work with some files due to multiple issues with the regular expression Pygments used in versions up to 2.6.1 to detect CMake headers. This should be fixed once pull request
  [#1491](https://github.com/pygments/pygments/pull/1491) is applied.

## Version 1.2.2, 2020-06-24

- Changed preprocessor statements to count as code, unlike Pygments which treats them as special comments (contributed by nkr0, issue [#51](https://github.com/roskakori/pygount/issues/51)).

## Version 1.2.1, 2020-04-02

- Fix broken links in README on PyPI by moving the documentation to [ReadTheDocs](https://pygount.readthedocs.io/).
- Improv API:
  - Change factory functions to methods and added deprecation warnings:
    - `source_analysis` → `SourceAnalysis.from_file`
    - `pseudo_source_analysis` → `SourceAnalysis.from_state`
  - Change attributes in `SourceAnalysis` to read-only properties.
  - Rename properties holding counts from `xxx` to `xxx_count`.
  - Add API reference to documentation.
  - Add a couple of type hints and assertions.

## Version 1.2.0, 2020-03-30

- Add file count to summary.
- Change installation to fail when attempting to install on Python earlier than 3.5.
- Improve API:
  - Change `SourceAnalysis.state` to be a proper enum instead of a string.
  - Add `ProjectSummary` to summarize multiple files.
- Clean up the project:
  - Change continuous integration from Travis CI to GitHub actions in the hope that the CI build does not automatically break after a while because things constantly change in the CI backend.
  - Change README format from reStructuredText to Markdown.
  - Improve badges in README: added a badge for supported Python versions and unified the layout by using <https://shields.io>.
  - Remove obsolete development files (for ant, tox etc).

## Version 1.1.0, 2020-03-10

- Fix `--folders-to-skip` and `--names-to-skip` which simply were
  ignored (contributed by pclausen, issue [#17](https://github.com/roskakori/pygount/issues/17)).
- Add option `--format=summary` to get a language overview and sum total (based on a contribution by Yuriy Petrovskiy, issue [#16](https://github.com/roskakori/pygount/issues/16)).
- Add Python 3.7 and 3.8 to the list of supported versions.
- Drop support for Python 3.3 and 3.4, mostly because it became hard to test without going through major hoops.

## Version 1.0.0, 2017-07-04

- Fix confusing warning about XML file `<unknown>` caused by SAX parser. As a workaround, `<unknown>` is now replaced by the actual path of the XML file that cannot be parsed.
- Add Python 3.6 to the list of supported versions (issue [#14](https://github.com/roskakori/pygount/issues/14)).

## Version 0.9, 2017-05-04

- Fix `AssertionError` when option `--encoding=chardet` was specified.
- Change the warning message "no fallback encoding specified, using \<encoding\>" to a debug message because it did not add any interesting information as the encoding actually used is visible in the info message for each file.
- Add detection of binary files and exclude them from the analysis. In particular Django model objects (`*.mo`) are not considered Modelica source code anymore (issue
  [#11](https://github.com/roskakori/pygount/issues/11)).
- Add detection of DocBook XML by DTD (issue [#10](https://github.com/roskakori/pygount/issues/10)).
- Add support for suffices to indicate PL/SQL files according to [Oracle FAQ entry on file extensions](http://www.orafaq.com/wiki/File_extensions) (issue [#12](https://github.com/roskakori/pygount/issues/12)).
- Add possibility to specify a fallback encoding for encoding 'chardet'. Use e.g. `--encoding=chardet;cp1252`.

## Version 0.8, 2016-10-07

- Fix option `--verbose`. Now each analyzed source code results in at least one informational message in the log.
- Add detection of duplicates using size and then MD5 code as criteria (issue [#2](https://github.com/roskakori/pygount/issues/2)). Use the option `--duplicates` to still count duplicate source code.
- Improve detection of programming language, which is now more consistent and yields the same language between Python invocations.

## Version 0.7, 2016-09-28

- Fix that option `--generated` was ignored.
- Add support for a couple of languages not supported by `pygments` yet:
  - m4, VBScript, and WebFOCUS use minimalistic lexers that can distinguish between comments and code.
  - OMG IDL repurposes the existing Java lexer.
- Add detection of certain XML dialects as separate language (issue [#8](https://github.com/roskakori/pygount/issues/8)).

## Version 0.6, 2016-09-26

- Fix that source files could end up as `__error__` if the first non-ASCII characters showed up only after kilobyte 16 and the encoding was not UTF-8. Now pygount attempts to read the whole file as UTF-8 before assuming it actually is UTF-8.
- Change lines in plain text files to count as comments (issue [#9](https://github.com/roskakori/pygount/issues/9)). Before pygments treated them as `ResourceBundle`.
- Change that empty files have `__empty__` as language (issue [#7](https://github.com/roskakori/pygount/issues/7)).
- Extend workaround for [pygments issue #1284](https://bitbucket.org/birkenfeld/pygments-main/issues/1284) to replace any lexer `*+Evoque` by `*`.

## Version 0.5, 2016-09-22

- Add that generated source code is excluded from analysis (issue [#1](https://github.com/roskakori/pygount/issues/1)). Use option `--generated` to specify patterns that indicate generated code.
- Add workaround for pygments sometimes detecting the same XML file as XML and other times as XML+Evoque (probably depending on the hash seed). Now XML+Evoque is always changed to XML.
- Add `__pycache__` as default `--folders-to-skip`.
- Add notes on pseudo languages for source code that cannot be analyzed.

## Version 0.4, 2016-09-11

- Fixed `LookupError` on broken encoding in magic comment (issue [#4](https://github.com/roskakori/pygount/issues/4)).
- Add options `--folders-to-skip` and `--names-to-skip` to specify which files should be excluded from analysis.
- Add comma (`,`) and colon (`:`) to list of "white characters" that do not count as code if there is nothing else in the line.
- Improve pattern matching: for all options that according to `--help` take `PATTERNS` you can now specify that the patterns are regular expressions instead of shell patterns (using `[regex]`) and that they should extend the default patterns (using `[...]`).
- Improve documentation: added notes on how code is counted and how pygount compares to other similar tools.

## Version 0.3, 2016-08-20

- Fix `@rem` comments in DOS batch files (issue [#3](https://github.com/roskakori/pygount/issues/3)).
- Clean up code.

## Version 0.2, 2016-07-10

- Fix that files starting with underscore (e.g. `__init__.py`) were excluded from analysis.
- Change `chardet` package to be optional.
- Add possibility to specify single files and glob patterns to analyze.
- Add that lines containing only certain characters are treated as white space instead of code. Currently, this concerns brackets (`()[]{}`) and semicolon (`;`).
- Add that Python's `pass` statement is treated as white space instead of code.
- Clean up and (slightly) optimized code.

## Version 0.1, 2016-07-05

- Initial public release.


================================================
FILE: docs/continuous-integration.md
================================================
# Continuous integration

Pygount can produce output that can be processed by the [SLOCCount plug-in](https://wiki.jenkins-ci.org/display/JENKINS/SLOCCount+Plugin) for the [Jenkins](https://jenkins.io/) continuous integration server.

It's recommended to run pygount as one of the first steps in your build process before any undesired file like compiler targets or generated source code are built.

An example "Execute shell" build step for Jenkins is:

```bash
$ pygount --format=cloc-xml --out cloc.xml --suffix=py --verbose
```

Then add a post-build action "Publish SLOCCount analysis results" and set "SLOCCount report" to "cloc.xml".


================================================
FILE: docs/contributing.md
================================================
# Contributing

## Project setup

In case you want to play with the source code or contribute changes, proceed as follows:

1.  Check out the project from GitHub:
    ```bash
    $ git clone https://github.com/roskakori/pygount.git
    $ cd pygount
    ```
2.  Install [uv](https://docs.astral.sh/uv/).
3.  Create the virtual environment and install the required packages:
    ```bash
    $ uv sync --all-groups
    ```
4.  Install the pre-commit hook:
    ```bash
    $ uv run pre-commit install
    ```

## Testing

To run the test suite:

```bash
$ uv run pytest
```

To build and browse the coverage report in HTML format:

```bash
$ sh scripts/test_coverage.sh
$ open htmlcov/index.html  # macOS only
```

## Documentation

To build the documentation in HTML format:

```bash
$ uv run scripts/build_documentation.sh
$ open docs/_build/html/index.html  # macOS only
```

## Coding guidelines

The code throughout uses a natural naming schema avoiding abbreviations, even for local variables and parameters.

Many coding guidelines are automatically enforced (and some even fixed automatically) by the pre-commit hook. If you want to check and clean up the code without performing a commit, run:

```bash
$ uv run pre-commit run --all-files
```

In particular, this applies checks from [black](https://black.readthedocs.io/en/stable/), [flake8](https://flake8.pycqa.org/) and [isort](https://pypi.org/project/isort/).

## Publish a new version

This section is only relevant for developers with access to the PyPI project.

To add a new release, first update the `pyproject.toml`:

```toml
[project]
version = "3.x.x"
```

Next, build the project and run the tests to ensure everything works:

```sh
$ rm -rf dist  # Remove any files from previous builds.
$ uv build
$ uv run pytest
```

Then create a tag in the repository:

```sh
$ git tag -a -m "Tag version 3.x.x" v3.x.x
$ git push --tags
```

Publish the new version on PyPI:

```sh
$ uv publish
```

Finally, add a GitHub release based on the tag from above to the [release page](https://github.com/roskakori/pygount/releases).


================================================
FILE: docs/index.md
================================================
# Pygount

Pygount is a command line tool to scan folders for source code files and count the number of source code lines in it. It is similar to tools like [sloccount](http://www.dwheeler.com/sloccount/) and [cloc](http://cloc.sourceforge.net/) but uses the [pygments](http://pygments.org/) package to parse the source code and consequently can analyze any [programming language supported by pygments](http://pygments.org/languages/).

The name is a combination of "pygments" and "count".

Pygount is open source and distributed under the [BSD license](https://opensource.org/licenses/BSD-3-Clause). The source code is available from <https://github.com/roskakori/pygount>.


================================================
FILE: docs/installation.md
================================================
# Installation

Pygount is available from [PyPI](https://pypi.python.org/pypi/pygount) and can be installed by running:

```bash
pip install pygount
```

Using [uv](https://docs.astral.sh/uv/), it can also run directly. For example:

```bash
uvx pygount --version
```


================================================
FILE: docs/json.md
================================================
# JSON

The JavaScript objects notation (JSON) is widely used to interchange data. Running pygount with `--format=json` is a simple way to provide the results of an analysis for further processing.

## General format

The general structure of the resulting JSON is:

```JSON
{
  "formatVersion": "1.1.0",
  "pygountVersion": "1.8.0",
  "files": [...],
  "languages": [...],
  "runtime": {...},
  "summary": {...}
}
```

The naming of the entries deliberately uses camel case to conform to the
[JSLint](https://www.jslint.com/) guidelines.

Both `formatVersion` and `pygountVersion` use [semantic
versioning](https://semver.org/). For more information about how this
JSON evolved, see `JSON format history`.

### Files

With `files` you can access a list of files analyzed, for example:

```JavaScript
{
  "codeCount": 171,
  "documentationCount": 28,
  "emptyCount": 56,
  "group": "pygount",
  "isCountable": true,
  "language": "Python",
  "lineCount": 266,
  "path": "/tmp/pygount/pygount/write.py",
  "state": "analyzed",
  "stateInfo": null,
  "sourceCount": 182
}
```

The `*Count` fields have the following meaning:

- `codeCount`: The number of lines that contains code excluding
  [Pure string lines](background.md#pure-string-lines)
- `documentationCount`: The number of lines containing comments
- `emptyCount`: The number of empty lines, which includes
  "`No operations`" lines
- `lineCount`: Basically the number of lines shown in your editor
  respectively computed by shell commands like `wc -l`,
- `sourceCount`: The source lines of code, similar to the traditional
  SLOC
- `stringCount`: The number of `Pure string lines`

Here, `sourceCount` is the number of source lines of code (SLOC), `documentationCount` the number of lines containing comments and

The `state` can have one of the following values:

- analyzed: successfully analyzed
- binary: the file is a [binary file](background.md#binary-files)
- duplicate: the file is a [duplicate](usage.md#-duplicates) of another
- empty: the file is empty (file size = 0)
- error: the source could not be parsed; in this case, `stateInfo` contains a message with more details
- generated: the file has been generated as specified with `--generated`
- unknown: pygments does not offer any lexer to analyze the file

### Languages

In `languages` the summary for each language is available, for example:

```JSON
{
  "documentationCount": 429,
  "documentationPercentage": 11.776008783969257,
  "codeCount": 2332,
  "codePercentage": 64.01317595388416,
  "emptyCount": 706,
  "emptyPercentage": 19.3796321712874,
  "fileCount": 20,
  "filePercentage": 48.78048780487805,
  "isPseudoLanguage": false,
  "language": "Python",
  "sourceCount": 2508,
  "sourcePercentage": 68.84435904474334,
  "stringCount": 176,
  "stringPercentage": 4.831183090859182
}
```

### Summary

In `summary` the total counts across the whole project can be accessed, for example:

```JSON
{
  "totalCodeCount": 4366,
  "totalCodePercentage": 68.38972431077694,
  "totalDocumentationCount": 463,
  "totalDocumentationPercentage": 7.25250626566416,
  "totalEmptyCount": 1275,
  "totalEmptyPercentage": 19.971804511278197,
  "totalFileCount": 41,
  "totalSourceCount": 4646,
  "totalSourcePercentage": 72.77568922305764,
  "totalStringCount": 280,
  "totalStringPercentage": 4.385964912280702
}
```

### Runtime

The `runtime` entry collects general information about how well pygount performed in collecting the information, for example:

```JSON
{
  "durationInSeconds": 0.6333059999999999,
  "filesPerSecond": 64.73963613166464,
  "finishedAt": "2024-05-13T16:14:31.977070+00:00",
  "linesPerSecond": 10080.435050354807,
  "startedAt": "2024-05-13T16:14:31.343764+00:00"
}
```

## Pretty printing

Because the output is concise and consequently mostly illegible for a human reader, you might want to pipe it through a pretty printer. As you already have python installed, the easiest way is:

```sh
pygount --format json | python -m json.tool
```

Another alternativ would be [jq](https://stedolan.github.io/jq/):

```sh
pygount --format json | jq .
```

## JSON format history

v1.1.0, pygount 1.8.0

- Add `code_count` and `line_count`

v1.0.0, pygount 1.3.0

- Initial version


================================================
FILE: docs/usage.md
================================================
# Usage

## General

Run and specify the folder to analyze recursively, for example:

```bash
$ pygount ~/development/sometool
```

If you omit the folder, the current folder of your shell is used as a starting point. Apart from folders you can also specify single files and shell patterns (using `?`, `*` and ranges like `[a-z]`).

Certain files and folders are automatically excluded from the analysis:

- files starting with dot (`.`) or ending in tilda (`~`)
- folders starting with dot (`.`) or named `_svn`.

### `--folders-to-skip LIST`, `--names-to-skip LIST`

To specify alternative patterns, use `--folders-to-skip` and `--names-to-skip`. Both take a comma separated list of patterns, see below on the pattern syntax. To, for example, also prevent folders starting with two underscores (`_`) from being analyzed, specify `--folders-to-skip=[...],__*`.

### `--suffix LIST`

To limit the analysis on certain file types, you can specify a comma separated list of suffixes to take into account, for example `--suffix=py,sql,xml`.

### `--out FILE`

By default, the results of the analysis are written to the standard output. To redirect the output to a file, use for example `--out=counts.txt`.

To explicitly redirect to the standard output specify `--out=STDOUT`.

### `--format FORMAT`

By default, the results of the analysis are written to the standard output in a format similar to sloccount. To redirect the output to a file, use e.g. `--out=counts.txt`. To change the format to an XML file similar to cloc, use `--format=cloc-xml`.

To just get a quick grasp of the languages used in a project and their respective importance use `--format=summary` which provides a language overview and a sum total. For example, pygount's summary looks like this:

```
Language          Files    %     Code    %     Comment    %
----------------  -----  ------  ----  ------  -------  ------
Python               19   51.35  1924   72.99      322   86.10
reStructuredText      7   18.92   332   12.59        7    1.87
markdown              3    8.11   327   12.41        1    0.27
Batchfile             1    2.70    24    0.91        1    0.27
YAML                  1    2.70    11    0.42        2    0.53
Makefile              1    2.70     9    0.34        7    1.87
INI                   1    2.70     5    0.19        0    0.00
TOML                  1    2.70     4    0.15        0    0.00
Text                  3    8.11     0    0.00       34    9.09
----------------  -----  ------  ----  ------  -------  ------
Sum total            37          2636              374
```

The summary output is designed for human readers, and the column widths adjust to the data.

For further processing the results of pygount, `--format=json` should be the easiest to deal with. For more information, see the chapter on [JSON](json.md).

### `--merge-embedded-languages`

Some languages such as HTML or JavaScript allow embedding other languages in their source code. In that case, the source code is assigned to a language that contains both the base and end embedded language in its name, for example:

- HTML+Jinja
- JavaScript+Lasso

If you prefer count all variants of a base language only under its own name, specify `--merge-embedded-languages`. The example above will then show as:

- HTML
- JavaScript

Consequently, multiple different embedded languages will all count for its common base language.

## Remote repositories

Additionally to local files, pygount can analyze remote git repositories:

```bash
$ pygount https://github.com/roskakori/pygount.git
```

In the background, this creates a shallow clone of the repository in a temporary folder that after the analysis is removed automatically.

Therefore, you need to have at read access to the repository.

If you want to analyze a specific revision, specify it at the end of the URL:

```bash
$ pygount https://github.com/roskakori/pygount.git/v1.6.0
```

The remote URL supports the git standard protocols: git, HTTP/S and SSH.

```bash
$ pygount git@github.com:username/project.git
```

You can specify multiple repositories, for example, to include both the web application, command line client and docker container of the [Weblate](https://weblate.org/) project:

```bash
$  pygount https://github.com/WeblateOrg/weblate.git https://github.com/WeblateOrg/wlc.git  https://github.com/WeblateOrg/docker.git
```

And you can even mix local files and remote repositories:

```bash
$ pygount ~/projects/some https://github.com/roskakori/pygount.git
```

## Patterns

Some command line arguments take patterns as values.

By default, patterns are shell patterns using `*`, `?` and ranges like `[a-z]` as placeholders. Depending on your platform, they are case-sensitive (Unix) or not (macOS, Windows).

If a pattern starts with `[regex]` you can specify a comma separated list of regular expressions instead using all the constructs supported by the [Python regular expression
syntax](https://docs.python.org/3/library/re.html#regular-expression-syntax). Regular expressions are case-sensitive unless they include a `(?i)` flag.

If the first actual pattern is `[...]`, default patterns are included. Without it, defaults are ignored and only the patterns explicitly stated are taken into account.

### `--generated`

So for example, to specify that generated code can also contain the German word "generiert" in a case-insensitive way use `--generated="[regex][...](?i).*generiert"`.

### `--generated-names`

In addition to the source code, the file name can indicate that a source code is generated. For example, `--generated-names="*.lock,*.g.dart"`.

The default already recognizes several standard generated names.

## Counting duplicates

### `--duplicates`

By default, pygount prevents multiple source files with exactly the same
content to be counted again.

For two files to be considered duplicates, the following conditions must be met:

1.  Both files have the same size.
2.  Both files have the same [SHA-256](https://en.wikipedia.org/wiki/SHA-2) hashcode.

This allows for efficient detection with a tiny possibility for false positives.

However, it also prevents detection of files with only minor differences as duplicates. Examples are files that are identical except for additional white space, empty lines or different line endings.

If you still want to count duplicates multiple times, specify `--duplicates`. This will also result in a minor performance gain of the analysis.

## Source code encoding

### --encoding ENCODING\[;FALLBACK\]

When reading source code, pygount automatically detects the encoding. It uses a simple algorithm where it recognizes BOM, XML declarations such as:

```xml
<?xml encoding='cp1252'?>
```

and "magic" comments such as:

```ruby
# encoding: cp1252
# coding: cp1252
# -*- coding: cp1252 -*-
```

If the file does not have an appropriate heading, pygount attempts to read it using UTF-8. If this fails, it reads the file using a fallback encoding (by default [CP1252](https://en.wikipedia.org/wiki/Windows-1252)) and ignores any encoding errors.

You can change this behavior using the `--encoding` option:

- To keep the automatic analysis and use a different fallback encoding, specify for example `--encoding=automatic;iso-8859-15`.
- To use automatic detection based on heuristic, specify `--encoding=chardet`. For this to work, the [chardet](https://pypi.python.org/pypi/chardet)
  package must be installed,
- To use a specific encoding (for all files analyzed), use for example `--encoding=iso-8859-15`.

## Pseudo languages

If a source code is not counted, the number of lines is 0 and the language shown is a pseudo language indicating the reason:

- `__binary__` - used for `binary`.
- `__duplicate__` - the source code duplicate as described at the
  command line option `--duplicates`.
- `__empty__` - the source code is an empty file with a size of 0 bytes.
- `__error__` - the source code could not be parsed; for example, due to an I/O
  error.
- `__generated__` - the source code is generated according to the
  command line option `--generated`.
- `__unknown__` - pygments does not provide a lexer to parse the source
  code.

## Other information

### `--verbose`

If `--verbose` is specified, pygount logs detailed information about what it is doing.

### `--help`

To get a description of all the available command line options, run:

```bash
$ pygount --help
```

### `--version`

To get pygount's current version number, run:

```bash
$ pygount --version
```


================================================
FILE: mkdocs.yaml
================================================
site_name: "pygount"
site_url: "https://pygount.readthedocs.io/"
site_author: "Thomas Aglassinger <roskakori@users.sourceforge.net>"
site_description: "Documentation of pygount, a tool to count lines of code for hundreds of languages using pygments"

repo_url: "https://github.com/roskakori/pygount"

theme:
  name: material
  features:
    - navigation.footer

markdown_extensions:
  - attr_list
  - codehilite
  - toc:
      permalink: true

nav:
  - "Overview": "index.md"
  - "Installation": "installation.md"
  - "Usage":
      - "Usage": "usage.md"
      - "JSON format": "json.md"
      - "Continuous integration": "continuous-integration.md"
      - "Background": "background.md"
  - "API": "api.md"
  - "Changes": "changes.md"
  - "Contributing": "contributing.md"

validation:
  nav:
    omitted_files: warn


================================================
FILE: pygount/__init__.py
================================================
"""
Pygount counts lines of source code using pygments lexers.
"""

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
from importlib.metadata import version

from .analysis import DuplicatePool, SourceAnalysis, SourceScanner, SourceState, encoding_for
from .common import Error, OptionError
from .summary import LanguageSummary, ProjectSummary

__version__ = version(__name__)

__all__ = [
    "DuplicatePool",
    "Error",
    "LanguageSummary",
    "OptionError",
    "ProjectSummary",
    "SourceAnalysis",
    "SourceScanner",
    "SourceState",
    "__version__",
    "encoding_for",
]


================================================
FILE: pygount/analysis.py
================================================
"""
Functions to analyze source code and count lines in it.
"""

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
import codecs
import collections
import glob
import hashlib
import itertools
import logging
import os
import re
from collections.abc import Iterator, Sequence
from dataclasses import dataclass
from enum import Enum
from io import SEEK_CUR, BufferedIOBase, IOBase, RawIOBase, TextIOBase
from pathlib import Path
from re import Pattern
from typing import Optional, Union

import pygments.lexer
import pygments.lexers
import pygments.lexers.html
import pygments.token
import pygments.util

import pygount.common
import pygount.lexers
import pygount.xmldialect
from pygount.common import WHITE_SPACE_CHARACTERS, mapped_repr, matching_regex
from pygount.git_storage import GitStorage, git_remote_url_and_revision_if_any

HTTP_URL_REGEX = re.compile(r"^(https?://)")
_ALLOWED_GIT_PLATFORMS = ["github.com", "bitbucket.org", "gitlab.com"]
_ALLOWED_GIT_PLATFORM_CHOICES_PATTERN = "|".join(map(re.escape, _ALLOWED_GIT_PLATFORMS))
GIT_REPO_REGEX = re.compile(rf"^(https?://|git@)({_ALLOWED_GIT_PLATFORM_CHOICES_PATTERN})/[^/]+/[^/]+")

# Attempt to import chardet.
try:
    import chardet.universaldetector

    _detector = chardet.universaldetector.UniversalDetector()
except ImportError:
    _detector = None
has_chardet = bool(_detector)

#: Fallback encoding to use if no encoding is specified
DEFAULT_FALLBACK_ENCODING = "cp1252"

#: Default glob patterns for folders not to analyze.
DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT = ", ".join(
    [
        ".?*",
        "_svn",  # Subversion hack for Windows
        "__pycache__",  # Python byte code
    ]
)

#: Pygments token type; we need to define our own type because pygments' ``_TokenType`` is internal.
TokenType = type(pygments.token.Token)

_BASE_LANGUAGE_REGEX = re.compile(r"^(?P<base_language>[^+]+)\+[^+].*$")

#: BOMs to indicate that a file is a text file even if it contains zero bytes.
_TEXT_BOMS = (codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE, codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE, codecs.BOM_UTF8)


class SourceState(Enum):
    """
    Possible values for :py:attr:`SourceAnalysis.state`.
    """

    #: successfully analyzed
    analyzed = 1
    #: source code is a binary
    binary = 2
    #: source code is an identical copy of another
    duplicate = 3
    #: source code is empty (file size = 0)
    empty = 4
    #: source could not be parsed
    error = 5
    #: source code has been generated
    generated = 6
    # TODO: 'huge' = auto()  # source code exceeds size limit
    #: pygments does not offer any lexer to analyze the source
    unknown = 7


#: Default patterns for regular expressions to detect generated code.
#: The '(?i)' indicates that the patterns are case-insensitive.
DEFAULT_GENERATED_LINE_PATTERNS_TEXT = pygount.common.REGEX_PATTERN_PREFIX + ", ".join(
    [
        r"(?i).*autogenerated",
        r"(?i).*automatically generated",
        r"(?i).*do not edit",
        r"(?i).*generated with the .+ utility",
        r"(?i).*this is a generated file",
        r"(?i).*generated automatically",
    ]
)

#: Default patterns for file names that are considered to be generated.
DEFAULT_GENERATED_NAME_PATTERNS_TEXT = ", ".join(
    [
        "*.g.dart",  # See, for example, <https://codewithandrea.com/articles/dart-flutter-code-generation/>
        "*.lock",  # For example, Cargo.lock, poetry.lock, uv.lock.
        "npm-shrinkwrap.json",  # See <https://docs.npmjs.com/cli/v11/configuring-npm/npm-shrinkwrap-json>.
        "go.sum",  # See <https://go.dev/ref/mod#go-sum-files>.
        "package-lock.json",  # See <https://docs.npmjs.com/cli/v11/configuring-npm/package-lock-json>.
        "pnpm-lock.yaml",  # See <https://pnpm.io/cli/install>.
    ]
)

#: Default glob patterns for file names not to analyze.
DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT = ", ".join([".*", "*~"])

_log = logging.getLogger("pygount")

_MARK_TO_NAME_MAP = (("c", "code"), ("d", "documentation"), ("e", "empty"), ("s", "string"))
_BOM_TO_ENCODING_MAP = collections.OrderedDict(
    (
        # NOTE: We need an ordered dict due to the overlap between utf-32-le and utf-16-be.
        (codecs.BOM_UTF8, "utf-8-sig"),
        (codecs.BOM_UTF32_LE, "utf-32-le"),
        (codecs.BOM_UTF16_BE, "utf-16-be"),
        (codecs.BOM_UTF16_LE, "utf-16-le"),
        (codecs.BOM_UTF32_BE, "utf-32-be"),
    )
)
_XML_PROLOG_REGEX = re.compile(r'<\?xml\s+.*encoding="(?P<encoding>[-_.a-zA-Z0-9]+)".*\?>')
_MAGIC_COMMENT_LINE_START_REGEXES = [
    re.compile(f"^{pattern}\\s*(?P<remainder>.+)$", re.IGNORECASE)
    for pattern in [
        r"#+",  # Python, Ruby
        r"//+",  # C++, Dart, Java, ...
        r"/\*+",  # C etc
        r"--+",  # Ada, SQL, VHDL
        r";+",  # Assembly
        r"%+",  # Latex, MatLab, Prolog
        r"rem\s",  # Basic, Windows batch
        r"\*+",  # Pascal
        r"\{",  # Pascal
    ]
]
_MAGIC_COMMENT_LINE_REMAINDER_REGEXES = [
    re.compile(pattern, re.IGNORECASE)
    for pattern in [
        # Covers for example "encoding: cp1252" and "encoding=cp1252".
        r"(en)?coding\s*[:=]\s*(?P<encoding>[-_.a-z0-9]+)\b",
        # Covers for example "-*- coding: cp1252 -*-".
        r"-\*-\s*coding\s*[:=]\s*(?P<encoding>[-_.a-z0-9]+)\s*(;.+\s*)?-\*-\s*",
    ]
]

_STANDARD_PLAIN_TEXT_NAME_PATTERNS = (
    # Text files for (moribund) gnits standards.
    "authors",
    "bugs",
    "changelog",
    "copying",
    "install",
    "license",
    "news",
    "readme",
    "thanks",
    # GitHub community recommendations, see
    # <https://docs.github.com/en/communities/setting-up-your-project-for-healthy-contributions>.
    # By now, in practice most projects use a suffix like "*.md" but some older ones
    # still might have such files without suffix.
    "code_of_conduct",
    "contributing",
    "support",
    # Other common text files.
    "changes",
    "faq",
    "readme\\.1st",
    "read\\.me",
    "todo",
)
_PLAIN_TEXT_PATTERN = "(^" + "$)|(^".join(_STANDARD_PLAIN_TEXT_NAME_PATTERNS) + "$)"
#: Regular expression to detect plain text files by name.
_PLAIN_TEXT_NAME_REGEX = re.compile(_PLAIN_TEXT_PATTERN, re.IGNORECASE)

_MARK_UP_NAME_PATTERN = r"^.*\.(md|rst|txt|\d+)$"
_MARK_UP_NAME_REGEX = re.compile(_MARK_UP_NAME_PATTERN, re.IGNORECASE)

#: Mapping for file suffixes to lexers for which pygments offers no official one.
_SUFFIX_TO_FALLBACK_LEXER_MAP = {
    "fex": pygount.lexers.MinimalisticWebFocusLexer(),
    "idl": pygount.lexers.IdlLexer(),
    "m4": pygount.lexers.MinimalisticM4Lexer(),
    "svg": pygments.lexers.html.XmlLexer(),  # TODO#213 Remove SVG hack.
    "txt": pygount.lexers.PlainTextLexer(),
    "vbe": pygount.lexers.MinimalisticVBScriptLexer(),
    "vbs": pygount.lexers.MinimalisticVBScriptLexer(),
}
for _oracle_suffix in ("pck", "pkb", "pks", "pls"):
    _SUFFIX_TO_FALLBACK_LEXER_MAP[_oracle_suffix] = pygments.lexers.get_lexer_by_name("plpgsql")


@dataclass(frozen=True)
class PathData:
    source_path: str
    group: str
    tmp_dir: Optional[str] = None


def is_markup_file(source_path: str) -> bool:
    return _MARK_UP_NAME_REGEX.match(os.path.basename(source_path)) is not None


class DuplicatePool:
    """
    A pool that collects information about potential duplicate files.
    """

    def __init__(self):
        self._size_to_paths_map = {}
        self._size_and_hash_to_path_map = {}

    @staticmethod
    def _hash_for(path_to_hash):
        buffer_size = 1024 * 1024
        sha256_hash = hashlib.sha256()
        with open(path_to_hash, "rb", buffer_size) as file_to_hash:
            data = file_to_hash.read(buffer_size)
            while len(data) >= 1:
                sha256_hash.update(data)
                data = file_to_hash.read(buffer_size)
        return sha256_hash.digest()

    def duplicate_path(self, source_path: str) -> Optional[str]:
        """
        Path to a duplicate for ``source_path`` or ``None`` if no duplicate exists.

        Internally information is stored to identify possible future duplicates of
        ``source_path``.
        """
        result = None
        source_size = os.path.getsize(source_path)
        paths_with_same_size = self._size_to_paths_map.get(source_size)
        if paths_with_same_size is None:
            self._size_to_paths_map[source_size] = [source_path]
        else:
            source_hash = DuplicatePool._hash_for(source_path)
            if len(paths_with_same_size) == 1:
                # Retrofit the initial path with the same size and its hash.
                initial_path_with_same_size = paths_with_same_size[0]
                initial_hash = DuplicatePool._hash_for(initial_path_with_same_size)
                self._size_and_hash_to_path_map[(source_size, initial_hash)] = initial_path_with_same_size
            result = self._size_and_hash_to_path_map.get((source_size, source_hash))
            self._size_and_hash_to_path_map[(source_size, source_hash)] = source_path
        return result


class SourceAnalysis:
    """
    Results from analyzing a source path.

    Prefer the factory methods :py:meth:`from_file()` and :py:meth:`from_state` to
    calling the constructor.
    """

    def __init__(
        self,
        path: str,
        language: str,
        group: str,
        code: int,
        documentation: int,
        empty: int,
        string: int,
        state: SourceState,
        state_info: Optional[str] = None,
    ):
        SourceAnalysis._check_state_info(state, state_info)
        self._path = path
        self._language = language
        self._group = group
        self._code = code
        self._documentation = documentation
        self._empty = empty
        self._string = string
        self._state = state
        self._state_info = state_info

    @staticmethod
    def from_state(
        source_path: str,
        group: str,
        state: SourceState,
        state_info: Optional[str] = None,
        tmp_dir: Optional[str] = None,
    ) -> "SourceAnalysis":
        """
        Factory method to create a :py:class:`SourceAnalysis` with all counts
        set to 0 and everything else according to the specified parameters.
        """
        assert source_path is not None
        assert group is not None
        assert state != SourceState.analyzed, "use from() for analyzable sources"
        SourceAnalysis._check_state_info(state, state_info)
        reduced_path = source_path.rsplit(tmp_dir, maxsplit=1)[-1].lstrip(os.sep) if tmp_dir else source_path
        return SourceAnalysis(
            path=reduced_path,
            language=f"__{state.name}__",
            group=group,
            code=0,
            documentation=0,
            empty=0,
            string=0,
            state=state,
            state_info=state_info,
        )

    @staticmethod
    def _check_state_info(state: SourceState, state_info: Optional[str]):
        assert state_info is None or isinstance(state_info, str), (
            f"state_info must be be None or str but is: {state_info!r}"
        )

        states_that_require_state_info = [SourceState.duplicate, SourceState.error, SourceState.generated]
        assert (state in states_that_require_state_info) == (state_info is not None), (
            f"state={state} and state_info={state_info} "
            f"but state_info must be specified for the following states: {states_that_require_state_info}"
        )

    @staticmethod
    def from_file(
        source_path: str,
        group: str,
        encoding: str = "automatic",
        fallback_encoding: str = "cp1252",
        generated_regexes: Optional[list[Pattern]] = None,
        duplicate_pool: Optional[DuplicatePool] = None,
        file_handle: Optional[IOBase] = None,
        merge_embedded_language: bool = False,
        tmp_dir: Optional[str] = None,
        *,
        generated_name_regexes: Optional[list[Pattern]] = None,
    ) -> "SourceAnalysis":
        """
        Factory method to create a :py:class:`SourceAnalysis` by analyzing
        the source code in ``source_path`` or the open file ``file_handle``.

        :param source_path: path to source code to analyze
        :param group: name of a logical group the source code belongs to, e.g. a
          package.
        :param encoding: encoding according to :func:`encoding_for`
        :param fallback_encoding: fallback encoding according to
          :func:`encoding_for`
        :param generated_regexes: list of regular expression that if found within the first few lines
          if a source code identify is as generated source code for which SLOC should not be counted
        :param generated_name_regexes: list of regular expression that if the base file name matches,
          the file is considered to be generated and the SLOC should not be counted
        :param duplicate_pool: a :class:`DuplicatePool` where information about possible duplicates is
          collected, or ``None`` if possible duplicates should be counted multiple times.
        :param file_handle: a file-like object, or ``None`` to read and open the file from
          ``source_path``. If the file is open in text mode, it must be opened with the correct
          encoding.
        :param merge_embedded_language: If pygments detects a base and embedded language, the source
          code counts towards the base language. For example, "JavaScript+Lasso" counts as
          "JavaScript".
        :param tmp_dir: If a temporary directory was created, strip it from the path name. This happens
          right now only for git repositories.
        """
        assert encoding is not None

        result = None
        lexer = None
        source_code = None
        if generated_name_regexes is not None:
            generated_name_regex = matching_regex(Path(source_path).name, generated_name_regexes)
            if generated_name_regex is not None:
                result = SourceAnalysis.from_state(
                    source_path, group, SourceState.generated, state_info=generated_name_regex.pattern
                )
        if result is None and file_handle is None:
            source_size = os.path.getsize(source_path)
            if source_size == 0:
                _log.info("%s: is empty", source_path)
                result = SourceAnalysis.from_state(source_path, group, SourceState.empty)
            elif is_binary_file(source_path):
                _log.info("%s: is binary", source_path)
                result = SourceAnalysis.from_state(source_path, group, SourceState.binary)
            elif not has_lexer(source_path):
                _log.info("%s: unknown language", source_path)
                result = SourceAnalysis.from_state(source_path, group, SourceState.unknown)
        if duplicate_pool is not None:
            duplicate_path = duplicate_pool.duplicate_path(source_path)
            if duplicate_path is not None:
                _log.info("%s: is a duplicate of %s", source_path, duplicate_path)
                result = SourceAnalysis.from_state(source_path, group, SourceState.duplicate, duplicate_path)
        if result is None:
            try:
                if file_handle is None:
                    if encoding in ("automatic", "chardet"):
                        encoding = encoding_for(source_path, encoding, fallback_encoding)
                    with open(source_path, encoding=encoding) as source_file:
                        source_code = source_file.read()
                elif not isinstance(file_handle, TextIOBase):
                    if encoding in ("automatic", "chardet"):
                        encoding = encoding_for(source_path, encoding, fallback_encoding, file_handle=file_handle)
                    source_code = file_handle.read().decode(encoding)
                else:
                    source_code = file_handle.read()
            except (LookupError, OSError, UnicodeError) as error:
                _log.warning("cannot read %s using encoding %s: %s", source_path, encoding, error)
                result = SourceAnalysis.from_state(source_path, group, SourceState.error, str(error))
            if result is None:
                lexer = guess_lexer(source_path, source_code)
                assert lexer is not None
        actual_generated_regexes = (
            generated_regexes
            if generated_regexes is not None
            else pygount.common.regexes_from(DEFAULT_GENERATED_LINE_PATTERNS_TEXT)
        )
        if (result is None) and (len(actual_generated_regexes) != 0):
            number_line_and_regex = matching_number_line_and_regex(
                pygount.common.lines(source_code), actual_generated_regexes
            )
            if number_line_and_regex is not None:
                number, _, regex = number_line_and_regex
                message = f"line {number} matches {regex}"
                _log.info("%s: is generated code because %s", source_path, message)
                result = SourceAnalysis.from_state(source_path, group, SourceState.generated, message)
        if result is None:
            assert lexer is not None
            assert source_code is not None
            language = base_language(lexer.name) if merge_embedded_language else lexer.name
            if ("xml" in language.lower()) or (language == "Genshi"):
                dialect = pygount.xmldialect.xml_dialect(source_path, source_code)
                if dialect is not None:
                    language = dialect
            _log.info("%s: analyze as %s using encoding %s", source_path, language, encoding)
            mark_to_count_map = {"c": 0, "d": 0, "e": 0, "s": 0}
            is_markup = is_markup_file(source_path)
            for line_parts in _line_parts(lexer, source_code, is_markup=is_markup):
                mark_to_increment = "e"
                for mark_to_check in ("d", "s", "c"):
                    if mark_to_check in line_parts:
                        mark_to_increment = mark_to_check
                mark_to_count_map[mark_to_increment] += 1
            reduced_path = source_path.rsplit(tmp_dir, maxsplit=1)[-1].lstrip(os.sep) if tmp_dir else source_path
            result = SourceAnalysis(
                path=reduced_path,
                language=language,
                group=group,
                code=mark_to_count_map["c"],
                documentation=mark_to_count_map["d"],
                empty=mark_to_count_map["e"],
                string=mark_to_count_map["s"],
                state=SourceState.analyzed,
                state_info=None,
            )

        assert result is not None
        return result

    @property
    def path(self) -> str:
        return self._path

    @property
    def language(self) -> str:
        """
        The programming language the analyzed source code is written in; if
        :py:attr:`state` does not equal :py:attr:`SourceState.analyzed` this
        will be a pseudo language.
        """
        return self._language

    @property
    def group(self) -> str:
        """
        Group the source code belongs to; this can be any text useful to group
        the files later on. It is perfectly valid to put all files in the same
        group.

        (Note: this property is mostly there for compatibility with the
        original SLOCCount.)
        """
        return self._group

    @property
    def code_count(self) -> int:
        """number of lines containing code"""
        return self._code

    @property
    def documentation_count(self) -> int:
        """number of lines containing documentation (resp. comments)"""
        return self._documentation

    @property
    def empty_count(self) -> int:
        """
        number of empty lines, including lines containing only white space,
        white characters or white code words

        See also: :py:func:`white_characters`, :py:func:`white_code_words`
        """
        return self._empty

    @property
    def line_count(self) -> int:
        """number of total lines, which is what you text editor a `wc -l`
        would show
        """
        return self.code_count + self.documentation_count + self.empty_count + self.string_count

    @property
    def string_count(self) -> int:
        """number of lines containing only strings but no other code"""
        return self._string

    @property
    def source_count(self) -> int:
        """number of source lines of code (the sum of code_count and string_count)"""
        return self.code_count + self.string_count

    @property
    def state(self) -> SourceState:
        """
        The state of the analysis after parsing the source file.
        """
        return self._state

    @property
    def state_info(self) -> Optional[Union[str, Exception]]:
        """
        Possible additional information about :py:attr:`state`:

        * :py:attr:`SourceState.duplicate`: path to the original source file
          the :py:attr:`path` is a duplicate of
        * :py:attr:`SourceState.error`: the :py:exc:`Exception` causing the
          error
        * :py:attr:`SourceState.generated`: a human-readable explanation why
          the file is considered to be generated
        """
        return self._state_info

    @property
    def is_countable(self) -> bool:
        """
        ``True`` if source counts can be counted towards a total.
        """
        return self.state in (SourceState.analyzed, SourceState.duplicate)

    def __repr__(self):
        name_to_value_map = {
            "path": repr(self.path),
            "language": repr(self.language),
            "group": repr(self.group),
            "state": self.state.name,
        }
        if self.state == SourceState.analyzed:
            name_to_value_map.update(
                {
                    "code_count": self.code_count,
                    "documentation_count": self.documentation_count,
                    "empty_count": self.empty_count,
                    "string_count": self.string_count,
                }
            )
        if self.state_info is not None:
            name_to_value_map["state_info"] = repr(self.state_info)
        return mapped_repr(self, name_to_value_map)


class SourceScanner:
    """
    Scanner for source code files matching certain conditions.
    """

    def __init__(
        self,
        source_patterns,
        suffixes="*",
        folders_to_skip=None,
        name_to_skip=None,
    ):
        self._source_patterns = source_patterns
        self._suffixes = pygount.common.regexes_from(suffixes)
        self._folder_regexps_to_skip = (
            folders_to_skip
            if folders_to_skip is not None
            else pygount.common.regexes_from(DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT)
        )
        self._name_regexps_to_skip = (
            name_to_skip
            if folders_to_skip is not None
            else pygount.common.regexes_from(DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT)
        )
        self._git_storages = []

    def close(self):
        for git_storage in self._git_storages:
            git_storage.close()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
        return False

    @property
    def source_patterns(self):
        return self._source_patterns

    @property
    def suffixes(self) -> list[Pattern]:
        return self._suffixes

    @property
    def folder_regexps_to_skip(self) -> list[Pattern]:
        return self._folder_regexps_to_skip

    @folder_regexps_to_skip.setter
    def folder_regexps_to_skip(self, regexps_or_pattern_text):
        self._folder_regexps_to_skip.append = pygount.common.regexes_from(
            regexps_or_pattern_text, self.folder_regexps_to_skip
        )

    @property
    def name_regexps_to_skip(self) -> list[Pattern]:
        return self._name_regexps_to_skip

    @name_regexps_to_skip.setter
    def name_regexps_to_skip(self, regexps_or_pattern_text):
        self._name_regexps_to_skip = pygount.common.regexes_from(regexps_or_pattern_text, self.name_regexps_to_skip)

    def _is_path_to_skip(self, name, is_folder) -> bool:
        assert os.sep not in name, f"name={name!r}"
        regexps_to_skip = self._folder_regexps_to_skip if is_folder else self._name_regexps_to_skip
        return any(path_name_to_skip_regex.match(name) is not None for path_name_to_skip_regex in regexps_to_skip)

    def _paths_and_group_to_analyze_in(self, folder, group, tmp_dir) -> PathData:
        assert folder is not None
        assert group is not None

        for name in os.listdir(folder):
            path = os.path.join(folder, name)
            if not os.path.islink(path):
                is_folder = os.path.isdir(path)
                if self._is_path_to_skip(os.path.basename(path), is_folder):
                    _log.debug("skip due to matching skip pattern: %s", path)
                elif is_folder:
                    yield from self._paths_and_group_to_analyze_in(path, group, tmp_dir)
                else:
                    yield PathData(source_path=path, group=group, tmp_dir=tmp_dir)

    def _paths_and_group_to_analyze(self, path_to_analyse_pattern, group=None, tmp_dir=None) -> Iterator[PathData]:
        for path_to_analyse in glob.glob(path_to_analyse_pattern):
            if os.path.islink(path_to_analyse):
                _log.debug("skip link: %s", path_to_analyse)
            else:
                is_folder = os.path.isdir(path_to_analyse)
                if self._is_path_to_skip(os.path.basename(path_to_analyse), is_folder):
                    _log.debug("skip due to matching skip pattern: %s", path_to_analyse)
                else:
                    actual_group = group
                    if is_folder:
                        if actual_group is None:
                            actual_group = os.path.basename(path_to_analyse)
                            if actual_group == "":
                                # Compensate for trailing path separator.
                                actual_group = os.path.basename(os.path.dirname(path_to_analyse))
                        yield from self._paths_and_group_to_analyze_in(path_to_analyse_pattern, actual_group, tmp_dir)
                    else:
                        if actual_group is None:
                            actual_group = os.path.dirname(path_to_analyse)
                            if actual_group == "":
                                actual_group = os.path.basename(os.path.dirname(os.path.abspath(path_to_analyse)))
                        yield PathData(source_path=path_to_analyse, group=actual_group, tmp_dir=tmp_dir)

    def _source_paths_and_groups_to_analyze(self, source_patterns_to_analyze) -> list[PathData]:
        assert source_patterns_to_analyze is not None

        result = []

        def _process_source_pattern(source_pattern: str):
            remote_url, revision = git_remote_url_and_revision_if_any(source_pattern)
            if remote_url is not None:
                git_storage = GitStorage(remote_url, revision)
                self._git_storages.append(git_storage)
                git_storage.extract()
                result.extend(
                    self._paths_and_group_to_analyze(git_storage.temp_folder, tmp_dir=git_storage.temp_folder)
                )
            else:
                has_url_prefix = re.match(HTTP_URL_REGEX, source_pattern)
                if has_url_prefix:
                    is_git_url = re.match(GIT_REPO_REGEX, source_pattern_to_analyze) is not None
                    if not is_git_url:
                        raise pygount.Error(
                            f'URL to git repository {source_pattern} must end with ".git" or must match the pattern '
                            f"http(s)://({'|'.join(_ALLOWED_GIT_PLATFORMS)})/<...>/<...>.git. "
                            f"For example: git@github.com:roskakori/pygount.git or "
                            f"https://github.com/roskakori/pygount.git."
                        )
                    source_pattern = source_pattern.rstrip("/")
                    _process_source_pattern(source_pattern + ".git")
                else:
                    result.extend(self._paths_and_group_to_analyze(source_pattern_to_analyze))

        # NOTE: We could avoid initializing `source_pattern_to_analyze` here by moving the `try` inside
        #  the loop, but this would incor a performance overhead (ruff's PERF203).
        source_pattern_to_analyze = None
        try:
            for source_pattern_to_analyze in source_patterns_to_analyze:
                _process_source_pattern(source_pattern_to_analyze)
        except OSError as error:
            assert source_pattern_to_analyze is not None
            raise OSError(f'cannot scan "{source_pattern_to_analyze}" for source files: {error}') from error
        result = sorted(set(result), key=lambda data: (data.source_path, data.group))
        return result

    def source_paths(self) -> Iterator[PathData]:
        """
        Paths to source code files matching all the conditions for this scanner.
        """
        source_paths_and_groups_to_analyze = self._source_paths_and_groups_to_analyze(self.source_patterns)

        for path_data in source_paths_and_groups_to_analyze:
            suffix = os.path.splitext(path_data.source_path)[1].lstrip(".")
            is_suffix_to_analyze = any(suffix_regexp.match(suffix) for suffix_regexp in self.suffixes)
            if is_suffix_to_analyze:
                yield path_data
            else:
                _log.info("skip due to suffix: %s", path_data.source_path)


_LANGUAGE_TO_WHITE_WORDS_MAP = {"batchfile": {"@"}, "python": {"pass"}, "sql": {"begin", "end"}}
for _language in _LANGUAGE_TO_WHITE_WORDS_MAP:
    assert _language.islower()


def matching_number_line_and_regex(
    source_lines: Iterator[str], generated_regexes: Sequence[Pattern], max_line_count: int = 15
) -> Optional[tuple[int, str, Pattern]]:
    """
    The first line and its number (starting with 0) in the source code that
    indicated that the source code is generated.
    :param source_lines: lines of text to scan
    :param generated_regexes: regular expressions a line must match to indicate
        the source code is generated.
    :param max_line_count: maximum number of lines to scan
    :return: a tuple of the form ``(number, line, regex)`` or ``None`` if the
        source lines do not match any ``generated_regexes``.
    """
    initial_numbers_and_lines = enumerate(itertools.islice(source_lines, max_line_count))
    matching_number_line_and_regexps = (
        (number, line, matching_regex)
        for number, line in initial_numbers_and_lines
        for matching_regex in generated_regexes
        if matching_regex.match(line)
    )
    possible_first_matching_number_line_and_regexp = list(itertools.islice(matching_number_line_and_regexps, 1))
    result = (
        possible_first_matching_number_line_and_regexp[0] if possible_first_matching_number_line_and_regexp else None
    )
    return result


def white_characters(language_id: str) -> str:
    """
    Characters that count as white space if they are the only characters in a
    line.
    """
    assert language_id is not None
    assert language_id.islower()
    return "(),:;[]{}"


def white_code_words(language_id: str) -> set[str]:
    """
    Words that do not count as code if it is the only word in a line.
    """
    assert language_id is not None
    assert language_id.islower()
    return _LANGUAGE_TO_WHITE_WORDS_MAP.get(language_id, set())


def _delined_tokens(tokens: Iterator[tuple[TokenType, str]]) -> Iterator[TokenType]:
    for token_type, token_text in tokens:
        remaining_token_text = token_text
        newline_index = remaining_token_text.find("\n")
        while newline_index != -1:
            yield token_type, remaining_token_text[: newline_index + 1]
            remaining_token_text = remaining_token_text[newline_index + 1 :]
            newline_index = remaining_token_text.find("\n")
        if remaining_token_text != "":
            yield token_type, remaining_token_text


def _pythonized_comments(tokens: Iterator[tuple[TokenType, str]]) -> Iterator[TokenType]:
    """
    Similar to tokens but converts strings after a colon (`:`) to comments.
    """
    is_after_colon = True
    for token_type, result_token_text in tokens:
        if is_after_colon and (token_type in pygments.token.String):
            result_token_type = pygments.token.Comment
        else:
            result_token_type = token_type
            if result_token_text == ":":
                is_after_colon = True
            elif token_type not in pygments.token.Comment:
                is_whitespace = len(result_token_text.rstrip(WHITE_SPACE_CHARACTERS)) == 0
                if not is_whitespace:
                    is_after_colon = False
        yield result_token_type, result_token_text


def _line_parts(lexer: pygments.lexer.Lexer, text: str, is_markup: bool = False) -> Iterator[set[str]]:
    line_marks = set()
    tokens = _delined_tokens(lexer.get_tokens(text))
    if lexer.name == "Python":
        tokens = _pythonized_comments(tokens)
    language_id = lexer.name.lower()
    white_text = " \f\n\r\t" + white_characters(language_id)
    white_words = white_code_words(language_id)
    for token_type, token_text in tokens:
        # NOTE: Pygments treats preprocessor statements as special comments.
        is_actual_comment = token_type in pygments.token.Comment and token_type not in (
            pygments.token.Comment.Preproc,
            pygments.token.Comment.PreprocFile,
        )
        if is_actual_comment:
            line_marks.add("d")  # 'documentation'
        elif token_type in pygments.token.String:
            line_marks.add("s")  # 'string'
        else:
            is_white_text = (token_text.strip() in white_words) or (token_text.rstrip(white_text) == "")
            if not is_white_text:
                line_mark = "d" if is_markup else "c"
                line_marks.add(line_mark)
        if token_text.endswith("\n"):
            yield line_marks
            line_marks = set()
    if len(line_marks) >= 1:
        yield line_marks


def check_file_handle_is_seekable(file_handle: Optional[Union[BufferedIOBase, RawIOBase]], source_path: str):
    if not file_handle.seekable():
        raise pygount.Error(f"cannot determine encoding: file handle must be seekable: {source_path}")


def encoding_for(
    source_path: str,
    encoding: str = "automatic",
    fallback_encoding: Optional[str] = None,
    file_handle: Optional[Union[BufferedIOBase, RawIOBase]] = None,
) -> str:
    """
    The encoding used by the text file stored in ``source_path``.

    The algorithm used is:

    * If ``encoding`` is ``'automatic``, attempt the following:

      1. Check BOM for UTF-8, UTF-16 and UTF-32.
      2. Look for XML prolog or magic heading like ``# -*- coding: cp1252 -*-``
      3. Read the file using UTF-8.
      4. If all this fails, use the ``fallback_encoding`` and ignore any
         further encoding errors.

    * If ``encoding`` is ``'chardet`` use :mod:`chardet` to obtain the encoding.
    * For any other ``encoding`` simply use the specified value.
    """
    assert encoding is not None

    if encoding == "automatic":
        if file_handle is None:
            with open(source_path, "rb") as source_file:
                heading = source_file.read(128)
        else:
            check_file_handle_is_seekable(file_handle, source_path)
            heading = file_handle.read(128)
            file_handle.seek(-len(heading), SEEK_CUR)
        result = None
        if len(heading) == 0:
            # File is empty, assume a dummy encoding.
            result = "utf-8"
        if result is None:
            result = next(
                (
                    encoding_for_bom
                    for bom, encoding_for_bom in _BOM_TO_ENCODING_MAP.items()
                    if heading[: len(bom)] == bom
                ),
                None,
            )
        if result is None:
            result = encoding_from_header(heading)
    elif encoding == "chardet":
        assert _detector is not None, (
            'without chardet installed, encoding="chardet" must be rejected before calling encoding_for()'
        )
        _detector.reset()
        if file_handle is None:
            with open(source_path, "rb") as source_file:
                lines = source_file.readlines()
        else:
            check_file_handle_is_seekable(file_handle, source_path)
            file_position = file_handle.tell()
            lines = file_handle.readlines()
            file_handle.seek(file_position)
        for line in lines:
            _detector.feed(line)
            if _detector.done:
                break
        result = _detector.result["encoding"]
        if result is None:
            _log.warning(
                "%s: chardet cannot determine encoding, assuming fallback encoding %s", source_path, fallback_encoding
            )
            result = fallback_encoding
    else:
        # Simply use the specified encoding.
        result = encoding
    if result is None:
        # Encoding 'automatic' or 'chardet' failed to detect anything.
        if fallback_encoding is not None:
            # If defined, use the fallback encoding.
            result = fallback_encoding
        else:
            try:
                # Attempt to read the file as UTF-8.
                if file_handle is None:
                    with open(source_path, encoding="utf-8") as source_file:
                        source_file.read()
                else:
                    check_file_handle_is_seekable(file_handle, source_path)
                    file_position = file_handle.tell()
                    file_handle.read()
                    file_handle.seek(file_position)
                result = "utf-8"
            except UnicodeDecodeError:
                # UTF-8 did not work out, use the default as last resort.
                result = DEFAULT_FALLBACK_ENCODING
            _log.debug("%s: no fallback encoding specified, using %s", source_path, result)

    assert result is not None
    return result


def encoding_from_header(header: bytes) -> Optional[str]:
    ascii_header = header.decode("ascii", errors="replace")
    result = encoding_from_possible_magic_comment(ascii_header)
    if result is None:
        result = encoding_from_possible_xml_prolog(ascii_header)
    return result


def encoding_from_possible_magic_comment(ascii_header: str) -> Optional[str]:
    return next(_magic_comment_encodings(ascii_header), None)


def _magic_comment_encodings(ascii_header: str) -> Iterator[str]:
    header_lines = ascii_header.split("\n")[:2]
    for header_line in header_lines:
        for magic_line_start_regex in _MAGIC_COMMENT_LINE_START_REGEXES:
            magic_line_start_match = re.match(magic_line_start_regex, header_line)
            if magic_line_start_match is not None:
                remainder = magic_line_start_match.group("remainder")
                for magic_coding_comment_regex in _MAGIC_COMMENT_LINE_REMAINDER_REGEXES:
                    result = magic_coding_comment_regex.match(remainder)
                    if result is not None:
                        yield result.group("encoding")


def encoding_from_possible_xml_prolog(ascii_header: str) -> Optional[str]:
    header_line = ascii_header.replace("\f\n\r\v", " ")
    xml_prolog_match = _XML_PROLOG_REGEX.match(header_line)
    return xml_prolog_match.group("encoding") if xml_prolog_match is not None else None


def is_binary_file(source_path: str) -> bool:
    with open(source_path, "rb") as source_file:
        initial_bytes = source_file.read(8192)
    return not any(initial_bytes.startswith(bom) for bom in _TEXT_BOMS) and b"\0" in initial_bytes


def is_plain_text(source_path):
    return _PLAIN_TEXT_NAME_REGEX.match(os.path.basename(source_path))


def has_lexer(source_path: str) -> bool:
    """
    Initial quick check if there is a lexer for ``source_path``. This removes
    the need for calling :py:func:`pygments.lexers.guess_lexer_for_filename()`
    which fully reads the source file.
    """
    result = bool(pygments.lexers.find_lexer_class_for_filename(source_path))
    if not result:
        suffix = os.path.splitext(os.path.basename(source_path))[1].lstrip(".")
        result = suffix in _SUFFIX_TO_FALLBACK_LEXER_MAP
    return result


def guess_lexer(source_path: str, text: str) -> pygments.lexer.Lexer:
    if is_plain_text(source_path):
        result = pygount.lexers.PlainTextLexer()
    else:
        try:
            result = pygments.lexers.guess_lexer_for_filename(source_path, text)
        except pygments.util.ClassNotFound:
            suffix = os.path.splitext(os.path.basename(source_path))[1].lstrip(".")
            result = _SUFFIX_TO_FALLBACK_LEXER_MAP.get(suffix)
    return result


def base_language(language: str) -> str:
    base_language_match = _BASE_LANGUAGE_REGEX.match(language)
    return language if base_language_match is None else base_language_match.group("base_language")


================================================
FILE: pygount/command.py
================================================
"""
Command line interface for pygount.
"""

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
import argparse
import contextlib
import logging
import os
import sys

from rich.progress import Progress

import pygount
import pygount.analysis
import pygount.common
import pygount.write

#: Valid formats for option --format.
VALID_OUTPUT_FORMATS = ("cloc-xml", "json", "sloccount", "summary")

_DEFAULT_ENCODING = "automatic"
_DEFAULT_OUTPUT_FORMAT = "sloccount"
_DEFAULT_OUTPUT = "STDOUT"
_DEFAULT_SOURCE_PATTERNS = os.curdir
_DEFAULT_SUFFIXES = "*"

_HELP_ENCODING = '''encoding to use when reading source code; use "automatic"
 to take BOMs, XML prolog and magic headers into account and fall back to
 UTF-8 or CP1252 if none fits; use "automatic;<fallback>" to specify a
 different fallback encoding than CP1252; use "chardet" to let the chardet
 package determine the encoding; default: "%(default)s"'''

_HELP_EPILOG = """SHELL-PATTERN is a pattern using *, ? and ranges like [a-z]
 as placeholders. PATTERNS is a comma separated list of SHELL-PATTERN. The
 prefix [regex] indicated that the PATTERNS use regular expression syntax. If
 default values are available, [...] indicates that the PATTERNS extend the
 existing default values."""

_HELP_FORMAT = (
    f"output format, one of: "
    # HACK The chr(34) is necessary because ruff does not preserve the
    #  backslash in '\"'.
    f"{', '.join([chr(34) + output_format + chr(34) for output_format in VALID_OUTPUT_FORMATS])};"
    f' default: "%(default)s"'
)

_HELP_GENERATED = """comma separated list of regular expressions to detect
 generated code; default: %(default)s"""

_HELP_GENERATED_NAMES = """comma separated list of glob patterns for file names
 not to treat as generated. Use "..." as first entry to append patterns to the default
 patterns; default: %(default)s"""

_HELP_MERGE_EMBEDDED_LANGUAGES = """merge counts for embedded languages into
 their base language; for example, HTML+Jinja2 counts as HTML"""

_HELP_FOLDERS_TO_SKIP = """comma separated list of glob patterns for folder
 names not to analyze. Use "..." as first entry to append patterns to the
 default patterns; default: %(default)s"""

_HELP_NAMES_TO_SKIP = """comma separated list of glob patterns for file names
 not to analyze. Use "..." as first entry to append patterns to the default
 patterns; default: %(default)s"""

_HELP_SUFFIX = '''limit analysis on files matching any suffix in comma
 separated LIST; shell patterns are possible; example: "py,sql"; default:
 "%(default)s"'''

_OUTPUT_FORMAT_TO_WRITER_CLASS_MAP = {
    "cloc-xml": pygount.write.ClocXmlWriter,
    "json": pygount.write.JsonWriter,
    "sloccount": pygount.write.LineWriter,
    "summary": pygount.write.SummaryWriter,
}
assert set(VALID_OUTPUT_FORMATS) == set(_OUTPUT_FORMAT_TO_WRITER_CLASS_MAP.keys())

_log = logging.getLogger("pygount")


def _check_encoding(name, encoding_to_check, alternative_encoding, source=None):
    """
    Check that ``encoding`` is a valid Python encoding
    :param name: name under which the encoding is known to the user, e.g. 'default encoding'
    :param encoding_to_check: name of the encoding to check, e.g. 'utf-8'
    :param source: source where the encoding has been set, e.g. option name
    :raise pygount.common.OptionError if ``encoding`` is not a valid Python encoding
    """
    assert name is not None

    if encoding_to_check not in (alternative_encoding, "chardet", None):
        try:
            "".encode(encoding_to_check)
        except LookupError:
            raise pygount.common.OptionError(
                f'{name} is "{encoding_to_check}" but must be "{alternative_encoding}" or a known Python encoding',
                source,
            ) from None


class Command:
    """
    Command interface for pygount, where options starting with defaults can
    gradually be set and finally :py:meth:`execute()`.
    """

    def __init__(self):
        self.set_encodings(_DEFAULT_ENCODING)
        self._folders_to_skip = pygount.common.regexes_from(pygount.analysis.DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT)
        self._generated_line_regexs = pygount.common.regexes_from(pygount.analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT)
        self._generated_name_regexps = pygount.common.regexes_from(
            pygount.analysis.DEFAULT_GENERATED_NAME_PATTERNS_TEXT
        )
        self._has_duplicates = False
        self._has_summary = False
        self._has_to_merge_embedded_languages = False
        self._is_verbose = False
        self._names_to_skip = pygount.common.regexes_from(pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT)
        self._output = _DEFAULT_OUTPUT
        self._output_format = _DEFAULT_OUTPUT_FORMAT
        self._source_patterns = _DEFAULT_SOURCE_PATTERNS
        self._suffixes = pygount.common.regexes_from(_DEFAULT_SUFFIXES)

    def set_encodings(self, encoding, source=None):
        encoding_is_chardet = (encoding == "chardet") or (encoding.startswith("chardet;"))
        if encoding_is_chardet and not pygount.analysis.has_chardet:  # pragma: no cover
            raise pygount.common.OptionError('chardet must be installed to set default encoding to "chardet"')
        if encoding in ("automatic", "chardet"):
            default_encoding = encoding
            fallback_encoding = None
        elif encoding.startswith(("automatic;", "chardet;")):
            first_encoding_semicolon_index = encoding.find(";")
            default_encoding = encoding[:first_encoding_semicolon_index]
            fallback_encoding = encoding[first_encoding_semicolon_index + 1 :]
        else:
            default_encoding = encoding
            fallback_encoding = pygount.analysis.DEFAULT_FALLBACK_ENCODING
        self.set_default_encoding(default_encoding, source)
        self.set_fallback_encoding(fallback_encoding, source)

    @property
    def default_encoding(self):
        return self._default_encoding

    def set_default_encoding(self, default_encoding, source=None):
        _check_encoding("default encoding", default_encoding, "automatic", source)
        self._default_encoding = default_encoding

    @property
    def fallback_encoding(self):
        return self._fallback_encoding

    def set_fallback_encoding(self, fallback_encoding, source=None):
        _check_encoding("fallback encoding", fallback_encoding, "automatic", source)
        self._fallback_encoding = fallback_encoding

    @property
    def folders_to_skip(self):
        return self._folders_to_skip

    def set_folders_to_skip(self, regexes_or_patterns_text, source=None):
        self._folders_to_skip = pygount.common.regexes_from(
            regexes_or_patterns_text, pygount.analysis.DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT, source
        )

    @property
    def generated_regexps(self):
        return self._generated_line_regexs

    def set_generated_regexps(self, regexes_or_patterns_text, source=None):
        self._generated_line_regexs = pygount.common.regexes_from(
            regexes_or_patterns_text, pygount.analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT, source
        )

    @property
    def generated_name_regexps(self):
        return self._generated_name_regexps

    def set_generated_name_regexps(self, regexes_or_pattern_text, source=None):
        self._generated_name_regexps = pygount.common.regexes_from(
            regexes_or_pattern_text, pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT, source
        )

    @property
    def has_duplicates(self):
        return self._has_duplicates

    def set_has_duplicates(self, has_duplicates, source=None):
        self._has_duplicates = bool(has_duplicates)

    @property
    def has_to_merge_embedded_languages(self):
        return self._has_to_merge_embedded_languages

    def set_has_to_merge_embedded_languages(self, has_to_merge_embedded_languages, source=None):
        self._has_to_merge_embedded_languages = bool(has_to_merge_embedded_languages)

    @property
    def is_verbose(self):
        return self._is_verbose

    def set_is_verbose(self, is_verbose, source=None):
        self._is_verbose = bool(is_verbose)

    @property
    def names_to_skip(self):
        return self._names_to_skip

    def set_names_to_skip(self, regexes_or_pattern_text, source=None):
        self._names_to_skip = pygount.common.regexes_from(
            regexes_or_pattern_text, pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT, source
        )

    @property
    def output(self):
        return self._output

    def set_output(self, output, source=None):
        assert output is not None
        self._output = output

    @property
    def output_format(self):
        return self._output_format

    def set_output_format(self, output_format, source=None):
        assert output_format is not None
        if output_format not in VALID_OUTPUT_FORMATS:
            raise pygount.common.OptionError(
                f"format is {output_format} but must be one of: {VALID_OUTPUT_FORMATS}", source
            )
        self._output_format = output_format

    @property
    def source_patterns(self):
        return self._source_patterns

    def set_source_patterns(self, glob_patterns_or_text, source=None):
        assert glob_patterns_or_text is not None
        self._source_patterns = pygount.common.as_list(glob_patterns_or_text)
        assert len(self._source_patterns) >= 0

    @property
    def suffixes(self):
        return self._suffixes

    def set_suffixes(self, regexes_or_patterns_text, source=None):
        assert regexes_or_patterns_text is not None
        self._suffixes = pygount.common.regexes_from(regexes_or_patterns_text, _DEFAULT_SUFFIXES, source)

    def argument_parser(self):
        parser = argparse.ArgumentParser(description="count source lines of code", epilog=_HELP_EPILOG)
        parser.add_argument("--duplicates", "-d", action="store_true", help="analyze duplicate files")
        parser.add_argument("--encoding", "-e", default=_DEFAULT_ENCODING, help=_HELP_ENCODING)
        parser.add_argument(
            "--folders-to-skip",
            "-F",
            metavar="PATTERNS",
            default=pygount.analysis.DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT,
            help=_HELP_FOLDERS_TO_SKIP,
        )
        parser.add_argument(
            "--format",
            "-f",
            metavar="FORMAT",
            choices=VALID_OUTPUT_FORMATS,
            default=_DEFAULT_OUTPUT_FORMAT,
            help=_HELP_FORMAT,
        )
        parser.add_argument(
            "--generated",
            "-g",
            metavar="PATTERNS",
            default=pygount.analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT,
            help=_HELP_GENERATED,
        )
        parser.add_argument(
            "--generated-names",
            "-G",
            metavar="PATTERNS",
            default=pygount.analysis.DEFAULT_GENERATED_NAME_PATTERNS_TEXT,
            help=_HELP_GENERATED_NAMES,
        )
        parser.add_argument(
            "--merge-embedded-languages",
            "-m",
            action="store_true",
            help=_HELP_MERGE_EMBEDDED_LANGUAGES,
        )
        parser.add_argument(
            "--names-to-skip",
            "-N",
            metavar="PATTERNS",
            default=pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT,
            help=_HELP_NAMES_TO_SKIP,
        )
        parser.add_argument(
            "--out",
            "-o",
            metavar="FILE",
            default=_DEFAULT_OUTPUT,
            help='file to write results to; use "STDOUT" for standard output; default: "%(default)s"',
        )
        parser.add_argument("--suffix", "-s", metavar="PATTERNS", default=_DEFAULT_SUFFIXES, help=_HELP_SUFFIX)
        parser.add_argument(
            "source_patterns",
            metavar="SHELL-PATTERN",
            nargs="*",
            default=[os.getcwd()],
            help="source files and directories to scan; can use glob patterns; default: current directory",
        )
        parser.add_argument("--verbose", "-v", action="store_true", help="explain what is being done")
        parser.add_argument("--version", action="version", version="%(prog)s " + pygount.__version__)
        return parser

    def parsed_args(self, arguments):
        assert arguments is not None

        parser = self.argument_parser()
        args = parser.parse_args(arguments)
        if args.encoding == "automatic":
            default_encoding = args.encoding
            fallback_encoding = None
        elif args.encoding == "chardet":
            if not pygount.analysis.has_chardet:  # pragma: no cover
                parser.error("chardet must be installed in order to specify --encoding=chardet")
            default_encoding = args.encoding
            fallback_encoding = None
        else:
            if args.encoding.startswith("automatic;"):
                first_encoding_semicolon_index = args.encoding.find(";")
                default_encoding = args.encoding[:first_encoding_semicolon_index]
                fallback_encoding = args.encoding[first_encoding_semicolon_index + 1 :]
                encoding_to_check = ("fallback encoding", fallback_encoding)
            else:
                default_encoding = args.encoding
                fallback_encoding = None
                encoding_to_check = ("encoding", default_encoding)
            if encoding_to_check is not None:
                name, encoding = encoding_to_check
                try:
                    "".encode(encoding)
                except LookupError:
                    parser.error(f"{name} specified with --encoding must be a known Python encoding: {encoding}")
        return args, default_encoding, fallback_encoding

    def apply_arguments(self, arguments=None):
        if arguments is None:  # pragma: no cover
            arguments = sys.argv[1:]
        args, default_encoding, fallback_encoding = self.parsed_args(arguments)
        self.set_default_encoding(default_encoding, "option --encoding")
        self.set_fallback_encoding(fallback_encoding, "option --encoding")
        self.set_folders_to_skip(args.folders_to_skip, "option --folders-to-skip")
        self.set_generated_regexps(args.generated, "option --generated")
        self.set_generated_name_regexps(args.generated_names, "option --generated-names")
        self.set_has_duplicates(args.duplicates, "option --duplicates")
        self.set_has_to_merge_embedded_languages(args.merge_embedded_languages, "option --merge-embedded-languages")
        self.set_is_verbose(args.verbose, "option --verbose")
        self.set_names_to_skip(args.names_to_skip, "option --names-to-skip")
        self.set_output(args.out, "option --out")
        self.set_output_format(args.format, "option --format")
        self.set_source_patterns(args.source_patterns, "option PATTERNS")
        self.set_suffixes(args.suffix, "option --suffix")

    def execute(self):
        _log.setLevel(logging.INFO if self.is_verbose else logging.WARNING)
        with pygount.analysis.SourceScanner(
            self.source_patterns, self.suffixes, self.folders_to_skip, self.names_to_skip
        ) as source_scanner:
            source_paths_and_groups_to_analyze = list(source_scanner.source_paths())
            duplicate_pool = pygount.analysis.DuplicatePool() if not self.has_duplicates else None
            writer_class = _OUTPUT_FORMAT_TO_WRITER_CLASS_MAP[self.output_format]
            is_stdout = self.output == "STDOUT"
            target_context_manager = (
                contextlib.nullcontext(sys.stdout)
                if is_stdout
                else open(self.output, "w", encoding="utf-8", newline="")  # noqa: SIM115
            )
            with (
                target_context_manager as target_file,
                writer_class(target_file) as writer,
                Progress(disable=not writer.has_to_track_progress, transient=True) as progress,
            ):
                try:
                    for path_data in progress.track(source_paths_and_groups_to_analyze):
                        writer.add(
                            pygount.analysis.SourceAnalysis.from_file(
                                path_data.source_path,
                                path_data.group,
                                self.default_encoding,
                                self.fallback_encoding,
                                generated_regexes=self._generated_line_regexs,
                                generated_name_regexes=self._generated_name_regexps,
                                duplicate_pool=duplicate_pool,
                                merge_embedded_language=self.has_to_merge_embedded_languages,
                                tmp_dir=path_data.tmp_dir,
                            )
                        )
                finally:
                    progress.stop()


def pygount_command(arguments=None):
    result = 1
    command = Command()
    try:
        command.apply_arguments(arguments)
        command.execute()
        result = 0
    except KeyboardInterrupt:  # pragma: no cover
        _log.error("interrupted as requested by user")
    except (pygount.common.OptionError, OSError) as error:
        _log.error(error)
    except Exception as error:
        _log.exception(error)

    return result


def main():  # pragma: no cover
    logging.basicConfig(level=logging.WARNING)
    sys.exit(pygount_command())


if __name__ == "__main__":  # pragma: no cover
    main()


================================================
FILE: pygount/common.py
================================================
"""
Common classes and functions for pygount.
"""

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
import fnmatch
import functools
import inspect
import re
import typing
import warnings
from collections.abc import Iterator, Sequence
from re import Pattern
from typing import Optional, Union

WHITE_SPACE_CHARACTERS = " \f\n\r\t"

#: Pseudo pattern to indicate that the remaining pattern are an addition to the default patterns.
ADDITIONAL_PATTERN = "[...]"

#: Prefix to use for pattern strings to describe a regular expression instead of a shell pattern.
REGEX_PATTERN_PREFIX = "[regex]"

_REGEX_TYPE = type(re.compile(""))


class Error(Exception):
    """
    Error to indicate that something went wrong during a pygount run.
    """


class OptionError(Error):
    """
    Error to indicate that a value passed to a command line option must be
    fixed.
    """

    def __init__(self, message, source=None):
        super().__init__(message)
        self.option_error_message = (source + ": ") if source is not None else ""
        self.option_error_message += message

    def __str__(self):
        return self.option_error_message


def as_list(items_or_text: Union[str, Sequence[str]]) -> list[str]:
    if isinstance(items_or_text, str):
        # TODO: Allow to specify comma (,) in text using '[,]'.
        result = [item.strip() for item in items_or_text.split(",") if item.strip() != ""]
    else:
        result = list(items_or_text)
    return result


def regex_from(pattern: Union[str, Pattern], is_shell_pattern=False) -> Pattern:
    assert pattern is not None
    if isinstance(pattern, str):
        result = re.compile(fnmatch.translate(pattern)) if is_shell_pattern else re.compile(pattern)
    else:
        result = pattern  # Assume pattern already is a compiled regular expression
    return result


def regexes_from(
    patterns_text: Union[str, Sequence[str], Sequence[Pattern]],
    default_patterns_text: Optional[Union[str, Sequence[Pattern], Sequence[str]]] = None,
    source: Optional[str] = None,
) -> list[Pattern]:
    assert patterns_text is not None

    result = []
    default_regexes = []
    try:
        if isinstance(patterns_text, str):
            is_shell_pattern = True
            patterns_text_without_prefixes = patterns_text
            if patterns_text_without_prefixes.startswith(REGEX_PATTERN_PREFIX):
                is_shell_pattern = False
                patterns_text_without_prefixes = patterns_text_without_prefixes[len(REGEX_PATTERN_PREFIX) :]
            if patterns_text_without_prefixes.startswith(ADDITIONAL_PATTERN):
                assert default_patterns_text is not None
                default_regexes = regexes_from(default_patterns_text)
                patterns_text_without_prefixes = patterns_text_without_prefixes[len(ADDITIONAL_PATTERN) :]

            patterns = as_list(patterns_text_without_prefixes)
            result = [regex_from(pattern, is_shell_pattern) for pattern in patterns]
        else:
            regexes = list(patterns_text)
            if len(regexes) >= 1 and regexes[0] is None:
                default_regexes = regexes_from(default_patterns_text)
                regexes = regexes[1:]
            for supposed_regex in regexes:
                assert isinstance(supposed_regex, _REGEX_TYPE), (
                    f"patterns_text must a text or sequence or regular expressions but contains: {supposed_regex}"
                )
            result.extend(regexes)
    except re.error as error:
        raise OptionError(f"cannot parse pattern for regular repression: {error}", source) from None
    result.extend(default_regexes)
    return result


def matching_regex(text: str, regexes: list[typing.Pattern]) -> Optional[typing.Pattern]:
    return next((regex for regex in regexes if regex.match(text)), None)


def lines(text: str) -> Iterator[str]:
    """
    Generator function to yield lines (delimited with ``'\n'``) stored in
    ``text``. This is useful when a regular expression should only match on a
    per-line basis in a memory efficient way.
    """
    assert text is not None
    assert "\r" not in text
    previous_newline_index = 0
    newline_index = text.find("\n")
    while newline_index != -1:
        yield text[previous_newline_index:newline_index]
        previous_newline_index = newline_index + 1
        newline_index = text.find("\n", previous_newline_index)
    last_line = text[previous_newline_index:]
    if last_line != "":
        yield last_line


def deprecated(reason: Optional[str]):  # pragma: no cover
    """
    Decorator to mark functions as deprecated and log a warning in case it is called.

    Source: https://stackoverflow.com/questions/2536307/decorators-in-the-python-standard-lib-deprecated-specifically
    """

    if isinstance(reason, str):
        # The @deprecated is used with a 'reason'.
        #
        # .. code-block:: python
        #
        #    @deprecated("please, use another function")
        #    def old_function(x, y):
        #      pass

        def decorator(func1):
            class_or_func = "class" if inspect.isclass(func1) else "function"

            @functools.wraps(func1)
            def new_func1(*args, **kwargs):
                warnings.simplefilter("always", DeprecationWarning)
                warnings.warn(
                    f"Call to deprecated {class_or_func} {func1.__name__} ({reason}).",
                    category=DeprecationWarning,
                    stacklevel=2,
                )
                warnings.simplefilter("default", DeprecationWarning)
                return func1(*args, **kwargs)

            return new_func1

        return decorator

    if inspect.isclass(reason) or inspect.isfunction(reason):
        # The @deprecated is used without any 'reason'.
        #
        # .. code-block:: python
        #
        #    @deprecated
        #    def old_function(x, y):
        #      pass

        func2 = reason
        class_or_func = "class" if inspect.isclass(func2) else "function"

        @functools.wraps(func2)
        def new_func2(*args, **kwargs):
            warnings.simplefilter("always", DeprecationWarning)
            warnings.warn(
                f"Call to deprecated {class_or_func} {func2.__name__}.",
                category=DeprecationWarning,
                stacklevel=2,
            )
            warnings.simplefilter("default", DeprecationWarning)
            return func2(*args, **kwargs)

        return new_func2
    raise TypeError(repr(type(reason)))


def mapped_repr(type_, name_to_value_map) -> str:
    result = ", ".join(f"{name}={value}" for name, value in name_to_value_map.items())
    result = f"{type_.__class__.__name__}({result})"
    return result


================================================
FILE: pygount/git_storage.py
================================================
import re
import shutil
from tempfile import mkdtemp
from typing import Optional

import git

#: Regular expression to detect git url with the optional tag or branch
# from https://stackoverflow.com/questions/2514859/regular-expression-for-git-repository server-name
_GIT_URL_REGEX = re.compile(
    r"(?P<remote_url>((git|ssh|http(s)?)|(git@[\w.-]+))(:(//)?)([\w.@:/\-~]+)(\.git))(/)?(?P<revision>[\w./\-]+)?"
)


def git_remote_url_and_revision_if_any(git_url: str) -> tuple[Optional[str], Optional[str]]:
    assert git_url is not None
    git_url_match = _GIT_URL_REGEX.match(git_url)
    return (
        (None, None) if git_url_match is None else (git_url_match.group("remote_url"), git_url_match.group("revision"))
    )


class GitStorage:
    def __init__(self, remote_url: str, revision: Optional[str] = None):
        assert remote_url is not None
        self._remote_url = remote_url
        self._revision = revision
        self._temp_folder = mkdtemp()

    @property
    def temp_folder(self) -> str:
        return self._temp_folder

    def extract(self):
        multi_options = ["--depth", "1"]
        if self._revision is not None:
            multi_options.extend(["--branch", self._revision])
        git.Repo.clone_from(self._remote_url, self._temp_folder, multi_options=multi_options)

    def close(self):
        shutil.rmtree(self._temp_folder, ignore_errors=True)


================================================
FILE: pygount/lexers.py
================================================
"""
Additional lexers for pygount that fill gaps left by :py:mod:`pygments`.
"""

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
import pygments.lexer
import pygments.lexers
import pygments.token
import pygments.util


class IdlLexer(pygments.lexers.JavaLexer):
    """
    Lexer for OMG Interface Definition Language (IDL) that simply uses the
    existing Java lexer to find comments. While this is useless for syntax
    highlighting it is good enough for counting lines.
    """

    name = "IDL"
    filenames = ["*.idl"]


class MinimalisticM4Lexer(pygments.lexer.RegexLexer):
    """
    Minimalistic lexer for m4 macro processor that can distinguish between
    comments and code. It does not recognize a redefined comment mark though.
    """

    name = "M4"
    tokens = {
        "root": [
            (r"(.*)(#.*\n)", pygments.lexer.bygroups(pygments.token.Text, pygments.token.Comment.Single)),
            (r".*\n", pygments.token.Text),
        ]
    }


class MinimalisticVBScriptLexer(pygments.lexer.RegexLexer):
    """
    Minimalistic lexer for VBScript that can distinguish between comments and
    code.
    """

    name = "VBScript"
    tokens = {"root": [(r"\s*'.*\n", pygments.token.Comment.Single), (r".*\n", pygments.token.Text)]}


class MinimalisticWebFocusLexer(pygments.lexer.RegexLexer):
    """
    Minimalistic lexer for WebFOCUS that can distinguish between comments and
    code.
    """

    name = "WebFOCUS"
    tokens = {"root": [(r"-\*.*\n", pygments.token.Comment.Single), (r".*\n", pygments.token.Text)]}


class PlainTextLexer(pygments.lexer.RegexLexer):
    """
    Simple lexer for plain text that treats every line with non-white space
    characters as :py:data:`pygments.Token.Comment.Single` and only lines
    that are empty or contain only white space as
    :py:data:`pygments.Token.Text`.

    This way, plaint text files count as documentation.
    """

    name = "Text"
    tokens = {"root": [(r"\s*\n", pygments.token.Text), (r".+\n", pygments.token.Comment.Single)]}


================================================
FILE: pygount/summary.py
================================================
"""
Summaries of analyses of multiple source codes.
"""

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
import functools
import re
from collections.abc import Hashable

from .analysis import SourceAnalysis
from .common import mapped_repr

_PSEUDO_LANGUAGE_REGEX = re.compile("^__[a-z]+__$")


@functools.total_ordering
class LanguageSummary:
    """
    Summary of a source code counts from multiple files of the same language.
    """

    def __init__(self, language: str):
        self._language = language
        self._code_count = 0
        self._documentation_count = 0
        self._empty_count = 0
        self._file_count = 0
        self._file_percentage = 0.0
        self._string_count = 0
        self._is_pseudo_language = _PSEUDO_LANGUAGE_REGEX.match(self.language) is not None
        self._has_up_to_date_percentages = False

    @property
    def language(self) -> str:
        """the language to be summarized"""
        return self._language

    @property
    def code_count(self) -> int:
        """sum lines of code for this language"""
        return self._code_count

    @property
    def code_percentage(self) -> float:
        """percentage of lines containing code for this language across entire project"""
        return _percentage_or_0(self.code_count, self.line_count)

    def _assert_has_up_to_date_percentages(self):
        assert self._has_up_to_date_percentages, "update_percentages() must be called first"

    @property
    def documentation_count(self) -> int:
        """sum lines of documentation for this language"""
        return self._documentation_count

    @property
    def documentation_percentage(self) -> float:
        """percentage of lines containing documentation for this language across entire project"""
        return _percentage_or_0(self.documentation_count, self.line_count)

    @property
    def empty_count(self) -> int:
        """sum empty lines for this language"""
        return self._empty_count

    @property
    def empty_percentage(self) -> float:
        """percentage of empty lines for this language across entire project"""
        return _percentage_or_0(self.empty_count, self.line_count)

    @property
    def file_count(self) -> int:
        """number of source code files for this language"""
        return self._file_count

    @property
    def file_percentage(self) -> float:
        """percentage of files in project"""
        self._assert_has_up_to_date_percentages()
        return self._file_percentage

    @property
    def line_count(self) -> int:
        """sum count of all lines of any kind for this language"""
        return self.code_count + self.documentation_count + self.empty_count + self.string_count

    @property
    def string_count(self) -> int:
        """sum number of lines containing strings for this language"""
        return self._string_count

    @property
    def string_percentage(self) -> float:
        """percentage of lines containing strings for this language across entire project"""
        return _percentage_or_0(self.string_count, self.line_count)

    @property
    def source_count(self) -> int:
        """sum number of source lines of code"""
        return self.code_count + self.string_count

    @property
    def source_percentage(self) -> float:
        """percentage of source lines for code for this language across the entire project"""
        return _percentage_or_0(self.source_count, self.line_count)

    @property
    def is_pseudo_language(self) -> bool:
        """``True`` if the language is not a real programming language"""
        return self._is_pseudo_language

    def sort_key(self) -> Hashable:
        """sort key to sort multiple languages by importance"""
        return self.code_count, self.documentation_count, self.string_count, self.empty_count, self.language

    def __hash__(self):
        return hash(self.language)

    def __eq__(self, other):
        return self.sort_key() == other.sort_key()

    def __lt__(self, other):
        return self.sort_key() < other.sort_key()

    def add(self, source_analysis: SourceAnalysis) -> None:
        """
        Add counts from ``source_analysis`` to total counts for this language.
        """
        assert source_analysis is not None
        assert source_analysis.language == self.language

        self._has_up_to_date_percentages = False
        self._file_count += 1
        if source_analysis.is_countable:
            self._code_count += source_analysis.code_count
            self._documentation_count += source_analysis.documentation_count
            self._empty_count += source_analysis.empty_count
            self._string_count += source_analysis.string_count

    def update_file_percentage(self, project_summary: "ProjectSummary"):
        self._file_percentage = _percentage_or_0(self.file_count, project_summary.total_file_count)
        self._has_up_to_date_percentages = True

    def __repr__(self):
        name_to_value_map = {
            "language": f"{self.language!r}",
            "file_count": self.file_count,
        }
        if not self.is_pseudo_language:
            name_to_value_map.update(
                {
                    "code_count": self.code_count,
                    "documentation_count": self.documentation_count,
                    "empty_count": self.empty_count,
                    "string_count": self.string_count,
                }
            )
        return mapped_repr(self, name_to_value_map)


def _percentage_or_0(partial_count: int, total_count: int) -> float:
    assert partial_count >= 0
    assert total_count >= 0
    return 100 * partial_count / total_count if total_count != 0 else 0.0


class ProjectSummary:
    """
    Summary of source code counts for several languages and files.
    """

    def __init__(self):
        self._language_to_language_summary_map = {}
        self._total_code_count = 0
        self._total_documentation_count = 0
        self._total_empty_count = 0
        self._total_string_count = 0
        self._total_file_count = 0
        self._total_line_count = 0

    @property
    def language_to_language_summary_map(self) -> dict[str, LanguageSummary]:
        """
        A map containing summarized counts for each language added with :py:meth:`add()` so far.
        """
        return self._language_to_language_summary_map

    @property
    def total_code_count(self) -> int:
        return self._total_code_count

    @property
    def total_code_percentage(self) -> float:
        return _percentage_or_0(self.total_code_count, self.total_line_count)

    @property
    def total_documentation_count(self) -> int:
        return self._total_documentation_count

    @property
    def total_documentation_percentage(self) -> float:
        return _percentage_or_0(self.total_documentation_count, self.total_line_count)

    @property
    def total_empty_count(self) -> int:
        return self._total_empty_count

    @property
    def total_empty_percentage(self) -> float:
        return _percentage_or_0(self.total_empty_count, self.total_line_count)

    @property
    def total_file_count(self) -> int:
        return self._total_file_count

    @property
    def total_line_count(self) -> int:
        return self._total_line_count

    @property
    def total_source_count(self) -> int:
        return self.total_code_count + self.total_string_count

    @property
    def total_source_percentage(self) -> float:
        return _percentage_or_0(self.total_source_count, self.total_line_count)

    @property
    def total_string_count(self) -> int:
        return self._total_string_count

    @property
    def total_string_percentage(self) -> float:
        return _percentage_or_0(self.total_string_count, self.total_line_count)

    def add(self, source_analysis: SourceAnalysis) -> None:
        """
        Add counts from ``source_analysis`` to total counts.
        """
        self._total_file_count += 1
        language_summary = self.language_to_language_summary_map.get(source_analysis.language)
        if language_summary is None:
            language_summary = LanguageSummary(source_analysis.language)
            self.language_to_language_summary_map[source_analysis.language] = language_summary
        language_summary.add(source_analysis)

        if source_analysis.is_countable:
            self._total_code_count += source_analysis.code_count
            self._total_documentation_count += source_analysis.documentation_count
            self._total_empty_count += source_analysis.empty_count
            self._total_line_count += (
                source_analysis.code_count
                + source_analysis.documentation_count
                + source_analysis.empty_count
                + source_analysis.string_count
            )
            self._total_string_count += source_analysis.string_count

    def update_file_percentages(self) -> None:
        """Update percentages for all languages part of the project."""
        for language_summary in self._language_to_language_summary_map.values():
            language_summary.update_file_percentage(self)

    def __repr__(self):
        return (
            f"{self.__class__.__name__}("
            f"total_file_count={self.total_file_count}, "
            f"total_line_count={self.total_line_count}, "
            f"languages={sorted(self.language_to_language_summary_map.keys())})"
        )


================================================
FILE: pygount/write.py
================================================
"""
Writers to store the results of a pygount analysis.
"""

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
import datetime
import json
import math
import os
from xml.etree import ElementTree

from rich.console import Console
from rich.table import Table

import pygount

from . import SourceAnalysis
from .summary import ProjectSummary

#: Version of cloc the --format=cloc-xml pretends to be.
CLOC_VERSION = "1.60"

JSON_FORMAT_VERSION = "1.1.0"


class BaseWriter:
    def __init__(self, target_stream):
        self._target_stream = target_stream
        try:
            self.target_name = self._target_stream.name
        except AttributeError:
            self.target_name = "<io>"
        self.project_summary = ProjectSummary()
        self.started_at = self._utc_now()
        self.finished_at = None
        self.files_per_second = 0
        self.lines_per_second = 0
        self.duration = None
        self.duration_in_seconds = 0.0
        self.has_to_track_progress = True

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
        return False

    def add(self, source_analysis):
        self.project_summary.add(source_analysis)

    def close(self):
        self.project_summary.update_file_percentages()
        self.finished_at = self._utc_now()
        self.duration = self.finished_at - self.started_at
        self.duration_in_seconds = max(
            0.001, self.duration.microseconds * 1e-6 + self.duration.seconds + self.duration.days * 3600 * 24
        )
        self.lines_per_second = self.project_summary.total_line_count / self.duration_in_seconds
        self.files_per_second = self.project_summary.total_file_count / self.duration_in_seconds

    @staticmethod
    def _utc_now() -> datetime.datetime:
        # After switching to Python 3.11+, we can change this to `now(datetime.UTC)`.
        return datetime.datetime.now(datetime.timezone.utc)


class LineWriter(BaseWriter):
    """
    Writer that simply writes a line of text for each source code.
    """

    def __init__(self, target_stream):
        super().__init__(target_stream)
        self.has_to_track_progress = False

    def add(self, source_analysis):
        source_line_count = source_analysis.code_count + source_analysis.string_count
        line_to_write = (
            f"{source_line_count}\t{source_analysis.language}\t{source_analysis.group}\t{source_analysis.path}"
        )
        self._target_stream.write(line_to_write + os.linesep)


class ClocXmlWriter(BaseWriter):
    """
    Writer that writes XML output similar to cloc when called with options
    --by-file --xml. This kind of output can be processed by Jenkins' SLOCCount
    plug-in.
    """

    def __init__(self, target_stream):
        super().__init__(target_stream)
        self._results_element = ElementTree.Element("results")
        self._header_element = ElementTree.SubElement(self._results_element, "header")
        ElementTree.SubElement(self._header_element, "cloc_url", text="https://github.com/roskakori/pygount")
        ElementTree.SubElement(self._header_element, "cloc_version", text=CLOC_VERSION)
        self._files_element = ElementTree.SubElement(self._results_element, "files")

    def __exit__(self, exc_type, exc_val, exc_tb):
        if exc_type is None:
            # Only write the XML if everything works out.
            self.close()

    def add(self, source_analysis: SourceAnalysis):
        super().add(source_analysis)
        file_attributes = {
            "blank": str(source_analysis.empty_count),
            "code": str(source_analysis.source_count),
            "comment": str(source_analysis.documentation_count),
            "language": source_analysis.language,
            "name": source_analysis.path,
        }
        ElementTree.SubElement(self._files_element, "file", attrib=file_attributes)

    def close(self):
        super().close()
        # Add various statistics to <header>.
        ElementTree.SubElement(self._header_element, "elapsed_seconds", text=str(self.duration_in_seconds))
        ElementTree.SubElement(self._header_element, "n_files", text=str(self.project_summary.total_file_count))
        ElementTree.SubElement(self._header_element, "n_lines", text=str(self.project_summary.total_line_count))
        ElementTree.SubElement(self._header_element, "files_per_second", text=f"{self.files_per_second:f}")
        ElementTree.SubElement(self._header_element, "lines_per_second", text=f"{self.lines_per_second:f}")
        ElementTree.SubElement(self._header_element, "report_file", text=self.target_name)

        # Add totals to <files>.
        file_attributes = {
            "blank": str(self.project_summary.total_empty_count),
            "code": str(self.project_summary.total_code_count + self.project_summary.total_string_count),
            "comment": str(self.project_summary.total_documentation_count),
        }
        ElementTree.SubElement(self._files_element, "total", attrib=file_attributes)

        # Write the whole XML file.
        if self._target_stream.encoding is not None:
            # Write XML declaration only for files but skip it for io.StringIO.
            self._target_stream.write(f'<?xml version="1.0" encoding="{self._target_stream.encoding}"?>')
        xml_root = ElementTree.ElementTree(self._results_element)
        xml_root.write(self._target_stream, encoding="unicode", xml_declaration=False)


class SummaryWriter(BaseWriter):
    """
    Writer to summarize the analysis per language in a format that can easily
    be read by humans.
    """

    _COLUMNS_WITH_JUSTIFY = (
        ("Language", "left"),
        ("Files", "right"),
        ("%", "right"),
        ("Code", "right"),
        ("%", "right"),
        ("Comment", "right"),
        ("%", "right"),
    )

    def close(self):
        super().close()

        table = Table()
        for column, justify in self._COLUMNS_WITH_JUSTIFY:
            table.add_column(column, justify=justify, overflow="fold")

        language_summaries = sorted(self.project_summary.language_to_language_summary_map.values(), reverse=True)
        for index, language_summary in enumerate(language_summaries, start=1):
            table.add_row(
                language_summary.language,
                str(language_summary.file_count),
                formatted_percentage(language_summary.file_percentage),
                str(language_summary.code_count),
                formatted_percentage(language_summary.code_percentage),
                str(language_summary.documentation_count),
                formatted_percentage(language_summary.documentation_percentage),
                end_section=(index == len(language_summaries)),
            )
        table.add_row(
            "Sum",
            str(self.project_summary.total_file_count),
            formatted_percentage(100.0),
            str(self.project_summary.total_code_count),
            formatted_percentage(self.project_summary.total_code_percentage),
            str(self.project_summary.total_documentation_count),
            formatted_percentage(self.project_summary.total_documentation_percentage),
        )
        Console(file=self._target_stream, soft_wrap=True).print(table)


class JsonWriter(BaseWriter):
    """
    Writer JSON output, ideal for further automatic processing.
    """

    def __init__(self, target_stream):
        super().__init__(target_stream)
        self.source_analyses = []

    def add(self, source_analysis: SourceAnalysis):
        super().add(source_analysis)
        self.source_analyses.append(
            {
                "codeCount": source_analysis.code_count,
                "documentationCount": source_analysis.documentation_count,
                "emptyCount": source_analysis.empty_count,
                "group": source_analysis.group,
                "isCountable": source_analysis.is_countable,
                "language": source_analysis.language,
                "lineCount": source_analysis.line_count,
                "path": source_analysis.path,
                "state": source_analysis.state.name,
                "stateInfo": source_analysis.state_info,
                "sourceCount": source_analysis.source_count,
            }
        )

    def close(self):
        # NOTE: JSON names use camel case to follow JSLint's guidelines, see <https://www.jslint.com/>.
        super().close()
        json_map = {
            "formatVersion": JSON_FORMAT_VERSION,
            "pygountVersion": pygount.__version__,
            "files": self.source_analyses,
            "languages": [
                {
                    "documentationCount": language_summary.documentation_count,
                    "documentationPercentage": language_summary.documentation_percentage,
                    "codeCount": language_summary.code_count,
                    "codePercentage": language_summary.code_percentage,
                    "emptyCount": language_summary.empty_count,
                    "emptyPercentage": language_summary.empty_percentage,
                    "fileCount": language_summary.file_count,
                    "filePercentage": language_summary.file_percentage,
                    "isPseudoLanguage": language_summary.is_pseudo_language,
                    "language": language_summary.language,
                    "sourceCount": language_summary.source_count,
                    "sourcePercentage": language_summary.source_percentage,
                    "stringCount": language_summary.string_count,
                    "stringPercentage": language_summary.string_percentage,
                }
                for language_summary in self.project_summary.language_to_language_summary_map.values()
            ],
            "runtime": {
                "durationInSeconds": self.duration_in_seconds,
                "filesPerSecond": self.files_per_second,
                "finishedAt": self.finished_at.isoformat(),
                "linesPerSecond": self.lines_per_second,
                "startedAt": self.started_at.isoformat(),
            },
            "summary": {
                "totalCodeCount": self.project_summary.total_code_count,
                "totalCodePercentage": self.project_summary.total_code_percentage,
                "totalDocumentationCount": self.project_summary.total_documentation_count,
                "totalDocumentationPercentage": self.project_summary.total_documentation_percentage,
                "totalEmptyCount": self.project_summary.total_empty_count,
                "totalEmptyPercentage": self.project_summary.total_empty_percentage,
                "totalFileCount": self.project_summary.total_file_count,
                "totalSourceCount": self.project_summary.total_source_count,
                "totalSourcePercentage": self.project_summary.total_source_percentage,
                "totalStringCount": self.project_summary.total_string_count,
                "totalStringPercentage": self.project_summary.total_string_percentage,
            },
        }
        json.dump(json_map, self._target_stream)


def digit_width(line_count: int) -> int:
    assert line_count >= 0
    return math.ceil(math.log10(line_count + 1)) if line_count != 0 else 1


def formatted_percentage(percentage: float) -> str:
    assert percentage >= 0.0
    assert percentage <= 100.0
    return f"{percentage:.01f}"


================================================
FILE: pygount/xmldialect.py
================================================
"""
Function to obtain the language dialect used by XML source code.
"""

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
import logging
import re
import xml.sax

from pygount.common import WHITE_SPACE_CHARACTERS

# TODO #10: Replace regex for DTD by working DTD handler.
#: Regular expression to obtain DTD.
_DTD_REGEX = re.compile(r'<!DOCTYPE\s+(?P<name>[a-zA-Z][a-zA-Z-]*)\s+PUBLIC\s+"(?P<public_id>.+)"')
_REGEX_PATTERNS_AND_DIALECTS = (
    (".*DocBook.*", "DocBook XML"),
    (".+ SVG .+", "SVG XML"),
)
_REGEXES_AND_DIALECTS = [(re.compile(pattern), dialect) for pattern, dialect in _REGEX_PATTERNS_AND_DIALECTS]
for public_id_regex, dialect in _REGEX_PATTERNS_AND_DIALECTS:
    assert public_id_regex is not None
    assert dialect is not None
    assert dialect.strip() != ""
#: Regex to detect Sax error messages with uninformative paths like '<unknown>'.
_SAX_MESSAGE_WITHOUT_PATH_PATTERN = re.compile(r"^<.+>(?P<message_without_path>:\d+:\d+.+)")

_log = logging.getLogger("pygount")


class SaxParserDone(Exception):
    """
    Pseudo error to indicate that the Sax parser ist done.
    """


class XmlDialectHandler(xml.sax.ContentHandler, xml.sax.handler.DTDHandler):
    def __init__(self, max_element_count=100):
        super().__init__()
        self.dialect = None
        self._path = ""
        self._element_count = 0
        self._max_element_count = max_element_count

    def _set_dialect_and_stop_parsing(self, dialect):
        self.dialect = dialect
        raise SaxParserDone(f"language detected: {dialect}")

    def startElement(self, name, attrs):
        self._element_count += 1
        if self._element_count == self._max_element_count:
            raise SaxParserDone(f"no language found after parsing {self._element_count} elements")
        self._path += "/" + name
        xmlns = attrs.get("xmlns", "")
        if (self._path == "/project") and ("name" in attrs):
            self._set_dialect_and_stop_parsing("Ant")
        elif (self._path in ("/book/title", "/chapter/title")) or (xmlns == "http://docbook.org/ns/docbook"):
            self._set_dialect_and_stop_parsing("DocBook XML")
        elif xmlns == "http://xmlns.jcp.org/xml/ns/javaee":
            self._set_dialect_and_stop_parsing("JavaEE XML")
        elif xmlns.startswith("http://maven.apache.org/POM"):
            self._set_dialect_and_stop_parsing("Maven")
        elif xmlns.startswith("http://www.netbeans.org/ns/project/"):
            self._set_dialect_and_stop_parsing("NetBeans Project")

    def endElement(self, name):
        self._path = self._path[: -len(name) - 1]


def xml_dialect(xml_path, xml_code):
    # TODO #10: Remove hack to obtain DTD using a regex instead of a DTDHandler.
    xml_code_witout_header = without_xml_header(xml_code)
    dtd_match = _DTD_REGEX.match(xml_code_witout_header)
    if dtd_match is not None:
        public_id = dtd_match.group("public_id")
        for public_id_regex, dialect in _REGEXES_AND_DIALECTS:
            if public_id_regex.match(public_id):
                return dialect

    xml_dialect_handler = XmlDialectHandler()
    parser = xml.sax.make_parser()
    parser.setContentHandler(xml_dialect_handler)
    parser.setFeature(xml.sax.handler.feature_external_ges, False)
    parser.setFeature(xml.sax.handler.feature_external_pes, False)
    parser.setFeature(xml.sax.handler.feature_validation, False)
    try:
        parser.feed(xml_code)
        # NOTE: We can only call close() when the parser has finished,
        # otherwise close() raises a SAXException('parser finished').
        parser.close()
    except SaxParserDone:
        # Language has been determined or the parser has given up.
        pass
    except (ValueError, xml.sax.SAXException) as error:
        # NOTE: ValueError is raised on unknown url type.
        error_message = str(error)
        message_without_path_match = _SAX_MESSAGE_WITHOUT_PATH_PATTERN.match(error_message)
        if message_without_path_match is not None:
            # HACK: Replace uninformative sax path like '<unknown>' with actual XML path.
            error_message = xml_path + message_without_path_match.group("message_without_path")
        _log.warning(error_message)
    except OSError as error:
        _log.warning("%s: cannot analyze XML dialect: %s", xml_path, error)
    return xml_dialect_handler.dialect


def without_xml_header(xml_code: str) -> str:
    result = xml_code.lstrip(WHITE_SPACE_CHARACTERS)
    if result.startswith("<?xml"):
        end_if_xml_declaration = result.find("?>")
        if end_if_xml_declaration != -1:
            result = result[end_if_xml_declaration + 2 :].lstrip(WHITE_SPACE_CHARACTERS)
    return result


================================================
FILE: pyproject.toml
================================================
[project]
name = "pygount"
version = "3.3.0"
description = "count source lines of code (SLOC) using pygments"
authors = [{ name = "Thomas Aglassinger", email = "roskakori@users.sourceforge.net" }]
requires-python = ">=3.10, <4"
readme = "README.md"
license = "BSD-3-Clause"
keywords = [
    "code analysis",
    "count",
    "SLOC",
]
classifiers = [
    "Development Status :: 5 - Production/Stable",
    "Environment :: Console",
    "Intended Audience :: Developers",
    "License :: OSI Approved :: BSD License",
    "Natural Language :: English",
    "Operating System :: OS Independent",
    "Programming Language :: Python :: 3 :: Only",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
    "Programming Language :: Python :: 3.14",
    "Topic :: Software Development",
]
dependencies = [
    "chardet>=5,<6",
    "gitpython~=3.1",
    "pygments>=2,<3",
    "rich>=14",
]

[project.urls]
Homepage = "https://github.com/roskakori/pygount"
Repository = "https://github.com/roskakori/pygount.git"
Documentation = "https://pygount.readthedocs.io"
"Issue Tracker" = "https://github.com/roskakori/pygount/issues"
Changes = "https://pygount.readthedocs.io/en/latest/changes.html"

[project.scripts]
pygount = "pygount.command:main"

[tool.pytest.ini_options]
minversion = "9.0"
addopts = [
    "-rA"
]
testpaths = [
    "tests",
]

[dependency-groups]
dev = [
    "coveralls>=4,<5",
    "coverage>=7,<8",
    "hatchling>=1.27.0",
    "mkdocs>=1.6,<2",
    "mkdocs-material>=9",
    "pytest>=9.0.3",
    "pytest-cov>=7,<8",
    "pre-commit>=4,<5",
    "ruff>=0.15",
]

[tool.uv]
default-groups = [
    "dev",
]

[tool.hatch.build.targets.sdist]
exclude = [".idea", ".github", ".readthedocs.yaml"]

[tool.hatch.build.targets.wheel]
packages = ["pygount"]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.ruff]
exclude = [
    ".eggs",
    ".git",
    ".pytest_cache",
    ".pytype",
    ".ruff_cache",
    ".vscode",
    "__pypackages__",
    "_build",
    "build",
    "dist",
    "htmlcov",
]
line-length = 120
target-version = "py39"

[tool.ruff.lint]
ignore = [
    # Missing trailing comma → May cause conflicts when used with the formatter.
    "COM812",
    # Too many branches
    "PLR0912",
    # Too many arguments in function definition
    "PLR0913",
    # Too many statements
    "PLR0915",
    # Magic value used in comparison
    "PLR2004",
    # TODO#89 Enable checks for usage of pathlib.
    "PTH100",
    "PTH103",
    "PTH107",
    "PTH109",
    "PTH110",
    "PTH112",
    "PTH114",
    "PTH118",
    "PTH119",
    "PTH120",
    "PTH122",
    "PTH123",
    "PTH202",
    "PTH207",
    "PTH208",
    # Unneccesarry assign → We regularly use `result = ...; return result` to examine the result in the debugger.
    "RET504",
    # TODO#506 Enable RUF012 check for mutable class attributes.
    # Mutable class attributes should be annotated with `typing.ClassVar`
    "RUF012",
    # Avoid specifying long messages outside the exception class
    "TRY003",
    # Abstract `raise` to an inner function
    "TRY301",
]
select = [
    # flake8-builtins
    "A",
    # flake8-bugbear
    "B",
    # flake8-commas
    "COM",
    # flake8-comprehensions
    "C4",
    # flake8-django
    "DJ",
    # flake8-datetimez
    "DTZ",
    # pycodestyle
    "E",
    # Pyflakes
    "F",
    # isort
    "I",
    # flake8-no-pep420
    "INP",
    #  flake8-gettext
    "INT",
    # flake8-logging
    "LOG",
    # perflint
    "PERF",
    # pygrep-hooks
    "PGH",
    # flake8-pie
    "PIE",
    # pylint
    "PL",
    # flake8-use-pathlib
    "PTH",
    # refactor
    "R",
    # flake8-raise
    "RSE",
    # flake8-return
    "RET",
    # ruff specific rules
    "RUF",
    # flake8-self
    "SLF",
    # flake8-simplify
    "SIM",
    # tryceratops
    "TRY",
    # flake8-debugger
    "T10",
    # flake8-print
    "T20",
    # pyupgrade
    "UP",
]

[tool.ruff.lint.isort]
known-first-party = ["pygount", "scripts", "tests"]


================================================
FILE: scripts/build_documentation.sh
================================================
#!/bin/sh
# Build documentation using Sphinx
set -e
echo "📖 Building documentation"
mkdocs build
echo "✅ Successfully built documentation in site/index.html"


================================================
FILE: scripts/build_movie.sh
================================================
#!/bin/sh
# Build a gource movie about the development.
#
# For this to work, use macOS and install the following:
#
#   brew gource ffmpeg
#
# See also: <https://www.ekreative.com/blog/producing-your-own-git-repository-animated-visualization-video/>
set -ex
mkdir -p build
gource --auto-skip-seconds 1 --file-idle-time 0 --hide dirnames,filenames,mouse --seconds-per-day 1 --title Pygount -1920x1080 --output-ppm-stream - . | ffmpeg -y -r 30 -f image2pipe -vcodec ppm -i - -vcodec libx264 -preset ultrafast -pix_fmt yuv420p -crf 1 -threads 0 -bf 0 /tmp/pygount_movie.mp4


================================================
FILE: scripts/test_coverage.sh
================================================
#!/bin/sh
set -e
uv run pytest --cov-reset --cov=pygount --cov-branch --cov-report html
echo "To view results run: firefox htmlcov/index.html &"


================================================
FILE: scripts/update_dependencies.sh
================================================
#!/bin/sh
# Update requirements files and pre-commit hooks to current versions.
set -e
echo "🧱 Updating project"
uv sync
uv lock --upgrade
echo "🛠️ Updating pre-commit"
uv run pre-commit autoupdate
echo "🎉 Successfully updated dependencies"


================================================
FILE: tests/__init__.py
================================================
# Deliberately left empty.


================================================
FILE: tests/_common.py
================================================
"""
Common constants and functions used by multiple tests.
"""

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
import os
import shutil
import unittest
from collections.abc import Iterator, Sequence
from contextlib import contextmanager
from tempfile import NamedTemporaryFile
from typing import IO, TextIO, Union

PYGOUNT_PROJECT_FOLDER = os.path.dirname(os.path.dirname(__file__))
PYGOUNT_SOURCE_FOLDER = os.path.join(PYGOUNT_PROJECT_FOLDER, "pygount")


class TempFolderTest(unittest.TestCase):
    def setUp(self):
        self.tests_temp_folder = os.path.join(PYGOUNT_PROJECT_FOLDER, "tests", ".temp")
        os.makedirs(self.tests_temp_folder, exist_ok=True)

    def create_temp_file(
        self, relative_target_path, content: Union[str, bytes, Sequence[str]], encoding="utf-8", do_create_folder=False
    ):
        result = os.path.join(self.tests_temp_folder, relative_target_path)
        if do_create_folder:
            os.makedirs(os.path.dirname(result), exist_ok=True)
        with open(result, "w", encoding=encoding) as target_file:
            if isinstance(content, (str, bytes)):
                target_file.write(content)
            else:
                for line in content:
                    target_file.write(line)
                    target_file.write("\n")
        return result

    def create_temp_binary_file(self, relative_target_path, content: bytes):
        result = os.path.join(self.tests_temp_folder, relative_target_path)
        with open(result, "wb") as target_file:
            target_file.write(content)
        return result

    def tearDown(self):
        shutil.rmtree(self.tests_temp_folder)


@contextmanager
def temp_binary_file(data: bytes) -> Iterator[IO]:
    with NamedTemporaryFile(mode="wb+", suffix=".bin") as result:
        result.write(data)
        result.flush()
        result.seek(0)
        yield result


@contextmanager
def temp_source_file(suffix: str, lines: list[str], *, encoding: str = "utf-8") -> Iterator[TextIO]:
    with NamedTemporaryFile(encoding=encoding, mode="w+", suffix=f".{suffix}") as result:
        result.write("\n".join(lines))
        result.flush()
        result.seek(0)
        yield result


================================================
FILE: tests/test_analysis.py
================================================
"""
Tests for pygount source code analysis.
"""

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
import glob
import os
import unittest
from io import BytesIO, StringIO

import pytest
from pygments import lexers, token

import pygount
from pygount import Error as PygountError
from pygount import analysis, common
from pygount.analysis import (
    _delined_tokens,
    _line_parts,
    _pythonized_comments,
    base_language,
    guess_lexer,
    is_markup_file,
)

from ._common import PYGOUNT_PROJECT_FOLDER, PYGOUNT_SOURCE_FOLDER, TempFolderTest, temp_source_file
from .test_xmldialect import EXAMPLE_ANT_CODE


class SourceScannerTest(TempFolderTest):
    def setUp(self):
        super().setUp()
        self._tests_folder = os.path.dirname(__file__)

    def test_can_find_no_files(self):
        scanner = analysis.SourceScanner([])
        actual_paths = list(scanner.source_paths())
        assert actual_paths == []

    def test_can_find_any_files(self):
        scanner = analysis.SourceScanner([PYGOUNT_SOURCE_FOLDER])
        actual_paths = list(scanner.source_paths())
        assert actual_paths != []

    def test_can_find_python_files(self):
        scanner = analysis.SourceScanner([PYGOUNT_SOURCE_FOLDER], "py")
        actual_paths = list(scanner.source_paths())
        assert actual_paths != []
        for path_data in actual_paths:
            actual_suffix = os.path.splitext(path_data.source_path)[1]
            assert actual_suffix == ".py"

    def test_can_skip_dot_folder(self):
        project_folder_name = "project"
        project_folder = os.path.join(self.tests_temp_folder, project_folder_name)
        name_to_include = "include.py"
        relative_path_to_include = os.path.join(project_folder_name, "include", name_to_include)
        self.create_temp_file(relative_path_to_include, "include = 1", do_create_folder=True)
        relative_path_to_skip = os.path.join(project_folder_name, ".skip", "skip.py")
        self.create_temp_file(relative_path_to_skip, "skip = 2", do_create_folder=True)

        scanner = analysis.SourceScanner([project_folder])
        scanned_names = [os.path.basename(path_data.source_path) for path_data in scanner.source_paths()]
        assert scanned_names == [name_to_include]

    def test_succeeds_on_not_git_extension(self):
        non_repo_urls = [["https://github.com/roskakori/pygount/"], ["git@github.com:roskakori/pygount"]]
        for non_repo_url in non_repo_urls:
            with analysis.SourceScanner(non_repo_url) as scanner:
                _ = list(scanner.source_paths())

    def test_fails_on_non_git_urls(self):
        non_repo_urls = [["https://no/git/url"], ["https://google.com/nogit"]]
        for non_repo_url in non_repo_urls:
            with (
                analysis.SourceScanner(non_repo_url) as scanner,
                pytest.raises(pygount.Error, match="URL to git repository"),
            ):
                _ = list(scanner.source_paths())

    def test_can_find_python_files_in_dot(self):
        scanner = analysis.SourceScanner(["."], "py")
        actual_paths = list(scanner.source_paths())
        assert actual_paths != []
        for path_data in actual_paths:
            actual_suffix = os.path.splitext(path_data.source_path)[1]
            assert actual_suffix == ".py"

    def test_can_find_files_from_mixed_cloned_git_remote_url_and_local(self):
        git_remote_url = "https://github.com/roskakori/pygount.git"
        with analysis.SourceScanner([git_remote_url, PYGOUNT_SOURCE_FOLDER]) as scanner:
            actual_paths = list(scanner.source_paths())
            assert actual_paths != []
            assert actual_paths[0].source_path != actual_paths[-1].source_path
            assert actual_paths[-1].tmp_dir is not None


class AnalysisTest(unittest.TestCase):
    def test_can_deline_tokens(self):
        assert list(_delined_tokens([(token.Comment, "# a")])) == [(token.Comment, "# a")]
        assert list(_delined_tokens([(token.Comment, "# a\n#  b")])) == [
            (token.Comment, "# a\n"),
            (token.Comment, "#  b"),
        ]
        assert list(_delined_tokens([(token.Comment, "# a\n#  b\n")])) == [
            (token.Comment, "# a\n"),
            (token.Comment, "#  b\n"),
        ]
        assert list(_delined_tokens([(token.Comment, "# a\n#  b\n # c\n")])) == [
            (token.Comment, "# a\n"),
            (token.Comment, "#  b\n"),
            (token.Comment, " # c\n"),
        ]

    def test_can_compute_python_line_parts(self):
        python_lexer = lexers.get_lexer_by_name("python")
        assert list(_line_parts(python_lexer, "#")) == [set("d")]
        assert list(_line_parts(python_lexer, "s = 'x'  # x")) == [set("cds")]

    def test_can_detect_white_text(self):
        python_lexer = lexers.get_lexer_by_name("python")
        assert list(_line_parts(python_lexer, "{[()]};")) == [set()]
        assert list(_line_parts(python_lexer, "pass")) == [set()]

    def test_can_convert_python_strings_to_comments(self):
        source_code = '#!/bin/python\n"Some tool."\n#(C) by me\ndef x():\n    "Some function"\n    return 1'
        python_lexer = lexers.get_lexer_by_name("python")
        python_tokens = python_lexer.get_tokens(source_code)
        for token_type, _ in list(_pythonized_comments(_delined_tokens(python_tokens))):
            assert token_type not in token.String

    def test_can_analyze_python(self):
        source_lines = [
            '"Some tool."',
            "#!/bin/python",
            "#(C) by me",
            "def x():",
            '    "Some function"',
            '    return "abc"',
        ]
        actual_line_parts = _line_parts_with_detected_markup("python", source_lines)
        expected_line_parts = [{"d"}, {"d"}, {"d"}, {"c"}, {"d"}, {"c", "s"}]
        assert actual_line_parts == expected_line_parts

    def test_can_analyze_c(self):
        source_lines = [
            "/*",
            " * The classic hello world for C99.",
            " */",
            "#include <stdio.h>",
            "int main(void) {",
            '   puts("Hello, World!");',
            "}",
        ]
        actual_line_parts = _line_parts_with_detected_markup("c", source_lines)
        expected_line_parts = [{"d"}, {"d"}, {"d"}, {"c"}, {"c"}, {"c", "s"}, set()]
        assert actual_line_parts == expected_line_parts


def test_can_detect_all_lines_as_documentation_with_markup_enabled():
    source_lines = [
        "/*",
        " * The classic hello world for C99.",
        " */",
        "#include <stdio.h>",
        "int main(void) {",
        '   puts("Hello, World!");',
        "}",
    ]
    actual_line_parts = _line_parts_with_detected_markup("markdown", source_lines)
    assert all(line_part == "d" for line_part in actual_line_parts[-1])
    assert actual_line_parts[-1:] == [set()]


def _line_parts_with_detected_markup(lexer_name: str, source_lines: list[str]) -> list[set[str]]:
    lexer = lexers.get_lexer_by_name(lexer_name)
    is_markup = lexer_name in ["markdown", "md", "restructuredtext", "rst", "rest", "groff"]
    source_code = "\n".join(source_lines)
    return list(_line_parts(lexer, source_code, is_markup=is_markup))


class _NonSeekableEmptyBytesIO(BytesIO):
    # Class to create a 'dummy object that mimics a non-seekable file handle'
    def seekable(self) -> bool:
        return False


class FileAnalysisTest(TempFolderTest):
    def test_can_analyze_encoding_error(self):
        test_path = self.create_temp_file("encoding_error.py", 'print("\N{EURO SIGN}")', encoding="cp1252")
        source_analysis = analysis.SourceAnalysis.from_file(test_path, "test", encoding="utf-8")
        assert source_analysis.language == "__error__"
        assert source_analysis.state == analysis.SourceState.error
        assert "0x80" in str(source_analysis.state_info)

    def test_can_detect_silent_dos_batch_remarks(self):
        test_bat_path = self.create_temp_file(
            "test_can_detect_silent_dos_batch_remarks.bat",
            ["rem normal comment", "@rem silent comment", "echo some code"],
        )
        source_analysis = analysis.SourceAnalysis.from_file(test_bat_path, "test", encoding="utf-8")
        assert source_analysis.language == "Batchfile"
        assert source_analysis.code_count == 1
        assert source_analysis.documentation_count == 2

    def test_can_ignore_almost_magic_comment(self):
        test_bat_path = self.create_temp_file(
            "test_can_ignore_almost_magic_comment.json",
            ['{"x":"coding:no_such_coding"'],
        )
        source_analysis = analysis.SourceAnalysis.from_file(test_bat_path, "test")
        assert source_analysis.language.lower() == "json"
        assert source_analysis.code_count == 1
        assert source_analysis.documentation_count == 0

    def test_fails_on_unknown_magic_encoding_comment(self):
        test_path = self.create_temp_file(
            "test_fails_on_unknown_magic_encoding_comment.py", ["# -*- coding: no_such_encoding -*-", 'print("hello")']
        )
        no_such_encoding = analysis.encoding_for(test_path)
        assert no_such_encoding == "no_such_encoding"
        source_analysis = analysis.SourceAnalysis.from_file(test_path, "test", encoding=no_such_encoding)
        assert source_analysis.language == "__error__"
        assert source_analysis.state == analysis.SourceState.error
        assert "unknown encoding" in str(source_analysis.state_info)

    def test_can_analyze_oracle_sql(self):
        test_oracle_sql_path = self.create_temp_file(
            "test_can_analyze_oracle_sql.pls",
            ["-- Oracle SQL example using an obscure suffix.", "select *", "from some_table;"],
        )
        source_analysis = analysis.SourceAnalysis.from_file(test_oracle_sql_path, "test", encoding="utf-8")
        assert source_analysis.language.lower().endswith("sql")
        assert source_analysis.code_count == 2
        assert source_analysis.documentation_count == 1

    def test_can_analyze_webfocus(self):
        test_fex_path = self.create_temp_file(
            "some.fex", ["-* comment", "-type some text", "table file some print * end;"]
        )
        source_analysis = analysis.SourceAnalysis.from_file(test_fex_path, "test", encoding="utf-8")
        assert source_analysis.language == "WebFOCUS"
        assert source_analysis.code_count == 2
        assert source_analysis.documentation_count == 1

    def test_can_analyze_xml_dialect(self):
        build_xml_path = self.create_temp_file("build.xml", EXAMPLE_ANT_CODE)
        source_analysis = analysis.SourceAnalysis.from_file(build_xml_path, "test")
        assert source_analysis.state == analysis.SourceState.analyzed
        assert source_analysis.language == "Ant"

    def test_can_analyze_unknown_language(self):
        unknown_language_path = self.create_temp_file("some.unknown_language", ["some", "lines", "of", "text"])
        source_analysis = analysis.SourceAnalysis.from_file(unknown_language_path, "test")
        assert source_analysis.state == analysis.SourceState.unknown

    def test_can_detect_binary_source_code(self):
        binary_path = self.create_temp_binary_file("some_django.mo", b"hello\0world!")
        source_analysis = analysis.SourceAnalysis.from_file(binary_path, "test", encoding="utf-8")
        assert source_analysis.state == analysis.SourceState.binary
        assert source_analysis.code_count == 0

    def test_can_analyze_stringio(self):
        test_path = "imaginary/path/to/file.py"
        test_code = "from random import randint\n\n# Print a random dice roll\nprint(randint(6))\n"
        source_analysis = analysis.SourceAnalysis.from_file(test_path, "test", file_handle=StringIO(test_code))
        assert source_analysis.state == analysis.SourceState.analyzed
        assert source_analysis.language == "Python"
        assert source_analysis.code_count == 2

    def test_can_analyze_bytesio(self):
        test_path = "imaginary/path/to/file.py"
        test_code = b"from random import randint\n\n# Print a random dice roll\nprint(randint(6))\n"
        source_analysis = analysis.SourceAnalysis.from_file(test_path, "test", file_handle=BytesIO(test_code))
        assert source_analysis.state == analysis.SourceState.analyzed
        assert source_analysis.language == "Python"
        assert source_analysis.code_count == 2

    def test_can_analyze_embedded_language(self):
        test_html_django_path = self.create_temp_file(
            "some.html",
            ["<!DOCTYPE html>", "{% load i18n %}", '<html lang="{{ language_code }}" />'],
        )
        source_analysis = analysis.SourceAnalysis.from_file(test_html_django_path, "test", encoding="utf-8")
        assert source_analysis.language.lower() == "html+django/jinja"
        assert source_analysis.code_count == 3

    def test_can_analyze_generated_name(self):
        test_uv_lock_path = self.create_temp_file("uv.lock", [])
        source_analysis = analysis.SourceAnalysis.from_file(
            test_uv_lock_path,
            "test",
            generated_name_regexes=pygount.common.regexes_from(pygount.analysis.DEFAULT_GENERATED_NAME_PATTERNS_TEXT),
        )
        assert source_analysis.state == analysis.SourceState.generated

    def test_can_merge_embedded_language(self):
        test_html_django_path = self.create_temp_file(
            "some.html",
            ["<!DOCTYPE html>", "{% load i18n %}", '<html lang="{{ language_code }}" />'],
        )
        source_analysis = analysis.SourceAnalysis.from_file(
            test_html_django_path, "test", encoding="utf-8", merge_embedded_language=True
        )
        assert source_analysis.language.lower() == "html"
        assert source_analysis.code_count == 3

    def test_can_analyze_unknown_magic_comment_encoding(self):
        test_python_path = self.create_temp_file("some.py", ["# -*- coding: no_such_encoding -*-", "print('hello')"])
        source_analysis = analysis.SourceAnalysis.from_file(test_python_path, "test")
        assert source_analysis.language.lower() == "__error__"
        assert source_analysis.state_info == "unknown encoding: no_such_encoding"

    def test_fails_on_non_seekable_file_handle_with_encoding_automatic(self):
        file_handle = _NonSeekableEmptyBytesIO()

        with pytest.raises(PygountError, match=r".*file handle must be seekable.*"):
            analysis.SourceAnalysis.from_file("README.md", "test", file_handle=file_handle, encoding="automatic")

    def test_fails_on_non_seekable_file_handle_with_encoding_chardet(self):
        file_handle = _NonSeekableEmptyBytesIO()

        with pytest.raises(PygountError, match=r".*file handle must be seekable.*"):
            analysis.SourceAnalysis.from_file("README.md", "test", file_handle=file_handle, encoding="chardet")


@pytest.mark.parametrize(
    "suffix, code_count, doc_count, expected_language_lower",
    [
        ("rst", 0, 3, "restructuredtext"),
        ("md", 0, 3, "markdown"),
        ("txt", 0, 3, "text only"),
        ("4", 0, 3, "groff"),
    ],
)
def test_can_analyze_markup_as_plain_documentation(
    suffix, code_count: int, doc_count: int, expected_language_lower: str
):
    source_lines = ["<!DOCTYPE html>", "{% load i18n %}", "", "  ", '<html lang="{{ language_code }}" />']
    expected_empty_count = 2
    expected_documentation_count = len(source_lines) - expected_empty_count
    with temp_source_file(suffix, source_lines) as test_file:
        source_analysis = analysis.SourceAnalysis.from_file(test_file.name, "test", encoding="utf-8")
        assert source_analysis.language.lower() == expected_language_lower
        assert source_analysis.code_count == 0
        assert source_analysis.documentation_count == expected_documentation_count
        assert source_analysis.empty_count == expected_empty_count


def test_can_repr_source_analysis_from_file():
    source_analysis = analysis.SourceAnalysis("some.py", "Python", "some", 1, 2, 3, 4, analysis.SourceState.analyzed)
    expected_source_analysis_repr = (
        "SourceAnalysis(path='some.py', language='Python', group='some', "
        "state=analyzed, code_count=1, documentation_count=2, empty_count=3, string_count=4)"
    )
    assert repr(source_analysis) == expected_source_analysis_repr
    assert repr(source_analysis) == str(source_analysis)


def test_can_repr_empty_source_analysis_from_file():
    source_analysis = analysis.SourceAnalysis("some.py", "__empty__", "some", 0, 0, 0, 0, analysis.SourceState.empty)
    expected_source_analysis_repr = "SourceAnalysis(path='some.py', language='__empty__', group='some', state=empty)"
    assert repr(source_analysis) == expected_source_analysis_repr
    assert repr(source_analysis) == str(source_analysis)


def test_can_repr_error_source_analysis_from_file():
    source_analysis = analysis.SourceAnalysis(
        "some.py", "__error__", "some", 0, 0, 0, 0, analysis.SourceState.error, "error details"
    )
    expected_source_analysis_repr = (
        "SourceAnalysis(path='some.py', language='__error__', group='some', state=error, state_info='error details')"
    )
    assert repr(source_analysis) == expected_source_analysis_repr
    assert repr(source_analysis) == str(source_analysis)


def test_can_guess_lexer_for_python():
    lexer = guess_lexer("some.py", "pass")
    assert lexer is not None
    assert lexer.name == "Python"


def test_can_guess_lexer_for_plain_text():
    lexer = guess_lexer("README.1st", "hello!")
    assert lexer is not None
    assert lexer.name == "Text"


def test_can_guess_lexer_for_cmakelists():
    source_code = "\n".join(
        [
            "cmake_minimum_required(VERSION 2.6)",
            "project(example)",
            "set(CMAKE_CXX_STANDARD 14)",
            "set(SOURCE_FILES example.cpp)",
            "add_executable(example ${SOURCE_FILES})",
        ]
    )
    lexer = guess_lexer("CMakeLists.txt", source_code)
    assert lexer is not None
    assert lexer.name == "CMake"


class GeneratedCodeTest(TempFolderTest):
    _STANDARD_SOURCE_LINES = [
        "#!/bin/python3",
        "    # Example code for",
        "    # generated source code.",
        '    print("I\'m generated!")',
        "    ",
    ]
    _STANDARD_GENERATED_REGEXES = common.regexes_from(
        common.REGEX_PATTERN_PREFIX + ".*some,.*other,.*generated,.*print"
    )

    def test_can_detect_non_generated_code(self):
        default_generated_regexes = common.regexes_from(analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT)
        with open(__file__, encoding="utf-8") as source_file:
            matching_line_number_and_regex = analysis.matching_number_line_and_regex(
                source_file, default_generated_regexes
            )
        assert matching_line_number_and_regex is None

    def test_can_detect_generated_code(self):
        matching_number_line_and_regex = analysis.matching_number_line_and_regex(
            GeneratedCodeTest._STANDARD_SOURCE_LINES, GeneratedCodeTest._STANDARD_GENERATED_REGEXES
        )
        assert matching_number_line_and_regex is not None
        matching_number, matching_line, matching_regex = matching_number_line_and_regex
        assert matching_number == 2
        assert matching_line == GeneratedCodeTest._STANDARD_SOURCE_LINES[2]
        assert matching_regex == GeneratedCodeTest._STANDARD_GENERATED_REGEXES[2]

    def test_can_not_detect_generated_code_with_late_comment(self):
        non_matching_number_line_and_regex = analysis.matching_number_line_and_regex(
            GeneratedCodeTest._STANDARD_SOURCE_LINES, GeneratedCodeTest._STANDARD_GENERATED_REGEXES, 2
        )
        assert non_matching_number_line_and_regex is None

    def test_can_analyze_generated_code_with_own_pattern(self):
        lines = ["-- Generiert mit Hau-Ruck-Franz-Deutsch.", "select * from sauerkraut;"]
        generated_sql_path = self.create_temp_file("generated.sql", lines)
        source_analysis = analysis.SourceAnalysis.from_file(
            generated_sql_path, "test", generated_regexes=common.regexes_from("[regex](?i).*generiert")
        )
        assert source_analysis.state == analysis.SourceState.generated


class SizeTest(TempFolderTest):
    def test_can_detect_empty_source_code(self):
        empty_py_path = self.create_temp_binary_file("empty.py", b"")
        source_analysis = analysis.SourceAnalysis.from_file(empty_py_path, "test", encoding="utf-8")
        assert source_analysis.state == analysis.SourceState.empty
        assert source_analysis.code_count == 0


def test_can_analyze_project_markdown_files():
    project_root_folder = os.path.dirname(PYGOUNT_PROJECT_FOLDER)
    for text_path in glob.glob(os.path.join(project_root_folder, "*.md")):
        source_analysis = analysis.SourceAnalysis.from_file(text_path, "test")
        assert source_analysis.state == analysis.SourceState.analyzed
        assert source_analysis.documentation_count > 0
        assert source_analysis.empty_count > 0


def test_has_no_duplicate_in_pygount_source():
    duplicate_pool = analysis.DuplicatePool()
    source_paths = []
    for sub_folder_name in ("pygount", "tests"):
        source_paths.extend(
            [
                os.path.join(PYGOUNT_PROJECT_FOLDER, sub_folder_name, source_name)
                for source_name in os.listdir(os.path.join(PYGOUNT_PROJECT_FOLDER, sub_folder_name))
            ]
        )
    for source_path in source_paths:
        if source_path.endswith(".py"):
            duplicate_path = duplicate_pool.duplicate_path(source_path)
            assert duplicate_path is None, f"{source_path} must not be duplicate of {duplicate_path}"


def test_can_compute_base_language():
    assert base_language("JavaScript") == "JavaScript"
    assert base_language("JavaScript+Lasso") == "JavaScript"
    assert base_language("JavaScript+") == "JavaScript+"  # no actual language
    assert base_language("C++") == "C++"
    assert base_language("++C") == "++C"  # no actual language
    assert base_language("") == ""  # no actual language, but should not crash either


class DuplicatePoolTest(TempFolderTest):
    def test_can_distinguish_different_files(self):
        some_path = self.create_temp_file(__name__ + "_some", "some")
        other_path = self.create_temp_file(__name__ + "_other", "other")
        duplicate_pool = analysis.DuplicatePool()
        assert duplicate_pool.duplicate_path(some_path) is None
        assert duplicate_pool.duplicate_path(other_path) is None

    def test_can_detect_duplicate(self):
        same_content = "same"
        original_path = self.create_temp_file("original", same_content)
        duplicate_path = self.create_temp_file("duplicate", same_content)
        duplicate_pool = analysis.DuplicatePool()
        assert duplicate_pool.duplicate_path(original_path) is None
        assert original_path == duplicate_pool.duplicate_path(duplicate_path)


@pytest.mark.parametrize(
    "suffix, expected_result",
    [("md", True), ("MD", True), ("mD", True), ("rst", True), ("py", False), ("4", True), ("c", False)],
)
def test_can_detect_markup_file(suffix, expected_result):
    source_path = f"some_file_name.{suffix}"
    assert is_markup_file(source_path) == expected_result


================================================
FILE: tests/test_command.py
================================================
"""
Tests for pygount command line interface.
"""

import contextlib

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
import json
import os
import tempfile
from xml.etree import ElementTree

import pytest

import pygount
from pygount import command
from pygount.command import VALID_OUTPUT_FORMATS, Command
from pygount.common import OptionError
from pygount.write import JSON_FORMAT_VERSION

from ._common import PYGOUNT_PROJECT_FOLDER, PYGOUNT_SOURCE_FOLDER, TempFolderTest


class CommandTest(TempFolderTest):
    def test_fails_on_unknown_output_format(self):
        unknown_output_format = "no_such_output_format"
        command = Command()
        with pytest.raises(OptionError, match=unknown_output_format):
            command.set_output_format(unknown_output_format)

    def test_can_set_encoding(self):
        command = Command()
        command.set_encodings("automatic;cp1252")
        assert command.default_encoding == "automatic"
        assert command.fallback_encoding == "cp1252"

    def test_can_execute_on_own_code(self):
        output_path = os.path.join(self.tests_temp_folder, "test_can_execute_on_own_code.txt")
        with contextlib.suppress(FileNotFoundError):  # Ignore missing file as it is going to be recreated.
            os.remove(output_path)
        command = Command()
        command.set_output(output_path)
        command.set_output_format("cloc-xml")
        command.set_source_patterns(PYGOUNT_SOURCE_FOLDER)
        command.set_suffixes("py")
        command.execute()
        cloc_xml_root = ElementTree.parse(output_path)
        file_elements = cloc_xml_root.findall("files/file")
        assert file_elements is not None
        assert len(file_elements) >= 1

    def test_fails_on_broken_regex(self):
        command = Command()
        with pytest.raises(OptionError, match=r"^option --generated: cannot parse pattern for regular repression.*"):
            command.set_generated_regexps("[regex](", "option --generated")

    def test_can_use_chardet_for_encoding(self):
        command = Command()
        command.set_encodings("chardet")
        command.set_source_patterns(PYGOUNT_SOURCE_FOLDER)
        command.execute()


class PygountCommandTest(TempFolderTest):
    def test_can_show_help(self):
        with pytest.raises(SystemExit) as error_info:
            command.pygount_command(["--help"])
        assert error_info.value.code == 0

    def test_can_show_version(self):
        with pytest.raises(SystemExit) as error_info:
            command.pygount_command(["--version"])
        assert error_info.value.code == 0

    def test_fails_on_unknown_encoding(self):
        with pytest.raises(SystemExit) as error_info:
            command.pygount_command(["--encoding", "no_such_encoding", tempfile.gettempdir()])
        assert error_info.value.code == 2

    def test_fails_on_unknown_format(self):
        with pytest.raises(SystemExit) as error_info:
            command.pygount_command(["--format", "no_such_encoding", tempfile.gettempdir()])
        assert error_info.value.code == 2

    def test_fails_on_broken_regex_pattern(self):
        exit_code = command.pygount_command(["--generated", "[regex](", tempfile.gettempdir()])
        assert exit_code == 1

    def test_can_analyze_pygount_setup_py(self):
        pygount_setup_py_path = os.path.join(PYGOUNT_PROJECT_FOLDER, "setup.py")
        exit_code = command.pygount_command(["--verbose", pygount_setup_py_path])
        assert exit_code == 0

    def test_can_analyze_pygount_source_code(self):
        exit_code = command.pygount_command(["--verbose", PYGOUNT_SOURCE_FOLDER])
        assert exit_code == 0

    def test_can_detect_generated_code(self):
        generated_code_path = os.path.join(self.tests_temp_folder, "generated.py")
        with open(generated_code_path, "w", encoding="utf-8") as generated_code_file:
            generated_code_file.write(
                "# Generated with pygount.test_command.PygountCommandTest.test_can_detect_generated_code.\n"
            )
            generated_code_file.write("# Do not edit!\n")
            generated_code_file.write("print('hello World')\n")
        cloc_xml_path = os.path.join(self.tests_temp_folder, "cloc.xml")
        exit_code = command.pygount_command(
            ["--verbose", "--format", "cloc-xml", "--out", cloc_xml_path, generated_code_path]
        )
        assert exit_code == 0
        assert os.path.exists(cloc_xml_path)
        cloc_xml_root = ElementTree.parse(cloc_xml_path)
        file_elements = cloc_xml_root.findall("files/file[@language='__generated__']")
        assert file_elements is not None
        assert len(file_elements) >= 1

    def test_can_detect_generated_code_with_own_pattern(self):
        generiert_py_path = os.path.join(self.tests_temp_folder, "generiert.py")
        with open(generiert_py_path, "w", encoding="utf-8") as generiert_py_file:
            generiert_py_file.write(
                "# Generiert mit pygount.test_command.PygountCommandTest."
                "test_can_detect_generated_code_with_own_pattern()\n"
            )
            generiert_py_file.write("print('hello World')\n")
        cloc_xml_path = os.path.join(self.tests_temp_folder, "cloc.xml")
        exit_code = command.pygount_command(
            [
                "--verbose",
                "--format=cloc-xml",
                "--generated=[regex](?i).*generiert",
                "--out",
                cloc_xml_path,
                generiert_py_path,
            ]
        )
        assert exit_code == 0
        assert os.path.exists(cloc_xml_path)
        cloc_xml_root = ElementTree.parse(cloc_xml_path)
        file_elements = cloc_xml_root.findall("files/file[@language='__generated__']")
        assert file_elements is not None
        assert len(file_elements) >= 1

    def test_can_analyze_pygount_source_code_as_cloc_xml(self):
        cloc_xml_path = os.path.join(self.tests_temp_folder, "cloc.xml")
        exit_code = command.pygount_command(
            ["--verbose", "--format", "cloc-xml", "--out", cloc_xml_path, PYGOUNT_SOURCE_FOLDER]
        )
        assert exit_code == 0
        assert os.path.exists(cloc_xml_path)
        cloc_xml_root = ElementTree.parse(cloc_xml_path)
        file_elements = cloc_xml_root.findall("files/file")
        assert file_elements is not None
        assert len(file_elements) >= 1

    def test_can_analyze_pygount_source_code_as_json(self):
        pygount_json_path = os.path.join(self.tests_temp_folder, "pygount.json")
        exit_code = command.pygount_command(
            ["--verbose", "--format", "json", "--out", pygount_json_path, PYGOUNT_SOURCE_FOLDER]
        )
        assert exit_code == 0
        assert os.path.exists(pygount_json_path)
        with open(pygount_json_path, encoding="utf-8") as pygount_json_file:
            json_map = json.load(pygount_json_file)
        assert json_map.get("pygountVersion") == pygount.__version__
        assert json_map.get("formatVersion") == JSON_FORMAT_VERSION
        assert "files" in json_map
        assert "languages" in json_map
        assert "runtime" in json_map
        assert "summary" in json_map

    def test_can_detect_duplicates(self):
        source_code = "# Duplicate source\nprint('duplicate code')\n"
        original_path = os.path.join(self.tests_temp_folder, "original.py")
        with open(original_path, "w") as original_file:
            original_file.write(source_code)
        duplicate_path = os.path.join(self.tests_temp_folder, "duplicate.py")
        with open(duplicate_path, "w") as duplicate_file:
            duplicate_file.write(source_code)
        cloc_xml_path = os.path.join(self.tests_temp_folder, "cloc.xml")
        exit_code = command.pygount_command(
            ["--verbose", "--format", "cloc-xml", "--out", cloc_xml_path, original_path, duplicate_path]
        )
        assert exit_code == 0
        assert os.path.exists(cloc_xml_path)
        cloc_xml_root = ElementTree.parse(cloc_xml_path)
        file_elements = cloc_xml_root.findall("files/file[@language='__duplicate__']")
        assert file_elements is not None
        assert len(file_elements) == 1

    def test_can_accept_duplicates(self):
        source_code = "# Duplicate source\nprint('duplicate code')\n"
        original_path = os.path.join(self.tests_temp_folder, "original.py")
        with open(original_path, "w") as original_file:
            original_file.write(source_code)
        duplicate_path = os.path.join(self.tests_temp_folder, "duplicate.py")
        with open(duplicate_path, "w") as duplicate_file:
            duplicate_file.write(source_code)
        cloc_xml_path = os.path.join(self.tests_temp_folder, "cloc.xml")
        exit_code = command.pygount_command(
            ["--duplicates", "--verbose", "--format", "cloc-xml", "--out", cloc_xml_path, original_path, duplicate_path]
        )
        assert exit_code == 0
        assert os.path.exists(cloc_xml_path)
        cloc_xml_root = ElementTree.parse(cloc_xml_path)
        file_elements = cloc_xml_root.findall("files/file[@language='__duplicate__']")
        assert file_elements is not None
        assert len(file_elements) == 0

    def test_can_write_all_output_formats(self):
        for output_format in VALID_OUTPUT_FORMATS:
            exit_code = command.pygount_command(["--format", output_format, PYGOUNT_SOURCE_FOLDER])
            self.assertEqual(exit_code, 0)

    def test_can_merge_embedded_languages(self):
        test_html_django_path = self.create_temp_file(
            "some.html",
            ["<!DOCTYPE html>", "{% load i18n %}", '<html lang="{{ language_code }}" />'],
        )
        cloc_xml_path = os.path.join(self.tests_temp_folder, "cloc.xml")
        exit_code = command.pygount_command(
            ["--merge-embedded-languages", "--format", "cloc-xml", "--out", cloc_xml_path, test_html_django_path]
        )
        assert exit_code == 0
        assert os.path.exists(cloc_xml_path)
        cloc_xml_root = ElementTree.parse(cloc_xml_path)
        file_elements = cloc_xml_root.findall("files/file[@language='HTML']")
        assert file_elements is not None
        assert len(file_elements) == 1


================================================
FILE: tests/test_common.py
================================================
"""
Tests for :py:mod:`pygount.common` module.
"""

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
import re

import pytest

import pygount.common
from pygount.common import matching_regex


def test_can_build_str():
    error_without_source = pygount.common.OptionError("test")
    assert str(error_without_source) == "test"

    error_with_source = pygount.common.OptionError("test", "some_file.txt")
    assert str(error_with_source) == "some_file.txt: test"


def test_can_match_from_regex():
    regex = pygount.common.regex_from(re.compile(r"a\d+b"))
    assert regex.match("a123b") is not None
    assert regex.match("ab") is None


def test_can_match_from_regex_pattern():
    regex = pygount.common.regex_from(r"a\d+b")
    assert regex.match("a123b") is not None
    assert regex.match("ab") is None


def test_can_match_from_shell_pattern():
    regex = pygount.common.regex_from("*a[0-9]?*b*", True)
    assert regex.match("a123b") is not None
    assert regex.match("ab") is None


def test_can_match_single_regex_from_shell_pattern():
    regexes = pygount.common.regexes_from("*.py")
    assert len(regexes) == 1
    assert regexes[0].match("some.py") is not None
    assert regexes[0].match("some.bat") is None


def test_can_match_single_regex():
    regexes = pygount.common.regexes_from(pygount.common.REGEX_PATTERN_PREFIX + r"^.+\.py$")
    assert len(regexes) == 1
    assert regexes[0].match("some.py") is not None
    assert regexes[0].match("some.bat") is None


def test_can_match_regex_from_multiple_regex_patterns():
    regexes = pygount.common.regexes_from(pygount.common.REGEX_PATTERN_PREFIX + r"x, abc, ^.+\.py$")
    assert len(regexes) == 3
    assert regexes[0].match("some.py") is None
    assert regexes[1].match("some.py") is None
    assert regexes[2].match("some.py") is not None


def test_can_match_regex_from_multiple_default_shell_patterns():
    regexes = pygount.common.regexes_from(
        pygount.common.REGEX_PATTERN_PREFIX + pygount.common.ADDITIONAL_PATTERN + r"x", "abc, *.py"
    )
    assert len(regexes) == 3
    assert regexes[0].match("some.py") is None
    assert regexes[1].match("some.py") is None
    assert regexes[2].match("some.py") is not None
    assert regexes[0].match("x") is not None


def test_can_represent_text_as_list():
    assert pygount.common.as_list("") == []
    assert pygount.common.as_list("a") == ["a"]
    assert pygount.common.as_list("abc,d, e") == ["abc", "d", "e"]
    assert pygount.common.as_list(",,,,") == []


def test_can_represent_iterable_as_list():
    assert pygount.common.as_list([]) == []
    assert pygount.common.as_list(["a", 1, None]) == ["a", 1, None]
    assert pygount.common.as_list(()) == []
    assert pygount.common.as_list(range(3)) == [0, 1, 2]


@pytest.mark.parametrize(
    "text,patterns,expected_regex_index",
    [
        ("some", [], -1),
        ("some", ["some"], 0),
        ("some", ["other"], -1),
        ("some", ["other", "some"], 1),
        ("some", ["s.+"], 0),
        ("some", [".*T.*"], -1),
    ],
)
def test_can_compute_matching_regex(text: str, patterns: list[str], expected_regex_index: int):
    regexes = [re.compile(pattern) for pattern in patterns]
    regex = matching_regex(text, regexes)
    regex_index = regexes.index(regex) if regex is not None else -1
    assert regex_index == expected_regex_index


def test_can_convert_empty_text_to_lines():
    assert list(pygount.common.lines("")) == []


def test_can_convert_single_letter_to_lines():
    assert list(pygount.common.lines("a")) == ["a"]


def test_can_convert_single_letter_with_newline_to_lines():
    assert list(pygount.common.lines("a\n")) == ["a"]


def test_can_convert_multiple_lines():
    assert list(pygount.common.lines("a\nbc")) == ["a", "bc"]
    assert list(pygount.common.lines("a\nbc\n")) == ["a", "bc"]


def test_can_convert_empty_lines():
    assert list(pygount.common.lines("\n\n\n")) == ["", "", ""]


def test_can_compute_mapped_repr():
    class Dummy:
        pass

    assert pygount.common.mapped_repr(Dummy(), {}) == "Dummy()"
    assert (
        pygount.common.mapped_repr(Dummy(), {"some": "such", "other": 1, "whatever": True})
        == "Dummy(some=such, other=1, whatever=True)"
    )


================================================
FILE: tests/test_encoding.py
================================================
"""
Tests for encoding related functions.
"""

# Copyright (c) 2016-2025, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
from tempfile import NamedTemporaryFile

import pytest

from pygount.analysis import _BOM_TO_ENCODING_MAP, encoding_for, encoding_from_possible_magic_comment, is_binary_file

from ._common import temp_binary_file, temp_source_file

_ENCODING_TO_BOM_MAP = {encoding: bom for bom, encoding in _BOM_TO_ENCODING_MAP.items()}
_TEST_CODE = "x = '\u00fd \u20ac'"


@pytest.mark.parametrize(
    "ascii_header",
    [
        "# encoding: cp1252",
        "# coding: cp1252",
        "# -*- coding: cp1252 -*-",
        "# eNcOdInG: cp1252",
        "## encoding: cp1252",
        "#encoding:cp1252",
        "# -*- coding: cp1252; mode: python; -*-"  # Emacs modeline
        "/* coding: cp1252 */"  # C
        "{ coding: cp1252 }"  # Pascal
        "REM coding: cp1252",  # Basic
    ],
)
def test_can_detect_encoding_from_magic_comments(ascii_header: str):
    assert encoding_from_possible_magic_comment(ascii_header) == "cp1252"


@pytest.mark.parametrize(
    "ascii_header",
    [
        "",
        "    ",
        " # encoding: cp1252",  # Leading white space
        "# encoding: !$%&",
        "-*- coding: cp1252 -*-",  # Not a comment
        "encoding: cp1252",
        '{"x":"encoding: cp1252"}',
    ],
)
def test_can_ignore_encoding_from_magic_comments(ascii_header: str):
    assert encoding_from_possible_magic_comment(ascii_header) is None


@pytest.mark.parametrize("encoding", _BOM_TO_ENCODING_MAP.values())
def test_can_detect_bom_encodings(encoding: str):
    _test_can_detect_bom_encoding(encoding)


def _test_can_detect_bom_encoding(encoding: str):
    with NamedTemporaryFile(mode="wb+", suffix="txt") as test_file:
        if encoding != "utf-8-sig":
            bom = _ENCODING_TO_BOM_MAP[encoding]
            test_file.write(bom)
        test_file.write(_TEST_CODE.encode(encoding))
        test_file.flush()
        test_file.seek(0)
        actual_encoding = encoding_for(test_file.name)
    assert actual_encoding == encoding


@pytest.mark.parametrize("encoding", ["cp1252", "utf-8"])
def test_can_detect_plain_encoding(encoding: str):
    with temp_source_file("txt", _TEST_CODE, encoding=encoding) as test_file:
        actual_encoding = encoding_for(test_file.name)
        assert actual_encoding == encoding


def test_can_detect_xml_prolog():
    encoding = "iso-8859-15"
    xml_code = f'<?xml encoding="{encoding}" standalone="yes"?><some>{_TEST_CODE}</some>'
    with temp_source_file("xml", [xml_code], encoding=encoding) as test_file:
        actual_encoding = encoding_for(test_file.name)
    assert actual_encoding == encoding


def test_can_detect_magic_comment():
    encoding = "iso-8859-15"
    lines = ["#!/usr/bin/python", f"# -*- coding: {encoding} -*-", _TEST_CODE]
    with temp_source_file("txt", lines, encoding=encoding) as test_file:
        actual_encoding = encoding_for(test_file.name)
    assert actual_encoding == encoding


def test_can_detect_automatic_encoding_for_empty_source():
    with temp_binary_file(b"") as test_file:
        actual_encoding = encoding_for(test_file.name)
    assert actual_encoding == "utf-8"


def test_can_detect_chardet_encoding():
    test_path = __file__
    actual_encoding = encoding_for(test_path)
    assert actual_encoding == "utf-8"


def test_can_detect_utf8_when_cp1252_would_fail():
    # Write closing double quote in UTF-8, which contains 0x9d,
    # which fails when read as CP1252.
    content = b"\xe2\x80\x9d"
    with temp_binary_file(content) as test_file:
        actual_encoding = encoding_for(test_file.name, encoding="automatic", fallback_encoding=None)
        assert actual_encoding == "utf-8"
        actual_encoding = encoding_for(test_file.name, encoding="automatic", fallback_encoding="cp1252")
        assert actual_encoding == "cp1252"


def test_can_use_hardcoded_encoding():
    with temp_source_file("txt", "\N{EURO SIGN}", encoding="cp1252") as test_file:
        test_path = test_file.name
        actual_encoding = encoding_for(test_path, "utf-8")
        assert actual_encoding == "utf-8"
        # Make sure that we cannot actually read the file using the hardcoded but wrong encoding.
        with open(test_path, encoding=actual_encoding) as broken_test_file, pytest.raises(UnicodeDecodeError):
            broken_test_file.read()


def test_can_detect_binary_with_zero_byte():
    with temp_binary_file(b"hello\0world") as binary_file:
        assert is_binary_file(binary_file.name)


def test_can_detect_utf16_as_non_binary():
    with NamedTemporaryFile(encoding="utf-16", mode="w+") as utf16_file:
        utf16_file.write("Hello world!")
        utf16_file.flush()
        utf16_file.seek(0)
        assert not is_binary_file(utf16_file.name)


================================================
FILE: tests/test_git_storage.py
================================================
from pathlib import Path

from pygount.git_storage import GitStorage, git_remote_url_and_revision_if_any


def test_can_extract_git_remote_url_and_revision_if_any():
    assert git_remote_url_and_revision_if_any("hello") == (None, None)
    assert git_remote_url_and_revision_if_any("git@github.com:roskakori/pygount.git/v1.5.1") == (
        "git@github.com:roskakori/pygount.git",
        "v1.5.1",
    )
    assert git_remote_url_and_revision_if_any("git@github.com:roskakori/pygount.git") == (
        "git@github.com:roskakori/pygount.git",
        None,
    )
    assert git_remote_url_and_revision_if_any("git@github.com:roskakori/pygount.git/") == (
        "git@github.com:roskakori/pygount.git",
        None,
    )
    assert git_remote_url_and_revision_if_any("") == (None, None)


def test_can_extract_and_close_and_find_files_from_cloned_git_remote_url_with_revision():
    remote_url, revision = git_remote_url_and_revision_if_any("https://github.com/roskakori/pygount.git/v0.1")
    assert remote_url is not None
    git_storage = GitStorage(remote_url, revision)
    pyproject_path = Path(git_storage.temp_folder) / "pyproject.toml"
    readme_path = Path(git_storage.temp_folder) / "README.rst"
    try:
        git_storage.extract()
        assert readme_path.exists()
        assert not pyproject_path.exists()
    finally:
        git_storage.close()
    assert not readme_path.exists()


================================================
FILE: tests/test_lexers.py
================================================
"""
Tests for additional lexers for pygount.
"""

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.

from pygments import token

import pygount.lexers


def test_can_lex_idl():
    lexer = pygount.lexers.IdlLexer()
    text = "\n".join(
        [
            "/* some",
            " * comment */",
            "module HelloApp {",
            "  interface Hello {",
            "    string sayHello(); // Be friendly!",
            "  };",
            "};",
        ]
    )
    text_tokens = list(lexer.get_tokens(text))
    assert text_tokens == [
        (token.Token.Comment.Multiline, "/* some\n * comment */"),
        (token.Token.Text.Whitespace, "\n"),
        (token.Token.Keyword.Declaration, "module"),
        (token.Token.Text, " "),
        (token.Token.Name.Class, "HelloApp"),
        (token.Token.Text.Whitespace, " "),
        (token.Token.Punctuation, "{"),
        (token.Token.Text.Whitespace, "\n"),
        (token.Token.Text.Whitespace, "  "),
        (token.Token.Keyword.Declaration, "interface"),
        (token.Token.Text, " "),
        (token.Token.Name.Class, "Hello"),
        (token.Token.Text.Whitespace, " "),
        (token.Token.Punctuation, "{"),
        (token.Token.Text.Whitespace, "\n"),
        (token.Token.Text.Whitespace, "    "),
        (token.Token.Name, "string"),
        (token.Token.Text.Whitespace, " "),
        (token.Token.Name.Function, "sayHello"),
        (token.Token.Punctuation, "("),
        (token.Token.Punctuation, ")"),
        (token.Token.Punctuation, ";"),
        (token.Token.Text.Whitespace, " "),
        (token.Token.Comment.Single, "// Be friendly!"),
        (token.Token.Text.Whitespace, "\n"),
        (token.Token.Text.Whitespace, "  "),
        (token.Token.Punctuation, "}"),
        (token.Token.Punctuation, ";"),
        (token.Token.Text.Whitespace, "\n"),
        (token.Token.Punctuation, "}"),
        (token.Token.Punctuation, ";"),
        (token.Token.Text.Whitespace, "\n"),
    ]


def test_can_lex_m4():
    lexer = pygount.lexers.MinimalisticM4Lexer()
    text = ""
    text += "#\n"
    text += "# comment\n"
    text += "define(FRUIT, apple) # Healthy stuff!\n"
    text += "Eat some FRUIT!"
    text_tokens = list(lexer.get_tokens(text))
    assert text_tokens == [
        (token.Token.Comment.Single, "#\n"),
        (token.Token.Comment.Single, "# comment\n"),
        (token.Token.Text, "define(FRUIT, apple) "),
        (token.Token.Comment.Single, "# Healthy stuff!\n"),
        (token.Token.Text, "Eat some FRUIT!\n"),
    ]


def test_can_lex_vbscript():
    lexer = pygount.lexers.MinimalisticVBScriptLexer()
    text = "".join(["' comment\n", 'WScript.Echo "hello world!"'])
    text_tokens = list(lexer.get_tokens(text))
    assert text_tokens == [
        (token.Token.Comment.Single, "' comment\n"),
        (token.Token.Text, 'WScript.Echo "hello world!"\n'),
    ]


def test_can_lex_webfocus():
    lexer = pygount.lexers.MinimalisticWebFocusLexer()
    text = "".join(["-*\n", "-* comment\n", "-set &some='text';\n", "table file some print * end;"])
    text_tokens = list(lexer.get_tokens(text))
    assert text_tokens == [
        (token.Token.Comment.Single, "-*\n"),
        (token.Token.Comment.Single, "-* comment\n"),
        (token.Token.Text, "-set &some='text';\n"),
        (token.Token.Text, "table file some print * end;\n"),
    ]


def test_can_lex_plain_text():
    lexer = pygount.lexers.PlainTextLexer()
    text = "".join(
        [
            "a\n",  # line with text
            "\n",  # empty line
            " \t \n",  # line containing only white space
            "  ",  # trailing while space line without newline character
        ]
    )
    text_tokens = list(lexer.get_tokens(text))
    assert text_tokens == [(token.Token.Comment.Single, "a\n"), (token.Token.Text, "\n \t \n  \n")]


================================================
FILE: tests/test_summary.py
================================================
"""
Tests to summarize analyses of multiple source codes.
"""

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
from pygount.analysis import SourceAnalysis, SourceState
from pygount.summary import LanguageSummary, ProjectSummary


def test_can_repr_language_summary():
    language_summary = LanguageSummary("Python")
    language_summary.add(SourceAnalysis("some.py", "Python", "some", 2, 3, 4, 5, SourceState.analyzed))
    expected_language_summary_repr = (
        "LanguageSummary(language='Python', file_count=1, "
        "code_count=2, documentation_count=3, empty_count=4, string_count=5)"
    )
    assert repr(language_summary) == expected_language_summary_repr
    assert repr(language_summary) == str(language_summary)


def test_can_repr_pseudo_language_summary():
    language_summary = LanguageSummary("__empty__")
    language_summary.add(SourceAnalysis("some.py", "__empty__", "some", 0, 0, 0, 0, SourceState.empty))
    expected_language_summary_repr = "LanguageSummary(language='__empty__', file_count=1)"
    assert repr(language_summary) == expected_language_summary_repr
    assert repr(language_summary) == str(language_summary)


def test_can_summarize_project_with_multiple_files_of_same_language():
    source_analyses = (
        SourceAnalysis("some.py", "Python", "some", 300, 70, 4, 2, SourceState.analyzed),
        SourceAnalysis("other.py", "Python", "some", 700, 30, 6, 3, SourceState.analyzed),
    )

    project_summary = ProjectSummary()
    for source_analysis in source_analyses:
        project_summary.add(source_analysis)

    assert set(project_summary.language_to_language_summary_map.keys()) == {"Python"}
    assert project_summary.total_file_count == 2
    assert project_summary.total_code_count == 1000
    assert project_summary.total_documentation_count == 100
    assert project_summary.total_empty_count == 10
    assert project_summary.total_string_count == 5


def test_can_summarize_project_with_multiple_files_of_different_languages():
    source_analyses = (
        SourceAnalysis("some.py", "Python", "some", 1000, 100, 10, 3, SourceState.analyzed),
        SourceAnalysis("some.sh", "Bash", "some", 200, 20, 5, 2, SourceState.analyzed),
    )

    project_summary = ProjectSummary()
    for source_analysis in source_analyses:
        project_summary.add(source_analysis)

    assert set(project_summary.language_to_language_summary_map.keys()) == {"Bash", "Python"}
    assert project_summary.total_file_count == 2
    assert project_summary.total_code_count == 1200
    assert project_summary.total_documentation_count == 120
    assert project_summary.total_empty_count == 15
    assert project_summary.total_string_count == 5

    assert (
        repr(project_summary)
        == "ProjectSummary(total_file_count=2, total_line_count=1340, languages=['Bash', 'Python'])"
    )


def test_can_summarize_project_with_pseudo_languages():
    source_analyses = (
        SourceAnalysis("empty.py", "__empty__", "some", 0, 0, 0, 0, SourceState.empty),
        SourceAnalysis("generated.py", "__generated__", "some", 1, 2, 3, 4, SourceState.generated, "generated by test"),
        SourceAnalysis("binary.bin", "__binary__", "some", 0, 0, 0, 0, SourceState.binary),
    )
    expected_languages = {source_analysis.language for source_analysis in source_analyses}

    project_summary = ProjectSummary()
    for source_analysis in source_analyses:
        project_summary.add(source_analysis)

    assert project_summary.total_file_count == 3
    assert set(project_summary.language_to_language_summary_map.keys()) == expected_languages
    assert project_summary.total_code_count == 0
    assert project_summary.total_documentation_count == 0
    assert project_summary.total_empty_count == 0
    assert project_summary.total_string_count == 0

    assert repr(project_summary) == (
        "ProjectSummary(total_file_count=3, total_line_count=0, languages=['__binary__', '__empty__', '__generated__'])"
    )


def test_can_repr_empty_project_summary():
    project_summary = ProjectSummary()
    assert repr(project_summary) == "ProjectSummary(total_file_count=0, total_line_count=0, languages=[])"
    assert repr(project_summary) == str(project_summary)


================================================
FILE: tests/test_write.py
================================================
"""
Test to write results of pygount analyses.
"""

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
import io
import re
import tempfile
from pathlib import Path
from xml.etree import ElementTree

import pytest

from pygount import analysis, write

from ._common import TempFolderTest


def test_can_collect_totals():
    source_analyses = (
        analysis.SourceAnalysis("some.py", "Python", "some", 1, 2, 3, 4, analysis.SourceState.analyzed, None),
        analysis.SourceAnalysis("other.py", "Python", "some", 10, 20, 30, 40, analysis.SourceState.analyzed, None),
    )
    with (
        tempfile.NamedTemporaryFile("w", encoding="utf-8", prefix="pygount_", suffix=".tmp") as target_stream,
        write.BaseWriter(target_stream) as writer,
    ):
        for source_analysis in source_analyses:
            writer.add(source_analysis)
    assert writer.project_summary.total_file_count == 2
    assert writer.project_summary.total_line_count == 110
    assert writer.duration_in_seconds > 0
    assert writer.lines_per_second > writer.files_per_second


def test_can_write_cloc_xml():
    source_analyses = (
        analysis.SourceAnalysis("some.py", "Python", "some", 1, 2, 3, 4, analysis.SourceState.analyzed, None),
        analysis.SourceAnalysis("other.py", "Python", "some", 10, 20, 30, 40, analysis.SourceState.analyzed, None),
    )
    with io.StringIO() as target_stream:
        with write.ClocXmlWriter(target_stream) as writer:
            for source_analysis in source_analyses:
                writer.add(source_analysis)
        xml_data = target_stream.getvalue()
        assert len(xml_data) >= 1
    with io.StringIO(xml_data) as cloc_xml_stream:
        cloc_results_root = ElementTree.parse(cloc_xml_stream)
    file_elements = cloc_results_root.findall("files/file")
    assert file_elements is not None
    assert len(file_elements) == len(source_analyses)


def test_can_compute_digit_width():
    assert write.digit_width(0) == 1
    assert write.digit_width(1) == 1
    assert write.digit_width(9) == 1
    assert write.digit_width(999) == 3
    assert write.digit_width(1000) == 4


_LINE_WORD_REGEX = re.compile(r"[\w\\.]+")  # HACK: For test assume all language names are "\w+".


class _LineData:
    def __init__(self, line: str):
        line_parts = _LINE_WORD_REGEX.findall(line)
        self.language = line_parts[0]
        self.file_count = int(line_parts[1])
        self.file_percentage = float(line_parts[2])
        self.code_count = int(line_parts[3])
        self.code_percentage = float(line_parts[4])
        self.comment_count = int(line_parts[5])
        self.comment_percentage = float(line_parts[6])


class SummaryWriterTest(TempFolderTest):
    def test_can_write_summary(self):
        source_analyses = (
            analysis.SourceAnalysis("script.sh", "Bash", "some", 200, 25, 1, 2, analysis.SourceState.analyzed, None),
            analysis.SourceAnalysis("some.py", "Python", "some", 300, 45, 3, 4, analysis.SourceState.analyzed, None),
            analysis.SourceAnalysis("other.py", "Python", "some", 500, 30, 5, 6, analysis.SourceState.analyzed, None),
        )
        lines = self._summary_lines_for(source_analyses)
        assert len(lines) == 8, f"lines={lines}"

        python_data = _LineData(lines[3])
        assert python_data.language == "Python"
        assert python_data.file_count == 2
        assert python_data.file_percentage == pytest.approx(66.7)
        assert python_data.code_count == 800
        assert python_data.code_percentage == pytest.approx(89.6)
        assert python_data.comment_count == 75
        assert python_data.comment_percentage == pytest.approx(8.4)

        bash_data = _LineData(lines[4])
        assert bash_data.language == "Bash"
        assert bash_data.file_count == 1
        assert bash_data.code_count == 200
        assert bash_data.code_percentage == pytest.approx(87.7)
        assert bash_data.comment_count == 25
        assert bash_data.comment_percentage == pytest.approx(11.0)

        sum_total_data = _LineData(lines[-2])
        assert sum_total_data.file_count == 3
        assert sum_total_data.file_percentage == pytest.approx(100.0)
        assert sum_total_data.code_count == 1000
        assert sum_total_data.code_percentage == pytest.approx(89.2)
        assert sum_total_data.comment_count == 100
        assert sum_total_data.comment_percentage == pytest.approx(8.9)

    def _summary_lines_for(self, source_analyses):
        # NOTE: We need to write to a file because the lines containing the
        # actual data are only available during close() at which point they
        # would not be accessible to StringIO.getvalue().
        summary_path = Path(self.tests_temp_folder, "summary.tmp")
        with summary_path.open("w", encoding="utf-8") as summary_file, write.SummaryWriter(summary_file) as writer:
            for source_analysis in source_analyses:
                writer.add(source_analysis)
        return summary_path.read_text("utf-8").splitlines()


================================================
FILE: tests/test_xmldialect.py
================================================
"""
Tests for function to obtain the language dialect used by XML source code.
"""

import pytest

# Copyright (c) 2016-2024, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
import pygount.xmldialect
from pygount.xmldialect import without_xml_header

EXAMPLE_ANT_CODE = """<project name="hello">
    <target name="hello">
        <echo message="Hello world!" />
    </target>
</project>
"""

_EXAMPLE_POM_CODE = """<project
  xmlns="http://maven.apache.org/POM/4.0.0"
  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
  http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.mycompany.app</groupId>
  <artifactId>my-app</artifactId>
  <version>1.0-SNAPSHOT</version>
  <packaging>jar</packaging>

  <name>Maven Quick Start Archetype</name>
  <url>http://maven.apache.org</url>

  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.8.2</version>
      <scope>test</scope>
    </dependency>
  </dependencies>
</project>"""

_EXAMPLE_DOCBOOK_DTD_CODE = """<!DOCTYPE example PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
    "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd">
<example><title>Hello World in Python</title>
<programlisting>
print('Hello World!')
</programlisting>
</example>
"""

_EXAMPLE_SVG_CODE = (
    '<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n'
    '<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">'
    '<svg xmlns="http://www.w3.org/2000/svg" width="320" height="120" viewBox="0 0 320 120">'
    '  <rect width="100%" height="100%" fill="white"/>'
    '  <text x="50%" y="50%" text-anchor="middle" dominant-baseline="middle" '
    '   font-family="sans-serif" font-size="32" fill="black"'
    "  >Hello, world!</text>"
    "</svg>"
)


@pytest.mark.parametrize(
    "xml_code,expected",
    [
        ("<some/>", "<some/>"),
        ("<some/>", "<some/>"),
        ('<?xml version="1.0"?><some/>', "<some/>"),
        ('  <?xml version="1.0"?><some/>', "<some/>"),
        ('<?xml version="1.0"?>  <some/>', "<some/>"),
        ('\n\n<?xml version="1.0"?>\n\n<some/>', "<some/>"),
    ],
)
def test_can_compute_xml_code_without_header(xml_code: str, expected: str):
    assert without_xml_header(xml_code) == expected


def test_can_detect_ant():
    assert pygount.xmldialect.xml_dialect("<ant>", EXAMPLE_ANT_CODE) == "Ant"


def test_can_detect_maven():
    assert pygount.xmldialect.xml_dialect("<maven>", _EXAMPLE_POM_CODE) == "Maven"


def test_can_ignore_broken_xml():
    assert pygount.xmldialect.xml_dialect("<broken>", "<some></other>") is None


def test_can_detect_docbook_from_dtd():
    assert pygount.xmldialect.xml_dialect("<docbook-dtd>", _EXAMPLE_DOCBOOK_DTD_CODE) == "DocBook XML"


def test_can_detect_svg_from_dtd():
    assert pygount.xmldialect.xml_dialect("<svg>", _EXAMPLE_SVG_CODE) == "SVG XML"