[
  {
    "path": ".gitattributes",
    "content": "* text=auto\n"
  },
  {
    "path": ".github/workflows/build.yml",
    "content": "# Continuous integration build for pygount.\nname: Build\n\non: [push, pull_request]\n\njobs:\n  build:\n    runs-on: ubuntu-latest\n    strategy:\n      matrix:\n        python-version: [\"3.9\", \"3.10\", \"3.11\", \"3.12\", \"3.13\"]\n    env:\n      MAIN_PYTHON_VERSION: \"3.12\" # same as Ubuntu 24 LTS\n\n    steps:\n      - uses: actions/checkout@v4\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v5\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Install uv\n        uses: astral-sh/setup-uv@v6\n        with:\n          # NOTE Using the \"latest\" version of uv is risky, but for the time being uv is updated\n          #  regularly, so a specific version would be outdated rather quickly. Once uv goes\n          #  version 1.0, this should be changed to something like \">=1 <2\".\n          version: \"latest\"\n      - name: Load cached venv\n        id: cached-uv-dependencies\n        uses: actions/cache@v4\n        with:\n          path: .venv\n          key: venv-${{ runner.os }}-${{ hashFiles('**/uv.lock') }}\n      - name: Install dependencies\n        if: steps.cached-uv-dependencies.outputs.cache-hit != 'true'\n        run: |\n          uv sync\n      - name: Build pygount package\n        run: |\n          uv build\n      - name: Run the test suite\n        run: |\n          uv run pytest --cov=pygount --cov-branch\n      - name: Build documentation\n        if: ${{ matrix.python-version == env.MAIN_PYTHON_VERSION }}\n        run: |\n          uv run sh scripts/build_documentation.sh\n      - name: Update coveralls statistics\n        if: ${{ matrix.python-version == env.MAIN_PYTHON_VERSION }}\n        env:\n          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n        run: |\n          uv run coveralls --service=github\n\n  check-style:\n    runs-on: ubuntu-latest\n    # Disable pre-commit check on main and production to prevent\n    # pull request merges to fail with don't commit to branch\".\n    if: github.ref != 'refs/heads/main'\n    steps:\n      - uses: actions/checkout@v4\n      - name: Set up Python ${{ env.MAIN_PYTHON_VERSION }}\n        uses: actions/setup-python@v5\n        with:\n          python-version: ${{ env.MAIN_PYTHON_VERSION }}\n      - name: Install pre-commit\n        run: |\n          pip install pre-commit\n      - name: Load cached pre-commit\n        id: cached-pre-commit\n        uses: actions/cache@v4\n        with:\n          path: ~/.cache/pre-commit\n          key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }}\n      - name: Install pre-commit hooks\n        if: steps.cached-pre-commit.outputs.cache-hit != 'true'\n        run: pre-commit install --install-hooks\n      - name: Check coding style\n        run: pre-commit run --all-files\n"
  },
  {
    "path": ".gitignore",
    "content": "\n# Created by https://www.toptal.com/developers/gitignore/api/python,pycharm\n# Edit at https://www.toptal.com/developers/gitignore?templates=python,pycharm\n\n### PyCharm ###\n# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider\n# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839\n\n# User-specific stuff\n.idea/**/workspace.xml\n.idea/**/tasks.xml\n.idea/**/usage.statistics.xml\n.idea/**/dictionaries\n.idea/**/shelf\n\n# AWS User-specific\n.idea/**/aws.xml\n\n# Generated files\n.idea/**/contentModel.xml\n\n# Sensitive or high-churn files\n.idea/**/dataSources/\n.idea/**/dataSources.ids\n.idea/**/dataSources.local.xml\n.idea/**/sqlDataSources.xml\n.idea/**/dynamic.xml\n.idea/**/uiDesigner.xml\n.idea/**/dbnavigator.xml\n\n# Gradle\n.idea/**/gradle.xml\n.idea/**/libraries\n\n# Gradle and Maven with auto-import\n# When using Gradle or Maven with auto-import, you should exclude module files,\n# since they will be recreated, and may cause churn.  Uncomment if using\n# auto-import.\n# .idea/artifacts\n# .idea/compiler.xml\n# .idea/jarRepositories.xml\n# .idea/modules.xml\n# .idea/*.iml\n# .idea/modules\n# *.iml\n# *.ipr\n\n# CMake\ncmake-build-*/\n\n# Mongo Explorer plugin\n.idea/**/mongoSettings.xml\n\n# File-based project format\n*.iws\n\n# IntelliJ\nout/\n\n# mpeltonen/sbt-idea plugin\n.idea_modules/\n\n# JIRA plugin\natlassian-ide-plugin.xml\n\n# Cursive Clojure plugin\n.idea/replstate.xml\n\n# Crashlytics plugin (for Android Studio and IntelliJ)\ncom_crashlytics_export_strings.xml\ncrashlytics.properties\ncrashlytics-build.properties\nfabric.properties\n\n# Editor-based Rest Client\n.idea/httpRequests\n\n# Android studio 3.1+ serialized cache file\n.idea/caches/build_file_checksums.ser\n\n### PyCharm Patch ###\n# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721\n\n# *.iml\n# modules.xml\n# .idea/misc.xml\n# *.ipr\n\n# Sonarlint plugin\n# https://plugins.jetbrains.com/plugin/7973-sonarlint\n.idea/**/sonarlint/\n\n# SonarQube Plugin\n# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin\n.idea/**/sonarIssues.xml\n\n# Markdown Navigator plugin\n# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced\n.idea/**/markdown-navigator.xml\n.idea/**/markdown-navigator-enh.xml\n.idea/**/markdown-navigator/\n\n# Cache file creation bug\n# See https://youtrack.jetbrains.com/issue/JBR-2257\n.idea/$CACHE_FILE$\n\n# CodeStream plugin\n# https://plugins.jetbrains.com/plugin/12206-codestream\n.idea/codestream.xml\n\n### Python ###\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\ncover/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\n.pybuilder/\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n#   For a library or package, you might want to ignore these files since the code is\n#   intended to run in multiple environments; otherwise, check them in:\n# .python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# pytype static type analyzer\n.pytype/\n\n# Cython debug symbols\ncython_debug/\n\n# End of https://www.toptal.com/developers/gitignore/api/python,pycharm\n\n# Various\n.DS_Store\n.pytest_cache\n/.idea/ruff.xml\n/build/\n/dist/\n/cloc.xml\n/tests/.temp/\n/htmlcov/\n"
  },
  {
    "path": ".idea/.gitignore",
    "content": "# Default ignored files\n/shelf/\n/workspace.xml\n# Editor-based HTTP Client requests\n/httpRequests/\n# Datasource local storage ignored files\n/dataSources/\n/dataSources.local.xml\n"
  },
  {
    "path": ".idea/encodings.xml",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"Encoding\" addBOMForNewFiles=\"with NO BOM\" />\n</project>"
  },
  {
    "path": ".idea/inspectionProfiles/Project_Default.xml",
    "content": "<component name=\"InspectionProjectProfileManager\">\n  <profile version=\"1.0\">\n    <option name=\"myName\" value=\"Project Default\" />\n    <inspection_tool class=\"PyCompatibilityInspection\" enabled=\"true\" level=\"WARNING\" enabled_by_default=\"true\">\n      <option name=\"ourVersions\">\n        <value>\n          <list size=\"5\">\n            <item index=\"0\" class=\"java.lang.String\" itemvalue=\"3.7\" />\n            <item index=\"1\" class=\"java.lang.String\" itemvalue=\"3.8\" />\n            <item index=\"2\" class=\"java.lang.String\" itemvalue=\"3.9\" />\n            <item index=\"3\" class=\"java.lang.String\" itemvalue=\"3.10\" />\n            <item index=\"4\" class=\"java.lang.String\" itemvalue=\"3.10\" />\n          </list>\n        </value>\n      </option>\n    </inspection_tool>\n    <inspection_tool class=\"PyPackageRequirementsInspection\" enabled=\"true\" level=\"WARNING\" enabled_by_default=\"true\">\n      <option name=\"ignoredPackages\">\n        <value>\n          <list size=\"2\">\n            <item index=\"0\" class=\"java.lang.String\" itemvalue=\"coverage\" />\n            <item index=\"1\" class=\"java.lang.String\" itemvalue=\"setuptools\" />\n          </list>\n        </value>\n      </option>\n    </inspection_tool>\n  </profile>\n</component>"
  },
  {
    "path": ".idea/misc.xml",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"Black\">\n    <option name=\"sdkName\" value=\"Poetry (pygount)\" />\n  </component>\n  <component name=\"ProjectRootManager\" version=\"2\" project-jdk-name=\"uv (pygount)\" project-jdk-type=\"Python SDK\" />\n  <component name=\"PythonCompatibilityInspectionAdvertiser\">\n    <option name=\"version\" value=\"3\" />\n  </component>\n</project>"
  },
  {
    "path": ".idea/modules.xml",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"ProjectModuleManager\">\n    <modules>\n      <module fileurl=\"file://$PROJECT_DIR$/.idea/pygount.iml\" filepath=\"$PROJECT_DIR$/.idea/pygount.iml\" />\n    </modules>\n  </component>\n</project>"
  },
  {
    "path": ".idea/pyProjectModel.xml",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"PyProjectModelSettings\">\n    <option name=\"showConfigurationNotification\" value=\"false\" />\n  </component>\n</project>"
  },
  {
    "path": ".idea/pygount.iml",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<module type=\"PYTHON_MODULE\" version=\"4\">\n  <component name=\"NewModuleRootManager\">\n    <content url=\"file://$MODULE_DIR$\">\n      <excludeFolder url=\"file://$MODULE_DIR$/dist\" />\n      <excludeFolder url=\"file://$MODULE_DIR$/.pytest_cache\" />\n      <excludeFolder url=\"file://$MODULE_DIR$/.idea/libraries\" />\n      <excludeFolder url=\"file://$MODULE_DIR$/pygount.egg-info\" />\n      <excludeFolder url=\"file://$MODULE_DIR$/htmlcov\" />\n      <excludeFolder url=\"file://$MODULE_DIR$/.venv\" />\n      <excludeFolder url=\"file://$MODULE_DIR$/build\" />\n      <excludeFolder url=\"file://$MODULE_DIR$/site\" />\n    </content>\n    <orderEntry type=\"jdk\" jdkName=\"uv (pygount)\" jdkType=\"Python SDK\" />\n    <orderEntry type=\"sourceFolder\" forTests=\"false\" />\n    <orderEntry type=\"library\" name=\"R User Library\" level=\"project\" />\n    <orderEntry type=\"library\" name=\"R Skeletons\" level=\"application\" />\n  </component>\n  <component name=\"PackageRequirementsSettings\">\n    <option name=\"requirementsPath\" value=\"\" />\n  </component>\n  <component name=\"TestRunnerService\">\n    <option name=\"PROJECT_TEST_RUNNER\" value=\"py.test\" />\n  </component>\n</module>"
  },
  {
    "path": ".idea/vcs.xml",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"VcsDirectoryMappings\">\n    <mapping directory=\"$PROJECT_DIR$\" vcs=\"Git\" />\n  </component>\n</project>"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "exclude: \"^.idea\"\n\nrepos:\n  - repo: https://github.com/astral-sh/ruff-pre-commit\n    rev: v0.15.12\n    hooks:\n      - id: ruff\n        args: [\"--fix\"]\n      - id: ruff-format\n\n  - repo: https://github.com/pre-commit/mirrors-prettier\n    rev: v3.1.0\n    hooks:\n      - id: prettier\n\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v6.0.0\n    hooks:\n      - id: fix-byte-order-marker\n      - id: trailing-whitespace\n      - id: end-of-file-fixer\n      - id: mixed-line-ending\n      - id: check-added-large-files\n      - id: check-ast\n      - id: check-json\n      - id: check-merge-conflict\n      - id: check-xml\n      - id: check-yaml\n      - id: debug-statements\n\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v6.0.0\n    hooks:\n      - id: no-commit-to-branch\n        args: [\"--branch\", \"main\"]\n"
  },
  {
    "path": ".readthedocs.yaml",
    "content": "# Settings for \"Read the Docs\" build.\n# See <https://docs.readthedocs.io/>.\nversion: 2\n\nbuild:\n  os: \"ubuntu-24.04\"\n  tools:\n    python: \"3.14\"\n\nmkdocs:\n  configuration: mkdocs.yaml\n\npython:\n  install:\n    - method: uv\n      command: sync\n      groups:\n        - dev\n"
  },
  {
    "path": "CHANGES.md",
    "content": "# Version history\n\nFor more information about which versions of pygount included what changes\nread the\n[respective chapter of the documentation](https://pygount.readthedocs.io/en/latest/changes/).\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Contributing to pygount\n\nFor more information on building pygount and contributing to it, read the\n[respective chapter of the documentation](https://pygount.readthedocs.io/en/latest/contributing/).\n"
  },
  {
    "path": "LICENSE.txt",
    "content": "Copyright (c) 2016-2024, Thomas Aglassinger\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above copyright notice, this\n  list of conditions and the following disclaimer.\n\n* Redistributions in binary form must reproduce the above copyright notice,\n  this list of conditions and the following disclaimer in the documentation\n  and/or other materials provided with the distribution.\n\n* Neither the name of pygount nor the names of its contributors may be used to\n  endorse or promote products derived from this software without specific\n  prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\nFOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\nSERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\nOR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
  },
  {
    "path": "README.md",
    "content": "[![PyPI](https://img.shields.io/pypi/v/pygount)](https://pypi.org/project/pygount/)\n[![Python Versions](https://img.shields.io/pypi/pyversions/pygount.svg)](https://www.python.org/downloads/)\n[![Build Status](https://github.com/roskakori/pygount/actions/workflows/build.yml/badge.svg)](https://github.com/roskakori/pygount/actions/workflows/build.yml)\n[![Test Coverage](https://img.shields.io/coveralls/github/roskakori/pygount)](https://coveralls.io/r/roskakori/pygount?branch=main)\n[![Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)\n[![License](https://img.shields.io/github/license/roskakori/pygount)](https://opensource.org/licenses/BSD-3-Clause)\n\n# pygount\n\nPygount is a command line tool to scan folders for source code files and\ncount the number of source code lines in it. It is similar to tools like\n[sloccount](https://www.dwheeler.com/sloccount/) and\n[cloc](https://github.com/AlDanial/cloc) but uses the\n[pygments](https://pygments.org/)\npackage to analyze the source code and consequently can analyze any\n[programming language supported by pygments](https://pygments.org/languages/).\n\nThe name is a combination of pygments and count.\n\nPygount is open source and distributed under the\n[BSD license](https://opensource.org/licenses/BSD-3-Clause). The source\ncode is available from https://github.com/roskakori/pygount.\n\n## Quickstart\n\nFor installation run\n\n```bash\n$ pip install pygount\n```\n\nor use [uv](https://docs.astral.sh/uv/) to run it directly, for example:\n\n```bash\n$ uvx pygount --version\n```\n\nTo get a list of line counts for a projects stored in a certain folder:\n\n```bash\n$ pygount ~/projects/example\n```\n\nTo limit the analysis to certain file types identified by their suffix:\n\n```bash\n$ pygount --suffix=cfg,py,yml ~/projects/example\n```\n\nTo get a summary of each programming language with sum counts and percentage:\n\n```bash\n$ pygount --format=summary ~/projects/example\n```\n\nTo analyze a remote git repository directly without having to clone it first:\n\n```bash\n$ pygount --format=summary https://github.com/roskakori/pygount.git\n```\n\nYou can pass a specific revision at the end of the remote URL:\n\n```bash\n$ pygount --format=summary https://github.com/roskakori/pygount.git/v1.5.1\n```\n\nThis example results in the following summary output:\n\n```\n┏━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━┓\n┃ Language         ┃ Files ┃     % ┃ Code ┃    % ┃ Comment ┃    % ┃\n┡━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━┩\n│ Python           │    18 │  47.4 │ 2132 │ 63.6 │     418 │ 12.5 │\n│ TOML             │     2 │   5.3 │ 1204 │ 82.7 │       1 │  0.1 │\n│ Batchfile        │     1 │   2.6 │   24 │ 68.6 │       1 │  2.9 │\n│ Bash             │     2 │   5.3 │   12 │ 80.0 │       3 │ 20.0 │\n│ Makefile         │     1 │   2.6 │    9 │ 45.0 │       7 │ 35.0 │\n│ reStructuredText │     9 │  23.7 │    0 │  0.0 │     438 │ 50.2 │\n│ Markdown         │     3 │   7.9 │    0 │  0.0 │      53 │ 49.1 │\n│ Text only        │     2 │   5.3 │    0 │  0.0 │      24 │ 82.8 │\n├──────────────────┼───────┼───────┼──────┼──────┼─────────┼──────┤\n│ Sum              │    38 │ 100.0 │ 3381 │ 57.4 │     945 │ 16.1 │\n└──────────────────┴───────┴───────┴──────┴──────┴─────────┴──────┘\n```\n\nPlenty of tools can post process SLOC information, for example the\n[SLOCCount plug-in](https://wiki.jenkins-ci.org/display/JENKINS/SLOCCount+Plugin)\nfor the [Jenkins](https://jenkins.io/) continuous integration server.\n\nA popular format for such tools is the XML format used by cloc, which pygount\nalso supports and can store in an output file:\n\n```bash\n$ pygount --format=cloc-xml --out=cloc.xml ~/projects/example\n```\n\nTo get a short description of all available command line options use:\n\n```bash\n$ pygount --help\n```\n\nFor more information and examples read the documentation chapter on\n[Usage](https://pygount.readthedocs.io/en/latest/usage/).\n\n## Contributions\n\nTo report bugs, visit the\n[issue tracker](https://github.com/roskakori/pygount/issues).\n\nIn case you want to play with the source code or contribute improvements, see\n[CONTRIBUTING](https://pygount.readthedocs.io/en/latest/contributing/).\n\n## Version history\n\nSee [CHANGES](https://pygount.readthedocs.io/en/latest/changes/).\n"
  },
  {
    "path": "docs/api.md",
    "content": "# API\n\n## Overview\n\nPygount provides a simple API to integrate with other tools. This, however, is currently still a work in progress and subject to change.\n\nHere's an example on how to analyze one of pygount's own source codes:\n\n```pycon\n>>> from pygount import SourceAnalysis\n>>> SourceAnalysis.from_file(\"pygount/analysis.py\", \"pygount\")\nSourceAnalysis(path='pygount/analysis.py', language='Python', group='pygount', state=analyzed, code_count=509, documentation_count=141, empty_count=117, string_count=23)\n```\n\nInformation about multiple source files can be summarized using `ProjectSummary`:\n\nFirst, set up the summary:\n\n```pycon\n>>> from pygount import ProjectSummary\n>>> project_summary = ProjectSummary()\n```\n\nNext, find some files to analyze:\n\n```pycon\n>>> from glob import glob\n>>> source_paths = glob(\"pygount/*.py\") + glob(\"*.md\")\n>>> source_paths\n['pygount/command.py', 'pygount/analysis.py', 'pygount/write.py', 'pygount/__init__.py', 'pygount/xmldialect.py', 'pygount/summary.py', 'pygount/common.py', 'pygount/lexers.py', 'README.md', 'CONTRIBUTING.md', 'CHANGES.md']\n```\n\nThen analyze them:\n\n```pycon\n>>> for source_path in source_paths:\n...     source_analysis = SourceAnalysis.from_file(source_path, \"pygount\")\n...     project_summary.add(source_analysis)\n```\n\nFinally, take a look at the information collected, for example, by printing the values of `ProjectSummary.language_to_language_summary_map`:\n\n```pycon\n>>> for language_summary in project_summary.language_to_language_summary_map.values():\n...   print(language_summary)\n...\nLanguageSummary(language='Python', file_count=8, code=1232, documentation=295, empty=331, string=84)\nLanguageSummary(language='markdown', file_count=3, code=64, documentation=0, empty=29, string=14)\n```\n"
  },
  {
    "path": "docs/background.md",
    "content": "# Background\n\n## How pygount counts code\n\nPygount primarily counts the physical lines of source code. It begins by using lexers from Pygments, if available. If Pygments doesn't have a suitable lexer, pygount employs its own internal lexers to differentiate between code and comments. These include:\n\n- Minimalist lexers for m4, VBScript, and WebFOCUS, capable of distinguishing between comments and code.\n- The Java lexer repurposed for OMG IDL.\n\nAdditionally, plain text is treated with a separate lexer that considers all lines as comments.\n\nLines consisting solely of comment tokens or whitespace are counted as comments.\n\nLines with only whitespace are ignored.\n\nAll other content is considered code.\n\n## White characters\n\nA line containing only \"white characters\" is also ignored because they do not contribute to code complexity in any meaningful way. Currently, white characters are:\n\n```\n(),:;[]{}\n```\n\nBecause of that, pygount tends to report about 5 to 15 percent fewer SLOC for C-like languages than other similar tools.\n\n## No operations\n\nFor some languages, \"no operations\" are detected and treated as white space. For example, Python's `pass` or Transact-SQL's `begin` and `end`.\n\nAs an example, consider this Python code:\n\n```python\nclass SomeError(Exception):\n    \"\"\"\n    Some error caused by some issue.\n    \"\"\"\n    pass\n```\n\nThis counts as 1 line of code and 3 lines of comments. The line with `pass` is considered a \"no operation\" and thus not taken into account.\n\n## Pure string lines\n\nMany programming languages support the concept of strings, which typically often contain text to be shown to the end user or simple constant values. Similar to white character and \"no operations\" in most cases, they do not add much to the complexity of the code. Notable exceptions are strings containing code for domain-specific languages, templates, or SQL statements.\n\nPygount currently takes an opinionated approach on how to count pure string lines depending on the output format:\n\n- With `--format=summary`, pure string lines are ignored similar to empty lines\n- With `--format` set to `sloccount` or `cloc-xml` string lines are counted as code, resulting in somewhat similar counts as the original tools.\n- With `--format=json` all variants are available as attributes, and you can choose which one you prefer.\n\nIn hindsight, this is an inconsistency that might warrant a cleanup. See issue [#122](https://github.com/roskakori/pygount/issues/122) for a discussion and issue [#152](https://github.com/roskakori/pygount/issues/152) for a plan on how to clean this up.\n\n## Binary files\n\nWhen a file is considered to be binary when all the following\nconditions match:\n\n1. The file does not start with a BOM for UTF-8, UTF-16 or UTF-32 (which indicates text files).\n2. The initial 8192 bytes contain at least one 0-byte.\n\nIn this case, pygount assigns it the pseudo language `__binary__` and performs no further analysis.\n\n## Generated files\n\nGenerated files are recognized either by their content (`--generated`) or name (`--generated-names`). Use `--help` to see the current default patterns.\n\nIn case you think the standard patterns should be extended, modify `pygount.analysis.DEFAULT_GENERATED_LINE|NAME_PATTERNS_TEXT` and [contribute a pull request](contributing.md).\n\nFor source code repositories, committing generated files should generally be avoided. Instead, make the generation part of the build process. However, there are valid reasons to include generated files:\n\n1. Package managers generate \"lock\" files from the package specification to ensure builds use the exact same versions and hashes. For example, \"pyproject.toml\" and \"uv.lock\".\n2. Generation takes too long, for example, in Flutter projects with many nested sub-packages.\n3. Generated files cannot be bootstrapped from scratch because of interdependencies.\n4. Cloud tools require certain generated files to be present in the repository. An example would be [ReadTheDocs.org](https://readthedocs.org), which as of May 2025 in combination with [MkDocs](https://www.mkdocs.org/) needs additional dependencies to be specified in a `requirements.txt`. Many Python projects specify their dependencies in `pyproject.toml`, which can be used to generate the `requirements.txt`. However, the ReadTheDocs build does not allow easily including such a step, so the path of least resistance is to just include the generated `requirements.txt` file in the repository.\n\n## Comparison with other tools\n\nPygount can analyze more languages than other common tools such as sloccount or cloc because it builds on `pygments`, which provides lexers for hundreds of languages. This also makes enables supporting another language: [Write your own lexer](http://pygments.org/docs/lexerdevelopment/).\n\nFor certain corner cases, pygount gives more accurate results because it actually lexes the code unlike other tools that mostly look for comment markers and can get confused when they show up inside strings. In practice, though, this should not make much of a difference.\n\nPygount is slower than most other tools. Partially, this is due to actually lexing instead of just scanning the code. Partially, because other tools can use statically compiled languages such as Java or C, which are generally faster than dynamic languages. For many applications though, pygount should be \"fast enough\", especially when running as an asynchronous step during a continuous integration build.\n"
  },
  {
    "path": "docs/changes.md",
    "content": "# Changes\n\nThis chapter describes the changes coming with each new version of\npygount.\n\n## Version 3.3.0, 2026-xx-xx\n\nDevelopment:\n\n- Migrate ReadTheDocs documentation to uv (issue [#221](https://github.com/roskakori/pygount/issues/221)).\n\n## Version 3.2.0, 2026-04-08\n\n- Add detection of SVG as XML dialect (issue [#209](https://github.com/roskakori/pygount/issues/209)).\n- Fix detecttion of XML dialect when a `<?xml>` header was present.\n\n## Version 3.1.1, 2025-02-17\n\n- Update dependencies and drop support for Python 3.9 (issue [#205](https://github.com/roskakori/pygount/issues/205)).\n\n## Version 3.1.0, 2025-05-27\n\n- Add command line option [`--generated-names`](usage.md#-generated-names) to specify which file names should be considered to be generated. The current patterns recognized are somewhat limited, so contributions are welcome. See the section on \"[Generated files](background.md#generated-files)\" for hints on how to do that (issue [#190](https://github.com/roskakori/pygount/issues/190)).\n- Change documentation from Sphinx to MkDocs in the hope to avoid it breaking regularly (issue [#191](https://github.com/roskakori/pygount/issues/191)).\n\nDevelopment:\n\n- Replace `format()` with f-strings (contributed by Ben Allen, issue [#166](https://github.com/roskakori/pygount/issues/166)).\n- Change sdist archive to include more than just the Python source code.\n\n## Version 3.0.0, 2025-05-23\n\n- Count pure markup files as documentation: (contributed by Tytus Bucholc, issue [#6](https://github.com/roskakori/pygount/issues/6)).\n- Fix silent error on git failing (contributed by Tom De Bièvre, issue [#162](https://github.com/roskakori/pygount/issues/162))\n- Transform common project URLs to repository: (contributed by Tom De Bièvre, issue [#164](https://github.com/roskakori/pygount/issues/164))\n- Change dependency rules for rich to be more lenient (suggested by Brian McGillion, issue [#193](https://github.com/roskakori/pygount/issues/193))\n\n## Version 2.0.0, 2025-03-16\n\n- Fix `TypeError` when processing files with a magic encoding comment specifying an unknown encoding and using `--format=json` (contributed by PyHedgehog, issue [#176](https://github.com/roskakori/pygount/issues/176))\n- Fix false positives when extracting the encoding from magic coding comments (issue [#184](https://github.com/roskakori/pygount/issues/184))\n- Add support for Python 3.13 and later (issue [#174](https://github.com/roskakori/pygount/issues/174))\n- Remove temporary directory in the output of a git analysis (contributed by Isabel Beckenbach, issue [#113](https://github.com/roskakori/pygount/issues/113))\n- Remove support for Python 3.8 (issue [#158](https://github.com/roskakori/pygount/issues/158))\n- Development: Change packaging to uv (issue [#180](https://github.com/roskakori/pygount/issues/180)).\n- Development: Change linter to ruff and in turn, clean up code (issue [#157](https://github.com/roskakori/pygount/issues/157)).\n- Development: Change default branch to main (issue [#160](https://github.com/roskakori/pygount/issues/160)).\n- Removed deprecated code: (contributed by Marco Gambone and Niels Vanden Bussche, issue [#47](https://github.com/roskakori/pygount/issues/47)).\n\n## Version 1.8.0, 2024-05-13\n\n- Add all available counts and percentages to JSON format (issue [#122](https://github.com/roskakori/pygount/issues/122)).\n\n  In particular, this makes available the `codeCount`, which is similar to the already existing `sourceCount` but does exclude lines that contain only strings. You can check their availability by validating that the `formatVersion` is at least 1.1.0.\n\n  The documentation about \"`How to count code` has more information about the available counts and the ways they are computed.\n\n  Pygount 2.0 will probably introduce some breaking changes in this area, which can already be previewed and discussed at issue [#152](https://github.com/roskakori/pygount/issues/152).\n\n## Version 1.7.0, 2024-05-13\n\n- Fix analysis with [FIPS](https://en.wikipedia.org/wiki/Federal_Information_Processing_Standards) mode by changing computation of hash for duplicate detection from MD5 to SHA256. As a side effect, reasonably modern machines should receive a (probably unnoticeable) minor performance boost (contributed by Matthew Vine, issue [#137](https://github.com/roskakori/pygount/issues/137)).\n- Add command line option `--merge-embedded-languages` to merge embedded languages into their base language. For example, \"HTML+Django/Jinja\" counts as \"HTML\" (issue [#105](https://github.com/roskakori/pygount/issues/105)).\n- Add Python 3.12 and make it the main version for CI (issue [#145](https://github.com/roskakori/pygount/issues/145)).\n\n## Version 1.6.1, 2023-07-02\n\n- Fix missing check for seekable file handles (issue [#114](https://github.com/roskakori/pygount/issues/114)).\n- Fix the ReadTheDocs documentation build by switching to the built-in alabaster Sphinx theme (issue [#116](https://github.com/roskakori/pygount/issues/116)).\n\n## Version 1.6.0, 2023-06-26\n\n- Add support for analysis of remote git URL\\'s in addition to local files (contributed by Rojdi Thomallari, issue [#109](https://github.com/roskakori/pygount/issues/109)).\n- Removed support for Python 3.7.\n- Improve API:\n  - Add an option to pass a file handle to `SourceAnalysis.from_file()` (contributed by Dominik George, issue [#100](https://github.com/roskakori/pygount/issues/100)).\n\n## Version 1.5.1, 2023-01-02\n\n- Remove progress bar for `--format=sloccount` because it resulted into blank lines when running on Windows and could cause interwoven output on Unix (issue [#91](https://github.com/roskakori/pygount/issues/91)).\n\n## Version 1.5.0, 2022-12-30\n\n- Remove support for Python 3.6 and update dependencies (issue [#93](https://github.com/roskakori/pygount/issues/93)).\n\n## Version 1.4.0, 2022-04-09\n\n- Add progress bar during scan phase and improve visual design of `--format=summary` (contributed by Stanislav Zmiev, issue [#73](https://github.com/roskakori/pygount/issues/73)).\n- Add percentages to API. For example in addition to `code_count` now there also is `code_percentage`.\n\n## Version 1.3.0, 2022-01-06\n\n- Fix computation of \"lines per second\", which was a copy and paste of \"files per second\".\n- Add JSON as additional output `--format`, see [JSON](json.md) for details (issue [#62](https://github.com/roskakori/pygount/issues/62)).\n- Add detection of [GitHub community files](https://docs.github.com/en/communities/setting-up-your-project-for-healthy-contributions) without a suffix as text (issue [#54](https://github.com/roskakori/pygount/issues/54)).\n- Change the build process to [poetry](https://python-poetry.org/) to change several messy configuration files into a single even more messy configuration file.\n\n## Version 1.2.5, 2021-05-16\n\n- Remove support for Python 3.5. Probably it still works, but there is no easy way to test this anymore because 3.5 reached its end of life a while ago.\n\n## Version 1.2.4, 2020-08-11\n\n- Fix scanning of \".\" (for current folder), which was skipped entirely (issue [#56](https://github.com/roskakori/pygount/issues/56)).\n\n## Version 1.2.3, 2020-07-05\n\n- Improve detection of text files by trying to guess a lexer for `*.txt` before assuming it is text. This basically fixes the detection of `CMakelists.txt` as CMake file [#53](https://github.com/roskakori/pygount/issues/53)). However, it will only work with some files due to multiple issues with the regular expression Pygments used in versions up to 2.6.1 to detect CMake headers. This should be fixed once pull request\n  [#1491](https://github.com/pygments/pygments/pull/1491) is applied.\n\n## Version 1.2.2, 2020-06-24\n\n- Changed preprocessor statements to count as code, unlike Pygments which treats them as special comments (contributed by nkr0, issue [#51](https://github.com/roskakori/pygount/issues/51)).\n\n## Version 1.2.1, 2020-04-02\n\n- Fix broken links in README on PyPI by moving the documentation to [ReadTheDocs](https://pygount.readthedocs.io/).\n- Improv API:\n  - Change factory functions to methods and added deprecation warnings:\n    - `source_analysis` → `SourceAnalysis.from_file`\n    - `pseudo_source_analysis` → `SourceAnalysis.from_state`\n  - Change attributes in `SourceAnalysis` to read-only properties.\n  - Rename properties holding counts from `xxx` to `xxx_count`.\n  - Add API reference to documentation.\n  - Add a couple of type hints and assertions.\n\n## Version 1.2.0, 2020-03-30\n\n- Add file count to summary.\n- Change installation to fail when attempting to install on Python earlier than 3.5.\n- Improve API:\n  - Change `SourceAnalysis.state` to be a proper enum instead of a string.\n  - Add `ProjectSummary` to summarize multiple files.\n- Clean up the project:\n  - Change continuous integration from Travis CI to GitHub actions in the hope that the CI build does not automatically break after a while because things constantly change in the CI backend.\n  - Change README format from reStructuredText to Markdown.\n  - Improve badges in README: added a badge for supported Python versions and unified the layout by using <https://shields.io>.\n  - Remove obsolete development files (for ant, tox etc).\n\n## Version 1.1.0, 2020-03-10\n\n- Fix `--folders-to-skip` and `--names-to-skip` which simply were\n  ignored (contributed by pclausen, issue [#17](https://github.com/roskakori/pygount/issues/17)).\n- Add option `--format=summary` to get a language overview and sum total (based on a contribution by Yuriy Petrovskiy, issue [#16](https://github.com/roskakori/pygount/issues/16)).\n- Add Python 3.7 and 3.8 to the list of supported versions.\n- Drop support for Python 3.3 and 3.4, mostly because it became hard to test without going through major hoops.\n\n## Version 1.0.0, 2017-07-04\n\n- Fix confusing warning about XML file `<unknown>` caused by SAX parser. As a workaround, `<unknown>` is now replaced by the actual path of the XML file that cannot be parsed.\n- Add Python 3.6 to the list of supported versions (issue [#14](https://github.com/roskakori/pygount/issues/14)).\n\n## Version 0.9, 2017-05-04\n\n- Fix `AssertionError` when option `--encoding=chardet` was specified.\n- Change the warning message \"no fallback encoding specified, using \\<encoding\\>\" to a debug message because it did not add any interesting information as the encoding actually used is visible in the info message for each file.\n- Add detection of binary files and exclude them from the analysis. In particular Django model objects (`*.mo`) are not considered Modelica source code anymore (issue\n  [#11](https://github.com/roskakori/pygount/issues/11)).\n- Add detection of DocBook XML by DTD (issue [#10](https://github.com/roskakori/pygount/issues/10)).\n- Add support for suffices to indicate PL/SQL files according to [Oracle FAQ entry on file extensions](http://www.orafaq.com/wiki/File_extensions) (issue [#12](https://github.com/roskakori/pygount/issues/12)).\n- Add possibility to specify a fallback encoding for encoding 'chardet'. Use e.g. `--encoding=chardet;cp1252`.\n\n## Version 0.8, 2016-10-07\n\n- Fix option `--verbose`. Now each analyzed source code results in at least one informational message in the log.\n- Add detection of duplicates using size and then MD5 code as criteria (issue [#2](https://github.com/roskakori/pygount/issues/2)). Use the option `--duplicates` to still count duplicate source code.\n- Improve detection of programming language, which is now more consistent and yields the same language between Python invocations.\n\n## Version 0.7, 2016-09-28\n\n- Fix that option `--generated` was ignored.\n- Add support for a couple of languages not supported by `pygments` yet:\n  - m4, VBScript, and WebFOCUS use minimalistic lexers that can distinguish between comments and code.\n  - OMG IDL repurposes the existing Java lexer.\n- Add detection of certain XML dialects as separate language (issue [#8](https://github.com/roskakori/pygount/issues/8)).\n\n## Version 0.6, 2016-09-26\n\n- Fix that source files could end up as `__error__` if the first non-ASCII characters showed up only after kilobyte 16 and the encoding was not UTF-8. Now pygount attempts to read the whole file as UTF-8 before assuming it actually is UTF-8.\n- Change lines in plain text files to count as comments (issue [#9](https://github.com/roskakori/pygount/issues/9)). Before pygments treated them as `ResourceBundle`.\n- Change that empty files have `__empty__` as language (issue [#7](https://github.com/roskakori/pygount/issues/7)).\n- Extend workaround for [pygments issue #1284](https://bitbucket.org/birkenfeld/pygments-main/issues/1284) to replace any lexer `*+Evoque` by `*`.\n\n## Version 0.5, 2016-09-22\n\n- Add that generated source code is excluded from analysis (issue [#1](https://github.com/roskakori/pygount/issues/1)). Use option `--generated` to specify patterns that indicate generated code.\n- Add workaround for pygments sometimes detecting the same XML file as XML and other times as XML+Evoque (probably depending on the hash seed). Now XML+Evoque is always changed to XML.\n- Add `__pycache__` as default `--folders-to-skip`.\n- Add notes on pseudo languages for source code that cannot be analyzed.\n\n## Version 0.4, 2016-09-11\n\n- Fixed `LookupError` on broken encoding in magic comment (issue [#4](https://github.com/roskakori/pygount/issues/4)).\n- Add options `--folders-to-skip` and `--names-to-skip` to specify which files should be excluded from analysis.\n- Add comma (`,`) and colon (`:`) to list of \"white characters\" that do not count as code if there is nothing else in the line.\n- Improve pattern matching: for all options that according to `--help` take `PATTERNS` you can now specify that the patterns are regular expressions instead of shell patterns (using `[regex]`) and that they should extend the default patterns (using `[...]`).\n- Improve documentation: added notes on how code is counted and how pygount compares to other similar tools.\n\n## Version 0.3, 2016-08-20\n\n- Fix `@rem` comments in DOS batch files (issue [#3](https://github.com/roskakori/pygount/issues/3)).\n- Clean up code.\n\n## Version 0.2, 2016-07-10\n\n- Fix that files starting with underscore (e.g. `__init__.py`) were excluded from analysis.\n- Change `chardet` package to be optional.\n- Add possibility to specify single files and glob patterns to analyze.\n- Add that lines containing only certain characters are treated as white space instead of code. Currently, this concerns brackets (`()[]{}`) and semicolon (`;`).\n- Add that Python's `pass` statement is treated as white space instead of code.\n- Clean up and (slightly) optimized code.\n\n## Version 0.1, 2016-07-05\n\n- Initial public release.\n"
  },
  {
    "path": "docs/continuous-integration.md",
    "content": "# Continuous integration\n\nPygount can produce output that can be processed by the [SLOCCount plug-in](https://wiki.jenkins-ci.org/display/JENKINS/SLOCCount+Plugin) for the [Jenkins](https://jenkins.io/) continuous integration server.\n\nIt's recommended to run pygount as one of the first steps in your build process before any undesired file like compiler targets or generated source code are built.\n\nAn example \"Execute shell\" build step for Jenkins is:\n\n```bash\n$ pygount --format=cloc-xml --out cloc.xml --suffix=py --verbose\n```\n\nThen add a post-build action \"Publish SLOCCount analysis results\" and set \"SLOCCount report\" to \"cloc.xml\".\n"
  },
  {
    "path": "docs/contributing.md",
    "content": "# Contributing\n\n## Project setup\n\nIn case you want to play with the source code or contribute changes, proceed as follows:\n\n1.  Check out the project from GitHub:\n    ```bash\n    $ git clone https://github.com/roskakori/pygount.git\n    $ cd pygount\n    ```\n2.  Install [uv](https://docs.astral.sh/uv/).\n3.  Create the virtual environment and install the required packages:\n    ```bash\n    $ uv sync --all-groups\n    ```\n4.  Install the pre-commit hook:\n    ```bash\n    $ uv run pre-commit install\n    ```\n\n## Testing\n\nTo run the test suite:\n\n```bash\n$ uv run pytest\n```\n\nTo build and browse the coverage report in HTML format:\n\n```bash\n$ sh scripts/test_coverage.sh\n$ open htmlcov/index.html  # macOS only\n```\n\n## Documentation\n\nTo build the documentation in HTML format:\n\n```bash\n$ uv run scripts/build_documentation.sh\n$ open docs/_build/html/index.html  # macOS only\n```\n\n## Coding guidelines\n\nThe code throughout uses a natural naming schema avoiding abbreviations, even for local variables and parameters.\n\nMany coding guidelines are automatically enforced (and some even fixed automatically) by the pre-commit hook. If you want to check and clean up the code without performing a commit, run:\n\n```bash\n$ uv run pre-commit run --all-files\n```\n\nIn particular, this applies checks from [black](https://black.readthedocs.io/en/stable/), [flake8](https://flake8.pycqa.org/) and [isort](https://pypi.org/project/isort/).\n\n## Publish a new version\n\nThis section is only relevant for developers with access to the PyPI project.\n\nTo add a new release, first update the `pyproject.toml`:\n\n```toml\n[project]\nversion = \"3.x.x\"\n```\n\nNext, build the project and run the tests to ensure everything works:\n\n```sh\n$ rm -rf dist  # Remove any files from previous builds.\n$ uv build\n$ uv run pytest\n```\n\nThen create a tag in the repository:\n\n```sh\n$ git tag -a -m \"Tag version 3.x.x\" v3.x.x\n$ git push --tags\n```\n\nPublish the new version on PyPI:\n\n```sh\n$ uv publish\n```\n\nFinally, add a GitHub release based on the tag from above to the [release page](https://github.com/roskakori/pygount/releases).\n"
  },
  {
    "path": "docs/index.md",
    "content": "# Pygount\n\nPygount is a command line tool to scan folders for source code files and count the number of source code lines in it. It is similar to tools like [sloccount](http://www.dwheeler.com/sloccount/) and [cloc](http://cloc.sourceforge.net/) but uses the [pygments](http://pygments.org/) package to parse the source code and consequently can analyze any [programming language supported by pygments](http://pygments.org/languages/).\n\nThe name is a combination of \"pygments\" and \"count\".\n\nPygount is open source and distributed under the [BSD license](https://opensource.org/licenses/BSD-3-Clause). The source code is available from <https://github.com/roskakori/pygount>.\n"
  },
  {
    "path": "docs/installation.md",
    "content": "# Installation\n\nPygount is available from [PyPI](https://pypi.python.org/pypi/pygount) and can be installed by running:\n\n```bash\npip install pygount\n```\n\nUsing [uv](https://docs.astral.sh/uv/), it can also run directly. For example:\n\n```bash\nuvx pygount --version\n```\n"
  },
  {
    "path": "docs/json.md",
    "content": "# JSON\n\nThe JavaScript objects notation (JSON) is widely used to interchange data. Running pygount with `--format=json` is a simple way to provide the results of an analysis for further processing.\n\n## General format\n\nThe general structure of the resulting JSON is:\n\n```JSON\n{\n  \"formatVersion\": \"1.1.0\",\n  \"pygountVersion\": \"1.8.0\",\n  \"files\": [...],\n  \"languages\": [...],\n  \"runtime\": {...},\n  \"summary\": {...}\n}\n```\n\nThe naming of the entries deliberately uses camel case to conform to the\n[JSLint](https://www.jslint.com/) guidelines.\n\nBoth `formatVersion` and `pygountVersion` use [semantic\nversioning](https://semver.org/). For more information about how this\nJSON evolved, see `JSON format history`.\n\n### Files\n\nWith `files` you can access a list of files analyzed, for example:\n\n```JavaScript\n{\n  \"codeCount\": 171,\n  \"documentationCount\": 28,\n  \"emptyCount\": 56,\n  \"group\": \"pygount\",\n  \"isCountable\": true,\n  \"language\": \"Python\",\n  \"lineCount\": 266,\n  \"path\": \"/tmp/pygount/pygount/write.py\",\n  \"state\": \"analyzed\",\n  \"stateInfo\": null,\n  \"sourceCount\": 182\n}\n```\n\nThe `*Count` fields have the following meaning:\n\n- `codeCount`: The number of lines that contains code excluding\n  [Pure string lines](background.md#pure-string-lines)\n- `documentationCount`: The number of lines containing comments\n- `emptyCount`: The number of empty lines, which includes\n  \"`No operations`\" lines\n- `lineCount`: Basically the number of lines shown in your editor\n  respectively computed by shell commands like `wc -l`,\n- `sourceCount`: The source lines of code, similar to the traditional\n  SLOC\n- `stringCount`: The number of `Pure string lines`\n\nHere, `sourceCount` is the number of source lines of code (SLOC), `documentationCount` the number of lines containing comments and\n\nThe `state` can have one of the following values:\n\n- analyzed: successfully analyzed\n- binary: the file is a [binary file](background.md#binary-files)\n- duplicate: the file is a [duplicate](usage.md#-duplicates) of another\n- empty: the file is empty (file size = 0)\n- error: the source could not be parsed; in this case, `stateInfo` contains a message with more details\n- generated: the file has been generated as specified with `--generated`\n- unknown: pygments does not offer any lexer to analyze the file\n\n### Languages\n\nIn `languages` the summary for each language is available, for example:\n\n```JSON\n{\n  \"documentationCount\": 429,\n  \"documentationPercentage\": 11.776008783969257,\n  \"codeCount\": 2332,\n  \"codePercentage\": 64.01317595388416,\n  \"emptyCount\": 706,\n  \"emptyPercentage\": 19.3796321712874,\n  \"fileCount\": 20,\n  \"filePercentage\": 48.78048780487805,\n  \"isPseudoLanguage\": false,\n  \"language\": \"Python\",\n  \"sourceCount\": 2508,\n  \"sourcePercentage\": 68.84435904474334,\n  \"stringCount\": 176,\n  \"stringPercentage\": 4.831183090859182\n}\n```\n\n### Summary\n\nIn `summary` the total counts across the whole project can be accessed, for example:\n\n```JSON\n{\n  \"totalCodeCount\": 4366,\n  \"totalCodePercentage\": 68.38972431077694,\n  \"totalDocumentationCount\": 463,\n  \"totalDocumentationPercentage\": 7.25250626566416,\n  \"totalEmptyCount\": 1275,\n  \"totalEmptyPercentage\": 19.971804511278197,\n  \"totalFileCount\": 41,\n  \"totalSourceCount\": 4646,\n  \"totalSourcePercentage\": 72.77568922305764,\n  \"totalStringCount\": 280,\n  \"totalStringPercentage\": 4.385964912280702\n}\n```\n\n### Runtime\n\nThe `runtime` entry collects general information about how well pygount performed in collecting the information, for example:\n\n```JSON\n{\n  \"durationInSeconds\": 0.6333059999999999,\n  \"filesPerSecond\": 64.73963613166464,\n  \"finishedAt\": \"2024-05-13T16:14:31.977070+00:00\",\n  \"linesPerSecond\": 10080.435050354807,\n  \"startedAt\": \"2024-05-13T16:14:31.343764+00:00\"\n}\n```\n\n## Pretty printing\n\nBecause the output is concise and consequently mostly illegible for a human reader, you might want to pipe it through a pretty printer. As you already have python installed, the easiest way is:\n\n```sh\npygount --format json | python -m json.tool\n```\n\nAnother alternativ would be [jq](https://stedolan.github.io/jq/):\n\n```sh\npygount --format json | jq .\n```\n\n## JSON format history\n\nv1.1.0, pygount 1.8.0\n\n- Add `code_count` and `line_count`\n\nv1.0.0, pygount 1.3.0\n\n- Initial version\n"
  },
  {
    "path": "docs/usage.md",
    "content": "# Usage\n\n## General\n\nRun and specify the folder to analyze recursively, for example:\n\n```bash\n$ pygount ~/development/sometool\n```\n\nIf you omit the folder, the current folder of your shell is used as a starting point. Apart from folders you can also specify single files and shell patterns (using `?`, `*` and ranges like `[a-z]`).\n\nCertain files and folders are automatically excluded from the analysis:\n\n- files starting with dot (`.`) or ending in tilda (`~`)\n- folders starting with dot (`.`) or named `_svn`.\n\n### `--folders-to-skip LIST`, `--names-to-skip LIST`\n\nTo specify alternative patterns, use `--folders-to-skip` and `--names-to-skip`. Both take a comma separated list of patterns, see below on the pattern syntax. To, for example, also prevent folders starting with two underscores (`_`) from being analyzed, specify `--folders-to-skip=[...],__*`.\n\n### `--suffix LIST`\n\nTo limit the analysis on certain file types, you can specify a comma separated list of suffixes to take into account, for example `--suffix=py,sql,xml`.\n\n### `--out FILE`\n\nBy default, the results of the analysis are written to the standard output. To redirect the output to a file, use for example `--out=counts.txt`.\n\nTo explicitly redirect to the standard output specify `--out=STDOUT`.\n\n### `--format FORMAT`\n\nBy default, the results of the analysis are written to the standard output in a format similar to sloccount. To redirect the output to a file, use e.g. `--out=counts.txt`. To change the format to an XML file similar to cloc, use `--format=cloc-xml`.\n\nTo just get a quick grasp of the languages used in a project and their respective importance use `--format=summary` which provides a language overview and a sum total. For example, pygount's summary looks like this:\n\n```\nLanguage          Files    %     Code    %     Comment    %\n----------------  -----  ------  ----  ------  -------  ------\nPython               19   51.35  1924   72.99      322   86.10\nreStructuredText      7   18.92   332   12.59        7    1.87\nmarkdown              3    8.11   327   12.41        1    0.27\nBatchfile             1    2.70    24    0.91        1    0.27\nYAML                  1    2.70    11    0.42        2    0.53\nMakefile              1    2.70     9    0.34        7    1.87\nINI                   1    2.70     5    0.19        0    0.00\nTOML                  1    2.70     4    0.15        0    0.00\nText                  3    8.11     0    0.00       34    9.09\n----------------  -----  ------  ----  ------  -------  ------\nSum total            37          2636              374\n```\n\nThe summary output is designed for human readers, and the column widths adjust to the data.\n\nFor further processing the results of pygount, `--format=json` should be the easiest to deal with. For more information, see the chapter on [JSON](json.md).\n\n### `--merge-embedded-languages`\n\nSome languages such as HTML or JavaScript allow embedding other languages in their source code. In that case, the source code is assigned to a language that contains both the base and end embedded language in its name, for example:\n\n- HTML+Jinja\n- JavaScript+Lasso\n\nIf you prefer count all variants of a base language only under its own name, specify `--merge-embedded-languages`. The example above will then show as:\n\n- HTML\n- JavaScript\n\nConsequently, multiple different embedded languages will all count for its common base language.\n\n## Remote repositories\n\nAdditionally to local files, pygount can analyze remote git repositories:\n\n```bash\n$ pygount https://github.com/roskakori/pygount.git\n```\n\nIn the background, this creates a shallow clone of the repository in a temporary folder that after the analysis is removed automatically.\n\nTherefore, you need to have at read access to the repository.\n\nIf you want to analyze a specific revision, specify it at the end of the URL:\n\n```bash\n$ pygount https://github.com/roskakori/pygount.git/v1.6.0\n```\n\nThe remote URL supports the git standard protocols: git, HTTP/S and SSH.\n\n```bash\n$ pygount git@github.com:username/project.git\n```\n\nYou can specify multiple repositories, for example, to include both the web application, command line client and docker container of the [Weblate](https://weblate.org/) project:\n\n```bash\n$  pygount https://github.com/WeblateOrg/weblate.git https://github.com/WeblateOrg/wlc.git  https://github.com/WeblateOrg/docker.git\n```\n\nAnd you can even mix local files and remote repositories:\n\n```bash\n$ pygount ~/projects/some https://github.com/roskakori/pygount.git\n```\n\n## Patterns\n\nSome command line arguments take patterns as values.\n\nBy default, patterns are shell patterns using `*`, `?` and ranges like `[a-z]` as placeholders. Depending on your platform, they are case-sensitive (Unix) or not (macOS, Windows).\n\nIf a pattern starts with `[regex]` you can specify a comma separated list of regular expressions instead using all the constructs supported by the [Python regular expression\nsyntax](https://docs.python.org/3/library/re.html#regular-expression-syntax). Regular expressions are case-sensitive unless they include a `(?i)` flag.\n\nIf the first actual pattern is `[...]`, default patterns are included. Without it, defaults are ignored and only the patterns explicitly stated are taken into account.\n\n### `--generated`\n\nSo for example, to specify that generated code can also contain the German word \"generiert\" in a case-insensitive way use `--generated=\"[regex][...](?i).*generiert\"`.\n\n### `--generated-names`\n\nIn addition to the source code, the file name can indicate that a source code is generated. For example, `--generated-names=\"*.lock,*.g.dart\"`.\n\nThe default already recognizes several standard generated names.\n\n## Counting duplicates\n\n### `--duplicates`\n\nBy default, pygount prevents multiple source files with exactly the same\ncontent to be counted again.\n\nFor two files to be considered duplicates, the following conditions must be met:\n\n1.  Both files have the same size.\n2.  Both files have the same [SHA-256](https://en.wikipedia.org/wiki/SHA-2) hashcode.\n\nThis allows for efficient detection with a tiny possibility for false positives.\n\nHowever, it also prevents detection of files with only minor differences as duplicates. Examples are files that are identical except for additional white space, empty lines or different line endings.\n\nIf you still want to count duplicates multiple times, specify `--duplicates`. This will also result in a minor performance gain of the analysis.\n\n## Source code encoding\n\n### --encoding ENCODING\\[;FALLBACK\\]\n\nWhen reading source code, pygount automatically detects the encoding. It uses a simple algorithm where it recognizes BOM, XML declarations such as:\n\n```xml\n<?xml encoding='cp1252'?>\n```\n\nand \"magic\" comments such as:\n\n```ruby\n# encoding: cp1252\n# coding: cp1252\n# -*- coding: cp1252 -*-\n```\n\nIf the file does not have an appropriate heading, pygount attempts to read it using UTF-8. If this fails, it reads the file using a fallback encoding (by default [CP1252](https://en.wikipedia.org/wiki/Windows-1252)) and ignores any encoding errors.\n\nYou can change this behavior using the `--encoding` option:\n\n- To keep the automatic analysis and use a different fallback encoding, specify for example `--encoding=automatic;iso-8859-15`.\n- To use automatic detection based on heuristic, specify `--encoding=chardet`. For this to work, the [chardet](https://pypi.python.org/pypi/chardet)\n  package must be installed,\n- To use a specific encoding (for all files analyzed), use for example `--encoding=iso-8859-15`.\n\n## Pseudo languages\n\nIf a source code is not counted, the number of lines is 0 and the language shown is a pseudo language indicating the reason:\n\n- `__binary__` - used for `binary`.\n- `__duplicate__` - the source code duplicate as described at the\n  command line option `--duplicates`.\n- `__empty__` - the source code is an empty file with a size of 0 bytes.\n- `__error__` - the source code could not be parsed; for example, due to an I/O\n  error.\n- `__generated__` - the source code is generated according to the\n  command line option `--generated`.\n- `__unknown__` - pygments does not provide a lexer to parse the source\n  code.\n\n## Other information\n\n### `--verbose`\n\nIf `--verbose` is specified, pygount logs detailed information about what it is doing.\n\n### `--help`\n\nTo get a description of all the available command line options, run:\n\n```bash\n$ pygount --help\n```\n\n### `--version`\n\nTo get pygount's current version number, run:\n\n```bash\n$ pygount --version\n```\n"
  },
  {
    "path": "mkdocs.yaml",
    "content": "site_name: \"pygount\"\nsite_url: \"https://pygount.readthedocs.io/\"\nsite_author: \"Thomas Aglassinger <roskakori@users.sourceforge.net>\"\nsite_description: \"Documentation of pygount, a tool to count lines of code for hundreds of languages using pygments\"\n\nrepo_url: \"https://github.com/roskakori/pygount\"\n\ntheme:\n  name: material\n  features:\n    - navigation.footer\n\nmarkdown_extensions:\n  - attr_list\n  - codehilite\n  - toc:\n      permalink: true\n\nnav:\n  - \"Overview\": \"index.md\"\n  - \"Installation\": \"installation.md\"\n  - \"Usage\":\n      - \"Usage\": \"usage.md\"\n      - \"JSON format\": \"json.md\"\n      - \"Continuous integration\": \"continuous-integration.md\"\n      - \"Background\": \"background.md\"\n  - \"API\": \"api.md\"\n  - \"Changes\": \"changes.md\"\n  - \"Contributing\": \"contributing.md\"\n\nvalidation:\n  nav:\n    omitted_files: warn\n"
  },
  {
    "path": "pygount/__init__.py",
    "content": "\"\"\"\nPygount counts lines of source code using pygments lexers.\n\"\"\"\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nfrom importlib.metadata import version\n\nfrom .analysis import DuplicatePool, SourceAnalysis, SourceScanner, SourceState, encoding_for\nfrom .common import Error, OptionError\nfrom .summary import LanguageSummary, ProjectSummary\n\n__version__ = version(__name__)\n\n__all__ = [\n    \"DuplicatePool\",\n    \"Error\",\n    \"LanguageSummary\",\n    \"OptionError\",\n    \"ProjectSummary\",\n    \"SourceAnalysis\",\n    \"SourceScanner\",\n    \"SourceState\",\n    \"__version__\",\n    \"encoding_for\",\n]\n"
  },
  {
    "path": "pygount/analysis.py",
    "content": "\"\"\"\nFunctions to analyze source code and count lines in it.\n\"\"\"\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nimport codecs\nimport collections\nimport glob\nimport hashlib\nimport itertools\nimport logging\nimport os\nimport re\nfrom collections.abc import Iterator, Sequence\nfrom dataclasses import dataclass\nfrom enum import Enum\nfrom io import SEEK_CUR, BufferedIOBase, IOBase, RawIOBase, TextIOBase\nfrom pathlib import Path\nfrom re import Pattern\nfrom typing import Optional, Union\n\nimport pygments.lexer\nimport pygments.lexers\nimport pygments.lexers.html\nimport pygments.token\nimport pygments.util\n\nimport pygount.common\nimport pygount.lexers\nimport pygount.xmldialect\nfrom pygount.common import WHITE_SPACE_CHARACTERS, mapped_repr, matching_regex\nfrom pygount.git_storage import GitStorage, git_remote_url_and_revision_if_any\n\nHTTP_URL_REGEX = re.compile(r\"^(https?://)\")\n_ALLOWED_GIT_PLATFORMS = [\"github.com\", \"bitbucket.org\", \"gitlab.com\"]\n_ALLOWED_GIT_PLATFORM_CHOICES_PATTERN = \"|\".join(map(re.escape, _ALLOWED_GIT_PLATFORMS))\nGIT_REPO_REGEX = re.compile(rf\"^(https?://|git@)({_ALLOWED_GIT_PLATFORM_CHOICES_PATTERN})/[^/]+/[^/]+\")\n\n# Attempt to import chardet.\ntry:\n    import chardet.universaldetector\n\n    _detector = chardet.universaldetector.UniversalDetector()\nexcept ImportError:\n    _detector = None\nhas_chardet = bool(_detector)\n\n#: Fallback encoding to use if no encoding is specified\nDEFAULT_FALLBACK_ENCODING = \"cp1252\"\n\n#: Default glob patterns for folders not to analyze.\nDEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT = \", \".join(\n    [\n        \".?*\",\n        \"_svn\",  # Subversion hack for Windows\n        \"__pycache__\",  # Python byte code\n    ]\n)\n\n#: Pygments token type; we need to define our own type because pygments' ``_TokenType`` is internal.\nTokenType = type(pygments.token.Token)\n\n_BASE_LANGUAGE_REGEX = re.compile(r\"^(?P<base_language>[^+]+)\\+[^+].*$\")\n\n#: BOMs to indicate that a file is a text file even if it contains zero bytes.\n_TEXT_BOMS = (codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE, codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE, codecs.BOM_UTF8)\n\n\nclass SourceState(Enum):\n    \"\"\"\n    Possible values for :py:attr:`SourceAnalysis.state`.\n    \"\"\"\n\n    #: successfully analyzed\n    analyzed = 1\n    #: source code is a binary\n    binary = 2\n    #: source code is an identical copy of another\n    duplicate = 3\n    #: source code is empty (file size = 0)\n    empty = 4\n    #: source could not be parsed\n    error = 5\n    #: source code has been generated\n    generated = 6\n    # TODO: 'huge' = auto()  # source code exceeds size limit\n    #: pygments does not offer any lexer to analyze the source\n    unknown = 7\n\n\n#: Default patterns for regular expressions to detect generated code.\n#: The '(?i)' indicates that the patterns are case-insensitive.\nDEFAULT_GENERATED_LINE_PATTERNS_TEXT = pygount.common.REGEX_PATTERN_PREFIX + \", \".join(\n    [\n        r\"(?i).*autogenerated\",\n        r\"(?i).*automatically generated\",\n        r\"(?i).*do not edit\",\n        r\"(?i).*generated with the .+ utility\",\n        r\"(?i).*this is a generated file\",\n        r\"(?i).*generated automatically\",\n    ]\n)\n\n#: Default patterns for file names that are considered to be generated.\nDEFAULT_GENERATED_NAME_PATTERNS_TEXT = \", \".join(\n    [\n        \"*.g.dart\",  # See, for example, <https://codewithandrea.com/articles/dart-flutter-code-generation/>\n        \"*.lock\",  # For example, Cargo.lock, poetry.lock, uv.lock.\n        \"npm-shrinkwrap.json\",  # See <https://docs.npmjs.com/cli/v11/configuring-npm/npm-shrinkwrap-json>.\n        \"go.sum\",  # See <https://go.dev/ref/mod#go-sum-files>.\n        \"package-lock.json\",  # See <https://docs.npmjs.com/cli/v11/configuring-npm/package-lock-json>.\n        \"pnpm-lock.yaml\",  # See <https://pnpm.io/cli/install>.\n    ]\n)\n\n#: Default glob patterns for file names not to analyze.\nDEFAULT_NAME_PATTERNS_TO_SKIP_TEXT = \", \".join([\".*\", \"*~\"])\n\n_log = logging.getLogger(\"pygount\")\n\n_MARK_TO_NAME_MAP = ((\"c\", \"code\"), (\"d\", \"documentation\"), (\"e\", \"empty\"), (\"s\", \"string\"))\n_BOM_TO_ENCODING_MAP = collections.OrderedDict(\n    (\n        # NOTE: We need an ordered dict due to the overlap between utf-32-le and utf-16-be.\n        (codecs.BOM_UTF8, \"utf-8-sig\"),\n        (codecs.BOM_UTF32_LE, \"utf-32-le\"),\n        (codecs.BOM_UTF16_BE, \"utf-16-be\"),\n        (codecs.BOM_UTF16_LE, \"utf-16-le\"),\n        (codecs.BOM_UTF32_BE, \"utf-32-be\"),\n    )\n)\n_XML_PROLOG_REGEX = re.compile(r'<\\?xml\\s+.*encoding=\"(?P<encoding>[-_.a-zA-Z0-9]+)\".*\\?>')\n_MAGIC_COMMENT_LINE_START_REGEXES = [\n    re.compile(f\"^{pattern}\\\\s*(?P<remainder>.+)$\", re.IGNORECASE)\n    for pattern in [\n        r\"#+\",  # Python, Ruby\n        r\"//+\",  # C++, Dart, Java, ...\n        r\"/\\*+\",  # C etc\n        r\"--+\",  # Ada, SQL, VHDL\n        r\";+\",  # Assembly\n        r\"%+\",  # Latex, MatLab, Prolog\n        r\"rem\\s\",  # Basic, Windows batch\n        r\"\\*+\",  # Pascal\n        r\"\\{\",  # Pascal\n    ]\n]\n_MAGIC_COMMENT_LINE_REMAINDER_REGEXES = [\n    re.compile(pattern, re.IGNORECASE)\n    for pattern in [\n        # Covers for example \"encoding: cp1252\" and \"encoding=cp1252\".\n        r\"(en)?coding\\s*[:=]\\s*(?P<encoding>[-_.a-z0-9]+)\\b\",\n        # Covers for example \"-*- coding: cp1252 -*-\".\n        r\"-\\*-\\s*coding\\s*[:=]\\s*(?P<encoding>[-_.a-z0-9]+)\\s*(;.+\\s*)?-\\*-\\s*\",\n    ]\n]\n\n_STANDARD_PLAIN_TEXT_NAME_PATTERNS = (\n    # Text files for (moribund) gnits standards.\n    \"authors\",\n    \"bugs\",\n    \"changelog\",\n    \"copying\",\n    \"install\",\n    \"license\",\n    \"news\",\n    \"readme\",\n    \"thanks\",\n    # GitHub community recommendations, see\n    # <https://docs.github.com/en/communities/setting-up-your-project-for-healthy-contributions>.\n    # By now, in practice most projects use a suffix like \"*.md\" but some older ones\n    # still might have such files without suffix.\n    \"code_of_conduct\",\n    \"contributing\",\n    \"support\",\n    # Other common text files.\n    \"changes\",\n    \"faq\",\n    \"readme\\\\.1st\",\n    \"read\\\\.me\",\n    \"todo\",\n)\n_PLAIN_TEXT_PATTERN = \"(^\" + \"$)|(^\".join(_STANDARD_PLAIN_TEXT_NAME_PATTERNS) + \"$)\"\n#: Regular expression to detect plain text files by name.\n_PLAIN_TEXT_NAME_REGEX = re.compile(_PLAIN_TEXT_PATTERN, re.IGNORECASE)\n\n_MARK_UP_NAME_PATTERN = r\"^.*\\.(md|rst|txt|\\d+)$\"\n_MARK_UP_NAME_REGEX = re.compile(_MARK_UP_NAME_PATTERN, re.IGNORECASE)\n\n#: Mapping for file suffixes to lexers for which pygments offers no official one.\n_SUFFIX_TO_FALLBACK_LEXER_MAP = {\n    \"fex\": pygount.lexers.MinimalisticWebFocusLexer(),\n    \"idl\": pygount.lexers.IdlLexer(),\n    \"m4\": pygount.lexers.MinimalisticM4Lexer(),\n    \"svg\": pygments.lexers.html.XmlLexer(),  # TODO#213 Remove SVG hack.\n    \"txt\": pygount.lexers.PlainTextLexer(),\n    \"vbe\": pygount.lexers.MinimalisticVBScriptLexer(),\n    \"vbs\": pygount.lexers.MinimalisticVBScriptLexer(),\n}\nfor _oracle_suffix in (\"pck\", \"pkb\", \"pks\", \"pls\"):\n    _SUFFIX_TO_FALLBACK_LEXER_MAP[_oracle_suffix] = pygments.lexers.get_lexer_by_name(\"plpgsql\")\n\n\n@dataclass(frozen=True)\nclass PathData:\n    source_path: str\n    group: str\n    tmp_dir: Optional[str] = None\n\n\ndef is_markup_file(source_path: str) -> bool:\n    return _MARK_UP_NAME_REGEX.match(os.path.basename(source_path)) is not None\n\n\nclass DuplicatePool:\n    \"\"\"\n    A pool that collects information about potential duplicate files.\n    \"\"\"\n\n    def __init__(self):\n        self._size_to_paths_map = {}\n        self._size_and_hash_to_path_map = {}\n\n    @staticmethod\n    def _hash_for(path_to_hash):\n        buffer_size = 1024 * 1024\n        sha256_hash = hashlib.sha256()\n        with open(path_to_hash, \"rb\", buffer_size) as file_to_hash:\n            data = file_to_hash.read(buffer_size)\n            while len(data) >= 1:\n                sha256_hash.update(data)\n                data = file_to_hash.read(buffer_size)\n        return sha256_hash.digest()\n\n    def duplicate_path(self, source_path: str) -> Optional[str]:\n        \"\"\"\n        Path to a duplicate for ``source_path`` or ``None`` if no duplicate exists.\n\n        Internally information is stored to identify possible future duplicates of\n        ``source_path``.\n        \"\"\"\n        result = None\n        source_size = os.path.getsize(source_path)\n        paths_with_same_size = self._size_to_paths_map.get(source_size)\n        if paths_with_same_size is None:\n            self._size_to_paths_map[source_size] = [source_path]\n        else:\n            source_hash = DuplicatePool._hash_for(source_path)\n            if len(paths_with_same_size) == 1:\n                # Retrofit the initial path with the same size and its hash.\n                initial_path_with_same_size = paths_with_same_size[0]\n                initial_hash = DuplicatePool._hash_for(initial_path_with_same_size)\n                self._size_and_hash_to_path_map[(source_size, initial_hash)] = initial_path_with_same_size\n            result = self._size_and_hash_to_path_map.get((source_size, source_hash))\n            self._size_and_hash_to_path_map[(source_size, source_hash)] = source_path\n        return result\n\n\nclass SourceAnalysis:\n    \"\"\"\n    Results from analyzing a source path.\n\n    Prefer the factory methods :py:meth:`from_file()` and :py:meth:`from_state` to\n    calling the constructor.\n    \"\"\"\n\n    def __init__(\n        self,\n        path: str,\n        language: str,\n        group: str,\n        code: int,\n        documentation: int,\n        empty: int,\n        string: int,\n        state: SourceState,\n        state_info: Optional[str] = None,\n    ):\n        SourceAnalysis._check_state_info(state, state_info)\n        self._path = path\n        self._language = language\n        self._group = group\n        self._code = code\n        self._documentation = documentation\n        self._empty = empty\n        self._string = string\n        self._state = state\n        self._state_info = state_info\n\n    @staticmethod\n    def from_state(\n        source_path: str,\n        group: str,\n        state: SourceState,\n        state_info: Optional[str] = None,\n        tmp_dir: Optional[str] = None,\n    ) -> \"SourceAnalysis\":\n        \"\"\"\n        Factory method to create a :py:class:`SourceAnalysis` with all counts\n        set to 0 and everything else according to the specified parameters.\n        \"\"\"\n        assert source_path is not None\n        assert group is not None\n        assert state != SourceState.analyzed, \"use from() for analyzable sources\"\n        SourceAnalysis._check_state_info(state, state_info)\n        reduced_path = source_path.rsplit(tmp_dir, maxsplit=1)[-1].lstrip(os.sep) if tmp_dir else source_path\n        return SourceAnalysis(\n            path=reduced_path,\n            language=f\"__{state.name}__\",\n            group=group,\n            code=0,\n            documentation=0,\n            empty=0,\n            string=0,\n            state=state,\n            state_info=state_info,\n        )\n\n    @staticmethod\n    def _check_state_info(state: SourceState, state_info: Optional[str]):\n        assert state_info is None or isinstance(state_info, str), (\n            f\"state_info must be be None or str but is: {state_info!r}\"\n        )\n\n        states_that_require_state_info = [SourceState.duplicate, SourceState.error, SourceState.generated]\n        assert (state in states_that_require_state_info) == (state_info is not None), (\n            f\"state={state} and state_info={state_info} \"\n            f\"but state_info must be specified for the following states: {states_that_require_state_info}\"\n        )\n\n    @staticmethod\n    def from_file(\n        source_path: str,\n        group: str,\n        encoding: str = \"automatic\",\n        fallback_encoding: str = \"cp1252\",\n        generated_regexes: Optional[list[Pattern]] = None,\n        duplicate_pool: Optional[DuplicatePool] = None,\n        file_handle: Optional[IOBase] = None,\n        merge_embedded_language: bool = False,\n        tmp_dir: Optional[str] = None,\n        *,\n        generated_name_regexes: Optional[list[Pattern]] = None,\n    ) -> \"SourceAnalysis\":\n        \"\"\"\n        Factory method to create a :py:class:`SourceAnalysis` by analyzing\n        the source code in ``source_path`` or the open file ``file_handle``.\n\n        :param source_path: path to source code to analyze\n        :param group: name of a logical group the source code belongs to, e.g. a\n          package.\n        :param encoding: encoding according to :func:`encoding_for`\n        :param fallback_encoding: fallback encoding according to\n          :func:`encoding_for`\n        :param generated_regexes: list of regular expression that if found within the first few lines\n          if a source code identify is as generated source code for which SLOC should not be counted\n        :param generated_name_regexes: list of regular expression that if the base file name matches,\n          the file is considered to be generated and the SLOC should not be counted\n        :param duplicate_pool: a :class:`DuplicatePool` where information about possible duplicates is\n          collected, or ``None`` if possible duplicates should be counted multiple times.\n        :param file_handle: a file-like object, or ``None`` to read and open the file from\n          ``source_path``. If the file is open in text mode, it must be opened with the correct\n          encoding.\n        :param merge_embedded_language: If pygments detects a base and embedded language, the source\n          code counts towards the base language. For example, \"JavaScript+Lasso\" counts as\n          \"JavaScript\".\n        :param tmp_dir: If a temporary directory was created, strip it from the path name. This happens\n          right now only for git repositories.\n        \"\"\"\n        assert encoding is not None\n\n        result = None\n        lexer = None\n        source_code = None\n        if generated_name_regexes is not None:\n            generated_name_regex = matching_regex(Path(source_path).name, generated_name_regexes)\n            if generated_name_regex is not None:\n                result = SourceAnalysis.from_state(\n                    source_path, group, SourceState.generated, state_info=generated_name_regex.pattern\n                )\n        if result is None and file_handle is None:\n            source_size = os.path.getsize(source_path)\n            if source_size == 0:\n                _log.info(\"%s: is empty\", source_path)\n                result = SourceAnalysis.from_state(source_path, group, SourceState.empty)\n            elif is_binary_file(source_path):\n                _log.info(\"%s: is binary\", source_path)\n                result = SourceAnalysis.from_state(source_path, group, SourceState.binary)\n            elif not has_lexer(source_path):\n                _log.info(\"%s: unknown language\", source_path)\n                result = SourceAnalysis.from_state(source_path, group, SourceState.unknown)\n        if duplicate_pool is not None:\n            duplicate_path = duplicate_pool.duplicate_path(source_path)\n            if duplicate_path is not None:\n                _log.info(\"%s: is a duplicate of %s\", source_path, duplicate_path)\n                result = SourceAnalysis.from_state(source_path, group, SourceState.duplicate, duplicate_path)\n        if result is None:\n            try:\n                if file_handle is None:\n                    if encoding in (\"automatic\", \"chardet\"):\n                        encoding = encoding_for(source_path, encoding, fallback_encoding)\n                    with open(source_path, encoding=encoding) as source_file:\n                        source_code = source_file.read()\n                elif not isinstance(file_handle, TextIOBase):\n                    if encoding in (\"automatic\", \"chardet\"):\n                        encoding = encoding_for(source_path, encoding, fallback_encoding, file_handle=file_handle)\n                    source_code = file_handle.read().decode(encoding)\n                else:\n                    source_code = file_handle.read()\n            except (LookupError, OSError, UnicodeError) as error:\n                _log.warning(\"cannot read %s using encoding %s: %s\", source_path, encoding, error)\n                result = SourceAnalysis.from_state(source_path, group, SourceState.error, str(error))\n            if result is None:\n                lexer = guess_lexer(source_path, source_code)\n                assert lexer is not None\n        actual_generated_regexes = (\n            generated_regexes\n            if generated_regexes is not None\n            else pygount.common.regexes_from(DEFAULT_GENERATED_LINE_PATTERNS_TEXT)\n        )\n        if (result is None) and (len(actual_generated_regexes) != 0):\n            number_line_and_regex = matching_number_line_and_regex(\n                pygount.common.lines(source_code), actual_generated_regexes\n            )\n            if number_line_and_regex is not None:\n                number, _, regex = number_line_and_regex\n                message = f\"line {number} matches {regex}\"\n                _log.info(\"%s: is generated code because %s\", source_path, message)\n                result = SourceAnalysis.from_state(source_path, group, SourceState.generated, message)\n        if result is None:\n            assert lexer is not None\n            assert source_code is not None\n            language = base_language(lexer.name) if merge_embedded_language else lexer.name\n            if (\"xml\" in language.lower()) or (language == \"Genshi\"):\n                dialect = pygount.xmldialect.xml_dialect(source_path, source_code)\n                if dialect is not None:\n                    language = dialect\n            _log.info(\"%s: analyze as %s using encoding %s\", source_path, language, encoding)\n            mark_to_count_map = {\"c\": 0, \"d\": 0, \"e\": 0, \"s\": 0}\n            is_markup = is_markup_file(source_path)\n            for line_parts in _line_parts(lexer, source_code, is_markup=is_markup):\n                mark_to_increment = \"e\"\n                for mark_to_check in (\"d\", \"s\", \"c\"):\n                    if mark_to_check in line_parts:\n                        mark_to_increment = mark_to_check\n                mark_to_count_map[mark_to_increment] += 1\n            reduced_path = source_path.rsplit(tmp_dir, maxsplit=1)[-1].lstrip(os.sep) if tmp_dir else source_path\n            result = SourceAnalysis(\n                path=reduced_path,\n                language=language,\n                group=group,\n                code=mark_to_count_map[\"c\"],\n                documentation=mark_to_count_map[\"d\"],\n                empty=mark_to_count_map[\"e\"],\n                string=mark_to_count_map[\"s\"],\n                state=SourceState.analyzed,\n                state_info=None,\n            )\n\n        assert result is not None\n        return result\n\n    @property\n    def path(self) -> str:\n        return self._path\n\n    @property\n    def language(self) -> str:\n        \"\"\"\n        The programming language the analyzed source code is written in; if\n        :py:attr:`state` does not equal :py:attr:`SourceState.analyzed` this\n        will be a pseudo language.\n        \"\"\"\n        return self._language\n\n    @property\n    def group(self) -> str:\n        \"\"\"\n        Group the source code belongs to; this can be any text useful to group\n        the files later on. It is perfectly valid to put all files in the same\n        group.\n\n        (Note: this property is mostly there for compatibility with the\n        original SLOCCount.)\n        \"\"\"\n        return self._group\n\n    @property\n    def code_count(self) -> int:\n        \"\"\"number of lines containing code\"\"\"\n        return self._code\n\n    @property\n    def documentation_count(self) -> int:\n        \"\"\"number of lines containing documentation (resp. comments)\"\"\"\n        return self._documentation\n\n    @property\n    def empty_count(self) -> int:\n        \"\"\"\n        number of empty lines, including lines containing only white space,\n        white characters or white code words\n\n        See also: :py:func:`white_characters`, :py:func:`white_code_words`\n        \"\"\"\n        return self._empty\n\n    @property\n    def line_count(self) -> int:\n        \"\"\"number of total lines, which is what you text editor a `wc -l`\n        would show\n        \"\"\"\n        return self.code_count + self.documentation_count + self.empty_count + self.string_count\n\n    @property\n    def string_count(self) -> int:\n        \"\"\"number of lines containing only strings but no other code\"\"\"\n        return self._string\n\n    @property\n    def source_count(self) -> int:\n        \"\"\"number of source lines of code (the sum of code_count and string_count)\"\"\"\n        return self.code_count + self.string_count\n\n    @property\n    def state(self) -> SourceState:\n        \"\"\"\n        The state of the analysis after parsing the source file.\n        \"\"\"\n        return self._state\n\n    @property\n    def state_info(self) -> Optional[Union[str, Exception]]:\n        \"\"\"\n        Possible additional information about :py:attr:`state`:\n\n        * :py:attr:`SourceState.duplicate`: path to the original source file\n          the :py:attr:`path` is a duplicate of\n        * :py:attr:`SourceState.error`: the :py:exc:`Exception` causing the\n          error\n        * :py:attr:`SourceState.generated`: a human-readable explanation why\n          the file is considered to be generated\n        \"\"\"\n        return self._state_info\n\n    @property\n    def is_countable(self) -> bool:\n        \"\"\"\n        ``True`` if source counts can be counted towards a total.\n        \"\"\"\n        return self.state in (SourceState.analyzed, SourceState.duplicate)\n\n    def __repr__(self):\n        name_to_value_map = {\n            \"path\": repr(self.path),\n            \"language\": repr(self.language),\n            \"group\": repr(self.group),\n            \"state\": self.state.name,\n        }\n        if self.state == SourceState.analyzed:\n            name_to_value_map.update(\n                {\n                    \"code_count\": self.code_count,\n                    \"documentation_count\": self.documentation_count,\n                    \"empty_count\": self.empty_count,\n                    \"string_count\": self.string_count,\n                }\n            )\n        if self.state_info is not None:\n            name_to_value_map[\"state_info\"] = repr(self.state_info)\n        return mapped_repr(self, name_to_value_map)\n\n\nclass SourceScanner:\n    \"\"\"\n    Scanner for source code files matching certain conditions.\n    \"\"\"\n\n    def __init__(\n        self,\n        source_patterns,\n        suffixes=\"*\",\n        folders_to_skip=None,\n        name_to_skip=None,\n    ):\n        self._source_patterns = source_patterns\n        self._suffixes = pygount.common.regexes_from(suffixes)\n        self._folder_regexps_to_skip = (\n            folders_to_skip\n            if folders_to_skip is not None\n            else pygount.common.regexes_from(DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT)\n        )\n        self._name_regexps_to_skip = (\n            name_to_skip\n            if folders_to_skip is not None\n            else pygount.common.regexes_from(DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT)\n        )\n        self._git_storages = []\n\n    def close(self):\n        for git_storage in self._git_storages:\n            git_storage.close()\n\n    def __enter__(self):\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        self.close()\n        return False\n\n    @property\n    def source_patterns(self):\n        return self._source_patterns\n\n    @property\n    def suffixes(self) -> list[Pattern]:\n        return self._suffixes\n\n    @property\n    def folder_regexps_to_skip(self) -> list[Pattern]:\n        return self._folder_regexps_to_skip\n\n    @folder_regexps_to_skip.setter\n    def folder_regexps_to_skip(self, regexps_or_pattern_text):\n        self._folder_regexps_to_skip.append = pygount.common.regexes_from(\n            regexps_or_pattern_text, self.folder_regexps_to_skip\n        )\n\n    @property\n    def name_regexps_to_skip(self) -> list[Pattern]:\n        return self._name_regexps_to_skip\n\n    @name_regexps_to_skip.setter\n    def name_regexps_to_skip(self, regexps_or_pattern_text):\n        self._name_regexps_to_skip = pygount.common.regexes_from(regexps_or_pattern_text, self.name_regexps_to_skip)\n\n    def _is_path_to_skip(self, name, is_folder) -> bool:\n        assert os.sep not in name, f\"name={name!r}\"\n        regexps_to_skip = self._folder_regexps_to_skip if is_folder else self._name_regexps_to_skip\n        return any(path_name_to_skip_regex.match(name) is not None for path_name_to_skip_regex in regexps_to_skip)\n\n    def _paths_and_group_to_analyze_in(self, folder, group, tmp_dir) -> PathData:\n        assert folder is not None\n        assert group is not None\n\n        for name in os.listdir(folder):\n            path = os.path.join(folder, name)\n            if not os.path.islink(path):\n                is_folder = os.path.isdir(path)\n                if self._is_path_to_skip(os.path.basename(path), is_folder):\n                    _log.debug(\"skip due to matching skip pattern: %s\", path)\n                elif is_folder:\n                    yield from self._paths_and_group_to_analyze_in(path, group, tmp_dir)\n                else:\n                    yield PathData(source_path=path, group=group, tmp_dir=tmp_dir)\n\n    def _paths_and_group_to_analyze(self, path_to_analyse_pattern, group=None, tmp_dir=None) -> Iterator[PathData]:\n        for path_to_analyse in glob.glob(path_to_analyse_pattern):\n            if os.path.islink(path_to_analyse):\n                _log.debug(\"skip link: %s\", path_to_analyse)\n            else:\n                is_folder = os.path.isdir(path_to_analyse)\n                if self._is_path_to_skip(os.path.basename(path_to_analyse), is_folder):\n                    _log.debug(\"skip due to matching skip pattern: %s\", path_to_analyse)\n                else:\n                    actual_group = group\n                    if is_folder:\n                        if actual_group is None:\n                            actual_group = os.path.basename(path_to_analyse)\n                            if actual_group == \"\":\n                                # Compensate for trailing path separator.\n                                actual_group = os.path.basename(os.path.dirname(path_to_analyse))\n                        yield from self._paths_and_group_to_analyze_in(path_to_analyse_pattern, actual_group, tmp_dir)\n                    else:\n                        if actual_group is None:\n                            actual_group = os.path.dirname(path_to_analyse)\n                            if actual_group == \"\":\n                                actual_group = os.path.basename(os.path.dirname(os.path.abspath(path_to_analyse)))\n                        yield PathData(source_path=path_to_analyse, group=actual_group, tmp_dir=tmp_dir)\n\n    def _source_paths_and_groups_to_analyze(self, source_patterns_to_analyze) -> list[PathData]:\n        assert source_patterns_to_analyze is not None\n\n        result = []\n\n        def _process_source_pattern(source_pattern: str):\n            remote_url, revision = git_remote_url_and_revision_if_any(source_pattern)\n            if remote_url is not None:\n                git_storage = GitStorage(remote_url, revision)\n                self._git_storages.append(git_storage)\n                git_storage.extract()\n                result.extend(\n                    self._paths_and_group_to_analyze(git_storage.temp_folder, tmp_dir=git_storage.temp_folder)\n                )\n            else:\n                has_url_prefix = re.match(HTTP_URL_REGEX, source_pattern)\n                if has_url_prefix:\n                    is_git_url = re.match(GIT_REPO_REGEX, source_pattern_to_analyze) is not None\n                    if not is_git_url:\n                        raise pygount.Error(\n                            f'URL to git repository {source_pattern} must end with \".git\" or must match the pattern '\n                            f\"http(s)://({'|'.join(_ALLOWED_GIT_PLATFORMS)})/<...>/<...>.git. \"\n                            f\"For example: git@github.com:roskakori/pygount.git or \"\n                            f\"https://github.com/roskakori/pygount.git.\"\n                        )\n                    source_pattern = source_pattern.rstrip(\"/\")\n                    _process_source_pattern(source_pattern + \".git\")\n                else:\n                    result.extend(self._paths_and_group_to_analyze(source_pattern_to_analyze))\n\n        # NOTE: We could avoid initializing `source_pattern_to_analyze` here by moving the `try` inside\n        #  the loop, but this would incor a performance overhead (ruff's PERF203).\n        source_pattern_to_analyze = None\n        try:\n            for source_pattern_to_analyze in source_patterns_to_analyze:\n                _process_source_pattern(source_pattern_to_analyze)\n        except OSError as error:\n            assert source_pattern_to_analyze is not None\n            raise OSError(f'cannot scan \"{source_pattern_to_analyze}\" for source files: {error}') from error\n        result = sorted(set(result), key=lambda data: (data.source_path, data.group))\n        return result\n\n    def source_paths(self) -> Iterator[PathData]:\n        \"\"\"\n        Paths to source code files matching all the conditions for this scanner.\n        \"\"\"\n        source_paths_and_groups_to_analyze = self._source_paths_and_groups_to_analyze(self.source_patterns)\n\n        for path_data in source_paths_and_groups_to_analyze:\n            suffix = os.path.splitext(path_data.source_path)[1].lstrip(\".\")\n            is_suffix_to_analyze = any(suffix_regexp.match(suffix) for suffix_regexp in self.suffixes)\n            if is_suffix_to_analyze:\n                yield path_data\n            else:\n                _log.info(\"skip due to suffix: %s\", path_data.source_path)\n\n\n_LANGUAGE_TO_WHITE_WORDS_MAP = {\"batchfile\": {\"@\"}, \"python\": {\"pass\"}, \"sql\": {\"begin\", \"end\"}}\nfor _language in _LANGUAGE_TO_WHITE_WORDS_MAP:\n    assert _language.islower()\n\n\ndef matching_number_line_and_regex(\n    source_lines: Iterator[str], generated_regexes: Sequence[Pattern], max_line_count: int = 15\n) -> Optional[tuple[int, str, Pattern]]:\n    \"\"\"\n    The first line and its number (starting with 0) in the source code that\n    indicated that the source code is generated.\n    :param source_lines: lines of text to scan\n    :param generated_regexes: regular expressions a line must match to indicate\n        the source code is generated.\n    :param max_line_count: maximum number of lines to scan\n    :return: a tuple of the form ``(number, line, regex)`` or ``None`` if the\n        source lines do not match any ``generated_regexes``.\n    \"\"\"\n    initial_numbers_and_lines = enumerate(itertools.islice(source_lines, max_line_count))\n    matching_number_line_and_regexps = (\n        (number, line, matching_regex)\n        for number, line in initial_numbers_and_lines\n        for matching_regex in generated_regexes\n        if matching_regex.match(line)\n    )\n    possible_first_matching_number_line_and_regexp = list(itertools.islice(matching_number_line_and_regexps, 1))\n    result = (\n        possible_first_matching_number_line_and_regexp[0] if possible_first_matching_number_line_and_regexp else None\n    )\n    return result\n\n\ndef white_characters(language_id: str) -> str:\n    \"\"\"\n    Characters that count as white space if they are the only characters in a\n    line.\n    \"\"\"\n    assert language_id is not None\n    assert language_id.islower()\n    return \"(),:;[]{}\"\n\n\ndef white_code_words(language_id: str) -> set[str]:\n    \"\"\"\n    Words that do not count as code if it is the only word in a line.\n    \"\"\"\n    assert language_id is not None\n    assert language_id.islower()\n    return _LANGUAGE_TO_WHITE_WORDS_MAP.get(language_id, set())\n\n\ndef _delined_tokens(tokens: Iterator[tuple[TokenType, str]]) -> Iterator[TokenType]:\n    for token_type, token_text in tokens:\n        remaining_token_text = token_text\n        newline_index = remaining_token_text.find(\"\\n\")\n        while newline_index != -1:\n            yield token_type, remaining_token_text[: newline_index + 1]\n            remaining_token_text = remaining_token_text[newline_index + 1 :]\n            newline_index = remaining_token_text.find(\"\\n\")\n        if remaining_token_text != \"\":\n            yield token_type, remaining_token_text\n\n\ndef _pythonized_comments(tokens: Iterator[tuple[TokenType, str]]) -> Iterator[TokenType]:\n    \"\"\"\n    Similar to tokens but converts strings after a colon (`:`) to comments.\n    \"\"\"\n    is_after_colon = True\n    for token_type, result_token_text in tokens:\n        if is_after_colon and (token_type in pygments.token.String):\n            result_token_type = pygments.token.Comment\n        else:\n            result_token_type = token_type\n            if result_token_text == \":\":\n                is_after_colon = True\n            elif token_type not in pygments.token.Comment:\n                is_whitespace = len(result_token_text.rstrip(WHITE_SPACE_CHARACTERS)) == 0\n                if not is_whitespace:\n                    is_after_colon = False\n        yield result_token_type, result_token_text\n\n\ndef _line_parts(lexer: pygments.lexer.Lexer, text: str, is_markup: bool = False) -> Iterator[set[str]]:\n    line_marks = set()\n    tokens = _delined_tokens(lexer.get_tokens(text))\n    if lexer.name == \"Python\":\n        tokens = _pythonized_comments(tokens)\n    language_id = lexer.name.lower()\n    white_text = \" \\f\\n\\r\\t\" + white_characters(language_id)\n    white_words = white_code_words(language_id)\n    for token_type, token_text in tokens:\n        # NOTE: Pygments treats preprocessor statements as special comments.\n        is_actual_comment = token_type in pygments.token.Comment and token_type not in (\n            pygments.token.Comment.Preproc,\n            pygments.token.Comment.PreprocFile,\n        )\n        if is_actual_comment:\n            line_marks.add(\"d\")  # 'documentation'\n        elif token_type in pygments.token.String:\n            line_marks.add(\"s\")  # 'string'\n        else:\n            is_white_text = (token_text.strip() in white_words) or (token_text.rstrip(white_text) == \"\")\n            if not is_white_text:\n                line_mark = \"d\" if is_markup else \"c\"\n                line_marks.add(line_mark)\n        if token_text.endswith(\"\\n\"):\n            yield line_marks\n            line_marks = set()\n    if len(line_marks) >= 1:\n        yield line_marks\n\n\ndef check_file_handle_is_seekable(file_handle: Optional[Union[BufferedIOBase, RawIOBase]], source_path: str):\n    if not file_handle.seekable():\n        raise pygount.Error(f\"cannot determine encoding: file handle must be seekable: {source_path}\")\n\n\ndef encoding_for(\n    source_path: str,\n    encoding: str = \"automatic\",\n    fallback_encoding: Optional[str] = None,\n    file_handle: Optional[Union[BufferedIOBase, RawIOBase]] = None,\n) -> str:\n    \"\"\"\n    The encoding used by the text file stored in ``source_path``.\n\n    The algorithm used is:\n\n    * If ``encoding`` is ``'automatic``, attempt the following:\n\n      1. Check BOM for UTF-8, UTF-16 and UTF-32.\n      2. Look for XML prolog or magic heading like ``# -*- coding: cp1252 -*-``\n      3. Read the file using UTF-8.\n      4. If all this fails, use the ``fallback_encoding`` and ignore any\n         further encoding errors.\n\n    * If ``encoding`` is ``'chardet`` use :mod:`chardet` to obtain the encoding.\n    * For any other ``encoding`` simply use the specified value.\n    \"\"\"\n    assert encoding is not None\n\n    if encoding == \"automatic\":\n        if file_handle is None:\n            with open(source_path, \"rb\") as source_file:\n                heading = source_file.read(128)\n        else:\n            check_file_handle_is_seekable(file_handle, source_path)\n            heading = file_handle.read(128)\n            file_handle.seek(-len(heading), SEEK_CUR)\n        result = None\n        if len(heading) == 0:\n            # File is empty, assume a dummy encoding.\n            result = \"utf-8\"\n        if result is None:\n            result = next(\n                (\n                    encoding_for_bom\n                    for bom, encoding_for_bom in _BOM_TO_ENCODING_MAP.items()\n                    if heading[: len(bom)] == bom\n                ),\n                None,\n            )\n        if result is None:\n            result = encoding_from_header(heading)\n    elif encoding == \"chardet\":\n        assert _detector is not None, (\n            'without chardet installed, encoding=\"chardet\" must be rejected before calling encoding_for()'\n        )\n        _detector.reset()\n        if file_handle is None:\n            with open(source_path, \"rb\") as source_file:\n                lines = source_file.readlines()\n        else:\n            check_file_handle_is_seekable(file_handle, source_path)\n            file_position = file_handle.tell()\n            lines = file_handle.readlines()\n            file_handle.seek(file_position)\n        for line in lines:\n            _detector.feed(line)\n            if _detector.done:\n                break\n        result = _detector.result[\"encoding\"]\n        if result is None:\n            _log.warning(\n                \"%s: chardet cannot determine encoding, assuming fallback encoding %s\", source_path, fallback_encoding\n            )\n            result = fallback_encoding\n    else:\n        # Simply use the specified encoding.\n        result = encoding\n    if result is None:\n        # Encoding 'automatic' or 'chardet' failed to detect anything.\n        if fallback_encoding is not None:\n            # If defined, use the fallback encoding.\n            result = fallback_encoding\n        else:\n            try:\n                # Attempt to read the file as UTF-8.\n                if file_handle is None:\n                    with open(source_path, encoding=\"utf-8\") as source_file:\n                        source_file.read()\n                else:\n                    check_file_handle_is_seekable(file_handle, source_path)\n                    file_position = file_handle.tell()\n                    file_handle.read()\n                    file_handle.seek(file_position)\n                result = \"utf-8\"\n            except UnicodeDecodeError:\n                # UTF-8 did not work out, use the default as last resort.\n                result = DEFAULT_FALLBACK_ENCODING\n            _log.debug(\"%s: no fallback encoding specified, using %s\", source_path, result)\n\n    assert result is not None\n    return result\n\n\ndef encoding_from_header(header: bytes) -> Optional[str]:\n    ascii_header = header.decode(\"ascii\", errors=\"replace\")\n    result = encoding_from_possible_magic_comment(ascii_header)\n    if result is None:\n        result = encoding_from_possible_xml_prolog(ascii_header)\n    return result\n\n\ndef encoding_from_possible_magic_comment(ascii_header: str) -> Optional[str]:\n    return next(_magic_comment_encodings(ascii_header), None)\n\n\ndef _magic_comment_encodings(ascii_header: str) -> Iterator[str]:\n    header_lines = ascii_header.split(\"\\n\")[:2]\n    for header_line in header_lines:\n        for magic_line_start_regex in _MAGIC_COMMENT_LINE_START_REGEXES:\n            magic_line_start_match = re.match(magic_line_start_regex, header_line)\n            if magic_line_start_match is not None:\n                remainder = magic_line_start_match.group(\"remainder\")\n                for magic_coding_comment_regex in _MAGIC_COMMENT_LINE_REMAINDER_REGEXES:\n                    result = magic_coding_comment_regex.match(remainder)\n                    if result is not None:\n                        yield result.group(\"encoding\")\n\n\ndef encoding_from_possible_xml_prolog(ascii_header: str) -> Optional[str]:\n    header_line = ascii_header.replace(\"\\f\\n\\r\\v\", \" \")\n    xml_prolog_match = _XML_PROLOG_REGEX.match(header_line)\n    return xml_prolog_match.group(\"encoding\") if xml_prolog_match is not None else None\n\n\ndef is_binary_file(source_path: str) -> bool:\n    with open(source_path, \"rb\") as source_file:\n        initial_bytes = source_file.read(8192)\n    return not any(initial_bytes.startswith(bom) for bom in _TEXT_BOMS) and b\"\\0\" in initial_bytes\n\n\ndef is_plain_text(source_path):\n    return _PLAIN_TEXT_NAME_REGEX.match(os.path.basename(source_path))\n\n\ndef has_lexer(source_path: str) -> bool:\n    \"\"\"\n    Initial quick check if there is a lexer for ``source_path``. This removes\n    the need for calling :py:func:`pygments.lexers.guess_lexer_for_filename()`\n    which fully reads the source file.\n    \"\"\"\n    result = bool(pygments.lexers.find_lexer_class_for_filename(source_path))\n    if not result:\n        suffix = os.path.splitext(os.path.basename(source_path))[1].lstrip(\".\")\n        result = suffix in _SUFFIX_TO_FALLBACK_LEXER_MAP\n    return result\n\n\ndef guess_lexer(source_path: str, text: str) -> pygments.lexer.Lexer:\n    if is_plain_text(source_path):\n        result = pygount.lexers.PlainTextLexer()\n    else:\n        try:\n            result = pygments.lexers.guess_lexer_for_filename(source_path, text)\n        except pygments.util.ClassNotFound:\n            suffix = os.path.splitext(os.path.basename(source_path))[1].lstrip(\".\")\n            result = _SUFFIX_TO_FALLBACK_LEXER_MAP.get(suffix)\n    return result\n\n\ndef base_language(language: str) -> str:\n    base_language_match = _BASE_LANGUAGE_REGEX.match(language)\n    return language if base_language_match is None else base_language_match.group(\"base_language\")\n"
  },
  {
    "path": "pygount/command.py",
    "content": "\"\"\"\nCommand line interface for pygount.\n\"\"\"\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nimport argparse\nimport contextlib\nimport logging\nimport os\nimport sys\n\nfrom rich.progress import Progress\n\nimport pygount\nimport pygount.analysis\nimport pygount.common\nimport pygount.write\n\n#: Valid formats for option --format.\nVALID_OUTPUT_FORMATS = (\"cloc-xml\", \"json\", \"sloccount\", \"summary\")\n\n_DEFAULT_ENCODING = \"automatic\"\n_DEFAULT_OUTPUT_FORMAT = \"sloccount\"\n_DEFAULT_OUTPUT = \"STDOUT\"\n_DEFAULT_SOURCE_PATTERNS = os.curdir\n_DEFAULT_SUFFIXES = \"*\"\n\n_HELP_ENCODING = '''encoding to use when reading source code; use \"automatic\"\n to take BOMs, XML prolog and magic headers into account and fall back to\n UTF-8 or CP1252 if none fits; use \"automatic;<fallback>\" to specify a\n different fallback encoding than CP1252; use \"chardet\" to let the chardet\n package determine the encoding; default: \"%(default)s\"'''\n\n_HELP_EPILOG = \"\"\"SHELL-PATTERN is a pattern using *, ? and ranges like [a-z]\n as placeholders. PATTERNS is a comma separated list of SHELL-PATTERN. The\n prefix [regex] indicated that the PATTERNS use regular expression syntax. If\n default values are available, [...] indicates that the PATTERNS extend the\n existing default values.\"\"\"\n\n_HELP_FORMAT = (\n    f\"output format, one of: \"\n    # HACK The chr(34) is necessary because ruff does not preserve the\n    #  backslash in '\\\"'.\n    f\"{', '.join([chr(34) + output_format + chr(34) for output_format in VALID_OUTPUT_FORMATS])};\"\n    f' default: \"%(default)s\"'\n)\n\n_HELP_GENERATED = \"\"\"comma separated list of regular expressions to detect\n generated code; default: %(default)s\"\"\"\n\n_HELP_GENERATED_NAMES = \"\"\"comma separated list of glob patterns for file names\n not to treat as generated. Use \"...\" as first entry to append patterns to the default\n patterns; default: %(default)s\"\"\"\n\n_HELP_MERGE_EMBEDDED_LANGUAGES = \"\"\"merge counts for embedded languages into\n their base language; for example, HTML+Jinja2 counts as HTML\"\"\"\n\n_HELP_FOLDERS_TO_SKIP = \"\"\"comma separated list of glob patterns for folder\n names not to analyze. Use \"...\" as first entry to append patterns to the\n default patterns; default: %(default)s\"\"\"\n\n_HELP_NAMES_TO_SKIP = \"\"\"comma separated list of glob patterns for file names\n not to analyze. Use \"...\" as first entry to append patterns to the default\n patterns; default: %(default)s\"\"\"\n\n_HELP_SUFFIX = '''limit analysis on files matching any suffix in comma\n separated LIST; shell patterns are possible; example: \"py,sql\"; default:\n \"%(default)s\"'''\n\n_OUTPUT_FORMAT_TO_WRITER_CLASS_MAP = {\n    \"cloc-xml\": pygount.write.ClocXmlWriter,\n    \"json\": pygount.write.JsonWriter,\n    \"sloccount\": pygount.write.LineWriter,\n    \"summary\": pygount.write.SummaryWriter,\n}\nassert set(VALID_OUTPUT_FORMATS) == set(_OUTPUT_FORMAT_TO_WRITER_CLASS_MAP.keys())\n\n_log = logging.getLogger(\"pygount\")\n\n\ndef _check_encoding(name, encoding_to_check, alternative_encoding, source=None):\n    \"\"\"\n    Check that ``encoding`` is a valid Python encoding\n    :param name: name under which the encoding is known to the user, e.g. 'default encoding'\n    :param encoding_to_check: name of the encoding to check, e.g. 'utf-8'\n    :param source: source where the encoding has been set, e.g. option name\n    :raise pygount.common.OptionError if ``encoding`` is not a valid Python encoding\n    \"\"\"\n    assert name is not None\n\n    if encoding_to_check not in (alternative_encoding, \"chardet\", None):\n        try:\n            \"\".encode(encoding_to_check)\n        except LookupError:\n            raise pygount.common.OptionError(\n                f'{name} is \"{encoding_to_check}\" but must be \"{alternative_encoding}\" or a known Python encoding',\n                source,\n            ) from None\n\n\nclass Command:\n    \"\"\"\n    Command interface for pygount, where options starting with defaults can\n    gradually be set and finally :py:meth:`execute()`.\n    \"\"\"\n\n    def __init__(self):\n        self.set_encodings(_DEFAULT_ENCODING)\n        self._folders_to_skip = pygount.common.regexes_from(pygount.analysis.DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT)\n        self._generated_line_regexs = pygount.common.regexes_from(pygount.analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT)\n        self._generated_name_regexps = pygount.common.regexes_from(\n            pygount.analysis.DEFAULT_GENERATED_NAME_PATTERNS_TEXT\n        )\n        self._has_duplicates = False\n        self._has_summary = False\n        self._has_to_merge_embedded_languages = False\n        self._is_verbose = False\n        self._names_to_skip = pygount.common.regexes_from(pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT)\n        self._output = _DEFAULT_OUTPUT\n        self._output_format = _DEFAULT_OUTPUT_FORMAT\n        self._source_patterns = _DEFAULT_SOURCE_PATTERNS\n        self._suffixes = pygount.common.regexes_from(_DEFAULT_SUFFIXES)\n\n    def set_encodings(self, encoding, source=None):\n        encoding_is_chardet = (encoding == \"chardet\") or (encoding.startswith(\"chardet;\"))\n        if encoding_is_chardet and not pygount.analysis.has_chardet:  # pragma: no cover\n            raise pygount.common.OptionError('chardet must be installed to set default encoding to \"chardet\"')\n        if encoding in (\"automatic\", \"chardet\"):\n            default_encoding = encoding\n            fallback_encoding = None\n        elif encoding.startswith((\"automatic;\", \"chardet;\")):\n            first_encoding_semicolon_index = encoding.find(\";\")\n            default_encoding = encoding[:first_encoding_semicolon_index]\n            fallback_encoding = encoding[first_encoding_semicolon_index + 1 :]\n        else:\n            default_encoding = encoding\n            fallback_encoding = pygount.analysis.DEFAULT_FALLBACK_ENCODING\n        self.set_default_encoding(default_encoding, source)\n        self.set_fallback_encoding(fallback_encoding, source)\n\n    @property\n    def default_encoding(self):\n        return self._default_encoding\n\n    def set_default_encoding(self, default_encoding, source=None):\n        _check_encoding(\"default encoding\", default_encoding, \"automatic\", source)\n        self._default_encoding = default_encoding\n\n    @property\n    def fallback_encoding(self):\n        return self._fallback_encoding\n\n    def set_fallback_encoding(self, fallback_encoding, source=None):\n        _check_encoding(\"fallback encoding\", fallback_encoding, \"automatic\", source)\n        self._fallback_encoding = fallback_encoding\n\n    @property\n    def folders_to_skip(self):\n        return self._folders_to_skip\n\n    def set_folders_to_skip(self, regexes_or_patterns_text, source=None):\n        self._folders_to_skip = pygount.common.regexes_from(\n            regexes_or_patterns_text, pygount.analysis.DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT, source\n        )\n\n    @property\n    def generated_regexps(self):\n        return self._generated_line_regexs\n\n    def set_generated_regexps(self, regexes_or_patterns_text, source=None):\n        self._generated_line_regexs = pygount.common.regexes_from(\n            regexes_or_patterns_text, pygount.analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT, source\n        )\n\n    @property\n    def generated_name_regexps(self):\n        return self._generated_name_regexps\n\n    def set_generated_name_regexps(self, regexes_or_pattern_text, source=None):\n        self._generated_name_regexps = pygount.common.regexes_from(\n            regexes_or_pattern_text, pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT, source\n        )\n\n    @property\n    def has_duplicates(self):\n        return self._has_duplicates\n\n    def set_has_duplicates(self, has_duplicates, source=None):\n        self._has_duplicates = bool(has_duplicates)\n\n    @property\n    def has_to_merge_embedded_languages(self):\n        return self._has_to_merge_embedded_languages\n\n    def set_has_to_merge_embedded_languages(self, has_to_merge_embedded_languages, source=None):\n        self._has_to_merge_embedded_languages = bool(has_to_merge_embedded_languages)\n\n    @property\n    def is_verbose(self):\n        return self._is_verbose\n\n    def set_is_verbose(self, is_verbose, source=None):\n        self._is_verbose = bool(is_verbose)\n\n    @property\n    def names_to_skip(self):\n        return self._names_to_skip\n\n    def set_names_to_skip(self, regexes_or_pattern_text, source=None):\n        self._names_to_skip = pygount.common.regexes_from(\n            regexes_or_pattern_text, pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT, source\n        )\n\n    @property\n    def output(self):\n        return self._output\n\n    def set_output(self, output, source=None):\n        assert output is not None\n        self._output = output\n\n    @property\n    def output_format(self):\n        return self._output_format\n\n    def set_output_format(self, output_format, source=None):\n        assert output_format is not None\n        if output_format not in VALID_OUTPUT_FORMATS:\n            raise pygount.common.OptionError(\n                f\"format is {output_format} but must be one of: {VALID_OUTPUT_FORMATS}\", source\n            )\n        self._output_format = output_format\n\n    @property\n    def source_patterns(self):\n        return self._source_patterns\n\n    def set_source_patterns(self, glob_patterns_or_text, source=None):\n        assert glob_patterns_or_text is not None\n        self._source_patterns = pygount.common.as_list(glob_patterns_or_text)\n        assert len(self._source_patterns) >= 0\n\n    @property\n    def suffixes(self):\n        return self._suffixes\n\n    def set_suffixes(self, regexes_or_patterns_text, source=None):\n        assert regexes_or_patterns_text is not None\n        self._suffixes = pygount.common.regexes_from(regexes_or_patterns_text, _DEFAULT_SUFFIXES, source)\n\n    def argument_parser(self):\n        parser = argparse.ArgumentParser(description=\"count source lines of code\", epilog=_HELP_EPILOG)\n        parser.add_argument(\"--duplicates\", \"-d\", action=\"store_true\", help=\"analyze duplicate files\")\n        parser.add_argument(\"--encoding\", \"-e\", default=_DEFAULT_ENCODING, help=_HELP_ENCODING)\n        parser.add_argument(\n            \"--folders-to-skip\",\n            \"-F\",\n            metavar=\"PATTERNS\",\n            default=pygount.analysis.DEFAULT_FOLDER_PATTERNS_TO_SKIP_TEXT,\n            help=_HELP_FOLDERS_TO_SKIP,\n        )\n        parser.add_argument(\n            \"--format\",\n            \"-f\",\n            metavar=\"FORMAT\",\n            choices=VALID_OUTPUT_FORMATS,\n            default=_DEFAULT_OUTPUT_FORMAT,\n            help=_HELP_FORMAT,\n        )\n        parser.add_argument(\n            \"--generated\",\n            \"-g\",\n            metavar=\"PATTERNS\",\n            default=pygount.analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT,\n            help=_HELP_GENERATED,\n        )\n        parser.add_argument(\n            \"--generated-names\",\n            \"-G\",\n            metavar=\"PATTERNS\",\n            default=pygount.analysis.DEFAULT_GENERATED_NAME_PATTERNS_TEXT,\n            help=_HELP_GENERATED_NAMES,\n        )\n        parser.add_argument(\n            \"--merge-embedded-languages\",\n            \"-m\",\n            action=\"store_true\",\n            help=_HELP_MERGE_EMBEDDED_LANGUAGES,\n        )\n        parser.add_argument(\n            \"--names-to-skip\",\n            \"-N\",\n            metavar=\"PATTERNS\",\n            default=pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT,\n            help=_HELP_NAMES_TO_SKIP,\n        )\n        parser.add_argument(\n            \"--out\",\n            \"-o\",\n            metavar=\"FILE\",\n            default=_DEFAULT_OUTPUT,\n            help='file to write results to; use \"STDOUT\" for standard output; default: \"%(default)s\"',\n        )\n        parser.add_argument(\"--suffix\", \"-s\", metavar=\"PATTERNS\", default=_DEFAULT_SUFFIXES, help=_HELP_SUFFIX)\n        parser.add_argument(\n            \"source_patterns\",\n            metavar=\"SHELL-PATTERN\",\n            nargs=\"*\",\n            default=[os.getcwd()],\n            help=\"source files and directories to scan; can use glob patterns; default: current directory\",\n        )\n        parser.add_argument(\"--verbose\", \"-v\", action=\"store_true\", help=\"explain what is being done\")\n        parser.add_argument(\"--version\", action=\"version\", version=\"%(prog)s \" + pygount.__version__)\n        return parser\n\n    def parsed_args(self, arguments):\n        assert arguments is not None\n\n        parser = self.argument_parser()\n        args = parser.parse_args(arguments)\n        if args.encoding == \"automatic\":\n            default_encoding = args.encoding\n            fallback_encoding = None\n        elif args.encoding == \"chardet\":\n            if not pygount.analysis.has_chardet:  # pragma: no cover\n                parser.error(\"chardet must be installed in order to specify --encoding=chardet\")\n            default_encoding = args.encoding\n            fallback_encoding = None\n        else:\n            if args.encoding.startswith(\"automatic;\"):\n                first_encoding_semicolon_index = args.encoding.find(\";\")\n                default_encoding = args.encoding[:first_encoding_semicolon_index]\n                fallback_encoding = args.encoding[first_encoding_semicolon_index + 1 :]\n                encoding_to_check = (\"fallback encoding\", fallback_encoding)\n            else:\n                default_encoding = args.encoding\n                fallback_encoding = None\n                encoding_to_check = (\"encoding\", default_encoding)\n            if encoding_to_check is not None:\n                name, encoding = encoding_to_check\n                try:\n                    \"\".encode(encoding)\n                except LookupError:\n                    parser.error(f\"{name} specified with --encoding must be a known Python encoding: {encoding}\")\n        return args, default_encoding, fallback_encoding\n\n    def apply_arguments(self, arguments=None):\n        if arguments is None:  # pragma: no cover\n            arguments = sys.argv[1:]\n        args, default_encoding, fallback_encoding = self.parsed_args(arguments)\n        self.set_default_encoding(default_encoding, \"option --encoding\")\n        self.set_fallback_encoding(fallback_encoding, \"option --encoding\")\n        self.set_folders_to_skip(args.folders_to_skip, \"option --folders-to-skip\")\n        self.set_generated_regexps(args.generated, \"option --generated\")\n        self.set_generated_name_regexps(args.generated_names, \"option --generated-names\")\n        self.set_has_duplicates(args.duplicates, \"option --duplicates\")\n        self.set_has_to_merge_embedded_languages(args.merge_embedded_languages, \"option --merge-embedded-languages\")\n        self.set_is_verbose(args.verbose, \"option --verbose\")\n        self.set_names_to_skip(args.names_to_skip, \"option --names-to-skip\")\n        self.set_output(args.out, \"option --out\")\n        self.set_output_format(args.format, \"option --format\")\n        self.set_source_patterns(args.source_patterns, \"option PATTERNS\")\n        self.set_suffixes(args.suffix, \"option --suffix\")\n\n    def execute(self):\n        _log.setLevel(logging.INFO if self.is_verbose else logging.WARNING)\n        with pygount.analysis.SourceScanner(\n            self.source_patterns, self.suffixes, self.folders_to_skip, self.names_to_skip\n        ) as source_scanner:\n            source_paths_and_groups_to_analyze = list(source_scanner.source_paths())\n            duplicate_pool = pygount.analysis.DuplicatePool() if not self.has_duplicates else None\n            writer_class = _OUTPUT_FORMAT_TO_WRITER_CLASS_MAP[self.output_format]\n            is_stdout = self.output == \"STDOUT\"\n            target_context_manager = (\n                contextlib.nullcontext(sys.stdout)\n                if is_stdout\n                else open(self.output, \"w\", encoding=\"utf-8\", newline=\"\")  # noqa: SIM115\n            )\n            with (\n                target_context_manager as target_file,\n                writer_class(target_file) as writer,\n                Progress(disable=not writer.has_to_track_progress, transient=True) as progress,\n            ):\n                try:\n                    for path_data in progress.track(source_paths_and_groups_to_analyze):\n                        writer.add(\n                            pygount.analysis.SourceAnalysis.from_file(\n                                path_data.source_path,\n                                path_data.group,\n                                self.default_encoding,\n                                self.fallback_encoding,\n                                generated_regexes=self._generated_line_regexs,\n                                generated_name_regexes=self._generated_name_regexps,\n                                duplicate_pool=duplicate_pool,\n                                merge_embedded_language=self.has_to_merge_embedded_languages,\n                                tmp_dir=path_data.tmp_dir,\n                            )\n                        )\n                finally:\n                    progress.stop()\n\n\ndef pygount_command(arguments=None):\n    result = 1\n    command = Command()\n    try:\n        command.apply_arguments(arguments)\n        command.execute()\n        result = 0\n    except KeyboardInterrupt:  # pragma: no cover\n        _log.error(\"interrupted as requested by user\")\n    except (pygount.common.OptionError, OSError) as error:\n        _log.error(error)\n    except Exception as error:\n        _log.exception(error)\n\n    return result\n\n\ndef main():  # pragma: no cover\n    logging.basicConfig(level=logging.WARNING)\n    sys.exit(pygount_command())\n\n\nif __name__ == \"__main__\":  # pragma: no cover\n    main()\n"
  },
  {
    "path": "pygount/common.py",
    "content": "\"\"\"\nCommon classes and functions for pygount.\n\"\"\"\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nimport fnmatch\nimport functools\nimport inspect\nimport re\nimport typing\nimport warnings\nfrom collections.abc import Iterator, Sequence\nfrom re import Pattern\nfrom typing import Optional, Union\n\nWHITE_SPACE_CHARACTERS = \" \\f\\n\\r\\t\"\n\n#: Pseudo pattern to indicate that the remaining pattern are an addition to the default patterns.\nADDITIONAL_PATTERN = \"[...]\"\n\n#: Prefix to use for pattern strings to describe a regular expression instead of a shell pattern.\nREGEX_PATTERN_PREFIX = \"[regex]\"\n\n_REGEX_TYPE = type(re.compile(\"\"))\n\n\nclass Error(Exception):\n    \"\"\"\n    Error to indicate that something went wrong during a pygount run.\n    \"\"\"\n\n\nclass OptionError(Error):\n    \"\"\"\n    Error to indicate that a value passed to a command line option must be\n    fixed.\n    \"\"\"\n\n    def __init__(self, message, source=None):\n        super().__init__(message)\n        self.option_error_message = (source + \": \") if source is not None else \"\"\n        self.option_error_message += message\n\n    def __str__(self):\n        return self.option_error_message\n\n\ndef as_list(items_or_text: Union[str, Sequence[str]]) -> list[str]:\n    if isinstance(items_or_text, str):\n        # TODO: Allow to specify comma (,) in text using '[,]'.\n        result = [item.strip() for item in items_or_text.split(\",\") if item.strip() != \"\"]\n    else:\n        result = list(items_or_text)\n    return result\n\n\ndef regex_from(pattern: Union[str, Pattern], is_shell_pattern=False) -> Pattern:\n    assert pattern is not None\n    if isinstance(pattern, str):\n        result = re.compile(fnmatch.translate(pattern)) if is_shell_pattern else re.compile(pattern)\n    else:\n        result = pattern  # Assume pattern already is a compiled regular expression\n    return result\n\n\ndef regexes_from(\n    patterns_text: Union[str, Sequence[str], Sequence[Pattern]],\n    default_patterns_text: Optional[Union[str, Sequence[Pattern], Sequence[str]]] = None,\n    source: Optional[str] = None,\n) -> list[Pattern]:\n    assert patterns_text is not None\n\n    result = []\n    default_regexes = []\n    try:\n        if isinstance(patterns_text, str):\n            is_shell_pattern = True\n            patterns_text_without_prefixes = patterns_text\n            if patterns_text_without_prefixes.startswith(REGEX_PATTERN_PREFIX):\n                is_shell_pattern = False\n                patterns_text_without_prefixes = patterns_text_without_prefixes[len(REGEX_PATTERN_PREFIX) :]\n            if patterns_text_without_prefixes.startswith(ADDITIONAL_PATTERN):\n                assert default_patterns_text is not None\n                default_regexes = regexes_from(default_patterns_text)\n                patterns_text_without_prefixes = patterns_text_without_prefixes[len(ADDITIONAL_PATTERN) :]\n\n            patterns = as_list(patterns_text_without_prefixes)\n            result = [regex_from(pattern, is_shell_pattern) for pattern in patterns]\n        else:\n            regexes = list(patterns_text)\n            if len(regexes) >= 1 and regexes[0] is None:\n                default_regexes = regexes_from(default_patterns_text)\n                regexes = regexes[1:]\n            for supposed_regex in regexes:\n                assert isinstance(supposed_regex, _REGEX_TYPE), (\n                    f\"patterns_text must a text or sequence or regular expressions but contains: {supposed_regex}\"\n                )\n            result.extend(regexes)\n    except re.error as error:\n        raise OptionError(f\"cannot parse pattern for regular repression: {error}\", source) from None\n    result.extend(default_regexes)\n    return result\n\n\ndef matching_regex(text: str, regexes: list[typing.Pattern]) -> Optional[typing.Pattern]:\n    return next((regex for regex in regexes if regex.match(text)), None)\n\n\ndef lines(text: str) -> Iterator[str]:\n    \"\"\"\n    Generator function to yield lines (delimited with ``'\\n'``) stored in\n    ``text``. This is useful when a regular expression should only match on a\n    per-line basis in a memory efficient way.\n    \"\"\"\n    assert text is not None\n    assert \"\\r\" not in text\n    previous_newline_index = 0\n    newline_index = text.find(\"\\n\")\n    while newline_index != -1:\n        yield text[previous_newline_index:newline_index]\n        previous_newline_index = newline_index + 1\n        newline_index = text.find(\"\\n\", previous_newline_index)\n    last_line = text[previous_newline_index:]\n    if last_line != \"\":\n        yield last_line\n\n\ndef deprecated(reason: Optional[str]):  # pragma: no cover\n    \"\"\"\n    Decorator to mark functions as deprecated and log a warning in case it is called.\n\n    Source: https://stackoverflow.com/questions/2536307/decorators-in-the-python-standard-lib-deprecated-specifically\n    \"\"\"\n\n    if isinstance(reason, str):\n        # The @deprecated is used with a 'reason'.\n        #\n        # .. code-block:: python\n        #\n        #    @deprecated(\"please, use another function\")\n        #    def old_function(x, y):\n        #      pass\n\n        def decorator(func1):\n            class_or_func = \"class\" if inspect.isclass(func1) else \"function\"\n\n            @functools.wraps(func1)\n            def new_func1(*args, **kwargs):\n                warnings.simplefilter(\"always\", DeprecationWarning)\n                warnings.warn(\n                    f\"Call to deprecated {class_or_func} {func1.__name__} ({reason}).\",\n                    category=DeprecationWarning,\n                    stacklevel=2,\n                )\n                warnings.simplefilter(\"default\", DeprecationWarning)\n                return func1(*args, **kwargs)\n\n            return new_func1\n\n        return decorator\n\n    if inspect.isclass(reason) or inspect.isfunction(reason):\n        # The @deprecated is used without any 'reason'.\n        #\n        # .. code-block:: python\n        #\n        #    @deprecated\n        #    def old_function(x, y):\n        #      pass\n\n        func2 = reason\n        class_or_func = \"class\" if inspect.isclass(func2) else \"function\"\n\n        @functools.wraps(func2)\n        def new_func2(*args, **kwargs):\n            warnings.simplefilter(\"always\", DeprecationWarning)\n            warnings.warn(\n                f\"Call to deprecated {class_or_func} {func2.__name__}.\",\n                category=DeprecationWarning,\n                stacklevel=2,\n            )\n            warnings.simplefilter(\"default\", DeprecationWarning)\n            return func2(*args, **kwargs)\n\n        return new_func2\n    raise TypeError(repr(type(reason)))\n\n\ndef mapped_repr(type_, name_to_value_map) -> str:\n    result = \", \".join(f\"{name}={value}\" for name, value in name_to_value_map.items())\n    result = f\"{type_.__class__.__name__}({result})\"\n    return result\n"
  },
  {
    "path": "pygount/git_storage.py",
    "content": "import re\nimport shutil\nfrom tempfile import mkdtemp\nfrom typing import Optional\n\nimport git\n\n#: Regular expression to detect git url with the optional tag or branch\n# from https://stackoverflow.com/questions/2514859/regular-expression-for-git-repository server-name\n_GIT_URL_REGEX = re.compile(\n    r\"(?P<remote_url>((git|ssh|http(s)?)|(git@[\\w.-]+))(:(//)?)([\\w.@:/\\-~]+)(\\.git))(/)?(?P<revision>[\\w./\\-]+)?\"\n)\n\n\ndef git_remote_url_and_revision_if_any(git_url: str) -> tuple[Optional[str], Optional[str]]:\n    assert git_url is not None\n    git_url_match = _GIT_URL_REGEX.match(git_url)\n    return (\n        (None, None) if git_url_match is None else (git_url_match.group(\"remote_url\"), git_url_match.group(\"revision\"))\n    )\n\n\nclass GitStorage:\n    def __init__(self, remote_url: str, revision: Optional[str] = None):\n        assert remote_url is not None\n        self._remote_url = remote_url\n        self._revision = revision\n        self._temp_folder = mkdtemp()\n\n    @property\n    def temp_folder(self) -> str:\n        return self._temp_folder\n\n    def extract(self):\n        multi_options = [\"--depth\", \"1\"]\n        if self._revision is not None:\n            multi_options.extend([\"--branch\", self._revision])\n        git.Repo.clone_from(self._remote_url, self._temp_folder, multi_options=multi_options)\n\n    def close(self):\n        shutil.rmtree(self._temp_folder, ignore_errors=True)\n"
  },
  {
    "path": "pygount/lexers.py",
    "content": "\"\"\"\nAdditional lexers for pygount that fill gaps left by :py:mod:`pygments`.\n\"\"\"\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nimport pygments.lexer\nimport pygments.lexers\nimport pygments.token\nimport pygments.util\n\n\nclass IdlLexer(pygments.lexers.JavaLexer):\n    \"\"\"\n    Lexer for OMG Interface Definition Language (IDL) that simply uses the\n    existing Java lexer to find comments. While this is useless for syntax\n    highlighting it is good enough for counting lines.\n    \"\"\"\n\n    name = \"IDL\"\n    filenames = [\"*.idl\"]\n\n\nclass MinimalisticM4Lexer(pygments.lexer.RegexLexer):\n    \"\"\"\n    Minimalistic lexer for m4 macro processor that can distinguish between\n    comments and code. It does not recognize a redefined comment mark though.\n    \"\"\"\n\n    name = \"M4\"\n    tokens = {\n        \"root\": [\n            (r\"(.*)(#.*\\n)\", pygments.lexer.bygroups(pygments.token.Text, pygments.token.Comment.Single)),\n            (r\".*\\n\", pygments.token.Text),\n        ]\n    }\n\n\nclass MinimalisticVBScriptLexer(pygments.lexer.RegexLexer):\n    \"\"\"\n    Minimalistic lexer for VBScript that can distinguish between comments and\n    code.\n    \"\"\"\n\n    name = \"VBScript\"\n    tokens = {\"root\": [(r\"\\s*'.*\\n\", pygments.token.Comment.Single), (r\".*\\n\", pygments.token.Text)]}\n\n\nclass MinimalisticWebFocusLexer(pygments.lexer.RegexLexer):\n    \"\"\"\n    Minimalistic lexer for WebFOCUS that can distinguish between comments and\n    code.\n    \"\"\"\n\n    name = \"WebFOCUS\"\n    tokens = {\"root\": [(r\"-\\*.*\\n\", pygments.token.Comment.Single), (r\".*\\n\", pygments.token.Text)]}\n\n\nclass PlainTextLexer(pygments.lexer.RegexLexer):\n    \"\"\"\n    Simple lexer for plain text that treats every line with non-white space\n    characters as :py:data:`pygments.Token.Comment.Single` and only lines\n    that are empty or contain only white space as\n    :py:data:`pygments.Token.Text`.\n\n    This way, plaint text files count as documentation.\n    \"\"\"\n\n    name = \"Text\"\n    tokens = {\"root\": [(r\"\\s*\\n\", pygments.token.Text), (r\".+\\n\", pygments.token.Comment.Single)]}\n"
  },
  {
    "path": "pygount/summary.py",
    "content": "\"\"\"\nSummaries of analyses of multiple source codes.\n\"\"\"\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nimport functools\nimport re\nfrom collections.abc import Hashable\n\nfrom .analysis import SourceAnalysis\nfrom .common import mapped_repr\n\n_PSEUDO_LANGUAGE_REGEX = re.compile(\"^__[a-z]+__$\")\n\n\n@functools.total_ordering\nclass LanguageSummary:\n    \"\"\"\n    Summary of a source code counts from multiple files of the same language.\n    \"\"\"\n\n    def __init__(self, language: str):\n        self._language = language\n        self._code_count = 0\n        self._documentation_count = 0\n        self._empty_count = 0\n        self._file_count = 0\n        self._file_percentage = 0.0\n        self._string_count = 0\n        self._is_pseudo_language = _PSEUDO_LANGUAGE_REGEX.match(self.language) is not None\n        self._has_up_to_date_percentages = False\n\n    @property\n    def language(self) -> str:\n        \"\"\"the language to be summarized\"\"\"\n        return self._language\n\n    @property\n    def code_count(self) -> int:\n        \"\"\"sum lines of code for this language\"\"\"\n        return self._code_count\n\n    @property\n    def code_percentage(self) -> float:\n        \"\"\"percentage of lines containing code for this language across entire project\"\"\"\n        return _percentage_or_0(self.code_count, self.line_count)\n\n    def _assert_has_up_to_date_percentages(self):\n        assert self._has_up_to_date_percentages, \"update_percentages() must be called first\"\n\n    @property\n    def documentation_count(self) -> int:\n        \"\"\"sum lines of documentation for this language\"\"\"\n        return self._documentation_count\n\n    @property\n    def documentation_percentage(self) -> float:\n        \"\"\"percentage of lines containing documentation for this language across entire project\"\"\"\n        return _percentage_or_0(self.documentation_count, self.line_count)\n\n    @property\n    def empty_count(self) -> int:\n        \"\"\"sum empty lines for this language\"\"\"\n        return self._empty_count\n\n    @property\n    def empty_percentage(self) -> float:\n        \"\"\"percentage of empty lines for this language across entire project\"\"\"\n        return _percentage_or_0(self.empty_count, self.line_count)\n\n    @property\n    def file_count(self) -> int:\n        \"\"\"number of source code files for this language\"\"\"\n        return self._file_count\n\n    @property\n    def file_percentage(self) -> float:\n        \"\"\"percentage of files in project\"\"\"\n        self._assert_has_up_to_date_percentages()\n        return self._file_percentage\n\n    @property\n    def line_count(self) -> int:\n        \"\"\"sum count of all lines of any kind for this language\"\"\"\n        return self.code_count + self.documentation_count + self.empty_count + self.string_count\n\n    @property\n    def string_count(self) -> int:\n        \"\"\"sum number of lines containing strings for this language\"\"\"\n        return self._string_count\n\n    @property\n    def string_percentage(self) -> float:\n        \"\"\"percentage of lines containing strings for this language across entire project\"\"\"\n        return _percentage_or_0(self.string_count, self.line_count)\n\n    @property\n    def source_count(self) -> int:\n        \"\"\"sum number of source lines of code\"\"\"\n        return self.code_count + self.string_count\n\n    @property\n    def source_percentage(self) -> float:\n        \"\"\"percentage of source lines for code for this language across the entire project\"\"\"\n        return _percentage_or_0(self.source_count, self.line_count)\n\n    @property\n    def is_pseudo_language(self) -> bool:\n        \"\"\"``True`` if the language is not a real programming language\"\"\"\n        return self._is_pseudo_language\n\n    def sort_key(self) -> Hashable:\n        \"\"\"sort key to sort multiple languages by importance\"\"\"\n        return self.code_count, self.documentation_count, self.string_count, self.empty_count, self.language\n\n    def __hash__(self):\n        return hash(self.language)\n\n    def __eq__(self, other):\n        return self.sort_key() == other.sort_key()\n\n    def __lt__(self, other):\n        return self.sort_key() < other.sort_key()\n\n    def add(self, source_analysis: SourceAnalysis) -> None:\n        \"\"\"\n        Add counts from ``source_analysis`` to total counts for this language.\n        \"\"\"\n        assert source_analysis is not None\n        assert source_analysis.language == self.language\n\n        self._has_up_to_date_percentages = False\n        self._file_count += 1\n        if source_analysis.is_countable:\n            self._code_count += source_analysis.code_count\n            self._documentation_count += source_analysis.documentation_count\n            self._empty_count += source_analysis.empty_count\n            self._string_count += source_analysis.string_count\n\n    def update_file_percentage(self, project_summary: \"ProjectSummary\"):\n        self._file_percentage = _percentage_or_0(self.file_count, project_summary.total_file_count)\n        self._has_up_to_date_percentages = True\n\n    def __repr__(self):\n        name_to_value_map = {\n            \"language\": f\"{self.language!r}\",\n            \"file_count\": self.file_count,\n        }\n        if not self.is_pseudo_language:\n            name_to_value_map.update(\n                {\n                    \"code_count\": self.code_count,\n                    \"documentation_count\": self.documentation_count,\n                    \"empty_count\": self.empty_count,\n                    \"string_count\": self.string_count,\n                }\n            )\n        return mapped_repr(self, name_to_value_map)\n\n\ndef _percentage_or_0(partial_count: int, total_count: int) -> float:\n    assert partial_count >= 0\n    assert total_count >= 0\n    return 100 * partial_count / total_count if total_count != 0 else 0.0\n\n\nclass ProjectSummary:\n    \"\"\"\n    Summary of source code counts for several languages and files.\n    \"\"\"\n\n    def __init__(self):\n        self._language_to_language_summary_map = {}\n        self._total_code_count = 0\n        self._total_documentation_count = 0\n        self._total_empty_count = 0\n        self._total_string_count = 0\n        self._total_file_count = 0\n        self._total_line_count = 0\n\n    @property\n    def language_to_language_summary_map(self) -> dict[str, LanguageSummary]:\n        \"\"\"\n        A map containing summarized counts for each language added with :py:meth:`add()` so far.\n        \"\"\"\n        return self._language_to_language_summary_map\n\n    @property\n    def total_code_count(self) -> int:\n        return self._total_code_count\n\n    @property\n    def total_code_percentage(self) -> float:\n        return _percentage_or_0(self.total_code_count, self.total_line_count)\n\n    @property\n    def total_documentation_count(self) -> int:\n        return self._total_documentation_count\n\n    @property\n    def total_documentation_percentage(self) -> float:\n        return _percentage_or_0(self.total_documentation_count, self.total_line_count)\n\n    @property\n    def total_empty_count(self) -> int:\n        return self._total_empty_count\n\n    @property\n    def total_empty_percentage(self) -> float:\n        return _percentage_or_0(self.total_empty_count, self.total_line_count)\n\n    @property\n    def total_file_count(self) -> int:\n        return self._total_file_count\n\n    @property\n    def total_line_count(self) -> int:\n        return self._total_line_count\n\n    @property\n    def total_source_count(self) -> int:\n        return self.total_code_count + self.total_string_count\n\n    @property\n    def total_source_percentage(self) -> float:\n        return _percentage_or_0(self.total_source_count, self.total_line_count)\n\n    @property\n    def total_string_count(self) -> int:\n        return self._total_string_count\n\n    @property\n    def total_string_percentage(self) -> float:\n        return _percentage_or_0(self.total_string_count, self.total_line_count)\n\n    def add(self, source_analysis: SourceAnalysis) -> None:\n        \"\"\"\n        Add counts from ``source_analysis`` to total counts.\n        \"\"\"\n        self._total_file_count += 1\n        language_summary = self.language_to_language_summary_map.get(source_analysis.language)\n        if language_summary is None:\n            language_summary = LanguageSummary(source_analysis.language)\n            self.language_to_language_summary_map[source_analysis.language] = language_summary\n        language_summary.add(source_analysis)\n\n        if source_analysis.is_countable:\n            self._total_code_count += source_analysis.code_count\n            self._total_documentation_count += source_analysis.documentation_count\n            self._total_empty_count += source_analysis.empty_count\n            self._total_line_count += (\n                source_analysis.code_count\n                + source_analysis.documentation_count\n                + source_analysis.empty_count\n                + source_analysis.string_count\n            )\n            self._total_string_count += source_analysis.string_count\n\n    def update_file_percentages(self) -> None:\n        \"\"\"Update percentages for all languages part of the project.\"\"\"\n        for language_summary in self._language_to_language_summary_map.values():\n            language_summary.update_file_percentage(self)\n\n    def __repr__(self):\n        return (\n            f\"{self.__class__.__name__}(\"\n            f\"total_file_count={self.total_file_count}, \"\n            f\"total_line_count={self.total_line_count}, \"\n            f\"languages={sorted(self.language_to_language_summary_map.keys())})\"\n        )\n"
  },
  {
    "path": "pygount/write.py",
    "content": "\"\"\"\nWriters to store the results of a pygount analysis.\n\"\"\"\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nimport datetime\nimport json\nimport math\nimport os\nfrom xml.etree import ElementTree\n\nfrom rich.console import Console\nfrom rich.table import Table\n\nimport pygount\n\nfrom . import SourceAnalysis\nfrom .summary import ProjectSummary\n\n#: Version of cloc the --format=cloc-xml pretends to be.\nCLOC_VERSION = \"1.60\"\n\nJSON_FORMAT_VERSION = \"1.1.0\"\n\n\nclass BaseWriter:\n    def __init__(self, target_stream):\n        self._target_stream = target_stream\n        try:\n            self.target_name = self._target_stream.name\n        except AttributeError:\n            self.target_name = \"<io>\"\n        self.project_summary = ProjectSummary()\n        self.started_at = self._utc_now()\n        self.finished_at = None\n        self.files_per_second = 0\n        self.lines_per_second = 0\n        self.duration = None\n        self.duration_in_seconds = 0.0\n        self.has_to_track_progress = True\n\n    def __enter__(self):\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        self.close()\n        return False\n\n    def add(self, source_analysis):\n        self.project_summary.add(source_analysis)\n\n    def close(self):\n        self.project_summary.update_file_percentages()\n        self.finished_at = self._utc_now()\n        self.duration = self.finished_at - self.started_at\n        self.duration_in_seconds = max(\n            0.001, self.duration.microseconds * 1e-6 + self.duration.seconds + self.duration.days * 3600 * 24\n        )\n        self.lines_per_second = self.project_summary.total_line_count / self.duration_in_seconds\n        self.files_per_second = self.project_summary.total_file_count / self.duration_in_seconds\n\n    @staticmethod\n    def _utc_now() -> datetime.datetime:\n        # After switching to Python 3.11+, we can change this to `now(datetime.UTC)`.\n        return datetime.datetime.now(datetime.timezone.utc)\n\n\nclass LineWriter(BaseWriter):\n    \"\"\"\n    Writer that simply writes a line of text for each source code.\n    \"\"\"\n\n    def __init__(self, target_stream):\n        super().__init__(target_stream)\n        self.has_to_track_progress = False\n\n    def add(self, source_analysis):\n        source_line_count = source_analysis.code_count + source_analysis.string_count\n        line_to_write = (\n            f\"{source_line_count}\\t{source_analysis.language}\\t{source_analysis.group}\\t{source_analysis.path}\"\n        )\n        self._target_stream.write(line_to_write + os.linesep)\n\n\nclass ClocXmlWriter(BaseWriter):\n    \"\"\"\n    Writer that writes XML output similar to cloc when called with options\n    --by-file --xml. This kind of output can be processed by Jenkins' SLOCCount\n    plug-in.\n    \"\"\"\n\n    def __init__(self, target_stream):\n        super().__init__(target_stream)\n        self._results_element = ElementTree.Element(\"results\")\n        self._header_element = ElementTree.SubElement(self._results_element, \"header\")\n        ElementTree.SubElement(self._header_element, \"cloc_url\", text=\"https://github.com/roskakori/pygount\")\n        ElementTree.SubElement(self._header_element, \"cloc_version\", text=CLOC_VERSION)\n        self._files_element = ElementTree.SubElement(self._results_element, \"files\")\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        if exc_type is None:\n            # Only write the XML if everything works out.\n            self.close()\n\n    def add(self, source_analysis: SourceAnalysis):\n        super().add(source_analysis)\n        file_attributes = {\n            \"blank\": str(source_analysis.empty_count),\n            \"code\": str(source_analysis.source_count),\n            \"comment\": str(source_analysis.documentation_count),\n            \"language\": source_analysis.language,\n            \"name\": source_analysis.path,\n        }\n        ElementTree.SubElement(self._files_element, \"file\", attrib=file_attributes)\n\n    def close(self):\n        super().close()\n        # Add various statistics to <header>.\n        ElementTree.SubElement(self._header_element, \"elapsed_seconds\", text=str(self.duration_in_seconds))\n        ElementTree.SubElement(self._header_element, \"n_files\", text=str(self.project_summary.total_file_count))\n        ElementTree.SubElement(self._header_element, \"n_lines\", text=str(self.project_summary.total_line_count))\n        ElementTree.SubElement(self._header_element, \"files_per_second\", text=f\"{self.files_per_second:f}\")\n        ElementTree.SubElement(self._header_element, \"lines_per_second\", text=f\"{self.lines_per_second:f}\")\n        ElementTree.SubElement(self._header_element, \"report_file\", text=self.target_name)\n\n        # Add totals to <files>.\n        file_attributes = {\n            \"blank\": str(self.project_summary.total_empty_count),\n            \"code\": str(self.project_summary.total_code_count + self.project_summary.total_string_count),\n            \"comment\": str(self.project_summary.total_documentation_count),\n        }\n        ElementTree.SubElement(self._files_element, \"total\", attrib=file_attributes)\n\n        # Write the whole XML file.\n        if self._target_stream.encoding is not None:\n            # Write XML declaration only for files but skip it for io.StringIO.\n            self._target_stream.write(f'<?xml version=\"1.0\" encoding=\"{self._target_stream.encoding}\"?>')\n        xml_root = ElementTree.ElementTree(self._results_element)\n        xml_root.write(self._target_stream, encoding=\"unicode\", xml_declaration=False)\n\n\nclass SummaryWriter(BaseWriter):\n    \"\"\"\n    Writer to summarize the analysis per language in a format that can easily\n    be read by humans.\n    \"\"\"\n\n    _COLUMNS_WITH_JUSTIFY = (\n        (\"Language\", \"left\"),\n        (\"Files\", \"right\"),\n        (\"%\", \"right\"),\n        (\"Code\", \"right\"),\n        (\"%\", \"right\"),\n        (\"Comment\", \"right\"),\n        (\"%\", \"right\"),\n    )\n\n    def close(self):\n        super().close()\n\n        table = Table()\n        for column, justify in self._COLUMNS_WITH_JUSTIFY:\n            table.add_column(column, justify=justify, overflow=\"fold\")\n\n        language_summaries = sorted(self.project_summary.language_to_language_summary_map.values(), reverse=True)\n        for index, language_summary in enumerate(language_summaries, start=1):\n            table.add_row(\n                language_summary.language,\n                str(language_summary.file_count),\n                formatted_percentage(language_summary.file_percentage),\n                str(language_summary.code_count),\n                formatted_percentage(language_summary.code_percentage),\n                str(language_summary.documentation_count),\n                formatted_percentage(language_summary.documentation_percentage),\n                end_section=(index == len(language_summaries)),\n            )\n        table.add_row(\n            \"Sum\",\n            str(self.project_summary.total_file_count),\n            formatted_percentage(100.0),\n            str(self.project_summary.total_code_count),\n            formatted_percentage(self.project_summary.total_code_percentage),\n            str(self.project_summary.total_documentation_count),\n            formatted_percentage(self.project_summary.total_documentation_percentage),\n        )\n        Console(file=self._target_stream, soft_wrap=True).print(table)\n\n\nclass JsonWriter(BaseWriter):\n    \"\"\"\n    Writer JSON output, ideal for further automatic processing.\n    \"\"\"\n\n    def __init__(self, target_stream):\n        super().__init__(target_stream)\n        self.source_analyses = []\n\n    def add(self, source_analysis: SourceAnalysis):\n        super().add(source_analysis)\n        self.source_analyses.append(\n            {\n                \"codeCount\": source_analysis.code_count,\n                \"documentationCount\": source_analysis.documentation_count,\n                \"emptyCount\": source_analysis.empty_count,\n                \"group\": source_analysis.group,\n                \"isCountable\": source_analysis.is_countable,\n                \"language\": source_analysis.language,\n                \"lineCount\": source_analysis.line_count,\n                \"path\": source_analysis.path,\n                \"state\": source_analysis.state.name,\n                \"stateInfo\": source_analysis.state_info,\n                \"sourceCount\": source_analysis.source_count,\n            }\n        )\n\n    def close(self):\n        # NOTE: JSON names use camel case to follow JSLint's guidelines, see <https://www.jslint.com/>.\n        super().close()\n        json_map = {\n            \"formatVersion\": JSON_FORMAT_VERSION,\n            \"pygountVersion\": pygount.__version__,\n            \"files\": self.source_analyses,\n            \"languages\": [\n                {\n                    \"documentationCount\": language_summary.documentation_count,\n                    \"documentationPercentage\": language_summary.documentation_percentage,\n                    \"codeCount\": language_summary.code_count,\n                    \"codePercentage\": language_summary.code_percentage,\n                    \"emptyCount\": language_summary.empty_count,\n                    \"emptyPercentage\": language_summary.empty_percentage,\n                    \"fileCount\": language_summary.file_count,\n                    \"filePercentage\": language_summary.file_percentage,\n                    \"isPseudoLanguage\": language_summary.is_pseudo_language,\n                    \"language\": language_summary.language,\n                    \"sourceCount\": language_summary.source_count,\n                    \"sourcePercentage\": language_summary.source_percentage,\n                    \"stringCount\": language_summary.string_count,\n                    \"stringPercentage\": language_summary.string_percentage,\n                }\n                for language_summary in self.project_summary.language_to_language_summary_map.values()\n            ],\n            \"runtime\": {\n                \"durationInSeconds\": self.duration_in_seconds,\n                \"filesPerSecond\": self.files_per_second,\n                \"finishedAt\": self.finished_at.isoformat(),\n                \"linesPerSecond\": self.lines_per_second,\n                \"startedAt\": self.started_at.isoformat(),\n            },\n            \"summary\": {\n                \"totalCodeCount\": self.project_summary.total_code_count,\n                \"totalCodePercentage\": self.project_summary.total_code_percentage,\n                \"totalDocumentationCount\": self.project_summary.total_documentation_count,\n                \"totalDocumentationPercentage\": self.project_summary.total_documentation_percentage,\n                \"totalEmptyCount\": self.project_summary.total_empty_count,\n                \"totalEmptyPercentage\": self.project_summary.total_empty_percentage,\n                \"totalFileCount\": self.project_summary.total_file_count,\n                \"totalSourceCount\": self.project_summary.total_source_count,\n                \"totalSourcePercentage\": self.project_summary.total_source_percentage,\n                \"totalStringCount\": self.project_summary.total_string_count,\n                \"totalStringPercentage\": self.project_summary.total_string_percentage,\n            },\n        }\n        json.dump(json_map, self._target_stream)\n\n\ndef digit_width(line_count: int) -> int:\n    assert line_count >= 0\n    return math.ceil(math.log10(line_count + 1)) if line_count != 0 else 1\n\n\ndef formatted_percentage(percentage: float) -> str:\n    assert percentage >= 0.0\n    assert percentage <= 100.0\n    return f\"{percentage:.01f}\"\n"
  },
  {
    "path": "pygount/xmldialect.py",
    "content": "\"\"\"\nFunction to obtain the language dialect used by XML source code.\n\"\"\"\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nimport logging\nimport re\nimport xml.sax\n\nfrom pygount.common import WHITE_SPACE_CHARACTERS\n\n# TODO #10: Replace regex for DTD by working DTD handler.\n#: Regular expression to obtain DTD.\n_DTD_REGEX = re.compile(r'<!DOCTYPE\\s+(?P<name>[a-zA-Z][a-zA-Z-]*)\\s+PUBLIC\\s+\"(?P<public_id>.+)\"')\n_REGEX_PATTERNS_AND_DIALECTS = (\n    (\".*DocBook.*\", \"DocBook XML\"),\n    (\".+ SVG .+\", \"SVG XML\"),\n)\n_REGEXES_AND_DIALECTS = [(re.compile(pattern), dialect) for pattern, dialect in _REGEX_PATTERNS_AND_DIALECTS]\nfor public_id_regex, dialect in _REGEX_PATTERNS_AND_DIALECTS:\n    assert public_id_regex is not None\n    assert dialect is not None\n    assert dialect.strip() != \"\"\n#: Regex to detect Sax error messages with uninformative paths like '<unknown>'.\n_SAX_MESSAGE_WITHOUT_PATH_PATTERN = re.compile(r\"^<.+>(?P<message_without_path>:\\d+:\\d+.+)\")\n\n_log = logging.getLogger(\"pygount\")\n\n\nclass SaxParserDone(Exception):\n    \"\"\"\n    Pseudo error to indicate that the Sax parser ist done.\n    \"\"\"\n\n\nclass XmlDialectHandler(xml.sax.ContentHandler, xml.sax.handler.DTDHandler):\n    def __init__(self, max_element_count=100):\n        super().__init__()\n        self.dialect = None\n        self._path = \"\"\n        self._element_count = 0\n        self._max_element_count = max_element_count\n\n    def _set_dialect_and_stop_parsing(self, dialect):\n        self.dialect = dialect\n        raise SaxParserDone(f\"language detected: {dialect}\")\n\n    def startElement(self, name, attrs):\n        self._element_count += 1\n        if self._element_count == self._max_element_count:\n            raise SaxParserDone(f\"no language found after parsing {self._element_count} elements\")\n        self._path += \"/\" + name\n        xmlns = attrs.get(\"xmlns\", \"\")\n        if (self._path == \"/project\") and (\"name\" in attrs):\n            self._set_dialect_and_stop_parsing(\"Ant\")\n        elif (self._path in (\"/book/title\", \"/chapter/title\")) or (xmlns == \"http://docbook.org/ns/docbook\"):\n            self._set_dialect_and_stop_parsing(\"DocBook XML\")\n        elif xmlns == \"http://xmlns.jcp.org/xml/ns/javaee\":\n            self._set_dialect_and_stop_parsing(\"JavaEE XML\")\n        elif xmlns.startswith(\"http://maven.apache.org/POM\"):\n            self._set_dialect_and_stop_parsing(\"Maven\")\n        elif xmlns.startswith(\"http://www.netbeans.org/ns/project/\"):\n            self._set_dialect_and_stop_parsing(\"NetBeans Project\")\n\n    def endElement(self, name):\n        self._path = self._path[: -len(name) - 1]\n\n\ndef xml_dialect(xml_path, xml_code):\n    # TODO #10: Remove hack to obtain DTD using a regex instead of a DTDHandler.\n    xml_code_witout_header = without_xml_header(xml_code)\n    dtd_match = _DTD_REGEX.match(xml_code_witout_header)\n    if dtd_match is not None:\n        public_id = dtd_match.group(\"public_id\")\n        for public_id_regex, dialect in _REGEXES_AND_DIALECTS:\n            if public_id_regex.match(public_id):\n                return dialect\n\n    xml_dialect_handler = XmlDialectHandler()\n    parser = xml.sax.make_parser()\n    parser.setContentHandler(xml_dialect_handler)\n    parser.setFeature(xml.sax.handler.feature_external_ges, False)\n    parser.setFeature(xml.sax.handler.feature_external_pes, False)\n    parser.setFeature(xml.sax.handler.feature_validation, False)\n    try:\n        parser.feed(xml_code)\n        # NOTE: We can only call close() when the parser has finished,\n        # otherwise close() raises a SAXException('parser finished').\n        parser.close()\n    except SaxParserDone:\n        # Language has been determined or the parser has given up.\n        pass\n    except (ValueError, xml.sax.SAXException) as error:\n        # NOTE: ValueError is raised on unknown url type.\n        error_message = str(error)\n        message_without_path_match = _SAX_MESSAGE_WITHOUT_PATH_PATTERN.match(error_message)\n        if message_without_path_match is not None:\n            # HACK: Replace uninformative sax path like '<unknown>' with actual XML path.\n            error_message = xml_path + message_without_path_match.group(\"message_without_path\")\n        _log.warning(error_message)\n    except OSError as error:\n        _log.warning(\"%s: cannot analyze XML dialect: %s\", xml_path, error)\n    return xml_dialect_handler.dialect\n\n\ndef without_xml_header(xml_code: str) -> str:\n    result = xml_code.lstrip(WHITE_SPACE_CHARACTERS)\n    if result.startswith(\"<?xml\"):\n        end_if_xml_declaration = result.find(\"?>\")\n        if end_if_xml_declaration != -1:\n            result = result[end_if_xml_declaration + 2 :].lstrip(WHITE_SPACE_CHARACTERS)\n    return result\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[project]\nname = \"pygount\"\nversion = \"3.3.0\"\ndescription = \"count source lines of code (SLOC) using pygments\"\nauthors = [{ name = \"Thomas Aglassinger\", email = \"roskakori@users.sourceforge.net\" }]\nrequires-python = \">=3.10, <4\"\nreadme = \"README.md\"\nlicense = \"BSD-3-Clause\"\nkeywords = [\n    \"code analysis\",\n    \"count\",\n    \"SLOC\",\n]\nclassifiers = [\n    \"Development Status :: 5 - Production/Stable\",\n    \"Environment :: Console\",\n    \"Intended Audience :: Developers\",\n    \"License :: OSI Approved :: BSD License\",\n    \"Natural Language :: English\",\n    \"Operating System :: OS Independent\",\n    \"Programming Language :: Python :: 3 :: Only\",\n    \"Programming Language :: Python :: 3.10\",\n    \"Programming Language :: Python :: 3.11\",\n    \"Programming Language :: Python :: 3.12\",\n    \"Programming Language :: Python :: 3.13\",\n    \"Programming Language :: Python :: 3.14\",\n    \"Topic :: Software Development\",\n]\ndependencies = [\n    \"chardet>=5,<6\",\n    \"gitpython~=3.1\",\n    \"pygments>=2,<3\",\n    \"rich>=14\",\n]\n\n[project.urls]\nHomepage = \"https://github.com/roskakori/pygount\"\nRepository = \"https://github.com/roskakori/pygount.git\"\nDocumentation = \"https://pygount.readthedocs.io\"\n\"Issue Tracker\" = \"https://github.com/roskakori/pygount/issues\"\nChanges = \"https://pygount.readthedocs.io/en/latest/changes.html\"\n\n[project.scripts]\npygount = \"pygount.command:main\"\n\n[tool.pytest.ini_options]\nminversion = \"9.0\"\naddopts = [\n    \"-rA\"\n]\ntestpaths = [\n    \"tests\",\n]\n\n[dependency-groups]\ndev = [\n    \"coveralls>=4,<5\",\n    \"coverage>=7,<8\",\n    \"hatchling>=1.27.0\",\n    \"mkdocs>=1.6,<2\",\n    \"mkdocs-material>=9\",\n    \"pytest>=9.0.3\",\n    \"pytest-cov>=7,<8\",\n    \"pre-commit>=4,<5\",\n    \"ruff>=0.15\",\n]\n\n[tool.uv]\ndefault-groups = [\n    \"dev\",\n]\n\n[tool.hatch.build.targets.sdist]\nexclude = [\".idea\", \".github\", \".readthedocs.yaml\"]\n\n[tool.hatch.build.targets.wheel]\npackages = [\"pygount\"]\n\n[build-system]\nrequires = [\"hatchling\"]\nbuild-backend = \"hatchling.build\"\n\n[tool.ruff]\nexclude = [\n    \".eggs\",\n    \".git\",\n    \".pytest_cache\",\n    \".pytype\",\n    \".ruff_cache\",\n    \".vscode\",\n    \"__pypackages__\",\n    \"_build\",\n    \"build\",\n    \"dist\",\n    \"htmlcov\",\n]\nline-length = 120\ntarget-version = \"py39\"\n\n[tool.ruff.lint]\nignore = [\n    # Missing trailing comma → May cause conflicts when used with the formatter.\n    \"COM812\",\n    # Too many branches\n    \"PLR0912\",\n    # Too many arguments in function definition\n    \"PLR0913\",\n    # Too many statements\n    \"PLR0915\",\n    # Magic value used in comparison\n    \"PLR2004\",\n    # TODO#89 Enable checks for usage of pathlib.\n    \"PTH100\",\n    \"PTH103\",\n    \"PTH107\",\n    \"PTH109\",\n    \"PTH110\",\n    \"PTH112\",\n    \"PTH114\",\n    \"PTH118\",\n    \"PTH119\",\n    \"PTH120\",\n    \"PTH122\",\n    \"PTH123\",\n    \"PTH202\",\n    \"PTH207\",\n    \"PTH208\",\n    # Unneccesarry assign → We regularly use `result = ...; return result` to examine the result in the debugger.\n    \"RET504\",\n    # TODO#506 Enable RUF012 check for mutable class attributes.\n    # Mutable class attributes should be annotated with `typing.ClassVar`\n    \"RUF012\",\n    # Avoid specifying long messages outside the exception class\n    \"TRY003\",\n    # Abstract `raise` to an inner function\n    \"TRY301\",\n]\nselect = [\n    # flake8-builtins\n    \"A\",\n    # flake8-bugbear\n    \"B\",\n    # flake8-commas\n    \"COM\",\n    # flake8-comprehensions\n    \"C4\",\n    # flake8-django\n    \"DJ\",\n    # flake8-datetimez\n    \"DTZ\",\n    # pycodestyle\n    \"E\",\n    # Pyflakes\n    \"F\",\n    # isort\n    \"I\",\n    # flake8-no-pep420\n    \"INP\",\n    #  flake8-gettext\n    \"INT\",\n    # flake8-logging\n    \"LOG\",\n    # perflint\n    \"PERF\",\n    # pygrep-hooks\n    \"PGH\",\n    # flake8-pie\n    \"PIE\",\n    # pylint\n    \"PL\",\n    # flake8-use-pathlib\n    \"PTH\",\n    # refactor\n    \"R\",\n    # flake8-raise\n    \"RSE\",\n    # flake8-return\n    \"RET\",\n    # ruff specific rules\n    \"RUF\",\n    # flake8-self\n    \"SLF\",\n    # flake8-simplify\n    \"SIM\",\n    # tryceratops\n    \"TRY\",\n    # flake8-debugger\n    \"T10\",\n    # flake8-print\n    \"T20\",\n    # pyupgrade\n    \"UP\",\n]\n\n[tool.ruff.lint.isort]\nknown-first-party = [\"pygount\", \"scripts\", \"tests\"]\n"
  },
  {
    "path": "scripts/build_documentation.sh",
    "content": "#!/bin/sh\n# Build documentation using Sphinx\nset -e\necho \"📖 Building documentation\"\nmkdocs build\necho \"✅ Successfully built documentation in site/index.html\"\n"
  },
  {
    "path": "scripts/build_movie.sh",
    "content": "#!/bin/sh\n# Build a gource movie about the development.\n#\n# For this to work, use macOS and install the following:\n#\n#   brew gource ffmpeg\n#\n# See also: <https://www.ekreative.com/blog/producing-your-own-git-repository-animated-visualization-video/>\nset -ex\nmkdir -p build\ngource --auto-skip-seconds 1 --file-idle-time 0 --hide dirnames,filenames,mouse --seconds-per-day 1 --title Pygount -1920x1080 --output-ppm-stream - . | ffmpeg -y -r 30 -f image2pipe -vcodec ppm -i - -vcodec libx264 -preset ultrafast -pix_fmt yuv420p -crf 1 -threads 0 -bf 0 /tmp/pygount_movie.mp4\n"
  },
  {
    "path": "scripts/test_coverage.sh",
    "content": "#!/bin/sh\nset -e\nuv run pytest --cov-reset --cov=pygount --cov-branch --cov-report html\necho \"To view results run: firefox htmlcov/index.html &\"\n"
  },
  {
    "path": "scripts/update_dependencies.sh",
    "content": "#!/bin/sh\n# Update requirements files and pre-commit hooks to current versions.\nset -e\necho \"🧱 Updating project\"\nuv sync\nuv lock --upgrade\necho \"🛠️ Updating pre-commit\"\nuv run pre-commit autoupdate\necho \"🎉 Successfully updated dependencies\"\n"
  },
  {
    "path": "tests/__init__.py",
    "content": "# Deliberately left empty.\n"
  },
  {
    "path": "tests/_common.py",
    "content": "\"\"\"\nCommon constants and functions used by multiple tests.\n\"\"\"\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nimport os\nimport shutil\nimport unittest\nfrom collections.abc import Iterator, Sequence\nfrom contextlib import contextmanager\nfrom tempfile import NamedTemporaryFile\nfrom typing import IO, TextIO, Union\n\nPYGOUNT_PROJECT_FOLDER = os.path.dirname(os.path.dirname(__file__))\nPYGOUNT_SOURCE_FOLDER = os.path.join(PYGOUNT_PROJECT_FOLDER, \"pygount\")\n\n\nclass TempFolderTest(unittest.TestCase):\n    def setUp(self):\n        self.tests_temp_folder = os.path.join(PYGOUNT_PROJECT_FOLDER, \"tests\", \".temp\")\n        os.makedirs(self.tests_temp_folder, exist_ok=True)\n\n    def create_temp_file(\n        self, relative_target_path, content: Union[str, bytes, Sequence[str]], encoding=\"utf-8\", do_create_folder=False\n    ):\n        result = os.path.join(self.tests_temp_folder, relative_target_path)\n        if do_create_folder:\n            os.makedirs(os.path.dirname(result), exist_ok=True)\n        with open(result, \"w\", encoding=encoding) as target_file:\n            if isinstance(content, (str, bytes)):\n                target_file.write(content)\n            else:\n                for line in content:\n                    target_file.write(line)\n                    target_file.write(\"\\n\")\n        return result\n\n    def create_temp_binary_file(self, relative_target_path, content: bytes):\n        result = os.path.join(self.tests_temp_folder, relative_target_path)\n        with open(result, \"wb\") as target_file:\n            target_file.write(content)\n        return result\n\n    def tearDown(self):\n        shutil.rmtree(self.tests_temp_folder)\n\n\n@contextmanager\ndef temp_binary_file(data: bytes) -> Iterator[IO]:\n    with NamedTemporaryFile(mode=\"wb+\", suffix=\".bin\") as result:\n        result.write(data)\n        result.flush()\n        result.seek(0)\n        yield result\n\n\n@contextmanager\ndef temp_source_file(suffix: str, lines: list[str], *, encoding: str = \"utf-8\") -> Iterator[TextIO]:\n    with NamedTemporaryFile(encoding=encoding, mode=\"w+\", suffix=f\".{suffix}\") as result:\n        result.write(\"\\n\".join(lines))\n        result.flush()\n        result.seek(0)\n        yield result\n"
  },
  {
    "path": "tests/test_analysis.py",
    "content": "\"\"\"\nTests for pygount source code analysis.\n\"\"\"\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nimport glob\nimport os\nimport unittest\nfrom io import BytesIO, StringIO\n\nimport pytest\nfrom pygments import lexers, token\n\nimport pygount\nfrom pygount import Error as PygountError\nfrom pygount import analysis, common\nfrom pygount.analysis import (\n    _delined_tokens,\n    _line_parts,\n    _pythonized_comments,\n    base_language,\n    guess_lexer,\n    is_markup_file,\n)\n\nfrom ._common import PYGOUNT_PROJECT_FOLDER, PYGOUNT_SOURCE_FOLDER, TempFolderTest, temp_source_file\nfrom .test_xmldialect import EXAMPLE_ANT_CODE\n\n\nclass SourceScannerTest(TempFolderTest):\n    def setUp(self):\n        super().setUp()\n        self._tests_folder = os.path.dirname(__file__)\n\n    def test_can_find_no_files(self):\n        scanner = analysis.SourceScanner([])\n        actual_paths = list(scanner.source_paths())\n        assert actual_paths == []\n\n    def test_can_find_any_files(self):\n        scanner = analysis.SourceScanner([PYGOUNT_SOURCE_FOLDER])\n        actual_paths = list(scanner.source_paths())\n        assert actual_paths != []\n\n    def test_can_find_python_files(self):\n        scanner = analysis.SourceScanner([PYGOUNT_SOURCE_FOLDER], \"py\")\n        actual_paths = list(scanner.source_paths())\n        assert actual_paths != []\n        for path_data in actual_paths:\n            actual_suffix = os.path.splitext(path_data.source_path)[1]\n            assert actual_suffix == \".py\"\n\n    def test_can_skip_dot_folder(self):\n        project_folder_name = \"project\"\n        project_folder = os.path.join(self.tests_temp_folder, project_folder_name)\n        name_to_include = \"include.py\"\n        relative_path_to_include = os.path.join(project_folder_name, \"include\", name_to_include)\n        self.create_temp_file(relative_path_to_include, \"include = 1\", do_create_folder=True)\n        relative_path_to_skip = os.path.join(project_folder_name, \".skip\", \"skip.py\")\n        self.create_temp_file(relative_path_to_skip, \"skip = 2\", do_create_folder=True)\n\n        scanner = analysis.SourceScanner([project_folder])\n        scanned_names = [os.path.basename(path_data.source_path) for path_data in scanner.source_paths()]\n        assert scanned_names == [name_to_include]\n\n    def test_succeeds_on_not_git_extension(self):\n        non_repo_urls = [[\"https://github.com/roskakori/pygount/\"], [\"git@github.com:roskakori/pygount\"]]\n        for non_repo_url in non_repo_urls:\n            with analysis.SourceScanner(non_repo_url) as scanner:\n                _ = list(scanner.source_paths())\n\n    def test_fails_on_non_git_urls(self):\n        non_repo_urls = [[\"https://no/git/url\"], [\"https://google.com/nogit\"]]\n        for non_repo_url in non_repo_urls:\n            with (\n                analysis.SourceScanner(non_repo_url) as scanner,\n                pytest.raises(pygount.Error, match=\"URL to git repository\"),\n            ):\n                _ = list(scanner.source_paths())\n\n    def test_can_find_python_files_in_dot(self):\n        scanner = analysis.SourceScanner([\".\"], \"py\")\n        actual_paths = list(scanner.source_paths())\n        assert actual_paths != []\n        for path_data in actual_paths:\n            actual_suffix = os.path.splitext(path_data.source_path)[1]\n            assert actual_suffix == \".py\"\n\n    def test_can_find_files_from_mixed_cloned_git_remote_url_and_local(self):\n        git_remote_url = \"https://github.com/roskakori/pygount.git\"\n        with analysis.SourceScanner([git_remote_url, PYGOUNT_SOURCE_FOLDER]) as scanner:\n            actual_paths = list(scanner.source_paths())\n            assert actual_paths != []\n            assert actual_paths[0].source_path != actual_paths[-1].source_path\n            assert actual_paths[-1].tmp_dir is not None\n\n\nclass AnalysisTest(unittest.TestCase):\n    def test_can_deline_tokens(self):\n        assert list(_delined_tokens([(token.Comment, \"# a\")])) == [(token.Comment, \"# a\")]\n        assert list(_delined_tokens([(token.Comment, \"# a\\n#  b\")])) == [\n            (token.Comment, \"# a\\n\"),\n            (token.Comment, \"#  b\"),\n        ]\n        assert list(_delined_tokens([(token.Comment, \"# a\\n#  b\\n\")])) == [\n            (token.Comment, \"# a\\n\"),\n            (token.Comment, \"#  b\\n\"),\n        ]\n        assert list(_delined_tokens([(token.Comment, \"# a\\n#  b\\n # c\\n\")])) == [\n            (token.Comment, \"# a\\n\"),\n            (token.Comment, \"#  b\\n\"),\n            (token.Comment, \" # c\\n\"),\n        ]\n\n    def test_can_compute_python_line_parts(self):\n        python_lexer = lexers.get_lexer_by_name(\"python\")\n        assert list(_line_parts(python_lexer, \"#\")) == [set(\"d\")]\n        assert list(_line_parts(python_lexer, \"s = 'x'  # x\")) == [set(\"cds\")]\n\n    def test_can_detect_white_text(self):\n        python_lexer = lexers.get_lexer_by_name(\"python\")\n        assert list(_line_parts(python_lexer, \"{[()]};\")) == [set()]\n        assert list(_line_parts(python_lexer, \"pass\")) == [set()]\n\n    def test_can_convert_python_strings_to_comments(self):\n        source_code = '#!/bin/python\\n\"Some tool.\"\\n#(C) by me\\ndef x():\\n    \"Some function\"\\n    return 1'\n        python_lexer = lexers.get_lexer_by_name(\"python\")\n        python_tokens = python_lexer.get_tokens(source_code)\n        for token_type, _ in list(_pythonized_comments(_delined_tokens(python_tokens))):\n            assert token_type not in token.String\n\n    def test_can_analyze_python(self):\n        source_lines = [\n            '\"Some tool.\"',\n            \"#!/bin/python\",\n            \"#(C) by me\",\n            \"def x():\",\n            '    \"Some function\"',\n            '    return \"abc\"',\n        ]\n        actual_line_parts = _line_parts_with_detected_markup(\"python\", source_lines)\n        expected_line_parts = [{\"d\"}, {\"d\"}, {\"d\"}, {\"c\"}, {\"d\"}, {\"c\", \"s\"}]\n        assert actual_line_parts == expected_line_parts\n\n    def test_can_analyze_c(self):\n        source_lines = [\n            \"/*\",\n            \" * The classic hello world for C99.\",\n            \" */\",\n            \"#include <stdio.h>\",\n            \"int main(void) {\",\n            '   puts(\"Hello, World!\");',\n            \"}\",\n        ]\n        actual_line_parts = _line_parts_with_detected_markup(\"c\", source_lines)\n        expected_line_parts = [{\"d\"}, {\"d\"}, {\"d\"}, {\"c\"}, {\"c\"}, {\"c\", \"s\"}, set()]\n        assert actual_line_parts == expected_line_parts\n\n\ndef test_can_detect_all_lines_as_documentation_with_markup_enabled():\n    source_lines = [\n        \"/*\",\n        \" * The classic hello world for C99.\",\n        \" */\",\n        \"#include <stdio.h>\",\n        \"int main(void) {\",\n        '   puts(\"Hello, World!\");',\n        \"}\",\n    ]\n    actual_line_parts = _line_parts_with_detected_markup(\"markdown\", source_lines)\n    assert all(line_part == \"d\" for line_part in actual_line_parts[-1])\n    assert actual_line_parts[-1:] == [set()]\n\n\ndef _line_parts_with_detected_markup(lexer_name: str, source_lines: list[str]) -> list[set[str]]:\n    lexer = lexers.get_lexer_by_name(lexer_name)\n    is_markup = lexer_name in [\"markdown\", \"md\", \"restructuredtext\", \"rst\", \"rest\", \"groff\"]\n    source_code = \"\\n\".join(source_lines)\n    return list(_line_parts(lexer, source_code, is_markup=is_markup))\n\n\nclass _NonSeekableEmptyBytesIO(BytesIO):\n    # Class to create a 'dummy object that mimics a non-seekable file handle'\n    def seekable(self) -> bool:\n        return False\n\n\nclass FileAnalysisTest(TempFolderTest):\n    def test_can_analyze_encoding_error(self):\n        test_path = self.create_temp_file(\"encoding_error.py\", 'print(\"\\N{EURO SIGN}\")', encoding=\"cp1252\")\n        source_analysis = analysis.SourceAnalysis.from_file(test_path, \"test\", encoding=\"utf-8\")\n        assert source_analysis.language == \"__error__\"\n        assert source_analysis.state == analysis.SourceState.error\n        assert \"0x80\" in str(source_analysis.state_info)\n\n    def test_can_detect_silent_dos_batch_remarks(self):\n        test_bat_path = self.create_temp_file(\n            \"test_can_detect_silent_dos_batch_remarks.bat\",\n            [\"rem normal comment\", \"@rem silent comment\", \"echo some code\"],\n        )\n        source_analysis = analysis.SourceAnalysis.from_file(test_bat_path, \"test\", encoding=\"utf-8\")\n        assert source_analysis.language == \"Batchfile\"\n        assert source_analysis.code_count == 1\n        assert source_analysis.documentation_count == 2\n\n    def test_can_ignore_almost_magic_comment(self):\n        test_bat_path = self.create_temp_file(\n            \"test_can_ignore_almost_magic_comment.json\",\n            ['{\"x\":\"coding:no_such_coding\"'],\n        )\n        source_analysis = analysis.SourceAnalysis.from_file(test_bat_path, \"test\")\n        assert source_analysis.language.lower() == \"json\"\n        assert source_analysis.code_count == 1\n        assert source_analysis.documentation_count == 0\n\n    def test_fails_on_unknown_magic_encoding_comment(self):\n        test_path = self.create_temp_file(\n            \"test_fails_on_unknown_magic_encoding_comment.py\", [\"# -*- coding: no_such_encoding -*-\", 'print(\"hello\")']\n        )\n        no_such_encoding = analysis.encoding_for(test_path)\n        assert no_such_encoding == \"no_such_encoding\"\n        source_analysis = analysis.SourceAnalysis.from_file(test_path, \"test\", encoding=no_such_encoding)\n        assert source_analysis.language == \"__error__\"\n        assert source_analysis.state == analysis.SourceState.error\n        assert \"unknown encoding\" in str(source_analysis.state_info)\n\n    def test_can_analyze_oracle_sql(self):\n        test_oracle_sql_path = self.create_temp_file(\n            \"test_can_analyze_oracle_sql.pls\",\n            [\"-- Oracle SQL example using an obscure suffix.\", \"select *\", \"from some_table;\"],\n        )\n        source_analysis = analysis.SourceAnalysis.from_file(test_oracle_sql_path, \"test\", encoding=\"utf-8\")\n        assert source_analysis.language.lower().endswith(\"sql\")\n        assert source_analysis.code_count == 2\n        assert source_analysis.documentation_count == 1\n\n    def test_can_analyze_webfocus(self):\n        test_fex_path = self.create_temp_file(\n            \"some.fex\", [\"-* comment\", \"-type some text\", \"table file some print * end;\"]\n        )\n        source_analysis = analysis.SourceAnalysis.from_file(test_fex_path, \"test\", encoding=\"utf-8\")\n        assert source_analysis.language == \"WebFOCUS\"\n        assert source_analysis.code_count == 2\n        assert source_analysis.documentation_count == 1\n\n    def test_can_analyze_xml_dialect(self):\n        build_xml_path = self.create_temp_file(\"build.xml\", EXAMPLE_ANT_CODE)\n        source_analysis = analysis.SourceAnalysis.from_file(build_xml_path, \"test\")\n        assert source_analysis.state == analysis.SourceState.analyzed\n        assert source_analysis.language == \"Ant\"\n\n    def test_can_analyze_unknown_language(self):\n        unknown_language_path = self.create_temp_file(\"some.unknown_language\", [\"some\", \"lines\", \"of\", \"text\"])\n        source_analysis = analysis.SourceAnalysis.from_file(unknown_language_path, \"test\")\n        assert source_analysis.state == analysis.SourceState.unknown\n\n    def test_can_detect_binary_source_code(self):\n        binary_path = self.create_temp_binary_file(\"some_django.mo\", b\"hello\\0world!\")\n        source_analysis = analysis.SourceAnalysis.from_file(binary_path, \"test\", encoding=\"utf-8\")\n        assert source_analysis.state == analysis.SourceState.binary\n        assert source_analysis.code_count == 0\n\n    def test_can_analyze_stringio(self):\n        test_path = \"imaginary/path/to/file.py\"\n        test_code = \"from random import randint\\n\\n# Print a random dice roll\\nprint(randint(6))\\n\"\n        source_analysis = analysis.SourceAnalysis.from_file(test_path, \"test\", file_handle=StringIO(test_code))\n        assert source_analysis.state == analysis.SourceState.analyzed\n        assert source_analysis.language == \"Python\"\n        assert source_analysis.code_count == 2\n\n    def test_can_analyze_bytesio(self):\n        test_path = \"imaginary/path/to/file.py\"\n        test_code = b\"from random import randint\\n\\n# Print a random dice roll\\nprint(randint(6))\\n\"\n        source_analysis = analysis.SourceAnalysis.from_file(test_path, \"test\", file_handle=BytesIO(test_code))\n        assert source_analysis.state == analysis.SourceState.analyzed\n        assert source_analysis.language == \"Python\"\n        assert source_analysis.code_count == 2\n\n    def test_can_analyze_embedded_language(self):\n        test_html_django_path = self.create_temp_file(\n            \"some.html\",\n            [\"<!DOCTYPE html>\", \"{% load i18n %}\", '<html lang=\"{{ language_code }}\" />'],\n        )\n        source_analysis = analysis.SourceAnalysis.from_file(test_html_django_path, \"test\", encoding=\"utf-8\")\n        assert source_analysis.language.lower() == \"html+django/jinja\"\n        assert source_analysis.code_count == 3\n\n    def test_can_analyze_generated_name(self):\n        test_uv_lock_path = self.create_temp_file(\"uv.lock\", [])\n        source_analysis = analysis.SourceAnalysis.from_file(\n            test_uv_lock_path,\n            \"test\",\n            generated_name_regexes=pygount.common.regexes_from(pygount.analysis.DEFAULT_GENERATED_NAME_PATTERNS_TEXT),\n        )\n        assert source_analysis.state == analysis.SourceState.generated\n\n    def test_can_merge_embedded_language(self):\n        test_html_django_path = self.create_temp_file(\n            \"some.html\",\n            [\"<!DOCTYPE html>\", \"{% load i18n %}\", '<html lang=\"{{ language_code }}\" />'],\n        )\n        source_analysis = analysis.SourceAnalysis.from_file(\n            test_html_django_path, \"test\", encoding=\"utf-8\", merge_embedded_language=True\n        )\n        assert source_analysis.language.lower() == \"html\"\n        assert source_analysis.code_count == 3\n\n    def test_can_analyze_unknown_magic_comment_encoding(self):\n        test_python_path = self.create_temp_file(\"some.py\", [\"# -*- coding: no_such_encoding -*-\", \"print('hello')\"])\n        source_analysis = analysis.SourceAnalysis.from_file(test_python_path, \"test\")\n        assert source_analysis.language.lower() == \"__error__\"\n        assert source_analysis.state_info == \"unknown encoding: no_such_encoding\"\n\n    def test_fails_on_non_seekable_file_handle_with_encoding_automatic(self):\n        file_handle = _NonSeekableEmptyBytesIO()\n\n        with pytest.raises(PygountError, match=r\".*file handle must be seekable.*\"):\n            analysis.SourceAnalysis.from_file(\"README.md\", \"test\", file_handle=file_handle, encoding=\"automatic\")\n\n    def test_fails_on_non_seekable_file_handle_with_encoding_chardet(self):\n        file_handle = _NonSeekableEmptyBytesIO()\n\n        with pytest.raises(PygountError, match=r\".*file handle must be seekable.*\"):\n            analysis.SourceAnalysis.from_file(\"README.md\", \"test\", file_handle=file_handle, encoding=\"chardet\")\n\n\n@pytest.mark.parametrize(\n    \"suffix, code_count, doc_count, expected_language_lower\",\n    [\n        (\"rst\", 0, 3, \"restructuredtext\"),\n        (\"md\", 0, 3, \"markdown\"),\n        (\"txt\", 0, 3, \"text only\"),\n        (\"4\", 0, 3, \"groff\"),\n    ],\n)\ndef test_can_analyze_markup_as_plain_documentation(\n    suffix, code_count: int, doc_count: int, expected_language_lower: str\n):\n    source_lines = [\"<!DOCTYPE html>\", \"{% load i18n %}\", \"\", \"  \", '<html lang=\"{{ language_code }}\" />']\n    expected_empty_count = 2\n    expected_documentation_count = len(source_lines) - expected_empty_count\n    with temp_source_file(suffix, source_lines) as test_file:\n        source_analysis = analysis.SourceAnalysis.from_file(test_file.name, \"test\", encoding=\"utf-8\")\n        assert source_analysis.language.lower() == expected_language_lower\n        assert source_analysis.code_count == 0\n        assert source_analysis.documentation_count == expected_documentation_count\n        assert source_analysis.empty_count == expected_empty_count\n\n\ndef test_can_repr_source_analysis_from_file():\n    source_analysis = analysis.SourceAnalysis(\"some.py\", \"Python\", \"some\", 1, 2, 3, 4, analysis.SourceState.analyzed)\n    expected_source_analysis_repr = (\n        \"SourceAnalysis(path='some.py', language='Python', group='some', \"\n        \"state=analyzed, code_count=1, documentation_count=2, empty_count=3, string_count=4)\"\n    )\n    assert repr(source_analysis) == expected_source_analysis_repr\n    assert repr(source_analysis) == str(source_analysis)\n\n\ndef test_can_repr_empty_source_analysis_from_file():\n    source_analysis = analysis.SourceAnalysis(\"some.py\", \"__empty__\", \"some\", 0, 0, 0, 0, analysis.SourceState.empty)\n    expected_source_analysis_repr = \"SourceAnalysis(path='some.py', language='__empty__', group='some', state=empty)\"\n    assert repr(source_analysis) == expected_source_analysis_repr\n    assert repr(source_analysis) == str(source_analysis)\n\n\ndef test_can_repr_error_source_analysis_from_file():\n    source_analysis = analysis.SourceAnalysis(\n        \"some.py\", \"__error__\", \"some\", 0, 0, 0, 0, analysis.SourceState.error, \"error details\"\n    )\n    expected_source_analysis_repr = (\n        \"SourceAnalysis(path='some.py', language='__error__', group='some', state=error, state_info='error details')\"\n    )\n    assert repr(source_analysis) == expected_source_analysis_repr\n    assert repr(source_analysis) == str(source_analysis)\n\n\ndef test_can_guess_lexer_for_python():\n    lexer = guess_lexer(\"some.py\", \"pass\")\n    assert lexer is not None\n    assert lexer.name == \"Python\"\n\n\ndef test_can_guess_lexer_for_plain_text():\n    lexer = guess_lexer(\"README.1st\", \"hello!\")\n    assert lexer is not None\n    assert lexer.name == \"Text\"\n\n\ndef test_can_guess_lexer_for_cmakelists():\n    source_code = \"\\n\".join(\n        [\n            \"cmake_minimum_required(VERSION 2.6)\",\n            \"project(example)\",\n            \"set(CMAKE_CXX_STANDARD 14)\",\n            \"set(SOURCE_FILES example.cpp)\",\n            \"add_executable(example ${SOURCE_FILES})\",\n        ]\n    )\n    lexer = guess_lexer(\"CMakeLists.txt\", source_code)\n    assert lexer is not None\n    assert lexer.name == \"CMake\"\n\n\nclass GeneratedCodeTest(TempFolderTest):\n    _STANDARD_SOURCE_LINES = [\n        \"#!/bin/python3\",\n        \"    # Example code for\",\n        \"    # generated source code.\",\n        '    print(\"I\\'m generated!\")',\n        \"    \",\n    ]\n    _STANDARD_GENERATED_REGEXES = common.regexes_from(\n        common.REGEX_PATTERN_PREFIX + \".*some,.*other,.*generated,.*print\"\n    )\n\n    def test_can_detect_non_generated_code(self):\n        default_generated_regexes = common.regexes_from(analysis.DEFAULT_GENERATED_LINE_PATTERNS_TEXT)\n        with open(__file__, encoding=\"utf-8\") as source_file:\n            matching_line_number_and_regex = analysis.matching_number_line_and_regex(\n                source_file, default_generated_regexes\n            )\n        assert matching_line_number_and_regex is None\n\n    def test_can_detect_generated_code(self):\n        matching_number_line_and_regex = analysis.matching_number_line_and_regex(\n            GeneratedCodeTest._STANDARD_SOURCE_LINES, GeneratedCodeTest._STANDARD_GENERATED_REGEXES\n        )\n        assert matching_number_line_and_regex is not None\n        matching_number, matching_line, matching_regex = matching_number_line_and_regex\n        assert matching_number == 2\n        assert matching_line == GeneratedCodeTest._STANDARD_SOURCE_LINES[2]\n        assert matching_regex == GeneratedCodeTest._STANDARD_GENERATED_REGEXES[2]\n\n    def test_can_not_detect_generated_code_with_late_comment(self):\n        non_matching_number_line_and_regex = analysis.matching_number_line_and_regex(\n            GeneratedCodeTest._STANDARD_SOURCE_LINES, GeneratedCodeTest._STANDARD_GENERATED_REGEXES, 2\n        )\n        assert non_matching_number_line_and_regex is None\n\n    def test_can_analyze_generated_code_with_own_pattern(self):\n        lines = [\"-- Generiert mit Hau-Ruck-Franz-Deutsch.\", \"select * from sauerkraut;\"]\n        generated_sql_path = self.create_temp_file(\"generated.sql\", lines)\n        source_analysis = analysis.SourceAnalysis.from_file(\n            generated_sql_path, \"test\", generated_regexes=common.regexes_from(\"[regex](?i).*generiert\")\n        )\n        assert source_analysis.state == analysis.SourceState.generated\n\n\nclass SizeTest(TempFolderTest):\n    def test_can_detect_empty_source_code(self):\n        empty_py_path = self.create_temp_binary_file(\"empty.py\", b\"\")\n        source_analysis = analysis.SourceAnalysis.from_file(empty_py_path, \"test\", encoding=\"utf-8\")\n        assert source_analysis.state == analysis.SourceState.empty\n        assert source_analysis.code_count == 0\n\n\ndef test_can_analyze_project_markdown_files():\n    project_root_folder = os.path.dirname(PYGOUNT_PROJECT_FOLDER)\n    for text_path in glob.glob(os.path.join(project_root_folder, \"*.md\")):\n        source_analysis = analysis.SourceAnalysis.from_file(text_path, \"test\")\n        assert source_analysis.state == analysis.SourceState.analyzed\n        assert source_analysis.documentation_count > 0\n        assert source_analysis.empty_count > 0\n\n\ndef test_has_no_duplicate_in_pygount_source():\n    duplicate_pool = analysis.DuplicatePool()\n    source_paths = []\n    for sub_folder_name in (\"pygount\", \"tests\"):\n        source_paths.extend(\n            [\n                os.path.join(PYGOUNT_PROJECT_FOLDER, sub_folder_name, source_name)\n                for source_name in os.listdir(os.path.join(PYGOUNT_PROJECT_FOLDER, sub_folder_name))\n            ]\n        )\n    for source_path in source_paths:\n        if source_path.endswith(\".py\"):\n            duplicate_path = duplicate_pool.duplicate_path(source_path)\n            assert duplicate_path is None, f\"{source_path} must not be duplicate of {duplicate_path}\"\n\n\ndef test_can_compute_base_language():\n    assert base_language(\"JavaScript\") == \"JavaScript\"\n    assert base_language(\"JavaScript+Lasso\") == \"JavaScript\"\n    assert base_language(\"JavaScript+\") == \"JavaScript+\"  # no actual language\n    assert base_language(\"C++\") == \"C++\"\n    assert base_language(\"++C\") == \"++C\"  # no actual language\n    assert base_language(\"\") == \"\"  # no actual language, but should not crash either\n\n\nclass DuplicatePoolTest(TempFolderTest):\n    def test_can_distinguish_different_files(self):\n        some_path = self.create_temp_file(__name__ + \"_some\", \"some\")\n        other_path = self.create_temp_file(__name__ + \"_other\", \"other\")\n        duplicate_pool = analysis.DuplicatePool()\n        assert duplicate_pool.duplicate_path(some_path) is None\n        assert duplicate_pool.duplicate_path(other_path) is None\n\n    def test_can_detect_duplicate(self):\n        same_content = \"same\"\n        original_path = self.create_temp_file(\"original\", same_content)\n        duplicate_path = self.create_temp_file(\"duplicate\", same_content)\n        duplicate_pool = analysis.DuplicatePool()\n        assert duplicate_pool.duplicate_path(original_path) is None\n        assert original_path == duplicate_pool.duplicate_path(duplicate_path)\n\n\n@pytest.mark.parametrize(\n    \"suffix, expected_result\",\n    [(\"md\", True), (\"MD\", True), (\"mD\", True), (\"rst\", True), (\"py\", False), (\"4\", True), (\"c\", False)],\n)\ndef test_can_detect_markup_file(suffix, expected_result):\n    source_path = f\"some_file_name.{suffix}\"\n    assert is_markup_file(source_path) == expected_result\n"
  },
  {
    "path": "tests/test_command.py",
    "content": "\"\"\"\nTests for pygount command line interface.\n\"\"\"\n\nimport contextlib\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nimport json\nimport os\nimport tempfile\nfrom xml.etree import ElementTree\n\nimport pytest\n\nimport pygount\nfrom pygount import command\nfrom pygount.command import VALID_OUTPUT_FORMATS, Command\nfrom pygount.common import OptionError\nfrom pygount.write import JSON_FORMAT_VERSION\n\nfrom ._common import PYGOUNT_PROJECT_FOLDER, PYGOUNT_SOURCE_FOLDER, TempFolderTest\n\n\nclass CommandTest(TempFolderTest):\n    def test_fails_on_unknown_output_format(self):\n        unknown_output_format = \"no_such_output_format\"\n        command = Command()\n        with pytest.raises(OptionError, match=unknown_output_format):\n            command.set_output_format(unknown_output_format)\n\n    def test_can_set_encoding(self):\n        command = Command()\n        command.set_encodings(\"automatic;cp1252\")\n        assert command.default_encoding == \"automatic\"\n        assert command.fallback_encoding == \"cp1252\"\n\n    def test_can_execute_on_own_code(self):\n        output_path = os.path.join(self.tests_temp_folder, \"test_can_execute_on_own_code.txt\")\n        with contextlib.suppress(FileNotFoundError):  # Ignore missing file as it is going to be recreated.\n            os.remove(output_path)\n        command = Command()\n        command.set_output(output_path)\n        command.set_output_format(\"cloc-xml\")\n        command.set_source_patterns(PYGOUNT_SOURCE_FOLDER)\n        command.set_suffixes(\"py\")\n        command.execute()\n        cloc_xml_root = ElementTree.parse(output_path)\n        file_elements = cloc_xml_root.findall(\"files/file\")\n        assert file_elements is not None\n        assert len(file_elements) >= 1\n\n    def test_fails_on_broken_regex(self):\n        command = Command()\n        with pytest.raises(OptionError, match=r\"^option --generated: cannot parse pattern for regular repression.*\"):\n            command.set_generated_regexps(\"[regex](\", \"option --generated\")\n\n    def test_can_use_chardet_for_encoding(self):\n        command = Command()\n        command.set_encodings(\"chardet\")\n        command.set_source_patterns(PYGOUNT_SOURCE_FOLDER)\n        command.execute()\n\n\nclass PygountCommandTest(TempFolderTest):\n    def test_can_show_help(self):\n        with pytest.raises(SystemExit) as error_info:\n            command.pygount_command([\"--help\"])\n        assert error_info.value.code == 0\n\n    def test_can_show_version(self):\n        with pytest.raises(SystemExit) as error_info:\n            command.pygount_command([\"--version\"])\n        assert error_info.value.code == 0\n\n    def test_fails_on_unknown_encoding(self):\n        with pytest.raises(SystemExit) as error_info:\n            command.pygount_command([\"--encoding\", \"no_such_encoding\", tempfile.gettempdir()])\n        assert error_info.value.code == 2\n\n    def test_fails_on_unknown_format(self):\n        with pytest.raises(SystemExit) as error_info:\n            command.pygount_command([\"--format\", \"no_such_encoding\", tempfile.gettempdir()])\n        assert error_info.value.code == 2\n\n    def test_fails_on_broken_regex_pattern(self):\n        exit_code = command.pygount_command([\"--generated\", \"[regex](\", tempfile.gettempdir()])\n        assert exit_code == 1\n\n    def test_can_analyze_pygount_setup_py(self):\n        pygount_setup_py_path = os.path.join(PYGOUNT_PROJECT_FOLDER, \"setup.py\")\n        exit_code = command.pygount_command([\"--verbose\", pygount_setup_py_path])\n        assert exit_code == 0\n\n    def test_can_analyze_pygount_source_code(self):\n        exit_code = command.pygount_command([\"--verbose\", PYGOUNT_SOURCE_FOLDER])\n        assert exit_code == 0\n\n    def test_can_detect_generated_code(self):\n        generated_code_path = os.path.join(self.tests_temp_folder, \"generated.py\")\n        with open(generated_code_path, \"w\", encoding=\"utf-8\") as generated_code_file:\n            generated_code_file.write(\n                \"# Generated with pygount.test_command.PygountCommandTest.test_can_detect_generated_code.\\n\"\n            )\n            generated_code_file.write(\"# Do not edit!\\n\")\n            generated_code_file.write(\"print('hello World')\\n\")\n        cloc_xml_path = os.path.join(self.tests_temp_folder, \"cloc.xml\")\n        exit_code = command.pygount_command(\n            [\"--verbose\", \"--format\", \"cloc-xml\", \"--out\", cloc_xml_path, generated_code_path]\n        )\n        assert exit_code == 0\n        assert os.path.exists(cloc_xml_path)\n        cloc_xml_root = ElementTree.parse(cloc_xml_path)\n        file_elements = cloc_xml_root.findall(\"files/file[@language='__generated__']\")\n        assert file_elements is not None\n        assert len(file_elements) >= 1\n\n    def test_can_detect_generated_code_with_own_pattern(self):\n        generiert_py_path = os.path.join(self.tests_temp_folder, \"generiert.py\")\n        with open(generiert_py_path, \"w\", encoding=\"utf-8\") as generiert_py_file:\n            generiert_py_file.write(\n                \"# Generiert mit pygount.test_command.PygountCommandTest.\"\n                \"test_can_detect_generated_code_with_own_pattern()\\n\"\n            )\n            generiert_py_file.write(\"print('hello World')\\n\")\n        cloc_xml_path = os.path.join(self.tests_temp_folder, \"cloc.xml\")\n        exit_code = command.pygount_command(\n            [\n                \"--verbose\",\n                \"--format=cloc-xml\",\n                \"--generated=[regex](?i).*generiert\",\n                \"--out\",\n                cloc_xml_path,\n                generiert_py_path,\n            ]\n        )\n        assert exit_code == 0\n        assert os.path.exists(cloc_xml_path)\n        cloc_xml_root = ElementTree.parse(cloc_xml_path)\n        file_elements = cloc_xml_root.findall(\"files/file[@language='__generated__']\")\n        assert file_elements is not None\n        assert len(file_elements) >= 1\n\n    def test_can_analyze_pygount_source_code_as_cloc_xml(self):\n        cloc_xml_path = os.path.join(self.tests_temp_folder, \"cloc.xml\")\n        exit_code = command.pygount_command(\n            [\"--verbose\", \"--format\", \"cloc-xml\", \"--out\", cloc_xml_path, PYGOUNT_SOURCE_FOLDER]\n        )\n        assert exit_code == 0\n        assert os.path.exists(cloc_xml_path)\n        cloc_xml_root = ElementTree.parse(cloc_xml_path)\n        file_elements = cloc_xml_root.findall(\"files/file\")\n        assert file_elements is not None\n        assert len(file_elements) >= 1\n\n    def test_can_analyze_pygount_source_code_as_json(self):\n        pygount_json_path = os.path.join(self.tests_temp_folder, \"pygount.json\")\n        exit_code = command.pygount_command(\n            [\"--verbose\", \"--format\", \"json\", \"--out\", pygount_json_path, PYGOUNT_SOURCE_FOLDER]\n        )\n        assert exit_code == 0\n        assert os.path.exists(pygount_json_path)\n        with open(pygount_json_path, encoding=\"utf-8\") as pygount_json_file:\n            json_map = json.load(pygount_json_file)\n        assert json_map.get(\"pygountVersion\") == pygount.__version__\n        assert json_map.get(\"formatVersion\") == JSON_FORMAT_VERSION\n        assert \"files\" in json_map\n        assert \"languages\" in json_map\n        assert \"runtime\" in json_map\n        assert \"summary\" in json_map\n\n    def test_can_detect_duplicates(self):\n        source_code = \"# Duplicate source\\nprint('duplicate code')\\n\"\n        original_path = os.path.join(self.tests_temp_folder, \"original.py\")\n        with open(original_path, \"w\") as original_file:\n            original_file.write(source_code)\n        duplicate_path = os.path.join(self.tests_temp_folder, \"duplicate.py\")\n        with open(duplicate_path, \"w\") as duplicate_file:\n            duplicate_file.write(source_code)\n        cloc_xml_path = os.path.join(self.tests_temp_folder, \"cloc.xml\")\n        exit_code = command.pygount_command(\n            [\"--verbose\", \"--format\", \"cloc-xml\", \"--out\", cloc_xml_path, original_path, duplicate_path]\n        )\n        assert exit_code == 0\n        assert os.path.exists(cloc_xml_path)\n        cloc_xml_root = ElementTree.parse(cloc_xml_path)\n        file_elements = cloc_xml_root.findall(\"files/file[@language='__duplicate__']\")\n        assert file_elements is not None\n        assert len(file_elements) == 1\n\n    def test_can_accept_duplicates(self):\n        source_code = \"# Duplicate source\\nprint('duplicate code')\\n\"\n        original_path = os.path.join(self.tests_temp_folder, \"original.py\")\n        with open(original_path, \"w\") as original_file:\n            original_file.write(source_code)\n        duplicate_path = os.path.join(self.tests_temp_folder, \"duplicate.py\")\n        with open(duplicate_path, \"w\") as duplicate_file:\n            duplicate_file.write(source_code)\n        cloc_xml_path = os.path.join(self.tests_temp_folder, \"cloc.xml\")\n        exit_code = command.pygount_command(\n            [\"--duplicates\", \"--verbose\", \"--format\", \"cloc-xml\", \"--out\", cloc_xml_path, original_path, duplicate_path]\n        )\n        assert exit_code == 0\n        assert os.path.exists(cloc_xml_path)\n        cloc_xml_root = ElementTree.parse(cloc_xml_path)\n        file_elements = cloc_xml_root.findall(\"files/file[@language='__duplicate__']\")\n        assert file_elements is not None\n        assert len(file_elements) == 0\n\n    def test_can_write_all_output_formats(self):\n        for output_format in VALID_OUTPUT_FORMATS:\n            exit_code = command.pygount_command([\"--format\", output_format, PYGOUNT_SOURCE_FOLDER])\n            self.assertEqual(exit_code, 0)\n\n    def test_can_merge_embedded_languages(self):\n        test_html_django_path = self.create_temp_file(\n            \"some.html\",\n            [\"<!DOCTYPE html>\", \"{% load i18n %}\", '<html lang=\"{{ language_code }}\" />'],\n        )\n        cloc_xml_path = os.path.join(self.tests_temp_folder, \"cloc.xml\")\n        exit_code = command.pygount_command(\n            [\"--merge-embedded-languages\", \"--format\", \"cloc-xml\", \"--out\", cloc_xml_path, test_html_django_path]\n        )\n        assert exit_code == 0\n        assert os.path.exists(cloc_xml_path)\n        cloc_xml_root = ElementTree.parse(cloc_xml_path)\n        file_elements = cloc_xml_root.findall(\"files/file[@language='HTML']\")\n        assert file_elements is not None\n        assert len(file_elements) == 1\n"
  },
  {
    "path": "tests/test_common.py",
    "content": "\"\"\"\nTests for :py:mod:`pygount.common` module.\n\"\"\"\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nimport re\n\nimport pytest\n\nimport pygount.common\nfrom pygount.common import matching_regex\n\n\ndef test_can_build_str():\n    error_without_source = pygount.common.OptionError(\"test\")\n    assert str(error_without_source) == \"test\"\n\n    error_with_source = pygount.common.OptionError(\"test\", \"some_file.txt\")\n    assert str(error_with_source) == \"some_file.txt: test\"\n\n\ndef test_can_match_from_regex():\n    regex = pygount.common.regex_from(re.compile(r\"a\\d+b\"))\n    assert regex.match(\"a123b\") is not None\n    assert regex.match(\"ab\") is None\n\n\ndef test_can_match_from_regex_pattern():\n    regex = pygount.common.regex_from(r\"a\\d+b\")\n    assert regex.match(\"a123b\") is not None\n    assert regex.match(\"ab\") is None\n\n\ndef test_can_match_from_shell_pattern():\n    regex = pygount.common.regex_from(\"*a[0-9]?*b*\", True)\n    assert regex.match(\"a123b\") is not None\n    assert regex.match(\"ab\") is None\n\n\ndef test_can_match_single_regex_from_shell_pattern():\n    regexes = pygount.common.regexes_from(\"*.py\")\n    assert len(regexes) == 1\n    assert regexes[0].match(\"some.py\") is not None\n    assert regexes[0].match(\"some.bat\") is None\n\n\ndef test_can_match_single_regex():\n    regexes = pygount.common.regexes_from(pygount.common.REGEX_PATTERN_PREFIX + r\"^.+\\.py$\")\n    assert len(regexes) == 1\n    assert regexes[0].match(\"some.py\") is not None\n    assert regexes[0].match(\"some.bat\") is None\n\n\ndef test_can_match_regex_from_multiple_regex_patterns():\n    regexes = pygount.common.regexes_from(pygount.common.REGEX_PATTERN_PREFIX + r\"x, abc, ^.+\\.py$\")\n    assert len(regexes) == 3\n    assert regexes[0].match(\"some.py\") is None\n    assert regexes[1].match(\"some.py\") is None\n    assert regexes[2].match(\"some.py\") is not None\n\n\ndef test_can_match_regex_from_multiple_default_shell_patterns():\n    regexes = pygount.common.regexes_from(\n        pygount.common.REGEX_PATTERN_PREFIX + pygount.common.ADDITIONAL_PATTERN + r\"x\", \"abc, *.py\"\n    )\n    assert len(regexes) == 3\n    assert regexes[0].match(\"some.py\") is None\n    assert regexes[1].match(\"some.py\") is None\n    assert regexes[2].match(\"some.py\") is not None\n    assert regexes[0].match(\"x\") is not None\n\n\ndef test_can_represent_text_as_list():\n    assert pygount.common.as_list(\"\") == []\n    assert pygount.common.as_list(\"a\") == [\"a\"]\n    assert pygount.common.as_list(\"abc,d, e\") == [\"abc\", \"d\", \"e\"]\n    assert pygount.common.as_list(\",,,,\") == []\n\n\ndef test_can_represent_iterable_as_list():\n    assert pygount.common.as_list([]) == []\n    assert pygount.common.as_list([\"a\", 1, None]) == [\"a\", 1, None]\n    assert pygount.common.as_list(()) == []\n    assert pygount.common.as_list(range(3)) == [0, 1, 2]\n\n\n@pytest.mark.parametrize(\n    \"text,patterns,expected_regex_index\",\n    [\n        (\"some\", [], -1),\n        (\"some\", [\"some\"], 0),\n        (\"some\", [\"other\"], -1),\n        (\"some\", [\"other\", \"some\"], 1),\n        (\"some\", [\"s.+\"], 0),\n        (\"some\", [\".*T.*\"], -1),\n    ],\n)\ndef test_can_compute_matching_regex(text: str, patterns: list[str], expected_regex_index: int):\n    regexes = [re.compile(pattern) for pattern in patterns]\n    regex = matching_regex(text, regexes)\n    regex_index = regexes.index(regex) if regex is not None else -1\n    assert regex_index == expected_regex_index\n\n\ndef test_can_convert_empty_text_to_lines():\n    assert list(pygount.common.lines(\"\")) == []\n\n\ndef test_can_convert_single_letter_to_lines():\n    assert list(pygount.common.lines(\"a\")) == [\"a\"]\n\n\ndef test_can_convert_single_letter_with_newline_to_lines():\n    assert list(pygount.common.lines(\"a\\n\")) == [\"a\"]\n\n\ndef test_can_convert_multiple_lines():\n    assert list(pygount.common.lines(\"a\\nbc\")) == [\"a\", \"bc\"]\n    assert list(pygount.common.lines(\"a\\nbc\\n\")) == [\"a\", \"bc\"]\n\n\ndef test_can_convert_empty_lines():\n    assert list(pygount.common.lines(\"\\n\\n\\n\")) == [\"\", \"\", \"\"]\n\n\ndef test_can_compute_mapped_repr():\n    class Dummy:\n        pass\n\n    assert pygount.common.mapped_repr(Dummy(), {}) == \"Dummy()\"\n    assert (\n        pygount.common.mapped_repr(Dummy(), {\"some\": \"such\", \"other\": 1, \"whatever\": True})\n        == \"Dummy(some=such, other=1, whatever=True)\"\n    )\n"
  },
  {
    "path": "tests/test_encoding.py",
    "content": "\"\"\"\nTests for encoding related functions.\n\"\"\"\n\n# Copyright (c) 2016-2025, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nfrom tempfile import NamedTemporaryFile\n\nimport pytest\n\nfrom pygount.analysis import _BOM_TO_ENCODING_MAP, encoding_for, encoding_from_possible_magic_comment, is_binary_file\n\nfrom ._common import temp_binary_file, temp_source_file\n\n_ENCODING_TO_BOM_MAP = {encoding: bom for bom, encoding in _BOM_TO_ENCODING_MAP.items()}\n_TEST_CODE = \"x = '\\u00fd \\u20ac'\"\n\n\n@pytest.mark.parametrize(\n    \"ascii_header\",\n    [\n        \"# encoding: cp1252\",\n        \"# coding: cp1252\",\n        \"# -*- coding: cp1252 -*-\",\n        \"# eNcOdInG: cp1252\",\n        \"## encoding: cp1252\",\n        \"#encoding:cp1252\",\n        \"# -*- coding: cp1252; mode: python; -*-\"  # Emacs modeline\n        \"/* coding: cp1252 */\"  # C\n        \"{ coding: cp1252 }\"  # Pascal\n        \"REM coding: cp1252\",  # Basic\n    ],\n)\ndef test_can_detect_encoding_from_magic_comments(ascii_header: str):\n    assert encoding_from_possible_magic_comment(ascii_header) == \"cp1252\"\n\n\n@pytest.mark.parametrize(\n    \"ascii_header\",\n    [\n        \"\",\n        \"    \",\n        \" # encoding: cp1252\",  # Leading white space\n        \"# encoding: !$%&\",\n        \"-*- coding: cp1252 -*-\",  # Not a comment\n        \"encoding: cp1252\",\n        '{\"x\":\"encoding: cp1252\"}',\n    ],\n)\ndef test_can_ignore_encoding_from_magic_comments(ascii_header: str):\n    assert encoding_from_possible_magic_comment(ascii_header) is None\n\n\n@pytest.mark.parametrize(\"encoding\", _BOM_TO_ENCODING_MAP.values())\ndef test_can_detect_bom_encodings(encoding: str):\n    _test_can_detect_bom_encoding(encoding)\n\n\ndef _test_can_detect_bom_encoding(encoding: str):\n    with NamedTemporaryFile(mode=\"wb+\", suffix=\"txt\") as test_file:\n        if encoding != \"utf-8-sig\":\n            bom = _ENCODING_TO_BOM_MAP[encoding]\n            test_file.write(bom)\n        test_file.write(_TEST_CODE.encode(encoding))\n        test_file.flush()\n        test_file.seek(0)\n        actual_encoding = encoding_for(test_file.name)\n    assert actual_encoding == encoding\n\n\n@pytest.mark.parametrize(\"encoding\", [\"cp1252\", \"utf-8\"])\ndef test_can_detect_plain_encoding(encoding: str):\n    with temp_source_file(\"txt\", _TEST_CODE, encoding=encoding) as test_file:\n        actual_encoding = encoding_for(test_file.name)\n        assert actual_encoding == encoding\n\n\ndef test_can_detect_xml_prolog():\n    encoding = \"iso-8859-15\"\n    xml_code = f'<?xml encoding=\"{encoding}\" standalone=\"yes\"?><some>{_TEST_CODE}</some>'\n    with temp_source_file(\"xml\", [xml_code], encoding=encoding) as test_file:\n        actual_encoding = encoding_for(test_file.name)\n    assert actual_encoding == encoding\n\n\ndef test_can_detect_magic_comment():\n    encoding = \"iso-8859-15\"\n    lines = [\"#!/usr/bin/python\", f\"# -*- coding: {encoding} -*-\", _TEST_CODE]\n    with temp_source_file(\"txt\", lines, encoding=encoding) as test_file:\n        actual_encoding = encoding_for(test_file.name)\n    assert actual_encoding == encoding\n\n\ndef test_can_detect_automatic_encoding_for_empty_source():\n    with temp_binary_file(b\"\") as test_file:\n        actual_encoding = encoding_for(test_file.name)\n    assert actual_encoding == \"utf-8\"\n\n\ndef test_can_detect_chardet_encoding():\n    test_path = __file__\n    actual_encoding = encoding_for(test_path)\n    assert actual_encoding == \"utf-8\"\n\n\ndef test_can_detect_utf8_when_cp1252_would_fail():\n    # Write closing double quote in UTF-8, which contains 0x9d,\n    # which fails when read as CP1252.\n    content = b\"\\xe2\\x80\\x9d\"\n    with temp_binary_file(content) as test_file:\n        actual_encoding = encoding_for(test_file.name, encoding=\"automatic\", fallback_encoding=None)\n        assert actual_encoding == \"utf-8\"\n        actual_encoding = encoding_for(test_file.name, encoding=\"automatic\", fallback_encoding=\"cp1252\")\n        assert actual_encoding == \"cp1252\"\n\n\ndef test_can_use_hardcoded_encoding():\n    with temp_source_file(\"txt\", \"\\N{EURO SIGN}\", encoding=\"cp1252\") as test_file:\n        test_path = test_file.name\n        actual_encoding = encoding_for(test_path, \"utf-8\")\n        assert actual_encoding == \"utf-8\"\n        # Make sure that we cannot actually read the file using the hardcoded but wrong encoding.\n        with open(test_path, encoding=actual_encoding) as broken_test_file, pytest.raises(UnicodeDecodeError):\n            broken_test_file.read()\n\n\ndef test_can_detect_binary_with_zero_byte():\n    with temp_binary_file(b\"hello\\0world\") as binary_file:\n        assert is_binary_file(binary_file.name)\n\n\ndef test_can_detect_utf16_as_non_binary():\n    with NamedTemporaryFile(encoding=\"utf-16\", mode=\"w+\") as utf16_file:\n        utf16_file.write(\"Hello world!\")\n        utf16_file.flush()\n        utf16_file.seek(0)\n        assert not is_binary_file(utf16_file.name)\n"
  },
  {
    "path": "tests/test_git_storage.py",
    "content": "from pathlib import Path\n\nfrom pygount.git_storage import GitStorage, git_remote_url_and_revision_if_any\n\n\ndef test_can_extract_git_remote_url_and_revision_if_any():\n    assert git_remote_url_and_revision_if_any(\"hello\") == (None, None)\n    assert git_remote_url_and_revision_if_any(\"git@github.com:roskakori/pygount.git/v1.5.1\") == (\n        \"git@github.com:roskakori/pygount.git\",\n        \"v1.5.1\",\n    )\n    assert git_remote_url_and_revision_if_any(\"git@github.com:roskakori/pygount.git\") == (\n        \"git@github.com:roskakori/pygount.git\",\n        None,\n    )\n    assert git_remote_url_and_revision_if_any(\"git@github.com:roskakori/pygount.git/\") == (\n        \"git@github.com:roskakori/pygount.git\",\n        None,\n    )\n    assert git_remote_url_and_revision_if_any(\"\") == (None, None)\n\n\ndef test_can_extract_and_close_and_find_files_from_cloned_git_remote_url_with_revision():\n    remote_url, revision = git_remote_url_and_revision_if_any(\"https://github.com/roskakori/pygount.git/v0.1\")\n    assert remote_url is not None\n    git_storage = GitStorage(remote_url, revision)\n    pyproject_path = Path(git_storage.temp_folder) / \"pyproject.toml\"\n    readme_path = Path(git_storage.temp_folder) / \"README.rst\"\n    try:\n        git_storage.extract()\n        assert readme_path.exists()\n        assert not pyproject_path.exists()\n    finally:\n        git_storage.close()\n    assert not readme_path.exists()\n"
  },
  {
    "path": "tests/test_lexers.py",
    "content": "\"\"\"\nTests for additional lexers for pygount.\n\"\"\"\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\n\nfrom pygments import token\n\nimport pygount.lexers\n\n\ndef test_can_lex_idl():\n    lexer = pygount.lexers.IdlLexer()\n    text = \"\\n\".join(\n        [\n            \"/* some\",\n            \" * comment */\",\n            \"module HelloApp {\",\n            \"  interface Hello {\",\n            \"    string sayHello(); // Be friendly!\",\n            \"  };\",\n            \"};\",\n        ]\n    )\n    text_tokens = list(lexer.get_tokens(text))\n    assert text_tokens == [\n        (token.Token.Comment.Multiline, \"/* some\\n * comment */\"),\n        (token.Token.Text.Whitespace, \"\\n\"),\n        (token.Token.Keyword.Declaration, \"module\"),\n        (token.Token.Text, \" \"),\n        (token.Token.Name.Class, \"HelloApp\"),\n        (token.Token.Text.Whitespace, \" \"),\n        (token.Token.Punctuation, \"{\"),\n        (token.Token.Text.Whitespace, \"\\n\"),\n        (token.Token.Text.Whitespace, \"  \"),\n        (token.Token.Keyword.Declaration, \"interface\"),\n        (token.Token.Text, \" \"),\n        (token.Token.Name.Class, \"Hello\"),\n        (token.Token.Text.Whitespace, \" \"),\n        (token.Token.Punctuation, \"{\"),\n        (token.Token.Text.Whitespace, \"\\n\"),\n        (token.Token.Text.Whitespace, \"    \"),\n        (token.Token.Name, \"string\"),\n        (token.Token.Text.Whitespace, \" \"),\n        (token.Token.Name.Function, \"sayHello\"),\n        (token.Token.Punctuation, \"(\"),\n        (token.Token.Punctuation, \")\"),\n        (token.Token.Punctuation, \";\"),\n        (token.Token.Text.Whitespace, \" \"),\n        (token.Token.Comment.Single, \"// Be friendly!\"),\n        (token.Token.Text.Whitespace, \"\\n\"),\n        (token.Token.Text.Whitespace, \"  \"),\n        (token.Token.Punctuation, \"}\"),\n        (token.Token.Punctuation, \";\"),\n        (token.Token.Text.Whitespace, \"\\n\"),\n        (token.Token.Punctuation, \"}\"),\n        (token.Token.Punctuation, \";\"),\n        (token.Token.Text.Whitespace, \"\\n\"),\n    ]\n\n\ndef test_can_lex_m4():\n    lexer = pygount.lexers.MinimalisticM4Lexer()\n    text = \"\"\n    text += \"#\\n\"\n    text += \"# comment\\n\"\n    text += \"define(FRUIT, apple) # Healthy stuff!\\n\"\n    text += \"Eat some FRUIT!\"\n    text_tokens = list(lexer.get_tokens(text))\n    assert text_tokens == [\n        (token.Token.Comment.Single, \"#\\n\"),\n        (token.Token.Comment.Single, \"# comment\\n\"),\n        (token.Token.Text, \"define(FRUIT, apple) \"),\n        (token.Token.Comment.Single, \"# Healthy stuff!\\n\"),\n        (token.Token.Text, \"Eat some FRUIT!\\n\"),\n    ]\n\n\ndef test_can_lex_vbscript():\n    lexer = pygount.lexers.MinimalisticVBScriptLexer()\n    text = \"\".join([\"' comment\\n\", 'WScript.Echo \"hello world!\"'])\n    text_tokens = list(lexer.get_tokens(text))\n    assert text_tokens == [\n        (token.Token.Comment.Single, \"' comment\\n\"),\n        (token.Token.Text, 'WScript.Echo \"hello world!\"\\n'),\n    ]\n\n\ndef test_can_lex_webfocus():\n    lexer = pygount.lexers.MinimalisticWebFocusLexer()\n    text = \"\".join([\"-*\\n\", \"-* comment\\n\", \"-set &some='text';\\n\", \"table file some print * end;\"])\n    text_tokens = list(lexer.get_tokens(text))\n    assert text_tokens == [\n        (token.Token.Comment.Single, \"-*\\n\"),\n        (token.Token.Comment.Single, \"-* comment\\n\"),\n        (token.Token.Text, \"-set &some='text';\\n\"),\n        (token.Token.Text, \"table file some print * end;\\n\"),\n    ]\n\n\ndef test_can_lex_plain_text():\n    lexer = pygount.lexers.PlainTextLexer()\n    text = \"\".join(\n        [\n            \"a\\n\",  # line with text\n            \"\\n\",  # empty line\n            \" \\t \\n\",  # line containing only white space\n            \"  \",  # trailing while space line without newline character\n        ]\n    )\n    text_tokens = list(lexer.get_tokens(text))\n    assert text_tokens == [(token.Token.Comment.Single, \"a\\n\"), (token.Token.Text, \"\\n \\t \\n  \\n\")]\n"
  },
  {
    "path": "tests/test_summary.py",
    "content": "\"\"\"\nTests to summarize analyses of multiple source codes.\n\"\"\"\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nfrom pygount.analysis import SourceAnalysis, SourceState\nfrom pygount.summary import LanguageSummary, ProjectSummary\n\n\ndef test_can_repr_language_summary():\n    language_summary = LanguageSummary(\"Python\")\n    language_summary.add(SourceAnalysis(\"some.py\", \"Python\", \"some\", 2, 3, 4, 5, SourceState.analyzed))\n    expected_language_summary_repr = (\n        \"LanguageSummary(language='Python', file_count=1, \"\n        \"code_count=2, documentation_count=3, empty_count=4, string_count=5)\"\n    )\n    assert repr(language_summary) == expected_language_summary_repr\n    assert repr(language_summary) == str(language_summary)\n\n\ndef test_can_repr_pseudo_language_summary():\n    language_summary = LanguageSummary(\"__empty__\")\n    language_summary.add(SourceAnalysis(\"some.py\", \"__empty__\", \"some\", 0, 0, 0, 0, SourceState.empty))\n    expected_language_summary_repr = \"LanguageSummary(language='__empty__', file_count=1)\"\n    assert repr(language_summary) == expected_language_summary_repr\n    assert repr(language_summary) == str(language_summary)\n\n\ndef test_can_summarize_project_with_multiple_files_of_same_language():\n    source_analyses = (\n        SourceAnalysis(\"some.py\", \"Python\", \"some\", 300, 70, 4, 2, SourceState.analyzed),\n        SourceAnalysis(\"other.py\", \"Python\", \"some\", 700, 30, 6, 3, SourceState.analyzed),\n    )\n\n    project_summary = ProjectSummary()\n    for source_analysis in source_analyses:\n        project_summary.add(source_analysis)\n\n    assert set(project_summary.language_to_language_summary_map.keys()) == {\"Python\"}\n    assert project_summary.total_file_count == 2\n    assert project_summary.total_code_count == 1000\n    assert project_summary.total_documentation_count == 100\n    assert project_summary.total_empty_count == 10\n    assert project_summary.total_string_count == 5\n\n\ndef test_can_summarize_project_with_multiple_files_of_different_languages():\n    source_analyses = (\n        SourceAnalysis(\"some.py\", \"Python\", \"some\", 1000, 100, 10, 3, SourceState.analyzed),\n        SourceAnalysis(\"some.sh\", \"Bash\", \"some\", 200, 20, 5, 2, SourceState.analyzed),\n    )\n\n    project_summary = ProjectSummary()\n    for source_analysis in source_analyses:\n        project_summary.add(source_analysis)\n\n    assert set(project_summary.language_to_language_summary_map.keys()) == {\"Bash\", \"Python\"}\n    assert project_summary.total_file_count == 2\n    assert project_summary.total_code_count == 1200\n    assert project_summary.total_documentation_count == 120\n    assert project_summary.total_empty_count == 15\n    assert project_summary.total_string_count == 5\n\n    assert (\n        repr(project_summary)\n        == \"ProjectSummary(total_file_count=2, total_line_count=1340, languages=['Bash', 'Python'])\"\n    )\n\n\ndef test_can_summarize_project_with_pseudo_languages():\n    source_analyses = (\n        SourceAnalysis(\"empty.py\", \"__empty__\", \"some\", 0, 0, 0, 0, SourceState.empty),\n        SourceAnalysis(\"generated.py\", \"__generated__\", \"some\", 1, 2, 3, 4, SourceState.generated, \"generated by test\"),\n        SourceAnalysis(\"binary.bin\", \"__binary__\", \"some\", 0, 0, 0, 0, SourceState.binary),\n    )\n    expected_languages = {source_analysis.language for source_analysis in source_analyses}\n\n    project_summary = ProjectSummary()\n    for source_analysis in source_analyses:\n        project_summary.add(source_analysis)\n\n    assert project_summary.total_file_count == 3\n    assert set(project_summary.language_to_language_summary_map.keys()) == expected_languages\n    assert project_summary.total_code_count == 0\n    assert project_summary.total_documentation_count == 0\n    assert project_summary.total_empty_count == 0\n    assert project_summary.total_string_count == 0\n\n    assert repr(project_summary) == (\n        \"ProjectSummary(total_file_count=3, total_line_count=0, languages=['__binary__', '__empty__', '__generated__'])\"\n    )\n\n\ndef test_can_repr_empty_project_summary():\n    project_summary = ProjectSummary()\n    assert repr(project_summary) == \"ProjectSummary(total_file_count=0, total_line_count=0, languages=[])\"\n    assert repr(project_summary) == str(project_summary)\n"
  },
  {
    "path": "tests/test_write.py",
    "content": "\"\"\"\nTest to write results of pygount analyses.\n\"\"\"\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nimport io\nimport re\nimport tempfile\nfrom pathlib import Path\nfrom xml.etree import ElementTree\n\nimport pytest\n\nfrom pygount import analysis, write\n\nfrom ._common import TempFolderTest\n\n\ndef test_can_collect_totals():\n    source_analyses = (\n        analysis.SourceAnalysis(\"some.py\", \"Python\", \"some\", 1, 2, 3, 4, analysis.SourceState.analyzed, None),\n        analysis.SourceAnalysis(\"other.py\", \"Python\", \"some\", 10, 20, 30, 40, analysis.SourceState.analyzed, None),\n    )\n    with (\n        tempfile.NamedTemporaryFile(\"w\", encoding=\"utf-8\", prefix=\"pygount_\", suffix=\".tmp\") as target_stream,\n        write.BaseWriter(target_stream) as writer,\n    ):\n        for source_analysis in source_analyses:\n            writer.add(source_analysis)\n    assert writer.project_summary.total_file_count == 2\n    assert writer.project_summary.total_line_count == 110\n    assert writer.duration_in_seconds > 0\n    assert writer.lines_per_second > writer.files_per_second\n\n\ndef test_can_write_cloc_xml():\n    source_analyses = (\n        analysis.SourceAnalysis(\"some.py\", \"Python\", \"some\", 1, 2, 3, 4, analysis.SourceState.analyzed, None),\n        analysis.SourceAnalysis(\"other.py\", \"Python\", \"some\", 10, 20, 30, 40, analysis.SourceState.analyzed, None),\n    )\n    with io.StringIO() as target_stream:\n        with write.ClocXmlWriter(target_stream) as writer:\n            for source_analysis in source_analyses:\n                writer.add(source_analysis)\n        xml_data = target_stream.getvalue()\n        assert len(xml_data) >= 1\n    with io.StringIO(xml_data) as cloc_xml_stream:\n        cloc_results_root = ElementTree.parse(cloc_xml_stream)\n    file_elements = cloc_results_root.findall(\"files/file\")\n    assert file_elements is not None\n    assert len(file_elements) == len(source_analyses)\n\n\ndef test_can_compute_digit_width():\n    assert write.digit_width(0) == 1\n    assert write.digit_width(1) == 1\n    assert write.digit_width(9) == 1\n    assert write.digit_width(999) == 3\n    assert write.digit_width(1000) == 4\n\n\n_LINE_WORD_REGEX = re.compile(r\"[\\w\\\\.]+\")  # HACK: For test assume all language names are \"\\w+\".\n\n\nclass _LineData:\n    def __init__(self, line: str):\n        line_parts = _LINE_WORD_REGEX.findall(line)\n        self.language = line_parts[0]\n        self.file_count = int(line_parts[1])\n        self.file_percentage = float(line_parts[2])\n        self.code_count = int(line_parts[3])\n        self.code_percentage = float(line_parts[4])\n        self.comment_count = int(line_parts[5])\n        self.comment_percentage = float(line_parts[6])\n\n\nclass SummaryWriterTest(TempFolderTest):\n    def test_can_write_summary(self):\n        source_analyses = (\n            analysis.SourceAnalysis(\"script.sh\", \"Bash\", \"some\", 200, 25, 1, 2, analysis.SourceState.analyzed, None),\n            analysis.SourceAnalysis(\"some.py\", \"Python\", \"some\", 300, 45, 3, 4, analysis.SourceState.analyzed, None),\n            analysis.SourceAnalysis(\"other.py\", \"Python\", \"some\", 500, 30, 5, 6, analysis.SourceState.analyzed, None),\n        )\n        lines = self._summary_lines_for(source_analyses)\n        assert len(lines) == 8, f\"lines={lines}\"\n\n        python_data = _LineData(lines[3])\n        assert python_data.language == \"Python\"\n        assert python_data.file_count == 2\n        assert python_data.file_percentage == pytest.approx(66.7)\n        assert python_data.code_count == 800\n        assert python_data.code_percentage == pytest.approx(89.6)\n        assert python_data.comment_count == 75\n        assert python_data.comment_percentage == pytest.approx(8.4)\n\n        bash_data = _LineData(lines[4])\n        assert bash_data.language == \"Bash\"\n        assert bash_data.file_count == 1\n        assert bash_data.code_count == 200\n        assert bash_data.code_percentage == pytest.approx(87.7)\n        assert bash_data.comment_count == 25\n        assert bash_data.comment_percentage == pytest.approx(11.0)\n\n        sum_total_data = _LineData(lines[-2])\n        assert sum_total_data.file_count == 3\n        assert sum_total_data.file_percentage == pytest.approx(100.0)\n        assert sum_total_data.code_count == 1000\n        assert sum_total_data.code_percentage == pytest.approx(89.2)\n        assert sum_total_data.comment_count == 100\n        assert sum_total_data.comment_percentage == pytest.approx(8.9)\n\n    def _summary_lines_for(self, source_analyses):\n        # NOTE: We need to write to a file because the lines containing the\n        # actual data are only available during close() at which point they\n        # would not be accessible to StringIO.getvalue().\n        summary_path = Path(self.tests_temp_folder, \"summary.tmp\")\n        with summary_path.open(\"w\", encoding=\"utf-8\") as summary_file, write.SummaryWriter(summary_file) as writer:\n            for source_analysis in source_analyses:\n                writer.add(source_analysis)\n        return summary_path.read_text(\"utf-8\").splitlines()\n"
  },
  {
    "path": "tests/test_xmldialect.py",
    "content": "\"\"\"\nTests for function to obtain the language dialect used by XML source code.\n\"\"\"\n\nimport pytest\n\n# Copyright (c) 2016-2024, Thomas Aglassinger.\n# All rights reserved. Distributed under the BSD License.\nimport pygount.xmldialect\nfrom pygount.xmldialect import without_xml_header\n\nEXAMPLE_ANT_CODE = \"\"\"<project name=\"hello\">\n    <target name=\"hello\">\n        <echo message=\"Hello world!\" />\n    </target>\n</project>\n\"\"\"\n\n_EXAMPLE_POM_CODE = \"\"\"<project\n  xmlns=\"http://maven.apache.org/POM/4.0.0\"\n  xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n  xsi:schemaLocation=\"http://maven.apache.org/POM/4.0.0\n  http://maven.apache.org/xsd/maven-4.0.0.xsd\">\n  <modelVersion>4.0.0</modelVersion>\n\n  <groupId>com.mycompany.app</groupId>\n  <artifactId>my-app</artifactId>\n  <version>1.0-SNAPSHOT</version>\n  <packaging>jar</packaging>\n\n  <name>Maven Quick Start Archetype</name>\n  <url>http://maven.apache.org</url>\n\n  <dependencies>\n    <dependency>\n      <groupId>junit</groupId>\n      <artifactId>junit</artifactId>\n      <version>4.8.2</version>\n      <scope>test</scope>\n    </dependency>\n  </dependencies>\n</project>\"\"\"\n\n_EXAMPLE_DOCBOOK_DTD_CODE = \"\"\"<!DOCTYPE example PUBLIC \"-//OASIS//DTD DocBook XML V4.1.2//EN\"\n    \"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd\">\n<example><title>Hello World in Python</title>\n<programlisting>\nprint('Hello World!')\n</programlisting>\n</example>\n\"\"\"\n\n_EXAMPLE_SVG_CODE = (\n    '<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\\n'\n    '<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">'\n    '<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"320\" height=\"120\" viewBox=\"0 0 320 120\">'\n    '  <rect width=\"100%\" height=\"100%\" fill=\"white\"/>'\n    '  <text x=\"50%\" y=\"50%\" text-anchor=\"middle\" dominant-baseline=\"middle\" '\n    '   font-family=\"sans-serif\" font-size=\"32\" fill=\"black\"'\n    \"  >Hello, world!</text>\"\n    \"</svg>\"\n)\n\n\n@pytest.mark.parametrize(\n    \"xml_code,expected\",\n    [\n        (\"<some/>\", \"<some/>\"),\n        (\"<some/>\", \"<some/>\"),\n        ('<?xml version=\"1.0\"?><some/>', \"<some/>\"),\n        ('  <?xml version=\"1.0\"?><some/>', \"<some/>\"),\n        ('<?xml version=\"1.0\"?>  <some/>', \"<some/>\"),\n        ('\\n\\n<?xml version=\"1.0\"?>\\n\\n<some/>', \"<some/>\"),\n    ],\n)\ndef test_can_compute_xml_code_without_header(xml_code: str, expected: str):\n    assert without_xml_header(xml_code) == expected\n\n\ndef test_can_detect_ant():\n    assert pygount.xmldialect.xml_dialect(\"<ant>\", EXAMPLE_ANT_CODE) == \"Ant\"\n\n\ndef test_can_detect_maven():\n    assert pygount.xmldialect.xml_dialect(\"<maven>\", _EXAMPLE_POM_CODE) == \"Maven\"\n\n\ndef test_can_ignore_broken_xml():\n    assert pygount.xmldialect.xml_dialect(\"<broken>\", \"<some></other>\") is None\n\n\ndef test_can_detect_docbook_from_dtd():\n    assert pygount.xmldialect.xml_dialect(\"<docbook-dtd>\", _EXAMPLE_DOCBOOK_DTD_CODE) == \"DocBook XML\"\n\n\ndef test_can_detect_svg_from_dtd():\n    assert pygount.xmldialect.xml_dialect(\"<svg>\", _EXAMPLE_SVG_CODE) == \"SVG XML\"\n"
  }
]