Repository: Samsung/CredSweeper
Branch: main
Commit: 4fef4bedba2e
Files: 387
Total size: 16.1 MB
Directory structure:
gitextract_f9me649i/
├── LICENSE
├── README.md
├── SECURITY.md
├── action.yml
├── credsweeper/
│ ├── __init__.py
│ ├── __main__.py
│ ├── app.py
│ ├── common/
│ │ ├── __init__.py
│ │ ├── constants.py
│ │ ├── keyword_checklist.py
│ │ ├── keyword_checklist.txt
│ │ ├── keyword_pattern.py
│ │ └── morpheme_checklist.txt
│ ├── config/
│ │ ├── __init__.py
│ │ └── config.py
│ ├── credentials/
│ │ ├── __init__.py
│ │ ├── augment_candidates.py
│ │ ├── candidate.py
│ │ ├── candidate_group_generator.py
│ │ ├── candidate_key.py
│ │ ├── credential_manager.py
│ │ └── line_data.py
│ ├── deep_scanner/
│ │ ├── __init__.py
│ │ ├── abstract_scanner.py
│ │ ├── byte_scanner.py
│ │ ├── bzip2_scanner.py
│ │ ├── crx_scanner.py
│ │ ├── csv_scanner.py
│ │ ├── deb_scanner.py
│ │ ├── deep_scanner.py
│ │ ├── docx_scanner.py
│ │ ├── eml_scanner.py
│ │ ├── encoder_scanner.py
│ │ ├── gzip_scanner.py
│ │ ├── html_scanner.py
│ │ ├── jclass_scanner.py
│ │ ├── jks_scanner.py
│ │ ├── lang_scanner.py
│ │ ├── lzma_scanner.py
│ │ ├── mxfile_scanner.py
│ │ ├── patch_scanner.py
│ │ ├── pdf_scanner.py
│ │ ├── pkcs_scanner.py
│ │ ├── png_scanner.py
│ │ ├── pptx_scanner.py
│ │ ├── rpm_scanner.py
│ │ ├── rtf_scanner.py
│ │ ├── sqlite3_scanner.py
│ │ ├── strings_scanner.py
│ │ ├── tar_scanner.py
│ │ ├── tmx_scanner.py
│ │ ├── xlsx_scanner.py
│ │ ├── xml_scanner.py
│ │ ├── zip_scanner.py
│ │ └── zlib_scanner.py
│ ├── file_handler/
│ │ ├── __init__.py
│ │ ├── abstract_provider.py
│ │ ├── analysis_target.py
│ │ ├── byte_content_provider.py
│ │ ├── content_provider.py
│ │ ├── data_content_provider.py
│ │ ├── descriptor.py
│ │ ├── diff_content_provider.py
│ │ ├── file_path_extractor.py
│ │ ├── files_provider.py
│ │ ├── patches_provider.py
│ │ ├── string_content_provider.py
│ │ ├── struct_content_provider.py
│ │ └── text_content_provider.py
│ ├── filters/
│ │ ├── __init__.py
│ │ ├── filter.py
│ │ ├── group/
│ │ │ ├── __init__.py
│ │ │ ├── general_keyword.py
│ │ │ ├── general_pattern.py
│ │ │ ├── group.py
│ │ │ ├── password_keyword.py
│ │ │ ├── token_pattern.py
│ │ │ ├── url_credentials_group.py
│ │ │ ├── weird_base36_token.py
│ │ │ └── weird_base64_token.py
│ │ ├── line_git_binary_check.py
│ │ ├── line_specific_key_check.py
│ │ ├── line_uue_part_check.py
│ │ ├── value_allowlist_check.py
│ │ ├── value_array_dictionary_check.py
│ │ ├── value_atlassian_token_check.py
│ │ ├── value_azure_token_check.py
│ │ ├── value_base32_data_check.py
│ │ ├── value_base64_data_check.py
│ │ ├── value_base64_encoded_pem_check.py
│ │ ├── value_base64_key_check.py
│ │ ├── value_base64_part_check.py
│ │ ├── value_basic_auth_check.py
│ │ ├── value_blocklist_check.py
│ │ ├── value_camel_case_check.py
│ │ ├── value_dictionary_keyword_check.py
│ │ ├── value_discord_bot_check.py
│ │ ├── value_entropy_base32_check.py
│ │ ├── value_entropy_base36_check.py
│ │ ├── value_entropy_base64_check.py
│ │ ├── value_entropy_base_check.py
│ │ ├── value_file_path_check.py
│ │ ├── value_github_check.py
│ │ ├── value_grafana_check.py
│ │ ├── value_grafana_service_check.py
│ │ ├── value_hex_number_check.py
│ │ ├── value_jfrog_token_check.py
│ │ ├── value_json_web_key_check.py
│ │ ├── value_json_web_token_check.py
│ │ ├── value_last_word_check.py
│ │ ├── value_length_check.py
│ │ ├── value_method_check.py
│ │ ├── value_morphemes_check.py
│ │ ├── value_not_allowed_pattern_check.py
│ │ ├── value_not_part_encoded_check.py
│ │ ├── value_number_check.py
│ │ ├── value_pattern_check.py
│ │ ├── value_sealed_secret_check.py
│ │ ├── value_search_check.py
│ │ ├── value_similarity_check.py
│ │ ├── value_split_keyword_check.py
│ │ ├── value_string_type_check.py
│ │ ├── value_token_base32_check.py
│ │ ├── value_token_base36_check.py
│ │ ├── value_token_base64_check.py
│ │ ├── value_token_base_check.py
│ │ └── value_token_check.py
│ ├── logger/
│ │ ├── __init__.py
│ │ └── logger.py
│ ├── main.py
│ ├── ml_model/
│ │ ├── __init__.py
│ │ ├── features/
│ │ │ ├── __init__.py
│ │ │ ├── entropy_evaluation.py
│ │ │ ├── feature.py
│ │ │ ├── file_extension.py
│ │ │ ├── has_html_tag.py
│ │ │ ├── is_secret_numeric.py
│ │ │ ├── length_of_attribute.py
│ │ │ ├── morpheme_dense.py
│ │ │ ├── rule_name.py
│ │ │ ├── rule_severity.py
│ │ │ ├── search_in_attribute.py
│ │ │ ├── word_in.py
│ │ │ ├── word_in_path.py
│ │ │ ├── word_in_postamble.py
│ │ │ ├── word_in_preamble.py
│ │ │ ├── word_in_transition.py
│ │ │ ├── word_in_value.py
│ │ │ └── word_in_variable.py
│ │ ├── ml_config.json
│ │ ├── ml_model.onnx
│ │ └── ml_validator.py
│ ├── py.typed
│ ├── rules/
│ │ ├── __init__.py
│ │ ├── config.yaml
│ │ └── rule.py
│ ├── scanner/
│ │ ├── __init__.py
│ │ ├── scan_type/
│ │ │ ├── __init__.py
│ │ │ ├── multi_pattern.py
│ │ │ ├── pem_key_pattern.py
│ │ │ ├── scan_type.py
│ │ │ └── single_pattern.py
│ │ └── scanner.py
│ ├── secret/
│ │ ├── config.json
│ │ └── log.yaml
│ └── utils/
│ ├── __init__.py
│ ├── hop_stat.py
│ ├── pem_key_detector.py
│ └── util.py
├── docs/
│ ├── Makefile
│ ├── README.md
│ ├── howto/
│ │ └── how-to-contribute.md
│ ├── make.bat
│ ├── requirements.txt
│ └── source/
│ ├── api.rst
│ ├── apps_config.rst
│ ├── conf.py
│ ├── credsweeper.common.rst
│ ├── credsweeper.config.rst
│ ├── credsweeper.credentials.rst
│ ├── credsweeper.deep_scanner.rst
│ ├── credsweeper.file_handler.rst
│ ├── credsweeper.filters.group.rst
│ ├── credsweeper.filters.rst
│ ├── credsweeper.logger.rst
│ ├── credsweeper.ml_model.features.rst
│ ├── credsweeper.ml_model.rst
│ ├── credsweeper.rst
│ ├── credsweeper.rules.rst
│ ├── credsweeper.scanner.rst
│ ├── credsweeper.scanner.scan_type.rst
│ ├── credsweeper.utils.rst
│ ├── develop.rst
│ ├── guide.rst
│ ├── how_to_contribute.rst
│ ├── index.rst
│ ├── install.rst
│ ├── overall_architecture.rst
│ └── rules_config.rst
├── experiment/
│ ├── README.md
│ ├── __init__.py
│ ├── data_loader.py
│ ├── evaluate_model.py
│ ├── features.py
│ ├── hyperparameters.py
│ ├── log_callback.py
│ ├── main.py
│ ├── main.sh
│ ├── ml_model.py
│ ├── model_config_preprocess.py
│ ├── plot.py
│ ├── prepare_data.py
│ ├── requirements.txt
│ ├── tf2onnx/
│ │ └── tf2onnx.sh
│ ├── tools/
│ │ ├── base64_test.py
│ │ ├── entropy_test.py
│ │ ├── morpheme_test.py
│ │ └── strength_test.py
│ └── train.py
├── fuzz/
│ ├── README.md
│ ├── __main__.py
│ ├── auxilary.py
│ ├── coveraging.sh
│ ├── fuzzing.sh
│ ├── minimizing.sh
│ ├── re-fuzzing.sh
│ ├── reducing.sh
│ └── requirements.txt
├── pyproject.toml
├── pytest.ini
├── requirements.txt
└── tests/
├── README.md
├── __init__.py
├── common/
│ ├── __init__.py
│ ├── test_confidence.py
│ ├── test_keyword_checklist.py
│ ├── test_keyword_pattern.py
│ ├── test_regex.py
│ └── test_severity.py
├── config/
│ ├── __init__.py
│ └── test_config.py
├── conftest.py
├── credentials/
│ ├── __init__.py
│ ├── test_augment_candidates.py
│ ├── test_credential_manager.py
│ └── test_line_data.py
├── data/
│ ├── __init__.py
│ ├── depth_3_pedantic.json
│ ├── doc.json
│ ├── no_filters_no_ml.json
│ ├── no_ml.json
│ └── output.json
├── deep_scanner/
│ ├── __init__.py
│ ├── test_abstract_scanner.py
│ ├── test_bzip2_scanner.py
│ ├── test_crx_scanner.py
│ ├── test_csv_scanner.py
│ ├── test_deb_scanner.py
│ ├── test_deep_scanner.py
│ ├── test_eml_scanner.py
│ ├── test_encoder_scanner.py
│ ├── test_gzip_scanner.py
│ ├── test_html_scanner.py
│ ├── test_jclass_scanner.py
│ ├── test_jks_scanner.py
│ ├── test_lzma_scanner.py
│ ├── test_mxfile_scanner.py
│ ├── test_pdf_scanner.py
│ ├── test_png_scanner.py
│ ├── test_rtf_scanner.py
│ ├── test_sqlite3_scanner.py
│ ├── test_strings_scanner.py
│ ├── test_struct_scanner.py
│ ├── test_tar_scanner.py
│ ├── test_tmx_scanner.py
│ ├── test_xml_scanner.py
│ ├── test_zip_scanner.py
│ └── test_zlib_scanner.py
├── file_handler/
│ ├── __init__.py
│ ├── test_byte_content_provider.py
│ ├── test_data_content_provider.py
│ ├── test_diff_content_provider.py
│ ├── test_file_path_extractor.py
│ ├── test_files_provider.py
│ ├── test_patches_provider.py
│ ├── test_string_content_provider.py
│ ├── test_struct_content_provider.py
│ ├── test_text_content_provider.py
│ ├── zip_bomb_1.py
│ └── zip_bomb_2.py
├── filters/
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_line_git_binary_check.py
│ ├── test_line_specific_key_check.py
│ ├── test_line_uue_part_check.py
│ ├── test_value_allowlist_check.py
│ ├── test_value_array_dictionary_check.py
│ ├── test_value_atlassian_token_check.py
│ ├── test_value_azure_token_check.py
│ ├── test_value_base32_data_check.py
│ ├── test_value_base64_data_check.py
│ ├── test_value_base64_key_check.py
│ ├── test_value_base64_part_check.py
│ ├── test_value_basic_auth_check.py
│ ├── test_value_blocklist_check.py
│ ├── test_value_camel_case_check.py
│ ├── test_value_dictionary_keyword_check.py
│ ├── test_value_entropy_base32_check.py
│ ├── test_value_entropy_base36_check.py
│ ├── test_value_entropy_base64_check.py
│ ├── test_value_file_path_check.py
│ ├── test_value_github_check.py
│ ├── test_value_grafana_check.py
│ ├── test_value_grafana_service_check.py
│ ├── test_value_hex_number_check.py
│ ├── test_value_json_web_key_check.py
│ ├── test_value_json_web_token_check.py
│ ├── test_value_last_word_check.py
│ ├── test_value_length_check.py
│ ├── test_value_method_check.py
│ ├── test_value_morphemes_check.py
│ ├── test_value_not_allowed_pattern.py
│ ├── test_value_not_part_encoded.py
│ ├── test_value_number_check.py
│ ├── test_value_pattern_check.py
│ ├── test_value_sealed_secret_check.py
│ ├── test_value_search_check.py
│ ├── test_value_similarity_check.py
│ ├── test_value_split_keyword_check.py
│ ├── test_value_string_type_check.py
│ ├── test_value_token_base32_check.py
│ ├── test_value_token_base36_check.py
│ ├── test_value_token_base64_check.py
│ └── test_value_token_check.py
├── ml_model/
│ ├── __init__.py
│ ├── test_features.py
│ └── test_ml_validator.py
├── rules/
│ ├── __init__.py
│ ├── common.py
│ ├── test_api.py
│ ├── test_auth.py
│ ├── test_aws_key.py
│ ├── test_aws_multi.py
│ ├── test_aws_mws_key.py
│ ├── test_credential.py
│ ├── test_dynatrace_api_token.py
│ ├── test_facebook_key.py
│ ├── test_firebase_domain.py
│ ├── test_github_classic_token.py
│ ├── test_github_fine_granted_token.py
│ ├── test_google_api_key.py
│ ├── test_google_multi.py
│ ├── test_google_oauth_key.py
│ ├── test_instagram_access_token.py
│ ├── test_jwt.py
│ ├── test_key.py
│ ├── test_mailchimp_key.py
│ ├── test_nonce.py
│ ├── test_password.py
│ ├── test_paypal_key.py
│ ├── test_pem_key.py
│ ├── test_picatic_key.py
│ ├── test_pypi_api_token.py
│ ├── test_rule.py
│ ├── test_salt.py
│ ├── test_secret.py
│ ├── test_sendgrid_api_key_token.py
│ ├── test_shopify_token.py
│ ├── test_slack_token.py
│ ├── test_slack_webhook.py
│ ├── test_square_access_token.py
│ ├── test_telegram_bot_api_token.py
│ ├── test_token.py
│ └── test_url_credentials.py
├── scanner/
│ ├── __init__.py
│ └── scan_type/
│ ├── __init__.py
│ ├── test_multipattern.py
│ └── test_pem_key_pattern.py
├── test_app.py
├── test_doc.py
├── test_git.py
├── test_main.py
├── test_utils/
│ ├── __init__.py
│ └── dummy_line_data.py
└── utils/
├── __init__.py
├── test_hop_stat.py
└── test_util.py
================================================
FILE CONTENTS
================================================
================================================
FILE: LICENSE
================================================
Copyright (c) 2021 SAMSUNG
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# CredSweeper
[](https://github.com/Samsung/CredSweeper/releases)
[](https://credsweeper.readthedocs.io/en/latest/?badge=latest)
[](LICENSE)
[](https://pypi.org/project/credsweeper/)
[](https://badge.fury.io/py/credsweeper)
[](https://github.com/Samsung/CredSweeper/actions/workflows/test.yml)
[](https://codecov.io/gh/Samsung/CredSweeper)
[](https://bestpractices.coreinfrastructure.org/projects/6055)
[](https://api.securityscorecards.dev/projects/github.com/Samsung/CredSweeper)
- [CredSweeper](#credsweeper)
- [Introduction](#introduction)
- [How To Use](#how-to-use)
- [Main Requirements](#main-requirements)
- [Installation](#installation)
- [Run](#run)
- [Config](#config)
- [Develop](#develop)
- [Tests](#tests)
- [Benchmark](#benchmark)
- [Overall Architecture](#overall-architecture)
- [Retrain Model](#retrain-model)
- [License](#license)
- [How to Get Involved](#how-to-get-involved)
- [Project Roles](#project-roles)
- [Contributor](#contributor)
- [Maintainer](#maintainer)
- [How to Contact](#how-to-contact)
## Introduction
CredSweeper is an advanced credential detection tool designed to identify exposed
credentials such as passwords, API keys, tokens, and other sensitive information
across source code, configuration files, documents, and binary assets.
CredSweeper scans regular files, embedded data in containers, and files added in Git commits.
The tool combines pattern-based detection, machine learning–based validation, and
deep file inspection to deliver comprehensive and accurate security scanning for
modern codebases and repositories.
**Key Capabilities:**
- Credential detection in source code, configuration files, documents, and archives
- False positive reduction using algorithmic filters and machine learning
- Scanning of compressed files, documents, and binary formats
- Git repository analysis and diff scanning
Full documentation can be found here:
## How To Use
### Main Requirements
- Python 3.10, 3.11, 3.12, 3.13, 3.14
### Installation
Details [here](https://credsweeper.readthedocs.io/en/latest/install.html).
```bash
pip install credsweeper
```
### Run
[How to use](https://credsweeper.readthedocs.io/en/latest/guide.html).
Run CredSweeper:
```bash
python -m credsweeper --path tests/samples/password.gradle --save-json output.json
```
### JSON Output
```json
[
{
"rule": "Password",
"severity": "high",
"confidence": "moderate",
"ml_probability": 0.993,
"line_data_list": [
{
"line": "password = \"cackle!\"",
"line_num": 1,
"path": "./tests/samples/password.gradle",
"info": "",
"variable": "password",
"variable_start": 0,
"variable_end": 8,
"value": "cackle!",
"value_start": 12,
"value_end": 19,
"entropy": 2.52164
}
]
}
]
```
### Config
[credsweeper/secret/config.json](credsweeper/secret/config.json) - Configuration file for pre-processing of CredSweeper. For more details please check [here](https://credsweeper.readthedocs.io/en/latest/overall_architecture.html#pre-processing).
You can set the `pattern`, `extension` and `path` you want to exclude from scanning as below.
```json
{
"exclude": {
"pattern": [
"AKIA[0-9A-Z]{9}EXAMPLE",
...
],
"extension": [
"gif",
"jpg",
...
],
"path": [
"/.git/",
"/openssl/",
...
]
},
...
}
```
And you can also set `source_ext`, `source_quote_ext`, `find_by_ext_list`, `check_for_literals`, `line_data_output`, and `candidate_output` as below.
- `source_ext`: List of extensions for scanning categorized as source files.
- `source_quote_ext`: List of extensions for scanning categorized as source files that use quotes.
- `find_by_ext_list`: List of extensions to detect only extensions.
- `check_for_literals`: Bool value for whether to check line has string literal declaration or not.
- `line_data_output`: List of attributes of [line_data](credsweeper/credentials/line_data.py) for output.
- `candidate_output`: List of attributes of [candidate](credsweeper/credentials/candidate.py) for output.
```json
{
...
"source_ext": [
".py",
".cpp",
...
],
"source_quote_ext": [
".py",
".cpp",
...
],
"find_by_ext_list": [
".pem",
".cer",
...
],
"check_for_literals": true,
"line_data_output": [
"line",
"line_num",
...
],
"candidate_output": [
"rule",
"severity",
...
]
}
```
[credsweeper/rules/config.yaml](credsweeper/rules/config.yaml) - Configuration file for setting Rule. For more details please check [here](https://credsweeper.readthedocs.io/en/latest/overall_architecture.html#rule).
```yaml
- name: Credential
severity: medium
confidence: moderate
type: keyword
values:
- credential
filter_type: GeneralKeyword
use_ml: true
min_line_len: 18
required_substrings:
- credential
target:
- code
```
## Develop
### Tests
Run all tests with random order:
```bash
python -m pytest --cov=credsweeper --cov-report=term-missing --random-order --random-order-bucket=global -s tests/
```
### Benchmark
We have a dataset for testing credential scanners called [CredData](https://github.com/Samsung/CredData).
If you want to test CredSweeper with this dataset please check [here](https://github.com/Samsung/CredData/blob/main/README.md#benchmark).
## Overall Architecture
To check overall architecture of CredSweeper please check [here](https://credsweeper.readthedocs.io/en/latest/overall_architecture.html).
## Retrain Model
If you want to check how model was trained or retrain it on your own data, please refer to the [experiment](experiment/README.md) folder
## License
The CredSweeper is an Open Source project released under the terms of [MIT License](https://opensource.org/licenses/mit-license.php).
## How to Get Involved
In addition to developing under an Open Source license, the project follows an Open Source Development approach,
welcoming everyone to participate, contribute, and engage with each other through the project.
### Project Roles
The project recognizes the following formal roles: Contributor and Maintainer.
Informally, the community may organize itself and grant additional rights and responsibilities to the necessary people to achieve its goals.
#### Contributor
A Contributor is anyone who wishes to contribute to the project, at any level. Contributors are granted the following rights to:
- Contribute code, documentation, translations, artwork, samples, etc.
- Report defects (bugs) and suggestions for enhancement.
- Participate in the process of reviewing contributions by others.
If you want to participate in the project development, check out the [how to contribute guideline](./docs/howto/how-to-contribute.md) in advance.
Contributors who show dedication and skill are rewarded with additional rights and responsibilities.
Their opinions weigh more when decisions are made, in a fully meritocratic fashion.
#### Maintainer
A Maintainer is a Contributor who is also responsible for knowing, directing and anticipating the needs of a given Module.
As such, Maintainers have the right to set the overall organization of the source code in the Module,
and the right to participate in the decision-making. Maintainers are required to review the contributor’s requests and decide whether to accept or not.
| Name | E-Mail |
|------------------------------------------------|------------------------|
| [Jaeku Yun](https://github.com/silentearth) | jk0113.yun@samsung.com |
| [Shinhyung Choi](https://github.com/csh519) | sh519.choi@samsung.com |
| [Roman Babenko](https://github.com/babenek) | r.babenko@samsung.com |
| [Yuliia Tatarinova](https://github.com/Yullia) | yuliia.t@samsung.com |
## How to Contact
Please post questions, [issues, or suggestions in issues](https://github.com/Samsung/CredSweeper/issues). This is the best way to communicate with the developers.
================================================
FILE: SECURITY.md
================================================
# Security Policy
## Supported Versions
| Version | Supported |
|---------|--------------------|
| 1.15.x | :white_check_mark: |
| <1.15.x | :x: |
## Reporting a Vulnerability
Please use [issues](https://github.com/Samsung/CredSweeper/issues) to report about any security issue.
================================================
FILE: action.yml
================================================
name: "CredSweeper action"
description: "CredSweeper checks files"
author: "r.babenko@samsung.com"
branding:
icon: "terminal"
color: "gray-dark"
inputs:
python_version:
description: "Python Version. 3.10 - default"
default: "3.10"
required: false
path:
description: "Path to scan"
required: true
report:
description: "CredSweeper report in JSON format"
default: "output.json"
required: false
hashed:
description: "Report output is hashed by default"
default: "--hashed"
required: false
error:
description: "Exit with an error code if credentials are detected"
default: "--error"
required: false
runs:
using: "composite"
steps:
- name: DEBUG
shell: bash
env:
path: ${{ inputs.path }}
report: ${{ inputs.report }}
error: ${{ inputs.error }}
hashed: ${{ inputs.hashed }}
run: echo "print ('@@@ $error @@@ $report @@@ $path @@@ $PATH @@@')"
- name: Setup Python
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 - 2025.01.28
with:
python-version: ${{ inputs.python_version }}
- name: Install CredSweeper
shell: bash
run: python -m pip install credsweeper
- name: Run CredSweeper
shell: bash
env:
path: ${{ inputs.path }}
report: ${{ inputs.report }}
error: ${{ inputs.error }}
hashed: ${{ inputs.hashed }}
run: python -m credsweeper --banner --log INFO --no-color --no-stdout "$error" "$hashed" --save-json "$report" --path "$path"
================================================
FILE: credsweeper/__init__.py
================================================
from credsweeper.app import CredSweeper
from credsweeper.common.constants import ThresholdPreset, Severity, Confidence
from credsweeper.file_handler.byte_content_provider import ByteContentProvider
from credsweeper.file_handler.content_provider import ContentProvider
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.diff_content_provider import DiffContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider
from credsweeper.file_handler.text_content_provider import TextContentProvider
from credsweeper.ml_model.ml_validator import MlValidator
__all__ = [
"ByteContentProvider", #
"Confidence", #
"ContentProvider", #
"CredSweeper", #
"DataContentProvider", #
"DiffContentProvider", #
"MlValidator", #
"Severity", #
"StringContentProvider", #
"TextContentProvider", #
"ThresholdPreset", #
"__version__"
]
__version__ = "1.15.7"
================================================
FILE: credsweeper/__main__.py
================================================
import sys
from credsweeper.main import main
if __name__ == "__main__":
sys.exit(main())
================================================
FILE: credsweeper/app.py
================================================
import json
import logging
import multiprocessing
import signal
from pathlib import Path
from typing import Any, List, Optional, Union, Dict, Sequence, Tuple
import pandas as pd
from colorama import Style
# Directory of credsweeper sources MUST be placed before imports to avoid circular import error
APP_PATH = Path(__file__).resolve().parent
from credsweeper.scanner.scanner import Scanner
from credsweeper.common.constants import Severity, ThresholdPreset, DiffRowType, DEFAULT_ENCODING
from credsweeper.config.config import Config
from credsweeper.credentials.candidate import Candidate
from credsweeper.credentials.candidate_key import CandidateKey
from credsweeper.credentials.credential_manager import CredentialManager
from credsweeper.deep_scanner.deep_scanner import DeepScanner
from credsweeper.file_handler.content_provider import ContentProvider
from credsweeper.file_handler.file_path_extractor import FilePathExtractor
from credsweeper.file_handler.abstract_provider import AbstractProvider
from credsweeper.ml_model.ml_validator import MlValidator
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class CredSweeper:
"""Advanced credential analyzer base class.
Parameters:
credential_manager: CredSweeper credential manager object
scanner: CredSweeper scanner object
pool_count: number of pools used to run multiprocessing scanning
config: dictionary variable, stores analyzer features
json_filename: string variable, credential candidates export filename
"""
def __init__(self,
rule_path: Union[None, str, Path] = None,
config_path: Optional[str] = None,
json_filename: Union[None, str, Path] = None,
xlsx_filename: Union[None, str, Path] = None,
stdout: bool = False,
color: bool = False,
hashed: bool = False,
subtext: bool = False,
sort_output: bool = False,
use_filters: bool = True,
pool_count: int = 1,
ml_batch_size: Optional[int] = None,
ml_threshold: Union[int, float, ThresholdPreset] = ThresholdPreset.medium,
ml_config: Union[None, str, Path] = None,
ml_model: Union[None, str, Path] = None,
ml_providers: Optional[str] = None,
find_by_ext: bool = False,
pedantic: bool = False,
depth: int = 0,
doc: bool = False,
severity: Union[Severity, str] = Severity.INFO,
size_limit: Optional[str] = None,
exclude_lines: Optional[List[str]] = None,
exclude_values: Optional[List[str]] = None,
thrifty: bool = False,
log_level: Optional[str] = None) -> None:
"""Initialize Advanced credential scanner.
Args:
rule_path: optional str variable, path of rule config file
validation was the grained candidate model on machine learning
config_path: optional str variable, path of CredSweeper config file
default built-in config is used if None
json_filename: optional string variable, path to save result to json
xlsx_filename: optional string variable, path to save result to xlsx
stdout: print results to stdout
color: print concise results to stdout with colorization
hashed: use hash of line, value and variable instead plain text
subtext: use subtext of line near variable-value like it performed in ML
use_filters: boolean variable, specifying the need of rule filters
pool_count: int value, number of parallel processes to use
ml_batch_size: int value, size of the batch for model inference
ml_threshold: float or string value to specify threshold for the ml model
ml_config: str or Path to set custom config of ml model
ml_model: str or Path to set custom ml model
ml_providers: str - comma separated list with providers
find_by_ext: boolean - files will be reported by extension
pedantic: boolean - scan all files
depth: int - how deep container files will be scanned
doc: boolean - document-specific scanning
severity: Severity - minimum severity level of rule
size_limit: optional string integer or human-readable format to skip oversize files
exclude_lines: lines to omit in scan. Will be added to the lines already in config
exclude_values: values to omit in scan. Will be added to the values already in config
thrifty: free provider resources after scan to reduce memory consumption
log_level: str - level for pool initializer according logging levels (UPPERCASE)
"""
self.pool_count: int = max(1, int(pool_count))
if not (_severity := Severity.get(severity)):
raise RuntimeError(f"Severity level provided: {severity}"
f" -- must be one of: {' | '.join([i.value for i in Severity])}")
config_dict = self._get_config_dict(config_path=config_path,
use_filters=use_filters,
find_by_ext=find_by_ext,
pedantic=pedantic,
depth=depth,
doc=doc,
severity=_severity,
size_limit=size_limit,
exclude_lines=exclude_lines,
exclude_values=exclude_values)
self.config = Config(config_dict)
self.scanner = Scanner(self.config, rule_path)
self.deep_scanner = DeepScanner(self.config, self.scanner)
self.credential_manager = CredentialManager()
self.json_filename: Union[None, str, Path] = json_filename
self.xlsx_filename: Union[None, str, Path] = xlsx_filename
self.stdout = stdout
self.color = color
self.hashed = hashed
self.subtext = subtext
self.sort_output = sort_output
self.ml_batch_size = ml_batch_size if ml_batch_size and 0 < ml_batch_size else 16
self.ml_threshold = ml_threshold
self.ml_config = ml_config
self.ml_model = ml_model
self.ml_providers = ml_providers
self.__thrifty = thrifty
self.__log_level = log_level
self.__ml_validator: Optional[MlValidator] = None
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@staticmethod
def _get_config_path(config_path: Optional[str]) -> Path:
if config_path:
return Path(config_path)
return APP_PATH / "secret" / "config.json"
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def _get_config_dict(
self, #
config_path: Optional[str], #
use_filters: bool, #
find_by_ext: bool, #
pedantic: bool, #
depth: int, #
doc: bool, #
severity: Severity, #
size_limit: Optional[str], #
exclude_lines: Optional[List[str]], #
exclude_values: Optional[List[str]]) -> Dict[str, Any]:
config_dict = Util.json_load(self._get_config_path(config_path))
config_dict["use_filters"] = use_filters
config_dict["find_by_ext"] = find_by_ext
config_dict["size_limit"] = size_limit
config_dict["pedantic"] = pedantic
config_dict["depth"] = depth
config_dict["doc"] = doc
config_dict["severity"] = severity.value
if exclude_lines is not None:
config_dict["exclude"]["lines"] = config_dict["exclude"].get("lines", []) + exclude_lines
if exclude_values is not None:
config_dict["exclude"]["values"] = config_dict["exclude"].get("values", []) + exclude_values
return config_dict # type: ignore
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def _use_ml_validation(self) -> bool:
if isinstance(self.ml_threshold, int) and 0 == self.ml_threshold:
logger.info("ML validation is disabled")
return False
if not self.credential_manager.candidates:
logger.info("Skip ML validation because no candidates were found")
return False
for i in self.credential_manager.candidates:
if i.use_ml:
# any() or all() is not used to speedup
return True
logger.info("Skip ML validation because no candidates support it")
return False
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@property
def ml_validator(self) -> MlValidator:
"""ml_validator getter"""
if not self.__ml_validator:
self.__ml_validator = MlValidator(
threshold=self.ml_threshold, #
ml_config=self.ml_config, #
ml_model=self.ml_model, #
ml_providers=self.ml_providers, #
)
if not self.__ml_validator:
raise RuntimeError("MlValidator was not initialized!")
return self.__ml_validator
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@staticmethod
def pool_initializer(log_kwargs) -> None:
"""Ignore SIGINT in child processes."""
logging.basicConfig(**log_kwargs)
signal.signal(signal.SIGINT, signal.SIG_IGN)
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def run(self, content_provider: AbstractProvider) -> int:
"""Run an analysis of 'content_provider' object.
Args:
content_provider: path objects to scan
"""
_empty_list: Sequence[ContentProvider] = []
file_extractors = content_provider.get_scannable_files(self.config) if content_provider else _empty_list
if not file_extractors:
logger.info("No scannable targets for %s paths", len(content_provider.paths))
return 0
self.scan(file_extractors)
self.post_processing()
# PatchesProvider has the attribute. Circular import error appears with using the isinstance
change_type = content_provider.change_type if hasattr(content_provider, "change_type") else None
self.export_results(change_type)
return self.credential_manager.len_credentials()
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def scan(self, content_providers: Sequence[ContentProvider]) -> None:
"""Run scanning of files from an argument "content_providers".
Args:
content_providers: file objects to scan
"""
if 1 < self.pool_count and 1 < len(content_providers):
self.__multi_jobs_scan(content_providers)
else:
self.__single_job_scan(content_providers)
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def __single_job_scan(self, content_providers: Sequence[ContentProvider]) -> None:
"""Performs scan in main thread"""
logger.info("Scan for %s providers", len(content_providers))
all_cred = self.files_scan(content_providers)
self.credential_manager.set_credentials(all_cred)
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def __multi_jobs_scan(self, content_providers: Sequence[ContentProvider]) -> None:
"""Performs scan with multiple jobs"""
# use this separation to satisfy YAPF formatter
yapfix = "%(asctime)s | %(levelname)s | %(processName)s:%(threadName)s | %(filename)s:%(lineno)s | %(message)s"
log_kwargs = {"format": yapfix}
if isinstance(self.__log_level, str):
# is not None
if "SILENCE" == self.__log_level:
logging.addLevelName(60, "SILENCE")
log_kwargs["level"] = self.__log_level
pool_count = min(self.pool_count, len(content_providers))
logger.info("Scan in %s processes for %s providers", pool_count, len(content_providers))
with multiprocessing.get_context("spawn").Pool(processes=pool_count,
initializer=CredSweeper.pool_initializer,
initargs=(log_kwargs,)) as pool: # yapf: disable
try:
for scan_results in pool.imap_unordered(self.files_scan,
(content_providers[x::pool_count] for x in range(pool_count))):
for cred in scan_results:
self.credential_manager.add_credential(cred)
except KeyboardInterrupt:
pool.terminate()
pool.join()
raise
pool.close()
pool.join()
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def files_scan(self, content_providers: Sequence[ContentProvider]) -> List[Candidate]:
"""Auxiliary method for scan one sequence"""
all_cred: List[Candidate] = []
for provider in content_providers:
candidates = self.file_scan(provider)
if self.__thrifty:
provider.free()
all_cred.extend(candidates)
logger.info("Completed: processed %s providers with %s candidates", len(content_providers), len(all_cred))
return all_cred
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def file_scan(self, content_provider: ContentProvider) -> List[Candidate]:
"""Run scanning of file from 'file_provider'.
Args:
content_provider: content provider object to scan
Return:
list of credential candidates from scanned file
"""
candidates: List[Candidate] = []
logger.debug("Start scan file: %s %s", content_provider.file_path, content_provider.info)
if FilePathExtractor.is_find_by_ext_file(self.config, content_provider.file_type):
# Skip the file scanning and create fake candidate because the extension is suspicious
dummy_candidate = Candidate.get_dummy_candidate(self.config, content_provider.file_path,
content_provider.file_type, content_provider.info,
FilePathExtractor.FIND_BY_EXT_RULE)
candidates.append(dummy_candidate)
else:
if self.config.depth or self.config.doc:
# deep scan with possible data representation
candidates = self.deep_scanner.scan(content_provider, self.config.depth, self.config.size_limit)
else:
if content_provider.file_type not in self.config.exclude_containers:
# Regular file scanning
candidates = self.scanner.scan(content_provider)
# finally return result from 'file_scan'
return candidates
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def post_processing(self) -> None:
"""Machine learning validation for received credential candidates."""
if purged := self.credential_manager.purge_duplicates():
logger.info("Purged %s duplicates", purged)
if self._use_ml_validation():
logger.info("Grouping %s candidates", len(self.credential_manager.candidates))
new_cred_list: List[Candidate] = []
cred_groups = self.credential_manager.group_credentials()
ml_cred_groups: List[Tuple[CandidateKey, List[Candidate]]] = []
for group_key, group_candidates in cred_groups.items():
# Analyze with ML if any candidate in group require ML
for candidate in group_candidates:
if candidate.use_ml:
ml_cred_groups.append((group_key, group_candidates))
break
else:
# all candidates do not require ML
new_cred_list.extend(group_candidates)
# prevent extra ml_validator creation if ml_cred_groups is empty
if ml_cred_groups:
logger.info("Run ML Validation for %s groups", len(ml_cred_groups))
is_cred, probability = self.ml_validator.validate_groups(ml_cred_groups, self.ml_batch_size)
for i, (_, group_candidates) in enumerate(ml_cred_groups):
for candidate in group_candidates:
if candidate.use_ml:
if is_cred[i]:
candidate.ml_probability = probability[i]
new_cred_list.append(candidate)
else:
new_cred_list.append(candidate)
else:
logger.info("Skipping ML validation due not applicable")
self.credential_manager.set_credentials(new_cred_list)
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def export_results(self, change_type: Optional[DiffRowType] = None) -> None:
"""
Save credential candidates to json file or print them to a console.
Args:
change_type: flag to know which file should be created for a patch
"""
credentials = self.credential_manager.get_credentials()
logger.info("Exporting %s credentials", len(credentials))
if self.sort_output:
credentials.sort(key=lambda x: ( #
x.line_data_list[0].path, #
x.line_data_list[0].line_num, #
x.severity, #
x.rule_name, #
x.line_data_list[0].value_start, #
x.line_data_list[0].value_end #
))
if self.json_filename:
json_path = Path(self.json_filename)
if isinstance(change_type, DiffRowType):
# add suffix for appropriated reports to create two files for the patch scan
json_path = json_path.with_suffix(f".{change_type.value}{json_path.suffix}")
with open(json_path, 'w', encoding=DEFAULT_ENCODING) as f:
# use the approach to reduce total memory usage in case of huge data
first_item = True
f.write('[\n')
for credential in credentials:
if first_item:
first_item = False
else:
f.write(",\n")
f.write(json.dumps(credential.to_json(hashed=self.hashed, subtext=self.subtext), indent=4))
f.write("\n]")
if self.xlsx_filename:
data_list = []
for credential in credentials:
data_list.extend(credential.to_dict_list(hashed=self.hashed, subtext=self.subtext))
df = pd.DataFrame(data=data_list)
if isinstance(change_type, DiffRowType):
if Path(self.xlsx_filename).exists():
with pd.ExcelWriter(self.xlsx_filename, mode='a', engine="openpyxl",
if_sheet_exists="replace") as writer:
df.to_excel(writer, sheet_name=change_type.value, index=False)
else:
df.to_excel(self.xlsx_filename, sheet_name=change_type.value, index=False)
else:
df.to_excel(self.xlsx_filename, sheet_name="report", index=False)
if self.color:
for credential in credentials:
for line_data in credential.line_data_list:
# bright rule name and path or info
if isinstance(credential.ml_probability, float):
ml_probability_info = f" {credential.ml_probability:.6f}"
else:
ml_probability_info = ""
print(Style.BRIGHT + credential.rule_name +
f" {line_data.info or line_data.path}:{line_data.line_num}{ml_probability_info}" +
Style.RESET_ALL)
print(line_data.get_colored_line(hashed=self.hashed, subtext=self.subtext))
if self.stdout:
for credential in credentials:
print(credential.to_str(hashed=self.hashed, subtext=self.subtext))
================================================
FILE: credsweeper/common/__init__.py
================================================
from credsweeper.common.keyword_checklist import KeywordChecklist
# use the variable to avoid singleton creation and make testing easier
static_keyword_checklist = KeywordChecklist()
================================================
FILE: credsweeper/common/constants.py
================================================
import string
import typing
from enum import Enum
from typing import Optional, Union
class Severity(Enum):
"""Severity of candidate"""
CRITICAL = "critical"
HIGH = "high"
MEDIUM = "medium"
LOW = "low"
INFO = "info"
def __lt__(self, other) -> bool:
if Severity.INFO == self:
return other is not Severity.INFO
if Severity.LOW == self:
return other in [Severity.MEDIUM, Severity.HIGH, Severity.CRITICAL]
if Severity.MEDIUM == self:
return other in [Severity.HIGH, Severity.CRITICAL]
if Severity.HIGH == self:
return other is Severity.CRITICAL
return False
@staticmethod
def get(severity: Union[str, "Severity"]) -> Optional["Severity"]:
"""returns Severity value from string or None"""
if isinstance(severity, Severity):
return severity
if isinstance(severity, str):
value = getattr(Severity, severity.strip().upper(), None)
if isinstance(value, Severity):
return value
return None
class Confidence(Enum):
"""Confidence of candidate"""
STRONG = "strong"
MODERATE = "moderate"
WEAK = "weak"
def __lt__(self, other) -> bool:
if Confidence.WEAK == self:
return other is not Confidence.WEAK
if Confidence.MODERATE == self:
return other is Confidence.STRONG
return False
@staticmethod
def get(confidence: Union[str, "Confidence"]) -> Optional["Confidence"]:
"""returns Confidence value from string or None"""
if isinstance(confidence, Confidence):
return confidence
if isinstance(confidence, str):
value = getattr(Confidence, confidence.strip().upper(), None)
if isinstance(value, Confidence):
return value
return None
BASE64COMMON = string.ascii_uppercase + string.ascii_lowercase + string.digits
class Chars(Enum):
"""Stores enumeration of characters sets of encoding dictionaries"""
# set of characters, hexadecimal numeral system (Base16). Upper- and lowercase
HEX_CHARS = string.digits + "ABCDEFabcdef"
# UUID charset in uppercase
UUID_UPPER_CHARS = string.digits + "ABCDEF-"
# UUID charset in lowercase
UUID_LOWER_CHARS = string.digits + "abcdef-"
# set of characters, hexadecimal numeral system (Base16). Uppercase
BASE16UPPER = string.digits + "ABCDEF"
# set of characters, hexadecimal numeral system (Base16). Lowercase
BASE16LOWER = string.digits + "abcdef"
# set of 32 characters, used in Base32 encoding
BASE32_CHARS = string.ascii_uppercase + "234567"
# set of 36 characters, used in Base36 encoding
BASE36_CHARS = string.digits + string.ascii_lowercase
# base62 set https://en.wikipedia.org/wiki/Base62
BASE62_CHARS = string.digits + string.ascii_uppercase + string.ascii_lowercase
# URL- and filename-safe standard
BASE64URL_CHARS = BASE64COMMON + "-_"
# URL- and filename-safe standard plus padding sign
BASE64URLPAD_CHARS = BASE64COMMON + "-_="
# standard base64 charset
BASE64STD_CHARS = BASE64COMMON + "+/"
# standard base64 plus padding sign
BASE64STDPAD_CHARS = BASE64COMMON + "+/="
# except whitespaces
ASCII_VISIBLE = string.digits + string.ascii_letters + string.punctuation
# all printable symbols
ASCII_PRINTABLE = string.printable
class GroupType(Enum):
"""Group type - used in Group constructor for load predefined set of filters"""
KEYWORD = "keyword"
PATTERN = "pattern"
# for empty filter set
DEFAULT = "default"
class RuleType(Enum):
"""Rule type"""
# combine pattern with predefined structure
KEYWORD = "keyword"
# use patterns as-is. all patterns must be found in target (line)
PATTERN = "pattern"
# single value to detect pem format with specific scanner
PEM_KEY = "pem_key"
# When first pattern found - second will be searched in adjoining lines
MULTI = "multi"
class ThresholdPreset(Enum):
"""Preset threshold to simplify precision/recall selection for the user."""
lowest = "lowest"
low = "low"
medium = "medium"
high = "high"
highest = "highest"
class DiffRowType(Enum):
"""Diff type of row"""
ADDED = "added"
DELETED = "deleted"
StartEnd = typing.NamedTuple("StartEnd", [("start", int), ("end", int)])
MIN_VARIABLE_LENGTH = 1
MIN_SEPARATOR_LENGTH = 1
MIN_VALUE_LENGTH = 4
# if the line is oversize - it will be scanned by chunks with overlapping
MAX_LINE_LENGTH = 8000
# the size for overlapping chunks must be less than MAX_LINE_LENGTH
CHUNK_SIZE = 4000
OVERLAP_SIZE = 1000
CHUNK_STEP_SIZE = CHUNK_SIZE - OVERLAP_SIZE
# ML hunk size to limit of variable or value size and get substring near value
ML_HUNK = 64
# values according https://docs.python.org/3/library/codecs.html
UTF_8 = "utf_8"
LATIN_1 = "latin_1"
ASCII = "ascii"
# 16-bits codecs may be detected during decoding
UTF_16_LE = "utf_16_le"
UTF_16_BE = "utf_16_be"
DEFAULT_ENCODING = UTF_8
# LATIN_1 has to be placed at end to apply binary file detection
AVAILABLE_ENCODINGS = [UTF_8, LATIN_1]
# to limit memory usage in case of recursive scan
RECURSIVE_SCAN_LIMITATION = 1 << 30
# default value for config and ValuePatternCheck
DEFAULT_PATTERN_LEN = 4
# PEM x509 patterns
PEM_BEGIN_PATTERN = "-----BEGIN"
PEM_END_PATTERN = "-----END"
# similar min_line_len in rule_template - no real credential in data less than 8 bytes
MIN_DATA_LEN = 8
================================================
FILE: credsweeper/common/keyword_checklist.py
================================================
from functools import cached_property
from typing import Set, List
from credsweeper.app import APP_PATH
class KeywordChecklist:
"""KeywordsChecklist contains words 3 or more letters length"""
__keyword_set: Set[str]
__morpheme_set: Set[str]
KEYWORD_PATH = APP_PATH / "common" / "keyword_checklist.txt"
MORPHEME_PATH = APP_PATH / "common" / "morpheme_checklist.txt"
def __init__(self) -> None:
# used suggested text read style. split() is preferred because it strips 0x0A on end the file
self.__keyword_list = self.KEYWORD_PATH.read_text().split()
self.__keyword_list.sort(key=str.__len__, reverse=True)
self.__keyword_set = set(self.KEYWORD_PATH.read_text().split())
# The list of morphemes can be combined to form words.
# The value is considered a variable if at least two exist.
self.__morpheme_set = set(self.MORPHEME_PATH.read_text().split())
@cached_property
def keyword_set(self) -> Set[str]:
"""Get set with keywords"""
return self.__keyword_set
@cached_property
def keyword_list(self) -> List[str]:
"""Get list with keywords in descended order of length"""
return self.__keyword_list
@cached_property
def keyword_len(self) -> int:
"""Length of keyword_set"""
return len(self.__keyword_set)
@cached_property
def morpheme_set(self) -> Set[str]:
"""Get extended set with keywords.
Return:
Extended set of strings
"""
return self.__morpheme_set
@cached_property
def morpheme_len(self) -> int:
"""Length of morpheme_set"""
return len(self.__morpheme_set)
def check_morphemes(self, line_lower: str, threshold: int) -> bool:
"""Checks limit of morphemes limit in line.
Args:
line_lower: input line - MUST be in lower
threshold: number of minimal morphemes
Return:
True - if number of morphemes exceeds the threshold
"""
matches = 0
for keyword in self.morpheme_set:
if keyword in line_lower:
matches += 1
if threshold < matches:
return True
return False
================================================
FILE: credsweeper/common/keyword_checklist.txt
================================================
1234
abort
about
above
absolute
abstract
accent
accept
access
account
action
active
activity
actor
actual
added
adding
additional
address
adjust
advise
after
again
agent
alert
alias
algori
allow
alpha
already
always
amount
analyses
analyze
anchor
android
animated
animation
another
anony
apache
api
appearance
apple
application
apply
are
argc
args
argv
argument
array
arrow
article
ascii
aside
assembly
asset
assert
assign
associated
association
atomic
attachment
attribute
audio
author
authen
automatically
available
avatar
avoid
await
awesome
aws
backdrop
background
backward
badge
banner
based
basic
beans
because
before
begin
behind
being
below
between
beware
binary
binding
binds
blah
black
blank
bless
block
boost
bool
border
bottle
bottom
bound
brain
branch
brand
break
breeze
brief
broker
browse
buffer
build
bundle
button
byte
cache
calendar
callback
called
caller
calls
camel
cancel
cannot
canvas
capacity
capab
carat
carousel
cascade
cases
catalog
catch
categories
category
cause
center
certificate
chain
change
channel
chapter
character
chart
check
chevron
child
choices
chomp
choose
chosen
chrome
chunk
circle
clang
class
clean
clear
click
client
clock
clone
close
closure
cloud
cocoa
coding
collapse
collect
color
column
command
comment
commit
common
compact
compare
compilation
complete
completion
component
components
compute
condensed
condition
config
confirm
connect
consists
console
constant
constraints
consumer
contact
contain
content
context
continue
control
convenience
convert
copy
cookie
coordinator
corner
correct
could
count
course
cover
create
creature
credential
cron
criteria
croak
cross
cubic
curl
current
custom
danger
darken
dashboard
dashed
data
declaration
declared
decod
decoration
default
deferred
define
definition
delay
delegate
delete
delivery
delta
demo
dependency
dependent
depth
describe
description
designer
desktop
destination
destroy
detail
development
device
devise
diagnostic
dictionary
different
digest
direct
disable
dismiss
dispatch
display
disposable
dispose
disposing
distance
distribute
distribution
doctrine
document
domain
dotted
double
download
draft
driver
dumps
duration
during
dword
dynamic
easing
eclipse
editing
editor
effect
either
elastic
element
email
empty
enable
encod
encrypt
engine
enrollment
ensure
entity
entries
entry
environment
equal
equals
erase
error
event
example
except
exclude
execute
exist
expand
expect
explode
expir
export
exposed
expression
extend
extension
external
extra
faces
factory
failed
failure
false
family
feature
federate
feedback
fetch
field
figure
file
files
filename
filter
finagle
final
finish
first
fixed
fixture
flags
flash
float
floor
fluid
flush
focus
folder
follow
footer
force
format
forms
formula
forum
forward
found
fragment
frame
freeze
friend
fulfill
function
furnished
future
gallery
gateway
generate
generator
generic
geometry
getter
get(
given
github
gitlab
global
graphics
green
group
grunt
guard
handle
header
heading
height
hello
helper
hidden
highlight
history
holders
hooks
horizontal
hours
hover
http
html
icons
ignore
image
immediately
immutable
implemented
import
include
index
indicator
inference
infinite
info
inherit
inherited
initial
inject
inner
input
insert
inside
inspect
install
instance
instead
intent
interaction
intercept
interface
internal
interrupt
intro
invalid
inverted
invoke
isolate
issue
item
iterat
itself
java
justified
justify
key
label
labels
lambda
language
large
launch
layer
layout
leader
least
legend
length
letter
level
library
light
limit
linear
lines
links
linux
listener
little
loaded
loading
loads
local
location
logger
login
logon
loose
lower
machine
makes
manage
mapping
marathon
margin
mark
master
match
material
matrix
maximum
means
measure
media
medium
member
memory
message
meteor
method
methods
metro
middle
might
minus
minutes
missing
mixed
mobile
model
modified
module
moment
month
mount
mouse
multiple
mutating
name
native
navigation
needed
needs
network
neutral
neutron
never
nexus
nodes
none
normal
notes
nothing
notice
notification
null
number
oauth
object
oblique
observe
observer
occurs
offline
offset
often
openssl
operation
operator
option
oracle
orange
orbit
order
orientation
origin
organis
other
outer
outline
overflow
override
overview
owner
package
packet
padding
pager
pages
palette
panel
paper
param
parent
parse
partial
parts
passed
passing
passcode
passphrase
password
patch
paths
pattern
pause
peer
payload
payment
pending
people
percent
perform
performance
persistence
person
perspective
phone
picker
pills
pipeline
pixels
place
placement
plain
platform
player
point
pool
policy
portal
portfolio
position
possible
posts
power
precedence
preference
prefix
preparation
prepare
presence
present
pressed
preview
previous
price
primary
print
priority
private
problem
process
produce
product
profile
program
progress
project
promise
properties
property
props
protected
protocol
prototype
provide
proxy
public
publish
purchase
purple
queri
query
question
queue
radio
radius
rails
raise
raises
random
range
react
reader
readonly
readme
ready
really
realm
reason
reboot
receive
recommended
record
recreated
redirect
reference
reflect
refresh
regenerated
region
regist
reject
related
relation
relative
release
reload
remarks
remote
remove
render
repeat
replace
replica
reply
report
repository
representing
request
requests
require
rescue
reserved
reset
resolution
resolve
resource
response
responsible
responsive
restart
restriction
result
resume
retain
return
reveal
reverse
right
ripple
roles
rotate
round
route
rudder
rules
runner
running
sample
scale
scanner
scene
scenario
scope
score
screen
script
scroll
sealed
search
second
secret
section
secure
security
segue
select
sender
sending
sequel
sequence
series
serial
server
service
session
setting
setter
setup
sha256
sha1
sha2
sha224
sha512
shadow
shallow
shape
share
shift
short
should
showing
shown
shutdown
sidebar
signature
sign
similar
simple
since
single
sites
size
sizing
sleep
slice
slick
slide
small
smart
snapshot
social
socket
solid
sorted
source
space
spaces
spacing
spark
speak
special
specific
specified
specify
specs
speed
spell
spinner
split
spray
square
stack
start
stash
state
static
stats
status
steps
sticky
storage
store
strategy
stream
stretch
strict
string
strip
stroke
strong
struct
stubs
student
stuff
style
subject
submit
subscriptions
subtitle
success
suite
summary
super
support
swift
swing
switch
symbol
synchronized
synthesize
system
table
tablet
target
tasks
teacher
team
temp
terms
test
texture
their
theme
there
these
thick
those
thread
three
thrift
through
throw
thrown
throws
thumb
thumbs
ticket
timeline
timer
times
timing
title
today
token
tools
topic
total
touch
trace
track
trait
trans
tagword
triangle
trigger
true
trust
trying
tween
type
typically
uint
unavailable
under
uniform
union
unique
universe
unknown
unless
unlock
unsigned
unstable
until
update
upload
used
username
using
usually
valid
value
variable
variant
vector
verbose
verify
version
vertical
video
views
virtual
visibility
visible
visit
volatile
void
volume
wallet
warning
watch
waves
weight
whatever
where
whether
which
while
white
width
window
with
within
without
world
would
wrapper
write
written
xxxxx
yellow
yield
your
zeros
.json
.xml
================================================
FILE: credsweeper/common/keyword_pattern.py
================================================
import re
class KeywordPattern:
"""Pattern set of keyword types"""
directive = r"(?P(?:" \
r"(?:[#%]define|define(?=(\s|\\{1,8}[tnr])*\()|%global)" \
r"(?:\s?\(|\s|\\{1,8}[tnr]){1,8}|\bset(?=\b|\w*(\s|\\{1,8}[tnr])*\()" \
r"))?"
key_left = r"(?:\\[nrt]|(\\\\*u00|%)[0-9a-f]{2}|\s)*" \
r"(?P(([\"'`]{1,8}[^:=\"'`}<>\\/&?]*|[^:=\"'`}<>\s()\\/&?;,%]*)"
# keyword will be inserted here
key_right = r"[^%:=\"'`<>({?!&;\n]{0,80}" \
r")" \
r"(&(quot|apos|#3[49]);|(\\\\*u00|%)[0-9a-f]{2}|[\"'`])*" \
r")" #
separator = r"(?(directive)|(\s|\\{1,8}[tnr])*\]?(\s|\\{1,8}[tnr])*)" \
r"(?P:(\s[a-z]{3,9}[?]?\s)?=|:(?!:)|=(>|>|(\\\\*u00|%)26gt;)|!==|!=|===|==|=~|=" \
r"|(?(directive)(,|\\t|\s|\((?!\))){1,80}|%3d))" \
r"(\s|\\{1,8}[tnr])*"
# might be curly, square or parenthesis with words before
wrap = r"(?P(" \
r"((\s|\\{1,8}[tnr]|new|byte|char|string|\[\]){1,8})?" \
r"(?P([_a-z][0-9a-z_.\[\]]*\.)get|(os\.)?getenv)?" \
r"([0-9a-z_.]|::|-(>|>))*" \
r"\s*" \
r"(\[(?!\])|\((?!\))|\{(?!\}))" \
r"(\s|\\{1,8}[tnr])*" \
r"(?(get)('[^']{1,31}'|\"[^\"]{1,31}\")\s*(,|\)\s*or)\s*|)" \
r"([0-9a-z_]{1,32}\s*[:=]\s*)?" \
r"){1,8})?"
string_prefix = r"(((b|r|br|rb|u|t|f|rf|fr|l|@)(?=(\\*[\"'`])))?"
left_quote = r"(?P((?P\\{1,8})?([\"'`]|&(quot|apos|#3[49]);)){1,4}))?"
# Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
auth_keywords = r"(\s?(oauth|bot|basic|bearer|apikey|accesskey|ssws|ntlm|token)\s)?"
value = r"(?P" \
r"(?(value_leftquote)" \
r"(" \
r"(?!(?P=value_leftquote))" \
r"(?(esq)((?!(?P=esq)([\"'`]|&(quot|apos|#3[49]);)).)|((?!(?P=value_leftquote)).)))" \
r"|" \
r"(?!&(quot|apos|#3[49]);)" \
r"(\\{1,8}([ tnr]|[^\s\"'`])" \
r"|" \
r"(?P%[0-9a-f]{2})" \
r"|" \
r"(?(url_esc)[^\s\"'`,;\\&]|[^\s\"'`,;\\])" \
r")" \
r"){4,8000}" \
r"|" \
r"(<[^>]{4,8000}>)" \
r"|" \
r"(\$?\({1,3}[^)]{4,8000}\){1,3})" \
r"|" \
r"(\$?\{{1,3}[^}]{4,8000}\}{1,3})" \
r"|" \
r"(?(wrap)(?(value_leftquote)(?!\\(?P=value_leftquote))|[^\]\)\}]){16,8000})" \
r")" #
right_quote = r"(?(value_leftquote)" \
r"(?P(? re.Pattern:
"""Returns compiled regex pattern"""
expression = ''.join([ #
cls.directive, #
cls.key_left, #
fr"(?P{keyword})", # named group required
cls.key_right, #
cls.separator, #
cls.wrap, #
cls.string_prefix, #
cls.left_quote, #
cls.auth_keywords, #
cls.value, #
cls.right_quote, #
])
return re.compile(expression, flags=re.IGNORECASE | re.DOTALL)
================================================
FILE: credsweeper/common/morpheme_checklist.txt
================================================
../
.com
.org
/bin
/dev
/etc
/lib
/mnt
/opt
/sbin
/srv
/tmp
/usr
/var
000
111
14159265
18284590
222
333
444
555
65358979
666
71828182
777
80211
888
999
_ack_
_arg
_cbc
_cfg
_clk
_con
_cpu
_dbg_
_dev
_dir
_div
_dma
_drv_
_env
_err
_eth
_ext
_fig
_fmt_
_ghz
_i2c_
_id_
_if
_in
_io_
_irq
_is
_it
_jpg
_khz
_lan
_led_
_mem
_mhz
_mux
_num
_on
_op_
_or_
_pcm_
_pin
_pre
_pro
_pwr
_ram
_reg
_req
_ret
_rev
_rgb
_rsa_
_rw_
_rx_
_sdr_
_src
_to
_tx_
_un
_up
_val
_vol
_wap
_wep
_wpa
_x64
abel
abilit
able
ably
abort
above
absolut
abstra
academ
acce
acon
activ
actor
actual
actur
adapt
add
ader
adjust
admi
adver
advise
advisor
aes256
affect
after
aggre
agno
aight
aign
akeup
alert
algo
alias
alice
align
aling
all
alpha
alter
altit
amazon
ample
anali
analy
ance
anchor
anci
ancy
and
anguage
angular
anima
anomaly
antenna
anth
anti
any
apache
api
app
aram
arch
are
arg_
argc
args
argv
arian
arker
arpa
array
arro
art
ascii
ash
asia
asic
ask
assembl
assert
assoc
asure
asyn
ately
athon
atic
atil
ating
atlas
atomic
ator
attach
attack
attend
attr
atus
audio
audit
auri
auten
auth
auto
aux
avail
avatar
aver
awesom
axis
azure
back
badge
balanc
bank
bann
bar
bas
batch
batt
beac
beans
beat
beef
begin
behav
behind
being
belo
benutz
best
bias
big
bill
bin/
binar
bind
bio
bipol
bit
bixby
black
blan
bless
blic
blish
blob
blood
blue
board
bob
body
book
bool
boost
boot
boss
bot
boun
box
branch
break
breeze
bridge
brief
brit
bro
bssid
buck
buf
bugs
build
builtin
bular
bulk
bull
bund
burst
bus
butor
button
byte
cache
calen
camel
camp
can
capab
capac
cape
captu
carat
card
carri
carry
cascade
case
cast
catch
categor
cative
cbc_
ccele
ccept
ccess
ceed
celebr
cell
cenar
cense
cent
cert
cessor
cfg_
chacha
chain
change
channel
chant
chapter
char
check
chevron
child
chin
chip
choices
chomp
choose
chosen
chrom
chron
chunk
ciat
cilla
cinema
circle
cirrus
city
cket
claims
clan
class
clean
clear
click
clien
clip
clk_
close
closure
cloud
clud
clus
cmd
cocoa
code
codi
cogn
collaps
collect
color
column
comb
comi
comm
compa
compet
compil
compl
compo
compr
conc
conden
conf
connect
consist
console
const
contact
contai
conten
continu
contra
contri
contro
conven
conver
cookie
coord
copy
core
corn
correct
correl
corres
corru
cost
could
count
course
court
cove
cpu_
crac
creat
cred
cript
crit
croak
cron
cross
crypt
crystal
ctive
ctrl
cubic
cue
cultur
cumulat
curr
curs
custom
cut
cyan
cycle
daily
danger
darken
darwin
das
data
date
davinci
day
dead
debug
decimal
decod
def
delay
dele
deliv
delta
demo
denc
dens
dent
depen
deploy
depo
depth
derive
desc
desired
desktop
dest
detach
detai
detect
dev/
dev_
develop
device
devise
diag
dial
dicat
dict
did
dif
dig
dimen
ding
diod
dir_
direct
disab
disc
disk
dismi
dispos
dissoc
dist
ditor
dity
div_
divid
dma_
dock
docs
doct
does
dog
dot
double
doubt
draft
dragon
drift
drive
droid
drop
dul
dummy
dump
dup
durin
dust
dvb
dynamic
dynamo
eadbee
easin
easy
ecdhe
ecdsa
ecret
ected
ector
ectron
eded
edge
edit
edium
eeprom
effect
egory
elect
eless
emai
emi
empty
enabl
ence
enclave
encod
encryp
ency
ende
eness
engine
ength
enhanc
ensure
ente
entit
entr
enum
env_
equal
erase
erial
ericsson
err_
error
erse
ersi
ertise
esam
esses
estima
esult
etc/
eth_
etic
eting
eutron
eval
evan
event
exam
excee
except
exclu
exist
exit
expan
expe
expir
expl
expo
expr
ext_
exten
exter
extra
exynos
face
fact
fail
false
famil
far
fast
fault
favor
featu
fee
ferr
fetch
fied
field
fifo
fig_
figur
file
fill
filter
finagle
final
find
fine
fire
firm
first
fix
flas
flat
fleet
flick
flix
float
flood
floor
fluent
fluid
flush
focus
foo
for
fossil
foun
fpga
frac
frame
free
freq
friend
from
front
frozen
fujitsu
fulf
full
func
furn
futu
gain
game
gang
gate
gative
gauss
gen
geo
gest
get
ghbor
ghz_
gian
ging
git
given
global
gobble
good
google
grab
grace
gram
grant
graph
grave
gray
greater
green
gregat
gregor
gress
grid
gro
grpc
guard
guest
guid
guish
ha1-
ha1_
ha2-
ha256
ha2_
ha394
ha512
hack
half
hard
has
have
having
havior
hdmi
head
health
hear
height
hello
help
herm
heroku
hetero
hex
hiber
hidden
hierar
high
histo
hola
home
hook
horizon
host
houn
hours
html
http
hub
human
humid
hybrid
iabl
ical
icon
id_rsa
iden
idle
ieee
ient
if_
ificat
ignore
illega
ilor
image
imated
imer
impact
imple
improve
in_
inclu
incom
indemni
index
indic
indiv
iness
info
infra
ingle
ings
ingular
inherit
ini
injec
inn
insert
insig
instead
int
inval
invent
inver
invoke
ion
ipv4
ipv6
iron
irq_
is_
ished
iso_
isolat
issue
it_
item
iter
ities
iting
itiv
ivate
ixed
ixtu
ixup
ized
izer
jabber
java
ject
jira
jitsi
job
join
journal
jpeg
jpg_
json
jump
justif
kafka
kerberos
kernel
key
khz_
kill
kind
kinesis
kirk
know
knox
kris
lab
lag
lambda
lan_
lang
large
larval
last
late
latit
lative
launch
layer
lazy
lead
leaf
least
leek
left
legacy
legal
lend
leng
lens
let
level
lexeme
lexic
lianc
liant
lib/
library
licens
lies
life
lift
light
lim
line
lingu
link
linux
list
lite
little
lity
live
lled
llup
lness
load
local
lock
log
long
look
loop
loose
lost
low
luate
lysis
mac
magic
mail
main
maker
makes
manage
manual
manuf
map
marat
margin
mark
mary
master
match
mater
matrix
max
mber
mbin
mbler
mean
measur
medi
medus
meet
mem_
memb
memo
ment
menu
merc
merge
messag
meta
meteor
method
metr
mhz_
micha
micro
middle
might
migrat
millis
min
mirror
misc
miss
mit
mix
mmon
mmun
mnt/
mobile
mock
mode
modi
modu
monitor
month
morp
mory
mote
motor
mount
move
mpeg
multi
mutat
mute
mux_
nalyz
name
nary
nates
nativ
nced
ncept
ncies
ndom
ndow
ned
need
neigh
neo4j
ner
net
neutr
never
new
next
nexus
nielsen
ning
nipp
nish
nism
node
non
nope
norm
not
nsive
ntal
nter
nting
null
num_
numb
numer
nuous
nvram
obj
oblique
occur
ocean
ocess
oder
off
often
oken
oker
old
olygon
on_
oncat
one
onfig
only
ookup
open
opt/
opted
opti
oracle
orbi
order
ordinar
ores
organ
ories
origin
orithm
ormat
orph
otorola
ottle
ound
ously
out
over
own
pack
page
pair
pale
panel
par
pass
patch
path
patte
paw
pci
pcmcia
peer
penalt
pend
people
per
pets
phore
photon
phrase
phys
pick
pills
ping
pipe
pixel
pkcs1
pkcs8
place
plain
plan
play
plex
plic
plod
plor
plug
plus
poin
polar
polic
poll
poly
pond
pons
pool
poon
pop
port
pose
posit
possib
post
poun
power
pre_
pred
prefi
prese
press
prev
price
prim
princip
prior
priv
pro_
probe
problem
proc
prod
prof
prog
proj
promise
prompt
prop
prote
proto
provi
prox
pseudo
pster
psycho
pub
pull
purcha
purple
push
put
pwr_
python
qos
quantum
queri
query
queue
quick
quota
quote
rabbi
rack
radar
radeon
radio
radius
rage
rails
rain
raise
ram_
rammar
range
rank
ransit
rate
rati
raw
rcept
rchite
rchive
reached
react
read
real
reason
receive
recipe
recog
recom
record
redact
redir
redisson
refer
reflect
refresh
reg_
regexp
regio
regist
regs
regul
rejec
relat
release
reli
remar
remo
rend
rent
repeat
repl
repo
repre
req_
request
require
resiz
resolv
resp
resul
ret_
retai
retriev
return
rev_
revea
revel
reven
rever
revisio
revoke
rgb_
rick
ride
right
rimar
rime
rine
ring
ripple
rish
risk
ritte
rity
river
rize
road
role
roll
room
root
ropo
rose
rotat
rotocol
rottl
rough
roun
roup
row
rroga
rrupt
rticle
rudder
rule
run
rxtx
sabl
sage
salt
same
sampl
sams
saves
savi
scala
scale
scali
scen
sched
schem
scipl
scont
scope
scram
screen
scret
scri
scro
seal
searc
seccomp
second
secre
sect
secur
seed
seek
seen
segue
sein
self
sema
send
sens
sent
seque
seria
series
serv
sessio
set
sever
sex
sha1
sha2
sha3
sha5
shadow
shape
shift
ship
shoot
short
shot
should
show
shut
sian
sible
side
sight
sign
similar
simpl
simul
since
sine
sing
sip
sites
size
sizi
skip
slack
slas
slave
sleep
slice
slick
slide
slot
smar
smooth
snap
sness
sniff
snip
social
sock
soft
solid
solve
some
sony
sort
soun
source
space
spacing
speak
spec
speed
spell
spent
spin
split
spot
spray
sql
src_
srv/
ssh
ssl
stack
stan
star
stas
stat
stdin
steer
stem
sten
step
stic
sting
ston
stop
stor
strai
stream
stren
stretch
strob
stroke
strong
struct
stubs
stude
studio
stuff
style
sub
succee
succes
such
suffi
suite
sum
sun
supe
supp
surro
suspe
swap
swift
swing
switch
swizz
symbol
sync
synth
sys
tabl
tag
tail
tain
tape
tate
tative
teacher
teams
tech
tele
tell
temp
tent
tera
term
ternal
tery
test
text
than
that
the
thick
thing
this
thor
those
threat
three
thrift
thro
thumb
tial
tick
tics
tifier
time
timi
tio
tish
title
titud
tizen
tmp/
to_
tod
toke
tolera
tomcat
too
topic
tory
torial
total
touch
tour
trace
tract
traffic
trait
tral
trans
treat
trial
triang
tribut
tric
tries
trigger
trip
trol
trouble
troy
true
trust
try
tter
tune
tuni
tunnel
ture
tween
twenty
twitt
txrx
txt
type
typo
ultima
under
unfo
unic
unio
unique
unit
univ
unless
unpre
until
unzip
up_
updat
upgrade
url
usa
usb
use
usin
usr/
uster
util
val_
valid
valu
var/
vari
vault
vect
veeva
vendor
verbose
verify
vers
vert
very
video
view
viol
virtual
visibl
visit
visual
vita
vocab
voice
void
vol_
volat
volume
vuln
wait
wake
wan
wap_
ward
warm
warn
watch
wave
way
weak
web
week
weight
well
wep_
when
where
which
while
white
wide
widge
width
will
wind
wire
with
wlan
wood
word
work
world
wort
would
wow
wpa_
wrap
writ
wrong
x64_
xpect
xxx
year
yello
yield
you
zeppelin
zero
zigbee
zing
zona
zorro
================================================
FILE: credsweeper/config/__init__.py
================================================
================================================
FILE: credsweeper/config/config.py
================================================
import re
from typing import Dict, List, Optional, Set, Any
from humanfriendly import parse_size
from credsweeper.common.constants import Severity, DEFAULT_PATTERN_LEN
from credsweeper.utils.util import Util
class Config:
"""Class that contain configs that can be changed by user."""
NOT_ALLOWED_PATH = [
".*\\.min\\.js", ".*message.*\\.properties", ".*locale.*\\.properties", ".*makefile.*", ".*package-lock\\.json",
".*package\\.json", ".*\\.css", ".*\\.scss"
]
def __init__(self, config: Dict[str, Any]) -> None:
self.exclude_patterns: List[re.Pattern] = [re.compile(pattern) for pattern in config["exclude"]["pattern"]]
self.exclude_paths: List[str] = config["exclude"]["path"]
self.exclude_containers: List[str] = config["exclude"]["containers"]
self.exclude_documents: List[str] = config["exclude"]["documents"]
self.exclude_extensions: List[str] = config["exclude"]["extension"]
self.exclude_lines: Set[str] = set(config["exclude"].get("lines", []))
self.exclude_values: Set[str] = set(config["exclude"].get("values", []))
self.source_extensions: List[str] = config["source_ext"]
self.source_quote_ext: List[str] = config["source_quote_ext"]
self.find_by_ext_list: List[str] = config["find_by_ext_list"]
self.bruteforce_list: List[str] = config["bruteforce_list"]
self.check_for_literals: bool = config["check_for_literals"]
self.not_allowed_path_pattern = re.compile(f"{Util.get_regex_combine_or(self.NOT_ALLOWED_PATH)}",
flags=re.IGNORECASE)
self.use_filters: bool = config["use_filters"]
self.line_data_output: List[str] = config["line_data_output"]
self.candidate_output: List[str] = config["candidate_output"]
self.find_by_ext: bool = config["find_by_ext"]
self.size_limit: Optional[int] = parse_size(config["size_limit"]) if config["size_limit"] is not None else None
self.pedantic: bool = bool(config["pedantic"])
self.depth: int = int(config["depth"])
self.doc: bool = config["doc"]
self.severity: Severity = Severity.get(config.get("severity"))
self.max_url_cred_value_length: int = int(config["max_url_cred_value_length"])
self.max_password_value_length: int = int(config["max_password_value_length"])
# Trim exclude patterns from space like characters
self.exclude_lines = set(line.strip() for line in self.exclude_lines)
self.exclude_values = set(line.strip() for line in self.exclude_values)
self.pattern_len = config.get("pattern_len", DEFAULT_PATTERN_LEN)
================================================
FILE: credsweeper/credentials/__init__.py
================================================
================================================
FILE: credsweeper/credentials/augment_candidates.py
================================================
from typing import List
from credsweeper.credentials.candidate import Candidate
def augment_candidates(candidates: List[Candidate], new_candidates: List[Candidate]):
"""
Augments candidates with new_candidates if value of line data is not present in the candidates
Args:
candidates: [IN/OUT] list of candidates to be augmented
new_candidates: [IN] list with new candidates
"""
if not new_candidates:
return
found_values = set(line_data.value for candidate in candidates #
for line_data in candidate.line_data_list)
for new_candidate in new_candidates:
for line_data in new_candidate.line_data_list:
if line_data.value not in found_values:
candidates.append(new_candidate)
break
================================================
FILE: credsweeper/credentials/candidate.py
================================================
import copy
import re
from json.encoder import py_encode_basestring_ascii
from typing import Any, Dict, List, Optional
from credsweeper.common.constants import Severity, Confidence
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
class Candidate:
"""Candidates that can be credentials.
Class contains list of LineData, some attributes from Rule object, and config
Parameters:
line_data_list: List of LineData
patterns: Regular expressions that can be used for detection
rule_name: Name of Rule
severity: critical/high/medium/low
confidence: strong/moderate/weak
config: user configs
use_ml: Whether the candidate should be validated with ML. If not - ml_probability is set None
"""
DUMMY_PATTERN = re.compile(r"^")
def __init__(self,
line_data_list: List[LineData],
patterns: List[re.Pattern],
rule_name: str,
severity: Severity,
config: Optional[Config] = None,
use_ml: bool = False,
confidence: Confidence = Confidence.MODERATE) -> None:
self.line_data_list = line_data_list
self.patterns = patterns
self.rule_name = rule_name
self.severity = severity
self.config = config
self.use_ml = use_ml
self.confidence = confidence
# None - ML is not applicable or not processed yet; float - the ml decision above ml_threshold
# Note: -1.0 is possible too for some activation functions in ml model, so let avoid negative values
self.ml_probability: Optional[float] = None
def compare(self, other: 'Candidate') -> bool:
"""Comparison method - checks only result of final cred"""
if self.rule_name == other.rule_name \
and self.severity == other.severity \
and self.confidence == other.confidence \
and self.use_ml == other.use_ml \
and self.ml_probability == other.ml_probability \
and len(self.line_data_list) == len(other.line_data_list):
for i, j in zip(self.line_data_list, other.line_data_list):
if i.compare(j):
continue
break
else:
# all line_data are equal
return True
return False
@staticmethod
def _encode(value: Any) -> Any:
"""Encode value to the base string ascii
Args:
value: Any type of value to be encoded
"""
if isinstance(value, str):
return py_encode_basestring_ascii(value)
return value
def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
"""Represent candidate with subtext or|and hashed values"""
return f"rule: {self.rule_name}" \
f" | severity: {self.severity.value}" \
f" | confidence: {self.confidence.value}" \
f" | ml_probability: {self.ml_probability}" \
f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]"
def __str__(self):
return self.to_str()
def __repr__(self):
return self.to_str(subtext=True)
def to_json(self, hashed: bool, subtext: bool) -> Dict:
"""Convert credential candidate object to dictionary.
Return:
Dictionary object generated from current credential candidate
"""
full_output = {
"patterns": [pattern.pattern for pattern in self.patterns],
"rule": self.rule_name,
"severity": self.severity.value,
"confidence": self.confidence.value,
"use_ml": self.use_ml,
"ml_probability": self.ml_probability,
# put the array to end to make json more readable
"line_data_list": [line_data.to_json(hashed, subtext) for line_data in self.line_data_list],
}
if self.config is not None:
reported_output = {k: v for k, v in full_output.items() if k in self.config.candidate_output}
else:
reported_output = full_output
return reported_output
def to_dict_list(self, hashed: bool, subtext: bool) -> List[dict]:
"""Convert credential candidate object to List[dict].
Return:
List[dict] object generated from current credential candidate
"""
reported_output = []
json_output = self.to_json(hashed, subtext)
refined_data = copy.deepcopy(json_output)
del refined_data["line_data_list"]
for line_data in json_output["line_data_list"]:
line_data.update(refined_data)
for key in line_data.keys():
line_data[key] = self._encode(line_data[key])
reported_output.append(line_data)
return reported_output
@classmethod
def get_dummy_candidate(cls, config: Config, file_path: str, file_type: str, info: str, rule_name: str):
"""Create dummy instance to use in searching file by extension"""
return cls( #
line_data_list=[LineData(config, '', -1, 0, file_path, file_type, info, cls.DUMMY_PATTERN)],
patterns=[cls.DUMMY_PATTERN], #
rule_name=rule_name, #
severity=Severity.INFO, #
config=config, #
confidence=Confidence.WEAK)
================================================
FILE: credsweeper/credentials/candidate_group_generator.py
================================================
from typing import Dict, List, Tuple
from credsweeper.credentials.candidate import Candidate
from credsweeper.credentials.candidate_key import CandidateKey
class CandidateGroupGenerator:
"""CandidateGroupGenerator"""
def __init__(self) -> None:
self.grouped_candidates: Dict[CandidateKey, List[Candidate]] = {}
@property
def grouped_candidates(self) -> Dict[CandidateKey, List[Candidate]]:
"""property getter"""
return self._grouped_candidates
@grouped_candidates.setter
def grouped_candidates(self, grouped_candidates: Dict[CandidateKey, List[Candidate]]) -> None:
"""property setter"""
self._grouped_candidates = grouped_candidates
def __contains__(self, key: CandidateKey) -> bool:
return key in self.grouped_candidates
def __getitem__(self, key) -> List[Candidate]:
return self.grouped_candidates[key]
def __setitem__(self, key: CandidateKey, value: List[Candidate]) -> None:
self.grouped_candidates[key] = value
def __len__(self) -> int:
return len(self.grouped_candidates)
def items(self) -> List[Tuple[CandidateKey, List[Candidate]]]:
"""getter"""
return list(self.grouped_candidates.items())
================================================
FILE: credsweeper/credentials/candidate_key.py
================================================
from typing import Tuple
from credsweeper.credentials.line_data import LineData
class CandidateKey:
"""Class used to identify credential candidates.
Candidates that detected same value on same string in a same file would have identical CandidateKey
"""
def __init__(self, line_data: LineData):
self.path: str = line_data.path
self.line_num: int = line_data.line_num
self.value_start: int = line_data.value_start
self.value_end: int = line_data.value_end
self.key: Tuple[str, int, int, int] = (self.path, self.line_num, self.value_start, self.value_end)
self.__line = line_data.line
def __hash__(self):
return hash(self.key)
def __eq__(self, other):
return self.key == other.key
def __ne__(self, other):
return not bool(self == other)
def __repr__(self) -> str:
return f"{self.key}:{self.__line}"
================================================
FILE: credsweeper/credentials/credential_manager.py
================================================
import logging
from multiprocessing import Manager
from typing import List, Dict, Tuple
from credsweeper.credentials.candidate import Candidate
from credsweeper.credentials.candidate_group_generator import CandidateGroupGenerator, CandidateKey
logger = logging.getLogger(__name__)
class CredentialManager:
"""The manager allows you to store, add and delete separate credit candidates."""
def __init__(self) -> None:
self.candidates: List[Candidate] = list(Manager().list())
def clear_credentials(self) -> None:
"""Clear credential candidates stored in the manager."""
self.candidates.clear()
def len_credentials(self) -> int:
"""Get number of credential candidates stored in the manager.
Return:
Non-negative integer
"""
return len(self.candidates)
def get_credentials(self) -> List[Candidate]:
"""Get all credential candidates stored in the manager.
Return:
List with all Candidate objects stored in manager
"""
return self.candidates
def set_credentials(self, candidates: List[Candidate]) -> None:
"""Remove all current credentials candidates from the manager and add new credentials.
Args:
candidates: List with candidates to replace current candidates in the manager
"""
self.candidates = candidates
def add_credential(self, candidate: Candidate) -> None:
"""Add credential candidate to the manager.
Args:
candidate: credential candidate to be added
"""
self.candidates.append(candidate)
def remove_credential(self, candidate: Candidate) -> None:
"""Remove credential candidate from the manager.
Args:
candidate: credential candidate to be removed
"""
self.candidates.remove(candidate)
def purge_duplicates(self) -> int:
"""Purge duplicates candidates which may appear in overlaps during long line scan.
Returns: number of removed duplicates
"""
candidates_dict: Dict[Tuple[str, str, str, int, int, int, int, int, int, int], Candidate] = {}
before = len(self.candidates)
for i in self.candidates:
ld = i.line_data_list[0]
candidate_key = (
i.rule_name, #
ld.path, #
ld.info, #
ld.line_pos, #
ld.variable_start, #
ld.variable_end, #
ld.separator_start, #
ld.separator_end, #
ld.value_start, #
ld.value_end)
if candidate_key in candidates_dict:
# check precisely - compare with the values
candidate_dict = candidates_dict[candidate_key]
if not candidate_dict.compare(i):
ld_ = candidate_dict.line_data_list[0]
logger.warning("Check %s and %s", (ld_.variable, ld_.value), (ld.variable, ld.value))
else:
candidates_dict[candidate_key] = i
self.candidates = list(candidates_dict.values())
after = len(self.candidates)
return before - after
def group_credentials(self) -> CandidateGroupGenerator:
"""Join candidates that reference same secret value in the same line.
Candidate can belong to two groups in the same time if it has more than one LineData object inside
Return:
Contain dictionary of [path, line_num, value] -> credential candidates list
"""
groups = CandidateGroupGenerator()
for credential_candidate in self.get_credentials():
for line_data in credential_candidate.line_data_list[:1]:
# Match by file path+line num+value. Value required so two different credentials still be
# processed independently
candidate_key = CandidateKey(line_data)
if candidate_key in groups:
groups[candidate_key].append(credential_candidate)
else:
groups[candidate_key] = [credential_candidate]
return groups
================================================
FILE: credsweeper/credentials/line_data.py
================================================
import contextlib
import hashlib
import re
import string
from functools import cached_property
from typing import Any, Dict, Optional, Tuple
from colorama import Fore, Style
from credsweeper.common.constants import MAX_LINE_LENGTH, UTF_8, StartEnd, ML_HUNK
from credsweeper.config.config import Config
from credsweeper.utils.util import Util
class LineData:
"""Object to treat and store scanned line related data.
Parameters:
key: Optional[str] = None
line: string variable, line
line_num: int variable, number of line in file
path: string variable, path to file
file_type: string variable, extension of file '.txt'
info: additional info about how the data was detected
pattern: regex pattern, detected pattern in line
separator: optional string variable, separators between variable and value
separator_start: optional variable, separator position start
value: optional string variable, detected value in line
variable: optional string variable, detected variable in line
"""
quotation_marks = ('"', "'", '`')
comment_starts = ("//", "* ", "# ", "/*", "|\\w+?\\>|\\&)")
line_endings = re.compile(r"\\{1,8}[nr]")
# https://en.wikipedia.org/wiki/Percent-encoding
url_percent_split = re.compile(r"%(21|23|24|26|27|28|29|2a|2b|2c|2f|3a|3b|3d|3f|40|5b|5d)", flags=re.IGNORECASE)
url_unicode_split = re.compile(r"\\u00(0000)?(21|23|24|26|27|28|29|2a|2b|2c|2f|3a|3b|3d|3f|40|5b|5d)",
flags=re.IGNORECASE)
# some symbols e.g. double quotes cannot be in URL string https://www.ietf.org/rfc/rfc1738.txt
# \ - was added for case of url in escaped string \u0026amp; - means escaped & in HTML
url_scheme_part_regex = re.compile(r"[0-9A-Za-z.-]{3}")
url_chars_not_allowed_pattern = re.compile(r'[\s"<>\[\]^~`{|}]')
url_value_pattern = re.compile(r'[^\s&;"<>\[\]^~`{|}]+[&;][^\s=;"<>\[\]^~`{|}]{3,80}=[^\s;&="<>\[\]^~`{|}]{1,80}')
variable_strip_pattern = string.whitespace + """,'"-;"""
INITIAL_WRONG_POSITION = -3
EXCEPTION_POSITION = -2
def __init__(
self, #
config: Config, #
line: str, #
line_pos: int, #
line_num: int, #
path: str, #
file_type: str, #
info: str, #
pattern: re.Pattern, #
match_obj: Optional[re.Match] = None) -> None:
self.config = config
self.line: str = line
self.line_pos: int = line_pos
self.line_num: int = line_num
self.path: str = path
self.file_type: str = file_type
self.info: str = info
self.pattern: re.Pattern = pattern
# do not store match object due it cannot be pickled with multiprocessing
# start - end position of matched object
self.value_start = LineData.INITIAL_WRONG_POSITION
self.value_end = LineData.INITIAL_WRONG_POSITION
self.key: Optional[str] = None
self.separator: Optional[str] = None
self.separator_start: int = LineData.INITIAL_WRONG_POSITION
self.separator_end: int = LineData.INITIAL_WRONG_POSITION
self.value: Optional[str] = None
self.variable: Optional[str] = None
self.variable_start = LineData.INITIAL_WRONG_POSITION
self.variable_end = LineData.INITIAL_WRONG_POSITION
self.value_leftquote: Optional[str] = None
self.value_rightquote: Optional[str] = None
# is set when variable & value are in URL for any source type
self.url_part = False
self.wrap = None
self._3d_escaped_separator = False
self.initialize(match_obj)
# the line is very useful for debug breakpoint
pass # pylint: disable=W0107
def compare(self, other: 'LineData') -> bool:
"""Comparison method - skip whole line and checks only when variable and value are the same"""
if self.path == other.path \
and self.info == other.info \
and self.line_num == other.line_num \
and self.value_start == other.value_start \
and self.variable == other.variable \
and self.value == other.value:
return True
return False
def initialize(self, match_obj: Optional[re.Match] = None) -> None:
"""Apply regex to the candidate line and set internal fields based on match."""
if not isinstance(match_obj, re.Match) and isinstance(self.pattern, re.Pattern):
match_obj = self.pattern.search(self.line, endpos=MAX_LINE_LENGTH)
if match_obj is None:
return
def get_group_from_match_obj(_match_obj: re.Match, group: str) -> Any:
with contextlib.suppress(Exception):
return _match_obj.group(group)
return None
def get_span_from_match_obj(_match_obj: re.Match, group: str) -> Tuple[int, int]:
with contextlib.suppress(Exception):
span = _match_obj.span(group)
return span[0], span[1]
return LineData.EXCEPTION_POSITION, LineData.EXCEPTION_POSITION
self.key = get_group_from_match_obj(match_obj, "keyword")
self.separator = get_group_from_match_obj(match_obj, "separator")
self.separator_start, self.separator_end = get_span_from_match_obj(match_obj, "separator")
self.value = get_group_from_match_obj(match_obj, "value")
self.value_start, self.value_end = get_span_from_match_obj(match_obj, "value")
self.variable = get_group_from_match_obj(match_obj, "variable")
self.variable_start, self.variable_end = get_span_from_match_obj(match_obj, "variable")
self.value_leftquote = get_group_from_match_obj(match_obj, "value_leftquote")
self.value_rightquote = get_group_from_match_obj(match_obj, "value_rightquote")
self.wrap = get_group_from_match_obj(match_obj, "wrap")
# percent encoded '=' in url
self._3d_escaped_separator = bool(self.separator) and "%3D" == self.separator.upper()
self.sanitize_value()
self.sanitize_variable()
def sanitize_value(self):
"""Clean found value from extra artifacts. Correct positions if changed."""
# process the quotation workaround before cached properties invocation
if not self.value_leftquote and not self.value_rightquote:
while self.value:
first_symbol_code = ord(self.value[0])
last_symbol_code = ord(self.value[-1])
if 0x2018 <= first_symbol_code <= 0x201B and 0x2018 <= last_symbol_code <= 0x201B:
self.value_leftquote = self.value_rightquote = "'"
self.value = self.value[:-1]
self.value_end -= 1
self.value = self.value[1:]
self.value_start += 1
elif 0x201C <= first_symbol_code <= 0x201F and 0x201C <= last_symbol_code <= 0x201F:
self.value_leftquote = self.value_rightquote = '"'
self.value = self.value[1:]
self.value_start += 1
self.value = self.value[:-1]
self.value_end -= 1
else:
break
if self.variable and self.value and not self.is_well_quoted_value:
# sanitize is actual step for keyword pattern only
_value = self.value
self.clean_url_parameters()
self.clean_bash_parameters()
self.clean_toml_parameters()
self.clean_tag_parameters()
if 0 <= self.value_start and 0 <= self.value_end and len(self.value) < len(_value):
start = _value.find(self.value)
self.value_start += start
self.value_end = self.value_start + len(self.value)
def check_url_part(self) -> bool:
"""Determines whether value is part of url like line"""
line_before_value = self.line[:self.value_start]
url_pos = -1
find_pos = 0
while find_pos < self.value_start:
# find rightmost pattern
find_pos = line_before_value.find("://", find_pos)
if -1 == find_pos:
break
url_pos = find_pos
find_pos += 3
# whether the line has url start pattern
self.url_part = 3 <= url_pos
self.url_part &= bool(self.url_scheme_part_regex.match(line_before_value, pos=url_pos - 3, endpos=url_pos))
self.url_part &= not self.url_chars_not_allowed_pattern.search(line_before_value, pos=url_pos + 3)
self.url_part |= self.line[self.variable_start - 1] in "?&" if 0 < self.variable_start else False
self.url_part |= bool(self.url_value_pattern.match(self.value))
self.url_part |= self._3d_escaped_separator
return self.url_part
def clean_url_parameters(self) -> None:
"""Clean url address from 'query parameters'.
If line seem to be a URL - split by & character.
Variable should be right most value after & or ? ([-1]). And value should be left most before & ([0])
"""
# skip sanitize in case of URL credential rule - the regex is mature enough
if self.check_url_part() and not self.variable.endswith("://"):
# all checks have passed - line before the value may be a URL
self.variable = self.variable.rsplit('&')[-1].rsplit('?')[-1].rsplit(';')[-1]
self.value = self.value.split('&', maxsplit=1)[0].split(';', maxsplit=1)[0].split('#', maxsplit=1)[0]
self.value = self.url_unicode_split.split(self.value)[0]
if self._3d_escaped_separator:
self.value = self.url_percent_split.split(self.value)[0]
def clean_bash_parameters(self) -> None:
"""Split variable and value by bash special characters, if line assumed to be CLI command."""
if self.variable.startswith("-"):
value_spl = self.bash_param_split.split(self.value)
# If variable name starts with `-` (usual case for args in CLI)
# and value can be split by bash special characters
if len(value_spl) > 1:
self.value = value_spl[0]
if ' ' not in self.value and ("\\n" in self.value or "\\r" in self.value):
value_whsp = self.line_endings.split(self.value)
if len(value_whsp) > 1:
self.value = value_whsp[0]
def clean_toml_parameters(self) -> None:
"""Parenthesis, curly and squared brackets may be caught in TOML format and bash. Simple clearing"""
cleaning_required = self.value and self.value[-1] in ['}', ']', ')']
line_before_value = self.line[:self.value_start] if self.value_start and 0 <= self.value_start else ""
while cleaning_required:
cleaning_required = False
for left, right in [('{', '}'), ('[', ']'), ('(', ')')]:
if self.value.endswith(right) and left not in self.value \
and line_before_value.count(left) > line_before_value.count(right):
# full match does not reasonable to implement due open character may be in other line
self.value = self.value[:-1]
cleaning_required = True
def clean_tag_parameters(self) -> None:
"""Remove closing tag from value if the opened is somewhere before in line"""
cleaning_required = self.value and self.value.endswith('>')
while cleaning_required:
closing_tag_pos = self.value.rfind("")
if 0 <= closing_tag_pos:
# use `')
else:
break
def sanitize_variable(self) -> None:
"""Remove trailing spaces, dashes and quotations around the variable. Correct position."""
sanitized_var_len = 0
variable = self.variable
while self.variable and sanitized_var_len != len(self.variable):
sanitized_var_len = len(self.variable)
self.variable = self.variable.strip(self.variable_strip_pattern)
if self.variable.endswith('\\'):
self.variable = self.variable[:-1]
if self.variable.startswith('{') and '}' in self.line[self.variable_end:]:
# TOML case
self.variable = self.variable[1:]
if variable and len(self.variable) < len(variable) and 0 <= self.variable_start and 0 <= self.variable_end:
start = variable.find(self.variable)
self.variable_start += start
self.variable_end = self.variable_start + len(self.variable)
def is_comment(self) -> bool:
"""Check if line with credential is a comment.
Return:
True if line is a comment, False otherwise
"""
cleaned_line = self.line.strip()
for comment_start in self.comment_starts:
if cleaned_line.startswith(comment_start):
return True
return False
@cached_property
def is_well_quoted_value(self) -> bool:
"""Well quoted value - means the value has been quoted or has line wrap"""
result = False
if self.value_leftquote and self.value_rightquote:
if self.value_leftquote == self.value_rightquote:
# regex caught well
return True
if 1 == len(self.value_leftquote):
leftquote = self.value_leftquote
else:
# right side symbol should be a quote
leftquote = self.value_leftquote[-1]
if leftquote not in self.quotation_marks:
leftquote = ""
if 1 == len(self.value_rightquote):
rightquote = self.value_rightquote
else:
# clean \ sign in escaping text
for q in self.value_rightquote:
if q in self.quotation_marks:
rightquote = q
break
else:
rightquote = ""
result = bool(leftquote) and ( #
bool(rightquote) and (leftquote == rightquote) # normal case
or '\\' == self.value_rightquote and '\\' == self.line[-1] # line wrap
)
elif self.value_leftquote:
result = ( #
('\\' == self.value_rightquote or '\\' == self.value[-1]) and '\\' == self.line[-1] # line wrap
or '.php' == self.file_type # php may use multiline string
or 3 == self.value_leftquote.count('"') or 3 == self.value_leftquote.count("'") # python multiline
)
return result
@cached_property
def is_quoted(self) -> bool:
"""Check if variable and value in a quoted string.
Return:
True if candidate in a quoted string, False otherwise
"""
left_quote = None
if 0 < self.variable_start:
for i in self.line[:self.variable_start]:
if i in ('"', "'", '`'):
left_quote = i
break
right_quote = None
if len(self.line) > self.value_end:
for i in self.line[self.value_end:]:
if i in ('"', "'", '`'):
right_quote = i
break
result = bool(left_quote) and bool(right_quote) and left_quote == right_quote
return result
def is_source_file(self) -> bool:
"""Check if file with credential is a source code file or not (data, log, plain text).
Return:
True if file is source file, False otherwise
"""
if not self.path:
return False
if Util.get_extension(self.path) in self.config.source_extensions:
return True
return False
def is_source_file_with_quotes(self) -> bool:
"""Check if file with credential require quotation for string literals.
Return:
True if file require quotation, False otherwise
"""
file_type = self.file_type or Util.get_extension(self.path)
return bool(file_type) and file_type in self.config.source_quote_ext
@staticmethod
def get_hash_or_subtext(
text: Optional[str], #
hashed: bool, #
cut_pos: Optional[StartEnd] = None, #
) -> Optional[str]:
"""Represent not empty text with hash or a "beauty" subtext if required
Args:
text: str - input string
hashed: bool - whether the text will be hashed and returned
cut_pos: Optional[StartEnd] - start, end positions which text must be kept in output
Return:
sha256 hash in hex representation of input text with UTF-8 encodings
or
subtext from start to end, or original text as is
"""
if text:
if hashed:
text = hashlib.sha256(text.encode(UTF_8, errors="strict")).hexdigest()
elif cut_pos is not None:
if 2 * ML_HUNK < cut_pos.end - cut_pos.start:
# subtext positions exceed the limit
text = text[cut_pos.start:cut_pos.end]
else:
strip_text = text.strip()
if 2 * ML_HUNK >= len(strip_text):
# stripped text length meets the limit
text = strip_text
else:
offset = len(text) - len(text.lstrip())
center = (cut_pos.end + cut_pos.start - offset) >> 1
text = Util.subtext(strip_text, center, ML_HUNK)
return text
def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
"""Represent line_data with subtext or|and hashed values"""
cut_pos = StartEnd(self.variable_start, self.value_end) if subtext else None
return f"path: {self.path}" \
f" | line_num: {self.line_num}" \
f" | value: '{self.get_hash_or_subtext(self.value, hashed)}'" \
f" | line: '{self.get_hash_or_subtext(self.line, hashed, cut_pos)}'"
def __str__(self):
return self.to_str()
def __repr__(self):
return self.to_str(subtext=True)
def to_json(self, hashed: bool, subtext: bool) -> Dict:
"""Convert line data object to dictionary.
Return:
Dictionary object generated from current line data
"""
cut_pos = StartEnd(self.variable_start if 0 <= self.variable_start else self.value_start,
self.value_end) if subtext else None
if isinstance(self.value, str):
entropy = round(Util.get_shannon_entropy(self.value), 5)
else:
entropy = None
full_output = {
"key": self.key,
"line": self.get_hash_or_subtext(self.line, hashed, cut_pos),
"line_num": self.line_num,
"path": self.path,
# info may contain variable name - so let it be hashed if requested
"info": self.get_hash_or_subtext(self.info, hashed),
"pattern": self.pattern.pattern,
"variable": self.get_hash_or_subtext(self.variable, hashed),
"variable_start": self.variable_start,
"variable_end": self.variable_end,
"separator": self.separator,
"separator_start": self.separator_start,
"separator_end": self.separator_end,
"value": self.get_hash_or_subtext(self.value, hashed),
"value_start": self.value_start,
"value_end": self.value_end,
"entropy": entropy,
"value_leftquote": self.value_leftquote,
"value_rightquote": self.value_rightquote,
}
reported_output = {k: v for k, v in full_output.items() if k in self.config.line_data_output}
return reported_output
def get_colored_line(self, hashed: bool, subtext: bool = False) -> str:
"""Represents the LineData with a value, separator, and variable color formatting"""
if hashed:
# return colored hash
return Fore.LIGHTGREEN_EX \
+ self.get_hash_or_subtext(self.line, hashed,
StartEnd(self.value_start, self.value_end) if subtext else None) \
+ Style.RESET_ALL
# at least, value must present
line = self.line[:self.value_start] \
+ Fore.LIGHTYELLOW_EX \
+ self.line[self.value_start:self.value_end] \
+ Style.RESET_ALL \
+ self.line[self.value_end:] # noqa: E127
# separator may be missing
if 0 <= self.separator_start < self.separator_end <= self.value_start:
line = line[:self.separator_start] \
+ Fore.LIGHTGREEN_EX \
+ line[self.separator_start:self.separator_end] \
+ Style.RESET_ALL \
+ line[self.separator_end:]
# variable may be missing
if 0 <= self.separator_start \
and 0 <= self.variable_start < self.variable_end <= self.separator_end <= self.value_start \
or 0 <= self.variable_start < self.variable_end <= self.value_start:
line = line[:self.variable_start] \
+ Fore.LIGHTBLUE_EX \
+ line[self.variable_start:self.variable_end] \
+ Style.RESET_ALL \
+ line[self.variable_end:]
if subtext:
# display part of the text, centered around the start of the value, style reset at the end as a fallback
line = f"{Util.subtext(line, self.value_start + len(line) - len(self.line), ML_HUNK)}{Style.RESET_ALL}"
return line
================================================
FILE: credsweeper/deep_scanner/__init__.py
================================================
================================================
FILE: credsweeper/deep_scanner/abstract_scanner.py
================================================
import contextlib
import datetime
import logging
from abc import abstractmethod, ABC
from typing import List, Optional, Tuple, Any, Generator
from credsweeper.common.constants import RECURSIVE_SCAN_LIMITATION, MIN_DATA_LEN, DEFAULT_ENCODING, UTF_8, \
MIN_VALUE_LENGTH
from credsweeper.config.config import Config
from credsweeper.credentials.augment_candidates import augment_candidates
from credsweeper.credentials.candidate import Candidate
from credsweeper.file_handler.byte_content_provider import ByteContentProvider
from credsweeper.file_handler.content_provider import ContentProvider
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.descriptor import Descriptor
from credsweeper.file_handler.diff_content_provider import DiffContentProvider
from credsweeper.file_handler.file_path_extractor import FilePathExtractor
from credsweeper.file_handler.string_content_provider import StringContentProvider
from credsweeper.file_handler.struct_content_provider import StructContentProvider
from credsweeper.file_handler.text_content_provider import TextContentProvider
from credsweeper.scanner.scanner import Scanner
logger = logging.getLogger(__name__)
class AbstractScanner(ABC):
"""Base abstract class for all recursive scanners"""
@property
@abstractmethod
def config(self) -> Config:
"""Abstract property to be defined in DeepScanner"""
raise NotImplementedError(__name__)
@property
@abstractmethod
def scanner(self) -> Scanner:
"""Abstract property to be defined in DeepScanner"""
raise NotImplementedError(__name__)
@abstractmethod
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Abstract method to be defined in DeepScanner"""
raise NotImplementedError(__name__)
@staticmethod
@abstractmethod
def get_deep_scanners(data: bytes, descriptor: Descriptor, depth: int) -> Tuple[List[Any], List[Any]]:
"""Returns possibly scan methods for the data depends on content and fallback scanners"""
raise NotImplementedError(__name__)
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def recursive_scan(
self, #
data_provider: DataContentProvider, #
depth: int = 0, #
recursive_limit_size: int = 0) -> List[Candidate]:
"""Recursive function to scan files which might be containers like ZIP archives
Args:
data_provider: DataContentProvider object may be a container
depth: maximal level of recursion
recursive_limit_size: maximal bytes of opened files to prevent recursive zip-bomb attack
"""
candidates: List[Candidate] = []
if 0 > depth:
# break recursion if maximal depth is reached
logger.debug("Bottom reached %s recursive_limit_size:%d", data_provider.file_path, recursive_limit_size)
return candidates
depth -= 1
if MIN_DATA_LEN > len(data_provider.data):
# break recursion for minimal data size
logger.debug("Too small data: size=%d, depth=%d, limit=%d, path=%s, info=%s", len(data_provider.data),
depth, recursive_limit_size, data_provider.file_path, data_provider.info)
return candidates
logger.debug("Start data_scan: size=%d, depth=%d, limit=%d, path=%s, info=%s", len(data_provider.data), depth,
recursive_limit_size, data_provider.file_path, data_provider.info)
if FilePathExtractor.is_find_by_ext_file(self.config, data_provider.file_type):
# Skip scanning file and makes fake candidate due the extension is suspicious
dummy_candidate = Candidate.get_dummy_candidate(self.config, data_provider.file_path,
data_provider.file_type, data_provider.info,
FilePathExtractor.FIND_BY_EXT_RULE)
candidates.append(dummy_candidate)
else:
new_candidates = self.deep_scan_with_fallback(data_provider, depth, recursive_limit_size)
augment_candidates(candidates, new_candidates)
return candidates
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@staticmethod
def key_value_combination(structure: dict) -> Generator[Tuple[Any, Any], None, None]:
"""Combine items by `key` and `value` from a dictionary for augmentation
{..., "key": "api_key", "value": "XXXXXXX", ...} -> ("api_key", "XXXXXXX")
"""
for key_id in ("key", "KEY", "Key"):
if key_id in structure:
struct_key = structure.get(key_id)
break
else:
struct_key = None
if isinstance(struct_key, bytes):
# sqlite table may produce bytes for `key`
with contextlib.suppress(UnicodeError):
struct_key = struct_key.decode(UTF_8)
# only str type is common used for the augmentation
if struct_key and isinstance(struct_key, str):
for value_id in ("value", "VALUE", "Value"):
if value_id in structure:
struct_value = structure.get(value_id)
if struct_value and isinstance(struct_value, (str, bytes)):
yield struct_key, struct_value
# break in successful case
break
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@staticmethod
def structure_processing(structure: Any) -> Generator[Tuple[Any, Any], None, None]:
"""Yields pair `key, value` from given structure if applicable"""
if isinstance(structure, dict):
# transform dictionary to list
for key, value in structure.items():
if not value:
# skip empty values
continue
if isinstance(value, (list, tuple)):
if 1 == len(value):
# simplify some structures like YAML when single item in new line is a value
yield key, value[0]
continue
# all other data will be precessed in next code
yield key, value
yield from AbstractScanner.key_value_combination(structure)
elif isinstance(structure, (list, tuple)):
# enumerate the items to fit for return structure
for key, value in enumerate(structure):
yield key, value
else:
logger.warning("Not supported type:%s val:%s", str(type(structure)), repr(structure))
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def structure_scan(
self, #
struct_provider: StructContentProvider, #
depth: int, #
recursive_limit_size: int) -> List[Candidate]:
"""Recursive function to scan structured data
Args:
struct_provider: DataContentProvider object may be a container
depth: maximal level of recursion
recursive_limit_size: maximal bytes of opened files to prevent recursive zip-bomb attack
"""
candidates: List[Candidate] = []
logger.debug("Start struct_scan: depth=%d, limit=%d, path=%s, info=%s", depth, recursive_limit_size,
struct_provider.file_path, struct_provider.info)
if 0 > depth:
# break recursion if maximal depth is reached
logger.debug("Bottom reached %s recursive_limit_size:%d", struct_provider.file_path, recursive_limit_size)
return candidates
depth -= 1
augmented_lines_for_keyword_rules = []
for key, value in AbstractScanner.structure_processing(struct_provider.struct):
# a keyword rule may be applicable for `key` (str only) and `value` (str, bytes)
keyword_match = bool(isinstance(key, str) and self.scanner.keywords_required_substrings_check(key.lower()))
if isinstance(value, (dict, list, tuple)) and value:
# recursive scan for not empty structured `value`
val_struct_provider = StructContentProvider(struct=value,
file_path=struct_provider.file_path,
file_type=struct_provider.file_type,
info=f"{struct_provider.info}|STRUCT:{key}")
new_candidates = self.structure_scan(val_struct_provider, depth, recursive_limit_size)
candidates.extend(new_candidates)
elif isinstance(value, bytes):
# recursive data scan
if MIN_DATA_LEN <= len(value):
bytes_struct_provider = DataContentProvider(data=value,
file_path=struct_provider.file_path,
file_type=struct_provider.file_type,
info=f"{struct_provider.info}|BYTES:{key}")
new_limit = recursive_limit_size - len(value)
new_candidates = self.recursive_scan(bytes_struct_provider, depth, new_limit)
candidates.extend(new_candidates)
if keyword_match and MIN_VALUE_LENGTH <= len(value):
augmented_lines_for_keyword_rules.append(f"{key} = {repr(value)}")
elif isinstance(value, str):
# recursive text scan with transformation into bytes
stripped_value = value.strip()
if MIN_DATA_LEN <= len(stripped_value):
# recursive scan only for data which may be decoded at least
with contextlib.suppress(UnicodeError):
data = stripped_value.encode(encoding=DEFAULT_ENCODING, errors='strict')
str_struct_provider = DataContentProvider(data=data,
file_path=struct_provider.file_path,
file_type=struct_provider.file_type,
info=f"{struct_provider.info}|STRING:{key}")
new_limit = recursive_limit_size - len(str_struct_provider.data)
new_candidates = self.recursive_scan(str_struct_provider, depth, new_limit)
candidates.extend(new_candidates)
if keyword_match and MIN_VALUE_LENGTH <= len(stripped_value):
augmented_lines_for_keyword_rules.append(f"{key} = {repr(stripped_value)}")
elif not value or isinstance(value, (int, float, datetime.date, datetime.datetime)):
# skip useless types
pass
else:
logger.warning("Not supported type:%s value(%s)", str(type(value)), str(value))
if augmented_lines_for_keyword_rules:
str_provider = StringContentProvider(augmented_lines_for_keyword_rules,
file_path=struct_provider.file_path,
file_type=struct_provider.file_type,
info=f"{struct_provider.info}|KEYWORD")
new_candidates = self.scanner.scan(str_provider)
augment_candidates(candidates, new_candidates)
return candidates
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def deep_scan_with_fallback(self, data_provider: DataContentProvider, depth: int,
recursive_limit_size: int) -> List[Candidate]:
"""Scans with deep scanners and fallback scanners if possible
Args:
data_provider: DataContentProvider with raw data
depth: maximal level of recursion
recursive_limit_size: maximal bytes of opened files to prevent recursive zip-bomb attack
Returns: list with candidates
"""
candidates: List[Candidate] = []
deep_scanners, fallback_scanners = self.get_deep_scanners(data_provider.data, data_provider.descriptor, depth)
fallback = True
for scan_class in deep_scanners:
new_candidates = scan_class.data_scan(self, data_provider, depth, recursive_limit_size)
if new_candidates is None:
# scanner did not recognise the content type
continue
augment_candidates(candidates, new_candidates)
# this scan is successful, so fallback is not necessary
fallback = False
if fallback:
for scan_class in fallback_scanners:
fallback_candidates = scan_class.data_scan(self, data_provider, depth, recursive_limit_size)
if fallback_candidates is None:
continue
augment_candidates(candidates, fallback_candidates)
# use only first successful fallback scanner
break
return candidates
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def scan(self,
content_provider: ContentProvider,
depth: int,
recursive_limit_size: Optional[int] = None) -> List[Candidate]:
"""Initial scan method to launch recursive scan. Skips ByteScanner to prevent extra scan
Args:
content_provider: ContentProvider that might contain raw data
depth: maximal level of recursion
recursive_limit_size: maximal bytes of opened files to prevent recursive zip-bomb attack
"""
recursive_limit_size = recursive_limit_size if isinstance(recursive_limit_size,
int) else RECURSIVE_SCAN_LIMITATION
candidates: List[Candidate] = []
data: Optional[bytes] = None
if isinstance(content_provider, (TextContentProvider, ByteContentProvider)):
# Feature to scan files which might be containers
data = content_provider.data
info = f"FILE:{content_provider.file_path}"
elif isinstance(content_provider, DiffContentProvider) and content_provider.diff:
candidates = self.scanner.scan(content_provider)
# Feature to scan binary diffs
diff = content_provider.diff[0].get("line")
# the check for legal fix mypy issue
if isinstance(diff, bytes):
data = diff
info = f"DIFF:{content_provider.file_path}"
else:
logger.warning("Content provider %s does not support deep scan", type(content_provider))
info = "NA"
if data:
data_provider = DataContentProvider(data=data,
file_path=content_provider.file_path,
file_type=content_provider.file_type,
info=content_provider.info or info)
new_candidates = self.deep_scan_with_fallback(data_provider, depth, recursive_limit_size - len(data))
augment_candidates(candidates, new_candidates)
return candidates
================================================
FILE: credsweeper/deep_scanner/byte_scanner.py
================================================
import logging
from abc import ABC
from typing import List, Optional
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.byte_content_provider import ByteContentProvider
from credsweeper.file_handler.data_content_provider import DataContentProvider
logger = logging.getLogger(__name__)
class ByteScanner(AbstractScanner, ABC):
"""Implements plain data scanning"""
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to represent data as plain text with splitting by lines and scan as text lines"""
byte_content_provider = ByteContentProvider(content=data_provider.data,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|RAW")
return self.scanner.scan(byte_content_provider)
================================================
FILE: credsweeper/deep_scanner/bzip2_scanner.py
================================================
import bz2
import logging
from abc import ABC
from pathlib import Path
from typing import List, Optional
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class Bzip2Scanner(AbstractScanner, ABC):
"""Implements bzip2 scanning"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""According https://en.wikipedia.org/wiki/Bzip2"""
if data.startswith(b"\x42\x5A\x68") and 10 <= len(data) \
and 0x31 <= data[3] <= 0x39 \
and 4 == data.find(b"\x31\x41\x59\x26\x53\x59", 4, 10):
return True
return False
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Extracts data from bzip2 archive and launches data_scan"""
try:
file_path = Path(data_provider.file_path)
new_path = file_path.as_posix()
if ".bz2" == file_path.suffix:
new_path = new_path[:-4]
bzip2_content_provider = DataContentProvider(data=bz2.decompress(data_provider.data),
file_path=new_path,
file_type=Util.get_extension(new_path),
info=f"{data_provider.info}|BZIP2:{file_path}")
new_limit = recursive_limit_size - len(bzip2_content_provider.data)
bzip2_candidates = self.recursive_scan(bzip2_content_provider, depth, new_limit)
return bzip2_candidates
except Exception as bzip2_exc:
logger.warning("%s:%s", data_provider.file_path, bzip2_exc)
return None
================================================
FILE: credsweeper/deep_scanner/crx_scanner.py
================================================
import logging
import struct
from abc import ABC
from typing import List, Optional
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
logger = logging.getLogger(__name__)
class CrxScanner(AbstractScanner, ABC):
"""Implements CRX files scanning with cut-off prefix"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""Returns True if prefix match"""
if data.startswith((b"Cr24\x02\x00\x00\x00", b"Cr24\x03\x00\x00\x00")) and 32 < len(data):
return True
return False
@staticmethod
def zip_extract(data: bytes) -> bytes:
"""Extracts zip payload after signature block"""
pubkey_length = struct.unpack(" Optional[List[Candidate]]:
"""Tries cut-off header and use ZIP payload"""
try:
zip_data = CrxScanner.zip_extract(data_provider.data)
zip_content_provider = DataContentProvider(data=zip_data,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|CRX")
new_limit = recursive_limit_size - len(zip_data)
crx_candidates = self.recursive_scan(zip_content_provider, depth, new_limit)
return crx_candidates
except Exception as exc:
logger.warning(exc)
return None
================================================
FILE: credsweeper/deep_scanner/csv_scanner.py
================================================
import csv
import io
import logging
import re
from abc import ABC
from typing import List, Optional, Dict, Any
from credsweeper.common.constants import MAX_LINE_LENGTH
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.struct_content_provider import StructContentProvider
logger = logging.getLogger(__name__)
class CsvScanner(AbstractScanner, ABC):
"""Implements CSV scanning"""
sniffer = csv.Sniffer()
# do not use space as separator to avoid hallucinations
DELIMITERS = ",;\t|\x1F"
CSV_PATTERN = re.compile(b"[^\r\n]{1,8000}[,;\t|\x1F][^\r\n]{1,8000}")
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""Check if data MAY be in CSV format"""
end_pos = data.find(b'\n', 0, MAX_LINE_LENGTH)
if 0 > end_pos:
# classic Mac OS format
end_pos = data.find(b'\r', 0, MAX_LINE_LENGTH)
if 0 <= end_pos:
if CsvScanner.CSV_PATTERN.match(data, pos=0, endpos=end_pos):
return True
return False
@classmethod
def get_structure(cls, text: str) -> List[Dict[str, Any]]:
"""Reads a text as CSV standard with guessed dialect"""
# windows style \r\n
first_line_end = text.find('\r', 0, MAX_LINE_LENGTH)
line_terminator = "\r\n"
if 0 > first_line_end:
# unix style \n
first_line_end = text.find('\n', 0, MAX_LINE_LENGTH)
line_terminator = "\n"
if 0 > first_line_end:
raise ValueError(f"No suitable line end found in {MAX_LINE_LENGTH} symbols")
first_line = text[:first_line_end]
dialect = cls.sniffer.sniff(first_line, delimiters=cls.DELIMITERS)
rows = []
reader = csv.DictReader(io.StringIO(text),
delimiter=dialect.delimiter,
lineterminator=line_terminator,
strict=True)
# check the constant columns number for all rows
fields_number = sum(1 for x in reader.fieldnames if x is not None)
for row in reader:
if not isinstance(row, dict):
raise ValueError(f"ERROR: wrong row '{row}'")
if len(row) != fields_number or any(x is None for x in row.values()):
# None means no separator used
raise ValueError(f"Different columns number in row '{row}' - mismatch {fields_number}")
rows.append(row)
return rows
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to scan each row as structure with column name in key"""
try:
if rows := self.get_structure(data_provider.text):
struct_content_provider = StructContentProvider(struct=rows,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|CSV")
new_limit = recursive_limit_size - sum(len(x) for x in rows)
struct_candidates = self.structure_scan(struct_content_provider, depth, new_limit)
return struct_candidates
except Exception as csv_exc:
logger.debug("%s:%s", data_provider.file_path, csv_exc)
return None
================================================
FILE: credsweeper/deep_scanner/deb_scanner.py
================================================
import logging
import struct
from abc import ABC
from typing import List, Optional, Generator, Tuple
from credsweeper.common.constants import MIN_DATA_LEN, UTF_8
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class DebScanner(AbstractScanner, ABC):
"""Implements deb (ar) scanning"""
__header_size = 60
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""According https://en.wikipedia.org/wiki/Deb_(file_format)"""
if data.startswith(b"!\n"):
return True
return False
@staticmethod
def walk_deb(data: bytes) -> Generator[Tuple[int, str, bytes], None, None]:
"""Processes sequence of DEB archive and yields offset, name and data"""
offset = 8 # b"!\n"
data_limit = len(data) - DebScanner.__header_size
while offset <= data_limit:
_data = data[offset:offset + DebScanner.__header_size]
offset += DebScanner.__header_size
# basic header structure
_name, _, _size, __ = struct.unpack('16s32s10s2s', _data)
file_size = int(_size)
if MIN_DATA_LEN < file_size <= len(data) - offset:
_data = data[offset:offset + file_size]
yield offset, _name.decode(encoding=UTF_8).strip().rstrip('/'), _data
offset += file_size if 0 == 1 & file_size else file_size + 1
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Extracts data file from .ar (debian) archive and launches data_scan"""
try:
candidates: List[Candidate] = []
for offset, name, data in DebScanner.walk_deb(data_provider.data):
deb_content_provider = DataContentProvider(data=data,
file_path=f"{data_provider.file_path}",
file_type=Util.get_extension(name),
info=f"{data_provider.info}|DEB:0x{offset:x}:{name}")
new_limit = recursive_limit_size - len(data)
deb_candidates = self.recursive_scan(deb_content_provider, depth, new_limit)
candidates.extend(deb_candidates)
return candidates
except Exception as exc:
logger.warning(exc)
return None
================================================
FILE: credsweeper/deep_scanner/deep_scanner.py
================================================
import logging
import re
from typing import List, Any, Tuple, Union, Dict
from credsweeper.common.constants import MIN_DATA_LEN
from credsweeper.config.config import Config
from credsweeper.deep_scanner.byte_scanner import ByteScanner
from credsweeper.deep_scanner.bzip2_scanner import Bzip2Scanner
from credsweeper.deep_scanner.crx_scanner import CrxScanner
from credsweeper.deep_scanner.csv_scanner import CsvScanner
from credsweeper.deep_scanner.deb_scanner import DebScanner
from credsweeper.deep_scanner.docx_scanner import DocxScanner
from credsweeper.deep_scanner.eml_scanner import EmlScanner
from credsweeper.deep_scanner.encoder_scanner import EncoderScanner
from credsweeper.deep_scanner.gzip_scanner import GzipScanner
from credsweeper.deep_scanner.html_scanner import HtmlScanner
from credsweeper.deep_scanner.jclass_scanner import JclassScanner
from credsweeper.deep_scanner.jks_scanner import JksScanner
from credsweeper.deep_scanner.lang_scanner import LangScanner
from credsweeper.deep_scanner.lzma_scanner import LzmaScanner
from credsweeper.deep_scanner.mxfile_scanner import MxfileScanner
from credsweeper.deep_scanner.patch_scanner import PatchScanner
from credsweeper.deep_scanner.pdf_scanner import PdfScanner
from credsweeper.deep_scanner.pkcs_scanner import PkcsScanner
from credsweeper.deep_scanner.png_scanner import PngScanner
from credsweeper.deep_scanner.pptx_scanner import PptxScanner
from credsweeper.deep_scanner.rpm_scanner import RpmScanner
from credsweeper.deep_scanner.rtf_scanner import RtfScanner
from credsweeper.deep_scanner.sqlite3_scanner import Sqlite3Scanner
from credsweeper.deep_scanner.strings_scanner import StringsScanner
from credsweeper.deep_scanner.tar_scanner import TarScanner
from credsweeper.deep_scanner.tmx_scanner import TmxScanner
from credsweeper.deep_scanner.xlsx_scanner import XlsxScanner
from credsweeper.deep_scanner.xml_scanner import XmlScanner
from credsweeper.deep_scanner.zip_scanner import ZipScanner
from credsweeper.deep_scanner.zlib_scanner import ZlibScanner
from credsweeper.file_handler.descriptor import Descriptor
from credsweeper.scanner.scanner import Scanner
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class DeepScanner(
ByteScanner, #
Bzip2Scanner, #
CrxScanner, #
CsvScanner, #
DocxScanner, #
EncoderScanner, #
GzipScanner, #
HtmlScanner, #
JclassScanner, #
JksScanner, #
LangScanner, #
LzmaScanner, #
MxfileScanner, #
EmlScanner, #
PatchScanner, #
PdfScanner, #
PkcsScanner, #
PngScanner, #
PptxScanner, #
RtfScanner, #
RpmScanner, #
Sqlite3Scanner, #
StringsScanner, #
TarScanner, #
DebScanner, #
XmlScanner, #
XlsxScanner, #
ZipScanner, #
ZlibScanner, #
): # yapf: disable
"""Advanced scanner with recursive exploring of data"""
def __init__(self, config: Config, scanner: Scanner) -> None:
"""Initialize Advanced credential scanner.
Args:
scanner: CredSweeper scanner object
config: dictionary variable, stores analyzer features
"""
self.__config = config
self.__scanner = scanner
@property
def config(self) -> Config:
return self.__config
@property
def scanner(self) -> Scanner:
return self.__scanner
# manually crafted dict to detect a media format with first byte, prefix and optionally pattern
MEDIA_PATTERNS: Dict[int, List[Tuple[bytes, re.Pattern]]] = {
0x00: [
# JPEG2000
(b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A", None),
# ICO
(b"\x00\x00\x01\x00", None),
# TTF
(b"\x00\x01\x00\x00\x00", None),
# 3gp
(b"\x00\x00\x00", re.compile(b"\x00\x00\x00.ftyp3g")),
# GITCRYPT is not a media but added to use pedantic scan for strings and reduce extra warnings
(b"\x00GITCRYPT\x00", None),
],
0x1A: [
# Matroska
(b"\x1A\x45\xDF\xA3", None),
],
0x7F: [
# ELF signature - to quick pass for strings scanner
(b"\x7FELF", re.compile(b"\x7FELF[\x01\x02][\x01\x02]\x01[\x00-\x12]"))
],
0x89: [
# PNG - can store text chunks inside
(b"\x89PNG\x0D\x0A\x1A\x0A", None),
],
0xFF: [
# JPEG or MPEG-1 Layer 3
(b"\xFF", re.compile(b"\xFF(\xD8\xFF[\xDB\xEE\xE1\xE0\x51]|[\xFB\xF3\xF2])")),
],
ord('8'): [
# PSD
(b"8BPS\x00\x01\x00\x00\x00\x00\x00\x00", None),
# PSB
(b"8BPS\x00\x02\x00\x00\x00\x00\x00\x00", None),
],
ord('B'): [
# BMP
(b"BM", re.compile(b"BM.{2}\x00{4}")),
],
ord('G'): [
# GIF
(b"GIF8", re.compile(b"GIF8[79]a[^\x00-\x08\x0C\x0E\x1F\x80-\xFF]{0,4096}[\x00-\x08\x0C\x0E\x1F\x80-\xFF]")
),
],
ord('I'): [
# TIFF little endian
(b"II", re.compile(b"II[+*]\x00[^\x00-\x08\x0C\x0E\x1F\x80-\xFF]{0,4096}[\x00-\x08\x0C\x0E\x1F\x80-\xFF]")),
# ID2v3 for various media (e.g. MP3)
(b"ID3\x03\x00\x00\x00", None),
],
ord('M'): [
# TIFF big endian
(b"MM", re.compile(b"MM\x00[+*][^\x00-\x08\x0C\x0E\x1F\x80-\xFF]{0,4096}[\x00-\x08\x0C\x0E\x1F\x80-\xFF]")),
],
ord('O'): [
# OGG
(b"OggS", re.compile(b"OggS[^\x00-\x08\x0C\x0E\x1F\x80-\xFF]{0,4096}[\x00-\x08\x0C\x0E\x1F\x80-\xFF]")),
# OpenType font file
(b"OTTO\x00",
re.compile(b"OTTO\x00[^\x00-\x08\x0C\x0E\x1F\x80-\xFF]{0,4096}[\x00-\x08\x0C\x0E\x1F\x80-\xFF]")),
],
ord('R'): [
# RIFF va
(b"RIF",
re.compile(b"RIF[FX].{4}[ 0-9A-Za-z]{4}"
b"[^\x00-\x08\x0C\x0E\x1F\x80-\xFF]{0,4096}[\x00-\x08\x0C\x0E\x1F\x80-\xFF]")),
],
ord('X'): [
# Macromedia
(b"XFIR",
re.compile(b"XFIR.{4}[ 0-9A-Za-z]{4}"
b"[^\x00-\x08\x0C\x0E\x1F\x80-\xFF]{0,4096}[\x00-\x08\x0C\x0E\x1F\x80-\xFF]")),
],
ord('f'): [
# mp4
(b"ftyp",
re.compile(b"ftyp(isom|MSNV)[^\x00-\x08\x0C\x0E\x1F\x80-\xFF]{0,4096}[\x00-\x08\x0C\x0E\x1F\x80-\xFF]")),
],
ord('g'): [
# gimp
(b"gimp xcf",
re.compile(b"gimp xcf (file|v001|v002)\x00"
b"[^\x00-\x08\x0C\x0E\x1F\x80-\xFF]{0,4096}[\x00-\x08\x0C\x0E\x1F\x80-\xFF]")),
],
ord('w'): [
# WOFF 1.0, 2.0
(b"wOF", re.compile(b"wOF[2F][^\x00-\x08\x0C\x0E\x1F\x80-\xFF]{0,4096}[\x00-\x08\x0C\x0E\x1F\x80-\xFF]")),
],
}
@staticmethod
def is_media(data: Union[bytes, bytearray]) -> bool:
"""Returns True if well-known media format found"""
if patterns := DeepScanner.MEDIA_PATTERNS.get(data[0]):
for prefix, pattern in patterns:
# use prefix for speed-up total search
if prefix and data.startswith(prefix) and (pattern is None or pattern.match(data)):
return True
return False
@staticmethod
def get_deep_scanners(data: bytes, descriptor: Descriptor, depth: int) -> Tuple[List[Any], List[Any]]:
"""Returns possibly scan methods for the data depends on content and fallback scanners"""
deep_scanners: List[Any] = []
fallback_scanners: List[Any] = []
if not data or not isinstance(data, (bytes, bytearray)) or len(data) < MIN_DATA_LEN:
# Guard clause: reject empty or invalid input data early
pass
elif ZipScanner.match(data):
if 0 < depth:
deep_scanners.append(ZipScanner)
# probably, there might be a docx, xlsx and so on.
# It might be scanned with text representation in third-party libraries.
if descriptor.extension in (".xlsx", ".ods"):
deep_scanners.append(XlsxScanner)
else:
fallback_scanners.append(XlsxScanner)
if ".docx" == descriptor.extension:
deep_scanners.append(DocxScanner)
else:
fallback_scanners.append(DocxScanner)
if ".pptx" == descriptor.extension:
deep_scanners.append(PptxScanner)
else:
fallback_scanners.append(PptxScanner)
elif XlsxScanner.match(data):
if ".xls" == descriptor.extension:
deep_scanners.append(XlsxScanner)
else:
fallback_scanners.append(XlsxScanner)
elif Bzip2Scanner.match(data):
if 0 < depth:
deep_scanners.append(Bzip2Scanner)
elif LzmaScanner.match(data):
if 0 < depth:
deep_scanners.append(LzmaScanner)
elif TarScanner.match(data):
if 0 < depth:
deep_scanners.append(TarScanner)
elif DebScanner.match(data):
if 0 < depth:
deep_scanners.append(DebScanner)
elif GzipScanner.match(data):
if 0 < depth:
deep_scanners.append(GzipScanner)
elif PdfScanner.match(data):
deep_scanners.append(PdfScanner)
elif PngScanner.match(data):
deep_scanners.append(PngScanner)
elif RpmScanner.match(data):
if 0 < depth:
deep_scanners.append(RpmScanner)
elif JclassScanner.match(data):
deep_scanners.append(JclassScanner)
elif JksScanner.match(data):
deep_scanners.append(JksScanner)
elif Sqlite3Scanner.match(data):
if 0 < depth:
deep_scanners.append(Sqlite3Scanner)
elif PkcsScanner.match(data):
deep_scanners.append(PkcsScanner)
elif CrxScanner.match(data):
if 0 < depth:
deep_scanners.append(CrxScanner)
elif RtfScanner.match(data):
deep_scanners.append(RtfScanner)
fallback_scanners.append(ByteScanner)
elif XmlScanner.match(data):
if HtmlScanner.match(data):
deep_scanners.append(HtmlScanner)
deep_scanners.append(XmlScanner)
fallback_scanners.append(ByteScanner)
elif MxfileScanner.match(data):
deep_scanners.append(MxfileScanner)
deep_scanners.append(XmlScanner)
fallback_scanners.append(ByteScanner)
elif TmxScanner.match(data):
deep_scanners.append(TmxScanner)
fallback_scanners.append(XmlScanner)
fallback_scanners.append(ByteScanner)
else:
deep_scanners.append(XmlScanner)
fallback_scanners.append(ByteScanner)
elif EmlScanner.match(data):
if descriptor.extension in (".eml", ".mht"):
deep_scanners.append(EmlScanner)
else:
if 0 < depth:
# a formal patch looks like an eml
deep_scanners.append(PatchScanner)
fallback_scanners.append(EmlScanner)
fallback_scanners.append(ByteScanner)
elif DeepScanner.is_media(data):
# only StringsScanner may be applied for the formats effective
if 0 < depth:
fallback_scanners.append(StringsScanner)
elif not Util.is_binary(data):
# keep ByteScanner first to apply real value position if possible
deep_scanners.append(ByteScanner)
if 0 < depth:
deep_scanners.append(PatchScanner)
deep_scanners.append(LangScanner)
if CsvScanner.match(data):
deep_scanners.append(CsvScanner)
if EncoderScanner.match(data):
deep_scanners.append(EncoderScanner)
if ZlibScanner.match(data):
deep_scanners.append(ZlibScanner)
else:
if 0 < depth:
if ZlibScanner.match(data):
deep_scanners.append(ZlibScanner)
fallback_scanners.append(StringsScanner)
else:
deep_scanners.append(StringsScanner)
if not descriptor.info.endswith("|BASE64"):
logger.warning("Cannot apply a deep scanner for type %s prefix %s %d", descriptor, repr(data[:32]),
len(data))
return deep_scanners, fallback_scanners
================================================
FILE: credsweeper/deep_scanner/docx_scanner.py
================================================
import io
import logging
from abc import ABC
from typing import List, Optional
import docx
from docx.document import Document
from docx.oxml import CT_P, CT_Tbl, CT_SectPr, CT_TcPr
from docx.section import Section, _Header, _Footer
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from lxml.etree import _Element
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider
logger = logging.getLogger(__name__)
class DocxScanner(AbstractScanner, ABC):
"""Implements docx scanning"""
@staticmethod
def _iter_block_items(block):
if isinstance(block, Paragraph):
yield block
return
if isinstance(block, (_Header, _Footer)):
for table in block.tables:
for row in table.rows:
for cell in row.cells:
yield from DocxScanner._iter_block_items(cell)
yield from block.paragraphs
return
if isinstance(block, Section):
yield from DocxScanner._iter_block_items(block.header)
yield from DocxScanner._iter_block_items(block.footer)
return
if isinstance(block, Document):
parent_elm = block.element.body
elif isinstance(block, _Cell):
parent_elm = block._tc # pylint: disable=W0212
else:
raise ValueError(f"unrecognised:{type(block)}")
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, block)
elif isinstance(child, CT_Tbl):
table = Table(child, block)
for row in table.rows:
for cell in row.cells:
yield from DocxScanner._iter_block_items(cell)
elif isinstance(child, (CT_TcPr, CT_SectPr)):
# config
pass
elif isinstance(child, _Element):
yield child
else:
logger.warning("Unknown:%s", type(child))
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to scan DOCX text with splitting by lines"""
try:
docx_lines: List[str] = []
doc = docx.Document(io.BytesIO(data_provider.data))
for block in self._iter_block_items(doc):
if block.text:
docx_lines.append(block.text)
header_lines_set = set()
footer_lines_set = set()
for section in doc.sections:
for header in [section.first_page_header, section.even_page_header, section.header]:
for block in self._iter_block_items(header):
if block.text:
header_lines_set.add(block.text)
for footer in [section.first_page_footer, section.even_page_footer, section.footer]:
for block in self._iter_block_items(footer):
if block.text:
footer_lines_set.add(block.text)
docx_lines.extend(sorted(list(header_lines_set)))
docx_lines.extend(sorted(list(footer_lines_set)))
string_data_provider = StringContentProvider(lines=docx_lines,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|DOCX")
docx_candidates = self.scanner.scan(string_data_provider)
return docx_candidates
except Exception as docx_exc:
logger.warning("%s:%s", data_provider.file_path, docx_exc)
return None
================================================
FILE: credsweeper/deep_scanner/eml_scanner.py
================================================
import email
import logging
from abc import ABC
from typing import List, Optional
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.byte_content_provider import ByteContentProvider
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider
logger = logging.getLogger(__name__)
class EmlScanner(AbstractScanner, ABC):
"""Implements eml scanning"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""According to https://datatracker.ietf.org/doc/html/rfc822 lookup the fields: Date, From, To or Subject"""
if (b"\nDate:" in data or data.startswith(b"Date:")) \
and (b"\nFrom:" in data or data.startswith(b"From:")) \
and (b"\nTo:" in data or data.startswith(b"To:")) \
and (b"\nSubject:" in data or data.startswith(b"Subject:")):
return True
return False
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to scan EML with text representation"""
try:
candidates: List[Candidate] = []
msg = email.message_from_bytes(data_provider.data)
for part in msg.walk():
content_type = part.get_content_type()
body = part.get_payload(decode=True)
if not isinstance(body, (bytes, str)):
continue
if "text/plain" == content_type:
eml_text_data_provider = ByteContentProvider(
content=(body if isinstance(body, bytes) else body.encode()),
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|EML-TEXT")
eml_candidates = self.scanner.scan(eml_text_data_provider)
candidates.extend(eml_candidates)
else:
x_data_provider = DataContentProvider(data=(body if isinstance(body, bytes) else body.encode()),
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|EML-DATA")
new_limit = recursive_limit_size - len(body)
if "text/html" == content_type and x_data_provider.represent_as_html(
depth, new_limit, self.scanner.keywords_required_substrings_check):
string_data_provider = StringContentProvider(lines=x_data_provider.lines,
line_numbers=x_data_provider.line_numbers,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|EML-HTML")
html_candidates = self.scanner.scan(string_data_provider)
candidates.extend(html_candidates)
elif content_type.startswith("application"):
x_candidates = self.recursive_scan(x_data_provider, depth, new_limit)
candidates.extend(x_candidates)
else:
logger.warning("%s:%s:%s cannot be supported", data_provider.file_path, content_type,
type(body))
return candidates
except Exception as eml_exc:
logger.warning("%s:%s", data_provider.file_path, eml_exc)
return None
================================================
FILE: credsweeper/deep_scanner/encoder_scanner.py
================================================
import contextlib
import logging
import re
from abc import ABC
from typing import List, Optional
from credsweeper.common.constants import MAX_LINE_LENGTH
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
# 8 bytes are encoded to 12 symbols 12345678 -> MTIzNDU2Nzg=
MIN_ENCODED_DATA_LEN = 12
class EncoderScanner(AbstractScanner, ABC):
"""Implements recursive iteration when data might be encoded from base64"""
BASE64_PATTERN = re.compile(
rb"(\xFF\xFE|\xFE\xFF)?("
rb"(?:(?P[A-Z])|(?P[a-z])|(?P[0-9/+])|[\s\x00\\])+(?(a)(?(b)(?(c)(=+|$)|(?!x)x)|(?!x)x)|(?!x)x)|"
rb"(?:(?P[A-Z])|(?P[a-z])|(?P[0-9_-])|[\s\x00\\])+(?(e)(?(f)(?(g)(=+|$)|(?!x)x)|(?!x)x)|(?!x)x))")
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""Check if data MAY be base64 encoded with whitespaces (escaping too)"""
if len(data) >= MIN_ENCODED_DATA_LEN \
and EncoderScanner.BASE64_PATTERN.match(data, pos=0, endpos=MAX_LINE_LENGTH):
return True
return False
@staticmethod
def decode(text: str) -> Optional[bytes]:
"""Decodes base64 text with cleaning whitespaces. Returns None when the decoding fails"""
with contextlib.suppress(Exception):
return Util.decode_base64(text=Util.PEM_CLEANING_PATTERN.sub(r'', text).replace('\\', ''),
padding_safe=True,
urlsafe_detect=True)
return None
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to decode data from base64 encode to bytes and scan as bytes again"""
if decoded := EncoderScanner.decode(data_provider.text):
decoded_data_provider = DataContentProvider(data=decoded,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|BASE64")
new_limit = recursive_limit_size - len(decoded_data_provider.data)
return self.recursive_scan(decoded_data_provider, depth, new_limit)
return None
================================================
FILE: credsweeper/deep_scanner/gzip_scanner.py
================================================
import gzip
import io
import logging
from abc import ABC
from pathlib import Path
from typing import List, Optional
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class GzipScanner(AbstractScanner, ABC):
"""Realises gzip scanning"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""According https://www.rfc-editor.org/rfc/rfc1952"""
if data.startswith(b"\x1F\x8B\x08"):
return True
return False
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Extracts data from gzip archive and launches data_scan"""
try:
with gzip.open(io.BytesIO(data_provider.data)) as f:
file_path = Path(data_provider.file_path)
new_path = file_path.as_posix()
if ".gz" == file_path.suffix:
new_path = new_path[:-3]
gzip_content_provider = DataContentProvider(data=f.read(),
file_path=new_path,
file_type=Util.get_extension(new_path),
info=f"{data_provider.info}|GZIP:{new_path}")
new_limit = recursive_limit_size - len(gzip_content_provider.data)
gzip_candidates = self.recursive_scan(gzip_content_provider, depth, new_limit)
return gzip_candidates
except Exception as gzip_exc:
logger.warning("%s:%s", data_provider.file_path, gzip_exc)
return None
================================================
FILE: credsweeper/deep_scanner/html_scanner.py
================================================
import logging
from abc import ABC
from typing import List, Optional
from credsweeper.common.constants import MAX_LINE_LENGTH
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider
logger = logging.getLogger(__name__)
class HtmlScanner(AbstractScanner, ABC):
"""Implements html scanning if possible"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""Used to detect html format. Suppose, invocation of is_xml() was True before."""
for opening_tag, closing_tag in [(b""), (b""), (b""),
(b"", b"
"), (b"", b""), (b"", b"
"),
(b"", b""), (b"", b"
"), (b""),
(b"", b" | "), (b"", b"
"), (b"", b" | ")]:
opening_pos = data.find(opening_tag, 0, MAX_LINE_LENGTH)
if 0 <= opening_pos < data.find(closing_tag, opening_pos):
# opening and closing tags were found - suppose it is an HTML
return True
return False
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to represent data as html text and scan as text lines"""
if result := data_provider.represent_as_html(depth, recursive_limit_size,
self.scanner.keywords_required_substrings_check):
string_data_provider = StringContentProvider(lines=data_provider.lines,
line_numbers=data_provider.line_numbers,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|HTML")
return self.scanner.scan(string_data_provider)
return None if result is None else []
================================================
FILE: credsweeper/deep_scanner/jclass_scanner.py
================================================
import io
import logging
import struct
from abc import ABC
from typing import List, Optional
from credsweeper.common.constants import UTF_8
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.struct_content_provider import StructContentProvider
logger = logging.getLogger(__name__)
class JclassScanner(AbstractScanner, ABC):
"""Implements java .class scanning"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""According https://en.wikipedia.org/wiki/List_of_file_signatures - java class"""
if data.startswith(b"\xCA\xFE\xBA\xBE"):
return True
return False
@staticmethod
def u2(stream: io.BytesIO) -> int:
"""Extracts unsigned 16 bit big-endian"""
return int(struct.unpack(">H", stream.read(2))[0])
@staticmethod
def get_utf8_constants(stream: io.BytesIO) -> List[str]:
"""Extracts only Utf8 constants from java ClassFile"""
result = []
# actual number of items is one less!
items_counter = JclassScanner.u2(stream) - 1
while 0 < items_counter:
items_counter -= 1
# uint8
tag = int(stream.read(1)[0])
if 1 == tag:
# UTF-8 string in bytes may be bigger than in characters
length = JclassScanner.u2(stream)
data = stream.read(int(length))
value = data.decode(encoding=UTF_8, errors="replace")
result.append(value)
elif tag in (3, 4, 9, 10, 11, 12, 18):
_ = stream.read(4)
elif tag in (7, 8, 16):
_ = stream.read(2)
elif tag in (5, 6):
_ = stream.read(8)
# long and double types use two indexes
items_counter -= 1
elif 15 == tag:
_ = stream.read(3)
else:
logger.warning("Unknown tag %s", tag)
break
return result
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Extracts data from binary"""
try:
stream = io.BytesIO(data_provider.data)
stream.read(4) # magic
minor = JclassScanner.u2(stream)
major = JclassScanner.u2(stream)
constants = JclassScanner.get_utf8_constants(stream)
struct_content_provider = StructContentProvider(struct=constants,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|Java.{major}.{minor}")
new_limit = recursive_limit_size - sum(len(x) for x in constants)
candidates = self.structure_scan(struct_content_provider, depth, new_limit)
return candidates
except Exception as jclass_exc:
logger.warning("%s:%s", data_provider.file_path, jclass_exc)
return None
================================================
FILE: credsweeper/deep_scanner/jks_scanner.py
================================================
import logging
from abc import ABC
from typing import List, Optional
import jks
from credsweeper.common.constants import Severity, Confidence
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
logger = logging.getLogger(__name__)
class JksScanner(AbstractScanner, ABC):
"""Implements jks scanning"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""According https://en.wikipedia.org/wiki/List_of_file_signatures - jks"""
if data.startswith(b"\xFE\xED\xFE\xED"):
return True
return False
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to scan JKS to open with standard password"""
for pw_probe in self.config.bruteforce_list:
value = repr(pw_probe)
try:
keystore = jks.KeyStore.loads(data_provider.data, pw_probe, try_decrypt_keys=True)
# the password probe has passed, it will be the value
if keystore.private_keys or keystore.secret_keys:
severity = Severity.HIGH
confidence = Confidence.STRONG
info = f"{data_provider.info}|JKS:default password"
rule_name = f"JKS private key with password {value}"
else:
severity = Severity.LOW
confidence = Confidence.WEAK
info = f"{data_provider.info}|JKS:sensitive data"
rule_name = f"JKS sensitive data with password {value}"
candidate = Candidate.get_dummy_candidate(
self.config, #
data_provider.file_path, #
data_provider.file_type, #
info, #
rule_name)
candidate.severity = severity
candidate.confidence = confidence
candidate.line_data_list[0].line = candidate.line_data_list[0].value = value
candidate.line_data_list[0].value_start = 0
candidate.line_data_list[0].value_end = len(value)
return [candidate]
except Exception as jks_exc:
logger.debug("%s:%s:%s", data_provider.file_path, pw_probe, jks_exc)
return None
================================================
FILE: credsweeper/deep_scanner/lang_scanner.py
================================================
import logging
from abc import ABC
from typing import List, Optional
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.struct_content_provider import StructContentProvider
logger = logging.getLogger(__name__)
class LangScanner(AbstractScanner, ABC):
"""Implements scanning of data if it is a script of some markup language"""
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to represent data as markup language and scan as structure"""
if result := data_provider.represent_as_structure():
struct_data_provider = StructContentProvider(struct=data_provider.structure,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|STRUCT")
return self.structure_scan(struct_data_provider, depth, recursive_limit_size)
return None if result is None else []
================================================
FILE: credsweeper/deep_scanner/lzma_scanner.py
================================================
import logging
import lzma
from abc import ABC
from pathlib import Path
from typing import List, Optional
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class LzmaScanner(AbstractScanner, ABC):
"""Implements lzma scanning"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""According https://en.wikipedia.org/wiki/List_of_file_signatures - lzma also xz"""
if data.startswith((b"\xFD7zXZ\x00", b"\x5D\x00\x00")):
return True
return False
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Extracts data from lzma archive and launches data_scan"""
try:
file_path = Path(data_provider.file_path)
new_path = file_path.as_posix()
if ".xz" == file_path.suffix:
new_path = new_path[:-3]
elif ".lzma" == file_path.suffix:
new_path = new_path[:-5]
lzma_content_provider = DataContentProvider(data=lzma.decompress(data_provider.data),
file_path=new_path,
file_type=Util.get_extension(new_path),
info=f"{data_provider.info}|LZMA:{file_path}")
new_limit = recursive_limit_size - len(lzma_content_provider.data)
lzma_candidates = self.recursive_scan(lzma_content_provider, depth, new_limit)
return lzma_candidates
except Exception as lzma_exc:
logger.warning("%s:%s", data_provider.file_path, lzma_exc)
return None
================================================
FILE: credsweeper/deep_scanner/mxfile_scanner.py
================================================
import logging
from abc import ABC
from typing import List, Optional
from bs4 import BeautifulSoup
from lxml import etree
from credsweeper.common.constants import MAX_LINE_LENGTH
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider
logger = logging.getLogger(__name__)
class MxfileScanner(AbstractScanner, ABC):
"""Scanner for drawio diagram"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""Used to detect mxfile (drawio) format. Suppose, invocation of is_xml() was True before."""
mxfile_tag_pos = data.find(b"", mxfile_tag_pos):
return True
return False
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to get text data from the xml format"""
try:
lines = []
line_numbers = []
tree = etree.fromstring(data_provider.text)
for element in tree.iter():
if "mxCell" == getattr(element, "tag"):
line_number = element.sourceline
attr = getattr(element, "attrib")
if attr is None or not (value := attr.get("value")):
continue
if html := BeautifulSoup(value, features="html.parser"):
_, value_lines, __ = data_provider.simple_html_representation(html)
for line in value_lines:
lines.append(line)
line_numbers.append(line_number)
mxfile_data_provider = StringContentProvider(lines=lines,
line_numbers=line_numbers,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|MXFILE")
return self.scanner.scan(mxfile_data_provider)
except Exception as exc:
logger.warning(exc)
return None
================================================
FILE: credsweeper/deep_scanner/patch_scanner.py
================================================
import io
import logging
from abc import ABC
from typing import List, Optional
from credsweeper.common.constants import DiffRowType
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.patches_provider import PatchesProvider
logger = logging.getLogger(__name__)
class PatchScanner(AbstractScanner, ABC):
"""Implements .patch scanning"""
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to scan EML with text representation"""
try:
candidates: List[Candidate] = []
# common limitation
new_limit_size = recursive_limit_size - len(data_provider.data)
# ADDED
path_added = [(data_provider.file_path, io.BytesIO(data_provider.data))]
added_content_provider = PatchesProvider(path_added, change_type=DiffRowType.ADDED)
for added_file in added_content_provider.get_scannable_files(self.config):
added_candidates = self.scan(added_file, depth, new_limit_size)
candidates.extend(added_candidates)
# DELETED
path_deleted = [(data_provider.file_path, io.BytesIO(data_provider.data))]
deleted_content_provider = PatchesProvider(path_deleted, change_type=DiffRowType.DELETED)
for deleted_file in deleted_content_provider.get_scannable_files(self.config):
added_candidates = self.scan(deleted_file, depth, new_limit_size)
candidates.extend(added_candidates)
# update the line data for deep scan only
for i in candidates:
for line_data in i.line_data_list:
line_data.path = data_provider.file_path
line_data.info = f"{data_provider.info}|PATCH:{line_data.info}"
return candidates
except Exception as patch_exc:
logger.warning("%s:%s", data_provider.file_path, patch_exc)
return None
================================================
FILE: credsweeper/deep_scanner/pdf_scanner.py
================================================
import io
import logging
from abc import ABC
from typing import List, Optional
from pdfminer.high_level import extract_pages
from pdfminer.layout import LAParams, LTText, LTItem
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider, MIN_DATA_LEN
from credsweeper.file_handler.string_content_provider import StringContentProvider
logger = logging.getLogger(__name__)
class PdfScanner(AbstractScanner, ABC):
"""Implements pdf scanning"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""According https://en.wikipedia.org/wiki/List_of_file_signatures - pdf"""
if data.startswith(b"%PDF-"):
return True
return False
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to scan PDF elements recursively and the whole text on page as strings"""
# PyPDF2 - https://github.com/py-pdf/pypdf/issues/1328 text in table is merged without spaces
# pdfminer.six - splits text in table to many lines. Allows to walk through elements
try:
candidates = []
for page in extract_pages(io.BytesIO(data_provider.data), laparams=LAParams()):
for element in page:
if isinstance(element, LTText):
element_text = element.get_text().strip()
if 0 < depth and element_text:
if MIN_DATA_LEN < len(element_text):
pdf_content_provider = DataContentProvider(
data=element_text.encode(),
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|PDF:{page.pageid}")
new_limit = recursive_limit_size - len(pdf_content_provider.data)
element_candidates = self.recursive_scan(pdf_content_provider, depth, new_limit)
candidates.extend(element_candidates)
else:
string_data_provider = StringContentProvider(lines=[element_text],
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|PDF:{page.pageid}")
pdf_candidates = self.scanner.scan(string_data_provider)
candidates.extend(pdf_candidates)
elif isinstance(element, LTItem):
pass
else:
logger.warning("Unsupported %s", element)
return candidates
except Exception as pdf_exc:
logger.warning("%s:%s", data_provider.file_path, pdf_exc)
return None
================================================
FILE: credsweeper/deep_scanner/pkcs_scanner.py
================================================
import base64
import logging
from abc import ABC
from typing import List, Optional, Union
from credsweeper.common.constants import Severity, Confidence
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class PkcsScanner(AbstractScanner, ABC):
"""Implements pkcs12 scanning"""
@staticmethod
def match(data: Union[bytes, bytearray]) -> int:
"""Matched ASN1 structure"""
return bool(Util.get_asn1_size(data))
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to scan PKCS12 to open with standard password"""
for pw_probe in self.config.bruteforce_list:
try:
password = pw_probe.encode() if pw_probe else None
if pkey := Util.load_pk(data_provider.data, password):
if not Util.check_pk(pkey):
logger.debug("False alarm %s", data_provider.info)
return []
candidate = Candidate.get_dummy_candidate(
self.config, #
data_provider.file_path, #
data_provider.file_type, #
info=f"{data_provider.info}|PKCS_PASSWORD:{repr(password)}", #
rule_name=f"PKCS with password {repr(pw_probe)}" if pw_probe else "PKCS without password")
candidate.line_data_list[0].line = base64.b64encode(data_provider.data).decode()
candidate.line_data_list[0].value = repr(password)
# high severity is assigned to private key rules
candidate.severity = Severity.HIGH
candidate.confidence = Confidence.STRONG
return [candidate]
except Exception as pkcs_exc:
logger.debug("%s:%s:%s", data_provider.file_path, pw_probe, pkcs_exc)
return None
================================================
FILE: credsweeper/deep_scanner/png_scanner.py
================================================
import logging
import struct
from abc import ABC
from typing import List, Optional, Generator, Tuple
from credsweeper.common.constants import LATIN_1, UTF_8
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
logger = logging.getLogger(__name__)
class PngScanner(AbstractScanner, ABC):
"""Implements PNG scanning for text chunks"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""Returns True if prefix match"""
if data.startswith(b"\x89PNG\r\n\x1a\n"):
return True
return False
@staticmethod
def yield_png_chunks(data: bytes) -> Generator[Tuple[int, str, bytes], None, None]:
"""Processes PNG chunks and yields offset, type and data"""
offset = 8 # b"\x89PNG\r\n\x1a\n"
data_limit = len(data) - 12
while offset <= data_limit:
chunk_size = struct.unpack(">I", data[offset:offset + 4])[0]
chunk_type = data[offset + 4:offset + 8]
offset += 8
if len(data) < offset + chunk_size:
raise ValueError(f"PNG chunk size {chunk_size} exceeds data limit 0x{offset:x}")
match chunk_type:
case b"IEND":
# https://www.w3.org/TR/png/#11IEND
break
case b"tEXt":
# https://www.w3.org/TR/png/#11tEXt
keyword, text_data = data[offset:offset + chunk_size].split(b'\0', 1)
yield offset, f"PNG_TEXT:{keyword.decode(encoding=LATIN_1, errors='strict')}", text_data
case b"zTXt":
# https://www.w3.org/TR/png/#11zTXt
keyword, ztxt_data = data[offset:offset + chunk_size].split(b'\0', 1)
if not ztxt_data.startswith(b'\0'):
raise ValueError(f"Unsupported compression method {ztxt_data[0]}")
yield offset, f"PNG_ZTXT:{keyword.decode(encoding=LATIN_1, errors='strict')}", ztxt_data[1:]
case b"iTXt":
# https://www.w3.org/TR/png/#11iTXt
keyword, itxt_data = data[offset:offset + chunk_size].split(b'\0', 1)
if itxt_data.startswith(b"\x00\x00"):
compression = False
elif itxt_data.startswith(b"\x01\x00"):
compression = True
else:
raise ValueError(f"Unsupported compression {repr(itxt_data[:2])}")
lang_tag, itxt_data = itxt_data[2:].split(b'\0', 1)
trans_key, itxt_data = itxt_data.split(b'\0', 1)
yield (offset, f"PNG_ITXT_{'1' if compression else '0'}"
f":{keyword.decode(encoding=UTF_8)}"
f":{lang_tag.decode(encoding=UTF_8)}"
f":{trans_key.decode(encoding=UTF_8)}", itxt_data)
case _:
pass
# skip crc verification
offset += chunk_size + 4
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to scan each row as structure with column name in key"""
try:
candidates: List[Candidate] = []
for offset, chunk_type, data in PngScanner.yield_png_chunks(data_provider.data):
png_content_provider = DataContentProvider(data=data,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|{chunk_type}:0x{offset:x}")
new_limit = recursive_limit_size - len(data)
png_candidates = self.recursive_scan(png_content_provider, depth, new_limit)
candidates.extend(png_candidates)
return candidates
except Exception as exc:
logger.warning(exc)
return None
================================================
FILE: credsweeper/deep_scanner/pptx_scanner.py
================================================
import io
import logging
from abc import ABC
from typing import List, Optional
from pptx import Presentation
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider
logger = logging.getLogger(__name__)
class PptxScanner(AbstractScanner, ABC):
"""Implements pptx scanning"""
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to scan pptx text elements for all slides"""
try:
candidates = []
pptx_lines = []
presentation = Presentation(io.BytesIO(data_provider.data))
for n, slide in enumerate(presentation.slides):
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
pptx_lines.append(paragraph.text)
string_data_provider = StringContentProvider(lines=pptx_lines,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|PPTX:{n+1}")
pptx_candidates = self.scanner.scan(string_data_provider)
candidates.extend(pptx_candidates)
return candidates
except Exception as pptx_exc:
logger.warning("%s:%s", data_provider.file_path, pptx_exc)
return None
================================================
FILE: credsweeper/deep_scanner/rpm_scanner.py
================================================
import io
import logging
from abc import ABC
from typing import List, Optional
import rpmfile
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.file_path_extractor import FilePathExtractor
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class RpmScanner(AbstractScanner, ABC):
"""Implements rpm scanning"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""According https://en.wikipedia.org/wiki/List_of_file_signatures"""
if data.startswith(b"\xED\xAB\xEE\xDB"):
return True
return False
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Extracts files one by one from the package type and launches recursive scan"""
try:
candidates = []
with rpmfile.open(fileobj=io.BytesIO(data_provider.data)) as rpm_file:
for member in rpm_file.getmembers():
# skip directory
if 0 != member.isdir:
continue
if FilePathExtractor.check_exclude_file(self.config, member.name):
continue
if 0 > recursive_limit_size - member.size:
logger.warning("%s: size %s is over limit %s depth:%s", member.filename, member.size,
recursive_limit_size, depth)
continue
rpm_content_provider = DataContentProvider(data=rpm_file.extractfile(member).read(),
file_path=data_provider.file_path,
file_type=Util.get_extension(member.name),
info=f"{data_provider.info}|RPM:{member.name}")
new_limit = recursive_limit_size - len(rpm_content_provider.data)
rpm_candidates = self.recursive_scan(rpm_content_provider, depth, new_limit)
candidates.extend(rpm_candidates)
return candidates
except Exception as rpm_exc:
logger.warning("%s:%s", data_provider.file_path, rpm_exc)
return None
================================================
FILE: credsweeper/deep_scanner/rtf_scanner.py
================================================
import logging
from abc import ABC
from typing import List, Optional
from striprtf import striprtf
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class RtfScanner(AbstractScanner, ABC):
"""Implements squash file system scanning"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""According https://en.wikipedia.org/wiki/List_of_file_signatures - Rich Text Format"""
if data.startswith(b"{\\rtf1") and data.endswith(b"}"):
return True
return False
@staticmethod
def get_lines(text: str) -> List[str]:
"""Extracts text lines from RTF format"""
rtf_text = striprtf.rtf_to_text(text)
lines = Util.split_text(rtf_text)
return lines
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Scans data as RTF"""
try:
string_data_provider = StringContentProvider(lines=RtfScanner.get_lines(data_provider.text),
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|RTF")
rtf_candidates = self.scanner.scan(string_data_provider)
return rtf_candidates
except Exception as rtf_exc:
logger.warning("%s:%s", data_provider.file_path, rtf_exc)
return None
================================================
FILE: credsweeper/deep_scanner/sqlite3_scanner.py
================================================
import logging
import os.path
import sqlite3
import sys
import tempfile
from abc import ABC
from typing import List, Optional, Tuple, Any, Generator
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.struct_content_provider import StructContentProvider
logger = logging.getLogger(__name__)
class Sqlite3Scanner(AbstractScanner, ABC):
"""Implements SQLite3 database scanning"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""According https://en.wikipedia.org/wiki/List_of_file_signatures - SQLite Database"""
if data.startswith(b"SQLite format 3\0"):
return True
return False
@staticmethod
def __walk(sqlite3db) -> Generator[Tuple[str, Any], None, None]:
sqlite3db.row_factory = sqlite3.Row
cursor = sqlite3db.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';")
for table in cursor.fetchall():
table_name = table[0]
try:
cursor.execute(f"SELECT * FROM {table_name}")
for row in cursor:
yield table_name, dict(row)
except sqlite3.DatabaseError as exc:
print(f"Error reading table {table_name}: {exc}")
@staticmethod
def walk_sqlite(data: bytes) -> Generator[Tuple[str, Any], None, None]:
"""Yields data from sqlite3 database"""
if 10 < sys.version_info.minor:
# Added in version 3.11
with sqlite3.connect(":memory:") as sqlite3db:
sqlite3db.deserialize(data) # type: ignore
yield from Sqlite3Scanner.__walk(sqlite3db)
elif "nt" != os.name:
# a tmpfile has to be used. TODO: remove when 3.10 will deprecate
with tempfile.NamedTemporaryFile(suffix=".sqlite") as t:
t.write(data)
t.flush()
with sqlite3.connect(t.name) as sqlite3db:
yield from Sqlite3Scanner.__walk(sqlite3db)
elif "nt" == os.name:
# windows trick. TODO: remove when 3.10 will deprecate
with tempfile.NamedTemporaryFile(delete=False, suffix=".sqlite") as t:
t.write(data)
t.flush()
sqlite3db = sqlite3.connect(t.name)
yield from Sqlite3Scanner.__walk(sqlite3db)
sqlite3db.close()
if os.path.exists(t.name):
os.remove(t.name)
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Extracts data file from .ar (debian) archive and launches data_scan"""
try:
candidates: List[Candidate] = []
new_limit = recursive_limit_size - len(data_provider.data)
for table, row in self.walk_sqlite(data_provider.data):
struct_content_provider = StructContentProvider(struct=row,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|SQLite3.{table}")
if new_candidates := self.structure_scan(struct_content_provider, depth, new_limit):
candidates.extend(new_candidates)
return candidates
except Exception as exc:
logger.warning(exc)
return None
================================================
FILE: credsweeper/deep_scanner/strings_scanner.py
================================================
import logging
from abc import ABC
from typing import List, Optional, Tuple
from credsweeper.common.constants import MIN_DATA_LEN
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider
logger = logging.getLogger(__name__)
class StringsScanner(AbstractScanner, ABC):
"""Implements known binary file scanning with ASCII strings representations"""
@staticmethod
def get_enumerated_lines(data: bytes) -> List[Tuple[int, str]]:
"""Processes binary to found ASCII strings. Use offset instead line number."""
enumerated_lines = []
offset = -1
line_items = []
for n, x in enumerate(data):
if 0x09 == x or 0x20 <= x <= 0x7E:
# TAB, SPACE and visible ASCII symbols
if 0 > offset:
# use start of string as line number
offset = n
line_items.append(chr(x))
continue
if MIN_DATA_LEN <= len(line_items):
# add valuable lines only
enumerated_lines.append((offset, ''.join(line_items)))
offset = -1
line_items.clear()
if MIN_DATA_LEN <= len(line_items):
enumerated_lines.append((offset, ''.join(line_items)))
return enumerated_lines
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Scan binary files for ASCII strings"""
if strings := StringsScanner.get_enumerated_lines(data_provider.data):
string_data_provider = StringContentProvider(lines=[x[1] for x in strings],
line_numbers=[x[0] for x in strings],
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|STRINGS")
return self.scanner.scan(string_data_provider)
return None if strings is None else []
================================================
FILE: credsweeper/deep_scanner/tar_scanner.py
================================================
import contextlib
import io
import logging
import tarfile
from abc import ABC
from typing import List, Optional
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.file_path_extractor import FilePathExtractor
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class TarScanner(AbstractScanner, ABC):
"""Implements tar scanning"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""According https://en.wikipedia.org/wiki/List_of_file_signatures"""
if 512 <= len(data) and 257 == data.find(b"\x75\x73\x74\x61\x72", 257, 262) \
and (262 == data.find(b"\x00\x30\x30", 262, 265)
or 262 == data.find(b"\x20\x20\x00", 262, 265)):
with contextlib.suppress(Exception):
chksum = tarfile.nti(data[148:156]) # type: ignore
unsigned_chksum, signed_chksum = tarfile.calc_chksums(data) # type: ignore
if chksum == unsigned_chksum or chksum == signed_chksum:
return True
return False
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Extracts files one by one from tar archive and launches data_scan"""
try:
candidates = []
with tarfile.TarFile(fileobj=io.BytesIO(data_provider.data)) as tf:
for tfi in tf.getmembers():
# skip directory
if not tfi.isreg():
continue
if FilePathExtractor.check_exclude_file(self.config, tfi.name):
continue
if 0 > recursive_limit_size - tfi.size:
logger.warning("%s: size %s is over limit %s depth:%s", tfi.name, tfi.size,
recursive_limit_size, depth)
continue
with tf.extractfile(tfi) as f:
tar_content_provider = DataContentProvider(data=f.read(),
file_path=data_provider.file_path,
file_type=Util.get_extension(tfi.name),
info=f"{data_provider.info}|TAR:{tfi.name}")
# Nevertheless, use extracted data size
new_limit = recursive_limit_size - len(tar_content_provider.data)
tar_candidates = self.recursive_scan(tar_content_provider, depth, new_limit)
candidates.extend(tar_candidates)
return candidates
except Exception as tar_exc:
# too many exception types might be produced with broken tar
logger.warning("%s:%s", data_provider.file_path, tar_exc)
return None
================================================
FILE: credsweeper/deep_scanner/tmx_scanner.py
================================================
import logging
from abc import ABC
from typing import List, Optional
from lxml import etree
from credsweeper.common.constants import MIN_DATA_LEN, MAX_LINE_LENGTH
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class TmxScanner(AbstractScanner, ABC):
"""Realises tmX files scanning for values only. Image tags are skipped."""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""Used to detect tm7,tm6,etc. (ThreadModeling) format."""
for opening_tag, closing_tag in [(b""),
(b"")]:
opening_pos = data.find(opening_tag, 0, MAX_LINE_LENGTH)
if 0 <= opening_pos < data.find(closing_tag, opening_pos):
# opening and closing tags were found - suppose it is an HTML
return True
return False
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to represent data as xml text and scan as text lines"""
try:
lines = []
# the format is always in single line xlm, so line numbers are not actual
tree = etree.fromstring(data_provider.data)
for element in tree.iter():
tag = Util.extract_element_data(element, "tag")
if "Image" in tag:
continue
text = Util.extract_element_data(element, "text")
if MIN_DATA_LEN > len(text):
continue
lines.append(text)
tmx_data_provider = StringContentProvider(lines=lines,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|TMX")
return self.scanner.scan(tmx_data_provider)
except Exception as exc:
logger.warning("Cannot processed tmX file %s %s", str(data_provider.file_path), str(exc))
return None
================================================
FILE: credsweeper/deep_scanner/xlsx_scanner.py
================================================
import io
import logging
from abc import ABC
from typing import List, Optional
import pandas as pd
from credsweeper.credentials.augment_candidates import augment_candidates
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class XlsxScanner(AbstractScanner, ABC):
"""Implements xlsx scanning"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""According https://en.wikipedia.org/wiki/List_of_file_signatures"""
if data.startswith(b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"):
# Compound File Binary Format: doc, xls, ppt, msi, msg
return True
return False
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to scan xlsx text elements for all slides"""
try:
candidates = []
book = pd.read_excel(io.BytesIO(data_provider.data), sheet_name=None, header=None)
for sheet_name, sheet_data in book.items():
sheet_info = f"{data_provider.info}|{sheet_name}"
# replace open xml carriage returns _x000D_ before line feed only
df = sheet_data.replace(to_replace="_x000D_\n", value='\n', regex=True).fillna('').astype(str)
for row_pos, row in enumerate(df.values):
for col_pos, cell in enumerate(row):
cell_info = f"{sheet_info}:{Util.get_excel_column_name(col_pos)}{row_pos + 1}"
cell_provider = StringContentProvider(lines=cell.splitlines(),
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=cell_info)
cell_candidates = self.scanner.scan(cell_provider)
candidates.extend(cell_candidates)
row_line = '\t'.join(row)
row_provider = StringContentProvider(lines=[row_line],
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{sheet_info}:R{row_pos + 1}")
row_candidates = self.scanner.scan(row_provider)
augment_candidates(candidates, row_candidates)
return candidates
except Exception as xlsx_exc:
logger.warning("%s:%s", data_provider.file_path, xlsx_exc)
return None
================================================
FILE: credsweeper/deep_scanner/xml_scanner.py
================================================
import logging
import re
from abc import ABC
from typing import List, Optional
from credsweeper.common.constants import MAX_LINE_LENGTH
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider
logger = logging.getLogger(__name__)
class XmlScanner(AbstractScanner, ABC):
"""Realises xml scanning"""
# A well-formed XML must start from < or a whitespace character
XML_FIRST_BRACKET_PATTERN = re.compile(rb"^\s*<")
XML_OPENING_TAG_PATTERN = re.compile(rb"<([0-9A-Za-z_]{1,256})")
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""Used to detect xml format from raw bytes"""
if XmlScanner.XML_FIRST_BRACKET_PATTERN.search(data, 0, MAX_LINE_LENGTH):
if first_bracket_match := XmlScanner.XML_OPENING_TAG_PATTERN.search(data, 0, MAX_LINE_LENGTH):
start_pos = first_bracket_match.start()
closing_tag = b"" + first_bracket_match.group(1) + b">"
if start_pos < data.find(closing_tag, start_pos):
return True
return False
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Tries to represent data as xml text and scan as text lines"""
if result := data_provider.represent_as_xml():
string_data_provider = StringContentProvider(lines=data_provider.lines,
line_numbers=data_provider.line_numbers,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|XML")
return self.scanner.scan(string_data_provider)
return None if result is None else []
================================================
FILE: credsweeper/deep_scanner/zip_scanner.py
================================================
import io
import logging
from abc import ABC
from typing import List, Optional
from zipfile import ZipFile
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.file_path_extractor import FilePathExtractor
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class ZipScanner(AbstractScanner, ABC):
"""Implements zip scanning"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""According https://en.wikipedia.org/wiki/List_of_file_signatures"""
if data.startswith(b"PK") and 4 <= len(data):
if 0x03 == data[2] and 0x04 == data[3]:
# normal PK
return True
if 0x05 == data[2] and 0x06 == data[3]:
# empty archive - no sense to scan in other scanners, so let it be a zip
return True
if 0x07 == data[2] and 0x08 == data[3]:
# spanned archive - NOT SUPPORTED
return False
return False
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Extracts files one by one from zip archives and launches data_scan"""
try:
candidates = []
with ZipFile(io.BytesIO(data_provider.data)) as zf:
for zfl in zf.infolist():
# skip directory
if zfl.is_dir():
continue
if FilePathExtractor.check_exclude_file(self.config, zfl.filename):
continue
if 0 > recursive_limit_size - zfl.file_size:
logger.warning("%s: size %s is over limit %s depth:%s", zfl.filename, zfl.file_size,
recursive_limit_size, depth)
continue
with zf.open(zfl) as f:
zip_content_provider = DataContentProvider(data=f.read(),
file_path=data_provider.file_path,
file_type=Util.get_extension(zfl.filename),
info=f"{data_provider.info}|ZIP:{zfl.filename}")
# nevertheless use extracted data size
new_limit = recursive_limit_size - len(zip_content_provider.data)
zip_candidates = self.recursive_scan(zip_content_provider, depth, new_limit)
candidates.extend(zip_candidates)
return candidates
except Exception as zip_exc:
# too many exception types might be produced with broken zip
logger.warning("%s:%s", data_provider.file_path, zip_exc)
return None
================================================
FILE: credsweeper/deep_scanner/zlib_scanner.py
================================================
import logging
import zlib
from abc import ABC
from typing import List, Optional
from credsweeper.credentials.candidate import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
logger = logging.getLogger(__name__)
class ZlibScanner(AbstractScanner, ABC):
"""Implements zlib data inflate and scan"""
@staticmethod
def match(data: bytes | bytearray) -> bool:
"""Returns True if data looks like deflated data with zlib"""
if 6 < len(data):
cmf = data[0]
flg = data[1]
if 8 == (0xF & cmf) and 7 >= (cmf >> 4) and 0 == ((cmf << 8) | flg) % 31 and 0 == (0x20 & flg):
if 0x3 != (data[2] >> 1):
# the last check of impossible bits
return True
return False
@staticmethod
def decompress(limit: int, data: bytes) -> bytes:
"""Returns decompressed data by chunks with a limit or exception in unusual cases"""
zlib_obj = zlib.decompressobj()
result = zlib_obj.decompress(data, max_length=limit)
if zlib_obj.unconsumed_tail:
raise ValueError(f"Limit exceeds for {len(zlib_obj.unconsumed_tail)}")
if not zlib_obj.eof:
raise ValueError("Truncated zlib stream")
if zlib_obj.unused_data:
raise ValueError(f"Unused data {len(zlib_obj.unused_data)}")
return result
def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> Optional[List[Candidate]]:
"""Inflate data from zlib compressed and launches data_scan"""
try:
decompressed = ZlibScanner.decompress(recursive_limit_size, data_provider.data)
zlib_content_provider = DataContentProvider(data=decompressed,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|ZLIB")
new_limit = recursive_limit_size - len(decompressed)
zlib_candidates = self.recursive_scan(zlib_content_provider, depth, new_limit)
return zlib_candidates
except Exception as zlib_exc:
logger.warning("%s:%s", data_provider.file_path, zlib_exc)
return None
================================================
FILE: credsweeper/file_handler/__init__.py
================================================
================================================
FILE: credsweeper/file_handler/abstract_provider.py
================================================
import io
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Union, Tuple, Sequence
from credsweeper.config.config import Config
from credsweeper.file_handler.content_provider import ContentProvider
class AbstractProvider(ABC):
"""Base class for all files provider objects."""
def __init__(self, paths: Sequence[Union[str, Path, io.BytesIO, Tuple[Union[str, Path], io.BytesIO]]]) -> None:
"""Initialize Files Provider object for 'paths'.
Args:
paths: file paths list to scan or io.BytesIO or tuple with both
"""
self.paths = paths
@property
def paths(self) -> Sequence[Union[str, Path, io.BytesIO, Tuple[Union[str, Path], io.BytesIO]]]:
"""paths getter"""
return self.__paths
@paths.setter
def paths(self, paths: Sequence[Union[str, Path, io.BytesIO, Tuple[Union[str, Path], io.BytesIO]]]) -> None:
"""paths setter"""
self.__paths = paths
@abstractmethod
def get_scannable_files(self, config: Config) -> Sequence[ContentProvider]:
"""Get list of file object for analysis based on attribute "paths".
Args:
config: dict of credsweeper configuration
Return:
file objects to analyse
"""
raise NotImplementedError()
================================================
FILE: credsweeper/file_handler/analysis_target.py
================================================
from functools import cached_property
from typing import List, Optional
from credsweeper.file_handler.descriptor import Descriptor
class AnalysisTarget:
"""AnalysisTarget"""
def __init__(
self,
line_pos: int,
lines: List[str],
line_nums: List[int],
descriptor: Descriptor,
line: Optional[str] = None,
offset: Optional[int] = None,
):
self.__line_pos = line_pos
self.__lines = lines
self.__line_nums = line_nums
self.__descriptor = descriptor
self.__line = line
self.__offset = offset
@cached_property
def offset(self) -> Optional[int]:
"""cached value"""
# when the offset is not None - it means that original line was split into chunks
return self.__offset
@cached_property
def line(self) -> str:
"""cached value"""
if self.__line is None:
# normal target
return self.__lines[self.__line_pos]
# chunked target
return self.__line
@cached_property
def line_len(self) -> int:
"""cached value"""
return len(self.line)
@cached_property
def line_strip(self) -> str:
"""cached value"""
return self.line.strip()
@cached_property
def line_strip_len(self) -> int:
"""cached value"""
return len(self.line_strip)
@cached_property
def line_lower(self) -> str:
"""cached value"""
return self.line.lower()
@cached_property
def line_lower_strip(self) -> str:
"""cached value"""
return self.line_lower.strip()
@cached_property
def lines(self) -> List[str]:
"""cached value"""
return self.__lines
@cached_property
def lines_len(self) -> int:
"""cached value"""
return len(self.__lines)
@cached_property
def line_pos(self) -> int:
"""cached value"""
return self.__line_pos
@cached_property
def line_num(self) -> int:
"""cached value"""
return self.__line_nums[self.__line_pos]
@cached_property
def line_nums(self) -> List[int]:
"""cached value"""
return self.__line_nums
@cached_property
def file_path(self) -> Optional[str]:
"""cached value"""
return self.__descriptor.path
@cached_property
def file_type(self) -> Optional[str]:
"""cached value"""
return self.__descriptor.extension
@cached_property
def info(self) -> Optional[str]:
"""cached value"""
return self.__descriptor.info
@cached_property
def descriptor(self) -> Descriptor:
"""cached value"""
return self.__descriptor
================================================
FILE: credsweeper/file_handler/byte_content_provider.py
================================================
import logging
from functools import cached_property
from typing import List, Optional, Generator
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.file_handler.content_provider import ContentProvider
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class ByteContentProvider(ContentProvider):
"""Allow to scan byte sequence instead of extra reading a file"""
def __init__(
self, #
content: bytes, #
file_path: Optional[str] = None, #
file_type: Optional[str] = None, #
info: Optional[str] = None) -> None:
"""
Parameters:
content: The bytes are transformed to an array of lines with split by new line character.
"""
super().__init__(file_path=file_path, file_type=file_type, info=info)
self.__data = content
self.__lines: Optional[List[str]] = None
@cached_property
def data(self) -> Optional[bytes]:
"""data RO getter for ByteContentProvider"""
return self.__data
def free(self) -> None:
"""free data after scan to reduce memory usage"""
self.__data = None
if "data" in self.__dict__:
delattr(self, "data")
self.__lines = None
if "lines" in self.__dict__:
delattr(self, "lines")
@cached_property
def lines(self) -> List[str]:
"""lines RO getter for ByteContentProvider"""
if self.__lines is None:
text = Util.decode_text(self.__data)
if text is None:
if not self.info.endswith("|BASE64|RAW"):
# avoid extra warnings for the hypothesis
logger.warning("Binary data detected %s %s %s", self.file_path, self.info,
repr(self.__data[:32]) if isinstance(self.__data, bytes) else "NONE")
self.__lines = []
else:
self.__lines = Util.split_text(text)
return self.__lines if self.__lines is not None else []
def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
"""Return lines to scan.
Args:
min_len: minimal line length to scan
Return:
list of analysis targets based on every row in a content
"""
return self.lines_to_targets(min_len, self.lines)
================================================
FILE: credsweeper/file_handler/content_provider.py
================================================
import logging
from abc import ABC, abstractmethod
from functools import cached_property
from typing import List, Optional, Generator
from credsweeper.common.constants import MAX_LINE_LENGTH
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.file_handler.descriptor import Descriptor
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class ContentProvider(ABC):
"""Base class to provide access to analysis targets for scanned object."""
def __init__(
self, #
file_path: Optional[str] = None, #
file_type: Optional[str] = None, #
info: Optional[str] = None) -> None:
"""
Parameters:
file_path: optional string. Might be specified if you know the file name where data were taken from.
file_type: optional string. File extension e.g. ".java". It might be obtained from file_path if not given.
info: optional string. Any information to help understand how a credential was found.
"""
_file_path: str = file_path or ""
_file_type: str = file_type if file_type is not None else Util.get_extension(file_path)
_info: str = info or ""
self.__descriptor = Descriptor(_file_path, _file_type, _info)
@abstractmethod
def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
"""Load and preprocess file diff data to scan.
Args:
min_len: minimal line length to scan
Return:
row objects to analysing
"""
raise NotImplementedError()
@cached_property
def descriptor(self) -> Descriptor:
"""descriptor getter"""
return self.__descriptor
@cached_property
def file_path(self) -> str:
"""file_path getter"""
return self.__descriptor.path
@cached_property
def file_type(self) -> str:
"""file_type getter"""
return self.__descriptor.extension
@cached_property
def info(self) -> str:
"""info getter"""
return self.__descriptor.info
@cached_property
@abstractmethod
def data(self) -> Optional[bytes]:
"""abstract data getter"""
raise NotImplementedError(__name__)
@abstractmethod
def free(self) -> None:
"""free data after scan to reduce memory usage"""
raise NotImplementedError(__name__)
def lines_to_targets(
self, #
min_len: int,
lines: List[str], #
line_nums: Optional[List[int]] = None) -> Generator[AnalysisTarget, None, None]:
"""Creates list of targets with multiline concatenation"""
lines_range = range(len(lines))
if line_nums is None or len(line_nums) != len(lines):
if line_nums is not None:
logger.warning("Line numerations %s does not match lines %s. Plain numeration applied", len(line_nums),
len(lines))
line_nums = [1 + x for x in lines_range]
for line_pos in lines_range:
line = lines[line_pos]
if min_len > len(line.strip()):
# Ignore target if stripped part is too short for all types
continue
if MAX_LINE_LENGTH < len(line):
for chunk_start, chunk_end in Util.get_chunks(len(line)):
target = AnalysisTarget(
line_pos=line_pos, #
lines=lines, #
line_nums=line_nums, #
descriptor=self.descriptor, #
line=line[chunk_start:chunk_end], #
offset=chunk_start)
yield target
else:
target = AnalysisTarget(line_pos, lines, line_nums, self.descriptor)
yield target
================================================
FILE: credsweeper/file_handler/data_content_provider.py
================================================
import json
import logging
import warnings
from functools import cached_property
from typing import List, Optional, Any, Generator, Callable, Tuple
import yaml
from bs4 import BeautifulSoup, Tag, XMLParsedAsHTMLWarning
from credsweeper.common.constants import MIN_DATA_LEN
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.file_handler.content_provider import ContentProvider
from credsweeper.utils.util import Util
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning, module='bs4')
logger = logging.getLogger(__name__)
# 12345678 - minimal xml with a credential
MIN_XML_LEN = 16
class DataContentProvider(ContentProvider):
"""Dummy raw provider to keep bytes"""
def __init__(
self, #
data: bytes, #
file_path: Optional[str] = None, #
file_type: Optional[str] = None, #
info: Optional[str] = None) -> None:
"""
Parameters:
data: byte sequence to be stored for deep analysis
"""
super().__init__(file_path=file_path, file_type=file_type, info=info)
self.__data = data
self.__text: Optional[str] = None
self.structure: Optional[List[Any]] = None
self.decoded: Optional[bytes] = None
self.lines: List[str] = []
self.line_numbers: List[int] = []
self.__html_lines_size = len(data) # the size is used to limit extra memory consumption during html combination
@cached_property
def data(self) -> Optional[bytes]:
"""data RO getter for DataContentProvider and the property is used in deep scan"""
return self.__data
def free(self) -> None:
"""free data after scan to reduce memory usage"""
self.__data = None
if "data" in self.__dict__:
delattr(self, "data")
self.__text = None
if "text" in self.__dict__:
delattr(self, "text")
self.structure = None
self.decoded = None
self.lines = []
self.line_numbers = []
@cached_property
def text(self) -> str:
"""Getter to produce a text from DEFAULT_ENCODING. Empty str for unrecognized data"""
if self.__text is None:
self.__text = Util.decode_text(self.__data) or ''
return self.__text
def __is_structure(self) -> bool:
"""Check whether a structure was recognized"""
return self.structure is not None and (isinstance(self.structure, dict) and 0 < len(self.structure.keys())
or isinstance(self.structure, list) and 0 < len(self.structure))
def represent_as_structure(self) -> Optional[bool]:
"""Tries to convert data with many parsers. Stores result to internal structure
Return:
True if some structure found
False if no data found
None if the format is not acceptable
"""
if MIN_DATA_LEN > len(self.text):
return False
# JSON & NDJSON
if '{' in self.text and '}' in self.text and '"' in self.text and ':' in self.text:
try:
self.structure = json.loads(self.text)
logger.debug("CONVERTED from json")
except Exception as exc:
logger.debug("Cannot parse as json:%s %s", exc, self.data)
else:
if self.__is_structure():
return True
try:
self.structure = []
for line in self.text.splitlines():
# each line must be in json format, otherwise - exception rises
self.structure.append(json.loads(line))
logger.debug("CONVERTED from ndjson")
except Exception as exc:
logger.debug("Cannot parse as ndjson:%s %s", exc, self.data)
self.structure = None
else:
if self.__is_structure():
return True
else:
logger.debug("Data do not contain { - weak JSON")
# # # Python
try:
# search only in sources with strings
if (';' in self.text or 2 < self.text.count('\n') or 2 < self.text.count('\r')) \
and ('"' in self.text or "'" in self.text):
self.structure = Util.parse_python(self.text)
logger.debug("CONVERTED from Python")
else:
logger.debug("Data do not contain line feed - weak PYTHON")
except Exception as exc:
logger.debug("Cannot parse as Python:%s %s", exc, self.data)
else:
if self.__is_structure():
return True
# # # YAML - almost always recognized
try:
if ':' in self.text and (2 < self.text.count('\n') or 2 < self.text.count('\r')):
self.structure = yaml.safe_load(self.text)
logger.debug("CONVERTED from yaml")
else:
logger.debug("Data do not contain colon mark - weak YAML")
except Exception as exc:
logger.debug("Cannot parse as yaml:%s %s", exc, self.data)
else:
if self.__is_structure():
return True
# # # None of above
return None
def represent_as_xml(self) -> Optional[bool]:
"""Tries to read data as xml
Return:
True if reading was successful
False if no data found
None if the format is not acceptable
"""
if MIN_XML_LEN > len(self.text):
return False
try:
if '<' in self.text and '>' in self.text and "" in self.text:
xml_text = self.text.splitlines()
self.lines, self.line_numbers = Util.get_xml_from_lines(xml_text)
logger.debug("CONVERTED from xml")
return bool(self.lines and self.line_numbers)
logger.debug("Weak data to parse as XML")
except Exception as exc:
logger.debug("Cannot parse as XML:%s %s", exc, self.data)
return None
def _check_multiline_cell(self, cell: Tag) -> Optional[Tuple[int, str]]:
"""multiline cell will be analysed as text or return single line from cell
returns line number and one line for analysis
If there are no text or the text will be analysed as multiline - it returns None"""
# use not stripped get_text, otherwise all format is cleaned
cell_text = cell.get_text()
cell_lines = cell_text.splitlines()
line_numbers: List[int] = []
stripped_lines: List[str] = []
for offset, line in enumerate(cell_lines):
if stripped_line := line.strip():
line_numbers.append(cell.sourceline + offset)
stripped_lines.append(stripped_line)
if not stripped_lines:
return None
if 1 == len(stripped_lines):
return line_numbers[0], stripped_lines[0]
# otherwise the cell will be analyzed as multiline text
self.line_numbers.extend(line_numbers)
self.lines.extend(stripped_lines)
self.__html_lines_size += sum(len(x) for x in stripped_lines)
return None
@staticmethod
def simple_html_representation(html: BeautifulSoup) -> Tuple[List[int], List[str], int]:
"""simple parse as it is displayed to user and appends the lines"""
line_numbers: List[int] = []
lines: List[str] = []
lines_size = 0
# use dedicated variable to deal with yapf and flake
tags_to_split = [
"p", "br", "tr", "li", "ol", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "pre", "div", "th", "td"
]
for p in html.find_all(tags_to_split):
p.append('\t')
html_lines = html.get_text().splitlines()
for line_number, doc_line in enumerate(html_lines):
line = doc_line.strip()
if line:
line_numbers.append(line_number + 1)
lines.append(line)
lines_size += len(line)
return line_numbers, lines, lines_size
@staticmethod
def _table_depth_reached(table: Tag, depth: int) -> bool:
if parent := table.parent:
if isinstance(parent, BeautifulSoup):
return False
if 0 > depth:
return True
if "table" == parent.name:
depth -= 1
return DataContentProvider._table_depth_reached(parent, depth)
return True
def _table_representation(
self, #
table: Tag, #
depth: int, #
recursive_limit_size: int, #
keywords_required_substrings_check: Callable[[str], bool]):
"""
transform table if table cell is assigned to header cell
make from cells a chain like next is assigned to previous
"""
if DataContentProvider._table_depth_reached(table, depth):
logger.warning("Recursive depth limit was reached during HTML table combinations")
return
table_header: Optional[List[Optional[str]]] = None
rowspan_columns = []
for tr in table.find_all("tr"):
if recursive_limit_size < self.__html_lines_size:
# weird tables may lead to oversize memory
break
record_numbers = []
record_lines = []
record_leading = None
if table_header is None:
table_header = []
# first row in table may be a header with and a style, but search | too
for cell in tr.find_all(["th", "td"]):
if recursive_limit_size < self.__html_lines_size:
# keep the duplicates for early breaks!
break
colspan_header = int(cell.get("colspan", 1))
if td_numbered_line := self._check_multiline_cell(cell):
td_text = td_numbered_line[1]
td_text_has_keywords = keywords_required_substrings_check(td_text.lower())
for _ in range(colspan_header):
rowspan_header = int(cell.get("rowspan", 1))
rowspan_columns.append(rowspan_header)
if td_text_has_keywords:
table_header.append(td_text)
else:
table_header.append(None)
if record_leading is None:
if td_text_has_keywords:
record_leading = td_text
else:
record_leading = ""
else:
record_numbers.append(td_numbered_line[0])
record_lines.append(f"{record_leading} : {td_text}")
# add single text to lines for analysis
self.line_numbers.append(td_numbered_line[0])
self.lines.append(td_text)
self.__html_lines_size += len(td_text)
else:
# empty cell or multiline cell
for _ in range(colspan_header):
# number of columns is defined with header only
rowspan_header = int(cell.get("rowspan", 1))
rowspan_columns.append(rowspan_header)
table_header.append(None)
else:
header_pos = 0
# not a first line in table - may be combined with a header
for cell in tr.find_all("td"):
if recursive_limit_size < self.__html_lines_size:
# keep the duplicates for early breaks!
break
while header_pos < len(rowspan_columns) and 1 < rowspan_columns[header_pos]:
rowspan_columns[header_pos] -= 1
header_pos += 1
colspan_cell = int(cell.get("colspan", 1))
rowspan_cell = int(cell.get("rowspan", 1))
for i in range(header_pos, header_pos + colspan_cell):
if i < len(rowspan_columns):
rowspan_columns[i] += rowspan_cell - 1
if td_numbered_line := self._check_multiline_cell(cell):
td_text = td_numbered_line[1]
if record_leading is None:
td_text_has_keywords = keywords_required_substrings_check(td_text.lower())
if td_text_has_keywords:
record_leading = td_text
else:
record_leading = ""
elif record_leading:
record_numbers.append(td_numbered_line[0])
record_lines.append(f"{record_leading} : {td_text}")
if header_pos < len(table_header):
if header_text := table_header[header_pos]:
self.line_numbers.append(td_numbered_line[0])
self.lines.append(f"{header_text} : {td_text}")
self.__html_lines_size += len(td_text)
else:
# empty cell or multiline cell
table_header.append(None)
header_pos += colspan_cell
if record_lines:
# add combinations with left column
self.line_numbers.extend(record_numbers)
self.lines.extend(record_lines)
self.__html_lines_size += sum(len(x) for x in record_lines)
def _html_tables_representation(
self, #
html: BeautifulSoup, #
depth: int, #
recursive_limit_size: int, #
keywords_required_substrings_check: Callable[[str], bool]):
"""Iterates for all tables in html to explore cells and their combinations"""
depth -= 1
if 0 > depth:
return
for table in html.find_all("table"):
if recursive_limit_size < self.__html_lines_size:
logger.warning("Recursive size limit was reached during HTML table combinations")
break
self._table_representation(table, depth, recursive_limit_size, keywords_required_substrings_check)
def represent_as_html(
self, #
depth: int, #
recursive_limit_size: int, #
keywords_required_substrings_check: Callable[[str], bool]) -> Optional[bool]:
"""Tries to read data as html
Return:
True if reading was successful
False if no data found
None if the format is not acceptable
"""
try:
if "" in self.text and ">" in self.text:
if html := BeautifulSoup(self.text, features="html.parser"):
line_numbers, lines, lines_size = self.simple_html_representation(html)
self.line_numbers.extend(line_numbers)
self.lines.extend(lines)
self.__html_lines_size += lines_size
# apply recursive_limit_size/2 to reduce extra calculation
# of all accompanying losses per objects allocation
self._html_tables_representation(html, depth, recursive_limit_size >> 1,
keywords_required_substrings_check)
logger.debug("CONVERTED from html")
else:
logger.debug("Data do not contain specific tags - weak HTML")
except Exception as exc:
logger.debug("Cannot parse as HTML:%s %s", exc, self.data)
else:
return bool(self.lines and self.line_numbers)
return None
def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
"""Return nothing. The class provides only data storage.
Args:
min_len: minimal line length to scan
Raise:
NotImplementedError
"""
raise NotImplementedError()
================================================
FILE: credsweeper/file_handler/descriptor.py
================================================
from dataclasses import dataclass
@dataclass(frozen=True)
class Descriptor:
"""Descriptor for file - optimize memory consumption"""
path: str
extension: str
info: str
================================================
FILE: credsweeper/file_handler/diff_content_provider.py
================================================
import logging
from dataclasses import dataclass
from functools import cached_property
from typing import List, Tuple, Generator, TypedDict, Optional, Union, Any, Dict, cast
import whatthepatch
from credsweeper.common.constants import DiffRowType
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.file_handler.content_provider import ContentProvider
logger = logging.getLogger(__name__)
DiffDict = TypedDict(
"DiffDict",
{
"old": Optional[int], #
"new": Optional[int], #
"line": Union[str, bytes], # bytes are possibly since whatthepatch v1.0.4
"hunk": Any # not used
})
@dataclass(frozen=True)
class DiffRowData:
"""Class for keeping data of diff row."""
line_type: DiffRowType
line_numb: int
line: str
class DiffContentProvider(ContentProvider):
"""Provide data from a single `.patch` file.
Parameters:
file_path: path to file
change_type: set added or deleted file data to scan
diff: list of file row changes, with base elements represented as::
{
"old": line number before diff,
"new": line number after diff,
"line": line text,
"hunk": diff hunk number
}
"""
def __init__(
self, #
file_path: str, #
change_type: DiffRowType, #
diff: List[DiffDict]) -> None:
super().__init__(file_path=file_path, info=f"{file_path}:{change_type.value}")
self.__change_type = change_type
self.__diff = diff
@cached_property
def data(self) -> bytes:
"""data getter for DiffContentProvider"""
raise NotImplementedError(__name__)
@cached_property
def diff(self) -> List[DiffDict]:
"""diff getter for DiffContentProvider"""
return self.__diff
def free(self) -> None:
"""free data after scan to reduce memory usage"""
self.__diff = []
if "diff" in self.__dict__:
delattr(self, "diff")
@staticmethod
def parse_lines_data(change_type: DiffRowType, lines_data: List[DiffRowData]) -> Tuple[List[int], List[str]]:
"""Parse diff lines data.
Return list of line numbers with change type "self.change_type" and list of all lines in file
in original order(replaced all lines not mentioned in diff file with blank line)
Args:
change_type: set added or deleted file data to scan
lines_data: data of all rows mentioned in diff file
Return:
tuple of line numbers with change type "self.change_type" and all file lines
in original order(replaced all lines not mentioned in diff file with blank line)
"""
change_numbs = []
all_lines = []
for line_data in lines_data:
if line_data.line_type == change_type:
change_numbs.append(line_data.line_numb)
all_lines.append(line_data.line)
return change_numbs, all_lines
@staticmethod
def patch2files_diff(raw_patch: List[str], change_type: DiffRowType) -> Dict[str, List[DiffDict]]:
"""Generate files changes from patch for added or deleted filepaths.
Args:
raw_patch: git patch file content
change_type: change type to select, DiffRowType.ADDED or DiffRowType.DELETED
Return:
return dict with ``{file paths: list of file row changes}``, where
elements of list of file row changes represented as::
{
"old": line number before diff,
"new": line number after diff,
"line": line text,
"hunk": diff hunk number
}
"""
if not raw_patch:
return {}
added_files: Dict[str, List[DiffDict]] = {}
deleted_files: Dict[str, List[DiffDict]] = {}
try:
for patch in whatthepatch.parse_patch(raw_patch):
if patch.changes is None:
logger.warning("Patch '%s' cannot be scanned", str(patch.header))
continue
changes: List[DiffDict] = []
for change in patch.changes:
change_dict = cast(DiffDict, change._asdict())
changes.append(change_dict)
if patch.header:
added_files[patch.header.new_path] = changes
deleted_files[patch.header.old_path] = changes
if change_type == DiffRowType.ADDED:
return added_files
if change_type == DiffRowType.DELETED:
return deleted_files
logger.error("Change type should be one of: '%s', '%s'; but received %s", DiffRowType.ADDED,
DiffRowType.DELETED, change_type)
except Exception as exc:
logger.warning(exc)
return {}
@staticmethod
def preprocess_diff_rows(
added_line_number: Optional[int], #
deleted_line_number: Optional[int], #
line: str) -> List[DiffRowData]:
"""Auxiliary function to extend diff changes.
Args:
added_line_number: number of added line or None
deleted_line_number: number of deleted line or None
line: the text line
Return:
diff rows data with as list of row change type, line number, row content
"""
rows_data: List[DiffRowData] = []
if isinstance(added_line_number, int):
# indicates line was inserted
rows_data.append(DiffRowData(DiffRowType.ADDED, added_line_number, line))
if isinstance(deleted_line_number, int):
# indicates line was removed
rows_data.append(DiffRowData(DiffRowType.DELETED, deleted_line_number, line))
return rows_data
@staticmethod
def wrong_change(change: DiffDict) -> bool:
"""Returns True if the change is wrong"""
for i in ["line", "new", "old"]:
if i not in change:
logger.warning("Skipping wrong change %s", change)
return True
return False
@staticmethod
def preprocess_file_diff(changes: List[DiffDict]) -> List[DiffRowData]:
"""Generate changed file rows from diff data with changed lines (e.g. marked + or - in diff).
Args:
changes: git diff by file rows data
Return:
diff rows data with as list of row change type, line number, row content
"""
if not changes:
return []
rows_data = []
# process diff to restore lines and their positions
for change in changes:
if DiffContentProvider.wrong_change(change):
continue
if text := change["line"]:
if isinstance(text, str):
diff_rows = DiffContentProvider.preprocess_diff_rows(change.get("new"), change.get("old"), text)
rows_data.extend(diff_rows)
elif isinstance(text, (bytes, bytearray)):
logger.warning("The feature is available with the deep scan option")
else:
logger.warning("Unknown type of line %s", type(text))
else:
logger.debug("Change has no valuable text %s", change)
return rows_data
def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
"""Preprocess file diff data to scan.
Args:
min_len: minimal line length to scan
Return:
list of analysis targets of every row of file diff corresponding to change type "self.change_type"
"""
lines_data = DiffContentProvider.preprocess_file_diff(self.__diff)
change_numbs, all_lines = self.parse_lines_data(self.__change_type, lines_data)
return self.lines_to_targets(min_len, all_lines, change_numbs)
================================================
FILE: credsweeper/file_handler/file_path_extractor.py
================================================
import io
import logging
import os
from pathlib import Path
from typing import List, Dict, Union, Tuple
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
from credsweeper.common.constants import MIN_DATA_LEN
from credsweeper.config.config import Config
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class FilePathExtractor:
"""Util class to browse files in directories"""
FIND_BY_EXT_RULE = "Suspicious File Extension"
located_repos: Dict[Path, Repo] = {}
@staticmethod
def apply_gitignore(detected_files: List[str]) -> List[str]:
"""Apply gitignore rules for each file.
Args:
detected_files: list of files to be checked
Return:
List of files with all files ignored by git removed
"""
filtered_files = [file_path for file_path in detected_files if FilePathExtractor.is_valid_path(file_path)]
return filtered_files
@staticmethod
def get_file_paths(config: Config, path: Union[str, Path]) -> List[str]:
"""Get all files in the directory. Automatically exclude files non-code or data files (such as .jpg).
Args:
config: credsweeper configuration
path: path to the file or directory to be scanned
Return:
List all non-excluded files in the directory
"""
path = os.path.expanduser(path) # Replace ~ character with a full path to the home directory
if not os.path.exists(path):
logger.warning("Path '%s' does not exist", path)
file_paths = []
if os.path.isfile(path):
# suppose, the file is located outside and should be scanned
if not FilePathExtractor.check_exclude_file(config, path):
file_paths.append(path)
elif os.path.isdir(path):
for dirpath, _, filenames in os.walk(path):
for filename in filenames:
file_path = os.path.join(f"{dirpath}", f"{filename}")
if FilePathExtractor.check_exclude_file(config, file_path) or os.path.islink(file_path):
continue
if os.path.isfile(file_path) and not FilePathExtractor.check_file_size(config, file_path):
file_paths.append(file_path)
else:
pass # symbolic links and so on
return file_paths
@classmethod
def is_valid_path(cls, path: str) -> bool:
"""Locate nearest .git directory to the path and check if path is ignored.
Args:
path: path to the file or directory to check
Return:
False if file is ignored by git. True otherwise
"""
parent_directory = Path(path).parent
# Iterate over file path to find nearest ".git" directory
while True:
try:
if parent_directory in cls.located_repos:
repo = cls.located_repos[parent_directory]
else:
# The directory must have ".git" in it. If not it occurs error.
repo = Repo(parent_directory)
# Cache already located repositories, so we would not need to load it for each new file
cls.located_repos[parent_directory] = repo
# Return True if there is no ignored file in 'path' and False if any.
return len(repo.ignored(path)) == 0
except (InvalidGitRepositoryError, NoSuchPathError):
new_parent = parent_directory.parent
# If we encountered root and cannot move further: no .git directory located in the entire path
if new_parent == parent_directory:
return True
parent_directory = new_parent
@staticmethod
def is_find_by_ext_file(config: Config, extension: str) -> bool:
"""
Checks whether file has suspicious extension
Args:
config: Config
extension: str - may be only file name with extension
Return:
True when the feature is configured and the file extension matches
"""
return config.find_by_ext and extension in config.find_by_ext_list
@staticmethod
def check_exclude_file(config: Config, path: str) -> bool:
"""
Checks whether file should be excluded
Args:
config: Config
path: str - full path preferred
Return:
True when the file full path should be excluded according config
"""
if config.pedantic:
return False
path = path.replace('\\', '/')
lower_path = path.lower()
if config.not_allowed_path_pattern.match(lower_path):
return True
for exclude_pattern in config.exclude_patterns:
if exclude_pattern.match(lower_path):
return True
for exclude_path in config.exclude_paths:
# must be case-sensitive
if exclude_path in path:
return True
file_extension = Util.get_extension(lower_path, lower=False)
if file_extension in config.exclude_extensions:
return True
if not config.depth and file_extension in config.exclude_containers:
return True
# --depth or --doc enables scan for all documents extensions
if not (config.depth or config.doc) and file_extension in config.exclude_documents:
return True
return False
@staticmethod
def check_file_size(config: Config, reference: Union[str, Path, io.BytesIO, Tuple[Union[str, Path],
io.BytesIO]]) -> bool:
"""
Checks whether the file is over the size limit from configuration or less MIN_DATA_LEN
Args:
config: Config
reference: various types of a file reference
Return:
True when the file is oversize or less than MIN_DATA_LEN, or unsupported
"""
path = reference[1] if isinstance(reference, tuple) else reference
if isinstance(path, (str, Path)):
file_size = os.path.getsize(path)
elif isinstance(path, io.BytesIO):
current_pos = path.tell()
path.seek(0, io.SEEK_END)
file_size = path.tell() - current_pos
path.seek(current_pos, io.SEEK_SET)
else:
logger.error("Unknown path type: %s", path)
return True
if MIN_DATA_LEN > file_size:
logger.debug("Size (%s) of the file '%s' is too small", file_size, path)
return True
if isinstance(config.size_limit, int) and config.size_limit < file_size:
logger.warning("Size (%s) of the file '%s' is over limit (%s)", file_size, path, config.size_limit)
return True
return False
================================================
FILE: credsweeper/file_handler/files_provider.py
================================================
import io
import logging
from pathlib import Path
from typing import List, Optional, Union, Tuple, Sequence
from credsweeper.config.config import Config
from credsweeper.file_handler.abstract_provider import AbstractProvider
from credsweeper.file_handler.content_provider import ContentProvider
from credsweeper.file_handler.file_path_extractor import FilePathExtractor
from credsweeper.file_handler.text_content_provider import TextContentProvider
logger = logging.getLogger(__name__)
class FilesProvider(AbstractProvider):
"""Provider of plain os files to be analysed."""
def __init__(self,
paths: Sequence[Union[str, Path, io.BytesIO, Tuple[Union[str, Path], io.BytesIO]]],
skip_ignored: Optional[bool] = None) -> None:
"""Initialize Files Text Provider for files from 'paths'.
Args:
paths: list of parent paths of files to scan
OR tuple of path (info purpose) and io.BytesIO (reads the data from current pos)
skip_ignored: boolean variable, Checking the directory to the list
of ignored directories from the gitignore file
"""
super().__init__(paths)
self.skip_ignored = skip_ignored
def get_scannable_files(self, config: Config) -> Sequence[ContentProvider]:
"""Get list of full text file object for analysis of files with parent paths from "paths".
Args:
config: dict of credsweeper configuration
Return:
preprocessed file objects for analysis
"""
text_content_provider_list: List[ContentProvider] = []
for path in self.paths:
if isinstance(path, (str, Path)):
new_files = FilePathExtractor.get_file_paths(config, path)
if self.skip_ignored:
new_files = FilePathExtractor.apply_gitignore(new_files)
for _file in new_files:
text_content_provider_list.append(TextContentProvider(_file))
elif isinstance(path, io.BytesIO):
text_content_provider_list.append(TextContentProvider((":memory:", path)))
elif isinstance(path, tuple) and (isinstance(path[0], (str, Path))) and isinstance(path[1], io.BytesIO):
# suppose, all the files must be scanned
text_content_provider_list.append(TextContentProvider(path))
else:
logger.error("Unknown path type: %s", path)
return text_content_provider_list
================================================
FILE: credsweeper/file_handler/patches_provider.py
================================================
import io
import logging
from pathlib import Path
from typing import List, Union, Tuple, Sequence
from credsweeper.common.constants import DiffRowType
from credsweeper.config.config import Config
from credsweeper.file_handler.abstract_provider import AbstractProvider
from credsweeper.file_handler.content_provider import ContentProvider
from credsweeper.file_handler.diff_content_provider import DiffContentProvider
from credsweeper.file_handler.file_path_extractor import FilePathExtractor
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class PatchesProvider(AbstractProvider):
"""Provide data from a list of `.patch` files.
"""
def __init__(self, paths: Sequence[Union[str, Path, io.BytesIO, Tuple[Union[str, Path], io.BytesIO]]],
change_type: DiffRowType) -> None:
"""Initialize Files Patch Provider for patch files from 'paths'.
Args:
paths: file paths list to scan. All files should be in `.patch` format
change_type: DiffRowType, type of analyses changes in patch (added or deleted)
of ignored directories from the gitignore file
"""
super().__init__(paths)
self.change_type = change_type
def load_patch_data(self, config: Config) -> List[List[str]]:
"""Loads data from patch"""
raw_patches = []
for file_path in self.paths:
if FilePathExtractor.check_file_size(config, file_path):
continue
if isinstance(file_path, (str, Path)):
raw_patches.append(Util.read_file(file_path))
elif isinstance(file_path, io.BytesIO):
the_patch = Util.decode_bytes(file_path.read())
raw_patches.append(the_patch)
elif isinstance(file_path, tuple) and 1 < len(file_path) and isinstance(file_path[1], io.BytesIO):
the_patch = Util.decode_bytes(file_path[1].read())
raw_patches.append(the_patch)
else:
logger.error("Unknown path type: %s", file_path)
return raw_patches
def get_files_sequence(self, raw_patches: List[List[str]]) -> Sequence[ContentProvider]:
"""Returns sequence of files"""
files: List[ContentProvider] = []
for raw_patch in raw_patches:
files_data = DiffContentProvider.patch2files_diff(raw_patch, self.change_type)
for file_path, file_diff in files_data.items():
files.append(DiffContentProvider(file_path=file_path, change_type=self.change_type, diff=file_diff))
return files
def get_scannable_files(self, config: Config) -> Sequence[ContentProvider]:
"""Get files to scan. Output based on the `paths` field.
Args:
config: dict of credsweeper configuration
Return:
file objects for analysing
"""
diff_data = self.load_patch_data(config)
files = self.get_files_sequence(diff_data)
return files
================================================
FILE: credsweeper/file_handler/string_content_provider.py
================================================
from functools import cached_property
from typing import List, Optional, Generator
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.file_handler.content_provider import ContentProvider
class StringContentProvider(ContentProvider):
"""Provider performs scan simple text lines"""
def __init__(
self, #
lines: List[str], #
line_numbers: Optional[List[int]] = None, #
file_path: Optional[str] = None, #
file_type: Optional[str] = None, #
info: Optional[str] = None) -> None:
"""
Parameters:
lines: text lines to be processed
line_numbers: matched line numbers for lines if the order is not natural.
Otherwise, it will be filled with natural order from 1.
"""
super().__init__(file_path=file_path, file_type=file_type, info=info)
self.__lines = lines
# fill line numbers only when amounts are equal
if line_numbers is None or len(lines) != len(line_numbers):
self.__line_numbers = None
else:
self.__line_numbers = line_numbers
@cached_property
def data(self) -> bytes:
"""data getter for StringContentProvider"""
raise NotImplementedError(__name__)
def free(self) -> None:
"""free data after scan to reduce memory usage"""
self.__lines = []
if "lines" in self.__dict__:
delattr(self, "lines")
self.__line_numbers = []
if "line_numbers" in self.__dict__:
delattr(self, "line_numbers")
@cached_property
def lines(self) -> List[str]:
"""line_numbers RO getter for StringContentProvider"""
return self.__lines
@cached_property
def line_numbers(self) -> List[int]:
"""line_numbers RO getter for StringContentProvider"""
if self.__line_numbers is None or len(self.__lines) != len(self.__line_numbers):
self.__line_numbers = list(range(1, 1 + len(self.__lines))) if self.__lines else []
return self.__line_numbers
def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
"""Return lines to scan.
Args:
min_len: minimal line length to scan
Return:
list of analysis targets based on every row in file
"""
return self.lines_to_targets(min_len, self.lines, self.line_numbers)
================================================
FILE: credsweeper/file_handler/struct_content_provider.py
================================================
import logging
from functools import cached_property
from typing import Optional, Any, Generator
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.file_handler.content_provider import ContentProvider
logger = logging.getLogger(__name__)
class StructContentProvider(ContentProvider):
"""Content provider to keep structured data"""
def __init__(
self, #
struct: Any, #
file_path: Optional[str] = None, #
file_type: Optional[str] = None, #
info: Optional[str] = None) -> None:
"""
Parameters:
struct: Various structure (string, dictionary, list)
"""
super().__init__(file_path=file_path, file_type=file_type, info=info)
self.__struct = struct
@cached_property
def data(self) -> bytes:
"""data getter for StructContentProvider"""
raise NotImplementedError(__name__)
@cached_property
def struct(self) -> Any:
"""struct getter for StructContentProvider"""
return self.__struct
def free(self) -> None:
"""free data after scan to reduce memory usage"""
self.__struct = None
if "struct" in self.__dict__:
delattr(self, "struct")
def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
"""Return nothing. The class provides only data storage.
Args:
min_len: minimal line length to scan
Raise:
NotImplementedError
"""
raise NotImplementedError()
================================================
FILE: credsweeper/file_handler/text_content_provider.py
================================================
import io
import logging
from functools import cached_property
from pathlib import Path
from typing import List, Optional, Union, Tuple, Generator
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.file_handler.content_provider import ContentProvider
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class TextContentProvider(ContentProvider):
"""Provide access to analysis targets for full-text file scanning.
Parameters:
file_path: string, path to file
"""
def __init__(self,
file_path: Union[str, Path, Tuple[Union[str, Path], io.BytesIO]],
file_type: Optional[str] = None,
info: Optional[str] = None) -> None:
_path = str(file_path[0]) if isinstance(file_path, tuple) else str(file_path)
self.__io = file_path[1] if isinstance(file_path, tuple) else None
self.__data: Optional[bytes] = None
self.__lines: Optional[List[str]] = None
super().__init__(file_path=_path, file_type=file_type, info=info)
@cached_property
def data(self) -> Optional[bytes]:
"""data RO getter for TextContentProvider"""
if self.__data is None:
if isinstance(self.__io, io.BytesIO) and self.__io:
self.__data = self.__io.read()
else:
self.__data = Util.read_data(self.file_path)
return self.__data
def free(self) -> None:
"""free data after scan to reduce memory usage"""
self.__data = None
if "data" in self.__dict__:
delattr(self, "data")
self.__lines = None
if "lines" in self.__dict__:
delattr(self, "lines")
if isinstance(self.__io, io.BytesIO) and self.__io and not self.__io.closed:
self.__io.close()
@cached_property
def lines(self) -> Optional[List[str]]:
"""lines getter for TextContentProvider"""
if self.__lines is None:
text = Util.decode_text(self.data)
if isinstance(text, str):
self.__lines = Util.split_text(text)
elif isinstance(self.__data, bytes):
logger.warning("Binary file detected %s %s %s", self.file_path, self.info,
repr(self.__data[:32]) if isinstance(self.__data, bytes) else "NONE")
self.__lines = []
return self.__lines if self.__lines is not None else []
def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
"""Load and preprocess file content to scan.
Args:
min_len: minimal line length to scan
Return:
list of analysis targets based on every row in file
"""
lines: Optional[List[str]] = None
line_nums: Optional[List[int]] = None
if Util.get_extension(self.file_path) == ".xml":
try:
# append line ending for correct xml line numeration
xml_lines = [f"{line}\n" for line in self.lines]
lines, line_nums = Util.get_xml_from_lines(xml_lines)
except Exception as exc:
logger.warning("Cannot parse to xml %s", exc)
if lines is None:
lines = self.lines
return self.lines_to_targets(min_len, lines, line_nums)
================================================
FILE: credsweeper/filters/__init__.py
================================================
from credsweeper.filters.line_git_binary_check import LineGitBinaryCheck
from credsweeper.filters.line_specific_key_check import LineSpecificKeyCheck
from credsweeper.filters.line_uue_part_check import LineUUEPartCheck
from credsweeper.filters.value_allowlist_check import ValueAllowlistCheck
from credsweeper.filters.value_array_dictionary_check import ValueArrayDictionaryCheck
from credsweeper.filters.value_atlassian_token_check import ValueAtlassianTokenCheck
from credsweeper.filters.value_azure_token_check import ValueAzureTokenCheck
from credsweeper.filters.value_base32_data_check import ValueBase32DataCheck
from credsweeper.filters.value_base64_data_check import ValueBase64DataCheck
from credsweeper.filters.value_base64_encoded_pem_check import ValueBase64EncodedPem
from credsweeper.filters.value_base64_key_check import ValueBase64KeyCheck
from credsweeper.filters.value_base64_part_check import ValueBase64PartCheck
from credsweeper.filters.value_basic_auth_check import ValueBasicAuthCheck
from credsweeper.filters.value_blocklist_check import ValueBlocklistCheck
from credsweeper.filters.value_camel_case_check import ValueCamelCaseCheck
from credsweeper.filters.value_dictionary_keyword_check import ValueDictionaryKeywordCheck
from credsweeper.filters.value_discord_bot_check import ValueDiscordBotCheck
from credsweeper.filters.value_entropy_base32_check import ValueEntropyBase32Check
from credsweeper.filters.value_entropy_base36_check import ValueEntropyBase36Check
from credsweeper.filters.value_entropy_base64_check import ValueEntropyBase64Check
from credsweeper.filters.value_file_path_check import ValueFilePathCheck
from credsweeper.filters.value_github_check import ValueGitHubCheck
from credsweeper.filters.value_grafana_check import ValueGrafanaCheck
from credsweeper.filters.value_grafana_service_check import ValueGrafanaServiceCheck
from credsweeper.filters.value_hex_number_check import ValueHexNumberCheck
from credsweeper.filters.value_jfrog_token_check import ValueJfrogTokenCheck
from credsweeper.filters.value_json_web_key_check import ValueJsonWebKeyCheck
from credsweeper.filters.value_json_web_token_check import ValueJsonWebTokenCheck
from credsweeper.filters.value_last_word_check import ValueLastWordCheck
from credsweeper.filters.value_length_check import ValueLengthCheck
from credsweeper.filters.value_method_check import ValueMethodCheck
from credsweeper.filters.value_morphemes_check import ValueMorphemesCheck
from credsweeper.filters.value_not_allowed_pattern_check import ValueNotAllowedPatternCheck
from credsweeper.filters.value_not_part_encoded_check import ValueNotPartEncodedCheck
from credsweeper.filters.value_number_check import ValueNumberCheck
from credsweeper.filters.value_pattern_check import ValuePatternCheck
from credsweeper.filters.value_sealed_secret_check import ValueSealedSecretCheck
from credsweeper.filters.value_search_check import ValueSearchCheck
from credsweeper.filters.value_similarity_check import ValueSimilarityCheck
from credsweeper.filters.value_split_keyword_check import ValueSplitKeywordCheck
from credsweeper.filters.value_string_type_check import ValueStringTypeCheck
from credsweeper.filters.value_token_base32_check import ValueTokenBase32Check
from credsweeper.filters.value_token_base36_check import ValueTokenBase36Check
from credsweeper.filters.value_token_base64_check import ValueTokenBase64Check
from credsweeper.filters.value_token_check import ValueTokenCheck
================================================
FILE: credsweeper/filters/filter.py
================================================
from abc import abstractmethod, ABC
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
class Filter(ABC):
"""Base class for all filters that operates on 'line_data' objects."""
@abstractmethod
def __init__(self, config: Optional[Config], *args):
"""Config is optional for a filter"""
raise NotImplementedError()
@abstractmethod
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
raise NotImplementedError()
================================================
FILE: credsweeper/filters/group/__init__.py
================================================
from credsweeper.filters.group.general_keyword import GeneralKeyword
from credsweeper.filters.group.general_pattern import GeneralPattern
from credsweeper.filters.group.password_keyword import PasswordKeyword
from credsweeper.filters.group.token_pattern import TokenPattern
from credsweeper.filters.group.url_credentials_group import UrlCredentialsGroup
from credsweeper.filters.group.weird_base36_token import WeirdBase36Token
from credsweeper.filters.group.weird_base64_token import WeirdBase64Token
================================================
FILE: credsweeper/filters/group/general_keyword.py
================================================
from credsweeper.common.constants import GroupType
from credsweeper.config.config import Config
from credsweeper.filters import ValueDictionaryKeywordCheck, ValueSealedSecretCheck
from credsweeper.filters.group.group import Group
class GeneralKeyword(Group):
"""GeneralKeyword"""
def __init__(self, config: Config) -> None:
super().__init__(config, GroupType.KEYWORD)
self.filters.extend([ValueDictionaryKeywordCheck(), ValueSealedSecretCheck()])
================================================
FILE: credsweeper/filters/group/general_pattern.py
================================================
from credsweeper.common.constants import GroupType
from credsweeper.config.config import Config
from credsweeper.filters.group.group import Group
class GeneralPattern(Group):
"""GeneralPattern"""
def __init__(self, config: Config) -> None:
super().__init__(config, GroupType.PATTERN)
================================================
FILE: credsweeper/filters/group/group.py
================================================
from abc import ABC
from typing import List
from credsweeper.common.constants import GroupType
from credsweeper.config.config import Config
from credsweeper.filters.filter import Filter
from credsweeper.filters.line_specific_key_check import LineSpecificKeyCheck
from credsweeper.filters.value_allowlist_check import ValueAllowlistCheck
from credsweeper.filters.value_array_dictionary_check import ValueArrayDictionaryCheck
from credsweeper.filters.value_blocklist_check import ValueBlocklistCheck
from credsweeper.filters.value_camel_case_check import ValueCamelCaseCheck
from credsweeper.filters.value_file_path_check import ValueFilePathCheck
from credsweeper.filters.value_hex_number_check import ValueHexNumberCheck
from credsweeper.filters.value_last_word_check import ValueLastWordCheck
from credsweeper.filters.value_method_check import ValueMethodCheck
from credsweeper.filters.value_not_allowed_pattern_check import ValueNotAllowedPatternCheck
from credsweeper.filters.value_pattern_check import ValuePatternCheck
from credsweeper.filters.value_similarity_check import ValueSimilarityCheck
from credsweeper.filters.value_string_type_check import ValueStringTypeCheck
from credsweeper.filters.value_token_check import ValueTokenCheck
class Group(ABC):
"""Abstract Group class"""
def __init__(self, config: Config, rule_type: GroupType = GroupType.DEFAULT) -> None:
"""Config is required for filter group"""
if rule_type == GroupType.KEYWORD:
self.__filters = [ #
ValueAllowlistCheck(), #
ValueArrayDictionaryCheck(), #
ValueBlocklistCheck(), #
ValueCamelCaseCheck(), #
ValueFilePathCheck(), #
ValueHexNumberCheck(), #
ValueLastWordCheck(), #
ValueMethodCheck(), #
ValueSimilarityCheck(), #
ValueStringTypeCheck(check_for_literals=config.check_for_literals), #
ValueTokenCheck(), #
]
if not config.doc:
self.__filters.extend([ValuePatternCheck(), ValueNotAllowedPatternCheck()])
elif rule_type == GroupType.PATTERN:
self.__filters = [ #
LineSpecificKeyCheck(), #
ValuePatternCheck(), #
]
else:
# GroupType.DEFAULT
self.__filters = []
@property
def filters(self) -> List[Filter]:
"""property getter"""
return self.__filters
@filters.setter
def filters(self, filters: List[Filter]) -> None:
"""property setter"""
self.__filters = filters
================================================
FILE: credsweeper/filters/group/password_keyword.py
================================================
from credsweeper.common.constants import GroupType
from credsweeper.config.config import Config
from credsweeper.filters import ValueLengthCheck, LineGitBinaryCheck, ValueSealedSecretCheck
from credsweeper.filters import ValueSplitKeywordCheck
from credsweeper.filters.group.group import Group
from credsweeper.filters.line_uue_part_check import LineUUEPartCheck
class PasswordKeyword(Group):
"""PasswordKeyword"""
def __init__(self, config: Config) -> None:
super().__init__(config, GroupType.KEYWORD)
self.filters.extend([
ValueLengthCheck(max_len=config.max_password_value_length),
ValueSplitKeywordCheck(),
ValueSealedSecretCheck(),
LineGitBinaryCheck(),
LineUUEPartCheck()
])
================================================
FILE: credsweeper/filters/group/token_pattern.py
================================================
from credsweeper.common.constants import GroupType
from credsweeper.config.config import Config
from credsweeper.filters import ValueMorphemesCheck, ValueCamelCaseCheck, ValueNumberCheck, ValuePatternCheck
from credsweeper.filters.group.group import Group
class TokenPattern(Group):
"""Token Pattern"""
def __init__(self, config: Config) -> None:
super().__init__(config, GroupType.DEFAULT)
self.filters = [
ValueMorphemesCheck(),
ValueNumberCheck(),
ValueCamelCaseCheck(),
ValuePatternCheck(),
]
================================================
FILE: credsweeper/filters/group/url_credentials_group.py
================================================
from credsweeper.common.constants import GroupType
from credsweeper.config.config import Config
from credsweeper.filters import (ValueAllowlistCheck, ValueArrayDictionaryCheck, ValueBlocklistCheck,
ValueCamelCaseCheck, ValueLengthCheck, ValueFilePathCheck, ValueLastWordCheck,
ValueMethodCheck, ValueNotAllowedPatternCheck, ValuePatternCheck, ValueStringTypeCheck,
ValueTokenCheck)
from credsweeper.filters.group.group import Group
class UrlCredentialsGroup(Group):
"""UrlCredentialsGroup"""
def __init__(self, config: Config) -> None:
"""URL credentials group class.
Similar to PasswordKeyword, but exclude all checks dependent on the variable name, as URL credentials have no
explicitly defined variable
"""
super().__init__(config, GroupType.DEFAULT)
self.filters = [
ValueAllowlistCheck(),
ValueArrayDictionaryCheck(),
ValueBlocklistCheck(),
ValueCamelCaseCheck(),
ValueFilePathCheck(),
ValueLastWordCheck(),
ValueMethodCheck(),
ValueStringTypeCheck(check_for_literals=config.check_for_literals),
ValueNotAllowedPatternCheck(),
ValueTokenCheck(),
ValueLengthCheck(max_len=config.max_url_cred_value_length),
ValuePatternCheck()
]
================================================
FILE: credsweeper/filters/group/weird_base36_token.py
================================================
from credsweeper.common.constants import GroupType
from credsweeper.config.config import Config
from credsweeper.filters import ValueMorphemesCheck, ValuePatternCheck, ValueNumberCheck, ValueEntropyBase36Check, \
ValueTokenBase36Check
from credsweeper.filters.group.group import Group
class WeirdBase36Token(Group):
"""Structured Token"""
def __init__(self, config: Config) -> None:
super().__init__(config, GroupType.DEFAULT)
self.filters = [
ValueMorphemesCheck(threshold=1),
ValuePatternCheck(),
ValueNumberCheck(),
ValueTokenBase36Check(),
ValueEntropyBase36Check(),
]
================================================
FILE: credsweeper/filters/group/weird_base64_token.py
================================================
from credsweeper.common.constants import GroupType
from credsweeper.config.config import Config
from credsweeper.filters import ValueMorphemesCheck, ValueNotPartEncodedCheck, \
ValueBase64DataCheck, ValueEntropyBase64Check, ValuePatternCheck, ValueNumberCheck, ValueTokenBase64Check, \
ValueBase64PartCheck
from credsweeper.filters.group.group import Group
class WeirdBase64Token(Group):
"""Structured Token"""
def __init__(self, config: Config) -> None:
super().__init__(config, GroupType.DEFAULT)
self.filters = [
ValueMorphemesCheck(threshold=1),
ValueNumberCheck(),
ValueBase64DataCheck(),
ValueTokenBase64Check(),
ValueEntropyBase64Check(),
ValuePatternCheck(),
ValueNotPartEncodedCheck(),
ValueBase64PartCheck(),
]
================================================
FILE: credsweeper/filters/line_git_binary_check.py
================================================
import base64
import contextlib
import re
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class LineGitBinaryCheck(Filter):
"""Checks that line is not a part of git binary patch"""
base85string = re.compile(r"^[A-Za-z][0-9A-Za-z!#$%&()*+;<=>?@^_`{|}~-]{6,65}$")
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if not line_data.line:
return True
if 66 < target.line_strip_len:
return False
line = target.line_strip
len_line = len(line)
# https://github.com/git/git/blob/master/base85.c
if 6 <= len_line and 0 == ((len_line - 1) % 5) and LineGitBinaryCheck.base85string.match(line):
size = ord(line[0])
if 65 <= size <= 90: # A-Z
size -= 64
elif 97 <= size <= 122: # a-z
size -= 70
else:
return False
with contextlib.suppress(Exception):
decoded = base64.b85decode(line[1:])
return len(decoded) == size
return False
================================================
FILE: credsweeper/filters/line_specific_key_check.py
================================================
import re
from typing import Optional
from credsweeper.common.constants import ML_HUNK
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.util import Util
class LineSpecificKeyCheck(Filter):
"""Check that values from list below is not in candidate line."""
NOT_ALLOWED = [r"example", r"\benc[\(\[]", r"\btrue\b", r"\bfalse\b"]
NOT_ALLOWED_PATTERN = re.compile(Util.get_regex_combine_or(NOT_ALLOWED), re.IGNORECASE)
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if line_data.line is None:
return True
if 0 <= line_data.variable_start:
# variable may be defined too
sub_line_start = 0 if ML_HUNK >= line_data.variable_start else line_data.variable_start - ML_HUNK
else:
sub_line_start = 0 if ML_HUNK >= line_data.value_start else line_data.value_start - ML_HUNK
if self.NOT_ALLOWED_PATTERN.search(line_data.line, sub_line_start, line_data.value_end + ML_HUNK):
return True
return False
================================================
FILE: credsweeper/filters/line_uue_part_check.py
================================================
import re
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class LineUUEPartCheck(Filter):
"""Checks that line is not a part of UU encoding only for maximal line"""
uue_string = re.compile(r"^M[!-`]{60}$")
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if not line_data.line:
return True
if 61 != target.line_len:
return False
line = target.line
if LineUUEPartCheck.uue_string.match(line):
# to be sure - check two lines: before and/or after
if 0 < line_data.line_pos:
previous_line = target.lines[line_data.line_pos - 1]
if LineUUEPartCheck.uue_string.match(previous_line):
return True
if len(target.lines) > 1 + line_data.line_pos:
next_line = target.lines[line_data.line_pos + 1]
if LineUUEPartCheck.uue_string.match(next_line):
return True
return False
================================================
FILE: credsweeper/filters/value_allowlist_check.py
================================================
import re
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.util import Util
class ValueAllowlistCheck(Filter):
"""Check that the patterns do not MATCH the candidate value."""
ALLOWED = [
r"ENC\(.*\)", #
r"ENC\[.*\]", #
r"\$\{(\*|[0-9]+|[a-z_].*)\}", #
r"\$[0-9]+(\s|$)", #
r"\$\$[a-z_]+(\^%[0-9a-z_]+)?", #
r"#\{.+\}", # Ruby: String Interpolation
r"\{\{.+\}\}", #
r".*@@@hl@@@(암호|비번|PW|PASS)@@@endhl@@@", #
]
ALLOWED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED), flags=re.IGNORECASE)
ALLOWED_QUOTED = [
r"\$[a-z_][0-9a-z_]+((::|->|\.)[a-z_]|\[|$)", #
r"\$\([^)]+\)", #
r".*\*\*\*", #
]
ALLOWED_QUOTED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED_QUOTED), flags=re.IGNORECASE)
ALLOWED_UNQUOTED = [
r"[~a-z0-9_]+((\.|->)[a-z0-9_]+)+\(.*$", #
r"\$[a-z_][0-9a-z_]+((::|->|\.)[a-z_]|\[|$)", #
r"\$\([.0-9a-z_-]+", #
r".*\*\*\*\*\*", #
]
ALLOWED_UNQUOTED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED_UNQUOTED), flags=re.IGNORECASE)
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if line_data.is_well_quoted_value:
if self.ALLOWED_PATTERN.match(line_data.value) or self.ALLOWED_QUOTED_PATTERN.match(line_data.value):
return True
else:
value = line_data.wrap + line_data.value if line_data.wrap else line_data.value
if self.ALLOWED_PATTERN.match(value) or self.ALLOWED_UNQUOTED_PATTERN.match(value):
return True
return False
================================================
FILE: credsweeper/filters/value_array_dictionary_check.py
================================================
import re
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueArrayDictionaryCheck(Filter):
"""Match call to dictionary or array element.
This filter checks only calls, not declarations:
`token = values[i]` would be filtered
`token = {'root'}` would be kept
"""
PATTERN = re.compile(r"\[['\"]?[^,]+['\"]?]")
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if line_data.is_well_quoted_value:
return False
# not well quoted value
if line_data.wrap and "byte" in line_data.wrap.lower():
return False
if self.PATTERN.search(line_data.value):
return True
if line_data.wrap and (line_data.wrap.endswith('[') or line_data.wrap.endswith('(')):
return True
return False
================================================
FILE: credsweeper/filters/value_atlassian_token_check.py
================================================
import binascii
import contextlib
from typing import Optional
from credsweeper.common.constants import LATIN_1, ASCII
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.util import Util
class ValueAtlassianTokenCheck(Filter):
"""Check that candidate have a known structure"""
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received token which might be structured.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
value = line_data.value
with contextlib.suppress(Exception):
# atlassian integer:bytes from base64
if value.startswith("BBDC-"):
# Bitbucket HTTP Access Token
return ValueAtlassianTokenCheck.check_atlassian_struct(value[5:])
if value.startswith("AT"):
# Bitbucket App password
while "\\=" in value or "%3d" in value or "%3D" in value:
# = sign may be escaped in URL https://www.rfc-editor.org/rfc/rfc3986
value = value.replace('\\', '')
value = value.replace('%3d', '=')
value = value.replace('%3D', '=')
return ValueAtlassianTokenCheck.check_crc32_struct(value)
# Jira / Confluence PAT token
return ValueAtlassianTokenCheck.check_atlassian_struct(value)
return True
@staticmethod
def check_crc32_struct(value: str) -> bool:
"""Returns False if value is valid for bitbucket app password structure 'payload:crc32'"""
crc32 = int(value[-8:], 16)
data = value[:-8].encode(ASCII)
data_crc32 = binascii.crc32(data)
if crc32 == data_crc32:
return False
return True
@staticmethod
def check_atlassian_struct(value: str) -> bool:
"""Returns False if value is valid for atlassian structure 'integer:bytes'"""
decoded = Util.decode_base64(value, padding_safe=True, urlsafe_detect=True)
delimiter_pos = decoded.find(b':')
# there is limit for big integer value: math.log10(1<<64) = 19.265919722494797
if 0 < delimiter_pos <= 20:
val = decoded[:delimiter_pos].decode(LATIN_1)
# at least 4 digits in the token
if 1000 <= int(val):
# test for ascii and Shannon entropy - there should be random data
data = decoded[delimiter_pos + 1:]
return Util.is_ascii_entropy_validate(data)
return True
================================================
FILE: credsweeper/filters/value_azure_token_check.py
================================================
import contextlib
import json
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.filters.value_entropy_base64_check import ValueEntropyBase64Check
from credsweeper.utils.util import Util
class ValueAzureTokenCheck(Filter):
"""
Azure tokens contains header, payload and signature
https://learn.microsoft.com/en-us/azure/active-directory-b2c/access-tokens
"""
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received token which might be structured.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, when need to filter candidate and False if left
"""
with contextlib.suppress(Exception):
parts = line_data.value.split('.')
if 3 != len(parts):
return True
hdr = Util.decode_base64(parts[0], padding_safe=True, urlsafe_detect=True)
header = json.loads(hdr)
if not ("alg" in header and "typ" in header and "kid" in header):
# must be all parts in header
return True
pld = Util.decode_base64(parts[1], padding_safe=True, urlsafe_detect=True)
payload = json.loads(pld)
if not ("iss" in payload and "exp" in payload and "iat" in payload):
# must be all parts in payload
return True
min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(parts[2]))
entropy = Util.get_shannon_entropy(parts[2])
# good signature has to be like random bytes
return entropy < min_entropy
return True
================================================
FILE: credsweeper/filters/value_base32_data_check.py
================================================
import base64
import contextlib
import string
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.util import Util
class ValueBase32DataCheck(Filter):
"""
Check that candidate is NOT an ascii encoded string with entropy check
"""
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received weird base32 token which must be a random string
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, when need to filter candidate and False if left
"""
value = line_data.value
# check whether digits and upper cases present
for string_set in [string.digits, string.ascii_uppercase]:
for digit in string_set:
if digit in value:
break
else:
return True
# check whether decoded bytes have enough entropy
with contextlib.suppress(Exception):
if pad_remain := len(value) % 8:
value += '=' * (8 - pad_remain)
decoded = base64.b32decode(value)
return Util.is_ascii_entropy_validate(decoded)
return True
================================================
FILE: credsweeper/filters/value_base64_data_check.py
================================================
import contextlib
import string
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.util import Util
class ValueBase64DataCheck(Filter):
"""
Check that candidate is NOT an ascii encoded string with entropy check
"""
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received weird base64 token which must be a random string
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, when need to filter candidate and False if left
"""
value = line_data.value
# check whether digits, lower and upper cases present
for string_set in [string.digits, string.ascii_lowercase, string.ascii_uppercase]:
for digit in string_set:
if digit in value:
break
else:
return True
# check whether decoded bytes have enough entropy
with contextlib.suppress(Exception):
decoded = Util.decode_base64(value, padding_safe=True, urlsafe_detect=True)
return Util.is_ascii_entropy_validate(decoded)
return True
================================================
FILE: credsweeper/filters/value_base64_encoded_pem_check.py
================================================
import logging
from typing import Optional
from credsweeper.common.constants import ASCII, PEM_BEGIN_PATTERN, MAX_LINE_LENGTH, PEM_END_PATTERN
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.pem_key_detector import PemKeyDetector
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class ValueBase64EncodedPem(Filter):
"""Check that candidate contains base64 encoded pem private key"""
def __init__(self, config: Optional[Config] = None) -> None:
self.config = config
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received token which might be structured.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
try:
text = Util.decode_base64(line_data.value, padding_safe=True, urlsafe_detect=True).decode(ASCII)
pem_text = ''
pem_end_found = False
for line in text.splitlines():
if pem_text:
pem_text += f"\n{line}"
if PEM_END_PATTERN in line:
pem_end_found = True
else:
if PEM_BEGIN_PATTERN in line:
if PemKeyDetector.RE_PEM_BEGIN.search(line, 0, MAX_LINE_LENGTH):
pem_text = line
if PEM_END_PATTERN in line:
pem_end_found = True
if pem_end_found:
new_target = AnalysisTarget(0, [pem_text], [1], target.descriptor)
first_line = LineData(self.config, pem_text, 0, 1, target.file_path, target.file_type, target.info,
PemKeyDetector.RE_PEM_BEGIN)
if PemKeyDetector(self.config).detect_pem_key(first_line, new_target):
# obtained candidates are not used because not match text
return False
# drop the candidate and continue search
pem_text = ''
pem_end_found = False
except Exception as exc:
logger.warning(exc)
return True
================================================
FILE: credsweeper/filters/value_base64_key_check.py
================================================
import contextlib
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.util import Util
class ValueBase64KeyCheck(Filter):
"""Check that candidate contains base64 encoded private key"""
EXTRA_TRANS_TABLE = str.maketrans('', '', "\",'\\")
def __init__(self, config: Optional[Config] = None) -> None:
self.config = config
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received token which might be structured.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
with contextlib.suppress(Exception):
# remove backslash escaping sequences
text = Util.PEM_CLEANING_PATTERN.sub(r'', line_data.value)
# remove whitespaces
text = text.translate(Util.WHITESPACE_TRANS_TABLE)
# clean sequence concatenation case:
text = text.replace("'+'", '')
text = text.replace('"+"', '')
# possibly url based escaping:
text = text.replace('%2B', '+')
text = text.replace('%2F', '/')
text = text.replace('%3D', '=')
# clean any other chars which should not appear
text = text.translate(ValueBase64KeyCheck.EXTRA_TRANS_TABLE)
# only PEM standard encoding supported in regex pattern to cut off ending of the key
key = Util.decode_base64(text, padding_safe=True, urlsafe_detect=False)
private_key = Util.load_pk(key, password=None)
if Util.check_pk(private_key):
return False
return True
================================================
FILE: credsweeper/filters/value_base64_part_check.py
================================================
import contextlib
import re
import statistics
from itertools import takewhile
from typing import Optional
from credsweeper.common.constants import Chars
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.filters.value_entropy_base64_check import ValueEntropyBase64Check
from credsweeper.utils.util import Util
class ValueBase64PartCheck(Filter):
"""
Check that candidate is NOT a part of base64 long line
"""
base64_pattern = re.compile(r"^(\\{1,8}[0abfnrtv]|[0-9A-Za-z+/=]){1,4000}$")
base64_char_set = set(Chars.BASE64STDPAD_CHARS.value + '\\')
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received weird base64 token which must be a random string
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, when need to filter candidate and False if left
"""
with contextlib.suppress(Exception):
line = line_data.line
len_line = len(line)
value = line_data.value
len_value = len(value)
if 0 == line_data.value_start and len_line >= 2 * len_value \
or 0 < line_data.value_start and line[line_data.value_start - 1] in ('/', '+', '\\', '%') \
or 0 < line_data.value_end < len_line and line[line_data.value_end] in ('/', '+', '\\', '%'):
if '-' in value or '_' in value:
# the value contains url-safe chars, so '/' or '+' is a delimiter
return False
left_start = line_data.value_start - len_value
if 0 > left_start:
left_start = 0
right_end = line_data.value_end + len_value
if len_line < right_end:
right_end = len_line
hunk_size = right_end - left_start
if hunk_size == 3 * len_value:
# simple analysis for maximal data size
if self.base64_pattern.match(line[left_start:right_end]):
# obvious case: all characters are base64 standard
return True
elif right_end - left_start >= 2 * len_value:
# simple analysis for data too large to yield sensible insights
part_set = set(line[left_start:right_end])
if not part_set.difference(ValueBase64PartCheck.base64_char_set):
# obvious case: all characters are base64 standard
return True
left_part = ''.join(
takewhile(lambda x: x in ValueBase64PartCheck.base64_char_set,
reversed(line[left_start:line_data.value_start])))
right_part = ''.join(
takewhile(lambda x: x in ValueBase64PartCheck.base64_char_set, line[line_data.value_end:right_end]))
min_entropy_value = ValueEntropyBase64Check.get_min_data_entropy(len_value)
left_entropy = Util.get_shannon_entropy(left_part)
value_entropy = Util.get_shannon_entropy(value)
right_entropy = Util.get_shannon_entropy(right_part)
common = left_part + value + right_part
common_entropy = Util.get_shannon_entropy(common)
min_entropy_common = ValueEntropyBase64Check.get_min_data_entropy(len(common))
if min_entropy_common < common_entropy:
return True
if left_entropy and right_entropy:
data = [left_entropy, value_entropy, right_entropy, min_entropy_value, common_entropy]
elif left_entropy and not right_entropy:
data = [left_entropy, value_entropy, min_entropy_value, min_entropy_value, common_entropy]
elif not left_entropy and right_entropy:
data = [value_entropy, right_entropy, min_entropy_value, min_entropy_value, common_entropy]
else:
return False
avg = statistics.mean(data)
stdev = statistics.stdev(data, avg)
avg_min = avg - 1.1 * stdev
if (0. == left_entropy or avg_min < left_entropy or left_entropy < value_entropy < right_entropy) \
and (
0. == right_entropy or avg_min < right_entropy or right_entropy < value_entropy < left_entropy):
# high entropy of bound parts looks like a part of base64 long line
return True
return False
return False
================================================
FILE: credsweeper/filters/value_basic_auth_check.py
================================================
import contextlib
from typing import Optional
from credsweeper.common.constants import DEFAULT_PATTERN_LEN, UTF_8
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.util import Util
class ValueBasicAuthCheck(Filter):
"""Check that candidate have a known structure"""
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received token which might be structured.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
value = line_data.value
with contextlib.suppress(Exception):
# Basic encoding -> login:password
decoded = Util.decode_base64(value, padding_safe=True, urlsafe_detect=True)
delimiter_pos = decoded.find(b':')
# check whether the delimiter exists and all chars are decoded
if 0 < delimiter_pos < len(decoded) - DEFAULT_PATTERN_LEN and decoded.decode(UTF_8):
return False
return True
================================================
FILE: credsweeper/filters/value_blocklist_check.py
================================================
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueBlocklistCheck(Filter):
"""Check that words from block list is lest that 70% of candidate value length."""
NOT_ALLOWED = [
"true",
"false",
"null",
"none",
"bearer",
"string",
"value",
"undefined",
"uuid",
]
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
value = line_data.value.lower()
for not_allowed in self.NOT_ALLOWED:
if not_allowed in value and len(not_allowed) / len(value) >= 0.7:
return True
return False
================================================
FILE: credsweeper/filters/value_camel_case_check.py
================================================
import re
from typing import Optional
from credsweeper.common import static_keyword_checklist
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.util import Util
class ValueCamelCaseCheck(Filter):
"""Check that candidate is not written in camel case."""
CAMEL_CASE = ["[a-z]+([A-Z][a-z]+)+", "[A-Z][a-z]+([A-Z][a-z]+)+"]
CAMEL_CASE_PATTERN = re.compile(Util.get_regex_combine_or(CAMEL_CASE))
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if line_data.is_well_quoted_value:
return False
if self.CAMEL_CASE_PATTERN.fullmatch(line_data.value):
return static_keyword_checklist.check_morphemes(line_data.value.lower(), 1)
return False
================================================
FILE: credsweeper/filters/value_dictionary_keyword_check.py
================================================
from typing import Optional
from credsweeper.common import static_keyword_checklist
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueDictionaryKeywordCheck(Filter):
"""Check that no word from dictionary present in the candidate value."""
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
line_data_value_lower = line_data.value.lower()
for keyword in static_keyword_checklist.keyword_list:
if keyword in line_data_value_lower:
line_data_value_lower = line_data_value_lower.replace(keyword, '\x7F' * len(keyword))
ratio = line_data_value_lower.count('\x7F') / len(line_data_value_lower)
if 0.33 < ratio:
return True
return False
================================================
FILE: credsweeper/filters/value_discord_bot_check.py
================================================
import contextlib
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.filters.value_entropy_base64_check import ValueEntropyBase64Check
from credsweeper.utils.util import Util
class ValueDiscordBotCheck(Filter):
"""Discord bot Token"""
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received token which might be structured.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, when need to filter candidate and False if left
"""
with contextlib.suppress(Exception):
# . must be in value according regex
dot_separator_index = line_data.value.index('.')
id_part = line_data.value[:dot_separator_index]
discord_id = int(Util.decode_base64(id_part, padding_safe=True, urlsafe_detect=True))
entropy_part = line_data.value[dot_separator_index:]
entropy = Util.get_shannon_entropy(entropy_part)
min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(entropy_part))
if 1000 <= discord_id and min_entropy <= entropy:
return False
return True
================================================
FILE: credsweeper/filters/value_entropy_base32_check.py
================================================
import math
from functools import cache
from credsweeper.filters.value_entropy_base_check import ValueEntropyBaseCheck
class ValueEntropyBase32Check(ValueEntropyBaseCheck):
"""Base32 entropy check"""
@staticmethod
@cache
def get_min_data_entropy(x: int) -> float:
"""Returns average entropy for size of random data. Precalculated data is applied for speedup"""
if 8 <= x < 17:
y = 0.80569236 * math.log2(x) + 0.13439734
elif 17 <= x < 33:
y = 0.66350481 * math.log2(x) + 0.71143862
elif 33 <= x:
y = 4.04
else:
y = 0
return y
================================================
FILE: credsweeper/filters/value_entropy_base36_check.py
================================================
import math
from functools import cache
from credsweeper.filters.value_entropy_base_check import ValueEntropyBaseCheck
class ValueEntropyBase36Check(ValueEntropyBaseCheck):
"""Base36 entropy check"""
@staticmethod
@cache
def get_min_data_entropy(x: int) -> float:
"""Returns minimal entropy for size of random data. Precalculated data is applied for speedup"""
if 15 == x:
# workaround for Dropbox App secret
y = 3.374
elif 10 <= x < 26:
y = 0.731566857 * math.log2(x) + 0.474132
elif 26 <= x:
y = 3.9
else:
y = 0
return y
================================================
FILE: credsweeper/filters/value_entropy_base64_check.py
================================================
import math
from functools import cache
from credsweeper.filters.value_entropy_base_check import ValueEntropyBaseCheck
class ValueEntropyBase64Check(ValueEntropyBaseCheck):
"""Base64 entropy check"""
@staticmethod
@cache
def get_min_data_entropy(x: int) -> float:
"""Returns minimal average entropy for size of random data. Precalculated round data is applied for speedup"""
if 12 <= x < 18:
y = 0.915 * math.log2(x) - 0.047
elif 18 <= x < 35:
y = 0.767 * math.log2(x) + 0.5677
elif 35 <= x < 65:
y = 0.944 * math.log2(x) - 0.009 * x - 0.04
elif 65 <= x < 256:
y = 0.621 * math.log2(x) - 0.003 * x + 1.54
elif 256 <= x:
y = 6 - 64 / x
else:
y = 0
return y
================================================
FILE: credsweeper/filters/value_entropy_base_check.py
================================================
from abc import abstractmethod
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.util import Util
class ValueEntropyBaseCheck(Filter):
"""Check that candidate value has minimal Shanon Entropy for appropriated base"""
def __init__(self, config: Optional[Config] = None) -> None:
pass
@staticmethod
@abstractmethod
def get_min_data_entropy(x: int) -> float:
"""Returns minimal entropy for size of data"""
raise NotImplementedError()
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, when need to filter candidate and False if left
"""
entropy = Util.get_shannon_entropy(line_data.value)
min_entropy = self.get_min_data_entropy(len(line_data.value))
if min_entropy > entropy or 0 == min_entropy:
return True
return False
================================================
FILE: credsweeper/filters/value_file_path_check.py
================================================
from typing import Optional
from credsweeper.common import static_keyword_checklist
from credsweeper.common.constants import Chars
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.filters.value_entropy_base64_check import ValueEntropyBase64Check
from credsweeper.utils.util import Util
class ValueFilePathCheck(Filter):
"""Check that candidate value is a path or not.
Check if a value contains either '/' or ':\' separators (but not both)
and do not have any special characters ( !$@`&*()+)
"""
base64stdpad_possible_set = set(Chars.BASE64STDPAD_CHARS.value)
unusual_windows_symbols_in_path = "\t\n\r!$@`&*(){}<>+=;,~^"
unusual_linux_symbols_in_path = "\t\n\r!@`&*<>+=;,~^:\\"
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
value = line_data.value
bit_length = len(value).bit_length()
morpheme_threshold = 1 if 6 > bit_length else bit_length - 4
contains_unix_separator = '/' in value
if contains_unix_separator:
if ("://" in value #
or value.startswith("~/") #
or value.startswith("./") #
or "../" in value #
or "/.." in value #
or value.startswith("//") and ':' == line_data.separator):
# common case for url definition or aliases
# or _keyword_://example.com where : is the separator
return static_keyword_checklist.check_morphemes(value.lower(), morpheme_threshold)
# base64 encoded data might look like linux path
min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(value))
# get minimal entropy to compare with shannon entropy of found value
# min_entropy == 0 means that the value cannot be checked with the entropy due high variance
for i in value:
if i not in self.base64stdpad_possible_set:
# value contains wrong BASE64STDPAD_CHARS symbols like -_.
break
else:
# all symbols are from base64 alphabet
entropy = Util.get_shannon_entropy(value)
if 0 == min_entropy or min_entropy > entropy:
contains_unix_separator = 1 < value.count('/')
else:
# high entropy means base64 encoded data
contains_unix_separator = False
# low shannon entropy points that the value maybe not a high randomized value in base64
contains_windows_separator = ':\\' in value
if contains_unix_separator or contains_windows_separator:
unusual_symbols_in_path = self.unusual_linux_symbols_in_path if contains_unix_separator \
else self.unusual_windows_symbols_in_path
for i in unusual_symbols_in_path:
if i in value:
# the symbols which not passed in a path usually
break
else:
if contains_unix_separator ^ contains_windows_separator:
return static_keyword_checklist.check_morphemes(value.lower(), morpheme_threshold)
return False
================================================
FILE: credsweeper/filters/value_github_check.py
================================================
import binascii
import contextlib
from typing import Optional
import base62
from credsweeper.common.constants import ASCII
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueGitHubCheck(Filter):
"""NPM or GitHub Classic Token validation"""
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received token which might be structured.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, when need to filter candidate and False if left
"""
# https://github.blog/2021-04-05-behind-githubs-new-authentication-token-formats/
# https://github.blog/security/announcing-npms-new-access-token-format/
with contextlib.suppress(Exception):
if (line_data.value.startswith("gh") and '_' == line_data.value[3]) or line_data.value.startswith("npm_"):
token = line_data.value[4:-6]
data = token.encode(ASCII, errors="strict")
crc32sum = binascii.crc32(data)
base62_crc32 = line_data.value[-6:]
sign_b = base62.decodebytes(base62_crc32)
crc32sign = int.from_bytes(sign_b, "big")
if crc32sign == crc32sum:
return False
return True
================================================
FILE: credsweeper/filters/value_grafana_check.py
================================================
import contextlib
import json
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.util import Util
class ValueGrafanaCheck(Filter):
"""Grafana Provisioned API Key and Access Policy Token"""
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received token which might be structured.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, when need to filter candidate and False if left
"""
with contextlib.suppress(Exception):
if line_data.value.startswith("glc_"):
# Grafana Access Policy Token
decoded = Util.decode_base64(line_data.value[4:], padding_safe=True, urlsafe_detect=True)
keys = ["o", "n", "k", "m"]
else:
# Grafana Provisioned API Key
decoded = Util.decode_base64(line_data.value, padding_safe=True, urlsafe_detect=True)
keys = ["n", "k", "id"]
if payload := json.loads(decoded):
for key in keys:
if key not in payload:
return True
return False
return True
================================================
FILE: credsweeper/filters/value_grafana_service_check.py
================================================
import binascii
import contextlib
import struct
from typing import Optional
from credsweeper.common.constants import ASCII
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueGrafanaServiceCheck(Filter):
"""Check that candidate have a known structure"""
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received token which might be structured.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
with contextlib.suppress(Exception):
checksum = struct.unpack(" None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
value = line_data.value.lower()
if ValueHexNumberCheck.HEX_08_64_VALUE_REGEX.match(value):
return True
return False
================================================
FILE: credsweeper/filters/value_jfrog_token_check.py
================================================
import contextlib
import re
from typing import Optional
import base58
from credsweeper.common.constants import ASCII
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.util import Util
class ValueJfrogTokenCheck(Filter):
"""Check that candidate have a known structure JFROG token"""
def __init__(self, config: Optional[Config] = None) -> None:
# reftkn:01:0123456789:abcdefGhijklmnoPqrstuVwxyz0
self._pattern = re.compile(r"reftkn:\d+:\d+:[\w_/+-]+")
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received token which might be structured.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
value = line_data.value
with contextlib.suppress(Exception):
if value.startswith("cmVmdGtuO"):
decoded = Util.decode_base64(value, padding_safe=True, urlsafe_detect=True)
if self._pattern.match(decoded.decode(ASCII)):
# identity token
return False
if value.startswith("AKCp"):
decoded = base58.b58decode(value)
# the check only for correct size decoding
if 54 == len(decoded):
# API key (deprecated) - a good integrity check solution was not found
return False
return True
================================================
FILE: credsweeper/filters/value_json_web_key_check.py
================================================
import contextlib
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.util import Util
class ValueJsonWebKeyCheck(Filter):
"""
Check that candidate is JWK which starts usually from 'e'
and have private parts of the key
https://datatracker.ietf.org/doc/html/rfc7517
https://datatracker.ietf.org/doc/html/rfc7518
"""
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received key which might be structured.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, when need to filter candidate and False if left
"""
with contextlib.suppress(Exception):
if data := Util.decode_base64(line_data.value, padding_safe=True, urlsafe_detect=True):
if b'"kty":' in data and (b'"oct"' in data and b'"k":' in data or
(b'"EC"' in data or b'"RSA"' in data) and b'"d":' in data):
return False
return True
================================================
FILE: credsweeper/filters/value_json_web_token_check.py
================================================
import contextlib
import json
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.util import Util
class ValueJsonWebTokenCheck(Filter):
"""
Check that candidate is JWT which starts usually from 'eyJ'
registered keys are checked to be in the JWT parts
https://www.iana.org/assignments/jose/jose.xhtml
"""
header_keys = {
"kid", "x5u", "x5t", "x5t#S256", "typ", "cty", "crit", "alg", "enc", "zip", "jku", "jwk", "x5c", "epk", "apu",
"apv", "iv", "tag", "p2s", "p2c", "iss", "sub", "aud", "b64", "ppt", "url", "nonce", "svt"
}
payload_keys = {
"iss", "sub", "aud", "exp", "nbf", "iat", "jti", "kty", "use", "key_ops", "alg", "enc", "zip", "jku", "jwk",
"kid", "x5u", "x5c", "x5t", "x5t#S256", "x", "y", "d", "n", "e", "p", "q", "dp", "dq", "qi", "oth", "k", "crv",
"ext", "crit", "keys", "id", "role", "token", "secret", "password", "nonce"
}
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received token which might be structured.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, when need to filter candidate and False if left
"""
header_check = False
payload_check = False
signature_check = False
with contextlib.suppress(Exception):
jwt_parts = line_data.value.split('.')
for part in jwt_parts:
data = Util.decode_base64(part, padding_safe=True, urlsafe_detect=True)
if part.startswith("eyJ"):
# open part - just base64 encoded
json_keys = json.loads(data).keys()
# header will be checked first
if not header_check:
header_check = bool(ValueJsonWebTokenCheck.header_keys.intersection(json_keys))
# payload follows the header
elif not payload_check:
payload_check = bool(ValueJsonWebTokenCheck.payload_keys.intersection(json_keys))
# any other payloads are allowed
elif header_check and payload_check and not signature_check:
# signature check or skip encrypted part
signature_check = not Util.is_ascii_entropy_validate(data)
else:
break
if header_check and payload_check and signature_check:
return False
return True
================================================
FILE: credsweeper/filters/value_last_word_check.py
================================================
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueLastWordCheck(Filter):
"""Check that secret is not short value that ends with `:`."""
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if 16 > len(line_data.value) and not line_data.is_well_quoted_value and line_data.value.endswith(':'):
return True
return False
================================================
FILE: credsweeper/filters/value_length_check.py
================================================
from typing import Optional
from credsweeper.common.constants import MIN_VALUE_LENGTH, MAX_LINE_LENGTH
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueLengthCheck(Filter):
"""Check that candidate value length is between MIN and MAX."""
def __init__(self,
config: Optional[Config] = None,
min_len: int = MIN_VALUE_LENGTH,
max_len: int = MAX_LINE_LENGTH) -> None:
self.min_len = min_len
self.max_len = max_len
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if self.min_len <= len(line_data.value) <= self.max_len:
return False
return True
================================================
FILE: credsweeper/filters/value_method_check.py
================================================
import re
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueMethodCheck(Filter):
"""Check if potential candidate value is a function.
Check if potential candidate value is a function by looking for '(', ')' or 'function' sub-strings in it
"""
PATTERN = re.compile(r"^[~.\->:0-9A-Za-z_]+\(.*\)")
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if line_data.is_well_quoted_value:
return False
if "function" in line_data.value or self.PATTERN.search(line_data.value):
return True
return False
================================================
FILE: credsweeper/filters/value_morphemes_check.py
================================================
from typing import Optional
from credsweeper.common import static_keyword_checklist
from credsweeper.common.constants import MAX_LINE_LENGTH
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueMorphemesCheck(Filter):
"""Check value for a threshold of morphemes count"""
THRESHOLDS_X3 = int(MAX_LINE_LENGTH).bit_length()
# one morpheme is very likely to be random generated even for 3 symbols
MAX_MORPHEMES_LIMIT = max(1, THRESHOLDS_X3 - 4)
def __init__(self, config: Optional[Config] = None, threshold: Optional[int] = None) -> None:
# threshold - minimum morphemes number in a value
if threshold is None:
# use dynamic thresholds
self.thresholds = [max(1, x - 4) for x in range(ValueMorphemesCheck.THRESHOLDS_X3)]
elif isinstance(threshold, int) and 0 <= threshold:
# constant thresholds for any pattern
self.thresholds = [threshold] * ValueMorphemesCheck.THRESHOLDS_X3
else:
raise ValueError(f"Wrong type of pattern length {type(threshold)} = {repr(threshold)}")
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
threshold_id = len(line_data.value).bit_length()
# use the last (max) threshold in very huge value
threshold = self.thresholds[threshold_id] if len(self.thresholds) > threshold_id else self.thresholds[-1]
return static_keyword_checklist.check_morphemes(line_data.value.lower(), threshold)
================================================
FILE: credsweeper/filters/value_not_allowed_pattern_check.py
================================================
import re
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.util import Util
class ValueNotAllowedPatternCheck(Filter):
"""Check that secret doesn't open or closes brackets or a new line."""
NOT_ALLOWED = [r"[<>\[\]{}]\s+", r"\\u00(26|3c)gt;?(\s|\\+[nrt])?", r"^\s*\\", r"^\s*\\n\s*"]
NOT_ALLOWED_PATTERN = re.compile( #
f"{Util.get_regex_combine_or(NOT_ALLOWED)}$", #
flags=re.IGNORECASE)
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if not line_data.is_well_quoted_value and self.NOT_ALLOWED_PATTERN.search(line_data.value):
return True
return False
================================================
FILE: credsweeper/filters/value_not_part_encoded_check.py
================================================
import re
from typing import Optional
from credsweeper.common import static_keyword_checklist
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueNotPartEncodedCheck(Filter):
"""Check that token is not a part of encoded data."""
BASE64_ENCODED_DATA_PATTERN_BEFORE = re.compile(
r"(^|[^A-Za-z0-9]+)(?P(([A-Za-z0-9_-]{4}){16,64})|(([A-Za-z0-9+/]{4}){16,64}))([^=A-Za-z0-9+/|_-]+|$)")
BASE64_ENCODED_DATA_PATTERN_AFTER = re.compile(
r"(^|[^A-Za-z0-9]+)(?P(([A-Za-z0-9=_-]{4}){4,64})|(([A-Za-z0-9=+/]{4}){4,64}))([^=A-Za-z0-9+/|_-]+|$)")
def __init__(self, config: Optional[Config] = None) -> None:
pass
@staticmethod
def check_line_target_fit(line_data: LineData, target: AnalysisTarget) -> bool:
"""Verifies whether line data fit to be a part of many lines"""
return line_data.line_num == target.line_num \
and len(line_data.line) == target.line_len \
and line_data.line == target.line \
and 0 < target.line_num <= target.lines_len \
and line_data.line == target.lines[target.line_num - 1]
@staticmethod
def check_val(line: str, pattern: re.Pattern) -> Optional[bool]:
"""Verifies whether the line looks like a base64 pattern"""
if match_obj := pattern.match(line):
val = match_obj.group("val")
# not a path-like
if not val.startswith('/') \
or not static_keyword_checklist.check_morphemes(val.lower(), 2) \
or '=' == val[-1]:
# padding char is a marker too
return True
return None
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if ValueNotPartEncodedCheck.check_line_target_fit(line_data, target):
# suppose, there is plain lines order
if 1 < target.line_num:
result = ValueNotPartEncodedCheck.check_val(target.lines[line_data.line_num - 2],
ValueNotPartEncodedCheck.BASE64_ENCODED_DATA_PATTERN_BEFORE)
if result is not None:
return result
if target.lines_len > target.line_num:
result = ValueNotPartEncodedCheck.check_val(target.lines[line_data.line_num],
ValueNotPartEncodedCheck.BASE64_ENCODED_DATA_PATTERN_AFTER)
if result is not None:
return result
else:
# otherwise - need to iterate for all lines
for i in range(target.lines_len):
if line_data.line == target.lines[i]:
if 0 < i:
result = ValueNotPartEncodedCheck.check_val(
target.lines[i - 1], ValueNotPartEncodedCheck.BASE64_ENCODED_DATA_PATTERN_BEFORE)
if result is not None:
return result
i += 1
if target.lines_len > i:
result = ValueNotPartEncodedCheck.check_val(
target.lines[i], ValueNotPartEncodedCheck.BASE64_ENCODED_DATA_PATTERN_AFTER)
if result is not None:
return result
break
return False
================================================
FILE: credsweeper/filters/value_number_check.py
================================================
import re
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueNumberCheck(Filter):
"""Check value if it a value in hex or decimal representation"""
HEX_VALUE_REGEX = re.compile("^(0x)?[0-9a-f]{1,128}[ul]{0,3}$")
DEC_VALUE_REGEX = re.compile("^-?[0-9]{1,20}[ul]{0,3}$")
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
value = line_data.value.lower()
if 22 > len(value) and ValueNumberCheck.HEX_VALUE_REGEX.match(value):
return True
if ValueNumberCheck.DEC_VALUE_REGEX.match(value):
return True
return False
================================================
FILE: credsweeper/filters/value_pattern_check.py
================================================
import re
from typing import Optional
from credsweeper.common.constants import DEFAULT_PATTERN_LEN, MAX_LINE_LENGTH, MIN_DATA_LEN
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValuePatternCheck(Filter):
"""Check if candidate value contain specific pattern.
Similar to linguistic sequences of characters, random strings shouldn't contain math sequences of
characters. Based on "How Bad Can It Git? Characterizing Secret Leakage in Public GitHub Repositories", details:
https://www.ndss-symposium.org/ndss-paper/how-bad-can-it-git-characterizing-secret-leakage-in-public-github-repositories/
PatternCheck checks the occurrence in "line_data.value" of three types of sequence:
- N or more identical characters in sequence, example: "AAAA", "1111" ...
- N or more increasing characters sequentially, example: "abcd", "1234" ...
- N or more decreasing characters sequentially, example: "dcba", "4321" ...
Default pattern LEN is 4
"""
MAX_PATTERN_LENGTH = int(MAX_LINE_LENGTH).bit_length()
def __init__(self, config: Optional[Config] = None, pattern_len: Optional[int] = None):
"""Create ValuePatternCheck with a specific pattern_len to check.
Args:
config: pattern len to use during check. DEFAULT_PATTERN_LEN by default
pattern_len: size of constant pattern length for any value size or None for dynamic pattern size
"""
patterns_count = 1 + ValuePatternCheck.MAX_PATTERN_LENGTH
if pattern_len is None:
self.pattern_len = -1
# pattern length depends on value length
self.pattern_lengths = [max(x, DEFAULT_PATTERN_LEN) for x in range(patterns_count)]
self.patterns = [ValuePatternCheck.get_pattern(x) for x in range(patterns_count)]
elif isinstance(pattern_len, int) and DEFAULT_PATTERN_LEN <= pattern_len:
self.pattern_len = pattern_len
# constant pattern for any value length
self.pattern_lengths = [pattern_len] * patterns_count
self.patterns = [ValuePatternCheck.get_pattern(pattern_len)] * patterns_count
else:
raise ValueError(f"Wrong type of pattern length {type(pattern_len)} = {repr(pattern_len)}")
@staticmethod
def get_pattern(pattern_len: int) -> re.Pattern:
"""Creates regex pattern to find N or more identical characters in sequence"""
pattern_length = max(DEFAULT_PATTERN_LEN, pattern_len)
if MIN_DATA_LEN <= pattern_length:
# base64 long sequences may contain 0x00 or 0xFF inside
pattern = fr"([^\sA/_])\1{{{str(pattern_length-1)},}}"
else:
# up to 256 symbols length
pattern = fr"(\S)\1{{{str(pattern_length-1)},}}"
return re.compile(pattern)
def equal_pattern_check(self, value: str, bit_length: int) -> bool:
"""Check if candidate value contain 4 and more same chars or numbers sequences.
Args:
value: string variable, credential candidate value
bit_length: speedup for len(value).bit_length()
Return:
True if contain and False if not
"""
if self.patterns[bit_length].search(value):
return True
return False
def ascending_pattern_check(self, value: str, bit_length: int) -> bool:
"""Check if candidate value contain 4 and more ascending chars or numbers sequences.
Arg:
value: credential candidate value
bit_length: speedup for len(value).bit_length()
Return:
True if contain and False if not
"""
count = 1
for key in range(len(value) - 1):
if ord(value[key + 1]) - ord(value[key]) == 1:
count += 1
else:
count = 1
continue
if count == self.pattern_lengths[bit_length]:
return True
return False
def descending_pattern_check(self, value: str, bit_length: int) -> bool:
"""Check if candidate value contain 4 and more descending chars or numbers sequences.
Arg:
value: string variable, credential candidate value
bit_length: speedup for len(value).bit_length()
Return:
boolean variable. True if contain and False if not
"""
count = 1
for key in range(len(value) - 1):
if ord(value[key]) - ord(value[key + 1]) == 1:
count += 1
else:
count = 1
continue
if count == self.pattern_lengths[bit_length]:
return True
return False
def check_val(self, value: str, bit_length: int) -> bool:
"""Cumulative value check.
Arg:
value: string variable, credential candidate value
bit_length: speedup for len(value).bit_length()
Return:
boolean variable. True if contain and False if not
"""
if self.equal_pattern_check(value, bit_length):
return True
if self.ascending_pattern_check(value, bit_length):
return True
if self.descending_pattern_check(value, bit_length):
return True
return False
def duple_pattern_check(self, value: str, bit_length: int) -> bool:
"""Check if candidate value is a duplet value with possible patterns.
Arg:
value: string variable, credential candidate value
bit_length: speedup for len(value).bit_length()
Return:
boolean variable. True if contain and False if not
"""
even_value = value[0::2]
if self.check_val(even_value, bit_length):
odd_value = value[1::2]
if self.check_val(odd_value, bit_length):
return True
return False
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Arg:
line_data: LineData object, credential candidate data
target: multiline target from which line data was obtained
Return:
boolean variable. True, if need to filter candidate and False if left
"""
value_length = len(line_data.value)
bit_length = max(DEFAULT_PATTERN_LEN, value_length.bit_length())
if ValuePatternCheck.MAX_PATTERN_LENGTH < bit_length:
# huge values may contain anything
return False
if 0 <= value_length < self.pattern_len or value_length < self.pattern_lengths[bit_length]:
# too short value
return True
if self.check_val(line_data.value, bit_length):
return True
if 2 * self.pattern_lengths[bit_length] <= value_length \
and self.duple_pattern_check(line_data.value, bit_length):
return True
return False
================================================
FILE: credsweeper/filters/value_sealed_secret_check.py
================================================
from typing import Optional
from credsweeper.common.constants import MAX_LINE_LENGTH
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueSealedSecretCheck(Filter):
"""
Check that candidate may be a sealed secret
https://github.com/bitnami-labs/sealed-secrets/blob/main/docs/developer/crypto.md
"""
MAX_SEARCH_MARGIN = 100
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received value and check context for sealed secret markers.
Can be applied effective for plain scan when the value is full and the target has lines around.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, when need to filter candidate and False if left
"""
if (value := line_data.value) and (value.startswith('Ag') and 700 < len(value) and 'A' <= value[2] <= 'D'
or value.startswith('AQ') and 350 < len(value) and 'A' <= value[2] <= 'D'):
from_line = max(0, line_data.line_pos - ValueSealedSecretCheck.MAX_SEARCH_MARGIN)
to_line = min(len(target.lines), line_data.line_pos + ValueSealedSecretCheck.MAX_SEARCH_MARGIN)
sealed_secret_marker = encrypted_data_marker = bitnami_marker = False
for line in target.lines[from_line:to_line]:
if not sealed_secret_marker and 0 <= line.find("SealedSecret", 0, MAX_LINE_LENGTH):
sealed_secret_marker = True
if not encrypted_data_marker and 0 <= line.find("encryptedData", 0, MAX_LINE_LENGTH):
encrypted_data_marker = True
if not bitnami_marker and 0 <= line.find("bitnami", 0, MAX_LINE_LENGTH):
bitnami_marker = True
if sealed_secret_marker and encrypted_data_marker and bitnami_marker:
return True
return False
================================================
FILE: credsweeper/filters/value_search_check.py
================================================
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueSearchCheck(Filter):
"""Check whether a candidate value contains a pattern - useful for multi rules"""
def __init__(self, config: Optional[Config] = None, pattern: Optional[str] = None) -> None:
self.pattern = pattern
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if self.pattern and line_data.value:
if len(self.pattern) < len(line_data.value):
if self.pattern in line_data.value:
return True
else:
if line_data.value in self.pattern:
return True
return False
================================================
FILE: credsweeper/filters/value_similarity_check.py
================================================
from difflib import SequenceMatcher
from typing import Optional
from credsweeper.common.constants import MIN_VALUE_LENGTH
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueSimilarityCheck(Filter):
"""Check if candidate value is over 75% similarity as candidate variable. Like: `secret = "mysecret"` (0.8571)."""
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if line_data.variable and line_data.value:
variable_lower = line_data.variable.lower()
value_lower = line_data.value.lower()
if len(value_lower) <= len(variable_lower):
if value_lower in variable_lower:
return True
elif MIN_VALUE_LENGTH <= len(variable_lower):
# `api` and `key` may be in the value
if variable_lower in value_lower:
return True
if 0.75 < SequenceMatcher(None, variable_lower, value_lower).ratio():
return True
return False
================================================
FILE: credsweeper/filters/value_split_keyword_check.py
================================================
from typing import Optional
from typing import Union
from credsweeper.common import static_keyword_checklist
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueSplitKeywordCheck(Filter):
"""Check value by splitting with standard whitespace separators and any word is not matched in checklist."""
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
words: Union[set, list] = line_data.value.lower().split()
keyword_set = static_keyword_checklist.keyword_set
for word in words:
if word in keyword_set:
return True
return False
================================================
FILE: credsweeper/filters/value_string_type_check.py
================================================
import re
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueStringTypeCheck(Filter):
r"""Check if line_data is in source code file that require quotes for string declaration.
If it is, then checks if line_data really have string literal declaration.
Comment rows in source files (start with //, /\*, etc) ignored.
Multiple bytes scenario allowed [123,23,54,67,78,89] or {0xae, 0x54, 0x55, 0xff}
True if:
- line_data have no value
- line_data have no path
- line_data is in source code file (.cpp, .py, etc.) and is not comment
and contain no quotes (so no string literal declared)
False otherwise
"""
MULTIBYTE_PATTERN = re.compile(r"((0x)?[0-9a-f]{1,16}[UL]*)(\s*,\s*((0x)?[0-9a-f]{1,16}[UL]*)){3}",
flags=re.IGNORECASE)
def __init__(self, config: Optional[Config] = None, check_for_literals=True) -> None:
self.check_for_literals = check_for_literals
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if not self.check_for_literals or line_data.url_part:
return False
if ValueStringTypeCheck.MULTIBYTE_PATTERN.search(line_data.value):
return False
if line_data.is_source_file_with_quotes() \
and not line_data.is_comment() \
and not line_data.is_well_quoted_value \
and not line_data.is_quoted \
and not '0' <= line_data.value[0] <= '9' \
and line_data.separator and '=' in line_data.separator:
# heterogeneous code e.g. YAML in Python uses colon sign instead equals
return True
return False
================================================
FILE: credsweeper/filters/value_token_base32_check.py
================================================
from typing import Tuple
from credsweeper.filters.value_token_base_check import ValueTokenBaseCheck
class ValueTokenBase32Check(ValueTokenBaseCheck):
"""Check that candidate have good randomization"""
RANGE_DICT = {
8: ((3.480934, 0.8482364556537906), (1.9280820731422028, 0.5833143826506801)),
10: ((3.4801753333333334, 0.7508676237320747), (1.9558544090983234, 0.5119385414964345)),
15: ((3.4803549285714284, 0.603220270918794), (1.9896690734372564, 0.40640877687972476)),
16: ((3.4798649333333334, 0.5837818960141307), (1.9938368543943692, 0.392547066949958)),
20: ((3.4809878947368422, 0.518785674729997), (2.0058661928593517, 0.34692788889724946)),
24: ((3.480511086956522, 0.4726670109337228), (2.0131379532992537, 0.31476354168931936)),
25: ((3.480877375, 0.4626150412368404), (2.0147828593929953, 0.3075894753390553)),
32: ((3.4809023548387095, 0.4072672632996217), (2.0231609118646867, 0.2700344059876962)),
40: ((3.4801929743589746, 0.36361457820793436), (2.027858606807074, 0.2401498396303172)),
50: ((3.4798551224489795, 0.323708167297437), (2.0318808048208794, 0.2138098551294688)),
64: ((3.4805990476190476, 0.28572156450556774), (2.035756800745673, 0.18815721535870078)),
}
@staticmethod
def get_stat_range(size: int) -> Tuple[Tuple[float, float], Tuple[float, float]]:
"""Returns minimal, maximal for hop and deviation. Precalculated data is applied for speedup"""
if result := ValueTokenBase32Check.RANGE_DICT.get(size):
ppf = ValueTokenBaseCheck.get_ppf(size)
return ((result[0][0] - ppf * result[0][1], result[0][0] + ppf * result[0][1]),
(result[1][0] - ppf * result[1][1], result[1][0] + ppf * result[1][1]))
# not calculated
raise ValueError(f"Not calculated for {size}")
================================================
FILE: credsweeper/filters/value_token_base36_check.py
================================================
from typing import Tuple
from credsweeper.filters.value_token_base_check import ValueTokenBaseCheck
class ValueTokenBase36Check(ValueTokenBaseCheck):
"""Check that candidate have good randomization"""
RANGE_DICT = {
8: ((3.7190542428571427, 0.8995506118495411), (2.066095086865182, 0.609210293352161)),
10: ((3.719109611111111, 0.7956463384852813), (2.0946299036665494, 0.5322004874842623)),
15: ((3.719274257142857, 0.6401989313894239), (2.129437216268589, 0.42108786288993155)),
16: ((3.7192072666666665, 0.6188627491757901), (2.1336109506109366, 0.4064699817331141)),
20: ((3.719249815789474, 0.5506473627709657), (2.145293932511567, 0.3591543917048417)),
24: ((3.7191934304347827, 0.50051922802262), (2.152858549996053, 0.3252064160191062)),
25: ((3.7192351583333334, 0.4904181410613897), (2.1543202565038735, 0.31823801389315026)),
32: ((3.7190408419354837, 0.4315967526660196), (2.1620321219700767, 0.2788634701820312)),
40: ((3.7191682666666668, 0.3852248727988986), (2.16746680811131, 0.24802261318501675)),
50: ((3.718913744897959, 0.3436564880405547), (2.1715676118603806, 0.22070510537297627)),
64: ((3.7190009761904763, 0.30325954360127116), (2.1751172797904093, 0.1942582237461476)),
}
@staticmethod
def get_stat_range(size: int) -> Tuple[Tuple[float, float], Tuple[float, float]]:
"""Returns minimal, maximal for hop and deviation. Precalculated data is applied for speedup"""
if result := ValueTokenBase36Check.RANGE_DICT.get(size):
ppf = ValueTokenBaseCheck.get_ppf(size)
return ((result[0][0] - ppf * result[0][1], result[0][0] + ppf * result[0][1]),
(result[1][0] - ppf * result[1][1], result[1][0] + ppf * result[1][1]))
# not calculated
raise ValueError(f"Not calculated for {size}")
================================================
FILE: credsweeper/filters/value_token_base64_check.py
================================================
from typing import Tuple
from credsweeper.filters.value_token_base_check import ValueTokenBaseCheck
class ValueTokenBase64Check(ValueTokenBaseCheck):
"""Check that candidate have good randomization"""
RANGE_DICT = {
8: ((3.7627115714285715, 0.9413431166706269), (2.1378378843992736, 0.6394596814295781)),
10: ((3.7617393333333333, 0.8327986018456262), (2.168873183866972, 0.5605393324056347)),
15: ((3.7619624285714286, 0.6698092646328063), (2.2080058406286702, 0.4447698491992352)),
16: ((3.7618573333333334, 0.6471500119793832), (2.2116826642934453, 0.4288377928263507)),
20: ((3.7618887368421055, 0.575813792926031), (2.224384985667721, 0.37985781543221253)),
24: ((3.7621449565217393, 0.5243297908608613), (2.2326041329976607, 0.34397389723600613)),
25: ((3.762616791666667, 0.5137934920050976), (2.234571917211925, 0.3366547036535176)),
32: ((3.761885838709677, 0.4521158322065318), (2.2426375800006153, 0.29506039075960255)),
40: ((3.7622649487179487, 0.4031261511824518), (2.2485911621253574, 0.2622954601051068)),
50: ((3.762087693877551, 0.3597404118023357), (2.2533774423872956, 0.23384524947332655)),
64: ((3.7625271746031745, 0.31733579704946846), (2.257532519514275, 0.20571908142867643)),
}
@staticmethod
def get_stat_range(size: int) -> Tuple[Tuple[float, float], Tuple[float, float]]:
"""Returns minimal, maximal for hop and deviation. Precalculated data is applied for speedup"""
if result := ValueTokenBase64Check.RANGE_DICT.get(size):
ppf = ValueTokenBaseCheck.get_ppf(size)
return ((result[0][0] - ppf * result[0][1], result[0][0] + ppf * result[0][1]),
(result[1][0] - ppf * result[1][1], result[1][0] + ppf * result[1][1]))
# not calculated
raise ValueError(f"Not calculated for {size}")
================================================
FILE: credsweeper/filters/value_token_base_check.py
================================================
import contextlib
from abc import abstractmethod
from typing import Optional
from typing import Tuple
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.utils.hop_stat import HopStat
class ValueTokenBaseCheck(Filter):
"""Check that candidate have good randomization"""
MUL_DICT = {
8: 2.61619746,
10: 2.48685659,
15: 2.34025271,
16: 2.32370290,
20: 2.27614996,
24: 2.24609586,
25: 2.24023515,
32: 2.21025277,
40: 2.18961571,
50: 2.17355282,
64: 2.15981241,
}
def __init__(self, config: Optional[Config] = None) -> None:
self.__hop_stat = HopStat()
@staticmethod
@abstractmethod
def get_stat_range(size: int) -> Tuple[Tuple[float, float], Tuple[float, float]]:
"""Returns minimal strength. Precalculated data is applied for speedup"""
raise NotImplementedError
@staticmethod
def get_ppf(n: int) -> float:
"""Code used to produce the values"""
# from scipy.stats import t
# print('\n'.join(f'{n}: {t.ppf(0.9827, n-1):.8f},' for n in [8,10,15,16,20,24,25,32,40,50,64]))
return ValueTokenBaseCheck.MUL_DICT[n]
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
with contextlib.suppress(Exception):
hop, dev = self.__hop_stat.stat(line_data.value)
(min_hop, max_hop), (min_dev, max_dev) = self.get_stat_range(len(line_data.value))
if not (min_hop <= hop <= max_hop and min_dev <= dev <= max_dev):
return True
return False
================================================
FILE: credsweeper/filters/value_token_check.py
================================================
import re
from typing import Optional
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
class ValueTokenCheck(Filter):
"""Check if first substring of token is shorter than 5.
Split candidate value into substrings using ` ;`{})(<>[]` separators. Check if first substring is shorter than 5
Examples:
"my password"
"12);password"
"""
SPLIT_PATTERN = re.compile(r"(?[\]`]")
def __init__(self, config: Optional[Config] = None) -> None:
pass
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, if need to filter candidate and False if left
"""
if line_data.is_well_quoted_value:
return False
tokens = re.split(self.SPLIT_PATTERN, line_data.value, maxsplit=1)
# If tokens have length of 1 - pattern is not present in the value and original value returned from `.split(`
if len(tokens) < 2:
return False
token = tokens[0]
if len(token) < 5:
return True
return False
================================================
FILE: credsweeper/logger/__init__.py
================================================
================================================
FILE: credsweeper/logger/logger.py
================================================
import logging
import logging.config
from pathlib import Path
from typing import Optional
from credsweeper.app import APP_PATH
from credsweeper.utils.util import Util
class Logger:
"""Class that used to configure logging in CredSweeper."""
SILENCE = 60
LEVELS = {
"DEBUG": logging.DEBUG,
"INFO": logging.INFO,
"WARN": logging.WARNING,
"WARNING": logging.WARNING,
"ERROR": logging.ERROR,
"FATAL": logging.CRITICAL,
"CRITICAL": logging.CRITICAL,
"SILENCE": SILENCE
}
@staticmethod
def init_logging(log_level: str, file_path: Optional[str] = None) -> None:
"""Init logger.
Init logging with configuration from file 'credsweeper_path/secret/log.yaml'. For configure log level of
console output used 'log_level' args
Args:
log_level: log level for console output
file_path: path of custom log config
"""
try:
level = Logger.LEVELS.get(log_level.upper())
if level is None:
raise ValueError(f"log level given: {log_level} -- must be one of: {' | '.join(Logger.LEVELS.keys())}")
logging_config = Util.yaml_load(file_path) if file_path else None
if not logging_config:
logging_config = Util.yaml_load(APP_PATH / "secret" / "log.yaml")
log_dir = Path(logging_config["handlers"]["logfile"]["filename"]).resolve().parent
log_dir.mkdir(exist_ok=True)
logging_config["handlers"]["console"]["level"] = level
logging.config.dictConfig(logging_config)
for module in logging_config["ignore"]:
logging.getLogger(module).setLevel(logging.ERROR)
except OSError:
logging.basicConfig(level=logging.WARNING)
================================================
FILE: credsweeper/main.py
================================================
import binascii
import contextlib
import logging
import os
import sys
import time
from argparse import ArgumentParser, ArgumentTypeError, Namespace, BooleanOptionalAction
from pathlib import Path
from typing import Any, Union, Dict, Tuple, Sequence
from git import Repo, Commit
from credsweeper import __version__
from credsweeper.app import APP_PATH, CredSweeper
from credsweeper.common.constants import ThresholdPreset, Severity, RuleType, DiffRowType, ML_HUNK
from credsweeper.file_handler.abstract_provider import AbstractProvider
from credsweeper.file_handler.byte_content_provider import ByteContentProvider
from credsweeper.file_handler.files_provider import FilesProvider
from credsweeper.file_handler.patches_provider import PatchesProvider
from credsweeper.logger.logger import Logger
from credsweeper.utils.util import Util
EXIT_SUCCESS = 0
EXIT_FAILURE = 1
logger = logging.getLogger(__name__)
def positive_int(value: Any) -> int:
"""Check if number of parallel processes is not a positive number."""
int_value = int(value)
if int_value <= 0:
logger.error("Number of parallel processes should be a positive number: %s", value)
raise ArgumentTypeError(f"{value} should be greater than 0")
return int_value
def threshold_or_float_or_zero(arg: str) -> Union[int, float, ThresholdPreset]:
"""Return ThresholdPreset or a float from the input string
Args:
arg: string that either a float or one of allowed values in ThresholdPreset
Returns:
int = 0 to disable ML validator, float if arg convertible to float, ThresholdPreset if one of the allowed values
Raises:
ArgumentTypeError: if arg cannot be interpreted as float or ThresholdPreset
"""
allowed_presents = [e.value for e in ThresholdPreset]
if '0' == arg:
return 0
with contextlib.suppress(ValueError):
return float(arg) # try convert to float
if arg in allowed_presents:
return ThresholdPreset[arg]
raise ArgumentTypeError(f"value must be a float or one of {allowed_presents}")
def logger_levels(log_level: str) -> str:
"""Logger level correctness verification and transformation
Args:
log_level: string with level
Returns True if log_level UPPERCASE is one of keys
"""
val = log_level.upper()
if val in Logger.LEVELS:
return val
raise ArgumentTypeError(f"Log level provided: {log_level} -- must be one of: {' | '.join(Logger.LEVELS.keys())}")
def severity_levels(severity_level: str) -> Severity:
"""Severity level correctness verification and transformation
Args:
severity_level: string with level
Returns Severity matched provided string or throws ArgumentTypeError exception
"""
if severity := Severity.get(severity_level):
return severity
raise ArgumentTypeError(
f"Severity level provided: {severity_level} -- must be one of: {' | '.join([i.value for i in Severity])}")
def check_integrity() -> int:
"""Calculates CRC32 of program files
Returns CRC32 of files in integer
"""
crc32 = 0
for root, _dirs, files in os.walk(APP_PATH):
for file_name in files:
if Util.get_extension(file_name) in [".py", ".json", ".txt", ".yaml", ".onnx"]:
file_path = Path(root) / file_name
if data := Util.read_data(file_path):
crc32 ^= binascii.crc32(data)
return crc32
def get_arguments() -> Namespace:
"""All CLI arguments are defined here"""
parser = ArgumentParser(prog="python -m credsweeper")
single_banner_argument = 2 == len(sys.argv) and "--banner" == sys.argv[1]
group = parser.add_mutually_exclusive_group(required=not single_banner_argument)
group.add_argument("--path", nargs="+", help="file or directory to scan", dest="path", metavar="PATH")
group.add_argument("--diff_path", nargs="+", help="git diff file to scan", dest="diff_path", metavar="PATH")
group.add_argument("--export_config",
nargs="?",
help="exporting default config to file (default: config.json)",
const="config.json",
dest="export_config",
metavar="PATH")
group.add_argument("--export_log_config",
nargs="?",
help="exporting default logger config to file (default: log.yaml)",
const="log.yaml",
dest="export_log_config",
metavar="PATH")
group.add_argument("--git", help="git repo to scan", dest="git", metavar="PATH")
parser.add_argument("--ref",
help="scan git repo from the ref, otherwise - all branches were scanned (slow)",
dest="ref",
type=str)
parser.add_argument("--rules",
help="path of rule config file (default: credsweeper/rules/config.yaml). "
f"severity:{[i.value for i in Severity]} "
f"type:{[i.value for i in RuleType]}",
default=None,
dest="rule_path",
metavar="PATH")
parser.add_argument("--severity",
help=f"set minimum level for rules to apply {[i.value for i in Severity]}"
f"(default: '{Severity.INFO}', case insensitive)",
default=Severity.INFO,
dest="severity",
type=severity_levels)
parser.add_argument("--config",
help="use custom config (default: built-in)",
default=None,
dest="config_path",
metavar="PATH")
parser.add_argument("--log_config",
help="use custom log config (default: built-in)",
default=None,
dest="log_config_path",
metavar="PATH")
parser.add_argument("--denylist",
help="path to a plain text file with lines or secrets to ignore",
default=None,
dest="denylist_path",
metavar="PATH")
parser.add_argument("--find-by-ext",
help="find files by predefined extension",
dest="find_by_ext",
action="store_true")
parser.add_argument("--pedantic",
help="process files without extension",
action=BooleanOptionalAction,
default=False)
parser.add_argument("--depth",
help="additional recursive search in data (experimental)",
type=positive_int,
dest="depth",
default=0,
required=False,
metavar="POSITIVE_INT")
parser.add_argument("--no-filters", help="disable filters", dest="no_filters", action="store_false")
parser.add_argument("--doc", help="document-specific scanning", dest="doc", action="store_true")
parser.add_argument("--ml_threshold",
help="setup threshold for the ml model. "
"The lower the threshold - the more credentials will be reported. "
f"Allowed values: float between 0 and 1, or any of {[e.value for e in ThresholdPreset]} "
"(default: medium)",
type=threshold_or_float_or_zero,
default=ThresholdPreset.medium,
dest="ml_threshold",
required=False,
metavar="THRESHOLD_OR_FLOAT_OR_ZERO")
parser.add_argument("--ml_batch_size",
"-b",
help="batch size for model inference (default: 16)",
type=positive_int,
dest="ml_batch_size",
default=16,
required=False,
metavar="POSITIVE_INT")
parser.add_argument("--ml_config",
help="use external config for ml model",
type=str,
default=None,
dest="ml_config",
required=False,
metavar="PATH")
parser.add_argument("--ml_model",
help="use external ml model",
type=str,
default=None,
dest="ml_model",
required=False,
metavar="PATH")
parser.add_argument("--ml_providers",
help="comma separated list of providers for onnx (CPUExecutionProvider is used by default)",
type=str,
default=None,
dest="ml_providers",
required=False,
metavar="STR")
parser.add_argument("--jobs",
"-j",
help="number of parallel processes to use (default: 1)",
type=positive_int,
dest="jobs",
default=1,
metavar="POSITIVE_INT")
parser.add_argument("--thrifty",
help="clear objects after scan to reduce memory consumption",
action=BooleanOptionalAction,
default=True)
parser.add_argument("--skip_ignored",
help="parse .gitignore files and skip credentials from ignored objects",
dest="skip_ignored",
action="store_true")
parser.add_argument("--error",
help="produce error code if credentials are found",
action=BooleanOptionalAction,
default=False)
parser.add_argument("--save-json",
nargs="?",
help="save result to json file (default: output.json)",
const="output.json",
dest="json_filename",
metavar="PATH")
parser.add_argument("--save-xlsx",
nargs="?",
help="save result to xlsx file (default: output.xlsx)",
const="output.xlsx",
dest="xlsx_filename",
metavar="PATH")
parser.add_argument("--stdout", help="print results to stdout", action=BooleanOptionalAction, default=True)
parser.add_argument("--color", help="print results with colorization", action=BooleanOptionalAction, default=False)
parser.add_argument("--hashed",
help="line, variable, value will be hashed in output",
action=BooleanOptionalAction,
default=False)
parser.add_argument("--subtext",
help=f"line text will be stripped in {2 * ML_HUNK} symbols but value and variable are kept",
action=BooleanOptionalAction,
default=False)
parser.add_argument("--sort",
help="enable output sorting",
dest="sort_output",
action=BooleanOptionalAction,
default=False)
parser.add_argument("--log",
"-l",
help=(f"provide logging level of {list(Logger.LEVELS.keys())}"
f" (default: 'warning', case insensitive)"),
default="warning",
dest="log",
metavar="LOG_LEVEL",
type=logger_levels)
parser.add_argument("--size_limit",
help="set size limit of files that for scanning (eg. 1GB / 10MiB / 1000)",
dest="size_limit",
default=None)
parser.add_argument("--banner",
help="show version and crc32 sum of CredSweeper files at start",
action="store_const",
const=True)
parser.add_argument("--version",
"-V",
help="show program's version number and exit",
action="version",
version=f"CredSweeper {__version__}")
return parser.parse_args()
def get_credsweeper(args: Namespace) -> CredSweeper:
"""Common function to create the instance"""
if args.denylist_path is not None:
denylist = [line for line in Util.read_file(args.denylist_path) if line]
else:
denylist = []
return CredSweeper(rule_path=args.rule_path,
config_path=args.config_path,
json_filename=args.json_filename,
xlsx_filename=args.xlsx_filename,
stdout=args.stdout,
color=args.color,
hashed=args.hashed,
subtext=args.subtext,
sort_output=args.sort_output,
use_filters=args.no_filters,
pool_count=args.jobs,
ml_batch_size=args.ml_batch_size,
ml_threshold=args.ml_threshold,
ml_config=args.ml_config,
ml_model=args.ml_model,
ml_providers=args.ml_providers,
find_by_ext=args.find_by_ext,
pedantic=args.pedantic,
depth=args.depth,
doc=args.doc,
severity=args.severity,
size_limit=args.size_limit,
exclude_lines=denylist,
exclude_values=denylist,
thrifty=args.thrifty,
log_level=args.log)
def scan(args: Namespace, content_provider: AbstractProvider) -> int:
"""Scan content_provider data, print results or save them to json_filename is not None
Args:
args: arguments of the application
content_provider: FilesProvider instance to scan data from
Returns:
Number of detected credentials
Warnings:
DeprecationWarning: Using 'json_filename' and/or 'xlsx_filename' will issue a warning.
"""
try:
credsweeper = get_credsweeper(args)
return credsweeper.run(content_provider=content_provider)
except Exception as exc:
logger.critical(exc, exc_info=True)
logger.exception(exc)
return -1
def get_commit_providers(commit: Commit, repo: Repo) -> Sequence[ByteContentProvider]:
"""Process a commit and for providers"""
result = {}
# use the hardcoded sha1 until sha256 objects are not supported by GitPython
ancestors = commit.parents or [repo.tree("4b825dc642cb6eb9a060e54bf8d69288fbee4904")]
for parent in ancestors:
for diff in parent.diff(commit):
# only result files
blob_b = diff.b_blob
if blob_b and blob_b.path not in result:
try:
result[blob_b.path] = ByteContentProvider(content=blob_b.data_stream.read(),
file_path=str(blob_b.path),
info=DiffRowType.ADDED.value)
except Exception as exc:
logger.warning("A submodule was not properly initialized or commit was removed: %s", exc)
return list(result.values())
def drill(args: Namespace) -> Tuple[int, int]:
"""Scan repository for branches and commits
Args:
args: arguments of the application
Returns:
total credentials found
total scanned commits
"""
total_credentials = 0
total_commits = 0
try:
# repo init first
repo = Repo(args.git)
if args.ref:
commits_sha1 = set(x.commit.hexsha for x in repo.refs if x.name == args.ref)
if not commits_sha1:
commits_sha1 = {args.ref} # single commit sha1 reference
else:
commits_sha1 = set(x.commit.hexsha for x in repo.refs
if x.name.startswith('origin/') or x.name.startswith('refs/heads/'))
logger.info("Git repository %s with commits: %s", args.git, commits_sha1)
# then - credsweeper
credsweeper = get_credsweeper(args)
# use flat iterations to avoid recursive limits
to_scan = set(commits_sha1)
# local speedup for already scanned commits - avoid file system interactive
scanned = set()
# to avoid double-check
skipped = set()
while to_scan:
commit_sha1 = to_scan.pop()
if commit_sha1 in scanned:
# the commit was scanned in this launch
continue
commit = repo.commit(commit_sha1)
if commit.parents:
# add parents only when they were not skipped or scanned previously
to_scan.update(x.hexsha for x in commit.parents if x.hexsha not in skipped and x.hexsha not in scanned)
# check whether the commit has been checked and the report is present
skip_already_scanned = False
if args.json_filename:
json_path = Path(args.json_filename)
json_path = json_path.with_suffix(f".{commit_sha1}{json_path.suffix}")
if json_path.exists():
skip_already_scanned = True
else:
credsweeper.json_filename = json_path
if args.xlsx_filename:
xlsx_path = Path(args.xlsx_filename)
xlsx_path = xlsx_path.with_suffix(f".{commit_sha1}{xlsx_path.suffix}")
if xlsx_path.exists():
skip_already_scanned = True
else:
credsweeper.xlsx_filename = xlsx_path
if skip_already_scanned:
skipped.add(commit_sha1)
logger.info("Skip already scanned commit: %s %s", commit_sha1, commit.committed_datetime.isoformat())
continue
logger.info("Scan commit: %s %s", commit_sha1, commit.committed_datetime.isoformat())
# prepare all files to scan in the commit with bytes->IO transformation to avoid a multiprocess issue
if providers := get_commit_providers(commit, repo):
credsweeper.credential_manager.candidates.clear()
credsweeper.scan(providers)
credsweeper.post_processing()
credsweeper.export_results()
total_credentials += credsweeper.credential_manager.len_credentials()
total_commits += 1
scanned.add(commit_sha1)
except Exception as exc:
logger.critical(exc, exc_info=True)
return -1, total_commits
return total_credentials, total_commits
def main() -> int:
"""Main function"""
start_time = time.perf_counter()
result = EXIT_FAILURE
credentials_number = 0
args = get_arguments()
if args.banner:
print(f"CredSweeper {__version__} crc32:{check_integrity():08x}")
Logger.init_logging(args.log, args.log_config_path)
logger.info("Init CredSweeper object with arguments: %s CWD: %s", args, os.getcwd())
summary: Dict[str, int] = {}
if args.path:
logger.info("Run analyzer on path: %s", args.path)
content_provider: AbstractProvider = FilesProvider(args.path, skip_ignored=args.skip_ignored)
credentials_number = scan(args, content_provider)
summary["Detected Credentials"] = credentials_number
if 0 <= credentials_number:
result = EXIT_SUCCESS
elif args.diff_path:
# Analyze added data
logger.info("Run analyzer on added rows from patch files: %s", args.diff_path)
content_provider = PatchesProvider(args.diff_path, change_type=DiffRowType.ADDED)
add_credentials_number = scan(args, content_provider)
summary["Added File Credentials"] = add_credentials_number
# Analyze deleted data
logger.info("Run analyzer on deleted rows from patch files: %s", args.diff_path)
content_provider = PatchesProvider(args.diff_path, change_type=DiffRowType.DELETED)
del_credentials_number = scan(args, content_provider)
summary["Deleted File Credentials"] = del_credentials_number
if 0 <= add_credentials_number and 0 <= del_credentials_number:
# it means the scan was successful done
result = EXIT_SUCCESS
# collect number of all found credential to produce error code when necessary
credentials_number = add_credentials_number + del_credentials_number
elif args.git:
logger.info("Run analyzer on GIT: %s", args.git)
credentials_number, commits_number = drill(args)
summary[f"Detected Credentials in {args.git} for {commits_number} commits "] = credentials_number
if 0 <= credentials_number:
result = EXIT_SUCCESS
elif args.export_config:
logger.info("Exporting default config to file: %s", args.export_config)
config_dict = Util.json_load(APP_PATH / "secret" / "config.json")
Util.json_dump(config_dict, args.export_config)
result = EXIT_SUCCESS
elif args.export_log_config:
logger.info("Exporting default logger config to file: %s", args.export_log_config)
config_dict = Util.yaml_load(APP_PATH / "secret" / "log.yaml")
Util.yaml_dump(config_dict, args.export_log_config)
result = EXIT_SUCCESS
elif args.banner and 2 == len(sys.argv):
# only extend version invocation
result = EXIT_SUCCESS
else:
logger.error("Not specified 'path' or 'diff_path'")
if EXIT_SUCCESS == result and len(summary):
for k, v in summary.items():
print(f"{k}: {v}")
print(f"Time Elapsed: {time.perf_counter() - start_time}")
if args.error and EXIT_SUCCESS == result and 0 < credentials_number:
# override result when credentials were found with the requirement
result = EXIT_FAILURE
return result
================================================
FILE: credsweeper/ml_model/__init__.py
================================================
================================================
FILE: credsweeper/ml_model/features/__init__.py
================================================
from credsweeper.ml_model.features.entropy_evaluation import EntropyEvaluation
from credsweeper.ml_model.features.file_extension import FileExtension
from credsweeper.ml_model.features.has_html_tag import HasHtmlTag
from credsweeper.ml_model.features.is_secret_numeric import IsSecretNumeric
from credsweeper.ml_model.features.length_of_attribute import LengthOfAttribute
from credsweeper.ml_model.features.morpheme_dense import MorphemeDense
from credsweeper.ml_model.features.rule_name import RuleName
from credsweeper.ml_model.features.rule_severity import RuleSeverity
from credsweeper.ml_model.features.search_in_attribute import SearchInAttribute
from credsweeper.ml_model.features.word_in_path import WordInPath
from credsweeper.ml_model.features.word_in_postamble import WordInPostamble
from credsweeper.ml_model.features.word_in_preamble import WordInPreamble
from credsweeper.ml_model.features.word_in_transition import WordInTransition
from credsweeper.ml_model.features.word_in_value import WordInValue
from credsweeper.ml_model.features.word_in_variable import WordInVariable
================================================
FILE: credsweeper/ml_model/features/entropy_evaluation.py
================================================
import math
from typing import Dict, List, Set
import numpy as np
from credsweeper.common.constants import Chars, ML_HUNK
from credsweeper.credentials.candidate import Candidate
from credsweeper.file_handler.data_content_provider import MIN_DATA_LEN
from credsweeper.ml_model.features.feature import Feature
class EntropyEvaluation(Feature):
"""
Renyi, Shannon entropy evaluation with Hartley entropy normalization.
Augmentation with possible set of chars (hex, base64, etc.)
Analyse only begin of the value
See next link for details:
https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf
"""
# Max size of ML analyzed value is ML_HUNK but value may be bigger
HUNK_SIZE = 4 * ML_HUNK
LOG2_CACHE: Dict[int, float] = {x: math.log2(x) for x in range(4, 4 * ML_HUNK + 1)}
CHAR_SET: List[Set[str]] = [set(x.value) for x in Chars]
RESULT_SIZE = 3 + len(Chars)
def extract(self, candidate: Candidate) -> np.ndarray:
"""Returns real entropy and possible sets of characters"""
# only head of value will be analyzed
result: np.ndarray = np.zeros(shape=EntropyEvaluation.RESULT_SIZE, dtype=np.float32)
value = candidate.line_data_list[0].value[:EntropyEvaluation.HUNK_SIZE]
size = len(value)
uniq, counts = np.unique(list(value), return_counts=True)
if MIN_DATA_LEN <= size:
# evaluate the entropy for a value of at least 4
probabilities = counts / size
hartley_entropy = EntropyEvaluation.LOG2_CACHE.get(size, -1.0)
# renyi_entropy alpha=0.5
sum_prob_05 = np.sum(probabilities**0.5)
renyi_entropy_05 = 2 * np.log2(sum_prob_05)
result[0] = renyi_entropy_05 / hartley_entropy
# shannon_entropy or renyi_entropy alpha=1
shannon_entropy = -np.sum(probabilities * np.log2(probabilities))
result[1] = shannon_entropy / hartley_entropy
# renyi_entropy alpha=2
sum_prob_2 = np.sum(probabilities**2)
renyi_entropy_2 = -1 * np.log2(sum_prob_2)
result[2] = renyi_entropy_2 / hartley_entropy
if 0 < size:
# check charset for non-zero value
# use the new variable to deal with mypy
uniq_set = set(uniq)
for n, i in enumerate(EntropyEvaluation.CHAR_SET, start=3):
if not uniq_set.difference(i):
result[n] = 1.0
return result
================================================
FILE: credsweeper/ml_model/features/feature.py
================================================
from abc import ABC, abstractmethod
from typing import List, Any
import numpy as np
from credsweeper.credentials.candidate import Candidate
class Feature(ABC):
"""Base class for features."""
def __init__(self):
pass
def __call__(self, candidates: List[Candidate]) -> np.ndarray:
"""Call base class for features.
Args:
candidates: list of candidates to extract features
"""
return np.array([self.extract(candidate) for candidate in candidates])
@abstractmethod
def extract(self, candidate: Candidate) -> Any:
"""Abstract method of base class"""
raise NotImplementedError
================================================
FILE: credsweeper/ml_model/features/file_extension.py
================================================
from typing import List, Any
import numpy as np
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.word_in import WordIn
class FileExtension(WordIn):
"""Categorical feature of file type.
Parameters:
extensions: extension labels
"""
def __init__(self, extensions: List[str]) -> None:
super().__init__(words=extensions)
def __call__(self, candidates: List[Candidate]) -> np.ndarray:
extension_set = set(candidate.line_data_list[0].file_type.lower() for candidate in candidates)
return self.word_in_(extension_set)
def extract(self, candidate: Candidate) -> Any:
raise NotImplementedError
================================================
FILE: credsweeper/ml_model/features/has_html_tag.py
================================================
from credsweeper.common.constants import CHUNK_SIZE
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.word_in import WordIn
from credsweeper.utils.util import Util
class HasHtmlTag(WordIn):
"""Feature is true if line has HTML tags (HTML file)."""
HTML_WORDS = [
'< img', ' None:
super().__init__(HasHtmlTag.HTML_WORDS)
def extract(self, candidate: Candidate) -> float:
subtext = Util.subtext(candidate.line_data_list[0].line, candidate.line_data_list[0].value_start, CHUNK_SIZE)
candidate_line_data_list_0_line_lower = subtext.lower()
if '<' not in candidate_line_data_list_0_line_lower:
# early check
return -1.0
for i in self.words:
if i in candidate_line_data_list_0_line_lower:
return 1.0
if "/>" in candidate_line_data_list_0_line_lower or "" in candidate_line_data_list_0_line_lower:
# possible closed tag
return 1.0
return -1.0
================================================
FILE: credsweeper/ml_model/features/is_secret_numeric.py
================================================
import contextlib
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.feature import Feature
class IsSecretNumeric(Feature):
"""Feature is true if candidate value is a numerical value."""
def extract(self, candidate: Candidate) -> float:
with contextlib.suppress(ValueError):
float(candidate.line_data_list[0].value)
return 1.0
return -1.0
================================================
FILE: credsweeper/ml_model/features/length_of_attribute.py
================================================
import numpy as np
from credsweeper.common.constants import ML_HUNK
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.feature import Feature
class LengthOfAttribute(Feature):
"""Abstract class for obtain a normalized value of length with max size of hunk"""
def __init__(self, attribute: str):
super().__init__()
if "line" == attribute:
self.hunk_plus = 2 * ML_HUNK + 1
elif "value" == attribute or "variable" == attribute:
self.hunk_plus = ML_HUNK + 1
else:
raise ValueError(f"Not supported attribute '{attribute}'")
self.attribute = attribute
def extract(self, candidate: Candidate) -> np.ndarray:
"""Returns boolean for first LineData member"""
if attribute := getattr(candidate.line_data_list[0], self.attribute, None):
if len(attribute) < self.hunk_plus:
# should be in (0, 1)
return np.array([len(attribute) / self.hunk_plus])
# 1.0 means the attribute is oversize
return np.array([1.0])
# the attribute is empty
return np.array([0.0])
================================================
FILE: credsweeper/ml_model/features/morpheme_dense.py
================================================
from credsweeper.common import static_keyword_checklist
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.feature import Feature
class MorphemeDense(Feature):
"""Feature calculates morphemes density for a value"""
def extract(self, candidate: Candidate) -> float:
density = 0.0
if value := candidate.line_data_list[0].value.lower():
morphemes_length = 0
for morpheme in static_keyword_checklist.morpheme_set:
morpheme_pos = value.find(morpheme)
if 0 <= morpheme_pos:
morpheme_len = len(morpheme)
while 0 <= morpheme_pos:
morphemes_length += morpheme_len
morpheme_pos += morpheme_len
morpheme_pos = value.find(morpheme, morpheme_pos)
# normalization: minimal morpheme length is 3
density = morphemes_length / len(value)
if 1.0 < density:
# overlap morpheme case
density = 1.0
return density
================================================
FILE: credsweeper/ml_model/features/rule_name.py
================================================
from typing import List, Any
import numpy as np
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.word_in import WordIn
class RuleName(WordIn):
"""Categorical feature that corresponds to rule name.
Parameters:
rule_names: rule name labels
"""
def __init__(self, rule_names: List[str]) -> None:
super().__init__(words=rule_names)
def __call__(self, candidates: List[Candidate]) -> np.ndarray:
candidate_rule_set = set(x.rule_name for x in candidates)
return self.word_in_(candidate_rule_set)
def extract(self, candidate: Candidate) -> Any:
raise NotImplementedError
================================================
FILE: credsweeper/ml_model/features/rule_severity.py
================================================
from credsweeper.common.constants import Severity
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.feature import Feature
class RuleSeverity(Feature):
"""Categorical feature that corresponds to rule name."""
def extract(self, candidate: Candidate) -> float:
if Severity.CRITICAL == candidate.severity:
return 1.0
if Severity.HIGH == candidate.severity:
return 0.75
if Severity.MEDIUM == candidate.severity:
return 0.5
if Severity.LOW == candidate.severity:
return 0.25
if Severity.INFO == candidate.severity:
return 0.0
raise ValueError(f"Unknown type of severity: {candidate.severity}")
================================================
FILE: credsweeper/ml_model/features/search_in_attribute.py
================================================
import re
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.feature import Feature
class SearchInAttribute(Feature):
"""Abstract feature returns boolean for matched pattern in member of first LineData"""
def __init__(self, pattern: str, attribute: str):
super().__init__()
self.pattern = re.compile(pattern)
self.attribute = attribute
def extract(self, candidate: Candidate) -> float:
"""Returns boolean for first LineData member"""
if attribute := getattr(candidate.line_data_list[0], self.attribute, None):
if self.pattern.search(attribute):
return 1.0
return -1.0
================================================
FILE: credsweeper/ml_model/features/word_in.py
================================================
from abc import abstractmethod
from typing import List, Any, Set, Union
import numpy as np
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.feature import Feature
class WordIn(Feature):
"""Abstract feature returns array with all matched words in a string"""
def __init__(self, words: List[str]):
super().__init__()
self.dimension = len(words)
self.words = sorted(list(set(words)))
self.enumerated_words = list(enumerate(self.words))
if len(self.enumerated_words) != self.dimension:
raise RuntimeError(f"Check duplicates:{words}")
@abstractmethod
def extract(self, candidate: Candidate) -> Any:
raise NotImplementedError
@property
def zero(self) -> np.ndarray:
"""Returns zero filled array for case of empty input"""
return np.zeros(shape=[self.dimension], dtype=np.int8)
def word_in_(self, iterable_data: Union[str, List[str], Set[str]]) -> np.ndarray:
"""Returns array with words included in a string"""
result: np.ndarray = self.zero
for i, word in self.enumerated_words:
if word in iterable_data:
result[i] = 1
return np.array([result])
================================================
FILE: credsweeper/ml_model/features/word_in_path.py
================================================
import os.path
from pathlib import Path
from typing import List, Any
import numpy as np
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.word_in import WordIn
class WordInPath(WordIn):
"""Categorical feature that corresponds to words in path (POSIX, lowercase)"""
def __call__(self, candidates: List[Candidate]) -> np.ndarray:
# actually there must be one path because the candidates are grouped before
if file_path := candidates[0].line_data_list[0].path:
path = Path(file_path)
# apply ./ for normalised path to detect "/src" for relative path
posix_lower_path = path.as_posix().lower() if path.is_absolute() else f"./{path.as_posix().lower()}"
# prevent extra confusion from the same word in extension
path_without_extension, _ = os.path.splitext(posix_lower_path)
return self.word_in_(path_without_extension)
return np.array([self.zero])
def extract(self, candidate: Candidate) -> Any:
raise NotImplementedError
================================================
FILE: credsweeper/ml_model/features/word_in_postamble.py
================================================
import numpy as np
from credsweeper.common.constants import ML_HUNK
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.word_in import WordIn
class WordInPostamble(WordIn):
"""Feature is true if line contains at least one word from predefined list."""
def extract(self, candidate: Candidate) -> np.ndarray:
"""Returns true if any words in a part of line after value"""
postamble_end = len(candidate.line_data_list[0].line) \
if len(candidate.line_data_list[0].line) < candidate.line_data_list[0].value_end + ML_HUNK \
else candidate.line_data_list[0].value_end + ML_HUNK
postamble = candidate.line_data_list[0].line[candidate.line_data_list[0].value_end:postamble_end].strip()
return self.word_in_(postamble.lower()) if postamble else np.array([self.zero])
================================================
FILE: credsweeper/ml_model/features/word_in_preamble.py
================================================
import numpy as np
from credsweeper.common.constants import ML_HUNK
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.word_in import WordIn
class WordInPreamble(WordIn):
"""Feature is true if line contains at least one word from predefined list."""
def extract(self, candidate: Candidate) -> np.ndarray:
"""Returns true if any words in line before variable or value"""
if 0 <= candidate.line_data_list[0].variable_start:
preamble_start = 0 if ML_HUNK >= candidate.line_data_list[0].variable_start \
else candidate.line_data_list[0].variable_start - ML_HUNK
preamble = candidate.line_data_list[0].line[preamble_start:candidate.line_data_list[0].
variable_start].strip()
else:
preamble_start = 0 if ML_HUNK >= candidate.line_data_list[0].value_start \
else candidate.line_data_list[0].value_start - ML_HUNK
preamble = candidate.line_data_list[0].line[preamble_start:candidate.line_data_list[0].value_start].strip()
return self.word_in_(preamble.lower()) if preamble else np.array([self.zero])
================================================
FILE: credsweeper/ml_model/features/word_in_transition.py
================================================
import numpy as np
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.word_in import WordIn
class WordInTransition(WordIn):
"""Feature is true if line contains at least one word from predefined list."""
def extract(self, candidate: Candidate) -> np.ndarray:
"""Returns true if any words between variable and value"""
if 0 <= candidate.line_data_list[0].variable_end < candidate.line_data_list[0].value_start:
transition = candidate.line_data_list[0].line[candidate.line_data_list[0].variable_end:candidate.
line_data_list[0].value_start].strip()
else:
transition = ''
return self.word_in_(transition.lower()) if transition else np.array([self.zero])
================================================
FILE: credsweeper/ml_model/features/word_in_value.py
================================================
import numpy as np
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.word_in import WordIn
class WordInValue(WordIn):
"""Feature returns true if candidate value contains at least one word from predefined list."""
def extract(self, candidate: Candidate) -> np.ndarray:
"""Returns array of matching words for first line"""
if value := candidate.line_data_list[0].value:
return self.word_in_(value.lower())
return np.array([self.zero])
================================================
FILE: credsweeper/ml_model/features/word_in_variable.py
================================================
import numpy as np
from credsweeper.credentials.candidate import Candidate
from credsweeper.ml_model.features.word_in import WordIn
class WordInVariable(WordIn):
"""Feature returns array of words matching in variable"""
def extract(self, candidate: Candidate) -> np.ndarray:
"""Returns array of matching words for first line"""
if variable := candidate.line_data_list[0].variable:
return self.word_in_(variable.lower())
return np.array([self.zero])
================================================
FILE: credsweeper/ml_model/ml_config.json
================================================
{
"char_set": "\u001b\t\n\r !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~",
"thresholds": {
"lowest": 0.22917,
"low": 0.35739,
"medium": 0.62204,
"high": 0.79791,
"highest": 0.92996
},
"features": [
{
"type": "RuleSeverity",
"comment": "INFO=0.0, LOW=0.25, MEDIUM=0.5, HIGH=0.75, CRITICAL=1.0",
"kwargs": {}
},
{
"type": "EntropyEvaluation",
"kwargs": {}
},
{
"type": "LengthOfAttribute",
"kwargs": {
"attribute": "line"
}
},
{
"type": "LengthOfAttribute",
"kwargs": {
"attribute": "variable"
}
},
{
"type": "LengthOfAttribute",
"kwargs": {
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "Bash variable",
"kwargs": {
"pattern": "^\\$([A-Za-z_][0-9A-Za-z_]*|\\{[A-Za-z_][0-9A-Za-z_]*\\})",
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "PossibleComment replacing",
"kwargs": {
"pattern": "^\\s*(#|\\*|/\\*|//|--\\s)",
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "Example pattern",
"kwargs": {
"pattern": "^<[\\w\\s.-]*>",
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "Repeated symbol",
"kwargs": {
"pattern": "(?:(\\S)(\\S))((\\1.)|(.\\2)){7,}",
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "SHA marker",
"kwargs": {
"pattern": "(?i:sha)[_-]?(224|256|384|512)",
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "ASN1 prefix for PEM keys",
"kwargs": {
"pattern": "\\b(MII|LS0t)",
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "camelStyle naming detection",
"kwargs": {
"pattern": "^[a-z][a-z]{1,16}[0-9]*([A-Z]([a-z]{1,16}[0-9]*|[0-9]{1,16})){1,8}$",
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "PascalStyle naming detection",
"kwargs": {
"pattern": "^([A-Z]([a-z]{1,16}[0-9]*|[0-9]{1,16})){1,8}$",
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "UPPERCASE naming detection",
"kwargs": {
"pattern": "^(_+[0-9]{1,16}|_*[A-Z]{1,16}[0-9]*)(_+([0-9]{1,16}|[A-Z]{1,16}[0-9]*)){1,8}_*$",
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "lowercase naming detection",
"kwargs": {
"pattern": "^(_+[0-9]{1,16}|_*[a-z]{1,16}[0-9]*)(_+([0-9]{1,16}|[a-z]{1,16}[0-9]*)){1,8}_*$",
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "VariableNotAllowedPatternCheck",
"kwargs": {
"pattern": "(^(@|<|\\{\\{))|([!><+*/^|)](\\s)?$)",
"attribute": "variable"
}
},
{
"type": "SearchInAttribute",
"comment": "VariableNotAllowedNameCheck - hash mentioned",
"kwargs": {
"pattern": "(?i:( h1$|md5|sha[_-]?(224|256|384|512)))",
"attribute": "variable"
}
},
{
"type": "SearchInAttribute",
"comment": "VariableNotAllowedNameCheck - ID detect",
"kwargs": {
"pattern": "(guid|[^a-z](?i:u?id)|([^A-Z](G?U)?I[dD])s?)$",
"attribute": "variable"
}
},
{
"type": "SearchInAttribute",
"comment": "AWS Key ID - true ID",
"kwargs": {
"pattern": "^A[0-9A-Z]{19,20}$",
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "UUID pattern",
"kwargs": {
"pattern": "^(?i:[0-9a-f]{8}(-[0-9a-f]{4}){3}-[0-9a-f]{12})$",
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "VariableNotAllowedNameCheck - key rule",
"kwargs": {
"pattern": "(?i:(?:uniq(?:ue)?|escap(?:e|ing)|resources?|projects?|filters?|pub(?:lic)?)_?keys?)",
"attribute": "variable"
}
},
{
"type": "SearchInAttribute",
"comment": "VariableNotAllowedNameCheck - word at end",
"kwargs": {
"pattern": "(?i:(icon|label|mode|field|format|number|sum|size|len(gth)?|name|type|manager|algorithm|pattern|view|error|date(time)?|time(stamp)?|tag|version|hash|rate|code|fingerprint)s?$)",
"attribute": "variable"
}
},
{
"type": "SearchInAttribute",
"comment": "PWD invocation",
"kwargs": {
"pattern": "(?i:(^\\$pwd$)|(^\\$\\{#?pwd[^}]*\\}$)|(^\\$\\(pwd\\)$)|(^`pwd`$))",
"attribute": "variable"
}
},
{
"type": "WordInVariable",
"kwargs": {
"words": [
" ",
".",
",",
"]",
"#",
"@",
"/",
"\\",
"!!!",
"_at",
"_len",
"256",
"512",
"access",
"assert",
"cache",
"client",
"control",
"crypt",
"crypted",
"decrypt",
"encrypt",
"dummy",
"disable",
"example",
"expect",
"expir",
"fake",
"file",
"filter",
"fingerprint",
"guid",
"hash",
"keyguid",
"keyid",
"key_id",
"label",
"length",
"md5",
"manager",
"mock",
"name",
"native",
"obj",
"opt",
"p/w",
"param",
"pass",
"path",
"project",
"public",
"pw",
"query",
"secret",
"size",
"sha",
"space",
"status",
"sword",
"temp",
"test",
"thumbprint",
"time",
"timestamp",
"title",
"token",
"type",
"uniq",
"valid",
"version",
"view"
]
}
},
{
"type": "WordInValue",
"kwargs": {
"words": [
"%",
" ",
":",
"=",
"$(",
"${",
"{$",
"(",
"->",
".",
"...",
"123",
"141592653",
"718281828",
"<",
">",
"[",
"_id",
"abc",
"aaaa",
"asdf",
"allow",
"arn:aws:",
"bar",
"disable",
"changeme",
"crypt",
"crypted",
"decrypt",
"edited",
"encrypt",
"example",
"expir",
"fake",
"file",
"foo",
"hash",
"hex",
"key",
"min",
"mock",
"my",
"nil",
"oprst",
"other",
"pass",
"public",
"pwd",
"redacted",
"rsa",
"salt",
"secret",
"sha",
"ssh",
"test",
"word",
"xxx",
"xyz"
]
}
},
{
"type": "WordInPreamble",
"kwargs": {
"words": [
"$",
"%2",
"%3",
"&",
"&",
"(",
"->",
".",
"://",
"?",
"@",
"[",
"approval",
"arn:aws:",
"assert",
"case",
"circle",
"color",
"e.g.",
"equal",
"example",
"expect",
"fake",
"false",
"height",
"image",
"line",
"media",
"nil",
"none",
"null",
"pass",
"path",
"pwd",
"sqa",
"test",
"true",
"undefined",
"unit",
"where",
"width",
"word"
]
}
},
{
"type": "WordInTransition",
"kwargs": {
"words": [
"%2",
"%3",
"&",
"(",
"->",
"=>",
"'",
"\"",
".",
",",
"?",
"@",
"[",
"{",
"basic",
"bearer",
"get",
"e.g.",
"equal",
"env",
"example",
"expect",
"line",
"media",
"pass",
"password",
"path",
"test",
"unit"
]
}
},
{
"type": "WordInPostamble",
"kwargs": {
"words": [
"$",
"%2",
"%3",
"&",
"&",
"(",
"->",
"'",
"\"",
".",
"://",
"?",
"@",
"[",
"]",
"}",
"\\",
"assert",
"case",
"circle",
"color",
"e.g.",
"equal",
"example",
"expect",
"fake",
"false",
"height",
"image",
"line",
"media",
"nil",
"none",
"null",
"pass",
"path",
"pwd",
"sqa",
"test",
"true",
"undefined",
"unit",
"width",
"word"
]
}
},
{
"type": "WordInPath",
"kwargs": {
"words": [
"test",
"mock",
"/src",
"code",
"/include",
"internal",
"tool",
"util",
"example",
"sample",
"conf",
"secret",
"setting",
"security",
"secure",
"resource",
"fixture",
"docker",
"/docs",
"/doc/",
"document",
"/lang",
"/local/",
"/locale",
"/lib",
"/spec",
"/pkg",
"/api",
"/rest",
"/opt",
"/sys",
"kube",
"kafka",
"cluster",
"template",
"other",
"public",
"init",
"client",
"server",
"/model",
"/modul",
"browser",
"/env/",
"/app",
"/assets/",
"vendor",
"readme",
"build",
"/dist-packages",
"/record",
"/script",
"/site-packages",
"python",
"/usr",
"/etc",
"/fuzz"
]
}
},
{
"type": "MorphemeDense"
},
{
"type": "HasHtmlTag"
},
{
"type": "IsSecretNumeric"
},
{
"type": "FileExtension",
"kwargs": {
"extensions": [
"",
".04",
".1",
".adoc",
".asciidoc",
".axaml",
".bash",
".bat",
".bats",
".bazel",
".bin",
".build",
".bundle",
".bzl",
".c",
".cast",
".cc",
".cf",
".cjs",
".cljc",
".cmd",
".cnf",
".coffee",
".conf",
".config",
".cpp",
".crt",
".cs",
".csp",
".csv",
".dart",
".dist",
".dockerfile",
".edited",
".eex",
".env",
".erb",
".erl",
".ex",
".example",
".exs",
".ext",
".fsproj",
".g4",
".gml",
".go",
".golden",
".gradle",
".graphql",
".groovy",
".gtpl",
".h",
".haml",
".har",
".hpp",
".hs",
".html",
".idl",
".iml",
".in",
".inc",
".ini",
".ipynb",
".j",
".j2",
".java",
".jenkinsfile",
".js",
".json",
".jsp",
".jsx",
".ks",
".kt",
".kts",
".las",
".ldif",
".ldml",
".less",
".libsonnet",
".lkml",
".lock",
".log",
".lua",
".m",
".manifest",
".markdown",
".markerb",
".md",
".mdx",
".mjs",
".mk",
".ml",
".mlir",
".mod",
".moo",
".ndjson",
".nolint",
".odd",
".onnx",
".oracle",
".original",
".pan",
".patch",
".php",
".pl",
".pm",
".po",
".pod",
".postinst",
".pp",
".ppk",
".proj",
".properties",
".proto",
".ps1",
".purs",
".pxd",
".py",
".pyi",
".pyx",
".r",
".rake",
".rb",
".re",
".response",
".resx",
".rexx",
".rrc",
".rs",
".rsa",
".rsp",
".rst",
".rules",
".sample",
".sbt",
".scala",
".secrets",
".sh",
".snap",
".sql",
".storyboard",
".strings",
".sty",
".swift",
".t",
".td",
".tdf",
".template",
".test",
".testsettings",
".tf",
".tfstate",
".tfvars",
".tl",
".tmpl",
".token",
".toml",
".travis",
".ts",
".tsx",
".txt",
".var",
".vsmdi",
".vue",
".xaml",
".xib",
".xml",
".yaml",
".yml",
".zsh"
]
}
},
{
"type": "RuleName",
"kwargs": {
"rule_names": [
"API",
"Auth",
"CMD ConvertTo-SecureString",
"CMD Password",
"CMD Secret",
"CMD Token",
"CURL User Password",
"Credential",
"Key",
"Nonce",
"Password",
"SQL Password",
"Salt",
"Secret",
"Token",
"URL Credentials"
]
}
}
]
}
================================================
FILE: credsweeper/ml_model/ml_model.onnx
================================================
[File too large to display: 11.0 MB]
================================================
FILE: credsweeper/ml_model/ml_validator.py
================================================
import hashlib
import json
import logging
from pathlib import Path
from typing import List, Tuple, Union, Optional, Dict
import numpy as np
from onnxruntime import InferenceSession
from credsweeper.common.constants import ThresholdPreset, ML_HUNK
from credsweeper.credentials.candidate import Candidate
from credsweeper.credentials.candidate_key import CandidateKey
from credsweeper.ml_model import features
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class MlValidator:
"""ML validation class"""
MAX_LEN = 2 * ML_HUNK # for whole line limit
# used for initial fill
ZERO_CHAR = '\x00'
# applied for unknown characters
FAKE_CHAR = '\x01'
_dir_path = Path(__file__).parent
def __init__(
self, #
threshold: Union[float, ThresholdPreset], #
ml_config: Union[None, str, Path] = None, #
ml_model: Union[None, str, Path] = None, #
ml_providers: Optional[str] = None) -> None:
"""Init
Args:
threshold: decision threshold
ml_config: path to ml config
ml_model: path to ml model
ml_providers: coma separated list of providers https://onnxruntime.ai/docs/execution-providers/
"""
self.__session: Optional[InferenceSession] = None
if ml_config:
ml_config_path = Path(ml_config)
else:
ml_config_path = MlValidator._dir_path / "ml_config.json"
with open(ml_config_path, "rb") as f:
__ml_config_data = f.read()
model_config = json.loads(__ml_config_data)
if ml_model:
ml_model_path = Path(ml_model)
else:
ml_model_path = MlValidator._dir_path / "ml_model.onnx"
with open(ml_model_path, "rb") as f:
self.__ml_model_data = f.read()
if ml_providers:
self.providers = ml_providers.split(',')
else:
self.providers = ["CPUExecutionProvider"]
if isinstance(threshold, float):
self.threshold = threshold
elif isinstance(threshold, ThresholdPreset) and "thresholds" in model_config:
self.threshold = model_config["thresholds"][threshold.value]
else:
self.threshold = 0.5
logger.warning("Use fallback threshold value: %s", self.threshold)
char_set = set(model_config["char_set"])
if len(char_set) != len(model_config["char_set"]):
logger.warning('Duplicated symbols in "char_set"?')
if self.ZERO_CHAR in char_set or self.FAKE_CHAR in char_set:
raise ValueError(f'Unacceptable symbols 0x00 or 0x01 in "char_set"={char_set}')
self.char_dict = {self.ZERO_CHAR: 0, self.FAKE_CHAR: 1}
self.char_dict.update({
char: index
for index, char in enumerate(sorted(list(char_set)), start=len(self.char_dict))
})
self.num_classes = len(self.char_dict)
self.common_feature_list = []
self.unique_feature_list = []
if logger.isEnabledFor(logging.INFO):
config_md5 = hashlib.md5(__ml_config_data).hexdigest()
model_md5 = hashlib.md5(self.__ml_model_data).hexdigest()
logger.info("Init ML validator with providers: '%s' ; model:'%s' md5:%s ; config:'%s' md5:%s",
self.providers, ml_config_path, config_md5, ml_model_path, model_md5)
logger.debug(str(model_config))
for feature_definition in model_config["features"]:
feature_class = feature_definition["type"]
kwargs = feature_definition.get("kwargs", {})
feature_constructor = getattr(features, feature_class, None)
if feature_constructor is None:
raise ValueError(f"Error while parsing model details. Cannot create feature '{feature_class}'"
f" from {feature_definition}")
try:
feature = feature_constructor(**kwargs)
except TypeError:
logger.error("Error while parsing model details. Cannot create feature '%s' from %s", feature_class,
feature_definition)
raise
if feature_definition["type"] in ["RuleName"]:
self.unique_feature_list.append(feature)
else:
self.common_feature_list.append(feature)
def __reduce__(self):
# TypeError: cannot pickle 'onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession' object
self.__session = None
return super().__reduce__()
@property
def session(self) -> InferenceSession:
"""session getter to prevent pickle error"""
if not self.__session:
self.__session = InferenceSession(self.__ml_model_data, providers=self.providers)
if not self.__session:
raise RuntimeError("InferenceSession was not initialized!")
return self.__session
def encode(self, text: str, limit: int) -> np.ndarray:
"""Encodes prepared text to array"""
result_array: np.ndarray = np.zeros(shape=(limit, self.num_classes), dtype=np.float32)
if text is None:
return result_array
for i, c in enumerate(text):
if i >= limit:
break
if c in self.char_dict:
result_array[i, self.char_dict[c]] = 1.0
else:
result_array[i, self.char_dict[MlValidator.FAKE_CHAR]] = 1.0
return result_array
def encode_line(self, text: str, position: int):
"""Encodes line with balancing for position"""
offset = len(text) - len(text.lstrip())
pos = position - offset
stripped = text.strip()
if MlValidator.MAX_LEN < len(stripped):
stripped = Util.subtext(stripped, pos, ML_HUNK)
return self.encode(stripped, MlValidator.MAX_LEN)
def encode_value(self, text: str) -> np.ndarray:
"""Encodes line with balancing for position"""
stripped = text.strip()
return self.encode(stripped[:ML_HUNK], ML_HUNK)
def _call_model(self, line_input: np.ndarray, variable_input: np.ndarray, value_input: np.ndarray,
feature_input: np.ndarray) -> np.ndarray:
input_feed: Dict[str, np.ndarray] = {
"line_input": line_input.astype(np.float32),
"variable_input": variable_input.astype(np.float32),
"value_input": value_input.astype(np.float32),
"feature_input": feature_input.astype(np.float32),
}
result = self.session.run(output_names=None, input_feed=input_feed)
if result and isinstance(result[0], np.ndarray):
return result[0]
raise RuntimeError(f"Unexpected type {type(result[0])}")
def extract_common_features(self, candidates: List[Candidate]) -> np.ndarray:
"""Extract features that are guaranteed to be the same for all candidates on the same line with same value."""
feature_array: np.ndarray = np.array([], dtype=np.float32)
# Extract features from credential candidate
default_candidate = candidates[0]
for feature in self.common_feature_list:
new_feature = feature([default_candidate])[0]
if not isinstance(new_feature, np.ndarray):
new_feature = np.array([new_feature])
feature_array = np.append(feature_array, new_feature)
return feature_array
def extract_unique_features(self, candidates: List[Candidate]) -> np.ndarray:
"""Extract features that can be different between candidates. Join them with or operator."""
feature_array: np.ndarray = np.array([], dtype=np.int8)
default_candidate = candidates[0]
for feature in self.unique_feature_list:
new_feature = feature([default_candidate])[0]
if not isinstance(new_feature, np.ndarray):
new_feature = np.array([new_feature])
feature_array = np.append(feature_array, new_feature)
for candidate in candidates[1:]:
for feature in self.unique_feature_list:
new_feature = feature([candidate])[0]
if not isinstance(new_feature, np.ndarray):
new_feature = np.array([new_feature])
feature_array = feature_array | new_feature
return feature_array
def get_group_features(self, candidates: List[Candidate]) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""
`np.newaxis` used to add new dimension if front, so input will be treated as a batch
"""
# all candidates are from the same line
default_candidate = candidates[0]
line_input = self.encode_line(default_candidate.line_data_list[0].line,
default_candidate.line_data_list[0].value_start)[np.newaxis]
variable = ''
value = ''
for candidate in candidates:
if not variable and candidate.line_data_list[0].variable:
variable = candidate.line_data_list[0].variable
if not value and candidate.line_data_list[0].value:
value = candidate.line_data_list[0].value
if variable and value:
break
variable_input = self.encode_value(variable)[np.newaxis]
value_input = self.encode_value(value)[np.newaxis]
feature_array = self.extract_features(candidates)
return line_input, variable_input, value_input, feature_array
def extract_features(self, candidates: List[Candidate]) -> np.ndarray:
"""extracts common and unique features from list of candidates"""
common_features = self.extract_common_features(candidates)
unique_features = self.extract_unique_features(candidates)
feature_hstack = np.hstack([common_features, unique_features])
feature_array = np.array([feature_hstack])
return feature_array
def _batch_call_model(self, line_input_list, variable_input_list, value_input_list, features_list) -> np.ndarray:
"""auxiliary method to invoke twice"""
line_inputs_vstack = np.vstack(line_input_list)
variable_inputs_vstack = np.vstack(variable_input_list)
value_inputs_vstack = np.vstack(value_input_list)
feature_array_vstack = np.vstack(features_list)
result_call = self._call_model(line_inputs_vstack, variable_inputs_vstack, value_inputs_vstack,
feature_array_vstack)
result = result_call[:, 0]
return result
def validate_groups(self, group_list: List[Tuple[CandidateKey, List[Candidate]]],
batch_size: int) -> Tuple[np.ndarray, np.ndarray]:
"""Use ml model on list of candidate groups.
Args:
group_list: List of tuples (value, group)
batch_size: ML model batch
Return:
Boolean numpy array with decision based on the threshold,
and numpy array with probability predicted by the model
"""
line_input_list = []
variable_input_list = []
value_input_list = []
features_list = []
probability: np.ndarray = np.zeros(len(group_list), dtype=np.float32)
head = tail = 0
for _group_key, candidates in group_list:
line_input, variable_input, value_input, feature_array = self.get_group_features(candidates)
line_input_list.append(line_input)
variable_input_list.append(variable_input)
value_input_list.append(value_input)
features_list.append(feature_array)
tail += 1
if 0 == tail % batch_size:
# use the approach to reduce memory consumption for huge candidates list
probability[head:tail] = self._batch_call_model(line_input_list, variable_input_list, value_input_list,
features_list)
head = tail
line_input_list.clear()
variable_input_list.clear()
value_input_list.clear()
features_list.clear()
if head != tail:
probability[head:tail] = self._batch_call_model(line_input_list, variable_input_list, value_input_list,
features_list)
is_cred = self.threshold <= probability
if logger.isEnabledFor(logging.DEBUG):
for i, decision in enumerate(is_cred):
logger.debug("ML decision: %s with prediction: %s for value: %s", decision, probability[i],
group_list[i][0])
# apply cast to float to avoid json export issue
return is_cred, probability.astype(float)
================================================
FILE: credsweeper/py.typed
================================================
================================================
FILE: credsweeper/rules/__init__.py
================================================
================================================
FILE: credsweeper/rules/config.yaml
================================================
- name: DOC_GET
severity: medium
confidence: moderate
type: pattern
values:
- (?P(\w*(?i:비밀번호|비번|패스워드|키|암호화?|토큰|(?(\\*([\"']|&(quot|apos|#3[49]);)){1,4})(?P(.(?!(?P=lq))){4,8000}.?)
filter_type:
- ValueAllowlistCheck
- ValueBlocklistCheck
- LineGitBinaryCheck
- LineUUEPartCheck
- ValueFilePathCheck
- ValuePatternCheck(5)
min_line_len: 8
required_substrings:
- pass
- pw
- token
- secret
- key
- cred
- 비밀번호
- 비번
- 패스워드
- 암호
- 키
- 토큰
target:
- doc
use_ml: true
- name: DOC_CREDENTIALS
severity: medium
confidence: moderate
type: pattern
values:
- (?P[\"'`(])?\s*(?P(\w*(?i:(?설정은|:=|:(?!:)|=(>|>|(\\\\*u00|%)26gt;)|!==|!=|===|==|=~|=|%3[Dd])\s*)(?P[\"'`]{1,6})?(?P(?(quote)(?(wrap)[^\"'`)]{4,8000}|[^\"'`]{4,8000})|(?(wrap)[^\"'`)]{4,8000}|\S{4,8000})))
filter_type:
- ValueAllowlistCheck
- ValueBlocklistCheck
- LineGitBinaryCheck
- LineUUEPartCheck
- ValueFilePathCheck
- ValuePatternCheck(5)
- ValueSealedSecretCheck
min_line_len: 8
required_substrings:
- pass
- sword
- pw
- p/w
- paasw
- 비밀번호
- 비번
- 패스워드
- 암호
- token
- secret
- key
- credential
- 키
- 토큰
target:
- doc
use_ml: true
- name: SECRET_PAIR
severity: medium
confidence: moderate
type: pattern
values:
- (?P[\"'`]?(?i:token|secret|key|키|암호화?|토큰)[\"'`]?)((\s)*(?P설정은|:=|:(?!:)|=(>|>|(\\\\*u00|%)26gt;)|!==|!=|===|==|=~|=|%3[Dd])(\s)*)(?P[\"'`(])?(?P(?-i:(?P[A-Z])|(?P[a-z])|(?P[0-9/_+=~!@#$%^&*;:?-])){8,80}(?(a)(?(b)(?(c)((?(quote)[^)\"'`]{1,8000}|([0-9A-Za-z/_+=~!@#$%^&*;:?-]{1,8000}|\b))|$)|(?!x)x)|(?!x)x)|(?!x)x))(?(quote)[)\"'`])
filter_type:
- ValueAllowlistCheck
- ValuePatternCheck(4)
- ValueEntropyBase64Check
- ValueMorphemesCheck
- ValueSealedSecretCheck
min_line_len: 16
required_substrings:
- token
- secret
- key
- 키
- 암호
- 토큰
target:
- doc
use_ml: true
- name: PASSWD_PAIR
severity: medium
confidence: moderate
type: pattern
values:
- (?P[\"'`]?(?i:(?설정은|:=|:(?!:)|=(>|>|(\\\\*u00|%)26gt;)|!==|!=|===|==|=~|=|%3[Dd])(\s)*)(?P[\"'`(])?(?P(?-i:(?P[A-Z])|(?P[a-z])|(?P[0-9/_+=~!@#$%^&*;:?-])){8,64}(?(a)(?(b)(?(c)((?(quote)[^)\"'`]{1,8000}|([0-9A-Za-z/_+=~!@#$%^&*;:?-]{1,8000}|\b))|$)|(?!x)x)|(?!x)x)|(?!x)x))(?(quote)[)\"'`])
filter_type:
- ValueAllowlistCheck
- ValuePatternCheck(4)
- ValueDictionaryKeywordCheck
- LineGitBinaryCheck
- LineUUEPartCheck
- ValueFilePathCheck
- ValueHexNumberCheck
- ValueSealedSecretCheck
min_line_len: 10
required_substrings:
- pass
- sword
- pw
- p/w
- paasw
- 비밀번호
- 비번
- 패스워드
- 암호
target:
- doc
use_ml: true
- name: IP_ID_PASSWORD_TRIPLE
severity: medium
confidence: moderate
type: pattern
values:
- (^|\s|(?P(?i:\bip[\s/]{1,80}id[\s/]{1,80}pw[\s/:]{0,80}))|(?P://))(?P(?(?(url)(?-i:(?P[A-Z])|(?P[a-z])|(?P[0-9_+=~!@#$%^&*;?-])){7,64}(?(a)(?(b)(?(c)(\S|$)|(?!x)x)|(?!x)x)|(?!x)x)|(?-i:(?P[A-Z])|(?P[a-z])|(?P[0-9/_+=~!@#$%^&*;?-])){7,64}(?(e)(?(f)(?(g)(\S|$)|(?!x)x)|(?!x)x)|(?!x)x)))(?:\s|[^/]|$)
filter_type:
- ValueAllowlistCheck
- ValuePatternCheck(4)
- ValueDictionaryKeywordCheck
min_line_len: 10
required_substrings:
- "."
target:
- doc
use_ml: true
- name: ID_PAIR_PASSWD_PAIR
severity: medium
confidence: moderate
type: pattern
values:
- (?P--)?(?P\w*(?i:pa[as]swords?|passwd?|pwd|\bp/w|\bpw|비밀번호|비번|패스워드|암호))\s*?(?(ddash)[ =]|[:=/>-]{1,2})\s*(?P[\"'`]{1,8})?(?P(?-i:(?P[A-Z])|(?P[a-z])|(?P[0-9/_+=~!@#$%^&*;:?-])){4,64}(?(a)(?(b)(?(c)(\S|$)|(?!x)x)|(?!x)x)|(?!x)x))(?(quote)(?P=quote)|(\s|$))
- (?P--)?(?P(?i:user\s*)?(?i:id|login|account|root|admin|user|name|wifi|role|host|default|계정|아이디))\s*?(?(ddash)[ =]|[ :=])\s*?(?P\S+)
filter_type:
- ValueAllowlistCheck
- ValuePatternCheck(4)
min_line_len: 10
required_substrings:
- pass
- sword
- p/w
- pw
- 비밀번호
- 비번
- 패스워드
- 암호
target:
- doc
use_ml: true
- name: ID_PASSWD_PAIR
severity: medium
confidence: moderate
type: pattern
values:
- (?P[\w.-]{0,80}(?i:(?P\bid\b)|id\b|user|name|계정|아이디)[\w.-]{0,80}(?(id)[ :(/]{1,80}|[:(/]{1,80})(?i:pa[as]swo?r?ds?|pwd?|비밀번호|비번|패스워드|암호))\)?(\s*->\s*|[ =:)(/]{1,80}|\s+is\s+|\s+are\s+|\s*는\s*|\s*은\s*|\s*설정은\s*)\(?(?P[\w.-]{2,64})[ :\(/\"',]{1,80}(?P(?-i:(?P[A-Z])|(?P[a-z])|(?P[0-9/_+=~!@#$%^&*;:?-])){4,64}(?(a)(?(b)(?(c)(\S|$)|(?!x)x)|(?!x)x)|(?!x)x))
filter_type:
- ValueAllowlistCheck
- ValuePatternCheck(4)
- ValueDictionaryKeywordCheck
min_line_len: 10
required_substrings:
- pw
- pass
- sword
- 비밀번호
- 비번
- 패스워드
- 암호
target:
- doc
use_ml: true
- name: UUID
severity: info
confidence: strong
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P[0-9A-F]{8}(-[0-9A-F]{4}){3}-[0-9A-F]{12}|[0-9a-f]{8}(-[0-9a-f]{4}){3}-[0-9a-f]{12})(?![0-9A-Za-z_+-])
min_line_len: 36
required_substrings:
- "-"
filter_type:
- ValuePatternCheck(4)
use_ml: false
target:
- code
- doc
- name: Akamai Credentials
severity: high
confidence: strong
type: pattern
values:
- (?Pakab-[0-9a-z]{16}-[0-9a-z]{16})(?!\.[0-9a-z-]{1,80}\.akamaiapis\.net)
filter_type: GeneralPattern
required_substrings:
- akab-
min_line_len: 38
target:
- code
- doc
- name: Amazon Bedrock API Key
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P(ABSK|bedrock-api-key-)[0-9A-Za-z/+]{28,800})(?![0-9A-Za-z/+])
filter_type: GeneralPattern
required_substrings:
- ABSK
- bedrock-api-key-
min_line_len: 44
target:
- code
- doc
- name: AWS Client ID
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P(A3T[0-9A-Z]|ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|AROA|APKA|ASCA|ASIA)[0-9A-Z]{16,17})(?![0-9A-Za-z_+-])
filter_type: GeneralPattern
required_substrings:
- A3T
- ABIA
- ACCA
- AGPA
- AIDA
- AIPA
- AKIA
- ANPA
- ANVA
- AROA
- APKA
- ASCA
- ASIA
min_line_len: 20
required_regex: "[0-9A-Za-z_/+-]{15}"
target:
- code
- doc
- name: AWS Multi
severity: high
confidence: moderate
type: multi
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?PA(KIA|SIA)[0-9A-Z]{16})(?![0-9A-Za-z_])
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P((?P[A-Z])|(?P[a-z])|(?P[0-9/+])){40,44}(?(a)(?(b)(?(c)\b|(?!x)x)|(?!x)x)|(?!x)x))(?![0-9A-Za-z/+])
filter_type:
- LineSpecificKeyCheck
- ValuePatternCheck
- ValueBase64PartCheck
- ValueMorphemesCheck
required_substrings:
- AKIA
- ASIA
min_line_len: 20
required_regex: "[0-9A-Za-z_/+-]{15}"
target:
- code
- doc
- name: AWS MWS Key
severity: high
confidence: strong
type: pattern
values:
- (?Pamzn\.mws\.[0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12})(?![0-9A-Za-z_-])
filter_type: GeneralPattern
required_substrings:
- amzn.mws.
min_line_len: 30
target:
- code
- doc
- name: Dynatrace API Token
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Pdt0[A-Za-z]{1}[0-9]{2}\.[0-9A-Z]{24}\.[0-9A-Z]{64})(?![0-9A-Za-z_-])
filter_type: TokenPattern
required_substrings:
- dt0
min_line_len: 90
target:
- code
- doc
- name: Facebook Access Token
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?PEAA[0-9A-Za-z]{80,800})
filter_type:
- ValuePatternCheck
- ValueBase64PartCheck
- ValueNotPartEncodedCheck
required_substrings:
- EAA
min_line_len: 80
target:
- code
- doc
- name: Facebook App Token
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P[0-9]{12,18}\|[0-9A-Za-z_-]{24,28})(?![0-9A-Za-z_+-])
filter_type: TokenPattern
required_substrings:
- "|"
required_regex: "[0-9A-Za-z_/+-]{15}"
min_line_len: 33
target:
- code
- doc
- name: Google API Key
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?PAIza[0-9A-Za-z_-]{35})
filter_type: TokenPattern
required_substrings:
- AIza
min_line_len: 39
target:
- code
- doc
- name: Google Multi
severity: high
confidence: moderate
type: multi
values:
- (?P[0-9]{3,80}-[0-9a-z_]{32}\.apps\.googleusercontent\.com)
- \b(?PGOCSPX-[0-9A-Za-z_-]{28}|((?P[A-Z])|(?P[a-z])|(?P[0-9_-])){24,80}(?(a)(?(b)(?(c)\b|(?!x)x)|(?!x)x)|(?!x)x))
filter_type: GeneralPattern
required_substrings:
- .apps.googleusercontent.com
min_line_len: 40
target:
- code
- doc
- name: Google OAuth Secret
severity: high
confidence: strong
type: pattern
values:
- (?PGOCSPX-[0-9A-Za-z_-]{28})(?![0-9A-Za-z_-])
filter_type: TokenPattern
required_substrings:
- GOCSPX-
min_line_len: 40
target:
- code
- doc
- name: Google OAuth Access Token
severity: high
confidence: moderate
type: pattern
values:
- (?Pya29\.[0-9A-Za-z_-]{22,8000})
filter_type: TokenPattern
required_substrings:
- ya29.
min_line_len: 27
target:
- code
- doc
- name: Google OAuth Refresh Token
severity: medium
confidence: weak
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P1//0[0-9A-Za-z_-]{80,8000})
filter_type: TokenPattern
required_substrings:
- 1//0
min_line_len: 84
target:
- code
- doc
- name: Heroku Credentials
severity: high
confidence: strong
type: pattern
values:
- (?PHRKU-([0-9A-Za-z_-]{60}|[0-9A-Fa-f]{8}(-[0-9A-Fa-f]{4}){3}-[0-9A-Fa-f]{12}))
filter_type: TokenPattern
required_substrings:
- HRKU-
min_line_len: 41
target:
- code
- doc
- name: Instagram Access Token
severity: high
confidence: strong
type: pattern
values:
- (?PIGQVJ[=0-9A-Za-z_-]{100,8000})(?![=0-9A-Za-z_-])
filter_type: TokenPattern
required_substrings:
- IGQVJ
min_line_len: 105
target:
- code
- doc
- name: JSON Web Token
severity: medium
confidence: strong
type: pattern
values:
- (?PeyJ[=0-9A-Za-z_+/-]{15,8000}(\.[=0-9A-Za-z_+/-]{0,8000}){2,16})(?![=0-9A-Za-z_-])
filter_type:
- ValueJsonWebTokenCheck
required_substrings:
- eyJ
min_line_len: 64
target:
- code
- doc
- name: JSON Web Key
severity: medium
confidence: strong
type: pattern
values:
- (?P\b(e(yJ|yAi|woi|wog|w0K)|W(yJ|3si|wp7|wog|w0K|3sK))[0-9A-Za-z_+/-]{60,8000})
filter_type:
- ValueJsonWebKeyCheck
required_substrings:
- eyJ
- eyAi
- ewoi
- ewog
- ew0K
- WyJ
- W3si
- Wwp7
- Wwog
- Ww0K
- W3sK
min_line_len: 64
target:
- code
- doc
- name: JWK
severity: medium
confidence: moderate
type: multi
values:
- (?P['"]?\b(?Pkty)[^0-9A-Za-z_-]{1,8}(RSA|EC|oct)\b['"]?)
- (?P\b[dk])[^0-9A-Za-z_-]{1,8}(?P[0-9A-Za-z_-]{22,8000})(?![=0-9A-Za-z_-])
filter_type:
- ValuePatternCheck
- ValueMorphemesCheck
required_substrings:
- kty
min_line_len: 8
target:
- code
- doc
- name: MailChimp API Key
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P[0-9A-Za-z_-]{32}-us[0-9]{1,2})(?![0-9A-Za-z_-])
filter_type: TokenPattern
required_substrings:
- -us
min_line_len: 35
target:
- code
- doc
- name: MailGun API Key
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Pkey-[0-9a-z]{32}|[0-9a-f]{32}-[0-9a-f]{8}-[0-9a-f]{8})(?![0-9A-Za-z_-])
filter_type: TokenPattern
required_regex: "[0-9A-Za-z_/+-]{15}"
min_line_len: 36
target:
- code
- doc
- name: PayPal Braintree Access Token
severity: high
confidence: strong
type: pattern
values:
- (?Paccess_token\$production\$[0-9a-z]{16}\$[0-9a-z]{32})(?![0-9A-Za-z_-])
filter_type: GeneralPattern
required_substrings:
- access_token$production$
min_line_len: 72
target:
- code
- doc
- name: PEM Private Key
severity: high
confidence: strong
type: pem_key
values:
- (?P-----BEGIN(?![^-]*ENCRYPTED)[^-]*PRIVATE[^-]*KEY[^-]*-----)
min_line_len: 27
target:
- code
- doc
- name: BASE64 encoded PEM Private Key
severity: high
confidence: strong
type: pattern
values:
- (?P[0-9A-Za-z_/+-]{0,8000}LS0t(LS1CRUdJTiB|LUJFR0lOI|QkVHSU4g)[0-9A-Za-z_/+-]{0,11}(UFJJVkFURSBLRVkt|QUklWQVRFIEtFWS0t|FBSSVZBVEUgS0VZ)[0-9A-Za-z_/+-]{1,8000}LS0t[0-9A-Za-z_/+-]{1,8000})
filter_type:
- ValueBase64EncodedPem
min_line_len: 300
required_substrings:
- UFJJVkFURSBLRVkt
- QUklWQVRFIEtFWS0t
- FBSSVZBVEUgS0VZ
target:
- code
- doc
- name: BASE64 Private Key
severity: high
confidence: strong
type: pattern
values:
- (?PMII[A-Za-f][0-9A-Za-z/+]{8}(?s:[^!#$&()*\-.:;<=>?@\[\]^_{|}~]{8,8000}))
filter_type:
- ValueBase64KeyCheck
min_line_len: 160
required_substrings:
- MII
target:
- code
- doc
- name: Picatic API Key
severity: high
confidence: strong
type: pattern
values:
- (?Psk_live_[0-9a-z]{32})(?![0-9A-Za-z_-])
filter_type: GeneralPattern
required_substrings:
- sk_live_
min_line_len: 40
target:
- code
- doc
- name: SendGrid API Key
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?PSG\.[0-9A-Za-z_-]{16,32}\.[0-9A-Za-z_-]{16,64})
filter_type: TokenPattern
required_substrings:
- SG.
min_line_len: 34
target:
- code
- doc
- name: Shopify Token
severity: high
confidence: strong
type: pattern
values:
- (?Pshp(at|ca|pa|ss|tka)_[0-9A-Fa-f]{32})(?![0-9A-Za-z_-])
filter_type: TokenPattern
required_substrings:
- shp
min_line_len: 38
target:
- code
- doc
- name: Slack Token
severity: high
confidence: strong
type: pattern
values:
- (?P(xapp|xox[a-z])\-[0-9A-Za-z-]{10,250})(?![0-9A-Za-z_-])
filter_type: TokenPattern
required_substrings:
- xox
- xapp
min_line_len: 15
target:
- code
- doc
- name: Slack Webhook
severity: medium
confidence: strong
type: pattern
values:
- (?Phooks\.slack\.com/services)(?P/T[0-9A-Z]{8,16}/B[0-9A-Z]{8,16}/[0-9A-Za-z_]{24})
filter_type: GeneralPattern
required_substrings:
- hooks.slack.com/services/T
min_line_len: 61
target:
- code
- doc
- name: Stripe Credentials
severity: high
confidence: strong
type: pattern
values:
- (?P(whsec|[prs]k_(test|live))_[0-9A-Za-z]{24,160})
filter_type: GeneralPattern
required_substrings:
- k_live_
- k_test_
- whsec_
min_line_len: 32
target:
- code
- doc
- name: Square Access Token
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?PEAAA[0-9A-Za-z_-]{60})(?![0-9A-Za-z_-])
filter_type:
- ValuePatternCheck
- ValueBase64PartCheck
required_substrings:
- EAAA
min_line_len: 64
target:
- code
- doc
- name: Square Credentials
severity: medium
confidence: strong
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Psq0[a-z]{3}-[0-9A-Za-z_-]{22}([0-9A-Za-z_-]{21})?)(?![0-9A-Za-z_-])
filter_type: TokenPattern
required_substrings:
- sq0
min_line_len: 29
target:
- code
- doc
- name: Twilio Credentials
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P(AC|AD|AL|CA|CF|CL|CN|CR|FW|IP|KS|MM|NO|PK|PN|QU|RE|SC|SD|SK|SM|TR|UT|XE|XR)[0-9A-Fa-f]{32})(?![0-9A-Za-z_+-])
filter_type: TokenPattern
required_substrings:
- AC
- AD
- AL
- CA
- CF
- CL
- CN
- CR
- FW
- IP
- KS
- MM
- "NO"
- PK
- PN
- QU
- RE
- SC
- SD
- SK
- SM
- TR
- UT
- XE
- XR
min_line_len: 34
target:
- code
- doc
- name: Telegram Bot API Token
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P[0-9]{8,10}:[0-9A-Za-z_-]{35})(?![0-9A-Za-z_-])
filter_type: TokenPattern
required_substrings:
- :AA
min_line_len: 45
target:
- code
- doc
- name: PyPi API Token
severity: high
confidence: strong
type: pattern
values:
- (?Ppypi-[0-9A-Za-z_-]{150,255})
filter_type: TokenPattern
required_substrings:
- pypi-
min_line_len: 155
target:
- code
- doc
- name: NPM Token
severity: high
confidence: strong
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Pnpm_[0-9A-Za-z_-]{36,255})
filter_type:
- ValueGitHubCheck
required_substrings:
- npm_
min_line_len: 40
target:
- code
- doc
- name: Github Classic Token
severity: high
confidence: strong
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Pgh[pousr]_[0-9A-Za-z_-]{36,255})
filter_type:
- ValueGitHubCheck
required_substrings:
- ghp_
- gho_
- ghu_
- ghs_
- ghr_
min_line_len: 40
target:
- code
- doc
- name: Github Fine-granted Token
severity: high
confidence: strong
type: pattern
values:
- (?Pgithub_pat_[0-9A-Za-z_]{80,255})
filter_type: GeneralPattern
required_substrings:
- github_pat_
min_line_len: 90
target:
- code
- doc
- name: Firebase Domain
severity: info
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P[a-z0-9.-]{1,80}\.firebaseio\.com|[a-z0-9.-]{1,80}\.firebaseapp\.com)
filter_type: GeneralPattern
required_substrings:
- .firebase
min_line_len: 16
target:
- code
- doc
- name: AWS S3 Bucket
severity: info
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P[a-z0-9.-]{3,63}\.s3\.amazonaws\.com|[a-z0-9.-]{3,63}\.s3-website[.-](eu|ap|us|ca|sa|cn))
filter_type: GeneralPattern
required_substrings:
- .s3-website
- .s3.amazonaws.com
min_line_len: 14
target:
- code
- doc
- name: Jfrog Token
severity: high
confidence: strong
type: pattern
values:
- (?P(cmVmdGtuO[0-9A-Za-z_-]{55}|AKCp[0-9A-Za-z_-]{69}))(?![0-9A-Za-z_-])
filter_type:
- ValueJfrogTokenCheck
required_substrings:
- cmVmdGtuO
- AKCp
min_line_len: 64
target:
- code
- doc
- name: Azure Access Token
severity: high
confidence: strong
type: pattern
values:
- (?PeyJ[=0-9A-Za-z_-]{50,500}\.eyJ[=0-9A-Za-z_-]{8,8000}\.[=0-9A-Za-z_-]{18,800})
filter_type:
- ValueAzureTokenCheck
required_substrings:
- eyJ
min_line_len: 148
target:
- code
- doc
- name: Azure Secret Value
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P[0-9A-Za-z_~.-]{3}8Q~[0-9A-Za-z_~.-]{34})(?![0-9A-Za-z_-])
filter_type: TokenPattern
min_line_len: 40
required_substrings:
- 8Q~
target:
- code
- doc
- name: Azure Storage Account Key
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P[0-9A-Za-z]{52}JQQJ9[9DH][0-9A-Za-z]{26}([0-9A-Za-z=]{4})?)(?![0-9A-Za-z_/+-])
min_line_len: 80
filter_type:
- ValuePatternCheck(17)
required_substrings:
- JQQJ99
- JQQJ9D
- JQQJ9H
target:
- code
- doc
- name: Bitbucket App Password
severity: high
confidence: strong
type: pattern
values:
- (?PATBB[0-9A-Za-z]{24}[A-F0-9]{8})(?![0-9A-Za-z_])
filter_type:
- ValueAtlassianTokenCheck
min_line_len: 28
required_substrings:
- ATBB
target:
- code
- doc
- name: Bitbucket Repository Access Token
severity: high
confidence: strong
type: pattern
values:
- (?PATCTT3xFfGN0[0-9A-Za-z_-]{80,800}(\\?=|%3[dD])[A-F0-9]{8})
filter_type:
- ValueAtlassianTokenCheck
min_line_len: 160
required_substrings:
- ATCTT3xFfGN0
target:
- code
- doc
- name: Bitbucket HTTP Access Token
severity: high
confidence: strong
type: pattern
values:
- (?PBBDC-[MNO][ADQTgjwz][AEIMQUYcgk][012345wxyz][0-9A-Za-z_-]{40})
filter_type:
- ValueAtlassianTokenCheck
min_line_len: 49
required_substrings:
- BBDC-
target:
- code
- doc
- name: Jira / Confluence PAT token
severity: high
confidence: strong
type: pattern
values:
- (?[MNO][ADQTgjwz][AEIMQUYcgk][012345wxyz][0-9A-Za-z_-]{40})(?![0-9A-Za-z_-])
filter_type:
- ValueAtlassianTokenCheck
min_line_len: 44
required_substrings:
- M
- N
- O
required_regex: "[0-9A-Za-z_/+-]{15}"
target:
- code
- doc
- name: Atlassian PAT token
severity: high
confidence: strong
type: pattern
values:
- (?PATATT3xFfGF0[0-9A-Za-z_-]{80,800}(\\?=|%3[dD])[A-F0-9]{8})
filter_type:
- ValueAtlassianTokenCheck
min_line_len: 160
required_substrings:
- ATATT3xFfGF0
target:
- code
- doc
- name: Digital Ocean Token
severity: high
confidence: strong
type: pattern
values:
- (?Pdo[opr]_v1_[a-f0-9]{64})(?![0-9A-Za-z_-])
filter_type: TokenPattern
min_line_len: 71
required_substrings:
- doo_v1_
- dop_v1_
- dor_v1_
target:
- code
- doc
- name: Dropbox OAuth2 API Access Token
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Psl\.(u\.)?[0-9A-Za-z_-]{77,177})(?![0-9A-Za-z_-])
filter_type: TokenPattern
min_line_len: 80
required_substrings:
- sl.
target:
- code
- doc
- name: NuGet API key
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Poy2[a-z0-9]{43})(?![0-9A-Za-z_-])
filter_type: TokenPattern
min_line_len: 46
required_substrings:
- oy2
target:
- code
- doc
- name: Gitlab Prefix Token
severity: high
confidence: strong
type: pattern
values:
- (?P(_gitlab_session=|GR1348941|gl(agent|soat|ffct|p[at]t|oas|cbt|imt|rtr|[dfrw]t)-)[0-9A-Za-z_-]{20,64}(\.[0-9A-Za-z_-]{2,16}){0,2})(?![0-9A-Za-z_-])
filter_type:
- ValuePatternCheck
min_line_len: 25
required_substrings:
- _gitlab_session=
- GR1348941
- glagent-
- glsoat-
- glffct-
- glpat-
- gloas-
- glptt-
- glcbt-
- glimt-
- gldt-
- glft-
- glrt-
- glrtr-
- glwt-
target:
- code
- doc
- name: Grafana Provisioned API Key
severity: high
confidence: strong
type: pattern
values:
- (?PeyJ[=0-9A-Za-z_-]{64,360})(?![=0-9A-Za-z_-])
filter_type:
- ValueGrafanaCheck
min_line_len: 67
required_substrings:
- eyJ
target:
- code
- doc
- name: Grafana Access Policy Token
severity: high
confidence: strong
type: pattern
values:
- (?Pglc_eyJ[0-9A-Za-z_-]{80,360})(?![0-9A-Za-z_-])
filter_type:
- ValueGrafanaCheck
min_line_len: 87
required_substrings:
- glc_eyJ
target:
- code
- doc
- name: Grafana Service Account Token
severity: high
confidence: strong
type: pattern
values:
- (?Pglsa_[0-9A-Za-z_-]{32}_[0-9A-Fa-f]{8})
min_line_len: 46
filter_type:
- ValueGrafanaServiceCheck
required_substrings:
- glsa_
target:
- code
- doc
- name: Dropbox API secret (long term)
severity: high
confidence: weak
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?=[0-9A-Za-z]{64})(?P[0-9A-Za-z]{10,12}[B-Za-z0-9]A{10,12}[B-Za-z0-9][0-9A-Za-z]{40,44})(?![=0-9A-Za-z_/+-])
filter_type: [ ]
min_line_len: 43
required_substrings:
- AAAAAAAAAA
target:
- code
- doc
- name: Dropbox App secret
severity: info
confidence: weak
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P[a-z0-9]{15})(?![=0-9A-Za-z_/+-])
filter_type: WeirdBase36Token
min_line_len: 15
required_regex: "[0-9A-Za-z_/+-]{15}"
target:
- code
- doc
- name: Hashicorp Vault Token
severity: high
confidence: strong
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Phv[brs]\.[0-9A-Za-z_-]{80,160})
filter_type:
- ValuePatternCheck
- ValueEntropyBase64Check
min_line_len: 90
required_substrings:
- hvb.
- hvr.
- hvs.
target:
- code
- doc
- name: Hashicorp Terraform Token
severity: high
confidence: strong
type: pattern
values:
- (?P[0-9A-Za-z_-]{14}\.atlasv1\.[0-9A-Za-z_-]{67})(?![0-9A-Za-z_-])
filter_type:
- ValuePatternCheck
- ValueMorphemesCheck
min_line_len: 90
required_substrings:
- .atlasv1.
target:
- code
- doc
- name: NKEY Seed
severity: high
confidence: weak
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?PS[ACNOPUX][A-Z2-7]{40,200})(?![=0-9A-Za-z_+-])
min_line_len: 42
filter_type:
- ValueMorphemesCheck
- ValuePatternCheck
- ValueEntropyBase32Check
- ValueBase32DataCheck
- ValueTokenBase32Check
required_substrings:
- SA
- SC
- SN
- SO
- SP
- SU
- SX
required_regex: "[0-9A-Za-z_/+-]{15}"
target:
- code
- doc
- name: OTP / 2FA Secret
severity: info
confidence: weak
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P([A-Z2-7]{16}){1,2})(?![=0-9A-Za-z_+-])
filter_type:
- ValueMorphemesCheck
- ValuePatternCheck
- ValueEntropyBase32Check
- ValueBase32DataCheck
- ValueTokenBase32Check
- ValueBase64PartCheck
min_line_len: 16
required_regex: "[0-9A-Za-z_/+-]{15}"
target:
- code
- doc
- name: OpenAI Token
severity: high
confidence: strong
type: pattern
values:
- (?Psk-[0-9A-Za-z_-]{16,160}(T3BlbkFJ|9wZW5BS|PcGVuQU)[0-9A-Za-z_-]{16,160})
min_line_len: 51
filter_type:
- ValuePatternCheck
- ValueMorphemesCheck
required_substrings:
- T3BlbkFJ
- 9wZW5BS
- PcGVuQU
target:
- code
- doc
- name: Docker Access Token
severity: high
confidence: strong
type: pattern
values:
- (?Pdckr_[op]at_[0-9A-Za-z_-]{27,32})
min_line_len: 36
filter_type:
- ValuePatternCheck
- ValueMorphemesCheck
required_substrings:
- dckr_pat_
- dckr_oat_
target:
- code
- doc
- name: Docker Swarm Token
severity: high
confidence: strong
type: pattern
values:
- (?PSWMTKN-1-[0-9a-z]{50}-[0-9a-z]{25})
min_line_len: 85
filter_type:
- ValuePatternCheck
- ValueMorphemesCheck
required_substrings:
- SWMTKN-1-
target:
- code
- doc
- name: Docker Swarm Key
severity: high
confidence: strong
type: pattern
values:
- (?PSWMKEY-1-[0-9A-Za-z]{43})
min_line_len: 52
filter_type:
- ValuePatternCheck
- ValueMorphemesCheck
required_substrings:
- SWMKEY-1-
target:
- code
- doc
- name: Groq API Key
severity: high
confidence: strong
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Pgsk_[0-9A-Za-z_-]{8,40}(WGdyb3FY|hncm9xW|YZ3JvcV)[0-9A-Za-z_-]{8,40})(?![0-9A-Za-z_-])
min_line_len: 56
filter_type:
- ValuePatternCheck
required_substrings:
- WGdyb3FY
- hncm9xW
- YZ3JvcV
target:
- code
- doc
- name: X AI API Key
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Pxai-[0-9A-Za-z_-]{80})(?![0-9A-Za-z_-])
min_line_len: 84
filter_type:
- ValuePatternCheck
- ValueEntropyBase64Check
required_substrings:
- xai-
target:
- code
- doc
- name: Notion Integration Token
severity: high
confidence: strong
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Pntn_[0-9]{9}[0-9A-Za-z_-]{36,255})
filter_type:
- ValuePatternCheck
- ValueEntropyBase64Check
required_substrings:
- ntn_
min_line_len: 50
target:
- code
- doc
- name: Hugging Face User Access Token
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Phf_[0-9A-Za-z_-]{34})(?![0-9A-Za-z_-])
min_line_len: 37
filter_type:
- ValuePatternCheck
- ValueEntropyBase64Check
required_substrings:
- hf_
target:
- code
- doc
- name: Anthropic API Key
severity: high
confidence: strong
type: pattern
values:
- (?Psk-ant-api03-[0-9A-Za-z_-]{64,128})(?![0-9A-Za-z_-])
min_line_len: 77
filter_type:
- ValuePatternCheck
required_substrings:
- sk-ant-api03-
target:
- code
- doc
- name: Perplexity API Key
severity: high
confidence: strong
type: pattern
values:
- (?Ppplx-[0-9A-Za-z_-]{40,64})(?![0-9A-Za-z_-])
min_line_len: 45
filter_type:
- ValuePatternCheck
required_substrings:
- pplx-
target:
- code
- doc
- name: DeepSeek API Key
severity: high
confidence: moderate
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Psk-[0-9a-f]{32,64})(?![0-9A-Za-z_-])
min_line_len: 35
filter_type:
- ValuePatternCheck
required_substrings:
- sk-
target:
- code
- doc
- name: Tavily API Key
severity: high
confidence: strong
type: pattern
values:
- (?Ptvly-[0-9A-Za-z_-]{32,40})(?![0-9A-Za-z_-])
min_line_len: 37
filter_type:
- ValuePatternCheck
required_substrings:
- tvly-
target:
- code
- doc
- name: Figma Personal Access Token
severity: high
confidence: strong
type: pattern
values:
- (?Pfigd_[0-9A-Za-z_-]{40})(?![0-9A-Za-z_-])
min_line_len: 45
filter_type:
- ValuePatternCheck
required_substrings:
- figd_
target:
- code
- doc
- name: 1Password Account Token
severity: high
confidence: strong
type: pattern
values:
- (?Pops_eyJ[0-9A-Za-z_-]{168,8000})
min_line_len: 192
filter_type:
- ValuePatternCheck
required_substrings:
- InNlY3JldEtleSI6
- JzZWNyZXRLZXkiO
- ic2VjcmV0S2V5Ij
target:
- code
- doc
- name: Brevo API Key
severity: high
confidence: strong
type: pattern
values:
- (?Pxkeysib-[0-9a-f]{64}-[0-9A-Za-z_-]{16})
min_line_len: 89
filter_type:
- ValuePatternCheck
required_substrings:
- xkeysib-
target:
- code
- doc
- name: Together AI API Key
severity: high
confidence: strong
type: pattern
values:
- (?Ptgp_v1_[0-9A-Za-z_-]{43})
min_line_len: 50
filter_type:
- ValuePatternCheck
required_substrings:
- tgp_v1_
target:
- code
- doc
- name: LLAMA API Key
severity: high
confidence: strong
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Pllx-[0-9A-Za-z_-]{48})
min_line_len: 52
filter_type:
- ValuePatternCheck
required_substrings:
- llx-
target:
- code
- doc
- name: SonarQube Credentials
severity: medium
confidence: moderate
type: pattern
values:
- (?Psq[apu]_[0-9a-f]{40})(?![0-9A-Za-z_-])
min_line_len: 44
filter_type:
- ValuePatternCheck
required_substrings:
- sqa_
- sqp_
- squ_
target:
- code
- doc
- name: Sentry Organization Auth Token
severity: high
confidence: strong
type: pattern
values:
- (?Psntrys_eyJ[0-9A-Za-z_-]{80,8000}=*([0-9A-Za-z_-]{32,256})?)(?![0-9A-Za-z_-])
min_line_len: 37
filter_type:
- ValuePatternCheck
required_substrings:
- sntrys_eyJ
target:
- code
- doc
- name: Sentry User Auth Token
severity: high
confidence: strong
type: pattern
values:
- (?Psntryu_[0-9a-f]{64})(?![0-9A-Za-z_-])
min_line_len: 37
filter_type:
- ValuePatternCheck
required_substrings:
- sntryu_
target:
- code
- doc
- name: Discord Bot Token
severity: high
confidence: strong
type: pattern
values:
- (?P[MNO][ADQTgjwz][AEIMQUYcgk][012345wxyz][0-9A-Za-z_-]{20,24}\.[0-9A-Za-z_-]{6}\.[0-9A-Za-z_-]{30,40})(?![0-9A-Za-z_-])
min_line_len: 62
filter_type:
- ValueDiscordBotCheck
required_substrings:
- M
- N
- O
required_regex: "[0-9A-Za-z_/+-]{15}"
target:
- code
- doc
- name: Discord Webhook
severity: medium
confidence: strong
type: pattern
values:
- (?Pdiscord(?:app)?\.com/api/webhooks)(?P/[0-9]{16,22}/[0-9A-Za-z_-]{40,100})
filter_type:
- ValueMorphemesCheck
required_substrings:
- discordapp.com/api/webhooks
- discord.com/api/webhooks
min_line_len: 61
target:
- code
- doc
- name: Vercel Token
severity: medium
confidence: weak
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Pvcp_[0-9A-Za-z]{56})(?![0-9A-Za-z_-])
min_line_len: 60
filter_type: TokenPattern
required_substrings:
- vcp_
target:
- code
- doc
- name: Netlify Token
severity: medium
confidence: weak
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Pnfp_[0-9A-Za-z]{36})(?![0-9A-Za-z_-])
min_line_len: 40
filter_type: TokenPattern
required_substrings:
- nfp_
target:
- code
- doc
- name: PostHog Credentials
severity: medium
confidence: weak
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Pph[acrsx]_[0-9A-Za-z]{40,60})(?![0-9A-Za-z_-])
min_line_len: 44
filter_type: TokenPattern
required_substrings:
- phx_
- phs_
- phr_
- pha_
- phc_
target:
- code
- doc
- name: RubyGems API Key
severity: medium
confidence: strong
type: pattern
values:
- (?Prubygems_[0-9a-f]{48})
min_line_len: 57
filter_type: TokenPattern
required_substrings:
- rubygems_
target:
- code
- doc
- name: Tencent WeChat API App ID
severity: medium
confidence: weak
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?Pwx[0-9a-f]{16})(?![0-9A-Za-z_-])
min_line_len: 18
filter_type: TokenPattern
required_substrings:
- wx
target:
- code
- doc
- name: Salesforce Credentials
severity: medium
confidence: weak
type: pattern
values:
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P(3MVG[0-9A-Za-z_.]{24,200}|00D[0-9A-Za-z]{9,15}(![0-9A-Za-z_.]{24,200})?))(?![0-9A-Za-z_.])
min_line_len: 12
filter_type:
- ValuePatternCheck(9)
- ValueNumberCheck
- ValueBase64PartCheck
required_substrings:
- 00D
- 3MVG
target:
- code
- doc
- name: Postman Credentials
severity: medium
confidence: moderate
type: pattern
values:
- (?P(PMAK-[0-9a-f]{24}-[0-9a-f]{34}|PMAT-[0-9A-Z]{26}))
min_line_len: 29
filter_type:
- ValuePatternCheck
required_substrings:
- PMAK-
- PMAT-
target:
- code
- doc
- name: NTLM Token
severity: medium
confidence: strong
type: pattern
values:
- (?PTlRMTVNTUAADAAAA[=0-9A-Za-z_/+-]{8,8000})(?![0-9A-Za-z_/+-])
filter_type:
- ValueMorphemesCheck(2)
- ValuePatternCheck
min_line_len: 160
required_substrings:
- TlRMTVNTUAADAAAA
target:
- doc
- code
- name: Basic Authorization
severity: medium
confidence: strong
type: pattern
values:
- (?P(?i:basic))(?P\s+)(?P[=0-9A-Za-z_/+-]{8,8000})(?![0-9A-Za-z_/+-])
min_line_len: 18
filter_type:
- ValueBasicAuthCheck
required_substrings:
- basic
target:
- code
- doc
- name: Bearer Authorization
severity: medium
confidence: moderate
type: pattern
values:
- (?P(?i:bearer|ntlm))(?P\s+)(?P[.0-9A-Za-z_/+-]{32,8000}=*)(?![0-9A-Za-z_/+-])
min_line_len: 37
filter_type: GeneralKeyword
required_substrings:
- bearer
- ntlm
target:
- code
- doc
- name: SQL Password
severity: medium
confidence: weak
type: pattern
values:
- (\\[nrt]|\b)(?i:(?P(CREATE|ALTER|SET\s{1,8}PASSWORD|INSERT(\s{1,8}IGNORE)?|UPDATE\s{1,8}[^\s;]{1,80})\s{1,8}(LOGIN|USER|ROLE|FOR|INTO|SET)\s{1,8}((?!IDENTIFIED|PASSWORD)[^\s;]{1,80}\s{1,8}|VALUES\s{0,8}\(){1,8}(IDENTIFIED((\s{1,8}WITH\s{1,8}\S{1,80})?\s{1,8}(BY|AS))|(=|WITH)?\s{0,8}PASSWORD\b(\s{0,8}=)?)))\s{0,8}(?P[(]\s{0,8})?(?P((?P\\{1,8})?([\"'`]|&(quot|apos|#3[49]);)){1,4})?(?P(?(value_leftquote)((?!(?P=value_leftquote))(?(esq)((?!(?P=esq)([\"'`]|&(quot|apos|#3[49]);)).)|((?!(?P=value_leftquote)).)))|(?!&(quot|apos|#3[49]);)(\\{1,8}([ tnr]|[^\s\"'`])|[^\s\"'`,;\\])){3,80})(?(value_leftquote)(?P(?curl)\s.*(-[uU]|--(proxy-)?user)\s\s*(?P(\\*[\"']){1,3})?(?(value_leftquote)[^\"'\\:]|[^\s\"'\\:]){0,64}:(?P(?(value_leftquote)[^\"'\\]|[^\s\"'\\]){4,64})(?(value_leftquote)(?P(\\?[\"']){1,3}))
filter_type: GeneralKeyword
use_ml: true
required_substrings:
- curl
min_line_len: 16
target:
- doc
- code
- name: CMD ConvertTo-SecureString
severity: high
confidence: moderate
type: pattern
values:
- (?PConvertTo-SecureString(\s\s*-(String|AsPlainText|Force))*)\s\s*(?P(\\?[\"']){1,3})?(?P(?(value_leftquote)[^\"'\\]|[^\s\"'\\]){4,800})(?(value_leftquote)(?P(\\?[\"']){1,3}))
filter_type: GeneralKeyword
use_ml: true
required_substrings:
- convertto-securestring
min_line_len: 27
target:
- doc
- code
- name: CMD Password
severity: high
confidence: moderate
type: pattern
values:
- (^|\W|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P-[A-Za-z_-]*(?i:pass(in|out|word|phrase)))(\s|\\?[\"'],)\s*(?!-)(?P(\\?[\"']){1,3})?(pass:)?(?!file:|env:|fd:)(?P(?(value_leftquote)[^\"'\\]|[^\s\"'\\]){4,80})(?(value_leftquote)(?P(\\?[\"']){1,3}))
filter_type: GeneralKeyword
use_ml: true
required_substrings:
- pass
min_line_len: 12
target:
- doc
- code
- name: CMD Token
severity: high
confidence: moderate
type: pattern
values:
- (^|\W|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P-[A-Za-z_-]*(?i:token|oauth2-bearer))(\s|\\?[\"'],)\s*(?!-)(?P(\\?[\"']){1,3})?(?P(?(value_leftquote)[^\"'\\]|[^\s\"'\\]){4,4000})(?(value_leftquote)(?P(\\?[\"']){1,3}))
filter_type: GeneralKeyword
use_ml: true
required_substrings:
- token
- oauth2-bearer
min_line_len: 12
target:
- doc
- code
- name: CMD Secret
severity: high
confidence: moderate
type: pattern
values:
- (^|\W|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P-[A-Za-z_-]*(?i:secret)[A-Za-z_-]*)(\s|\\?[\"'],)\s*(?!-)(?P(\\?[\"']){1,3})?(pass:)?(?!file:|env:|fd:)(?P(?(value_leftquote)[^\"'\\]|[^\s\"'\\]){4,4000})(?(value_leftquote)(?P(\\?[\"']){1,3}))
filter_type: GeneralKeyword
use_ml: true
required_substrings:
- secret
min_line_len: 12
target:
- doc
- code
- name: URL Credentials
severity: high
confidence: moderate
type: pattern
values:
- (?P[\"'])?(?P[+0-9A-Za-z-]{2,80}://)([^\s\'"<>\[\]^~`{|}:/]{0,80}:){1,3}(?P[^\s\'"<>\[\]^~`{|}@:/]{3,80})@[^\s\'"<>\[\]^~`{|}@:/]{1,800}\\{0,8}(?P[\"'])?
filter_type: UrlCredentialsGroup
use_ml: true
required_substrings:
- ://
min_line_len: 10
target:
- doc
- code
- name: API
severity: low
confidence: moderate
type: keyword
values:
- api(?!tal)
filter_type: GeneralKeyword
use_ml: true
min_line_len: 11
required_substrings:
- api
target:
- code
- name: Auth
severity: medium
confidence: moderate
type: keyword
values:
- auth(?!ors?(?!i[tz]))
filter_type: GeneralKeyword
use_ml: true
min_line_len: 12
required_substrings:
- auth
target:
- code
- name: Credential
severity: medium
confidence: moderate
type: keyword
values:
- credential
filter_type: GeneralKeyword
use_ml: true
min_line_len: 18
required_substrings:
- credential
target:
- code
- name: Key
severity: high
confidence: moderate
type: keyword
values:
- key(?!word|board|pad|name)
filter_type: GeneralKeyword
use_ml: true
min_line_len: 11
required_substrings:
- key
target:
- code
- name: Nonce
severity: low
confidence: moderate
type: keyword
values:
- (? None:
self.config = config
self._verify_rule_config(rule_dict)
# mandatory fields
self.__rule_name = str(rule_dict[Rule.NAME])
if severity := Severity.get(rule_dict[Rule.SEVERITY]):
self.__severity = severity
else:
self._malformed_rule_error(rule_dict, Rule.SEVERITY)
if confidence := Confidence.get(rule_dict[Rule.CONFIDENCE]):
self.__confidence = confidence
else:
self._malformed_rule_error(rule_dict, Rule.CONFIDENCE)
if rule_type := getattr(RuleType, str(rule_dict[Rule.TYPE]).upper(), None):
self.__rule_type: RuleType = rule_type
else:
self._malformed_rule_error(rule_dict, Rule.TYPE)
self.__patterns = self._init_patterns(rule_dict[Rule.VALUES])
self.__target: List[str] = rule_dict.get(Rule.TARGET, [])
if not self.__target or set(self.__target).difference({"code", "doc"}):
self._malformed_rule_error(rule_dict, Rule.TARGET)
# auxiliary fields
self.__filters = self._init_filters(rule_dict.get(Rule.FILTER_TYPE, []))
self.__use_ml = bool(rule_dict.get(Rule.USE_ML))
self.__required_substrings = set(i.strip().lower() for i in rule_dict.get(Rule.REQUIRED_SUBSTRINGS, []))
self.__has_required_substrings = bool(self.__required_substrings)
required_regex = rule_dict.get(Rule.REQUIRED_REGEX)
if required_regex and not isinstance(required_regex, str):
self._malformed_rule_error(rule_dict, Rule.REQUIRED_REGEX)
self.__required_regex = re.compile(required_regex) if required_regex else None
self.__min_line_len = int(rule_dict.get(Rule.MIN_LINE_LEN, MAX_LINE_LENGTH))
def _malformed_rule_error(self, rule_dict: Dict, field: str):
raise ValueError(f"Malformed rule '{self.__rule_name}'."
f" field '{field}' has invalid value"
f" '{rule_dict.get(field)}'")
@cached_property
def rule_name(self) -> str:
"""rule_name getter"""
return self.__rule_name
@cached_property
def rule_type(self) -> RuleType:
"""rule_type getter"""
return self.__rule_type
@cached_property
def severity(self) -> Severity:
"""severity getter"""
return self.__severity
@cached_property
def confidence(self) -> Confidence:
"""confidence getter"""
return self.__confidence
@cached_property
def filters(self) -> List[Filter]:
"""filters getter"""
return self.__filters
@staticmethod
def _get_arg(arg: str) -> Union[int, float, str]:
"""Transform given string value to int, then float. In worst case - returns str"""
with contextlib.suppress(Exception):
return int(arg)
with contextlib.suppress(Exception):
return float(arg)
return str(arg)
def _init_filters(self, filter_type: Union[None, str, List[str]]) -> List[Filter]:
"""
filter_type: str - applies Group of filter
list - creates specific set of Filters
"""
_filters: List[Filter] = []
if isinstance(filter_type, str):
# when string passed - (Group) of filters is applied
filter_group = getattr(group, filter_type, None)
if isinstance(filter_group, type) and issubclass(filter_group, Group):
return filter_group(self.config).filters # type: ignore
elif isinstance(filter_type, list):
# list type means - list of (Filter)s is applied
for i in filter_type:
if '(' in i and ')' in i:
left_pos = i.find('(')
filter_parameters = [self._get_arg(x.strip()) for x in i[left_pos + 1:i.find(')')].split(',')]
filter_name = i[:left_pos].strip()
else:
filter_parameters = None
filter_name = i
_filter = getattr(filters, filter_name, None)
if isinstance(_filter, type) and issubclass(_filter, Filter):
if filter_parameters:
_filters.append(_filter(self.config, *filter_parameters))
else:
_filters.append(_filter(self.config))
else:
break
else:
return _filters
raise ValueError(f"Malformed rule '{self.__rule_name}'."
f" field '{Rule.FILTER_TYPE}' has invalid value"
f" '{filter_type}'")
def _init_patterns(self, _values: List[str]) -> List[re.Pattern]:
"""Get pattern values for rule object.
Set the pattern value attribute of the rule object based on the passed values.
So, if the received rule type corresponds to the RuleType.KEYWORD type,
the "patterns" attribute is assigned the value of template keyword regex
with the corresponding value. Otherwise, if the received rule type corresponds
to the RuleType.PATTERN, RuleType.MULTI or RuleType.PEM_KEY types, the "patterns" attribute is
assigned the compile regex ov received value
Args:
_values: regular expressions
"""
_patterns: List[re.Pattern] = []
if RuleType.KEYWORD == self.rule_type and 0 < len(_values):
for value in _values:
_pattern = KeywordPattern.get_keyword_pattern(value)
_patterns.append(_pattern)
elif RuleType.MULTI == self.rule_type and 2 == len(_values) \
or self.rule_type in (RuleType.PATTERN, RuleType.PEM_KEY) and 0 < len(_values):
for value in _values:
_patterns.append(re.compile(value))
if RuleType.PEM_KEY == self.rule_type and 1 < len(_values):
logger.warning("Rule %s has extra patterns. Only single pattern supported.", self.rule_name)
elif RuleType.MULTI == self.rule_type and 2 < len(_values):
logger.warning("Rule %s has extra patterns. Only two patterns supported.", self.rule_name)
else:
raise ValueError(f"Malformed rule config file. Rule '{self.rule_name}' type '{self.rule_type}' is invalid.")
return _patterns
@cached_property
def patterns(self) -> List[re.Pattern]:
"""patterns getter"""
return self.__patterns
@cached_property
def use_ml(self) -> bool:
"""use_ml getter"""
return self.__use_ml
@staticmethod
def _verify_rule_config(rule_config: Dict) -> None:
"""Checks all mandatory fields and wrong names
Args:
rule_config: dictionary loaded from the config file
Raises:
ValueError if missing fields is present
"""
if missing_fields := Rule.mandatory_fields.difference(rule_config.keys()):
raise ValueError(f"Malformed rule config file. Contain rule with missing fields: {missing_fields}.")
if extra_fields := set(rule_config.keys()).difference(Rule.all_fields):
raise ValueError(f"Malformed rule config file. Extra fields: {extra_fields}.")
@cached_property
def required_substrings(self) -> Set[str]:
"""required_substrings getter"""
return self.__required_substrings
@cached_property
def has_required_substrings(self) -> bool:
"""has_required_substrings getter for speedup"""
return self.__has_required_substrings
@cached_property
def required_regex(self) -> Optional[re.Pattern]:
"""required_regex getter"""
return self.__required_regex
@cached_property
def min_line_len(self) -> int:
"""min_line_len getter"""
return self.__min_line_len
@cached_property
def target(self) -> List[str]:
"""target getter"""
return self.__target
================================================
FILE: credsweeper/scanner/__init__.py
================================================
================================================
FILE: credsweeper/scanner/scan_type/__init__.py
================================================
================================================
FILE: credsweeper/scanner/scan_type/multi_pattern.py
================================================
import copy
import re
from typing import List
from credsweeper.common.constants import RuleType, MAX_LINE_LENGTH
from credsweeper.config.config import Config
from credsweeper.credentials.candidate import Candidate
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters import ValueSearchCheck
from credsweeper.filters.filter import Filter
from credsweeper.rules.rule import Rule
from credsweeper.scanner.scan_type.scan_type import ScanType
class MultiPattern(ScanType):
"""Check if line is a part of a multi-line credential and second part is present within MAX_SEARCH_MARGIN lines.
Parameters:
MAX_SEARCH_MARGIN: Int constant. Number of lines around current to perform search for the second part
"""
MAX_SEARCH_MARGIN = 10
@classmethod
def run(cls, config: Config, rule: Rule, target: AnalysisTarget) -> List[Candidate]:
"""Check if multiline credential present if the file within MAX_SEARCH_MARGIN range from current line_num.
Args:
config: user configs
rule: Rule object to check current line. Should be a multi-pattern rule
target: Analysis target
Return:
List of Candidates if pattern defined in a rule is present in a line
and second part of multi-pattern rule is present within MAX_SEARCH_MARGIN from the line.
Empty list (False) - otherwise.
"""
if RuleType.MULTI != rule.rule_type:
raise ValueError(f"Rule `{rule}` provided to `{cls.__name__}`.run "
f"should have pattern_type equal to `{RuleType.MULTI.value}`")
candidates = cls._get_candidates(config, rule, target)
for candidate in candidates:
# use additional filter to skip the value in first line_data and continues scan
filters = copy.deepcopy(rule.filters)
filters.append(ValueSearchCheck(config, candidate.line_data_list[0].value))
for line_pos in cls.get_line_positions(candidate.line_data_list[0].line_pos, target):
if cls._scan(config, candidate, line_pos, target, rule.patterns[1], filters):
break
# return candidates with multi line_data_list only
return [x for x in candidates if 1 < len(x.line_data_list)]
@classmethod
def get_line_positions(cls, line_pos: int, target: AnalysisTarget) -> List[int]:
"""Returns list of line positions to be scanned for second part of multi-pattern rule in a priority order."""
if 0 <= line_pos < target.lines_len:
# the same line is first
priority_positions = [(0, line_pos)]
else:
return []
# margin order is constant at start
priority_forward = priority_backward = cls.MAX_SEARCH_MARGIN
# backward lines are second priority
priority_backward += cls.MAX_SEARCH_MARGIN
line_pos_margin = 1
while line_pos_margin <= cls.MAX_SEARCH_MARGIN:
# forward
line_pos_forward = line_pos + line_pos_margin
if 0 <= line_pos_forward < target.lines_len:
if forward_curled_diff := target.lines[line_pos_forward].count('}', 0, MAX_LINE_LENGTH):
forward_curled_diff -= target.lines[line_pos_forward].count('{', 0, MAX_LINE_LENGTH)
if 0 < forward_curled_diff:
priority_forward += cls.MAX_SEARCH_MARGIN * (1 + forward_curled_diff)
else:
priority_forward += cls.MAX_SEARCH_MARGIN
priority_positions.append((priority_forward, line_pos_forward))
# backward
line_pos_backward = line_pos - line_pos_margin
if 0 <= line_pos_backward < target.lines_len:
if backward_curled_diff := target.lines[line_pos_backward].count('{', 0, MAX_LINE_LENGTH):
backward_curled_diff -= target.lines[line_pos_backward].count('}', 0, MAX_LINE_LENGTH)
if 0 < backward_curled_diff:
priority_backward += cls.MAX_SEARCH_MARGIN * (1 + backward_curled_diff)
else:
priority_backward += cls.MAX_SEARCH_MARGIN
priority_positions.append((priority_backward, line_pos_backward))
# increment the margin for next index
line_pos_margin += 1
# first item is priority, second - line_pos
priority_positions.sort()
return [x for _, x in priority_positions]
@classmethod
def _scan(cls, config: Config, candidate: Candidate, candi_line_pos: int, target: AnalysisTarget,
pattern: re.Pattern, filters: List[Filter]) -> bool:
"""Search for second pattern in multi-pattern rule.
Automatically update candidate with detected line if any.
Args:
config: dict, scanner configuration
candidate: Current credential candidate detected in the line
candi_line_pos: line position of lines around candidate to perform search
target: Analysis target
pattern: second pattern in a rule
filters: filters to be applied on candidate
Return:
Boolean. True if second part detected. False otherwise
"""
new_target = AnalysisTarget(candi_line_pos, target.lines, target.line_nums, target.descriptor)
line_data_list = cls.get_line_data_list(config=config, target=new_target, pattern=pattern, filters=filters)
if not line_data_list:
return False
candidate.line_data_list.extend(line_data_list)
return True
================================================
FILE: credsweeper/scanner/scan_type/pem_key_pattern.py
================================================
import logging
from typing import List
from credsweeper.common.constants import RuleType
from credsweeper.config.config import Config
from credsweeper.credentials.candidate import Candidate
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.rules.rule import Rule
from credsweeper.scanner.scan_type.scan_type import ScanType
from credsweeper.utils.pem_key_detector import PemKeyDetector
logger = logging.getLogger(__name__)
class PemKeyPattern(ScanType):
"""Scanner detects single PEM private key in target from current line"""
@classmethod
def run(cls, config: Config, rule: Rule, target: AnalysisTarget) -> List[Candidate]:
"""Check if target is a PEM key
Args:
config: user configs
rule: Rule object to check current line. Should be a pem-pattern rule
target: Analysis target
Return:
List of Candidate objects if pattern defined in a rule is present in a line
and filters defined in rule do not remove current line. Empty list - otherwise
"""
if RuleType.PEM_KEY != rule.rule_type:
raise ValueError(f"Rule `{rule}` provided to `{cls.__name__}`.run "
f"should have pattern_type equal to `{RuleType.PEM_KEY.value}`")
for candidate in cls._get_candidates(config, rule, target):
if pem_lines := PemKeyDetector(config).detect_pem_key(candidate.line_data_list[0], target):
candidate.line_data_list = pem_lines
return [candidate]
return []
================================================
FILE: credsweeper/scanner/scan_type/scan_type.py
================================================
import logging
import re
from abc import ABC, abstractmethod
from typing import List
from credsweeper.common.constants import RuleType, MIN_DATA_LEN
from credsweeper.config.config import Config
from credsweeper.credentials.candidate import Candidate, LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters.filter import Filter
from credsweeper.rules.rule import Rule
logger = logging.getLogger(__name__)
class ScanType(ABC):
"""Base class for all Scanners.
Scanner allow to check if regex pattern defined in a rule is present in a line.
"""
@classmethod
@abstractmethod
def run(cls, config: Config, rule: Rule, target: AnalysisTarget) -> List[Candidate]:
"""Check if regex pattern defined in a rule is present in a line.
Args:
config: user configs
rule: Rule object to check current line
target: Analysis target
Return:
List of Candidate objects if pattern defined in a rule is present in a line
and filters defined in rule do not remove current line. Empty list - otherwise
"""
raise NotImplementedError()
@classmethod
def filtering(cls, target: AnalysisTarget, line_data: LineData, filters: List[Filter]) -> bool:
"""Check if line data should be removed based on filters.
If `use_filters` option is false, always return False
Args:
target: AnalysisTarget from which `line_data` was obtained
line_data: Line data to check with `filters`
filters: Filters to use
Return:
boolean: True if line_data should be removed. False otherwise.
If `use_filters` option is false, always return False
"""
if not line_data.value:
logger.debug("Filtered line with empty value in file: %s:%d in line: %s value: '%s'", line_data.path,
line_data.line_num, line_data.line, line_data.value)
return True
for filter_ in filters:
if filter_.run(line_data, target):
logger.debug("Filtered line with filter: %s in file: %s:%d in line: %s value: %s",
filter_.__class__.__name__, line_data.path, line_data.line_num, line_data.line,
line_data.value)
return True
return False
@classmethod
def get_line_data_list(
cls, #
config: Config, #
target: AnalysisTarget, #
pattern: re.Pattern, #
filters: List[Filter]) -> List[LineData]:
"""Check if regex pattern is present in line, and line should not be removed by filters.
Args:
config: dict of credsweeper configuration
target: AnalysisTarget with all necessary data
pattern: Compiled regex object to be searched in line
filters: Filters to use
Return:
List of LineData objects if pattern a line and filters do not remove current line. Empty otherwise
"""
line_data_list: List[LineData] = []
# start - end positions for continuously searching for overlapping pattern
offsets = [(0, target.line_len)]
while offsets:
offset_start, offset_end = offsets.pop()
bypass_start = bypass_end = None
for _match in pattern.finditer(target.line, pos=offset_start, endpos=offset_end):
logger.debug("Valid line for pattern: %s in file: %s:%d in line: %s", pattern.pattern, target.file_path,
target.line_num, target.line)
line_data = LineData(config, target.line, target.line_pos, target.line_num, target.file_path,
target.file_type, target.info, pattern, _match)
if bypass_start and bypass_end:
if 0 < line_data.variable_start:
bypass_end = line_data.variable_start
elif 0 < line_data.value_start:
bypass_end = line_data.value_start
if bypass_start < bypass_end and bypass_end - bypass_start > MIN_DATA_LEN:
offsets.append((bypass_start, bypass_end))
bypass_start = bypass_end = None
elif MIN_DATA_LEN < line_data.value_end < _match.end() \
and MIN_DATA_LEN < _match.end() - line_data.value_end:
# add bypass for valuable sanitized value
bypass_start = line_data.value_end
bypass_end = offset_end
if config.use_filters and cls.filtering(target, line_data, filters):
if line_data.variable and 0 <= line_data.variable_start < line_data.variable_end:
# may be next matched item will be not filtered - let search it after variable
bypass_start = line_data.variable_end
bypass_end = offset_end
elif line_data.value and 0 <= line_data.value_start < line_data.value_end:
# may be next matched item will be not filtered - let search it after variable
bypass_start = line_data.value_end
bypass_end = offset_end
continue
if target.offset is not None:
# the target line is a chunk of long line - offsets have to be corrected
if 0 <= line_data.variable_start:
line_data.variable_start += target.offset
if 0 <= line_data.variable_end:
line_data.variable_end += target.offset
if 0 <= line_data.separator_start:
line_data.separator_start += target.offset
if 0 <= line_data.separator_end:
line_data.separator_end += target.offset
# value positions are mandatory
line_data.value_start += target.offset
line_data.value_end += target.offset
# get the original line
line_data.line = target.lines[target.line_pos]
line_data_list.append(line_data)
if bypass_start and bypass_end:
offsets.append((bypass_start, bypass_end))
return line_data_list
@classmethod
def _get_candidates(cls, config: Config, rule: Rule, target: AnalysisTarget) -> List[Candidate]:
"""Returns Candidate objects list.
Args:
config: user configs
rule: Rule object to check current line
target: Target for analysis
Return:
List of Candidate objects if pattern defined in a rule is present in a line
and filters defined in rule do not remove current line. Empty list - otherwise
"""
candidates: List[Candidate] = []
if config.exclude_lines and target.line_strip in config.exclude_lines:
return candidates
if line_data_list := cls.get_line_data_list(config=config,
target=target,
pattern=rule.patterns[0],
filters=rule.filters):
for line_data in line_data_list:
if config.exclude_values and line_data.value.strip() in config.exclude_values:
continue
candidate = Candidate(line_data_list=[line_data],
patterns=rule.patterns,
rule_name=rule.rule_name,
severity=rule.severity,
config=config,
use_ml=rule.use_ml,
confidence=rule.confidence)
# single pattern with multiple values means all the patterns must matched in target
if 1 < len(rule.patterns) and rule.rule_type in (RuleType.PATTERN, RuleType.KEYWORD):
# additional check whether all patterns match
if not cls._aux_scan(config, rule, target, candidate):
# cannot find secondary values for the candidate
continue
candidates.append(candidate)
return candidates
@classmethod
def _aux_scan(cls, config: Config, rule: Rule, target: AnalysisTarget, candidate: Candidate) -> bool:
"""check for all secondary patterns"""
for pattern in rule.patterns[1:]:
line_data_list = cls.get_line_data_list(config=config, target=target, pattern=pattern, filters=rule.filters)
pattern_matched = False
for line_data in line_data_list:
# standard filtering of values from config
if config.exclude_values and line_data.value.strip() in config.exclude_values:
continue
candidate.line_data_list.append(line_data)
pattern_matched = True
if not pattern_matched:
return False
# all secondary patterns were matched and candidate is filled with the values
return True
================================================
FILE: credsweeper/scanner/scan_type/single_pattern.py
================================================
from typing import List
from credsweeper.common.constants import RuleType
from credsweeper.config.config import Config
from credsweeper.credentials.candidate import Candidate
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.rules.rule import Rule
from credsweeper.scanner.scan_type.scan_type import ScanType
class SinglePattern(ScanType):
"""Check if single line rule present in the line."""
@classmethod
def run(cls, config: Config, rule: Rule, target: AnalysisTarget) -> List[Candidate]:
"""Check if regex pattern defined in a rule is present in a line.
Args:
config: config object of user configs
rule: Rule object to check current line
target: Analysis target
Return:
List of Candidate objects if pattern defined in a rule is present in a line
and filters defined in rule do not remove current line. Empty list - otherwise
"""
if RuleType.PATTERN != rule.rule_type and RuleType.KEYWORD != rule.rule_type:
raise ValueError(f"Rule `{rule}` provided to `{cls.__name__}`.run "
f"should have pattern_type equal to `{RuleType.PATTERN.value}`")
return cls._get_candidates(config, rule, target)
================================================
FILE: credsweeper/scanner/scanner.py
================================================
import logging
import re
from pathlib import Path
from typing import List, Type, Tuple, Union, Dict, Generator, Set
from credsweeper.app import APP_PATH
from credsweeper.common.constants import RuleType, MIN_VARIABLE_LENGTH, MIN_SEPARATOR_LENGTH, MIN_VALUE_LENGTH, \
MAX_LINE_LENGTH, PEM_BEGIN_PATTERN
from credsweeper.config.config import Config
from credsweeper.credentials.candidate import Candidate
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.file_handler.content_provider import ContentProvider
from credsweeper.rules.rule import Rule
from credsweeper.scanner.scan_type.multi_pattern import MultiPattern
from credsweeper.scanner.scan_type.pem_key_pattern import PemKeyPattern
from credsweeper.scanner.scan_type.scan_type import ScanType
from credsweeper.scanner.scan_type.single_pattern import SinglePattern
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
RULES_PATH = APP_PATH / "rules" / "config.yaml"
class Scanner:
"""Advanced Credential Scanner base class.
Parameters:
rules: list of rule objects to check
min_pattern_len: minimal length specified in all pattern rules
min_keyword_len: minimal possible length for a string to be matched by any keyword rule
min_len: Smallest between min_pattern_len and min_keyword_len
TargetGroup: Type for List[Tuple[AnalysisTarget, str, int]]
"""
TargetGroup = List[Tuple[AnalysisTarget, str, int]]
def __init__(self, config: Config, rule_path: Union[None, str, Path]) -> None:
self.config = config
# init with MAX_LINE_LENGTH before _set_rules
self.min_keyword_len = MAX_LINE_LENGTH
self.min_pattern_len = MAX_LINE_LENGTH
self.min_pem_key_len = MAX_LINE_LENGTH
self.min_multi_len = MAX_LINE_LENGTH
self.rules_scanners: List[Tuple[Rule, Type[ScanType]]] = []
self._set_rules_scanners(rule_path)
self.min_len = min(self.min_pattern_len, self.min_keyword_len, self.min_pem_key_len, self.min_multi_len,
MIN_VARIABLE_LENGTH + MIN_SEPARATOR_LENGTH + MIN_VALUE_LENGTH)
self.__keyword_rules_required_substrings = self._get_required_substrings(RuleType.KEYWORD)
def keywords_required_substrings_check(self, text: str) -> bool:
"""check whether `text` has any required substring for all keyword type rules"""
return self._substring_check(self.__keyword_rules_required_substrings, text)
def _get_required_substrings(self, rule_type: RuleType) -> Set[str]:
"""init set of required substrings for custom rule type"""
required_substrings: Set[str] = set()
for rule in (x[0] for x in self.rules_scanners if rule_type == x[0].rule_type):
required_substrings.update(set(rule.required_substrings))
return required_substrings
@staticmethod
def _substring_check(substrings: Set[str], text: str) -> bool:
"""checks whether `text` has any required substring. Set is used to reduce extra transformations"""
for substring in substrings:
if substring in text:
return True
return False
def _set_rules_scanners(self, rules_path: Union[None, str, Path]) -> None:
"""Auxiliary method to fill rules, determine min_pattern_len and set scanners"""
if rules_path is None:
rules_path = RULES_PATH
rule_templates = Util.yaml_load(rules_path)
if rule_templates and isinstance(rule_templates, list):
rule_names = set()
for rule_template in rule_templates:
try:
rule = Rule(self.config, rule_template)
except Exception as exc:
logger.error("Rule creation error%s", str(rule_template))
raise exc
if not self._is_available(rule):
continue
if rule.rule_name in rule_names:
raise RuntimeError(f"Duplicated rule name {rule.rule_name}")
rule_names.add(rule.rule_name)
if 0 < rule.min_line_len:
if rule.rule_type == RuleType.KEYWORD:
self.min_keyword_len = min(self.min_keyword_len, rule.min_line_len)
elif rule.rule_type == RuleType.PATTERN:
self.min_pattern_len = min(self.min_pattern_len, rule.min_line_len)
elif rule.rule_type == RuleType.PEM_KEY:
self.min_pem_key_len = min(self.min_pem_key_len, rule.min_line_len)
elif rule.rule_type == RuleType.MULTI:
self.min_multi_len = min(self.min_multi_len, rule.min_line_len)
else:
logger.warning("Unknown rule type:%s", rule.rule_type)
self.rules_scanners.append((rule, self.get_scanner(rule)))
else:
raise RuntimeError(f"Wrong rules '{rule_templates}' were read from '{rules_path}'")
def _is_available(self, rule: Rule) -> bool:
"""separate the method to reduce complexity"""
if rule.severity < self.config.severity:
return False
if self.config.doc:
if "doc" in rule.target:
return True
else:
if "code" in rule.target:
return True
return False
def yield_rule_scanner(
self, #
line_len: int, #
matched_pattern: bool, #
matched_keyword: bool, #
matched_pem_key: bool, #
matched_multi: bool) -> Generator[Tuple[Rule, Type[ScanType]], None, None]:
"""returns generator for rules and according scanner"""
for rule, scanner in self.rules_scanners:
if line_len >= rule.min_line_len \
and (RuleType.PATTERN == rule.rule_type and matched_pattern
or RuleType.KEYWORD == rule.rule_type and matched_keyword
or RuleType.PEM_KEY == rule.rule_type and matched_pem_key
or RuleType.MULTI == rule.rule_type and matched_multi):
yield rule, scanner
def scan(self, provider: ContentProvider) -> List[Candidate]:
"""Run scanning of list of target lines from 'targets' with set of rule from 'self.rules'.
Args:
provider: objects with data to analyze: line, line number,
filepath and all lines in file
Return:
list of all detected credential candidates in analyzed targets
"""
credentials: List[Candidate] = []
for target in provider.yield_analysis_target(self.min_len):
# Trim string from outer spaces to make future `x in str` checks faster
target_line_stripped = target.line_strip
target_line_stripped_len = target.line_strip_len
# use lower case for required substring
target_line_stripped_lower = target.line_lower_strip
# "cache" - YAPF and pycharm formatters ...
matched_keyword = \
target_line_stripped_len >= self.min_keyword_len and ( #
'=' in target_line_stripped
or ':' in target_line_stripped
or ("define" in target_line_stripped
and ('(' in target_line_stripped and ',' in target_line_stripped
or "#define" in target_line_stripped
or "%define" in target_line_stripped)
)
or "%global" in target_line_stripped
or "set" in target_line_stripped_lower
or "%3d" in target_line_stripped_lower
) #
matched_pem_key = \
target_line_stripped_len >= self.min_pem_key_len \
and PEM_BEGIN_PATTERN in target_line_stripped and "PRIVATE" in target_line_stripped
matched_pattern = target_line_stripped_len >= self.min_pattern_len
matched_multi = target_line_stripped_len >= self.min_multi_len
if not (matched_keyword or matched_pem_key or matched_pattern or matched_multi):
# target may be skipped only with length because not all rules have required_substrings
logger.debug("Skip too short (%d) line %s:%d", target_line_stripped_len, target.file_path,
target.line_num)
continue
# cached value to skip the same regex verifying
matched_regex: Dict[re.Pattern, bool] = {}
for rule, scanner in self.yield_rule_scanner(target_line_stripped_len, matched_pattern, matched_keyword,
matched_pem_key, matched_multi):
if rule.has_required_substrings \
and not self._substring_check(rule.required_substrings, target_line_stripped_lower):
continue
# common regex might be triggered for the same target
if rule.required_regex:
if rule.required_regex in matched_regex:
regex_result = matched_regex[rule.required_regex]
else:
regex_result = bool(rule.required_regex.search(target_line_stripped))
matched_regex[rule.required_regex] = regex_result
if not regex_result:
continue
if new_credentials := scanner.run(self.config, rule, target):
credentials.extend(new_credentials)
logger.debug("Credential for rule: %s in file: %s:%d in line: %s", rule.rule_name, target.file_path,
target.line_num, target.line)
return credentials
@staticmethod
def get_scanner(rule: Rule) -> Type[ScanType]:
"""Choose type of scanner base on rule affiliation.
Args:
rule: rule object used to scanning
Return:
depending on the rule type, returns the corresponding scanner class
"""
if rule.rule_type in (RuleType.PATTERN, RuleType.KEYWORD):
return SinglePattern
if RuleType.MULTI == rule.rule_type:
return MultiPattern
if RuleType.PEM_KEY == rule.rule_type:
return PemKeyPattern
raise ValueError(f"Unknown pattern_type in rule: {rule.rule_type}")
================================================
FILE: credsweeper/secret/config.json
================================================
{
"exclude": {
"pattern": [],
"containers": [
".aar",
".apk",
".bz2",
".class",
".gz",
".jar",
".lzma",
".rpm",
".tar",
".war",
".whl",
".xz",
".zip"
],
"documents": [
".doc",
".docx",
".odp",
".ods",
".odt",
".pdf",
".ppt",
".pptx",
".xls",
".xlsx"
],
"extension": [
".3gp",
".7z",
".a",
".aac",
".avi",
".bin",
".bmp",
".css",
".dmg",
".ear",
".eot",
".elf",
".exe",
".gif",
".gmo",
".ico",
".img",
".info",
".jpeg",
".jpg",
".lib",
".map",
".m4a",
".m4b",
".m4p",
".m4r",
".mat",
".mo",
".mov",
".mp3",
".mp4",
".mpg",
".mkv",
".npy",
".npz",
".obj",
".oga",
".ogg",
".ogv",
".ops",
".pak",
".png",
".psd",
".pyc",
".pyd",
".pyo",
".rar",
".rc",
".rc2",
".realm",
".res",
".s7z",
".scss",
".so",
".sum",
".svg",
".swf",
".tif",
".tiff",
".tlb",
".ttf",
".vcxproj",
".vdproj",
".wav",
".webm",
".webp",
".wma",
".woff",
".woff2",
".yuv"
],
"path": [
"/.git/",
"/.idea/",
"/.svn/",
"/__pycache__/",
"/node_modules/",
"/target/",
"/.venv/",
"/venv/"
],
"lines": [],
"values": []
},
"source_ext": [
".aspx",
".cs",
".cshtml",
".ejs",
".erb",
".go",
".html",
".ipynb",
".jsp",
".jsx",
".php",
".phtml",
".rb",
".sh",
".swift",
".ts",
".twig",
".vue",
".xhtml",
".java",
".js",
".py",
".cpp",
".c",
".h",
".hpp",
".mm",
".cu",
".y",
".vb",
".m",
".cu"
],
"source_quote_ext": [
".cs",
".cc",
".php",
".tf",
".kt",
".go",
".ipynb",
".ts",
".java",
".js",
".py",
".cpp",
".c",
".h",
".hpp"
],
"find_by_ext_list": [
".pem",
".cer",
".csr",
".der",
".pfx",
".p12",
".key",
".jks"
],
"bruteforce_list": [
"",
"changeit",
"changeme",
"tizen"
],
"check_for_literals": true,
"max_password_value_length": 64,
"max_url_cred_value_length": 80,
"line_data_output": [
"line",
"line_num",
"path",
"info",
"variable",
"variable_start",
"variable_end",
"value",
"value_start",
"value_end",
"entropy"
],
"candidate_output": [
"rule",
"severity",
"confidence",
"ml_probability",
"line_data_list"
]
}
================================================
FILE: credsweeper/secret/log.yaml
================================================
---
version: 1
disable_existing_loggers: False
ignore: [git, pdfminer]
formatters:
simple:
format: "%(asctime)s | %(levelname)s | %(module)s:%(lineno)d | %(message)s"
verbose:
format: "%(asctime)s | %(levelname)s | %(module)s | %(processName)s:%(threadName)s | %(filename)s:%(lineno)s | %(message)s"
handlers:
console:
class: logging.StreamHandler
level: INFO
formatter: simple
stream: ext://sys.stdout
logfile:
class: logging.handlers.RotatingFileHandler
level: DEBUG
formatter: simple
filename: ./log/credsweeper.log
maxBytes: 50485760
backupCount: 100
delay: True
error_log:
class: logging.handlers.RotatingFileHandler
level: ERROR
formatter: verbose
filename: ./log/error.log
maxBytes: 10485760
backupCount: 5
root:
level: DEBUG
handlers: [console, error_log]
================================================
FILE: credsweeper/utils/__init__.py
================================================
================================================
FILE: credsweeper/utils/hop_stat.py
================================================
import statistics
from typing import Tuple, Dict
class HopStat:
"""Statistical check distances between symbols sequence in a value on keyboard layout"""
KEYBOARD = ( #
"`1234567890-=", #
"\0qwertyuiop[]\\", #
"\0\0asdfghjkl;'", #
"\0\0zxcvbnm,./", #
)
TRANSLATION = str.maketrans({
'~': '`',
'!': '1',
'@': '2',
'#': '3',
'$': '4',
'%': '5',
'^': '6',
'&': '7',
'*': '8',
'(': '9',
')': '0',
'_': '-',
'+': '=',
'Q': 'q',
'W': 'w',
'E': 'e',
'R': 'r',
'T': 't',
'Y': 'y',
'U': 'u',
'I': 'i',
'O': 'o',
'P': 'p',
'{': '[',
'}': ']',
'|': '\\',
'A': 'a',
'S': 's',
'D': 'd',
'F': 'f',
'G': 'g',
'H': 'h',
'J': 'j',
'K': 'k',
'L': 'l',
':': ';',
'"': "'",
'Z': 'z',
'X': 'x',
'C': 'c',
'V': 'v',
'B': 'b',
'N': 'n',
'M': 'm',
'<': ',',
'>': '.',
'?': '/',
})
def __init__(self):
self.__hop_dict: Dict[Tuple[str, str], int] = {}
base = ''.join(x for x in HopStat.KEYBOARD)
for a in (x for x in base if '\0' != x):
for b in (x for x in base if '\0' != x):
if (b, a) in self.__hop_dict:
self.__hop_dict[(a, b)] = self.__hop_dict[(b, a)]
continue
if a == b:
self.__hop_dict[(a, b)] = 0
else:
x_a, y_a, z_a = self.__get_xyz(a)
x_b, y_b, z_b = self.__get_xyz(b)
d = (abs(x_a - x_b) + abs(y_a - y_b) + abs(z_a - z_b)) // 2
self.__hop_dict[(a, b)] = d
@staticmethod
def __get_xyz(c: str) -> Tuple[int, int, int]:
"""Returns axial coordinates of a char on keyboad qwerty"""
x = y = z = 0
for i, _ in enumerate(HopStat.KEYBOARD):
x = HopStat.KEYBOARD[i].find(c)
if 0 <= x:
z = i
x = x - (i // 2)
y = -(z + x)
break
return x, y, z
def stat(self, value: str) -> Tuple[float, float]:
"""Calculates statistical distances between given symbols
Args:
value: string based on initial alphabet
Return:
Average distance, deviation or exception if a value is not in initial alphabet
"""
hops = []
value = value.translate(HopStat.TRANSLATION)
for a, b in zip(value[:-1], value[1:]):
hop = self.__hop_dict.get((a, b))
if hop is None:
raise ValueError(f"Unknown char '{a}' or '{b}'")
hops.append(hop)
avg = statistics.mean(hops)
dev = statistics.stdev(hops, avg)
return avg, dev
================================================
FILE: credsweeper/utils/pem_key_detector.py
================================================
import contextlib
import logging
import re
import string
from typing import List
from credsweeper.common.constants import PEM_BEGIN_PATTERN, PEM_END_PATTERN, Chars, MAX_LINE_LENGTH
from credsweeper.config.config import Config
from credsweeper.credentials.line_data import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.utils.util import Util
logger = logging.getLogger(__name__)
class PemKeyDetector:
"""Class to detect PEM PRIVATE keys only"""
BASE64_CHARS_SET = set(Chars.BASE64STDPAD_CHARS.value)
RE_BASE64_CHARS = re.compile(fr"[{re.escape(Chars.BASE64STDPAD_CHARS.value)}]+")
ENTROPY_LIMIT_BASE64 = 4.5
# the limit is huge with possible prefixes and escaping
MAX_PEM_LENGTH = 4 * MAX_LINE_LENGTH
IGNORE_STARTS = [PEM_BEGIN_PATTERN, "Proc-Type", "Version", "DEK-Info"]
WRAP_CHARACTERS = "\\'\"`;,[]#*!"
REMOVE_CHARACTERS = string.whitespace + WRAP_CHARACTERS
# last line contains 4 symbols, at least
RE_PEM_BEGIN = re.compile(r"(?P" + PEM_BEGIN_PATTERN +
r"(?![^-]{1,80}ENCRYPTED)[^-]{0,80}PRIVATE[^-]{1,80}KEY[^-]{0,80}-----"
r"(.{1,8000}" + PEM_END_PATTERN + r"[^-]{1,80}KEY[^-]{0,80}-----)?)")
RE_PEM_VALUE = re.compile(fr"(?P.{{0,{MAX_PEM_LENGTH}}})")
def __init__(self, config: Config):
self.__config = config
self._barrier_pos: int = -2
self._barrier_cut: int = -2
self._barrier: str = ''
def cut_barrier(self, line: str) -> str:
"""Cut off barrier if detected"""
if self._barrier and 0 <= self._barrier_pos < self._barrier_cut < len(line):
if line[self._barrier_pos] == self._barrier:
return line[self._barrier_cut:]
self._barrier = ''
self._barrier_pos = self._barrier_cut = -1
return line
def set_barrier(self, line: str, start=0, end=MAX_LINE_LENGTH):
"""Detects barrier with offset of RE_PEM_BEGIN"""
self._barrier = ''
self._barrier_cut = line.find(PEM_END_PATTERN, start, end)
self._barrier_pos = self._barrier_cut - 1
if 0 <= self._barrier_pos < self._barrier_cut < len(line):
barrier = line[self._barrier_pos]
if barrier not in PemKeyDetector.BASE64_CHARS_SET:
self._barrier = barrier
def detect_pem_key(self, first_line: LineData, target: AnalysisTarget) -> List[LineData]:
"""Detects PEM key in single line and with iterative for next lines according
https://www.rfc-editor.org/rfc/rfc7468
Args:
first_line: detected -----BEGIN from rule pattern
target: Analysis target
Return:
List of LineData with found PEM
"""
line_data_list: List[LineData] = []
key_data_list: List[str] = []
# escaped key in one line with prefixes
pem_end_limit = min(target.line_len, first_line.value_start + PemKeyDetector.MAX_PEM_LENGTH)
first_line_end_pattern_start = target.line.find(PEM_END_PATTERN, first_line.value_start, pem_end_limit)
first_line_end_pattern_end = ( #
target.line.find("-----", first_line_end_pattern_start + 5, first_line_end_pattern_start + 80) #
if 0 <= first_line_end_pattern_start else -2)
if first_line.value_start < first_line_end_pattern_start < first_line_end_pattern_end:
# the whole PEM in single line
pem_text = target.line[first_line.value_start:first_line_end_pattern_end + 5]
first_line.value = pem_text
first_line.value_end = first_line.value_start + len(pem_text)
line_data_list.append(first_line)
else:
line_data_list.append(first_line)
pem_text = first_line.line[first_line.value_start:first_line.value_start + PemKeyDetector.MAX_PEM_LENGTH]
# perhaps, in next lines
start_pos = max(0, target.line_pos) + 1
end_pos = min(start_pos + 200, target.lines_len)
for line_pos in range(start_pos, end_pos):
target_line = target.lines[line_pos]
end_pattern_start = target_line.find(PEM_END_PATTERN, 0, PemKeyDetector.MAX_PEM_LENGTH)
end_pattern_end = (5 + target_line.find("-----", end_pattern_start + 5, end_pattern_start + 80)
if 0 <= end_pattern_start else -2)
if 0 <= end_pattern_start < end_pattern_end:
pem_line = target_line[:end_pattern_end]
else:
pem_line = target_line[:PemKeyDetector.MAX_PEM_LENGTH]
next_line = LineData(self.__config, target_line, line_pos, target.line_nums[line_pos], target.file_path,
target.file_type, target.info, PemKeyDetector.RE_PEM_VALUE)
line_data_list.append(next_line)
pem_text += f"\n{pem_line}"
if PEM_END_PATTERN in pem_line:
break
if PemKeyDetector.MAX_PEM_LENGTH < len(pem_text):
logger.warning("PEM text oversize")
return []
else:
logger.warning("PEM end not found %s", target.descriptor)
return []
while "\\\\" in pem_text:
# reduce JSON escaping sequences of backslash
pem_text = pem_text.replace("\\\\", '\\')
# replace escaped line ends with real and process them - PEM does not contain '\' sign
pem_text = pem_text.replace("\\r\\n", '\n').replace("\\r", '\n').replace("\\n", '\n').replace("\\t", '\t')
pem_lines = pem_text.splitlines()
self.set_barrier(pem_lines[-1])
for subline in pem_lines:
if PemKeyDetector.is_leading_config_line(subline):
continue
_subline = self.cut_barrier(subline)
if sanitized_line := PemKeyDetector.sanitize_line(_subline):
if PEM_END_PATTERN in sanitized_line:
return PemKeyDetector.finalize(line_data_list, key_data_list, sanitized_line)
# the end is not reached - sanitize the data
# PEM key line should not contain spaces or . (and especially not ...)
if not PemKeyDetector.RE_BASE64_CHARS.fullmatch(sanitized_line):
return []
key_data_list.append(sanitized_line)
return []
@staticmethod
def finalize(line_data_list: List[LineData], key_data_list: List[str], last_line: str) -> List[LineData]:
"""Checks collected key_data according the key type"""
if len(key_data_list) < len(line_data_list):
PemKeyDetector.sanitize_line_data_list(line_data_list, key_data_list, last_line)
key_data = ''.join(key_data_list)
if "PGP" in line_data_list[0].value:
# Check if entropy is high enough for base64 set with padding sign
entropy = Util.get_shannon_entropy(key_data)
if PemKeyDetector.ENTROPY_LIMIT_BASE64 <= entropy:
return line_data_list
logger.debug("Filtered with entropy %f '%s'", entropy, key_data)
if "OPENSSH" in line_data_list[0].value:
# Check whether the key is encrypted
with contextlib.suppress(Exception):
decoded = Util.decode_base64(key_data, urlsafe_detect=True)
if 32 < len(decoded) and b"bcrypt" not in decoded:
# 256 bits is the minimal size of Ed25519 keys
# all OK - the key is not encrypted in this top level
return line_data_list
logger.debug("Filtered with size or bcrypt '%s'", key_data)
else:
with contextlib.suppress(Exception):
if decoded := Util.decode_base64(key_data, padding_safe=True, urlsafe_detect=True):
if len(decoded) == Util.get_asn1_size(decoded):
# all OK - the key is not encrypted in this top level
return line_data_list
logger.debug("Filtered with non asn1 '%s'", key_data)
return []
@staticmethod
def sanitize_line_data_list(line_data_list: List[LineData], key_data_list: List[str], last_line: str):
"""Sanitize line_data_list to keep only valuable values"""
for value in key_data_list:
if 64 <= len(value):
# normal value length should not have a collision
for line_data in line_data_list:
if value == line_data.value:
# plain case - no sanitize necessary
break
value_start = line_data.value.find(value)
if 0 <= value_start:
line_data.value = value
line_data.value_start = value_start
line_data.value_end = value_start + len(value)
break
else:
# end of pem may be short and have collisions in long lines
value_pattern = re.compile(fr".*[^0-9A-Za-z+/=]?({re.escape(value)})[^0-9A-Za-z+/=]?.*")
for line_data in line_data_list:
if value == line_data.value:
# plain case - no sanitize necessary
break
if value_match := value_pattern.fullmatch(line_data.value):
line_data.value = value_match.group(1)
line_data.value_start, line_data.value_end = value_match.span(1)
break
if last_line.startswith(PEM_END_PATTERN) and last_line.endswith("-----"):
last_line_data = line_data_list[-1]
last_value_start = last_line_data.value.find(last_line, 0, PemKeyDetector.MAX_PEM_LENGTH)
if 0 <= last_line_data.value_start <= last_value_start:
# left barrier was sanitized
last_line_data.value = last_line
last_line_data.value_start = last_value_start
last_line_data.value_end = last_value_start + len(last_line)
@staticmethod
def sanitize_line(line: str, recurse_level: int = 5) -> str:
"""Remove common symbols that can surround PEM keys inside code.
Examples::
`# ZZAWarrA1`
`* ZZAWarrA1`
` "ZZAWarrA1\\n" + `
Args:
line: Line to be cleaned
recurse_level: to avoid infinite loop in case when removed symbol inside base64 encoded
Return:
line with special characters removed from both ends
"""
recurse_level -= 1
if 0 > recurse_level:
return line
# Note that this strip would remove `\n` but not `\\n`
line = line.strip(string.whitespace)
while line.startswith(("// ", "//\t")):
line = line[3:]
while line.startswith(("/// ", "///\t")):
line = line[4:]
while line.startswith("/*"):
line = line[2:]
while line.endswith("*/"):
line = line[:-2]
while line.endswith("\\"):
# line carry in many languages
line = line[:-1]
# remove concatenation carefully only when it is not part of base64
if line.startswith('+') and 1 < len(line) and line[1] not in PemKeyDetector.BASE64_CHARS_SET:
line = line[1:]
if line.endswith('+') and 2 < len(line) and line[-2] not in PemKeyDetector.BASE64_CHARS_SET:
line = line[:-1]
line = line.strip(PemKeyDetector.REMOVE_CHARACTERS)
# check whether new iteration requires
for x in PemKeyDetector.WRAP_CHARACTERS:
if x in line:
return PemKeyDetector.sanitize_line(line, recurse_level=recurse_level)
return line
@staticmethod
def is_leading_config_line(line: str) -> bool:
"""Remove non-key lines from the beginning of a list.
Example lines with non-key leading lines:
.. code-block:: text
Proc-Type: 4,ENCRYPTED
DEK-Info: DEK-Info: AES-256-CBC,2AA219GG746F88F6DDA0D852A0FD3211
ZZAWarrA1...
Args:
line: Line to be checked
Return:
True if the line is not a part of encoded data but leading config
"""
if not line:
return True
for ignore_string in PemKeyDetector.IGNORE_STARTS:
if ignore_string in line:
return True
return False
================================================
FILE: credsweeper/utils/util.py
================================================
import ast
import base64
import contextlib
import json
import logging
import math
import os
import random
import re
import string
import warnings
from pathlib import Path
from typing import Any, Dict, List, Tuple, Optional, Union
import numpy as np
import yaml
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.asymmetric import padding
from cryptography.hazmat.primitives.asymmetric.dh import DHPrivateKey, DHPublicKey
from cryptography.hazmat.primitives.asymmetric.dsa import DSAPrivateKey, DSAPublicKey
from cryptography.hazmat.primitives.asymmetric.ec import EllipticCurvePrivateKey, EllipticCurvePublicKey
from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey, Ed25519PublicKey
from cryptography.hazmat.primitives.asymmetric.ed448 import Ed448PrivateKey, Ed448PublicKey
from cryptography.hazmat.primitives.asymmetric.rsa import RSAPrivateKey
from cryptography.hazmat.primitives.asymmetric.types import PrivateKeyTypes
from cryptography.hazmat.primitives.asymmetric.x25519 import X25519PublicKey, X25519PrivateKey
from cryptography.hazmat.primitives.asymmetric.x448 import X448PublicKey, X448PrivateKey
from cryptography.hazmat.primitives.serialization import load_der_private_key
from cryptography.hazmat.primitives.serialization.pkcs12 import load_key_and_certificates
from lxml import etree
from credsweeper.common.constants import AVAILABLE_ENCODINGS, \
DEFAULT_ENCODING, LATIN_1, CHUNK_SIZE, MAX_LINE_LENGTH, CHUNK_STEP_SIZE, ASCII, UTF_16_LE, UTF_16_BE
logger = logging.getLogger(__name__)
class Util:
"""Class that contains different useful methods."""
@staticmethod
def get_extension(file_path: str, lower=True) -> str:
"""Return extension of file in lower case by default e.g.: '.txt', '.JPG'"""
_, extension = os.path.splitext(str(file_path))
return extension.lower() if lower else extension
@staticmethod
def get_regex_combine_or(re_strs: List[str]) -> str:
"""Routine combination for regex 'or'"""
result = "(?:"
for elem in re_strs:
result += elem + "|"
if result[-1] == "|":
result = result[:-1]
result += ")"
return result
@staticmethod
def get_shannon_entropy(data: Union[str, bytes]) -> float:
"""Borrowed from http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html."""
if not data:
return 0.0
size = len(data)
_uniq, counts = np.unique(list(data), return_counts=True)
probabilities = counts / size
entropy = -float(np.sum(probabilities * np.log2(probabilities)))
return entropy
# Precalculated data for speedup
MIN_DATA_ENTROPY: Dict[int, float] = {
16: 1.66973671780348,
20: 2.07723544540831,
32: 3.25392803184602,
40: 3.64853567064867,
64: 4.57756933688035,
384: 7.39,
512: 7.55,
}
@staticmethod
def get_min_data_entropy(x: int) -> float:
"""Returns minimal entropy for size of random data. Precalculated data is applied for speedup"""
if x in Util.MIN_DATA_ENTROPY:
y = Util.MIN_DATA_ENTROPY[x]
elif 8 < x < 64:
# approximated for range 12 - 64
_x = x - 8
y = ((0.000016617804 * _x - 0.002695077) * _x + 0.170393) * _x + 0.4
elif 64 < x < 384:
# logarithm base 2 - slow, but precise
_x = x - 8
y = 1.095884 * math.log2(_x) - 1.90156
elif 384 < x < 512:
# solved for 384 - 512
y = -0.11215851 * math.log2(x)**2 + 2.34303484 * math.log2(x) - 4.4466237
else:
# less or equal to 8 bytes might have 0 entropy
y = 0
return y
@staticmethod
def is_ascii_entropy_validate(data: bytes) -> bool:
"""
Tests small data sequence (<256) for data randomness by testing for ascii and shannon entropy
Returns True when data is an ASCII symbols or have small entropy
"""
if not data:
return True
data_len = len(data)
if 9 > data_len:
# even random data may have 0 entropy for length of 8 bytes and less
return True
entropy = 0.
cells = [int(0)] * 256
ascii_test = True
# "basket" sorting approach
for x in data:
cells[x] += 1
if ascii_test and 0b10000000 & x:
ascii_test = False
if ascii_test:
# only ascii symbols found
return True
left = 0.
step = 256.0 / data_len
right = left + step
while left < 256:
cell_sum = 0
i = int(left)
r = int(right)
while i < r and i < 256:
cell_sum += cells[i]
i += 1
p_x = float(cell_sum) / data_len
if p_x > 0:
entropy += -p_x * math.log2(p_x)
left = right
right += step
min_entropy = Util.get_min_data_entropy(data_len)
return entropy < min_entropy
@staticmethod
def is_binary(data: Union[bytes, bytearray]) -> bool:
"""
Returns True when two zeroes sequence is found in begin of data.
The sequence never exists in text format (UTF-8, UTF-16). UTF-32 is not supported.
"""
if 0 <= data.find(b"\0\0", 0, MAX_LINE_LENGTH):
return True
return False
NOT_LATIN1_PRINTABLE_SET = set(range(0, 256)) \
.difference(set(x for x in string.printable.encode(ASCII))) \
.difference({0x1B}) \
.difference(set(x for x in range(0xA0, 0x100)))
@staticmethod
def is_latin1(data: Union[bytes, bytearray]) -> bool:
"""Returns True when data looks like LATIN-1 for first MAX_LINE_LENGTH bytes."""
result = False
if data:
non_latin1_cnt = sum(1 for x in data[:MAX_LINE_LENGTH] if x in Util.NOT_LATIN1_PRINTABLE_SET)
# experiment for 255217 binary files shown avg = 0.268264 ± 0.168767, so let choose minimal
chunk_len = min(MAX_LINE_LENGTH, len(data))
result = bool(0.1 > non_latin1_cnt / chunk_len)
return result
@staticmethod
def read_file(path: Union[str, Path], encodings: Optional[List[str]] = None) -> List[str]:
"""Read the file content using different encodings.
Try to read the contents of the file according to the list of encodings "encodings" as soon as reading
occurs without any exceptions, the data is returned in the current encoding
Args:
path: path to file
encodings: supported encodings
Return:
list of file rows in a suitable encoding from "encodings",
if none of the encodings match, an empty list will be returned
"""
if data := Util.read_data(path):
return Util.decode_bytes(data, encodings)
return []
@staticmethod
def decode_text(content: Optional[bytes], encodings: Optional[List[str]] = None) -> Optional[str]:
"""Decode content using different encodings.
Try to decode bytes according to the list of encodings "encodings"
occurs without any exceptions. UTF-16 requires BOM
Args:
content: raw data that might be text
encodings: supported encodings
Return:
Decoded text in str for any suitable encoding
or None when binary data detected
"""
if content is None:
return None
binary_suggest = False
if encodings:
# use exactly defined encodings
_encodings = encodings
elif content.startswith(b"\xFF\xFE") or 1 < len(content) and 0 == content[1]:
_encodings = [UTF_16_LE]
elif content.startswith(b"\xFE\xFF") or content.startswith(b'\x00'):
_encodings = [UTF_16_BE]
else:
_encodings = AVAILABLE_ENCODINGS
for encoding in _encodings:
try:
if binary_suggest and LATIN_1 == encoding and (Util.is_binary(content) or not Util.is_latin1(content)):
# LATIN_1 may convert data (bytes in range 0x80:0xFF are transformed)
break
text = content.decode(encoding=encoding, errors="strict")
if content != text.encode(encoding=encoding, errors="strict"):
# the refurbish test helps to detect a real encoding
binary_suggest = True
continue
# the case decoding is good
if UTF_16_LE == encoding or UTF_16_BE == encoding:
return text.lstrip('\uFEFF')
return text
except UnicodeError:
binary_suggest = True
logger.debug("UnicodeError: Can't decode content as %s.", encoding)
except Exception as exc:
logger.error("Unexpected Error: Can't read content as %s. Error message: %s", encoding, exc)
return None
@staticmethod
def split_text(text: str) -> List[str]:
"""Splits a text into lines, handling all common line endings (e.g., LF, CRLF, CR)."""
return text.replace("\r\n", '\n').replace('\r', '\n').split('\n')
@staticmethod
def decode_bytes(content: Optional[bytes], encodings: Optional[List[str]] = None) -> List[str]:
"""Decode content using different encodings.
Try to decode bytes according to the list of encodings "encodings"
occurs without any exceptions. UTF-16 requires BOM
Args:
content: raw data that might be text
encodings: supported encodings
Return:
list of file rows in a suitable encoding from "encodings",
if none of the encodings match, an empty list will be returned
Also empty list will be returned after last encoding and 0 symbol is present in lines not at end
"""
if text := Util.decode_text(content, encodings):
return Util.split_text(text)
return []
@staticmethod
def get_asn1_size(data: Union[bytes, bytearray]) -> int:
"""Only sequence type 0x30 and size correctness are checked
Returns size of ASN1 data over 128 bytes or 0 if no interested data
"""
if isinstance(data, (bytes, bytearray)) and 2 <= len(data) and 0x30 == data[0]:
# https://www.oss.com/asn1/resources/asn1-made-simple/asn1-quick-reference/basic-encoding-rules.html#Lengths
length = data[1]
if 0x80 == length:
if data.endswith(b"\x00\x00"):
# assume, all data are ASN1 of various size
return len(data)
# else - skip the case where the ASN1 size is smaller than the actual data
elif 0x80 < length:
byte_len = 0x7F & length
len_limit = 2 + byte_len
if 4 >= byte_len and len(data) >= len_limit:
length = 0
for i in range(2, len_limit):
length <<= 8
length |= data[i]
if len(data) >= length + len_limit:
return length + len_limit
# else - unsupported huge size
else:
# length is less than 0x80
if len(data) >= length + 2:
return length + 2
# fallback - unsupported
return 0
@staticmethod
def read_data(path: Union[str, Path]) -> Optional[bytes]:
"""Read the file bytes as is.
Try to read the data of the file.
Args:
path: path to file
Return:
list of file rows in a suitable encoding from "encodings",
if none of the encodings match, an empty list will be returned
"""
try:
with open(path, "rb") as file:
return file.read()
except Exception as exc:
logger.error("Unexpected Error: Can not read '%s'. Error message: '%s'", path, exc)
return None
@staticmethod
def get_xml_from_lines(xml_lines: List[str]) -> Tuple[Optional[List[str]], Optional[List[int]]]:
"""Parse xml data from list of string and return List of str.
Args:
xml_lines: list of lines of xml data
Return:
List of formatted string(f"{root.tag} : {root.text}")
Raises:
xml exception
"""
lines = []
line_nums = []
tree = etree.fromstringlist(xml_lines)
for element in tree.iter():
tag = Util.extract_element_data(element, "tag")
text = Util.extract_element_data(element, "text")
lines.append(f"{tag} : {text}")
line_nums.append(element.sourceline)
return lines, line_nums
@staticmethod
def extract_element_data(element: Any, attr: str) -> str:
"""Extract xml element data to string.
Try to extract the xml data and strip() the string.
Args:
element: xml element
attr: attribute name
Return:
String xml data with strip()
"""
element_attr: Any = getattr(element, attr)
if element_attr is None or not isinstance(element_attr, str):
return ''
return str(element_attr).strip()
@staticmethod
def json_load(file_path: Union[str, Path], encoding=DEFAULT_ENCODING) -> Any:
"""Load dictionary from JSON file"""
try:
with open(file_path, "r", encoding=encoding) as f:
return json.load(f)
except Exception as exc:
logging.error("Failed to read: %s %s", file_path, exc)
return None
@staticmethod
def json_dump(obj: Any, file_path: Union[str, Path], encoding=DEFAULT_ENCODING, indent=4) -> None:
"""Write dictionary to JSON file"""
try:
with open(file_path, "w", encoding=encoding) as f:
json.dump(obj, f, indent=indent)
except Exception as exc:
logging.error("Failed to write: %s %s", file_path, exc)
@staticmethod
def yaml_load(file_path: Union[str, Path], encoding=DEFAULT_ENCODING) -> Any:
"""Load dictionary from YAML file"""
try:
with open(file_path, "r", encoding=encoding) as f:
return yaml.safe_load(f)
except Exception as exc:
logger.error("Failed to read %s %s", file_path, exc)
return None
@staticmethod
def yaml_dump(obj: Any, file_path: Union[str, Path], encoding=DEFAULT_ENCODING) -> None:
"""Write dictionary to YAML file"""
try:
with open(file_path, "w", encoding=encoding) as f:
yaml.dump(obj, f)
except Exception as exc:
logging.error("Failed to write: %s %s", file_path, exc)
@staticmethod
def parse_python(source: str) -> List[Any]:
"""Parse Python source and back to remove strings merge and line wrap"""
with warnings.catch_warnings(record=True):
warnings.simplefilter("error", SyntaxWarning)
src = ast.parse(source)
result = ast.unparse(src).splitlines()
return result
PEM_CLEANING_PATTERN = re.compile(r"\\[tnrvf]")
WHITESPACE_TRANS_TABLE = str.maketrans('', '', string.whitespace)
@staticmethod
def decode_base64(text: str, padding_safe: bool = False, urlsafe_detect=False) -> bytes:
"""decode text to bytes with / without padding detect and urlsafe symbols"""
value = text.translate(Util.WHITESPACE_TRANS_TABLE)
if padding_safe:
# workaround for binascii.Error: Excess padding not allowed
value = value.rstrip('=')
pad_num = 0x3 & len(value)
if pad_num:
value += '=' * (4 - pad_num)
if urlsafe_detect and ('-' in value or '_' in value):
decoded = base64.b64decode(value, altchars=b"-_", validate=True)
else:
decoded = base64.b64decode(value, validate=True)
return decoded
@staticmethod
def load_pk(data: bytes, password: Optional[bytes] = None) -> Optional[PrivateKeyTypes]:
"""Try to load private key from PKCS1, PKCS8 and PKCS12 formats"""
with contextlib.suppress(Exception):
# PKCS1, PKCS8 probes
private_key = load_der_private_key(data, password)
return private_key
with contextlib.suppress(Exception):
# PKCS12 probe
private_key, _certificate, _additional_certificates = load_key_and_certificates(data, password)
return private_key
return None
RANDOM_DATA = random.randbytes(20)
@staticmethod
def check_pk(pkey: PrivateKeyTypes) -> bool:
"""Check private key with encrypt-decrypt random data"""
if not pkey or isinstance(pkey, (EllipticCurvePublicKey, DSAPublicKey, Ed448PublicKey, Ed25519PublicKey,
DHPublicKey, X448PublicKey, X25519PublicKey)):
# These aren't the keys we're looking for
return False
if isinstance(pkey, (EllipticCurvePrivateKey, DSAPrivateKey, Ed448PrivateKey, Ed25519PrivateKey, DHPrivateKey,
X448PrivateKey, X25519PrivateKey)):
# One does not simply perform check the keys
return True
if isinstance(pkey, RSAPrivateKey):
pd = padding.OAEP(mgf=padding.MGF1(algorithm=hashes.SHA1()), algorithm=hashes.SHA1(), label=None)
ciphertext = pkey.public_key().encrypt(Util.RANDOM_DATA, padding=pd)
refurb = pkey.decrypt(ciphertext, padding=pd)
return bool(refurb == Util.RANDOM_DATA)
logger.warning("Unknown private key type: %s", type(pkey))
return False
@staticmethod
def get_chunks(line_len: int) -> List[Tuple[int, int]]:
"""Returns chunks positions for given line length"""
# line length is over MAX_LINE_LENGTH already
chunks = [(0, CHUNK_SIZE)]
# case for oversize line
next_offset = CHUNK_STEP_SIZE
while line_len > next_offset:
# the target is too long for single "finditer" - it will be scanned by chunks
if line_len > next_offset + MAX_LINE_LENGTH:
# the chunk is not the before last
chunks.append((next_offset, next_offset + CHUNK_SIZE))
next_offset += CHUNK_STEP_SIZE
else:
# the tail of line is between CHUNK_SIZE and MAX_LINE_LENGTH
chunks.append((next_offset, line_len))
break
return chunks
@staticmethod
def subtext(text: str, pos: int, hunk_size: int) -> str:
"""cut text symmetrically for given position or use remained quota to be fitted in 2x hunk_size"""
# cut trailed whitespaces to obtain more informative data
text = text.rstrip()
if hunk_size <= pos:
left_quota = 0
left_pos = pos - hunk_size
else:
left_quota = hunk_size - pos
left_pos = 0
# skip leading whitespaces in result string
for i in range(left_pos, pos):
if text[i] in string.whitespace:
left_quota += 1
left_pos += 1
else:
break
right_remain = len(text) - pos
if hunk_size <= right_remain:
right_quota = 0
right_pos = pos + hunk_size + left_quota
else:
right_quota = hunk_size - right_remain
right_pos = pos + hunk_size + left_quota
if len(text) < right_pos:
right_pos = len(text)
if 0 < left_pos:
left_pos -= right_quota
if 0 > left_pos:
left_pos = 0
return text[left_pos:right_pos].rstrip()
@staticmethod
def get_excel_column_name(column_index: int) -> str:
"""Converts index based column position into Excel style column name"""
name = ''
if isinstance(column_index, int):
while 0 <= column_index:
column_index, remain = divmod(column_index, 26)
name = f"{chr(ord('A') + remain)}{name}"
column_index -= 1
return name
================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?= -v -W
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
================================================
FILE: docs/README.md
================================================
# Documentation of CredSweeper
The directory is used for documentation of CredSweeper with using [sphinx](https://www.sphinx-doc.org/en/master/),
## Workflow
There is applied custom documentation, so auto-generation might fail. Please, use auto-generation as reference.
With the command new sources might be updated (in /docs directory):
```bash
sphinx-apidoc --force --full --ext-autodoc ../credsweeper -o source/
```
Edit, then check with command:
```bash
make html
```
================================================
FILE: docs/howto/how-to-contribute.md
================================================
# Contributing
Thank you for your interest in contributing to the CredSweeper tool!
The document covers the process for contributing to the CredSweeper code and documentation. Contributions may be as simple as typo corrections or as complex as new features.
1. [Process for contributing](#process-for-contributing)
1. [Repository structure](#repository-structure)
2. [File Name](#file-name)
3. [Self Test & Verification](#self-test-and-verification)
2. [How to PR](#how-to-pr)
3. [DOs and DON'Ts](#dos-and-donts)
## Process for contributing
You need a basic understanding of [Git and GitHub.com](https://guides.github.com/activities/hello-world/).
**Step 1:** You can skip this step for small changes such as typo corrections. Open an [new issue](https://github.com/Samsung/CredSweeper/issues/new) describing what you want to do, such as change an existing code, functionality or create a new one.
You can also look at our [issues](https://github.com/Samsung/CredSweeper/issues) list and volunteer to work on the ones you're interested in.
**Step 2:** Fork the [Samsung/CredSweeper](https://github.com/Samsung/CredSweeper/fork) repo and create a branch for your changes.
For small changes, you can use GitHub's web interface. Simply click the **Edit the file in your fork of this project** on the file you'd like to change.
GitHub creates the new branch for you when you submit the changes.
VCS(git) requirement: the branch MUST be forked after latest release.
**Step 3:** Make the changes on this new branch.
Be sure to follow the proper Python syntax. For more information, see the [style guide](https://github.com/google/styleguide/blob/gh-pages/pyguide.md).
Use pre-commit hook with [yapf config file](https://github.com/Samsung/CredSweeper/blob/main/.style.yapf).
### Repository structure
All new filters or another feature should be located in the appropriate directories. Also, for all new functionality, you need to create new positive and negative tests in the appropriate file and directory in ./tests/
### File name
File names use the following rules:
- Contain only lowercase letters, numbers, and underlines.
- No spaces or punctuation characters. Use the underlines to separate words and numbers in the file name.
- Use action verbs that are specific, such as develop, buy, build, troubleshoot. No -ing words.
- No small words - don't include a, and, the, in, or, etc.
- Keep file names reasonably short.
### Self Test and Verification
After updating CredSweeper code, please verify your change doesn't break the library. We suggest unit-tests using the pytest. You can easily run it with:
```bash
python -m pytest
```
Please make it sure running all tests and no any fail case.
Run all Actions in your fork before submitting the PR to the upstream and ensure all CI checks pass.
**Step 4:** Submit a Pull Request (PR) from your branch to `Samsung/CredSweeper/main`.
Each PR should usually address one issue at a time. The PR can modify one or multiple files. If you're addressing multiple fixes on different files, separate PRs are preferred.
If your PR is addressing an existing issue, add the `Fixes #Issue_Number` keyword to the commit message or PR description. That way, the issue is automatically closed when the PR is merged. For more information, see [Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages/).
The CredSweeper team will review your PR and let you know if there are any other updates/changes necessary in order to approve it.
**Step 5:** Make any necessary updates to your branch as discussed with the team.
The maintainers will merge your PR into the main branch once feedback has been applied and your change is approved.
### How to PR
1. Fork form the original repository, https://github.com/Samsung/CredSweeper.
(Ref. https://help.github.com/articles/fork-a-repo/)
2. Type `git clone`, and then paste the URL you copied in 1. It will look like this, with your GitHub username instead of `YOUR-USERNAME`:
```bash
git clone https://github.com/YOUR-USERNAME/CredSweeper.git
```
3. Set to synchronize the original repository and the forked repository.
```bash
git remote -v
git remote add upstream https://github.com/Samsung/CredSweeper.git
git remote -v
```
4. Create a new branch on the forked repository or the local repository,
and switch to the new branch.
```bash
git checkout -b
```
5. Install Yapf as a pre-commit hook with
``` bash
pip install pre-commit
pre-commit install
```
6. Create a local commit.
```bash
git status
git add
git commit -a
```
7. Push the branch
```bash
git push origin
```
8. Open a pull request on https://github.com/Samsung/CredSweeper.
All tests and checks MUST be passed
- Codestyle check
- Static analysis
- Unit tests
> - Development tests - use only linux and compatible version of packages. Code coverage is checked without test_app.py.
> - Release tests - use Linux, Mac, Windows platform without version limitation.
- Dynamic analysis (fuzzing)
> Used Atheris framework to fuzzing various input. Code coverage is checked. In case of unsatisfied coverage - need to do new fuzzing or refactor fuzzer.
- Benchmark
> If your PR changes benchmark scores - the scores MUST be updated (cicd/benchmark.txt)
9. Verify ActionTest after merge.
> The test verifies integration CredSweeper to github action and points to main branch of main repo.
## DOs and DON'Ts
The following list shows some guiding rules that you should keep in mind when you're contributing to the CredSweeper:
- **DON'T** surprise us with large pull requests. Instead, file an issue and start a discussion so we can agree on a direction before you invest a large amount of time.
- **DO** read the [style guide](https://github.com/google/styleguide/blob/gh-pages/pyguide.md) guideline.
- **DO** create a separate branch on your fork before working on the changes.
- **DO** follow the [GitHub Flow workflow](https://guides.github.com/introduction/flow/).
- **DO** blog and tweet (or whatever) about your contributions, frequently!
> **Note**
>
> you might notice that some of the topics are not currently following all the guidelines specified here and on the [style guide](https://github.com/google/styleguide/blob/gh-pages/pyguide.md) as well. We're working towards achieving consistency throughout the tool. Check the list of [open issues](https://github.com/Samsung/CredSweeper/issues?q=is%3Aissue+is%3Aopen) we're currently tracking for that specific goal.
================================================
FILE: docs/make.bat
================================================
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.https://www.sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd
================================================
FILE: docs/requirements.txt
================================================
myst_parser[linkify]==4.0.1
sphinx==8.1.3
sphinx_rtd_theme==3.0.2
================================================
FILE: docs/source/api.rst
================================================
API
===
This part of the documentation covers all the interfaces of CredSweeper.
.. toctree::
:maxdepth: 2
credsweeper
credsweeper.common.rst
credsweeper.config.rst
credsweeper.credentials.rst
credsweeper.deep_scanner.rst
credsweeper.file_handler.rst
credsweeper.filters.group.rst
credsweeper.filters.rst
credsweeper.logger.rst
credsweeper.ml_model.rst
credsweeper.rules.rst
credsweeper.scanner.rst
credsweeper.scanner.scan_type.rst
credsweeper.utils.rst
================================================
FILE: docs/source/apps_config.rst
================================================
CredSweeper Credential Analyzer Configuration
=============================================
.. literalinclude:: ../../credsweeper/secret/config.json
:language: json
:linenos:
================================================
FILE: docs/source/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
import os
import sys
sys.path.insert(0, os.path.abspath('../..'))
di = os.path.abspath(os.pardir)
remove_docs = di.strip('docs')
for r, d, f in os.walk(r"{}credsweeper".format(remove_docs)):
sys.path.append(r)
# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
project = 'CredSweeper'
copyright = '2026, Samsung CredTeam'
author = 'CredTeam'
from credsweeper import __version__ as credsweeper_version
# The short X.Y version
version = '.'.join(credsweeper_version.split('.')[0:2])
# The full version, including alpha/beta/rc tags
release = credsweeper_version
# The master toctree document.
master_doc = 'index'
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.intersphinx',
'sphinx.ext.viewcode',
'sphinx.ext.todo',
'sphinx.ext.napoleon',
'myst_parser',
]
intersphinx_mapping = {
'python': ('https://docs.python.org/3/', None),
'numpy': ('https://numpy.org/doc/stable/', None),
}
myst_enable_extensions = [
"amsmath",
"attrs_inline",
"colon_fence",
"deflist",
"dollarmath",
"fieldlist",
"html_admonition",
"html_image",
"linkify",
"replacements",
"smartquotes",
"strikethrough",
"substitution",
"tasklist",
]
source_suffix = {
'.rst': 'restructuredtext',
'.txt': 'markdown',
'.md': 'markdown',
}
myst_heading_anchors = 3
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = 'en'
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# If true, `todo` and `todoList` produce output, else they produce nothing.
# https://www.sphinx-doc.org/en/master/usage/extensions/todo.html#configuration
todo_include_todos = True
# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
html_theme = 'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_theme_options = {
'logo_only': True, #
'navigation_depth': 3 #
}
# The name of an image file (relative to this directory) to place at the top of the sidebar.
html_logo = 'https://raw.githubusercontent.com/Samsung/CredSweeper/main/docs/images/Logo.png'
html_scaled_image_link = False
# -- Options for HTMLHelp output ------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'ci_doc'
# -- Suppress unusual parts ... in json e.g.
suppress_warnings = ['misc.highlighting_failure']
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [(master_doc, 'ci', u'CI Documentation', [author], 1)]
sys.path.append(os.path.dirname(__file__) + '/..')
================================================
FILE: docs/source/credsweeper.common.rst
================================================
credsweeper.common package
==========================
Submodules
----------
credsweeper.common.constants module
-----------------------------------
.. automodule:: credsweeper.common.constants
:members:
:undoc-members:
:show-inheritance:
credsweeper.common.keyword\_checklist module
--------------------------------------------
.. automodule:: credsweeper.common.keyword_checklist
:members:
:undoc-members:
:show-inheritance:
credsweeper.common.keyword\_pattern module
------------------------------------------
.. automodule:: credsweeper.common.keyword_pattern
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: credsweeper.common
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/credsweeper.config.rst
================================================
credsweeper.config package
==========================
Submodules
----------
credsweeper.config.config module
--------------------------------
.. automodule:: credsweeper.config.config
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: credsweeper.config
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/credsweeper.credentials.rst
================================================
credsweeper.credentials package
===============================
Submodules
----------
credsweeper.credentials.augment\_candidates module
--------------------------------------------------
.. automodule:: credsweeper.credentials.augment_candidates
:members:
:undoc-members:
:show-inheritance:
credsweeper.credentials.candidate module
----------------------------------------
.. automodule:: credsweeper.credentials.candidate
:members:
:undoc-members:
:show-inheritance:
credsweeper.credentials.candidate\_group\_generator module
----------------------------------------------------------
.. automodule:: credsweeper.credentials.candidate_group_generator
:members:
:undoc-members:
:show-inheritance:
credsweeper.credentials.candidate\_key module
---------------------------------------------
.. automodule:: credsweeper.credentials.candidate_key
:members:
:undoc-members:
:show-inheritance:
credsweeper.credentials.credential\_manager module
--------------------------------------------------
.. automodule:: credsweeper.credentials.credential_manager
:members:
:undoc-members:
:show-inheritance:
credsweeper.credentials.line\_data module
-----------------------------------------
.. automodule:: credsweeper.credentials.line_data
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: credsweeper.credentials
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/credsweeper.deep_scanner.rst
================================================
credsweeper.deep\_scanner package
=================================
Submodules
----------
credsweeper.deep\_scanner.abstract\_scanner module
--------------------------------------------------
.. automodule:: credsweeper.deep_scanner.abstract_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.byte\_scanner module
----------------------------------------------
.. automodule:: credsweeper.deep_scanner.byte_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.bzip2\_scanner module
-----------------------------------------------
.. automodule:: credsweeper.deep_scanner.bzip2_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.crx\_scanner module
---------------------------------------------
.. automodule:: credsweeper.deep_scanner.crx_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.csv\_scanner module
---------------------------------------------
.. automodule:: credsweeper.deep_scanner.csv_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.deb\_scanner module
---------------------------------------------
.. automodule:: credsweeper.deep_scanner.deb_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.deep\_scanner module
----------------------------------------------
.. automodule:: credsweeper.deep_scanner.deep_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.docx\_scanner module
----------------------------------------------
.. automodule:: credsweeper.deep_scanner.docx_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.eml\_scanner module
---------------------------------------------
.. automodule:: credsweeper.deep_scanner.eml_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.encoder\_scanner module
-------------------------------------------------
.. automodule:: credsweeper.deep_scanner.encoder_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.gzip\_scanner module
----------------------------------------------
.. automodule:: credsweeper.deep_scanner.gzip_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.html\_scanner module
----------------------------------------------
.. automodule:: credsweeper.deep_scanner.html_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.jclass\_scanner module
------------------------------------------------
.. automodule:: credsweeper.deep_scanner.jclass_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.jks\_scanner module
---------------------------------------------
.. automodule:: credsweeper.deep_scanner.jks_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.lang\_scanner module
----------------------------------------------
.. automodule:: credsweeper.deep_scanner.lang_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.lzma\_scanner module
----------------------------------------------
.. automodule:: credsweeper.deep_scanner.lzma_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.mxfile\_scanner module
------------------------------------------------
.. automodule:: credsweeper.deep_scanner.mxfile_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.patch\_scanner module
-----------------------------------------------
.. automodule:: credsweeper.deep_scanner.patch_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.pdf\_scanner module
---------------------------------------------
.. automodule:: credsweeper.deep_scanner.pdf_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.pkcs\_scanner module
----------------------------------------------
.. automodule:: credsweeper.deep_scanner.pkcs_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.png\_scanner module
---------------------------------------------
.. automodule:: credsweeper.deep_scanner.png_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.pptx\_scanner module
----------------------------------------------
.. automodule:: credsweeper.deep_scanner.pptx_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.rpm\_scanner module
---------------------------------------------
.. automodule:: credsweeper.deep_scanner.rpm_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.rtf\_scanner module
---------------------------------------------
.. automodule:: credsweeper.deep_scanner.rtf_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.sqlite3\_scanner module
-------------------------------------------------
.. automodule:: credsweeper.deep_scanner.sqlite3_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.strings\_scanner module
-------------------------------------------------
.. automodule:: credsweeper.deep_scanner.strings_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.tar\_scanner module
---------------------------------------------
.. automodule:: credsweeper.deep_scanner.tar_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.tmx\_scanner module
---------------------------------------------
.. automodule:: credsweeper.deep_scanner.tmx_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.xlsx\_scanner module
----------------------------------------------
.. automodule:: credsweeper.deep_scanner.xlsx_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.xml\_scanner module
---------------------------------------------
.. automodule:: credsweeper.deep_scanner.xml_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.zip\_scanner module
---------------------------------------------
.. automodule:: credsweeper.deep_scanner.zip_scanner
:members:
:undoc-members:
:show-inheritance:
credsweeper.deep\_scanner.zlib\_scanner module
----------------------------------------------
.. automodule:: credsweeper.deep_scanner.zlib_scanner
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: credsweeper.deep_scanner
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/credsweeper.file_handler.rst
================================================
credsweeper.file\_handler package
=================================
Submodules
----------
credsweeper.file\_handler.abstract\_provider module
---------------------------------------------------
.. automodule:: credsweeper.file_handler.abstract_provider
:members:
:undoc-members:
:show-inheritance:
credsweeper.file\_handler.analysis\_target module
-------------------------------------------------
.. automodule:: credsweeper.file_handler.analysis_target
:members:
:undoc-members:
:show-inheritance:
credsweeper.file\_handler.byte\_content\_provider module
--------------------------------------------------------
.. automodule:: credsweeper.file_handler.byte_content_provider
:members:
:undoc-members:
:show-inheritance:
credsweeper.file\_handler.content\_provider module
--------------------------------------------------
.. automodule:: credsweeper.file_handler.content_provider
:members:
:undoc-members:
:show-inheritance:
credsweeper.file\_handler.data\_content\_provider module
--------------------------------------------------------
.. automodule:: credsweeper.file_handler.data_content_provider
:members:
:undoc-members:
:show-inheritance:
credsweeper.file\_handler.descriptor module
-------------------------------------------
.. automodule:: credsweeper.file_handler.descriptor
:members:
:undoc-members:
:show-inheritance:
credsweeper.file\_handler.diff\_content\_provider module
--------------------------------------------------------
.. automodule:: credsweeper.file_handler.diff_content_provider
:members:
:undoc-members:
:show-inheritance:
credsweeper.file\_handler.file\_path\_extractor module
------------------------------------------------------
.. automodule:: credsweeper.file_handler.file_path_extractor
:members:
:undoc-members:
:show-inheritance:
credsweeper.file\_handler.files\_provider module
------------------------------------------------
.. automodule:: credsweeper.file_handler.files_provider
:members:
:undoc-members:
:show-inheritance:
credsweeper.file\_handler.patches\_provider module
--------------------------------------------------
.. automodule:: credsweeper.file_handler.patches_provider
:members:
:undoc-members:
:show-inheritance:
credsweeper.file\_handler.string\_content\_provider module
----------------------------------------------------------
.. automodule:: credsweeper.file_handler.string_content_provider
:members:
:undoc-members:
:show-inheritance:
credsweeper.file\_handler.struct\_content\_provider module
----------------------------------------------------------
.. automodule:: credsweeper.file_handler.struct_content_provider
:members:
:undoc-members:
:show-inheritance:
credsweeper.file\_handler.text\_content\_provider module
--------------------------------------------------------
.. automodule:: credsweeper.file_handler.text_content_provider
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: credsweeper.file_handler
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/credsweeper.filters.group.rst
================================================
credsweeper.filters.group package
=================================
Submodules
----------
credsweeper.filters.group.general\_keyword module
-------------------------------------------------
.. automodule:: credsweeper.filters.group.general_keyword
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.group.general\_pattern module
-------------------------------------------------
.. automodule:: credsweeper.filters.group.general_pattern
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.group.group module
--------------------------------------
.. automodule:: credsweeper.filters.group.group
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.group.password\_keyword module
--------------------------------------------------
.. automodule:: credsweeper.filters.group.password_keyword
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.group.token\_pattern module
-----------------------------------------------
.. automodule:: credsweeper.filters.group.token_pattern
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.group.url\_credentials\_group module
--------------------------------------------------------
.. automodule:: credsweeper.filters.group.url_credentials_group
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.group.weird\_base36\_token module
-----------------------------------------------------
.. automodule:: credsweeper.filters.group.weird_base36_token
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.group.weird\_base64\_token module
-----------------------------------------------------
.. automodule:: credsweeper.filters.group.weird_base64_token
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: credsweeper.filters.group
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/credsweeper.filters.rst
================================================
credsweeper.filters package
===========================
Subpackages
-----------
.. toctree::
:maxdepth: 4
credsweeper.filters.group
Submodules
----------
credsweeper.filters.filter module
---------------------------------
.. automodule:: credsweeper.filters.filter
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.line\_git\_binary\_check module
---------------------------------------------------
.. automodule:: credsweeper.filters.line_git_binary_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.line\_specific\_key\_check module
-----------------------------------------------------
.. automodule:: credsweeper.filters.line_specific_key_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.line\_uue\_part\_check module
-------------------------------------------------
.. automodule:: credsweeper.filters.line_uue_part_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_allowlist\_check module
--------------------------------------------------
.. automodule:: credsweeper.filters.value_allowlist_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_array\_dictionary\_check module
----------------------------------------------------------
.. automodule:: credsweeper.filters.value_array_dictionary_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_atlassian\_token\_check module
---------------------------------------------------------
.. automodule:: credsweeper.filters.value_atlassian_token_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_azure\_token\_check module
-----------------------------------------------------
.. automodule:: credsweeper.filters.value_azure_token_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_base32\_data\_check module
-----------------------------------------------------
.. automodule:: credsweeper.filters.value_base32_data_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_base64\_data\_check module
-----------------------------------------------------
.. automodule:: credsweeper.filters.value_base64_data_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_base64\_encoded\_pem\_check module
-------------------------------------------------------------
.. automodule:: credsweeper.filters.value_base64_encoded_pem_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_base64\_key\_check module
----------------------------------------------------
.. automodule:: credsweeper.filters.value_base64_key_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_base64\_part\_check module
-----------------------------------------------------
.. automodule:: credsweeper.filters.value_base64_part_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_basic\_auth\_check module
----------------------------------------------------
.. automodule:: credsweeper.filters.value_basic_auth_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_blocklist\_check module
--------------------------------------------------
.. automodule:: credsweeper.filters.value_blocklist_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_camel\_case\_check module
----------------------------------------------------
.. automodule:: credsweeper.filters.value_camel_case_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_dictionary\_keyword\_check module
------------------------------------------------------------
.. automodule:: credsweeper.filters.value_dictionary_keyword_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_discord\_bot\_check module
-----------------------------------------------------
.. automodule:: credsweeper.filters.value_discord_bot_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_entropy\_base32\_check module
--------------------------------------------------------
.. automodule:: credsweeper.filters.value_entropy_base32_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_entropy\_base36\_check module
--------------------------------------------------------
.. automodule:: credsweeper.filters.value_entropy_base36_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_entropy\_base64\_check module
--------------------------------------------------------
.. automodule:: credsweeper.filters.value_entropy_base64_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_entropy\_base\_check module
------------------------------------------------------
.. automodule:: credsweeper.filters.value_entropy_base_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_file\_path\_check module
---------------------------------------------------
.. automodule:: credsweeper.filters.value_file_path_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_github\_check module
-----------------------------------------------
.. automodule:: credsweeper.filters.value_github_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_grafana\_check module
------------------------------------------------
.. automodule:: credsweeper.filters.value_grafana_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_grafana\_service\_check module
---------------------------------------------------------
.. automodule:: credsweeper.filters.value_grafana_service_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_hex\_number\_check module
----------------------------------------------------
.. automodule:: credsweeper.filters.value_hex_number_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_jfrog\_token\_check module
-----------------------------------------------------
.. automodule:: credsweeper.filters.value_jfrog_token_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_json\_web\_key\_check module
-------------------------------------------------------
.. automodule:: credsweeper.filters.value_json_web_key_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_json\_web\_token\_check module
---------------------------------------------------------
.. automodule:: credsweeper.filters.value_json_web_token_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_last\_word\_check module
---------------------------------------------------
.. automodule:: credsweeper.filters.value_last_word_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_length\_check module
-----------------------------------------------
.. automodule:: credsweeper.filters.value_length_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_method\_check module
-----------------------------------------------
.. automodule:: credsweeper.filters.value_method_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_morphemes\_check module
--------------------------------------------------
.. automodule:: credsweeper.filters.value_morphemes_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_not\_allowed\_pattern\_check module
--------------------------------------------------------------
.. automodule:: credsweeper.filters.value_not_allowed_pattern_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_not\_part\_encoded\_check module
-----------------------------------------------------------
.. automodule:: credsweeper.filters.value_not_part_encoded_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_number\_check module
-----------------------------------------------
.. automodule:: credsweeper.filters.value_number_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_pattern\_check module
------------------------------------------------
.. automodule:: credsweeper.filters.value_pattern_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_sealed\_secret\_check module
-------------------------------------------------------
.. automodule:: credsweeper.filters.value_sealed_secret_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_search\_check module
-----------------------------------------------
.. automodule:: credsweeper.filters.value_search_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_similarity\_check module
---------------------------------------------------
.. automodule:: credsweeper.filters.value_similarity_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_split\_keyword\_check module
-------------------------------------------------------
.. automodule:: credsweeper.filters.value_split_keyword_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_string\_type\_check module
-----------------------------------------------------
.. automodule:: credsweeper.filters.value_string_type_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_token\_base32\_check module
------------------------------------------------------
.. automodule:: credsweeper.filters.value_token_base32_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_token\_base36\_check module
------------------------------------------------------
.. automodule:: credsweeper.filters.value_token_base36_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_token\_base64\_check module
------------------------------------------------------
.. automodule:: credsweeper.filters.value_token_base64_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_token\_base\_check module
----------------------------------------------------
.. automodule:: credsweeper.filters.value_token_base_check
:members:
:undoc-members:
:show-inheritance:
credsweeper.filters.value\_token\_check module
----------------------------------------------
.. automodule:: credsweeper.filters.value_token_check
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: credsweeper.filters
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/credsweeper.logger.rst
================================================
credsweeper.logger package
==========================
Submodules
----------
credsweeper.logger.logger module
--------------------------------
.. automodule:: credsweeper.logger.logger
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: credsweeper.logger
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/credsweeper.ml_model.features.rst
================================================
credsweeper.ml\_model.features package
======================================
Submodules
----------
credsweeper.ml\_model.features.entropy\_evaluation module
---------------------------------------------------------
.. automodule:: credsweeper.ml_model.features.entropy_evaluation
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.feature module
---------------------------------------------
.. automodule:: credsweeper.ml_model.features.feature
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.file\_extension module
-----------------------------------------------------
.. automodule:: credsweeper.ml_model.features.file_extension
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.has\_html\_tag module
----------------------------------------------------
.. automodule:: credsweeper.ml_model.features.has_html_tag
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.is\_secret\_numeric module
---------------------------------------------------------
.. automodule:: credsweeper.ml_model.features.is_secret_numeric
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.length\_of\_attribute module
-----------------------------------------------------------
.. automodule:: credsweeper.ml_model.features.length_of_attribute
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.morpheme\_dense module
-----------------------------------------------------
.. automodule:: credsweeper.ml_model.features.morpheme_dense
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.rule\_name module
------------------------------------------------
.. automodule:: credsweeper.ml_model.features.rule_name
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.rule\_severity module
----------------------------------------------------
.. automodule:: credsweeper.ml_model.features.rule_severity
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.search\_in\_attribute module
-----------------------------------------------------------
.. automodule:: credsweeper.ml_model.features.search_in_attribute
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.word\_in module
----------------------------------------------
.. automodule:: credsweeper.ml_model.features.word_in
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.word\_in\_path module
----------------------------------------------------
.. automodule:: credsweeper.ml_model.features.word_in_path
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.word\_in\_postamble module
---------------------------------------------------------
.. automodule:: credsweeper.ml_model.features.word_in_postamble
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.word\_in\_preamble module
--------------------------------------------------------
.. automodule:: credsweeper.ml_model.features.word_in_preamble
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.word\_in\_transition module
----------------------------------------------------------
.. automodule:: credsweeper.ml_model.features.word_in_transition
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.word\_in\_value module
-----------------------------------------------------
.. automodule:: credsweeper.ml_model.features.word_in_value
:members:
:undoc-members:
:show-inheritance:
credsweeper.ml\_model.features.word\_in\_variable module
--------------------------------------------------------
.. automodule:: credsweeper.ml_model.features.word_in_variable
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: credsweeper.ml_model.features
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/credsweeper.ml_model.rst
================================================
credsweeper.ml\_model package
=============================
Subpackages
-----------
.. toctree::
:maxdepth: 4
credsweeper.ml_model.features
Submodules
----------
credsweeper.ml\_model.ml\_validator module
------------------------------------------
.. automodule:: credsweeper.ml_model.ml_validator
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: credsweeper.ml_model
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/credsweeper.rst
================================================
Credsweeper package
===================
CredSweeper
-----------
.. toctree::
:maxdepth: 4
.. automodule:: credsweeper.app
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/credsweeper.rules.rst
================================================
credsweeper.rules package
=========================
Submodules
----------
credsweeper.rules.rule module
-----------------------------
.. automodule:: credsweeper.rules.rule
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: credsweeper.rules
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/credsweeper.scanner.rst
================================================
credsweeper.scanner package
===========================
Subpackages
-----------
.. toctree::
:maxdepth: 4
credsweeper.scanner.scan_type
Submodules
----------
credsweeper.scanner.scanner module
----------------------------------
.. automodule:: credsweeper.scanner.scanner
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: credsweeper.scanner
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/credsweeper.scanner.scan_type.rst
================================================
credsweeper.scanner.scan\_type package
======================================
Submodules
----------
credsweeper.scanner.scan\_type.multi\_pattern module
----------------------------------------------------
.. automodule:: credsweeper.scanner.scan_type.multi_pattern
:members:
:undoc-members:
:show-inheritance:
credsweeper.scanner.scan\_type.pem\_key\_pattern module
-------------------------------------------------------
.. automodule:: credsweeper.scanner.scan_type.pem_key_pattern
:members:
:undoc-members:
:show-inheritance:
credsweeper.scanner.scan\_type.scan\_type module
------------------------------------------------
.. automodule:: credsweeper.scanner.scan_type.scan_type
:members:
:undoc-members:
:show-inheritance:
credsweeper.scanner.scan\_type.single\_pattern module
-----------------------------------------------------
.. automodule:: credsweeper.scanner.scan_type.single_pattern
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: credsweeper.scanner.scan_type
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/credsweeper.utils.rst
================================================
credsweeper.utils package
=========================
Submodules
----------
credsweeper.utils.hop\_stat module
----------------------------------
.. automodule:: credsweeper.utils.hop_stat
:members:
:undoc-members:
:show-inheritance:
credsweeper.utils.pem\_key\_detector module
-------------------------------------------
.. automodule:: credsweeper.utils.pem_key_detector
:members:
:undoc-members:
:show-inheritance:
credsweeper.utils.util module
-----------------------------
.. automodule:: credsweeper.utils.util
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: credsweeper.utils
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/develop.rst
================================================
Develop
=======
Tests
-----
To run all tests:
.. code-block:: bash
python -m pytest -s tests/
Benchmark
---------
We have a dataset for testing credential scanners that called `CredData `_. If you want to test CredSweeper with this dataset please check `here `_.
================================================
FILE: docs/source/guide.rst
================================================
How To Use
==========
Run
---
Get all argument list:
.. code-block:: bash
python -m credsweeper --help
.. code-block:: text
usage: python -m credsweeper [-h]
(--path PATH [PATH ...] | --diff_path PATH [PATH ...] | --export_config [PATH] | --export_log_config [PATH] | --git PATH)
[--ref REF] [--rules PATH] [--severity SEVERITY]
[--config PATH] [--log_config PATH]
[--denylist PATH] [--find-by-ext]
[--pedantic | --no-pedantic]
[--depth POSITIVE_INT] [--no-filters] [--doc]
[--ml_threshold THRESHOLD_OR_FLOAT_OR_ZERO]
[--ml_batch_size POSITIVE_INT] [--ml_config PATH]
[--ml_model PATH] [--ml_providers STR]
[--jobs POSITIVE_INT] [--thrifty | --no-thrifty]
[--skip_ignored] [--error | --no-error]
[--save-json [PATH]] [--save-xlsx [PATH]]
[--stdout | --no-stdout] [--color | --no-color]
[--hashed | --no-hashed]
[--subtext | --no-subtext] [--sort | --no-sort]
[--log LOG_LEVEL] [--size_limit SIZE_LIMIT]
[--banner] [--version]
options:
-h, --help show this help message and exit
--path PATH [PATH ...]
file or directory to scan
--diff_path PATH [PATH ...]
git diff file to scan
--export_config [PATH]
exporting default config to file (default:
config.json)
--export_log_config [PATH]
exporting default logger config to file (default:
log.yaml)
--git PATH git repo to scan
--ref REF scan git repo from the ref, otherwise - all branches
were scanned (slow)
--rules PATH path of rule config file (default:
credsweeper/rules/config.yaml). severity:['critical',
'high', 'medium', 'low', 'info'] type:['keyword',
'pattern', 'pem_key', 'multi']
--severity SEVERITY set minimum level for rules to apply ['critical',
'high', 'medium', 'low', 'info'](default:
'Severity.INFO', case insensitive)
--config PATH use custom config (default: built-in)
--log_config PATH use custom log config (default: built-in)
--denylist PATH path to a plain text file with lines or secrets to
ignore
--find-by-ext find files by predefined extension
--pedantic, --no-pedantic
process files without extension (default: False)
--depth POSITIVE_INT additional recursive search in data (experimental)
--no-filters disable filters
--doc document-specific scanning
--ml_threshold THRESHOLD_OR_FLOAT_OR_ZERO
setup threshold for the ml model. The lower the
threshold - the more credentials will be reported.
Allowed values: float between 0 and 1, or any of
['lowest', 'low', 'medium', 'high', 'highest']
(default: medium)
--ml_batch_size POSITIVE_INT, -b POSITIVE_INT
batch size for model inference (default: 16)
--ml_config PATH use external config for ml model
--ml_model PATH use external ml model
--ml_providers STR comma separated list of providers for onnx
(CPUExecutionProvider is used by default)
--jobs POSITIVE_INT, -j POSITIVE_INT
number of parallel processes to use (default: 1)
--thrifty, --no-thrifty
clear objects after scan to reduce memory consumption
(default: True)
--skip_ignored parse .gitignore files and skip credentials from
ignored objects
--error, --no-error produce error code if credentials are found (default:
False)
--save-json [PATH] save result to json file (default: output.json)
--save-xlsx [PATH] save result to xlsx file (default: output.xlsx)
--stdout, --no-stdout
print results to stdout (default: True)
--color, --no-color print results with colorization (default: False)
--hashed, --no-hashed
line, variable, value will be hashed in output
(default: False)
--subtext, --no-subtext
line text will be stripped in 128 symbols but value
and variable are kept (default: False)
--sort, --no-sort enable output sorting (default: False)
--log LOG_LEVEL, -l LOG_LEVEL
provide logging level of ['DEBUG', 'INFO', 'WARN',
'WARNING', 'ERROR', 'FATAL', 'CRITICAL', 'SILENCE']
(default: 'warning', case insensitive)
--size_limit SIZE_LIMIT
set size limit of files that for scanning (eg. 1GB /
10MiB / 1000)
--banner show version and crc32 sum of CredSweeper files at
start
--version, -V show program's version number and exit
.. note::
Validation by `ML model classifier `_ is used to reduce False Positives (by far), but might increase False negatives and execution time.
You may change system sensitivity by modifying --ml_threshold argument. Increasing threshold will decrease the number of alerts.
Setting `--ml_threshold 0` will turn ML off and will maximize the number of alerts.
Typical False Positives: `password = "template_password"`
.. note::
CredSweeper includes an experimental `--depth` option that enables scanning with awareness of specific data formats, such as:
- Compressed files (zip, gzip, bzip2, lzma)
- Data containers (deb, tar, Docker images, pkcs12, jks)
- Document rendering (pdf, xls, ods, xlsx, docx, pptx, tm7, mxfile)
- Base64-encoded content
- Structured text formats (HTML, XML, JSON, NDJSON, YAML, etc.) - keys and values are combined before analysis
- Python sources - reformatting source code to plain code style to avoid cases which may hide values from patterns ("AKIA" "EXAMPLE..." -> "AKIAEXAMPLE...")
**Remark:** The reported line number for a found credential with the option may not correspond to the original file. The `info` field provides context to help you understand how the credential was detected.
Get output as JSON file with deep scan for docker image:
Prepare dockerfile
.. code-block:: docker
FROM scratch
ADD tests/samples /
Build, save and scan
.. code-block:: bash
docker build . --tag test_samples
docker save test_samples --output test_samples.docker
python -m credsweeper --path test_samples.docker --save-json output.json --depth 3
Review the report file (output.json):
.. code-block:: json
[
...
{
"rule": "Password",
"severity": "medium",
"confidence": "moderate",
"ml_probability": 0.7925280332565308,
"line_data_list": [
{
"line": "password = 'cackle!'",
"line_num": 1,
"path": "test_samples.docker",
"info": "FILE:test_samples.docker|TAR:blobs/sha256/82a4962c3cfebb62a42c2fd5c120ea0706a9ae66f52f71f957c052c873c60775|TAR:password.gradle|STRUCT|STRING:0|RAW",
"variable": "password",
"variable_start": 0,
"variable_end": 8,
"value": "cackle!",
"value_start": 12,
"value_end": 19,
"entropy": 2.52164
}
]
},
...
]
Get CLI output only:
.. code-block:: bash
python -m credsweeper --path tests/samples/password.gradle
.. code-block:: text
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9149653911590576 | line_data_list: [path: tests/samples/password.gradle | line_num: 1 | value: 'cackle!' | line: 'password = "cackle!"']
Exclude outputs using CLI:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If you want to remove some values from report (e.g. known public secrets):
create text files with lines or values you want to remove and add it using `--denylist` argument.
Space-like characters at left and right will be ignored.
.. code-block:: bash
$ python -m credsweeper --path tests/samples/password.gradle --denylist list.txt
Detected Credentials: 0
Time Elapsed: 0.07523202896118164s
$ cat list.txt
cackle!
password = "cackle!"
Exclude outputs using config:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Edit ``exclude`` part of the config file.
Default config can be generated using ``python -m credsweeper --export_config place_to_save.json``
or can be found in ``credsweeper/secret/config.json``.
Space-like characters at left and right will be ignored.
.. code-block:: json
"exclude": {
"lines": [" password = \"cackle!\" "],
"values": ["cackle!"]
}
Then specify your config in CLI:
.. code-block:: bash
$ python -m credsweeper --path tests/samples/password.gradle --config my_cfg.json
Detected Credentials: 0
Time Elapsed: 0.07152628898620605s
Use as a python library
-----------------------
Minimal example for scanning line list:
.. code-block:: python
from credsweeper import CredSweeper, StringContentProvider
to_scan = ["line one", "password='in_line_2'"]
cred_sweeper = CredSweeper()
provider = StringContentProvider(to_scan)
results = cred_sweeper.file_scan(provider)
for r in results:
print(r)
.. code-block:: text
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
Minimal example for scanning bytes:
.. code-block:: python
from credsweeper import CredSweeper, ByteContentProvider
to_scan = b"line one\npassword='cackle!'"
cred_sweeper = CredSweeper()
provider = ByteContentProvider(to_scan)
results = cred_sweeper.file_scan(provider)
for r in results:
print(r)
.. code-block:: text
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
Minimal example for the ML validation:
.. code-block:: python
from credsweeper import CredSweeper, StringContentProvider, MlValidator, ThresholdPreset
to_scan = ["line one", "password='cackle!'", "secret='template'"]
cred_sweeper = CredSweeper()
provider = StringContentProvider(to_scan)
# You can select lower or higher threshold to get more or less reports respectively
threshold = ThresholdPreset.medium
validator = MlValidator(threshold=threshold)
results = cred_sweeper.file_scan(provider)
for candidate in results:
# For each results detected by a CredSweeper, you can validate them using MlValidator
is_credential, with_probability = validator.validate(candidate)
if is_credential:
print(candidate)
Note that `"secret='template'"` is not reported due to failing check by the `MlValidator`.
.. code-block:: text
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
Configurations
--------------
.. toctree::
:maxdepth: 1
apps_config
.. toctree::
:maxdepth: 1
rules_config
================================================
FILE: docs/source/how_to_contribute.rst
================================================
How To Contribute
=================
.. include:: ../howto/how-to-contribute.md
:parser: myst_parser.sphinx_
================================================
FILE: docs/source/index.rst
================================================
.. |CredSweeper_logo| image:: ../images/Logo.png
:width: 100
:alt: Alternative text
|CredSweeper_logo|
Welcome to CredSweeper's documentation!
=======================================
CredSweeper is a tool to detect credentials in any directories or files. CredSweeper could help users to detect
unwanted exposure of credentials (such as personal information, token, passwords, api keys and etc) in advance.
By scanning lines, filtering, and using AI model as option, CredSweeper reports lines with possible credentials,
where the line is, and expected type of the credential as a result.
How To Use
==========
.. toctree::
:maxdepth: 2
guide
Installation
============
.. toctree::
:maxdepth: 2
install
Develop
=======
.. toctree::
:maxdepth: 2
develop
How to Contribute
=================
.. toctree::
:maxdepth: 2
how_to_contribute
Overall architecture
====================
.. toctree::
:maxdepth: 2
overall_architecture
API Reference
=============
If you are looking for information on a specific function, class or method, this part of the documentation is for you.
.. toctree::
:maxdepth: 2
api
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
================================================
FILE: docs/source/install.rst
================================================
Installation
============
Currently `CredSweeper` requires the following prerequisites:
* Python version 3.10, 3.11, 3.12
.. note::
We recommend to use credsweeper in a separate virtual enviroment. Some heave dependencies as Tensorflow
might create a conflict with other dependencies othervise
Via pip
-------
.. code-block:: bash
pip install credsweeper
.. note::
If you didn't installed git, you may encounter the following error:
.. code-block:: bash
...
All git commands will error until this is rectified.
This initial warning can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
- quiet|q|silence|s|none|n|0: for no warning or exception
- warn|w|warning|1: for a printed warning
- error|e|raise|r|2: for a raised exception
Example:
export GIT_PYTHON_REFRESH=quiet
If so, please install git.
.. code-block:: bash
sudo apt install git
.. note::
Allows to use `ML model classifier `_
to validate credential candidates, but requires setup of additional packages: numpy, scikit-learn and tensorflow.
Via git clone (dev install)
---------------------------
.. code-block:: bash
git clone https://github.com/Samsung/CredSweeper.git
cd CredSweeper
# Annotate "onnxruntime" if you don't want to use the ML validation feature.
pip install -qr requirements.txt
Pre-commit git hook
---------------------------
Install CredSweeper into system and copy ``pre-commit`` file in your ``.git/hooks`` repo.
.. note::
CredSweeper must be available in current python environment.
.. note::
pre-commit file context:
.. code-block:: python
#!/usr/bin/env python
import io
import subprocess
import sys
from credsweeper import CredSweeper
from credsweeper.common.constants import DiffRowType
from credsweeper.file_handler.patch_provider import PatchProvider
def main() -> int:
command = ["git", "diff", "--cached"]
with subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as pipe:
_stdout, _stderr = pipe.communicate()
if pipe.returncode:
print(str(_stdout), flush=True)
print(str(_stderr), flush=True)
print(f"{command} EXIT CODE:{pipe.returncode}", flush=True)
return 1
patch = io.BytesIO(_stdout)
added = PatchProvider([patch], change_type=DiffRowType.ADDED)
deleted = PatchProvider([patch], change_type=DiffRowType.DELETED)
credsweeper = CredSweeper()
if credsweeper.run(content_provider=deleted):
print(f"CREDENTIALS FOUND IN DELETED CONTENT", flush=True)
# return 1 # <<< UNCOMMENT THE LINE IF YOU WANT TO MANAGE DELETED CREDENTIALS
if credsweeper.run(content_provider=added):
print(f"CREDENTIALS FOUND IN ADDED CONTENT", flush=True)
return 1
return 0
if __name__ == "__main__":
sys.exit(main())
Or use pre-commit with ``.pre-commit-config.yaml`` in your repo:
.. code-block:: none
repos:
- repo: https://github.com/Samsung/CredSweeper
rev: v1.10.6
hooks:
- id: CredSweeper
Install with: ``pre-commit install --install-hooks``
================================================
FILE: docs/source/overall_architecture.rst
================================================
Overall Architecture
====================
CredSweeper is largely composed of 3 parts as follows. (Pre-processing_, Scan_, `ML validation`_)
.. image:: https://raw.githubusercontent.com/Samsung/CredSweeper/main/docs/images/Architecture.png
Pre-processing
--------------
When paths to scan are entered, get the files in that paths and the files are excluded based on the list created by `config.json `_.
**config.json**
- exclude
- pattern: Regex patterns to exclude scan.
- containers: Extensions in lower case of container files which might be scan with --depth option
- documents: Extensions in lower case of container files which might be scan with --doc and/or --depth option
- extension: Extensions in lower case to exclude scan.
- path: Paths to exclude scan.
- source_ext: List of extensions for scanning categorized as source files.
- source_quote_ext: List of extensions for scanning categorized as source files that using quote.
- find_by_ext_list: List of extensions to detect only extensions.
- check_for_literals: Bool value for whether to check line has string literal declaration or not.
- line_data_output: List of attributes of `line_data `_ for output.
- candidate_output: List of attributes of `candidate `_ for output.
.. code-block:: text
...
"exclude": {
"pattern": [
...
],
"containers": [
".gz",
".zip",
...
],
"documents": [
".docx",
".pdf",
...
],
"extension": [
".7z",
".jpg",
...
],
"path": [
"/.git/",
"/.idea/",
...
]
}
...
Scan
----
Basically, scanning is performed for each file path, and it is performed based on the Rule_. Scanning method differs from scan type of the Rule_, which is assigned when the Rule_ is generated. There are 3 scan types: `SinglePattern `_, `MultiPattern `_, and `PEMKeyPattern `_. Below is the description of the each scan type and its scanning method.
- `SinglePattern `_
- When : The Rule_ has only 1 pattern.
- How : Check if a single line Rule pattern present in the line.
- `MultiPattern `_
- When : The Rule_ has 2 patterns.
- How : Check if a line is a part of a multi-line credential and the remaining part exists within 10 lines below.
- `PEMKeyPattern `_
- When : The Rule_ type is `pem_key`.
- How : Check if a line’s entropy is high enough and the line have no substring with 5 same consecutive characters. (like 'AAAAA')
Rule
----
Each Rule_ is dedicated to detect a specific type of credential, imported from `config.yaml `_ at the runtime.
**config.yaml**
.. code-block:: yaml
...
- name: API
severity: medium
confidence: moderate
type: keyword
values:
- api
filter_type: GeneralKeyword
use_ml: true
min_line_len: 11
required_substrings:
- api
target:
- code
...
**Rule Attributes**
- severity
- `Severity `_
.. code-block:: python
...
class Severity(Enum):
CRITICAL = "critical"
HIGH = "high"
MEDIUM = "medium"
LOW = "low"
...
- confidence
- `Confidence `_ - The manually configured value indicates the confidence that the found candidate could be the credential type.
.. code-block:: python
...
class Confidence(Enum):
STRONG = "strong"
MODERATE = "moderate"
WEAK = "weak"
...
- type
- `RuleType `_
.. code-block:: python
...
class RuleType(Enum):
KEYWORD = "keyword"
PATTERN = "pattern"
PEM_KEY = "pem_key"
MULTI = "multi"
...
- values
- keyword : The keywords you want to detect. If you want to detect multiple keywords, you can write them as follows : `password|passwd|pwd`.
- pattern : The patterns you want to detect. For more accurate detection, it is recommended to specify `?P` in the patterns : `(?PAIza[0-9A-Za-z\-_]{35})`.
- pem_key : Specific rule to find multiline PEM private keys.
- multi : Two patterns you want to detect. Candidate will be found only if second pattern matched nearby.
- filter_type
- The type of the Filter_ group you want to apply. Filter_ groups implemented are as follows: `GeneralKeyword `_, `GeneralPattern `_, `PasswordKeyword `_, and `UrlCredentials `_.
- use_ml
- The attribute to set whether to perform ML validation. If true, ML validation will be performed. If false - ml_probability will be set to None in report.
- min_line_len
- drop too short stripped lines before text search to increase performance
- required_substrings
- any strings has to be found in a line before regex search to increase performance
- target
- code : The rule will be applied without --doc option
- doc : The rule will be applied with --doc option
Filter
------
Check the detected candidates from the formal step. If a candidate is caught by the Filter_, it is removed from the candidates set.
There are 21 filters and 4 filter groups. Filter_ group is a set of Filter_s, which is designed to use many Filter_s effectively at the same time.
ML validation
-------------
CredSweeper provides pre-trained ML models to filter false credential lines.
`ML validation` is on by the default and its sensitivity can be adjusted using ``--ml_threshold``:
.. code-block:: text
--ml_threshold THRESHOLD_OR_FLOAT_OR_ZERO
setup threshold for the ml model.
The lower the threshold - the more credentials will be reported.
Allowed values: float between 0 and 1, or any of ['lowest', 'low', 'medium', 'high', 'highest']
(default: medium)
And ML can be fully disable by setting ``--ml_threshold 0``
.. code-block:: bash
python -m credsweeper --ml_threshold 0 ...
Our ML model architecture is a combination of Bidirectional LSTM with additional handcrafted features.
It uses first 80 characters from the potential credential value and variable (if available), 160 characters from line around the value and configurable handcrafted features to decide if it's a real credential or not.
Example (file leaked_cred.py):
.. code-block:: python
my_db_password = "NUU423cds"
Steps:
1. Regular expression extracts ```NUU423cds``` as a secret value, ```my_db_password``` as a variable, and ```my_db_password = "NUU423cds"``` as whole line
2. Handcrafted feature classes instantiated from classes in `features.py `_ using `model_config.json `_. Instantiation process can be checked at `ml_validator.py#L46 `_. Features include: ``` ``` character in line: yes/no, ```(``` character in line: yes/no, file extension is ```.c```: yes/no, etc.
3. Handcrafted features from step 2 used on line, value, variable, and filename to get feature vector of length 91
4. ```NUU423cds``` Configurable character set is applied + 1 padding character + 1 special character for all other symbols. Padded line than `one-hot encoded `_. Link to corresponding code: `MlValidator.encode `_
5. Padded line from step 4 inputted to Bidirectional LSTM of value. The same encodings are performed for variable and line. LSTM produce 3 single vectors of lengths 80, 80, 160 as outputs
6. LSTM outputs and handcrafted features concatenated into a single vector
7. The vector from step 6 is fed into a stack of two sequential Dense layers, each with the number of output units equal to the number of input units.
8. Last layer outputs float value in range 0-1 with estimated probability of line being a real credential
9. Predicted probability compared to the threshold (see `--ml_threshold` CLI option) and credential reported if predicted probability is greater
.. image:: https://raw.githubusercontent.com/Samsung/CredSweeper/main/docs/images/Model_with_features.png
Additional:
- Handcrafted features are based on the rules described in `"Secrets in Source Code" publication `_.
.. code-block:: text
@INPROCEEDINGS{9027350,
author={Saha, Aakanksha and Denning, Tamara and Srikumar, Vivek and Kasera, Sneha Kumar},
booktitle={2020 International Conference on COMmunication Systems NETworkS (COMSNETS)},
title={Secrets in Source Code: Reducing False Positives using Machine Learning},
year={2020},
pages={168-175},
doi={10.1109/COMSNETS48256.2020.9027350}
}
- Mapping between text threshold values and float can be found at `model_config.json#L2 `_. Values are based on F-0.25, F-0.5, F-1, F-2 and F-4 scores on `CredData test `_
================================================
FILE: docs/source/rules_config.rst
================================================
Rules Configuration
===================
.. literalinclude:: ../../credsweeper/rules/config.yaml
:language: yaml
:linenos:
================================================
FILE: experiment/README.md
================================================
# Train credential detection model
This code will allow you to retrain model on the CredData dataset
## Preparation
- Make sure that you are using Python 3.10 or higher
- Download CredData dataset
```bash
git clone https://github.com/Samsung/CredData
cd CredData
python download_data.py --data_dir data
```
- Go back to `CredSweeper/experiment` directory
- Install the additional requirements
```bash
pip install -r requirements.txt
```
- Make sure that `credsweeper` in the `PYTHONPATH`. You can add it with
```bash
export PYTHONPATH=:$PYTHONPATH
```
Example:
```bash
export PYTHONPATH=/home/user/code/CredSweeper:$PYTHONPATH
```
## Run
- Launch the experiment with
```bash
python main.py --data -j
```
Example:
```bash
python main.py --data /home/user/datasets/CredData -j 16
```
- Resulting model will be saved to `results/ml_model_at-`.
You now can convert the model to onnx:
```bash
python -m tf2onnx.convert --saved-model results/ml_model_at-20240225_111951 --output ../credsweeper/ml_model/ml_model.onnx --verbose
```
================================================
FILE: experiment/__init__.py
================================================
================================================
FILE: experiment/data_loader.py
================================================
import contextlib
import json
import os
import pathlib
from copy import deepcopy
from functools import cache
from typing import Tuple, Dict, Set, Any
import numpy as np
import pandas as pd
from colorama import Fore, Style, Back
from credsweeper.common.constants import ML_HUNK
from credsweeper.utils.util import Util
# path, line, val_start, val_end
identifier = Tuple[str, int, int, int]
def transform_to_meta_path(file_path: pathlib.Path):
"""Transform any path to '......./data/xxxxxxxx/[type]...../yyyyyyyy.ext' to find in meta markup"""
file_path = pathlib.Path(file_path).as_posix()
path_list = file_path.split('/')
assert path_list.count("data") == 1, file_path # only one "data" directory allowed
meta_path = ""
for n, x in enumerate(path_list):
if x == "data":
meta_path = '/'.join(path_list[n:])
break
assert meta_path, f"data dir was not found in {file_path}" # just extra check
return meta_path
def read_detected_data(file_path: pathlib.Path) -> Dict[identifier, Dict]:
print(f"Reading detections from {file_path}", flush=True)
with open(file_path) as f:
detections = json.load(f)
detected_lines = {}
for cred in detections:
rule_name = cred["rule"]
# skip not ML values like private keys and so on. Unsupported for ml train. "use_ml" rules ONLY
assert 0 < len(cred["line_data_list"]), cred # at least, one line_data_list must present
line_data = deepcopy(cred["line_data_list"][0])
line_data.pop("entropy")
line_data.pop("info")
line_data["line"] = None # will be read during join_label with data for ML input only
meta_path = transform_to_meta_path(line_data["path"])
line_data["path"] = meta_path
line_data["RuleName"] = [rule_name]
index = meta_path, line_data["line_num"], line_data["value_start"], line_data["value_end"]
if index not in detected_lines:
detected_lines[index] = line_data
else:
detected_lines[index]["RuleName"].append(rule_name)
print(f"Detected {len(detected_lines)} unique lines!", flush=True)
print(f"{len(detections)} detections in total", flush=True)
return detected_lines
def read_metadata(meta_dir: str) -> Dict[identifier, Dict]:
print(f"Reading meta from {meta_dir}", flush=True)
meta_lines = {}
j = 0
for file_path in os.listdir(meta_dir):
csv_file = os.path.join(meta_dir, file_path)
if not file_path.endswith(".csv"):
print(f"skip garbage: {csv_file}", flush=True)
continue
try:
df = pd.read_csv(csv_file,
dtype={
"RepoName": str,
"GroundTruth": str,
"Category": str,
"LineStart": "Int64",
"LineEnd": "Int64",
"ValueStart": "Int64",
"ValueEnd": "Int64",
})
except Exception as exc:
print(csv_file, exc, flush=True)
raise
# Int64 is important to change with NaN
df["LineStart"] = df["LineStart"].fillna(-1).astype(int)
df["LineEnd"] = df["LineEnd"].fillna(-1).astype(int)
df["ValueStart"] = df["ValueStart"].fillna(-1).astype(int)
df["ValueEnd"] = df["ValueEnd"].fillna(-1).astype(int)
# all templates are false
df.loc[df["GroundTruth"] != 'T', "GroundTruth"] = 'F'
for _, row in df.iterrows():
j += 1
if row["LineStart"] != row["LineEnd"] \
or all(x in ["AWS Multi", "Google Multi"] for x in row["Category"].split(':')):
# print(f"WARNING: skip not ml category {row['FilePath']},{line_start},{line_end}"
# f",{row['GroundTruth']},{row['Category']}", flush=True)
continue
assert 'F' == row["GroundTruth"] or 'T' == row["GroundTruth"] and 0 <= row["ValueStart"], row
meta_path = transform_to_meta_path(row["FilePath"])
index = meta_path, row['LineStart'], row['ValueStart'], row['ValueEnd']
if index not in meta_lines:
row_data = row.to_dict()
row_data["Used"] = False
row_data["FilePath"] = meta_path
meta_lines[index] = row_data
else:
print(
f"WARNING: {index} already in meta_lines {row['GroundTruth']} {row['Category']}"
f"\n{meta_lines[index]}",
flush=True)
print(f"Loaded {len(meta_lines)} lines from meta of {j} total", flush=True)
return meta_lines
def get_colored_line(line_data: Dict[str, Any]) -> str:
val_start = int(line_data['value_start'])
val_end = int(line_data['value_end'])
colored_line = line_data['line'][:val_start] \
+ Fore.LIGHTYELLOW_EX \
+ line_data['line'][val_start:val_end] \
+ Style.RESET_ALL \
+ line_data['line'][val_end:]
with contextlib.suppress(Exception):
var_start = int(line_data['variable_start'])
var_end = int(line_data['variable_end'])
if 0 <= var_start < var_end:
colored_line = colored_line[:var_start] \
+ Fore.LIGHTBLUE_EX \
+ colored_line[var_start:var_end] \
+ Style.RESET_ALL \
+ colored_line[var_end:]
colored_sub_line = Util.subtext(colored_line, line_data['value_start'], ML_HUNK)
return f"{colored_sub_line}{Style.RESET_ALL}"
def join_label(detected_data: Dict[identifier, Dict], meta_data: Dict[identifier, Dict],
cred_data_location: str) -> pd.DataFrame:
@cache
def read_text(path) -> list[str]:
with open(path, "r", encoding="utf8") as f:
return f.read().replace("\r\n", '\n').replace('\r', '\n').split('\n')
positive_lines = set((x[0], x[1]) for x, y in meta_data.items() if 'T' == y["GroundTruth"])
values = []
detected_rules: Set[str] = set()
for index, line_data in detected_data.items():
for i in line_data["RuleName"]:
detected_rules.add(i)
text = read_text(f'{cred_data_location}/{line_data["path"]}')
line = text[line_data["line_num"] - 1]
line_data["line"] = line
if not line_data["value"]:
print(f"WARNING: empty value\n{line_data}", flush=True)
continue
label = False
if markup := meta_data.get(index):
# it means index in meta_data with exactly match
if 'T' == markup["GroundTruth"]:
label = True
markup["Used"] = True
markup_rules = markup["Category"].split(':')
if not set(markup_rules).intersection(set(line_data["RuleName"])):
print(f"1.CHECK CATEGORIES\n{markup_rules}, {line_data['RuleName']}\n{str(markup)}" +
get_colored_line(line_data),
flush=True)
elif markup := meta_data.get((index[0], index[1], index[2], -1)):
# perhaps, the line has only start markup - so value end position is -1
if 'T' == markup["GroundTruth"]:
label = True
markup["Used"] = True
markup_rules = markup["Category"].split(':')
if not set(markup["Category"].split(':')).intersection(set(line_data["RuleName"])):
print(f"2.CHECK CATEGORIES\n{markup_rules}, {line_data['RuleName']}\n{str(markup)}" +
get_colored_line(line_data),
flush=True)
elif markup := meta_data.get((index[0], index[1], -1, -1)):
# perhaps, the line has false markup - so value start-end position is -1, -1
if 'T' == markup["GroundTruth"]:
raise RuntimeError(f"ERROR: markup {markup} cannot be TRUE\n{line_data}")
markup["Used"] = True
markup_rules = markup["Category"].split(':')
if not set(markup["Category"].split(':')).intersection(set(line_data["RuleName"])):
print(f"3.CHECK CATEGORIES\n{markup_rules}, {line_data['RuleName']}\n{str(markup)}" +
get_colored_line(line_data),
flush=True)
elif (index[0], index[1]) in positive_lines:
print(f"WARNING: {index} is not in meta!!! {Fore.LIGHTRED_EX}CHECK THE NEGATIVE CASE{Style.RESET_ALL}\n" +
get_colored_line(line_data),
flush=True)
else:
print(f"WARNING: {index} is not in meta!!! IT WILL BE USED AS NEGATIVE CASE\n" +
get_colored_line(line_data),
flush=True)
# check the value in detected data
assert line[line_data["value_start"]:line_data["value_end"]] == line_data["value"], (
line_data, line[line_data["value_start"]:line_data["value_end"]], line_data["value"])
# todo: variable input has to be markup in meta too, or/and new feature "VariableExists" created ???
line_data["GroundTruth"] = label
# auxiliary field for model_config_preprocess
# no extra memory usage due the dataframe is deleted before train
line_data["ext"] = Util.get_extension(line_data["path"])
values.append(line_data)
all_meta_found = True
for markup in meta_data.values():
if 'T' == markup["GroundTruth"] and not markup["Used"]:
for markup_rule in markup["Category"].split(':'):
if markup_rule in detected_rules:
if all_meta_found:
# print header of the markup once
print(f"{Back.MAGENTA}{Fore.BLACK}WARNING: Not all TRUE meta found!{Style.RESET_ALL}",
flush=True)
print(','.join(markup.keys()), flush=True)
all_meta_found = False
print(','.join(str(x) for x in markup.values()), flush=True)
text = read_text(f'{cred_data_location}/{markup["FilePath"]}')
line = text[markup["LineStart"] - 1]
if 0 <= markup["ValueStart"] and 0 <= markup["ValueEnd"]:
line = line[:markup["ValueStart"]] \
+ Fore.LIGHTGREEN_EX \
+ line[markup["ValueStart"]:markup["ValueEnd"]] \
+ Style.RESET_ALL \
+ line[markup["ValueEnd"]:]
elif 0 <= markup["ValueStart"]:
line = line[:markup["ValueStart"]] \
+ Fore.LIGHTGREEN_EX \
+ line[markup["ValueStart"]:] \
+ Style.RESET_ALL
print(line, flush=True)
break
read_text.cache_clear()
df = pd.DataFrame(values)
print(f"Initial full dataset: {len(df)} items\n{df.memory_usage(deep=True)}", flush=True)
df = df.drop_duplicates(subset=["line", "variable", "value", "path"])
print(f"Full dataset: {len(df)} items after drop duplicates\n{df.memory_usage(deep=True)}", flush=True)
return df
def get_y_labels(df: pd.DataFrame) -> np.ndarray:
true_cases = np.array(df["GroundTruth"], dtype=np.float32)
return true_cases
================================================
FILE: experiment/evaluate_model.py
================================================
from typing import List
import numpy as np
from keras import Model # type: ignore
from sklearn.metrics import f1_score, precision_score, recall_score, log_loss, accuracy_score
def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray], y_label: np.ndarray):
"""Evaluate Keras model with printing scores
Args:
thresholds: dict of credsweeper thresholds
keras_model: fitted keras model
x_data: List of np.arrays. Number and shape depends on model
y_label: expected result
"""
predictions_proba = keras_model.predict(x_data, verbose=2).ravel()
for name, threshold in thresholds.items():
predictions = (predictions_proba > threshold)
accuracy = accuracy_score(y_label, predictions)
precision = precision_score(y_label, predictions)
recall = recall_score(y_label, predictions)
loss = log_loss(y_label, predictions)
f1 = f1_score(y_label, predictions)
print(
f"{name}: {threshold:0.6f}, "
f"accuracy: {accuracy:0.6f}, "
f"precision:{precision:0.6f}, "
f"recall: {recall:0.6f}, "
f"loss: {loss:0.6f}, "
f"F1:{f1:0.6f}",
flush=True)
================================================
FILE: experiment/features.py
================================================
from typing import Tuple, Union
import numpy as np
import pandas as pd
from credsweeper.common.constants import Severity, ML_HUNK
from credsweeper.credentials.candidate import Candidate
from credsweeper.credentials.line_data import LineData
from credsweeper.ml_model.ml_validator import MlValidator
from credsweeper.utils.util import Util
class CustomLineData(LineData):
"""Object that allows to create LineData from scanner results"""
def __init__(
self, #
line: str, #
value: str, #
line_num: int, #
path: str, #
variable: str, #
value_start: int, #
value_end: int, #
variable_start: int, #
variable_end: int, #
) -> None:
self.line: str = line
self.line_num: int = line_num
self.path: str = path
self.value = value
self.file_type = Util.get_extension(path)
self.variable = variable
self.value_start = value_start
self.value_end = value_end if value_start < value_end else value_start + len(value)
self.variable_start = variable_start
self.variable_end = variable_end
def get_candidates(line_data: dict):
"""Get list of candidates. 1 candidate for each rule that detected this line"""
ld = CustomLineData(line=line_data["line"],
value=line_data["value"],
line_num=line_data["line_num"],
path=line_data["path"],
variable=line_data["variable"],
value_start=line_data["value_start"],
value_end=line_data["value_end"],
variable_start=line_data["variable_start"],
variable_end=line_data["variable_end"])
candidates = []
for rule in line_data["RuleName"]:
candidates.append(
Candidate(
line_data_list=[ld],
patterns=[],
rule_name=rule,
severity=Severity.MEDIUM,
use_ml=True,
))
return candidates
def get_features(line_data: Union[dict, pd.Series],
ml_validator: MlValidator) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""Get features from a single detection using CredSweeper.MlValidator module"""
candidates = get_candidates(line_data)
line_input = ml_validator.encode_line(line_data["line"], line_data["value_start"])
if variable := line_data["variable"]:
if len(variable) > ML_HUNK:
variable = variable[:ML_HUNK]
variable_input = ml_validator.encode_value(variable)
else:
variable_input = ml_validator.encode_value('')
if value := line_data["value"]:
if len(value) > ML_HUNK:
value = value[:ML_HUNK]
value_input = ml_validator.encode_value(value)
else:
raise RuntimeError(f"Empty value is not allowed {line_data}")
line = line_data["line"]
assert line[line_data["value_start"]:].startswith(line_data["value"]), line_data
extracted_features = ml_validator.extract_features(candidates)
return line_input, variable_input, value_input, extracted_features
def prepare_data(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""Get features from a DataFrame detection using CredSweeper.MlValidator module"""
ml_validator = MlValidator(0.5) # MLValidator object loads config (MAY be updated!) with features
x_size = len(df)
x_line_input = np.zeros(shape=[x_size, MlValidator.MAX_LEN, ml_validator.num_classes], dtype=np.float32)
x_variable_input = np.zeros(shape=[x_size, ML_HUNK, ml_validator.num_classes], dtype=np.float32)
x_value_input = np.zeros(shape=[x_size, ML_HUNK, ml_validator.num_classes], dtype=np.float32)
# features size preprocess to calculate the dimension automatically
features = get_features( #
line_data={ #
"path": "", #
"line_num": 1, #
"line": "const API=123;", #
"value": "123", #
"value_start": 10, #
"value_end": 13, #
"variable": "API", #
"variable_start": 6, #
"variable_end": 9, #
"RuleName": ["API"], #
}, #
ml_validator=ml_validator)
features_size = features[3].shape[1]
print(f"Features size: {features_size}", flush=True)
x_features = np.zeros(shape=[x_size, features_size], dtype=np.float32)
n = 0
for i, row in df.iterrows():
assert bool(row["line"]) and bool(row["value"]), row
line_input, variable_input, value_input, extracted_features = get_features(row, ml_validator)
x_line_input[n] = line_input
x_variable_input[n] = variable_input
x_value_input[n] = value_input
x_features[n] = extracted_features
n += 1
return x_line_input, x_variable_input, x_value_input, x_features
================================================
FILE: experiment/hyperparameters.py
================================================
HP_DICT = {
"line_lstm_dropout_rate": ((0.4, 0.5, 0.01), 0.47),
"variable_lstm_dropout_rate": ((0.4, 0.5, 0.01), 0.42),
"value_lstm_dropout_rate": ((0.4, 0.5, 0.01), 0.47),
"dense_a_drop": ((0.0, 0.3, 0.01), 0.21),
"dense_b_drop": ((0.0, 0.3, 0.01), 0.23),
}
================================================
FILE: experiment/log_callback.py
================================================
import datetime
import psutil
from keras.src.callbacks import Callback
class LogCallback(Callback):
def __init__(self):
super().__init__()
@staticmethod
def get_memory_info():
process = psutil.Process()
memory_info = process.memory_info()
return str(memory_info)
def on_epoch_end(self, epoch, logs=None):
print(str(datetime.datetime.now()), flush=True)
print(f"{epoch + 1}:{self.get_memory_info()}", flush=True)
print(logs, flush=True)
================================================
FILE: experiment/main.py
================================================
import os
import random
import sys
from argparse import ArgumentParser, BooleanOptionalAction
from train import train
def main(argv) -> int:
parser = ArgumentParser()
parser.add_argument("-d",
"--data",
nargs="?",
help="CredData location",
dest="cred_data_location",
metavar="PATH",
required=True)
parser.add_argument("-j",
"--jobs",
help="number of parallel processes to use (default: 4)",
default=4,
dest="jobs",
metavar="POSITIVE_INT")
parser.add_argument("-e",
"--epochs",
help="maximal epochs to train (default: 100)",
default=100,
dest="epochs",
metavar="POSITIVE_INT")
parser.add_argument("-b",
"--batch_size",
help="batch size (default: 256)",
default=256,
dest="batch_size",
metavar="POSITIVE_INT")
parser.add_argument("-p",
"--patience",
help="early stopping patience (default: 5)",
default=5,
dest="patience",
metavar="POSITIVE_INT")
parser.add_argument("--doc", help="use doc target", dest="doc_target", action=BooleanOptionalAction, default=False)
parser.add_argument("--tuner",
help="use keras tuner",
dest="use_tuner",
action=BooleanOptionalAction,
default=False)
parser.add_argument("--eval-test",
help="evaluate model for test dataset",
dest="eval_test",
action=BooleanOptionalAction,
default=False)
parser.add_argument("--eval-train",
help="evaluate model for train dataset",
dest="eval_train",
action=BooleanOptionalAction,
default=False)
parser.add_argument("--eval-full",
help="evaluate model for full dataset after train",
dest="eval_full",
action=BooleanOptionalAction,
default=False)
args = parser.parse_args(argv[1:])
fixed_seed = 20251216
print(f"Fixed seed:{fixed_seed}", flush=True)
random.seed(fixed_seed)
print(args, flush=True) # dbg
_model_file_name = train(
cred_data_location=args.cred_data_location,
jobs=int(args.jobs),
epochs=int(args.epochs),
batch_size=int(args.batch_size),
patience=int(args.patience),
doc_target=bool(args.doc_target),
use_tuner=bool(args.use_tuner),
eval_test=bool(args.eval_test),
eval_train=bool(args.eval_train),
eval_full=bool(args.eval_full),
)
if os.path.exists(_model_file_name):
# print in last line the name
print(f"\nYou can find your model in:\n{_model_file_name}", flush=True)
return 0
print(f"Error: {_model_file_name}", flush=True)
return 1
if __name__ == "__main__":
sys.exit(main(sys.argv))
================================================
FILE: experiment/main.sh
================================================
#!/usr/bin/env bash
set -ex
START_TIME=$(date +%s)
NOW=$(date +%Y%m%d_%H%M%S)
echo ">>> START ${BASH_SOURCE[0]} in $(pwd) at ${NOW}"
free --wide --human
# use the path environments without / at end
echo "CREDSWEEPER_DIR='${CREDSWEEPER_DIR}'"
if [ -z "${CREDSWEEPER_DIR}" ] || [ ! -d "${CREDSWEEPER_DIR}" ]; then
echo "CREDSWEEPER_DIR environment is empty or does not exist"
exit 1
fi
export PYTHONPATH="${CREDSWEEPER_DIR}":$PYTHONPATH
# check current version of CredSweeper
"${CREDSWEEPER_DIR}"/.venv/bin/python -m credsweeper --banner
git log -1
git status
echo "CREDDATA_DIR='${CREDDATA_DIR}'"
if [ -z "${CREDDATA_DIR}" ] || [ ! -d "${CREDDATA_DIR}" ]; then
echo "CREDDATA_DIR environment is empty or does not exist"
exit 1
fi
# do some check in CredData repo
(cd "${CREDDATA_DIR}" && git log -1 && git status)
echo "JOBS=${JOBS} of $(nproc)"
if [ -z "${JOBS}" ]; then
JOBS=$(nproc)
echo "Used JOBS=${JOBS} for multiple process"
elif [ ! 0 -lt ${JOBS} ]; then
echo "Unappropriated JOBS=${JOBS}"
exit 1
fi
echo "BATCH=${BATCH}"
if [ -z "${BATCH}" ]; then
BATCH=256
echo "Used BATCH=${BATCH}"
elif [ ! 0 -lt ${JOBS} ]; then
echo "Unappropriated BATCH=${BATCH}"
exit 1
fi
WORK_DIR="${CREDSWEEPER_DIR}/experiment"
cd "${WORK_DIR}"
RESULT_DIR="${WORK_DIR}/results"
mkdir -vp "${RESULT_DIR}"
# set env TUNER to use keras-tuner
#TUNER=--tuner
# set env DOC to apply doc dataset
#DOC=--doc
"${CREDSWEEPER_DIR}"/.venv/bin/python main.py --data "${CREDDATA_DIR}" --jobs ${JOBS} ${TUNER} ${DOC} --batch_size ${BATCH} | tee "${RESULT_DIR}/${NOW}.train.log"
error_code=${PIPESTATUS}
if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi
cd "${CREDSWEEPER_DIR}"
report_file=${RESULT_DIR}/${NOW}.json
if [ -z "${TESTDATA_DIR}" ]; then
echo "Used CREDDATA_DIR=${CREDDATA_DIR} for BenchMark and train rules only"
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort --rules ${CREDSWEEPER_DIR}/experiment/results/train_config.yaml --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS} --subtext --save-json ${report_file} --no-stdout
cd "${CREDDATA_DIR}"
else
echo "TESTDATA_DIR=${TESTDATA_DIR}"
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort --path "${TESTDATA_DIR}/data" --log info --jobs ${JOBS} --subtext --save-json ${report_file} --no-stdout
cd "${TESTDATA_DIR}"
fi
.venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${CREDSWEEPER_DIR}/.ci/benchmark.txt
SPENT_SECONDS=$(( $(date +%s) - ${START_TIME} ))
if [ 86400 -lt ${SPENT_SECONDS} ];then
SPENT_TIME=$(date -ud "@${SPENT_SECONDS}" +"$(( ${SPENT_SECONDS} / 86400 ))-%H:%M:%S")
else
SPENT_TIME=$(date -ud "@${SPENT_SECONDS}" +"%H:%M:%S")
fi
echo "<<< DONE ${BASH_SOURCE[0]} in $(pwd) at $(date) elapsed ${SPENT_TIME}"
================================================
FILE: experiment/ml_model.py
================================================
from typing import Any, Optional
import keras_tuner as kt
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Input, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.python.keras.layers import ReLU, Softmax, Multiply
from tensorflow.python.keras.metrics import BinaryAccuracy, Precision, Recall
from credsweeper.common.constants import ML_HUNK
from credsweeper.ml_model.ml_validator import MlValidator
class MlModel(kt.HyperModel):
d_type = "float32"
def __init__(self, line_shape: tuple, variable_shape: tuple, value_shape: tuple, feature_shape: tuple, **kwargs):
self.line_shape = line_shape
self.variable_shape = variable_shape
self.value_shape = value_shape
self.feature_shape = feature_shape
self.__kwargs = kwargs
def get_hyperparam(self, param_name: str, hp=None) -> Any:
if param_name in self.__kwargs:
param = self.__kwargs.get(param_name)
if isinstance(param, float):
print(f"'{param_name}' constant = {param}", flush=True)
return param
elif hp and isinstance(param, tuple) and 3 == len(param):
print(f"'{param_name}' tuning = {param}", flush=True)
return hp.Float(param_name, min_value=param[0], max_value=param[1], step=param[2])
else:
raise ValueError(f"'{param_name}' was not inited well {param} tuner is {hp}")
else:
raise ValueError(f"'{param_name}' was not defined during init and tuner is used")
def build(self, hp: Optional[Any]) -> Model:
"""Get keras model with string and feature input and single binary out"""
line_lstm_dropout_rate = self.get_hyperparam("line_lstm_dropout_rate", hp)
variable_lstm_dropout_rate = self.get_hyperparam("variable_lstm_dropout_rate", hp)
value_lstm_dropout_rate = self.get_hyperparam("value_lstm_dropout_rate", hp)
dense_a_drop = self.get_hyperparam("dense_a_drop", hp)
dense_b_drop = self.get_hyperparam("dense_b_drop", hp)
line_input = Input(shape=(None, self.line_shape[2]), name="line_input", dtype=self.d_type)
line_lstm = LSTM(units=self.line_shape[1],
dtype=self.d_type,
dropout=line_lstm_dropout_rate,
recurrent_dropout=0)
line_bidirectional = Bidirectional(layer=line_lstm, name="line_bidirectional")
line_lstm_branch = line_bidirectional(line_input)
variable_input = Input(shape=(None, self.variable_shape[2]), name="variable_input", dtype=self.d_type)
variable_lstm = LSTM(units=self.variable_shape[1],
dtype=self.d_type,
dropout=variable_lstm_dropout_rate,
recurrent_dropout=0)
variable_bidirectional = Bidirectional(layer=variable_lstm, name="variable_bidirectional")
variable_lstm_branch = variable_bidirectional(variable_input)
value_input = Input(shape=(None, self.value_shape[2]), name="value_input", dtype=self.d_type)
value_lstm = LSTM(units=self.value_shape[1],
dtype=self.d_type,
dropout=value_lstm_dropout_rate,
recurrent_dropout=0)
value_bidirectional = Bidirectional(layer=value_lstm, name="value_bidirectional")
value_lstm_branch = value_bidirectional(value_input)
feature_input = Input(shape=(self.feature_shape[1], ), name="feature_input", dtype=self.d_type)
feature_attention = Dense(self.feature_shape[1], activation=Softmax(), use_bias=False,
name="feature_attention")(feature_input)
x_scaled = Multiply(name="feature_multiply")([feature_input, feature_attention])
joined_features = Concatenate()([line_lstm_branch, variable_lstm_branch, value_lstm_branch, x_scaled])
# 3 bidirectional + features
dense_units = 2 * MlValidator.MAX_LEN + 2 * 2 * ML_HUNK + self.feature_shape[1]
# check after model compilation. Should be matched the combined size.
# first hidden layer
dense_a = Dense(units=dense_units, activation=ReLU(), name="a_dense", dtype=self.d_type)(joined_features)
drop_a = Dropout(name="a_drop", rate=dense_a_drop)(dense_a)
# second hidden layer
dense_b = Dense(units=dense_units, activation=ReLU(), name="b_dense", dtype=self.d_type)(drop_a)
drop_b = Dropout(name="b_drop", rate=dense_b_drop)(dense_b)
dense_final = Dense(units=1, activation='sigmoid', name="prediction", dtype=self.d_type)(drop_b)
metrics = [BinaryAccuracy(name="binary_accuracy"), Precision(name="precision"), Recall(name="recall")]
model: Model = Model(inputs=[line_input, variable_input, value_input, feature_input], outputs=dense_final)
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=metrics)
model.summary(line_length=120, expand_nested=True, show_trainable=True)
return model
================================================
FILE: experiment/model_config_preprocess.py
================================================
import mimetypes
from typing import Dict
import pandas as pd
from credsweeper.app import APP_PATH
from credsweeper.utils.util import Util
ML_CONFIG_PATH = APP_PATH / "ml_model" / "ml_config.json"
def model_config_preprocess(df_all: pd.DataFrame, doc_target: bool) -> Dict[str, float]:
model_config = Util.json_load(ML_CONFIG_PATH)
ascii_char_set = ''.join(chr(x) for x in range(0x20, 0x7F))
extra_char_set = "\x1B\t\n\r" # ESC code, tab and line end variations
doc_char_set = " ●가개공기께내는님당드등따로메면문밀방번보복본비사생서석성슈스시암에요용워으의이작정주지채체큰키토팅패필하할호화" if doc_target else ''
model_config["char_set"] = extra_char_set + ascii_char_set + doc_char_set
# check whether all extensions from meta are in ml_config.json
for x in model_config["features"]:
if "FileExtension" == x["type"]:
config_extensions = x["kwargs"]["extensions"]
config_extensions_set = set(config_extensions)
if len(config_extensions) != len(config_extensions_set):
print("WARNING: duplicates in config extensions list", flush=True)
if any(x != x.lower() for x in config_extensions_set):
print("WARNING: file extensions in config must be in lowercase", flush=True)
break
else:
raise RuntimeError(f"FileExtension was not found in config ({ML_CONFIG_PATH}) features!")
data_extension_set = set(df_all["ext"].unique())
if config_extensions_set != data_extension_set:
unknown_extensions = []
for x in model_config["features"]:
if "FileExtension" == x["type"]:
known_extensions = set(x["kwargs"]["extensions"])
x["kwargs"]["extensions"] = []
for extension in sorted(list(data_extension_set)):
if extension in known_extensions or mimetypes.guess_type(f"a_file{extension}")[0]:
# use already present extensions and well-known additionally
x["kwargs"]["extensions"].append(extension)
else:
# collect all unknown extensions for error log
print(f"UNKNOWN EXTENSION: {extension}", flush=True)
unknown_extensions.append(extension)
Util.json_dump(model_config, ML_CONFIG_PATH)
if known_extensions != set(x["kwargs"]["extensions"]):
# the process must be restarted with updated config
raise RuntimeError("RESTART: differences in extensions:"
f"\nconfig:{config_extensions_set.difference(data_extension_set)}"
f"\ndata:{data_extension_set.difference(config_extensions_set)}"
f"\nFile {ML_CONFIG_PATH} was updated."
f"\nUnknown extensions:{unknown_extensions if unknown_extensions else None}")
break
# append all rule names for the feature
for x in model_config["features"]:
if "RuleName" == x["type"]:
config_rules = x["kwargs"]["rule_names"]
config_rules_set = set(config_rules)
if len(config_rules) != len(config_rules_set):
print("WARNING: duplicates in config rule_names list", flush=True)
break
else:
raise RuntimeError(f"rule_names was not found in config ({ML_CONFIG_PATH}) features!")
data_rules_set = set(df_all["RuleName"].explode().unique())
if config_rules_set != data_rules_set:
sorted_rules = sorted(list(data_rules_set))
print("Update config rule names with ", sorted_rules, flush=True)
for x in model_config["features"]:
if "RuleName" == x["type"]:
x["kwargs"]["rule_names"] = sorted_rules
Util.json_dump(model_config, ML_CONFIG_PATH)
break
# the process must be restarted with updated config
raise RuntimeError(f"RESTART: differences in rules:"
f"\nconfig:{config_rules_set.difference(data_rules_set)}"
f"\ndata:{data_rules_set.difference(config_rules_set)}"
f"\nFile {ML_CONFIG_PATH} was updated.")
else:
print(config_rules_set, " matches ", data_rules_set, flush=True)
thresholds = model_config["thresholds"]
assert isinstance(thresholds, dict), thresholds
print(f"Load thresholds: {thresholds}", flush=True)
return thresholds
================================================
FILE: experiment/plot.py
================================================
import itertools
import math
import pathlib
import matplotlib.pyplot as plt
from keras.src.callbacks import History
from matplotlib import image as mpimg
METRICS = ["loss", "binary_accuracy", "precision", "recall"]
NCOLS = 2 # GRAPHS_PER_ROW
NROWS = math.ceil(len(METRICS) / NCOLS)
def save_plot(stamp: str, title: str, history: History, dir_path: pathlib.Path, best_epoch: int, info: str):
plt.clf()
fig, axes = plt.subplots(nrows=NROWS, ncols=NCOLS, figsize=(16, 9), tight_layout=True)
fig.suptitle(f"{stamp} {title}")
# train displays "Epoch 1/7", so let the plot starts from 1
x = [x + 1 for x in history.epoch]
for idx, characteristic in itertools.zip_longest(range(NROWS * NROWS), METRICS):
axes_x = idx % NCOLS
axes_y = idx // NCOLS
if characteristic:
y_train = history.history[characteristic]
y_test = history.history[f"val_{characteristic}"]
axes[axes_x, axes_y].plot(x, y_train, label="train")
axes[axes_x, axes_y].plot(x, y_test, label="test")
axes[axes_x, axes_y].set_title(characteristic)
axes[axes_x, axes_y].legend(loc="upper left")
axes[axes_x, axes_y].grid(visible=True, which="both", color="grey", linewidth=0.75, linestyle="dotted")
axes[axes_x, axes_y].set_xticks(range(min(x), max(x) + 1, 1), minor=True)
axes[axes_x, axes_y].axvline(x=best_epoch, color='green', linestyle='--', linewidth=1)
else:
axes[axes_x, axes_y].axis('off')
fig.text(0.001, 0.001, info, fontsize=10, color='green', backgroundcolor='white')
plt.savefig(dir_path / f"{stamp}.png", dpi=96)
plt.close('all')
def stamp_plot(stamp: str, dir_path: pathlib.Path, info: str):
file_path = dir_path / f"{stamp}.png"
image = mpimg.imread(file_path)
plt.figure(figsize=(16, 9), tight_layout=True)
plt.imshow(image)
plt.text(222, 333, info, fontsize=10, color='red', backgroundcolor='white')
plt.axis('off')
plt.savefig(file_path, bbox_inches='tight', pad_inches=0, dpi=96)
plt.close('all')
================================================
FILE: experiment/prepare_data.py
================================================
import binascii
import hashlib
import os
import pathlib
import subprocess
import sys
from pathlib import Path
from credsweeper.scanner.scanner import RULES_PATH
from credsweeper.utils.util import Util
RESULTS_DIR = pathlib.Path(__file__).parent / "results"
def execute_scanner(dataset_location: str, report_file_str: str, train_rules_str: str, jobs: int, doc_target: bool):
"""Execute CredSweeper as a separate process to make sure no global states is shared with training script"""
dir_path = os.path.dirname(os.path.realpath(__file__))
command = (f"{sys.executable} -m credsweeper"
f" --jobs {jobs}"
f" --path {dataset_location}/data"
f" {'--doc' if doc_target else ''}"
f" --save-json {report_file_str}"
f" --rules {train_rules_str}"
" --pedantic"
" --ml_threshold 0"
" --sort"
" --subtext"
" --log info"
" --no-stdout")
error_code = subprocess.check_call(command, shell=True, cwd=dir_path)
if 0 != error_code:
sys.exit(error_code)
def data_checksum(dir_path: Path) -> str:
checksum = hashlib.md5(b'').digest()
for root, dirs, files in os.walk(dir_path):
for file in files:
with open(os.path.join(root, file), "rb") as f:
cvs_checksum = hashlib.md5(f.read()).digest()
checksum = bytes(a ^ b for a, b in zip(checksum, cvs_checksum))
return binascii.hexlify(checksum).decode()
def prepare_train_data(cred_data_location: str, jobs: int, doc_target: bool):
print("Start train data preparation...", flush=True)
# use current rules
rules = Util.yaml_load(RULES_PATH)
target = "doc" if doc_target else "code"
new_rules = [x for x in rules if x.get("use_ml") and target in x["target"]]
train_rules_config_path = RESULTS_DIR / "train_config.yaml"
Util.yaml_dump(new_rules, train_rules_config_path)
meta_dir_checksum = data_checksum(Path(cred_data_location) / "meta")
print(f"meta checksum {meta_dir_checksum}", flush=True)
data_dir_checksum = data_checksum(Path(cred_data_location) / "data")
print(f"data checksum {data_dir_checksum}", flush=True)
detected_data_filename = RESULTS_DIR / f"detected_data.{data_dir_checksum}.json"
if not os.path.exists(detected_data_filename):
print(f"Get CredSweeper results from {cred_data_location}. May take some time", flush=True)
execute_scanner(cred_data_location, str(detected_data_filename), str(train_rules_config_path), jobs, doc_target)
else:
print(f"Get cached result {data_dir_checksum}", flush=True)
print("Train data prepared!", flush=True)
return meta_dir_checksum, data_dir_checksum
================================================
FILE: experiment/requirements.txt
================================================
# Python 3.10.20
# pip 26.1
# version sensetive
h5py==3.12.1
keras==2.15.0
keras-tuner==1.4.7
numpy==1.26.4
onnx==1.17.0
protobuf==3.20.3
scikit-learn==1.6.1
tensorflow-cpu==2.15.1
tf2onnx==1.16.1
wrapt==1.14.1
# version insensetive
types-tensorflow
matplotlib
colorama
psutil
================================================
FILE: experiment/tf2onnx/tf2onnx.sh
================================================
#!/usr/bin/env bash
set -e
# tensorflow model may be obtained like this: git restore -s be06d6059f0def4f0fdb50444c08db4ce542173e -- ml_model.h5
# use virtual environment and the requirements.txt - there are very specific luke combination of packages verions
# python -m venv .venv
# . .venv/bin/activate
# python -m pip install --upgrade pip
# python -m pip install --requirement requirements.txt
# [optional] thransform model form h5 to saved directory
python -c 'import tensorflow as tf;model=tf.keras.models.load_model("ml_model.h5");model.save("ml_model")'
# transform the model
python -m tf2onnx.convert --saved-model ml_model --output ml_model.onnx --verbose --rename-inputs feature_input,line_input
# md5sum for integrity
md5sum --binary ml_model.onnx
================================================
FILE: experiment/tools/base64_test.py
================================================
#!/usr/env python3
# -*- coding: utf-8 -*-
"""
The script is useful to test patterns of base64 encoded data with 0,1,2 offsets
"""
import base64
import random
import sys
def gen_token(pad: int, txt: bytes) -> bytes:
trash = random.randbytes(3) + random.randbytes(pad) + txt + random.randbytes(pad) + random.randbytes(3)
return base64.b64encode(trash, altchars=b"-_")
def main(argv):
loops = int(argv[1]) if 1 < len(argv) else 1
inner_pattern = b"XgroqX"
while 0 < loops:
loops -= 1
token0 = gen_token(0, inner_pattern)
assert b"WGdyb3FY" in token0, token0
token1 = gen_token(1, inner_pattern)
assert b"hncm9xW" in token1, token1
token2 = gen_token(2, inner_pattern)
assert b"YZ3JvcV" in token2, token2
if __name__ == """__main__""":
main(sys.argv)
================================================
FILE: experiment/tools/entropy_test.py
================================================
import random
import signal
import statistics
import threading
import time
from multiprocessing import Pool
from typing import Tuple, Dict
from credsweeper.common.constants import Chars
from credsweeper.utils.util import Util
random_data: str
ITERATIONS = 1000
def pool_initializer() -> None:
signal.signal(signal.SIGINT, signal.SIG_IGN)
def evaluate_avg(_args: Tuple[int, float, float]) -> Tuple[float, float]:
min_avg = _args[1]
max_dvt = _args[2]
size = _args[0]
entropies = []
for x in range(ITERATIONS):
offset = x * size
entropy = Util.get_shannon_entropy(random_data[offset:offset + size])
entropies.append(entropy)
avg = statistics.mean(entropies)
dvt = statistics.stdev(entropies, avg)
if avg < min_avg:
min_avg = avg
if dvt > max_dvt:
max_dvt = dvt
return min_avg, max_dvt
if __name__ == "__main__":
random.seed()
stats: Dict[int, Tuple[float, float]] = {}
sizes = [x for x in range(8, 36)]
try:
for n in range(1000):
start_time = time.time()
random_data = ''.join([random.choice(Chars.BASE32_CHARS.value) for _ in range(ITERATIONS * max(sizes))])
_args = [(i, stats[i][0] if i in stats else 9.9, stats[i][1] if i in stats else 0.0) for i in sizes]
with Pool(processes=min(15, len(_args)), initializer=pool_initializer) as pool:
for _size, _res in zip(sizes, pool.map(evaluate_avg, _args)):
with threading.Lock():
stats[_size] = _res
for k, v in stats.items():
print(f"{k}: {v}", flush=True)
print(f"loop {n} in {time.time() - start_time}", flush=True)
except KeyboardInterrupt as exc:
print(exc, flush=True)
finally:
print("===========================================================", flush=True)
for k, v in stats.items():
# for parametrization of unit tests
print(f"({k}, {v[0]}, {v[1]}),", flush=True)
================================================
FILE: experiment/tools/morpheme_test.py
================================================
import random
import signal
import threading
import time
from multiprocessing import Pool
from typing import Tuple, Dict
from credsweeper.common import KeywordChecklist
from credsweeper.common.constants import BASE64COMMON
random_data: str
ITERATIONS = 1000
class KeywordChecklistTest(KeywordChecklist):
def calc(self, line_lower: str) -> int:
matches = 0
for keyword in self.morpheme_set:
if keyword in line_lower:
matches += 1
return matches
counter = KeywordChecklistTest()
def pool_initializer() -> None:
signal.signal(signal.SIGINT, signal.SIG_IGN)
def evaluate_avg(_args: Tuple[int, float, float]) -> Tuple[float, float]:
min_avg = _args[1]
max_dvt = _args[2]
size = _args[0]
for x in range(ITERATIONS):
offset = x * size
value = counter.calc(random_data[offset:offset + size])
if 0 < value < min_avg:
min_avg = value
if value > max_dvt:
max_dvt = value
return min_avg, max_dvt
if __name__ == "__main__":
random.seed()
stats: Dict[int, Tuple[float, float]] = {}
sizes = [4, 8, 16, 32, 40, 64, 70, 80, 90, 100, 128, 256, 512, 1024]
try:
for n in range(100):
start_time = time.time()
random_data = ''.join([random.choice(BASE64COMMON) for _ in range(ITERATIONS * max(sizes))])
_args = [(i, stats[i][0] if i in stats else 9.9, stats[i][1] if i in stats else 0.0) for i in sizes]
with Pool(processes=min(15, len(_args)), initializer=pool_initializer) as pool:
for _size, _res in zip(sizes, pool.map(evaluate_avg, _args)):
with threading.Lock():
stats[_size] = _res
for k, v in stats.items():
print(f"{k}: {v}", flush=True)
print(f"loop {n} in {time.time() - start_time}", flush=True)
except KeyboardInterrupt as exc:
print(exc, flush=True)
finally:
print("===========================================================", flush=True)
for k, v in stats.items():
# for parametrization of unit tests
print(f"({k}, {v[0]}, {v[1]}),", flush=True)
================================================
FILE: experiment/tools/strength_test.py
================================================
import random
import signal
import statistics
import threading
import time
from multiprocessing import Pool
from typing import Tuple, Dict
from credsweeper.common.constants import Chars
from credsweeper.utils.hop_stat import HopStat
hopper = HopStat()
ITERATIONS = 10000000
BASE = Chars.BASE36_CHARS.value
def pool_initializer() -> None:
signal.signal(signal.SIGINT, signal.SIG_IGN)
def evaluate_avg(size) -> Tuple[Tuple[float, float], Tuple[float, float]]:
hops = []
devs = []
for i in range(ITERATIONS):
hop, dev = hopper.stat(''.join(random.choices(BASE, k=size)))
hops.append(hop)
devs.append(dev)
avg_hop = statistics.mean(hops)
dev_hop = statistics.stdev(hops, avg_hop)
avg_dev = statistics.mean(devs)
dev_dev = statistics.stdev(devs, avg_dev)
return (avg_hop, dev_hop), (avg_dev, dev_dev)
if __name__ == "__main__":
try:
stats: Dict[int, Tuple[float, float]] = {}
sizes = [8, 10, 15, 16, 20, 24, 25, 32, 40, 50, 64]
start_time = time.time()
with Pool(processes=min(16, len(sizes)), initializer=pool_initializer) as pool:
for _size, _res in zip(sizes, pool.map(evaluate_avg, sizes)):
with threading.Lock():
stats[_size] = _res
print(f"done in {time.time() - start_time} for {BASE}", flush=True)
for k, v in stats.items():
print(f"{k}: {v},", flush=True)
except KeyboardInterrupt as exc:
print(exc, flush=True)
# base32 results
# 8: ((3.480934, 0.8482364556537906), (1.9280820731422028, 0.5833143826506801)),
# 10: ((3.4801753333333334, 0.7508676237320747), (1.9558544090983234, 0.5119385414964345)),
# 15: ((3.4803549285714284, 0.603220270918794), (1.9896690734372564, 0.40640877687972476)),
# 16: ((3.4798649333333334, 0.5837818960141307), (1.9938368543943692, 0.392547066949958)),
# 20: ((3.4809878947368422, 0.518785674729997), (2.0058661928593517, 0.34692788889724946)),
# 24: ((3.480511086956522, 0.4726670109337228), (2.0131379532992537, 0.31476354168931936)),
# 25: ((3.480877375, 0.4626150412368404), (2.0147828593929953, 0.3075894753390553)),
# 32: ((3.4809023548387095, 0.4072672632996217), (2.0231609118646867, 0.2700344059876962)),
# 40: ((3.4801929743589746, 0.36361457820793436), (2.027858606807074, 0.2401498396303172)),
# 50: ((3.4798551224489795, 0.323708167297437), (2.0318808048208794, 0.2138098551294688)),
# 64: ((3.4805990476190476, 0.28572156450556774), (2.035756800745673, 0.18815721535870078)),
# base36 result
# 8: ((3.7190542428571427, 0.8995506118495411), (2.066095086865182, 0.609210293352161)),
# 10: ((3.719109611111111, 0.7956463384852813), (2.0946299036665494, 0.5322004874842623)),
# 15: ((3.719274257142857, 0.6401989313894239), (2.129437216268589, 0.42108786288993155)),
# 16: ((3.7192072666666665, 0.6188627491757901), (2.1336109506109366, 0.4064699817331141)),
# 20: ((3.719249815789474, 0.5506473627709657), (2.145293932511567, 0.3591543917048417)),
# 24: ((3.7191934304347827, 0.50051922802262), (2.152858549996053, 0.3252064160191062)),
# 25: ((3.7192351583333334, 0.4904181410613897), (2.1543202565038735, 0.31823801389315026)),
# 32: ((3.7190408419354837, 0.4315967526660196), (2.1620321219700767, 0.2788634701820312)),
# 40: ((3.7191682666666668, 0.3852248727988986), (2.16746680811131, 0.24802261318501675)),
# 50: ((3.718913744897959, 0.3436564880405547), (2.1715676118603806, 0.22070510537297627)),
# 64: ((3.7190009761904763, 0.30325954360127116), (2.1751172797904093, 0.1942582237461476)),
# base64 results
# done in 130.86447429656982 for 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_
# 8: ((3.7627115714285715, 0.9413431166706269), (2.1378378843992736, 0.6394596814295781)),
# 10: ((3.7617393333333333, 0.8327986018456262), (2.168873183866972, 0.5605393324056347)),
# 15: ((3.7619624285714286, 0.6698092646328063), (2.2080058406286702, 0.4447698491992352)),
# 16: ((3.7618573333333334, 0.6471500119793832), (2.2116826642934453, 0.4288377928263507)),
# 20: ((3.7618887368421055, 0.575813792926031), (2.224384985667721, 0.37985781543221253)),
# 24: ((3.7621449565217393, 0.5243297908608613), (2.2326041329976607, 0.34397389723600613)),
# 25: ((3.762616791666667, 0.5137934920050976), (2.234571917211925, 0.3366547036535176)),
# 32: ((3.761885838709677, 0.4521158322065318), (2.2426375800006153, 0.29506039075960255)),
# 40: ((3.7622649487179487, 0.4031261511824518), (2.2485911621253574, 0.2622954601051068)),
# 50: ((3.762087693877551, 0.3597404118023357), (2.2533774423872956, 0.23384524947332655)),
# 64: ((3.7625271746031745, 0.31733579704946846), (2.257532519514275, 0.20571908142867643)),
================================================
FILE: experiment/train.py
================================================
import hashlib
import os
import pathlib
import pickle
import random
import subprocess
import sys
from datetime import datetime
import keras_tuner as kt
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import Model # type: ignore
from numpy import ndarray
from sklearn.model_selection import train_test_split
from sklearn.utils import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from data_loader import read_detected_data, read_metadata, join_label, get_y_labels
from experiment.evaluate_model import evaluate_model
from features import prepare_data
from hyperparameters import HP_DICT
from log_callback import LogCallback
from ml_model import MlModel
from model_config_preprocess import model_config_preprocess, ML_CONFIG_PATH
from plot import save_plot
from prepare_data import prepare_train_data, RESULTS_DIR
def train(
cred_data_location: str,
jobs: int,
epochs: int,
batch_size: int,
patience: int,
doc_target: bool,
use_tuner: bool,
eval_test: bool,
eval_train: bool,
eval_full: bool,
) -> str:
# fixed seed for std.random in main()
tf.random.set_seed(random.randint(1, 0xffffffff))
np.random.seed(random.randint(1, 0xffffffff))
print(f"Memory at start: {LogCallback.get_memory_info()}", flush=True)
subprocess.check_call(f"md5sum {ML_CONFIG_PATH.absolute()}", shell=True) # dbg
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
os.makedirs(RESULTS_DIR, exist_ok=True)
print(f"Train model on data from {cred_data_location}", flush=True)
meta_checksum, data_checksum = prepare_train_data(cred_data_location, jobs, doc_target)
df_all_file = RESULTS_DIR / f"{meta_checksum}-{data_checksum}.pkl"
if df_all_file.exists():
df_all = pd.read_pickle(df_all_file)
print(f"Read from {df_all_file}", flush=True)
else:
# detected data means which data is passed to ML validator of credsweeper after filters with RuleName
detected_data = read_detected_data(RESULTS_DIR / f"detected_data.{data_checksum}.json")
print(f"CredSweeper detected {len(detected_data)} credentials without ML", flush=True)
# all markup data
meta_data = read_metadata(f"{cred_data_location}/meta")
print(f"Metadata markup: {len(meta_data)} items", flush=True)
df_all = join_label(detected_data, meta_data, cred_data_location)
# np.save(df_all_file, df_all)
df_all.to_pickle(df_all_file)
print(f"Stored to {df_all_file}", flush=True)
# to prevent extra memory consumption - delete unnecessary objects
del detected_data
del meta_data
# workaround for CI step
trial_cnt = 3
while 0 < trial_cnt:
trial_cnt -= 1
# there are 2 times possible fails due ml config might be updated
try:
thresholds = model_config_preprocess(df_all, doc_target)
break
except RuntimeError as exc:
if "RESTART:" in str(exc) and 0 <= trial_cnt:
print(str(exc), flush=True)
continue
else:
raise exc
else:
raise RuntimeError("Something went wrong")
# random split
df_train, df_test = train_test_split(df_all, test_size=0.15, random_state=random.randint(1, 1 << 32))
len_df_train = len(df_train)
print(f"Train size: {len_df_train}", flush=True)
len_df_test = len(df_test)
print(f"Test size: {len_df_test}", flush=True)
print(f"Prepare full data", flush=True)
x_full_line, x_full_variable, x_full_value, x_full_features = prepare_data(df_all)
y_full: ndarray = get_y_labels(df_all)
del df_all
print(f"Prepare train data", flush=True)
x_train_line, x_train_variable, x_train_value, x_train_features = prepare_data(df_train)
print("x_train_value dtype ", x_train_value.dtype, flush=True) # dbg
print("x_train_features dtype", x_train_features.dtype, flush=True) # dbg
y_train = get_y_labels(df_train)
print("y_train dtype", y_train.dtype, flush=True) # dbg
del df_train
print(f"Class-1 prop on train: {np.mean(y_train):.4f}", flush=True)
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
max_weight = max(class_weights)
class_weights = [weight / max_weight for weight in class_weights]
print(f"y_train size:{len(y_train)}, 0: {np.count_nonzero(y_train == 0)}, 1: {np.count_nonzero(y_train == 1)}",
flush=True)
class_weight = dict(zip(classes, class_weights))
print(f"class_weight: {class_weight}", flush=True) # information about class weights
print(f"Prepare test data", flush=True)
x_test_line, x_test_variable, x_test_value, x_test_features = prepare_data(df_test)
y_test = get_y_labels(df_test)
print(f"Class-1 prop on test: {np.mean(y_test):.4f}", flush=True)
del df_test
print(f"Memory before search / compile: {LogCallback.get_memory_info()}", flush=True)
log_callback = LogCallback()
if use_tuner:
print(f"Tuner initial dict:{HP_DICT}", flush=True)
tuner_kwargs = {k: v[0] for k, v in HP_DICT.items()}
print(f"Tuner kwargs:{tuner_kwargs}", flush=True)
tuner = kt.BayesianOptimization(
hypermodel=MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape,
**tuner_kwargs),
objective='val_loss',
directory=str(RESULTS_DIR / f"{current_time}.tuner"),
project_name='ml_tuning',
seed=random.randint(1, 0xffffffff),
max_trials=30,
)
search_early_stopping = EarlyStopping(monitor="val_loss",
patience=patience,
mode="min",
restore_best_weights=True,
verbose=1)
tuner.search(
x=[x_train_line, x_train_variable, x_train_value, x_train_features],
y=y_train,
epochs=epochs,
batch_size=batch_size,
callbacks=[search_early_stopping, log_callback],
validation_data=([x_test_line, x_test_variable, x_test_value, x_test_features], y_test),
verbose=2,
)
print("Best Hyperparameters:", flush=True)
for k, v in tuner.get_best_hyperparameters()[0].values.items():
print(f"{k}: {v}", flush=True)
param_kwargs = {k: float(v) for k, v in tuner.get_best_hyperparameters()[0].values.items() if k in HP_DICT}
del tuner
else:
print(f"Model is trained with params from dict:{HP_DICT}", flush=True)
param_kwargs = {k: v[1] for k, v in HP_DICT.items()}
print(f"Model hyper parameters: {param_kwargs}", flush=True)
# repeat train step to obtain actual history chart
_model = MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape,
**param_kwargs)
keras_model = _model.build(hp=None) # this train will be used hyperparam in param_kwargs
if not eval_full:
# the data are not necessary
del x_full_line
del x_full_variable
del x_full_value
del x_full_features
del y_full
early_stopping = EarlyStopping(monitor="val_loss",
patience=patience,
mode="min",
restore_best_weights=True,
verbose=1)
model_checkpoint = ModelCheckpoint(filepath=str(RESULTS_DIR / f"{current_time}.best_model"),
monitor="val_loss",
save_best_only=True,
mode="min",
verbose=1)
print(f"Memory before train: {LogCallback.get_memory_info()}", flush=True)
fit_history = keras_model.fit(x=[x_train_line, x_train_variable, x_train_value, x_train_features],
y=y_train,
batch_size=batch_size,
epochs=epochs,
verbose=2,
validation_data=([x_test_line, x_test_variable, x_test_value,
x_test_features], y_test),
class_weight=class_weight,
callbacks=[early_stopping, model_checkpoint, log_callback],
use_multiprocessing=True)
# if best_val_loss is not None and best_val_loss + 0.00001 < early_stopping.best:
# print(f"CHECK BEST TUNER EARLY STOP : {best_val_loss} vs CURRENT: {early_stopping.best}",flush=True)
print(f"Memory after train: {LogCallback.get_memory_info()}", flush=True)
with open(RESULTS_DIR / f"{current_time}.history.pickle", "wb") as f:
pickle.dump(fit_history, f)
model_file_name = RESULTS_DIR / f"ml_model_at-{current_time}"
keras_model.save(model_file_name, include_optimizer=False)
if eval_test:
print(f"Validate results on the test subset. Size: {len(y_test)} {np.mean(y_test):.4f}", flush=True)
evaluate_model(thresholds, keras_model, [x_test_line, x_test_variable, x_test_value, x_test_features], y_test)
# drop small test set first to free a bit more memory for next evaluation
del x_test_line
del x_test_variable
del x_test_value
del x_test_features
del y_test
if eval_train:
print(f"Validate results on the train subset. Size: {len(y_train)} {np.mean(y_train):.4f}", flush=True)
evaluate_model(thresholds, keras_model, [x_train_line, x_train_variable, x_train_value, x_train_features],
y_train)
del x_train_line
del x_train_variable
del x_train_value
del x_train_features
del y_train
if eval_full:
print(f"Validate results on the full set. Size: {len(y_full)} {np.mean(y_full):.4f}", flush=True)
evaluate_model(thresholds, keras_model, [x_full_line, x_full_variable, x_full_value, x_full_features], y_full)
del x_full_line
del x_full_variable
del x_full_value
del x_full_features
del y_full
onnx_model_file = pathlib.Path(__file__).parent.parent / "credsweeper" / "ml_model" / "ml_model.onnx"
# convert the model to onnx right now
convert_args = f"{sys.executable} -m tf2onnx.convert --saved-model {model_file_name.absolute()}" \
f" --output {str(onnx_model_file)} --verbose"
subprocess.check_call(convert_args, shell=True, cwd=pathlib.Path(__file__).parent)
with open(onnx_model_file, "rb") as f:
onnx_md5 = hashlib.md5(f.read()).hexdigest()
print(f"ml_model.onnx:{onnx_md5}", flush=True)
with open(ML_CONFIG_PATH, "rb") as f:
config_md5 = hashlib.md5(f.read()).hexdigest()
print(f"ml_config.json:{config_md5}", flush=True)
best_epoch = 1 + np.argmin(np.array(fit_history.history['val_loss']))
# ml history analysis
save_plot(
stamp=current_time,
title=f"batch:{batch_size} train:{len_df_train} test:{len_df_test} weights:{class_weights}",
history=fit_history,
dir_path=RESULTS_DIR,
best_epoch=int(best_epoch),
info=f"ml_config.json:{config_md5} ml_model.onnx:{onnx_md5} best_epoch:{best_epoch}",
)
return str(model_file_name.absolute())
================================================
FILE: fuzz/README.md
================================================
# Fuzzing of CredSweeper API
The directory is used for dynamic analysis of CredSweeper with using [atheris](https://github.com/google/atheris),
based on [LibFuzzer](https://llvm.org/docs/LibFuzzer.html#options)
## Preparation
- The same interpreter packages as for CredSweeper + atheris + coverage (optional).
Working dir is project root - to be sure current source of credsweeper is used for coverage.
Preferred to use virtual environment.
```bash
python3.8 -m virtualenv --copies .venv
. .venv/bin/activate
pip install -U pip
pip install -r requirements.txt
```
## Fuzzing
Launch fuzzing script to collect seed files.
```bash
fuzzing.sh
```
-atheris_runs - must be greater than corpus files in 'corpus' directory.
Many interactions require more rss memory - the limit must be decided.
Then after productive fuzzing there will be new corpus files.
Some of them are reduced from others. Some - new for imported libs.
The launch does not require coverage module but requires instumentation.
## Coverage
Launch fuzzing script to calculate coverage with provided corpus files.
```bash
coveraging.sh
```
To generate HTML report use ```coverage html``` in project root (where .coverage file exists) after fuzzing.
Instrumentation does not required - so it can be skipped.
## Reducing
Launch reducing script to reduce corpus files only for 'NEW'.
```bash
reducing.sh
```
The script is used -merge function of libfuzzer to reduce corpus files with multiple interaction.
Full instrumentation is preferred.
## Minimizing
Launch the script to remove corpus files that do not impact on credsweeper.
```bash
minimizing.sh
```
The script uses coverage package to determine which corpus files do not change overall coverage and removes them.
The process is slow due each corpus file has to be checked. Instrumentation is not necessary.
NOTE: some seeds may be dropped due complicated expression is assumed like one line/branch.
e.g.:```if 0x01 == a[0] and 0x02 == a[1]:``` then seed [0x01,0x02] is kept, but [0x01,0x00] will be removed.
## Useful commands for seed files modification
```base
for f in $(ls); do h=$(sha1sum $f|cut -c-40); mv -v $f $h; done
```
Rename seed file according sha1 digest of content
================================================
FILE: fuzz/__main__.py
================================================
#!/usr/bin/env python
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
import io
import logging
import os
import sys
import warnings
from unittest.mock import patch
import atheris
# # # In simple case interested lib(s) may be imported during 'with'
# # # It runs quickly but not precisely
# with atheris.instrument_imports(enable_loader_override=False):
from bs4 import XMLParsedAsHTMLWarning
from credsweeper.app import CredSweeper
from credsweeper.common.constants import DiffRowType
from credsweeper.file_handler.files_provider import FilesProvider
from credsweeper.file_handler.patches_provider import PatchesProvider
from tests import ZERO_ML_THRESHOLD
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
# set log level for fuzzing
logging.basicConfig(level=logging.CRITICAL)
logger = logging.getLogger(__name__)
# Use depth=3 to deep scan in .zip and .gz files + find by extension feature
cred_sweeper = CredSweeper(find_by_ext=True, ml_threshold=ZERO_ML_THRESHOLD, color=True, hashed=True,
subtext=True, sort_output=True, thrifty=True)
INPUT_DATA_SIZE = 0x1000
def fuzz_credsweeper_scan(data: bytes):
# seed file name is sha1 of the content
file_name = hashlib.sha1(data).hexdigest()
fdp = atheris.FuzzedDataProvider(data)
# offset:0x0000
to_scan = fdp.ConsumeBytes(INPUT_DATA_SIZE)
logger.debug("%s >>>>>>>> %s", file_name, to_scan.decode(encoding='ascii', errors="ignore"))
candidates = []
cred_sweeper.config.doc = False
cred_sweeper.config.depth = 3
cred_sweeper.credential_manager.candidates.clear()
patch_provider_add = PatchesProvider([io.BytesIO(to_scan)], change_type=DiffRowType.ADDED)
with patch.object(CredSweeper, CredSweeper.export_results.__name__):
cred_sweeper.run(patch_provider_add)
candidates.extend(cred_sweeper.credential_manager.get_credentials())
cred_sweeper.config.doc = False
cred_sweeper.config.depth = 0
cred_sweeper.credential_manager.candidates.clear()
patch_provider_del = PatchesProvider([io.BytesIO(to_scan)], change_type=DiffRowType.DELETED)
with patch.object(CredSweeper, CredSweeper.export_results.__name__):
cred_sweeper.run(patch_provider_del)
candidates.extend(cred_sweeper.credential_manager.get_credentials())
cred_sweeper.config.doc = True
cred_sweeper.config.depth = 3
cred_sweeper.credential_manager.candidates.clear()
text_provider = FilesProvider(["dummy", io.BytesIO(to_scan)])
with patch.object(CredSweeper, CredSweeper.export_results.__name__):
cred_sweeper.run(text_provider)
candidates.extend(cred_sweeper.credential_manager.get_credentials())
cred_sweeper.config.doc = False
cred_sweeper.config.depth = 0
cred_sweeper.credential_manager.candidates.clear()
text_provider = FilesProvider(["dummy.xml", io.BytesIO(to_scan)])
with patch.object(CredSweeper, CredSweeper.export_results.__name__):
cred_sweeper.run(text_provider)
candidates.extend(cred_sweeper.credential_manager.get_credentials())
for candidate in candidates:
candidate.to_dict_list(False, False)
def main():
# # # Instrument all works with ~30K functions. It is slow, but necessary for fuzzing for new seeds and reducing.
# # # Instrumentation may being skipped when checking coverage with existing seeds or seeds minimization.
if os.getenv('DO_ATHERIS_INSTRUMENT'):
atheris.instrument_all()
atheris.Setup( #
sys.argv + [f"-max_len={INPUT_DATA_SIZE}"], # -rss_limit_mb=6912
fuzz_credsweeper_scan, #
internal_libfuzzer=True, #
enable_python_coverage=True)
atheris.Fuzz()
if __name__ == "__main__":
main()
================================================
FILE: fuzz/auxilary.py
================================================
import binascii
import random
import string
import sys
OLD_SEED_SIZE = 2048
NEW_SEED_SIZE = 4096 - 256
# run in fuzz: for f in $(find corpus -type f); do python3 auxilary.py $f; done
def main(argv):
ascii_chars = string.digits + string.ascii_letters + string.punctuation + ' '
responses = [
b'{"status":"PASS"}',
b'{"status":"REQUEST_DENIED","error_message":"The provided API key is invalid."}',
b'{"status":"REQUEST_DENIED","error_message":"This API project is not authorized to use this API."}',
b'{"ok":1}',
b'{"error":1}',
b'{"error":"invalid_auth"}',
b'You are being |