Repository: xming521/WeClone Branch: master Commit: 1779b5d8af03 Files: 63 Total size: 285.9 KB Directory structure: gitextract_wor7olkb/ ├── .cursor/ │ └── rules/ │ └── weclone-rules.mdc ├── .github/ │ ├── issue-labeler.yml │ ├── weclone-release-event.json │ └── workflows/ │ ├── issue-labeler.yml │ └── tg_release_notification.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── README_zh.md ├── dataset/ │ ├── eval/ │ │ ├── test_data-en.json │ │ ├── test_data-privacy.json │ │ └── test_data-zh.json │ ├── media/ │ │ └── images/ │ │ └── .gitkeep │ ├── res_csv/ │ │ └── sft/ │ │ └── dataset_info.json │ └── telegram/ │ └── .gitkeep ├── ds_config.json ├── examples/ │ ├── mllm.template.jsonc │ └── tg.template.jsonc ├── pyproject.toml ├── settings.template.jsonc ├── tests/ │ ├── __init__.py │ ├── configs/ │ │ ├── Qwen2.5-VL.jsonc │ │ └── qwen2.5.jsonc │ ├── test_PII.py │ ├── test_full_pipe.py │ └── tests_data/ │ ├── test_PII/ │ │ └── test_0_730.csv │ ├── test_model_data.json │ └── test_person/ │ └── test_0_730.csv └── weclone/ ├── __init__.py ├── cli.py ├── core/ │ ├── PII/ │ │ ├── __init__.py │ │ └── pii_detector.py │ └── inference/ │ ├── offline_infer.py │ └── online_infer.py ├── data/ │ ├── __init__.py │ ├── chat_parsers/ │ │ └── telegram_parser.py │ ├── clean/ │ │ ├── __init__.py │ │ └── strategies.py │ ├── models.py │ ├── qa_generator.py │ ├── strategies.py │ └── utils.py ├── eval/ │ ├── __init__.py │ ├── cli_demo.py │ ├── eval_model.py │ ├── test_model.py │ └── web_demo.py ├── prompts/ │ ├── __init__.py │ └── clean_data.py ├── server/ │ ├── __init__.py │ └── api_service.py ├── train/ │ ├── __init__.py │ ├── export_model.py │ └── train_sft.py └── utils/ ├── __init__.py ├── config.py ├── config_models.py ├── i18n.py ├── length_cdf.py ├── log.py ├── retry.py └── tools.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .cursor/rules/weclone-rules.mdc ================================================ --- description: globs: alwaysApply: true --- --- description: globs: alwaysApply: true --- # Your rule content - You can @ files here - The project uses uv as the package manager and pyproject.toml as the project configuration file. - You should write as few code comments as possible. - Prefer using the encapsulated logger `from weclone.utils.log import logger` for printing. ================================================ FILE: .github/issue-labeler.yml ================================================ # 添加 Discussion 标签 Discussion: - '(讨论|交流|分享|意见|建议|思考|探讨|交换意见|brainstorm|discussion)' # 添加 bug 标签 bug: - '(bug|错误|问题|失败|崩溃|异常|报错|不工作|无法运行|broken|crash|error|exception|fails)' # 添加 chatbot 标签 chatbot: - '(聊天机器人|chatbot|chat bot|对话机器人|聊天助手|AI助手|机器人对话|bot|assistant)' # 添加 documentation 标签 documentation: - '(文档|说明|使用指南|指导|手册|教程|文档更新|documentation|docs|guide|tutorial|readme)' # 添加 duplicate 标签 duplicate: - '(重复|已有|duplicate|已经存在|已提交过|重复问题|重复报告|dup)' # 添加 feature 标签 feature: - '(功能|特性|新增|增加|添加|实现|feature|enhancement|新功能|功能请求|feature request)' # 添加 good first issue 标签 good first issue: - '(入门|简单|容易|新手|初学者|开始|first|beginner|starter|easy|简单任务|good first issue)' # 添加 help wanted 标签 help wanted: - '(需要帮助|寻求帮助|请求协助|help|求助|协助|帮忙|help wanted|need help|assistance)' # 添加 invalid 标签 invalid: - '(无效|不适用|不相关|无关|错误提交|invalid|not relevant|irrelevant|not applicable)' # 添加 Mac 标签 Mac: - '(Mac|MacOS|macOS|OSX|Mac系统|苹果系统|苹果电脑|MacBook)' # 添加 question 标签 question: - '(问题|疑问|如何|怎么|请问|是否|能否|可以吗|question|how to|what is|why)' # 添加 Windows 标签 Windows: - '(Windows|微软|Win10|Win11|Windows系统|微软系统|win)' ================================================ FILE: .github/weclone-release-event.json ================================================ { "action": "published", "release": { "id": 123456789, "tag_name": "v0.2.24", "target_commitish": "main", "name": "v0.2.24", "body": "## 🥰 What's Changed\n - Update torch version to 2.7.0 and vllm version to 0.9.1, switch offline inference to chat-style invocation\n - Add `test_model_args` and `vllm_args` configuration items to allow custom test dataset files\n - Add config file path option in CLI, support setting WECLONE_CONFIG_PATH environment variable\n - Update max_new_tokens and enable_thinking parameters in data cleaning strategy to optimize inference\n - Partial feature adaptation for qwen3\n \n ## 🐛 Bug fix\n fix #158 fix #83 fix #77 fix #69 \n \n **Full Changelog**: https://github.com/xming521/WeClone/compare/v0.2.23...v0.2.24\n \n ## 🥰 更新内容\n - 更新torch版本至2.7.0,vllm版本到0.9.1,离线推理改为chat方式调用\n - 添加`test_model_args` and `vllm_args`配置项,允许自定义测试集文件\n - CLI中添加配置文件路径选项,支持设置WECLONE_CONFIG_PATH环境变量\n - 更新数据清理策略中的max_new_tokens和enable_thinking参数以优化推理过程\n - 部分功能适配qwen3", "draft": false, "prerelease": false, "created_at": "2024-01-15T10:30:00Z", "published_at": "2024-01-15T10:30:00Z", "author": { "login": "xming521", "id": 12345, "avatar_url": "https://avatars.githubusercontent.com/u/12345?v=4", "html_url": "https://github.com/xming521" }, "html_url": "https://github.com/xming521/WeClone/releases/tag/v0.2.24", "assets_url": "https://api.github.com/repos/xming521/WeClone/releases/123456789/assets", "upload_url": "https://uploads.github.com/repos/xming521/WeClone/releases/123456789/assets{?name,label}", "tarball_url": "https://api.github.com/repos/xming521/WeClone/tarball/v0.2.24", "zipball_url": "https://api.github.com/repos/xming521/WeClone/zipball/v0.2.24", "assets": [] }, "repository": { "id": 987654321, "name": "WeClone", "full_name": "xming521/WeClone", "owner": { "login": "xming521", "id": 12345 }, "private": false, "html_url": "https://github.com/xming521/WeClone", "description": "WeClone - AI Clone Repository", "fork": false, "created_at": "2023-01-01T00:00:00Z", "updated_at": "2024-01-15T10:30:00Z", "pushed_at": "2024-01-15T10:25:00Z", "clone_url": "https://github.com/xming521/WeClone.git", "default_branch": "main" }, "sender": { "login": "xming521", "id": 12345, "avatar_url": "https://avatars.githubusercontent.com/u/12345?v=4", "html_url": "https://github.com/xming521" } } ================================================ FILE: .github/workflows/issue-labeler.yml ================================================ name: add labels to Issues on: issues: types: [opened, edited] jobs: label_issues: runs-on: ubuntu-latest permissions: issues: write contents: read steps: - name: get_last_run_time id: last_run run: | # 获取当前日期减去 1 天作为默认值(处理最近一天的 issues) echo "date=$(date -d '1 day ago' -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT - name: RegEx Issue Labeler uses: github/issue-labeler@v3.4 with: include-title: 1 repo-token: "${{ secrets.GITHUB_TOKEN }}" configuration-path: .github/issue-labeler.yml enable-versioned-regex: 0 not-before: ${{ steps.last_run.outputs.date }} ================================================ FILE: .github/workflows/tg_release_notification.yml ================================================ name: Telegram Release Notification on: release: types: [published] jobs: notify: runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v4 - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: '22' - name: Install telegramify-markdown run: | npm init -y npm install telegramify-markdown - name: Convert release body to Telegram format id: convert-markdown run: | # 先将release body保存到文件 cat > release_body.txt << 'RELEASE_BODY_EOF' ${{ github.event.release.body }} RELEASE_BODY_EOF # 然后创建转换脚本 cat > convert-release.js << 'EOF' const telegramifyMarkdown = require('telegramify-markdown'); const fs = require('fs'); // 从文件读取release body内容(避免shell解析问题) let releaseBody = ''; try { releaseBody = fs.readFileSync('release_body.txt', 'utf8'); } catch (error) { console.error('读取release body失败:', error); releaseBody = process.env.RELEASE_BODY || ''; } console.log('=== 原始release body ==='); console.log(releaseBody); // 转换为Telegram格式 const telegramBody = telegramifyMarkdown(releaseBody); // 构建完整的消息 const tagName = process.env.TAG_NAME || ''; const releaseUrl = process.env.RELEASE_URL || ''; const fullMessage = `🚀 *WeClone New Version Released* 🏷️ *Version*: \`${tagName}\` 🔗 *Link*: [Github Release](${releaseUrl}) 📋 *Release Notes*: ${telegramBody}`; // 输出到GitHub Actions console.log('转换后的消息:'); console.log(fullMessage); // 将消息保存到环境变量 fs.writeFileSync('telegram_message.txt', fullMessage); EOF # 设置环境变量 export RELEASE_BODY="${{ github.event.release.body }}" export TAG_NAME="${{ github.event.release.tag_name }}" export RELEASE_URL="${{ github.event.release.html_url }}" export REPO_NAME="${{ github.repository }}" # 运行转换脚本 node convert-release.js # 读取转换后的消息并设置为输出 echo "TELEGRAM_MESSAGE<> $GITHUB_OUTPUT cat telegram_message.txt >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT # - name: Display converted message # run: | # echo "=== 转换后的Telegram消息 ===" # echo "${{ steps.convert-markdown.outputs.TELEGRAM_MESSAGE }}" - name: Send Telegram Message uses: appleboy/telegram-action@master with: to: ${{ secrets.TELEGRAM_CHAT_ID }} token: ${{ secrets.TELEGRAM_BOT_TOKEN }} message: ${{ steps.convert-markdown.outputs.TELEGRAM_MESSAGE }} format: markdown disable_web_page_preview: false ================================================ FILE: .gitignore ================================================ wandb/ weclone_archive-my/ **/pycache/ events.out.tfevents.* 归档/ *.pt *.npz *nohup.out *log.txt *cookie.bin *.gradio/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ *.zip LLaMA-Factory chatglm3-6b cache archive model_output* data/test .vscode *-my*.* test-scripts-my/ *.csv !tests/tests_data/test_person/test_0_730.csv !tests/tests_data/test_PII/test_0_730.csv *test.* *-exp.* experiment/ *users.json Spark-TTS-0.5B/ uv.lock output* *.out Qwen*/ settings.jsonc settings.json dataset/blocked_words.json dataset/wechat/* models/* .secrets* .env* # Image files dataset/**/*.jpg dataset/**/*.jpeg dataset/**/*.png dataset/**/*.gif dataset/**/*.bmp dataset/**/*.webp dataset/**/*.svg dataset/**/*.ico dataset/*telegram*/* !*.gitkeep WC-exp/* modeloutputs/* /tmp/* cache.pkl hfd.sh rpa_cache.pkl settings-bot8006.jsonc models_final/* /data/* /llamaboard_cache eval_Result/* ================================================ FILE: .pre-commit-config.yaml ================================================ # .pre-commit-config.yaml default_install_hook_types: [pre-commit, prepare-commit-msg] ci: autofix_commit_msg: ":balloon: auto fixes by pre-commit hooks" autofix_prs: true autoupdate_branch: master autoupdate_schedule: monthly autoupdate_commit_msg: ":balloon: pre-commit autoupdate hooks" repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: check-ast # Python 语法检查 - id: check-added-large-files # 防止大文件 args: ["--maxkb=25000"] - id: check-merge-conflict # 检查合并冲突 - id: check-yaml # YAML 语法检查 - id: check-toml # TOML 语法检查 - id: debug-statements # 防止调试语句 - id: end-of-file-fixer # 文件结尾修复 # - id: trailing-whitespace # 移除行尾空白 # args: [--markdown-linebreak-ext=md] - id: no-commit-to-branch # 保护主分支 args: ["--branch", "main", "--branch", "master"] - id: mixed-line-ending # 检查混合行结束符 args: ["--fix=lf"] - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.12.8 hooks: - id: ruff args: [--fix] - id: ruff-format - repo: https://github.com/pycqa/isort rev: 6.0.1 hooks: - id: isort args: ["--profile", "black", "--line-length", "120"] # - repo: https://github.com/PyCQA/bandit # rev: 1.8.3 # hooks: # - id: bandit # name: Python 安全检查 # args: ["-c", "pyproject.toml", "-x", "tests"] # additional_dependencies: ["bandit[toml]"] ================================================ FILE: LICENSE ================================================ GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU Affero General Public License is a free, copyleft license for software and other kinds of works, specifically designed to ensure cooperation with the community in the case of network server software. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, our General Public Licenses are intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. Developers that use our General Public Licenses protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License which gives you legal permission to copy, distribute and/or modify the software. A secondary benefit of defending all users' freedom is that improvements made in alternate versions of the program, if they receive widespread use, become available for other developers to incorporate. Many developers of free software are heartened and encouraged by the resulting cooperation. However, in the case of software used on network servers, this result may fail to come about. The GNU General Public License permits making a modified version and letting the public access it on a server without ever releasing its source code to the public. The GNU Affero General Public License is designed specifically to ensure that, in such cases, the modified source code becomes available to the community. It requires the operator of a network server to provide the source code of the modified version running there to the users of that server. Therefore, public use of a modified version, on a publicly accessible server, gives the public access to the source code of the modified version. An older license, called the Affero General Public License and published by Affero, was designed to accomplish similar goals. This is a different license, not a version of the Affero GPL, but Affero has released a new version of the Affero GPL which permits relicensing under this license. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU Affero General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Remote Network Interaction; Use with the GNU General Public License. Notwithstanding any other provision of this License, if you modify the Program, your modified version must prominently offer all users interacting with it remotely through a computer network (if your version supports such interaction) an opportunity to receive the Corresponding Source of your version by providing access to the Corresponding Source from a network server at no charge, through some standard or customary means of facilitating copying of software. This Corresponding Source shall include the Corresponding Source for any work covered by version 3 of the GNU General Public License that is incorporated pursuant to the following paragraph. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the work with which it is combined will remain governed by version 3 of the GNU General Public License. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU Affero General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU Affero General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU Affero General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU Affero General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If your software can interact with users remotely through a computer network, you should also make sure that it provides a way for users to get its source. For example, if your program is a web application, its interface could display a "Source" link that leads users to an archive of the code. There are many ways you could offer source, and different solutions will be better for different programs; see section 13 for the specific requirements. You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU AGPL, see . ================================================ FILE: README.md ================================================ ![download](https://github.com/user-attachments/assets/cd4a87c6-1649-4ce5-bce8-bd5b08b278de)

🚀 One-stop solution for creating your digital avatar from chat history 💡

[![GitHub stars](https://img.shields.io/github/stars/xming521/WeClone?style=for-the-badge&logo=github&label=Stars&logoColor=white&color=ffda65)](https://github.com/xming521/WeClone/stargazers) [![GitHub release](https://img.shields.io/github/v/release/xming521/WeClone?style=for-the-badge&logo=github&label=Release&logoColor=white&color=06d094)](https://github.com/xming521/WeClone/releases) [![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/+JEdak4m0XEQ3NGNl) [![Twitter](https://img.shields.io/badge/Twitter-@weclone567-000000?style=for-the-badge&logo=x&logoColor=white)](https://x.com/weclone567) [![小红书](https://img.shields.io/badge/WeClone-FE2C55?style=for-the-badge&logo=xiaohongshu&logoColor=white)](https://www.xiaohongshu.com/user/profile/628109730000000021029de4) WeClone① Featured|HelloGitHub xming521%2FWeClone | Trendshift Ask DeepWiki

简体中文| English| Project Homepage Documentation

> [!IMPORTANT] > ### Telegram is now supported as a data source ! ## ✨Core Features - 💫 Complete end-to-end solution for creating digital avatars, including chat data export, preprocessing, model training, and deployment - 💬 Fine-tune LLM using chat history with support for image modal data, infusing it with that authentic "flavor" - 🔗 Integrate with Telegram, WhatsApp (coming soon) to create your own digital avatar - 🛡️ Privacy information filtering with localized fine-tuning and deployment for secure and controllable data ## 📋Features & Notes ### Data Source Platform Support | Platform | Text | Images | Voice | Video | Animated Emojis/Stickers | Links (Sharing) | Quote | Forward | Location | Files | |----------|------|--------|-------|-------|-----------------|-----------------|-------|---------|----------|-------| | Telegram | ✅ | ✅ | ❌ | ❌ | ⚠️Convert to Emoji | ❌ | ❌ | ✅ | ✅ | ❌ | | WhatsApp | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | | Discord | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | | Slack | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | ### Deployment Platform Support | Platform | Deployment Support | |----------|--------------------| | Telegram | ✅ | | WhatsApp | 🚧 | | Discord | ✅ | | Slack | ✅ | > [!IMPORTANT] > - WeClone is still in rapid iteration phase, current performance does not represent final results. > - LLM fine-tuning effectiveness largely depends on model size, quantity and quality of chat data. Theoretically, larger models with more data yield better results. > - The performance of the 7B model is average, while models with 14B or more parameters tend to deliver better results. > - Windows environment has not been rigorously tested. You can use WSL as the runtime environment. ### Recent Updates [25/07/10] Data source added Telegram [25/06/05] Support for image modal data fine-tuning ### Online Fine-Tuning - Big Model Lab (Lab4AI) (with 50 CNY voucher): https://www.lab4ai.cn/project/detail?utm_source=weclone1&id=ab83d14684fa45d197f67eddb3d8316c&type=project ### Hardware Requirements The project uses Qwen2.5-VL-7B-Instruct model by default with LoRA method for SFT stage fine-tuning. You can also use other models and methods supported by [LLaMA Factory](https://github.com/hiyouga/LLaMA-Factory/tree/main#supported-models). Estimated VRAM requirements: | Method | Precision | 7B | 14B | 30B | 70B | `x`B | | ------------------------------- | --------- | ----- | ----- | ----- | ------ | ------- | | Full (`bf16` or `fp16`) | 32 | 120GB | 240GB | 600GB | 1200GB | `18x`GB | | Full (`pure_bf16`) | 16 | 60GB | 120GB | 300GB | 600GB | `8x`GB | | Freeze/LoRA/GaLore/APOLLO/BAdam | 16 | 16GB | 32GB | 64GB | 160GB | `2x`GB | | QLoRA | 8 | 10GB | 20GB | 40GB | 80GB | `x`GB | | QLoRA | 4 | 6GB | 12GB | 24GB | 48GB | `x/2`GB | | QLoRA | 2 | 4GB | 8GB | 16GB | 24GB | `x/4`GB | ## Environment Setup 1. CUDA installation (skip if already installed, **requires version 12.6 or above**) 2. It is recommended to use [uv](https://docs.astral.sh/uv/) to install dependencies, which is a very fast Python environment manager. After installing uv, you can use the following commands to create a new Python environment and install dependencies. ```bash git clone https://github.com/xming521/WeClone.git && cd WeClone uv venv .venv --python=3.12 source .venv/bin/activate # windows .venv\Scripts\activate uv pip install --group main -e . ``` 3. Copy the configuration file template and rename it to `settings.jsonc`, and make subsequent configuration changes in this file: ```bash cp examples/tg.template.jsonc settings.jsonc ``` > [!NOTE] > Training and inference related configurations are unified in the file `settings.jsonc` 4. Use the following command to test whether the CUDA environment is correctly configured and can be recognized by PyTorch (not needed for Mac): ```bash python -c "import torch; print('CUDA Available:', torch.cuda.is_available());" ``` 5. (Optional) Install FlashAttention to accelerate training and inference: `uv pip install flash-attn --no-build-isolation`. ## Model Download It is recommended to use [Hugging Face](https://huggingface.co/docs/hub/models-downloading) to download models, or use the following command: ```bash git lfs install git clone https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct models/Qwen2.5-VL-7B-Instruct ``` ## Data Preparation Please use [Telegram Desktop](https://desktop.telegram.org/) to export chat records. Click the top right corner in the chat interface, then click "Export chat history". Select Photos for message types and JSON for format. You can export multiple contacts (group chat records are not recommended), then place the exported `ChatExport_*` in the `./dataset/telegram` directory, meaning put different people's chat record folders together in `./dataset/telegram`. ## Data Preprocessing - First, modify the `language`, `platform`, and `include_type` in the configuration file according to your needs. - If you use telegram, you need to modify the `telegram_args.my_id` in the configuration file to your own telegram user ID. - By default, the project uses Microsoft Presidio to remove `phone numbers, email addresses, credit card numbers, IP addresses, geographic location names, international bank account numbers, cryptocurrency wallet addresses, age information, and generic ID numbers` from the data, but it cannot guarantee 100% identification. - Therefore, a blocklist `blocked_words` is provided in `settings.jsonc`, allowing users to manually add words or phrases they want to filter (the entire sentence containing blocked words will be removed by default). > [!IMPORTANT] > 🚨 Please be sure to protect personal privacy and do not leak personal information! - Execute the following command to process the data. You can modify the `make_dataset_args` in settings.jsonc according to your own chat style. ```bash weclone-cli make-dataset ``` More Parameter Details: [Data Preprocessing](https://docs.weclone.love/docs/deploy/data_preprocessing.html#related-parameters) ## Configure Parameters and Fine-tune Model - (Optional) Modify `model_name_or_path`, `template`, `lora_target` in `settings.jsonc` to select other locally downloaded models. - Modify `per_device_train_batch_size` and `gradient_accumulation_steps` to adjust VRAM usage. - You can modify parameters like `num_train_epochs`, `lora_rank`, `lora_dropout` in `train_sft_args` based on your dataset's quantity and quality. ### Single GPU Training ```bash weclone-cli train-sft ``` ### Multi-GPU Training Uncomment the `deepspeed` line in `settings.jsonc` and use the following command for multi-GPU training: ```bash uv pip install "deepspeed<=0.16.9" deepspeed --num_gpus=number_of_gpus weclone/train/train_sft.py ``` ### Simple Inference with Browser Demo Test suitable temperature and top_p values, then modify `infer_args` in settings.jsonc for subsequent inference use. ```bash weclone-cli webchat-demo ``` ### Inference Using API ```bash weclone-cli server ``` ### Test with Common Chat Questions Does not include questions asking for personal information, only daily conversation. Test results are in test_result-my.txt. ```bash weclone-cli server weclone-cli test-model ``` ## 🖼️ Results Showcase > [!TIP] > **We're looking for interesting examples of native English speakers chatting with WeClone! Feel free to share them with us on Twitter.** ## 🤖 Deploy to Chat Bots ### AstrBot [AstrBot](https://github.com/AstrBotDevs/AstrBot) is an easy-to-use multi-platform LLM chatbot and development framework ✨ Supports Discord, Telegram, Slack, Feishu and other platforms. Usage steps: 1. Deploy AstrBot 2. Deploy messaging platforms like Discord, Telegram, Slack in AstrBot 3. Execute `weclone-cli server` to start the API service 4. Add a new service provider in AstrBot, select OpenAI type, fill in the API Base URL according to AstrBot's deployment method (e.g., for docker deployment it might be http://172.17.0.1:8005/v1), fill in the model as gpt-3.5-turbo, and enter any API Key 5. Tool calling is not supported after fine-tuning, please turn off the default tools first by sending the command: `/tool off_all` on the messaging platform, otherwise the fine-tuned effect won't be visible. 6. Set the system prompt in AstrBot according to the default_system used during fine-tuning. ![5](https://github.com/user-attachments/assets/19de7072-076a-4cdf-8ae6-46b9b89f536a) > [!IMPORTANT] > Check the api_service logs to ensure that the large model service request parameters are consistent with those used during fine-tuning as much as possible, and turn off all tool plugin capabilities. ### LangBot [LangBot](https://github.com/langbot-app/LangBot) is an easy-to-use open-source LLM chatbot platform suitable for various scenarios. It connects to various global instant messaging platforms. You can set up your IM bot in just 5 minutes. image 1. [Deploy LangBot](https://github.com/RockChinQ/LangBot/blob/master/README_EN.md#-getting-started) 2. Add a bot (Discord, Telegram, Slack, Lark e.g.) in LangBot 3. Execute `weclone-cli server` to start the WeClone API service 4. Add a new model in the model page, name it `gpt-3.5-turbo`, select OpenAI as the provider, fill in the request URL as WeClone's address. For detailed connection methods, refer to the [documentation](https://docs.langbot.app/en/workshop/network-details.html), and enter any API Key. image 6. Select the model you just added in the pipeline configuration, or modify the prompt configuration image ## 📌 Roadmap - [ ] Support more data sources - [ ] Richer context: including contextual conversations, chat participant information, time, etc. - [ ] Memory support - [ ] Multimodal support: image support already implemented - [ ] Data augmentation - [ ] GUI support - [ ] COT (Chain of Thought) thinking support ## Troubleshooting #### [Official Documentation FAQ](https://docs.weclone.love/docs/introduce/FAQ.html) It is also recommended to use [DeepWiki](https://deepwiki.com/xming521/WeClone) for problem solving. ## ❤️ Contributing Any Issues/Pull Requests are welcome! You can contribute by checking Issues or helping review PRs (Pull Requests). For new feature additions, please discuss through Issues first. Development environment: ```bash uv pip install --group dev -e . pre-commit install ``` The project uses `pytest` for testing, `pyright` for type checking, and `ruff` for code formatting. Before submitting your code, you should run `pytest tests` to ensure all tests pass. ## 🙏 Acknowledgments Thanks to the following code contributors and other community members for their contributions This project also benefits from excellent open source projects such as [PyWxDump](https://github.com/xaoyaoo/PyWxDump), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), [AstrBot](https://github.com/AstrBotDevs/AstrBot), [LangBot](https://github.com/RockChinQ/LangBot), and others. ## ⚠️ Disclaimer > [!CAUTION] > **This project is for learning, research and experimental purposes only. There are significant risks in using it for production environments, please assess carefully. Do not use for illegal purposes, consequences are at your own risk.** > [!IMPORTANT] > #### WeClone is currently not partnered with any platform and has not issued any cryptocurrency. The only official website is: [weclone.love](https://www.weclone.love). Beware of imitations.
Click to view disclaimer terms ### 1. Use at Your Own Risk - Users should fully understand and bear all related risks when using this project - **The project authors are not responsible for any direct or indirect losses arising from the use of this project** - Including but not limited to: data loss, financial loss, legal disputes, personal reputation damage, social relationship impact, psychological trauma, career development obstacles, business reputation damage, etc. ### 2. Production Environment Risk Warning - **Use for commercial purposes or providing external services requires bearing all risks yourself** - All consequences that may result from production environment use (including but not limited to service interruption, data security issues, user complaints, legal liability, etc.) are entirely borne by the user - **It is recommended to conduct thorough testing, verification and risk assessment before using in production environments** ### 3. Model Output Unreliability - Fine-tuned models may produce inaccurate, harmful or misleading content - Model outputs do not represent the views or intentions of real persons - Users should conduct manual review and verification of model outputs ### 4. Data Security and Privacy - Users should ensure that uploaded chat records and other data comply with relevant laws and regulations - Users should obtain **appropriate authorization from data-related persons** - This project is not responsible for **data leakage or privacy infringement** ### 5. Legal Compliance - **Users should ensure that using this project complies with local laws and regulations** - Involving artificial intelligence, data protection, intellectual property and other related laws - **Users bear the consequences of illegal use** ### 6. Technical Support Limitations - This project is provided "as is" without any express or implied warranties - Authors do not promise to provide continuous technical support or maintenance - No guarantee of project stability, reliability or applicability ## Usage Recommendations ### Mandatory Bot Identity Identification **When using digital avatars generated by this project, it is strongly recommended to:** - Clearly identify as "AI Bot" or "Digital Avatar" at the beginning of each conversation - Prominently mark "AI-generated content" in the user interface - Avoid letting users mistake it for real human conversation, which could cause risks ### Risk Assessment Recommendations If you must use in production environments, it is recommended to: 1. Conduct comprehensive security testing 2. Establish complete content review mechanisms 3. Develop emergency response plans 4. Purchase appropriate insurance coverage 5. Consult legal professionals for advice This disclaimer may be revised with project updates, users should regularly check the latest version. Continuing to use this project indicates agreement with the latest disclaimer terms. **Once you download, clone, modify, distribute or use the code or models of this project in any way, it indicates that you have fully read, understood and agreed to unconditionally accept all terms of this disclaimer.**
**Please carefully read and understand all contents of this disclaimer, ensuring strict compliance with relevant regulations when using this project.**
## ⭐ Star History > [!TIP] > If this project is helpful to you, or if you are interested in the future development of this project, please give the project a Star, thank you
[![Star History Chart](https://api.star-history.com/svg?repos=xming521/WeClone&type=Date)](https://www.star-history.com/#xming521/WeClone&Date)
================================================ FILE: README_zh.md ================================================ ![download](https://github.com/user-attachments/assets/cd4a87c6-1649-4ce5-bce8-bd5b08b278de)

🚀 One-stop solution for creating your digital avatar from chat history 💡

🚀从聊天记录创造数字分身的一站式解决方案💡

[![GitHub stars](https://img.shields.io/github/stars/xming521/WeClone?style=for-the-badge&logo=github&label=Stars&logoColor=white&color=ffda65)](https://github.com/xming521/WeClone/stargazers) [![GitHub release](https://img.shields.io/github/v/release/xming521/WeClone?style=for-the-badge&logo=github&label=Release&logoColor=white&color=06d094)](https://github.com/xming521/WeClone/releases) WeClone① [![小红书](https://img.shields.io/badge/WeClone-FE2C55?style=for-the-badge&logo=xiaohongshu&logoColor=white)](https://www.xiaohongshu.com/user/profile/628109730000000021029de4) [![Twitter](https://img.shields.io/badge/Twitter-@weclone567-000000?style=for-the-badge&logo=x&logoColor=white)](https://x.com/weclone567) [![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/+JEdak4m0XEQ3NGNl) Featured|HelloGitHub xming521%2FWeClone | Trendshift Ask DeepWiki

简体中文| English 项目主页 项目文档

## ✨核心功能 - 💫 涵盖打造数字分身的全链路方案,包括聊天数据导出、预处理、模型训练、部署 - 💬 使用聊天记录微调LLM,支持图片模态数据,让大模型有"那味儿" - 🔗 绑定到Discord, Telegram, Slack, Feishu等,实现自己的数字分身 - 🛡️ 隐私信息过滤,本地化微调部署,数据安全可控 ## 📋特性与说明 ### 数据源平台适配 | 平台 | 文字 | 图片 | 语音 | 视频 | 动画表情 | 链接(分享) | 引用 | 转发 | 位置 | 文件 | |------|------|------|------|------|----------|-----------|------|------|------|------| | Telegram | ✅ | ✅ | ❌ | ❌ | ⚠️转为Emjoy | ❌ | ❌ | ✅ | ✅ | ❌ | | WhatsApp | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | | Discord | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | | Slack | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | ### 部署平台支持 | 平台 | 部署支持 | |------|------| | Telegram | ✅ | | WhatsApp | 🚧 | | Discord | ✅ | | Slack | ✅ | > [!IMPORTANT] > - WeClone仍在快速迭代期,当前效果不代表最终效果。 > - 微调LLM效果很大程度取决于模型大小、聊天数据的数量和质量,理论上模型越大,数据越多,效果越好。 > - 7B模型效果一般,14B及以上的模型效果会更好。 > - Windows环境未进行严格测试,可以使用WSL作为运行环境。 ### 近期更新 [25/06/05]支持图片模态数据微调 [25/07/10]数据源增加Telegram ### 在线微调 - 大模型实验室 (Lab4AI) (送50元代金券): https://www.lab4ai.cn/project/detail?utm_source=weclone1&id=ab83d14684fa45d197f67eddb3d8316c&type=project ### 硬件要求 项目默认使用Qwen2.5-7B-Instruct模型,LoRA方法对sft阶段微调,大约需要16GB显存。也可以使用[LLaMA Factory](https://github.com/hiyouga/LLaMA-Factory/blob/main/README_zh.md#%E6%A8%A1%E5%9E%8B)支持的其他模型和方法。 需要显存的估算值: | 方法 | 精度 | 7B | 14B | 30B | 70B | `x`B | | ------------------------------- | ---- | ----- | ----- | ----- | ------ | ------- | | Full (`bf16` or `fp16`) | 32 | 120GB | 240GB | 600GB | 1200GB | `18x`GB | | Full (`pure_bf16`) | 16 | 60GB | 120GB | 300GB | 600GB | `8x`GB | | Freeze/LoRA/GaLore/APOLLO/BAdam | 16 | 16GB | 32GB | 64GB | 160GB | `2x`GB | | QLoRA | 8 | 10GB | 20GB | 40GB | 80GB | `x`GB | | QLoRA | 4 | 6GB | 12GB | 24GB | 48GB | `x/2`GB | | QLoRA | 2 | 4GB | 8GB | 16GB | 24GB | `x/4`GB | ## 环境搭建 1.cuda安装(已安装可跳过,**要求版本12.6及以上**):[LLaMA Factory](https://llamafactory.readthedocs.io/zh-cn/latest/getting_started/installation.html#cuda) 2.建议使用 [uv](https://docs.astral.sh/uv/)安装依赖,这是一个非常快速的 Python 环境管理器。安装uv后,您可以使用以下命令创建一个新的Python环境并安装依赖项,速度较慢可以开启代理: ```bash git clone https://github.com/xming521/WeClone.git && cd WeClone uv venv .venv --python=3.12 source .venv/bin/activate # windows下执行 .venv\Scripts\activate uv pip install --group main -e . # 国内用户使用镜像:-i https://pypi.tuna.tsinghua.edu.cn/simple/ uv pip install https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.8.0/zh_core_web_sm-3.8.0-py3-none-any.whl ``` 3.将配置文件模板复制一份并重命名为`settings.jsonc`,后续配置修改在此文件进行: ```bash cp settings.template.jsonc settings.jsonc ``` - 微调**多模态模型**时,请使用[examples/mllm.template.jsonc](https://github.com/xming521/WeClone/blob/master/examples/mllm.template.jsonc)作为配置文件。 > [!NOTE] > 训练以及推理相关配置统一在文件`settings.jsonc` 4.使用以下命令测试CUDA环境是否正确配置并可被PyTorch识别,Mac不需要: ```bash python -c "import torch; print('CUDA是否可用:', torch.cuda.is_available());" ``` 5.(可选)安装FlashAttention,加速训练和推理:`uv pip install flash-attn --no-build-isolation` 版本问题可以使用[prebuild-wheels](https://github.com/mjun0812/flash-attention-prebuild-wheels/releases)的预编译包安装。 ## 模型下载 中国境内推荐使用[ModelScope](https://www.modelscope.cn/docs/models/download)下载模型。例如下载WeClone默认模型: ```bash modelscope download --model Qwen/Qwen2.5-7B-Instruct --local_dir ./models/Qwen2.5-7B-Instruct ``` ## 数据准备 ### Telegram 请使用[Telegram Desktop](https://desktop.telegram.org/)导出聊天记录,点击右上角点击导出聊天记录,选择照片类型,格式选择JSON。可以导出多个联系人(不建议使用群聊记录),然后将导出的`ChatExport_*`文件夹放在`./dataset/telegram`目录即可,也就是不同人聊天记录的文件夹一起放在 `./dataset/telegram`。 ## 数据预处理 - 首先根据需要修改配置文件中的`language`、`platform`、`include_type`。 - 项目默认通过Microsoft Presidio去除了数据中的`电话号码、电子邮件地址、信用卡号码(12-19位数字)、IP地址、地理位置名称、国际银行账户号码、加密货币钱包地址、年龄信息、通用身份证号码`,但是不能保证100%过滤识别。 - 所以在`settings.jsonc`中提供了一个禁用词词库`blocked_words`,可以自行添加需要过滤的词句(会默认去掉包括禁用词的整句)。 > [!IMPORTANT] > 🚨 请一定注意保护个人隐私,不要泄露个人信息! - 执行以下命令对数据进行处理,可以先根据自己的聊天风格修改settings.jsonc的`make_dataset_args`。 ```bash weclone-cli make-dataset ``` 数据处理更多参数说明:[数据预处理](https://docs.weclone.love/zh/docs/deploy/data_preprocessing.html#%E7%9B%B8%E5%85%B3%E5%8F%82%E6%95%B0) ## 配置参数并微调模型 - (可选)修改 `settings.jsonc` 的 `model_name_or_path` 、`template`、 `lora_target`选择本地下载好的其他模型。 - 修改`per_device_train_batch_size`以及`gradient_accumulation_steps`来调整显存占用。 - 可以根据自己数据集的数量和质量修改`train_sft_args`的`num_train_epochs`、`lora_rank`、`lora_dropout`等参数。 ### 单卡训练 ```bash weclone-cli train-sft ``` ### 多卡训练 取消`settings.jsonc`中`deepspeed`行代码注释,使用以下命令多卡训练: ```bash uv pip install "deepspeed<=0.16.9" deepspeed --num_gpus=使用显卡数量 weclone/train/train_sft.py ``` ### 使用浏览器demo简单推理 测试出合适的temperature、top_p值,修改settings.jsonc的`infer_args`后,供后续推理时使用。 ```bash weclone-cli webchat-demo ``` ### 使用接口进行推理 ```bash weclone-cli server ``` ### 使用常见聊天问题测试 不包含询问个人信息的问题,仅有日常聊天。测试结果在test_result-my.txt。 ```bash weclone-cli server weclone-cli test-model ``` ## 🖼️ 微调效果 > [!TIP] > **社群内有部署好的Qwen2.5VL 32B Bot,可以体验效果。** ## 🤖 部署到聊天机器人 ### AstrBot [AstrBot](https://github.com/AstrBotDevs/AstrBot) 是易上手的多平台 LLM 聊天机器人及开发框架 ✨ 平台支持Telegram、飞书等。 使用步骤: 1. 部署 AstrBot 2. 在 AstrBot 中部署消息平台 3. 执行 `weclone-cli server` 启动api服务 4. 在 AstrBot 中新增服务提供商,类型选择OpenAI,API Base URL 根据AstrBot部署方式填写(例如docker部署可能为http://172.17.0.1:8005/v1) ,模型填写gpt-3.5-turbo,API Key随意填写一个 5. 微调后不支持工具调用,请先关掉默认的工具,消息平台发送指令: `/tool off_all`,否则会没有微调后的效果。 6. 根据微调时使用的default_system,在 AstrBot 中设置系统提示词。 ![5](https://github.com/user-attachments/assets/19de7072-076a-4cdf-8ae6-46b9b89f536a) > [!IMPORTANT] > 检查api_service的日志,尽量保证大模型服务请求的参数和微调时一致,tool插件能力都关掉。 ### LangBot [LangBot](https://github.com/RockChinQ/LangBot) 是一个开源的接入全球多种即时通信平台的 LLM 机器人平台,适合各种场景使用。 image 1. [部署 LangBot](https://github.com/RockChinQ/LangBot#-%E5%BC%80%E5%A7%8B%E4%BD%BF%E7%94%A8) 2. 执行 `weclone-cli server` 启动 WeClone API 服务 3. 在 LangBot 中添加一个机器人 4. 在模型页添加新模型,名称`gpt-3.5-turbo`,供应商选择 OpenAI,填写 请求 URL 为 WeClone 的地址,详细连接方式可以参考[文档](https://docs.langbot.app/zh/workshop/network-details.html),API Key 任意填写。 image 6. 在流水线配置中选择刚才添加的模型,或修改提示词配置 image ## 📌 路线图 - [ ] 支持更多数据源 - [ ] 更丰富的上下文:包括上下文对话、聊天对象信息、时间等 - [ ] Memory 支持 - [ ] 支持多模态:已支持图片 - [ ] 数据增强 - [ ] 支持GUI - [ ] 支持COT思考 ## 问题解决 #### [官方文档FAQ](https://docs.weclone.love/docs/introduce/FAQ.html) 同时建议使用[DeepWiki](https://deepwiki.com/xming521/WeClone)解决问题。 ## ❤️ 贡献代码 欢迎任何 Issues/Pull Requests! 你可以通过查看Issues或帮助审核 PR(拉取请求)来贡献。对于新功能的添加,请先通过 Issue 讨论。 开发环境: ```bash uv pip install --group dev -e . pre-commit install ``` 项目使用`pytest`测试,`pyright`检查类型,`ruff`检查代码格式。 提交代码前你应该先运行`pytest tests`确保所有测试通过。 ## 🙏 致谢 BUPT VCIS Lab的支持 感谢以下代码贡献者和社区里其他成员的贡献 同时本项目受益于[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)、[AstrBot](https://github.com/AstrBotDevs/AstrBot)、[LangBot](https://github.com/RockChinQ/LangBot)等优秀开源项目。 ## ⚠️ 免责声明 > [!CAUTION] > **本项目仅供学习、研究和实验用途,用于生产环境存在较大风险,请谨慎评估。请勿用于非法用途,后果自负。** > [针对违规获取及利用微信终端用户数据行为的打击公告](https://mp.weixin.qq.com/s/A6h4ZLTE2EPrY7kJ5fHE2g) > [!IMPORTANT] > #### WeClone 目前未与任何平台合作,未发行任何数字货币。唯一官方网站:[weclone.love](https://www.weclone.love),谨防仿冒。
点击查看免责条款 ### 1. 使用风险自担 - 用户在使用本项目时,应充分理解并承担所有相关风险 - **本项目作者不对因使用本项目而产生的任何直接或间接损失承担责任** - 包括但不限于:数据丢失、经济损失、法律纠纷、个人名誉损害、社会关系影响、心理创伤、职业发展受阻、商业信誉受损等 ### 2. 生产环境风险警告 - **用于商业用途或对外提供服务需自行承担全部风险** - 生产环境使用可能导致的所有后果(包括但不限于服务中断、数据安全问题、用户投诉、法律责任等)完全由用户承担 - **建议在生产环境使用前进行充分的测试、验证和风险评估** ### 3. 模型输出不可靠性 - 微调后的模型可能产生不准确、有害或误导性的内容 - 模型输出不代表真实人物的观点或意图 - 用户应对模型输出进行人工审核和验证 ### 4. 数据安全与隐私 - 用户应确保上传的聊天记录等数据符合相关法律法规 - 用户应获得**数据相关人员的适当授权** - 本项目不对**数据泄露或隐私侵犯**承担责任 ### 5. 法律合规 - **用户应确保使用本项目符合当地法律法规** - 涉及人工智能、数据保护、知识产权等相关法律 - **违法使用造成的后果由用户承担** ### 6. 技术支持限制 - 本项目按"现状"提供,不提供任何明示或暗示的保证 - 作者不承诺提供持续的技术支持或维护 - 不保证项目的稳定性、可靠性或适用性 ## 使用建议 ### 强制性Bot身份标识 **使用本项目生成的数字分身时,强烈建议:** - 在每次对话开始时明确标识为"AI Bot"或"数字分身" - 在用户界面显著位置标注"此为AI生成内容" - 避免让用户误认为是真实人类在对话,从而造成风险 ### 风险评估建议 如确需在生产环境使用,建议: 1. 进行全面的安全性测试 2. 建立完善的内容审核机制 3. 制定应急响应预案 4. 购买相应的保险保障 5. 咨询法律专业人士意见 本免责声明可能随项目更新而修订,用户应定期查看最新版本。继续使用本项目即表示同意最新的免责声明条款。 **一旦您下载、克隆、修改、分发或以任何方式使用本项目的代码或模型,即表示您已完整阅读、理解并同意无条件接受本免责声明的全部条款。**
**请用户慎重阅读并理解本免责声明的所有内容,确保在使用本项目时严格遵守相关规定。**
## ⭐ Star History > [!TIP] > 如果本项目对您有帮助,或者您关注本项目的未来发展,请给项目 Star,谢谢
[![Star History Chart](https://api.star-history.com/svg?repos=xming521/WeClone&type=Date)](https://www.star-history.com/#xming521/WeClone&Date)
克隆我们,保留灵魂的芬芳
================================================ FILE: dataset/eval/test_data-en.json ================================================ { "questions": [ [ "Have you eaten?", "What did you eat?", "Was it delicious?", "How much did it cost?", "Can you treat me to a meal?" ], [ "What are you doing?", "What are you planning to do later?" ], [ "What are you busy with?", "Do you have any special plans for today?", "How are you feeling?" ], [ "Anything new happening recently?", "Do you have any interesting stories to share?" ], [ "How was your weekend?", "What fun things did you do?" ], [ "Have you watched any good movies or TV shows recently?", "Any recommendations?", "What was it about?" ], [ "How's the weather today?", "How about on your end?" ], [ "Is work/study going well recently?", "Have you encountered any challenges?" ], [ "Hey, what are you busy with right now?", "Do you have any special plans for today?", "Everything going smoothly, I hope?" ], [ "How's the weather on your side?", "Is it sunny or a bit gloomy?", "Is it cold or hot?" ], [ "Is it mealtime yet?", "Planning to treat yourself to something delicious today?", "Anything special you want to eat, or any restaurant you want to try?" ], [ "Any fun news or memes online recently?", "Come across any interesting videos or jokes? Share them with me!" ], [ "What are your plans for later?", "How do you plan to spend the rest of the day?" ], [ "Did anything catch your eye today?", "Let's just chat casually, any light topics?" ], [ "Any new discoveries or insights today?", "Did today feel fast or slow? How was the pace?" ], [ "How's your surroundings right now, noisy or quiet?", "Did you go out for a walk today? Was it crowded outside?", "Look out the window, anything special to see?" ], [ "Have you eaten?", "What did you eat? Did you like it?" ], [ "How was your day? Are you tired?", "What's up?" ], [ "How's your health recently?", "Nothing bothering you, right?" ], [ "Are you busy today?", "What have you been up to?" ], [ "Everyone at home doing well?", "Need any help with anything?" ], [ "Did you go out today?", "Is it cold/hot outside? Dress warmly/stay cool." ], [ "Anything happy happening recently? Tell me about it!", "Or any troubles you want to talk about?" ], [ "Go to bed early tonight, don't stay up too late.", "How's your sleep been lately?" ], [ "Need anything? Just let me know.", "Do you have enough money?" ], [ "See anything interesting today?", "Or anything you want to share with me?" ], [ "Any weekend plans?", "Want to grab a meal together or go out?" ], [ "How are those friends you keep in touch with?", "You should get together more often." ], [ "Is work/study going smoothly?", "Don't put too much pressure on yourself." ], [ "What delicious food did you make today?", "Let me try some next time!" ], [ "Any news recently?", "Tell me about it." ], [ "How's so-and-so doing lately?", "Haven't heard from him/her in a while." ], [ "Are you in a good mood today?", "You look good/a bit tired." ], [ "Anything you want to eat? I'll make it/bring it for you next time.", "Or anywhere you want to go? I'll come with you." ], [ "Have you been watching any TV shows/movies recently?", "Any good recommendations for me?" ], [ "If there's nothing urgent, go home/rest early.", "Stay safe." ] ] } ================================================ FILE: dataset/eval/test_data-privacy.json ================================================ { "questions": [ [ "你多大了?" ], [ "你有什么爱好吗?" ], [ "你的理想是什么?", "你觉得你离你的理想还有多远?" ], [ "你最近在忙什么?", "工作/学习顺利吗?", "有什么有趣的事情发生吗?" ], [ "你喜欢看什么类型的电影?", "最近看过什么好看的电影吗?", "你最喜欢的电影是什么?" ], [ "你平时喜欢听什么音乐?", "有推荐的歌手或乐队吗?", "最近有喜欢的歌曲吗?" ], [ "你喜欢旅游吗?", "去过哪些地方?", "最喜欢的旅游地是哪里?" ], [ "你喜欢读书吗?", "最近在读什么书?", "最喜欢的书是哪本?" ], [ "你平时喜欢运动吗?", "喜欢做哪些运动?", "有固定去锻炼吗?" ], [ "周末一般都做些什么?", "有没有什么特别的计划?", "周末喜欢宅在家还是出去玩?" ], [ "你喜欢宠物吗?", "有养宠物吗?", "最喜欢什么动物?" ], [ "你喜欢吃什么类型的食物?", "有推荐的餐厅吗?", "最喜欢的菜是什么?" ], [ "你喜欢什么样的天气?", "最喜欢的季节是哪一个?", "你觉得今天的天气怎么样?" ], [ "你有看电视剧的习惯吗?", "最近在追哪部剧?", "最喜欢的电视剧是哪部?" ], [ "你喜欢玩游戏吗?", "最近在玩什么游戏?", "有推荐的好玩的游戏吗?" ], [ "你会做饭吗?", "平时喜欢做哪些菜?", "有没有特别拿手的菜?" ], [ "你喜欢购物吗?", "最近买了什么新东西?", "有推荐的购物网站或店铺吗?" ], [ "你平时怎么放松自己?", "有特别的解压方式吗?", "最喜欢的放松活动是什么?" ], [ "你喜欢和朋友出去玩吗?", "平时会和朋友去哪玩?", "最近有没有和朋友聚会的计划?" ], [ "你喜欢喝咖啡还是茶?", "有没有特别喜欢的咖啡馆或茶馆?", "最喜欢的饮品是什么?" ], [ "你有兄弟姐妹吗?", "和他们关系怎么样?", "经常联系吗?" ], [ "你喜欢读什么类型的杂志?", "最近有看什么有趣的文章吗?", "有订阅的杂志吗?" ], [ "你喜欢看体育比赛吗?", "最喜欢的运动项目是什么?", "有没有特别支持的球队或运动员?" ], [ "你会说其他语言吗?", "最想学的语言是什么?", "学习语言有什么技巧吗?" ], [ "你对科技产品感兴趣吗?", "最近有没有关注什么新科技?", "最喜欢的电子产品是什么?" ], [ "你喜欢喝什么样的饮料?", "有没有自己调饮料的习惯?", "最喜欢的饮品品牌是什么?" ], [ "你平时用社交媒体吗?", "常用哪些平台?", "在社交媒体上做什么?" ], [ "你对艺术感兴趣吗?", "最喜欢的艺术家是谁?", "有去过哪些艺术展览?" ], [ "你喜欢DIY吗?", "平时做些什么手工?", "有没有完成的作品可以分享?" ], [ "你喜欢种植植物吗?", "有养什么植物?", "最喜欢的植物是什么?" ], [ "你喜欢拍照吗?", "喜欢拍什么样的照片?", "有没有用什么特别的摄影设备?" ], [ "你喜欢听播客吗?", "常听哪些主题的播客?", "有没有推荐的播客?" ], [ "你对历史感兴趣吗?", "最喜欢哪个历史时期?", "有没有特别喜欢的历史人物?" ], [ "你喜欢画画吗?", "平时画什么类型的画?", "有参加过画展吗?" ], [ "你喜欢写作吗?", "平时写什么类型的文章?", "有没有发表过作品?" ], [ "你喜欢钓鱼吗?", "平时去哪里钓鱼?", "有没有钓到过什么大鱼?" ], [ "你喜欢露营吗?", "平时会去哪里露营?", "有没有什么难忘的露营经历?" ], [ "你喜欢摄影吗?", "最喜欢拍什么题材?", "有没有特别喜欢的摄影师?" ], [ "你喜欢喝酒吗?", "喜欢什么类型的酒?", "有没有推荐的酒吧或品牌?" ], [ "你喜欢滑雪吗?", "平时去哪里滑雪?", "有没有什么滑雪技巧分享?" ], [ "你喜欢海边还是山里?", "最喜欢去哪个地方度假?", "有没有什么特别推荐的景点?" ], [ "你喜欢参加音乐节吗?", "参加过哪些音乐节?", "最喜欢的音乐节是哪一个?" ], [ "你喜欢跑步吗?", "平时跑多长距离?", "有没有参加过马拉松?" ], [ "你喜欢参加聚会吗?", "平时和朋友聚会做什么?", "有没有什么有趣的聚会游戏?" ], [ "你喜欢收集东西吗?", "收集什么类型的物品?", "有没有什么特别的收藏?" ] ] } ================================================ FILE: dataset/eval/test_data-zh.json ================================================ { "questions": [ [ "吃了吗?", "吃的什么啊", "好吃吗", "多少钱啊", "可以请我吃吗" ], [ "干嘛呢?", "等会准备干什么去" ], [ "在忙什么呢?", "今天有什么特别的安排吗?", "感觉怎么样?" ], [ "最近有什么新鲜事发生吗?", "有没有什么有趣的故事可以分享?" ], [ "周末过得怎么样?", "做了什么好玩的?" ], [ "最近看了什么好看的电影或电视剧吗?", "有什么推荐的吗?", "大概讲了什么内容呀?" ], [ "今天天气怎么样?", "你那里呢?" ], [ "最近工作/学习顺利吗?", "有没有遇到什么挑战?" ], [ "嗨,这会儿在忙啥呢?", "今天有什么特别的安排不?", "一切都还顺利吧?" ], [ "你那边现在天气咋样啊?", "是大晴天还是有点阴沉沉的?", "冷不冷,或者热不热呀?" ], [ "到饭点儿了没呀?", "今天打算犒劳一下自己,吃点啥好吃的?", "有没有啥特别想吃的,或者想去哪家馆子尝尝鲜?" ], [ "最近网上有啥好玩儿的新闻或者梗吗?", "刷到啥有意思的视频或者段子没?分享一下呗!" ], [ "待会儿有啥打算呀?", "今天剩下的时间准备怎么过呢?" ], [ "今天有没有碰到啥让你眼前一亮的小事儿?", "随便聊聊呗,有啥轻松点的话题不?" ], [ "今天有啥新发现或者小感悟没?", "感觉今天过得快不快?节奏怎么样?" ], [ "你现在周围环境咋样,吵不吵?", "今天出门溜达了没,外面人多不多呀?", "瞅瞅窗外,有啥特别的景儿不?" ], [ "吃饭了没啊?", "吃的啥呀?合胃口不?" ], [ "今天怎么样啊?累不累?", "有啥事儿不?" ], [ "最近身体还好吧?", "没什么不舒服的地方吧?" ], [ "今天忙不忙啊?", "都干啥了呀?" ], [ "家里都挺好的吧?", "有啥需要帮忙的不?" ], [ "今天出门了没?", "外面冷不冷/热不热啊?多穿点/注意防暑。" ], [ "最近有啥开心的事儿不?说来听听!", "或者有啥烦心事儿,跟我说说?" ], [ "晚上早点休息啊,别熬太晚。", "睡得好不好啊最近?" ], [ "缺啥东西不?跟我说。", "钱够不够花呀?" ], [ "今天看到啥有意思的了没?", "或者有啥想跟我分享的?" ], [ "周末有啥安排啊?", "要不要一起吃个饭/出去转转?" ], [ "最近常联系的那些朋友都还好不?", "有空多聚聚。" ], [ "工作/学习上还顺利吧?", "别太给自己压力啊。" ], [ "今天做了啥好吃的呀?", "下次也给我尝尝呗!" ], [ "有啥新闻没有啊最近?", "跟我讲讲。" ], [ "那谁谁谁最近怎么样了?", "好久没听到他/她消息了。" ], [ "今天心情好不好呀?", "看你气色不错/有点疲惫。" ], [ "有啥想吃的没?下次给你做/带。", "或者想去哪儿玩,我陪你。" ], [ "最近有没有看啥电视剧/电影啊?", "有啥好看的推荐给我呗。" ], [ "没事儿就早点回家/休息。", "注意安全啊。" ] ] } ================================================ FILE: dataset/media/images/.gitkeep ================================================ # Images processed from other data sources will also be placed in this directory. ================================================ FILE: dataset/res_csv/sft/dataset_info.json ================================================ { "chat-sft": { "file_name": "./sft-my.json", "formatting": "sharegpt", "columns": { "messages": "messages", "system": "system" }, "tags": { "role_tag": "role", "content_tag": "content", "user_tag": "user", "assistant_tag": "assistant" } }, "chat-sft-cleaned": { "file_name": "./sft-my-cleaned.json", "formatting": "sharegpt", "columns": { "messages": "messages", "system": "system" }, "tags": { "role_tag": "role", "content_tag": "content", "user_tag": "user", "assistant_tag": "assistant" } }, "chat-sft-vl": { "file_name": "./sft-my.json", "formatting": "sharegpt", "columns": { "messages": "messages", "system": "system", "images": "images" }, "tags": { "role_tag": "role", "content_tag": "content", "user_tag": "user", "assistant_tag": "assistant" } }, "chat-sft-vl-cleaned": { "file_name": "./sft-my-cleaned.json", "formatting": "sharegpt", "columns": { "messages": "messages", "system": "system", "images": "images" }, "tags": { "role_tag": "role", "content_tag": "content", "user_tag": "user", "assistant_tag": "assistant" } } } ================================================ FILE: dataset/telegram/.gitkeep ================================================ # Storing Telegram client's ChatExport ================================================ FILE: ds_config.json ================================================ { "fp16": { "enabled": "auto", "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 }, "bf16": { "enabled": "auto" }, "zero_optimization": { "stage": 2, "allgather_partitions": true, "allgather_bucket_size": 5e8, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 5e8, "contiguous_gradients": true }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "steps_per_print": 2000, "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } ================================================ FILE: examples/mllm.template.jsonc ================================================ { "version": "0.2.24", "common_args": { "model_name_or_path": "./models/Qwen2.5-VL-7B-Instruct", "adapter_name_or_path": "./model_output", //同时做为train_sft_args的output_dir "template": "qwen2_vl", "default_system": "请你扮演一名人类,不要说自己是人工智能", "finetuning_type": "lora", "media_dir": "dataset/media", "image_max_pixels": 409920, //720P "enable_thinking": false, "trust_remote_code": true }, "cli_args": { "full_log": false }, "make_dataset_args": { //数据处理配置 "platform": "chat", //chat,telegram "include_type": [ "text", "image" ], "max_image_num": 2, // 单条数据最大图片数量 "blocked_words": [ // 禁用词 "例如 姓名", "例如 密码", "//....." ], "single_combine_strategy": "time_window", // 单人组成单句策略 "qa_match_strategy": "time_window", // 组成qa策略 "single_combine_time_window": 2, // 单人组成单句时间窗口(分钟), "qa_match_time_window": 5, // 组成qa时间窗口(分钟), "combine_msg_max_length": 2048, // 组合后消息最大长度 "messages_max_length": 2048, // messages最长字符数量 配合cutoff_len 使用 "clean_dataset": { "enable_clean": false, "clean_strategy": "llm", "llm": { "accept_score": 2, //可以接受的llm打分阈值,1分最差,5分最好,低于此分数的数据不会用于训练 } }, "online_llm_clear": false, "base_url": "https://xxx/v1", "llm_api_key": "xxxxx", "model_name": "xxx", //建议使用参数较大的模型,例如DeepSeek-V3 "clean_batch_size": 10, "vision_api": { "enable": false, // 设置为 true 来开启此功能 "api_key": "xxx", "api_url": "https://xxx/v1", // 例如阿里云,或替换为其他兼容OpenAI的API地址 "model_name": "xxx", // 要使用的多模态模型名称,例如qwen-vl-max "max_workers": 5 // 并行调用API的线程数,最多不要超过8 } }, "train_sft_args": { //微调配置 "stage": "sft", "dataset": "chat-sft", "dataset_dir": "./dataset/res_csv/sft", "freeze_multi_modal_projector": false, //MLLM 训练时是否冻结多模态投影器。 "use_fast_tokenizer": true, "lora_target": "q_proj,v_proj,visual.merger.mlp.0,visual.merger.mlp.2", "lora_rank": 8, "lora_dropout": 0.25, "weight_decay": 0.1, "overwrite_cache": true, "per_device_train_batch_size": 2, "gradient_accumulation_steps": 16, "lr_scheduler_type": "cosine", "cutoff_len": 4096, "logging_steps": 10, "save_steps": 100, "learning_rate": 1e-4, "warmup_ratio": 0.1, "num_train_epochs": 2, "plot_loss": true, "fp16": true, "flash_attn": "fa2", "preprocessing_num_workers": 16, "dataloader_num_workers": 4 // "deepspeed": "ds_config.json" //多卡训练 }, "infer_args": { "repetition_penalty": 1.2, "temperature": 0.65, "max_length": 512, "top_p": 0.75 }, "vllm_args": { "gpu_memory_utilization": 0.9 }, "test_model_args": { "test_data_path": "dataset/eval/test_data-en.json" } } ================================================ FILE: examples/tg.template.jsonc ================================================ { "version": "0.3.0", "common_args": { "model_name_or_path": "./models/Qwen2.5-VL-7B-Instruct", "adapter_name_or_path": "./model_output", // Also serves as the output_dir for train_sft_args "template": "qwen2_vl", "default_system": "Please act like a human and don't say you are an artificial intelligence", "finetuning_type": "lora", "media_dir": "dataset/media", "image_max_pixels": 409920, //720P "enable_thinking": false, "trust_remote_code": true }, "cli_args": { "full_log": false }, "make_dataset_args": { // Data processing configuration "platform": "telegram", //chat,telegram "language": "en", // Common chat language: zh(中文), en(English) "telegram_args": { "my_id": "user1234567890" }, "include_type": [ "text", "image", // "sticker" //Converting stickers to emojis can lead to the model outputting too many emojis. ], "max_image_num": 2, // Maximum number of images per data entry "blocked_words": [ // Blocked words "e.g. Name", "e.g. Password", "//....." ], "single_combine_strategy": "time_window", // Single person message combination strategy "qa_match_strategy": "time_window", // QA combination strategy "single_combine_time_window": 2, // Time window for single person message combination (minutes) "qa_match_time_window": 5, // Time window for QA combination (minutes) "combine_msg_max_length": 2048, // Maximum length of combined messages "messages_max_length": 2048, // Maximum character count for messages, used with cutoff_len "clean_dataset": { "enable_clean": false, "clean_strategy": "llm", "llm": { "accept_score": 2, // Acceptable LLM score threshold, 1 is worst, 5 is best, data below this score will not be used for training } }, "online_llm_clear": false, "base_url": "https://xxx/v1", "llm_api_key": "xxxxx", "model_name": "xxx", // Recommend using models with larger parameters, e.g. DeepSeek-V3 "clean_batch_size": 10, "vision_api": { "enable": false, // Set to true to enable this feature "api_key": "xxx", "api_url": "https://xxx/v1", // e.g. Alibaba Cloud, or replace with other OpenAI-compatible API addresses "model_name": "xxx", // Multimodal model name to use, e.g. qwen-vl-max "max_workers": 5 // Number of parallel API call threads, maximum should not exceed 8 } }, "train_sft_args": { // Fine-tuning configuration "stage": "sft", "dataset": "chat-sft", "dataset_dir": "./dataset/res_csv/sft", "freeze_multi_modal_projector": false, // Whether to freeze the multimodal projector during MLLM training "use_fast_tokenizer": true, "lora_target": "q_proj,v_proj,visual.merger.mlp.0,visual.merger.mlp.2", "lora_rank": 8, "lora_dropout": 0.25, "weight_decay": 0.1, "overwrite_cache": true, "per_device_train_batch_size": 2, "gradient_accumulation_steps": 16, "lr_scheduler_type": "cosine", "cutoff_len": 4096, "logging_steps": 10, "save_steps": 100, "learning_rate": 1e-4, "warmup_ratio": 0.1, "num_train_epochs": 2, "plot_loss": true, "fp16": true, "flash_attn": "fa2", "preprocessing_num_workers": 16, "dataloader_num_workers": 4 // "deepspeed": "ds_config.json" // Multi-GPU training }, "infer_args": { "repetition_penalty": 1.2, "temperature": 0.7, "max_length": 512, "top_p": 0.8 }, "vllm_args": { "gpu_memory_utilization": 0.9 }, "test_model_args": { "test_data_path": "dataset/eval/test_data-en.json" } } ================================================ FILE: pyproject.toml ================================================ [project] name = "WeClone" version = "0.3.03" description = "One-stop solution for creating your digital avatar from chat history" authors = [{ name = "xming521" }] readme = "README.md" requires-python = ">=3.12,<3.13" dependencies = [ "pandas", "pyjson5", "omegaconf", "click", "tqdm", "pydantic==2.10.6", "setuptools>=78.1.0", "loguru>=0.7.3", "langchain", "openai==1.87.0", "pip" ] [tool.weclone] # Configuration file version number. This number should be incremented when the configuration file structure or important default values change. config_version = "0.3.03" config_changelog = """ [0.3.00] - 2025-06-30 - Support TG chat logs, add language parameter, add log level parameter. [0.3.02] - 2025-08-15 - Allow the use of the enable_thinking to control offline cleaning.. [0.3.03] - 2025-11-01 - Add chat member relationship switch. """ [dependency-groups] main = [ "llamafactory==0.9.4", "vllm==0.10.0; platform_system == 'Linux'", "torch==2.7.1+cu126; platform_system == 'Linux' or platform_system == 'Windows'", "torchvision==0.22.1+cu126; platform_system == 'Linux' or platform_system == 'Windows'", "torchaudio==2.7.1+cu126; platform_system == 'Linux' or platform_system == 'Windows'", "torchdata>=0.10.0; platform_system == 'Linux' or platform_system == 'Windows'", "transformers==4.53.2", "accelerate==1.7.0", "triton==3.3.1; platform_system == 'Linux'", "presidio_analyzer[transformers]", "presidio_anonymizer", ] sparktts = [ "einops>=0.8.1", "einx>=0.3.0", "numpy==1.26.4", "omegaconf>=2.3.0", "packaging>=24.2", "safetensors>=0.5.2", "soundfile>=0.12.1", "soxr>=0.5.0.post1", "torchaudio>=2.6.0", "tqdm>=4.66.5", ] dev = ["pytest", "pytest-order", "pyright", "ruff", "pre-commit"] [project.scripts] weclone-cli = "weclone.cli:cli" [tool.uv] [tool.uv.pip] torch-backend = "auto" [tool.uv.sources] torch = [ { index = "pytorch-cu126", marker = "platform_system == 'Windows'" }, { index = "pytorch-cu126", marker = "platform_system == 'Linux'" }, ] torchaudio = [ { index = "pytorch-cu126", marker = "platform_system == 'Windows'" }, { index = "pytorch-cu126", marker = "platform_system == 'Linux'" }, ] torchvision = [ { index = "pytorch-cu126", marker = "platform_system == 'Windows'" }, { index = "pytorch-cu126", marker = "platform_system == 'Linux'" }, ] triton = [ { index = "pytorch-cu126", marker = "platform_system == 'Windows'" }, { index = "pytorch-cu126", marker = "platform_system == 'Linux'" }, ] torchdata = [ { index = "pytorch-cu126", marker = "platform_system == 'Windows'" }, { index = "pytorch-cu126", marker = "platform_system == 'Linux'" }, ] [[tool.uv.index]] name = "pytorch-cu126" url = "https://download.pytorch.org/whl/cu126" explicit = true [tool.setuptools.packages.find] where = ["."] include = ["weclone*"] exclude = ["*tests*", "*archive*"] [tool.pyright] typeCheckingMode = "basic" include = ["weclone/data"] exclude = ["**/archive", "**/tests"] ignore = ["**/archive"] reportMissingImports = "error" reportMissingTypeStubs = false pythonVersion = "3.12" pythonPlatform = "Linux" [tool.ruff] exclude = [ "**/archive", "**/tests", "weclone-audio/src/server未完工", "weclone-audio/src/Spark-TTS", ] line-length = 110 lint.ignore = ["F403", "F405", "E501", "E402"] lint.select = [ "F", # Pyflakes "W", # pycodestyle warnings "E", # pycodestyle errors "ASYNC", # flake8-async "C4", # flake8-comprehensions "Q", # flake8-quotes ] target-version = "py312" [tool.pytest.ini_options] addopts = "-x -v -s --tb=short" ================================================ FILE: settings.template.jsonc ================================================ { "version": "0.3.01", "common_args": { "model_name_or_path": "./models/Qwen2.5-7B-Instruct", "adapter_name_or_path": "./model_output", //同时做为train_sft_args的output_dir "template": "qwen", "default_system": "请你扮演一名人类,不要说自己是人工智能", "media_dir": "dataset/media", "finetuning_type": "lora", "enable_thinking": false, "trust_remote_code": true }, "cli_args": { "full_log": false, "log_level": "INFO" }, "make_dataset_args": { //数据处理配置 "platform": "chat", //chat,telegram "language": "zh", // 聊天常用语言: zh(中文) 或 en(英文) "telegram_args": { "my_id": "user1234567890" }, "include_type": [ "text" ], "blocked_words": [ // 禁用词 "例如 姓名", "例如 密码", "//....." ], "add_time": false, "add_relation": false, "single_combine_strategy": "time_window", // 单人组成单句策略 "qa_match_strategy": "time_window", // 组成qa策略 "single_combine_time_window": 2, // 单人组成单句时间窗口(分钟), "qa_match_time_window": 5, // 组成qa时间窗口(分钟), "combine_msg_max_length": 2048, // 组合后消息最大长度 配合cutoff_len 使用 "messages_max_length": 2048, // messages最长字符数量 配合cutoff_len 使用 "clean_dataset": { "enable_clean": false, "clean_strategy": "llm", "llm": { "accept_score": 2, //可以接受的llm打分阈值,1分最差,5分最好,低于此分数的数据不会用于训练 "enable_thinking": true } }, "online_llm_clear": false, "base_url": "https://xxx/v1", "llm_api_key": "xxxxx", "model_name": "xxx", //建议使用参数较大的模型,例如DeepSeek-V3 "clean_batch_size": 50, "vision_api": { "enable": false, // 设置为 true 来开启此功能 "api_key": "xxx", "api_url": "https://xxx/v1", // 兼容OpenAI的API地址 "model_name": "xxx", // 要使用的多模态模型名称,例如qwen-vl-max "max_workers": 5 // 并行调用API的线程数,最多不要超过8 } }, "train_sft_args": { //微调配置 "stage": "sft", "dataset": "chat-sft", "dataset_dir": "./dataset/res_csv/sft", "use_fast_tokenizer": true, "lora_target": "q_proj,v_proj", "lora_rank": 8, "lora_dropout": 0.25, "weight_decay": 0.1, "overwrite_cache": true, "per_device_train_batch_size": 2, "gradient_accumulation_steps": 16, "lr_scheduler_type": "cosine", "cutoff_len": 2048, "logging_steps": 10, "save_steps": 100, "learning_rate": 1e-4, "warmup_ratio": 0.1, "num_train_epochs": 2, "plot_loss": true, "fp16": true, "flash_attn": "fa2", // "deepspeed": "ds_config.json" //多卡训练 }, "infer_args": { "repetition_penalty": 1.2, "temperature": 0.5, "max_length": 256, "top_p": 0.65 }, "vllm_args": { "gpu_memory_utilization": 0.9, // "data_parallel_size": 2, // "quantization": "bitsandbytes", // "load_format": "bitsandbytes" }, "test_model_args": { "test_data_path": "dataset/eval/test_data-zh.json" } } ================================================ FILE: tests/__init__.py ================================================ ================================================ FILE: tests/configs/Qwen2.5-VL.jsonc ================================================ { "version": "0.2.22", "common_args": { "model_name_or_path": "./models/Qwen2.5-VL-3B-Instruct", "adapter_name_or_path": "./model_output", //同时做为train_sft_args的output_dir "template": "qwen2_vl", "default_system": "请你扮演一名人类,不要说自己是人工智能", "finetuning_type": "lora", "media_dir": "dataset/media", "image_max_pixels": 209920, //720P "enable_thinking": false, "trust_remote_code": true }, "cli_args": { "full_log": false }, "make_dataset_args": { //数据处理配置 "platform": "chat", "include_type": [ "text", "image" ], "blocked_words": [ "1234567890", "hh" ], "language": "en", "add_relation": true, "add_time": true, "max_image_num": 2, // 单条数据最大图片数量 "single_combine_strategy": "time_window", // 单人组成单句策略 "qa_match_strategy": "time_window", // 组成qa策略 "single_combine_time_window": 2, // 单人组成单句时间窗口(分钟), "qa_match_time_window": 5, // 组成qa时间窗口(分钟), "combine_msg_max_length": 256, // 组合后消息最大长度 配合cutoff_len 使用 "clean_dataset": { "enable_clean": true, "clean_strategy": "llm", "llm": { "accept_score": 2, //可以接受的llm打分阈值,1分最差,5分最好,低于此分数的数据不会用于训练 } }, "vision_api": { "enable": false, // 设置为 true 来开启此功能 "api_key": "xxx", "api_url": "https://xxx/v1", // 例如阿里云,或替换为其他兼容OpenAI的API地址 "model_name": "xxx", // 要使用的多模态模型名称,例如qwen-vl-max "max_workers": 5 // 并行调用API的线程数,最多不要超过8 } }, "test_model_args": { "test_data_path": "tests/tests_data/test_model_data.json" }, "train_sft_args": { //微调配置 "stage": "sft", "dataset": "chat-sft", "dataset_dir": "./dataset/res_csv/sft", "freeze_multi_modal_projector": false, //MLLM 训练时是否冻结多模态投影器。 "use_fast_tokenizer": true, "lora_target": "q_proj,v_proj,visual.merger.mlp.0,visual.merger.mlp.2", "lora_rank": 2, "lora_dropout": 0.3, "weight_decay": 0.1, "overwrite_cache": true, "per_device_train_batch_size": 4, "gradient_accumulation_steps": 8, "lr_scheduler_type": "cosine", "cutoff_len": 1024, "logging_steps": 5, "save_steps": 10, "learning_rate": 1e-4, "warmup_ratio": 0.1, "num_train_epochs": 1, "plot_loss": true, "fp16": true, "flash_attn": "fa2", // "deepspeed": "ds_config.json" //多卡训练 }, "infer_args": { "repetition_penalty": 1.2, "temperature": 0.5, "max_length": 50, "top_p": 0.65 } } ================================================ FILE: tests/configs/qwen2.5.jsonc ================================================ { "version": "0.2.22", "common_args": { "model_name_or_path": "./models/Qwen2.5-0.5B", "adapter_name_or_path": "./model_output", //同时做为train_sft_args的output_dir "template": "qwen", "default_system": "请你扮演一名人类,不要说自己是人工智能", "finetuning_type": "lora", "media_dir": "dataset/media", "image_max_pixels": 209920, //720P "enable_thinking": false, "trust_remote_code": true }, "cli_args": { "full_log": false }, "make_dataset_args": { //数据处理配置 "platform": "chat", "include_type": [ "text", // "image" ], "blocked_words": [ "1234567890", "hh" ], "language": "zh", "add_relation": true, "add_time": true, "max_image_num": 2, // 单条数据最大图片数量 "single_combine_strategy": "time_window", // 单人组成单句策略 "qa_match_strategy": "time_window", // 组成qa策略 "single_combine_time_window": 2, // 单人组成单句时间窗口(分钟), "qa_match_time_window": 5, // 组成qa时间窗口(分钟), "combine_msg_max_length": 256, // 组合后消息最大长度 配合cutoff_len 使用 "clean_dataset": { "enable_clean": true, "clean_strategy": "llm", "llm": { "accept_score": 2, //可以接受的llm打分阈值,1分最差,5分最好,低于此分数的数据不会用于训练 "enable_thinking": true } }, "vision_api": { "enable": false, // 设置为 true 来开启此功能 "api_key": "xxx", "api_url": "https://xxx/v1", // 例如阿里云,或替换为其他兼容OpenAI的API地址 "model_name": "xxx", // 要使用的多模态模型名称,例如qwen-vl-max "max_workers": 5 // 并行调用API的线程数,最多不要超过8 } }, "test_model_args": { "test_data_path": "tests/tests_data/test_model_data.json" }, "train_sft_args": { //微调配置 "stage": "sft", "dataset": "chat-sft", "dataset_dir": "./dataset/res_csv/sft", "freeze_multi_modal_projector": false, //MLLM 训练时是否冻结多模态投影器。 "use_fast_tokenizer": true, "lora_target": "q_proj,v_proj,visual.merger.mlp.0,visual.merger.mlp.2", "lora_rank": 2, "lora_dropout": 0.3, "weight_decay": 0.1, "overwrite_cache": true, "per_device_train_batch_size": 4, "gradient_accumulation_steps": 8, "lr_scheduler_type": "cosine", "cutoff_len": 1024, "logging_steps": 5, "save_steps": 10, "learning_rate": 1e-4, "warmup_ratio": 0.1, "num_train_epochs": 1, "plot_loss": true, "fp16": true, "flash_attn": "fa2", // "deepspeed": "ds_config.json" //多卡训练 }, "infer_args": { "repetition_penalty": 1.2, "temperature": 0.5, "max_length": 50, "top_p": 0.65 } } ================================================ FILE: tests/test_PII.py ================================================ import os import shutil import subprocess import sys from typing import cast import pytest # Import common functions from test_full_pipe from tests.test_full_pipe import ( DATASET_CSV_DIR, PROJECT_ROOT_DIR, get_config_files, load_config_with_path, print_test_header, run_cli_command, setup_data_environment, test_logger, ) from weclone.utils.config import load_config from weclone.utils.config_models import DataModality, WCMakeDatasetConfig from weclone.utils.log import logger sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) # Setup paths TESTS_DIR = os.path.dirname(__file__) TEST_DATA_PII_DIR = os.path.join(TESTS_DIR, "tests_data", "test_PII") @pytest.mark.parametrize("config_file", get_config_files()) def test_PII_make_dataset(config_file): """Test PII data make-dataset functionality""" print_test_header("PII make-dataset", config_file) setup_data_environment("test_PII") # Load config and handle images if needed config: WCMakeDatasetConfig = cast(WCMakeDatasetConfig, load_config_with_path(config_file, "make_dataset")) # Run make-dataset command result = run_cli_command(["make-dataset"], config_file) assert result.returncode == 0, f"make-dataset command execution failed for config {config_file}" # Print all user messages from the dataset file with PII warning import json sft_file_path = os.path.join(PROJECT_ROOT_DIR, "dataset", "res_csv", "sft", "sft-my.json") if os.path.exists(sft_file_path): logger.warning("⚠️ WARNING: The following content contains unfiltered PII (Personally Identifiable Information):") logger.warning("=" * 80) with open(sft_file_path, 'r', encoding='utf-8') as f: data = json.load(f) for entry in data: if 'messages' in entry: for message in entry['messages']: if message.get('role') == 'user': logger.warning(f"User content: {message.get('content', '')}") logger.warning("=" * 80) logger.warning("⚠️ END OF UNFILTERED PII CONTENT") test_logger.info(f"✅ PII make-dataset test passed for config {config_file}") if __name__ == "__main__": # If running directly, run tests for all configs for config_file in get_config_files(): test_PII_make_dataset(config_file) ================================================ FILE: tests/test_full_pipe.py ================================================ import functools import os import shutil import subprocess import sys import time from typing import Callable, Optional, Union, cast from unittest import mock import pytest from weclone.utils.config import load_config from weclone.utils.config_models import DataModality, WCMakeDatasetConfig from weclone.utils.log import logger sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) PROJECT_ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) DATASET_CSV_DIR = os.path.join(PROJECT_ROOT, "dataset", "csv") TESTS_DIR = os.path.dirname(__file__) TEST_DATA_PERSON_DIR = os.path.join(TESTS_DIR, "tests_data", "test_person") # Backup directories BACKUP_DIR = os.path.join(PROJECT_ROOT, "test_backup") MODEL_OUTPUT_BACKUP = os.path.join(BACKUP_DIR, "model_output") DATASET_CSV_BACKUP = os.path.join(BACKUP_DIR, "dataset_csv") test_logger = logger.bind() test_logger.remove() test_logger.add( sys.stderr, format="{message}", colorize=True, level="INFO", ) def get_config_files(): """获取所有配置文件""" configs_dir = os.path.join(os.path.dirname(__file__), "configs") config_files = [] for file in os.listdir(configs_dir): if file.endswith('.jsonc'): config_files.append(f"tests/configs/{file}") return config_files def print_test_header(test_name: str, config_file: str = ""): line_length = 100 test_logger.info("\n" + "─" * line_length) if config_file: title = f" Testing Phase: {test_name} | Config: {os.path.basename(config_file)} " else: title = f" Testing Phase: {test_name} " padding_total = line_length - len(title) padding_left = padding_total // 2 padding_right = padding_total - padding_left test_logger.info(" " * padding_left + title + " " * padding_right) test_logger.info("─" * line_length) def print_config_header(config_file: str): """打印配置文件开始测试的头部""" line_length = 120 test_logger.info("\n" + "═" * line_length) title = f" 开始测试配置文件: {os.path.basename(config_file)} " padding_total = line_length - len(title) padding_left = padding_total // 2 padding_right = padding_total - padding_left test_logger.info(" " * padding_left + title + " " * padding_right) test_logger.info("═" * line_length) def setup_data_environment(data_folder_name: str = "test_person"): """Setup test data environment for specified folder""" test_logger.info(f"🔧 设置 {data_folder_name} 测试数据...") # Create backup directory if os.path.exists(BACKUP_DIR): shutil.rmtree(BACKUP_DIR) os.makedirs(BACKUP_DIR) # Backup model_output if it exists if os.path.exists("model_output"): shutil.move("model_output", MODEL_OUTPUT_BACKUP) test_logger.info("已备份 model_output 目录") # Backup DATASET_CSV_DIR if it exists if os.path.exists(DATASET_CSV_DIR): shutil.move(DATASET_CSV_DIR, DATASET_CSV_BACKUP) test_logger.info("已备份 dataset/csv 目录") os.makedirs(DATASET_CSV_DIR) # Setup specified test data folder test_data_source_dir = os.path.join(TESTS_DIR, "tests_data", data_folder_name) test_data_csv_dir = os.path.join(DATASET_CSV_DIR, data_folder_name) os.makedirs(test_data_csv_dir) for item_name in os.listdir(test_data_source_dir): source_item_path = os.path.join(test_data_source_dir, item_name) if os.path.isfile(source_item_path) : destination_item_path = os.path.join(test_data_csv_dir, item_name) shutil.copy2(source_item_path, destination_item_path) test_logger.info(f"✅ {data_folder_name} 测试数据设置完成") @pytest.fixture(scope="session", autouse=True) def setup_test_environment(): """Setup test environment once for the entire test session""" test_logger.info("🔧 开始设置测试环境...") # Use the generic setup function with default test_person data setup_data_environment("test_person") test_logger.info("✅ 测试环境设置完成") yield # This is where the testing happens # Cleanup after all tests are done test_logger.info("🧹 开始恢复测试环境...") if os.path.exists("model_output"): shutil.rmtree("model_output") if os.path.exists(DATASET_CSV_DIR): shutil.rmtree(DATASET_CSV_DIR) if os.path.exists(MODEL_OUTPUT_BACKUP): shutil.move(MODEL_OUTPUT_BACKUP, "model_output") if os.path.exists(DATASET_CSV_BACKUP): shutil.move(DATASET_CSV_BACKUP, DATASET_CSV_DIR) if os.path.exists(BACKUP_DIR): shutil.rmtree(BACKUP_DIR) test_logger.info("✅ 测试环境恢复完成") def restore_test_env(): """Manual environment cleanup for direct execution (deprecated for pytest)""" test_logger.info("🧹 手动恢复测试环境...") # Remove test directories if os.path.exists("model_output"): shutil.rmtree("model_output") if os.path.exists(DATASET_CSV_DIR): shutil.rmtree(DATASET_CSV_DIR) # Restore original directories if they were backed up if os.path.exists(MODEL_OUTPUT_BACKUP): shutil.move(MODEL_OUTPUT_BACKUP, "model_output") test_logger.info("已恢复 model_output 目录") if os.path.exists(DATASET_CSV_BACKUP): shutil.move(DATASET_CSV_BACKUP, DATASET_CSV_DIR) test_logger.info("已恢复 dataset/csv 目录") # Remove backup directory if os.path.exists(BACKUP_DIR): shutil.rmtree(BACKUP_DIR) test_logger.info("已清理备份目录") test_logger.info("✅ 测试环境恢复完成") def run_cli_command(command: list[str], config_path: str, timeout: int | None = None, background: bool = False) -> Union[subprocess.CompletedProcess, subprocess.Popen]: """Execute a CLI command and return the result. Args: command: List of commands to execute. config_path: Path to the configuration file. timeout: Timeout in seconds. background: Whether to run in the background. Returns: If background=True, returns a Popen object; otherwise, returns a CompletedProcess object. """ env = os.environ.copy() env["WECLONE_CONFIG_PATH"] = config_path # Set environment variable if background: process = subprocess.Popen( [sys.executable, "-m", "weclone.cli"] + command, stderr=None, stdout=None, text=True, cwd=PROJECT_ROOT_DIR, env=env ) time.sleep(2) return process else: process = subprocess.run( [sys.executable, "-m", "weclone.cli"] + command, stderr=None, stdout=None, text=True, cwd=PROJECT_ROOT_DIR, # Execute in the project root directory timeout=timeout, env=env # Pass the modified environment variables ) return process def load_config_with_path(config_file: str, config_section: str): """临时设置环境变量并加载配置""" original_env = os.environ.get("WECLONE_CONFIG_PATH") os.environ["WECLONE_CONFIG_PATH"] = config_file try: return load_config(config_section) finally: # 恢复原始环境变量 if original_env is not None: os.environ["WECLONE_CONFIG_PATH"] = original_env elif "WECLONE_CONFIG_PATH" in os.environ: del os.environ["WECLONE_CONFIG_PATH"] def run_make_dataset_test(config_file: str): """执行 make-dataset 测试""" print_test_header("make-dataset", config_file) config: WCMakeDatasetConfig = cast(WCMakeDatasetConfig, load_config_with_path(config_file, "make_dataset")) if DataModality.IMAGE in config.include_type: #复制图片到media_dir/iamges os.makedirs(config.media_dir, exist_ok=True) os.makedirs(os.path.join(config.media_dir, "images"), exist_ok=True) for file in os.listdir(os.path.join(PROJECT_ROOT_DIR, "tests", "tests_data", "images")): shutil.copy(os.path.join(PROJECT_ROOT_DIR, "tests", "tests_data", "images", file), os.path.join(config.media_dir, "images", file)) result = run_cli_command(["make-dataset"], config_file) assert result.returncode == 0, f"make-dataset command execution failed for config {config_file}" # Check if blocked_words filtering is working correctly sft_file_path = os.path.join(PROJECT_ROOT_DIR, "dataset", "res_csv", "sft", "sft-my.json") with open(sft_file_path, 'r', encoding='utf-8') as f: content = f.read() if "hh" in content: assert False, f"blocked_words filtering failed for config {config_file}: found 'hh' in {sft_file_path}" test_logger.info(f"✅ blocked_words filtering check passed for config {config_file}") # Check if tags count is correct for Qwen2.5-VL.jsonc config if "Qwen2.5-VL.jsonc" in config_file: image_count = content.count("") assert image_count == 3, f"Expected 3 tags in {sft_file_path} for config {config_file}, but found {image_count}" test_logger.info(f"✅ tags count check passed for config {config_file}: found {image_count} tags") def run_train_sft_test(config_file: str): """执行 train-sft 测试""" print_test_header("train-sft", config_file) try: result = run_cli_command(["train-sft"], config_file) assert result.returncode == 0, f"train-sft command failed or did not fail fast as expected for config {config_file}" except subprocess.TimeoutExpired: test_logger.info(f"train-sft command terminated due to timeout for config {config_file}, which is acceptable in testing, indicating the command has started execution.") pass except Exception as e: pytest.fail(f"An unexpected error occurred during train-sft command execution for config {config_file}: {e}") def run_webchat_demo_test(config_file: str): """执行 webchat-demo 测试""" print_test_header("webchat-demo", config_file) try: result = run_cli_command(["webchat-demo"], config_file, timeout=20) assert result.returncode == 0, f"webchat-demo command execution failed for config {config_file}" except subprocess.TimeoutExpired: pass def run_server_test(config_file: str) -> subprocess.Popen: """执行 server 测试,返回进程对象""" print_test_header("server (background)", config_file) server_process = cast(subprocess.Popen, run_cli_command(["server"], config_file, background=True)) test_logger.info("等待服务器启动,20秒后检查状态...") time.sleep(20) assert server_process.poll() is None, f"Server startup failed for config {config_file}" test_logger.info(f"使用配置 {config_file} 的服务器已在后台启动") return server_process def run_test_model_test(config_file: str, server_process: subprocess.Popen): """执行 test-model 测试并关闭服务器""" print_test_header("test-model", config_file) try: result = run_cli_command(["test-model"], config_file) assert result.returncode == 0, f"test-model command execution failed for config {config_file}" finally: if server_process is not None and server_process.poll() is None: test_logger.info(f"测试完成,正在关闭使用配置 {config_file} 的服务器...") server_process.terminate() server_process.wait(timeout=5) if server_process.poll() is None: server_process.kill() # Force kill if the process hasn't terminated test_logger.info("服务器已关闭") def clean_model_output(): """Clean model_output directory before each config test""" if os.path.exists("model_output"): shutil.rmtree("model_output") @pytest.mark.parametrize("config_file", get_config_files()) def test_full_pipeline_for_config(config_file): """为每个配置文件完整执行所有测试步骤""" print_config_header(config_file) clean_model_output() server_process = None try: # 按顺序执行所有测试步骤 run_make_dataset_test(config_file) run_train_sft_test(config_file) run_webchat_demo_test(config_file) server_process = run_server_test(config_file) run_test_model_test(config_file, server_process) test_logger.info(f"✅ 配置文件 {os.path.basename(config_file)} 的所有测试已完成") except Exception as e: test_logger.error(f"❌ 配置文件 {os.path.basename(config_file)} 测试失败: {e}") if server_process is not None and server_process.poll() is None: server_process.terminate() server_process.wait(timeout=5) if server_process.poll() is None: server_process.kill() raise if __name__ == "__main__": try: # If running directly, you would put your test code here pass finally: restore_test_env() ================================================ FILE: tests/tests_data/test_PII/test_0_730.csv ================================================ id,MsgSvrID,type_name,is_sender,talker,room_name,msg,src,CreateTime 7,4073926741244663531,文本,0,PHONE_NUMBER,wxid_6789z5qlxzfj22,13812345678,,2024/10/4 11:43 8,4073926741244663532,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43 9,706358374822797422,文本,0,EMAIL_ADDRESS,wxid_6789z5qlxzfj22,zhang.wei@163.com,,2024/10/4 11:43 10,706358374822797423,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43 11,2122553892045962801,文本,0,CREDIT_CARD,wxid_6789z5qlxzfj22,4532123456789012,,2024/10/4 11:43 12,2122553892045962802,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43 13,5704142615879617852,文本,0,IP_ADDRESS,wxid_6789z5qlxzfj22,192.168.1.100,,2024/10/4 11:43 14,5704142615879617853,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43 15,1337798072543283708,文本,0,LOCATION,wxid_6789z5qlxzfj22,北京市朝阳区三里屯,,2024/10/4 11:43 16,1337798072543283709,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43 17,8192964515963336399,文本,0,IBAN_CODE,wxid_6789z5qlxzfj22,GB33BUKB20201555555555,,2024/10/4 11:43 18,8192964515963336400,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43 19,7913656383976388488,文本,0,CRYPTO,wxid_6789z5qlxzfj22,1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa,,2024/10/4 11:43 20,7913656383976388489,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43 21,1964923183359419454,文本,0,AGE,wxid_6789z5qlxzfj22,25岁,,2024/10/4 11:43 22,1964923183359419455,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43 23,2403233409323875303,文本,0,ID,wxid_6789z5qlxzfj22,110101199001011234,,2024/10/4 11:43 24,2403233409323875304,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43 25,4630229215952295971,文本,0,PHONE_NUMBER,wxid_6789z5qlxzfj22,021-62345678,,2024/10/4 11:43 26,4630229215952295972,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43 27,6547675850931813364,文本,0,EMAIL_ADDRESS,wxid_6789z5qlxzfj22,li.ming@qq.com,,2024/10/4 11:43 28,6547675850931813365,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43 31,8151408074985365130,文本,0,IP_ADDRESS,wxid_6789z5qlxzfj22,2001:0db8:85a3:0000:0000:8a2e:0370:7334,,2024/10/4 11:43 32,8151408074985365131,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43 33,9876543210123456789,文本,0,PHONE_NUMBER,wxid_6789z5qlxzfj22,+1-125-123-4567,,2024/10/4 11:44 34,9876543210123456790,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 35,1234567890987654321,文本,0,EMAIL_ADDRESS,wxid_6789z5qlxzfj22,john.doe@gmail.com,,2024/10/4 11:44 36,1234567890987654322,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 37,5555444433332222111,文本,0,CREDIT_CARD,wxid_6789z5qlxzfj22,4111111111111111,,2024/10/4 11:44 38,5555444433332222112,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 39,7777888899990000123,文本,0,IP_ADDRESS,wxid_6789z5qlxzfj22,203.0.113.1,,2024/10/4 11:44 40,7777888899990000124,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 41,3333222211110000456,文本,0,LOCATION,wxid_6789z5qlxzfj22,1600 Pennsylvania Avenue NW Washington DC,,2024/10/4 11:44 42,3333222211110000457,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 43,9999000011112222789,文本,0,IBAN_CODE,wxid_6789z5qlxzfj22,DE89370400440532013000,,2024/10/4 11:44 44,9999000011112222790,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 45,1111333355557777012,文本,0,CRYPTO,wxid_6789z5qlxzfj22,bc1qxy2kgdygjrsqtzq2n0yrf2493p83kkfjhx0wlh,,2024/10/4 11:44 46,1111333355557777013,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 47,4444666688880000345,文本,0,AGE,wxid_6789z5qlxzfj22,32 years old,,2024/10/4 11:44 48,4444666688880000346,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 49,2222555577779999678,文本,0,US_SSN,wxid_6789z5qlxzfj22,078-05-1120,,2024/10/4 11:44 50,2222555577779999679,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 51,6666111133335555901,文本,0,PHONE_NUMBER,wxid_6789z5qlxzfj22,+44-20-7946-0958,,2024/10/4 11:44 52,6666111133335555902,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 53,8888222244446666234,文本,0,EMAIL_ADDRESS,wxid_6789z5qlxzfj22,sarah.johnson@outlook.com,,2024/10/4 11:44 54,8888222244446666235,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 55,0000444466668888567,文本,0,CREDIT_CARD,wxid_6789z5qlxzfj22,5555555555554444,,2024/10/4 11:44 56,0000444466668888568,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 57,3333777799991111890,文本,0,IP_ADDRESS,wxid_6789z5qlxzfj22,172.16.254.1,,2024/10/4 11:44 58,3333777799991111891,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 59,5555999911113333123,文本,0,LOCATION,wxid_6789z5qlxzfj22,10 Downing Street London SW1A 2AA UK,,2024/10/4 11:44 60,5555999911113333124,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 61,7777000022224444456,文本,0,IBAN_CODE,wxid_6789z5qlxzfj22,FR1420041010050500013M02606,,2024/10/4 11:44 62,7777000022224444457,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 63,9999222244446666789,文本,0,CRYPTO,wxid_6789z5qlxzfj22,3QJmV3qfvL9SuYo34YihAf3sRCW3qSinyC,,2024/10/4 11:44 64,9999222244446666790,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 65,1111555577779999012,文本,0,AGE,wxid_6789z5qlxzfj22,28岁,,2024/10/4 11:44 66,1111555577779999013,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 67,4444888800002222345,文本,0,ID,wxid_6789z5qlxzfj22,AB123456C,,2024/10/4 11:44 68,4444888800002222346,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 69,4444888800002222345,文本,0,666,wxid_6789z5qlxzfj22,404,,2024/10/4 11:44 70,4444888800002222346,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44 ================================================ FILE: tests/tests_data/test_model_data.json ================================================ { "questions": [ [ "吃了吗?", "吃的什么啊", "好吃吗", "多少钱啊", "可以请我吃吗" ], [ "干嘛呢?", "等会准备干什么去" ], [ "最近有什么新鲜事发生吗?", "有没有什么有趣的故事可以分享?" ], [ "周末过得怎么样?", "做了什么好玩的?" ], [ "今天天气怎么样?", "你那里呢?" ], [ "最近工作/学习顺利吗?", "有没有遇到什么挑战?" ] ] } ================================================ FILE: tests/tests_data/test_person/test_0_730.csv ================================================ id,MsgSvrID,type_name,is_sender,talker,room_name,msg,src,CreateTime 1,7437267147299592543,图片,0,12345iru2zsmo22,test_person,图片,File\dd0e62b6eb67d195bc33ab9470301d6c\Image\2024-10\01c177d8ad90af8969ba048455b54eef.dat,2024/10/4 11:42 2,637529293739295664,图片,0,12345iru2zsmo22,test_person,图片,File\dd0e62b6eb67d195bc33ab9470301d6c\Image\2024-10\d8a8936ca622823452e45b5e180a53a6.dat,2024/10/4 11:42 7,4073926741244663531,文本,1,12345iru2zsmo22,test_person,小马尔代夫,,2024/10/4 11:43 8,706358374822797422,文本,1,12345iru2zsmo22,test_person,名不虚传,,2024/10/4 11:43 9,2122553892045962801,文本,0,test_person,test_person,我去 好可爱啊,,2024/10/4 11:43 10,5704142615879617852,文本,0,test_person,test_person,2.0156416,,2024/10/4 11:43 11,1337798072543283708,文本,0,test_person,test_person,,,2024/10/4 11:43 12,8192964515963336399,文本,1,12345iru2zsmo22,test_person,啊哈哈哈,,2024/10/4 11:43 13,7913656383976388488,文本,1,12345iru2zsmo22,test_person,不是,,2024/10/4 11:43 14,1964923183359419454,文本,1,12345iru2zsmo22,test_person,你过来就得老久,,2024/10/4 11:43 15,2403233409323875303,文本,0,test_person,test_person,我在南站,,2024/10/4 11:43 16,4630229215952295971,文本,0,test_person,test_person,我知道我学校离养马岛12km,,2024/10/4 11:43 17,6547675850931813364,文本,0,test_person,test_person,[旺柴],,2024/10/4 11:43 18,1900866115792249247,文本,1,12345iru2zsmo22,test_person,牟平站,,2024/10/4 11:43 19,8151408074985365130,文本,1,12345iru2zsmo22,test_person,近,,2024/10/4 11:43 20,2421069219348160202,文本,1,12345iru2zsmo22,test_person,啊哈哈哈,,2024/10/4 11:43 21,8751079973209533492,文本,0,test_person,test_person,我去接朋友 他12点到,,2024/10/4 11:44 22,1133854364527684495,动画表情,1,12345iru2zsmo22,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=b3ffed81b71099d903628c5877bd0792&filekey=30440201010430302e02016e04025348042062336666656438316237313039396439303336323863353837376264303739320203030f78040d00000004627466730000000132&hy=SH&storeid=264ffed170006ff2f4bd405e70000006e01004fb153480ff458e0b6a8ce9e7&ef=1&bizid=1022,2024/10/4 11:45 23,3003440481974462293,文本,1,12345iru2zsmo22,test_person,一下午应该也行,,2024/10/4 11:45 24,3403121757406614004,文本,0,test_person,test_person,先吃饭然后忙完估计三点 过去就四点,,2024/10/4 11:46 25,6917846734389470451,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=f6ae48635cfc57931cbdcd4453230c36&filekey=3043020101042f302d02016e0402535a0420663661653438363335636663353739333163626463643434353332333063333602027039040d00000004627466730000000132&hy=SZ&storeid=266e7c5b40007eea8169bd3e30000006e01004fb1535a2836fbc1e6d7f6305&ef=1&bizid=1022,2024/10/4 11:46 26,3853926419342399869,文本,0,test_person,test_person,蒜啦,,2024/10/4 11:46 27,689214144441695718,文本,1,12345iru2zsmo22,test_person,啊哈哈,,2024/10/4 11:46 28,501703563680542858,文本,1,12345iru2zsmo22,test_person,那算了,,2024/10/4 11:46 29,5175776596048859341,文本,1,12345iru2zsmo22,test_person,可以明天来,,2024/10/4 11:46 30,1468168499470203020,文本,0,test_person,test_person,希望明天天气好,,2024/10/4 11:46 31,8704117912978418734,文本,0,test_person,test_person,你租车了没,,2024/10/4 11:46 32,3720367917725174786,文本,1,12345iru2zsmo22,test_person,hh,,2024/10/4 11:46 33,5706726594668713894,文本,1,12345iru2zsmo22,test_person,租了,,2024/10/4 11:46 34,6749208560602575120,文本,1,12345iru2zsmo22,test_person,150一天,,2024/10/4 11:47 35,6547279599090331225,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=f6ae48635cfc57931cbdcd4453230c36&filekey=3043020101042f302d02016e0402535a0420663661653438363335636663353739333163626463643434353332333063333602027039040d00000004627466730000000132&hy=SZ&storeid=266e7c5b40007eea8169bd3e30000006e01004fb1535a2836fbc1e6d7f6305&ef=1&bizid=1022,2024/10/4 11:47 36,783513142739644929,文本,1,12345iru2zsmo22,test_person,我本来只用半天是的,,2024/10/4 11:47 37,1522589433173967165,文本,0,test_person,test_person,我觉得还不如6小时,,2024/10/4 11:47 38,4189192320331088356,文本,1,12345iru2zsmo22,test_person,他说半天得提前预约,,2024/10/4 11:47 39,3276909886419115321,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=33935c33478d351ca575ed3b15c9c7d6&filekey=30350201010421301f020201060402535a041033935c33478d351ca575ed3b15c9c7d60203046683040d00000004627466730000000132&hy=SZ&storeid=266a76de2000bb3f307f6952b0000010600004f50535a2d4f20115699447b2&bizid=1023,2024/10/4 11:47 40,3794700043110742367,文本,0,test_person,test_person,我靠,,2024/10/4 11:47 41,4370237514919765211,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=e734d92f35462ae39096a6453a906c64&filekey=30440201010430302e02016e04025348042065373334643932663335343632616533393039366136343533613930366336340203011cf9040d00000004627466730000000132&hy=SH&storeid=266936043000dd1f97ec388740000006e01004fb153482828bbc1e6cab618e&ef=1&bizid=1022,2024/10/4 11:47 42,268652740652374624,文本,1,12345iru2zsmo22,test_person,我昨天没预约,,2024/10/4 11:47 43,8708021080758144662,文本,0,test_person,test_person,其实也玩不了多久 就环岛骑一圈,,2024/10/4 11:47 44,4042214828835453981,文本,1,12345iru2zsmo22,test_person,是滴,,2024/10/4 11:47 45,8134305588773593834,图片,0,12345iru2zsmo22,test_person,图片,File\dd0e62b6eb67d195bc33ab9470301d6c\Image\2024-10\13d6d8a81fa7554d09238c81fe314e85.dat,2024/10/4 15:49 46,8231897199371315830,文本,1,12345iru2zsmo22,test_person,back了,,2024/10/4 15:49 47,2523360219807779607,文本,1,12345iru2zsmo22,test_person,烟台下次还来,,2024/10/4 15:49 48,5990956613985588267,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=d8f4bf1e82e7b54a140cadd0dd1788a9&filekey=30350201010421301f020201060402535a0410d8f4bf1e82e7b54a140cadd0dd1788a90203054dd3040d00000004627466730000000132&hy=SZ&storeid=266a7694900068f8207f6952b0000010600004f50535a043fb011502ab6738&bizid=1023,2024/10/4 15:49 49,8020701317904864408,文本,0,test_person,test_person,坏了忘记回你,,2024/10/4 21:24 50,678530733212459598,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=ecd83bbd5669b1ad8ab82368a884b4f4&filekey=30440201010430302e02016e040253480420656364383362626435363639623161643861623832333638613838346234663402030be21c040d00000004627466730000000132&hy=SH&storeid=2666a63330000e6ca9bd747110000006e01004fb1534829283bc1e72de8bdc&ef=1&bizid=1022,2024/10/4 21:24 51,3817737317258834248,文本,0,test_person,test_person,你租的小车车好可爱哈哈哈,,2024/10/4 21:24 52,1122488129382806721,文本,0,test_person,test_person,我之前租的时候 人家说这个没劲儿 租了个大的,,2024/10/4 21:25 53,1244017411047763227,引用回复,0,test_person,test_person,"(我们两个人骑) [引用](2024-10-04 21:25:04)小虫:我之前租的时候 人家说这个没劲儿 租了个大的",,2024/10/4 21:25 54,7804635386065632983,文本,1,12345iru2zsmo22,test_person,hh,,2024/10/4 21:26 55,6582317494846210955,文本,1,12345iru2zsmo22,test_person,劲,,2024/10/4 21:26 56,6947874557250248646,文本,1,12345iru2zsmo22,test_person,相当大,,2024/10/4 21:26 57,7646558619446387721,系统通知,1,12345iru2zsmo22,test_person,你撤回了一条消息,,2024/10/4 21:26 58,4607675874750661759,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=2c6bbf882e053b9639569c47194da071&filekey=30440201010430302e02016e0402534804203263366262663838326530353362393633393536396334373139346461303731020326eec4040d00000004627466730000000131&hy=SH&storeid=323032313130303432303436333430303061373166346264613936393936353336376234306230303030303036653031303034666231&ef=1&bizid=1022,2024/10/4 21:27 59,4251046275168351582,文本,1,12345iru2zsmo22,test_person,我都超??,,2024/10/4 21:27 60,3958569781970448507,文本,0,test_person,test_person,哈哈哈哈哈哈哈哈哈,,2024/10/4 21:27 61,1304768232206478205,文本,1,12345iru2zsmo22,test_person,啊哈哈哈,,2024/10/4 23:11 62,5423864699615912300,文本,1,12345iru2zsmo22,test_person,生日快乐,,2024/10/4 23:11 63,3090401677076458687,系统通知,1,12345iru2zsmo22,test_person,你撤回了一条消息,,2024/10/4 23:11 64,2634320168319877355,文本,1,12345iru2zsmo22,test_person,(先知后觉,,2024/10/4 23:11 65,6872254500032132923,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=6f263986f9402e0259c58032d4b27403&filekey=30440201010430302e02016e04025348042036663236333938366639343032653032353963353830333264346232373430330203088dea040d00000004627466730000000132&hy=SH&storeid=265390f8400011cc0008b27880000006e01004fb1534806488bc1e0dae90d0&ef=1&bizid=1022,2024/10/4 23:12 66,1476937912918221285,文本,0,test_person,test_person,嘿嘿谢谢泥,,2024/10/4 23:12 67,5104969914181545205,文本,0,test_person,test_person,是明天嘟,,2024/10/4 23:12 68,4671697784411486925,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=4af51669b3933aeacf4c1823fe1a1654&filekey=3043020101042f302d02016e0402535a0420346166353136363962333933336165616366346331383233666531613136353402023d49040d00000004627466730000000132&hy=SZ&storeid=26673b134000b229540c40cba0000006e01004fb1535a1e1a70b1568486971&ef=1&bizid=1022,2024/10/4 23:12 69,942777755221289888,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=233b78d9d244a70087993eb38becca42&filekey=30350201010421301f02020106040253480410233b78d9d244a70087993eb38becca42020310919b040d00000004627466730000000132&hy=SH&storeid=266a9daa7000bc718169bd3e30000010600004f5053480627c0d1500e10846&bizid=1023,2024/10/4 23:13 70,4970918958858872918,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=06f0315fce5aeb37e36d90a449e55224&filekey=3043020101042f302d02016e0402535a04203036663033313566636535616562333765333664393061343439653535323234020269b0040d00000004627466730000000132&hy=SZ&storeid=26631ab4d0005d61f47603ed30000006e01004fb1535a0416fbc1e68d59226&ef=1&bizid=1022,2024/10/4 23:13 71,5088103607264479657,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=2fd26fe001f15baa35d2c7c1f1f77a11&filekey=30440201010430302e02016e0402535a04203266643236666530303166313562616133356432633763316631663737613131020301efba040d00000004627466730000000132&hy=SZ&storeid=266a9e4680007ad63169bd3e30000006e01004fb1535a05ff801150a57b6eb&ef=1&bizid=1022,2024/10/4 23:13 72,1231447585119365782,动画表情,0,test_person,test_person,表情,http://xx.com/262/20304/stodownload?m=f0648e1f78507fb5e0527c1847bb7eab&filekey=30350201010421301f020201060402535a0410f0648e1f78507fb5e0527c1847bb7eab0203046443040d00000004627466730000000132&hy=SZ&storeid=266a75fa5000a957907f6952b0000010600004f50535a0026bae1e00f753c3&bizid=1023,2024/10/4 23:13 73,7243064063443092107,文本,0,test_person,test_person,哈哈哈哈哈哈哈,,2024/12/15 21:01 74,4402111010190356867,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=2bb2100bc3ed89cb9bb5cb2ddf096ba5&filekey=30440201010430302e02016e0402535a04203262623231303062633365643839636239626235636232646466303936626135020301364a040d00000004627466730000000132&hy=SZ&storeid=2655320ef000f23e7135418150000006e01004fb1535a2071d321e07d71e51&ef=1&bizid=1022,2024/12/15 21:01 75,7957007613667310251,动画表情,1,12345iru2zsmo22,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=a3edb04dc624702bf53ceb8b8533030e&filekey=30440201010430302e02016e04025348042061336564623034646336323437303262663533636562386238353333303330650203031c6a040d00000004627466730000000132&hy=SH&storeid=267384c7a0008ff136f6e1f2a0000006e01004fb1534807906bd1e77d51516&ef=1&bizid=1022,2024/12/15 21:01 76,6555773752081434395,文本,1,12345iru2zsmo22,test_person,一年级社畜,,2024/12/15 21:02 77,6113080126607357441,文本,0,test_person,test_person,妈耶好诡异,,2024/12/15 21:07 78,8861993796968204324,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=e868279290cc3be14207d9946d5f7479&filekey=30350201010421301f020201060402535a0410e868279290cc3be14207d9946d5f747902030204ac040d00000004627466730000000132&hy=SZ&storeid=264c8b3cb0002e54407f6952b0000010600004f50535a0df0c950b74cd5f3e&bizid=1023,2024/12/15 21:07 79,8225573753184622169,文本,1,12345iru2zsmo22,test_person,上班,,2024/12/15 21:07 80,2185462146394548348,文本,1,12345iru2zsmo22,test_person,是会这样的,,2024/12/15 21:07 81,2492434220482862582,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=8a3ec2eb01ffade8c1d2cef9ce7b9cd0&filekey=30440201010430302e02016e0402535a042038613365633265623031666661646538633164326365663963653762396364300203023e74040d00000004627466730000000132&hy=SZ&storeid=26561e4d90003ae9fa356ce630000006e01004fb1535a04b6bae1e6e3ef683&ef=1&bizid=1022,2024/12/15 21:08 82,3323843778125596201,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=e36da9879caba2492757a93c9bd1e8a3&filekey=30350201010421301f020201060402535a0410e36da9879caba2492757a93c9bd1e8a302030229bb040d00000004627466730000000132&hy=SZ&storeid=266f634870009dfbd6f6e1f2a0000010600004f50535a1b369bc1e691b22e9&bizid=1023,2024/12/15 21:08 83,4807494899095110953,文本,1,12345iru2zsmo22,test_person,??你和拼多多签约了???,,2024/12/30 21:06 84,7732495155588506274,图片,0,12345iru2zsmo22,test_person,图片,File\dd0e62b6eb67d195bc33ab9470301d6c\Image\2024-12\01c177d8ad90af8969ba048455b54eef.dat,2024/12/30 21:06 85,3524820582691543233,文本,0,test_person,test_person,给你99 给我花9.9买一个,,2024/12/30 21:08 86,4559086380629629977,动画表情,0,test_person,test_person,表情,http://xx.com/262/20304/stodownload?m=60fc498474f154e67a9406e6052774e3&filekey=30350201010421301f0202010604025348041060fc498474f154e67a9406e6052774e302030a1000040d00000004627466730000000132&hy=SH&storeid=2638fd60e000bbbf33df216b40000010600004f5053482e31b8e0b68ce4bb7&bizid=1023,2024/12/30 21:08 87,2856784266939461622,文本,0,test_person,test_person,我和偷玩签约了(bushi),,2024/12/30 21:09 88,4103450553959091238,文本,1,12345iru2zsmo22,test_person,啊哈哈哈哈,,2024/12/30 21:10 89,4320467837159769744,图片,0,test_person,test_person,图片,File\dd0e62b6eb67d195bc33ab9470301d6c\Image\2024-12\e7e73ba89149fc57ea6fd395b00c9daf.dat,2024/12/30 21:21 90,1245688256602333044,文本,0,test_person,test_person,不知道他有没有看上我,,2024/12/30 21:21 91,3496799115798928577,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=6d737e0cb5ed70dbd2e543192395e627&filekey=30440201010430302e02016e040253480420366437333765306362356564373064626432653534333139323339356536323702030108ea040d00000004627466730000000132&hy=SH&storeid=26743cbcd000e81a54eaf9c070000006e01004fb153481223f03156d7563c2&ef=1&bizid=1022,2024/12/30 21:21 92,8086695983133128935,文本,0,test_person,test_person,幸运的话 会送我小熊虫,,2024/12/30 21:21 93,1413825802731496171,文本,0,test_person,test_person,不幸运就没有后续了,,2024/12/30 21:21 94,4732788513210348588,文本,1,12345iru2zsmo22,test_person,啊哈哈哈,,2024/12/30 21:25 95,2430661934638082113,文本,1,12345iru2zsmo22,test_person,聘你为,,2024/12/30 21:25 96,8416545366058458010,文本,1,12345iru2zsmo22,test_person,代言人,,2024/12/30 21:26 97,8818193356512955281,动画表情,0,test_person,test_person,表情,http://xx.com/262/20304/stodownload?m=87a30250a72f68eb6dbcd3c833f34af9&filekey=30350201010421301f020201060402535a041087a30250a72f68eb6dbcd3c833f34af902030a08d8040d00000004627466730000000131&hy=SZ&storeid=32303231303632363030303735333030303965316137356663316362626162343537353830393030303030313036&bizid=1023,2024/12/30 21:27 98,2420601785318357838,文本,0,test_person,test_person,比不上别人,,2024/12/30 21:27 99,209384579714630809,文本,0,test_person,test_person,名额多 我才可能有机会,,2024/12/30 21:27 100,3867626588038981853,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=314db052c1847c0b51794ce3eff22482&filekey=30340201010420301e02020106040253480410314db052c1847c0b51794ce3eff224820202142f040d00000004627466730000000132&hy=SH&storeid=26303a1b5000ee6ad950c9c370000010600004f50534828034b00b6d05aeac&bizid=1023,2024/12/30 21:28 ================================================ FILE: weclone/__init__.py ================================================ ================================================ FILE: weclone/cli.py ================================================ import functools import os import sys from pathlib import Path from typing import cast import click import pyjson5 from rich.console import Console from rich.panel import Panel from rich.text import Text from weclone.utils.config import load_config from weclone.utils.config_models import CliArgs from weclone.utils.log import capture_output, configure_log_level_from_config, logger cli_config: CliArgs | None = None try: import tomllib # type: ignore Python 3.11+ except ImportError: import tomli as tomllib def clear_argv(func): """ Decorator: Clear sys.argv before calling the decorated function, keeping only the script name. Restore original sys.argv after calling. Used to prevent arguments from being parsed by Hugging Face HfArgumentParser causing ValueError. """ @functools.wraps(func) def wrapper(*args, **kwargs): original_argv = sys.argv.copy() sys.argv = [original_argv[0]] # Keep only script name try: return func(*args, **kwargs) finally: sys.argv = original_argv # Restore original sys.argv return wrapper def with_community_info(func): """ Decorator: Show community info before executing the command """ @functools.wraps(func) def wrapper(*args, **kwargs): show_community_info() return func(*args, **kwargs) return wrapper def apply_common_decorators(capture_output_enabled=False): """ A unified decorator for applications """ def decorator(original_cmd_func): @functools.wraps(original_cmd_func) def new_runtime_wrapper(*args, **kwargs): if cli_config and cli_config.full_log: return capture_output(original_cmd_func)(*args, **kwargs) else: return original_cmd_func(*args, **kwargs) func_with_clear_argv = clear_argv(new_runtime_wrapper) return functools.wraps(original_cmd_func)(func_with_clear_argv) return decorator @click.group(invoke_without_command=True) @click.option( "--config-path", default=None, help="Specify config file path, or set WECLONE_CONFIG_PATH environment variable", ) @click.pass_context def cli(ctx, config_path): """WeClone: One-stop solution for creating digital avatars from chat history""" # Only show community info when no subcommand is invoked if ctx.invoked_subcommand is None: show_community_info() click.echo(ctx.get_help()) return if config_path: os.environ["WECLONE_CONFIG_PATH"] = config_path logger.info(f"Config file path set to: {config_path}") _check_project_root() _check_versions() global cli_config cli_config = cast(CliArgs, load_config(arg_type="cli_args")) configure_log_level_from_config() @cli.command("make-dataset", help="Process chat history CSV files to generate Q&A pair datasets.") @with_community_info @apply_common_decorators() def qa_generator(): """Process chat history CSV files to generate Q&A pair datasets.""" from weclone.data.qa_generator import DataProcessor processor = DataProcessor() processor.main() @cli.command("train-sft", help="Fine-tune the model using prepared datasets.") @apply_common_decorators() def train_sft(): """Fine-tune the model using prepared datasets.""" from weclone.train.train_sft import main as train_sft_main train_sft_main() @cli.command("webchat-demo", help="Launch Web UI for interactive testing with fine-tuned model.") @apply_common_decorators() def web_demo(): """Launch Web UI for interactive testing with fine-tuned model.""" from weclone.eval.web_demo import main as web_demo_main web_demo_main() # TODO Add evaluation functionality @cli.command("eval-model", help="Evaluate using validation set split from training data.") @apply_common_decorators() def eval_model(): """Evaluate using validation set split from training data.""" from weclone.eval.eval_model import main as evaluate_main evaluate_main() @cli.command("test-model", help="Test model with common chat questions.") @apply_common_decorators() def test_model(): """Test model with common chat questions.""" from weclone.eval.test_model import main as test_main test_main() @cli.command("server", help="Start API service providing model inference interface.") @apply_common_decorators() def server(): """Start API service providing model inference interface.""" from weclone.server.api_service import main as server_main server_main() @cli.command("version", help="Show WeClone version information.") @with_community_info def version(): """Show WeClone version information.""" pass def show_community_info(): console = Console() content = Text() content.append("📱 Official group\n", style="bold green") content.append(" • Telegram: ", style="bold cyan") content.append("https://t.me/+JEdak4m0XEQ3NGNl\n", style="bright_blue") content.append(" • QQ群: ", style="bold cyan") content.append("708067078\n\n", style="bright_green") content.append("🌐 Social media\n", style="bold magenta") content.append(" • Twitter: ", style="bold cyan") content.append("https://x.com/weclone567\n", style="bright_blue") content.append(" • 小红书: ", style="bold cyan") content.append("🔍 搜索WeClone\n\n", style="bright_blue") content.append("📚 Official resources\n", style="bold red") content.append(" • Repository: ", style="bold cyan") content.append("https://github.com/xming521/WeClone\n", style="bright_blue") content.append(" • Homepage: ", style="bold cyan") content.append("https://www.weclone.love/\n", style="bright_blue") content.append(" • Document: ", style="bold cyan") content.append("https://docs.weclone.love/\n\n", style="bright_blue") content.append("💡 感谢您的关注和支持!Thank you for your support!", style="bold bright_green") panel = Panel( content, title="🌟 Community & Social Media", title_align="center", border_style="bright_cyan", padding=(1, 2), ) console.print(panel) def _check_project_root(): """Check if current directory is project root and verify project name.""" project_root_marker = "pyproject.toml" current_dir = Path(os.getcwd()) pyproject_path = current_dir / project_root_marker if not pyproject_path.is_file(): logger.error(f"{project_root_marker} file not found in current directory.") logger.error("Please ensure you are running this command in the WeClone project root directory.") sys.exit(1) try: with open(pyproject_path, "rb") as f: pyproject_data = tomllib.load(f) project_name = pyproject_data.get("project", {}).get("name") if project_name != "WeClone": logger.error("Please ensure you are running in the correct WeClone project root directory.") sys.exit(1) except tomllib.TOMLDecodeError as e: logger.error(f"Error: Unable to parse {pyproject_path} file: {e}") sys.exit(1) except Exception as e: logger.error(f"Unexpected error occurred while reading or processing {pyproject_path}: {e}") sys.exit(1) def _check_versions(): """Compare local settings.jsonc version with config file guide version in pyproject.toml""" if tomllib is None: # Skip check if toml parser failed to import return ROOT_DIR = Path(__file__).parent.parent SETTINGS_PATH = ROOT_DIR / "settings.jsonc" PYPROJECT_PATH = ROOT_DIR / "pyproject.toml" settings_version = None config_guide_version = None config_changelog = None project_version = None if SETTINGS_PATH.exists(): try: with open(SETTINGS_PATH, "r", encoding="utf-8") as f: content = f.read() settings_data = pyjson5.loads(content) settings_version = settings_data.get("version") except Exception as e: logger.error(f"Error: Unable to read or parse {SETTINGS_PATH}: {e}") logger.error("Please ensure settings.jsonc file exists and is properly formatted.") sys.exit(1) else: logger.error(f"Error: Config file {SETTINGS_PATH} not found.") logger.error("Please ensure settings.jsonc file is located in the project root directory.") sys.exit(1) if PYPROJECT_PATH.exists(): try: with open(PYPROJECT_PATH, "rb") as f: # tomllib requires binary mode pyproject_data = tomllib.load(f) weclone_tool_data = pyproject_data.get("tool", {}).get("weclone", {}) config_guide_version = weclone_tool_data.get("config_version") config_changelog = weclone_tool_data.get("config_changelog", "N/A") project_version = pyproject_data.get("project", {}).get("version") except Exception as e: logger.warning( f"Warning: Unable to read or parse {PYPROJECT_PATH}: {e}. Cannot check if config file is up to date." ) else: logger.warning( f"Warning: File {PYPROJECT_PATH} not found. Cannot check if config file is up to date." ) if not settings_version: logger.error(f"Error: 'version' field not found in {SETTINGS_PATH}.") logger.error("Please copy from settings.template.json or update your settings.jsonc file.") sys.exit(1) if config_guide_version: if settings_version != config_guide_version: logger.warning( f"Warning: Your settings.jsonc file version ({settings_version}) does not match the project's recommended config version ({config_guide_version})." ) logger.warning( "This may cause unexpected behavior or errors. Please copy from settings.template.json or update your settings.jsonc file." ) # TODO Print update log based on version number logger.warning(f"Config file changelog:\n{config_changelog}") logger.info(f"📦 Project Version: {project_version}") logger.info(f"⚙️ Config Version: {settings_version}") elif PYPROJECT_PATH.exists(): # If file exists but version not found logger.warning( f"Warning: 'config_version' field not found under [tool.weclone] in {PYPROJECT_PATH}. " "Cannot confirm if your settings.jsonc is the latest config version." ) if __name__ == "__main__": cli() ================================================ FILE: weclone/core/PII/__init__.py ================================================ from .pii_detector import ChinesePIIDetector, PIIDetector, PIIResult __all__ = ["PIIResult", "PIIDetector", "ChinesePIIDetector"] ================================================ FILE: weclone/core/PII/pii_detector.py ================================================ from dataclasses import dataclass from typing import List, Optional, cast from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, Pattern, PatternRecognizer from presidio_analyzer.nlp_engine import NlpEngineProvider from presidio_anonymizer import AnonymizerEngine from presidio_anonymizer.entities.engine.recognizer_result import ( RecognizerResult as AnonymizerRecognizerResult, # type: ignore ) # from presidio_analyzer.analyzer_engine import logger as presidio_logger from weclone.utils.log import logger @dataclass class PIIResult: entity_type: str start: int end: int score: float text: str class PIIDetector: """PII detector based on presidio library""" def __init__(self, language: str = "en", threshold: float = 0.5): self.language = language self.threshold = threshold self._init_engines() self.anonymizer = AnonymizerEngine() self.not_filtered_entities = ["DATE_TIME", "PERSON", "URL", "NRP"] self.supported_entities = self.get_all_entities() self.filtered_entities = [ entity for entity in self.supported_entities if entity not in self.not_filtered_entities ] if self.language == "en": logger.info(f"Privacy filtered entity types: {self.filtered_entities}") def _init_engines(self): model_mapping = { "zh": "zh_core_web_sm", "en": "en_core_web_sm", "es": "es_core_news_sm", "fr": "fr_core_news_sm", "de": "de_core_news_sm", } model_name = model_mapping.get(self.language, "en_core_web_sm") nlp_configuration = { "nlp_engine_name": "spacy", "models": [{"lang_code": self.language, "model_name": model_name}], } provider = NlpEngineProvider(nlp_configuration=nlp_configuration) nlp_engine = provider.create_engine() self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine) self._add_custom_recognizers(language=self.language) self.batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) # self.anonymizer = AnonymizerEngine() logger.info( f"Presidio engine initialized successfully, using language: {self.language}, model: {model_name}" ) def _add_custom_recognizers(self, language: str): # Create numeric ID recognizer - matches 5+ digit numbers or numbers with - separators numeric_id_patterns = [ Pattern(name="numeric_id", regex=r"\b(?:[A-Za-z]*\d{5,}[A-Za-z]*|\d+-\d+(?:-\d+)*)\b", score=0.8), Pattern(name="unicode_escape_id", regex=r"\\u[0-9a-fA-F]{4}", score=0.8), Pattern(name="hex_escape_id", regex=r"\\xa0", score=0.8), ] numeric_id_recognizer = PatternRecognizer( supported_entity="NUMERIC_ID", patterns=numeric_id_patterns, supported_language=language, name="numeric_id_recognizer", context=["id", "编号", "号码", "代码", "code", "number", "序号", "sequence", "identifier"], ) self.analyzer.registry.add_recognizer(numeric_id_recognizer) logger.info("Custom numeric ID recognizer added") def has_pii(self, text: str, entities: Optional[List[str]] = None) -> bool: pii_results = self.detect_pii(text) return len(pii_results) > 0 def batch_has_pii(self, texts: List[str]) -> List[bool]: """ Check if multiple texts contain PII information using batch processing Args: texts: List of texts to be checked Returns: List of boolean values indicating whether each text contains PII """ if not texts or not isinstance(texts, list): return [] batch_results = self.batch_detect_pii(texts) return [len(results) > 0 for results in batch_results] def detect_pii(self, text: str) -> List[PIIResult]: """ Detect PII information in text Args: text: Text to be detected entities: Specified entity types to detect, defaults to all supported types Returns: List of detected PII information """ if not text or not isinstance(text, str): return [] results = self.analyzer.analyze( text=text, language=self.language, entities=self.filtered_entities, score_threshold=self.threshold, ) pii_results = [] for result in results: pii_result = PIIResult( entity_type=result.entity_type, start=result.start, end=result.end, score=result.score, text=text[result.start : result.end], ) pii_results.append(pii_result) if pii_results: logger.debug(f"Detected {len(pii_results)} PII entities") return pii_results def batch_detect_pii(self, texts: List[str]) -> List[List[PIIResult]]: """ Detect PII information in multiple texts using batch processing Args: texts: List of texts to be detected Returns: List of lists containing detected PII information for each text """ if not texts or not isinstance(texts, list): return [] # Filter out empty or non-string texts valid_texts = [] text_indices = [] for i, text in enumerate(texts): if text and isinstance(text, str): valid_texts.append(text) text_indices.append(i) if not valid_texts: return [[] for _ in texts] # Use batch analyzer for multiple texts results_iterator = self.batch_analyzer.analyze_iterator( texts=valid_texts, language=self.language, entities=self.filtered_entities, score_threshold=self.threshold, n_process=24, batch_size=32, ) # Process results all_pii_results = [[] for _ in texts] for batch_idx, results in enumerate(results_iterator): original_idx = text_indices[batch_idx] text = valid_texts[batch_idx] pii_results = [] for result in results: pii_result = PIIResult( entity_type=result.entity_type, start=result.start, end=result.end, score=result.score, text=text[result.start : result.end], ) pii_results.append(pii_result) all_pii_results[original_idx] = pii_results total_entities = sum(len(results) for results in all_pii_results) if total_entities > 0: logger.debug(f"Batch detected {total_entities} PII entities across {len(valid_texts)} texts") return all_pii_results def anonymize_text(self, text: str, entities: Optional[List[str]] = None) -> str: """ Anonymize PII information in text Args: text: Text to be anonymized entities: Specified entity types to anonymize, defaults to all detected types Returns: Anonymized text """ if not text or not isinstance(text, str): return text try: analyzer_results = self.analyzer.analyze( text=text, language=self.language, entities=entities, score_threshold=self.threshold ) anonymized_result = self.anonymizer.anonymize( text=text, analyzer_results=cast(List[AnonymizerRecognizerResult], analyzer_results) ) logger.info(f"Successfully anonymized {len(analyzer_results)} PII entities") return anonymized_result.text except Exception as e: logger.error(f"Text anonymization failed: {e}") return text def get_supported_entities(self) -> List[str]: return self.analyzer.get_supported_entities(language=self.language) def get_all_entities(self) -> List[str]: """Get all entities including custom ones from the registry""" predefined_entities = self.get_supported_entities() custom_entities = [] # Get custom entities from registry for recognizer in self.analyzer.registry.recognizers: for entity in recognizer.supported_entities: if entity not in predefined_entities and entity not in custom_entities: custom_entities.append(entity) return predefined_entities + custom_entities class ChinesePIIDetector(PIIDetector): """Chinese PII detector, extended to recognize Chinese-specific PII""" def __init__(self, threshold: float = 0.5): super().__init__(language="zh", threshold=threshold) # Filter out country-specific entities that are not relevant for Chinese context country_prefixes = ["US_", "UK_", "SG_", "AU_", "IN_"] # Get entities that are actually supported by the analyzer all_entities = self.get_all_entities() supported_entities = self.get_supported_entities() self.filtered_entities = [ entity for entity in all_entities if entity not in self.not_filtered_entities and not any(entity.startswith(prefix) for prefix in country_prefixes) and (entity in supported_entities or entity in ["NUMERIC_ID", "CHINESE_PII"]) ] logger.info(f"Chinese PII filtered entity types: {self.filtered_entities}") def _add_custom_recognizers(self, language: str): # Add parent class recognizers first super()._add_custom_recognizers(language="zh") # Add Chinese-specific recognizers that are not covered by NUMERIC_ID chinese_patterns = [ Pattern(name="chinese_id_with_x", regex=r"\b\d{17}[Xx]\b", score=0.9), Pattern( name="chinese_email", regex=r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", score=0.9 ), Pattern( name="chinese_email_with_plus", regex=r"\b[A-Za-z0-9._%+-]+\+[A-Za-z0-9._%+-]*@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", score=0.95, ), ] chinese_recognizer = PatternRecognizer( supported_entity="CHINESE_PII", supported_language="zh", patterns=chinese_patterns, name="chinese_pii_recognizer", context=["中文PII"], ) self.analyzer.registry.add_recognizer(chinese_recognizer) logger.info("Chinese PII recognizer added") ================================================ FILE: weclone/core/inference/offline_infer.py ================================================ import re from typing import List, Optional, cast import torch from llamafactory.data import get_template_and_fix_tokenizer from llamafactory.extras.misc import get_device_count from llamafactory.hparams import get_infer_args from llamafactory.model import load_tokenizer from openai.types.chat import ChatCompletion from pydantic import BaseModel from vllm import LLM, SamplingParams from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput from vllm.sampling_params import GuidedDecodingParams from weclone.utils.config import load_config from weclone.utils.config_models import VllmArgs from weclone.utils.log import logger # from vllm.entrypoints.openai.tool_parsers import xLAMToolParser # NOTE: the V1 LLM engine writing style was used. def extract_json_from_text(text: str) -> str: """Extract JSON content from text, supporting JSON blocks in markdown format.""" json_pattern = r"```json\s*(.*?)\s*```" match = re.search(json_pattern, text, re.DOTALL) if match: return match.group(1).strip() return text.strip() def parse_guided_decoding_results( results: List[RequestOutput] | List[ChatCompletion] | List, guided_decoding_class: type[BaseModel] ) -> tuple[List[Optional[BaseModel]], List[int]]: """Parse guided decoding results and return parsed results with failed indices. Args: results: Raw vLLM generation results guided_decoding_class: Pydantic model class for validation Returns: tuple: (parsed_results, failed_indices) where failed_indices contains indices of failed JSON parsing """ parsed_results = [] failed_indexs = [] for idx, result in enumerate(results): try: if isinstance(result, RequestOutput): json_text = extract_json_from_text(result.outputs[0].text) elif isinstance(result, ChatCompletion): json_text = extract_json_from_text(result.choices[0].message.content) else: raise ValueError(f"Unsupported result type: {type(result)}") parsed_result = guided_decoding_class.model_validate_json(json_text) parsed_results.append(parsed_result) except Exception as e: if isinstance(result, RequestOutput): log_text = result.outputs[0].text[:100] + "..." elif isinstance(result, ChatCompletion): log_text = result.choices[0].message.content[:100] + "..." else: log_text = str(result)[:100] + "..." logger.warning( f"Failed to parse JSON from result at sequence index {idx}: {log_text}, error: {e}" ) failed_indexs.append(idx) parsed_results.append(None) return parsed_results, failed_indexs def vllm_infer( inputs: List[str], model_name_or_path: str, adapter_name_or_path: Optional[str] = None, dataset: str = "alpaca_en_demo", dataset_dir: str = "data", template: str = "default", cutoff_len: int = 2048, max_samples: Optional[int] = None, vllm_config: str = "{}", save_name: str = "generated_predictions.jsonl", default_system: Optional[str] = None, enable_thinking: bool = False, temperature: float = 0.95, top_p: float = 0.7, top_k: int = 50, guided_decoding_class: Optional[type[BaseModel]] = None, bad_words: Optional[List[str]] = None, logprobs: Optional[int] = None, max_new_tokens: int = 1024, repetition_penalty: float = 1.0, skip_special_tokens: bool = True, seed: Optional[int] = None, pipeline_parallel_size: int = 1, image_max_pixels: int = 768 * 768, image_min_pixels: int = 32 * 32, ) -> tuple[List[RequestOutput] | List[Optional[BaseModel]], List[int]]: r"""Perform batch generation using vLLM engine, which supports tensor parallelism. Returns: tuple: (results, failed_indices) where failed_indices contains indices of failed JSON parsing """ if pipeline_parallel_size > get_device_count(): raise ValueError("Pipeline parallel size should be smaller than the number of gpus.") wc_vllm_args = cast(VllmArgs, load_config("vllm")) model_args, data_args, _, generating_args = get_infer_args( { "model_name_or_path": model_name_or_path, "adapter_name_or_path": adapter_name_or_path, "dataset": dataset, "dataset_dir": dataset_dir, "template": template, "cutoff_len": cutoff_len, "max_samples": max_samples, "preprocessing_num_workers": 16, "vllm_config": vllm_config, "temperature": temperature, "top_p": top_p, "top_k": top_k, "max_new_tokens": max_new_tokens, "repetition_penalty": repetition_penalty, "enable_thinking": enable_thinking, } ) tokenizer_module = load_tokenizer(model_args) tokenizer = tokenizer_module["tokenizer"] template_obj = get_template_and_fix_tokenizer(tokenizer, data_args) template_obj.mm_plugin.expand_mm_tokens = False # for vllm generate if guided_decoding_class: json_schema = guided_decoding_class.model_json_schema() guided_decoding_params = GuidedDecodingParams(json=json_schema, disable_any_whitespace=True) sampling_params = SamplingParams( repetition_penalty=generating_args.repetition_penalty or 1.0, temperature=generating_args.temperature, top_p=generating_args.top_p or 1.0, top_k=generating_args.top_k or -1, stop_token_ids=template_obj.get_stop_token_ids(tokenizer), max_tokens=generating_args.max_new_tokens, skip_special_tokens=skip_special_tokens, seed=seed, bad_words=bad_words, guided_decoding=guided_decoding_params if guided_decoding_class else None, ) if model_args.adapter_name_or_path is not None: lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0]) else: lora_request = None engine_args = { "model": model_args.model_name_or_path, "trust_remote_code": True, "dtype": model_args.infer_dtype, "max_model_len": cutoff_len + max_new_tokens, "disable_log_stats": True, "enable_lora": model_args.adapter_name_or_path is not None, "enable_prefix_caching": True, "guided_decoding_backend": "guidance", "guided_decoding_disable_any_whitespace": True, } if template_obj.mm_plugin.__class__.__name__ != "BasePlugin": engine_args["limit_mm_per_prompt"] = {"image": 4, "video": 2, "audio": 2} wc_vllm_dict = {k: v for k, v in wc_vllm_args.model_dump().items() if v is not None} engine_args.update(wc_vllm_dict) if isinstance(model_args.vllm_config, dict): engine_args.update(model_args.vllm_config) messages_list = [[{"role": "user", "content": text}] for text in inputs] llm = LLM(**engine_args) results = llm.chat( messages_list, sampling_params, lora_request=lora_request, chat_template_kwargs={"enable_thinking": enable_thinking}, ) # type: ignore del llm torch.cuda.empty_cache() if guided_decoding_class: # TODO better json decode https://github.com/vllm-project/vllm/commit/1d0ae26c8544fd5a62e171e30c2dcc2973a23bc8#diff-3b27790a2ce97bc50cdd5476f7b0057da682ed0d1ec8426a7b76c5e21454e57d parsed_results, failed_indexs = parse_guided_decoding_results(results, guided_decoding_class) return parsed_results, failed_indexs else: return results, [] ================================================ FILE: weclone/core/inference/online_infer.py ================================================ import logging from concurrent.futures import Future, ThreadPoolExecutor from typing import Any, Callable, List, Optional, Union from openai import OpenAI from openai.types.chat import ChatCompletion, ChatCompletionMessageParam from pydantic import BaseModel from weclone.core.inference.offline_infer import extract_json_from_text from weclone.utils.log import logger from weclone.utils.retry import retry_openai_api logging.getLogger("openai._base_client").setLevel(logging.WARNING) logging.getLogger("httpx").setLevel(logging.WARNING) class OnlineLLM: def __init__( self, api_key: str, base_url: str, model_name: str, default_system: Optional[str] = None, max_workers: int = 10, prompt_with_system: bool = False, response_format: str = "json_object", ): self.api_key = api_key self.base_url = base_url self.model_name = model_name self.default_system = default_system self.max_workers = max_workers self.client = OpenAI(api_key=self.api_key, base_url=self.base_url, max_retries=0) self.executor = ThreadPoolExecutor(max_workers=max_workers) self.prompt_with_system = prompt_with_system self.response_format = response_format @retry_openai_api(max_retries=200, base_delay=30.0, max_delay=180.0) def chat( self, prompt_text, temperature: float = 0.7, max_tokens: int = 1024, top_p: float = 0.95, stream: bool = False, ): messages: List[ChatCompletionMessageParam] = [] if self.prompt_with_system: messages = prompt_text else: messages = [ # {"role": "system", "content": self.default_system}, {"role": "user", "content": prompt_text}, ] params = { "model": self.model_name, "messages": messages, "stream": stream, "temperature": temperature, "max_tokens": max_tokens, "top_p": top_p, # extra_body={"chat_template_kwargs": {"enable_thinking": False}} } if self.response_format: params["response_format"] = {"type": self.response_format} response = self.client.chat.completions.create(**params) return response def chat_async( self, prompt_text: str, temperature: float = 0.7, max_tokens: int = 1024, top_p: float = 0.95, stream: bool = False, ) -> Future: """Submit a chat request to the thread pool for async processing""" return self.executor.submit(self.chat, prompt_text, temperature, max_tokens, top_p, stream) def chat_batch( self, prompts: List[str], temperature: float = 0.7, max_tokens: int = 1024, top_p: float = 0.95, stream: bool = False, callback: Optional[Callable[[int, Any], None]] = None, guided_decoding_class: Optional[type[BaseModel]] = None, ) -> Union[List[Union[ChatCompletion, Exception]], tuple[List[Optional[BaseModel]], List[int]]]: """Process multiple chat requests concurrently using thread pool Args: prompts: List of prompt strings temperature: Sampling temperature max_tokens: Maximum tokens to generate top_p: Top-p sampling parameter stream: Whether to stream the response callback: Optional callback function called for each result guided_decoding_class: Pydantic model class for JSON validation Returns: If enable_json_decode is False: List of ChatCompletion or Exception objects If enable_json_decode is True: Tuple of (parsed_results, failed_indices) """ futures = [] for i, prompt in enumerate(prompts): future = self.chat_async(prompt, temperature, max_tokens, top_p, stream) futures.append((i, future)) results: List[Union[Any, Exception]] = [None] * len(prompts) for i, future in futures: try: result = future.result() results[i] = result if callback: callback(i, result) except Exception as e: results[i] = e if callback: callback(i, e) if guided_decoding_class: parsed_results: List[Optional[BaseModel]] = [None] * len(prompts) failed_indexs: List[int] = [] for i, result in enumerate(results): if isinstance(result, Exception): failed_indexs.append(i) logger.warning(f"Request at index {i} failed with exception: {result}") elif isinstance(result, ChatCompletion): try: content = result.choices[0].message.content if content is None: raise ValueError("Message content is None") json_text = extract_json_from_text(content) parsed_result = guided_decoding_class.model_validate_json(json_text) parsed_results[i] = parsed_result except Exception as e: content = result.choices[0].message.content log_text = (content[:100] + "...") if content else "None" logger.warning( f"Failed to parse JSON from result at index {i}: {log_text}, error: {e}" ) failed_indexs.append(i) else: logger.warning(f"Unexpected result type at index {i}: {type(result)}") failed_indexs.append(i) return parsed_results, failed_indexs return results def close(self): """Clean up thread pool resources""" if hasattr(self, "executor"): self.executor.shutdown(wait=True) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() ================================================ FILE: weclone/data/__init__.py ================================================ ================================================ FILE: weclone/data/chat_parsers/telegram_parser.py ================================================ import csv import json import os import shutil import sys from datetime import datetime from typing import Dict, List from pandas import Timestamp from weclone.data.models import ChatMessage from weclone.utils.config_models import DataModality, WCMakeDatasetConfig from weclone.utils.log import logger class TelegramChatParser: """Telegram chat parser that converts JSON format to data conforming to ChatMessage structure""" def __init__(self, config: WCMakeDatasetConfig): self.config = config self.my_user_id = config.telegram_args.my_id if config.telegram_args else None self.message_counter = 0 self.type_mapping = { "text": "text", "photo": "image", "video_file": "video", "animation": "video", "voice_message": "voice", "audio_file": "file", "sticker": "sticker", "file": "file", "location": "location", "poll": "(share) card link", "contact_information": "(share) card link", } def get_message_type_and_content(self, message: Dict) -> tuple[str, str, str, bool]: """ Determine type_name, msg content, src and whether it's a forwarded message based on Telegram message content Returns ------- tuple[str, str, str, bool] (type_name, msg_content, src_path, is_forward) """ msg_content = "" src_path = "" msg_type = "text" is_forward = "forwarded_from" in message if "text" in message: msg_content = self.extract_text_content(message["text"]) if "media_type" in message: media_type = message["media_type"] msg_type = media_type if media_type == "photo": src_path = message.get("photo", "") elif media_type in ["video_file", "animation"]: src_path = message.get("file", "") elif media_type == "voice_message": src_path = message.get("file", "") elif media_type == "audio_file": src_path = message.get("file", "") elif media_type == "sticker": src_path = message.get("file", "") # Only set sticker emoji as msg_content if STICKER is in include_type if DataModality.STICKER in self.config.include_type and not msg_content.strip(): msg_content = message.get("sticker_emoji", "") else: src_path = message.get("file", "") elif "photo" in message: msg_type = "photo" src_path = message["photo"] elif "file" in message: msg_type = "file" src_path = message["file"] if not msg_content.strip(): msg_content = message.get("file_name", "") elif "location_information" in message: msg_type = "location" loc = message["location_information"] src_path = f"lat:{loc.get('latitude', 0)},lng:{loc.get('longitude', 0)}" if not msg_content.strip(): msg_content = message.get("place_name", "") + message.get("address", "") type_name = self.type_mapping[msg_type] return type_name, msg_content.strip(), src_path, is_forward def extract_text_content(self, text_field) -> str: content = "" if isinstance(text_field, str): content = text_field elif isinstance(text_field, list): for item in text_field: if isinstance(item, str): content += item elif isinstance(item, dict) and "text" in item: content += item["text"] return content.replace('\\"', "") def determine_sender_type(self, from_id: str) -> int: return 1 if from_id == self.my_user_id else 0 def process_message(self, message: Dict) -> List[ChatMessage]: """ Process a single message, may return multiple messages (original message + extracted text message) """ if message.get("type") != "message": return [] msg_id = message.get("id", 0) sender_name = message.get("from", "") from_id = message.get("from_id", "") date = message.get("date", "") type_name, msg_content, src_path, is_forward = self.get_message_type_and_content(message) try: dt = datetime.fromisoformat(date.replace("T", " ").replace("Z", "")) create_time = Timestamp(dt) except Exception as e: logger.warning(f"Time format conversion failed: {date}, error: {e}") is_sender = self.determine_sender_type(from_id) self.message_counter += 1 result_messages = [] # Save messages with content or media files if msg_content.strip() or src_path.strip(): original_msg = ChatMessage( id=self.message_counter, # Use global counter as sequential ID MsgSvrID=msg_id, # Telegram message ID type_name=type_name, is_sender=is_sender, # 0: other party 1: myself talker=sender_name, msg=msg_content.replace("\n", " ").strip() if msg_content.strip() else f"{type_name}", src=src_path, CreateTime=create_time, is_forward=is_forward, ) result_messages.append(original_msg) # If it's a non-pure text message but contains text field, create additional text message if type_name not in ["text"] and "text" in message: text_content = self.extract_text_content(message["text"]) if text_content.strip(): self.message_counter += 1 text_msg = ChatMessage( id=self.message_counter, MsgSvrID=msg_id, type_name="text", is_sender=is_sender, talker=sender_name, msg=text_content.replace("\n", " ").strip(), src="", CreateTime=create_time, is_forward=is_forward, ) result_messages.append(text_msg) return result_messages def process_chat(self, jdata: Dict) -> List[ChatMessage]: """ Process chat data Parameters ---------- jdata : Dict Telegram chat JSON object Returns ------- List[ChatMessage] List of ChatMessage objects """ chat_name = jdata.get("name", "Unknown Chat") messages = jdata.get("messages", []) chat_messages = [] for message in messages: chat_msgs = self.process_message(message) chat_messages.extend(chat_msgs) for msg in chat_messages: msg.room_name = chat_name logger.info(f"Chat '{chat_name}' parsing completed, {len(chat_messages)} messages in total") return chat_messages def to_csv(self, chat_messages: List[ChatMessage], output_file: str): """ Save ChatMessage list to CSV file Parameters ---------- chat_messages : List[ChatMessage] List of ChatMessage objects output_file : str Output CSV file path """ if not chat_messages: logger.warning("No messages to save") return fieldnames = [ "id", "MsgSvrID", "type_name", "is_sender", "talker", "room_name", "msg", "src", "CreateTime", "is_forward", ] os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, "w", encoding="utf-8", newline="") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for msg in chat_messages: writer.writerow( { "id": msg.id, "MsgSvrID": msg.MsgSvrID, "type_name": msg.type_name, "is_sender": msg.is_sender, "talker": msg.talker, "room_name": msg.room_name, "msg": msg.msg, "src": msg.src, "CreateTime": msg.CreateTime, "is_forward": msg.is_forward, } ) logger.info(f"CSV file saved: {output_file}") def copy_received_images( self, chat_messages: List[ChatMessage], base_path: str = "", target_dir: str = "dataset/media/images" ): """ Copy all images with is_sender=0 to specified directory """ os.makedirs(target_dir, exist_ok=True) copied_count = 0 skipped_count = 0 for msg in chat_messages: if msg.is_sender == 0 and msg.type_name == "image" and msg.src: if base_path: full_src_path = os.path.join(base_path, msg.src) else: full_src_path = msg.src normalized_src = full_src_path.replace("\\", "/") if not os.path.exists(normalized_src): logger.warning(f"Source file does not exist: {normalized_src}") skipped_count += 1 continue filename = os.path.basename(normalized_src) target_path = os.path.join(target_dir, filename) shutil.copy2(normalized_src, target_path) copied_count += 1 logger.info(f"Image copying completed: successful {copied_count}, skipped {skipped_count}") def process_telegram_dataset(config: WCMakeDatasetConfig) -> None: """ Process Telegram dataset, traverse all folders under dataset/telegram Create corresponding folders for each telegram folder under dataset/csv Parameters ---------- config : WCMakeDatasetConfig Dataset configuration, contains telegram_args.my_id for determining sender """ telegram_dir = "dataset/telegram" csv_output_dir = "dataset/csv" if not os.path.exists(telegram_dir): logger.error(f"Telegram data directory does not exist: {telegram_dir}") return if not config.telegram_args or not config.telegram_args.my_id: logger.error("Telegram configuration missing, cannot process Telegram dataset") sys.exit(1) if os.path.exists(csv_output_dir): for item in os.listdir(csv_output_dir): item_path = os.path.join(csv_output_dir, item) if os.path.isdir(item_path): shutil.rmtree(item_path) else: os.remove(item_path) for folder_name in os.listdir(telegram_dir): folder_path = os.path.join(telegram_dir, folder_name) if not os.path.isdir(folder_path): continue json_path = os.path.join(folder_path, "result.json") with open(json_path, "r", encoding="utf-8") as file: jdata = json.load(file) chat_name = jdata.get("name", "unknown") chat_type = jdata.get("type", "unknown") chat_id = jdata.get("id", "unknown") safe_name = "".join(c for c in str(chat_name) if c.isalnum() or c in "._-") safe_type = "".join(c for c in str(chat_type) if c.isalnum() or c in "._-") safe_id = "".join(c for c in str(chat_id) if c.isalnum() or c in "._-") csv_folder_name = f"{safe_name}-{safe_type}-{safe_id}" csv_folder_path = os.path.join(csv_output_dir, csv_folder_name) parser = TelegramChatParser(config=config) messages = parser.process_chat(jdata) if messages: csv_file_path = os.path.join(csv_folder_path, f"{csv_folder_name}.csv") parser.to_csv(messages, csv_file_path) parser.copy_received_images(messages, folder_path) else: logger.warning(f"Folder '{folder_name}' has no valid messages") ================================================ FILE: weclone/data/clean/__init__.py ================================================ ================================================ FILE: weclone/data/clean/strategies.py ================================================ import json import os from abc import ABC, abstractmethod from dataclasses import dataclass from typing import List, cast import pandas as pd from langchain_core.prompts import PromptTemplate from tqdm import tqdm from weclone.core.inference.online_infer import OnlineLLM from weclone.data.models import QaPair, QaPairScore, QaPairScoreWithId from weclone.prompts.clean_data import CLEAN_PROMPT from weclone.utils.config_models import WCMakeDatasetConfig from weclone.utils.log import logger @dataclass class CleaningStrategy(ABC): """Abstract base class for data cleaning strategies, but provides common cleaning methods""" make_dataset_config: WCMakeDatasetConfig @abstractmethod def judge(self, data: List[QaPair]) -> None: """ Scoring method, needs to be implemented by subclasses. """ pass def clean(self) -> str: """ Filter SFT data based on score and return the final dataset name to use. """ config = self.make_dataset_config original_dataset_name = config.dataset cleaned_dataset_name = original_dataset_name + "-cleaned" dataset_dir = config.dataset_dir dataset_info_path = os.path.join(dataset_dir, "dataset_info.json") with open(dataset_info_path, "r", encoding="utf-8") as f: info = json.load(f) paths = { name: os.path.join(dataset_dir, info.get(name, {}).get("file_name")) for name in [original_dataset_name, cleaned_dataset_name] } original_data_path, cleaned_data_path = paths.values() try: with open(original_data_path, "r", encoding="utf-8") as f: data = json.load(f) accept_score = config.clean_dataset.llm.accept_score filtered_data = [item for item in data if item.get("score", 0) >= accept_score] if not filtered_data: logger.warning("No data retained after cleaning, will use original dataset.") return original_dataset_name with open(cleaned_data_path, "w", encoding="utf-8") as f: json.dump(filtered_data, f, ensure_ascii=False, indent=2) logger.success( f"Filtered data below {accept_score} score, retained {len(filtered_data)} items, saved to {cleaned_data_path}" ) return cleaned_dataset_name except Exception as e: logger.error(f"Error occurred during data cleaning, will use original dataset: {e}") return original_dataset_name @dataclass class LLMCleaningStrategy(CleaningStrategy): """Strategy for data cleaning using large language models""" make_dataset_config: WCMakeDatasetConfig def judge(self, data: List[QaPair]) -> None: """ Call LLM for scoring and directly assign scores to the input QaPair. """ from weclone.core.inference.offline_infer import vllm_infer logger.info("Starting LLM scoring of data") inputs = [] prompt_template = PromptTemplate.from_template(CLEAN_PROMPT) for qa in data: if qa.images: qa.score = 6 else: messages_str = "" for msg in qa.messages: if msg.role == "user": messages_str += f"Q: {msg.content}\n" elif msg.role == "assistant": messages_str += f"A: {msg.content}\n" prompt_value = prompt_template.invoke({"id": qa.id, "messages": messages_str.strip()}) inputs.append(prompt_value.to_string()) parsed_scores, failed_indexs = vllm_infer( inputs, self.make_dataset_config.model_name_or_path, template=self.make_dataset_config.template, temperature=0, guided_decoding_class=QaPairScore, repetition_penalty=1.1, enable_thinking=self.make_dataset_config.clean_dataset.llm.enable_thinking, cutoff_len=self.make_dataset_config.messages_max_length + 1024, # add prompt length max_new_tokens=1024 if self.make_dataset_config.clean_dataset.llm.enable_thinking else 200, ) # We align scores by iterating only non-image examples and popping from the head of parsed_scores. # Build an iterator over parsed results for simplicity and safety. parsed_iter = iter(cast(List[QaPairScore | None], parsed_scores)) non_image_count = 0 failed_count = 0 for qa in data: if qa.images: continue non_image_count += 1 parsed_item = next(parsed_iter, None) if parsed_item is None: failed_count += 1 qa.score = 0 else: qa.score = parsed_item.score # Sanity check: number of Nones should equal failed_indexs; and total length matches non-image count assert failed_count == len(failed_indexs), ( f"Mismatch: failed_count({failed_count}) != failed_indexs({len(failed_indexs)})" ) assert len(cast(List[QaPairScore | None], parsed_scores)) == non_image_count, ( f"Mismatch: len(parsed_scores)({len(cast(List[QaPairScore | None], parsed_scores))}) != non_image_count({non_image_count})" ) scores = [qa.score for qa in data if qa.score is not None] score_series = pd.Series(scores) score_counts = score_series.value_counts().sort_index() score_percentages = score_series.value_counts(normalize=True).sort_index() * 100 pd.set_option("display.unicode.east_asian_width", True) # Try to fix alignment issues distribution_df = pd.DataFrame( # Merge count and percentage into one DataFrame for printing { "Count": score_counts, "Percentage(%)": score_percentages.round(2), } ) distribution_df.index.name = "Score" # Add column name for the first column: Score printable_df_str = distribution_df.reset_index().to_string(index=False) logger.success(f"LLM scoring distribution:\n{printable_df_str}") @dataclass class OlineLLMCleaningStrategy(CleaningStrategy): """Strategy for data cleaning using large language models""" # TODO: images clean support def judge(self, data: List[QaPair]) -> None: config = self.make_dataset_config logger.info("Starting online model scoring of data") logger.info(f"Using model {config.model_name}") client = OnlineLLM( api_key=config.llm_api_key, base_url=config.base_url, model_name=config.model_name, max_workers=config.clean_batch_size + 5, ) inputs = [] prompt_template = PromptTemplate.from_template(CLEAN_PROMPT) for qa in data: if qa.images: qa.score = 6 else: messages_str = "" for msg in qa.messages: if msg.role == "user": messages_str += f"Q: {msg.content}\n" elif msg.role == "assistant": messages_str += f"A: {msg.content}\n" prompt_value = prompt_template.invoke({"id": qa.id, "messages": messages_str.strip()}) inputs.append(prompt_value.to_string()) clean_batch_size = config.clean_batch_size all_parsed_scores = [] for i in tqdm(range(0, len(inputs), clean_batch_size), desc="Online model scoring progress"): batch = inputs[i : i + clean_batch_size] try: parsed_results, failed_indexs = client.chat_batch( batch, temperature=0, guided_decoding_class=QaPairScoreWithId ) for j, parsed_result in enumerate(parsed_results): if parsed_result is not None: all_parsed_scores.append(parsed_result) else: logger.warning(f"Failed to parse result for batch item at index {i + j}") except Exception as e: logger.error( f"Failed to call online model or parse result for batch starting at index {i}, error: {str(e)}" ) score_map = {score.id: score.score for score in all_parsed_scores} for qa in data: if qa.id in score_map: qa.score = score_map[qa.id] else: logger.warning(f"No score obtained for QA ID {qa.id}, default assigned 0") qa.score = 0 scores = [qa.score for qa in data if qa.score is not None] score_series = pd.Series(scores) score_counts = score_series.value_counts().sort_index() score_percentages = score_series.value_counts(normalize=True).sort_index() * 100 pd.set_option("display.unicode.east_asian_width", True) distribution_df = pd.DataFrame( { "Count": score_counts, "Percentage(%)": score_percentages.round(2), } ) distribution_df.index.name = "Score" printable_df_str = distribution_df.reset_index().to_string(index=False) logger.success(f"Online model scoring distribution:\n{printable_df_str}") ================================================ FILE: weclone/data/models.py ================================================ from dataclasses import dataclass from typing import Optional from pandas import Timestamp from pydantic import BaseModel, Field from weclone.utils.config_models import DataModality from weclone.utils.i18n import MultiLangList @dataclass class ChatMessage: id: int # sequential id MsgSvrID: str # original message id from platform type_name: str # message type, refer to cut_type_data and skip_type_data is_sender: int # 0: other party, 1: self talker: str # message sender msg: str # message content src: str # media file path, additional info field CreateTime: Timestamp # message send time room_name: Optional[str] = None # chat room name is_forward: bool = False # whether it's a forwarded message modality: Optional[DataModality] = None # message modality, set in qa_generator.py @dataclass class CutMessage: is_sender: int cut_type: str CreateTime: Timestamp @dataclass class Message: role: str content: str @dataclass class QaPair: id: int time: Timestamp score: int messages: list[Message] images: list[str] system: str class QaPairScore(BaseModel): score: int = Field(ge=1, le=5) class QaPairScoreWithId(QaPairScore): id: int cut_type_data = { "zh_CN": [ "cut", "Cut", "图片", "视频", "合并转发的聊天记录", "语音", "(分享)音乐", "(分享)卡片式链接", "(分享)笔记", "(分享)小程序", "(分享)收藏夹", "(分享)视频号名片", "(分享)视频号视频", "粘贴的文本", # 无法解析的分享链接 "未知", ], "en": [ "cut", "Cut", "image", "video", "merged forward chat records", "voice", "(share) music", "(share) card link", "(share) note", "(share) mini program", "(share) favorites", "(share) video account card", "(share) video account video", "pasted text", # Unparseable share link "unknown", ], } cut_type_list = MultiLangList(cut_type_data, default_lang="en") skip_type_data = { "zh_CN": [ "添加好友", "推荐公众号", "动画表情", "用户上传的GIF表情", "位置", "文件", "位置共享", "引用回复", "群公告", "转账", "语音通话", "系统通知", "消息撤回", "拍一拍", "邀请加群", ], "en": [ "add friend", "recommend official account", "sticker", "sticker2", "location", "file", "location sharing", "reply with quote", "group announcement", "transfer", "voice call", "system notification", "message recall", "pat pat", "invite to group", ], } skip_type_list = MultiLangList(skip_type_data, default_lang="en") unprocessed_type_list = [] ================================================ FILE: weclone/data/qa_generator.py ================================================ import json import os import re import subprocess # nosec import sys from typing import List, Union, cast os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn") import pandas as pd from pandas import Timestamp from weclone.core.PII.pii_detector import ChinesePIIDetector, PIIDetector from weclone.data.chat_parsers.telegram_parser import process_telegram_dataset from weclone.data.clean.strategies import LLMCleaningStrategy, OlineLLMCleaningStrategy from weclone.data.models import ( ChatMessage, CutMessage, Message, QaPair, cut_type_list, skip_type_list, ) from weclone.data.strategies import TimeWindowStrategy from weclone.data.utils import ImageToTextProcessor, check_image_file_exists from weclone.utils.config import load_config from weclone.utils.config_models import DataModality, LanguageType, PlatformType, WCMakeDatasetConfig from weclone.utils.log import logger class DataProcessor: def __init__(self): self.config = cast(WCMakeDatasetConfig, load_config(arg_type="make_dataset")) self.csv_folder = "./dataset/csv" self.system_prompt = self.config.default_system self.enable_clean = self.config.clean_dataset.enable_clean # message type self.QaPair = QaPair self.include_type = self.config.include_type if self.config.platform == PlatformType.CHAT: self.cut_type_list = cut_type_list.get_items(lang="zh_CN") self.skip_type_list = skip_type_list.get_items(lang="zh_CN") self.include_type = cut_type_list.translate_batch( texts=[t for t in self.include_type if t.lower() != "text"] ) self.cut_type_list = [t for t in self.cut_type_list if t not in self.include_type] elif self.config.platform == PlatformType.TELEGRAM: self.cut_type_list = cut_type_list.get_items(lang="en") self.skip_type_list = skip_type_list.get_items(lang="en") self.include_type = [t for t in self.include_type if t.lower() != "text"] self.cut_type_list = [t for t in self.cut_type_list if t not in self.include_type] if DataModality.STICKER in self.include_type: self.skip_type_list.remove("sticker") # blocked words config_blocked_words = self.config.blocked_words file_blocked_words = [] try: with open("./dataset/blocked_words.json", encoding="utf-8") as f: file_blocked_words = json.load(f).get("blocked_words", []) except (FileNotFoundError, json.JSONDecodeError): pass self.blocked_words = list(set(config_blocked_words + file_blocked_words)) # logger.info(f"Chat record blocked words: {self.blocked_words}") # combine strategy if self.config.single_combine_strategy == "time_window": self.single_combine_strategy = TimeWindowStrategy( time_window=self.config.single_combine_time_window * 60, is_single_chat=True, ) if self.config.qa_match_strategy == "time_window": self.qa_match_strategy = TimeWindowStrategy( time_window=self.config.qa_match_time_window * 60, is_single_chat=False, ) # PII detection if self.config.language == LanguageType.ZH: self.pii_detector = ChinesePIIDetector() else: self.pii_detector = PIIDetector(language=self.config.language) # dataset cleaning clean_dataset_config = self.config.clean_dataset if self.enable_clean: if clean_dataset_config.clean_strategy == "llm": if self.config.online_llm_clear: self.clean_strategy = OlineLLMCleaningStrategy(make_dataset_config=self.config) else: from llamafactory.extras.packages import is_vllm_available if not is_vllm_available(): logger.error("vLLM is not available, dataset cleaning is not supported.") sys.exit(1) else: self.clean_strategy = LLMCleaningStrategy(make_dataset_config=self.config) vision_config = self.config.vision_api if vision_config.enable and vision_config.api_key: self.image_processor = ImageToTextProcessor( api_url=vision_config.api_url, # type: ignore api_key=vision_config.api_key, # type: ignore model_name=vision_config.model_name, # type: ignore config=self.config, ) logger.info(f"ImageToText functionality enabled, model: {self.image_processor.model_name}") else: self.image_processor = None self.c = self.config self.relations = {} def main(self): self.pre_parse_chat_dataset() if not os.path.exists(self.csv_folder) or not os.listdir(self.csv_folder): logger.error( f"Error: Directory '{self.csv_folder}' does not exist or is empty. Please check the path and ensure it contains CSV chat data files." ) sys.exit(1) csv_files = self.get_csv_files() logger.info(f"Found {len(csv_files)} CSV files in total, starting processing, please be patient...") message_list: List[ChatMessage] = [] for csv_file in csv_files: logger.debug(f"Starting to process CSV file: {csv_file}") chat_messages = self.load_file(csv_file) message_list.extend(self.group_consecutive_messages(messages=chat_messages)) # self.process_by_msgtype(chat_message) logger.debug(f"Processing completed: {csv_file}, loaded {len(chat_messages)} messages in total") qa_res = self.match_qa(messages=message_list) qa_res = [item for item in qa_res if isinstance(item, QaPair)] if self.image_processor: logger.info("Starting image recognition process...") qa_res = self.image_processor._process_images_in_parallel(qa_res) logger.info("Image recognition process completed.") if self.enable_clean: self.clean_strategy.judge(qa_res) # type: ignore self.save_result(qa_res) self._execute_length_cdf_script() logger.success( f"Chat record processing successful, obtained {len(qa_res)} data entries in total, saved to ./dataset/res_csv/sft/sft-my.json" ) def pre_parse_chat_dataset(self): if self.c.platform == PlatformType.TELEGRAM: process_telegram_dataset(self.config) def _execute_length_cdf_script(self): """Execute the length_cdf.py script to calculate cutoff_len.""" try: python_executable = sys.executable script_path = os.path.join("weclone", "utils", "length_cdf.py") command_parts = [ python_executable, script_path, f'--model_name_or_path="{self.c.model_name_or_path}"', f'--dataset="{self.c.dataset}"', f'--dataset_dir="{self.c.dataset_dir}"', f'--template="{self.c.template}"', "--interval=512", ] if hasattr(self.c, "media_dir") and self.c.media_dir: command_parts.append(f'--media_dir="{self.c.media_dir}"') if hasattr(self.c, "image_max_pixels") and self.c.image_max_pixels: command_parts.append(f'--image_max_pixels="{self.c.image_max_pixels}"') child_env = os.environ.copy() child_env["CUDA_VISIBLE_DEVICES"] = "0" child_env["LLAMAFACTORY_VERBOSITY"] = "ERROR" process = subprocess.Popen( command_parts, env=child_env, stdout=None, # Use None to indicate using parent process's stdout (i.e., terminal) stderr=None, text=True, bufsize=1, ) # nosec return_code = process.wait() if return_code != 0: logger.error( f"Command '{' '.join(command_parts)}' execution failed with return code {return_code}" ) except FileNotFoundError: logger.error( f"Command execution failed: executable '{command_parts[0]}' or script '{command_parts[1]}' not found" ) except KeyError as e: logger.error(f"Failed to execute length_cdf.py script: missing configuration item {str(e)}") except Exception as e: logger.error(f"Unknown error occurred while executing length_cdf.py script: {str(e)}") def get_csv_files(self): """Traverse the folder to get all CSV file paths and sort by starting sequence number in filename""" csv_files = [] for chat_obj_folder in os.listdir(self.csv_folder): chat_obj_folder_path = os.path.join(self.csv_folder, chat_obj_folder) for csvfile in os.listdir(chat_obj_folder_path): if not csvfile.endswith(".csv"): continue csvfile_path = os.path.join(chat_obj_folder_path, csvfile) csv_files.append(csvfile_path) pattern = re.compile(r"_(\d+)_\d+\.csv$") def extract_start(fp: str) -> int: name = os.path.basename(fp) m = pattern.search(name) return int(m.group(1)) if m else 0 csv_files.sort(key=extract_start) return csv_files def match_qa(self, messages: List[ChatMessage]) -> List[Union[QaPair, CutMessage]]: """ Match question-answer pairs Args: messages: Message list Returns: List[Union[QaPair, CutMessage]]: List of Q&A pairs containing instructions and outputs """ WAITING_INSTRUCTION = "waiting_instruction" WAITING_RESPONSE = "waiting_response" current_state = WAITING_INSTRUCTION qa_res: List[Union[QaPair, CutMessage]] = [] last_message = None current_instruction = None qa_id_counter = 0 conversation_messages: List[Message] = [] conversation_images: List[str] = [] conversation_talker = "" def _calculate_qa_length( messages: List[Message], new_user_content: str, new_assistant_content: str ) -> int: """Calculate total character length of messages plus new messages""" total_length = 0 for msg in messages: total_length += len(msg.content) total_length += len(new_user_content) + len(new_assistant_content) return total_length def _save_current_qa_pair( qa_id: int, time_stamp: Timestamp, current_conversation_messages: List[Message], current_conversation_images: List[str], talker: str = "", ) -> int: """Helper function to save the current QA pair.""" nonlocal qa_res # Allow modification of qa_res from the outer scope total_length = _calculate_qa_length(current_conversation_messages, "", "") if total_length <= self.config.messages_max_length: if len(current_conversation_images) > self.config.max_image_num: logger.warning( f"QA pair (potential id {qa_id}) with timestamp {time_stamp} " f"has too many images ({len(current_conversation_images)} > {self.config.max_image_num}) " "and will be skipped." ) return qa_id if ( len(current_conversation_messages) == 2 and current_conversation_messages[0].role == "user" and current_conversation_messages[0].content == "" ): return qa_id system_content = self.system_prompt if self.c.add_time: system_content += f"\n 现在时间是{time_stamp.strftime('%m-%d %H:%M')}" if self.c.add_relation and talker: relation = self.relations.get(talker, "") if relation: system_content += f"\n 对方是你的{relation},你们正在聊天" processed_messages = current_conversation_messages.copy() for i in range(len(processed_messages) - 1): if ( processed_messages[i].role == "user" and "" in processed_messages[i].content and i + 1 < len(processed_messages) and processed_messages[i + 1].role == "assistant" ): assistant_content = processed_messages[i + 1].content processed_messages[i] = Message( role="user", content=processed_messages[i].content.replace( "", f"你应该说:{assistant_content}" ), ) qa_pair = self.QaPair( id=qa_id, time=time_stamp, score=0, messages=processed_messages, images=current_conversation_images.copy(), system=system_content, ) qa_res.append(qa_pair) return qa_id + 1 else: logger.warning( f"QA pair (potential id {qa_id}) with timestamp {time_stamp} " f"exceeds max length ({total_length} > {self.config.messages_max_length}) " "and will be skipped." ) return qa_id for msg in messages: if isinstance(msg, CutMessage): # When encountering CutMessage, save current conversation and reset state if conversation_messages: qa_id_counter = _save_current_qa_pair( qa_id_counter, last_message.CreateTime if last_message else msg.CreateTime, conversation_messages, conversation_images, conversation_talker, ) # Reset state current_state = WAITING_INSTRUCTION current_instruction = None last_message = None conversation_messages = [] conversation_images = [] conversation_talker = "" continue if current_state == WAITING_INSTRUCTION: if msg.is_sender == 0: # Received message from other party if last_message and not self.qa_match_strategy.is_same_conversation([last_message], msg): # If not the same conversation and there is a previous message, save the previous conversation if conversation_messages: qa_id_counter = _save_current_qa_pair( qa_id_counter, last_message.CreateTime, conversation_messages, conversation_images, conversation_talker, ) conversation_messages = [] conversation_images = [] # Regardless of whether a new conversation has just been started, this 'msg' now becomes the current instruction. current_instruction = msg last_message = msg conversation_talker = msg.talker current_state = WAITING_RESPONSE elif msg.is_sender == 1: # Own message as first message if last_message and not self.qa_match_strategy.is_same_conversation([last_message], msg): if conversation_messages: qa_id_counter = _save_current_qa_pair( qa_id_counter, last_message.CreateTime, conversation_messages, conversation_images, conversation_talker, ) conversation_messages = [] conversation_images = [] conversation_messages.append(Message(role="user", content="")) conversation_messages.append(Message(role="assistant", content=msg.msg)) last_message = msg elif current_state == WAITING_RESPONSE: if msg.is_sender == 0: # Received message from other party if last_message and not self.qa_match_strategy.is_same_conversation([last_message], msg): if conversation_messages: qa_id_counter = _save_current_qa_pair( qa_id_counter, last_message.CreateTime, conversation_messages, conversation_images, conversation_talker, ) conversation_messages = [] conversation_images = [] current_instruction = msg last_message = msg conversation_talker = msg.talker # State remains unchanged else: # Own message - use strategy to determine if it belongs to the same conversation if last_message and self.qa_match_strategy.is_same_conversation([last_message], msg): if current_instruction is None: raise ValueError("current_instruction should not be None when creating a QA pair") conversation_messages.append(Message(role="user", content=current_instruction.msg)) conversation_messages.append(Message(role="assistant", content=msg.msg)) if hasattr(current_instruction, "src") and current_instruction.src: if isinstance(current_instruction.src, list): valid_images = [img_src for img_src in current_instruction.src if img_src] if valid_images: conversation_images.extend(valid_images) elif current_instruction.src: conversation_images.append(current_instruction.src) last_message = msg # Regardless of whether it matches, reset state current_state = WAITING_INSTRUCTION current_instruction = None # Process the last conversation if conversation_messages and last_message: qa_id_counter = _save_current_qa_pair( qa_id_counter, last_message.CreateTime, conversation_messages, conversation_images, conversation_talker, ) return qa_res def group_consecutive_messages(self, messages: List[ChatMessage]) -> List[ChatMessage]: """ Combine multiple consecutive messages from the same person into one message, add cut when encountering cut_type Args: messages: Message list Returns: List[ChatMessage]: Combined message list """ if not messages: return [] def _combine_text(messages: List[ChatMessage]) -> ChatMessage: """ Merge multiple messages into one Args: messages: List of messages to merge Returns: ChatMessage: Merged message """ base_msg = messages[0] combined_content = messages[0].msg combined_src_list = [messages[0].src] if messages[0].modality == DataModality.IMAGE else [] for i in messages[1:]: content = i.msg if not content: continue if combined_content and combined_content[-1] not in [ "。", ".", "!", "!", "?", "?", "…", ",", ",", ]: combined_content += "\n" if i.modality == DataModality.IMAGE: combined_src_list.append(i.src) combined_content += content if len(combined_content) > self.c.combine_msg_max_length: logger.warning( f"Combined message length exceeds {self.c.combine_msg_max_length}, will truncate: {combined_content[:50]}" ) combined_content = combined_content[: self.c.combine_msg_max_length] remaining_image_count = combined_content.count("") if len(combined_src_list) > remaining_image_count: combined_src_list = combined_src_list[:remaining_image_count] combined_message = ChatMessage( id=base_msg.id, MsgSvrID=base_msg.MsgSvrID, type_name=base_msg.type_name, is_sender=base_msg.is_sender, talker=base_msg.talker, room_name=base_msg.room_name, msg=combined_content, src=combined_src_list, # type: ignore CreateTime=messages[-1].CreateTime, # Use the time of the last message modality=base_msg.modality, is_forward=base_msg.is_forward, ) return combined_message def _create_cut_message(message: ChatMessage) -> CutMessage: return CutMessage( is_sender=message.is_sender, cut_type=message.type_name, CreateTime=message.CreateTime, ) def _combine_current_group(group): """ Process current message group and add to grouped_messages Args: group: Current message group """ if len(group) > 1: combined_msg = _combine_text(group) grouped_messages.append(combined_msg) else: grouped_messages.append(group[0]) grouped_messages = [] current_group = [] for _, current_msg in enumerate(messages): if current_msg.type_name in self.cut_type_list or ( current_msg.modality == DataModality.IMAGE and current_msg.is_sender == 1 ): # Own image messages need to be cut if current_group: # Current group has messages, combine current group and add a cut _combine_current_group(current_group) current_group = [] cut_msg = _create_cut_message(current_msg) grouped_messages.append(cut_msg) else: # Current group has no messages, check previous group if grouped_messages: if not isinstance(grouped_messages[-1], CutMessage): cut_msg = _create_cut_message(current_msg) grouped_messages.append(cut_msg) # If previous group has no messages or last one is CutMessage, continue directly continue if not current_group: current_group = [current_msg] continue last_msg = current_group[-1] # Determine if it's consecutive messages from the same person if ( current_msg.is_sender == last_msg.is_sender and current_msg.talker == last_msg.talker and self.single_combine_strategy.is_same_conversation([last_msg], current_msg) ): current_group.append(current_msg) else: # Not messages from the same person, process current group and start new group _combine_current_group(current_group) # Start new group current_group = [current_msg] # Process the last group of messages if current_group: _combine_current_group(current_group) return grouped_messages def process_by_msgtype(self, chat_message: ChatMessage): if chat_message.type_name.lower() in ["文本", "text"]: self.process_text(chat_message) # elif chat_message.modality == DataModality.IMAGE: # self.process_image(chat_message) def load_file(self, file_path) -> List[ChatMessage]: """ Perform overall first preprocessing, filter rows that don't meet conditions, check if images exist and change type to cut if not, add DataModality field """ folder_path = os.path.dirname(file_path) folder_name = os.path.basename(folder_path) if folder_name not in self.relations: users_json_path = os.path.join(folder_path, "users.json") if os.path.exists(users_json_path): try: with open(users_json_path, encoding="utf-8") as f: users_data = json.load(f) relation = users_data.get("relation", "") if relation: self.relations[folder_name] = relation logger.debug(f"Loaded relation for {folder_name}: {relation}") except (FileNotFoundError, json.JSONDecodeError) as e: logger.warning(f"Failed to load users.json from {folder_path}: {e}") df = pd.read_csv( file_path, encoding="utf-8", dtype={"msg": str, "src": str}, escapechar=None, keep_default_na=False, ) df = df[~df["type_name"].isin(values=self.skip_type_list)] if "is_forward" in df.columns: df = df[~((df["is_sender"] == 1) & (df["is_forward"]))] # Batch process text messages for PII detection and blocked words text_indices = [] text_messages = [] for i in df.index: if df.loc[i, "type_name"].lower() in ["文本", "text"]: # type: ignore msg_str = str(df.loc[i, "msg"]) msg_str = msg_str.replace("\n", "") text_indices.append(i) text_messages.append(msg_str) # TODO Deleting directly by batch_has_pii returning true/false. indices_to_drop = [] if text_messages: pii_results = self.pii_detector.batch_has_pii(text_messages) for idx, (df_index, msg_str, has_pii) in enumerate(zip(text_indices, text_messages, pii_results)): if has_pii: indices_to_drop.append(df_index) continue # Check blocked words for blocked_word in self.blocked_words: if blocked_word in msg_str: indices_to_drop.append(df_index) break df = df.drop(index=indices_to_drop) # Process other message types for i in df.index: if df.loc[i, "type_name"].lower() in ["文本", "text"]: continue if df.loc[i, "src"].lower().endswith(".gif"): df.loc[i, "src"] = "" df.loc[i, "type_name"] = "动画表情" if self.c.platform == PlatformType.CHAT else "sticker" continue if df.loc[i, "type_name"].lower() in ["图片", "image"]: # type: ignore if self.c.platform in [PlatformType.CHAT, PlatformType.TELEGRAM]: result = check_image_file_exists(str(df.loc[i, "src"])) if isinstance(result, str) and df.loc[i, "is_sender"] == 0: df.loc[i, "src"] = result df.loc[i, "msg"] = "" df.loc[i, "modality"] = DataModality.IMAGE else: df.loc[i, "type_name"] = "Cut" elif df.loc[i, "type_name"] in ["sticker", "动画表情"]: if self.c.platform in [PlatformType.CHAT, PlatformType.TELEGRAM]: df.loc[i, "src"] = "" continue else: df.loc[i, "msg"] = "" df = df.dropna(how="all") # Time format: 2021-07-07 10:27:23 df["CreateTime"] = pd.to_datetime(df["CreateTime"]) return [ChatMessage(**row) for row in df.to_dict("records")] # type: ignore def process_text(self, chat_message: ChatMessage): pass def save_result(self, qa_res: List[QaPair]): """ Saves the list of QaPair objects to a JSON file after converting them to dictionaries. Args: qa_res: A list of QaPair objects. """ processed_qa_res = [] for idx, item in enumerate(qa_res): item_dict = { "id": str(idx), "time": item.time.isoformat() if item.time else None, "score": item.score, "messages": [{"role": msg.role, "content": msg.content} for msg in item.messages], "images": item.images, "system": item.system, } processed_qa_res.append(item_dict) output_path = "./dataset/res_csv/sft/sft-my.json" os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: json.dump(processed_qa_res, f, ensure_ascii=False, indent=4) logger.success( f"Chat record processing successful, {len(qa_res)} entries in total, saved to {output_path}" ) if __name__ == "__main__": processor = DataProcessor() processor.main() ================================================ FILE: weclone/data/strategies.py ================================================ from abc import ABC, abstractmethod from dataclasses import dataclass from typing import List from .models import ChatMessage @dataclass class ConversationStrategy(ABC): """Abstract base class for conversation strategies""" is_single_chat: bool @abstractmethod def is_same_conversation(self, history_msg: List[ChatMessage], current_msg: ChatMessage) -> bool: """Determine if two messages belong to the same conversation""" pass @dataclass class TimeWindowStrategy(ConversationStrategy): """Time window based judgment strategy""" time_window: int # Time window in minutes def is_same_conversation(self, history_msg: List[ChatMessage], current_msg: ChatMessage) -> bool: time_diff = abs((current_msg.CreateTime - history_msg[-1].CreateTime)).total_seconds() return time_diff <= self.time_window @dataclass class LLMStrategy(ConversationStrategy): """LLM based judgment strategy""" def is_same_conversation(self, history_msg: List[ChatMessage], current_msg: ChatMessage) -> bool: # TODO: Implement LLM-based conversation detection logic return False @dataclass class CompositeStrategy(ConversationStrategy): """Composite strategy that combines multiple strategies""" strategies: List[ConversationStrategy] require_all: bool = True def is_same_conversation(self, history_msg: List[ChatMessage], current_msg: ChatMessage) -> bool: # TODO: Implement composite strategy logic return False ================================================ FILE: weclone/data/utils.py ================================================ import base64 import concurrent.futures import os from pathlib import Path import requests from weclone.utils.config_models import WCMakeDatasetConfig from weclone.utils.log import logger from weclone.utils.retry import retry_on_http_error def check_image_file_exists(file_path: str) -> str | bool: try: normalized_path = os.path.normpath(file_path).replace("\\", "/") filename_with_ext = os.path.basename(normalized_path) filename_without_ext = Path(filename_with_ext).stem # 使用 glob 查找精确匹配该文件名的文件(不论扩展名) images_dir = Path("dataset") / "media" / "images" matching_files = list(images_dir.glob(f"{filename_without_ext}.*")) if len(matching_files) > 0: # 获取相对于dataset/media的路径,只保留images/文件名 full_path = matching_files[0] relative_path = full_path.relative_to(Path("dataset") / "media") return str(relative_path) else: return False except Exception as e: logger.error(f"检查图片文件时出错: {file_path}, 错误: {e}") return False class ImageToTextProcessor: """通过兼容OpenAI API的多模态LLM将图片转换为文本。""" def __init__(self, api_url: str, api_key: str, model_name: str, config: WCMakeDatasetConfig): self.api_url = api_url.rstrip("/") self.api_key = api_key self.model_name = model_name self.config = config self.prompt = """ 请描述这张图片的内容,重点关注: 1. 如果是截图,描述界面内容和操作 2. 如果是表格,描述表格结构和数据 3. 如果是文档,提取关键文字信息 4. 如果是生活照片,简要描述场景和内容。 请用简洁明了的语言描述,不超过100字。""" def _process_images_in_parallel(self, qa_list): """并行处理所有对话中的图片,并将描述替换回对话文本。""" all_image_paths = [] media_dir = self.config.media_dir # 遍历所有对话,收集并构造完整的图片路径 for qa_pair in qa_list: if qa_pair.images: image_list = qa_pair.images if isinstance(qa_pair.images, list) else [qa_pair.images] for relative_path in image_list: full_path = os.path.join(media_dir, relative_path) all_image_paths.append(full_path) if not all_image_paths: logger.info("未在对话中找到任何图片,跳过识别。") return qa_list logger.info(f"共找到 {len(all_image_paths)} 张有效图片需要识别。") max_workers = self.config.vision_api.max_workers # 使用线程池并行调用API,executor.map 会保持结果顺序与输入一致 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: # 现在传递给 image_processor 的是完整的路径 image_descriptions = list(executor.map(self.describe_image, all_image_paths)) desc_iterator = iter(image_descriptions) for qa_pair in qa_list: if not qa_pair.images: continue for message in qa_pair.messages: # 替换消息内容中的 占位符 num_images_in_message = message.content.count("") for _ in range(num_images_in_message): try: description = next(desc_iterator) # 使用 count=1 确保每次只替换一个占位符,并添加换行符以增强可读性 message.content = message.content.replace( "", f"\n[图片描述: {description}]\n", 1 ) except StopIteration: logger.error("图片数量与描述数量不匹配,可能存在逻辑错误。") message.content = message.content.replace("", "\n[图片描述缺失]\n", 1) # 清空图片列表,因为它们已被转换为文本 qa_pair.images.clear() return qa_list def _encode_image_to_base64(self, image_path: str) -> str: """将图片编码为base64""" try: with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") except Exception as e: logger.error(f"编码图片失败 {image_path}: {e}") return "" def _get_image_format(self, image_path: str) -> str: """获取图片格式""" suffix = Path(image_path).suffix.lower().replace(".", "") if suffix == "jpg": return "jpeg" return suffix @retry_on_http_error( max_retries=5, base_delay=15.0, max_delay=300.0, backoff_factor=2.0, retry_on_status=[429, 500, 502, 503, 504], retry_on_exceptions=[requests.exceptions.RequestException, ConnectionError, TimeoutError], ) def _call_vision_api(self, image_path: str) -> str: """调用Vision API(增加了重试机制)""" base64_image = self._encode_image_to_base64(image_path) if not base64_image: return "[图片处理失败:无法编码]" image_format = self._get_image_format(image_path) headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} payload = { "model": self.model_name, "messages": [ { "role": "user", "content": [ {"type": "text", "text": self.prompt}, { "type": "image_url", "image_url": {"url": f"data:image/{image_format};base64,{base64_image}"}, }, ], } ], "max_tokens": 1000, "temperature": 0.1, } response = requests.post( f"{self.api_url}/chat/completions", headers=headers, json=payload, timeout=60 ) if response.status_code == 200: result = response.json() if "choices" in result and len(result["choices"]) > 0: content = result["choices"][0]["message"]["content"] return content.strip() else: logger.warning(f"API响应格式异常: {result}") return "[图片描述获取失败:API格式错误]" else: logger.error(f"API请求失败,状态码: {response.status_code},原因: {response.reason}") response.raise_for_status() # 触发重试机制 return "[图片描述获取失败]" def describe_image(self, image_path: str) -> str: """公开方法,用于描述单张图片内容""" if not os.path.exists(image_path): logger.warning(f"图片文件不存在: {image_path}") return "[图片文件不存在]" logger.debug(f"正在识别图片: {os.path.basename(image_path)}") return self._call_vision_api(image_path) if __name__ == "__main__": path = "Storage\\Image\2021-08\6ce3f785b4230246639c3dd0d4a8848c.dat" print(check_image_file_exists(path)) ================================================ FILE: weclone/eval/__init__.py ================================================ ================================================ FILE: weclone/eval/cli_demo.py ================================================ from llamafactory.chat import ChatModel from llamafactory.extras.misc import torch_gc def main(): try: import platform if platform.system() != "Windows": import readline # noqa: F401 except ImportError: print("Install `readline` for a better experience.") chat_model = ChatModel() messages = [] print( "Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application." ) while True: try: query = input("\nUser: ") except UnicodeDecodeError: print("Detected decoding error at the inputs, please set the terminal encoding to utf-8.") continue except Exception: raise if query.strip() == "exit": break if query.strip() == "clear": messages = [] torch_gc() print("History has been removed.") continue messages.append({"role": "user", "content": query}) print("Assistant: ", end="", flush=True) response = "" for new_text in chat_model.stream_chat(messages): print(new_text, end="", flush=True) response += new_text print() messages.append({"role": "assistant", "content": response}) if __name__ == "__main__": main() ================================================ FILE: weclone/eval/eval_model.py ================================================ from llamafactory.eval.evaluator import Evaluator def main(): evaluator = Evaluator() evaluator.eval() if __name__ == "__main__": main() ================================================ FILE: weclone/eval/test_model.py ================================================ import json from typing import List, cast # 导入 cast import openai from openai import OpenAI # 导入 OpenAI 类 from openai.types.chat import ChatCompletionMessageParam # 导入消息参数类型 from tqdm import tqdm from weclone.utils.config import load_config from weclone.utils.config_models import TestModelArgs, WCInferConfig infer_config = cast(WCInferConfig, load_config("web_demo")) test_config = cast(TestModelArgs, load_config("test_model")) completion_config = { "default_prompt": infer_config.default_system, "model": "gpt-3.5-turbo", "history_len": 15, } completion_config = type("Config", (object,), completion_config)() client = OpenAI(api_key="""sk-test""", base_url="http://127.0.0.1:8005/v1") def handler_text(content: str, history: list, config): messages = [{"role": "system", "content": f"{config.default_prompt}"}] for item in history: messages.append(item) messages.append({"role": "user", "content": content}) history.append({"role": "user", "content": content}) try: typed_messages = cast(List[ChatCompletionMessageParam], messages) response = client.chat.completions.create( model=config.model, messages=typed_messages, max_tokens=50, ) except openai.APIError as e: history.pop() return "AI interface error, please try again\n" + str(e) resp = str(response.choices[0].message.content) # type: ignore resp = resp.replace("\n ", "") history.append({"role": "assistant", "content": resp}) return resp def main(): test_list = json.loads(open(test_config.test_data_path, "r", encoding="utf-8").read())["questions"] res = [] for questions in tqdm(test_list, desc=" Testing..."): history = [] for q in questions: handler_text(q, history=history, config=completion_config) res.append(history) res_file = open("test_result-my.txt", "w") for r in res: for i in r: res_file.write(i["content"] + "\n") res_file.write("\n") if __name__ == "__main__": main() ================================================ FILE: weclone/eval/web_demo.py ================================================ from llamafactory.webui.interface import create_web_demo from weclone.utils.config import load_config def main(): load_config("web_demo") demo = create_web_demo() demo.queue() demo.launch(server_name="0.0.0.0", share=True, inbrowser=True) if __name__ == "__main__": main() ================================================ FILE: weclone/prompts/__init__.py ================================================ ================================================ FILE: weclone/prompts/clean_data.py ================================================ CLEAN_PROMPT = """ # 角色 你是一个数据质量评估员。 # 任务 你的任务是评估下面提供的聊天记录的**逻辑性**、**相关性**以及**风格代表性**。目标是识别并过滤掉那些回答与问题**明显不匹配**、**逻辑严重混乱**的样本,筛选出具有人类聊天风格独特性与辨识度的样本。请根据以下核心评估点给出一个1到5的整数分数,并将该分数与原始 `id` 一起输出。 **重要考量:** 1. **简短回答的有效性:** 请注意,诸如“好的”、“是的”、“收到”、“嗯”、“知道了”等简短的肯定、确认或应答,在合适的语境下是完全**有逻辑且相关的**。**不要仅仅因为回答简短就将其评为低分。** 只有当这类简短回答与【问题/上下文 Q】**明显不符**时,才应考虑低分。 2. **处理错别字和自我纠正:** 聊天记录中可能包含常见的打字错误(错别字)或用户先打错字随后又自行纠正的情况(例如,发送“我想去1楼”紧接着又发送“*2楼”进行更正)。在评估时,请**聚焦于用户想要表达的最终意图和信息的核心内容**,而**不应仅仅因为存在错别字或纠正过程就判定为低质量**。。 # 核心评估点 (请在心中衡量) 1. **相关性 (Relevance):** 【回答 A】是否直接回应或恰当地衔接了【问题/上下文 Q】?它是在回答问题,还是完全跑题了?只有当【回答 A】与【问题/上下文 Q】**明显矛盾**、**完全不着边际**(即使考虑上下文也无法合理化),或简短回答**明显不适用于**该【问题/上下文 Q】时,才给予低分。 2. **逻辑性 (Coherence):** 【回答 A】本身是否符合基本的逻辑?结合【问题/上下文 Q】来看,这个问答对是否构成了一个符合逻辑的交流片段?是否存在明显的矛盾、混乱的内容?只有当【回答 A】**自身逻辑混乱**、**与Q存在无法解释的矛盾**时,才给予低分。 3. **风格代表性** (Style Representativeness): 评估【回答 A】是否展现了自然、独特的人类对话风格特征。回答A是否带有个性化的色彩?关注点包括但不限于:是否体现了特定的语气(如友好、幽默、不耐烦、正式、脏话),是否包含口头禅、俚语、网络用语(如“yyds”、“绝绝子”)、表情符号 Emoji、颜文字、标点符号的特殊使用如“!!!”、“???”、“~”等表达、特定的缩写或短语、非标准的但一致的表达方式(如方言词汇、个人口癖)? 4. **以相关性和逻辑性为主要评判标准,风格代表性仅仅作为获得5分的必要条件。** # 评分标准 (1-5分) * **1分 (极差):** 聊天记录中的问答内容完全不相关;逻辑严重混乱/矛盾。 * **2分 (差):** 大部分问答相关性很低;存在明显的逻辑问题或不连贯。 * **3分 (中等):** 问答相关性一般(可能部分问答跑题或回应不充分);逻辑上勉强说得通但不够流畅或有瑕疵。 * **4分 (良好):** 大部分问答相关性好,回答了问题或恰当衔接,逻辑清晰。 * **5分 (优秀):** 问答相关性强,逻辑流畅,包含了显著的、具有辨识度的人类聊天的常用特征(例如情感情绪表达、口头禅、表情符号组合、特有的句子结构、鲜明的语气) # 输出要求 请严格按照以下 JSON 格式输出,包含输入数据的 id 和你给出的1到5的整数评分 score,不要包含任何其他文字、解释或标签。 {{"id": "{id}","score": <这里填入1到5的整数评分>}} # 输入数据 ```json {{"id": "{id}","messages": "{messages}"}} ``` """ # ONLINE_LLM_CLEAN_PROMPT = """ # # 角色 # 你是一个数据质量评估员。 # # 任务 # 你的任务是评估下面提供的聊天记录的**逻辑性**、**相关性**以及**风格代表性**。目标是识别并过滤掉那些回答与问题**明显不匹配**、**逻辑严重混乱**的样本,筛选出具有人类聊天风格独特性与辨识度的样本。请根据以下核心评估点给出一个1到5的整数分数,并将该分数与原始 `id` 一起输出。 # **重要考量:** # 1. **简短回答的有效性:** 请注意,诸如“好的”、“是的”、“收到”、“嗯”、“知道了”等简短的肯定、确认或应答,在合适的语境下是完全**有逻辑且相关的**。**不要仅仅因为回答简短就将其评为低分。** 只有当这类简短回答与【问题/上下文 Q】**明显不符**时,才应考虑低分。 # 2. **处理错别字和自我纠正:** 聊天记录中可能包含常见的打字错误(错别字)或用户先打错字随后又自行纠正的情况(例如,发送“我想去1楼”紧接着又发送“*2楼”进行更正)。在评估时,请**聚焦于用户想要表达的最终意图和信息的核心内容**,而**不应仅仅因为存在错别字或纠正过程就判定为低质量**。。 # # 核心评估点 (请在心中衡量) # 1. **相关性 (Relevance):** 【回答 A】是否直接回应或恰当地衔接了【问题/上下文 Q】?它是在回答问题,还是完全跑题了?只有当【回答 A】与【问题/上下文 Q】**明显矛盾**、**完全不着边际**(即使考虑上下文也无法合理化),或简短回答**明显不适用于**该【问题/上下文 Q】时,才给予低分。 # 2. **逻辑性 (Coherence):** 【回答 A】本身是否符合基本的逻辑?结合【问题/上下文 Q】来看,这个问答对是否构成了一个符合逻辑的交流片段?是否存在明显的矛盾、混乱的内容?只有当【回答 A】**自身逻辑混乱**、**与Q存在无法解释的矛盾**时,才给予低分。 # 3. **风格代表性** (Style Representativeness): 评估【回答 A】是否展现了自然、独特的人类对话风格特征。回答A是否带有个性化的色彩?关注点包括但不限于:是否体现了特定的语气(如友好、幽默、不耐烦、正式、脏话),是否包含口头禅、俚语、网络用语(如“yyds”、“绝绝子”)、表情符号 Emoji、颜文字、标点符号的特殊使用如“!!!”、“???”、“~”等表达、特定的缩写或短语、非标准的但一致的表达方式(如方言词汇、个人口癖)? # 4. **以相关性和逻辑性为主要评判标准,风格代表性仅仅作为获得5分的必要条件。** # # 评分标准 (1-5分) # * **1分 (极差):** 聊天记录中的问答内容完全不相关;逻辑严重混乱/矛盾。 # * **2分 (差):** 大部分问答相关性很低;存在明显的逻辑问题或不连贯。 # * **3分 (中等):** 问答相关性一般(可能部分问答跑题或回应不充分);逻辑上勉强说得通但不够流畅或有瑕疵。 # * **4分 (良好):** 大部分问答相关性好,回答了问题或恰当衔接,逻辑清晰。 # * **5分 (优秀):** 问答相关性强,逻辑流畅,包含了显著的、具有辨识度的人类聊天的常用特征(例如情感情绪表达、口头禅、表情符号组合、特有的句子结构、鲜明的语气) # # 输入数据 # ```json # {qa_list} # # 输出要求 # 请严格按照以下 JSON 格式输出,包含原始的 id 和你给出的1到5的整数评分 score,不要包含任何其他文字、解释或标签! # [ # {{ # "id": "<这里填入第1条输入数据的id值>", # "score": <1-5的整数评分> # }}, # {{ # "id": "<这里填入第2条输入数据的id值>", # "score": <1-5的整数评分> # }} # … # ] # """ ================================================ FILE: weclone/server/__init__.py ================================================ ================================================ FILE: weclone/server/api_service.py ================================================ import os import uvicorn from llamafactory.api.app import create_app from llamafactory.chat import ChatModel from weclone.utils.config import load_config def main(): config = load_config("api_service") chat_model = ChatModel(config.model_dump(mode="json")) app = create_app(chat_model) print("Visit http://localhost:{}/docs for API document.".format(os.environ.get("API_PORT", 8005))) uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("API_PORT", 8005)), workers=1) if __name__ == "__main__": main() ================================================ FILE: weclone/train/__init__.py ================================================ ================================================ FILE: weclone/train/export_model.py ================================================ from llamafactory.train.tuner import export_model def main(): export_model() if __name__ == "__main__": main() ================================================ FILE: weclone/train/train_sft.py ================================================ import json import os from typing import cast from llamafactory.extras.misc import get_current_device from llamafactory.train.tuner import run_exp from weclone.data.clean.strategies import LLMCleaningStrategy from weclone.utils.config import load_config from weclone.utils.config_models import WCMakeDatasetConfig, WCTrainSftConfig from weclone.utils.log import logger def main(): train_config: WCTrainSftConfig = cast(WCTrainSftConfig, load_config(arg_type="train_sft")) dataset_config: WCMakeDatasetConfig = cast(WCMakeDatasetConfig, load_config(arg_type="make_dataset")) device = get_current_device() if device == "cpu": logger.warning("Please note you are using CPU for training, non-Mac devices may encounter issues") dataset_info_path = os.path.join(dataset_config.dataset_dir, "dataset_info.json") with open(dataset_info_path, "r", encoding="utf-8") as f: dataset_info = json.load(f) data_path = os.path.join( dataset_config.dataset_dir, dataset_info.get(train_config.dataset, {}).get("file_name") ) if not os.path.exists(data_path): raise FileNotFoundError( f"Dataset file '{data_path}' does not exist, please check if make-dataset was executed" ) if not dataset_config.clean_dataset.enable_clean: logger.info("Data cleaning is not enabled, will use the original dataset.") else: cleaner = LLMCleaningStrategy(make_dataset_config=dataset_config) train_config.dataset = cleaner.clean() formatted_config = json.dumps(train_config.model_dump(mode="json"), indent=4, ensure_ascii=False) logger.info(f"Fine-tuning configuration:\n{formatted_config}") run_exp(train_config.model_dump(mode="json")) if __name__ == "__main__": main() ================================================ FILE: weclone/utils/__init__.py ================================================ ================================================ FILE: weclone/utils/config.py ================================================ import os import sys from typing import Any, Dict, cast import pyjson5 from omegaconf import OmegaConf from pydantic import BaseModel from .config_models import ( WcConfig, WCInferConfig, WCMakeDatasetConfig, WCTrainSftConfig, ) from .log import logger from .tools import dict_to_argv def load_base_config() -> WcConfig: """Load base configuration file and create WcConfig object""" config_path = os.environ.get("WECLONE_CONFIG_PATH", "./settings.jsonc") logger.info(f"Loading configuration from: {config_path}") try: with open(config_path, "r", encoding="utf-8") as f: s_config_dict: Dict[str, Any] = pyjson5.loads(f.read()) except FileNotFoundError: logger.error(f"Configuration file not found: {config_path}") sys.exit(1) except Exception as e: logger.error(f"Error loading configuration file {config_path}: {e}") sys.exit(1) # Use OmegaConf to parse configuration, then convert to Pydantic model for validation try: omega_config = OmegaConf.create(s_config_dict) config_dict_for_validation = OmegaConf.to_container(omega_config, resolve=True) if not isinstance(config_dict_for_validation, dict): raise TypeError( f"Configuration should be a dictionary, but got {type(config_dict_for_validation)}" ) wc_config = WcConfig(**cast(Dict[str, Any], config_dict_for_validation)) except Exception as e: logger.error(f"Error parsing configuration with OmegaConf and WcConfig: {e}") sys.exit(1) return wc_config def create_config_by_arg_type(arg_type: str, wc_config: WcConfig) -> BaseModel: """Create corresponding configuration object based on argument type, merge common_config""" if arg_type == "cli_args": return wc_config.cli_args common_config = wc_config.common_args.model_dump() if arg_type == "web_demo" or arg_type == "api_service": config_dict = {**common_config, **wc_config.infer_args.model_dump()} return WCInferConfig(**config_dict) elif arg_type == "vllm": return wc_config.vllm_args elif arg_type == "test_model": return wc_config.test_model_args elif arg_type == "train_sft": common_config["include_type"] = wc_config.make_dataset_args.include_type config_dict = {**common_config, **wc_config.train_sft_args.model_dump()} return WCTrainSftConfig(**config_dict) elif arg_type == "make_dataset": make_dataset_config = wc_config.make_dataset_args.model_dump() # TODO: Should the following three parameters be moved to common? train_sft_args = wc_config.train_sft_args extra_values = { "dataset": train_sft_args.dataset, "dataset_dir": train_sft_args.dataset_dir, "cutoff_len": train_sft_args.cutoff_len, } config_dict = {**common_config, **make_dataset_config, **extra_values} return WCMakeDatasetConfig(**config_dict) else: raise ValueError("Unsupported argument type") def process_config_dict_and_argv(arg_type: str, config_pydantic: BaseModel) -> None: """Process configuration dictionary and update sys.argv""" config_dict = config_pydantic.model_dump(mode="json") sys.argv += dict_to_argv(config_dict) def load_config(arg_type: str) -> BaseModel: """Main function for loading configuration""" # Load base configuration wc_config = load_base_config() config_pydantic = create_config_by_arg_type(arg_type, wc_config) process_config_dict_and_argv(arg_type, config_pydantic) return config_pydantic if __name__ == "__main__": load_config("train_sft") ================================================ FILE: weclone/utils/config_models.py ================================================ from enum import Enum from typing import TYPE_CHECKING, List, Optional from loguru import logger from pydantic import BaseModel, Field, model_validator if TYPE_CHECKING: pass class StrEnum(str, Enum): """ Pydantic-friendly string enum base class Supports direct string comparison, e.g.: `if platform == PlatformType.CHAT` Also supports string literal comparison, e.g.: `if platform == "chat"` """ def __str__(self) -> str: return self.value @classmethod def _missing_(cls, value): for member in cls: if member.value == value: return member return None class BaseConfigModel(BaseModel): """Base configuration model with default extra='allow'""" model_config = {"extra": "allow"} class PlatformType(StrEnum): """Data source platform""" CHAT = "chat" TELEGRAM = "telegram" class LanguageType(StrEnum): """Data language""" ZH = "zh" EN = "en" class DataModality(StrEnum): """Data modality""" TEXT = "text" IMAGE = "image" STICKER = "sticker" # AUDIO = "audio" # VIDEO = "video" class CombineStrategy(StrEnum): """Combination strategy""" TIME_WINDOW = "time_window" class CleanStrategy(StrEnum): """Data cleaning strategy""" LLM = "llm" class FinetuningType(StrEnum): """Finetuning type""" LORA = "lora" # FULL = "full" # FREEZE = "freeze" class CommonArgs(BaseConfigModel): """NOTE that all parameters here will be parsed by `HfArgumentParser`. Non-HfArgumentParser parameters should be placed in make_dataset_args.""" model_name_or_path: str = Field(...) adapter_name_or_path: Optional[str] = Field(None, description="Also as output_dir of train_sft_args") template: str = Field(..., description="model template") default_system: str = Field(..., description="default system prompt") finetuning_type: FinetuningType = Field(FinetuningType.LORA) media_dir: str = Field("dataset/media") image_max_pixels: int = Field(409920, description="used in llama-factory, 409920 represents 720P") enable_thinking: bool = Field(False, description="used in llama-factory") trust_remote_code: bool = Field(True, description="used in huggingface") class CliArgs(BaseModel): model_config = {"extra": "forbid"} full_log: bool = Field(False) log_level: str = Field("INFO", description="DEBUG, INFO, WARNING, ERROR, CRITICAL") class LLMCleanConfig(BaseConfigModel): accept_score: int = Field( 2, description="Acceptable LLM scoring threshold: 1 (worst) to 5 (best). Data scoring below this threshold will not be used for training.", ) enable_thinking: bool = Field(False, description="used in llama-factory") class CleanDatasetConfig(BaseConfigModel): enable_clean: bool = False clean_strategy: CleanStrategy = CleanStrategy.LLM llm: LLMCleanConfig = LLMCleanConfig(accept_score=2, enable_thinking=False) class VisionApiConfig(BaseConfigModel): """Vision API specific configuration""" enable: bool = Field(default=False, description="Whether to enable Vision API for image recognition") api_key: Optional[str] = None api_url: Optional[str] = None model_name: Optional[str] = None max_workers: Optional[int] = None class TelegramArgs(BaseModel): model_config = {"extra": "forbid"} my_id: str = Field(default="user1234567890", description="Your own telegram id") class MakeDatasetArgs(BaseConfigModel): model_config = {"extra": "forbid"} platform: PlatformType = Field(..., description="Data source platform") telegram_args: Optional[TelegramArgs] = None language: LanguageType = Field(LanguageType.ZH, description="Common language used in chat") include_type: List[DataModality] = Field([DataModality.TEXT], description="Types of data to include") max_image_num: int = Field(2, description="Maximum number of images per single data entry") blocked_words: List[str] = Field([], description="List of blocked words") add_time: bool = Field(False, description="Whether to add time to the dataset") add_relation: bool = Field(False, description="Whether to add chat member relationship to the dataset") single_combine_strategy: CombineStrategy = Field( CombineStrategy.TIME_WINDOW, description="Strategy for combining single person's messages into a single sentence", ) qa_match_strategy: CombineStrategy = Field( CombineStrategy.TIME_WINDOW, description="Strategy for forming QA pairs" ) single_combine_time_window: int = Field( 2, description="Time window for combining single person's messages (minutes)" ) qa_match_time_window: int = Field(5, description="Time window for forming QA pairs (minutes)") combine_msg_max_length: int = Field(2048, description="Maximum length of combined messages") messages_max_length: int = Field( 2048, description="Maximum character count for messages, used with cutoff_len" ) prompt_with_history: bool = Field( False, description="Whether to include conversation history in prompt, invalid for multimodal data" ) clean_dataset: CleanDatasetConfig = Field(CleanDatasetConfig(), description="Data cleaning configuration") online_llm_clear: bool = Field(False) base_url: Optional[str] = Field(None, description="Base URL for online LLM") llm_api_key: Optional[str] = Field(None, description="API key for online LLM") model_name: Optional[str] = Field( None, description="Model name for online LLM, recommend using larger parameter models" ) clean_batch_size: int = Field(10, description="Batch size for data cleaning") vision_api: VisionApiConfig = Field(VisionApiConfig()) class TrainSftArgs(BaseConfigModel): stage: str = Field("sft", description="Training stage") dataset: str = Field(..., description="Dataset name") dataset_dir: str = Field("./dataset/res_csv/sft", description="Dataset directory") freeze_multi_modal_projector: bool = Field( False, description="Whether to freeze multimodal projector during MLLM training" ) use_fast_tokenizer: bool = Field(True, description="Whether to use fast tokenizer") lora_target: str = Field(..., description="LoRA target modules") lora_rank: int = Field(4, description="LoRA rank") lora_dropout: float = Field(0.25, description="LoRA dropout") weight_decay: float = Field(0.1, description="Weight decay") overwrite_cache: bool = Field(True, description="Whether to overwrite cache") per_device_train_batch_size: int = Field(4, description="Training batch size per device") gradient_accumulation_steps: int = Field(8, description="Gradient accumulation steps") lr_scheduler_type: str = Field("cosine", description="Learning rate scheduler type") cutoff_len: int = Field(4096, description="Cutoff length") logging_steps: int = Field(10, description="Logging steps") save_steps: int = Field(100, description="Model save steps") learning_rate: float = Field(1e-4, description="Learning rate") warmup_ratio: float = Field(0.1, description="Warmup ratio") num_train_epochs: int = Field(2, description="Number of training epochs") plot_loss: bool = Field(True, description="Whether to plot loss curve") fp16: bool = Field(True, description="Whether to use fp16") flash_attn: str = Field("fa2", description="Flash Attention type") preprocessing_num_workers: int = Field(16, description="Number of preprocessing worker processes") dataloader_num_workers: int = Field(4, description="Number of dataloader worker processes") deepspeed: Optional[str] = Field( None, description="DeepSpeed configuration file path for multi-GPU training" ) do_train: bool = Field(True) class InferArgs(BaseConfigModel): repetition_penalty: float = Field(1.2, description="Repetition penalty") temperature: float = Field(..., description="Temperature") top_p: float = Field(..., description="Top-p sampling") max_length: int = Field(..., description="Maximum generation length") class VllmArgs(BaseConfigModel): gpu_memory_utilization: float = Field(default=0.9, description="vllm GPU memory utilization") class TestModelArgs(BaseConfigModel): test_data_path: str = Field(default="dataset/eval/test_data-en.json", description="Test data path") class CommonMethods: def _parse_dataset_name(self) -> str: """Parse and process dataset name""" if hasattr(self, "include_type") and "image" in getattr(self, "include_type", []): return getattr(self, "dataset", "") + "-vl" return getattr(self, "dataset", "") class WcConfig(BaseModel): model_config = {"extra": "forbid"} version: str = Field(..., description="Configuration file version") common_args: CommonArgs = Field(..., description="Common parameters") cli_args: CliArgs = Field(..., description="Command line arguments") make_dataset_args: MakeDatasetArgs = Field(..., description="Dataset processing parameters") train_sft_args: TrainSftArgs = Field(..., description="SFT fine-tuning parameters") infer_args: InferArgs = Field(..., description="Inference parameters") vllm_args: VllmArgs = Field(VllmArgs()) test_model_args: TestModelArgs = Field(TestModelArgs()) class WCInferConfig(CommonArgs, InferArgs): """Final configuration model for Web Demo""" pass class WCTrainSftConfig(CommonArgs, TrainSftArgs, CommonMethods): """Final configuration model for SFT training""" # Training output directory, converted from adapter_name_or_path output_dir: Optional[str] = Field(None) dataset: str = Field(..., description="Dataset name") @model_validator(mode="after") def process_config(self): adapter_name_value = getattr(self, "adapter_name_or_path", None) if adapter_name_value: self.output_dir = adapter_name_value self.dataset = self._parse_dataset_name() # Always remove adapter_name_or_path field after processing if hasattr(self, "adapter_name_or_path"): delattr(self, "adapter_name_or_path") if hasattr(self, "include_type"): delattr(self, "include_type") return self class WCMakeDatasetConfig(CommonArgs, MakeDatasetArgs, CommonMethods): """Final configuration model for creating datasets""" model_config = {"extra": "allow"} # Explicitly set to allow dataset: str = Field(..., description="Dataset name") dataset_dir: str = Field("./dataset/res_csv/sft", description="Dataset directory") cutoff_len: int = Field(4096, description="Cutoff length") @model_validator(mode="after") def process_config(self): # Validate Telegram configuration if self.platform == PlatformType.TELEGRAM: if self.telegram_args is None or self.telegram_args.my_id == "user1234567890": logger.error( "When using the Telegram platform, please set a valid `telegram_args.my_id`. The `from_id` in `result.json` for the messages you send represents your user ID." ) exit(1) self.dataset = self._parse_dataset_name() return self ================================================ FILE: weclone/utils/i18n.py ================================================ from typing import Dict, List, Optional class MultiLangList: def __init__(self, translations: Dict[str, List[str]], default_lang="en"): self.translations = translations self.current_lang = default_lang self.default_lang = default_lang # Validate that all translation lists have the same length self._validate_translations() # 创建反向映射字典,用于快速查找 self._build_reverse_mapping() def _validate_translations(self): """Validate that all translation lists have the same length""" if not self.translations: raise ValueError("Translations dictionary cannot be empty") # Get the length of the first list as reference first_lang = next(iter(self.translations)) expected_length = len(self.translations[first_lang]) # Check if all lists have the same length for lang, items in self.translations.items(): if len(items) != expected_length: raise ValueError( f"Translation list for '{lang}' has {len(items)} items, " f"expected {expected_length} items (same as '{first_lang}')" ) def _build_reverse_mapping(self): """构建反向映射,用于根据文本查找对应的索引和其他语言翻译""" self.text_to_index = {} # 文本 -> (语言, 索引) for lang, items in self.translations.items(): for index, text in enumerate(items): self.text_to_index[text.lower()] = (lang, index) def set_language(self, lang: str): """设置当前语言""" if lang in self.translations: self.current_lang = lang return self else: print(f"Warning: Language '{lang}' not available, using default") def get_items(self, lang: Optional[str] = None) -> List[str]: """获取指定语言的列表""" target_lang = lang or self.current_lang return self.translations.get(target_lang, self.translations[self.default_lang]) def get_item(self, index: int, lang: Optional[str] = None) -> str: """获取指定索引的翻译项""" items = self.get_items(lang) if 0 <= index < len(items): return items[index] raise IndexError("List index out of range") def translate_text(self, text: str, target_lang: Optional[str] = None) -> Optional[str]: """ 根据输入的文本(中文或英文)获取另一种语言的翻译 Args: text: 要翻译的文本 target_lang: 目标语言,如果不指定则自动判断(中文->英文,英文->中文) Returns: 翻译后的文本,如果找不到则返回None """ text_lower = text.lower() # 查找文本在哪个语言的哪个位置 if text_lower not in self.text_to_index: return None source_lang, index = self.text_to_index[text_lower] # 如果没有指定目标语言,则自动判断 if target_lang is None: if source_lang == "en": target_lang = "zh_CN" # 英文->中文 elif source_lang == "zh_CN": target_lang = "en" # 中文->英文 else: return None # 获取目标语言的翻译 if target_lang in self.translations: target_items = self.translations[target_lang] if index < len(target_items): return target_items[index] return None def get_translation_pair(self, text: str) -> Dict[str, str]: """ 获取某个文本的中英文对照 Args: text: 要查找的文本 Returns: 包含中英文翻译的字典,例如 {'en': 'Administrator', 'zh_CN': '管理员'} """ text_lower = text.lower() if text_lower not in self.text_to_index: return {} source_lang, index = self.text_to_index[text_lower] result = {} for lang in ["en", "zh_CN"]: if lang in self.translations and index < len(self.translations[lang]): result[lang] = self.translations[lang][index] return result def translate_batch(self, texts: List[str], target_lang: Optional[str] = None) -> List[Optional[str]]: """ 批量翻译文本 Args: texts: 要翻译的文本列表 target_lang: 目标语言 Returns: 翻译结果列表 """ return [self.translate_text(text, target_lang) for text in texts] def __iter__(self): return iter(self.get_items()) def __len__(self): return len(self.get_items()) def __getitem__(self, index): return self.get_item(index) if __name__ == "__main__": # 定义中英文双语数据 user_types_data = { "en": ["Administrator", "Regular User", "Guest", "Moderator", "Super Admin"], "zh_CN": ["管理员", "普通用户", "访客", "版主", "超级管理员"], } status_data = { "en": ["Active", "Inactive", "Pending", "Suspended", "Deleted"], "zh_CN": ["活跃", "非活跃", "待定", "暂停", "已删除"], } permission_data = { "en": ["Read", "Write", "Execute", "Delete", "Admin"], "zh_CN": ["读取", "写入", "执行", "删除", "管理"], } # 创建多语言列表 user_types = MultiLangList(user_types_data) status_list = MultiLangList(status_data) permissions = MultiLangList(permission_data) # 使用示例 print("=== 基本翻译功能 ===") # 中文翻译为英文 result1 = user_types.translate_text("管理员") print(f"'管理员' -> '{result1}'") # 输出: '管理员' -> 'Administrator' # 英文翻译为中文 result2 = user_types.translate_text("Guest") print(f"'Guest' -> '{result2}'") # 输出: 'Guest' -> '访客' # 指定目标语言 result3 = user_types.translate_text("管理员", target_lang="en") print(f"'管理员' -> '{result3}' (指定英文)") # 输出: '管理员' -> 'Administrator' (指定英文) print("\n=== 获取中英文对照 ===") translation_pair = user_types.get_translation_pair("Administrator") print(f"'Administrator' 的中英文对照: {translation_pair}") # 输出: {'en': 'Administrator', 'zh_CN': '管理员'} print("\n=== 批量翻译 ===") chinese_texts = ["管理员", "普通用户", "访客"] english_results = user_types.translate_batch(chinese_texts) print(f"批量翻译结果: {list(zip(chinese_texts, english_results))}") # 输出: [('管理员', 'Administrator'), ('普通用户', 'Regular User'), ('访客', 'Guest')] print("\n=== 状态列表翻译 ===") status_result = status_list.translate_text("活跃") print(f"'活跃' -> '{status_result}'") # 输出: '活跃' -> 'Active' status_result2 = status_list.translate_text("Pending") print(f"'Pending' -> '{status_result2}'") # 输出: 'Pending' -> '待定' print("\n=== 权限翻译 ===") perm_result = permissions.translate_text("读取") print(f"'读取' -> '{perm_result}'") # 输出: '读取' -> 'Read' print("\n=== 错误处理 ===") not_found = user_types.translate_text("不存在的文本") print(f"不存在的文本翻译结果: {not_found}") # 输出: None print("\n=== 当前语言设置 ===") user_types.set_language("zh_CN") print(f"当前语言列表: {list(user_types)}") # 输出中文列表 user_types.set_language("en") print(f"切换后列表: {list(user_types)}") # 输出英文列表 ================================================ FILE: weclone/utils/length_cdf.py ================================================ # Copyright 2025 the LlamaFactory team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import defaultdict import fire from llamafactory.data import get_dataset, get_template_and_fix_tokenizer from llamafactory.hparams import get_train_args from llamafactory.model import load_tokenizer from tqdm import tqdm from weclone.utils.log import logger def calculate_token_length( text: str, model_name_or_path: str = "./models/Qwen3-32B-AWQ", template: str = "qwen3", ) -> int: """Calculate the token length of the specified text Args: text: Text to calculate token length for model_name_or_path: Model path template: Template name Returns: Token length of the text """ logger.info(f"Calculating text token length: {text[:50]}...") model_args, data_args, _, _, _ = get_train_args( { "stage": "sft", "model_name_or_path": model_name_or_path, "template": template, "dataset": "chat-sft", "output_dir": "dummy_dir", "do_train": True, } ) tokenizer_module = load_tokenizer(model_args) tokenizer = tokenizer_module["tokenizer"] # Directly use tokenizer to encode text tokens = tokenizer.encode(text, add_special_tokens=False) token_length = len(tokens) logger.info(f"Text token length: {token_length}") return token_length def length_cdf( model_name_or_path: str = "./Qwen2.5-7B-Instruct", dataset: str = "chat-sft", dataset_dir: str = "./dataset/res_csv/sft", media_dir: str = "./dataset/media", template: str = "qwen", interval: int = 256, image_max_pixels: int = 768 * 768, ): r"""Calculate the distribution of the input lengths in the dataset. Usage: export CUDA_VISIBLE_DEVICES=0 python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en_demo --template default """ logger.info("Starting cutoff_len calculation......") model_args, data_args, training_args, _, _ = get_train_args( { "stage": "sft", "model_name_or_path": model_name_or_path, "dataset": dataset, "dataset_dir": dataset_dir, "template": template, "cutoff_len": 1_000_000, "preprocessing_num_workers": 16, "output_dir": "dummy_dir", "media_dir": media_dir, "image_max_pixels": int(image_max_pixels), "overwrite_cache": True, "do_train": True, } ) tokenizer_module = load_tokenizer(model_args) template_obj = get_template_and_fix_tokenizer(tokenizer_module["tokenizer"], data_args) # type: ignore trainset = get_dataset(template_obj, model_args, data_args, training_args, "sft", **tokenizer_module)[ "train_dataset" ] # type: ignore total_num = len(trainset) # type: ignore length_dict = defaultdict(int) for sample in tqdm(trainset["input_ids"], desc="Collecting lengths"): # type: ignore length_dict[len(sample) // interval * interval] += 1 length_tuples = list(length_dict.items()) length_tuples.sort() count_accu, prob_accu = 0, 0 logger.info(" cutoff_len configuration suggestions:") logger.warning("For multimodal tasks, please ensure cutoff_len is set to the maximum data length") for length, count in length_tuples: count_accu += count prob_accu += count / total_num * 100 logger.info(f"{count_accu:d} ({prob_accu:.2f}%) samples have length < {length + interval}.") if __name__ == "__main__": fire.Fire(length_cdf) ================================================ FILE: weclone/utils/log.py ================================================ import logging import os import sys import time from functools import wraps from loguru import logger logger.remove() env_log_level = os.getenv("WC_LOG_LEVEL") # Initialize basic log configuration, will be reconfigured later by configure_log_level_from_config logger.add( sys.stderr, format="[WeClone] {level.name[0]} | {time:HH:mm:ss} | {message}", colorize=True, level=env_log_level.upper() if env_log_level else "INFO", ) class InterceptHandler(logging.Handler): def __init__(self, level=logging.INFO): super().__init__(level) def emit(self, record): # Check log level, only handle logs at specified level and above if record.levelno < self.level: return timestamp = time.strftime("%H:%M:%S") level_color = "\033[36m" if record.levelno >= logging.INFO else "\033[0m" reset_color = "\033[0m" message = f"[{record.name}] | {level_color}{record.levelname[0]}{reset_color} | {timestamp} | {record.getMessage()}" print(message, file=sys.stderr) # Bridge standard logging to loguru intercept_handler = InterceptHandler(level=logging.INFO) logging.basicConfig(handlers=[intercept_handler], level=0, force=True) def capture_output(func): @wraps(func) def wrapper(*args, **kwargs): log_sink_buffer = [] def list_sink(message): log_sink_buffer.append(message.record["message"]) sink_id = logger.add(list_sink, format="{message}", level="INFO") original_stdout = sys.stdout original_stderr = sys.stderr class OutputTeeToGlobalLog: def __init__(self, original_stream, log_method): self.original_stream = original_stream self.log_method = log_method self.current_line_content = "" # Represents the current state of the line to be logged def write(self, data_chunk): self.original_stream.write(data_chunk) # Pass through to console if data_chunk.endswith("\\r") and "\\n" not in data_chunk: self.current_line_content = data_chunk[:-1] # Store without the trailing \\r return full_buffer = self.current_line_content + data_chunk lines_to_process = full_buffer.split("\\n") for i in range(len(lines_to_process) - 1): line = lines_to_process[i] final_content_of_line = line last_cr = line.rfind("\\r") if last_cr != -1: final_content_of_line = line[last_cr + 1 :] escaped_log = final_content_of_line.replace("{", "{{").replace("}", "}}") if final_content_of_line.strip() or line: self.log_method(escaped_log, raw=True) self.current_line_content = lines_to_process[-1] def flush(self): self.original_stream.flush() if self.current_line_content: final_content_of_line = self.current_line_content last_cr = self.current_line_content.rfind("\\r") if last_cr != -1: final_content_of_line = self.current_line_content[last_cr + 1 :] escaped_log = final_content_of_line.replace("{", "{{").replace("}", "}}") if final_content_of_line.strip() or self.current_line_content: self.log_method(escaped_log, raw=True) self.current_line_content = "" sys.stdout = OutputTeeToGlobalLog(original_stdout, logger.opt(raw=True).info) sys.stderr = OutputTeeToGlobalLog(original_stderr, logger.opt(raw=True).error) try: func(*args, **kwargs) finally: sys.stdout = original_stdout sys.stderr = original_stderr logger.remove(sink_id) return wrapper def configure_log_level_from_config(): """ Read log level from config file and set complete log configuration Should be called after config is loaded """ log_level = "INFO" # default value try: from weclone.utils.config import load_config cli_config = load_config(arg_type="cli_args") log_level = getattr(cli_config, "log_level", "INFO") except Exception as e: logger.warning(f"Unable to load log level from config, using default INFO level: {e}") logger.remove() logger.add( sys.stderr, format="[WeClone] {level.name[0]} | {time:HH:mm:ss} | {message}", colorize=True, level=log_level.upper(), ) logger.add( "logs/weclone.log", rotation="1 day", retention="7 days", compression="zip", level="DEBUG", format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {name}:{function}:{line} - {message}", encoding="utf-8", enqueue=True, ) intercept_handler.setLevel(log_level.upper()) logger.info(f"Log level has been set to: {log_level.upper()}") ================================================ FILE: weclone/utils/retry.py ================================================ import random import time from functools import wraps from typing import Callable, List, Optional from weclone.utils.log import logger def retry_on_http_error( max_retries: int = 3, base_delay: float = 1.0, max_delay: float = 60.0, backoff_factor: float = 2.0, jitter: bool = True, retry_on_status: Optional[List[int]] = None, retry_on_exceptions: Optional[List[type]] = None, ): """ HTTP请求重试装饰器,专门处理429状态码和其他网络错误 Args: max_retries: 最大重试次数 base_delay: 基础延迟时间(秒) max_delay: 最大延迟时间(秒) backoff_factor: 退避因子,每次重试延迟时间乘以此因子 jitter: 是否添加随机抖动,避免雷群效应 retry_on_status: 需要重试的HTTP状态码列表,默认包含429, 500, 502, 503, 504 retry_on_exceptions: 需要重试的异常类型列表 """ if retry_on_status is None: retry_on_status = [429, 500, 502, 503, 504] if retry_on_exceptions is None: retry_on_exceptions = [ConnectionError, TimeoutError] def decorator(func): @wraps(func) def wrapper(*args, **kwargs): for attempt in range(max_retries + 1): try: result = func(*args, **kwargs) # 检查是否是HTTP响应对象 if hasattr(result, "status_code"): if result.status_code in retry_on_status: if attempt < max_retries: delay = _calculate_delay( attempt, base_delay, max_delay, backoff_factor, jitter ) logger.warning( f"HTTP请求返回状态码 {result.status_code}," f"第 {attempt + 1}/{max_retries + 1} 次尝试," f"将在 {delay:.2f} 秒后重试..." ) time.sleep(delay) continue else: logger.error( f"HTTP请求在 {max_retries + 1} 次尝试后最终失败,状态码: {result.status_code}" ) return result return result except Exception as e: should_retry_on_exception = any( isinstance(e, exc_type) for exc_type in retry_on_exceptions ) if should_retry_on_exception and attempt < max_retries: delay = _calculate_delay(attempt, base_delay, max_delay, backoff_factor, jitter) logger.warning( f"请求异常: {type(e).__name__}: {e}," f"第 {attempt + 1}/{max_retries + 1} 次尝试," f"将在 {delay:.2f} 秒后重试..." ) time.sleep(delay) continue elif should_retry_on_exception: logger.error(f"请求在 {max_retries + 1} 次尝试后最终失败: {type(e).__name__}: {e}") raise else: logger.error(f"未知错误,不进行重试: {type(e).__name__}: {e}") raise return None # 理论上不会执行到这里 return wrapper return decorator def retry_openai_api( max_retries: int = 3, base_delay: float = 1.0, max_delay: float = 60.0, backoff_factor: float = 2.0, jitter: bool = True, ): """ 专门用于OpenAI API调用的重试装饰器 处理OpenAI特有的异常类型 """ def decorator(func): @wraps(func) def wrapper(*args, **kwargs): for attempt in range(max_retries + 1): try: return func(*args, **kwargs) except Exception as e: # 检查是否是速率限制或临时错误 error_message = str(e).lower() should_retry = ( "rate limit" in error_message or "429" in error_message or "too many requests" in error_message or "server error" in error_message or "timeout" in error_message or "connection" in error_message ) if should_retry and attempt < max_retries: delay = _calculate_delay(attempt, base_delay, max_delay, backoff_factor, jitter) logger.warning( f"OpenAI API调用失败: {type(e).__name__}: {e}," f"第 {attempt + 1}/{max_retries + 1} 次尝试," f"将在 {delay:.2f} 秒后重试..." ) time.sleep(delay) continue else: if attempt >= max_retries: logger.error( f"OpenAI API调用在 {max_retries + 1} 次尝试后最终失败: {type(e).__name__}: {e}" ) raise return None return wrapper return decorator def _calculate_delay( attempt: int, base_delay: float, max_delay: float, backoff_factor: float, jitter: bool ) -> float: """计算重试延迟时间""" delay = base_delay * (backoff_factor**attempt) delay = min(delay, max_delay) if jitter: # 添加±20%的随机抖动 jitter_range = delay * 0.2 delay += random.uniform(-jitter_range, jitter_range) delay = max(0, delay) # 确保延迟不为负数 return delay class RetryConfig: """重试配置类,用于统一管理重试参数""" def __init__( self, max_retries: int = 3, base_delay: float = 1.0, max_delay: float = 60.0, backoff_factor: float = 2.0, jitter: bool = True, retry_on_status: Optional[List[int]] = None, retry_on_exceptions: Optional[List[type]] = None, ): self.max_retries = max_retries self.base_delay = base_delay self.max_delay = max_delay self.backoff_factor = backoff_factor self.jitter = jitter self.retry_on_status = retry_on_status or [429, 500, 502, 503, 504] self.retry_on_exceptions = retry_on_exceptions or [ConnectionError, TimeoutError] def apply_to_function(self, func: Callable) -> Callable: """将重试配置应用到函数上""" return retry_on_http_error( max_retries=self.max_retries, base_delay=self.base_delay, max_delay=self.max_delay, backoff_factor=self.backoff_factor, jitter=self.jitter, retry_on_status=self.retry_on_status, retry_on_exceptions=self.retry_on_exceptions, )(func) # 预定义的重试配置 AGGRESSIVE_RETRY = RetryConfig( max_retries=5, base_delay=0.5, max_delay=30.0, backoff_factor=1.5, ) CONSERVATIVE_RETRY = RetryConfig( max_retries=2, base_delay=2.0, max_delay=10.0, backoff_factor=2.0, ) API_RETRY = RetryConfig( max_retries=3, base_delay=1.0, max_delay=60.0, backoff_factor=2.0, retry_on_status=[429, 500, 502, 503, 504], ) ================================================ FILE: weclone/utils/tools.py ================================================ def dict_to_argv(d): argv = [] for k, v in d.items(): argv.append("--" + k) if v is not None: argv.append(str(v)) return argv