Repository: xming521/WeClone
Branch: master
Commit: 1779b5d8af03
Files: 63
Total size: 285.9 KB

Directory structure:
gitextract_wor7olkb/

├── .cursor/
│   └── rules/
│       └── weclone-rules.mdc
├── .github/
│   ├── issue-labeler.yml
│   ├── weclone-release-event.json
│   └── workflows/
│       ├── issue-labeler.yml
│       └── tg_release_notification.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── README_zh.md
├── dataset/
│   ├── eval/
│   │   ├── test_data-en.json
│   │   ├── test_data-privacy.json
│   │   └── test_data-zh.json
│   ├── media/
│   │   └── images/
│   │       └── .gitkeep
│   ├── res_csv/
│   │   └── sft/
│   │       └── dataset_info.json
│   └── telegram/
│       └── .gitkeep
├── ds_config.json
├── examples/
│   ├── mllm.template.jsonc
│   └── tg.template.jsonc
├── pyproject.toml
├── settings.template.jsonc
├── tests/
│   ├── __init__.py
│   ├── configs/
│   │   ├── Qwen2.5-VL.jsonc
│   │   └── qwen2.5.jsonc
│   ├── test_PII.py
│   ├── test_full_pipe.py
│   └── tests_data/
│       ├── test_PII/
│       │   └── test_0_730.csv
│       ├── test_model_data.json
│       └── test_person/
│           └── test_0_730.csv
└── weclone/
    ├── __init__.py
    ├── cli.py
    ├── core/
    │   ├── PII/
    │   │   ├── __init__.py
    │   │   └── pii_detector.py
    │   └── inference/
    │       ├── offline_infer.py
    │       └── online_infer.py
    ├── data/
    │   ├── __init__.py
    │   ├── chat_parsers/
    │   │   └── telegram_parser.py
    │   ├── clean/
    │   │   ├── __init__.py
    │   │   └── strategies.py
    │   ├── models.py
    │   ├── qa_generator.py
    │   ├── strategies.py
    │   └── utils.py
    ├── eval/
    │   ├── __init__.py
    │   ├── cli_demo.py
    │   ├── eval_model.py
    │   ├── test_model.py
    │   └── web_demo.py
    ├── prompts/
    │   ├── __init__.py
    │   └── clean_data.py
    ├── server/
    │   ├── __init__.py
    │   └── api_service.py
    ├── train/
    │   ├── __init__.py
    │   ├── export_model.py
    │   └── train_sft.py
    └── utils/
        ├── __init__.py
        ├── config.py
        ├── config_models.py
        ├── i18n.py
        ├── length_cdf.py
        ├── log.py
        ├── retry.py
        └── tools.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .cursor/rules/weclone-rules.mdc
================================================
---
description: 
globs: 
alwaysApply: true
---
---
description: 
globs: 
alwaysApply: true
---

# Your rule content
- You can @ files here
- The project uses uv as the package manager and pyproject.toml as the project configuration file.
- You should write as few code comments as possible.
- Prefer using the encapsulated logger `from weclone.utils.log import logger` for printing.


================================================
FILE: .github/issue-labeler.yml
================================================
# 添加 Discussion 标签
Discussion:
  - '(讨论|交流|分享|意见|建议|思考|探讨|交换意见|brainstorm|discussion)'

# 添加 bug 标签
bug:
  - '(bug|错误|问题|失败|崩溃|异常|报错|不工作|无法运行|broken|crash|error|exception|fails)'

# 添加 chatbot 标签
chatbot:
  - '(聊天机器人|chatbot|chat bot|对话机器人|聊天助手|AI助手|机器人对话|bot|assistant)'

# 添加 documentation 标签
documentation:
  - '(文档|说明|使用指南|指导|手册|教程|文档更新|documentation|docs|guide|tutorial|readme)'

# 添加 duplicate 标签
duplicate:
  - '(重复|已有|duplicate|已经存在|已提交过|重复问题|重复报告|dup)'

# 添加 feature 标签
feature:
  - '(功能|特性|新增|增加|添加|实现|feature|enhancement|新功能|功能请求|feature request)'

# 添加 good first issue 标签
good first issue:
  - '(入门|简单|容易|新手|初学者|开始|first|beginner|starter|easy|简单任务|good first issue)'

# 添加 help wanted 标签
help wanted:
  - '(需要帮助|寻求帮助|请求协助|help|求助|协助|帮忙|help wanted|need help|assistance)'

# 添加 invalid 标签
invalid:
  - '(无效|不适用|不相关|无关|错误提交|invalid|not relevant|irrelevant|not applicable)'

# 添加 Mac 标签
Mac:
  - '(Mac|MacOS|macOS|OSX|Mac系统|苹果系统|苹果电脑|MacBook)'

# 添加 question 标签
question:
  - '(问题|疑问|如何|怎么|请问|是否|能否|可以吗|question|how to|what is|why)'

# 添加 Windows 标签
Windows:
  - '(Windows|微软|Win10|Win11|Windows系统|微软系统|win)'


================================================
FILE: .github/weclone-release-event.json
================================================
{
    "action": "published",
    "release": {
        "id": 123456789,
        "tag_name": "v0.2.24",
        "target_commitish": "main",
        "name": "v0.2.24",
        "body": "## 🥰 What's Changed\n  - Update torch version to 2.7.0 and vllm version to 0.9.1, switch offline inference to chat-style invocation\n  - Add `test_model_args` and `vllm_args` configuration items to allow custom test dataset files\n  - Add config file path option in CLI, support setting WECLONE_CONFIG_PATH environment variable\n  - Update max_new_tokens and enable_thinking parameters in data cleaning strategy to optimize inference\n  - Partial feature adaptation for qwen3\n  \n  ## 🐛 Bug fix\n  fix #158 fix #83 fix #77 fix #69 \n  \n  **Full Changelog**: https://github.com/xming521/WeClone/compare/v0.2.23...v0.2.24\n  \n  ## 🥰 更新内容\n  - 更新torch版本至2.7.0，vllm版本到0.9.1，离线推理改为chat方式调用\n  - 添加`test_model_args` and `vllm_args`配置项，允许自定义测试集文件\n  - CLI中添加配置文件路径选项，支持设置WECLONE_CONFIG_PATH环境变量\n  - 更新数据清理策略中的max_new_tokens和enable_thinking参数以优化推理过程\n  - 部分功能适配qwen3",
        "draft": false,
        "prerelease": false,
        "created_at": "2024-01-15T10:30:00Z",
        "published_at": "2024-01-15T10:30:00Z",
        "author": {
            "login": "xming521",
            "id": 12345,
            "avatar_url": "https://avatars.githubusercontent.com/u/12345?v=4",
            "html_url": "https://github.com/xming521"
        },
        "html_url": "https://github.com/xming521/WeClone/releases/tag/v0.2.24",
        "assets_url": "https://api.github.com/repos/xming521/WeClone/releases/123456789/assets",
        "upload_url": "https://uploads.github.com/repos/xming521/WeClone/releases/123456789/assets{?name,label}",
        "tarball_url": "https://api.github.com/repos/xming521/WeClone/tarball/v0.2.24",
        "zipball_url": "https://api.github.com/repos/xming521/WeClone/zipball/v0.2.24",
        "assets": []
    },
    "repository": {
        "id": 987654321,
        "name": "WeClone",
        "full_name": "xming521/WeClone",
        "owner": {
            "login": "xming521",
            "id": 12345
        },
        "private": false,
        "html_url": "https://github.com/xming521/WeClone",
        "description": "WeClone - AI Clone Repository",
        "fork": false,
        "created_at": "2023-01-01T00:00:00Z",
        "updated_at": "2024-01-15T10:30:00Z",
        "pushed_at": "2024-01-15T10:25:00Z",
        "clone_url": "https://github.com/xming521/WeClone.git",
        "default_branch": "main"
    },
    "sender": {
        "login": "xming521",
        "id": 12345,
        "avatar_url": "https://avatars.githubusercontent.com/u/12345?v=4",
        "html_url": "https://github.com/xming521"
    }
}


================================================
FILE: .github/workflows/issue-labeler.yml
================================================
name: add labels to Issues

on:
  issues:
    types: [opened, edited]


jobs:
  label_issues:
    runs-on: ubuntu-latest
    permissions:
      issues: write
      contents: read
    steps:
      - name: get_last_run_time
        id: last_run
        run: |
          # 获取当前日期减去 1 天作为默认值（处理最近一天的 issues）
          echo "date=$(date -d '1 day ago' -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT
      
      - name: RegEx Issue Labeler
        uses: github/issue-labeler@v3.4
        with:
          include-title: 1
          repo-token: "${{ secrets.GITHUB_TOKEN }}"
          configuration-path: .github/issue-labeler.yml
          enable-versioned-regex: 0
          not-before: ${{ steps.last_run.outputs.date }}


================================================
FILE: .github/workflows/tg_release_notification.yml
================================================
name: Telegram Release Notification

on:
  release:
    types: [published]

jobs:
  notify:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        
      - name: Setup Node.js
        uses: actions/setup-node@v4
        with:
          node-version: '22'
          
      - name: Install telegramify-markdown
        run: |
          npm init -y
          npm install telegramify-markdown
          
      - name: Convert release body to Telegram format
        id: convert-markdown
        run: |
          # 先将release body保存到文件
          cat > release_body.txt << 'RELEASE_BODY_EOF'
          ${{ github.event.release.body }}
          RELEASE_BODY_EOF
          
          # 然后创建转换脚本
          cat > convert-release.js << 'EOF'
          const telegramifyMarkdown = require('telegramify-markdown');
          const fs = require('fs');

          // 从文件读取release body内容（避免shell解析问题）
          let releaseBody = '';
          try {
            releaseBody = fs.readFileSync('release_body.txt', 'utf8');
          } catch (error) {
            console.error('读取release body失败:', error);
            releaseBody = process.env.RELEASE_BODY || '';
          }
          
          console.log('=== 原始release body ===');
          console.log(releaseBody);

          // 转换为Telegram格式
          const telegramBody = telegramifyMarkdown(releaseBody);
          
          // 构建完整的消息
          const tagName = process.env.TAG_NAME || '';
          const releaseUrl = process.env.RELEASE_URL || '';
          
          const fullMessage = `🚀 *WeClone New Version Released*
          🏷️ *Version*: \`${tagName}\`
          🔗 *Link*: [Github Release](${releaseUrl})
          📋 *Release Notes*:
          ${telegramBody}`;
          
          // 输出到GitHub Actions
          console.log('转换后的消息:');
          console.log(fullMessage);
          
          // 将消息保存到环境变量
          fs.writeFileSync('telegram_message.txt', fullMessage);
          EOF
          
          # 设置环境变量
          export RELEASE_BODY="${{ github.event.release.body }}"
          export TAG_NAME="${{ github.event.release.tag_name }}"
          export RELEASE_URL="${{ github.event.release.html_url }}"
          export REPO_NAME="${{ github.repository }}"
          
          # 运行转换脚本
          node convert-release.js
          
          # 读取转换后的消息并设置为输出
          echo "TELEGRAM_MESSAGE<<EOF" >> $GITHUB_OUTPUT
          cat telegram_message.txt >> $GITHUB_OUTPUT
          echo "EOF" >> $GITHUB_OUTPUT
          
      # - name: Display converted message
      #   run: |
      #     echo "=== 转换后的Telegram消息 ==="
      #     echo "${{ steps.convert-markdown.outputs.TELEGRAM_MESSAGE }}"
          
      - name: Send Telegram Message
        uses: appleboy/telegram-action@master
        with:
          to: ${{ secrets.TELEGRAM_CHAT_ID }}
          token: ${{ secrets.TELEGRAM_BOT_TOKEN }}
          message: ${{ steps.convert-markdown.outputs.TELEGRAM_MESSAGE }}
          format: markdown
          disable_web_page_preview: false
        

================================================
FILE: .gitignore
================================================
wandb/
weclone_archive-my/
**/pycache/
events.out.tfevents.*
归档/
*.pt
*.npz
*nohup.out
*log.txt
*cookie.bin
*.gradio/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/


*.zip
LLaMA-Factory
chatglm3-6b
cache
archive
model_output*
data/test
.vscode
*-my*.*
test-scripts-my/
*.csv
!tests/tests_data/test_person/test_0_730.csv
!tests/tests_data/test_PII/test_0_730.csv
*test.*
*-exp.*
experiment/
*users.json   
Spark-TTS-0.5B/
uv.lock
output*
*.out

Qwen*/
settings.jsonc
settings.json
dataset/blocked_words.json
dataset/wechat/*
models/*
.secrets*
.env*

# Image files
dataset/**/*.jpg
dataset/**/*.jpeg
dataset/**/*.png
dataset/**/*.gif
dataset/**/*.bmp
dataset/**/*.webp
dataset/**/*.svg
dataset/**/*.ico


dataset/*telegram*/*
!*.gitkeep
WC-exp/*

modeloutputs/*
/tmp/*
cache.pkl
hfd.sh
rpa_cache.pkl
settings-bot8006.jsonc
models_final/*
/data/*
/llamaboard_cache
eval_Result/*


================================================
FILE: .pre-commit-config.yaml
================================================
# .pre-commit-config.yaml
default_install_hook_types: [pre-commit, prepare-commit-msg]
ci:
  autofix_commit_msg: ":balloon: auto fixes by pre-commit hooks"
  autofix_prs: true
  autoupdate_branch: master
  autoupdate_schedule: monthly
  autoupdate_commit_msg: ":balloon: pre-commit autoupdate hooks"

repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v6.0.0
    hooks:
      - id: check-ast # Python 语法检查
      - id: check-added-large-files # 防止大文件
        args: ["--maxkb=25000"]
      - id: check-merge-conflict # 检查合并冲突
      - id: check-yaml # YAML 语法检查
      - id: check-toml # TOML 语法检查
      - id: debug-statements # 防止调试语句
      - id: end-of-file-fixer # 文件结尾修复
      # - id: trailing-whitespace # 移除行尾空白
      #   args: [--markdown-linebreak-ext=md]
      - id: no-commit-to-branch # 保护主分支
        args: ["--branch", "main", "--branch", "master"]
      - id: mixed-line-ending # 检查混合行结束符
        args: ["--fix=lf"]

  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.12.8
    hooks:
      - id: ruff
        args: [--fix]
      - id: ruff-format

  - repo: https://github.com/pycqa/isort
    rev: 6.0.1
    hooks:
      - id: isort
        args: ["--profile", "black", "--line-length", "120"]

  # - repo: https://github.com/PyCQA/bandit
  #   rev: 1.8.3
  #   hooks:
  #     - id: bandit
  #       name: Python 安全检查
  #       args: ["-c", "pyproject.toml", "-x", "tests"]
  #       additional_dependencies: ["bandit[toml]"]


================================================
FILE: LICENSE
================================================
                    GNU AFFERO GENERAL PUBLIC LICENSE
                       Version 3, 19 November 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

                            Preamble

  The GNU Affero General Public License is a free, copyleft license for
software and other kinds of works, specifically designed to ensure
cooperation with the community in the case of network server software.

  The licenses for most software and other practical works are designed
to take away your freedom to share and change the works.  By contrast,
our General Public Licenses are intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.

  Developers that use our General Public Licenses protect your rights
with two steps: (1) assert copyright on the software, and (2) offer
you this License which gives you legal permission to copy, distribute
and/or modify the software.

  A secondary benefit of defending all users' freedom is that
improvements made in alternate versions of the program, if they
receive widespread use, become available for other developers to
incorporate.  Many developers of free software are heartened and
encouraged by the resulting cooperation.  However, in the case of
software used on network servers, this result may fail to come about.
The GNU General Public License permits making a modified version and
letting the public access it on a server without ever releasing its
source code to the public.

  The GNU Affero General Public License is designed specifically to
ensure that, in such cases, the modified source code becomes available
to the community.  It requires the operator of a network server to
provide the source code of the modified version running there to the
users of that server.  Therefore, public use of a modified version, on
a publicly accessible server, gives the public access to the source
code of the modified version.

  An older license, called the Affero General Public License and
published by Affero, was designed to accomplish similar goals.  This is
a different license, not a version of the Affero GPL, but Affero has
released a new version of the Affero GPL which permits relicensing under
this license.

  The precise terms and conditions for copying, distribution and
modification follow.

                       TERMS AND CONDITIONS

  0. Definitions.

  "This License" refers to version 3 of the GNU Affero General Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.

  "The Program" refers to any copyrightable work licensed under this
License.  Each licensee is addressed as "you".  "Licensees" and
"recipients" may be individuals or organizations.

  To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy.  The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.

  A "covered work" means either the unmodified Program or a work based
on the Program.

  To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy.  Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.

  To "convey" a work means any kind of propagation that enables other
parties to make or receive copies.  Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.

  An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License.  If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.

  1. Source Code.

  The "source code" for a work means the preferred form of the work
for making modifications to it.  "Object code" means any non-source
form of a work.

  A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.

  The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form.  A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.

  The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities.  However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work.  For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.

  The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.

  The Corresponding Source for a work in source code form is that
same work.

  2. Basic Permissions.

  All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met.  This License explicitly affirms your unlimited
permission to run the unmodified Program.  The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work.  This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.

  You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force.  You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright.  Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.

  Conveying under any other circumstances is permitted solely under
the conditions stated below.  Sublicensing is not allowed; section 10
makes it unnecessary.

  3. Protecting Users' Legal Rights From Anti-Circumvention Law.

  No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.

  When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.

  4. Conveying Verbatim Copies.

  You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.

  You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.

  5. Conveying Modified Source Versions.

  You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:

    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.

    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".

    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.

    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.

  A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit.  Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.

  6. Conveying Non-Source Forms.

  You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:

    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.

    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.

    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.

    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.

    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.

  A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.

  A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling.  In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage.  For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product.  A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.

  "Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source.  The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.

  If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information.  But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).

  The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed.  Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.

  Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.

  7. Additional Terms.

  "Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law.  If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.

  When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it.  (Additional permissions may be written to require their own
removal in certain cases when you modify the work.)  You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.

  Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:

    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or

    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or

    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or

    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or

    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or

    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.

  All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10.  If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term.  If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.

  If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.

  Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.

  8. Termination.

  You may not propagate or modify a covered work except as expressly
provided under this License.  Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).

  However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.

  Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

  Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License.  If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.

  9. Acceptance Not Required for Having Copies.

  You are not required to accept this License in order to receive or
run a copy of the Program.  Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance.  However,
nothing other than this License grants you permission to propagate or
modify any covered work.  These actions infringe copyright if you do
not accept this License.  Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.

  10. Automatic Licensing of Downstream Recipients.

  Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License.  You are not responsible
for enforcing compliance by third parties with this License.

  An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations.  If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.

  You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License.  For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.

  11. Patents.

  A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based.  The
work thus licensed is called the contributor's "contributor version".

  A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version.  For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.

  Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.

  In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement).  To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.

  If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients.  "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.

  If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.

  A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License.  You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.

  Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.

  12. No Surrender of Others' Freedom.

  If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all.  For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.

  13. Remote Network Interaction; Use with the GNU General Public License.

  Notwithstanding any other provision of this License, if you modify the
Program, your modified version must prominently offer all users
interacting with it remotely through a computer network (if your version
supports such interaction) an opportunity to receive the Corresponding
Source of your version by providing access to the Corresponding Source
from a network server at no charge, through some standard or customary
means of facilitating copying of software.  This Corresponding Source
shall include the Corresponding Source for any work covered by version 3
of the GNU General Public License that is incorporated pursuant to the
following paragraph.

  Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU General Public License into a single
combined work, and to convey the resulting work.  The terms of this
License will continue to apply to the part which is the covered work,
but the work with which it is combined will remain governed by version
3 of the GNU General Public License.

  14. Revised Versions of this License.

  The Free Software Foundation may publish revised and/or new versions of
the GNU Affero General Public License from time to time.  Such new versions
will be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

  Each version is given a distinguishing version number.  If the
Program specifies that a certain numbered version of the GNU Affero General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation.  If the Program does not specify a version number of the
GNU Affero General Public License, you may choose any version ever published
by the Free Software Foundation.

  If the Program specifies that a proxy can decide which future
versions of the GNU Affero General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.

  Later license versions may give you additional or different
permissions.  However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.

  15. Disclaimer of Warranty.

  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

  16. Limitation of Liability.

  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.

  17. Interpretation of Sections 15 and 16.

  If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.

                     END OF TERMS AND CONDITIONS

            How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published
    by the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

  If your software can interact with users remotely through a computer
network, you should also make sure that it provides a way for users to
get its source.  For example, if your program is a web application, its
interface could display a "Source" link that leads users to an archive
of the code.  There are many ways you could offer source, and different
solutions will be better for different programs; see section 13 for the
specific requirements.

  You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU AGPL, see
<https://www.gnu.org/licenses/>.


================================================
FILE: README.md
================================================
![download](https://github.com/user-attachments/assets/cd4a87c6-1649-4ce5-bce8-bd5b08b278de)

<h3 align="center">🚀 One-stop solution for creating your digital avatar from chat history 💡</h3>  

<div align="center">

[![GitHub stars](https://img.shields.io/github/stars/xming521/WeClone?style=for-the-badge&logo=github&label=Stars&logoColor=white&color=ffda65)](https://github.com/xming521/WeClone/stargazers)
[![GitHub release](https://img.shields.io/github/v/release/xming521/WeClone?style=for-the-badge&logo=github&label=Release&logoColor=white&color=06d094)](https://github.com/xming521/WeClone/releases)
[![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/+JEdak4m0XEQ3NGNl)
[![Twitter](https://img.shields.io/badge/Twitter-@weclone567-000000?style=for-the-badge&logo=x&logoColor=white)](https://x.com/weclone567)
[![小红书](https://img.shields.io/badge/WeClone-FE2C55?style=for-the-badge&logo=xiaohongshu&logoColor=white)](https://www.xiaohongshu.com/user/profile/628109730000000021029de4)
<a href="https://qm.qq.com/cgi-bin/qm/qr?k=wNdgbOVT6oFOJ2wlMLsolUXErW9ESLpk&jump_from=webapi&authKey=z/reOp6YLyvR4Tl2k2nYMsLoMC3w9/99ucgKMX0oRGlxDV/WbYnvq2QxODoIkfxn" target="_blank" style="text-decoration: none;">
  <img src="https://img.shields.io/badge/QQ群-708067078-12B7F5?style=for-the-badge&logo=qq&logoColor=white" alt="WeClone①" title="WeClone①">
</a>


<a href="https://hellogithub.com/repository/12ab209b56cb4cfd885c8cfd4cfdd53e" target="_blank"><img src="https://abroad.hellogithub.com/v1/widgets/recommend.svg?rid=12ab209b56cb4cfd885c8cfd4cfdd53e&claim_uid=RThlPDoGrFvdMY5" alt="Featured｜HelloGitHub" style="width: 150px; height: 28px;" /></a>
<a href="https://trendshift.io/repositories/13759" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13759" alt="xming521%2FWeClone | Trendshift" style="width: 220px; height: 50px;" /></a>
<a href="https://deepwiki.com/xming521/WeClone"><img src="https://deepwiki.com/badge.svg" alt="Ask DeepWiki"  style="width: 134px; height: 23px;margin-bottom: 3px;"></a>
</div>

<p align="center">
  <a href="https://github.com/xming521/WeClone/blob/master/README_zh.md" target="_blank">简体中文</a>｜
  English</a>｜
  <a href="https://www.weclone.love/" target="_blank"> Project Homepage </a> ｜
  <a href="https://docs.weclone.love/docs/introduce/what-is-weclone.html" target="_blank"> Documentation </a> 
</p>

> [!IMPORTANT]
> ### Telegram is now supported as a data source !

## ✨Core Features
- 💫 Complete end-to-end solution for creating digital avatars, including chat data export, preprocessing, model training, and deployment
- 💬 Fine-tune LLM using chat history with support for image modal data, infusing it with that authentic "flavor"
- 🔗 Integrate with Telegram, WhatsApp (coming soon) to create your own digital avatar
- 🛡️ Privacy information filtering with localized fine-tuning and deployment for secure and controllable data

## 📋Features & Notes

### Data Source Platform Support

| Platform | Text | Images | Voice | Video | Animated Emojis/Stickers | Links (Sharing) | Quote | Forward | Location | Files |
|----------|------|--------|-------|-------|-----------------|-----------------|-------|---------|----------|-------|
| Telegram | ✅ | ✅ | ❌ | ❌ | ⚠️Convert to Emoji | ❌ | ❌ | ✅ | ✅ | ❌ |
| WhatsApp | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 |
| Discord | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 |
| Slack | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 |
 
### Deployment Platform Support

| Platform | Deployment Support |
|----------|--------------------|
| Telegram | ✅ |
| WhatsApp | 🚧 |
| Discord | ✅ |
| Slack | ✅ |

> [!IMPORTANT]
> - WeClone is still in rapid iteration phase, current performance does not represent final results.  
> - LLM fine-tuning effectiveness largely depends on model size, quantity and quality of chat data. Theoretically, larger models with more data yield better results.
> - The performance of the 7B model is average, while models with 14B or more parameters tend to deliver better results.   
> - Windows environment has not been rigorously tested. You can use WSL as the runtime environment.

### Recent Updates
[25/07/10] Data source added Telegram   
[25/06/05] Support for image modal data fine-tuning    

### Online Fine-Tuning
- Big Model Lab (Lab4AI) (with 50 CNY voucher): https://www.lab4ai.cn/project/detail?utm_source=weclone1&id=ab83d14684fa45d197f67eddb3d8316c&type=project

### Hardware Requirements

The project uses Qwen2.5-VL-7B-Instruct model by default with LoRA method for SFT stage fine-tuning. You can also use other models and methods supported by [LLaMA Factory](https://github.com/hiyouga/LLaMA-Factory/tree/main#supported-models).

Estimated VRAM requirements: 
| Method                          | Precision |   7B  |  14B  |  30B  |   70B  |   `x`B  |
| ------------------------------- | --------- | ----- | ----- | ----- | ------ | ------- |
| Full (`bf16` or `fp16`)         |    32     | 120GB | 240GB | 600GB | 1200GB | `18x`GB |
| Full (`pure_bf16`)              |    16     |  60GB | 120GB | 300GB |  600GB |  `8x`GB |
| Freeze/LoRA/GaLore/APOLLO/BAdam |    16     |  16GB |  32GB |  64GB |  160GB |  `2x`GB |
| QLoRA                           |     8     |  10GB |  20GB |  40GB |   80GB |   `x`GB |
| QLoRA                           |     4     |   6GB |  12GB |  24GB |   48GB | `x/2`GB |
| QLoRA                           |     2     |   4GB |   8GB |  16GB |   24GB | `x/4`GB |


## Environment Setup
1. CUDA installation (skip if already installed, **requires version 12.6 or above**)

2. It is recommended to use [uv](https://docs.astral.sh/uv/) to install dependencies, which is a very fast Python environment manager. After installing uv, you can use the following commands to create a new Python environment and install dependencies. 
```bash
git clone https://github.com/xming521/WeClone.git && cd WeClone
uv venv .venv --python=3.12
source .venv/bin/activate # windows .venv\Scripts\activate
uv pip install --group main -e . 
```

3. Copy the configuration file template and rename it to `settings.jsonc`, and make subsequent configuration changes in this file:

```bash
cp examples/tg.template.jsonc settings.jsonc
```

> [!NOTE]
> Training and inference related configurations are unified in the file `settings.jsonc`

4. Use the following command to test whether the CUDA environment is correctly configured and can be recognized by PyTorch (not needed for Mac):
```bash
  python -c "import torch; print('CUDA Available:', torch.cuda.is_available());"
```

5. (Optional) Install FlashAttention to accelerate training and inference: `uv pip install flash-attn --no-build-isolation`.

## Model Download
It is recommended to use [Hugging Face](https://huggingface.co/docs/hub/models-downloading) to download models, or use the following command:
```bash
git lfs install
git clone https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct models/Qwen2.5-VL-7B-Instruct
```

## Data Preparation

Please use [Telegram Desktop](https://desktop.telegram.org/) to export chat records. Click the top right corner in the chat interface, then click "Export chat history". Select Photos for message types and JSON for format. You can export multiple contacts (group chat records are not recommended), then place the exported `ChatExport_*` in the `./dataset/telegram` directory, meaning put different people's chat record folders together in `./dataset/telegram`.   


## Data Preprocessing
- First, modify the `language`, `platform`, and `include_type` in the configuration file according to your needs.
- If you use telegram, you need to modify the `telegram_args.my_id` in the configuration file to your own telegram user ID.
- By default, the project uses Microsoft Presidio to remove `phone numbers, email addresses, credit card numbers, IP addresses, geographic location names, international bank account numbers, cryptocurrency wallet addresses, age information, and generic ID numbers` from the data, but it cannot guarantee 100% identification.
- Therefore, a blocklist `blocked_words` is provided in `settings.jsonc`, allowing users to manually add words or phrases they want to filter (the entire sentence containing blocked words will be removed by default).

> [!IMPORTANT]
> 🚨 Please be sure to protect personal privacy and do not leak personal information!

- Execute the following command to process the data. You can modify the `make_dataset_args` in settings.jsonc according to your own chat style.
```bash
weclone-cli make-dataset
```
More Parameter Details: [Data Preprocessing](https://docs.weclone.love/docs/deploy/data_preprocessing.html#related-parameters)

## Configure Parameters and Fine-tune Model

- (Optional) Modify `model_name_or_path`, `template`, `lora_target` in `settings.jsonc` to select other locally downloaded models.   
- Modify `per_device_train_batch_size` and `gradient_accumulation_steps` to adjust VRAM usage.  
- You can modify parameters like `num_train_epochs`, `lora_rank`, `lora_dropout` in `train_sft_args` based on your dataset's quantity and quality.

### Single GPU Training
```bash
weclone-cli train-sft
```

### Multi-GPU Training
Uncomment the `deepspeed` line in `settings.jsonc` and use the following command for multi-GPU training:
```bash
uv pip install "deepspeed<=0.16.9"
deepspeed --num_gpus=number_of_gpus weclone/train/train_sft.py
```

### Simple Inference with Browser Demo
Test suitable temperature and top_p values, then modify `infer_args` in settings.jsonc for subsequent inference use.
```bash
weclone-cli webchat-demo
```

### Inference Using API

```bash
weclone-cli server
```

### Test with Common Chat Questions
Does not include questions asking for personal information, only daily conversation. Test results are in test_result-my.txt.
```bash
weclone-cli server
weclone-cli test-model
```

## 🖼️ Results Showcase
> [!TIP] 
> **We're looking for interesting examples of native English speakers chatting with WeClone! Feel free to share them with us on Twitter.**  


## 🤖 Deploy to Chat Bots
### AstrBot
[AstrBot](https://github.com/AstrBotDevs/AstrBot) is an easy-to-use multi-platform LLM chatbot and development framework ✨ Supports Discord, Telegram, Slack, Feishu and other platforms.      

Usage steps:
1. Deploy AstrBot
2. Deploy messaging platforms like Discord, Telegram, Slack in AstrBot
3. Execute `weclone-cli server` to start the API service
4. Add a new service provider in AstrBot, select OpenAI type, fill in the API Base URL according to AstrBot's deployment method (e.g., for docker deployment it might be http://172.17.0.1:8005/v1), fill in the model as gpt-3.5-turbo, and enter any API Key
5. Tool calling is not supported after fine-tuning, please turn off the default tools first by sending the command: `/tool off_all` on the messaging platform, otherwise the fine-tuned effect won't be visible.
6. Set the system prompt in AstrBot according to the default_system used during fine-tuning.
![5](https://github.com/user-attachments/assets/19de7072-076a-4cdf-8ae6-46b9b89f536a)
> [!IMPORTANT]
> Check the api_service logs to ensure that the large model service request parameters are consistent with those used during fine-tuning as much as possible, and turn off all tool plugin capabilities.

### LangBot

[LangBot](https://github.com/langbot-app/LangBot) is an easy-to-use open-source LLM chatbot platform suitable for various scenarios. It connects to various global instant messaging platforms. You can set up your IM bot in just 5 minutes.

<img width="400px" alt="image" src="https://github.com/user-attachments/assets/de44e6e3-3a53-44d9-af76-96364cfca30f" />

1. [Deploy LangBot](https://github.com/RockChinQ/LangBot/blob/master/README_EN.md#-getting-started)
2. Add a bot (Discord, Telegram, Slack, Lark e.g.) in LangBot
3. Execute `weclone-cli server` to start the WeClone API service
4. Add a new model in the model page, name it `gpt-3.5-turbo`, select OpenAI as the provider, fill in the request URL as WeClone's address. For detailed connection methods, refer to the [documentation](https://docs.langbot.app/en/workshop/network-details.html), and enter any API Key.

<img width="400px" alt="image" src="https://github.com/user-attachments/assets/835853ab-6ddc-459e-ae21-b04c38a85b5b" />

6. Select the model you just added in the pipeline configuration, or modify the prompt configuration

<img width="400px" alt="image" src="https://github.com/user-attachments/assets/da61342d-84f9-4f02-87bc-3d4c7cdf187c" />


## 📌 Roadmap
- [ ] Support more data sources
- [ ] Richer context: including contextual conversations, chat participant information, time, etc.
- [ ] Memory support
- [ ] Multimodal support: image support already implemented
- [ ] Data augmentation
- [ ] GUI support
- [ ] COT (Chain of Thought) thinking support

## Troubleshooting
#### [Official Documentation FAQ](https://docs.weclone.love/docs/introduce/FAQ.html)    
It is also recommended to use [DeepWiki](https://deepwiki.com/xming521/WeClone) for problem solving.


## ❤️ Contributing

Any Issues/Pull Requests are welcome!

You can contribute by checking Issues or helping review PRs (Pull Requests). For new feature additions, please discuss through Issues first.   
Development environment:
```bash
uv pip install --group dev -e .
pre-commit install
```

The project uses `pytest` for testing, `pyright` for type checking, and `ruff` for code formatting.   
Before submitting your code, you should run `pytest tests` to ensure all tests pass.


## 🙏 Acknowledgments

Thanks to the following code contributors and other community members for their contributions

<a href="https://github.com/xming521/WeClone/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=xming521/WeClone" />
</a>

This project also benefits from excellent open source projects such as [PyWxDump](https://github.com/xaoyaoo/PyWxDump), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), [AstrBot](https://github.com/AstrBotDevs/AstrBot), [LangBot](https://github.com/RockChinQ/LangBot), and others.

## ⚠️ Disclaimer
> [!CAUTION]
> **This project is for learning, research and experimental purposes only. There are significant risks in using it for production environments, please assess carefully. Do not use for illegal purposes, consequences are at your own risk.**

> [!IMPORTANT]
> #### WeClone is currently not partnered with any platform and has not issued any cryptocurrency. The only official website is: [weclone.love](https://www.weclone.love). Beware of imitations.

<details>
<summary>Click to view disclaimer terms</summary>

### 1. Use at Your Own Risk
- Users should fully understand and bear all related risks when using this project
- **The project authors are not responsible for any direct or indirect losses arising from the use of this project**
- Including but not limited to: data loss, financial loss, legal disputes, personal reputation damage, social relationship impact, psychological trauma, career development obstacles, business reputation damage, etc.

### 2. Production Environment Risk Warning
- **Use for commercial purposes or providing external services requires bearing all risks yourself**
- All consequences that may result from production environment use (including but not limited to service interruption, data security issues, user complaints, legal liability, etc.) are entirely borne by the user
- **It is recommended to conduct thorough testing, verification and risk assessment before using in production environments**

### 3. Model Output Unreliability
- Fine-tuned models may produce inaccurate, harmful or misleading content
- Model outputs do not represent the views or intentions of real persons
- Users should conduct manual review and verification of model outputs

### 4. Data Security and Privacy
- Users should ensure that uploaded chat records and other data comply with relevant laws and regulations
- Users should obtain **appropriate authorization from data-related persons**
- This project is not responsible for **data leakage or privacy infringement**

### 5. Legal Compliance
- **Users should ensure that using this project complies with local laws and regulations**
- Involving artificial intelligence, data protection, intellectual property and other related laws
- **Users bear the consequences of illegal use**

### 6. Technical Support Limitations
- This project is provided "as is" without any express or implied warranties
- Authors do not promise to provide continuous technical support or maintenance
- No guarantee of project stability, reliability or applicability

## Usage Recommendations

### Mandatory Bot Identity Identification
**When using digital avatars generated by this project, it is strongly recommended to:**
- Clearly identify as "AI Bot" or "Digital Avatar" at the beginning of each conversation
- Prominently mark "AI-generated content" in the user interface
- Avoid letting users mistake it for real human conversation, which could cause risks

### Risk Assessment Recommendations

If you must use in production environments, it is recommended to:
1. Conduct comprehensive security testing
2. Establish complete content review mechanisms
3. Develop emergency response plans
4. Purchase appropriate insurance coverage
5. Consult legal professionals for advice


This disclaimer may be revised with project updates, users should regularly check the latest version. Continuing to use this project indicates agreement with the latest disclaimer terms.

**Once you download, clone, modify, distribute or use the code or models of this project in any way, it indicates that you have fully read, understood and agreed to unconditionally accept all terms of this disclaimer.**

</details>

**Please carefully read and understand all contents of this disclaimer, ensuring strict compliance with relevant regulations when using this project.**
<br>  

## ⭐ Star History
> [!TIP] 
> If this project is helpful to you, or if you are interested in the future development of this project, please give the project a Star, thank you 

<div align="center">

[![Star History Chart](https://api.star-history.com/svg?repos=xming521/WeClone&type=Date)](https://www.star-history.com/#xming521/WeClone&Date)

</div>


================================================
FILE: README_zh.md
================================================
![download](https://github.com/user-attachments/assets/cd4a87c6-1649-4ce5-bce8-bd5b08b278de)
<h3 align="center">🚀 One-stop solution for creating your digital avatar from chat history 💡</h3>  
<h3 align="center">🚀从聊天记录创造数字分身的一站式解决方案💡</h3>  


<div align="center">

[![GitHub stars](https://img.shields.io/github/stars/xming521/WeClone?style=for-the-badge&logo=github&label=Stars&logoColor=white&color=ffda65)](https://github.com/xming521/WeClone/stargazers)
[![GitHub release](https://img.shields.io/github/v/release/xming521/WeClone?style=for-the-badge&logo=github&label=Release&logoColor=white&color=06d094)](https://github.com/xming521/WeClone/releases)
<a href="https://qm.qq.com/cgi-bin/qm/qr?k=wNdgbOVT6oFOJ2wlMLsolUXErW9ESLpk&jump_from=webapi&authKey=z/reOp6YLyvR4Tl2k2nYMsLoMC3w9/99ucgKMX0oRGlxDV/WbYnvq2QxODoIkfxn" target="_blank" style="text-decoration: none;">
  <img src="https://img.shields.io/badge/QQ群-708067078-12B7F5?style=for-the-badge&logo=qq&logoColor=white" alt="WeClone①" title="WeClone①">
</a>
[![小红书](https://img.shields.io/badge/WeClone-FE2C55?style=for-the-badge&logo=xiaohongshu&logoColor=white)](https://www.xiaohongshu.com/user/profile/628109730000000021029de4)
[![Twitter](https://img.shields.io/badge/Twitter-@weclone567-000000?style=for-the-badge&logo=x&logoColor=white)](https://x.com/weclone567)
[![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/+JEdak4m0XEQ3NGNl)

<a href="https://hellogithub.com/repository/12ab209b56cb4cfd885c8cfd4cfdd53e" target="_blank"><img src="https://abroad.hellogithub.com/v1/widgets/recommend.svg?rid=12ab209b56cb4cfd885c8cfd4cfdd53e&claim_uid=RThlPDoGrFvdMY5" alt="Featured｜HelloGitHub" style="width: 150px; height: 28px;" /></a>
<a href="https://trendshift.io/repositories/13759" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13759" alt="xming521%2FWeClone | Trendshift" style="width: 220px; height: 50px;" /></a>
<a href="https://deepwiki.com/xming521/WeClone"><img src="https://deepwiki.com/badge.svg" alt="Ask DeepWiki"  style="width: 134px; height: 23px;margin-bottom: 3px;"></a>
</div>

<p align="center">
简体中文｜
  <a href="https://github.com/xming521/WeClone/blob/master/README.md" target="_blank">English</a>｜
  <a href="https://www.weclone.love/" target="_blank"> 项目主页 </a> ｜
  <a href="https://docs.weclone.love/docs/introduce/what-is-weclone.html" target="_blank"> 项目文档 </a>
  
</p>


## ✨核心功能
- 💫 涵盖打造数字分身的全链路方案，包括聊天数据导出、预处理、模型训练、部署
- 💬 使用聊天记录微调LLM，支持图片模态数据，让大模型有"那味儿"
- 🔗 绑定到Discord, Telegram, Slack, Feishu等，实现自己的数字分身
- 🛡️ 隐私信息过滤，本地化微调部署，数据安全可控

## 📋特性与说明

### 数据源平台适配

| 平台 | 文字 | 图片 | 语音 | 视频 | 动画表情 | 链接(分享) | 引用 | 转发 | 位置 | 文件 |
|------|------|------|------|------|----------|-----------|------|------|------|------|
| Telegram | ✅ | ✅ | ❌ | ❌ | ⚠️转为Emjoy | ❌ | ❌ | ✅ | ✅ | ❌ |
| WhatsApp | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 |
| Discord | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 |
| Slack | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 |

### 部署平台支持
| 平台 | 部署支持 |
|------|------|
| Telegram | ✅ | 
| WhatsApp | 🚧 | 
| Discord | ✅ | 
| Slack | ✅ | 

> [!IMPORTANT]
> - WeClone仍在快速迭代期，当前效果不代表最终效果。  
> - 微调LLM效果很大程度取决于模型大小、聊天数据的数量和质量，理论上模型越大，数据越多，效果越好。
> - 7B模型效果一般，14B及以上的模型效果会更好。   
> - Windows环境未进行严格测试，可以使用WSL作为运行环境。

### 近期更新
[25/06/05]支持图片模态数据微调   
[25/07/10]数据源增加Telegram

### 在线微调
- 大模型实验室 (Lab4AI) (送50元代金券): https://www.lab4ai.cn/project/detail?utm_source=weclone1&id=ab83d14684fa45d197f67eddb3d8316c&type=project

### 硬件要求

项目默认使用Qwen2.5-7B-Instruct模型，LoRA方法对sft阶段微调，大约需要16GB显存。也可以使用[LLaMA Factory](https://github.com/hiyouga/LLaMA-Factory/blob/main/README_zh.md#%E6%A8%A1%E5%9E%8B)支持的其他模型和方法。

需要显存的估算值：
| 方法                             | 精度 |   7B  |  14B  |  30B  |   70B  |   `x`B  |
| ------------------------------- | ---- | ----- | ----- | ----- | ------ | ------- |
| Full (`bf16` or `fp16`)         |  32  | 120GB | 240GB | 600GB | 1200GB | `18x`GB |
| Full (`pure_bf16`)              |  16  |  60GB | 120GB | 300GB |  600GB |  `8x`GB |
| Freeze/LoRA/GaLore/APOLLO/BAdam |  16  |  16GB |  32GB |  64GB |  160GB |  `2x`GB |
| QLoRA                           |   8  |  10GB |  20GB |  40GB |   80GB |   `x`GB |
| QLoRA                           |   4  |   6GB |  12GB |  24GB |   48GB | `x/2`GB |
| QLoRA                           |   2  |   4GB |   8GB |  16GB |   24GB | `x/4`GB |


## 环境搭建
1.cuda安装(已安装可跳过，**要求版本12.6及以上**)：[LLaMA Factory](https://llamafactory.readthedocs.io/zh-cn/latest/getting_started/installation.html#cuda) 

2.建议使用 [uv](https://docs.astral.sh/uv/)安装依赖，这是一个非常快速的 Python 环境管理器。安装uv后，您可以使用以下命令创建一个新的Python环境并安装依赖项，速度较慢可以开启代理：
```bash
git clone https://github.com/xming521/WeClone.git && cd WeClone
uv venv .venv --python=3.12
source .venv/bin/activate # windows下执行 .venv\Scripts\activate
uv pip install --group main -e . # 国内用户使用镜像：-i https://pypi.tuna.tsinghua.edu.cn/simple/ 
uv pip install https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.8.0/zh_core_web_sm-3.8.0-py3-none-any.whl
```

3.将配置文件模板复制一份并重命名为`settings.jsonc`，后续配置修改在此文件进行：
```bash
cp settings.template.jsonc settings.jsonc
```
- 微调**多模态模型**时，请使用[examples/mllm.template.jsonc](https://github.com/xming521/WeClone/blob/master/examples/mllm.template.jsonc)作为配置文件。

> [!NOTE]
> 训练以及推理相关配置统一在文件`settings.jsonc`

4.使用以下命令测试CUDA环境是否正确配置并可被PyTorch识别，Mac不需要：
```bash
python -c "import torch; print('CUDA是否可用:', torch.cuda.is_available());"
```

5.（可选）安装FlashAttention，加速训练和推理：`uv pip install flash-attn --no-build-isolation` 版本问题可以使用[prebuild-wheels](https://github.com/mjun0812/flash-attention-prebuild-wheels/releases)的预编译包安装。

## 模型下载
中国境内推荐使用[ModelScope](https://www.modelscope.cn/docs/models/download)下载模型。例如下载WeClone默认模型：
```bash
modelscope download --model Qwen/Qwen2.5-7B-Instruct --local_dir ./models/Qwen2.5-7B-Instruct
```

## 数据准备

### Telegram
请使用[Telegram Desktop](https://desktop.telegram.org/)导出聊天记录，点击右上角点击导出聊天记录，选择照片类型，格式选择JSON。可以导出多个联系人（不建议使用群聊记录），然后将导出的`ChatExport_*`文件夹放在`./dataset/telegram`目录即可，也就是不同人聊天记录的文件夹一起放在 `./dataset/telegram`。


## 数据预处理
- 首先根据需要修改配置文件中的`language`、`platform`、`include_type`。
- 项目默认通过Microsoft Presidio去除了数据中的`电话号码、电子邮件地址、信用卡号码（12-19位数字）、IP地址、地理位置名称、国际银行账户号码、加密货币钱包地址、年龄信息、通用身份证号码`,但是不能保证100%过滤识别。
- 所以在`settings.jsonc`中提供了一个禁用词词库`blocked_words`，可以自行添加需要过滤的词句（会默认去掉包括禁用词的整句）。
> [!IMPORTANT]
> 🚨 请一定注意保护个人隐私，不要泄露个人信息！

- 执行以下命令对数据进行处理，可以先根据自己的聊天风格修改settings.jsonc的`make_dataset_args`。
```bash
weclone-cli make-dataset
```
数据处理更多参数说明：[数据预处理](https://docs.weclone.love/zh/docs/deploy/data_preprocessing.html#%E7%9B%B8%E5%85%B3%E5%8F%82%E6%95%B0)

## 配置参数并微调模型

- (可选)修改 `settings.jsonc` 的 `model_name_or_path` 、`template`、 `lora_target`选择本地下载好的其他模型。  
- 修改`per_device_train_batch_size`以及`gradient_accumulation_steps`来调整显存占用。  
- 可以根据自己数据集的数量和质量修改`train_sft_args`的`num_train_epochs`、`lora_rank`、`lora_dropout`等参数。

### 单卡训练
```bash
weclone-cli train-sft
```

### 多卡训练
取消`settings.jsonc`中`deepspeed`行代码注释，使用以下命令多卡训练：
```bash
uv pip install "deepspeed<=0.16.9"
deepspeed --num_gpus=使用显卡数量 weclone/train/train_sft.py
```

### 使用浏览器demo简单推理
测试出合适的temperature、top_p值，修改settings.jsonc的`infer_args`后，供后续推理时使用。
```bash
weclone-cli webchat-demo
```

### 使用接口进行推理

```bash
weclone-cli server
```

### 使用常见聊天问题测试
不包含询问个人信息的问题，仅有日常聊天。测试结果在test_result-my.txt。
```bash
weclone-cli server
weclone-cli test-model
```

## 🖼️ 微调效果
> [!TIP] 
> **社群内有部署好的Qwen2.5VL 32B Bot，可以体验效果。** 


## 🤖 部署到聊天机器人

### AstrBot

[AstrBot](https://github.com/AstrBotDevs/AstrBot) 是易上手的多平台 LLM 聊天机器人及开发框架 ✨ 平台支持Telegram、飞书等。      

使用步骤：
1. 部署 AstrBot
2. 在 AstrBot 中部署消息平台
3. 执行 `weclone-cli server` 启动api服务
4. 在 AstrBot 中新增服务提供商，类型选择OpenAI，API Base URL 根据AstrBot部署方式填写（例如docker部署可能为http://172.17.0.1:8005/v1） ，模型填写gpt-3.5-turbo,API Key随意填写一个
5. 微调后不支持工具调用，请先关掉默认的工具，消息平台发送指令： `/tool off_all`，否则会没有微调后的效果。 
6. 根据微调时使用的default_system，在 AstrBot 中设置系统提示词。
![5](https://github.com/user-attachments/assets/19de7072-076a-4cdf-8ae6-46b9b89f536a)
> [!IMPORTANT]
> 检查api_service的日志，尽量保证大模型服务请求的参数和微调时一致，tool插件能力都关掉。

### LangBot

[LangBot](https://github.com/RockChinQ/LangBot) 是一个开源的接入全球多种即时通信平台的 LLM 机器人平台，适合各种场景使用。

<img width="450px" alt="image" src="https://github.com/user-attachments/assets/04ceeacf-8a14-40a9-b07a-2f03f257eee6" />


1. [部署 LangBot](https://github.com/RockChinQ/LangBot#-%E5%BC%80%E5%A7%8B%E4%BD%BF%E7%94%A8)
2. 执行 `weclone-cli server` 启动 WeClone API 服务
3. 在 LangBot 中添加一个机器人
4. 在模型页添加新模型，名称`gpt-3.5-turbo`，供应商选择 OpenAI，填写 请求 URL 为 WeClone 的地址，详细连接方式可以参考[文档](https://docs.langbot.app/zh/workshop/network-details.html)，API Key 任意填写。

<img width="400px" alt="image" src="https://github.com/user-attachments/assets/fc167dea-7c93-4d94-9c5f-db709d0320ba" />

6. 在流水线配置中选择刚才添加的模型，或修改提示词配置

<img width="400px" alt="image" src="https://github.com/user-attachments/assets/dbb0fd0a-f760-42db-acd0-bb99c859b52e" />

## 📌 路线图
- [ ] 支持更多数据源
- [ ] 更丰富的上下文：包括上下文对话、聊天对象信息、时间等 
- [ ] Memory 支持
- [ ] 支持多模态:已支持图片
- [ ] 数据增强
- [ ] 支持GUI
- [ ] 支持COT思考


## 问题解决
#### [官方文档FAQ](https://docs.weclone.love/docs/introduce/FAQ.html)    
同时建议使用[DeepWiki](https://deepwiki.com/xming521/WeClone)解决问题。

## ❤️ 贡献代码

欢迎任何 Issues/Pull Requests！

你可以通过查看Issues或帮助审核 PR（拉取请求）来贡献。对于新功能的添加，请先通过 Issue 讨论。   
开发环境：
```bash
uv pip install --group dev -e .
pre-commit install
```

项目使用`pytest`测试，`pyright`检查类型，`ruff`检查代码格式。  
提交代码前你应该先运行`pytest tests`确保所有测试通过。

## 🙏 致谢
BUPT VCIS Lab的支持
感谢以下代码贡献者和社区里其他成员的贡献

<a href="https://github.com/xming521/WeClone/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=xming521/WeClone" />
</a>

同时本项目受益于[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)、[AstrBot](https://github.com/AstrBotDevs/AstrBot)、[LangBot](https://github.com/RockChinQ/LangBot)等优秀开源项目。

## ⚠️ 免责声明
> [!CAUTION]
> **本项目仅供学习、研究和实验用途，用于生产环境存在较大风险，请谨慎评估。请勿用于非法用途，后果自负。**   
> [针对违规获取及利用微信终端用户数据行为的打击公告](https://mp.weixin.qq.com/s/A6h4ZLTE2EPrY7kJ5fHE2g)


> [!IMPORTANT]
> #### WeClone 目前未与任何平台合作，未发行任何数字货币。唯一官方网站：[weclone.love](https://www.weclone.love)，谨防仿冒。
<details>
<summary>点击查看免责条款</summary>

### 1. 使用风险自担
- 用户在使用本项目时，应充分理解并承担所有相关风险
- **本项目作者不对因使用本项目而产生的任何直接或间接损失承担责任**
- 包括但不限于：数据丢失、经济损失、法律纠纷、个人名誉损害、社会关系影响、心理创伤、职业发展受阻、商业信誉受损等

### 2. 生产环境风险警告
- **用于商业用途或对外提供服务需自行承担全部风险**
- 生产环境使用可能导致的所有后果（包括但不限于服务中断、数据安全问题、用户投诉、法律责任等）完全由用户承担
- **建议在生产环境使用前进行充分的测试、验证和风险评估**

### 3. 模型输出不可靠性
- 微调后的模型可能产生不准确、有害或误导性的内容
- 模型输出不代表真实人物的观点或意图
- 用户应对模型输出进行人工审核和验证

### 4. 数据安全与隐私
- 用户应确保上传的聊天记录等数据符合相关法律法规
- 用户应获得**数据相关人员的适当授权**
- 本项目不对**数据泄露或隐私侵犯**承担责任

### 5. 法律合规
- **用户应确保使用本项目符合当地法律法规**
- 涉及人工智能、数据保护、知识产权等相关法律
- **违法使用造成的后果由用户承担**

### 6. 技术支持限制
- 本项目按"现状"提供，不提供任何明示或暗示的保证
- 作者不承诺提供持续的技术支持或维护
- 不保证项目的稳定性、可靠性或适用性

## 使用建议

### 强制性Bot身份标识
**使用本项目生成的数字分身时，强烈建议：**
- 在每次对话开始时明确标识为"AI Bot"或"数字分身"
- 在用户界面显著位置标注"此为AI生成内容"
- 避免让用户误认为是真实人类在对话，从而造成风险

### 风险评估建议

如确需在生产环境使用，建议：
1. 进行全面的安全性测试
2. 建立完善的内容审核机制
3. 制定应急响应预案
4. 购买相应的保险保障
5. 咨询法律专业人士意见


本免责声明可能随项目更新而修订，用户应定期查看最新版本。继续使用本项目即表示同意最新的免责声明条款。

**一旦您下载、克隆、修改、分发或以任何方式使用本项目的代码或模型，即表示您已完整阅读、理解并同意无条件接受本免责声明的全部条款。**

</details>

**请用户慎重阅读并理解本免责声明的所有内容，确保在使用本项目时严格遵守相关规定。**
<br>  

## ⭐ Star History
> [!TIP] 
> 如果本项目对您有帮助，或者您关注本项目的未来发展，请给项目 Star，谢谢 

<div align="center">

[![Star History Chart](https://api.star-history.com/svg?repos=xming521/WeClone&type=Date)](https://www.star-history.com/#xming521/WeClone&Date)

</div>


<div align="center"> 克隆我们，保留灵魂的芬芳 </div>


================================================
FILE: dataset/eval/test_data-en.json
================================================
{
    "questions": [
        [
            "Have you eaten?",
            "What did you eat?",
            "Was it delicious?",
            "How much did it cost?",
            "Can you treat me to a meal?"
        ],
        [
            "What are you doing?",
            "What are you planning to do later?"
        ],
        [
            "What are you busy with?",
            "Do you have any special plans for today?",
            "How are you feeling?"
        ],
        [
            "Anything new happening recently?",
            "Do you have any interesting stories to share?"
        ],
        [
            "How was your weekend?",
            "What fun things did you do?"
        ],
        [
            "Have you watched any good movies or TV shows recently?",
            "Any recommendations?",
            "What was it about?"
        ],
        [
            "How's the weather today?",
            "How about on your end?"
        ],
        [
            "Is work/study going well recently?",
            "Have you encountered any challenges?"
        ],
        [
            "Hey, what are you busy with right now?",
            "Do you have any special plans for today?",
            "Everything going smoothly, I hope?"
        ],
        [
            "How's the weather on your side?",
            "Is it sunny or a bit gloomy?",
            "Is it cold or hot?"
        ],
        [
            "Is it mealtime yet?",
            "Planning to treat yourself to something delicious today?",
            "Anything special you want to eat, or any restaurant you want to try?"
        ],
        [
            "Any fun news or memes online recently?",
            "Come across any interesting videos or jokes? Share them with me!"
        ],
        [
            "What are your plans for later?",
            "How do you plan to spend the rest of the day?"
        ],
        [
            "Did anything catch your eye today?",
            "Let's just chat casually, any light topics?"
        ],
        [
            "Any new discoveries or insights today?",
            "Did today feel fast or slow? How was the pace?"
        ],
        [
            "How's your surroundings right now, noisy or quiet?",
            "Did you go out for a walk today? Was it crowded outside?",
            "Look out the window, anything special to see?"
        ],
        [
            "Have you eaten?",
            "What did you eat? Did you like it?"
        ],
        [
            "How was your day? Are you tired?",
            "What's up?"
        ],
        [
            "How's your health recently?",
            "Nothing bothering you, right?"
        ],
        [
            "Are you busy today?",
            "What have you been up to?"
        ],
        [
            "Everyone at home doing well?",
            "Need any help with anything?"
        ],
        [
            "Did you go out today?",
            "Is it cold/hot outside? Dress warmly/stay cool."
        ],
        [
            "Anything happy happening recently? Tell me about it!",
            "Or any troubles you want to talk about?"
        ],
        [
            "Go to bed early tonight, don't stay up too late.",
            "How's your sleep been lately?"
        ],
        [
            "Need anything? Just let me know.",
            "Do you have enough money?"
        ],
        [
            "See anything interesting today?",
            "Or anything you want to share with me?"
        ],
        [
            "Any weekend plans?",
            "Want to grab a meal together or go out?"
        ],
        [
            "How are those friends you keep in touch with?",
            "You should get together more often."
        ],
        [
            "Is work/study going smoothly?",
            "Don't put too much pressure on yourself."
        ],
        [
            "What delicious food did you make today?",
            "Let me try some next time!"
        ],
        [
            "Any news recently?",
            "Tell me about it."
        ],
        [
            "How's so-and-so doing lately?",
            "Haven't heard from him/her in a while."
        ],
        [
            "Are you in a good mood today?",
            "You look good/a bit tired."
        ],
        [
            "Anything you want to eat? I'll make it/bring it for you next time.",
            "Or anywhere you want to go? I'll come with you."
        ],
        [
            "Have you been watching any TV shows/movies recently?",
            "Any good recommendations for me?"
        ],
        [
            "If there's nothing urgent, go home/rest early.",
            "Stay safe."
        ]
    ]
}


================================================
FILE: dataset/eval/test_data-privacy.json
================================================
{
    "questions": [
        [
            "你多大了？"
        ],
        [
            "你有什么爱好吗？"
        ],
        [
            "你的理想是什么？",
            "你觉得你离你的理想还有多远？"
        ],
        [
            "你最近在忙什么？",
            "工作/学习顺利吗？",
            "有什么有趣的事情发生吗？"
        ],
        [
            "你喜欢看什么类型的电影？",
            "最近看过什么好看的电影吗？",
            "你最喜欢的电影是什么？"
        ],
        [
            "你平时喜欢听什么音乐？",
            "有推荐的歌手或乐队吗？",
            "最近有喜欢的歌曲吗？"
        ],
        [
            "你喜欢旅游吗？",
            "去过哪些地方？",
            "最喜欢的旅游地是哪里？"
        ],
        [
            "你喜欢读书吗？",
            "最近在读什么书？",
            "最喜欢的书是哪本？"
        ],
        [
            "你平时喜欢运动吗？",
            "喜欢做哪些运动？",
            "有固定去锻炼吗？"
        ],
        [
            "周末一般都做些什么？",
            "有没有什么特别的计划？",
            "周末喜欢宅在家还是出去玩？"
        ],
        [
            "你喜欢宠物吗？",
            "有养宠物吗？",
            "最喜欢什么动物？"
        ],
        [
            "你喜欢吃什么类型的食物？",
            "有推荐的餐厅吗？",
            "最喜欢的菜是什么？"
        ],
        [
            "你喜欢什么样的天气？",
            "最喜欢的季节是哪一个？",
            "你觉得今天的天气怎么样？"
        ],
        [
            "你有看电视剧的习惯吗？",
            "最近在追哪部剧？",
            "最喜欢的电视剧是哪部？"
        ],
        [
            "你喜欢玩游戏吗？",
            "最近在玩什么游戏？",
            "有推荐的好玩的游戏吗？"
        ],
        [
            "你会做饭吗？",
            "平时喜欢做哪些菜？",
            "有没有特别拿手的菜？"
        ],
        [
            "你喜欢购物吗？",
            "最近买了什么新东西？",
            "有推荐的购物网站或店铺吗？"
        ],
        [
            "你平时怎么放松自己？",
            "有特别的解压方式吗？",
            "最喜欢的放松活动是什么？"
        ],
        [
            "你喜欢和朋友出去玩吗？",
            "平时会和朋友去哪玩？",
            "最近有没有和朋友聚会的计划？"
        ],
        [
            "你喜欢喝咖啡还是茶？",
            "有没有特别喜欢的咖啡馆或茶馆？",
            "最喜欢的饮品是什么？"
        ],
        [
            "你有兄弟姐妹吗？",
            "和他们关系怎么样？",
            "经常联系吗？"
        ],
        [
            "你喜欢读什么类型的杂志？",
            "最近有看什么有趣的文章吗？",
            "有订阅的杂志吗？"
        ],
        [
            "你喜欢看体育比赛吗？",
            "最喜欢的运动项目是什么？",
            "有没有特别支持的球队或运动员？"
        ],
        [
            "你会说其他语言吗？",
            "最想学的语言是什么？",
            "学习语言有什么技巧吗？"
        ],
        [
            "你对科技产品感兴趣吗？",
            "最近有没有关注什么新科技？",
            "最喜欢的电子产品是什么？"
        ],
        [
            "你喜欢喝什么样的饮料？",
            "有没有自己调饮料的习惯？",
            "最喜欢的饮品品牌是什么？"
        ],
        [
            "你平时用社交媒体吗？",
            "常用哪些平台？",
            "在社交媒体上做什么？"
        ],
        [
            "你对艺术感兴趣吗？",
            "最喜欢的艺术家是谁？",
            "有去过哪些艺术展览？"
        ],
        [
            "你喜欢DIY吗？",
            "平时做些什么手工？",
            "有没有完成的作品可以分享？"
        ],
        [
            "你喜欢种植植物吗？",
            "有养什么植物？",
            "最喜欢的植物是什么？"
        ],
        [
            "你喜欢拍照吗？",
            "喜欢拍什么样的照片？",
            "有没有用什么特别的摄影设备？"
        ],
        [
            "你喜欢听播客吗？",
            "常听哪些主题的播客？",
            "有没有推荐的播客？"
        ],
        [
            "你对历史感兴趣吗？",
            "最喜欢哪个历史时期？",
            "有没有特别喜欢的历史人物？"
        ],
        [
            "你喜欢画画吗？",
            "平时画什么类型的画？",
            "有参加过画展吗？"
        ],
        [
            "你喜欢写作吗？",
            "平时写什么类型的文章？",
            "有没有发表过作品？"
        ],
        [
            "你喜欢钓鱼吗？",
            "平时去哪里钓鱼？",
            "有没有钓到过什么大鱼？"
        ],
        [
            "你喜欢露营吗？",
            "平时会去哪里露营？",
            "有没有什么难忘的露营经历？"
        ],
        [
            "你喜欢摄影吗？",
            "最喜欢拍什么题材？",
            "有没有特别喜欢的摄影师？"
        ],
        [
            "你喜欢喝酒吗？",
            "喜欢什么类型的酒？",
            "有没有推荐的酒吧或品牌？"
        ],
        [
            "你喜欢滑雪吗？",
            "平时去哪里滑雪？",
            "有没有什么滑雪技巧分享？"
        ],
        [
            "你喜欢海边还是山里？",
            "最喜欢去哪个地方度假？",
            "有没有什么特别推荐的景点？"
        ],
        [
            "你喜欢参加音乐节吗？",
            "参加过哪些音乐节？",
            "最喜欢的音乐节是哪一个？"
        ],
        [
            "你喜欢跑步吗？",
            "平时跑多长距离？",
            "有没有参加过马拉松？"
        ],
        [
            "你喜欢参加聚会吗？",
            "平时和朋友聚会做什么？",
            "有没有什么有趣的聚会游戏？"
        ],
        [
            "你喜欢收集东西吗？",
            "收集什么类型的物品？",
            "有没有什么特别的收藏？"
        ]
    ]
}


================================================
FILE: dataset/eval/test_data-zh.json
================================================
{
    "questions": [
        [
            "吃了吗？",
            "吃的什么啊",
            "好吃吗",
            "多少钱啊",
            "可以请我吃吗"
        ],
        [
            "干嘛呢？",
            "等会准备干什么去"
        ],
        [
            "在忙什么呢？",
            "今天有什么特别的安排吗？",
            "感觉怎么样？"
        ],
        [
            "最近有什么新鲜事发生吗？",
            "有没有什么有趣的故事可以分享？"
        ],
        [
            "周末过得怎么样？",
            "做了什么好玩的？"
        ],
        [
            "最近看了什么好看的电影或电视剧吗？",
            "有什么推荐的吗？",
            "大概讲了什么内容呀？"
        ],
        [
            "今天天气怎么样？",
            "你那里呢？"
        ],
        [
            "最近工作/学习顺利吗？",
            "有没有遇到什么挑战？"
        ],
        [
            "嗨，这会儿在忙啥呢？",
            "今天有什么特别的安排不？",
            "一切都还顺利吧？"
        ],
        [
            "你那边现在天气咋样啊？",
            "是大晴天还是有点阴沉沉的？",
            "冷不冷，或者热不热呀？"
        ],
        [
            "到饭点儿了没呀？",
            "今天打算犒劳一下自己，吃点啥好吃的？",
            "有没有啥特别想吃的，或者想去哪家馆子尝尝鲜？"
        ],
        [
            "最近网上有啥好玩儿的新闻或者梗吗？",
            "刷到啥有意思的视频或者段子没？分享一下呗！"
        ],
        [
            "待会儿有啥打算呀？",
            "今天剩下的时间准备怎么过呢？"
        ],
        [
            "今天有没有碰到啥让你眼前一亮的小事儿？",
            "随便聊聊呗，有啥轻松点的话题不？"
        ],
        [
            "今天有啥新发现或者小感悟没？",
            "感觉今天过得快不快？节奏怎么样？"
        ],
        [
            "你现在周围环境咋样，吵不吵？",
            "今天出门溜达了没，外面人多不多呀？",
            "瞅瞅窗外，有啥特别的景儿不？"
        ],
        [
            "吃饭了没啊？",
            "吃的啥呀？合胃口不？"
        ],
        [
            "今天怎么样啊？累不累？",
            "有啥事儿不？"
        ],
        [
            "最近身体还好吧？",
            "没什么不舒服的地方吧？"
        ],
        [
            "今天忙不忙啊？",
            "都干啥了呀？"
        ],
        [
            "家里都挺好的吧？",
            "有啥需要帮忙的不？"
        ],
        [
            "今天出门了没？",
            "外面冷不冷/热不热啊？多穿点/注意防暑。"
        ],
        [
            "最近有啥开心的事儿不？说来听听！",
            "或者有啥烦心事儿，跟我说说？"
        ],
        [
            "晚上早点休息啊，别熬太晚。",
            "睡得好不好啊最近？"
        ],
        [
            "缺啥东西不？跟我说。",
            "钱够不够花呀？"
        ],
        [
            "今天看到啥有意思的了没？",
            "或者有啥想跟我分享的？"
        ],
        [
            "周末有啥安排啊？",
            "要不要一起吃个饭/出去转转？"
        ],
        [
            "最近常联系的那些朋友都还好不？",
            "有空多聚聚。"
        ],
        [
            "工作/学习上还顺利吧？",
            "别太给自己压力啊。"
        ],
        [
            "今天做了啥好吃的呀？",
            "下次也给我尝尝呗！"
        ],
        [
            "有啥新闻没有啊最近？",
            "跟我讲讲。"
        ],
        [
            "那谁谁谁最近怎么样了？",
            "好久没听到他/她消息了。"
        ],
        [
            "今天心情好不好呀？",
            "看你气色不错/有点疲惫。"
        ],
        [
            "有啥想吃的没？下次给你做/带。",
            "或者想去哪儿玩，我陪你。"
        ],
        [
            "最近有没有看啥电视剧/电影啊？",
            "有啥好看的推荐给我呗。"
        ],
        [
            "没事儿就早点回家/休息。",
            "注意安全啊。"
        ]
    ]
}


================================================
FILE: dataset/media/images/.gitkeep
================================================
# Images processed from other data sources will also be placed in this directory.


================================================
FILE: dataset/res_csv/sft/dataset_info.json
================================================
{
    "chat-sft": {
        "file_name": "./sft-my.json",
        "formatting": "sharegpt",
        "columns": {
            "messages": "messages",
            "system": "system"
        },
        "tags": {
            "role_tag": "role",
            "content_tag": "content",
            "user_tag": "user",
            "assistant_tag": "assistant"
        }
    },
    "chat-sft-cleaned": {
        "file_name": "./sft-my-cleaned.json",
        "formatting": "sharegpt",
        "columns": {
            "messages": "messages",
            "system": "system"
        },
        "tags": {
            "role_tag": "role",
            "content_tag": "content",
            "user_tag": "user",
            "assistant_tag": "assistant"
        }
    },
    "chat-sft-vl": {
        "file_name": "./sft-my.json",
        "formatting": "sharegpt",
        "columns": {
            "messages": "messages",
            "system": "system",
            "images": "images"
        },
        "tags": {
            "role_tag": "role",
            "content_tag": "content",
            "user_tag": "user",
            "assistant_tag": "assistant"
        }
    },
    "chat-sft-vl-cleaned": {
        "file_name": "./sft-my-cleaned.json",
        "formatting": "sharegpt",
        "columns": {
            "messages": "messages",
            "system": "system",
            "images": "images"
        },
        "tags": {
            "role_tag": "role",
            "content_tag": "content",
            "user_tag": "user",
            "assistant_tag": "assistant"
        }
    }
}


================================================
FILE: dataset/telegram/.gitkeep
================================================
# Storing Telegram client's ChatExport


================================================
FILE: ds_config.json
================================================
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "bf16": {
        "enabled": "auto"
    },
    "zero_optimization": {
        "stage": 2,
        "allgather_partitions": true,
        "allgather_bucket_size": 5e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": 5e8,
        "contiguous_gradients": true
    },
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}


================================================
FILE: examples/mllm.template.jsonc
================================================
{
    "version": "0.2.24",
    "common_args": {
        "model_name_or_path": "./models/Qwen2.5-VL-7B-Instruct",
        "adapter_name_or_path": "./model_output", //同时做为train_sft_args的output_dir
        "template": "qwen2_vl",
        "default_system": "请你扮演一名人类，不要说自己是人工智能",
        "finetuning_type": "lora",
        "media_dir": "dataset/media",
        "image_max_pixels": 409920, //720P
        "enable_thinking": false,
        "trust_remote_code": true
    },
    "cli_args": {
        "full_log": false
    },
    "make_dataset_args": {
        //数据处理配置
        "platform": "chat", //chat,telegram
        "include_type": [
            "text",
            "image"
        ],
        "max_image_num": 2, // 单条数据最大图片数量
        "blocked_words": [ // 禁用词
            "例如 姓名",
            "例如 密码",
            "//....."
        ],
        "single_combine_strategy": "time_window", // 单人组成单句策略
        "qa_match_strategy": "time_window", // 组成qa策略
        "single_combine_time_window": 2, // 单人组成单句时间窗口（分钟）,
        "qa_match_time_window": 5, // 组成qa时间窗口（分钟）,
        "combine_msg_max_length": 2048, // 组合后消息最大长度 
        "messages_max_length": 2048, // messages最长字符数量 配合cutoff_len 使用
        "clean_dataset": {
            "enable_clean": false,
            "clean_strategy": "llm",
            "llm": {
                "accept_score": 2, //可以接受的llm打分阈值,1分最差，5分最好,低于此分数的数据不会用于训练
            }
        },
        "online_llm_clear": false,
        "base_url": "https://xxx/v1",
        "llm_api_key": "xxxxx",
        "model_name": "xxx", //建议使用参数较大的模型，例如DeepSeek-V3
        "clean_batch_size": 10,
        "vision_api": {
            "enable": false, // 设置为 true 来开启此功能
            "api_key": "xxx",
            "api_url": "https://xxx/v1", // 例如阿里云，或替换为其他兼容OpenAI的API地址
            "model_name": "xxx", // 要使用的多模态模型名称,例如qwen-vl-max
            "max_workers": 5 // 并行调用API的线程数，最多不要超过8
        }
    },
    "train_sft_args": {
        //微调配置
        "stage": "sft",
        "dataset": "chat-sft",
        "dataset_dir": "./dataset/res_csv/sft",
        "freeze_multi_modal_projector": false, //MLLM 训练时是否冻结多模态投影器。
        "use_fast_tokenizer": true,
        "lora_target": "q_proj,v_proj,visual.merger.mlp.0,visual.merger.mlp.2",
        "lora_rank": 8,
        "lora_dropout": 0.25,
        "weight_decay": 0.1,
        "overwrite_cache": true,
        "per_device_train_batch_size": 2,
        "gradient_accumulation_steps": 16,
        "lr_scheduler_type": "cosine",
        "cutoff_len": 4096,
        "logging_steps": 10,
        "save_steps": 100,
        "learning_rate": 1e-4,
        "warmup_ratio": 0.1,
        "num_train_epochs": 2,
        "plot_loss": true,
        "fp16": true,
        "flash_attn": "fa2",
        "preprocessing_num_workers": 16,
        "dataloader_num_workers": 4
        // "deepspeed": "ds_config.json" //多卡训练
    },
    "infer_args": {
        "repetition_penalty": 1.2,
        "temperature": 0.65,
        "max_length": 512,
        "top_p": 0.75
    },
    "vllm_args": {
        "gpu_memory_utilization": 0.9
    },
    "test_model_args": {
        "test_data_path": "dataset/eval/test_data-en.json"
    }
}


================================================
FILE: examples/tg.template.jsonc
================================================
{
    "version": "0.3.0",
    "common_args": {
        "model_name_or_path": "./models/Qwen2.5-VL-7B-Instruct",
        "adapter_name_or_path": "./model_output", // Also serves as the output_dir for train_sft_args
        "template": "qwen2_vl",
        "default_system": "Please act like a human and don't say you are an artificial intelligence",
        "finetuning_type": "lora",
        "media_dir": "dataset/media",
        "image_max_pixels": 409920, //720P
        "enable_thinking": false,
        "trust_remote_code": true
    },
    "cli_args": {
        "full_log": false
    },
    "make_dataset_args": {
        // Data processing configuration
        "platform": "telegram", //chat,telegram
        "language": "en", // Common chat language: zh(中文), en(English)
        "telegram_args": {
            "my_id": "user1234567890"
        },
        "include_type": [
            "text",
            "image",
            // "sticker" //Converting stickers to emojis can lead to the model outputting too many emojis.
        ],
        "max_image_num": 2, // Maximum number of images per data entry
        "blocked_words": [ // Blocked words
            "e.g. Name",
            "e.g. Password",
            "//....."
        ],
        "single_combine_strategy": "time_window", // Single person message combination strategy
        "qa_match_strategy": "time_window", // QA combination strategy
        "single_combine_time_window": 2, // Time window for single person message combination (minutes)
        "qa_match_time_window": 5, // Time window for QA combination (minutes)
        "combine_msg_max_length": 2048, // Maximum length of combined messages
        "messages_max_length": 2048, // Maximum character count for messages, used with cutoff_len
        "clean_dataset": {
            "enable_clean": false,
            "clean_strategy": "llm",
            "llm": {
                "accept_score": 2, // Acceptable LLM score threshold, 1 is worst, 5 is best, data below this score will not be used for training
            }
        },
        "online_llm_clear": false,
        "base_url": "https://xxx/v1",
        "llm_api_key": "xxxxx",
        "model_name": "xxx", // Recommend using models with larger parameters, e.g. DeepSeek-V3
        "clean_batch_size": 10,
        "vision_api": {
            "enable": false, // Set to true to enable this feature
            "api_key": "xxx",
            "api_url": "https://xxx/v1", // e.g. Alibaba Cloud, or replace with other OpenAI-compatible API addresses
            "model_name": "xxx", // Multimodal model name to use, e.g. qwen-vl-max
            "max_workers": 5 // Number of parallel API call threads, maximum should not exceed 8
        }
    },
    "train_sft_args": {
        // Fine-tuning configuration
        "stage": "sft",
        "dataset": "chat-sft",
        "dataset_dir": "./dataset/res_csv/sft",
        "freeze_multi_modal_projector": false, // Whether to freeze the multimodal projector during MLLM training
        "use_fast_tokenizer": true,
        "lora_target": "q_proj,v_proj,visual.merger.mlp.0,visual.merger.mlp.2",
        "lora_rank": 8,
        "lora_dropout": 0.25,
        "weight_decay": 0.1,
        "overwrite_cache": true,
        "per_device_train_batch_size": 2,
        "gradient_accumulation_steps": 16,
        "lr_scheduler_type": "cosine",
        "cutoff_len": 4096,
        "logging_steps": 10,
        "save_steps": 100,
        "learning_rate": 1e-4,
        "warmup_ratio": 0.1,
        "num_train_epochs": 2,
        "plot_loss": true,
        "fp16": true,
        "flash_attn": "fa2",
        "preprocessing_num_workers": 16,
        "dataloader_num_workers": 4
        // "deepspeed": "ds_config.json" // Multi-GPU training
    },
    "infer_args": {
        "repetition_penalty": 1.2,
        "temperature": 0.7,
        "max_length": 512,
        "top_p": 0.8
    },
    "vllm_args": {
        "gpu_memory_utilization": 0.9
    },
    "test_model_args": {
        "test_data_path": "dataset/eval/test_data-en.json"
    }
}


================================================
FILE: pyproject.toml
================================================
[project]
name = "WeClone"
version = "0.3.03"
description = "One-stop solution for creating your digital avatar from chat history"
authors = [{ name = "xming521" }]
readme = "README.md"
requires-python = ">=3.12,<3.13"

dependencies = [
  "pandas",
  "pyjson5",
  "omegaconf",
  "click",
  "tqdm",
  "pydantic==2.10.6",
  "setuptools>=78.1.0",
  "loguru>=0.7.3",
  "langchain",
  "openai==1.87.0",
  "pip"
]

[tool.weclone]
# Configuration file version number. This number should be incremented when the configuration file structure or important default values change.
config_version = "0.3.03"

config_changelog = """
[0.3.00] - 2025-06-30 - Support TG chat logs, add language parameter, add log level parameter.
[0.3.02] - 2025-08-15 - Allow the use of the enable_thinking to control offline cleaning..
[0.3.03] - 2025-11-01 - Add chat member relationship switch.
"""

[dependency-groups]
main = [
  "llamafactory==0.9.4",
  "vllm==0.10.0; platform_system == 'Linux'",
  "torch==2.7.1+cu126; platform_system == 'Linux' or platform_system == 'Windows'",
  "torchvision==0.22.1+cu126; platform_system == 'Linux' or platform_system == 'Windows'",
  "torchaudio==2.7.1+cu126; platform_system == 'Linux' or platform_system == 'Windows'",
  "torchdata>=0.10.0; platform_system == 'Linux' or platform_system == 'Windows'",
  "transformers==4.53.2",
  "accelerate==1.7.0",
  "triton==3.3.1; platform_system == 'Linux'",
  "presidio_analyzer[transformers]",
  "presidio_anonymizer",
]
sparktts = [
  "einops>=0.8.1",
  "einx>=0.3.0",
  "numpy==1.26.4",
  "omegaconf>=2.3.0",
  "packaging>=24.2",
  "safetensors>=0.5.2",
  "soundfile>=0.12.1",
  "soxr>=0.5.0.post1",
  "torchaudio>=2.6.0",
  "tqdm>=4.66.5",
]

dev = ["pytest", "pytest-order", "pyright", "ruff", "pre-commit"]

[project.scripts]
weclone-cli = "weclone.cli:cli"

[tool.uv]

[tool.uv.pip]
torch-backend = "auto"

[tool.uv.sources]
torch = [
  { index = "pytorch-cu126", marker = "platform_system == 'Windows'" },
  { index = "pytorch-cu126", marker = "platform_system == 'Linux'" },
]
torchaudio = [
  { index = "pytorch-cu126", marker = "platform_system == 'Windows'" },
  { index = "pytorch-cu126", marker = "platform_system == 'Linux'" },
]
torchvision = [
  { index = "pytorch-cu126", marker = "platform_system == 'Windows'" },
  { index = "pytorch-cu126", marker = "platform_system == 'Linux'" },
]
triton = [
  { index = "pytorch-cu126", marker = "platform_system == 'Windows'" },
  { index = "pytorch-cu126", marker = "platform_system == 'Linux'" },
]
torchdata = [
  { index = "pytorch-cu126", marker = "platform_system == 'Windows'" },
  { index = "pytorch-cu126", marker = "platform_system == 'Linux'" },
]

[[tool.uv.index]]
name = "pytorch-cu126"
url = "https://download.pytorch.org/whl/cu126"
explicit = true

[tool.setuptools.packages.find]
where = ["."]                      
include = ["weclone*"]             
exclude = ["*tests*", "*archive*"]


[tool.pyright]
typeCheckingMode = "basic"
include = ["weclone/data"]
exclude = ["**/archive", "**/tests"]
ignore = ["**/archive"]

reportMissingImports = "error"
reportMissingTypeStubs = false

pythonVersion = "3.12"
pythonPlatform = "Linux"

[tool.ruff]
exclude = [
  "**/archive",
  "**/tests",
  "weclone-audio/src/server未完工",
  "weclone-audio/src/Spark-TTS",
]
line-length = 110

lint.ignore = ["F403", "F405", "E501", "E402"]
lint.select = [
  "F",     # Pyflakes
  "W",     # pycodestyle warnings
  "E",     # pycodestyle errors
  "ASYNC", # flake8-async
  "C4",    # flake8-comprehensions
  "Q",     # flake8-quotes
]
target-version = "py312"

[tool.pytest.ini_options]
addopts = "-x -v -s --tb=short"


================================================
FILE: settings.template.jsonc
================================================
{
    "version": "0.3.01",
    "common_args": {
        "model_name_or_path": "./models/Qwen2.5-7B-Instruct",
        "adapter_name_or_path": "./model_output", //同时做为train_sft_args的output_dir
        "template": "qwen",
        "default_system": "请你扮演一名人类，不要说自己是人工智能",
        "media_dir": "dataset/media",
        "finetuning_type": "lora",
        "enable_thinking": false,
        "trust_remote_code": true
    },
    "cli_args": {
        "full_log": false,
        "log_level": "INFO"
    },
    "make_dataset_args": {
        //数据处理配置
        "platform": "chat", //chat,telegram
        "language": "zh", // 聊天常用语言: zh(中文) 或 en(英文)
        "telegram_args": {
            "my_id": "user1234567890"
        },
        "include_type": [
            "text"
        ],
        "blocked_words": [ // 禁用词
            "例如 姓名",
            "例如 密码",
            "//....."
        ],
        "add_time": false,
        "add_relation": false,
        "single_combine_strategy": "time_window", // 单人组成单句策略
        "qa_match_strategy": "time_window", // 组成qa策略
        "single_combine_time_window": 2, // 单人组成单句时间窗口（分钟）,
        "qa_match_time_window": 5, // 组成qa时间窗口（分钟）,
        "combine_msg_max_length": 2048, // 组合后消息最大长度 配合cutoff_len 使用
        "messages_max_length": 2048, // messages最长字符数量 配合cutoff_len 使用
        "clean_dataset": {
            "enable_clean": false,
            "clean_strategy": "llm",
            "llm": {
                "accept_score": 2, //可以接受的llm打分阈值,1分最差，5分最好,低于此分数的数据不会用于训练
                "enable_thinking": true
            }
        },
        "online_llm_clear": false,
        "base_url": "https://xxx/v1",
        "llm_api_key": "xxxxx",
        "model_name": "xxx", //建议使用参数较大的模型，例如DeepSeek-V3
        "clean_batch_size": 50,
        "vision_api": {
            "enable": false, // 设置为 true 来开启此功能
            "api_key": "xxx",
            "api_url": "https://xxx/v1", // 兼容OpenAI的API地址
            "model_name": "xxx", // 要使用的多模态模型名称,例如qwen-vl-max
            "max_workers": 5 // 并行调用API的线程数，最多不要超过8
        }
    },
    "train_sft_args": {
        //微调配置
        "stage": "sft",
        "dataset": "chat-sft",
        "dataset_dir": "./dataset/res_csv/sft",
        "use_fast_tokenizer": true,
        "lora_target": "q_proj,v_proj",
        "lora_rank": 8,
        "lora_dropout": 0.25,
        "weight_decay": 0.1,
        "overwrite_cache": true,
        "per_device_train_batch_size": 2,
        "gradient_accumulation_steps": 16,
        "lr_scheduler_type": "cosine",
        "cutoff_len": 2048,
        "logging_steps": 10,
        "save_steps": 100,
        "learning_rate": 1e-4,
        "warmup_ratio": 0.1,
        "num_train_epochs": 2,
        "plot_loss": true,
        "fp16": true,
        "flash_attn": "fa2",
        // "deepspeed": "ds_config.json" //多卡训练
    },
    "infer_args": {
        "repetition_penalty": 1.2,
        "temperature": 0.5,
        "max_length": 256,
        "top_p": 0.65
    },
    "vllm_args": {
        "gpu_memory_utilization": 0.9,
        // "data_parallel_size": 2,
        // "quantization": "bitsandbytes", 
        // "load_format": "bitsandbytes"
    },
    "test_model_args": {
        "test_data_path": "dataset/eval/test_data-zh.json"
    }
}


================================================
FILE: tests/__init__.py
================================================


================================================
FILE: tests/configs/Qwen2.5-VL.jsonc
================================================
{
    "version": "0.2.22",
    "common_args": {
        "model_name_or_path": "./models/Qwen2.5-VL-3B-Instruct",
        "adapter_name_or_path": "./model_output", //同时做为train_sft_args的output_dir
        "template": "qwen2_vl",
        "default_system": "请你扮演一名人类，不要说自己是人工智能",
        "finetuning_type": "lora",
        "media_dir": "dataset/media",
        "image_max_pixels": 209920, //720P
        "enable_thinking": false,
        "trust_remote_code": true
    },
    "cli_args": {
        "full_log": false
    },
    "make_dataset_args": {
        //数据处理配置
        "platform": "chat",
        "include_type": [
            "text",
            "image"
        ],
        "blocked_words": [
            "1234567890",
            "hh"
        ],
        "language": "en",
        "add_relation": true,
        "add_time": true,
        "max_image_num": 2, // 单条数据最大图片数量
        "single_combine_strategy": "time_window", // 单人组成单句策略
        "qa_match_strategy": "time_window", // 组成qa策略
        "single_combine_time_window": 2, // 单人组成单句时间窗口（分钟）,
        "qa_match_time_window": 5, // 组成qa时间窗口（分钟）,
        "combine_msg_max_length": 256, // 组合后消息最大长度 配合cutoff_len 使用
        "clean_dataset": {
            "enable_clean": true,
            "clean_strategy": "llm",
            "llm": {
                "accept_score": 2, //可以接受的llm打分阈值,1分最差，5分最好,低于此分数的数据不会用于训练
            }
        },
        "vision_api": {
            "enable": false, // 设置为 true 来开启此功能
            "api_key": "xxx",
            "api_url": "https://xxx/v1", // 例如阿里云，或替换为其他兼容OpenAI的API地址
            "model_name": "xxx", // 要使用的多模态模型名称,例如qwen-vl-max
            "max_workers": 5 // 并行调用API的线程数，最多不要超过8
        }
    },
    "test_model_args": {
        "test_data_path": "tests/tests_data/test_model_data.json"
    },
    "train_sft_args": {
        //微调配置
        "stage": "sft",
        "dataset": "chat-sft",
        "dataset_dir": "./dataset/res_csv/sft",
        "freeze_multi_modal_projector": false, //MLLM 训练时是否冻结多模态投影器。
        "use_fast_tokenizer": true,
        "lora_target": "q_proj,v_proj,visual.merger.mlp.0,visual.merger.mlp.2",
        "lora_rank": 2,
        "lora_dropout": 0.3,
        "weight_decay": 0.1,
        "overwrite_cache": true,
        "per_device_train_batch_size": 4,
        "gradient_accumulation_steps": 8,
        "lr_scheduler_type": "cosine",
        "cutoff_len": 1024,
        "logging_steps": 5,
        "save_steps": 10,
        "learning_rate": 1e-4,
        "warmup_ratio": 0.1,
        "num_train_epochs": 1,
        "plot_loss": true,
        "fp16": true,
        "flash_attn": "fa2",
        // "deepspeed": "ds_config.json" //多卡训练
    },
    "infer_args": {
        "repetition_penalty": 1.2,
        "temperature": 0.5,
        "max_length": 50,
        "top_p": 0.65
    }
}


================================================
FILE: tests/configs/qwen2.5.jsonc
================================================
{
    "version": "0.2.22",
    "common_args": {
        "model_name_or_path": "./models/Qwen2.5-0.5B",
        "adapter_name_or_path": "./model_output", //同时做为train_sft_args的output_dir
        "template": "qwen",
        "default_system": "请你扮演一名人类，不要说自己是人工智能",
        "finetuning_type": "lora",
        "media_dir": "dataset/media",
        "image_max_pixels": 209920, //720P
        "enable_thinking": false,
        "trust_remote_code": true
    },
    "cli_args": {
        "full_log": false
    },
    "make_dataset_args": {
        //数据处理配置
        "platform": "chat",
        "include_type": [
            "text",
            // "image"
        ],
        "blocked_words": [
            "1234567890",
            "hh"
        ],
        "language": "zh",
        "add_relation": true,
        "add_time": true,
        "max_image_num": 2, // 单条数据最大图片数量
        "single_combine_strategy": "time_window", // 单人组成单句策略
        "qa_match_strategy": "time_window", // 组成qa策略
        "single_combine_time_window": 2, // 单人组成单句时间窗口（分钟）,
        "qa_match_time_window": 5, // 组成qa时间窗口（分钟）,
        "combine_msg_max_length": 256, // 组合后消息最大长度 配合cutoff_len 使用
        "clean_dataset": {
            "enable_clean": true,
            "clean_strategy": "llm",
            "llm": {
                "accept_score": 2, //可以接受的llm打分阈值,1分最差，5分最好,低于此分数的数据不会用于训练
                "enable_thinking": true
            }
        },
        "vision_api": {
            "enable": false, // 设置为 true 来开启此功能
            "api_key": "xxx",
            "api_url": "https://xxx/v1", // 例如阿里云，或替换为其他兼容OpenAI的API地址
            "model_name": "xxx", // 要使用的多模态模型名称,例如qwen-vl-max
            "max_workers": 5 // 并行调用API的线程数，最多不要超过8
        }
    },
    "test_model_args": {
        "test_data_path": "tests/tests_data/test_model_data.json"
    },
    "train_sft_args": {
        //微调配置
        "stage": "sft",
        "dataset": "chat-sft",
        "dataset_dir": "./dataset/res_csv/sft",
        "freeze_multi_modal_projector": false, //MLLM 训练时是否冻结多模态投影器。
        "use_fast_tokenizer": true,
        "lora_target": "q_proj,v_proj,visual.merger.mlp.0,visual.merger.mlp.2",
        "lora_rank": 2,
        "lora_dropout": 0.3,
        "weight_decay": 0.1,
        "overwrite_cache": true,
        "per_device_train_batch_size": 4,
        "gradient_accumulation_steps": 8,
        "lr_scheduler_type": "cosine",
        "cutoff_len": 1024,
        "logging_steps": 5,
        "save_steps": 10,
        "learning_rate": 1e-4,
        "warmup_ratio": 0.1,
        "num_train_epochs": 1,
        "plot_loss": true,
        "fp16": true,
        "flash_attn": "fa2",
        // "deepspeed": "ds_config.json" //多卡训练
    },
    "infer_args": {
        "repetition_penalty": 1.2,
        "temperature": 0.5,
        "max_length": 50,
        "top_p": 0.65
    }
}


================================================
FILE: tests/test_PII.py
================================================
import os
import shutil
import subprocess
import sys
from typing import cast

import pytest

# Import common functions from test_full_pipe
from tests.test_full_pipe import (
    DATASET_CSV_DIR,
    PROJECT_ROOT_DIR,
    get_config_files,
    load_config_with_path,
    print_test_header,
    run_cli_command,
    setup_data_environment,
    test_logger,
)
from weclone.utils.config import load_config
from weclone.utils.config_models import DataModality, WCMakeDatasetConfig
from weclone.utils.log import logger

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

# Setup paths
TESTS_DIR = os.path.dirname(__file__)
TEST_DATA_PII_DIR = os.path.join(TESTS_DIR, "tests_data", "test_PII")

@pytest.mark.parametrize("config_file", get_config_files())
def test_PII_make_dataset(config_file):
    """Test PII data make-dataset functionality"""
    print_test_header("PII make-dataset", config_file)
    
    setup_data_environment("test_PII")
    
    # Load config and handle images if needed
    config: WCMakeDatasetConfig = cast(WCMakeDatasetConfig, load_config_with_path(config_file, "make_dataset"))

    # Run make-dataset command
    result = run_cli_command(["make-dataset"], config_file)
    assert result.returncode == 0, f"make-dataset command execution failed for config {config_file}"

    # Print all user messages from the dataset file with PII warning
    import json
    sft_file_path = os.path.join(PROJECT_ROOT_DIR, "dataset", "res_csv", "sft", "sft-my.json")
    if os.path.exists(sft_file_path):
        logger.warning("⚠️  WARNING: The following content contains unfiltered PII (Personally Identifiable Information):")
        logger.warning("=" * 80)
        
        with open(sft_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        for entry in data:
            if 'messages' in entry:
                for message in entry['messages']:
                    if message.get('role') == 'user':
                        logger.warning(f"User content: {message.get('content', '')}")
        
        logger.warning("=" * 80)
        logger.warning("⚠️  END OF UNFILTERED PII CONTENT")

    test_logger.info(f"✅ PII make-dataset test passed for config {config_file}")

if __name__ == "__main__":
    # If running directly, run tests for all configs
    for config_file in get_config_files():
        test_PII_make_dataset(config_file) 


================================================
FILE: tests/test_full_pipe.py
================================================
import functools
import os
import shutil
import subprocess
import sys
import time
from typing import Callable, Optional, Union, cast
from unittest import mock

import pytest

from weclone.utils.config import load_config
from weclone.utils.config_models import DataModality, WCMakeDatasetConfig
from weclone.utils.log import logger

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
PROJECT_ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
DATASET_CSV_DIR = os.path.join(PROJECT_ROOT, "dataset", "csv")
TESTS_DIR = os.path.dirname(__file__)
TEST_DATA_PERSON_DIR = os.path.join(TESTS_DIR, "tests_data", "test_person")


# Backup directories
BACKUP_DIR = os.path.join(PROJECT_ROOT, "test_backup")
MODEL_OUTPUT_BACKUP = os.path.join(BACKUP_DIR, "model_output")
DATASET_CSV_BACKUP = os.path.join(BACKUP_DIR, "dataset_csv")

test_logger = logger.bind()
test_logger.remove()
test_logger.add(
    sys.stderr,
    format="<yellow><b>{message}</b></yellow>",
    colorize=True,
    level="INFO",
)

def get_config_files():
    """获取所有配置文件"""
    configs_dir = os.path.join(os.path.dirname(__file__), "configs")
    config_files = []
    for file in os.listdir(configs_dir):
        if file.endswith('.jsonc'):
            config_files.append(f"tests/configs/{file}")
    return config_files

def print_test_header(test_name: str, config_file: str = ""):
    line_length = 100
    test_logger.info("\n" + "─" * line_length)
    if config_file:
        title = f"  Testing Phase: {test_name} | Config: {os.path.basename(config_file)}  "
    else:
        title = f"  Testing Phase: {test_name}  "
    padding_total = line_length - len(title)
    padding_left = padding_total // 2
    padding_right = padding_total - padding_left
    test_logger.info(" " * padding_left + title + " " * padding_right)
    test_logger.info("─" * line_length)

def print_config_header(config_file: str):
    """打印配置文件开始测试的头部"""
    line_length = 120
    test_logger.info("\n" + "═" * line_length)
    title = f"  开始测试配置文件: {os.path.basename(config_file)}  "
    padding_total = line_length - len(title)
    padding_left = padding_total // 2
    padding_right = padding_total - padding_left
    test_logger.info(" " * padding_left + title + " " * padding_right)
    test_logger.info("═" * line_length)

def setup_data_environment(data_folder_name: str = "test_person"):
    """Setup test data environment for specified folder"""
    test_logger.info(f"🔧 设置 {data_folder_name} 测试数据...")
    
    # Create backup directory
    if os.path.exists(BACKUP_DIR):
        shutil.rmtree(BACKUP_DIR)
    os.makedirs(BACKUP_DIR)
    
    # Backup model_output if it exists
    if os.path.exists("model_output"):
        shutil.move("model_output", MODEL_OUTPUT_BACKUP)
        test_logger.info("已备份 model_output 目录")
    
    # Backup DATASET_CSV_DIR if it exists
    if os.path.exists(DATASET_CSV_DIR):
        shutil.move(DATASET_CSV_DIR, DATASET_CSV_BACKUP)
        test_logger.info("已备份 dataset/csv 目录")
    
    os.makedirs(DATASET_CSV_DIR)
    
    # Setup specified test data folder
    test_data_source_dir = os.path.join(TESTS_DIR, "tests_data", data_folder_name)
    test_data_csv_dir = os.path.join(DATASET_CSV_DIR, data_folder_name)
    os.makedirs(test_data_csv_dir)

    for item_name in os.listdir(test_data_source_dir):
        source_item_path = os.path.join(test_data_source_dir, item_name)
        if os.path.isfile(source_item_path) :
            destination_item_path = os.path.join(test_data_csv_dir, item_name)
            shutil.copy2(source_item_path, destination_item_path)
    
    test_logger.info(f"✅ {data_folder_name} 测试数据设置完成")

@pytest.fixture(scope="session", autouse=True)
def setup_test_environment():
    """Setup test environment once for the entire test session"""
    test_logger.info("🔧 开始设置测试环境...")
    
    # Use the generic setup function with default test_person data
    setup_data_environment("test_person")
    
    test_logger.info("✅ 测试环境设置完成")
    
    yield  # This is where the testing happens
    
    # Cleanup after all tests are done
    test_logger.info("🧹 开始恢复测试环境...")
    
    if os.path.exists("model_output"):
        shutil.rmtree("model_output")
    if os.path.exists(DATASET_CSV_DIR):
        shutil.rmtree(DATASET_CSV_DIR)
    
    if os.path.exists(MODEL_OUTPUT_BACKUP):
        shutil.move(MODEL_OUTPUT_BACKUP, "model_output")
    
    if os.path.exists(DATASET_CSV_BACKUP):
        shutil.move(DATASET_CSV_BACKUP, DATASET_CSV_DIR)
    
    if os.path.exists(BACKUP_DIR):
        shutil.rmtree(BACKUP_DIR)
    
    test_logger.info("✅ 测试环境恢复完成")


def restore_test_env():
    """Manual environment cleanup for direct execution (deprecated for pytest)"""
    test_logger.info("🧹 手动恢复测试环境...")
    
    # Remove test directories
    if os.path.exists("model_output"):
        shutil.rmtree("model_output")
    if os.path.exists(DATASET_CSV_DIR):
        shutil.rmtree(DATASET_CSV_DIR)
    
    # Restore original directories if they were backed up
    if os.path.exists(MODEL_OUTPUT_BACKUP):
        shutil.move(MODEL_OUTPUT_BACKUP, "model_output")
        test_logger.info("已恢复 model_output 目录")
    
    if os.path.exists(DATASET_CSV_BACKUP):
        shutil.move(DATASET_CSV_BACKUP, DATASET_CSV_DIR)
        test_logger.info("已恢复 dataset/csv 目录")
    
    # Remove backup directory
    if os.path.exists(BACKUP_DIR):
        shutil.rmtree(BACKUP_DIR)
        test_logger.info("已清理备份目录")
    
    test_logger.info("✅ 测试环境恢复完成")

def run_cli_command(command: list[str], config_path: str, timeout: int | None = None, background: bool = False) -> Union[subprocess.CompletedProcess, subprocess.Popen]:
    """Execute a CLI command and return the result.
    
    Args:
        command: List of commands to execute.
        config_path: Path to the configuration file.
        timeout: Timeout in seconds.
        background: Whether to run in the background.
        
    Returns:
        If background=True, returns a Popen object; otherwise, returns a CompletedProcess object.
    """
    env = os.environ.copy()
    env["WECLONE_CONFIG_PATH"] = config_path # Set environment variable

    if background:
        process = subprocess.Popen(
            [sys.executable, "-m", "weclone.cli"] + command,
            stderr=None,
            stdout=None,
            text=True,
            cwd=PROJECT_ROOT_DIR,
            env=env
        )
        time.sleep(2)
        return process
    else:
        process = subprocess.run(
            [sys.executable, "-m", "weclone.cli"] + command,
            stderr=None,
            stdout=None,
            text=True,
            cwd=PROJECT_ROOT_DIR,  # Execute in the project root directory
            timeout=timeout,
            env=env  # Pass the modified environment variables
        )
        return process

def load_config_with_path(config_file: str, config_section: str):
    """临时设置环境变量并加载配置"""
    original_env = os.environ.get("WECLONE_CONFIG_PATH")
    os.environ["WECLONE_CONFIG_PATH"] = config_file
    
    try:
        return load_config(config_section)
    finally:
        # 恢复原始环境变量
        if original_env is not None:
            os.environ["WECLONE_CONFIG_PATH"] = original_env
        elif "WECLONE_CONFIG_PATH" in os.environ:
            del os.environ["WECLONE_CONFIG_PATH"]

def run_make_dataset_test(config_file: str):
    """执行 make-dataset 测试"""
    print_test_header("make-dataset", config_file)
    
    config: WCMakeDatasetConfig = cast(WCMakeDatasetConfig, load_config_with_path(config_file, "make_dataset"))
    if DataModality.IMAGE in config.include_type:
        #复制图片到media_dir/iamges
        os.makedirs(config.media_dir, exist_ok=True)
        os.makedirs(os.path.join(config.media_dir, "images"), exist_ok=True)
        for file in os.listdir(os.path.join(PROJECT_ROOT_DIR, "tests", "tests_data", "images")):
            shutil.copy(os.path.join(PROJECT_ROOT_DIR, "tests", "tests_data", "images", file), os.path.join(config.media_dir, "images", file))

    result = run_cli_command(["make-dataset"], config_file)
    assert result.returncode == 0, f"make-dataset command execution failed for config {config_file}"

    # Check if blocked_words filtering is working correctly
    sft_file_path = os.path.join(PROJECT_ROOT_DIR, "dataset", "res_csv", "sft", "sft-my.json")
    with open(sft_file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        if "hh" in content:
            assert False, f"blocked_words filtering failed for config {config_file}: found 'hh' in {sft_file_path}"
    test_logger.info(f"✅ blocked_words filtering check passed for config {config_file}")
    
    # Check if <image> tags count is correct for Qwen2.5-VL.jsonc config
    if "Qwen2.5-VL.jsonc" in config_file:
        image_count = content.count("<image>")
        assert image_count == 3, f"Expected 3 <image> tags in {sft_file_path} for config {config_file}, but found {image_count}"
        test_logger.info(f"✅ <image> tags count check passed for config {config_file}: found {image_count} <image> tags")

    
def run_train_sft_test(config_file: str):
    """执行 train-sft 测试"""
    print_test_header("train-sft", config_file)
   
    try:
        result = run_cli_command(["train-sft"], config_file) 
        assert result.returncode == 0, f"train-sft command failed or did not fail fast as expected for config {config_file}"
    except subprocess.TimeoutExpired:
        test_logger.info(f"train-sft command terminated due to timeout for config {config_file}, which is acceptable in testing, indicating the command has started execution.")
        pass
    except Exception as e:
        pytest.fail(f"An unexpected error occurred during train-sft command execution for config {config_file}: {e}")

def run_webchat_demo_test(config_file: str):
    """执行 webchat-demo 测试"""
    print_test_header("webchat-demo", config_file)
    
    try:
        result = run_cli_command(["webchat-demo"], config_file, timeout=20)
        assert result.returncode == 0, f"webchat-demo command execution failed for config {config_file}"
    except subprocess.TimeoutExpired:
        pass

def run_server_test(config_file: str) -> subprocess.Popen:
    """执行 server 测试，返回进程对象"""
    print_test_header("server (background)", config_file)
    server_process = cast(subprocess.Popen, run_cli_command(["server"], config_file, background=True))
    test_logger.info("等待服务器启动，20秒后检查状态...")
    time.sleep(20)
    assert server_process.poll() is None, f"Server startup failed for config {config_file}"
    test_logger.info(f"使用配置 {config_file} 的服务器已在后台启动")
    return server_process

def run_test_model_test(config_file: str, server_process: subprocess.Popen):
    """执行 test-model 测试并关闭服务器"""
    print_test_header("test-model", config_file)
    try:
        result = run_cli_command(["test-model"], config_file)
        assert result.returncode == 0, f"test-model command execution failed for config {config_file}"
    finally:
        if server_process is not None and server_process.poll() is None:
            test_logger.info(f"测试完成，正在关闭使用配置 {config_file} 的服务器...")
            server_process.terminate()
            server_process.wait(timeout=5)
            if server_process.poll() is None:
                server_process.kill()  # Force kill if the process hasn't terminated
            test_logger.info("服务器已关闭")

def clean_model_output():
    """Clean model_output directory before each config test"""
    if os.path.exists("model_output"):
        shutil.rmtree("model_output")

@pytest.mark.parametrize("config_file", get_config_files())
def test_full_pipeline_for_config(config_file):
    """为每个配置文件完整执行所有测试步骤"""
    print_config_header(config_file)
    
    clean_model_output()
    
    server_process = None
    try:
        # 按顺序执行所有测试步骤
        run_make_dataset_test(config_file)
        run_train_sft_test(config_file)
        run_webchat_demo_test(config_file)
        server_process = run_server_test(config_file)
        run_test_model_test(config_file, server_process)
        
        test_logger.info(f"✅ 配置文件 {os.path.basename(config_file)} 的所有测试已完成")
        
    except Exception as e:
        test_logger.error(f"❌ 配置文件 {os.path.basename(config_file)} 测试失败: {e}")
        if server_process is not None and server_process.poll() is None:
            server_process.terminate()
            server_process.wait(timeout=5)
            if server_process.poll() is None:
                server_process.kill()
        raise

if __name__ == "__main__":
    try:
        # If running directly, you would put your test code here
        pass
    finally:
        restore_test_env()


================================================
FILE: tests/tests_data/test_PII/test_0_730.csv
================================================
id,MsgSvrID,type_name,is_sender,talker,room_name,msg,src,CreateTime
7,4073926741244663531,文本,0,PHONE_NUMBER,wxid_6789z5qlxzfj22,13812345678,,2024/10/4 11:43
8,4073926741244663532,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43
9,706358374822797422,文本,0,EMAIL_ADDRESS,wxid_6789z5qlxzfj22,zhang.wei@163.com,,2024/10/4 11:43
10,706358374822797423,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43
11,2122553892045962801,文本,0,CREDIT_CARD,wxid_6789z5qlxzfj22,4532123456789012,,2024/10/4 11:43
12,2122553892045962802,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43
13,5704142615879617852,文本,0,IP_ADDRESS,wxid_6789z5qlxzfj22,192.168.1.100,,2024/10/4 11:43
14,5704142615879617853,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43
15,1337798072543283708,文本,0,LOCATION,wxid_6789z5qlxzfj22,北京市朝阳区三里屯,,2024/10/4 11:43
16,1337798072543283709,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43
17,8192964515963336399,文本,0,IBAN_CODE,wxid_6789z5qlxzfj22,GB33BUKB20201555555555,,2024/10/4 11:43
18,8192964515963336400,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43
19,7913656383976388488,文本,0,CRYPTO,wxid_6789z5qlxzfj22,1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa,,2024/10/4 11:43
20,7913656383976388489,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43
21,1964923183359419454,文本,0,AGE,wxid_6789z5qlxzfj22,25岁,,2024/10/4 11:43
22,1964923183359419455,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43
23,2403233409323875303,文本,0,ID,wxid_6789z5qlxzfj22,110101199001011234,,2024/10/4 11:43
24,2403233409323875304,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43
25,4630229215952295971,文本,0,PHONE_NUMBER,wxid_6789z5qlxzfj22,021-62345678,,2024/10/4 11:43
26,4630229215952295972,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43
27,6547675850931813364,文本,0,EMAIL_ADDRESS,wxid_6789z5qlxzfj22,li.ming@qq.com,,2024/10/4 11:43
28,6547675850931813365,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43
31,8151408074985365130,文本,0,IP_ADDRESS,wxid_6789z5qlxzfj22,2001:0db8:85a3:0000:0000:8a2e:0370:7334,,2024/10/4 11:43
32,8151408074985365131,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43
33,9876543210123456789,文本,0,PHONE_NUMBER,wxid_6789z5qlxzfj22,+1-125-123-4567,,2024/10/4 11:44
34,9876543210123456790,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
35,1234567890987654321,文本,0,EMAIL_ADDRESS,wxid_6789z5qlxzfj22,john.doe@gmail.com,,2024/10/4 11:44
36,1234567890987654322,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
37,5555444433332222111,文本,0,CREDIT_CARD,wxid_6789z5qlxzfj22,4111111111111111,,2024/10/4 11:44
38,5555444433332222112,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
39,7777888899990000123,文本,0,IP_ADDRESS,wxid_6789z5qlxzfj22,203.0.113.1,,2024/10/4 11:44
40,7777888899990000124,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
41,3333222211110000456,文本,0,LOCATION,wxid_6789z5qlxzfj22,1600 Pennsylvania Avenue NW Washington DC,,2024/10/4 11:44
42,3333222211110000457,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
43,9999000011112222789,文本,0,IBAN_CODE,wxid_6789z5qlxzfj22,DE89370400440532013000,,2024/10/4 11:44
44,9999000011112222790,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
45,1111333355557777012,文本,0,CRYPTO,wxid_6789z5qlxzfj22,bc1qxy2kgdygjrsqtzq2n0yrf2493p83kkfjhx0wlh,,2024/10/4 11:44
46,1111333355557777013,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
47,4444666688880000345,文本,0,AGE,wxid_6789z5qlxzfj22,32 years old,,2024/10/4 11:44
48,4444666688880000346,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
49,2222555577779999678,文本,0,US_SSN,wxid_6789z5qlxzfj22,078-05-1120,,2024/10/4 11:44
50,2222555577779999679,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
51,6666111133335555901,文本,0,PHONE_NUMBER,wxid_6789z5qlxzfj22,+44-20-7946-0958,,2024/10/4 11:44
52,6666111133335555902,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
53,8888222244446666234,文本,0,EMAIL_ADDRESS,wxid_6789z5qlxzfj22,sarah.johnson@outlook.com,,2024/10/4 11:44
54,8888222244446666235,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
55,0000444466668888567,文本,0,CREDIT_CARD,wxid_6789z5qlxzfj22,5555555555554444,,2024/10/4 11:44
56,0000444466668888568,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
57,3333777799991111890,文本,0,IP_ADDRESS,wxid_6789z5qlxzfj22,172.16.254.1,,2024/10/4 11:44
58,3333777799991111891,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
59,5555999911113333123,文本,0,LOCATION,wxid_6789z5qlxzfj22,10 Downing Street London SW1A 2AA UK,,2024/10/4 11:44
60,5555999911113333124,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
61,7777000022224444456,文本,0,IBAN_CODE,wxid_6789z5qlxzfj22,FR1420041010050500013M02606,,2024/10/4 11:44
62,7777000022224444457,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
63,9999222244446666789,文本,0,CRYPTO,wxid_6789z5qlxzfj22,3QJmV3qfvL9SuYo34YihAf3sRCW3qSinyC,,2024/10/4 11:44
64,9999222244446666790,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
65,1111555577779999012,文本,0,AGE,wxid_6789z5qlxzfj22,28岁,,2024/10/4 11:44
66,1111555577779999013,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
67,4444888800002222345,文本,0,ID,wxid_6789z5qlxzfj22,AB123456C,,2024/10/4 11:44
68,4444888800002222346,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44
69,4444888800002222345,文本,0,666,wxid_6789z5qlxzfj22,404,,2024/10/4 11:44
70,4444888800002222346,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44


================================================
FILE: tests/tests_data/test_model_data.json
================================================
{
    "questions": [
        [
            "吃了吗？",
            "吃的什么啊",
            "好吃吗",
            "多少钱啊",
            "可以请我吃吗"
        ],
        [
            "干嘛呢？",
            "等会准备干什么去"
        ],
        [
            "最近有什么新鲜事发生吗？",
            "有没有什么有趣的故事可以分享？"
        ],
        [
            "周末过得怎么样？",
            "做了什么好玩的？"
        ],
        [
            "今天天气怎么样？",
            "你那里呢？"
        ],
        [
            "最近工作/学习顺利吗？",
            "有没有遇到什么挑战？"
        ]
    ]
}


================================================
FILE: tests/tests_data/test_person/test_0_730.csv
================================================
id,MsgSvrID,type_name,is_sender,talker,room_name,msg,src,CreateTime
1,7437267147299592543,图片,0,12345iru2zsmo22,test_person,图片,File\dd0e62b6eb67d195bc33ab9470301d6c\Image\2024-10\01c177d8ad90af8969ba048455b54eef.dat,2024/10/4 11:42
2,637529293739295664,图片,0,12345iru2zsmo22,test_person,图片,File\dd0e62b6eb67d195bc33ab9470301d6c\Image\2024-10\d8a8936ca622823452e45b5e180a53a6.dat,2024/10/4 11:42
7,4073926741244663531,文本,1,12345iru2zsmo22,test_person,小马尔代夫,,2024/10/4 11:43
8,706358374822797422,文本,1,12345iru2zsmo22,test_person,名不虚传,,2024/10/4 11:43
9,2122553892045962801,文本,0,test_person,test_person,我去 好可爱啊,,2024/10/4 11:43
10,5704142615879617852,文本,0,test_person,test_person,2.0156416,,2024/10/4 11:43
11,1337798072543283708,文本,0,test_person,test_person,,,2024/10/4 11:43
12,8192964515963336399,文本,1,12345iru2zsmo22,test_person,啊哈哈哈,,2024/10/4 11:43
13,7913656383976388488,文本,1,12345iru2zsmo22,test_person,不是,,2024/10/4 11:43
14,1964923183359419454,文本,1,12345iru2zsmo22,test_person,你过来就得老久,,2024/10/4 11:43
15,2403233409323875303,文本,0,test_person,test_person,我在南站,,2024/10/4 11:43
16,4630229215952295971,文本,0,test_person,test_person,我知道我学校离养马岛12km,,2024/10/4 11:43
17,6547675850931813364,文本,0,test_person,test_person,[旺柴],,2024/10/4 11:43
18,1900866115792249247,文本,1,12345iru2zsmo22,test_person,牟平站,,2024/10/4 11:43
19,8151408074985365130,文本,1,12345iru2zsmo22,test_person,近,,2024/10/4 11:43
20,2421069219348160202,文本,1,12345iru2zsmo22,test_person,啊哈哈哈,,2024/10/4 11:43
21,8751079973209533492,文本,0,test_person,test_person,我去接朋友 他12点到,,2024/10/4 11:44
22,1133854364527684495,动画表情,1,12345iru2zsmo22,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=b3ffed81b71099d903628c5877bd0792&filekey=30440201010430302e02016e04025348042062336666656438316237313039396439303336323863353837376264303739320203030f78040d00000004627466730000000132&hy=SH&storeid=264ffed170006ff2f4bd405e70000006e01004fb153480ff458e0b6a8ce9e7&ef=1&bizid=1022,2024/10/4 11:45
23,3003440481974462293,文本,1,12345iru2zsmo22,test_person,一下午应该也行,,2024/10/4 11:45
24,3403121757406614004,文本,0,test_person,test_person,先吃饭然后忙完估计三点 过去就四点,,2024/10/4 11:46
25,6917846734389470451,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=f6ae48635cfc57931cbdcd4453230c36&filekey=3043020101042f302d02016e0402535a0420663661653438363335636663353739333163626463643434353332333063333602027039040d00000004627466730000000132&hy=SZ&storeid=266e7c5b40007eea8169bd3e30000006e01004fb1535a2836fbc1e6d7f6305&ef=1&bizid=1022,2024/10/4 11:46
26,3853926419342399869,文本,0,test_person,test_person,蒜啦,,2024/10/4 11:46
27,689214144441695718,文本,1,12345iru2zsmo22,test_person,啊哈哈,,2024/10/4 11:46
28,501703563680542858,文本,1,12345iru2zsmo22,test_person,那算了,,2024/10/4 11:46
29,5175776596048859341,文本,1,12345iru2zsmo22,test_person,可以明天来,,2024/10/4 11:46
30,1468168499470203020,文本,0,test_person,test_person,希望明天天气好,,2024/10/4 11:46
31,8704117912978418734,文本,0,test_person,test_person,你租车了没,,2024/10/4 11:46
32,3720367917725174786,文本,1,12345iru2zsmo22,test_person,hh,,2024/10/4 11:46
33,5706726594668713894,文本,1,12345iru2zsmo22,test_person,租了,,2024/10/4 11:46
34,6749208560602575120,文本,1,12345iru2zsmo22,test_person,150一天,,2024/10/4 11:47
35,6547279599090331225,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=f6ae48635cfc57931cbdcd4453230c36&filekey=3043020101042f302d02016e0402535a0420663661653438363335636663353739333163626463643434353332333063333602027039040d00000004627466730000000132&hy=SZ&storeid=266e7c5b40007eea8169bd3e30000006e01004fb1535a2836fbc1e6d7f6305&ef=1&bizid=1022,2024/10/4 11:47
36,783513142739644929,文本,1,12345iru2zsmo22,test_person,我本来只用半天是的,,2024/10/4 11:47
37,1522589433173967165,文本,0,test_person,test_person,我觉得还不如6小时,,2024/10/4 11:47
38,4189192320331088356,文本,1,12345iru2zsmo22,test_person,他说半天得提前预约,,2024/10/4 11:47
39,3276909886419115321,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=33935c33478d351ca575ed3b15c9c7d6&filekey=30350201010421301f020201060402535a041033935c33478d351ca575ed3b15c9c7d60203046683040d00000004627466730000000132&hy=SZ&storeid=266a76de2000bb3f307f6952b0000010600004f50535a2d4f20115699447b2&bizid=1023,2024/10/4 11:47
40,3794700043110742367,文本,0,test_person,test_person,我靠,,2024/10/4 11:47
41,4370237514919765211,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=e734d92f35462ae39096a6453a906c64&filekey=30440201010430302e02016e04025348042065373334643932663335343632616533393039366136343533613930366336340203011cf9040d00000004627466730000000132&hy=SH&storeid=266936043000dd1f97ec388740000006e01004fb153482828bbc1e6cab618e&ef=1&bizid=1022,2024/10/4 11:47
42,268652740652374624,文本,1,12345iru2zsmo22,test_person,我昨天没预约,,2024/10/4 11:47
43,8708021080758144662,文本,0,test_person,test_person,其实也玩不了多久 就环岛骑一圈,,2024/10/4 11:47
44,4042214828835453981,文本,1,12345iru2zsmo22,test_person,是滴,,2024/10/4 11:47
45,8134305588773593834,图片,0,12345iru2zsmo22,test_person,图片,File\dd0e62b6eb67d195bc33ab9470301d6c\Image\2024-10\13d6d8a81fa7554d09238c81fe314e85.dat,2024/10/4 15:49
46,8231897199371315830,文本,1,12345iru2zsmo22,test_person,back了,,2024/10/4 15:49
47,2523360219807779607,文本,1,12345iru2zsmo22,test_person,烟台下次还来,,2024/10/4 15:49
48,5990956613985588267,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=d8f4bf1e82e7b54a140cadd0dd1788a9&filekey=30350201010421301f020201060402535a0410d8f4bf1e82e7b54a140cadd0dd1788a90203054dd3040d00000004627466730000000132&hy=SZ&storeid=266a7694900068f8207f6952b0000010600004f50535a043fb011502ab6738&bizid=1023,2024/10/4 15:49
49,8020701317904864408,文本,0,test_person,test_person,坏了忘记回你,,2024/10/4 21:24
50,678530733212459598,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=ecd83bbd5669b1ad8ab82368a884b4f4&filekey=30440201010430302e02016e040253480420656364383362626435363639623161643861623832333638613838346234663402030be21c040d00000004627466730000000132&hy=SH&storeid=2666a63330000e6ca9bd747110000006e01004fb1534829283bc1e72de8bdc&ef=1&bizid=1022,2024/10/4 21:24
51,3817737317258834248,文本,0,test_person,test_person,你租的小车车好可爱哈哈哈,,2024/10/4 21:24
52,1122488129382806721,文本,0,test_person,test_person,我之前租的时候 人家说这个没劲儿 租了个大的,,2024/10/4 21:25
53,1244017411047763227,引用回复,0,test_person,test_person,"（我们两个人骑）

[引用](2024-10-04 21:25:04)小虫:我之前租的时候 人家说这个没劲儿 租了个大的",,2024/10/4 21:25
54,7804635386065632983,文本,1,12345iru2zsmo22,test_person,hh,,2024/10/4 21:26
55,6582317494846210955,文本,1,12345iru2zsmo22,test_person,劲,,2024/10/4 21:26
56,6947874557250248646,文本,1,12345iru2zsmo22,test_person,相当大,,2024/10/4 21:26
57,7646558619446387721,系统通知,1,12345iru2zsmo22,test_person,<revokemsg>你撤回了一条消息</revokemsg>,,2024/10/4 21:26
58,4607675874750661759,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=2c6bbf882e053b9639569c47194da071&filekey=30440201010430302e02016e0402534804203263366262663838326530353362393633393536396334373139346461303731020326eec4040d00000004627466730000000131&hy=SH&storeid=323032313130303432303436333430303061373166346264613936393936353336376234306230303030303036653031303034666231&ef=1&bizid=1022,2024/10/4 21:27
59,4251046275168351582,文本,1,12345iru2zsmo22,test_person,我都超??,,2024/10/4 21:27
60,3958569781970448507,文本,0,test_person,test_person,哈哈哈哈哈哈哈哈哈,,2024/10/4 21:27
61,1304768232206478205,文本,1,12345iru2zsmo22,test_person,啊哈哈哈,,2024/10/4 23:11
62,5423864699615912300,文本,1,12345iru2zsmo22,test_person,生日快乐,,2024/10/4 23:11
63,3090401677076458687,系统通知,1,12345iru2zsmo22,test_person,<revokemsg>你撤回了一条消息</revokemsg>,,2024/10/4 23:11
64,2634320168319877355,文本,1,12345iru2zsmo22,test_person,（先知后觉,,2024/10/4 23:11
65,6872254500032132923,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=6f263986f9402e0259c58032d4b27403&filekey=30440201010430302e02016e04025348042036663236333938366639343032653032353963353830333264346232373430330203088dea040d00000004627466730000000132&hy=SH&storeid=265390f8400011cc0008b27880000006e01004fb1534806488bc1e0dae90d0&ef=1&bizid=1022,2024/10/4 23:12
66,1476937912918221285,文本,0,test_person,test_person,嘿嘿谢谢泥,,2024/10/4 23:12
67,5104969914181545205,文本,0,test_person,test_person,是明天嘟,,2024/10/4 23:12
68,4671697784411486925,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=4af51669b3933aeacf4c1823fe1a1654&filekey=3043020101042f302d02016e0402535a0420346166353136363962333933336165616366346331383233666531613136353402023d49040d00000004627466730000000132&hy=SZ&storeid=26673b134000b229540c40cba0000006e01004fb1535a1e1a70b1568486971&ef=1&bizid=1022,2024/10/4 23:12
69,942777755221289888,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=233b78d9d244a70087993eb38becca42&filekey=30350201010421301f02020106040253480410233b78d9d244a70087993eb38becca42020310919b040d00000004627466730000000132&hy=SH&storeid=266a9daa7000bc718169bd3e30000010600004f5053480627c0d1500e10846&bizid=1023,2024/10/4 23:13
70,4970918958858872918,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=06f0315fce5aeb37e36d90a449e55224&filekey=3043020101042f302d02016e0402535a04203036663033313566636535616562333765333664393061343439653535323234020269b0040d00000004627466730000000132&hy=SZ&storeid=26631ab4d0005d61f47603ed30000006e01004fb1535a0416fbc1e68d59226&ef=1&bizid=1022,2024/10/4 23:13
71,5088103607264479657,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=2fd26fe001f15baa35d2c7c1f1f77a11&filekey=30440201010430302e02016e0402535a04203266643236666530303166313562616133356432633763316631663737613131020301efba040d00000004627466730000000132&hy=SZ&storeid=266a9e4680007ad63169bd3e30000006e01004fb1535a05ff801150a57b6eb&ef=1&bizid=1022,2024/10/4 23:13
72,1231447585119365782,动画表情,0,test_person,test_person,表情,http://xx.com/262/20304/stodownload?m=f0648e1f78507fb5e0527c1847bb7eab&filekey=30350201010421301f020201060402535a0410f0648e1f78507fb5e0527c1847bb7eab0203046443040d00000004627466730000000132&hy=SZ&storeid=266a75fa5000a957907f6952b0000010600004f50535a0026bae1e00f753c3&bizid=1023,2024/10/4 23:13
73,7243064063443092107,文本,0,test_person,test_person,哈哈哈哈哈哈哈,,2024/12/15 21:01
74,4402111010190356867,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=2bb2100bc3ed89cb9bb5cb2ddf096ba5&filekey=30440201010430302e02016e0402535a04203262623231303062633365643839636239626235636232646466303936626135020301364a040d00000004627466730000000132&hy=SZ&storeid=2655320ef000f23e7135418150000006e01004fb1535a2071d321e07d71e51&ef=1&bizid=1022,2024/12/15 21:01
75,7957007613667310251,动画表情,1,12345iru2zsmo22,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=a3edb04dc624702bf53ceb8b8533030e&filekey=30440201010430302e02016e04025348042061336564623034646336323437303262663533636562386238353333303330650203031c6a040d00000004627466730000000132&hy=SH&storeid=267384c7a0008ff136f6e1f2a0000006e01004fb1534807906bd1e77d51516&ef=1&bizid=1022,2024/12/15 21:01
76,6555773752081434395,文本,1,12345iru2zsmo22,test_person,一年级社畜,,2024/12/15 21:02
77,6113080126607357441,文本,0,test_person,test_person,妈耶好诡异,,2024/12/15 21:07
78,8861993796968204324,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=e868279290cc3be14207d9946d5f7479&filekey=30350201010421301f020201060402535a0410e868279290cc3be14207d9946d5f747902030204ac040d00000004627466730000000132&hy=SZ&storeid=264c8b3cb0002e54407f6952b0000010600004f50535a0df0c950b74cd5f3e&bizid=1023,2024/12/15 21:07
79,8225573753184622169,文本,1,12345iru2zsmo22,test_person,上班,,2024/12/15 21:07
80,2185462146394548348,文本,1,12345iru2zsmo22,test_person,是会这样的,,2024/12/15 21:07
81,2492434220482862582,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=8a3ec2eb01ffade8c1d2cef9ce7b9cd0&filekey=30440201010430302e02016e0402535a042038613365633265623031666661646538633164326365663963653762396364300203023e74040d00000004627466730000000132&hy=SZ&storeid=26561e4d90003ae9fa356ce630000006e01004fb1535a04b6bae1e6e3ef683&ef=1&bizid=1022,2024/12/15 21:08
82,3323843778125596201,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=e36da9879caba2492757a93c9bd1e8a3&filekey=30350201010421301f020201060402535a0410e36da9879caba2492757a93c9bd1e8a302030229bb040d00000004627466730000000132&hy=SZ&storeid=266f634870009dfbd6f6e1f2a0000010600004f50535a1b369bc1e691b22e9&bizid=1023,2024/12/15 21:08
83,4807494899095110953,文本,1,12345iru2zsmo22,test_person,？？你和拼多多签约了？？？,,2024/12/30 21:06
84,7732495155588506274,图片,0,12345iru2zsmo22,test_person,图片,File\dd0e62b6eb67d195bc33ab9470301d6c\Image\2024-12\01c177d8ad90af8969ba048455b54eef.dat,2024/12/30 21:06
85,3524820582691543233,文本,0,test_person,test_person,给你99 给我花9.9买一个,,2024/12/30 21:08
86,4559086380629629977,动画表情,0,test_person,test_person,表情,http://xx.com/262/20304/stodownload?m=60fc498474f154e67a9406e6052774e3&filekey=30350201010421301f0202010604025348041060fc498474f154e67a9406e6052774e302030a1000040d00000004627466730000000132&hy=SH&storeid=2638fd60e000bbbf33df216b40000010600004f5053482e31b8e0b68ce4bb7&bizid=1023,2024/12/30 21:08
87,2856784266939461622,文本,0,test_person,test_person,我和偷玩签约了（bushi）,,2024/12/30 21:09
88,4103450553959091238,文本,1,12345iru2zsmo22,test_person,啊哈哈哈哈,,2024/12/30 21:10
89,4320467837159769744,图片,0,test_person,test_person,图片,File\dd0e62b6eb67d195bc33ab9470301d6c\Image\2024-12\e7e73ba89149fc57ea6fd395b00c9daf.dat,2024/12/30 21:21
90,1245688256602333044,文本,0,test_person,test_person,不知道他有没有看上我,,2024/12/30 21:21
91,3496799115798928577,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=6d737e0cb5ed70dbd2e543192395e627&filekey=30440201010430302e02016e040253480420366437333765306362356564373064626432653534333139323339356536323702030108ea040d00000004627466730000000132&hy=SH&storeid=26743cbcd000e81a54eaf9c070000006e01004fb153481223f03156d7563c2&ef=1&bizid=1022,2024/12/30 21:21
92,8086695983133128935,文本,0,test_person,test_person,幸运的话 会送我小熊虫,,2024/12/30 21:21
93,1413825802731496171,文本,0,test_person,test_person,不幸运就没有后续了,,2024/12/30 21:21
94,4732788513210348588,文本,1,12345iru2zsmo22,test_person,啊哈哈哈,,2024/12/30 21:25
95,2430661934638082113,文本,1,12345iru2zsmo22,test_person,聘你为,,2024/12/30 21:25
96,8416545366058458010,文本,1,12345iru2zsmo22,test_person,代言人,,2024/12/30 21:26
97,8818193356512955281,动画表情,0,test_person,test_person,表情,http://xx.com/262/20304/stodownload?m=87a30250a72f68eb6dbcd3c833f34af9&filekey=30350201010421301f020201060402535a041087a30250a72f68eb6dbcd3c833f34af902030a08d8040d00000004627466730000000131&hy=SZ&storeid=32303231303632363030303735333030303965316137356663316362626162343537353830393030303030313036&bizid=1023,2024/12/30 21:27
98,2420601785318357838,文本,0,test_person,test_person,比不上别人,,2024/12/30 21:27
99,209384579714630809,文本,0,test_person,test_person,名额多 我才可能有机会,,2024/12/30 21:27
100,3867626588038981853,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=314db052c1847c0b51794ce3eff22482&filekey=30340201010420301e02020106040253480410314db052c1847c0b51794ce3eff224820202142f040d00000004627466730000000132&hy=SH&storeid=26303a1b5000ee6ad950c9c370000010600004f50534828034b00b6d05aeac&bizid=1023,2024/12/30 21:28


================================================
FILE: weclone/__init__.py
================================================


================================================
FILE: weclone/cli.py
================================================
import functools
import os
import sys
from pathlib import Path
from typing import cast

import click
import pyjson5
from rich.console import Console
from rich.panel import Panel
from rich.text import Text

from weclone.utils.config import load_config
from weclone.utils.config_models import CliArgs
from weclone.utils.log import capture_output, configure_log_level_from_config, logger

cli_config: CliArgs | None = None

try:
    import tomllib  # type: ignore Python 3.11+
except ImportError:
    import tomli as tomllib


def clear_argv(func):
    """
    Decorator: Clear sys.argv before calling the decorated function, keeping only the script name. Restore original sys.argv after calling.
    Used to prevent arguments from being parsed by Hugging Face HfArgumentParser causing ValueError.
    """

    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        original_argv = sys.argv.copy()
        sys.argv = [original_argv[0]]  # Keep only script name
        try:
            return func(*args, **kwargs)
        finally:
            sys.argv = original_argv  # Restore original sys.argv

    return wrapper


def with_community_info(func):
    """
    Decorator: Show community info before executing the command
    """

    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        show_community_info()
        return func(*args, **kwargs)

    return wrapper


def apply_common_decorators(capture_output_enabled=False):
    """
    A unified decorator for applications
    """

    def decorator(original_cmd_func):
        @functools.wraps(original_cmd_func)
        def new_runtime_wrapper(*args, **kwargs):
            if cli_config and cli_config.full_log:
                return capture_output(original_cmd_func)(*args, **kwargs)
            else:
                return original_cmd_func(*args, **kwargs)

        func_with_clear_argv = clear_argv(new_runtime_wrapper)

        return functools.wraps(original_cmd_func)(func_with_clear_argv)

    return decorator


@click.group(invoke_without_command=True)
@click.option(
    "--config-path",
    default=None,
    help="Specify config file path, or set WECLONE_CONFIG_PATH environment variable",
)
@click.pass_context
def cli(ctx, config_path):
    """WeClone: One-stop solution for creating digital avatars from chat history"""
    # Only show community info when no subcommand is invoked
    if ctx.invoked_subcommand is None:
        show_community_info()
        click.echo(ctx.get_help())
        return

    if config_path:
        os.environ["WECLONE_CONFIG_PATH"] = config_path
        logger.info(f"Config file path set to: {config_path}")

    _check_project_root()
    _check_versions()
    global cli_config
    cli_config = cast(CliArgs, load_config(arg_type="cli_args"))

    configure_log_level_from_config()


@cli.command("make-dataset", help="Process chat history CSV files to generate Q&A pair datasets.")
@with_community_info
@apply_common_decorators()
def qa_generator():
    """Process chat history CSV files to generate Q&A pair datasets."""
    from weclone.data.qa_generator import DataProcessor

    processor = DataProcessor()
    processor.main()


@cli.command("train-sft", help="Fine-tune the model using prepared datasets.")
@apply_common_decorators()
def train_sft():
    """Fine-tune the model using prepared datasets."""
    from weclone.train.train_sft import main as train_sft_main

    train_sft_main()


@cli.command("webchat-demo", help="Launch Web UI for interactive testing with fine-tuned model.")
@apply_common_decorators()
def web_demo():
    """Launch Web UI for interactive testing with fine-tuned model."""
    from weclone.eval.web_demo import main as web_demo_main

    web_demo_main()


# TODO Add evaluation functionality @cli.command("eval-model", help="Evaluate using validation set split from training data.")
@apply_common_decorators()
def eval_model():
    """Evaluate using validation set split from training data."""
    from weclone.eval.eval_model import main as evaluate_main

    evaluate_main()


@cli.command("test-model", help="Test model with common chat questions.")
@apply_common_decorators()
def test_model():
    """Test model with common chat questions."""
    from weclone.eval.test_model import main as test_main

    test_main()


@cli.command("server", help="Start API service providing model inference interface.")
@apply_common_decorators()
def server():
    """Start API service providing model inference interface."""
    from weclone.server.api_service import main as server_main

    server_main()


@cli.command("version", help="Show WeClone version information.")
@with_community_info
def version():
    """Show WeClone version information."""
    pass


def show_community_info():
    console = Console()
    content = Text()
    content.append("📱 Official group\n", style="bold green")
    content.append("   • Telegram: ", style="bold cyan")
    content.append("https://t.me/+JEdak4m0XEQ3NGNl\n", style="bright_blue")
    content.append("   • QQ群: ", style="bold cyan")
    content.append("708067078\n\n", style="bright_green")
    content.append("🌐 Social media\n", style="bold magenta")
    content.append("   • Twitter: ", style="bold cyan")
    content.append("https://x.com/weclone567\n", style="bright_blue")
    content.append("   • 小红书: ", style="bold cyan")
    content.append("🔍 搜索WeClone\n\n", style="bright_blue")
    content.append("📚 Official resources\n", style="bold red")
    content.append("   • Repository: ", style="bold cyan")
    content.append("https://github.com/xming521/WeClone\n", style="bright_blue")
    content.append("   • Homepage: ", style="bold cyan")
    content.append("https://www.weclone.love/\n", style="bright_blue")
    content.append("   • Document: ", style="bold cyan")
    content.append("https://docs.weclone.love/\n\n", style="bright_blue")
    content.append("💡 感谢您的关注和支持！Thank you for your support!", style="bold bright_green")
    panel = Panel(
        content,
        title="🌟 Community & Social Media",
        title_align="center",
        border_style="bright_cyan",
        padding=(1, 2),
    )
    console.print(panel)


def _check_project_root():
    """Check if current directory is project root and verify project name."""
    project_root_marker = "pyproject.toml"
    current_dir = Path(os.getcwd())
    pyproject_path = current_dir / project_root_marker

    if not pyproject_path.is_file():
        logger.error(f"{project_root_marker} file not found in current directory.")
        logger.error("Please ensure you are running this command in the WeClone project root directory.")
        sys.exit(1)

    try:
        with open(pyproject_path, "rb") as f:
            pyproject_data = tomllib.load(f)
        project_name = pyproject_data.get("project", {}).get("name")
        if project_name != "WeClone":
            logger.error("Please ensure you are running in the correct WeClone project root directory.")
            sys.exit(1)
    except tomllib.TOMLDecodeError as e:
        logger.error(f"Error: Unable to parse {pyproject_path} file: {e}")
        sys.exit(1)
    except Exception as e:
        logger.error(f"Unexpected error occurred while reading or processing {pyproject_path}: {e}")
        sys.exit(1)


def _check_versions():
    """Compare local settings.jsonc version with config file guide version in pyproject.toml"""
    if tomllib is None:  # Skip check if toml parser failed to import
        return

    ROOT_DIR = Path(__file__).parent.parent
    SETTINGS_PATH = ROOT_DIR / "settings.jsonc"
    PYPROJECT_PATH = ROOT_DIR / "pyproject.toml"

    settings_version = None
    config_guide_version = None
    config_changelog = None
    project_version = None

    if SETTINGS_PATH.exists():
        try:
            with open(SETTINGS_PATH, "r", encoding="utf-8") as f:
                content = f.read()
                settings_data = pyjson5.loads(content)
                settings_version = settings_data.get("version")
        except Exception as e:
            logger.error(f"Error: Unable to read or parse {SETTINGS_PATH}: {e}")
            logger.error("Please ensure settings.jsonc file exists and is properly formatted.")
            sys.exit(1)
    else:
        logger.error(f"Error: Config file {SETTINGS_PATH} not found.")
        logger.error("Please ensure settings.jsonc file is located in the project root directory.")
        sys.exit(1)

    if PYPROJECT_PATH.exists():
        try:
            with open(PYPROJECT_PATH, "rb") as f:  # tomllib requires binary mode
                pyproject_data = tomllib.load(f)
                weclone_tool_data = pyproject_data.get("tool", {}).get("weclone", {})
                config_guide_version = weclone_tool_data.get("config_version")
                config_changelog = weclone_tool_data.get("config_changelog", "N/A")
                project_version = pyproject_data.get("project", {}).get("version")
        except Exception as e:
            logger.warning(
                f"Warning: Unable to read or parse {PYPROJECT_PATH}: {e}. Cannot check if config file is up to date."
            )
    else:
        logger.warning(
            f"Warning: File {PYPROJECT_PATH} not found. Cannot check if config file is up to date."
        )

    if not settings_version:
        logger.error(f"Error: 'version' field not found in {SETTINGS_PATH}.")
        logger.error("Please copy from settings.template.json or update your settings.jsonc file.")
        sys.exit(1)

    if config_guide_version:
        if settings_version != config_guide_version:
            logger.warning(
                f"Warning: Your settings.jsonc file version ({settings_version}) does not match the project's recommended config version ({config_guide_version})."
            )
            logger.warning(
                "This may cause unexpected behavior or errors. Please copy from settings.template.json or update your settings.jsonc file."
            )
            # TODO Print update log based on version number
            logger.warning(f"Config file changelog:\n{config_changelog}")

        logger.info(f"📦 Project Version: {project_version}")
        logger.info(f"⚙️  Config Version: {settings_version}")
    elif PYPROJECT_PATH.exists():  # If file exists but version not found
        logger.warning(
            f"Warning: 'config_version' field not found under [tool.weclone] in {PYPROJECT_PATH}. "
            "Cannot confirm if your settings.jsonc is the latest config version."
        )


if __name__ == "__main__":
    cli()


================================================
FILE: weclone/core/PII/__init__.py
================================================
from .pii_detector import ChinesePIIDetector, PIIDetector, PIIResult

__all__ = ["PIIResult", "PIIDetector", "ChinesePIIDetector"]


================================================
FILE: weclone/core/PII/pii_detector.py
================================================
from dataclasses import dataclass
from typing import List, Optional, cast

from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, Pattern, PatternRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities.engine.recognizer_result import (
    RecognizerResult as AnonymizerRecognizerResult,  # type: ignore
)

# from presidio_analyzer.analyzer_engine import logger as presidio_logger
from weclone.utils.log import logger


@dataclass
class PIIResult:
    entity_type: str
    start: int
    end: int
    score: float
    text: str


class PIIDetector:
    """PII detector based on presidio library"""

    def __init__(self, language: str = "en", threshold: float = 0.5):
        self.language = language
        self.threshold = threshold

        self._init_engines()
        self.anonymizer = AnonymizerEngine()
        self.not_filtered_entities = ["DATE_TIME", "PERSON", "URL", "NRP"]
        self.supported_entities = self.get_all_entities()
        self.filtered_entities = [
            entity for entity in self.supported_entities if entity not in self.not_filtered_entities
        ]
        if self.language == "en":
            logger.info(f"Privacy filtered entity types: {self.filtered_entities}")

    def _init_engines(self):
        model_mapping = {
            "zh": "zh_core_web_sm",
            "en": "en_core_web_sm",
            "es": "es_core_news_sm",
            "fr": "fr_core_news_sm",
            "de": "de_core_news_sm",
        }

        model_name = model_mapping.get(self.language, "en_core_web_sm")

        nlp_configuration = {
            "nlp_engine_name": "spacy",
            "models": [{"lang_code": self.language, "model_name": model_name}],
        }

        provider = NlpEngineProvider(nlp_configuration=nlp_configuration)
        nlp_engine = provider.create_engine()

        self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)

        self._add_custom_recognizers(language=self.language)

        self.batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer)

        # self.anonymizer = AnonymizerEngine()

        logger.info(
            f"Presidio engine initialized successfully, using language: {self.language}, model: {model_name}"
        )

    def _add_custom_recognizers(self, language: str):
        # Create numeric ID recognizer - matches 5+ digit numbers or numbers with - separators
        numeric_id_patterns = [
            Pattern(name="numeric_id", regex=r"\b(?:[A-Za-z]*\d{5,}[A-Za-z]*|\d+-\d+(?:-\d+)*)\b", score=0.8),
            Pattern(name="unicode_escape_id", regex=r"\\u[0-9a-fA-F]{4}", score=0.8),
            Pattern(name="hex_escape_id", regex=r"\\xa0", score=0.8),
        ]

        numeric_id_recognizer = PatternRecognizer(
            supported_entity="NUMERIC_ID",
            patterns=numeric_id_patterns,
            supported_language=language,
            name="numeric_id_recognizer",
            context=["id", "编号", "号码", "代码", "code", "number", "序号", "sequence", "identifier"],
        )

        self.analyzer.registry.add_recognizer(numeric_id_recognizer)

        logger.info("Custom numeric ID recognizer added")

    def has_pii(self, text: str, entities: Optional[List[str]] = None) -> bool:
        pii_results = self.detect_pii(text)
        return len(pii_results) > 0

    def batch_has_pii(self, texts: List[str]) -> List[bool]:
        """
        Check if multiple texts contain PII information using batch processing

        Args:
            texts: List of texts to be checked

        Returns:
            List of boolean values indicating whether each text contains PII
        """
        if not texts or not isinstance(texts, list):
            return []

        batch_results = self.batch_detect_pii(texts)
        return [len(results) > 0 for results in batch_results]

    def detect_pii(self, text: str) -> List[PIIResult]:
        """
        Detect PII information in text

        Args:
            text: Text to be detected
            entities: Specified entity types to detect, defaults to all supported types

        Returns:
            List of detected PII information
        """
        if not text or not isinstance(text, str):
            return []

        results = self.analyzer.analyze(
            text=text,
            language=self.language,
            entities=self.filtered_entities,
            score_threshold=self.threshold,
        )

        pii_results = []
        for result in results:
            pii_result = PIIResult(
                entity_type=result.entity_type,
                start=result.start,
                end=result.end,
                score=result.score,
                text=text[result.start : result.end],
            )
            pii_results.append(pii_result)

        if pii_results:
            logger.debug(f"Detected {len(pii_results)} PII entities")

        return pii_results

    def batch_detect_pii(self, texts: List[str]) -> List[List[PIIResult]]:
        """
        Detect PII information in multiple texts using batch processing

        Args:
            texts: List of texts to be detected

        Returns:
            List of lists containing detected PII information for each text
        """
        if not texts or not isinstance(texts, list):
            return []

        # Filter out empty or non-string texts
        valid_texts = []
        text_indices = []
        for i, text in enumerate(texts):
            if text and isinstance(text, str):
                valid_texts.append(text)
                text_indices.append(i)

        if not valid_texts:
            return [[] for _ in texts]

        # Use batch analyzer for multiple texts
        results_iterator = self.batch_analyzer.analyze_iterator(
            texts=valid_texts,
            language=self.language,
            entities=self.filtered_entities,
            score_threshold=self.threshold,
            n_process=24,
            batch_size=32,
        )

        # Process results
        all_pii_results = [[] for _ in texts]

        for batch_idx, results in enumerate(results_iterator):
            original_idx = text_indices[batch_idx]
            text = valid_texts[batch_idx]

            pii_results = []
            for result in results:
                pii_result = PIIResult(
                    entity_type=result.entity_type,
                    start=result.start,
                    end=result.end,
                    score=result.score,
                    text=text[result.start : result.end],
                )
                pii_results.append(pii_result)

            all_pii_results[original_idx] = pii_results

        total_entities = sum(len(results) for results in all_pii_results)
        if total_entities > 0:
            logger.debug(f"Batch detected {total_entities} PII entities across {len(valid_texts)} texts")

        return all_pii_results

    def anonymize_text(self, text: str, entities: Optional[List[str]] = None) -> str:
        """
        Anonymize PII information in text

        Args:
            text: Text to be anonymized
            entities: Specified entity types to anonymize, defaults to all detected types

        Returns:
            Anonymized text
        """
        if not text or not isinstance(text, str):
            return text

        try:
            analyzer_results = self.analyzer.analyze(
                text=text, language=self.language, entities=entities, score_threshold=self.threshold
            )

            anonymized_result = self.anonymizer.anonymize(
                text=text, analyzer_results=cast(List[AnonymizerRecognizerResult], analyzer_results)
            )

            logger.info(f"Successfully anonymized {len(analyzer_results)} PII entities")
            return anonymized_result.text

        except Exception as e:
            logger.error(f"Text anonymization failed: {e}")
            return text

    def get_supported_entities(self) -> List[str]:
        return self.analyzer.get_supported_entities(language=self.language)

    def get_all_entities(self) -> List[str]:
        """Get all entities including custom ones from the registry"""
        predefined_entities = self.get_supported_entities()
        custom_entities = []

        # Get custom entities from registry
        for recognizer in self.analyzer.registry.recognizers:
            for entity in recognizer.supported_entities:
                if entity not in predefined_entities and entity not in custom_entities:
                    custom_entities.append(entity)

        return predefined_entities + custom_entities


class ChinesePIIDetector(PIIDetector):
    """Chinese PII detector, extended to recognize Chinese-specific PII"""

    def __init__(self, threshold: float = 0.5):
        super().__init__(language="zh", threshold=threshold)

        # Filter out country-specific entities that are not relevant for Chinese context
        country_prefixes = ["US_", "UK_", "SG_", "AU_", "IN_"]
        # Get entities that are actually supported by the analyzer
        all_entities = self.get_all_entities()
        supported_entities = self.get_supported_entities()

        self.filtered_entities = [
            entity
            for entity in all_entities
            if entity not in self.not_filtered_entities
            and not any(entity.startswith(prefix) for prefix in country_prefixes)
            and (entity in supported_entities or entity in ["NUMERIC_ID", "CHINESE_PII"])
        ]
        logger.info(f"Chinese PII filtered entity types: {self.filtered_entities}")

    def _add_custom_recognizers(self, language: str):
        # Add parent class recognizers first
        super()._add_custom_recognizers(language="zh")

        # Add Chinese-specific recognizers that are not covered by NUMERIC_ID
        chinese_patterns = [
            Pattern(name="chinese_id_with_x", regex=r"\b\d{17}[Xx]\b", score=0.9),
            Pattern(
                name="chinese_email", regex=r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", score=0.9
            ),
            Pattern(
                name="chinese_email_with_plus",
                regex=r"\b[A-Za-z0-9._%+-]+\+[A-Za-z0-9._%+-]*@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
                score=0.95,
            ),
        ]

        chinese_recognizer = PatternRecognizer(
            supported_entity="CHINESE_PII",
            supported_language="zh",
            patterns=chinese_patterns,
            name="chinese_pii_recognizer",
            context=["中文PII"],
        )
        self.analyzer.registry.add_recognizer(chinese_recognizer)

        logger.info("Chinese PII recognizer added")


================================================
FILE: weclone/core/inference/offline_infer.py
================================================
import re
from typing import List, Optional, cast

import torch
from llamafactory.data import get_template_and_fix_tokenizer
from llamafactory.extras.misc import get_device_count
from llamafactory.hparams import get_infer_args
from llamafactory.model import load_tokenizer
from openai.types.chat import ChatCompletion
from pydantic import BaseModel
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.sampling_params import GuidedDecodingParams

from weclone.utils.config import load_config
from weclone.utils.config_models import VllmArgs
from weclone.utils.log import logger

# from vllm.entrypoints.openai.tool_parsers import xLAMToolParser

# NOTE: the V1 LLM engine writing style was used.


def extract_json_from_text(text: str) -> str:
    """Extract JSON content from text, supporting JSON blocks in markdown format."""
    json_pattern = r"```json\s*(.*?)\s*```"
    match = re.search(json_pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return text.strip()


def parse_guided_decoding_results(
    results: List[RequestOutput] | List[ChatCompletion] | List, guided_decoding_class: type[BaseModel]
) -> tuple[List[Optional[BaseModel]], List[int]]:
    """Parse guided decoding results and return parsed results with failed indices.

    Args:
        results: Raw vLLM generation results
        guided_decoding_class: Pydantic model class for validation

    Returns:
        tuple: (parsed_results, failed_indices) where failed_indices contains
               indices of failed JSON parsing
    """
    parsed_results = []
    failed_indexs = []

    for idx, result in enumerate(results):
        try:
            if isinstance(result, RequestOutput):
                json_text = extract_json_from_text(result.outputs[0].text)
            elif isinstance(result, ChatCompletion):
                json_text = extract_json_from_text(result.choices[0].message.content)
            else:
                raise ValueError(f"Unsupported result type: {type(result)}")
            parsed_result = guided_decoding_class.model_validate_json(json_text)
            parsed_results.append(parsed_result)
        except Exception as e:
            if isinstance(result, RequestOutput):
                log_text = result.outputs[0].text[:100] + "..."
            elif isinstance(result, ChatCompletion):
                log_text = result.choices[0].message.content[:100] + "..."
            else:
                log_text = str(result)[:100] + "..."
            logger.warning(
                f"Failed to parse JSON from result at sequence index {idx}: {log_text}, error: {e}"
            )
            failed_indexs.append(idx)
            parsed_results.append(None)

    return parsed_results, failed_indexs


def vllm_infer(
    inputs: List[str],
    model_name_or_path: str,
    adapter_name_or_path: Optional[str] = None,
    dataset: str = "alpaca_en_demo",
    dataset_dir: str = "data",
    template: str = "default",
    cutoff_len: int = 2048,
    max_samples: Optional[int] = None,
    vllm_config: str = "{}",
    save_name: str = "generated_predictions.jsonl",
    default_system: Optional[str] = None,
    enable_thinking: bool = False,
    temperature: float = 0.95,
    top_p: float = 0.7,
    top_k: int = 50,
    guided_decoding_class: Optional[type[BaseModel]] = None,
    bad_words: Optional[List[str]] = None,
    logprobs: Optional[int] = None,
    max_new_tokens: int = 1024,
    repetition_penalty: float = 1.0,
    skip_special_tokens: bool = True,
    seed: Optional[int] = None,
    pipeline_parallel_size: int = 1,
    image_max_pixels: int = 768 * 768,
    image_min_pixels: int = 32 * 32,
) -> tuple[List[RequestOutput] | List[Optional[BaseModel]], List[int]]:
    r"""Perform batch generation using vLLM engine, which supports tensor parallelism.

    Returns:
        tuple: (results, failed_indices) where failed_indices contains indices of failed JSON parsing
    """
    if pipeline_parallel_size > get_device_count():
        raise ValueError("Pipeline parallel size should be smaller than the number of gpus.")

    wc_vllm_args = cast(VllmArgs, load_config("vllm"))
    model_args, data_args, _, generating_args = get_infer_args(
        {
            "model_name_or_path": model_name_or_path,
            "adapter_name_or_path": adapter_name_or_path,
            "dataset": dataset,
            "dataset_dir": dataset_dir,
            "template": template,
            "cutoff_len": cutoff_len,
            "max_samples": max_samples,
            "preprocessing_num_workers": 16,
            "vllm_config": vllm_config,
            "temperature": temperature,
            "top_p": top_p,
            "top_k": top_k,
            "max_new_tokens": max_new_tokens,
            "repetition_penalty": repetition_penalty,
            "enable_thinking": enable_thinking,
        }
    )

    tokenizer_module = load_tokenizer(model_args)
    tokenizer = tokenizer_module["tokenizer"]
    template_obj = get_template_and_fix_tokenizer(tokenizer, data_args)
    template_obj.mm_plugin.expand_mm_tokens = False  # for vllm generate

    if guided_decoding_class:
        json_schema = guided_decoding_class.model_json_schema()
        guided_decoding_params = GuidedDecodingParams(json=json_schema, disable_any_whitespace=True)

    sampling_params = SamplingParams(
        repetition_penalty=generating_args.repetition_penalty or 1.0,
        temperature=generating_args.temperature,
        top_p=generating_args.top_p or 1.0,
        top_k=generating_args.top_k or -1,
        stop_token_ids=template_obj.get_stop_token_ids(tokenizer),
        max_tokens=generating_args.max_new_tokens,
        skip_special_tokens=skip_special_tokens,
        seed=seed,
        bad_words=bad_words,
        guided_decoding=guided_decoding_params if guided_decoding_class else None,
    )
    if model_args.adapter_name_or_path is not None:
        lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0])
    else:
        lora_request = None

    engine_args = {
        "model": model_args.model_name_or_path,
        "trust_remote_code": True,
        "dtype": model_args.infer_dtype,
        "max_model_len": cutoff_len + max_new_tokens,
        "disable_log_stats": True,
        "enable_lora": model_args.adapter_name_or_path is not None,
        "enable_prefix_caching": True,
        "guided_decoding_backend": "guidance",
        "guided_decoding_disable_any_whitespace": True,
    }

    if template_obj.mm_plugin.__class__.__name__ != "BasePlugin":
        engine_args["limit_mm_per_prompt"] = {"image": 4, "video": 2, "audio": 2}

    wc_vllm_dict = {k: v for k, v in wc_vllm_args.model_dump().items() if v is not None}
    engine_args.update(wc_vllm_dict)

    if isinstance(model_args.vllm_config, dict):
        engine_args.update(model_args.vllm_config)

    messages_list = [[{"role": "user", "content": text}] for text in inputs]

    llm = LLM(**engine_args)

    results = llm.chat(
        messages_list,
        sampling_params,
        lora_request=lora_request,
        chat_template_kwargs={"enable_thinking": enable_thinking},
    )  # type: ignore

    del llm
    torch.cuda.empty_cache()

    if guided_decoding_class:
        # TODO better json decode  https://github.com/vllm-project/vllm/commit/1d0ae26c8544fd5a62e171e30c2dcc2973a23bc8#diff-3b27790a2ce97bc50cdd5476f7b0057da682ed0d1ec8426a7b76c5e21454e57d
        parsed_results, failed_indexs = parse_guided_decoding_results(results, guided_decoding_class)
        return parsed_results, failed_indexs
    else:
        return results, []


================================================
FILE: weclone/core/inference/online_infer.py
================================================
import logging
from concurrent.futures import Future, ThreadPoolExecutor
from typing import Any, Callable, List, Optional, Union

from openai import OpenAI
from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
from pydantic import BaseModel

from weclone.core.inference.offline_infer import extract_json_from_text
from weclone.utils.log import logger
from weclone.utils.retry import retry_openai_api

logging.getLogger("openai._base_client").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)


class OnlineLLM:
    def __init__(
        self,
        api_key: str,
        base_url: str,
        model_name: str,
        default_system: Optional[str] = None,
        max_workers: int = 10,
        prompt_with_system: bool = False,
        response_format: str = "json_object",
    ):
        self.api_key = api_key
        self.base_url = base_url
        self.model_name = model_name
        self.default_system = default_system
        self.max_workers = max_workers
        self.client = OpenAI(api_key=self.api_key, base_url=self.base_url, max_retries=0)
        self.executor = ThreadPoolExecutor(max_workers=max_workers)
        self.prompt_with_system = prompt_with_system
        self.response_format = response_format

    @retry_openai_api(max_retries=200, base_delay=30.0, max_delay=180.0)
    def chat(
        self,
        prompt_text,
        temperature: float = 0.7,
        max_tokens: int = 1024,
        top_p: float = 0.95,
        stream: bool = False,
    ):
        messages: List[ChatCompletionMessageParam] = []
        if self.prompt_with_system:
            messages = prompt_text
        else:
            messages = [
                # {"role": "system", "content": self.default_system},
                {"role": "user", "content": prompt_text},
            ]

        params = {
            "model": self.model_name,
            "messages": messages,
            "stream": stream,
            "temperature": temperature,
            "max_tokens": max_tokens,
            "top_p": top_p,
            # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
        }

        if self.response_format:
            params["response_format"] = {"type": self.response_format}

        response = self.client.chat.completions.create(**params)

        return response

    def chat_async(
        self,
        prompt_text: str,
        temperature: float = 0.7,
        max_tokens: int = 1024,
        top_p: float = 0.95,
        stream: bool = False,
    ) -> Future:
        """Submit a chat request to the thread pool for async processing"""
        return self.executor.submit(self.chat, prompt_text, temperature, max_tokens, top_p, stream)

    def chat_batch(
        self,
        prompts: List[str],
        temperature: float = 0.7,
        max_tokens: int = 1024,
        top_p: float = 0.95,
        stream: bool = False,
        callback: Optional[Callable[[int, Any], None]] = None,
        guided_decoding_class: Optional[type[BaseModel]] = None,
    ) -> Union[List[Union[ChatCompletion, Exception]], tuple[List[Optional[BaseModel]], List[int]]]:
        """Process multiple chat requests concurrently using thread pool

        Args:
            prompts: List of prompt strings
            temperature: Sampling temperature
            max_tokens: Maximum tokens to generate
            top_p: Top-p sampling parameter
            stream: Whether to stream the response
            callback: Optional callback function called for each result
            guided_decoding_class: Pydantic model class for JSON validation

        Returns:
            If enable_json_decode is False: List of ChatCompletion or Exception objects
            If enable_json_decode is True: Tuple of (parsed_results, failed_indices)
        """
        futures = []

        for i, prompt in enumerate(prompts):
            future = self.chat_async(prompt, temperature, max_tokens, top_p, stream)
            futures.append((i, future))

        results: List[Union[Any, Exception]] = [None] * len(prompts)

        for i, future in futures:
            try:
                result = future.result()
                results[i] = result
                if callback:
                    callback(i, result)
            except Exception as e:
                results[i] = e
                if callback:
                    callback(i, e)

        if guided_decoding_class:
            parsed_results: List[Optional[BaseModel]] = [None] * len(prompts)
            failed_indexs: List[int] = []

            for i, result in enumerate(results):
                if isinstance(result, Exception):
                    failed_indexs.append(i)
                    logger.warning(f"Request at index {i} failed with exception: {result}")
                elif isinstance(result, ChatCompletion):
                    try:
                        content = result.choices[0].message.content
                        if content is None:
                            raise ValueError("Message content is None")
                        json_text = extract_json_from_text(content)
                        parsed_result = guided_decoding_class.model_validate_json(json_text)
                        parsed_results[i] = parsed_result
                    except Exception as e:
                        content = result.choices[0].message.content
                        log_text = (content[:100] + "...") if content else "None"
                        logger.warning(
                            f"Failed to parse JSON from result at index {i}: {log_text}, error: {e}"
                        )
                        failed_indexs.append(i)
                else:
                    logger.warning(f"Unexpected result type at index {i}: {type(result)}")
                    failed_indexs.append(i)

            return parsed_results, failed_indexs

        return results

    def close(self):
        """Clean up thread pool resources"""
        if hasattr(self, "executor"):
            self.executor.shutdown(wait=True)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()


================================================
FILE: weclone/data/__init__.py
================================================


================================================
FILE: weclone/data/chat_parsers/telegram_parser.py
================================================
import csv
import json
import os
import shutil
import sys
from datetime import datetime
from typing import Dict, List

from pandas import Timestamp

from weclone.data.models import ChatMessage
from weclone.utils.config_models import DataModality, WCMakeDatasetConfig
from weclone.utils.log import logger


class TelegramChatParser:
    """Telegram chat parser that converts JSON format to data conforming to ChatMessage structure"""

    def __init__(self, config: WCMakeDatasetConfig):
        self.config = config
        self.my_user_id = config.telegram_args.my_id if config.telegram_args else None
        self.message_counter = 0

        self.type_mapping = {
            "text": "text",
            "photo": "image",
            "video_file": "video",
            "animation": "video",
            "voice_message": "voice",
            "audio_file": "file",
            "sticker": "sticker",
            "file": "file",
            "location": "location",
            "poll": "(share) card link",
            "contact_information": "(share) card link",
        }

    def get_message_type_and_content(self, message: Dict) -> tuple[str, str, str, bool]:
        """
        Determine type_name, msg content, src and whether it's a forwarded message based on Telegram message content

        Returns
        -------
        tuple[str, str, str, bool]
            (type_name, msg_content, src_path, is_forward)
        """
        msg_content = ""
        src_path = ""
        msg_type = "text"
        is_forward = "forwarded_from" in message

        if "text" in message:
            msg_content = self.extract_text_content(message["text"])

        if "media_type" in message:
            media_type = message["media_type"]
            msg_type = media_type

            if media_type == "photo":
                src_path = message.get("photo", "")
            elif media_type in ["video_file", "animation"]:
                src_path = message.get("file", "")
            elif media_type == "voice_message":
                src_path = message.get("file", "")
            elif media_type == "audio_file":
                src_path = message.get("file", "")
            elif media_type == "sticker":
                src_path = message.get("file", "")
                # Only set sticker emoji as msg_content if STICKER is in include_type
                if DataModality.STICKER in self.config.include_type and not msg_content.strip():
                    msg_content = message.get("sticker_emoji", "")
            else:
                src_path = message.get("file", "")

        elif "photo" in message:
            msg_type = "photo"
            src_path = message["photo"]

        elif "file" in message:
            msg_type = "file"
            src_path = message["file"]
            if not msg_content.strip():
                msg_content = message.get("file_name", "")

        elif "location_information" in message:
            msg_type = "location"
            loc = message["location_information"]
            src_path = f"lat:{loc.get('latitude', 0)},lng:{loc.get('longitude', 0)}"
            if not msg_content.strip():
                msg_content = message.get("place_name", "") + message.get("address", "")

        type_name = self.type_mapping[msg_type]

        return type_name, msg_content.strip(), src_path, is_forward

    def extract_text_content(self, text_field) -> str:
        content = ""
        if isinstance(text_field, str):
            content = text_field
        elif isinstance(text_field, list):
            for item in text_field:
                if isinstance(item, str):
                    content += item
                elif isinstance(item, dict) and "text" in item:
                    content += item["text"]

        return content.replace('\\"', "")

    def determine_sender_type(self, from_id: str) -> int:
        return 1 if from_id == self.my_user_id else 0

    def process_message(self, message: Dict) -> List[ChatMessage]:
        """
        Process a single message, may return multiple messages (original message + extracted text message)
        """
        if message.get("type") != "message":
            return []

        msg_id = message.get("id", 0)
        sender_name = message.get("from", "")
        from_id = message.get("from_id", "")
        date = message.get("date", "")

        type_name, msg_content, src_path, is_forward = self.get_message_type_and_content(message)

        try:
            dt = datetime.fromisoformat(date.replace("T", " ").replace("Z", ""))
            create_time = Timestamp(dt)
        except Exception as e:
            logger.warning(f"Time format conversion failed: {date}, error: {e}")

        is_sender = self.determine_sender_type(from_id)
        self.message_counter += 1

        result_messages = []
        # Save messages with content or media files
        if msg_content.strip() or src_path.strip():
            original_msg = ChatMessage(
                id=self.message_counter,  # Use global counter as sequential ID
                MsgSvrID=msg_id,  # Telegram message ID
                type_name=type_name,
                is_sender=is_sender,  # 0: other party 1: myself
                talker=sender_name,
                msg=msg_content.replace("\n", " ").strip() if msg_content.strip() else f"{type_name}",
                src=src_path,
                CreateTime=create_time,
                is_forward=is_forward,
            )
            result_messages.append(original_msg)

        # If it's a non-pure text message but contains text field, create additional text message
        if type_name not in ["text"] and "text" in message:
            text_content = self.extract_text_content(message["text"])
            if text_content.strip():
                self.message_counter += 1
                text_msg = ChatMessage(
                    id=self.message_counter,
                    MsgSvrID=msg_id,
                    type_name="text",
                    is_sender=is_sender,
                    talker=sender_name,
                    msg=text_content.replace("\n", " ").strip(),
                    src="",
                    CreateTime=create_time,
                    is_forward=is_forward,
                )
                result_messages.append(text_msg)

        return result_messages

    def process_chat(self, jdata: Dict) -> List[ChatMessage]:
        """
        Process chat data

        Parameters
        ----------
        jdata : Dict
            Telegram chat JSON object

        Returns
        -------
        List[ChatMessage]
            List of ChatMessage objects
        """
        chat_name = jdata.get("name", "Unknown Chat")
        messages = jdata.get("messages", [])

        chat_messages = []
        for message in messages:
            chat_msgs = self.process_message(message)
            chat_messages.extend(chat_msgs)

        for msg in chat_messages:
            msg.room_name = chat_name

        logger.info(f"Chat '{chat_name}' parsing completed, {len(chat_messages)} messages in total")
        return chat_messages

    def to_csv(self, chat_messages: List[ChatMessage], output_file: str):
        """
        Save ChatMessage list to CSV file

        Parameters
        ----------
        chat_messages : List[ChatMessage]
            List of ChatMessage objects
        output_file : str
            Output CSV file path
        """
        if not chat_messages:
            logger.warning("No messages to save")
            return

        fieldnames = [
            "id",
            "MsgSvrID",
            "type_name",
            "is_sender",
            "talker",
            "room_name",
            "msg",
            "src",
            "CreateTime",
            "is_forward",
        ]

        os.makedirs(os.path.dirname(output_file), exist_ok=True)

        with open(output_file, "w", encoding="utf-8", newline="") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for msg in chat_messages:
                writer.writerow(
                    {
                        "id": msg.id,
                        "MsgSvrID": msg.MsgSvrID,
                        "type_name": msg.type_name,
                        "is_sender": msg.is_sender,
                        "talker": msg.talker,
                        "room_name": msg.room_name,
                        "msg": msg.msg,
                        "src": msg.src,
                        "CreateTime": msg.CreateTime,
                        "is_forward": msg.is_forward,
                    }
                )

        logger.info(f"CSV file saved: {output_file}")

    def copy_received_images(
        self, chat_messages: List[ChatMessage], base_path: str = "", target_dir: str = "dataset/media/images"
    ):
        """
        Copy all images with is_sender=0 to specified directory
        """
        os.makedirs(target_dir, exist_ok=True)

        copied_count = 0
        skipped_count = 0

        for msg in chat_messages:
            if msg.is_sender == 0 and msg.type_name == "image" and msg.src:
                if base_path:
                    full_src_path = os.path.join(base_path, msg.src)
                else:
                    full_src_path = msg.src

                normalized_src = full_src_path.replace("\\", "/")
                if not os.path.exists(normalized_src):
                    logger.warning(f"Source file does not exist: {normalized_src}")
                    skipped_count += 1
                    continue

                filename = os.path.basename(normalized_src)

                target_path = os.path.join(target_dir, filename)

                shutil.copy2(normalized_src, target_path)
                copied_count += 1

        logger.info(f"Image copying completed: successful {copied_count}, skipped {skipped_count}")


def process_telegram_dataset(config: WCMakeDatasetConfig) -> None:
    """
    Process Telegram dataset, traverse all folders under dataset/telegram
    Create corresponding folders for each telegram folder under dataset/csv

    Parameters
    ----------
    config : WCMakeDatasetConfig
        Dataset configuration, contains telegram_args.my_id for determining sender
    """
    telegram_dir = "dataset/telegram"
    csv_output_dir = "dataset/csv"

    if not os.path.exists(telegram_dir):
        logger.error(f"Telegram data directory does not exist: {telegram_dir}")
        return

    if not config.telegram_args or not config.telegram_args.my_id:
        logger.error("Telegram configuration missing, cannot process Telegram dataset")
        sys.exit(1)

    if os.path.exists(csv_output_dir):
        for item in os.listdir(csv_output_dir):
            item_path = os.path.join(csv_output_dir, item)
            if os.path.isdir(item_path):
                shutil.rmtree(item_path)
            else:
                os.remove(item_path)

    for folder_name in os.listdir(telegram_dir):
        folder_path = os.path.join(telegram_dir, folder_name)
        if not os.path.isdir(folder_path):
            continue

        json_path = os.path.join(folder_path, "result.json")

        with open(json_path, "r", encoding="utf-8") as file:
            jdata = json.load(file)

        chat_name = jdata.get("name", "unknown")
        chat_type = jdata.get("type", "unknown")
        chat_id = jdata.get("id", "unknown")

        safe_name = "".join(c for c in str(chat_name) if c.isalnum() or c in "._-")
        safe_type = "".join(c for c in str(chat_type) if c.isalnum() or c in "._-")
        safe_id = "".join(c for c in str(chat_id) if c.isalnum() or c in "._-")

        csv_folder_name = f"{safe_name}-{safe_type}-{safe_id}"
        csv_folder_path = os.path.join(csv_output_dir, csv_folder_name)

        parser = TelegramChatParser(config=config)
        messages = parser.process_chat(jdata)

        if messages:
            csv_file_path = os.path.join(csv_folder_path, f"{csv_folder_name}.csv")
            parser.to_csv(messages, csv_file_path)
            parser.copy_received_images(messages, folder_path)
        else:
            logger.warning(f"Folder '{folder_name}' has no valid messages")


================================================
FILE: weclone/data/clean/__init__.py
================================================


================================================
FILE: weclone/data/clean/strategies.py
================================================
import json
import os
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, cast

import pandas as pd
from langchain_core.prompts import PromptTemplate
from tqdm import tqdm

from weclone.core.inference.online_infer import OnlineLLM
from weclone.data.models import QaPair, QaPairScore, QaPairScoreWithId
from weclone.prompts.clean_data import CLEAN_PROMPT
from weclone.utils.config_models import WCMakeDatasetConfig
from weclone.utils.log import logger


@dataclass
class CleaningStrategy(ABC):
    """Abstract base class for data cleaning strategies, but provides common cleaning methods"""

    make_dataset_config: WCMakeDatasetConfig

    @abstractmethod
    def judge(self, data: List[QaPair]) -> None:
        """
        Scoring method, needs to be implemented by subclasses.
        """
        pass

    def clean(self) -> str:
        """
        Filter SFT data based on score and return the final dataset name to use.
        """
        config = self.make_dataset_config
        original_dataset_name = config.dataset
        cleaned_dataset_name = original_dataset_name + "-cleaned"

        dataset_dir = config.dataset_dir
        dataset_info_path = os.path.join(dataset_dir, "dataset_info.json")

        with open(dataset_info_path, "r", encoding="utf-8") as f:
            info = json.load(f)
        paths = {
            name: os.path.join(dataset_dir, info.get(name, {}).get("file_name"))
            for name in [original_dataset_name, cleaned_dataset_name]
        }
        original_data_path, cleaned_data_path = paths.values()

        try:
            with open(original_data_path, "r", encoding="utf-8") as f:
                data = json.load(f)
            accept_score = config.clean_dataset.llm.accept_score
            filtered_data = [item for item in data if item.get("score", 0) >= accept_score]

            if not filtered_data:
                logger.warning("No data retained after cleaning, will use original dataset.")
                return original_dataset_name

            with open(cleaned_data_path, "w", encoding="utf-8") as f:
                json.dump(filtered_data, f, ensure_ascii=False, indent=2)
            logger.success(
                f"Filtered data below {accept_score} score, retained {len(filtered_data)} items, saved to {cleaned_data_path}"
            )
            return cleaned_dataset_name

        except Exception as e:
            logger.error(f"Error occurred during data cleaning, will use original dataset: {e}")
            return original_dataset_name


@dataclass
class LLMCleaningStrategy(CleaningStrategy):
    """Strategy for data cleaning using large language models"""

    make_dataset_config: WCMakeDatasetConfig

    def judge(self, data: List[QaPair]) -> None:
        """
        Call LLM for scoring and directly assign scores to the input QaPair.
        """
        from weclone.core.inference.offline_infer import vllm_infer

        logger.info("Starting LLM scoring of data")
        inputs = []
        prompt_template = PromptTemplate.from_template(CLEAN_PROMPT)
        for qa in data:
            if qa.images:
                qa.score = 6
            else:
                messages_str = ""
                for msg in qa.messages:
                    if msg.role == "user":
                        messages_str += f"Q: {msg.content}\n"
                    elif msg.role == "assistant":
                        messages_str += f"A: {msg.content}\n"
                prompt_value = prompt_template.invoke({"id": qa.id, "messages": messages_str.strip()})
                inputs.append(prompt_value.to_string())

        parsed_scores, failed_indexs = vllm_infer(
            inputs,
            self.make_dataset_config.model_name_or_path,
            template=self.make_dataset_config.template,
            temperature=0,
            guided_decoding_class=QaPairScore,
            repetition_penalty=1.1,
            enable_thinking=self.make_dataset_config.clean_dataset.llm.enable_thinking,
            cutoff_len=self.make_dataset_config.messages_max_length + 1024,  # add prompt length
            max_new_tokens=1024 if self.make_dataset_config.clean_dataset.llm.enable_thinking else 200,
        )

        # We align scores by iterating only non-image examples and popping from the head of parsed_scores.
        # Build an iterator over parsed results for simplicity and safety.
        parsed_iter = iter(cast(List[QaPairScore | None], parsed_scores))
        non_image_count = 0
        failed_count = 0

        for qa in data:
            if qa.images:
                continue
            non_image_count += 1
            parsed_item = next(parsed_iter, None)
            if parsed_item is None:
                failed_count += 1
                qa.score = 0
            else:
                qa.score = parsed_item.score

        # Sanity check: number of Nones should equal failed_indexs; and total length matches non-image count
        assert failed_count == len(failed_indexs), (
            f"Mismatch: failed_count({failed_count}) != failed_indexs({len(failed_indexs)})"
        )
        assert len(cast(List[QaPairScore | None], parsed_scores)) == non_image_count, (
            f"Mismatch: len(parsed_scores)({len(cast(List[QaPairScore | None], parsed_scores))}) != non_image_count({non_image_count})"
        )

        scores = [qa.score for qa in data if qa.score is not None]
        score_series = pd.Series(scores)
        score_counts = score_series.value_counts().sort_index()
        score_percentages = score_series.value_counts(normalize=True).sort_index() * 100
        pd.set_option("display.unicode.east_asian_width", True)  # Try to fix alignment issues
        distribution_df = pd.DataFrame(  # Merge count and percentage into one DataFrame for printing
            {
                "Count": score_counts,
                "Percentage(%)": score_percentages.round(2),
            }
        )
        distribution_df.index.name = "Score"  # Add column name for the first column: Score
        printable_df_str = distribution_df.reset_index().to_string(index=False)
        logger.success(f"LLM scoring distribution:\n{printable_df_str}")


@dataclass
class OlineLLMCleaningStrategy(CleaningStrategy):
    """Strategy for data cleaning using large language models"""

    # TODO: images clean support
    def judge(self, data: List[QaPair]) -> None:
        config = self.make_dataset_config
        logger.info("Starting online model scoring of data")
        logger.info(f"Using model {config.model_name}")

        client = OnlineLLM(
            api_key=config.llm_api_key,
            base_url=config.base_url,
            model_name=config.model_name,
            max_workers=config.clean_batch_size + 5,
        )

        inputs = []
        prompt_template = PromptTemplate.from_template(CLEAN_PROMPT)
        for qa in data:
            if qa.images:
                qa.score = 6
            else:
                messages_str = ""
                for msg in qa.messages:
                    if msg.role == "user":
                        messages_str += f"Q: {msg.content}\n"
                    elif msg.role == "assistant":
                        messages_str += f"A: {msg.content}\n"
                prompt_value = prompt_template.invoke({"id": qa.id, "messages": messages_str.strip()})
                inputs.append(prompt_value.to_string())

        clean_batch_size = config.clean_batch_size
        all_parsed_scores = []

        for i in tqdm(range(0, len(inputs), clean_batch_size), desc="Online model scoring progress"):
            batch = inputs[i : i + clean_batch_size]

            try:
                parsed_results, failed_indexs = client.chat_batch(
                    batch, temperature=0, guided_decoding_class=QaPairScoreWithId
                )

                for j, parsed_result in enumerate(parsed_results):
                    if parsed_result is not None:
                        all_parsed_scores.append(parsed_result)
                    else:
                        logger.warning(f"Failed to parse result for batch item at index {i + j}")

            except Exception as e:
                logger.error(
                    f"Failed to call online model or parse result for batch starting at index {i}, error: {str(e)}"
                )

        score_map = {score.id: score.score for score in all_parsed_scores}
        for qa in data:
            if qa.id in score_map:
                qa.score = score_map[qa.id]
            else:
                logger.warning(f"No score obtained for QA ID {qa.id}, default assigned 0")
                qa.score = 0

        scores = [qa.score for qa in data if qa.score is not None]
        score_series = pd.Series(scores)
        score_counts = score_series.value_counts().sort_index()
        score_percentages = score_series.value_counts(normalize=True).sort_index() * 100
        pd.set_option("display.unicode.east_asian_width", True)
        distribution_df = pd.DataFrame(
            {
                "Count": score_counts,
                "Percentage(%)": score_percentages.round(2),
            }
        )
        distribution_df.index.name = "Score"
        printable_df_str = distribution_df.reset_index().to_string(index=False)
        logger.success(f"Online model scoring distribution:\n{printable_df_str}")


================================================
FILE: weclone/data/models.py
================================================
from dataclasses import dataclass
from typing import Optional

from pandas import Timestamp
from pydantic import BaseModel, Field

from weclone.utils.config_models import DataModality
from weclone.utils.i18n import MultiLangList


@dataclass
class ChatMessage:
    id: int  # sequential id
    MsgSvrID: str  # original message id from platform
    type_name: str  # message type, refer to cut_type_data and skip_type_data
    is_sender: int  # 0: other party, 1: self
    talker: str  # message sender
    msg: str  # message content
    src: str  # media file path, additional info field
    CreateTime: Timestamp  # message send time
    room_name: Optional[str] = None  # chat room name
    is_forward: bool = False  # whether it's a forwarded message
    modality: Optional[DataModality] = None  # message modality, set in qa_generator.py


@dataclass
class CutMessage:
    is_sender: int
    cut_type: str
    CreateTime: Timestamp


@dataclass
class Message:
    role: str
    content: str


@dataclass
class QaPair:
    id: int
    time: Timestamp
    score: int
    messages: list[Message]
    images: list[str]
    system: str


class QaPairScore(BaseModel):
    score: int = Field(ge=1, le=5)


class QaPairScoreWithId(QaPairScore):
    id: int


cut_type_data = {
    "zh_CN": [
        "cut",
        "Cut",
        "图片",
        "视频",
        "合并转发的聊天记录",
        "语音",
        "(分享)音乐",
        "(分享)卡片式链接",
        "(分享)笔记",
        "(分享)小程序",
        "(分享)收藏夹",
        "(分享)视频号名片",
        "(分享)视频号视频",
        "粘贴的文本",  # 无法解析的分享链接
        "未知",
    ],
    "en": [
        "cut",
        "Cut",
        "image",
        "video",
        "merged forward chat records",
        "voice",
        "(share) music",
        "(share) card link",
        "(share) note",
        "(share) mini program",
        "(share) favorites",
        "(share) video account card",
        "(share) video account video",
        "pasted text",  # Unparseable share link
        "unknown",
    ],
}

cut_type_list = MultiLangList(cut_type_data, default_lang="en")


skip_type_data = {
    "zh_CN": [
        "添加好友",
        "推荐公众号",
        "动画表情",
        "用户上传的GIF表情",
        "位置",
        "文件",
        "位置共享",
        "引用回复",
        "群公告",
        "转账",
        "语音通话",
        "系统通知",
        "消息撤回",
        "拍一拍",
        "邀请加群",
    ],
    "en": [
        "add friend",
        "recommend official account",
        "sticker",
        "sticker2",
        "location",
        "file",
        "location sharing",
        "reply with quote",
        "group announcement",
        "transfer",
        "voice call",
        "system notification",
        "message recall",
        "pat pat",
        "invite to group",
    ],
}

skip_type_list = MultiLangList(skip_type_data, default_lang="en")

unprocessed_type_list = []


================================================
FILE: weclone/data/qa_generator.py
================================================
import json
import os
import re
import subprocess  # nosec
import sys
from typing import List, Union, cast

os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")

import pandas as pd
from pandas import Timestamp

from weclone.core.PII.pii_detector import ChinesePIIDetector, PIIDetector
from weclone.data.chat_parsers.telegram_parser import process_telegram_dataset
from weclone.data.clean.strategies import LLMCleaningStrategy, OlineLLMCleaningStrategy
from weclone.data.models import (
    ChatMessage,
    CutMessage,
    Message,
    QaPair,
    cut_type_list,
    skip_type_list,
)
from weclone.data.strategies import TimeWindowStrategy
from weclone.data.utils import ImageToTextProcessor, check_image_file_exists
from weclone.utils.config import load_config
from weclone.utils.config_models import DataModality, LanguageType, PlatformType, WCMakeDatasetConfig
from weclone.utils.log import logger


class DataProcessor:
    def __init__(self):
        self.config = cast(WCMakeDatasetConfig, load_config(arg_type="make_dataset"))
        self.csv_folder = "./dataset/csv"
        self.system_prompt = self.config.default_system
        self.enable_clean = self.config.clean_dataset.enable_clean

        # message type
        self.QaPair = QaPair

        self.include_type = self.config.include_type
        if self.config.platform == PlatformType.CHAT:
            self.cut_type_list = cut_type_list.get_items(lang="zh_CN")
            self.skip_type_list = skip_type_list.get_items(lang="zh_CN")
            self.include_type = cut_type_list.translate_batch(
                texts=[t for t in self.include_type if t.lower() != "text"]
            )
            self.cut_type_list = [t for t in self.cut_type_list if t not in self.include_type]
        elif self.config.platform == PlatformType.TELEGRAM:
            self.cut_type_list = cut_type_list.get_items(lang="en")
            self.skip_type_list = skip_type_list.get_items(lang="en")
            self.include_type = [t for t in self.include_type if t.lower() != "text"]
            self.cut_type_list = [t for t in self.cut_type_list if t not in self.include_type]
            if DataModality.STICKER in self.include_type:
                self.skip_type_list.remove("sticker")

        # blocked words
        config_blocked_words = self.config.blocked_words
        file_blocked_words = []
        try:
            with open("./dataset/blocked_words.json", encoding="utf-8") as f:
                file_blocked_words = json.load(f).get("blocked_words", [])
        except (FileNotFoundError, json.JSONDecodeError):
            pass

        self.blocked_words = list(set(config_blocked_words + file_blocked_words))
        # logger.info(f"Chat record blocked words: {self.blocked_words}")

        # combine strategy
        if self.config.single_combine_strategy == "time_window":
            self.single_combine_strategy = TimeWindowStrategy(
                time_window=self.config.single_combine_time_window * 60,
                is_single_chat=True,
            )

        if self.config.qa_match_strategy == "time_window":
            self.qa_match_strategy = TimeWindowStrategy(
                time_window=self.config.qa_match_time_window * 60,
                is_single_chat=False,
            )

        # PII detection
        if self.config.language == LanguageType.ZH:
            self.pii_detector = ChinesePIIDetector()
        else:
            self.pii_detector = PIIDetector(language=self.config.language)

        # dataset cleaning
        clean_dataset_config = self.config.clean_dataset

        if self.enable_clean:
            if clean_dataset_config.clean_strategy == "llm":
                if self.config.online_llm_clear:
                    self.clean_strategy = OlineLLMCleaningStrategy(make_dataset_config=self.config)
                else:
                    from llamafactory.extras.packages import is_vllm_available

                    if not is_vllm_available():
                        logger.error("vLLM is not available, dataset cleaning is not supported.")
                        sys.exit(1)
                    else:
                        self.clean_strategy = LLMCleaningStrategy(make_dataset_config=self.config)

        vision_config = self.config.vision_api
        if vision_config.enable and vision_config.api_key:
            self.image_processor = ImageToTextProcessor(
                api_url=vision_config.api_url,  # type: ignore
                api_key=vision_config.api_key,  # type: ignore
                model_name=vision_config.model_name,  # type: ignore
                config=self.config,
            )
            logger.info(f"ImageToText functionality enabled, model: {self.image_processor.model_name}")
        else:
            self.image_processor = None

        self.c = self.config

        self.relations = {}

    def main(self):
        self.pre_parse_chat_dataset()

        if not os.path.exists(self.csv_folder) or not os.listdir(self.csv_folder):
            logger.error(
                f"Error: Directory '{self.csv_folder}' does not exist or is empty. Please check the path and ensure it contains CSV chat data files."
            )
            sys.exit(1)

        csv_files = self.get_csv_files()
        logger.info(f"Found {len(csv_files)} CSV files in total, starting processing, please be patient...")
        message_list: List[ChatMessage] = []
        for csv_file in csv_files:
            logger.debug(f"Starting to process CSV file: {csv_file}")
            chat_messages = self.load_file(csv_file)
            message_list.extend(self.group_consecutive_messages(messages=chat_messages))
            # self.process_by_msgtype(chat_message)
            logger.debug(f"Processing completed: {csv_file}, loaded {len(chat_messages)} messages in total")
        qa_res = self.match_qa(messages=message_list)
        qa_res = [item for item in qa_res if isinstance(item, QaPair)]

        if self.image_processor:
            logger.info("Starting image recognition process...")
            qa_res = self.image_processor._process_images_in_parallel(qa_res)
            logger.info("Image recognition process completed.")

        if self.enable_clean:
            self.clean_strategy.judge(qa_res)  # type: ignore

        self.save_result(qa_res)
        self._execute_length_cdf_script()

        logger.success(
            f"Chat record processing successful, obtained {len(qa_res)} data entries in total, saved to ./dataset/res_csv/sft/sft-my.json"
        )

    def pre_parse_chat_dataset(self):
        if self.c.platform == PlatformType.TELEGRAM:
            process_telegram_dataset(self.config)

    def _execute_length_cdf_script(self):
        """Execute the length_cdf.py script to calculate cutoff_len."""
        try:
            python_executable = sys.executable
            script_path = os.path.join("weclone", "utils", "length_cdf.py")

            command_parts = [
                python_executable,
                script_path,
                f'--model_name_or_path="{self.c.model_name_or_path}"',
                f'--dataset="{self.c.dataset}"',
                f'--dataset_dir="{self.c.dataset_dir}"',
                f'--template="{self.c.template}"',
                "--interval=512",
            ]

            if hasattr(self.c, "media_dir") and self.c.media_dir:
                command_parts.append(f'--media_dir="{self.c.media_dir}"')
            if hasattr(self.c, "image_max_pixels") and self.c.image_max_pixels:
                command_parts.append(f'--image_max_pixels="{self.c.image_max_pixels}"')

            child_env = os.environ.copy()
            child_env["CUDA_VISIBLE_DEVICES"] = "0"
            child_env["LLAMAFACTORY_VERBOSITY"] = "ERROR"

            process = subprocess.Popen(
                command_parts,
                env=child_env,
                stdout=None,  # Use None to indicate using parent process's stdout (i.e., terminal)
                stderr=None,
                text=True,
                bufsize=1,
            )  # nosec
            return_code = process.wait()
            if return_code != 0:
                logger.error(
                    f"Command '{' '.join(command_parts)}' execution failed with return code {return_code}"
                )
        except FileNotFoundError:
            logger.error(
                f"Command execution failed: executable '{command_parts[0]}' or script '{command_parts[1]}' not found"
            )
        except KeyError as e:
            logger.error(f"Failed to execute length_cdf.py script: missing configuration item {str(e)}")
        except Exception as e:
            logger.error(f"Unknown error occurred while executing length_cdf.py script: {str(e)}")

    def get_csv_files(self):
        """Traverse the folder to get all CSV file paths and sort by starting sequence number in filename"""

        csv_files = []
        for chat_obj_folder in os.listdir(self.csv_folder):
            chat_obj_folder_path = os.path.join(self.csv_folder, chat_obj_folder)
            for csvfile in os.listdir(chat_obj_folder_path):
                if not csvfile.endswith(".csv"):
                    continue
                csvfile_path = os.path.join(chat_obj_folder_path, csvfile)
                csv_files.append(csvfile_path)
        pattern = re.compile(r"_(\d+)_\d+\.csv$")

        def extract_start(fp: str) -> int:
            name = os.path.basename(fp)
            m = pattern.search(name)
            return int(m.group(1)) if m else 0

        csv_files.sort(key=extract_start)
        return csv_files

    def match_qa(self, messages: List[ChatMessage]) -> List[Union[QaPair, CutMessage]]:
        """
        Match question-answer pairs

        Args:
            messages: Message list

        Returns:
            List[Union[QaPair, CutMessage]]: List of Q&A pairs containing instructions and outputs
        """
        WAITING_INSTRUCTION = "waiting_instruction"
        WAITING_RESPONSE = "waiting_response"

        current_state = WAITING_INSTRUCTION
        qa_res: List[Union[QaPair, CutMessage]] = []
        last_message = None
        current_instruction = None
        qa_id_counter = 0

        conversation_messages: List[Message] = []
        conversation_images: List[str] = []
        conversation_talker = ""

        def _calculate_qa_length(
            messages: List[Message], new_user_content: str, new_assistant_content: str
        ) -> int:
            """Calculate total character length of messages plus new messages"""
            total_length = 0
            for msg in messages:
                total_length += len(msg.content)
            total_length += len(new_user_content) + len(new_assistant_content)
            return total_length

        def _save_current_qa_pair(
            qa_id: int,
            time_stamp: Timestamp,
            current_conversation_messages: List[Message],
            current_conversation_images: List[str],
            talker: str = "",
        ) -> int:
            """Helper function to save the current QA pair."""
            nonlocal qa_res  # Allow modification of qa_res from the outer scope

            total_length = _calculate_qa_length(current_conversation_messages, "", "")

            if total_length <= self.config.messages_max_length:
                if len(current_conversation_images) > self.config.max_image_num:
                    logger.warning(
                        f"QA pair (potential id {qa_id}) with timestamp {time_stamp} "
                        f"has too many images ({len(current_conversation_images)} > {self.config.max_image_num}) "
                        "and will be skipped."
                    )
                    return qa_id

                if (
                    len(current_conversation_messages) == 2
                    and current_conversation_messages[0].role == "user"
                    and current_conversation_messages[0].content == "<begin_chat>"
                ):
                    return qa_id

                system_content = self.system_prompt
                if self.c.add_time:
                    system_content += f"\n 现在时间是{time_stamp.strftime('%m-%d %H:%M')}"
                if self.c.add_relation and talker:
                    relation = self.relations.get(talker, "")
                    if relation:
                        system_content += f"\n 对方是你的{relation}，你们正在聊天"

                processed_messages = current_conversation_messages.copy()
                for i in range(len(processed_messages) - 1):
                    if (
                        processed_messages[i].role == "user"
                        and "<begin_chat>" in processed_messages[i].content
                        and i + 1 < len(processed_messages)
                        and processed_messages[i + 1].role == "assistant"
                    ):
                        assistant_content = processed_messages[i + 1].content
                        processed_messages[i] = Message(
                            role="user",
                            content=processed_messages[i].content.replace(
                                "<begin_chat>", f"<begin_chat>你应该说：{assistant_content}</begin_chat>"
                            ),
                        )

                qa_pair = self.QaPair(
                    id=qa_id,
                    time=time_stamp,
                    score=0,
                    messages=processed_messages,
                    images=current_conversation_images.copy(),
                    system=system_content,
                )
                qa_res.append(qa_pair)
                return qa_id + 1
            else:
                logger.warning(
                    f"QA pair (potential id {qa_id}) with timestamp {time_stamp} "
                    f"exceeds max length ({total_length} > {self.config.messages_max_length}) "
                    "and will be skipped."
                )
                return qa_id

        for msg in messages:
            if isinstance(msg, CutMessage):
                # When encountering CutMessage, save current conversation and reset state
                if conversation_messages:
                    qa_id_counter = _save_current_qa_pair(
                        qa_id_counter,
                        last_message.CreateTime if last_message else msg.CreateTime,
                        conversation_messages,
                        conversation_images,
                        conversation_talker,
                    )
                # Reset state
                current_state = WAITING_INSTRUCTION
                current_instruction = None
                last_message = None
                conversation_messages = []
                conversation_images = []
                conversation_talker = ""
                continue

            if current_state == WAITING_INSTRUCTION:
                if msg.is_sender == 0:  # Received message from other party
                    if last_message and not self.qa_match_strategy.is_same_conversation([last_message], msg):
                        # If not the same conversation and there is a previous message, save the previous conversation
                        if conversation_messages:
                            qa_id_counter = _save_current_qa_pair(
                                qa_id_counter,
                                last_message.CreateTime,
                                conversation_messages,
                                conversation_images,
                                conversation_talker,
                            )
                            conversation_messages = []
                            conversation_images = []

                    # Regardless of whether a new conversation has just been started, this 'msg' now becomes the current instruction.
                    current_instruction = msg
                    last_message = msg
                    conversation_talker = msg.talker
                    current_state = WAITING_RESPONSE
                elif msg.is_sender == 1:  # Own message as first message
                    if last_message and not self.qa_match_strategy.is_same_conversation([last_message], msg):
                        if conversation_messages:
                            qa_id_counter = _save_current_qa_pair(
                                qa_id_counter,
                                last_message.CreateTime,
                                conversation_messages,
                                conversation_images,
                                conversation_talker,
                            )
                            conversation_messages = []
                            conversation_images = []

                    conversation_messages.append(Message(role="user", content="<begin_chat>"))
                    conversation_messages.append(Message(role="assistant", content=msg.msg))
                    last_message = msg

            elif current_state == WAITING_RESPONSE:
                if msg.is_sender == 0:  # Received message from other party
                    if last_message and not self.qa_match_strategy.is_same_conversation([last_message], msg):
                        if conversation_messages:
                            qa_id_counter = _save_current_qa_pair(
                                qa_id_counter,
                                last_message.CreateTime,
                                conversation_messages,
                                conversation_images,
                                conversation_talker,
                            )
                            conversation_messages = []
                            conversation_images = []
                    current_instruction = msg
                    last_message = msg
                    conversation_talker = msg.talker
                    # State remains unchanged
                else:  # Own message - use strategy to determine if it belongs to the same conversation
                    if last_message and self.qa_match_strategy.is_same_conversation([last_message], msg):
                        if current_instruction is None:
                            raise ValueError("current_instruction should not be None when creating a QA pair")

                        conversation_messages.append(Message(role="user", content=current_instruction.msg))
                        conversation_messages.append(Message(role="assistant", content=msg.msg))
                        if hasattr(current_instruction, "src") and current_instruction.src:
                            if isinstance(current_instruction.src, list):
                                valid_images = [img_src for img_src in current_instruction.src if img_src]
                                if valid_images:
                                    conversation_images.extend(valid_images)
                            elif current_instruction.src:
                                conversation_images.append(current_instruction.src)
                        last_message = msg

                    # Regardless of whether it matches, reset state
                    current_state = WAITING_INSTRUCTION
                    current_instruction = None

        # Process the last conversation
        if conversation_messages and last_message:
            qa_id_counter = _save_current_qa_pair(
                qa_id_counter,
                last_message.CreateTime,
                conversation_messages,
                conversation_images,
                conversation_talker,
            )

        return qa_res

    def group_consecutive_messages(self, messages: List[ChatMessage]) -> List[ChatMessage]:
        """
        Combine multiple consecutive messages from the same person into one message, add cut when encountering cut_type

        Args:
            messages: Message list

        Returns:
            List[ChatMessage]: Combined message list
        """
        if not messages:
            return []

        def _combine_text(messages: List[ChatMessage]) -> ChatMessage:
            """
            Merge multiple messages into one

            Args:
                messages: List of messages to merge

            Returns:
                ChatMessage: Merged message
            """
            base_msg = messages[0]
            combined_content = messages[0].msg
            combined_src_list = [messages[0].src] if messages[0].modality == DataModality.IMAGE else []

            for i in messages[1:]:
                content = i.msg
                if not content:
                    continue

                if combined_content and combined_content[-1] not in [
                    "。",
                    ".",
                    "！",
                    "!",
                    "？",
                    "?",
                    "…",
                    "，",
                    ",",
                ]:
                    combined_content += "\n"

                if i.modality == DataModality.IMAGE:
                    combined_src_list.append(i.src)

                combined_content += content

            if len(combined_content) > self.c.combine_msg_max_length:
                logger.warning(
                    f"Combined message length exceeds {self.c.combine_msg_max_length}, will truncate: {combined_content[:50]}"
                )
                combined_content = combined_content[: self.c.combine_msg_max_length]
                remaining_image_count = combined_content.count("<image>")
                if len(combined_src_list) > remaining_image_count:
                    combined_src_list = combined_src_list[:remaining_image_count]

            combined_message = ChatMessage(
                id=base_msg.id,
                MsgSvrID=base_msg.MsgSvrID,
                type_name=base_msg.type_name,
                is_sender=base_msg.is_sender,
                talker=base_msg.talker,
                room_name=base_msg.room_name,
                msg=combined_content,
                src=combined_src_list,  # type: ignore
                CreateTime=messages[-1].CreateTime,  # Use the time of the last message
                modality=base_msg.modality,
                is_forward=base_msg.is_forward,
            )

            return combined_message

        def _create_cut_message(message: ChatMessage) -> CutMessage:
            return CutMessage(
                is_sender=message.is_sender,
                cut_type=message.type_name,
                CreateTime=message.CreateTime,
            )

        def _combine_current_group(group):
            """
            Process current message group and add to grouped_messages

            Args:
                group: Current message group
            """
            if len(group) > 1:
                combined_msg = _combine_text(group)
                grouped_messages.append(combined_msg)
            else:
                grouped_messages.append(group[0])

        grouped_messages = []
        current_group = []

        for _, current_msg in enumerate(messages):
            if current_msg.type_name in self.cut_type_list or (
                current_msg.modality == DataModality.IMAGE and current_msg.is_sender == 1
            ):  # Own image messages need to be cut
                if current_group:
                    # Current group has messages, combine current group and add a cut
                    _combine_current_group(current_group)
                    current_group = []

                    cut_msg = _create_cut_message(current_msg)
                    grouped_messages.append(cut_msg)
                else:
                    # Current group has no messages, check previous group
                    if grouped_messages:
                        if not isinstance(grouped_messages[-1], CutMessage):
                            cut_msg = _create_cut_message(current_msg)
                            grouped_messages.append(cut_msg)
                    # If previous group has no messages or last one is CutMessage, continue directly
                continue

            if not current_group:
                current_group = [current_msg]
                continue

            last_msg = current_group[-1]

            # Determine if it's consecutive messages from the same person
            if (
                current_msg.is_sender == last_msg.is_sender
                and current_msg.talker == last_msg.talker
                and self.single_combine_strategy.is_same_conversation([last_msg], current_msg)
            ):
                current_group.append(current_msg)
            else:
                # Not messages from the same person, process current group and start new group
                _combine_current_group(current_group)
                # Start new group
                current_group = [current_msg]

        # Process the last group of messages
        if current_group:
            _combine_current_group(current_group)

        return grouped_messages

    def process_by_msgtype(self, chat_message: ChatMessage):
        if chat_message.type_name.lower() in ["文本", "text"]:
            self.process_text(chat_message)
        # elif chat_message.modality == DataModality.IMAGE:
        #     self.process_image(chat_message)

    def load_file(self, file_path) -> List[ChatMessage]:
        """
        Perform overall first preprocessing, filter rows that don't meet conditions, check if images exist and change type to cut if not, add DataModality field
        """
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)

        if folder_name not in self.relations:
            users_json_path = os.path.join(folder_path, "users.json")
            if os.path.exists(users_json_path):
                try:
                    with open(users_json_path, encoding="utf-8") as f:
                        users_data = json.load(f)
                        relation = users_data.get("relation", "")
                        if relation:
                            self.relations[folder_name] = relation
                            logger.debug(f"Loaded relation for {folder_name}: {relation}")
                except (FileNotFoundError, json.JSONDecodeError) as e:
                    logger.warning(f"Failed to load users.json from {folder_path}: {e}")

        df = pd.read_csv(
            file_path,
            encoding="utf-8",
            dtype={"msg": str, "src": str},
            escapechar=None,
            keep_default_na=False,
        )

        df = df[~df["type_name"].isin(values=self.skip_type_list)]

        if "is_forward" in df.columns:
            df = df[~((df["is_sender"] == 1) & (df["is_forward"]))]

        # Batch process text messages for PII detection and blocked words
        text_indices = []
        text_messages = []

        for i in df.index:
            if df.loc[i, "type_name"].lower() in ["文本", "text"]:  # type: ignore
                msg_str = str(df.loc[i, "msg"])
                msg_str = msg_str.replace("\n", "")
                text_indices.append(i)
                text_messages.append(msg_str)

        # TODO Deleting directly by batch_has_pii returning true/false.
        indices_to_drop = []
        if text_messages:
            pii_results = self.pii_detector.batch_has_pii(text_messages)

            for idx, (df_index, msg_str, has_pii) in enumerate(zip(text_indices, text_messages, pii_results)):
                if has_pii:
                    indices_to_drop.append(df_index)
                    continue

                # Check blocked words
                for blocked_word in self.blocked_words:
                    if blocked_word in msg_str:
                        indices_to_drop.append(df_index)
                        break

        df = df.drop(index=indices_to_drop)

        # Process other message types
        for i in df.index:
            if df.loc[i, "type_name"].lower() in ["文本", "text"]:
                continue
            if df.loc[i, "src"].lower().endswith(".gif"):
                df.loc[i, "src"] = ""
                df.loc[i, "type_name"] = "动画表情" if self.c.platform == PlatformType.CHAT else "sticker"
                continue
            if df.loc[i, "type_name"].lower() in ["图片", "image"]:  # type: ignore
                if self.c.platform in [PlatformType.CHAT, PlatformType.TELEGRAM]:
                    result = check_image_file_exists(str(df.loc[i, "src"]))
                    if isinstance(result, str) and df.loc[i, "is_sender"] == 0:
                        df.loc[i, "src"] = result
                        df.loc[i, "msg"] = "<image>"
                        df.loc[i, "modality"] = DataModality.IMAGE
                    else:
                        df.loc[i, "type_name"] = "Cut"
            elif df.loc[i, "type_name"] in ["sticker", "动画表情"]:
                if self.c.platform in [PlatformType.CHAT, PlatformType.TELEGRAM]:
                    df.loc[i, "src"] = ""
                    continue
            else:
                df.loc[i, "msg"] = ""

        df = df.dropna(how="all")
        # Time format: 2021-07-07 10:27:23
        df["CreateTime"] = pd.to_datetime(df["CreateTime"])

        return [ChatMessage(**row) for row in df.to_dict("records")]  # type: ignore

    def process_text(self, chat_message: ChatMessage):
        pass

    def save_result(self, qa_res: List[QaPair]):
        """
        Saves the list of QaPair objects to a JSON file after converting them to dictionaries.

        Args:
            qa_res: A list of QaPair objects.
        """
        processed_qa_res = []
        for idx, item in enumerate(qa_res):
            item_dict = {
                "id": str(idx),
                "time": item.time.isoformat() if item.time else None,
                "score": item.score,
                "messages": [{"role": msg.role, "content": msg.content} for msg in item.messages],
                "images": item.images,
                "system": item.system,
            }
            processed_qa_res.append(item_dict)

        output_path = "./dataset/res_csv/sft/sft-my.json"
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(processed_qa_res, f, ensure_ascii=False, indent=4)
        logger.success(
            f"Chat record processing successful, {len(qa_res)} entries in total, saved to {output_path}"
        )


if __name__ == "__main__":
    processor = DataProcessor()
    processor.main()


================================================
FILE: weclone/data/strategies.py
================================================
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List

from .models import ChatMessage


@dataclass
class ConversationStrategy(ABC):
    """Abstract base class for conversation strategies"""

    is_single_chat: bool

    @abstractmethod
    def is_same_conversation(self, history_msg: List[ChatMessage], current_msg: ChatMessage) -> bool:
        """Determine if two messages belong to the same conversation"""
        pass


@dataclass
class TimeWindowStrategy(ConversationStrategy):
    """Time window based judgment strategy"""

    time_window: int  # Time window in minutes

    def is_same_conversation(self, history_msg: List[ChatMessage], current_msg: ChatMessage) -> bool:
        time_diff = abs((current_msg.CreateTime - history_msg[-1].CreateTime)).total_seconds()
        return time_diff <= self.time_window


@dataclass
class LLMStrategy(ConversationStrategy):
    """LLM based judgment strategy"""

    def is_same_conversation(self, history_msg: List[ChatMessage], current_msg: ChatMessage) -> bool:
        # TODO: Implement LLM-based conversation detection logic
        return False


@dataclass
class CompositeStrategy(ConversationStrategy):
    """Composite strategy that combines multiple strategies"""

    strategies: List[ConversationStrategy]
    require_all: bool = True

    def is_same_conversation(self, history_msg: List[ChatMessage], current_msg: ChatMessage) -> bool:
        # TODO: Implement composite strategy logic
        return False


================================================
FILE: weclone/data/utils.py
================================================
import base64
import concurrent.futures
import os
from pathlib import Path

import requests

from weclone.utils.config_models import WCMakeDatasetConfig
from weclone.utils.log import logger
from weclone.utils.retry import retry_on_http_error


def check_image_file_exists(file_path: str) -> str | bool:
    try:
        normalized_path = os.path.normpath(file_path).replace("\\", "/")

        filename_with_ext = os.path.basename(normalized_path)
        filename_without_ext = Path(filename_with_ext).stem

        # 使用 glob 查找精确匹配该文件名的文件（不论扩展名）
        images_dir = Path("dataset") / "media" / "images"
        matching_files = list(images_dir.glob(f"{filename_without_ext}.*"))

        if len(matching_files) > 0:
            # 获取相对于dataset/media的路径，只保留images/文件名
            full_path = matching_files[0]
            relative_path = full_path.relative_to(Path("dataset") / "media")
            return str(relative_path)
        else:
            return False

    except Exception as e:
        logger.error(f"检查图片文件时出错: {file_path}, 错误: {e}")
        return False


class ImageToTextProcessor:
    """通过兼容OpenAI API的多模态LLM将图片转换为文本。"""

    def __init__(self, api_url: str, api_key: str, model_name: str, config: WCMakeDatasetConfig):
        self.api_url = api_url.rstrip("/")
        self.api_key = api_key
        self.model_name = model_name
        self.config = config
        self.prompt = """
        请描述这张图片的内容，重点关注：
        1. 如果是截图，描述界面内容和操作
        2. 如果是表格，描述表格结构和数据
        3. 如果是文档，提取关键文字信息
        4. 如果是生活照片，简要描述场景和内容。
        请用简洁明了的语言描述，不超过100字。"""

    def _process_images_in_parallel(self, qa_list):
        """并行处理所有对话中的图片，并将描述替换回对话文本。"""
        all_image_paths = []
        media_dir = self.config.media_dir

        # 遍历所有对话，收集并构造完整的图片路径
        for qa_pair in qa_list:
            if qa_pair.images:
                image_list = qa_pair.images if isinstance(qa_pair.images, list) else [qa_pair.images]
                for relative_path in image_list:
                    full_path = os.path.join(media_dir, relative_path)
                    all_image_paths.append(full_path)

        if not all_image_paths:
            logger.info("未在对话中找到任何图片，跳过识别。")
            return qa_list

        logger.info(f"共找到 {len(all_image_paths)} 张有效图片需要识别。")
        max_workers = self.config.vision_api.max_workers

        # 使用线程池并行调用API，executor.map 会保持结果顺序与输入一致
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # 现在传递给 image_processor 的是完整的路径
            image_descriptions = list(executor.map(self.describe_image, all_image_paths))

        desc_iterator = iter(image_descriptions)
        for qa_pair in qa_list:
            if not qa_pair.images:
                continue

            for message in qa_pair.messages:
                # 替换消息内容中的 <image> 占位符
                num_images_in_message = message.content.count("<image>")
                for _ in range(num_images_in_message):
                    try:
                        description = next(desc_iterator)
                        # 使用 count=1 确保每次只替换一个占位符，并添加换行符以增强可读性
                        message.content = message.content.replace(
                            "<image>", f"\n[图片描述: {description}]\n", 1
                        )
                    except StopIteration:
                        logger.error("图片数量与描述数量不匹配，可能存在逻辑错误。")
                        message.content = message.content.replace("<image>", "\n[图片描述缺失]\n", 1)

            # 清空图片列表，因为它们已被转换为文本
            qa_pair.images.clear()

        return qa_list

    def _encode_image_to_base64(self, image_path: str) -> str:
        """将图片编码为base64"""
        try:
            with open(image_path, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode("utf-8")
        except Exception as e:
            logger.error(f"编码图片失败 {image_path}: {e}")
            return ""

    def _get_image_format(self, image_path: str) -> str:
        """获取图片格式"""
        suffix = Path(image_path).suffix.lower().replace(".", "")
        if suffix == "jpg":
            return "jpeg"
        return suffix

    @retry_on_http_error(
        max_retries=5,
        base_delay=15.0,
        max_delay=300.0,
        backoff_factor=2.0,
        retry_on_status=[429, 500, 502, 503, 504],
        retry_on_exceptions=[requests.exceptions.RequestException, ConnectionError, TimeoutError],
    )
    def _call_vision_api(self, image_path: str) -> str:
        """调用Vision API（增加了重试机制）"""
        base64_image = self._encode_image_to_base64(image_path)
        if not base64_image:
            return "[图片处理失败：无法编码]"

        image_format = self._get_image_format(image_path)

        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}

        payload = {
            "model": self.model_name,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": self.prompt},
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/{image_format};base64,{base64_image}"},
                        },
                    ],
                }
            ],
            "max_tokens": 1000,
            "temperature": 0.1,
        }

        response = requests.post(
            f"{self.api_url}/chat/completions", headers=headers, json=payload, timeout=60
        )

        if response.status_code == 200:
            result = response.json()
            if "choices" in result and len(result["choices"]) > 0:
                content = result["choices"][0]["message"]["content"]
                return content.strip()
            else:
                logger.warning(f"API响应格式异常: {result}")
                return "[图片描述获取失败：API格式错误]"
        else:
            logger.error(f"API请求失败，状态码: {response.status_code}，原因: {response.reason}")
            response.raise_for_status()  # 触发重试机制
            return "[图片描述获取失败]"

    def describe_image(self, image_path: str) -> str:
        """公开方法，用于描述单张图片内容"""
        if not os.path.exists(image_path):
            logger.warning(f"图片文件不存在: {image_path}")
            return "[图片文件不存在]"

        logger.debug(f"正在识别图片: {os.path.basename(image_path)}")
        return self._call_vision_api(image_path)


if __name__ == "__main__":
    path = "Storage\\Image\2021-08\6ce3f785b4230246639c3dd0d4a8848c.dat"
    print(check_image_file_exists(path))


================================================
FILE: weclone/eval/__init__.py
================================================


================================================
FILE: weclone/eval/cli_demo.py
================================================
from llamafactory.chat import ChatModel
from llamafactory.extras.misc import torch_gc


def main():
    try:
        import platform

        if platform.system() != "Windows":
            import readline  # noqa: F401
    except ImportError:
        print("Install `readline` for a better experience.")

    chat_model = ChatModel()
    messages = []
    print(
        "Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application."
    )

    while True:
        try:
            query = input("\nUser: ")
        except UnicodeDecodeError:
            print("Detected decoding error at the inputs, please set the terminal encoding to utf-8.")
            continue
        except Exception:
            raise

        if query.strip() == "exit":
            break

        if query.strip() == "clear":
            messages = []
            torch_gc()
            print("History has been removed.")
            continue

        messages.append({"role": "user", "content": query})
        print("Assistant: ", end="", flush=True)

        response = ""
        for new_text in chat_model.stream_chat(messages):
            print(new_text, end="", flush=True)
            response += new_text
        print()
        messages.append({"role": "assistant", "content": response})


if __name__ == "__main__":
    main()


================================================
FILE: weclone/eval/eval_model.py
================================================
from llamafactory.eval.evaluator import Evaluator


def main():
    evaluator = Evaluator()
    evaluator.eval()


if __name__ == "__main__":
    main()


================================================
FILE: weclone/eval/test_model.py
================================================
import json
from typing import List, cast  # 导入 cast

import openai
from openai import OpenAI  # 导入 OpenAI 类
from openai.types.chat import ChatCompletionMessageParam  # 导入消息参数类型
from tqdm import tqdm

from weclone.utils.config import load_config
from weclone.utils.config_models import TestModelArgs, WCInferConfig

infer_config = cast(WCInferConfig, load_config("web_demo"))
test_config = cast(TestModelArgs, load_config("test_model"))

completion_config = {
    "default_prompt": infer_config.default_system,
    "model": "gpt-3.5-turbo",
    "history_len": 15,
}

completion_config = type("Config", (object,), completion_config)()

client = OpenAI(api_key="""sk-test""", base_url="http://127.0.0.1:8005/v1")


def handler_text(content: str, history: list, config):
    messages = [{"role": "system", "content": f"{config.default_prompt}"}]
    for item in history:
        messages.append(item)
    messages.append({"role": "user", "content": content})
    history.append({"role": "user", "content": content})
    try:
        typed_messages = cast(List[ChatCompletionMessageParam], messages)
        response = client.chat.completions.create(
            model=config.model,
            messages=typed_messages,
            max_tokens=50,
        )
    except openai.APIError as e:
        history.pop()
        return "AI interface error, please try again\n" + str(e)

    resp = str(response.choices[0].message.content)  # type: ignore
    resp = resp.replace("\n ", "")
    history.append({"role": "assistant", "content": resp})
    return resp


def main():
    test_list = json.loads(open(test_config.test_data_path, "r", encoding="utf-8").read())["questions"]
    res = []
    for questions in tqdm(test_list, desc=" Testing..."):
        history = []
        for q in questions:
            handler_text(q, history=history, config=completion_config)
        res.append(history)

    res_file = open("test_result-my.txt", "w")
    for r in res:
        for i in r:
            res_file.write(i["content"] + "\n")
        res_file.write("\n")


if __name__ == "__main__":
    main()


================================================
FILE: weclone/eval/web_demo.py
================================================
from llamafactory.webui.interface import create_web_demo

from weclone.utils.config import load_config


def main():
    load_config("web_demo")
    demo = create_web_demo()
    demo.queue()
    demo.launch(server_name="0.0.0.0", share=True, inbrowser=True)


if __name__ == "__main__":
    main()


================================================
FILE: weclone/prompts/__init__.py
================================================


================================================
FILE: weclone/prompts/clean_data.py
================================================
CLEAN_PROMPT = """

# 角色
你是一个数据质量评估员。

# 任务
你的任务是评估下面提供的聊天记录的**逻辑性**、**相关性**以及**风格代表性**。目标是识别并过滤掉那些回答与问题**明显不匹配**、**逻辑严重混乱**的样本，筛选出具有人类聊天风格独特性与辨识度的样本。请根据以下核心评估点给出一个1到5的整数分数，并将该分数与原始 `id` 一起输出。

**重要考量:**
1.  **简短回答的有效性:** 请注意，诸如“好的”、“是的”、“收到”、“嗯”、“知道了”等简短的肯定、确认或应答，在合适的语境下是完全**有逻辑且相关的**。**不要仅仅因为回答简短就将其评为低分。** 只有当这类简短回答与【问题/上下文 Q】**明显不符**时，才应考虑低分。
2.  **处理错别字和自我纠正:** 聊天记录中可能包含常见的打字错误（错别字）或用户先打错字随后又自行纠正的情况（例如，发送“我想去1楼”紧接着又发送“*2楼”进行更正）。在评估时，请**聚焦于用户想要表达的最终意图和信息的核心内容**，而**不应仅仅因为存在错别字或纠正过程就判定为低质量**。。

# 核心评估点 (请在心中衡量)
1.  **相关性 (Relevance):** 【回答 A】是否直接回应或恰当地衔接了【问题/上下文 Q】？它是在回答问题，还是完全跑题了？只有当【回答 A】与【问题/上下文 Q】**明显矛盾**、**完全不着边际**（即使考虑上下文也无法合理化），或简短回答**明显不适用于**该【问题/上下文 Q】时，才给予低分。
2.  **逻辑性 (Coherence):** 【回答 A】本身是否符合基本的逻辑？结合【问题/上下文 Q】来看，这个问答对是否构成了一个符合逻辑的交流片段？是否存在明显的矛盾、混乱的内容？只有当【回答 A】**自身逻辑混乱**、**与Q存在无法解释的矛盾**时，才给予低分。
3. **风格代表性**  (Style Representativeness): 评估【回答 A】是否展现了自然、独特的人类对话风格特征。回答Ａ是否带有个性化的色彩？关注点包括但不限于：是否体现了特定的语气（如友好、幽默、不耐烦、正式、脏话），是否包含口头禅、俚语、网络用语（如“yyds”、“绝绝子”）、表情符号 Emoji、颜文字、标点符号的特殊使用如“!!!”、“???”、“~”等表达、特定的缩写或短语、非标准的但一致的表达方式（如方言词汇、个人口癖）？
4. **以相关性和逻辑性为主要评判标准，风格代表性仅仅作为获得5分的必要条件。**

# 评分标准 (1-5分)
*   **1分 (极差):** 聊天记录中的问答内容完全不相关；逻辑严重混乱/矛盾。
*   **2分 (差):** 大部分问答相关性很低；存在明显的逻辑问题或不连贯。
*   **3分 (中等):** 问答相关性一般（可能部分问答跑题或回应不充分）；逻辑上勉强说得通但不够流畅或有瑕疵。
*   **4分 (良好):** 大部分问答相关性好，回答了问题或恰当衔接，逻辑清晰。
*   **5分 (优秀):** 问答相关性强，逻辑流畅，包含了显著的、具有辨识度的人类聊天的常用特征（例如情感情绪表达、口头禅、表情符号组合、特有的句子结构、鲜明的语气）
# 输出要求
请严格按照以下 JSON 格式输出，包含输入数据的 id 和你给出的1到5的整数评分 score，不要包含任何其他文字、解释或标签。
{{"id": "{id}","score": <这里填入1到5的整数评分>}}
# 输入数据
```json
{{"id": "{id}","messages": "{messages}"}}
```
"""

# ONLINE_LLM_CLEAN_PROMPT = """
# # 角色
# 你是一个数据质量评估员。

# # 任务
# 你的任务是评估下面提供的聊天记录的**逻辑性**、**相关性**以及**风格代表性**。目标是识别并过滤掉那些回答与问题**明显不匹配**、**逻辑严重混乱**的样本，筛选出具有人类聊天风格独特性与辨识度的样本。请根据以下核心评估点给出一个1到5的整数分数，并将该分数与原始 `id` 一起输出。

# **重要考量:**
# 1.  **简短回答的有效性:** 请注意，诸如“好的”、“是的”、“收到”、“嗯”、“知道了”等简短的肯定、确认或应答，在合适的语境下是完全**有逻辑且相关的**。**不要仅仅因为回答简短就将其评为低分。** 只有当这类简短回答与【问题/上下文 Q】**明显不符**时，才应考虑低分。
# 2.  **处理错别字和自我纠正:** 聊天记录中可能包含常见的打字错误（错别字）或用户先打错字随后又自行纠正的情况（例如，发送“我想去1楼”紧接着又发送“*2楼”进行更正）。在评估时，请**聚焦于用户想要表达的最终意图和信息的核心内容**，而**不应仅仅因为存在错别字或纠正过程就判定为低质量**。。

# # 核心评估点 (请在心中衡量)
# 1.  **相关性 (Relevance):** 【回答 A】是否直接回应或恰当地衔接了【问题/上下文 Q】？它是在回答问题，还是完全跑题了？只有当【回答 A】与【问题/上下文 Q】**明显矛盾**、**完全不着边际**（即使考虑上下文也无法合理化），或简短回答**明显不适用于**该【问题/上下文 Q】时，才给予低分。
# 2.  **逻辑性 (Coherence):** 【回答 A】本身是否符合基本的逻辑？结合【问题/上下文 Q】来看，这个问答对是否构成了一个符合逻辑的交流片段？是否存在明显的矛盾、混乱的内容？只有当【回答 A】**自身逻辑混乱**、**与Q存在无法解释的矛盾**时，才给予低分。
# 3. **风格代表性**  (Style Representativeness): 评估【回答 A】是否展现了自然、独特的人类对话风格特征。回答Ａ是否带有个性化的色彩？关注点包括但不限于：是否体现了特定的语气（如友好、幽默、不耐烦、正式、脏话），是否包含口头禅、俚语、网络用语（如“yyds”、“绝绝子”）、表情符号 Emoji、颜文字、标点符号的特殊使用如“!!!”、“???”、“~”等表达、特定的缩写或短语、非标准的但一致的表达方式（如方言词汇、个人口癖）？
# 4. **以相关性和逻辑性为主要评判标准，风格代表性仅仅作为获得5分的必要条件。**

# # 评分标准 (1-5分)
# *   **1分 (极差):** 聊天记录中的问答内容完全不相关；逻辑严重混乱/矛盾。
# *   **2分 (差):** 大部分问答相关性很低；存在明显的逻辑问题或不连贯。
# *   **3分 (中等):** 问答相关性一般（可能部分问答跑题或回应不充分）；逻辑上勉强说得通但不够流畅或有瑕疵。
# *   **4分 (良好):** 大部分问答相关性好，回答了问题或恰当衔接，逻辑清晰。
# *   **5分 (优秀):** 问答相关性强，逻辑流畅，包含了显著的、具有辨识度的人类聊天的常用特征（例如情感情绪表达、口头禅、表情符号组合、特有的句子结构、鲜明的语气）

# # 输入数据
# ```json
# {qa_list}

# # 输出要求
# 请严格按照以下 JSON 格式输出，包含原始的 id 和你给出的1到5的整数评分 score，不要包含任何其他文字、解释或标签！
# [
#   {{
#     "id": "<这里填入第1条输入数据的id值>",
#     "score": <1-5的整数评分>
#   }},
#   {{
#     "id": "<这里填入第2条输入数据的id值>",
#     "score": <1-5的整数评分>
#   }}
#   …
# ]
# """


================================================
FILE: weclone/server/__init__.py
================================================


================================================
FILE: weclone/server/api_service.py
================================================
import os

import uvicorn
from llamafactory.api.app import create_app
from llamafactory.chat import ChatModel

from weclone.utils.config import load_config


def main():
    config = load_config("api_service")
    chat_model = ChatModel(config.model_dump(mode="json"))
    app = create_app(chat_model)
    print("Visit http://localhost:{}/docs for API document.".format(os.environ.get("API_PORT", 8005)))
    uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("API_PORT", 8005)), workers=1)


if __name__ == "__main__":
    main()


================================================
FILE: weclone/train/__init__.py
================================================


================================================
FILE: weclone/train/export_model.py
================================================
from llamafactory.train.tuner import export_model


def main():
    export_model()


if __name__ == "__main__":
    main()


================================================
FILE: weclone/train/train_sft.py
================================================
import json
import os
from typing import cast

from llamafactory.extras.misc import get_current_device
from llamafactory.train.tuner import run_exp

from weclone.data.clean.strategies import LLMCleaningStrategy
from weclone.utils.config import load_config
from weclone.utils.config_models import WCMakeDatasetConfig, WCTrainSftConfig
from weclone.utils.log import logger


def main():
    train_config: WCTrainSftConfig = cast(WCTrainSftConfig, load_config(arg_type="train_sft"))
    dataset_config: WCMakeDatasetConfig = cast(WCMakeDatasetConfig, load_config(arg_type="make_dataset"))

    device = get_current_device()
    if device == "cpu":
        logger.warning("Please note you are using CPU for training, non-Mac devices may encounter issues")

    dataset_info_path = os.path.join(dataset_config.dataset_dir, "dataset_info.json")

    with open(dataset_info_path, "r", encoding="utf-8") as f:
        dataset_info = json.load(f)
        data_path = os.path.join(
            dataset_config.dataset_dir, dataset_info.get(train_config.dataset, {}).get("file_name")
        )
        if not os.path.exists(data_path):
            raise FileNotFoundError(
                f"Dataset file '{data_path}' does not exist, please check if make-dataset was executed"
            )

    if not dataset_config.clean_dataset.enable_clean:
        logger.info("Data cleaning is not enabled, will use the original dataset.")
    else:
        cleaner = LLMCleaningStrategy(make_dataset_config=dataset_config)
        train_config.dataset = cleaner.clean()

    formatted_config = json.dumps(train_config.model_dump(mode="json"), indent=4, ensure_ascii=False)
    logger.info(f"Fine-tuning configuration:\n{formatted_config}")

    run_exp(train_config.model_dump(mode="json"))


if __name__ == "__main__":
    main()


================================================
FILE: weclone/utils/__init__.py
================================================


================================================
FILE: weclone/utils/config.py
================================================
import os
import sys
from typing import Any, Dict, cast

import pyjson5
from omegaconf import OmegaConf
from pydantic import BaseModel

from .config_models import (
    WcConfig,
    WCInferConfig,
    WCMakeDatasetConfig,
    WCTrainSftConfig,
)
from .log import logger
from .tools import dict_to_argv


def load_base_config() -> WcConfig:
    """Load base configuration file and create WcConfig object"""
    config_path = os.environ.get("WECLONE_CONFIG_PATH", "./settings.jsonc")
    logger.info(f"Loading configuration from: {config_path}")

    try:
        with open(config_path, "r", encoding="utf-8") as f:
            s_config_dict: Dict[str, Any] = pyjson5.loads(f.read())
    except FileNotFoundError:
        logger.error(f"Configuration file not found: {config_path}")
        sys.exit(1)
    except Exception as e:
        logger.error(f"Error loading configuration file {config_path}: {e}")
        sys.exit(1)

    # Use OmegaConf to parse configuration, then convert to Pydantic model for validation
    try:
        omega_config = OmegaConf.create(s_config_dict)
        config_dict_for_validation = OmegaConf.to_container(omega_config, resolve=True)
        if not isinstance(config_dict_for_validation, dict):
            raise TypeError(
                f"Configuration should be a dictionary, but got {type(config_dict_for_validation)}"
            )
        wc_config = WcConfig(**cast(Dict[str, Any], config_dict_for_validation))
    except Exception as e:
        logger.error(f"Error parsing configuration with OmegaConf and WcConfig: {e}")
        sys.exit(1)

    return wc_config


def create_config_by_arg_type(arg_type: str, wc_config: WcConfig) -> BaseModel:
    """Create corresponding configuration object based on argument type, merge common_config"""
    if arg_type == "cli_args":
        return wc_config.cli_args

    common_config = wc_config.common_args.model_dump()

    if arg_type == "web_demo" or arg_type == "api_service":
        config_dict = {**common_config, **wc_config.infer_args.model_dump()}
        return WCInferConfig(**config_dict)

    elif arg_type == "vllm":
        return wc_config.vllm_args

    elif arg_type == "test_model":
        return wc_config.test_model_args

    elif arg_type == "train_sft":
        common_config["include_type"] = wc_config.make_dataset_args.include_type
        config_dict = {**common_config, **wc_config.train_sft_args.model_dump()}
        return WCTrainSftConfig(**config_dict)

    elif arg_type == "make_dataset":
        make_dataset_config = wc_config.make_dataset_args.model_dump()
        # TODO: Should the following three parameters be moved to common?
        train_sft_args = wc_config.train_sft_args
        extra_values = {
            "dataset": train_sft_args.dataset,
            "dataset_dir": train_sft_args.dataset_dir,
            "cutoff_len": train_sft_args.cutoff_len,
        }
        config_dict = {**common_config, **make_dataset_config, **extra_values}
        return WCMakeDatasetConfig(**config_dict)

    else:
        raise ValueError("Unsupported argument type")


def process_config_dict_and_argv(arg_type: str, config_pydantic: BaseModel) -> None:
    """Process configuration dictionary and update sys.argv"""
    config_dict = config_pydantic.model_dump(mode="json")

    sys.argv += dict_to_argv(config_dict)


def load_config(arg_type: str) -> BaseModel:
    """Main function for loading configuration"""
    # Load base configuration
    wc_config = load_base_config()

    config_pydantic = create_config_by_arg_type(arg_type, wc_config)

    process_config_dict_and_argv(arg_type, config_pydantic)

    return config_pydantic


if __name__ == "__main__":
    load_config("train_sft")


================================================
FILE: weclone/utils/config_models.py
================================================
from enum import Enum
from typing import TYPE_CHECKING, List, Optional

from loguru import logger
from pydantic import BaseModel, Field, model_validator

if TYPE_CHECKING:
    pass


class StrEnum(str, Enum):
    """
    Pydantic-friendly string enum base class
    Supports direct string comparison, e.g.: `if platform == PlatformType.CHAT`
    Also supports string literal comparison, e.g.: `if platform == "chat"`
    """

    def __str__(self) -> str:
        return self.value

    @classmethod
    def _missing_(cls, value):
        for member in cls:
            if member.value == value:
                return member
        return None


class BaseConfigModel(BaseModel):
    """Base configuration model with default extra='allow'"""

    model_config = {"extra": "allow"}


class PlatformType(StrEnum):
    """Data source platform"""

    CHAT = "chat"
    TELEGRAM = "telegram"


class LanguageType(StrEnum):
    """Data language"""

    ZH = "zh"
    EN = "en"


class DataModality(StrEnum):
    """Data modality"""

    TEXT = "text"
    IMAGE = "image"
    STICKER = "sticker"
    # AUDIO = "audio"
    # VIDEO = "video"


class CombineStrategy(StrEnum):
    """Combination strategy"""

    TIME_WINDOW = "time_window"


class CleanStrategy(StrEnum):
    """Data cleaning strategy"""

    LLM = "llm"


class FinetuningType(StrEnum):
    """Finetuning type"""

    LORA = "lora"
    # FULL = "full"
    # FREEZE = "freeze"


class CommonArgs(BaseConfigModel):
    """NOTE that all parameters here will be parsed by `HfArgumentParser`. Non-HfArgumentParser parameters should be placed in make_dataset_args."""

    model_name_or_path: str = Field(...)
    adapter_name_or_path: Optional[str] = Field(None, description="Also as output_dir of train_sft_args")
    template: str = Field(..., description="model template")
    default_system: str = Field(..., description="default system prompt")
    finetuning_type: FinetuningType = Field(FinetuningType.LORA)
    media_dir: str = Field("dataset/media")
    image_max_pixels: int = Field(409920, description="used in llama-factory, 409920 represents 720P")
    enable_thinking: bool = Field(False, description="used in llama-factory")
    trust_remote_code: bool = Field(True, description="used in huggingface")


class CliArgs(BaseModel):
    model_config = {"extra": "forbid"}
    full_log: bool = Field(False)
    log_level: str = Field("INFO", description="DEBUG, INFO, WARNING, ERROR, CRITICAL")


class LLMCleanConfig(BaseConfigModel):
    accept_score: int = Field(
        2,
        description="Acceptable LLM scoring threshold: 1 (worst) to 5 (best). Data scoring below this threshold will not be used for training.",
    )
    enable_thinking: bool = Field(False, description="used in llama-factory")


class CleanDatasetConfig(BaseConfigModel):
    enable_clean: bool = False
    clean_strategy: CleanStrategy = CleanStrategy.LLM
    llm: LLMCleanConfig = LLMCleanConfig(accept_score=2, enable_thinking=False)


class VisionApiConfig(BaseConfigModel):
    """Vision API specific configuration"""

    enable: bool = Field(default=False, description="Whether to enable Vision API for image recognition")
    api_key: Optional[str] = None
    api_url: Optional[str] = None
    model_name: Optional[str] = None
    max_workers: Optional[int] = None


class TelegramArgs(BaseModel):
    model_config = {"extra": "forbid"}
    my_id: str = Field(default="user1234567890", description="Your own telegram id")


class MakeDatasetArgs(BaseConfigModel):
    model_config = {"extra": "forbid"}

    platform: PlatformType = Field(..., description="Data source platform")
    telegram_args: Optional[TelegramArgs] = None
    language: LanguageType = Field(LanguageType.ZH, description="Common language used in chat")
    include_type: List[DataModality] = Field([DataModality.TEXT], description="Types of data to include")
    max_image_num: int = Field(2, description="Maximum number of images per single data entry")
    blocked_words: List[str] = Field([], description="List of blocked words")
    add_time: bool = Field(False, description="Whether to add time to the dataset")
    add_relation: bool = Field(False, description="Whether to add chat member relationship to the dataset")
    single_combine_strategy: CombineStrategy = Field(
        CombineStrategy.TIME_WINDOW,
        description="Strategy for combining single person's messages into a single sentence",
    )
    qa_match_strategy: CombineStrategy = Field(
        CombineStrategy.TIME_WINDOW, description="Strategy for forming QA pairs"
    )
    single_combine_time_window: int = Field(
        2, description="Time window for combining single person's messages (minutes)"
    )
    qa_match_time_window: int = Field(5, description="Time window for forming QA pairs (minutes)")
    combine_msg_max_length: int = Field(2048, description="Maximum length of combined messages")
    messages_max_length: int = Field(
        2048, description="Maximum character count for messages, used with cutoff_len"
    )
    prompt_with_history: bool = Field(
        False, description="Whether to include conversation history in prompt, invalid for multimodal data"
    )
    clean_dataset: CleanDatasetConfig = Field(CleanDatasetConfig(), description="Data cleaning configuration")
    online_llm_clear: bool = Field(False)
    base_url: Optional[str] = Field(None, description="Base URL for online LLM")
    llm_api_key: Optional[str] = Field(None, description="API key for online LLM")
    model_name: Optional[str] = Field(
        None, description="Model name for online LLM, recommend using larger parameter models"
    )
    clean_batch_size: int = Field(10, description="Batch size for data cleaning")
    vision_api: VisionApiConfig = Field(VisionApiConfig())


class TrainSftArgs(BaseConfigModel):
    stage: str = Field("sft", description="Training stage")
    dataset: str = Field(..., description="Dataset name")
    dataset_dir: str = Field("./dataset/res_csv/sft", description="Dataset directory")
    freeze_multi_modal_projector: bool = Field(
        False, description="Whether to freeze multimodal projector during MLLM training"
    )
    use_fast_tokenizer: bool = Field(True, description="Whether to use fast tokenizer")
    lora_target: str = Field(..., description="LoRA target modules")
    lora_rank: int = Field(4, description="LoRA rank")
    lora_dropout: float = Field(0.25, description="LoRA dropout")
    weight_decay: float = Field(0.1, description="Weight decay")
    overwrite_cache: bool = Field(True, description="Whether to overwrite cache")
    per_device_train_batch_size: int = Field(4, description="Training batch size per device")
    gradient_accumulation_steps: int = Field(8, description="Gradient accumulation steps")
    lr_scheduler_type: str = Field("cosine", description="Learning rate scheduler type")
    cutoff_len: int = Field(4096, description="Cutoff length")
    logging_steps: int = Field(10, description="Logging steps")
    save_steps: int = Field(100, description="Model save steps")
    learning_rate: float = Field(1e-4, description="Learning rate")
    warmup_ratio: float = Field(0.1, description="Warmup ratio")
    num_train_epochs: int = Field(2, description="Number of training epochs")
    plot_loss: bool = Field(True, description="Whether to plot loss curve")
    fp16: bool = Field(True, description="Whether to use fp16")
    flash_attn: str = Field("fa2", description="Flash Attention type")
    preprocessing_num_workers: int = Field(16, description="Number of preprocessing worker processes")
    dataloader_num_workers: int = Field(4, description="Number of dataloader worker processes")
    deepspeed: Optional[str] = Field(
        None, description="DeepSpeed configuration file path for multi-GPU training"
    )
    do_train: bool = Field(True)


class InferArgs(BaseConfigModel):
    repetition_penalty: float = Field(1.2, description="Repetition penalty")
    temperature: float = Field(..., description="Temperature")
    top_p: float = Field(..., description="Top-p sampling")
    max_length: int = Field(..., description="Maximum generation length")


class VllmArgs(BaseConfigModel):
    gpu_memory_utilization: float = Field(default=0.9, description="vllm GPU memory utilization")


class TestModelArgs(BaseConfigModel):
    test_data_path: str = Field(default="dataset/eval/test_data-en.json", description="Test data path")


class CommonMethods:
    def _parse_dataset_name(self) -> str:
        """Parse and process dataset name"""
        if hasattr(self, "include_type") and "image" in getattr(self, "include_type", []):
            return getattr(self, "dataset", "") + "-vl"
        return getattr(self, "dataset", "")


class WcConfig(BaseModel):
    model_config = {"extra": "forbid"}

    version: str = Field(..., description="Configuration file version")
    common_args: CommonArgs = Field(..., description="Common parameters")
    cli_args: CliArgs = Field(..., description="Command line arguments")
    make_dataset_args: MakeDatasetArgs = Field(..., description="Dataset processing parameters")
    train_sft_args: TrainSftArgs = Field(..., description="SFT fine-tuning parameters")
    infer_args: InferArgs = Field(..., description="Inference parameters")
    vllm_args: VllmArgs = Field(VllmArgs())
    test_model_args: TestModelArgs = Field(TestModelArgs())


class WCInferConfig(CommonArgs, InferArgs):
    """Final configuration model for Web Demo"""

    pass


class WCTrainSftConfig(CommonArgs, TrainSftArgs, CommonMethods):
    """Final configuration model for SFT training"""

    # Training output directory, converted from adapter_name_or_path
    output_dir: Optional[str] = Field(None)
    dataset: str = Field(..., description="Dataset name")

    @model_validator(mode="after")
    def process_config(self):
        adapter_name_value = getattr(self, "adapter_name_or_path", None)

        if adapter_name_value:
            self.output_dir = adapter_name_value

        self.dataset = self._parse_dataset_name()
        # Always remove adapter_name_or_path field after processing
        if hasattr(self, "adapter_name_or_path"):
            delattr(self, "adapter_name_or_path")
        if hasattr(self, "include_type"):
            delattr(self, "include_type")

        return self


class WCMakeDatasetConfig(CommonArgs, MakeDatasetArgs, CommonMethods):
    """Final configuration model for creating datasets"""

    model_config = {"extra": "allow"}  # Explicitly set to allow

    dataset: str = Field(..., description="Dataset name")
    dataset_dir: str = Field("./dataset/res_csv/sft", description="Dataset directory")
    cutoff_len: int = Field(4096, description="Cutoff length")

    @model_validator(mode="after")
    def process_config(self):
        # Validate Telegram configuration
        if self.platform == PlatformType.TELEGRAM:
            if self.telegram_args is None or self.telegram_args.my_id == "user1234567890":
                logger.error(
                    "When using the Telegram platform, please set a valid `telegram_args.my_id`. The `from_id` in `result.json` for the messages you send represents your user ID."
                )
                exit(1)

        self.dataset = self._parse_dataset_name()

        return self


================================================
FILE: weclone/utils/i18n.py
================================================
from typing import Dict, List, Optional


class MultiLangList:
    def __init__(self, translations: Dict[str, List[str]], default_lang="en"):
        self.translations = translations
        self.current_lang = default_lang
        self.default_lang = default_lang
        # Validate that all translation lists have the same length
        self._validate_translations()
        # 创建反向映射字典，用于快速查找
        self._build_reverse_mapping()

    def _validate_translations(self):
        """Validate that all translation lists have the same length"""
        if not self.translations:
            raise ValueError("Translations dictionary cannot be empty")

        # Get the length of the first list as reference
        first_lang = next(iter(self.translations))
        expected_length = len(self.translations[first_lang])

        # Check if all lists have the same length
        for lang, items in self.translations.items():
            if len(items) != expected_length:
                raise ValueError(
                    f"Translation list for '{lang}' has {len(items)} items, "
                    f"expected {expected_length} items (same as '{first_lang}')"
                )

    def _build_reverse_mapping(self):
        """构建反向映射，用于根据文本查找对应的索引和其他语言翻译"""
        self.text_to_index = {}  # 文本 -> (语言, 索引)

        for lang, items in self.translations.items():
            for index, text in enumerate(items):
                self.text_to_index[text.lower()] = (lang, index)

    def set_language(self, lang: str):
        """设置当前语言"""
        if lang in self.translations:
            self.current_lang = lang
            return self
        else:
            print(f"Warning: Language '{lang}' not available, using default")

    def get_items(self, lang: Optional[str] = None) -> List[str]:
        """获取指定语言的列表"""
        target_lang = lang or self.current_lang
        return self.translations.get(target_lang, self.translations[self.default_lang])

    def get_item(self, index: int, lang: Optional[str] = None) -> str:
        """获取指定索引的翻译项"""
        items = self.get_items(lang)
        if 0 <= index < len(items):
            return items[index]
        raise IndexError("List index out of range")

    def translate_text(self, text: str, target_lang: Optional[str] = None) -> Optional[str]:
        """
        根据输入的文本（中文或英文）获取另一种语言的翻译

        Args:
            text: 要翻译的文本
            target_lang: 目标语言，如果不指定则自动判断（中文->英文，英文->中文）

        Returns:
            翻译后的文本，如果找不到则返回None
        """
        text_lower = text.lower()

        # 查找文本在哪个语言的哪个位置
        if text_lower not in self.text_to_index:
            return None

        source_lang, index = self.text_to_index[text_lower]

        # 如果没有指定目标语言，则自动判断
        if target_lang is None:
            if source_lang == "en":
                target_lang = "zh_CN"  # 英文->中文
            elif source_lang == "zh_CN":
                target_lang = "en"  # 中文->英文
            else:
                return None

        # 获取目标语言的翻译
        if target_lang in self.translations:
            target_items = self.translations[target_lang]
            if index < len(target_items):
                return target_items[index]

        return None

    def get_translation_pair(self, text: str) -> Dict[str, str]:
        """
        获取某个文本的中英文对照

        Args:
            text: 要查找的文本

        Returns:
            包含中英文翻译的字典，例如 {'en': 'Administrator', 'zh_CN': '管理员'}
        """
        text_lower = text.lower()

        if text_lower not in self.text_to_index:
            return {}

        source_lang, index = self.text_to_index[text_lower]

        result = {}
        for lang in ["en", "zh_CN"]:
            if lang in self.translations and index < len(self.translations[lang]):
                result[lang] = self.translations[lang][index]

        return result

    def translate_batch(self, texts: List[str], target_lang: Optional[str] = None) -> List[Optional[str]]:
        """
        批量翻译文本

        Args:
            texts: 要翻译的文本列表
            target_lang: 目标语言

        Returns:
            翻译结果列表
        """
        return [self.translate_text(text, target_lang) for text in texts]

    def __iter__(self):
        return iter(self.get_items())

    def __len__(self):
        return len(self.get_items())

    def __getitem__(self, index):
        return self.get_item(index)


if __name__ == "__main__":
    # 定义中英文双语数据
    user_types_data = {
        "en": ["Administrator", "Regular User", "Guest", "Moderator", "Super Admin"],
        "zh_CN": ["管理员", "普通用户", "访客", "版主", "超级管理员"],
    }

    status_data = {
        "en": ["Active", "Inactive", "Pending", "Suspended", "Deleted"],
        "zh_CN": ["活跃", "非活跃", "待定", "暂停", "已删除"],
    }

    permission_data = {
        "en": ["Read", "Write", "Execute", "Delete", "Admin"],
        "zh_CN": ["读取", "写入", "执行", "删除", "管理"],
    }

    # 创建多语言列表
    user_types = MultiLangList(user_types_data)
    status_list = MultiLangList(status_data)
    permissions = MultiLangList(permission_data)
    # 使用示例
    print("=== 基本翻译功能 ===")
    # 中文翻译为英文
    result1 = user_types.translate_text("管理员")
    print(f"'管理员' -> '{result1}'")  # 输出: '管理员' -> 'Administrator'

    # 英文翻译为中文
    result2 = user_types.translate_text("Guest")
    print(f"'Guest' -> '{result2}'")  # 输出: 'Guest' -> '访客'

    # 指定目标语言
    result3 = user_types.translate_text("管理员", target_lang="en")
    print(f"'管理员' -> '{result3}' (指定英文)")  # 输出: '管理员' -> 'Administrator' (指定英文)

    print("\n=== 获取中英文对照 ===")
    translation_pair = user_types.get_translation_pair("Administrator")
    print(f"'Administrator' 的中英文对照: {translation_pair}")
    # 输出: {'en': 'Administrator', 'zh_CN': '管理员'}

    print("\n=== 批量翻译 ===")
    chinese_texts = ["管理员", "普通用户", "访客"]
    english_results = user_types.translate_batch(chinese_texts)
    print(f"批量翻译结果: {list(zip(chinese_texts, english_results))}")
    # 输出: [('管理员', 'Administrator'), ('普通用户', 'Regular User'), ('访客', 'Guest')]

    print("\n=== 状态列表翻译 ===")
    status_result = status_list.translate_text("活跃")
    print(f"'活跃' -> '{status_result}'")  # 输出: '活跃' -> 'Active'

    status_result2 = status_list.translate_text("Pending")
    print(f"'Pending' -> '{status_result2}'")  # 输出: 'Pending' -> '待定'

    print("\n=== 权限翻译 ===")
    perm_result = permissions.translate_text("读取")
    print(f"'读取' -> '{perm_result}'")  # 输出: '读取' -> 'Read'

    print("\n=== 错误处理 ===")
    not_found = user_types.translate_text("不存在的文本")
    print(f"不存在的文本翻译结果: {not_found}")  # 输出: None

    print("\n=== 当前语言设置 ===")
    user_types.set_language("zh_CN")
    print(f"当前语言列表: {list(user_types)}")  # 输出中文列表

    user_types.set_language("en")
    print(f"切换后列表: {list(user_types)}")  # 输出英文列表


================================================
FILE: weclone/utils/length_cdf.py
================================================
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import defaultdict

import fire
from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
from llamafactory.hparams import get_train_args
from llamafactory.model import load_tokenizer
from tqdm import tqdm

from weclone.utils.log import logger


def calculate_token_length(
    text: str,
    model_name_or_path: str = "./models/Qwen3-32B-AWQ",
    template: str = "qwen3",
) -> int:
    """Calculate the token length of the specified text

    Args:
        text: Text to calculate token length for
        model_name_or_path: Model path
        template: Template name

    Returns:
        Token length of the text
    """
    logger.info(f"Calculating text token length: {text[:50]}...")

    model_args, data_args, _, _, _ = get_train_args(
        {
            "stage": "sft",
            "model_name_or_path": model_name_or_path,
            "template": template,
            "dataset": "chat-sft",
            "output_dir": "dummy_dir",
            "do_train": True,
        }
    )

    tokenizer_module = load_tokenizer(model_args)
    tokenizer = tokenizer_module["tokenizer"]

    # Directly use tokenizer to encode text
    tokens = tokenizer.encode(text, add_special_tokens=False)
    token_length = len(tokens)

    logger.info(f"Text token length: {token_length}")
    return token_length


def length_cdf(
    model_name_or_path: str = "./Qwen2.5-7B-Instruct",
    dataset: str = "chat-sft",
    dataset_dir: str = "./dataset/res_csv/sft",
    media_dir: str = "./dataset/media",
    template: str = "qwen",
    interval: int = 256,
    image_max_pixels: int = 768 * 768,
):
    r"""Calculate the distribution of the input lengths in the dataset.

    Usage: export CUDA_VISIBLE_DEVICES=0
    python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en_demo --template default
    """
    logger.info("Starting cutoff_len calculation......")

    model_args, data_args, training_args, _, _ = get_train_args(
        {
            "stage": "sft",
            "model_name_or_path": model_name_or_path,
            "dataset": dataset,
            "dataset_dir": dataset_dir,
            "template": template,
            "cutoff_len": 1_000_000,
            "preprocessing_num_workers": 16,
            "output_dir": "dummy_dir",
            "media_dir": media_dir,
            "image_max_pixels": int(image_max_pixels),
            "overwrite_cache": True,
            "do_train": True,
        }
    )
    tokenizer_module = load_tokenizer(model_args)
    template_obj = get_template_and_fix_tokenizer(tokenizer_module["tokenizer"], data_args)  # type: ignore
    trainset = get_dataset(template_obj, model_args, data_args, training_args, "sft", **tokenizer_module)[
        "train_dataset"
    ]  # type: ignore
    total_num = len(trainset)  # type: ignore
    length_dict = defaultdict(int)
    for sample in tqdm(trainset["input_ids"], desc="Collecting lengths"):  # type: ignore
        length_dict[len(sample) // interval * interval] += 1

    length_tuples = list(length_dict.items())
    length_tuples.sort()
    count_accu, prob_accu = 0, 0
    logger.info(" cutoff_len configuration suggestions:")
    logger.warning("For multimodal tasks, please ensure cutoff_len is set to the maximum data length")
    for length, count in length_tuples:
        count_accu += count
        prob_accu += count / total_num * 100
        logger.info(f"{count_accu:d} ({prob_accu:.2f}%) samples have length < {length + interval}.")


if __name__ == "__main__":
    fire.Fire(length_cdf)


================================================
FILE: weclone/utils/log.py
================================================
import logging
import os
import sys
import time
from functools import wraps

from loguru import logger

logger.remove()

env_log_level = os.getenv("WC_LOG_LEVEL")
# Initialize basic log configuration, will be reconfigured later by configure_log_level_from_config
logger.add(
    sys.stderr,
    format="<green><b>[WeClone]</b></green> <level>{level.name[0]}</level> | <level>{time:HH:mm:ss}</level> | <level>{message}</level>",
    colorize=True,
    level=env_log_level.upper() if env_log_level else "INFO",
)


class InterceptHandler(logging.Handler):
    def __init__(self, level=logging.INFO):
        super().__init__(level)

    def emit(self, record):
        # Check log level, only handle logs at specified level and above
        if record.levelno < self.level:
            return

        timestamp = time.strftime("%H:%M:%S")
        level_color = "\033[36m" if record.levelno >= logging.INFO else "\033[0m"
        reset_color = "\033[0m"
        message = f"[{record.name}] | {level_color}{record.levelname[0]}{reset_color} | {timestamp} | {record.getMessage()}"
        print(message, file=sys.stderr)


# Bridge standard logging to loguru
intercept_handler = InterceptHandler(level=logging.INFO)
logging.basicConfig(handlers=[intercept_handler], level=0, force=True)


def capture_output(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        log_sink_buffer = []

        def list_sink(message):
            log_sink_buffer.append(message.record["message"])

        sink_id = logger.add(list_sink, format="{message}", level="INFO")

        original_stdout = sys.stdout
        original_stderr = sys.stderr

        class OutputTeeToGlobalLog:
            def __init__(self, original_stream, log_method):
                self.original_stream = original_stream
                self.log_method = log_method
                self.current_line_content = ""  # Represents the current state of the line to be logged

            def write(self, data_chunk):
                self.original_stream.write(data_chunk)  # Pass through to console

                if data_chunk.endswith("\\r") and "\\n" not in data_chunk:
                    self.current_line_content = data_chunk[:-1]  # Store without the trailing \\r
                    return

                full_buffer = self.current_line_content + data_chunk
                lines_to_process = full_buffer.split("\\n")

                for i in range(len(lines_to_process) - 1):
                    line = lines_to_process[i]
                    final_content_of_line = line
                    last_cr = line.rfind("\\r")
                    if last_cr != -1:
                        final_content_of_line = line[last_cr + 1 :]

                    escaped_log = final_content_of_line.replace("{", "{{").replace("}", "}}")
                    if final_content_of_line.strip() or line:
                        self.log_method(escaped_log, raw=True)

                self.current_line_content = lines_to_process[-1]

            def flush(self):
                self.original_stream.flush()
                if self.current_line_content:
                    final_content_of_line = self.current_line_content
                    last_cr = self.current_line_content.rfind("\\r")
                    if last_cr != -1:
                        final_content_of_line = self.current_line_content[last_cr + 1 :]

                    escaped_log = final_content_of_line.replace("{", "{{").replace("}", "}}")
                    if final_content_of_line.strip() or self.current_line_content:
                        self.log_method(escaped_log, raw=True)
                    self.current_line_content = ""

        sys.stdout = OutputTeeToGlobalLog(original_stdout, logger.opt(raw=True).info)
        sys.stderr = OutputTeeToGlobalLog(original_stderr, logger.opt(raw=True).error)

        try:
            func(*args, **kwargs)
        finally:
            sys.stdout = original_stdout
            sys.stderr = original_stderr
            logger.remove(sink_id)

    return wrapper


def configure_log_level_from_config():
    """
    Read log level from config file and set complete log configuration
    Should be called after config is loaded
    """
    log_level = "INFO"  # default value

    try:
        from weclone.utils.config import load_config

        cli_config = load_config(arg_type="cli_args")
        log_level = getattr(cli_config, "log_level", "INFO")
    except Exception as e:
        logger.warning(f"Unable to load log level from config, using default INFO level: {e}")

    logger.remove()

    logger.add(
        sys.stderr,
        format="<green><b>[WeClone]</b></green> <level>{level.name[0]}</level> | <level>{time:HH:mm:ss}</level> | <level>{message}</level>",
        colorize=True,
        level=log_level.upper(),
    )

    logger.add(
        "logs/weclone.log",
        rotation="1 day",
        retention="7 days",
        compression="zip",
        level="DEBUG",
        format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {name}:{function}:{line} - {message}",
        encoding="utf-8",
        enqueue=True,
    )

    intercept_handler.setLevel(log_level.upper())

    logger.info(f"Log level has been set to: {log_level.upper()}")


================================================
FILE: weclone/utils/retry.py
================================================
import random
import time
from functools import wraps
from typing import Callable, List, Optional

from weclone.utils.log import logger


def retry_on_http_error(
    max_retries: int = 3,
    base_delay: float = 1.0,
    max_delay: float = 60.0,
    backoff_factor: float = 2.0,
    jitter: bool = True,
    retry_on_status: Optional[List[int]] = None,
    retry_on_exceptions: Optional[List[type]] = None,
):
    """
    HTTP请求重试装饰器，专门处理429状态码和其他网络错误

    Args:
        max_retries: 最大重试次数
        base_delay: 基础延迟时间（秒）
        max_delay: 最大延迟时间（秒）
        backoff_factor: 退避因子，每次重试延迟时间乘以此因子
        jitter: 是否添加随机抖动，避免雷群效应
        retry_on_status: 需要重试的HTTP状态码列表，默认包含429, 500, 502, 503, 504
        retry_on_exceptions: 需要重试的异常类型列表
    """
    if retry_on_status is None:
        retry_on_status = [429, 500, 502, 503, 504]

    if retry_on_exceptions is None:
        retry_on_exceptions = [ConnectionError, TimeoutError]

    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            for attempt in range(max_retries + 1):
                try:
                    result = func(*args, **kwargs)

                    # 检查是否是HTTP响应对象
                    if hasattr(result, "status_code"):
                        if result.status_code in retry_on_status:
                            if attempt < max_retries:
                                delay = _calculate_delay(
                                    attempt, base_delay, max_delay, backoff_factor, jitter
                                )
                                logger.warning(
                                    f"HTTP请求返回状态码 {result.status_code}，"
                                    f"第 {attempt + 1}/{max_retries + 1} 次尝试，"
                                    f"将在 {delay:.2f} 秒后重试..."
                                )
                                time.sleep(delay)
                                continue
                            else:
                                logger.error(
                                    f"HTTP请求在 {max_retries + 1} 次尝试后最终失败，状态码: {result.status_code}"
                                )
                                return result

                    return result

                except Exception as e:
                    should_retry_on_exception = any(
                        isinstance(e, exc_type) for exc_type in retry_on_exceptions
                    )

                    if should_retry_on_exception and attempt < max_retries:
                        delay = _calculate_delay(attempt, base_delay, max_delay, backoff_factor, jitter)
                        logger.warning(
                            f"请求异常: {type(e).__name__}: {e}，"
                            f"第 {attempt + 1}/{max_retries + 1} 次尝试，"
                            f"将在 {delay:.2f} 秒后重试..."
                        )
                        time.sleep(delay)
                        continue
                    elif should_retry_on_exception:
                        logger.error(f"请求在 {max_retries + 1} 次尝试后最终失败: {type(e).__name__}: {e}")
                        raise
                    else:
                        logger.error(f"未知错误，不进行重试: {type(e).__name__}: {e}")
                        raise

            return None  # 理论上不会执行到这里

        return wrapper

    return decorator


def retry_openai_api(
    max_retries: int = 3,
    base_delay: float = 1.0,
    max_delay: float = 60.0,
    backoff_factor: float = 2.0,
    jitter: bool = True,
):
    """
    专门用于OpenAI API调用的重试装饰器
    处理OpenAI特有的异常类型
    """

    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            for attempt in range(max_retries + 1):
                try:
                    return func(*args, **kwargs)

                except Exception as e:
                    # 检查是否是速率限制或临时错误
                    error_message = str(e).lower()
                    should_retry = (
                        "rate limit" in error_message
                        or "429" in error_message
                        or "too many requests" in error_message
                        or "server error" in error_message
                        or "timeout" in error_message
                        or "connection" in error_message
                    )

                    if should_retry and attempt < max_retries:
                        delay = _calculate_delay(attempt, base_delay, max_delay, backoff_factor, jitter)
                        logger.warning(
                            f"OpenAI API调用失败: {type(e).__name__}: {e}，"
                            f"第 {attempt + 1}/{max_retries + 1} 次尝试，"
                            f"将在 {delay:.2f} 秒后重试..."
                        )
                        time.sleep(delay)
                        continue
                    else:
                        if attempt >= max_retries:
                            logger.error(
                                f"OpenAI API调用在 {max_retries + 1} 次尝试后最终失败: {type(e).__name__}: {e}"
                            )
                        raise

            return None

        return wrapper

    return decorator


def _calculate_delay(
    attempt: int, base_delay: float, max_delay: float, backoff_factor: float, jitter: bool
) -> float:
    """计算重试延迟时间"""
    delay = base_delay * (backoff_factor**attempt)
    delay = min(delay, max_delay)

    if jitter:
        # 添加±20%的随机抖动
        jitter_range = delay * 0.2
        delay += random.uniform(-jitter_range, jitter_range)
        delay = max(0, delay)  # 确保延迟不为负数

    return delay


class RetryConfig:
    """重试配置类，用于统一管理重试参数"""

    def __init__(
        self,
        max_retries: int = 3,
        base_delay: float = 1.0,
        max_delay: float = 60.0,
        backoff_factor: float = 2.0,
        jitter: bool = True,
        retry_on_status: Optional[List[int]] = None,
        retry_on_exceptions: Optional[List[type]] = None,
    ):
        self.max_retries = max_retries
        self.base_delay = base_delay
        self.max_delay = max_delay
        self.backoff_factor = backoff_factor
        self.jitter = jitter
        self.retry_on_status = retry_on_status or [429, 500, 502, 503, 504]
        self.retry_on_exceptions = retry_on_exceptions or [ConnectionError, TimeoutError]

    def apply_to_function(self, func: Callable) -> Callable:
        """将重试配置应用到函数上"""
        return retry_on_http_error(
            max_retries=self.max_retries,
            base_delay=self.base_delay,
            max_delay=self.max_delay,
            backoff_factor=self.backoff_factor,
            jitter=self.jitter,
            retry_on_status=self.retry_on_status,
            retry_on_exceptions=self.retry_on_exceptions,
        )(func)


# 预定义的重试配置
AGGRESSIVE_RETRY = RetryConfig(
    max_retries=5,
    base_delay=0.5,
    max_delay=30.0,
    backoff_factor=1.5,
)

CONSERVATIVE_RETRY = RetryConfig(
    max_retries=2,
    base_delay=2.0,
    max_delay=10.0,
    backoff_factor=2.0,
)

API_RETRY = RetryConfig(
    max_retries=3,
    base_delay=1.0,
    max_delay=60.0,
    backoff_factor=2.0,
    retry_on_status=[429, 500, 502, 503, 504],
)


================================================
FILE: weclone/utils/tools.py
================================================
def dict_to_argv(d):
    argv = []
    for k, v in d.items():
        argv.append("--" + k)
        if v is not None:
            argv.append(str(v))
    return argv