[
  {
    "path": ".cursor/rules/weclone-rules.mdc",
    "content": "---\ndescription: \nglobs: \nalwaysApply: true\n---\n---\ndescription: \nglobs: \nalwaysApply: true\n---\n\n# Your rule content\n- You can @ files here\n- The project uses uv as the package manager and pyproject.toml as the project configuration file.\n- You should write as few code comments as possible.\n- Prefer using the encapsulated logger `from weclone.utils.log import logger` for printing.\n"
  },
  {
    "path": ".github/issue-labeler.yml",
    "content": "# 添加 Discussion 标签\nDiscussion:\n  - '(讨论|交流|分享|意见|建议|思考|探讨|交换意见|brainstorm|discussion)'\n\n# 添加 bug 标签\nbug:\n  - '(bug|错误|问题|失败|崩溃|异常|报错|不工作|无法运行|broken|crash|error|exception|fails)'\n\n# 添加 chatbot 标签\nchatbot:\n  - '(聊天机器人|chatbot|chat bot|对话机器人|聊天助手|AI助手|机器人对话|bot|assistant)'\n\n# 添加 documentation 标签\ndocumentation:\n  - '(文档|说明|使用指南|指导|手册|教程|文档更新|documentation|docs|guide|tutorial|readme)'\n\n# 添加 duplicate 标签\nduplicate:\n  - '(重复|已有|duplicate|已经存在|已提交过|重复问题|重复报告|dup)'\n\n# 添加 feature 标签\nfeature:\n  - '(功能|特性|新增|增加|添加|实现|feature|enhancement|新功能|功能请求|feature request)'\n\n# 添加 good first issue 标签\ngood first issue:\n  - '(入门|简单|容易|新手|初学者|开始|first|beginner|starter|easy|简单任务|good first issue)'\n\n# 添加 help wanted 标签\nhelp wanted:\n  - '(需要帮助|寻求帮助|请求协助|help|求助|协助|帮忙|help wanted|need help|assistance)'\n\n# 添加 invalid 标签\ninvalid:\n  - '(无效|不适用|不相关|无关|错误提交|invalid|not relevant|irrelevant|not applicable)'\n\n# 添加 Mac 标签\nMac:\n  - '(Mac|MacOS|macOS|OSX|Mac系统|苹果系统|苹果电脑|MacBook)'\n\n# 添加 question 标签\nquestion:\n  - '(问题|疑问|如何|怎么|请问|是否|能否|可以吗|question|how to|what is|why)'\n\n# 添加 Windows 标签\nWindows:\n  - '(Windows|微软|Win10|Win11|Windows系统|微软系统|win)'\n"
  },
  {
    "path": ".github/weclone-release-event.json",
    "content": "{\n    \"action\": \"published\",\n    \"release\": {\n        \"id\": 123456789,\n        \"tag_name\": \"v0.2.24\",\n        \"target_commitish\": \"main\",\n        \"name\": \"v0.2.24\",\n        \"body\": \"## 🥰 What's Changed\\n  - Update torch version to 2.7.0 and vllm version to 0.9.1, switch offline inference to chat-style invocation\\n  - Add `test_model_args` and `vllm_args` configuration items to allow custom test dataset files\\n  - Add config file path option in CLI, support setting WECLONE_CONFIG_PATH environment variable\\n  - Update max_new_tokens and enable_thinking parameters in data cleaning strategy to optimize inference\\n  - Partial feature adaptation for qwen3\\n  \\n  ## 🐛 Bug fix\\n  fix #158 fix #83 fix #77 fix #69 \\n  \\n  **Full Changelog**: https://github.com/xming521/WeClone/compare/v0.2.23...v0.2.24\\n  \\n  ## 🥰 更新内容\\n  - 更新torch版本至2.7.0，vllm版本到0.9.1，离线推理改为chat方式调用\\n  - 添加`test_model_args` and `vllm_args`配置项，允许自定义测试集文件\\n  - CLI中添加配置文件路径选项，支持设置WECLONE_CONFIG_PATH环境变量\\n  - 更新数据清理策略中的max_new_tokens和enable_thinking参数以优化推理过程\\n  - 部分功能适配qwen3\",\n        \"draft\": false,\n        \"prerelease\": false,\n        \"created_at\": \"2024-01-15T10:30:00Z\",\n        \"published_at\": \"2024-01-15T10:30:00Z\",\n        \"author\": {\n            \"login\": \"xming521\",\n            \"id\": 12345,\n            \"avatar_url\": \"https://avatars.githubusercontent.com/u/12345?v=4\",\n            \"html_url\": \"https://github.com/xming521\"\n        },\n        \"html_url\": \"https://github.com/xming521/WeClone/releases/tag/v0.2.24\",\n        \"assets_url\": \"https://api.github.com/repos/xming521/WeClone/releases/123456789/assets\",\n        \"upload_url\": \"https://uploads.github.com/repos/xming521/WeClone/releases/123456789/assets{?name,label}\",\n        \"tarball_url\": \"https://api.github.com/repos/xming521/WeClone/tarball/v0.2.24\",\n        \"zipball_url\": \"https://api.github.com/repos/xming521/WeClone/zipball/v0.2.24\",\n        \"assets\": []\n    },\n    \"repository\": {\n        \"id\": 987654321,\n        \"name\": \"WeClone\",\n        \"full_name\": \"xming521/WeClone\",\n        \"owner\": {\n            \"login\": \"xming521\",\n            \"id\": 12345\n        },\n        \"private\": false,\n        \"html_url\": \"https://github.com/xming521/WeClone\",\n        \"description\": \"WeClone - AI Clone Repository\",\n        \"fork\": false,\n        \"created_at\": \"2023-01-01T00:00:00Z\",\n        \"updated_at\": \"2024-01-15T10:30:00Z\",\n        \"pushed_at\": \"2024-01-15T10:25:00Z\",\n        \"clone_url\": \"https://github.com/xming521/WeClone.git\",\n        \"default_branch\": \"main\"\n    },\n    \"sender\": {\n        \"login\": \"xming521\",\n        \"id\": 12345,\n        \"avatar_url\": \"https://avatars.githubusercontent.com/u/12345?v=4\",\n        \"html_url\": \"https://github.com/xming521\"\n    }\n}\n"
  },
  {
    "path": ".github/workflows/issue-labeler.yml",
    "content": "name: add labels to Issues\n\non:\n  issues:\n    types: [opened, edited]\n\n\njobs:\n  label_issues:\n    runs-on: ubuntu-latest\n    permissions:\n      issues: write\n      contents: read\n    steps:\n      - name: get_last_run_time\n        id: last_run\n        run: |\n          # 获取当前日期减去 1 天作为默认值（处理最近一天的 issues）\n          echo \"date=$(date -d '1 day ago' -u +\"%Y-%m-%dT%H:%M:%SZ\")\" >> $GITHUB_OUTPUT\n      \n      - name: RegEx Issue Labeler\n        uses: github/issue-labeler@v3.4\n        with:\n          include-title: 1\n          repo-token: \"${{ secrets.GITHUB_TOKEN }}\"\n          configuration-path: .github/issue-labeler.yml\n          enable-versioned-regex: 0\n          not-before: ${{ steps.last_run.outputs.date }}\n"
  },
  {
    "path": ".github/workflows/tg_release_notification.yml",
    "content": "name: Telegram Release Notification\n\non:\n  release:\n    types: [published]\n\njobs:\n  notify:\n    runs-on: ubuntu-latest\n    steps:\n      - name: Checkout repository\n        uses: actions/checkout@v4\n        \n      - name: Setup Node.js\n        uses: actions/setup-node@v4\n        with:\n          node-version: '22'\n          \n      - name: Install telegramify-markdown\n        run: |\n          npm init -y\n          npm install telegramify-markdown\n          \n      - name: Convert release body to Telegram format\n        id: convert-markdown\n        run: |\n          # 先将release body保存到文件\n          cat > release_body.txt << 'RELEASE_BODY_EOF'\n          ${{ github.event.release.body }}\n          RELEASE_BODY_EOF\n          \n          # 然后创建转换脚本\n          cat > convert-release.js << 'EOF'\n          const telegramifyMarkdown = require('telegramify-markdown');\n          const fs = require('fs');\n\n          // 从文件读取release body内容（避免shell解析问题）\n          let releaseBody = '';\n          try {\n            releaseBody = fs.readFileSync('release_body.txt', 'utf8');\n          } catch (error) {\n            console.error('读取release body失败:', error);\n            releaseBody = process.env.RELEASE_BODY || '';\n          }\n          \n          console.log('=== 原始release body ===');\n          console.log(releaseBody);\n\n          // 转换为Telegram格式\n          const telegramBody = telegramifyMarkdown(releaseBody);\n          \n          // 构建完整的消息\n          const tagName = process.env.TAG_NAME || '';\n          const releaseUrl = process.env.RELEASE_URL || '';\n          \n          const fullMessage = `🚀 *WeClone New Version Released*\n          🏷️ *Version*: \\`${tagName}\\`\n          🔗 *Link*: [Github Release](${releaseUrl})\n          📋 *Release Notes*:\n          ${telegramBody}`;\n          \n          // 输出到GitHub Actions\n          console.log('转换后的消息:');\n          console.log(fullMessage);\n          \n          // 将消息保存到环境变量\n          fs.writeFileSync('telegram_message.txt', fullMessage);\n          EOF\n          \n          # 设置环境变量\n          export RELEASE_BODY=\"${{ github.event.release.body }}\"\n          export TAG_NAME=\"${{ github.event.release.tag_name }}\"\n          export RELEASE_URL=\"${{ github.event.release.html_url }}\"\n          export REPO_NAME=\"${{ github.repository }}\"\n          \n          # 运行转换脚本\n          node convert-release.js\n          \n          # 读取转换后的消息并设置为输出\n          echo \"TELEGRAM_MESSAGE<<EOF\" >> $GITHUB_OUTPUT\n          cat telegram_message.txt >> $GITHUB_OUTPUT\n          echo \"EOF\" >> $GITHUB_OUTPUT\n          \n      # - name: Display converted message\n      #   run: |\n      #     echo \"=== 转换后的Telegram消息 ===\"\n      #     echo \"${{ steps.convert-markdown.outputs.TELEGRAM_MESSAGE }}\"\n          \n      - name: Send Telegram Message\n        uses: appleboy/telegram-action@master\n        with:\n          to: ${{ secrets.TELEGRAM_CHAT_ID }}\n          token: ${{ secrets.TELEGRAM_BOT_TOKEN }}\n          message: ${{ steps.convert-markdown.outputs.TELEGRAM_MESSAGE }}\n          format: markdown\n          disable_web_page_preview: false\n        \n"
  },
  {
    "path": ".gitignore",
    "content": "wandb/\nweclone_archive-my/\n**/pycache/\nevents.out.tfevents.*\n归档/\n*.pt\n*.npz\n*nohup.out\n*log.txt\n*cookie.bin\n*.gradio/\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\npip-wheel-metadata/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n.python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n\n*.zip\nLLaMA-Factory\nchatglm3-6b\ncache\narchive\nmodel_output*\ndata/test\n.vscode\n*-my*.*\ntest-scripts-my/\n*.csv\n!tests/tests_data/test_person/test_0_730.csv\n!tests/tests_data/test_PII/test_0_730.csv\n*test.*\n*-exp.*\nexperiment/\n*users.json   \nSpark-TTS-0.5B/\nuv.lock\noutput*\n*.out\n\nQwen*/\nsettings.jsonc\nsettings.json\ndataset/blocked_words.json\ndataset/wechat/*\nmodels/*\n.secrets*\n.env*\n\n# Image files\ndataset/**/*.jpg\ndataset/**/*.jpeg\ndataset/**/*.png\ndataset/**/*.gif\ndataset/**/*.bmp\ndataset/**/*.webp\ndataset/**/*.svg\ndataset/**/*.ico\n\n\ndataset/*telegram*/*\n!*.gitkeep\nWC-exp/*\n\nmodeloutputs/*\n/tmp/*\ncache.pkl\nhfd.sh\nrpa_cache.pkl\nsettings-bot8006.jsonc\nmodels_final/*\n/data/*\n/llamaboard_cache\neval_Result/*\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "# .pre-commit-config.yaml\ndefault_install_hook_types: [pre-commit, prepare-commit-msg]\nci:\n  autofix_commit_msg: \":balloon: auto fixes by pre-commit hooks\"\n  autofix_prs: true\n  autoupdate_branch: master\n  autoupdate_schedule: monthly\n  autoupdate_commit_msg: \":balloon: pre-commit autoupdate hooks\"\n\nrepos:\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v6.0.0\n    hooks:\n      - id: check-ast # Python 语法检查\n      - id: check-added-large-files # 防止大文件\n        args: [\"--maxkb=25000\"]\n      - id: check-merge-conflict # 检查合并冲突\n      - id: check-yaml # YAML 语法检查\n      - id: check-toml # TOML 语法检查\n      - id: debug-statements # 防止调试语句\n      - id: end-of-file-fixer # 文件结尾修复\n      # - id: trailing-whitespace # 移除行尾空白\n      #   args: [--markdown-linebreak-ext=md]\n      - id: no-commit-to-branch # 保护主分支\n        args: [\"--branch\", \"main\", \"--branch\", \"master\"]\n      - id: mixed-line-ending # 检查混合行结束符\n        args: [\"--fix=lf\"]\n\n  - repo: https://github.com/astral-sh/ruff-pre-commit\n    rev: v0.12.8\n    hooks:\n      - id: ruff\n        args: [--fix]\n      - id: ruff-format\n\n  - repo: https://github.com/pycqa/isort\n    rev: 6.0.1\n    hooks:\n      - id: isort\n        args: [\"--profile\", \"black\", \"--line-length\", \"120\"]\n\n  # - repo: https://github.com/PyCQA/bandit\n  #   rev: 1.8.3\n  #   hooks:\n  #     - id: bandit\n  #       name: Python 安全检查\n  #       args: [\"-c\", \"pyproject.toml\", \"-x\", \"tests\"]\n  #       additional_dependencies: [\"bandit[toml]\"]\n"
  },
  {
    "path": "LICENSE",
    "content": "                    GNU AFFERO GENERAL PUBLIC LICENSE\n                       Version 3, 19 November 2007\n\n Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>\n Everyone is permitted to copy and distribute verbatim copies\n of this license document, but changing it is not allowed.\n\n                            Preamble\n\n  The GNU Affero General Public License is a free, copyleft license for\nsoftware and other kinds of works, specifically designed to ensure\ncooperation with the community in the case of network server software.\n\n  The licenses for most software and other practical works are designed\nto take away your freedom to share and change the works.  By contrast,\nour General Public Licenses are intended to guarantee your freedom to\nshare and change all versions of a program--to make sure it remains free\nsoftware for all its users.\n\n  When we speak of free software, we are referring to freedom, not\nprice.  Our General Public Licenses are designed to make sure that you\nhave the freedom to distribute copies of free software (and charge for\nthem if you wish), that you receive source code or can get it if you\nwant it, that you can change the software or use pieces of it in new\nfree programs, and that you know you can do these things.\n\n  Developers that use our General Public Licenses protect your rights\nwith two steps: (1) assert copyright on the software, and (2) offer\nyou this License which gives you legal permission to copy, distribute\nand/or modify the software.\n\n  A secondary benefit of defending all users' freedom is that\nimprovements made in alternate versions of the program, if they\nreceive widespread use, become available for other developers to\nincorporate.  Many developers of free software are heartened and\nencouraged by the resulting cooperation.  However, in the case of\nsoftware used on network servers, this result may fail to come about.\nThe GNU General Public License permits making a modified version and\nletting the public access it on a server without ever releasing its\nsource code to the public.\n\n  The GNU Affero General Public License is designed specifically to\nensure that, in such cases, the modified source code becomes available\nto the community.  It requires the operator of a network server to\nprovide the source code of the modified version running there to the\nusers of that server.  Therefore, public use of a modified version, on\na publicly accessible server, gives the public access to the source\ncode of the modified version.\n\n  An older license, called the Affero General Public License and\npublished by Affero, was designed to accomplish similar goals.  This is\na different license, not a version of the Affero GPL, but Affero has\nreleased a new version of the Affero GPL which permits relicensing under\nthis license.\n\n  The precise terms and conditions for copying, distribution and\nmodification follow.\n\n                       TERMS AND CONDITIONS\n\n  0. Definitions.\n\n  \"This License\" refers to version 3 of the GNU Affero General Public License.\n\n  \"Copyright\" also means copyright-like laws that apply to other kinds of\nworks, such as semiconductor masks.\n\n  \"The Program\" refers to any copyrightable work licensed under this\nLicense.  Each licensee is addressed as \"you\".  \"Licensees\" and\n\"recipients\" may be individuals or organizations.\n\n  To \"modify\" a work means to copy from or adapt all or part of the work\nin a fashion requiring copyright permission, other than the making of an\nexact copy.  The resulting work is called a \"modified version\" of the\nearlier work or a work \"based on\" the earlier work.\n\n  A \"covered work\" means either the unmodified Program or a work based\non the Program.\n\n  To \"propagate\" a work means to do anything with it that, without\npermission, would make you directly or secondarily liable for\ninfringement under applicable copyright law, except executing it on a\ncomputer or modifying a private copy.  Propagation includes copying,\ndistribution (with or without modification), making available to the\npublic, and in some countries other activities as well.\n\n  To \"convey\" a work means any kind of propagation that enables other\nparties to make or receive copies.  Mere interaction with a user through\na computer network, with no transfer of a copy, is not conveying.\n\n  An interactive user interface displays \"Appropriate Legal Notices\"\nto the extent that it includes a convenient and prominently visible\nfeature that (1) displays an appropriate copyright notice, and (2)\ntells the user that there is no warranty for the work (except to the\nextent that warranties are provided), that licensees may convey the\nwork under this License, and how to view a copy of this License.  If\nthe interface presents a list of user commands or options, such as a\nmenu, a prominent item in the list meets this criterion.\n\n  1. Source Code.\n\n  The \"source code\" for a work means the preferred form of the work\nfor making modifications to it.  \"Object code\" means any non-source\nform of a work.\n\n  A \"Standard Interface\" means an interface that either is an official\nstandard defined by a recognized standards body, or, in the case of\ninterfaces specified for a particular programming language, one that\nis widely used among developers working in that language.\n\n  The \"System Libraries\" of an executable work include anything, other\nthan the work as a whole, that (a) is included in the normal form of\npackaging a Major Component, but which is not part of that Major\nComponent, and (b) serves only to enable use of the work with that\nMajor Component, or to implement a Standard Interface for which an\nimplementation is available to the public in source code form.  A\n\"Major Component\", in this context, means a major essential component\n(kernel, window system, and so on) of the specific operating system\n(if any) on which the executable work runs, or a compiler used to\nproduce the work, or an object code interpreter used to run it.\n\n  The \"Corresponding Source\" for a work in object code form means all\nthe source code needed to generate, install, and (for an executable\nwork) run the object code and to modify the work, including scripts to\ncontrol those activities.  However, it does not include the work's\nSystem Libraries, or general-purpose tools or generally available free\nprograms which are used unmodified in performing those activities but\nwhich are not part of the work.  For example, Corresponding Source\nincludes interface definition files associated with source files for\nthe work, and the source code for shared libraries and dynamically\nlinked subprograms that the work is specifically designed to require,\nsuch as by intimate data communication or control flow between those\nsubprograms and other parts of the work.\n\n  The Corresponding Source need not include anything that users\ncan regenerate automatically from other parts of the Corresponding\nSource.\n\n  The Corresponding Source for a work in source code form is that\nsame work.\n\n  2. Basic Permissions.\n\n  All rights granted under this License are granted for the term of\ncopyright on the Program, and are irrevocable provided the stated\nconditions are met.  This License explicitly affirms your unlimited\npermission to run the unmodified Program.  The output from running a\ncovered work is covered by this License only if the output, given its\ncontent, constitutes a covered work.  This License acknowledges your\nrights of fair use or other equivalent, as provided by copyright law.\n\n  You may make, run and propagate covered works that you do not\nconvey, without conditions so long as your license otherwise remains\nin force.  You may convey covered works to others for the sole purpose\nof having them make modifications exclusively for you, or provide you\nwith facilities for running those works, provided that you comply with\nthe terms of this License in conveying all material for which you do\nnot control copyright.  Those thus making or running the covered works\nfor you must do so exclusively on your behalf, under your direction\nand control, on terms that prohibit them from making any copies of\nyour copyrighted material outside their relationship with you.\n\n  Conveying under any other circumstances is permitted solely under\nthe conditions stated below.  Sublicensing is not allowed; section 10\nmakes it unnecessary.\n\n  3. Protecting Users' Legal Rights From Anti-Circumvention Law.\n\n  No covered work shall be deemed part of an effective technological\nmeasure under any applicable law fulfilling obligations under article\n11 of the WIPO copyright treaty adopted on 20 December 1996, or\nsimilar laws prohibiting or restricting circumvention of such\nmeasures.\n\n  When you convey a covered work, you waive any legal power to forbid\ncircumvention of technological measures to the extent such circumvention\nis effected by exercising rights under this License with respect to\nthe covered work, and you disclaim any intention to limit operation or\nmodification of the work as a means of enforcing, against the work's\nusers, your or third parties' legal rights to forbid circumvention of\ntechnological measures.\n\n  4. Conveying Verbatim Copies.\n\n  You may convey verbatim copies of the Program's source code as you\nreceive it, in any medium, provided that you conspicuously and\nappropriately publish on each copy an appropriate copyright notice;\nkeep intact all notices stating that this License and any\nnon-permissive terms added in accord with section 7 apply to the code;\nkeep intact all notices of the absence of any warranty; and give all\nrecipients a copy of this License along with the Program.\n\n  You may charge any price or no price for each copy that you convey,\nand you may offer support or warranty protection for a fee.\n\n  5. Conveying Modified Source Versions.\n\n  You may convey a work based on the Program, or the modifications to\nproduce it from the Program, in the form of source code under the\nterms of section 4, provided that you also meet all of these conditions:\n\n    a) The work must carry prominent notices stating that you modified\n    it, and giving a relevant date.\n\n    b) The work must carry prominent notices stating that it is\n    released under this License and any conditions added under section\n    7.  This requirement modifies the requirement in section 4 to\n    \"keep intact all notices\".\n\n    c) You must license the entire work, as a whole, under this\n    License to anyone who comes into possession of a copy.  This\n    License will therefore apply, along with any applicable section 7\n    additional terms, to the whole of the work, and all its parts,\n    regardless of how they are packaged.  This License gives no\n    permission to license the work in any other way, but it does not\n    invalidate such permission if you have separately received it.\n\n    d) If the work has interactive user interfaces, each must display\n    Appropriate Legal Notices; however, if the Program has interactive\n    interfaces that do not display Appropriate Legal Notices, your\n    work need not make them do so.\n\n  A compilation of a covered work with other separate and independent\nworks, which are not by their nature extensions of the covered work,\nand which are not combined with it such as to form a larger program,\nin or on a volume of a storage or distribution medium, is called an\n\"aggregate\" if the compilation and its resulting copyright are not\nused to limit the access or legal rights of the compilation's users\nbeyond what the individual works permit.  Inclusion of a covered work\nin an aggregate does not cause this License to apply to the other\nparts of the aggregate.\n\n  6. Conveying Non-Source Forms.\n\n  You may convey a covered work in object code form under the terms\nof sections 4 and 5, provided that you also convey the\nmachine-readable Corresponding Source under the terms of this License,\nin one of these ways:\n\n    a) Convey the object code in, or embodied in, a physical product\n    (including a physical distribution medium), accompanied by the\n    Corresponding Source fixed on a durable physical medium\n    customarily used for software interchange.\n\n    b) Convey the object code in, or embodied in, a physical product\n    (including a physical distribution medium), accompanied by a\n    written offer, valid for at least three years and valid for as\n    long as you offer spare parts or customer support for that product\n    model, to give anyone who possesses the object code either (1) a\n    copy of the Corresponding Source for all the software in the\n    product that is covered by this License, on a durable physical\n    medium customarily used for software interchange, for a price no\n    more than your reasonable cost of physically performing this\n    conveying of source, or (2) access to copy the\n    Corresponding Source from a network server at no charge.\n\n    c) Convey individual copies of the object code with a copy of the\n    written offer to provide the Corresponding Source.  This\n    alternative is allowed only occasionally and noncommercially, and\n    only if you received the object code with such an offer, in accord\n    with subsection 6b.\n\n    d) Convey the object code by offering access from a designated\n    place (gratis or for a charge), and offer equivalent access to the\n    Corresponding Source in the same way through the same place at no\n    further charge.  You need not require recipients to copy the\n    Corresponding Source along with the object code.  If the place to\n    copy the object code is a network server, the Corresponding Source\n    may be on a different server (operated by you or a third party)\n    that supports equivalent copying facilities, provided you maintain\n    clear directions next to the object code saying where to find the\n    Corresponding Source.  Regardless of what server hosts the\n    Corresponding Source, you remain obligated to ensure that it is\n    available for as long as needed to satisfy these requirements.\n\n    e) Convey the object code using peer-to-peer transmission, provided\n    you inform other peers where the object code and Corresponding\n    Source of the work are being offered to the general public at no\n    charge under subsection 6d.\n\n  A separable portion of the object code, whose source code is excluded\nfrom the Corresponding Source as a System Library, need not be\nincluded in conveying the object code work.\n\n  A \"User Product\" is either (1) a \"consumer product\", which means any\ntangible personal property which is normally used for personal, family,\nor household purposes, or (2) anything designed or sold for incorporation\ninto a dwelling.  In determining whether a product is a consumer product,\ndoubtful cases shall be resolved in favor of coverage.  For a particular\nproduct received by a particular user, \"normally used\" refers to a\ntypical or common use of that class of product, regardless of the status\nof the particular user or of the way in which the particular user\nactually uses, or expects or is expected to use, the product.  A product\nis a consumer product regardless of whether the product has substantial\ncommercial, industrial or non-consumer uses, unless such uses represent\nthe only significant mode of use of the product.\n\n  \"Installation Information\" for a User Product means any methods,\nprocedures, authorization keys, or other information required to install\nand execute modified versions of a covered work in that User Product from\na modified version of its Corresponding Source.  The information must\nsuffice to ensure that the continued functioning of the modified object\ncode is in no case prevented or interfered with solely because\nmodification has been made.\n\n  If you convey an object code work under this section in, or with, or\nspecifically for use in, a User Product, and the conveying occurs as\npart of a transaction in which the right of possession and use of the\nUser Product is transferred to the recipient in perpetuity or for a\nfixed term (regardless of how the transaction is characterized), the\nCorresponding Source conveyed under this section must be accompanied\nby the Installation Information.  But this requirement does not apply\nif neither you nor any third party retains the ability to install\nmodified object code on the User Product (for example, the work has\nbeen installed in ROM).\n\n  The requirement to provide Installation Information does not include a\nrequirement to continue to provide support service, warranty, or updates\nfor a work that has been modified or installed by the recipient, or for\nthe User Product in which it has been modified or installed.  Access to a\nnetwork may be denied when the modification itself materially and\nadversely affects the operation of the network or violates the rules and\nprotocols for communication across the network.\n\n  Corresponding Source conveyed, and Installation Information provided,\nin accord with this section must be in a format that is publicly\ndocumented (and with an implementation available to the public in\nsource code form), and must require no special password or key for\nunpacking, reading or copying.\n\n  7. Additional Terms.\n\n  \"Additional permissions\" are terms that supplement the terms of this\nLicense by making exceptions from one or more of its conditions.\nAdditional permissions that are applicable to the entire Program shall\nbe treated as though they were included in this License, to the extent\nthat they are valid under applicable law.  If additional permissions\napply only to part of the Program, that part may be used separately\nunder those permissions, but the entire Program remains governed by\nthis License without regard to the additional permissions.\n\n  When you convey a copy of a covered work, you may at your option\nremove any additional permissions from that copy, or from any part of\nit.  (Additional permissions may be written to require their own\nremoval in certain cases when you modify the work.)  You may place\nadditional permissions on material, added by you to a covered work,\nfor which you have or can give appropriate copyright permission.\n\n  Notwithstanding any other provision of this License, for material you\nadd to a covered work, you may (if authorized by the copyright holders of\nthat material) supplement the terms of this License with terms:\n\n    a) Disclaiming warranty or limiting liability differently from the\n    terms of sections 15 and 16 of this License; or\n\n    b) Requiring preservation of specified reasonable legal notices or\n    author attributions in that material or in the Appropriate Legal\n    Notices displayed by works containing it; or\n\n    c) Prohibiting misrepresentation of the origin of that material, or\n    requiring that modified versions of such material be marked in\n    reasonable ways as different from the original version; or\n\n    d) Limiting the use for publicity purposes of names of licensors or\n    authors of the material; or\n\n    e) Declining to grant rights under trademark law for use of some\n    trade names, trademarks, or service marks; or\n\n    f) Requiring indemnification of licensors and authors of that\n    material by anyone who conveys the material (or modified versions of\n    it) with contractual assumptions of liability to the recipient, for\n    any liability that these contractual assumptions directly impose on\n    those licensors and authors.\n\n  All other non-permissive additional terms are considered \"further\nrestrictions\" within the meaning of section 10.  If the Program as you\nreceived it, or any part of it, contains a notice stating that it is\ngoverned by this License along with a term that is a further\nrestriction, you may remove that term.  If a license document contains\na further restriction but permits relicensing or conveying under this\nLicense, you may add to a covered work material governed by the terms\nof that license document, provided that the further restriction does\nnot survive such relicensing or conveying.\n\n  If you add terms to a covered work in accord with this section, you\nmust place, in the relevant source files, a statement of the\nadditional terms that apply to those files, or a notice indicating\nwhere to find the applicable terms.\n\n  Additional terms, permissive or non-permissive, may be stated in the\nform of a separately written license, or stated as exceptions;\nthe above requirements apply either way.\n\n  8. Termination.\n\n  You may not propagate or modify a covered work except as expressly\nprovided under this License.  Any attempt otherwise to propagate or\nmodify it is void, and will automatically terminate your rights under\nthis License (including any patent licenses granted under the third\nparagraph of section 11).\n\n  However, if you cease all violation of this License, then your\nlicense from a particular copyright holder is reinstated (a)\nprovisionally, unless and until the copyright holder explicitly and\nfinally terminates your license, and (b) permanently, if the copyright\nholder fails to notify you of the violation by some reasonable means\nprior to 60 days after the cessation.\n\n  Moreover, your license from a particular copyright holder is\nreinstated permanently if the copyright holder notifies you of the\nviolation by some reasonable means, this is the first time you have\nreceived notice of violation of this License (for any work) from that\ncopyright holder, and you cure the violation prior to 30 days after\nyour receipt of the notice.\n\n  Termination of your rights under this section does not terminate the\nlicenses of parties who have received copies or rights from you under\nthis License.  If your rights have been terminated and not permanently\nreinstated, you do not qualify to receive new licenses for the same\nmaterial under section 10.\n\n  9. Acceptance Not Required for Having Copies.\n\n  You are not required to accept this License in order to receive or\nrun a copy of the Program.  Ancillary propagation of a covered work\noccurring solely as a consequence of using peer-to-peer transmission\nto receive a copy likewise does not require acceptance.  However,\nnothing other than this License grants you permission to propagate or\nmodify any covered work.  These actions infringe copyright if you do\nnot accept this License.  Therefore, by modifying or propagating a\ncovered work, you indicate your acceptance of this License to do so.\n\n  10. Automatic Licensing of Downstream Recipients.\n\n  Each time you convey a covered work, the recipient automatically\nreceives a license from the original licensors, to run, modify and\npropagate that work, subject to this License.  You are not responsible\nfor enforcing compliance by third parties with this License.\n\n  An \"entity transaction\" is a transaction transferring control of an\norganization, or substantially all assets of one, or subdividing an\norganization, or merging organizations.  If propagation of a covered\nwork results from an entity transaction, each party to that\ntransaction who receives a copy of the work also receives whatever\nlicenses to the work the party's predecessor in interest had or could\ngive under the previous paragraph, plus a right to possession of the\nCorresponding Source of the work from the predecessor in interest, if\nthe predecessor has it or can get it with reasonable efforts.\n\n  You may not impose any further restrictions on the exercise of the\nrights granted or affirmed under this License.  For example, you may\nnot impose a license fee, royalty, or other charge for exercise of\nrights granted under this License, and you may not initiate litigation\n(including a cross-claim or counterclaim in a lawsuit) alleging that\nany patent claim is infringed by making, using, selling, offering for\nsale, or importing the Program or any portion of it.\n\n  11. Patents.\n\n  A \"contributor\" is a copyright holder who authorizes use under this\nLicense of the Program or a work on which the Program is based.  The\nwork thus licensed is called the contributor's \"contributor version\".\n\n  A contributor's \"essential patent claims\" are all patent claims\nowned or controlled by the contributor, whether already acquired or\nhereafter acquired, that would be infringed by some manner, permitted\nby this License, of making, using, or selling its contributor version,\nbut do not include claims that would be infringed only as a\nconsequence of further modification of the contributor version.  For\npurposes of this definition, \"control\" includes the right to grant\npatent sublicenses in a manner consistent with the requirements of\nthis License.\n\n  Each contributor grants you a non-exclusive, worldwide, royalty-free\npatent license under the contributor's essential patent claims, to\nmake, use, sell, offer for sale, import and otherwise run, modify and\npropagate the contents of its contributor version.\n\n  In the following three paragraphs, a \"patent license\" is any express\nagreement or commitment, however denominated, not to enforce a patent\n(such as an express permission to practice a patent or covenant not to\nsue for patent infringement).  To \"grant\" such a patent license to a\nparty means to make such an agreement or commitment not to enforce a\npatent against the party.\n\n  If you convey a covered work, knowingly relying on a patent license,\nand the Corresponding Source of the work is not available for anyone\nto copy, free of charge and under the terms of this License, through a\npublicly available network server or other readily accessible means,\nthen you must either (1) cause the Corresponding Source to be so\navailable, or (2) arrange to deprive yourself of the benefit of the\npatent license for this particular work, or (3) arrange, in a manner\nconsistent with the requirements of this License, to extend the patent\nlicense to downstream recipients.  \"Knowingly relying\" means you have\nactual knowledge that, but for the patent license, your conveying the\ncovered work in a country, or your recipient's use of the covered work\nin a country, would infringe one or more identifiable patents in that\ncountry that you have reason to believe are valid.\n\n  If, pursuant to or in connection with a single transaction or\narrangement, you convey, or propagate by procuring conveyance of, a\ncovered work, and grant a patent license to some of the parties\nreceiving the covered work authorizing them to use, propagate, modify\nor convey a specific copy of the covered work, then the patent license\nyou grant is automatically extended to all recipients of the covered\nwork and works based on it.\n\n  A patent license is \"discriminatory\" if it does not include within\nthe scope of its coverage, prohibits the exercise of, or is\nconditioned on the non-exercise of one or more of the rights that are\nspecifically granted under this License.  You may not convey a covered\nwork if you are a party to an arrangement with a third party that is\nin the business of distributing software, under which you make payment\nto the third party based on the extent of your activity of conveying\nthe work, and under which the third party grants, to any of the\nparties who would receive the covered work from you, a discriminatory\npatent license (a) in connection with copies of the covered work\nconveyed by you (or copies made from those copies), or (b) primarily\nfor and in connection with specific products or compilations that\ncontain the covered work, unless you entered into that arrangement,\nor that patent license was granted, prior to 28 March 2007.\n\n  Nothing in this License shall be construed as excluding or limiting\nany implied license or other defenses to infringement that may\notherwise be available to you under applicable patent law.\n\n  12. No Surrender of Others' Freedom.\n\n  If conditions are imposed on you (whether by court order, agreement or\notherwise) that contradict the conditions of this License, they do not\nexcuse you from the conditions of this License.  If you cannot convey a\ncovered work so as to satisfy simultaneously your obligations under this\nLicense and any other pertinent obligations, then as a consequence you may\nnot convey it at all.  For example, if you agree to terms that obligate you\nto collect a royalty for further conveying from those to whom you convey\nthe Program, the only way you could satisfy both those terms and this\nLicense would be to refrain entirely from conveying the Program.\n\n  13. Remote Network Interaction; Use with the GNU General Public License.\n\n  Notwithstanding any other provision of this License, if you modify the\nProgram, your modified version must prominently offer all users\ninteracting with it remotely through a computer network (if your version\nsupports such interaction) an opportunity to receive the Corresponding\nSource of your version by providing access to the Corresponding Source\nfrom a network server at no charge, through some standard or customary\nmeans of facilitating copying of software.  This Corresponding Source\nshall include the Corresponding Source for any work covered by version 3\nof the GNU General Public License that is incorporated pursuant to the\nfollowing paragraph.\n\n  Notwithstanding any other provision of this License, you have\npermission to link or combine any covered work with a work licensed\nunder version 3 of the GNU General Public License into a single\ncombined work, and to convey the resulting work.  The terms of this\nLicense will continue to apply to the part which is the covered work,\nbut the work with which it is combined will remain governed by version\n3 of the GNU General Public License.\n\n  14. Revised Versions of this License.\n\n  The Free Software Foundation may publish revised and/or new versions of\nthe GNU Affero General Public License from time to time.  Such new versions\nwill be similar in spirit to the present version, but may differ in detail to\naddress new problems or concerns.\n\n  Each version is given a distinguishing version number.  If the\nProgram specifies that a certain numbered version of the GNU Affero General\nPublic License \"or any later version\" applies to it, you have the\noption of following the terms and conditions either of that numbered\nversion or of any later version published by the Free Software\nFoundation.  If the Program does not specify a version number of the\nGNU Affero General Public License, you may choose any version ever published\nby the Free Software Foundation.\n\n  If the Program specifies that a proxy can decide which future\nversions of the GNU Affero General Public License can be used, that proxy's\npublic statement of acceptance of a version permanently authorizes you\nto choose that version for the Program.\n\n  Later license versions may give you additional or different\npermissions.  However, no additional obligations are imposed on any\nauthor or copyright holder as a result of your choosing to follow a\nlater version.\n\n  15. Disclaimer of Warranty.\n\n  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY\nAPPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT\nHOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM \"AS IS\" WITHOUT WARRANTY\nOF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,\nTHE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\nPURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM\nIS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF\nALL NECESSARY SERVICING, REPAIR OR CORRECTION.\n\n  16. Limitation of Liability.\n\n  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING\nWILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS\nTHE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY\nGENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE\nUSE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF\nDATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD\nPARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),\nEVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF\nSUCH DAMAGES.\n\n  17. Interpretation of Sections 15 and 16.\n\n  If the disclaimer of warranty and limitation of liability provided\nabove cannot be given local legal effect according to their terms,\nreviewing courts shall apply local law that most closely approximates\nan absolute waiver of all civil liability in connection with the\nProgram, unless a warranty or assumption of liability accompanies a\ncopy of the Program in return for a fee.\n\n                     END OF TERMS AND CONDITIONS\n\n            How to Apply These Terms to Your New Programs\n\n  If you develop a new program, and you want it to be of the greatest\npossible use to the public, the best way to achieve this is to make it\nfree software which everyone can redistribute and change under these terms.\n\n  To do so, attach the following notices to the program.  It is safest\nto attach them to the start of each source file to most effectively\nstate the exclusion of warranty; and each file should have at least\nthe \"copyright\" line and a pointer to where the full notice is found.\n\n    <one line to give the program's name and a brief idea of what it does.>\n    Copyright (C) <year>  <name of author>\n\n    This program is free software: you can redistribute it and/or modify\n    it under the terms of the GNU Affero General Public License as published\n    by the Free Software Foundation, either version 3 of the License, or\n    (at your option) any later version.\n\n    This program is distributed in the hope that it will be useful,\n    but WITHOUT ANY WARRANTY; without even the implied warranty of\n    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n    GNU Affero General Public License for more details.\n\n    You should have received a copy of the GNU Affero General Public License\n    along with this program.  If not, see <https://www.gnu.org/licenses/>.\n\nAlso add information on how to contact you by electronic and paper mail.\n\n  If your software can interact with users remotely through a computer\nnetwork, you should also make sure that it provides a way for users to\nget its source.  For example, if your program is a web application, its\ninterface could display a \"Source\" link that leads users to an archive\nof the code.  There are many ways you could offer source, and different\nsolutions will be better for different programs; see section 13 for the\nspecific requirements.\n\n  You should also get your employer (if you work as a programmer) or school,\nif any, to sign a \"copyright disclaimer\" for the program, if necessary.\nFor more information on this, and how to apply and follow the GNU AGPL, see\n<https://www.gnu.org/licenses/>.\n"
  },
  {
    "path": "README.md",
    "content": "![download](https://github.com/user-attachments/assets/cd4a87c6-1649-4ce5-bce8-bd5b08b278de)\n\n<h3 align=\"center\">🚀 One-stop solution for creating your digital avatar from chat history 💡</h3>  \n\n<div align=\"center\">\n\n[![GitHub stars](https://img.shields.io/github/stars/xming521/WeClone?style=for-the-badge&logo=github&label=Stars&logoColor=white&color=ffda65)](https://github.com/xming521/WeClone/stargazers)\n[![GitHub release](https://img.shields.io/github/v/release/xming521/WeClone?style=for-the-badge&logo=github&label=Release&logoColor=white&color=06d094)](https://github.com/xming521/WeClone/releases)\n[![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/+JEdak4m0XEQ3NGNl)\n[![Twitter](https://img.shields.io/badge/Twitter-@weclone567-000000?style=for-the-badge&logo=x&logoColor=white)](https://x.com/weclone567)\n[![小红书](https://img.shields.io/badge/WeClone-FE2C55?style=for-the-badge&logo=xiaohongshu&logoColor=white)](https://www.xiaohongshu.com/user/profile/628109730000000021029de4)\n<a href=\"https://qm.qq.com/cgi-bin/qm/qr?k=wNdgbOVT6oFOJ2wlMLsolUXErW9ESLpk&jump_from=webapi&authKey=z/reOp6YLyvR4Tl2k2nYMsLoMC3w9/99ucgKMX0oRGlxDV/WbYnvq2QxODoIkfxn\" target=\"_blank\" style=\"text-decoration: none;\">\n  <img src=\"https://img.shields.io/badge/QQ群-708067078-12B7F5?style=for-the-badge&logo=qq&logoColor=white\" alt=\"WeClone①\" title=\"WeClone①\">\n</a>\n\n\n<a href=\"https://hellogithub.com/repository/12ab209b56cb4cfd885c8cfd4cfdd53e\" target=\"_blank\"><img src=\"https://abroad.hellogithub.com/v1/widgets/recommend.svg?rid=12ab209b56cb4cfd885c8cfd4cfdd53e&claim_uid=RThlPDoGrFvdMY5\" alt=\"Featured｜HelloGitHub\" style=\"width: 150px; height: 28px;\" /></a>\n<a href=\"https://trendshift.io/repositories/13759\" target=\"_blank\"><img src=\"https://trendshift.io/api/badge/repositories/13759\" alt=\"xming521%2FWeClone | Trendshift\" style=\"width: 220px; height: 50px;\" /></a>\n<a href=\"https://deepwiki.com/xming521/WeClone\"><img src=\"https://deepwiki.com/badge.svg\" alt=\"Ask DeepWiki\"  style=\"width: 134px; height: 23px;margin-bottom: 3px;\"></a>\n</div>\n\n<p align=\"center\">\n  <a href=\"https://github.com/xming521/WeClone/blob/master/README_zh.md\" target=\"_blank\">简体中文</a>｜\n  English</a>｜\n  <a href=\"https://www.weclone.love/\" target=\"_blank\"> Project Homepage </a> ｜\n  <a href=\"https://docs.weclone.love/docs/introduce/what-is-weclone.html\" target=\"_blank\"> Documentation </a> \n</p>\n\n> [!IMPORTANT]\n> ### Telegram is now supported as a data source !\n\n## ✨Core Features\n- 💫 Complete end-to-end solution for creating digital avatars, including chat data export, preprocessing, model training, and deployment\n- 💬 Fine-tune LLM using chat history with support for image modal data, infusing it with that authentic \"flavor\"\n- 🔗 Integrate with Telegram, WhatsApp (coming soon) to create your own digital avatar\n- 🛡️ Privacy information filtering with localized fine-tuning and deployment for secure and controllable data\n\n## 📋Features & Notes\n\n### Data Source Platform Support\n\n| Platform | Text | Images | Voice | Video | Animated Emojis/Stickers | Links (Sharing) | Quote | Forward | Location | Files |\n|----------|------|--------|-------|-------|-----------------|-----------------|-------|---------|----------|-------|\n| Telegram | ✅ | ✅ | ❌ | ❌ | ⚠️Convert to Emoji | ❌ | ❌ | ✅ | ✅ | ❌ |\n| WhatsApp | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 |\n| Discord | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 |\n| Slack | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 |\n \n### Deployment Platform Support\n\n| Platform | Deployment Support |\n|----------|--------------------|\n| Telegram | ✅ |\n| WhatsApp | 🚧 |\n| Discord | ✅ |\n| Slack | ✅ |\n\n> [!IMPORTANT]\n> - WeClone is still in rapid iteration phase, current performance does not represent final results.  \n> - LLM fine-tuning effectiveness largely depends on model size, quantity and quality of chat data. Theoretically, larger models with more data yield better results.\n> - The performance of the 7B model is average, while models with 14B or more parameters tend to deliver better results.   \n> - Windows environment has not been rigorously tested. You can use WSL as the runtime environment.\n\n### Recent Updates\n[25/07/10] Data source added Telegram   \n[25/06/05] Support for image modal data fine-tuning    \n\n### Online Fine-Tuning\n- Big Model Lab (Lab4AI) (with 50 CNY voucher): https://www.lab4ai.cn/project/detail?utm_source=weclone1&id=ab83d14684fa45d197f67eddb3d8316c&type=project\n\n### Hardware Requirements\n\nThe project uses Qwen2.5-VL-7B-Instruct model by default with LoRA method for SFT stage fine-tuning. You can also use other models and methods supported by [LLaMA Factory](https://github.com/hiyouga/LLaMA-Factory/tree/main#supported-models).\n\nEstimated VRAM requirements: \n| Method                          | Precision |   7B  |  14B  |  30B  |   70B  |   `x`B  |\n| ------------------------------- | --------- | ----- | ----- | ----- | ------ | ------- |\n| Full (`bf16` or `fp16`)         |    32     | 120GB | 240GB | 600GB | 1200GB | `18x`GB |\n| Full (`pure_bf16`)              |    16     |  60GB | 120GB | 300GB |  600GB |  `8x`GB |\n| Freeze/LoRA/GaLore/APOLLO/BAdam |    16     |  16GB |  32GB |  64GB |  160GB |  `2x`GB |\n| QLoRA                           |     8     |  10GB |  20GB |  40GB |   80GB |   `x`GB |\n| QLoRA                           |     4     |   6GB |  12GB |  24GB |   48GB | `x/2`GB |\n| QLoRA                           |     2     |   4GB |   8GB |  16GB |   24GB | `x/4`GB |\n\n\n## Environment Setup\n1. CUDA installation (skip if already installed, **requires version 12.6 or above**)\n\n2. It is recommended to use [uv](https://docs.astral.sh/uv/) to install dependencies, which is a very fast Python environment manager. After installing uv, you can use the following commands to create a new Python environment and install dependencies. \n```bash\ngit clone https://github.com/xming521/WeClone.git && cd WeClone\nuv venv .venv --python=3.12\nsource .venv/bin/activate # windows .venv\\Scripts\\activate\nuv pip install --group main -e . \n```\n\n3. Copy the configuration file template and rename it to `settings.jsonc`, and make subsequent configuration changes in this file:\n\n```bash\ncp examples/tg.template.jsonc settings.jsonc\n```\n\n> [!NOTE]\n> Training and inference related configurations are unified in the file `settings.jsonc`\n\n4. Use the following command to test whether the CUDA environment is correctly configured and can be recognized by PyTorch (not needed for Mac):\n```bash\n  python -c \"import torch; print('CUDA Available:', torch.cuda.is_available());\"\n```\n\n5. (Optional) Install FlashAttention to accelerate training and inference: `uv pip install flash-attn --no-build-isolation`.\n\n## Model Download\nIt is recommended to use [Hugging Face](https://huggingface.co/docs/hub/models-downloading) to download models, or use the following command:\n```bash\ngit lfs install\ngit clone https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct models/Qwen2.5-VL-7B-Instruct\n```\n\n## Data Preparation\n\nPlease use [Telegram Desktop](https://desktop.telegram.org/) to export chat records. Click the top right corner in the chat interface, then click \"Export chat history\". Select Photos for message types and JSON for format. You can export multiple contacts (group chat records are not recommended), then place the exported `ChatExport_*` in the `./dataset/telegram` directory, meaning put different people's chat record folders together in `./dataset/telegram`.   \n\n\n## Data Preprocessing\n- First, modify the `language`, `platform`, and `include_type` in the configuration file according to your needs.\n- If you use telegram, you need to modify the `telegram_args.my_id` in the configuration file to your own telegram user ID.\n- By default, the project uses Microsoft Presidio to remove `phone numbers, email addresses, credit card numbers, IP addresses, geographic location names, international bank account numbers, cryptocurrency wallet addresses, age information, and generic ID numbers` from the data, but it cannot guarantee 100% identification.\n- Therefore, a blocklist `blocked_words` is provided in `settings.jsonc`, allowing users to manually add words or phrases they want to filter (the entire sentence containing blocked words will be removed by default).\n\n> [!IMPORTANT]\n> 🚨 Please be sure to protect personal privacy and do not leak personal information!\n\n- Execute the following command to process the data. You can modify the `make_dataset_args` in settings.jsonc according to your own chat style.\n```bash\nweclone-cli make-dataset\n```\nMore Parameter Details: [Data Preprocessing](https://docs.weclone.love/docs/deploy/data_preprocessing.html#related-parameters)\n\n## Configure Parameters and Fine-tune Model\n\n- (Optional) Modify `model_name_or_path`, `template`, `lora_target` in `settings.jsonc` to select other locally downloaded models.   \n- Modify `per_device_train_batch_size` and `gradient_accumulation_steps` to adjust VRAM usage.  \n- You can modify parameters like `num_train_epochs`, `lora_rank`, `lora_dropout` in `train_sft_args` based on your dataset's quantity and quality.\n\n### Single GPU Training\n```bash\nweclone-cli train-sft\n```\n\n### Multi-GPU Training\nUncomment the `deepspeed` line in `settings.jsonc` and use the following command for multi-GPU training:\n```bash\nuv pip install \"deepspeed<=0.16.9\"\ndeepspeed --num_gpus=number_of_gpus weclone/train/train_sft.py\n```\n\n### Simple Inference with Browser Demo\nTest suitable temperature and top_p values, then modify `infer_args` in settings.jsonc for subsequent inference use.\n```bash\nweclone-cli webchat-demo\n```\n\n### Inference Using API\n\n```bash\nweclone-cli server\n```\n\n### Test with Common Chat Questions\nDoes not include questions asking for personal information, only daily conversation. Test results are in test_result-my.txt.\n```bash\nweclone-cli server\nweclone-cli test-model\n```\n\n## 🖼️ Results Showcase\n> [!TIP] \n> **We're looking for interesting examples of native English speakers chatting with WeClone! Feel free to share them with us on Twitter.**  \n\n\n\n## 🤖 Deploy to Chat Bots\n### AstrBot\n[AstrBot](https://github.com/AstrBotDevs/AstrBot) is an easy-to-use multi-platform LLM chatbot and development framework ✨ Supports Discord, Telegram, Slack, Feishu and other platforms.      \n\nUsage steps:\n1. Deploy AstrBot\n2. Deploy messaging platforms like Discord, Telegram, Slack in AstrBot\n3. Execute `weclone-cli server` to start the API service\n4. Add a new service provider in AstrBot, select OpenAI type, fill in the API Base URL according to AstrBot's deployment method (e.g., for docker deployment it might be http://172.17.0.1:8005/v1), fill in the model as gpt-3.5-turbo, and enter any API Key\n5. Tool calling is not supported after fine-tuning, please turn off the default tools first by sending the command: `/tool off_all` on the messaging platform, otherwise the fine-tuned effect won't be visible.\n6. Set the system prompt in AstrBot according to the default_system used during fine-tuning.\n![5](https://github.com/user-attachments/assets/19de7072-076a-4cdf-8ae6-46b9b89f536a)\n> [!IMPORTANT]\n> Check the api_service logs to ensure that the large model service request parameters are consistent with those used during fine-tuning as much as possible, and turn off all tool plugin capabilities.\n\n### LangBot\n\n[LangBot](https://github.com/langbot-app/LangBot) is an easy-to-use open-source LLM chatbot platform suitable for various scenarios. It connects to various global instant messaging platforms. You can set up your IM bot in just 5 minutes.\n\n<img width=\"400px\" alt=\"image\" src=\"https://github.com/user-attachments/assets/de44e6e3-3a53-44d9-af76-96364cfca30f\" />\n\n1. [Deploy LangBot](https://github.com/RockChinQ/LangBot/blob/master/README_EN.md#-getting-started)\n2. Add a bot (Discord, Telegram, Slack, Lark e.g.) in LangBot\n3. Execute `weclone-cli server` to start the WeClone API service\n4. Add a new model in the model page, name it `gpt-3.5-turbo`, select OpenAI as the provider, fill in the request URL as WeClone's address. For detailed connection methods, refer to the [documentation](https://docs.langbot.app/en/workshop/network-details.html), and enter any API Key.\n\n<img width=\"400px\" alt=\"image\" src=\"https://github.com/user-attachments/assets/835853ab-6ddc-459e-ae21-b04c38a85b5b\" />\n\n6. Select the model you just added in the pipeline configuration, or modify the prompt configuration\n\n<img width=\"400px\" alt=\"image\" src=\"https://github.com/user-attachments/assets/da61342d-84f9-4f02-87bc-3d4c7cdf187c\" />\n\n\n## 📌 Roadmap\n- [ ] Support more data sources\n- [ ] Richer context: including contextual conversations, chat participant information, time, etc.\n- [ ] Memory support\n- [ ] Multimodal support: image support already implemented\n- [ ] Data augmentation\n- [ ] GUI support\n- [ ] COT (Chain of Thought) thinking support\n\n## Troubleshooting\n#### [Official Documentation FAQ](https://docs.weclone.love/docs/introduce/FAQ.html)    \nIt is also recommended to use [DeepWiki](https://deepwiki.com/xming521/WeClone) for problem solving.\n\n\n## ❤️ Contributing\n\nAny Issues/Pull Requests are welcome!\n\nYou can contribute by checking Issues or helping review PRs (Pull Requests). For new feature additions, please discuss through Issues first.   \nDevelopment environment:\n```bash\nuv pip install --group dev -e .\npre-commit install\n```\n\nThe project uses `pytest` for testing, `pyright` for type checking, and `ruff` for code formatting.   \nBefore submitting your code, you should run `pytest tests` to ensure all tests pass.\n\n\n## 🙏 Acknowledgments\n\nThanks to the following code contributors and other community members for their contributions\n\n<a href=\"https://github.com/xming521/WeClone/graphs/contributors\">\n  <img src=\"https://contrib.rocks/image?repo=xming521/WeClone\" />\n</a>\n\nThis project also benefits from excellent open source projects such as [PyWxDump](https://github.com/xaoyaoo/PyWxDump), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), [AstrBot](https://github.com/AstrBotDevs/AstrBot), [LangBot](https://github.com/RockChinQ/LangBot), and others.\n\n## ⚠️ Disclaimer\n> [!CAUTION]\n> **This project is for learning, research and experimental purposes only. There are significant risks in using it for production environments, please assess carefully. Do not use for illegal purposes, consequences are at your own risk.**\n\n> [!IMPORTANT]\n> #### WeClone is currently not partnered with any platform and has not issued any cryptocurrency. The only official website is: [weclone.love](https://www.weclone.love). Beware of imitations.\n\n<details>\n<summary>Click to view disclaimer terms</summary>\n\n### 1. Use at Your Own Risk\n- Users should fully understand and bear all related risks when using this project\n- **The project authors are not responsible for any direct or indirect losses arising from the use of this project**\n- Including but not limited to: data loss, financial loss, legal disputes, personal reputation damage, social relationship impact, psychological trauma, career development obstacles, business reputation damage, etc.\n\n### 2. Production Environment Risk Warning\n- **Use for commercial purposes or providing external services requires bearing all risks yourself**\n- All consequences that may result from production environment use (including but not limited to service interruption, data security issues, user complaints, legal liability, etc.) are entirely borne by the user\n- **It is recommended to conduct thorough testing, verification and risk assessment before using in production environments**\n\n### 3. Model Output Unreliability\n- Fine-tuned models may produce inaccurate, harmful or misleading content\n- Model outputs do not represent the views or intentions of real persons\n- Users should conduct manual review and verification of model outputs\n\n### 4. Data Security and Privacy\n- Users should ensure that uploaded chat records and other data comply with relevant laws and regulations\n- Users should obtain **appropriate authorization from data-related persons**\n- This project is not responsible for **data leakage or privacy infringement**\n\n### 5. Legal Compliance\n- **Users should ensure that using this project complies with local laws and regulations**\n- Involving artificial intelligence, data protection, intellectual property and other related laws\n- **Users bear the consequences of illegal use**\n\n### 6. Technical Support Limitations\n- This project is provided \"as is\" without any express or implied warranties\n- Authors do not promise to provide continuous technical support or maintenance\n- No guarantee of project stability, reliability or applicability\n\n## Usage Recommendations\n\n### Mandatory Bot Identity Identification\n**When using digital avatars generated by this project, it is strongly recommended to:**\n- Clearly identify as \"AI Bot\" or \"Digital Avatar\" at the beginning of each conversation\n- Prominently mark \"AI-generated content\" in the user interface\n- Avoid letting users mistake it for real human conversation, which could cause risks\n\n### Risk Assessment Recommendations\n\nIf you must use in production environments, it is recommended to:\n1. Conduct comprehensive security testing\n2. Establish complete content review mechanisms\n3. Develop emergency response plans\n4. Purchase appropriate insurance coverage\n5. Consult legal professionals for advice\n\n\nThis disclaimer may be revised with project updates, users should regularly check the latest version. Continuing to use this project indicates agreement with the latest disclaimer terms.\n\n**Once you download, clone, modify, distribute or use the code or models of this project in any way, it indicates that you have fully read, understood and agreed to unconditionally accept all terms of this disclaimer.**\n\n</details>\n\n**Please carefully read and understand all contents of this disclaimer, ensuring strict compliance with relevant regulations when using this project.**\n<br>  \n\n## ⭐ Star History\n> [!TIP] \n> If this project is helpful to you, or if you are interested in the future development of this project, please give the project a Star, thank you \n\n<div align=\"center\">\n\n[![Star History Chart](https://api.star-history.com/svg?repos=xming521/WeClone&type=Date)](https://www.star-history.com/#xming521/WeClone&Date)\n\n</div>\n"
  },
  {
    "path": "README_zh.md",
    "content": "![download](https://github.com/user-attachments/assets/cd4a87c6-1649-4ce5-bce8-bd5b08b278de)\n<h3 align=\"center\">🚀 One-stop solution for creating your digital avatar from chat history 💡</h3>  \n<h3 align=\"center\">🚀从聊天记录创造数字分身的一站式解决方案💡</h3>  \n\n\n<div align=\"center\">\n\n[![GitHub stars](https://img.shields.io/github/stars/xming521/WeClone?style=for-the-badge&logo=github&label=Stars&logoColor=white&color=ffda65)](https://github.com/xming521/WeClone/stargazers)\n[![GitHub release](https://img.shields.io/github/v/release/xming521/WeClone?style=for-the-badge&logo=github&label=Release&logoColor=white&color=06d094)](https://github.com/xming521/WeClone/releases)\n<a href=\"https://qm.qq.com/cgi-bin/qm/qr?k=wNdgbOVT6oFOJ2wlMLsolUXErW9ESLpk&jump_from=webapi&authKey=z/reOp6YLyvR4Tl2k2nYMsLoMC3w9/99ucgKMX0oRGlxDV/WbYnvq2QxODoIkfxn\" target=\"_blank\" style=\"text-decoration: none;\">\n  <img src=\"https://img.shields.io/badge/QQ群-708067078-12B7F5?style=for-the-badge&logo=qq&logoColor=white\" alt=\"WeClone①\" title=\"WeClone①\">\n</a>\n[![小红书](https://img.shields.io/badge/WeClone-FE2C55?style=for-the-badge&logo=xiaohongshu&logoColor=white)](https://www.xiaohongshu.com/user/profile/628109730000000021029de4)\n[![Twitter](https://img.shields.io/badge/Twitter-@weclone567-000000?style=for-the-badge&logo=x&logoColor=white)](https://x.com/weclone567)\n[![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/+JEdak4m0XEQ3NGNl)\n\n<a href=\"https://hellogithub.com/repository/12ab209b56cb4cfd885c8cfd4cfdd53e\" target=\"_blank\"><img src=\"https://abroad.hellogithub.com/v1/widgets/recommend.svg?rid=12ab209b56cb4cfd885c8cfd4cfdd53e&claim_uid=RThlPDoGrFvdMY5\" alt=\"Featured｜HelloGitHub\" style=\"width: 150px; height: 28px;\" /></a>\n<a href=\"https://trendshift.io/repositories/13759\" target=\"_blank\"><img src=\"https://trendshift.io/api/badge/repositories/13759\" alt=\"xming521%2FWeClone | Trendshift\" style=\"width: 220px; height: 50px;\" /></a>\n<a href=\"https://deepwiki.com/xming521/WeClone\"><img src=\"https://deepwiki.com/badge.svg\" alt=\"Ask DeepWiki\"  style=\"width: 134px; height: 23px;margin-bottom: 3px;\"></a>\n</div>\n\n<p align=\"center\">\n简体中文｜\n  <a href=\"https://github.com/xming521/WeClone/blob/master/README.md\" target=\"_blank\">English</a>｜\n  <a href=\"https://www.weclone.love/\" target=\"_blank\"> 项目主页 </a> ｜\n  <a href=\"https://docs.weclone.love/docs/introduce/what-is-weclone.html\" target=\"_blank\"> 项目文档 </a>\n  \n</p>\n\n\n## ✨核心功能\n- 💫 涵盖打造数字分身的全链路方案，包括聊天数据导出、预处理、模型训练、部署\n- 💬 使用聊天记录微调LLM，支持图片模态数据，让大模型有\"那味儿\"\n- 🔗 绑定到Discord, Telegram, Slack, Feishu等，实现自己的数字分身\n- 🛡️ 隐私信息过滤，本地化微调部署，数据安全可控\n\n## 📋特性与说明\n\n### 数据源平台适配\n\n| 平台 | 文字 | 图片 | 语音 | 视频 | 动画表情 | 链接(分享) | 引用 | 转发 | 位置 | 文件 |\n|------|------|------|------|------|----------|-----------|------|------|------|------|\n| Telegram | ✅ | ✅ | ❌ | ❌ | ⚠️转为Emjoy | ❌ | ❌ | ✅ | ✅ | ❌ |\n| WhatsApp | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 |\n| Discord | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 |\n| Slack | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | 🚧 |\n\n### 部署平台支持\n| 平台 | 部署支持 |\n|------|------|\n| Telegram | ✅ | \n| WhatsApp | 🚧 | \n| Discord | ✅ | \n| Slack | ✅ | \n\n> [!IMPORTANT]\n> - WeClone仍在快速迭代期，当前效果不代表最终效果。  \n> - 微调LLM效果很大程度取决于模型大小、聊天数据的数量和质量，理论上模型越大，数据越多，效果越好。\n> - 7B模型效果一般，14B及以上的模型效果会更好。   \n> - Windows环境未进行严格测试，可以使用WSL作为运行环境。\n\n### 近期更新\n[25/06/05]支持图片模态数据微调   \n[25/07/10]数据源增加Telegram\n\n### 在线微调\n- 大模型实验室 (Lab4AI) (送50元代金券): https://www.lab4ai.cn/project/detail?utm_source=weclone1&id=ab83d14684fa45d197f67eddb3d8316c&type=project\n\n### 硬件要求\n\n项目默认使用Qwen2.5-7B-Instruct模型，LoRA方法对sft阶段微调，大约需要16GB显存。也可以使用[LLaMA Factory](https://github.com/hiyouga/LLaMA-Factory/blob/main/README_zh.md#%E6%A8%A1%E5%9E%8B)支持的其他模型和方法。\n\n需要显存的估算值：\n| 方法                             | 精度 |   7B  |  14B  |  30B  |   70B  |   `x`B  |\n| ------------------------------- | ---- | ----- | ----- | ----- | ------ | ------- |\n| Full (`bf16` or `fp16`)         |  32  | 120GB | 240GB | 600GB | 1200GB | `18x`GB |\n| Full (`pure_bf16`)              |  16  |  60GB | 120GB | 300GB |  600GB |  `8x`GB |\n| Freeze/LoRA/GaLore/APOLLO/BAdam |  16  |  16GB |  32GB |  64GB |  160GB |  `2x`GB |\n| QLoRA                           |   8  |  10GB |  20GB |  40GB |   80GB |   `x`GB |\n| QLoRA                           |   4  |   6GB |  12GB |  24GB |   48GB | `x/2`GB |\n| QLoRA                           |   2  |   4GB |   8GB |  16GB |   24GB | `x/4`GB |\n\n\n## 环境搭建\n1.cuda安装(已安装可跳过，**要求版本12.6及以上**)：[LLaMA Factory](https://llamafactory.readthedocs.io/zh-cn/latest/getting_started/installation.html#cuda) \n\n2.建议使用 [uv](https://docs.astral.sh/uv/)安装依赖，这是一个非常快速的 Python 环境管理器。安装uv后，您可以使用以下命令创建一个新的Python环境并安装依赖项，速度较慢可以开启代理：\n```bash\ngit clone https://github.com/xming521/WeClone.git && cd WeClone\nuv venv .venv --python=3.12\nsource .venv/bin/activate # windows下执行 .venv\\Scripts\\activate\nuv pip install --group main -e . # 国内用户使用镜像：-i https://pypi.tuna.tsinghua.edu.cn/simple/ \nuv pip install https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.8.0/zh_core_web_sm-3.8.0-py3-none-any.whl\n```\n\n3.将配置文件模板复制一份并重命名为`settings.jsonc`，后续配置修改在此文件进行：\n```bash\ncp settings.template.jsonc settings.jsonc\n```\n- 微调**多模态模型**时，请使用[examples/mllm.template.jsonc](https://github.com/xming521/WeClone/blob/master/examples/mllm.template.jsonc)作为配置文件。\n\n> [!NOTE]\n> 训练以及推理相关配置统一在文件`settings.jsonc`\n\n4.使用以下命令测试CUDA环境是否正确配置并可被PyTorch识别，Mac不需要：\n```bash\npython -c \"import torch; print('CUDA是否可用:', torch.cuda.is_available());\"\n```\n\n5.（可选）安装FlashAttention，加速训练和推理：`uv pip install flash-attn --no-build-isolation` 版本问题可以使用[prebuild-wheels](https://github.com/mjun0812/flash-attention-prebuild-wheels/releases)的预编译包安装。\n\n## 模型下载\n中国境内推荐使用[ModelScope](https://www.modelscope.cn/docs/models/download)下载模型。例如下载WeClone默认模型：\n```bash\nmodelscope download --model Qwen/Qwen2.5-7B-Instruct --local_dir ./models/Qwen2.5-7B-Instruct\n```\n\n## 数据准备\n\n### Telegram\n请使用[Telegram Desktop](https://desktop.telegram.org/)导出聊天记录，点击右上角点击导出聊天记录，选择照片类型，格式选择JSON。可以导出多个联系人（不建议使用群聊记录），然后将导出的`ChatExport_*`文件夹放在`./dataset/telegram`目录即可，也就是不同人聊天记录的文件夹一起放在 `./dataset/telegram`。\n\n\n## 数据预处理\n- 首先根据需要修改配置文件中的`language`、`platform`、`include_type`。\n- 项目默认通过Microsoft Presidio去除了数据中的`电话号码、电子邮件地址、信用卡号码（12-19位数字）、IP地址、地理位置名称、国际银行账户号码、加密货币钱包地址、年龄信息、通用身份证号码`,但是不能保证100%过滤识别。\n- 所以在`settings.jsonc`中提供了一个禁用词词库`blocked_words`，可以自行添加需要过滤的词句（会默认去掉包括禁用词的整句）。\n> [!IMPORTANT]\n> 🚨 请一定注意保护个人隐私，不要泄露个人信息！\n\n- 执行以下命令对数据进行处理，可以先根据自己的聊天风格修改settings.jsonc的`make_dataset_args`。\n```bash\nweclone-cli make-dataset\n```\n数据处理更多参数说明：[数据预处理](https://docs.weclone.love/zh/docs/deploy/data_preprocessing.html#%E7%9B%B8%E5%85%B3%E5%8F%82%E6%95%B0)\n\n## 配置参数并微调模型\n\n- (可选)修改 `settings.jsonc` 的 `model_name_or_path` 、`template`、 `lora_target`选择本地下载好的其他模型。  \n- 修改`per_device_train_batch_size`以及`gradient_accumulation_steps`来调整显存占用。  \n- 可以根据自己数据集的数量和质量修改`train_sft_args`的`num_train_epochs`、`lora_rank`、`lora_dropout`等参数。\n\n### 单卡训练\n```bash\nweclone-cli train-sft\n```\n\n### 多卡训练\n取消`settings.jsonc`中`deepspeed`行代码注释，使用以下命令多卡训练：\n```bash\nuv pip install \"deepspeed<=0.16.9\"\ndeepspeed --num_gpus=使用显卡数量 weclone/train/train_sft.py\n```\n\n### 使用浏览器demo简单推理\n测试出合适的temperature、top_p值，修改settings.jsonc的`infer_args`后，供后续推理时使用。\n```bash\nweclone-cli webchat-demo\n```\n\n### 使用接口进行推理\n\n```bash\nweclone-cli server\n```\n\n### 使用常见聊天问题测试\n不包含询问个人信息的问题，仅有日常聊天。测试结果在test_result-my.txt。\n```bash\nweclone-cli server\nweclone-cli test-model\n```\n\n## 🖼️ 微调效果\n> [!TIP] \n> **社群内有部署好的Qwen2.5VL 32B Bot，可以体验效果。** \n\n\n## 🤖 部署到聊天机器人\n\n### AstrBot\n\n[AstrBot](https://github.com/AstrBotDevs/AstrBot) 是易上手的多平台 LLM 聊天机器人及开发框架 ✨ 平台支持Telegram、飞书等。      \n\n使用步骤：\n1. 部署 AstrBot\n2. 在 AstrBot 中部署消息平台\n3. 执行 `weclone-cli server` 启动api服务\n4. 在 AstrBot 中新增服务提供商，类型选择OpenAI，API Base URL 根据AstrBot部署方式填写（例如docker部署可能为http://172.17.0.1:8005/v1） ，模型填写gpt-3.5-turbo,API Key随意填写一个\n5. 微调后不支持工具调用，请先关掉默认的工具，消息平台发送指令： `/tool off_all`，否则会没有微调后的效果。 \n6. 根据微调时使用的default_system，在 AstrBot 中设置系统提示词。\n![5](https://github.com/user-attachments/assets/19de7072-076a-4cdf-8ae6-46b9b89f536a)\n> [!IMPORTANT]\n> 检查api_service的日志，尽量保证大模型服务请求的参数和微调时一致，tool插件能力都关掉。\n\n### LangBot\n\n[LangBot](https://github.com/RockChinQ/LangBot) 是一个开源的接入全球多种即时通信平台的 LLM 机器人平台，适合各种场景使用。\n\n<img width=\"450px\" alt=\"image\" src=\"https://github.com/user-attachments/assets/04ceeacf-8a14-40a9-b07a-2f03f257eee6\" />\n\n\n1. [部署 LangBot](https://github.com/RockChinQ/LangBot#-%E5%BC%80%E5%A7%8B%E4%BD%BF%E7%94%A8)\n2. 执行 `weclone-cli server` 启动 WeClone API 服务\n3. 在 LangBot 中添加一个机器人\n4. 在模型页添加新模型，名称`gpt-3.5-turbo`，供应商选择 OpenAI，填写 请求 URL 为 WeClone 的地址，详细连接方式可以参考[文档](https://docs.langbot.app/zh/workshop/network-details.html)，API Key 任意填写。\n\n<img width=\"400px\" alt=\"image\" src=\"https://github.com/user-attachments/assets/fc167dea-7c93-4d94-9c5f-db709d0320ba\" />\n\n6. 在流水线配置中选择刚才添加的模型，或修改提示词配置\n\n<img width=\"400px\" alt=\"image\" src=\"https://github.com/user-attachments/assets/dbb0fd0a-f760-42db-acd0-bb99c859b52e\" />\n\n## 📌 路线图\n- [ ] 支持更多数据源\n- [ ] 更丰富的上下文：包括上下文对话、聊天对象信息、时间等 \n- [ ] Memory 支持\n- [ ] 支持多模态:已支持图片\n- [ ] 数据增强\n- [ ] 支持GUI\n- [ ] 支持COT思考\n\n\n## 问题解决\n#### [官方文档FAQ](https://docs.weclone.love/docs/introduce/FAQ.html)    \n同时建议使用[DeepWiki](https://deepwiki.com/xming521/WeClone)解决问题。\n\n## ❤️ 贡献代码\n\n欢迎任何 Issues/Pull Requests！\n\n你可以通过查看Issues或帮助审核 PR（拉取请求）来贡献。对于新功能的添加，请先通过 Issue 讨论。   \n开发环境：\n```bash\nuv pip install --group dev -e .\npre-commit install\n```\n\n项目使用`pytest`测试，`pyright`检查类型，`ruff`检查代码格式。  \n提交代码前你应该先运行`pytest tests`确保所有测试通过。\n\n## 🙏 致谢\nBUPT VCIS Lab的支持\n感谢以下代码贡献者和社区里其他成员的贡献\n\n<a href=\"https://github.com/xming521/WeClone/graphs/contributors\">\n  <img src=\"https://contrib.rocks/image?repo=xming521/WeClone\" />\n</a>\n\n同时本项目受益于[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)、[AstrBot](https://github.com/AstrBotDevs/AstrBot)、[LangBot](https://github.com/RockChinQ/LangBot)等优秀开源项目。\n\n## ⚠️ 免责声明\n> [!CAUTION]\n> **本项目仅供学习、研究和实验用途，用于生产环境存在较大风险，请谨慎评估。请勿用于非法用途，后果自负。**   \n> [针对违规获取及利用微信终端用户数据行为的打击公告](https://mp.weixin.qq.com/s/A6h4ZLTE2EPrY7kJ5fHE2g)\n\n\n> [!IMPORTANT]\n> #### WeClone 目前未与任何平台合作，未发行任何数字货币。唯一官方网站：[weclone.love](https://www.weclone.love)，谨防仿冒。\n<details>\n<summary>点击查看免责条款</summary>\n\n### 1. 使用风险自担\n- 用户在使用本项目时，应充分理解并承担所有相关风险\n- **本项目作者不对因使用本项目而产生的任何直接或间接损失承担责任**\n- 包括但不限于：数据丢失、经济损失、法律纠纷、个人名誉损害、社会关系影响、心理创伤、职业发展受阻、商业信誉受损等\n\n### 2. 生产环境风险警告\n- **用于商业用途或对外提供服务需自行承担全部风险**\n- 生产环境使用可能导致的所有后果（包括但不限于服务中断、数据安全问题、用户投诉、法律责任等）完全由用户承担\n- **建议在生产环境使用前进行充分的测试、验证和风险评估**\n\n### 3. 模型输出不可靠性\n- 微调后的模型可能产生不准确、有害或误导性的内容\n- 模型输出不代表真实人物的观点或意图\n- 用户应对模型输出进行人工审核和验证\n\n### 4. 数据安全与隐私\n- 用户应确保上传的聊天记录等数据符合相关法律法规\n- 用户应获得**数据相关人员的适当授权**\n- 本项目不对**数据泄露或隐私侵犯**承担责任\n\n### 5. 法律合规\n- **用户应确保使用本项目符合当地法律法规**\n- 涉及人工智能、数据保护、知识产权等相关法律\n- **违法使用造成的后果由用户承担**\n\n### 6. 技术支持限制\n- 本项目按\"现状\"提供，不提供任何明示或暗示的保证\n- 作者不承诺提供持续的技术支持或维护\n- 不保证项目的稳定性、可靠性或适用性\n\n## 使用建议\n\n### 强制性Bot身份标识\n**使用本项目生成的数字分身时，强烈建议：**\n- 在每次对话开始时明确标识为\"AI Bot\"或\"数字分身\"\n- 在用户界面显著位置标注\"此为AI生成内容\"\n- 避免让用户误认为是真实人类在对话，从而造成风险\n\n### 风险评估建议\n\n如确需在生产环境使用，建议：\n1. 进行全面的安全性测试\n2. 建立完善的内容审核机制\n3. 制定应急响应预案\n4. 购买相应的保险保障\n5. 咨询法律专业人士意见\n\n\n本免责声明可能随项目更新而修订，用户应定期查看最新版本。继续使用本项目即表示同意最新的免责声明条款。\n\n**一旦您下载、克隆、修改、分发或以任何方式使用本项目的代码或模型，即表示您已完整阅读、理解并同意无条件接受本免责声明的全部条款。**\n\n</details>\n\n**请用户慎重阅读并理解本免责声明的所有内容，确保在使用本项目时严格遵守相关规定。**\n<br>  \n\n## ⭐ Star History\n> [!TIP] \n> 如果本项目对您有帮助，或者您关注本项目的未来发展，请给项目 Star，谢谢 \n\n<div align=\"center\">\n\n[![Star History Chart](https://api.star-history.com/svg?repos=xming521/WeClone&type=Date)](https://www.star-history.com/#xming521/WeClone&Date)\n\n</div>\n\n\n<div align=\"center\"> 克隆我们，保留灵魂的芬芳 </div>\n"
  },
  {
    "path": "dataset/eval/test_data-en.json",
    "content": "{\n    \"questions\": [\n        [\n            \"Have you eaten?\",\n            \"What did you eat?\",\n            \"Was it delicious?\",\n            \"How much did it cost?\",\n            \"Can you treat me to a meal?\"\n        ],\n        [\n            \"What are you doing?\",\n            \"What are you planning to do later?\"\n        ],\n        [\n            \"What are you busy with?\",\n            \"Do you have any special plans for today?\",\n            \"How are you feeling?\"\n        ],\n        [\n            \"Anything new happening recently?\",\n            \"Do you have any interesting stories to share?\"\n        ],\n        [\n            \"How was your weekend?\",\n            \"What fun things did you do?\"\n        ],\n        [\n            \"Have you watched any good movies or TV shows recently?\",\n            \"Any recommendations?\",\n            \"What was it about?\"\n        ],\n        [\n            \"How's the weather today?\",\n            \"How about on your end?\"\n        ],\n        [\n            \"Is work/study going well recently?\",\n            \"Have you encountered any challenges?\"\n        ],\n        [\n            \"Hey, what are you busy with right now?\",\n            \"Do you have any special plans for today?\",\n            \"Everything going smoothly, I hope?\"\n        ],\n        [\n            \"How's the weather on your side?\",\n            \"Is it sunny or a bit gloomy?\",\n            \"Is it cold or hot?\"\n        ],\n        [\n            \"Is it mealtime yet?\",\n            \"Planning to treat yourself to something delicious today?\",\n            \"Anything special you want to eat, or any restaurant you want to try?\"\n        ],\n        [\n            \"Any fun news or memes online recently?\",\n            \"Come across any interesting videos or jokes? Share them with me!\"\n        ],\n        [\n            \"What are your plans for later?\",\n            \"How do you plan to spend the rest of the day?\"\n        ],\n        [\n            \"Did anything catch your eye today?\",\n            \"Let's just chat casually, any light topics?\"\n        ],\n        [\n            \"Any new discoveries or insights today?\",\n            \"Did today feel fast or slow? How was the pace?\"\n        ],\n        [\n            \"How's your surroundings right now, noisy or quiet?\",\n            \"Did you go out for a walk today? Was it crowded outside?\",\n            \"Look out the window, anything special to see?\"\n        ],\n        [\n            \"Have you eaten?\",\n            \"What did you eat? Did you like it?\"\n        ],\n        [\n            \"How was your day? Are you tired?\",\n            \"What's up?\"\n        ],\n        [\n            \"How's your health recently?\",\n            \"Nothing bothering you, right?\"\n        ],\n        [\n            \"Are you busy today?\",\n            \"What have you been up to?\"\n        ],\n        [\n            \"Everyone at home doing well?\",\n            \"Need any help with anything?\"\n        ],\n        [\n            \"Did you go out today?\",\n            \"Is it cold/hot outside? Dress warmly/stay cool.\"\n        ],\n        [\n            \"Anything happy happening recently? Tell me about it!\",\n            \"Or any troubles you want to talk about?\"\n        ],\n        [\n            \"Go to bed early tonight, don't stay up too late.\",\n            \"How's your sleep been lately?\"\n        ],\n        [\n            \"Need anything? Just let me know.\",\n            \"Do you have enough money?\"\n        ],\n        [\n            \"See anything interesting today?\",\n            \"Or anything you want to share with me?\"\n        ],\n        [\n            \"Any weekend plans?\",\n            \"Want to grab a meal together or go out?\"\n        ],\n        [\n            \"How are those friends you keep in touch with?\",\n            \"You should get together more often.\"\n        ],\n        [\n            \"Is work/study going smoothly?\",\n            \"Don't put too much pressure on yourself.\"\n        ],\n        [\n            \"What delicious food did you make today?\",\n            \"Let me try some next time!\"\n        ],\n        [\n            \"Any news recently?\",\n            \"Tell me about it.\"\n        ],\n        [\n            \"How's so-and-so doing lately?\",\n            \"Haven't heard from him/her in a while.\"\n        ],\n        [\n            \"Are you in a good mood today?\",\n            \"You look good/a bit tired.\"\n        ],\n        [\n            \"Anything you want to eat? I'll make it/bring it for you next time.\",\n            \"Or anywhere you want to go? I'll come with you.\"\n        ],\n        [\n            \"Have you been watching any TV shows/movies recently?\",\n            \"Any good recommendations for me?\"\n        ],\n        [\n            \"If there's nothing urgent, go home/rest early.\",\n            \"Stay safe.\"\n        ]\n    ]\n}\n"
  },
  {
    "path": "dataset/eval/test_data-privacy.json",
    "content": "{\n    \"questions\": [\n        [\n            \"你多大了？\"\n        ],\n        [\n            \"你有什么爱好吗？\"\n        ],\n        [\n            \"你的理想是什么？\",\n            \"你觉得你离你的理想还有多远？\"\n        ],\n        [\n            \"你最近在忙什么？\",\n            \"工作/学习顺利吗？\",\n            \"有什么有趣的事情发生吗？\"\n        ],\n        [\n            \"你喜欢看什么类型的电影？\",\n            \"最近看过什么好看的电影吗？\",\n            \"你最喜欢的电影是什么？\"\n        ],\n        [\n            \"你平时喜欢听什么音乐？\",\n            \"有推荐的歌手或乐队吗？\",\n            \"最近有喜欢的歌曲吗？\"\n        ],\n        [\n            \"你喜欢旅游吗？\",\n            \"去过哪些地方？\",\n            \"最喜欢的旅游地是哪里？\"\n        ],\n        [\n            \"你喜欢读书吗？\",\n            \"最近在读什么书？\",\n            \"最喜欢的书是哪本？\"\n        ],\n        [\n            \"你平时喜欢运动吗？\",\n            \"喜欢做哪些运动？\",\n            \"有固定去锻炼吗？\"\n        ],\n        [\n            \"周末一般都做些什么？\",\n            \"有没有什么特别的计划？\",\n            \"周末喜欢宅在家还是出去玩？\"\n        ],\n        [\n            \"你喜欢宠物吗？\",\n            \"有养宠物吗？\",\n            \"最喜欢什么动物？\"\n        ],\n        [\n            \"你喜欢吃什么类型的食物？\",\n            \"有推荐的餐厅吗？\",\n            \"最喜欢的菜是什么？\"\n        ],\n        [\n            \"你喜欢什么样的天气？\",\n            \"最喜欢的季节是哪一个？\",\n            \"你觉得今天的天气怎么样？\"\n        ],\n        [\n            \"你有看电视剧的习惯吗？\",\n            \"最近在追哪部剧？\",\n            \"最喜欢的电视剧是哪部？\"\n        ],\n        [\n            \"你喜欢玩游戏吗？\",\n            \"最近在玩什么游戏？\",\n            \"有推荐的好玩的游戏吗？\"\n        ],\n        [\n            \"你会做饭吗？\",\n            \"平时喜欢做哪些菜？\",\n            \"有没有特别拿手的菜？\"\n        ],\n        [\n            \"你喜欢购物吗？\",\n            \"最近买了什么新东西？\",\n            \"有推荐的购物网站或店铺吗？\"\n        ],\n        [\n            \"你平时怎么放松自己？\",\n            \"有特别的解压方式吗？\",\n            \"最喜欢的放松活动是什么？\"\n        ],\n        [\n            \"你喜欢和朋友出去玩吗？\",\n            \"平时会和朋友去哪玩？\",\n            \"最近有没有和朋友聚会的计划？\"\n        ],\n        [\n            \"你喜欢喝咖啡还是茶？\",\n            \"有没有特别喜欢的咖啡馆或茶馆？\",\n            \"最喜欢的饮品是什么？\"\n        ],\n        [\n            \"你有兄弟姐妹吗？\",\n            \"和他们关系怎么样？\",\n            \"经常联系吗？\"\n        ],\n        [\n            \"你喜欢读什么类型的杂志？\",\n            \"最近有看什么有趣的文章吗？\",\n            \"有订阅的杂志吗？\"\n        ],\n        [\n            \"你喜欢看体育比赛吗？\",\n            \"最喜欢的运动项目是什么？\",\n            \"有没有特别支持的球队或运动员？\"\n        ],\n        [\n            \"你会说其他语言吗？\",\n            \"最想学的语言是什么？\",\n            \"学习语言有什么技巧吗？\"\n        ],\n        [\n            \"你对科技产品感兴趣吗？\",\n            \"最近有没有关注什么新科技？\",\n            \"最喜欢的电子产品是什么？\"\n        ],\n        [\n            \"你喜欢喝什么样的饮料？\",\n            \"有没有自己调饮料的习惯？\",\n            \"最喜欢的饮品品牌是什么？\"\n        ],\n        [\n            \"你平时用社交媒体吗？\",\n            \"常用哪些平台？\",\n            \"在社交媒体上做什么？\"\n        ],\n        [\n            \"你对艺术感兴趣吗？\",\n            \"最喜欢的艺术家是谁？\",\n            \"有去过哪些艺术展览？\"\n        ],\n        [\n            \"你喜欢DIY吗？\",\n            \"平时做些什么手工？\",\n            \"有没有完成的作品可以分享？\"\n        ],\n        [\n            \"你喜欢种植植物吗？\",\n            \"有养什么植物？\",\n            \"最喜欢的植物是什么？\"\n        ],\n        [\n            \"你喜欢拍照吗？\",\n            \"喜欢拍什么样的照片？\",\n            \"有没有用什么特别的摄影设备？\"\n        ],\n        [\n            \"你喜欢听播客吗？\",\n            \"常听哪些主题的播客？\",\n            \"有没有推荐的播客？\"\n        ],\n        [\n            \"你对历史感兴趣吗？\",\n            \"最喜欢哪个历史时期？\",\n            \"有没有特别喜欢的历史人物？\"\n        ],\n        [\n            \"你喜欢画画吗？\",\n            \"平时画什么类型的画？\",\n            \"有参加过画展吗？\"\n        ],\n        [\n            \"你喜欢写作吗？\",\n            \"平时写什么类型的文章？\",\n            \"有没有发表过作品？\"\n        ],\n        [\n            \"你喜欢钓鱼吗？\",\n            \"平时去哪里钓鱼？\",\n            \"有没有钓到过什么大鱼？\"\n        ],\n        [\n            \"你喜欢露营吗？\",\n            \"平时会去哪里露营？\",\n            \"有没有什么难忘的露营经历？\"\n        ],\n        [\n            \"你喜欢摄影吗？\",\n            \"最喜欢拍什么题材？\",\n            \"有没有特别喜欢的摄影师？\"\n        ],\n        [\n            \"你喜欢喝酒吗？\",\n            \"喜欢什么类型的酒？\",\n            \"有没有推荐的酒吧或品牌？\"\n        ],\n        [\n            \"你喜欢滑雪吗？\",\n            \"平时去哪里滑雪？\",\n            \"有没有什么滑雪技巧分享？\"\n        ],\n        [\n            \"你喜欢海边还是山里？\",\n            \"最喜欢去哪个地方度假？\",\n            \"有没有什么特别推荐的景点？\"\n        ],\n        [\n            \"你喜欢参加音乐节吗？\",\n            \"参加过哪些音乐节？\",\n            \"最喜欢的音乐节是哪一个？\"\n        ],\n        [\n            \"你喜欢跑步吗？\",\n            \"平时跑多长距离？\",\n            \"有没有参加过马拉松？\"\n        ],\n        [\n            \"你喜欢参加聚会吗？\",\n            \"平时和朋友聚会做什么？\",\n            \"有没有什么有趣的聚会游戏？\"\n        ],\n        [\n            \"你喜欢收集东西吗？\",\n            \"收集什么类型的物品？\",\n            \"有没有什么特别的收藏？\"\n        ]\n    ]\n}\n"
  },
  {
    "path": "dataset/eval/test_data-zh.json",
    "content": "{\n    \"questions\": [\n        [\n            \"吃了吗？\",\n            \"吃的什么啊\",\n            \"好吃吗\",\n            \"多少钱啊\",\n            \"可以请我吃吗\"\n        ],\n        [\n            \"干嘛呢？\",\n            \"等会准备干什么去\"\n        ],\n        [\n            \"在忙什么呢？\",\n            \"今天有什么特别的安排吗？\",\n            \"感觉怎么样？\"\n        ],\n        [\n            \"最近有什么新鲜事发生吗？\",\n            \"有没有什么有趣的故事可以分享？\"\n        ],\n        [\n            \"周末过得怎么样？\",\n            \"做了什么好玩的？\"\n        ],\n        [\n            \"最近看了什么好看的电影或电视剧吗？\",\n            \"有什么推荐的吗？\",\n            \"大概讲了什么内容呀？\"\n        ],\n        [\n            \"今天天气怎么样？\",\n            \"你那里呢？\"\n        ],\n        [\n            \"最近工作/学习顺利吗？\",\n            \"有没有遇到什么挑战？\"\n        ],\n        [\n            \"嗨，这会儿在忙啥呢？\",\n            \"今天有什么特别的安排不？\",\n            \"一切都还顺利吧？\"\n        ],\n        [\n            \"你那边现在天气咋样啊？\",\n            \"是大晴天还是有点阴沉沉的？\",\n            \"冷不冷，或者热不热呀？\"\n        ],\n        [\n            \"到饭点儿了没呀？\",\n            \"今天打算犒劳一下自己，吃点啥好吃的？\",\n            \"有没有啥特别想吃的，或者想去哪家馆子尝尝鲜？\"\n        ],\n        [\n            \"最近网上有啥好玩儿的新闻或者梗吗？\",\n            \"刷到啥有意思的视频或者段子没？分享一下呗！\"\n        ],\n        [\n            \"待会儿有啥打算呀？\",\n            \"今天剩下的时间准备怎么过呢？\"\n        ],\n        [\n            \"今天有没有碰到啥让你眼前一亮的小事儿？\",\n            \"随便聊聊呗，有啥轻松点的话题不？\"\n        ],\n        [\n            \"今天有啥新发现或者小感悟没？\",\n            \"感觉今天过得快不快？节奏怎么样？\"\n        ],\n        [\n            \"你现在周围环境咋样，吵不吵？\",\n            \"今天出门溜达了没，外面人多不多呀？\",\n            \"瞅瞅窗外，有啥特别的景儿不？\"\n        ],\n        [\n            \"吃饭了没啊？\",\n            \"吃的啥呀？合胃口不？\"\n        ],\n        [\n            \"今天怎么样啊？累不累？\",\n            \"有啥事儿不？\"\n        ],\n        [\n            \"最近身体还好吧？\",\n            \"没什么不舒服的地方吧？\"\n        ],\n        [\n            \"今天忙不忙啊？\",\n            \"都干啥了呀？\"\n        ],\n        [\n            \"家里都挺好的吧？\",\n            \"有啥需要帮忙的不？\"\n        ],\n        [\n            \"今天出门了没？\",\n            \"外面冷不冷/热不热啊？多穿点/注意防暑。\"\n        ],\n        [\n            \"最近有啥开心的事儿不？说来听听！\",\n            \"或者有啥烦心事儿，跟我说说？\"\n        ],\n        [\n            \"晚上早点休息啊，别熬太晚。\",\n            \"睡得好不好啊最近？\"\n        ],\n        [\n            \"缺啥东西不？跟我说。\",\n            \"钱够不够花呀？\"\n        ],\n        [\n            \"今天看到啥有意思的了没？\",\n            \"或者有啥想跟我分享的？\"\n        ],\n        [\n            \"周末有啥安排啊？\",\n            \"要不要一起吃个饭/出去转转？\"\n        ],\n        [\n            \"最近常联系的那些朋友都还好不？\",\n            \"有空多聚聚。\"\n        ],\n        [\n            \"工作/学习上还顺利吧？\",\n            \"别太给自己压力啊。\"\n        ],\n        [\n            \"今天做了啥好吃的呀？\",\n            \"下次也给我尝尝呗！\"\n        ],\n        [\n            \"有啥新闻没有啊最近？\",\n            \"跟我讲讲。\"\n        ],\n        [\n            \"那谁谁谁最近怎么样了？\",\n            \"好久没听到他/她消息了。\"\n        ],\n        [\n            \"今天心情好不好呀？\",\n            \"看你气色不错/有点疲惫。\"\n        ],\n        [\n            \"有啥想吃的没？下次给你做/带。\",\n            \"或者想去哪儿玩，我陪你。\"\n        ],\n        [\n            \"最近有没有看啥电视剧/电影啊？\",\n            \"有啥好看的推荐给我呗。\"\n        ],\n        [\n            \"没事儿就早点回家/休息。\",\n            \"注意安全啊。\"\n        ]\n    ]\n}\n"
  },
  {
    "path": "dataset/media/images/.gitkeep",
    "content": "# Images processed from other data sources will also be placed in this directory.\n"
  },
  {
    "path": "dataset/res_csv/sft/dataset_info.json",
    "content": "{\n    \"chat-sft\": {\n        \"file_name\": \"./sft-my.json\",\n        \"formatting\": \"sharegpt\",\n        \"columns\": {\n            \"messages\": \"messages\",\n            \"system\": \"system\"\n        },\n        \"tags\": {\n            \"role_tag\": \"role\",\n            \"content_tag\": \"content\",\n            \"user_tag\": \"user\",\n            \"assistant_tag\": \"assistant\"\n        }\n    },\n    \"chat-sft-cleaned\": {\n        \"file_name\": \"./sft-my-cleaned.json\",\n        \"formatting\": \"sharegpt\",\n        \"columns\": {\n            \"messages\": \"messages\",\n            \"system\": \"system\"\n        },\n        \"tags\": {\n            \"role_tag\": \"role\",\n            \"content_tag\": \"content\",\n            \"user_tag\": \"user\",\n            \"assistant_tag\": \"assistant\"\n        }\n    },\n    \"chat-sft-vl\": {\n        \"file_name\": \"./sft-my.json\",\n        \"formatting\": \"sharegpt\",\n        \"columns\": {\n            \"messages\": \"messages\",\n            \"system\": \"system\",\n            \"images\": \"images\"\n        },\n        \"tags\": {\n            \"role_tag\": \"role\",\n            \"content_tag\": \"content\",\n            \"user_tag\": \"user\",\n            \"assistant_tag\": \"assistant\"\n        }\n    },\n    \"chat-sft-vl-cleaned\": {\n        \"file_name\": \"./sft-my-cleaned.json\",\n        \"formatting\": \"sharegpt\",\n        \"columns\": {\n            \"messages\": \"messages\",\n            \"system\": \"system\",\n            \"images\": \"images\"\n        },\n        \"tags\": {\n            \"role_tag\": \"role\",\n            \"content_tag\": \"content\",\n            \"user_tag\": \"user\",\n            \"assistant_tag\": \"assistant\"\n        }\n    }\n}\n"
  },
  {
    "path": "dataset/telegram/.gitkeep",
    "content": "# Storing Telegram client's ChatExport\n"
  },
  {
    "path": "ds_config.json",
    "content": "{\n    \"fp16\": {\n        \"enabled\": \"auto\",\n        \"loss_scale\": 0,\n        \"loss_scale_window\": 1000,\n        \"initial_scale_power\": 16,\n        \"hysteresis\": 2,\n        \"min_loss_scale\": 1\n    },\n    \"bf16\": {\n        \"enabled\": \"auto\"\n    },\n    \"zero_optimization\": {\n        \"stage\": 2,\n        \"allgather_partitions\": true,\n        \"allgather_bucket_size\": 5e8,\n        \"overlap_comm\": true,\n        \"reduce_scatter\": true,\n        \"reduce_bucket_size\": 5e8,\n        \"contiguous_gradients\": true\n    },\n    \"gradient_accumulation_steps\": \"auto\",\n    \"gradient_clipping\": \"auto\",\n    \"steps_per_print\": 2000,\n    \"train_batch_size\": \"auto\",\n    \"train_micro_batch_size_per_gpu\": \"auto\",\n    \"wall_clock_breakdown\": false\n}\n"
  },
  {
    "path": "examples/mllm.template.jsonc",
    "content": "{\n    \"version\": \"0.2.24\",\n    \"common_args\": {\n        \"model_name_or_path\": \"./models/Qwen2.5-VL-7B-Instruct\",\n        \"adapter_name_or_path\": \"./model_output\", //同时做为train_sft_args的output_dir\n        \"template\": \"qwen2_vl\",\n        \"default_system\": \"请你扮演一名人类，不要说自己是人工智能\",\n        \"finetuning_type\": \"lora\",\n        \"media_dir\": \"dataset/media\",\n        \"image_max_pixels\": 409920, //720P\n        \"enable_thinking\": false,\n        \"trust_remote_code\": true\n    },\n    \"cli_args\": {\n        \"full_log\": false\n    },\n    \"make_dataset_args\": {\n        //数据处理配置\n        \"platform\": \"chat\", //chat,telegram\n        \"include_type\": [\n            \"text\",\n            \"image\"\n        ],\n        \"max_image_num\": 2, // 单条数据最大图片数量\n        \"blocked_words\": [ // 禁用词\n            \"例如 姓名\",\n            \"例如 密码\",\n            \"//.....\"\n        ],\n        \"single_combine_strategy\": \"time_window\", // 单人组成单句策略\n        \"qa_match_strategy\": \"time_window\", // 组成qa策略\n        \"single_combine_time_window\": 2, // 单人组成单句时间窗口（分钟）,\n        \"qa_match_time_window\": 5, // 组成qa时间窗口（分钟）,\n        \"combine_msg_max_length\": 2048, // 组合后消息最大长度 \n        \"messages_max_length\": 2048, // messages最长字符数量 配合cutoff_len 使用\n        \"clean_dataset\": {\n            \"enable_clean\": false,\n            \"clean_strategy\": \"llm\",\n            \"llm\": {\n                \"accept_score\": 2, //可以接受的llm打分阈值,1分最差，5分最好,低于此分数的数据不会用于训练\n            }\n        },\n        \"online_llm_clear\": false,\n        \"base_url\": \"https://xxx/v1\",\n        \"llm_api_key\": \"xxxxx\",\n        \"model_name\": \"xxx\", //建议使用参数较大的模型，例如DeepSeek-V3\n        \"clean_batch_size\": 10,\n        \"vision_api\": {\n            \"enable\": false, // 设置为 true 来开启此功能\n            \"api_key\": \"xxx\",\n            \"api_url\": \"https://xxx/v1\", // 例如阿里云，或替换为其他兼容OpenAI的API地址\n            \"model_name\": \"xxx\", // 要使用的多模态模型名称,例如qwen-vl-max\n            \"max_workers\": 5 // 并行调用API的线程数，最多不要超过8\n        }\n    },\n    \"train_sft_args\": {\n        //微调配置\n        \"stage\": \"sft\",\n        \"dataset\": \"chat-sft\",\n        \"dataset_dir\": \"./dataset/res_csv/sft\",\n        \"freeze_multi_modal_projector\": false, //MLLM 训练时是否冻结多模态投影器。\n        \"use_fast_tokenizer\": true,\n        \"lora_target\": \"q_proj,v_proj,visual.merger.mlp.0,visual.merger.mlp.2\",\n        \"lora_rank\": 8,\n        \"lora_dropout\": 0.25,\n        \"weight_decay\": 0.1,\n        \"overwrite_cache\": true,\n        \"per_device_train_batch_size\": 2,\n        \"gradient_accumulation_steps\": 16,\n        \"lr_scheduler_type\": \"cosine\",\n        \"cutoff_len\": 4096,\n        \"logging_steps\": 10,\n        \"save_steps\": 100,\n        \"learning_rate\": 1e-4,\n        \"warmup_ratio\": 0.1,\n        \"num_train_epochs\": 2,\n        \"plot_loss\": true,\n        \"fp16\": true,\n        \"flash_attn\": \"fa2\",\n        \"preprocessing_num_workers\": 16,\n        \"dataloader_num_workers\": 4\n        // \"deepspeed\": \"ds_config.json\" //多卡训练\n    },\n    \"infer_args\": {\n        \"repetition_penalty\": 1.2,\n        \"temperature\": 0.65,\n        \"max_length\": 512,\n        \"top_p\": 0.75\n    },\n    \"vllm_args\": {\n        \"gpu_memory_utilization\": 0.9\n    },\n    \"test_model_args\": {\n        \"test_data_path\": \"dataset/eval/test_data-en.json\"\n    }\n}\n"
  },
  {
    "path": "examples/tg.template.jsonc",
    "content": "{\n    \"version\": \"0.3.0\",\n    \"common_args\": {\n        \"model_name_or_path\": \"./models/Qwen2.5-VL-7B-Instruct\",\n        \"adapter_name_or_path\": \"./model_output\", // Also serves as the output_dir for train_sft_args\n        \"template\": \"qwen2_vl\",\n        \"default_system\": \"Please act like a human and don't say you are an artificial intelligence\",\n        \"finetuning_type\": \"lora\",\n        \"media_dir\": \"dataset/media\",\n        \"image_max_pixels\": 409920, //720P\n        \"enable_thinking\": false,\n        \"trust_remote_code\": true\n    },\n    \"cli_args\": {\n        \"full_log\": false\n    },\n    \"make_dataset_args\": {\n        // Data processing configuration\n        \"platform\": \"telegram\", //chat,telegram\n        \"language\": \"en\", // Common chat language: zh(中文), en(English)\n        \"telegram_args\": {\n            \"my_id\": \"user1234567890\"\n        },\n        \"include_type\": [\n            \"text\",\n            \"image\",\n            // \"sticker\" //Converting stickers to emojis can lead to the model outputting too many emojis.\n        ],\n        \"max_image_num\": 2, // Maximum number of images per data entry\n        \"blocked_words\": [ // Blocked words\n            \"e.g. Name\",\n            \"e.g. Password\",\n            \"//.....\"\n        ],\n        \"single_combine_strategy\": \"time_window\", // Single person message combination strategy\n        \"qa_match_strategy\": \"time_window\", // QA combination strategy\n        \"single_combine_time_window\": 2, // Time window for single person message combination (minutes)\n        \"qa_match_time_window\": 5, // Time window for QA combination (minutes)\n        \"combine_msg_max_length\": 2048, // Maximum length of combined messages\n        \"messages_max_length\": 2048, // Maximum character count for messages, used with cutoff_len\n        \"clean_dataset\": {\n            \"enable_clean\": false,\n            \"clean_strategy\": \"llm\",\n            \"llm\": {\n                \"accept_score\": 2, // Acceptable LLM score threshold, 1 is worst, 5 is best, data below this score will not be used for training\n            }\n        },\n        \"online_llm_clear\": false,\n        \"base_url\": \"https://xxx/v1\",\n        \"llm_api_key\": \"xxxxx\",\n        \"model_name\": \"xxx\", // Recommend using models with larger parameters, e.g. DeepSeek-V3\n        \"clean_batch_size\": 10,\n        \"vision_api\": {\n            \"enable\": false, // Set to true to enable this feature\n            \"api_key\": \"xxx\",\n            \"api_url\": \"https://xxx/v1\", // e.g. Alibaba Cloud, or replace with other OpenAI-compatible API addresses\n            \"model_name\": \"xxx\", // Multimodal model name to use, e.g. qwen-vl-max\n            \"max_workers\": 5 // Number of parallel API call threads, maximum should not exceed 8\n        }\n    },\n    \"train_sft_args\": {\n        // Fine-tuning configuration\n        \"stage\": \"sft\",\n        \"dataset\": \"chat-sft\",\n        \"dataset_dir\": \"./dataset/res_csv/sft\",\n        \"freeze_multi_modal_projector\": false, // Whether to freeze the multimodal projector during MLLM training\n        \"use_fast_tokenizer\": true,\n        \"lora_target\": \"q_proj,v_proj,visual.merger.mlp.0,visual.merger.mlp.2\",\n        \"lora_rank\": 8,\n        \"lora_dropout\": 0.25,\n        \"weight_decay\": 0.1,\n        \"overwrite_cache\": true,\n        \"per_device_train_batch_size\": 2,\n        \"gradient_accumulation_steps\": 16,\n        \"lr_scheduler_type\": \"cosine\",\n        \"cutoff_len\": 4096,\n        \"logging_steps\": 10,\n        \"save_steps\": 100,\n        \"learning_rate\": 1e-4,\n        \"warmup_ratio\": 0.1,\n        \"num_train_epochs\": 2,\n        \"plot_loss\": true,\n        \"fp16\": true,\n        \"flash_attn\": \"fa2\",\n        \"preprocessing_num_workers\": 16,\n        \"dataloader_num_workers\": 4\n        // \"deepspeed\": \"ds_config.json\" // Multi-GPU training\n    },\n    \"infer_args\": {\n        \"repetition_penalty\": 1.2,\n        \"temperature\": 0.7,\n        \"max_length\": 512,\n        \"top_p\": 0.8\n    },\n    \"vllm_args\": {\n        \"gpu_memory_utilization\": 0.9\n    },\n    \"test_model_args\": {\n        \"test_data_path\": \"dataset/eval/test_data-en.json\"\n    }\n}\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[project]\nname = \"WeClone\"\nversion = \"0.3.03\"\ndescription = \"One-stop solution for creating your digital avatar from chat history\"\nauthors = [{ name = \"xming521\" }]\nreadme = \"README.md\"\nrequires-python = \">=3.12,<3.13\"\n\ndependencies = [\n  \"pandas\",\n  \"pyjson5\",\n  \"omegaconf\",\n  \"click\",\n  \"tqdm\",\n  \"pydantic==2.10.6\",\n  \"setuptools>=78.1.0\",\n  \"loguru>=0.7.3\",\n  \"langchain\",\n  \"openai==1.87.0\",\n  \"pip\"\n]\n\n[tool.weclone]\n# Configuration file version number. This number should be incremented when the configuration file structure or important default values change.\nconfig_version = \"0.3.03\"\n\nconfig_changelog = \"\"\"\n[0.3.00] - 2025-06-30 - Support TG chat logs, add language parameter, add log level parameter.\n[0.3.02] - 2025-08-15 - Allow the use of the enable_thinking to control offline cleaning..\n[0.3.03] - 2025-11-01 - Add chat member relationship switch.\n\"\"\"\n\n[dependency-groups]\nmain = [\n  \"llamafactory==0.9.4\",\n  \"vllm==0.10.0; platform_system == 'Linux'\",\n  \"torch==2.7.1+cu126; platform_system == 'Linux' or platform_system == 'Windows'\",\n  \"torchvision==0.22.1+cu126; platform_system == 'Linux' or platform_system == 'Windows'\",\n  \"torchaudio==2.7.1+cu126; platform_system == 'Linux' or platform_system == 'Windows'\",\n  \"torchdata>=0.10.0; platform_system == 'Linux' or platform_system == 'Windows'\",\n  \"transformers==4.53.2\",\n  \"accelerate==1.7.0\",\n  \"triton==3.3.1; platform_system == 'Linux'\",\n  \"presidio_analyzer[transformers]\",\n  \"presidio_anonymizer\",\n]\nsparktts = [\n  \"einops>=0.8.1\",\n  \"einx>=0.3.0\",\n  \"numpy==1.26.4\",\n  \"omegaconf>=2.3.0\",\n  \"packaging>=24.2\",\n  \"safetensors>=0.5.2\",\n  \"soundfile>=0.12.1\",\n  \"soxr>=0.5.0.post1\",\n  \"torchaudio>=2.6.0\",\n  \"tqdm>=4.66.5\",\n]\n\ndev = [\"pytest\", \"pytest-order\", \"pyright\", \"ruff\", \"pre-commit\"]\n\n[project.scripts]\nweclone-cli = \"weclone.cli:cli\"\n\n[tool.uv]\n\n[tool.uv.pip]\ntorch-backend = \"auto\"\n\n[tool.uv.sources]\ntorch = [\n  { index = \"pytorch-cu126\", marker = \"platform_system == 'Windows'\" },\n  { index = \"pytorch-cu126\", marker = \"platform_system == 'Linux'\" },\n]\ntorchaudio = [\n  { index = \"pytorch-cu126\", marker = \"platform_system == 'Windows'\" },\n  { index = \"pytorch-cu126\", marker = \"platform_system == 'Linux'\" },\n]\ntorchvision = [\n  { index = \"pytorch-cu126\", marker = \"platform_system == 'Windows'\" },\n  { index = \"pytorch-cu126\", marker = \"platform_system == 'Linux'\" },\n]\ntriton = [\n  { index = \"pytorch-cu126\", marker = \"platform_system == 'Windows'\" },\n  { index = \"pytorch-cu126\", marker = \"platform_system == 'Linux'\" },\n]\ntorchdata = [\n  { index = \"pytorch-cu126\", marker = \"platform_system == 'Windows'\" },\n  { index = \"pytorch-cu126\", marker = \"platform_system == 'Linux'\" },\n]\n\n[[tool.uv.index]]\nname = \"pytorch-cu126\"\nurl = \"https://download.pytorch.org/whl/cu126\"\nexplicit = true\n\n[tool.setuptools.packages.find]\nwhere = [\".\"]                      \ninclude = [\"weclone*\"]             \nexclude = [\"*tests*\", \"*archive*\"]\n\n\n[tool.pyright]\ntypeCheckingMode = \"basic\"\ninclude = [\"weclone/data\"]\nexclude = [\"**/archive\", \"**/tests\"]\nignore = [\"**/archive\"]\n\nreportMissingImports = \"error\"\nreportMissingTypeStubs = false\n\npythonVersion = \"3.12\"\npythonPlatform = \"Linux\"\n\n[tool.ruff]\nexclude = [\n  \"**/archive\",\n  \"**/tests\",\n  \"weclone-audio/src/server未完工\",\n  \"weclone-audio/src/Spark-TTS\",\n]\nline-length = 110\n\nlint.ignore = [\"F403\", \"F405\", \"E501\", \"E402\"]\nlint.select = [\n  \"F\",     # Pyflakes\n  \"W\",     # pycodestyle warnings\n  \"E\",     # pycodestyle errors\n  \"ASYNC\", # flake8-async\n  \"C4\",    # flake8-comprehensions\n  \"Q\",     # flake8-quotes\n]\ntarget-version = \"py312\"\n\n[tool.pytest.ini_options]\naddopts = \"-x -v -s --tb=short\"\n"
  },
  {
    "path": "settings.template.jsonc",
    "content": "{\n    \"version\": \"0.3.01\",\n    \"common_args\": {\n        \"model_name_or_path\": \"./models/Qwen2.5-7B-Instruct\",\n        \"adapter_name_or_path\": \"./model_output\", //同时做为train_sft_args的output_dir\n        \"template\": \"qwen\",\n        \"default_system\": \"请你扮演一名人类，不要说自己是人工智能\",\n        \"media_dir\": \"dataset/media\",\n        \"finetuning_type\": \"lora\",\n        \"enable_thinking\": false,\n        \"trust_remote_code\": true\n    },\n    \"cli_args\": {\n        \"full_log\": false,\n        \"log_level\": \"INFO\"\n    },\n    \"make_dataset_args\": {\n        //数据处理配置\n        \"platform\": \"chat\", //chat,telegram\n        \"language\": \"zh\", // 聊天常用语言: zh(中文) 或 en(英文)\n        \"telegram_args\": {\n            \"my_id\": \"user1234567890\"\n        },\n        \"include_type\": [\n            \"text\"\n        ],\n        \"blocked_words\": [ // 禁用词\n            \"例如 姓名\",\n            \"例如 密码\",\n            \"//.....\"\n        ],\n        \"add_time\": false,\n        \"add_relation\": false,\n        \"single_combine_strategy\": \"time_window\", // 单人组成单句策略\n        \"qa_match_strategy\": \"time_window\", // 组成qa策略\n        \"single_combine_time_window\": 2, // 单人组成单句时间窗口（分钟）,\n        \"qa_match_time_window\": 5, // 组成qa时间窗口（分钟）,\n        \"combine_msg_max_length\": 2048, // 组合后消息最大长度 配合cutoff_len 使用\n        \"messages_max_length\": 2048, // messages最长字符数量 配合cutoff_len 使用\n        \"clean_dataset\": {\n            \"enable_clean\": false,\n            \"clean_strategy\": \"llm\",\n            \"llm\": {\n                \"accept_score\": 2, //可以接受的llm打分阈值,1分最差，5分最好,低于此分数的数据不会用于训练\n                \"enable_thinking\": true\n            }\n        },\n        \"online_llm_clear\": false,\n        \"base_url\": \"https://xxx/v1\",\n        \"llm_api_key\": \"xxxxx\",\n        \"model_name\": \"xxx\", //建议使用参数较大的模型，例如DeepSeek-V3\n        \"clean_batch_size\": 50,\n        \"vision_api\": {\n            \"enable\": false, // 设置为 true 来开启此功能\n            \"api_key\": \"xxx\",\n            \"api_url\": \"https://xxx/v1\", // 兼容OpenAI的API地址\n            \"model_name\": \"xxx\", // 要使用的多模态模型名称,例如qwen-vl-max\n            \"max_workers\": 5 // 并行调用API的线程数，最多不要超过8\n        }\n    },\n    \"train_sft_args\": {\n        //微调配置\n        \"stage\": \"sft\",\n        \"dataset\": \"chat-sft\",\n        \"dataset_dir\": \"./dataset/res_csv/sft\",\n        \"use_fast_tokenizer\": true,\n        \"lora_target\": \"q_proj,v_proj\",\n        \"lora_rank\": 8,\n        \"lora_dropout\": 0.25,\n        \"weight_decay\": 0.1,\n        \"overwrite_cache\": true,\n        \"per_device_train_batch_size\": 2,\n        \"gradient_accumulation_steps\": 16,\n        \"lr_scheduler_type\": \"cosine\",\n        \"cutoff_len\": 2048,\n        \"logging_steps\": 10,\n        \"save_steps\": 100,\n        \"learning_rate\": 1e-4,\n        \"warmup_ratio\": 0.1,\n        \"num_train_epochs\": 2,\n        \"plot_loss\": true,\n        \"fp16\": true,\n        \"flash_attn\": \"fa2\",\n        // \"deepspeed\": \"ds_config.json\" //多卡训练\n    },\n    \"infer_args\": {\n        \"repetition_penalty\": 1.2,\n        \"temperature\": 0.5,\n        \"max_length\": 256,\n        \"top_p\": 0.65\n    },\n    \"vllm_args\": {\n        \"gpu_memory_utilization\": 0.9,\n        // \"data_parallel_size\": 2,\n        // \"quantization\": \"bitsandbytes\", \n        // \"load_format\": \"bitsandbytes\"\n    },\n    \"test_model_args\": {\n        \"test_data_path\": \"dataset/eval/test_data-zh.json\"\n    }\n}\n"
  },
  {
    "path": "tests/__init__.py",
    "content": ""
  },
  {
    "path": "tests/configs/Qwen2.5-VL.jsonc",
    "content": "{\n    \"version\": \"0.2.22\",\n    \"common_args\": {\n        \"model_name_or_path\": \"./models/Qwen2.5-VL-3B-Instruct\",\n        \"adapter_name_or_path\": \"./model_output\", //同时做为train_sft_args的output_dir\n        \"template\": \"qwen2_vl\",\n        \"default_system\": \"请你扮演一名人类，不要说自己是人工智能\",\n        \"finetuning_type\": \"lora\",\n        \"media_dir\": \"dataset/media\",\n        \"image_max_pixels\": 209920, //720P\n        \"enable_thinking\": false,\n        \"trust_remote_code\": true\n    },\n    \"cli_args\": {\n        \"full_log\": false\n    },\n    \"make_dataset_args\": {\n        //数据处理配置\n        \"platform\": \"chat\",\n        \"include_type\": [\n            \"text\",\n            \"image\"\n        ],\n        \"blocked_words\": [\n            \"1234567890\",\n            \"hh\"\n        ],\n        \"language\": \"en\",\n        \"add_relation\": true,\n        \"add_time\": true,\n        \"max_image_num\": 2, // 单条数据最大图片数量\n        \"single_combine_strategy\": \"time_window\", // 单人组成单句策略\n        \"qa_match_strategy\": \"time_window\", // 组成qa策略\n        \"single_combine_time_window\": 2, // 单人组成单句时间窗口（分钟）,\n        \"qa_match_time_window\": 5, // 组成qa时间窗口（分钟）,\n        \"combine_msg_max_length\": 256, // 组合后消息最大长度 配合cutoff_len 使用\n        \"clean_dataset\": {\n            \"enable_clean\": true,\n            \"clean_strategy\": \"llm\",\n            \"llm\": {\n                \"accept_score\": 2, //可以接受的llm打分阈值,1分最差，5分最好,低于此分数的数据不会用于训练\n            }\n        },\n        \"vision_api\": {\n            \"enable\": false, // 设置为 true 来开启此功能\n            \"api_key\": \"xxx\",\n            \"api_url\": \"https://xxx/v1\", // 例如阿里云，或替换为其他兼容OpenAI的API地址\n            \"model_name\": \"xxx\", // 要使用的多模态模型名称,例如qwen-vl-max\n            \"max_workers\": 5 // 并行调用API的线程数，最多不要超过8\n        }\n    },\n    \"test_model_args\": {\n        \"test_data_path\": \"tests/tests_data/test_model_data.json\"\n    },\n    \"train_sft_args\": {\n        //微调配置\n        \"stage\": \"sft\",\n        \"dataset\": \"chat-sft\",\n        \"dataset_dir\": \"./dataset/res_csv/sft\",\n        \"freeze_multi_modal_projector\": false, //MLLM 训练时是否冻结多模态投影器。\n        \"use_fast_tokenizer\": true,\n        \"lora_target\": \"q_proj,v_proj,visual.merger.mlp.0,visual.merger.mlp.2\",\n        \"lora_rank\": 2,\n        \"lora_dropout\": 0.3,\n        \"weight_decay\": 0.1,\n        \"overwrite_cache\": true,\n        \"per_device_train_batch_size\": 4,\n        \"gradient_accumulation_steps\": 8,\n        \"lr_scheduler_type\": \"cosine\",\n        \"cutoff_len\": 1024,\n        \"logging_steps\": 5,\n        \"save_steps\": 10,\n        \"learning_rate\": 1e-4,\n        \"warmup_ratio\": 0.1,\n        \"num_train_epochs\": 1,\n        \"plot_loss\": true,\n        \"fp16\": true,\n        \"flash_attn\": \"fa2\",\n        // \"deepspeed\": \"ds_config.json\" //多卡训练\n    },\n    \"infer_args\": {\n        \"repetition_penalty\": 1.2,\n        \"temperature\": 0.5,\n        \"max_length\": 50,\n        \"top_p\": 0.65\n    }\n}\n"
  },
  {
    "path": "tests/configs/qwen2.5.jsonc",
    "content": "{\n    \"version\": \"0.2.22\",\n    \"common_args\": {\n        \"model_name_or_path\": \"./models/Qwen2.5-0.5B\",\n        \"adapter_name_or_path\": \"./model_output\", //同时做为train_sft_args的output_dir\n        \"template\": \"qwen\",\n        \"default_system\": \"请你扮演一名人类，不要说自己是人工智能\",\n        \"finetuning_type\": \"lora\",\n        \"media_dir\": \"dataset/media\",\n        \"image_max_pixels\": 209920, //720P\n        \"enable_thinking\": false,\n        \"trust_remote_code\": true\n    },\n    \"cli_args\": {\n        \"full_log\": false\n    },\n    \"make_dataset_args\": {\n        //数据处理配置\n        \"platform\": \"chat\",\n        \"include_type\": [\n            \"text\",\n            // \"image\"\n        ],\n        \"blocked_words\": [\n            \"1234567890\",\n            \"hh\"\n        ],\n        \"language\": \"zh\",\n        \"add_relation\": true,\n        \"add_time\": true,\n        \"max_image_num\": 2, // 单条数据最大图片数量\n        \"single_combine_strategy\": \"time_window\", // 单人组成单句策略\n        \"qa_match_strategy\": \"time_window\", // 组成qa策略\n        \"single_combine_time_window\": 2, // 单人组成单句时间窗口（分钟）,\n        \"qa_match_time_window\": 5, // 组成qa时间窗口（分钟）,\n        \"combine_msg_max_length\": 256, // 组合后消息最大长度 配合cutoff_len 使用\n        \"clean_dataset\": {\n            \"enable_clean\": true,\n            \"clean_strategy\": \"llm\",\n            \"llm\": {\n                \"accept_score\": 2, //可以接受的llm打分阈值,1分最差，5分最好,低于此分数的数据不会用于训练\n                \"enable_thinking\": true\n            }\n        },\n        \"vision_api\": {\n            \"enable\": false, // 设置为 true 来开启此功能\n            \"api_key\": \"xxx\",\n            \"api_url\": \"https://xxx/v1\", // 例如阿里云，或替换为其他兼容OpenAI的API地址\n            \"model_name\": \"xxx\", // 要使用的多模态模型名称,例如qwen-vl-max\n            \"max_workers\": 5 // 并行调用API的线程数，最多不要超过8\n        }\n    },\n    \"test_model_args\": {\n        \"test_data_path\": \"tests/tests_data/test_model_data.json\"\n    },\n    \"train_sft_args\": {\n        //微调配置\n        \"stage\": \"sft\",\n        \"dataset\": \"chat-sft\",\n        \"dataset_dir\": \"./dataset/res_csv/sft\",\n        \"freeze_multi_modal_projector\": false, //MLLM 训练时是否冻结多模态投影器。\n        \"use_fast_tokenizer\": true,\n        \"lora_target\": \"q_proj,v_proj,visual.merger.mlp.0,visual.merger.mlp.2\",\n        \"lora_rank\": 2,\n        \"lora_dropout\": 0.3,\n        \"weight_decay\": 0.1,\n        \"overwrite_cache\": true,\n        \"per_device_train_batch_size\": 4,\n        \"gradient_accumulation_steps\": 8,\n        \"lr_scheduler_type\": \"cosine\",\n        \"cutoff_len\": 1024,\n        \"logging_steps\": 5,\n        \"save_steps\": 10,\n        \"learning_rate\": 1e-4,\n        \"warmup_ratio\": 0.1,\n        \"num_train_epochs\": 1,\n        \"plot_loss\": true,\n        \"fp16\": true,\n        \"flash_attn\": \"fa2\",\n        // \"deepspeed\": \"ds_config.json\" //多卡训练\n    },\n    \"infer_args\": {\n        \"repetition_penalty\": 1.2,\n        \"temperature\": 0.5,\n        \"max_length\": 50,\n        \"top_p\": 0.65\n    }\n}\n"
  },
  {
    "path": "tests/test_PII.py",
    "content": "import os\nimport shutil\nimport subprocess\nimport sys\nfrom typing import cast\n\nimport pytest\n\n# Import common functions from test_full_pipe\nfrom tests.test_full_pipe import (\n    DATASET_CSV_DIR,\n    PROJECT_ROOT_DIR,\n    get_config_files,\n    load_config_with_path,\n    print_test_header,\n    run_cli_command,\n    setup_data_environment,\n    test_logger,\n)\nfrom weclone.utils.config import load_config\nfrom weclone.utils.config_models import DataModality, WCMakeDatasetConfig\nfrom weclone.utils.log import logger\n\nsys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))\n\n# Setup paths\nTESTS_DIR = os.path.dirname(__file__)\nTEST_DATA_PII_DIR = os.path.join(TESTS_DIR, \"tests_data\", \"test_PII\")\n\n@pytest.mark.parametrize(\"config_file\", get_config_files())\ndef test_PII_make_dataset(config_file):\n    \"\"\"Test PII data make-dataset functionality\"\"\"\n    print_test_header(\"PII make-dataset\", config_file)\n    \n    setup_data_environment(\"test_PII\")\n    \n    # Load config and handle images if needed\n    config: WCMakeDatasetConfig = cast(WCMakeDatasetConfig, load_config_with_path(config_file, \"make_dataset\"))\n\n    # Run make-dataset command\n    result = run_cli_command([\"make-dataset\"], config_file)\n    assert result.returncode == 0, f\"make-dataset command execution failed for config {config_file}\"\n\n    # Print all user messages from the dataset file with PII warning\n    import json\n    sft_file_path = os.path.join(PROJECT_ROOT_DIR, \"dataset\", \"res_csv\", \"sft\", \"sft-my.json\")\n    if os.path.exists(sft_file_path):\n        logger.warning(\"⚠️  WARNING: The following content contains unfiltered PII (Personally Identifiable Information):\")\n        logger.warning(\"=\" * 80)\n        \n        with open(sft_file_path, 'r', encoding='utf-8') as f:\n            data = json.load(f)\n            \n        for entry in data:\n            if 'messages' in entry:\n                for message in entry['messages']:\n                    if message.get('role') == 'user':\n                        logger.warning(f\"User content: {message.get('content', '')}\")\n        \n        logger.warning(\"=\" * 80)\n        logger.warning(\"⚠️  END OF UNFILTERED PII CONTENT\")\n\n    test_logger.info(f\"✅ PII make-dataset test passed for config {config_file}\")\n\nif __name__ == \"__main__\":\n    # If running directly, run tests for all configs\n    for config_file in get_config_files():\n        test_PII_make_dataset(config_file) \n"
  },
  {
    "path": "tests/test_full_pipe.py",
    "content": "import functools\nimport os\nimport shutil\nimport subprocess\nimport sys\nimport time\nfrom typing import Callable, Optional, Union, cast\nfrom unittest import mock\n\nimport pytest\n\nfrom weclone.utils.config import load_config\nfrom weclone.utils.config_models import DataModality, WCMakeDatasetConfig\nfrom weclone.utils.log import logger\n\nsys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))\nPROJECT_ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))\nPROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))\nDATASET_CSV_DIR = os.path.join(PROJECT_ROOT, \"dataset\", \"csv\")\nTESTS_DIR = os.path.dirname(__file__)\nTEST_DATA_PERSON_DIR = os.path.join(TESTS_DIR, \"tests_data\", \"test_person\")\n\n\n# Backup directories\nBACKUP_DIR = os.path.join(PROJECT_ROOT, \"test_backup\")\nMODEL_OUTPUT_BACKUP = os.path.join(BACKUP_DIR, \"model_output\")\nDATASET_CSV_BACKUP = os.path.join(BACKUP_DIR, \"dataset_csv\")\n\ntest_logger = logger.bind()\ntest_logger.remove()\ntest_logger.add(\n    sys.stderr,\n    format=\"<yellow><b>{message}</b></yellow>\",\n    colorize=True,\n    level=\"INFO\",\n)\n\ndef get_config_files():\n    \"\"\"获取所有配置文件\"\"\"\n    configs_dir = os.path.join(os.path.dirname(__file__), \"configs\")\n    config_files = []\n    for file in os.listdir(configs_dir):\n        if file.endswith('.jsonc'):\n            config_files.append(f\"tests/configs/{file}\")\n    return config_files\n\ndef print_test_header(test_name: str, config_file: str = \"\"):\n    line_length = 100\n    test_logger.info(\"\\n\" + \"─\" * line_length)\n    if config_file:\n        title = f\"  Testing Phase: {test_name} | Config: {os.path.basename(config_file)}  \"\n    else:\n        title = f\"  Testing Phase: {test_name}  \"\n    padding_total = line_length - len(title)\n    padding_left = padding_total // 2\n    padding_right = padding_total - padding_left\n    test_logger.info(\" \" * padding_left + title + \" \" * padding_right)\n    test_logger.info(\"─\" * line_length)\n\ndef print_config_header(config_file: str):\n    \"\"\"打印配置文件开始测试的头部\"\"\"\n    line_length = 120\n    test_logger.info(\"\\n\" + \"═\" * line_length)\n    title = f\"  开始测试配置文件: {os.path.basename(config_file)}  \"\n    padding_total = line_length - len(title)\n    padding_left = padding_total // 2\n    padding_right = padding_total - padding_left\n    test_logger.info(\" \" * padding_left + title + \" \" * padding_right)\n    test_logger.info(\"═\" * line_length)\n\ndef setup_data_environment(data_folder_name: str = \"test_person\"):\n    \"\"\"Setup test data environment for specified folder\"\"\"\n    test_logger.info(f\"🔧 设置 {data_folder_name} 测试数据...\")\n    \n    # Create backup directory\n    if os.path.exists(BACKUP_DIR):\n        shutil.rmtree(BACKUP_DIR)\n    os.makedirs(BACKUP_DIR)\n    \n    # Backup model_output if it exists\n    if os.path.exists(\"model_output\"):\n        shutil.move(\"model_output\", MODEL_OUTPUT_BACKUP)\n        test_logger.info(\"已备份 model_output 目录\")\n    \n    # Backup DATASET_CSV_DIR if it exists\n    if os.path.exists(DATASET_CSV_DIR):\n        shutil.move(DATASET_CSV_DIR, DATASET_CSV_BACKUP)\n        test_logger.info(\"已备份 dataset/csv 目录\")\n    \n    os.makedirs(DATASET_CSV_DIR)\n    \n    # Setup specified test data folder\n    test_data_source_dir = os.path.join(TESTS_DIR, \"tests_data\", data_folder_name)\n    test_data_csv_dir = os.path.join(DATASET_CSV_DIR, data_folder_name)\n    os.makedirs(test_data_csv_dir)\n\n    for item_name in os.listdir(test_data_source_dir):\n        source_item_path = os.path.join(test_data_source_dir, item_name)\n        if os.path.isfile(source_item_path) :\n            destination_item_path = os.path.join(test_data_csv_dir, item_name)\n            shutil.copy2(source_item_path, destination_item_path)\n    \n    test_logger.info(f\"✅ {data_folder_name} 测试数据设置完成\")\n\n@pytest.fixture(scope=\"session\", autouse=True)\ndef setup_test_environment():\n    \"\"\"Setup test environment once for the entire test session\"\"\"\n    test_logger.info(\"🔧 开始设置测试环境...\")\n    \n    # Use the generic setup function with default test_person data\n    setup_data_environment(\"test_person\")\n    \n    test_logger.info(\"✅ 测试环境设置完成\")\n    \n    yield  # This is where the testing happens\n    \n    # Cleanup after all tests are done\n    test_logger.info(\"🧹 开始恢复测试环境...\")\n    \n    if os.path.exists(\"model_output\"):\n        shutil.rmtree(\"model_output\")\n    if os.path.exists(DATASET_CSV_DIR):\n        shutil.rmtree(DATASET_CSV_DIR)\n    \n    if os.path.exists(MODEL_OUTPUT_BACKUP):\n        shutil.move(MODEL_OUTPUT_BACKUP, \"model_output\")\n    \n    if os.path.exists(DATASET_CSV_BACKUP):\n        shutil.move(DATASET_CSV_BACKUP, DATASET_CSV_DIR)\n    \n    if os.path.exists(BACKUP_DIR):\n        shutil.rmtree(BACKUP_DIR)\n    \n    test_logger.info(\"✅ 测试环境恢复完成\")\n\n\ndef restore_test_env():\n    \"\"\"Manual environment cleanup for direct execution (deprecated for pytest)\"\"\"\n    test_logger.info(\"🧹 手动恢复测试环境...\")\n    \n    # Remove test directories\n    if os.path.exists(\"model_output\"):\n        shutil.rmtree(\"model_output\")\n    if os.path.exists(DATASET_CSV_DIR):\n        shutil.rmtree(DATASET_CSV_DIR)\n    \n    # Restore original directories if they were backed up\n    if os.path.exists(MODEL_OUTPUT_BACKUP):\n        shutil.move(MODEL_OUTPUT_BACKUP, \"model_output\")\n        test_logger.info(\"已恢复 model_output 目录\")\n    \n    if os.path.exists(DATASET_CSV_BACKUP):\n        shutil.move(DATASET_CSV_BACKUP, DATASET_CSV_DIR)\n        test_logger.info(\"已恢复 dataset/csv 目录\")\n    \n    # Remove backup directory\n    if os.path.exists(BACKUP_DIR):\n        shutil.rmtree(BACKUP_DIR)\n        test_logger.info(\"已清理备份目录\")\n    \n    test_logger.info(\"✅ 测试环境恢复完成\")\n\ndef run_cli_command(command: list[str], config_path: str, timeout: int | None = None, background: bool = False) -> Union[subprocess.CompletedProcess, subprocess.Popen]:\n    \"\"\"Execute a CLI command and return the result.\n    \n    Args:\n        command: List of commands to execute.\n        config_path: Path to the configuration file.\n        timeout: Timeout in seconds.\n        background: Whether to run in the background.\n        \n    Returns:\n        If background=True, returns a Popen object; otherwise, returns a CompletedProcess object.\n    \"\"\"\n    env = os.environ.copy()\n    env[\"WECLONE_CONFIG_PATH\"] = config_path # Set environment variable\n\n    if background:\n        process = subprocess.Popen(\n            [sys.executable, \"-m\", \"weclone.cli\"] + command,\n            stderr=None,\n            stdout=None,\n            text=True,\n            cwd=PROJECT_ROOT_DIR,\n            env=env\n        )\n        time.sleep(2)\n        return process\n    else:\n        process = subprocess.run(\n            [sys.executable, \"-m\", \"weclone.cli\"] + command,\n            stderr=None,\n            stdout=None,\n            text=True,\n            cwd=PROJECT_ROOT_DIR,  # Execute in the project root directory\n            timeout=timeout,\n            env=env  # Pass the modified environment variables\n        )\n        return process\n\ndef load_config_with_path(config_file: str, config_section: str):\n    \"\"\"临时设置环境变量并加载配置\"\"\"\n    original_env = os.environ.get(\"WECLONE_CONFIG_PATH\")\n    os.environ[\"WECLONE_CONFIG_PATH\"] = config_file\n    \n    try:\n        return load_config(config_section)\n    finally:\n        # 恢复原始环境变量\n        if original_env is not None:\n            os.environ[\"WECLONE_CONFIG_PATH\"] = original_env\n        elif \"WECLONE_CONFIG_PATH\" in os.environ:\n            del os.environ[\"WECLONE_CONFIG_PATH\"]\n\ndef run_make_dataset_test(config_file: str):\n    \"\"\"执行 make-dataset 测试\"\"\"\n    print_test_header(\"make-dataset\", config_file)\n    \n    config: WCMakeDatasetConfig = cast(WCMakeDatasetConfig, load_config_with_path(config_file, \"make_dataset\"))\n    if DataModality.IMAGE in config.include_type:\n        #复制图片到media_dir/iamges\n        os.makedirs(config.media_dir, exist_ok=True)\n        os.makedirs(os.path.join(config.media_dir, \"images\"), exist_ok=True)\n        for file in os.listdir(os.path.join(PROJECT_ROOT_DIR, \"tests\", \"tests_data\", \"images\")):\n            shutil.copy(os.path.join(PROJECT_ROOT_DIR, \"tests\", \"tests_data\", \"images\", file), os.path.join(config.media_dir, \"images\", file))\n\n    result = run_cli_command([\"make-dataset\"], config_file)\n    assert result.returncode == 0, f\"make-dataset command execution failed for config {config_file}\"\n\n    # Check if blocked_words filtering is working correctly\n    sft_file_path = os.path.join(PROJECT_ROOT_DIR, \"dataset\", \"res_csv\", \"sft\", \"sft-my.json\")\n    with open(sft_file_path, 'r', encoding='utf-8') as f:\n        content = f.read()\n        if \"hh\" in content:\n            assert False, f\"blocked_words filtering failed for config {config_file}: found 'hh' in {sft_file_path}\"\n    test_logger.info(f\"✅ blocked_words filtering check passed for config {config_file}\")\n    \n    # Check if <image> tags count is correct for Qwen2.5-VL.jsonc config\n    if \"Qwen2.5-VL.jsonc\" in config_file:\n        image_count = content.count(\"<image>\")\n        assert image_count == 3, f\"Expected 3 <image> tags in {sft_file_path} for config {config_file}, but found {image_count}\"\n        test_logger.info(f\"✅ <image> tags count check passed for config {config_file}: found {image_count} <image> tags\")\n\n    \n\ndef run_train_sft_test(config_file: str):\n    \"\"\"执行 train-sft 测试\"\"\"\n    print_test_header(\"train-sft\", config_file)\n   \n    try:\n        result = run_cli_command([\"train-sft\"], config_file) \n        assert result.returncode == 0, f\"train-sft command failed or did not fail fast as expected for config {config_file}\"\n    except subprocess.TimeoutExpired:\n        test_logger.info(f\"train-sft command terminated due to timeout for config {config_file}, which is acceptable in testing, indicating the command has started execution.\")\n        pass\n    except Exception as e:\n        pytest.fail(f\"An unexpected error occurred during train-sft command execution for config {config_file}: {e}\")\n\ndef run_webchat_demo_test(config_file: str):\n    \"\"\"执行 webchat-demo 测试\"\"\"\n    print_test_header(\"webchat-demo\", config_file)\n    \n    try:\n        result = run_cli_command([\"webchat-demo\"], config_file, timeout=20)\n        assert result.returncode == 0, f\"webchat-demo command execution failed for config {config_file}\"\n    except subprocess.TimeoutExpired:\n        pass\n\ndef run_server_test(config_file: str) -> subprocess.Popen:\n    \"\"\"执行 server 测试，返回进程对象\"\"\"\n    print_test_header(\"server (background)\", config_file)\n    server_process = cast(subprocess.Popen, run_cli_command([\"server\"], config_file, background=True))\n    test_logger.info(\"等待服务器启动，20秒后检查状态...\")\n    time.sleep(20)\n    assert server_process.poll() is None, f\"Server startup failed for config {config_file}\"\n    test_logger.info(f\"使用配置 {config_file} 的服务器已在后台启动\")\n    return server_process\n\ndef run_test_model_test(config_file: str, server_process: subprocess.Popen):\n    \"\"\"执行 test-model 测试并关闭服务器\"\"\"\n    print_test_header(\"test-model\", config_file)\n    try:\n        result = run_cli_command([\"test-model\"], config_file)\n        assert result.returncode == 0, f\"test-model command execution failed for config {config_file}\"\n    finally:\n        if server_process is not None and server_process.poll() is None:\n            test_logger.info(f\"测试完成，正在关闭使用配置 {config_file} 的服务器...\")\n            server_process.terminate()\n            server_process.wait(timeout=5)\n            if server_process.poll() is None:\n                server_process.kill()  # Force kill if the process hasn't terminated\n            test_logger.info(\"服务器已关闭\")\n\ndef clean_model_output():\n    \"\"\"Clean model_output directory before each config test\"\"\"\n    if os.path.exists(\"model_output\"):\n        shutil.rmtree(\"model_output\")\n\n@pytest.mark.parametrize(\"config_file\", get_config_files())\ndef test_full_pipeline_for_config(config_file):\n    \"\"\"为每个配置文件完整执行所有测试步骤\"\"\"\n    print_config_header(config_file)\n    \n    clean_model_output()\n    \n    server_process = None\n    try:\n        # 按顺序执行所有测试步骤\n        run_make_dataset_test(config_file)\n        run_train_sft_test(config_file)\n        run_webchat_demo_test(config_file)\n        server_process = run_server_test(config_file)\n        run_test_model_test(config_file, server_process)\n        \n        test_logger.info(f\"✅ 配置文件 {os.path.basename(config_file)} 的所有测试已完成\")\n        \n    except Exception as e:\n        test_logger.error(f\"❌ 配置文件 {os.path.basename(config_file)} 测试失败: {e}\")\n        if server_process is not None and server_process.poll() is None:\n            server_process.terminate()\n            server_process.wait(timeout=5)\n            if server_process.poll() is None:\n                server_process.kill()\n        raise\n\nif __name__ == \"__main__\":\n    try:\n        # If running directly, you would put your test code here\n        pass\n    finally:\n        restore_test_env()\n"
  },
  {
    "path": "tests/tests_data/test_PII/test_0_730.csv",
    "content": "id,MsgSvrID,type_name,is_sender,talker,room_name,msg,src,CreateTime\n7,4073926741244663531,文本,0,PHONE_NUMBER,wxid_6789z5qlxzfj22,13812345678,,2024/10/4 11:43\n8,4073926741244663532,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43\n9,706358374822797422,文本,0,EMAIL_ADDRESS,wxid_6789z5qlxzfj22,zhang.wei@163.com,,2024/10/4 11:43\n10,706358374822797423,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43\n11,2122553892045962801,文本,0,CREDIT_CARD,wxid_6789z5qlxzfj22,4532123456789012,,2024/10/4 11:43\n12,2122553892045962802,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43\n13,5704142615879617852,文本,0,IP_ADDRESS,wxid_6789z5qlxzfj22,192.168.1.100,,2024/10/4 11:43\n14,5704142615879617853,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43\n15,1337798072543283708,文本,0,LOCATION,wxid_6789z5qlxzfj22,北京市朝阳区三里屯,,2024/10/4 11:43\n16,1337798072543283709,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43\n17,8192964515963336399,文本,0,IBAN_CODE,wxid_6789z5qlxzfj22,GB33BUKB20201555555555,,2024/10/4 11:43\n18,8192964515963336400,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43\n19,7913656383976388488,文本,0,CRYPTO,wxid_6789z5qlxzfj22,1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa,,2024/10/4 11:43\n20,7913656383976388489,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43\n21,1964923183359419454,文本,0,AGE,wxid_6789z5qlxzfj22,25岁,,2024/10/4 11:43\n22,1964923183359419455,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43\n23,2403233409323875303,文本,0,ID,wxid_6789z5qlxzfj22,110101199001011234,,2024/10/4 11:43\n24,2403233409323875304,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43\n25,4630229215952295971,文本,0,PHONE_NUMBER,wxid_6789z5qlxzfj22,021-62345678,,2024/10/4 11:43\n26,4630229215952295972,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43\n27,6547675850931813364,文本,0,EMAIL_ADDRESS,wxid_6789z5qlxzfj22,li.ming@qq.com,,2024/10/4 11:43\n28,6547675850931813365,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43\n31,8151408074985365130,文本,0,IP_ADDRESS,wxid_6789z5qlxzfj22,2001:0db8:85a3:0000:0000:8a2e:0370:7334,,2024/10/4 11:43\n32,8151408074985365131,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:43\n33,9876543210123456789,文本,0,PHONE_NUMBER,wxid_6789z5qlxzfj22,+1-125-123-4567,,2024/10/4 11:44\n34,9876543210123456790,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n35,1234567890987654321,文本,0,EMAIL_ADDRESS,wxid_6789z5qlxzfj22,john.doe@gmail.com,,2024/10/4 11:44\n36,1234567890987654322,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n37,5555444433332222111,文本,0,CREDIT_CARD,wxid_6789z5qlxzfj22,4111111111111111,,2024/10/4 11:44\n38,5555444433332222112,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n39,7777888899990000123,文本,0,IP_ADDRESS,wxid_6789z5qlxzfj22,203.0.113.1,,2024/10/4 11:44\n40,7777888899990000124,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n41,3333222211110000456,文本,0,LOCATION,wxid_6789z5qlxzfj22,1600 Pennsylvania Avenue NW Washington DC,,2024/10/4 11:44\n42,3333222211110000457,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n43,9999000011112222789,文本,0,IBAN_CODE,wxid_6789z5qlxzfj22,DE89370400440532013000,,2024/10/4 11:44\n44,9999000011112222790,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n45,1111333355557777012,文本,0,CRYPTO,wxid_6789z5qlxzfj22,bc1qxy2kgdygjrsqtzq2n0yrf2493p83kkfjhx0wlh,,2024/10/4 11:44\n46,1111333355557777013,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n47,4444666688880000345,文本,0,AGE,wxid_6789z5qlxzfj22,32 years old,,2024/10/4 11:44\n48,4444666688880000346,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n49,2222555577779999678,文本,0,US_SSN,wxid_6789z5qlxzfj22,078-05-1120,,2024/10/4 11:44\n50,2222555577779999679,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n51,6666111133335555901,文本,0,PHONE_NUMBER,wxid_6789z5qlxzfj22,+44-20-7946-0958,,2024/10/4 11:44\n52,6666111133335555902,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n53,8888222244446666234,文本,0,EMAIL_ADDRESS,wxid_6789z5qlxzfj22,sarah.johnson@outlook.com,,2024/10/4 11:44\n54,8888222244446666235,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n55,0000444466668888567,文本,0,CREDIT_CARD,wxid_6789z5qlxzfj22,5555555555554444,,2024/10/4 11:44\n56,0000444466668888568,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n57,3333777799991111890,文本,0,IP_ADDRESS,wxid_6789z5qlxzfj22,172.16.254.1,,2024/10/4 11:44\n58,3333777799991111891,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n59,5555999911113333123,文本,0,LOCATION,wxid_6789z5qlxzfj22,10 Downing Street London SW1A 2AA UK,,2024/10/4 11:44\n60,5555999911113333124,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n61,7777000022224444456,文本,0,IBAN_CODE,wxid_6789z5qlxzfj22,FR1420041010050500013M02606,,2024/10/4 11:44\n62,7777000022224444457,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n63,9999222244446666789,文本,0,CRYPTO,wxid_6789z5qlxzfj22,3QJmV3qfvL9SuYo34YihAf3sRCW3qSinyC,,2024/10/4 11:44\n64,9999222244446666790,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n65,1111555577779999012,文本,0,AGE,wxid_6789z5qlxzfj22,28岁,,2024/10/4 11:44\n66,1111555577779999013,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n67,4444888800002222345,文本,0,ID,wxid_6789z5qlxzfj22,AB123456C,,2024/10/4 11:44\n68,4444888800002222346,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n69,4444888800002222345,文本,0,666,wxid_6789z5qlxzfj22,404,,2024/10/4 11:44\n70,4444888800002222346,文本,1,,wxid_6789z5qlxzfj22,FALSE,,2024/10/4 11:44\n"
  },
  {
    "path": "tests/tests_data/test_model_data.json",
    "content": "{\n    \"questions\": [\n        [\n            \"吃了吗？\",\n            \"吃的什么啊\",\n            \"好吃吗\",\n            \"多少钱啊\",\n            \"可以请我吃吗\"\n        ],\n        [\n            \"干嘛呢？\",\n            \"等会准备干什么去\"\n        ],\n        [\n            \"最近有什么新鲜事发生吗？\",\n            \"有没有什么有趣的故事可以分享？\"\n        ],\n        [\n            \"周末过得怎么样？\",\n            \"做了什么好玩的？\"\n        ],\n        [\n            \"今天天气怎么样？\",\n            \"你那里呢？\"\n        ],\n        [\n            \"最近工作/学习顺利吗？\",\n            \"有没有遇到什么挑战？\"\n        ]\n    ]\n}\n"
  },
  {
    "path": "tests/tests_data/test_person/test_0_730.csv",
    "content": "id,MsgSvrID,type_name,is_sender,talker,room_name,msg,src,CreateTime\n1,7437267147299592543,图片,0,12345iru2zsmo22,test_person,图片,File\\dd0e62b6eb67d195bc33ab9470301d6c\\Image\\2024-10\\01c177d8ad90af8969ba048455b54eef.dat,2024/10/4 11:42\n2,637529293739295664,图片,0,12345iru2zsmo22,test_person,图片,File\\dd0e62b6eb67d195bc33ab9470301d6c\\Image\\2024-10\\d8a8936ca622823452e45b5e180a53a6.dat,2024/10/4 11:42\n7,4073926741244663531,文本,1,12345iru2zsmo22,test_person,小马尔代夫,,2024/10/4 11:43\n8,706358374822797422,文本,1,12345iru2zsmo22,test_person,名不虚传,,2024/10/4 11:43\n9,2122553892045962801,文本,0,test_person,test_person,我去 好可爱啊,,2024/10/4 11:43\n10,5704142615879617852,文本,0,test_person,test_person,2.0156416,,2024/10/4 11:43\n11,1337798072543283708,文本,0,test_person,test_person,,,2024/10/4 11:43\n12,8192964515963336399,文本,1,12345iru2zsmo22,test_person,啊哈哈哈,,2024/10/4 11:43\n13,7913656383976388488,文本,1,12345iru2zsmo22,test_person,不是,,2024/10/4 11:43\n14,1964923183359419454,文本,1,12345iru2zsmo22,test_person,你过来就得老久,,2024/10/4 11:43\n15,2403233409323875303,文本,0,test_person,test_person,我在南站,,2024/10/4 11:43\n16,4630229215952295971,文本,0,test_person,test_person,我知道我学校离养马岛12km,,2024/10/4 11:43\n17,6547675850931813364,文本,0,test_person,test_person,[旺柴],,2024/10/4 11:43\n18,1900866115792249247,文本,1,12345iru2zsmo22,test_person,牟平站,,2024/10/4 11:43\n19,8151408074985365130,文本,1,12345iru2zsmo22,test_person,近,,2024/10/4 11:43\n20,2421069219348160202,文本,1,12345iru2zsmo22,test_person,啊哈哈哈,,2024/10/4 11:43\n21,8751079973209533492,文本,0,test_person,test_person,我去接朋友 他12点到,,2024/10/4 11:44\n22,1133854364527684495,动画表情,1,12345iru2zsmo22,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=b3ffed81b71099d903628c5877bd0792&filekey=30440201010430302e02016e04025348042062336666656438316237313039396439303336323863353837376264303739320203030f78040d00000004627466730000000132&hy=SH&storeid=264ffed170006ff2f4bd405e70000006e01004fb153480ff458e0b6a8ce9e7&ef=1&bizid=1022,2024/10/4 11:45\n23,3003440481974462293,文本,1,12345iru2zsmo22,test_person,一下午应该也行,,2024/10/4 11:45\n24,3403121757406614004,文本,0,test_person,test_person,先吃饭然后忙完估计三点 过去就四点,,2024/10/4 11:46\n25,6917846734389470451,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=f6ae48635cfc57931cbdcd4453230c36&filekey=3043020101042f302d02016e0402535a0420663661653438363335636663353739333163626463643434353332333063333602027039040d00000004627466730000000132&hy=SZ&storeid=266e7c5b40007eea8169bd3e30000006e01004fb1535a2836fbc1e6d7f6305&ef=1&bizid=1022,2024/10/4 11:46\n26,3853926419342399869,文本,0,test_person,test_person,蒜啦,,2024/10/4 11:46\n27,689214144441695718,文本,1,12345iru2zsmo22,test_person,啊哈哈,,2024/10/4 11:46\n28,501703563680542858,文本,1,12345iru2zsmo22,test_person,那算了,,2024/10/4 11:46\n29,5175776596048859341,文本,1,12345iru2zsmo22,test_person,可以明天来,,2024/10/4 11:46\n30,1468168499470203020,文本,0,test_person,test_person,希望明天天气好,,2024/10/4 11:46\n31,8704117912978418734,文本,0,test_person,test_person,你租车了没,,2024/10/4 11:46\n32,3720367917725174786,文本,1,12345iru2zsmo22,test_person,hh,,2024/10/4 11:46\n33,5706726594668713894,文本,1,12345iru2zsmo22,test_person,租了,,2024/10/4 11:46\n34,6749208560602575120,文本,1,12345iru2zsmo22,test_person,150一天,,2024/10/4 11:47\n35,6547279599090331225,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=f6ae48635cfc57931cbdcd4453230c36&filekey=3043020101042f302d02016e0402535a0420663661653438363335636663353739333163626463643434353332333063333602027039040d00000004627466730000000132&hy=SZ&storeid=266e7c5b40007eea8169bd3e30000006e01004fb1535a2836fbc1e6d7f6305&ef=1&bizid=1022,2024/10/4 11:47\n36,783513142739644929,文本,1,12345iru2zsmo22,test_person,我本来只用半天是的,,2024/10/4 11:47\n37,1522589433173967165,文本,0,test_person,test_person,我觉得还不如6小时,,2024/10/4 11:47\n38,4189192320331088356,文本,1,12345iru2zsmo22,test_person,他说半天得提前预约,,2024/10/4 11:47\n39,3276909886419115321,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=33935c33478d351ca575ed3b15c9c7d6&filekey=30350201010421301f020201060402535a041033935c33478d351ca575ed3b15c9c7d60203046683040d00000004627466730000000132&hy=SZ&storeid=266a76de2000bb3f307f6952b0000010600004f50535a2d4f20115699447b2&bizid=1023,2024/10/4 11:47\n40,3794700043110742367,文本,0,test_person,test_person,我靠,,2024/10/4 11:47\n41,4370237514919765211,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=e734d92f35462ae39096a6453a906c64&filekey=30440201010430302e02016e04025348042065373334643932663335343632616533393039366136343533613930366336340203011cf9040d00000004627466730000000132&hy=SH&storeid=266936043000dd1f97ec388740000006e01004fb153482828bbc1e6cab618e&ef=1&bizid=1022,2024/10/4 11:47\n42,268652740652374624,文本,1,12345iru2zsmo22,test_person,我昨天没预约,,2024/10/4 11:47\n43,8708021080758144662,文本,0,test_person,test_person,其实也玩不了多久 就环岛骑一圈,,2024/10/4 11:47\n44,4042214828835453981,文本,1,12345iru2zsmo22,test_person,是滴,,2024/10/4 11:47\n45,8134305588773593834,图片,0,12345iru2zsmo22,test_person,图片,File\\dd0e62b6eb67d195bc33ab9470301d6c\\Image\\2024-10\\13d6d8a81fa7554d09238c81fe314e85.dat,2024/10/4 15:49\n46,8231897199371315830,文本,1,12345iru2zsmo22,test_person,back了,,2024/10/4 15:49\n47,2523360219807779607,文本,1,12345iru2zsmo22,test_person,烟台下次还来,,2024/10/4 15:49\n48,5990956613985588267,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=d8f4bf1e82e7b54a140cadd0dd1788a9&filekey=30350201010421301f020201060402535a0410d8f4bf1e82e7b54a140cadd0dd1788a90203054dd3040d00000004627466730000000132&hy=SZ&storeid=266a7694900068f8207f6952b0000010600004f50535a043fb011502ab6738&bizid=1023,2024/10/4 15:49\n49,8020701317904864408,文本,0,test_person,test_person,坏了忘记回你,,2024/10/4 21:24\n50,678530733212459598,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=ecd83bbd5669b1ad8ab82368a884b4f4&filekey=30440201010430302e02016e040253480420656364383362626435363639623161643861623832333638613838346234663402030be21c040d00000004627466730000000132&hy=SH&storeid=2666a63330000e6ca9bd747110000006e01004fb1534829283bc1e72de8bdc&ef=1&bizid=1022,2024/10/4 21:24\n51,3817737317258834248,文本,0,test_person,test_person,你租的小车车好可爱哈哈哈,,2024/10/4 21:24\n52,1122488129382806721,文本,0,test_person,test_person,我之前租的时候 人家说这个没劲儿 租了个大的,,2024/10/4 21:25\n53,1244017411047763227,引用回复,0,test_person,test_person,\"（我们两个人骑）\n\n[引用](2024-10-04 21:25:04)小虫:我之前租的时候 人家说这个没劲儿 租了个大的\",,2024/10/4 21:25\n54,7804635386065632983,文本,1,12345iru2zsmo22,test_person,hh,,2024/10/4 21:26\n55,6582317494846210955,文本,1,12345iru2zsmo22,test_person,劲,,2024/10/4 21:26\n56,6947874557250248646,文本,1,12345iru2zsmo22,test_person,相当大,,2024/10/4 21:26\n57,7646558619446387721,系统通知,1,12345iru2zsmo22,test_person,<revokemsg>你撤回了一条消息</revokemsg>,,2024/10/4 21:26\n58,4607675874750661759,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=2c6bbf882e053b9639569c47194da071&filekey=30440201010430302e02016e0402534804203263366262663838326530353362393633393536396334373139346461303731020326eec4040d00000004627466730000000131&hy=SH&storeid=323032313130303432303436333430303061373166346264613936393936353336376234306230303030303036653031303034666231&ef=1&bizid=1022,2024/10/4 21:27\n59,4251046275168351582,文本,1,12345iru2zsmo22,test_person,我都超??,,2024/10/4 21:27\n60,3958569781970448507,文本,0,test_person,test_person,哈哈哈哈哈哈哈哈哈,,2024/10/4 21:27\n61,1304768232206478205,文本,1,12345iru2zsmo22,test_person,啊哈哈哈,,2024/10/4 23:11\n62,5423864699615912300,文本,1,12345iru2zsmo22,test_person,生日快乐,,2024/10/4 23:11\n63,3090401677076458687,系统通知,1,12345iru2zsmo22,test_person,<revokemsg>你撤回了一条消息</revokemsg>,,2024/10/4 23:11\n64,2634320168319877355,文本,1,12345iru2zsmo22,test_person,（先知后觉,,2024/10/4 23:11\n65,6872254500032132923,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=6f263986f9402e0259c58032d4b27403&filekey=30440201010430302e02016e04025348042036663236333938366639343032653032353963353830333264346232373430330203088dea040d00000004627466730000000132&hy=SH&storeid=265390f8400011cc0008b27880000006e01004fb1534806488bc1e0dae90d0&ef=1&bizid=1022,2024/10/4 23:12\n66,1476937912918221285,文本,0,test_person,test_person,嘿嘿谢谢泥,,2024/10/4 23:12\n67,5104969914181545205,文本,0,test_person,test_person,是明天嘟,,2024/10/4 23:12\n68,4671697784411486925,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=4af51669b3933aeacf4c1823fe1a1654&filekey=3043020101042f302d02016e0402535a0420346166353136363962333933336165616366346331383233666531613136353402023d49040d00000004627466730000000132&hy=SZ&storeid=26673b134000b229540c40cba0000006e01004fb1535a1e1a70b1568486971&ef=1&bizid=1022,2024/10/4 23:12\n69,942777755221289888,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=233b78d9d244a70087993eb38becca42&filekey=30350201010421301f02020106040253480410233b78d9d244a70087993eb38becca42020310919b040d00000004627466730000000132&hy=SH&storeid=266a9daa7000bc718169bd3e30000010600004f5053480627c0d1500e10846&bizid=1023,2024/10/4 23:13\n70,4970918958858872918,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=06f0315fce5aeb37e36d90a449e55224&filekey=3043020101042f302d02016e0402535a04203036663033313566636535616562333765333664393061343439653535323234020269b0040d00000004627466730000000132&hy=SZ&storeid=26631ab4d0005d61f47603ed30000006e01004fb1535a0416fbc1e68d59226&ef=1&bizid=1022,2024/10/4 23:13\n71,5088103607264479657,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=2fd26fe001f15baa35d2c7c1f1f77a11&filekey=30440201010430302e02016e0402535a04203266643236666530303166313562616133356432633763316631663737613131020301efba040d00000004627466730000000132&hy=SZ&storeid=266a9e4680007ad63169bd3e30000006e01004fb1535a05ff801150a57b6eb&ef=1&bizid=1022,2024/10/4 23:13\n72,1231447585119365782,动画表情,0,test_person,test_person,表情,http://xx.com/262/20304/stodownload?m=f0648e1f78507fb5e0527c1847bb7eab&filekey=30350201010421301f020201060402535a0410f0648e1f78507fb5e0527c1847bb7eab0203046443040d00000004627466730000000132&hy=SZ&storeid=266a75fa5000a957907f6952b0000010600004f50535a0026bae1e00f753c3&bizid=1023,2024/10/4 23:13\n73,7243064063443092107,文本,0,test_person,test_person,哈哈哈哈哈哈哈,,2024/12/15 21:01\n74,4402111010190356867,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=2bb2100bc3ed89cb9bb5cb2ddf096ba5&filekey=30440201010430302e02016e0402535a04203262623231303062633365643839636239626235636232646466303936626135020301364a040d00000004627466730000000132&hy=SZ&storeid=2655320ef000f23e7135418150000006e01004fb1535a2071d321e07d71e51&ef=1&bizid=1022,2024/12/15 21:01\n75,7957007613667310251,动画表情,1,12345iru2zsmo22,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=a3edb04dc624702bf53ceb8b8533030e&filekey=30440201010430302e02016e04025348042061336564623034646336323437303262663533636562386238353333303330650203031c6a040d00000004627466730000000132&hy=SH&storeid=267384c7a0008ff136f6e1f2a0000006e01004fb1534807906bd1e77d51516&ef=1&bizid=1022,2024/12/15 21:01\n76,6555773752081434395,文本,1,12345iru2zsmo22,test_person,一年级社畜,,2024/12/15 21:02\n77,6113080126607357441,文本,0,test_person,test_person,妈耶好诡异,,2024/12/15 21:07\n78,8861993796968204324,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=e868279290cc3be14207d9946d5f7479&filekey=30350201010421301f020201060402535a0410e868279290cc3be14207d9946d5f747902030204ac040d00000004627466730000000132&hy=SZ&storeid=264c8b3cb0002e54407f6952b0000010600004f50535a0df0c950b74cd5f3e&bizid=1023,2024/12/15 21:07\n79,8225573753184622169,文本,1,12345iru2zsmo22,test_person,上班,,2024/12/15 21:07\n80,2185462146394548348,文本,1,12345iru2zsmo22,test_person,是会这样的,,2024/12/15 21:07\n81,2492434220482862582,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=8a3ec2eb01ffade8c1d2cef9ce7b9cd0&filekey=30440201010430302e02016e0402535a042038613365633265623031666661646538633164326365663963653762396364300203023e74040d00000004627466730000000132&hy=SZ&storeid=26561e4d90003ae9fa356ce630000006e01004fb1535a04b6bae1e6e3ef683&ef=1&bizid=1022,2024/12/15 21:08\n82,3323843778125596201,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=e36da9879caba2492757a93c9bd1e8a3&filekey=30350201010421301f020201060402535a0410e36da9879caba2492757a93c9bd1e8a302030229bb040d00000004627466730000000132&hy=SZ&storeid=266f634870009dfbd6f6e1f2a0000010600004f50535a1b369bc1e691b22e9&bizid=1023,2024/12/15 21:08\n83,4807494899095110953,文本,1,12345iru2zsmo22,test_person,？？你和拼多多签约了？？？,,2024/12/30 21:06\n84,7732495155588506274,图片,0,12345iru2zsmo22,test_person,图片,File\\dd0e62b6eb67d195bc33ab9470301d6c\\Image\\2024-12\\01c177d8ad90af8969ba048455b54eef.dat,2024/12/30 21:06\n85,3524820582691543233,文本,0,test_person,test_person,给你99 给我花9.9买一个,,2024/12/30 21:08\n86,4559086380629629977,动画表情,0,test_person,test_person,表情,http://xx.com/262/20304/stodownload?m=60fc498474f154e67a9406e6052774e3&filekey=30350201010421301f0202010604025348041060fc498474f154e67a9406e6052774e302030a1000040d00000004627466730000000132&hy=SH&storeid=2638fd60e000bbbf33df216b40000010600004f5053482e31b8e0b68ce4bb7&bizid=1023,2024/12/30 21:08\n87,2856784266939461622,文本,0,test_person,test_person,我和偷玩签约了（bushi）,,2024/12/30 21:09\n88,4103450553959091238,文本,1,12345iru2zsmo22,test_person,啊哈哈哈哈,,2024/12/30 21:10\n89,4320467837159769744,图片,0,test_person,test_person,图片,File\\dd0e62b6eb67d195bc33ab9470301d6c\\Image\\2024-12\\e7e73ba89149fc57ea6fd395b00c9daf.dat,2024/12/30 21:21\n90,1245688256602333044,文本,0,test_person,test_person,不知道他有没有看上我,,2024/12/30 21:21\n91,3496799115798928577,动画表情,0,test_person,test_person,表情,http://File.tc.xx.com/110/20401/stodownload?m=6d737e0cb5ed70dbd2e543192395e627&filekey=30440201010430302e02016e040253480420366437333765306362356564373064626432653534333139323339356536323702030108ea040d00000004627466730000000132&hy=SH&storeid=26743cbcd000e81a54eaf9c070000006e01004fb153481223f03156d7563c2&ef=1&bizid=1022,2024/12/30 21:21\n92,8086695983133128935,文本,0,test_person,test_person,幸运的话 会送我小熊虫,,2024/12/30 21:21\n93,1413825802731496171,文本,0,test_person,test_person,不幸运就没有后续了,,2024/12/30 21:21\n94,4732788513210348588,文本,1,12345iru2zsmo22,test_person,啊哈哈哈,,2024/12/30 21:25\n95,2430661934638082113,文本,1,12345iru2zsmo22,test_person,聘你为,,2024/12/30 21:25\n96,8416545366058458010,文本,1,12345iru2zsmo22,test_person,代言人,,2024/12/30 21:26\n97,8818193356512955281,动画表情,0,test_person,test_person,表情,http://xx.com/262/20304/stodownload?m=87a30250a72f68eb6dbcd3c833f34af9&filekey=30350201010421301f020201060402535a041087a30250a72f68eb6dbcd3c833f34af902030a08d8040d00000004627466730000000131&hy=SZ&storeid=32303231303632363030303735333030303965316137356663316362626162343537353830393030303030313036&bizid=1023,2024/12/30 21:27\n98,2420601785318357838,文本,0,test_person,test_person,比不上别人,,2024/12/30 21:27\n99,209384579714630809,文本,0,test_person,test_person,名额多 我才可能有机会,,2024/12/30 21:27\n100,3867626588038981853,动画表情,1,12345iru2zsmo22,test_person,表情,http://xx.com/262/20304/stodownload?m=314db052c1847c0b51794ce3eff22482&filekey=30340201010420301e02020106040253480410314db052c1847c0b51794ce3eff224820202142f040d00000004627466730000000132&hy=SH&storeid=26303a1b5000ee6ad950c9c370000010600004f50534828034b00b6d05aeac&bizid=1023,2024/12/30 21:28\n"
  },
  {
    "path": "weclone/__init__.py",
    "content": ""
  },
  {
    "path": "weclone/cli.py",
    "content": "import functools\nimport os\nimport sys\nfrom pathlib import Path\nfrom typing import cast\n\nimport click\nimport pyjson5\nfrom rich.console import Console\nfrom rich.panel import Panel\nfrom rich.text import Text\n\nfrom weclone.utils.config import load_config\nfrom weclone.utils.config_models import CliArgs\nfrom weclone.utils.log import capture_output, configure_log_level_from_config, logger\n\ncli_config: CliArgs | None = None\n\ntry:\n    import tomllib  # type: ignore Python 3.11+\nexcept ImportError:\n    import tomli as tomllib\n\n\ndef clear_argv(func):\n    \"\"\"\n    Decorator: Clear sys.argv before calling the decorated function, keeping only the script name. Restore original sys.argv after calling.\n    Used to prevent arguments from being parsed by Hugging Face HfArgumentParser causing ValueError.\n    \"\"\"\n\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        original_argv = sys.argv.copy()\n        sys.argv = [original_argv[0]]  # Keep only script name\n        try:\n            return func(*args, **kwargs)\n        finally:\n            sys.argv = original_argv  # Restore original sys.argv\n\n    return wrapper\n\n\ndef with_community_info(func):\n    \"\"\"\n    Decorator: Show community info before executing the command\n    \"\"\"\n\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        show_community_info()\n        return func(*args, **kwargs)\n\n    return wrapper\n\n\ndef apply_common_decorators(capture_output_enabled=False):\n    \"\"\"\n    A unified decorator for applications\n    \"\"\"\n\n    def decorator(original_cmd_func):\n        @functools.wraps(original_cmd_func)\n        def new_runtime_wrapper(*args, **kwargs):\n            if cli_config and cli_config.full_log:\n                return capture_output(original_cmd_func)(*args, **kwargs)\n            else:\n                return original_cmd_func(*args, **kwargs)\n\n        func_with_clear_argv = clear_argv(new_runtime_wrapper)\n\n        return functools.wraps(original_cmd_func)(func_with_clear_argv)\n\n    return decorator\n\n\n@click.group(invoke_without_command=True)\n@click.option(\n    \"--config-path\",\n    default=None,\n    help=\"Specify config file path, or set WECLONE_CONFIG_PATH environment variable\",\n)\n@click.pass_context\ndef cli(ctx, config_path):\n    \"\"\"WeClone: One-stop solution for creating digital avatars from chat history\"\"\"\n    # Only show community info when no subcommand is invoked\n    if ctx.invoked_subcommand is None:\n        show_community_info()\n        click.echo(ctx.get_help())\n        return\n\n    if config_path:\n        os.environ[\"WECLONE_CONFIG_PATH\"] = config_path\n        logger.info(f\"Config file path set to: {config_path}\")\n\n    _check_project_root()\n    _check_versions()\n    global cli_config\n    cli_config = cast(CliArgs, load_config(arg_type=\"cli_args\"))\n\n    configure_log_level_from_config()\n\n\n@cli.command(\"make-dataset\", help=\"Process chat history CSV files to generate Q&A pair datasets.\")\n@with_community_info\n@apply_common_decorators()\ndef qa_generator():\n    \"\"\"Process chat history CSV files to generate Q&A pair datasets.\"\"\"\n    from weclone.data.qa_generator import DataProcessor\n\n    processor = DataProcessor()\n    processor.main()\n\n\n@cli.command(\"train-sft\", help=\"Fine-tune the model using prepared datasets.\")\n@apply_common_decorators()\ndef train_sft():\n    \"\"\"Fine-tune the model using prepared datasets.\"\"\"\n    from weclone.train.train_sft import main as train_sft_main\n\n    train_sft_main()\n\n\n@cli.command(\"webchat-demo\", help=\"Launch Web UI for interactive testing with fine-tuned model.\")\n@apply_common_decorators()\ndef web_demo():\n    \"\"\"Launch Web UI for interactive testing with fine-tuned model.\"\"\"\n    from weclone.eval.web_demo import main as web_demo_main\n\n    web_demo_main()\n\n\n# TODO Add evaluation functionality @cli.command(\"eval-model\", help=\"Evaluate using validation set split from training data.\")\n@apply_common_decorators()\ndef eval_model():\n    \"\"\"Evaluate using validation set split from training data.\"\"\"\n    from weclone.eval.eval_model import main as evaluate_main\n\n    evaluate_main()\n\n\n@cli.command(\"test-model\", help=\"Test model with common chat questions.\")\n@apply_common_decorators()\ndef test_model():\n    \"\"\"Test model with common chat questions.\"\"\"\n    from weclone.eval.test_model import main as test_main\n\n    test_main()\n\n\n@cli.command(\"server\", help=\"Start API service providing model inference interface.\")\n@apply_common_decorators()\ndef server():\n    \"\"\"Start API service providing model inference interface.\"\"\"\n    from weclone.server.api_service import main as server_main\n\n    server_main()\n\n\n@cli.command(\"version\", help=\"Show WeClone version information.\")\n@with_community_info\ndef version():\n    \"\"\"Show WeClone version information.\"\"\"\n    pass\n\n\ndef show_community_info():\n    console = Console()\n    content = Text()\n    content.append(\"📱 Official group\\n\", style=\"bold green\")\n    content.append(\"   • Telegram: \", style=\"bold cyan\")\n    content.append(\"https://t.me/+JEdak4m0XEQ3NGNl\\n\", style=\"bright_blue\")\n    content.append(\"   • QQ群: \", style=\"bold cyan\")\n    content.append(\"708067078\\n\\n\", style=\"bright_green\")\n    content.append(\"🌐 Social media\\n\", style=\"bold magenta\")\n    content.append(\"   • Twitter: \", style=\"bold cyan\")\n    content.append(\"https://x.com/weclone567\\n\", style=\"bright_blue\")\n    content.append(\"   • 小红书: \", style=\"bold cyan\")\n    content.append(\"🔍 搜索WeClone\\n\\n\", style=\"bright_blue\")\n    content.append(\"📚 Official resources\\n\", style=\"bold red\")\n    content.append(\"   • Repository: \", style=\"bold cyan\")\n    content.append(\"https://github.com/xming521/WeClone\\n\", style=\"bright_blue\")\n    content.append(\"   • Homepage: \", style=\"bold cyan\")\n    content.append(\"https://www.weclone.love/\\n\", style=\"bright_blue\")\n    content.append(\"   • Document: \", style=\"bold cyan\")\n    content.append(\"https://docs.weclone.love/\\n\\n\", style=\"bright_blue\")\n    content.append(\"💡 感谢您的关注和支持！Thank you for your support!\", style=\"bold bright_green\")\n    panel = Panel(\n        content,\n        title=\"🌟 Community & Social Media\",\n        title_align=\"center\",\n        border_style=\"bright_cyan\",\n        padding=(1, 2),\n    )\n    console.print(panel)\n\n\ndef _check_project_root():\n    \"\"\"Check if current directory is project root and verify project name.\"\"\"\n    project_root_marker = \"pyproject.toml\"\n    current_dir = Path(os.getcwd())\n    pyproject_path = current_dir / project_root_marker\n\n    if not pyproject_path.is_file():\n        logger.error(f\"{project_root_marker} file not found in current directory.\")\n        logger.error(\"Please ensure you are running this command in the WeClone project root directory.\")\n        sys.exit(1)\n\n    try:\n        with open(pyproject_path, \"rb\") as f:\n            pyproject_data = tomllib.load(f)\n        project_name = pyproject_data.get(\"project\", {}).get(\"name\")\n        if project_name != \"WeClone\":\n            logger.error(\"Please ensure you are running in the correct WeClone project root directory.\")\n            sys.exit(1)\n    except tomllib.TOMLDecodeError as e:\n        logger.error(f\"Error: Unable to parse {pyproject_path} file: {e}\")\n        sys.exit(1)\n    except Exception as e:\n        logger.error(f\"Unexpected error occurred while reading or processing {pyproject_path}: {e}\")\n        sys.exit(1)\n\n\ndef _check_versions():\n    \"\"\"Compare local settings.jsonc version with config file guide version in pyproject.toml\"\"\"\n    if tomllib is None:  # Skip check if toml parser failed to import\n        return\n\n    ROOT_DIR = Path(__file__).parent.parent\n    SETTINGS_PATH = ROOT_DIR / \"settings.jsonc\"\n    PYPROJECT_PATH = ROOT_DIR / \"pyproject.toml\"\n\n    settings_version = None\n    config_guide_version = None\n    config_changelog = None\n    project_version = None\n\n    if SETTINGS_PATH.exists():\n        try:\n            with open(SETTINGS_PATH, \"r\", encoding=\"utf-8\") as f:\n                content = f.read()\n                settings_data = pyjson5.loads(content)\n                settings_version = settings_data.get(\"version\")\n        except Exception as e:\n            logger.error(f\"Error: Unable to read or parse {SETTINGS_PATH}: {e}\")\n            logger.error(\"Please ensure settings.jsonc file exists and is properly formatted.\")\n            sys.exit(1)\n    else:\n        logger.error(f\"Error: Config file {SETTINGS_PATH} not found.\")\n        logger.error(\"Please ensure settings.jsonc file is located in the project root directory.\")\n        sys.exit(1)\n\n    if PYPROJECT_PATH.exists():\n        try:\n            with open(PYPROJECT_PATH, \"rb\") as f:  # tomllib requires binary mode\n                pyproject_data = tomllib.load(f)\n                weclone_tool_data = pyproject_data.get(\"tool\", {}).get(\"weclone\", {})\n                config_guide_version = weclone_tool_data.get(\"config_version\")\n                config_changelog = weclone_tool_data.get(\"config_changelog\", \"N/A\")\n                project_version = pyproject_data.get(\"project\", {}).get(\"version\")\n        except Exception as e:\n            logger.warning(\n                f\"Warning: Unable to read or parse {PYPROJECT_PATH}: {e}. Cannot check if config file is up to date.\"\n            )\n    else:\n        logger.warning(\n            f\"Warning: File {PYPROJECT_PATH} not found. Cannot check if config file is up to date.\"\n        )\n\n    if not settings_version:\n        logger.error(f\"Error: 'version' field not found in {SETTINGS_PATH}.\")\n        logger.error(\"Please copy from settings.template.json or update your settings.jsonc file.\")\n        sys.exit(1)\n\n    if config_guide_version:\n        if settings_version != config_guide_version:\n            logger.warning(\n                f\"Warning: Your settings.jsonc file version ({settings_version}) does not match the project's recommended config version ({config_guide_version}).\"\n            )\n            logger.warning(\n                \"This may cause unexpected behavior or errors. Please copy from settings.template.json or update your settings.jsonc file.\"\n            )\n            # TODO Print update log based on version number\n            logger.warning(f\"Config file changelog:\\n{config_changelog}\")\n\n        logger.info(f\"📦 Project Version: {project_version}\")\n        logger.info(f\"⚙️  Config Version: {settings_version}\")\n    elif PYPROJECT_PATH.exists():  # If file exists but version not found\n        logger.warning(\n            f\"Warning: 'config_version' field not found under [tool.weclone] in {PYPROJECT_PATH}. \"\n            \"Cannot confirm if your settings.jsonc is the latest config version.\"\n        )\n\n\nif __name__ == \"__main__\":\n    cli()\n"
  },
  {
    "path": "weclone/core/PII/__init__.py",
    "content": "from .pii_detector import ChinesePIIDetector, PIIDetector, PIIResult\n\n__all__ = [\"PIIResult\", \"PIIDetector\", \"ChinesePIIDetector\"]\n"
  },
  {
    "path": "weclone/core/PII/pii_detector.py",
    "content": "from dataclasses import dataclass\nfrom typing import List, Optional, cast\n\nfrom presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, Pattern, PatternRecognizer\nfrom presidio_analyzer.nlp_engine import NlpEngineProvider\nfrom presidio_anonymizer import AnonymizerEngine\nfrom presidio_anonymizer.entities.engine.recognizer_result import (\n    RecognizerResult as AnonymizerRecognizerResult,  # type: ignore\n)\n\n# from presidio_analyzer.analyzer_engine import logger as presidio_logger\nfrom weclone.utils.log import logger\n\n\n@dataclass\nclass PIIResult:\n    entity_type: str\n    start: int\n    end: int\n    score: float\n    text: str\n\n\nclass PIIDetector:\n    \"\"\"PII detector based on presidio library\"\"\"\n\n    def __init__(self, language: str = \"en\", threshold: float = 0.5):\n        self.language = language\n        self.threshold = threshold\n\n        self._init_engines()\n        self.anonymizer = AnonymizerEngine()\n        self.not_filtered_entities = [\"DATE_TIME\", \"PERSON\", \"URL\", \"NRP\"]\n        self.supported_entities = self.get_all_entities()\n        self.filtered_entities = [\n            entity for entity in self.supported_entities if entity not in self.not_filtered_entities\n        ]\n        if self.language == \"en\":\n            logger.info(f\"Privacy filtered entity types: {self.filtered_entities}\")\n\n    def _init_engines(self):\n        model_mapping = {\n            \"zh\": \"zh_core_web_sm\",\n            \"en\": \"en_core_web_sm\",\n            \"es\": \"es_core_news_sm\",\n            \"fr\": \"fr_core_news_sm\",\n            \"de\": \"de_core_news_sm\",\n        }\n\n        model_name = model_mapping.get(self.language, \"en_core_web_sm\")\n\n        nlp_configuration = {\n            \"nlp_engine_name\": \"spacy\",\n            \"models\": [{\"lang_code\": self.language, \"model_name\": model_name}],\n        }\n\n        provider = NlpEngineProvider(nlp_configuration=nlp_configuration)\n        nlp_engine = provider.create_engine()\n\n        self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)\n\n        self._add_custom_recognizers(language=self.language)\n\n        self.batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer)\n\n        # self.anonymizer = AnonymizerEngine()\n\n        logger.info(\n            f\"Presidio engine initialized successfully, using language: {self.language}, model: {model_name}\"\n        )\n\n    def _add_custom_recognizers(self, language: str):\n        # Create numeric ID recognizer - matches 5+ digit numbers or numbers with - separators\n        numeric_id_patterns = [\n            Pattern(name=\"numeric_id\", regex=r\"\\b(?:[A-Za-z]*\\d{5,}[A-Za-z]*|\\d+-\\d+(?:-\\d+)*)\\b\", score=0.8),\n            Pattern(name=\"unicode_escape_id\", regex=r\"\\\\u[0-9a-fA-F]{4}\", score=0.8),\n            Pattern(name=\"hex_escape_id\", regex=r\"\\\\xa0\", score=0.8),\n        ]\n\n        numeric_id_recognizer = PatternRecognizer(\n            supported_entity=\"NUMERIC_ID\",\n            patterns=numeric_id_patterns,\n            supported_language=language,\n            name=\"numeric_id_recognizer\",\n            context=[\"id\", \"编号\", \"号码\", \"代码\", \"code\", \"number\", \"序号\", \"sequence\", \"identifier\"],\n        )\n\n        self.analyzer.registry.add_recognizer(numeric_id_recognizer)\n\n        logger.info(\"Custom numeric ID recognizer added\")\n\n    def has_pii(self, text: str, entities: Optional[List[str]] = None) -> bool:\n        pii_results = self.detect_pii(text)\n        return len(pii_results) > 0\n\n    def batch_has_pii(self, texts: List[str]) -> List[bool]:\n        \"\"\"\n        Check if multiple texts contain PII information using batch processing\n\n        Args:\n            texts: List of texts to be checked\n\n        Returns:\n            List of boolean values indicating whether each text contains PII\n        \"\"\"\n        if not texts or not isinstance(texts, list):\n            return []\n\n        batch_results = self.batch_detect_pii(texts)\n        return [len(results) > 0 for results in batch_results]\n\n    def detect_pii(self, text: str) -> List[PIIResult]:\n        \"\"\"\n        Detect PII information in text\n\n        Args:\n            text: Text to be detected\n            entities: Specified entity types to detect, defaults to all supported types\n\n        Returns:\n            List of detected PII information\n        \"\"\"\n        if not text or not isinstance(text, str):\n            return []\n\n        results = self.analyzer.analyze(\n            text=text,\n            language=self.language,\n            entities=self.filtered_entities,\n            score_threshold=self.threshold,\n        )\n\n        pii_results = []\n        for result in results:\n            pii_result = PIIResult(\n                entity_type=result.entity_type,\n                start=result.start,\n                end=result.end,\n                score=result.score,\n                text=text[result.start : result.end],\n            )\n            pii_results.append(pii_result)\n\n        if pii_results:\n            logger.debug(f\"Detected {len(pii_results)} PII entities\")\n\n        return pii_results\n\n    def batch_detect_pii(self, texts: List[str]) -> List[List[PIIResult]]:\n        \"\"\"\n        Detect PII information in multiple texts using batch processing\n\n        Args:\n            texts: List of texts to be detected\n\n        Returns:\n            List of lists containing detected PII information for each text\n        \"\"\"\n        if not texts or not isinstance(texts, list):\n            return []\n\n        # Filter out empty or non-string texts\n        valid_texts = []\n        text_indices = []\n        for i, text in enumerate(texts):\n            if text and isinstance(text, str):\n                valid_texts.append(text)\n                text_indices.append(i)\n\n        if not valid_texts:\n            return [[] for _ in texts]\n\n        # Use batch analyzer for multiple texts\n        results_iterator = self.batch_analyzer.analyze_iterator(\n            texts=valid_texts,\n            language=self.language,\n            entities=self.filtered_entities,\n            score_threshold=self.threshold,\n            n_process=24,\n            batch_size=32,\n        )\n\n        # Process results\n        all_pii_results = [[] for _ in texts]\n\n        for batch_idx, results in enumerate(results_iterator):\n            original_idx = text_indices[batch_idx]\n            text = valid_texts[batch_idx]\n\n            pii_results = []\n            for result in results:\n                pii_result = PIIResult(\n                    entity_type=result.entity_type,\n                    start=result.start,\n                    end=result.end,\n                    score=result.score,\n                    text=text[result.start : result.end],\n                )\n                pii_results.append(pii_result)\n\n            all_pii_results[original_idx] = pii_results\n\n        total_entities = sum(len(results) for results in all_pii_results)\n        if total_entities > 0:\n            logger.debug(f\"Batch detected {total_entities} PII entities across {len(valid_texts)} texts\")\n\n        return all_pii_results\n\n    def anonymize_text(self, text: str, entities: Optional[List[str]] = None) -> str:\n        \"\"\"\n        Anonymize PII information in text\n\n        Args:\n            text: Text to be anonymized\n            entities: Specified entity types to anonymize, defaults to all detected types\n\n        Returns:\n            Anonymized text\n        \"\"\"\n        if not text or not isinstance(text, str):\n            return text\n\n        try:\n            analyzer_results = self.analyzer.analyze(\n                text=text, language=self.language, entities=entities, score_threshold=self.threshold\n            )\n\n            anonymized_result = self.anonymizer.anonymize(\n                text=text, analyzer_results=cast(List[AnonymizerRecognizerResult], analyzer_results)\n            )\n\n            logger.info(f\"Successfully anonymized {len(analyzer_results)} PII entities\")\n            return anonymized_result.text\n\n        except Exception as e:\n            logger.error(f\"Text anonymization failed: {e}\")\n            return text\n\n    def get_supported_entities(self) -> List[str]:\n        return self.analyzer.get_supported_entities(language=self.language)\n\n    def get_all_entities(self) -> List[str]:\n        \"\"\"Get all entities including custom ones from the registry\"\"\"\n        predefined_entities = self.get_supported_entities()\n        custom_entities = []\n\n        # Get custom entities from registry\n        for recognizer in self.analyzer.registry.recognizers:\n            for entity in recognizer.supported_entities:\n                if entity not in predefined_entities and entity not in custom_entities:\n                    custom_entities.append(entity)\n\n        return predefined_entities + custom_entities\n\n\nclass ChinesePIIDetector(PIIDetector):\n    \"\"\"Chinese PII detector, extended to recognize Chinese-specific PII\"\"\"\n\n    def __init__(self, threshold: float = 0.5):\n        super().__init__(language=\"zh\", threshold=threshold)\n\n        # Filter out country-specific entities that are not relevant for Chinese context\n        country_prefixes = [\"US_\", \"UK_\", \"SG_\", \"AU_\", \"IN_\"]\n        # Get entities that are actually supported by the analyzer\n        all_entities = self.get_all_entities()\n        supported_entities = self.get_supported_entities()\n\n        self.filtered_entities = [\n            entity\n            for entity in all_entities\n            if entity not in self.not_filtered_entities\n            and not any(entity.startswith(prefix) for prefix in country_prefixes)\n            and (entity in supported_entities or entity in [\"NUMERIC_ID\", \"CHINESE_PII\"])\n        ]\n        logger.info(f\"Chinese PII filtered entity types: {self.filtered_entities}\")\n\n    def _add_custom_recognizers(self, language: str):\n        # Add parent class recognizers first\n        super()._add_custom_recognizers(language=\"zh\")\n\n        # Add Chinese-specific recognizers that are not covered by NUMERIC_ID\n        chinese_patterns = [\n            Pattern(name=\"chinese_id_with_x\", regex=r\"\\b\\d{17}[Xx]\\b\", score=0.9),\n            Pattern(\n                name=\"chinese_email\", regex=r\"\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b\", score=0.9\n            ),\n            Pattern(\n                name=\"chinese_email_with_plus\",\n                regex=r\"\\b[A-Za-z0-9._%+-]+\\+[A-Za-z0-9._%+-]*@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b\",\n                score=0.95,\n            ),\n        ]\n\n        chinese_recognizer = PatternRecognizer(\n            supported_entity=\"CHINESE_PII\",\n            supported_language=\"zh\",\n            patterns=chinese_patterns,\n            name=\"chinese_pii_recognizer\",\n            context=[\"中文PII\"],\n        )\n        self.analyzer.registry.add_recognizer(chinese_recognizer)\n\n        logger.info(\"Chinese PII recognizer added\")\n"
  },
  {
    "path": "weclone/core/inference/offline_infer.py",
    "content": "import re\nfrom typing import List, Optional, cast\n\nimport torch\nfrom llamafactory.data import get_template_and_fix_tokenizer\nfrom llamafactory.extras.misc import get_device_count\nfrom llamafactory.hparams import get_infer_args\nfrom llamafactory.model import load_tokenizer\nfrom openai.types.chat import ChatCompletion\nfrom pydantic import BaseModel\nfrom vllm import LLM, SamplingParams\nfrom vllm.lora.request import LoRARequest\nfrom vllm.outputs import RequestOutput\nfrom vllm.sampling_params import GuidedDecodingParams\n\nfrom weclone.utils.config import load_config\nfrom weclone.utils.config_models import VllmArgs\nfrom weclone.utils.log import logger\n\n# from vllm.entrypoints.openai.tool_parsers import xLAMToolParser\n\n# NOTE: the V1 LLM engine writing style was used.\n\n\ndef extract_json_from_text(text: str) -> str:\n    \"\"\"Extract JSON content from text, supporting JSON blocks in markdown format.\"\"\"\n    json_pattern = r\"```json\\s*(.*?)\\s*```\"\n    match = re.search(json_pattern, text, re.DOTALL)\n    if match:\n        return match.group(1).strip()\n    return text.strip()\n\n\ndef parse_guided_decoding_results(\n    results: List[RequestOutput] | List[ChatCompletion] | List, guided_decoding_class: type[BaseModel]\n) -> tuple[List[Optional[BaseModel]], List[int]]:\n    \"\"\"Parse guided decoding results and return parsed results with failed indices.\n\n    Args:\n        results: Raw vLLM generation results\n        guided_decoding_class: Pydantic model class for validation\n\n    Returns:\n        tuple: (parsed_results, failed_indices) where failed_indices contains\n               indices of failed JSON parsing\n    \"\"\"\n    parsed_results = []\n    failed_indexs = []\n\n    for idx, result in enumerate(results):\n        try:\n            if isinstance(result, RequestOutput):\n                json_text = extract_json_from_text(result.outputs[0].text)\n            elif isinstance(result, ChatCompletion):\n                json_text = extract_json_from_text(result.choices[0].message.content)\n            else:\n                raise ValueError(f\"Unsupported result type: {type(result)}\")\n            parsed_result = guided_decoding_class.model_validate_json(json_text)\n            parsed_results.append(parsed_result)\n        except Exception as e:\n            if isinstance(result, RequestOutput):\n                log_text = result.outputs[0].text[:100] + \"...\"\n            elif isinstance(result, ChatCompletion):\n                log_text = result.choices[0].message.content[:100] + \"...\"\n            else:\n                log_text = str(result)[:100] + \"...\"\n            logger.warning(\n                f\"Failed to parse JSON from result at sequence index {idx}: {log_text}, error: {e}\"\n            )\n            failed_indexs.append(idx)\n            parsed_results.append(None)\n\n    return parsed_results, failed_indexs\n\n\ndef vllm_infer(\n    inputs: List[str],\n    model_name_or_path: str,\n    adapter_name_or_path: Optional[str] = None,\n    dataset: str = \"alpaca_en_demo\",\n    dataset_dir: str = \"data\",\n    template: str = \"default\",\n    cutoff_len: int = 2048,\n    max_samples: Optional[int] = None,\n    vllm_config: str = \"{}\",\n    save_name: str = \"generated_predictions.jsonl\",\n    default_system: Optional[str] = None,\n    enable_thinking: bool = False,\n    temperature: float = 0.95,\n    top_p: float = 0.7,\n    top_k: int = 50,\n    guided_decoding_class: Optional[type[BaseModel]] = None,\n    bad_words: Optional[List[str]] = None,\n    logprobs: Optional[int] = None,\n    max_new_tokens: int = 1024,\n    repetition_penalty: float = 1.0,\n    skip_special_tokens: bool = True,\n    seed: Optional[int] = None,\n    pipeline_parallel_size: int = 1,\n    image_max_pixels: int = 768 * 768,\n    image_min_pixels: int = 32 * 32,\n) -> tuple[List[RequestOutput] | List[Optional[BaseModel]], List[int]]:\n    r\"\"\"Perform batch generation using vLLM engine, which supports tensor parallelism.\n\n    Returns:\n        tuple: (results, failed_indices) where failed_indices contains indices of failed JSON parsing\n    \"\"\"\n    if pipeline_parallel_size > get_device_count():\n        raise ValueError(\"Pipeline parallel size should be smaller than the number of gpus.\")\n\n    wc_vllm_args = cast(VllmArgs, load_config(\"vllm\"))\n    model_args, data_args, _, generating_args = get_infer_args(\n        {\n            \"model_name_or_path\": model_name_or_path,\n            \"adapter_name_or_path\": adapter_name_or_path,\n            \"dataset\": dataset,\n            \"dataset_dir\": dataset_dir,\n            \"template\": template,\n            \"cutoff_len\": cutoff_len,\n            \"max_samples\": max_samples,\n            \"preprocessing_num_workers\": 16,\n            \"vllm_config\": vllm_config,\n            \"temperature\": temperature,\n            \"top_p\": top_p,\n            \"top_k\": top_k,\n            \"max_new_tokens\": max_new_tokens,\n            \"repetition_penalty\": repetition_penalty,\n            \"enable_thinking\": enable_thinking,\n        }\n    )\n\n    tokenizer_module = load_tokenizer(model_args)\n    tokenizer = tokenizer_module[\"tokenizer\"]\n    template_obj = get_template_and_fix_tokenizer(tokenizer, data_args)\n    template_obj.mm_plugin.expand_mm_tokens = False  # for vllm generate\n\n    if guided_decoding_class:\n        json_schema = guided_decoding_class.model_json_schema()\n        guided_decoding_params = GuidedDecodingParams(json=json_schema, disable_any_whitespace=True)\n\n    sampling_params = SamplingParams(\n        repetition_penalty=generating_args.repetition_penalty or 1.0,\n        temperature=generating_args.temperature,\n        top_p=generating_args.top_p or 1.0,\n        top_k=generating_args.top_k or -1,\n        stop_token_ids=template_obj.get_stop_token_ids(tokenizer),\n        max_tokens=generating_args.max_new_tokens,\n        skip_special_tokens=skip_special_tokens,\n        seed=seed,\n        bad_words=bad_words,\n        guided_decoding=guided_decoding_params if guided_decoding_class else None,\n    )\n    if model_args.adapter_name_or_path is not None:\n        lora_request = LoRARequest(\"default\", 1, model_args.adapter_name_or_path[0])\n    else:\n        lora_request = None\n\n    engine_args = {\n        \"model\": model_args.model_name_or_path,\n        \"trust_remote_code\": True,\n        \"dtype\": model_args.infer_dtype,\n        \"max_model_len\": cutoff_len + max_new_tokens,\n        \"disable_log_stats\": True,\n        \"enable_lora\": model_args.adapter_name_or_path is not None,\n        \"enable_prefix_caching\": True,\n        \"guided_decoding_backend\": \"guidance\",\n        \"guided_decoding_disable_any_whitespace\": True,\n    }\n\n    if template_obj.mm_plugin.__class__.__name__ != \"BasePlugin\":\n        engine_args[\"limit_mm_per_prompt\"] = {\"image\": 4, \"video\": 2, \"audio\": 2}\n\n    wc_vllm_dict = {k: v for k, v in wc_vllm_args.model_dump().items() if v is not None}\n    engine_args.update(wc_vllm_dict)\n\n    if isinstance(model_args.vllm_config, dict):\n        engine_args.update(model_args.vllm_config)\n\n    messages_list = [[{\"role\": \"user\", \"content\": text}] for text in inputs]\n\n    llm = LLM(**engine_args)\n\n    results = llm.chat(\n        messages_list,\n        sampling_params,\n        lora_request=lora_request,\n        chat_template_kwargs={\"enable_thinking\": enable_thinking},\n    )  # type: ignore\n\n    del llm\n    torch.cuda.empty_cache()\n\n    if guided_decoding_class:\n        # TODO better json decode  https://github.com/vllm-project/vllm/commit/1d0ae26c8544fd5a62e171e30c2dcc2973a23bc8#diff-3b27790a2ce97bc50cdd5476f7b0057da682ed0d1ec8426a7b76c5e21454e57d\n        parsed_results, failed_indexs = parse_guided_decoding_results(results, guided_decoding_class)\n        return parsed_results, failed_indexs\n    else:\n        return results, []\n"
  },
  {
    "path": "weclone/core/inference/online_infer.py",
    "content": "import logging\nfrom concurrent.futures import Future, ThreadPoolExecutor\nfrom typing import Any, Callable, List, Optional, Union\n\nfrom openai import OpenAI\nfrom openai.types.chat import ChatCompletion, ChatCompletionMessageParam\nfrom pydantic import BaseModel\n\nfrom weclone.core.inference.offline_infer import extract_json_from_text\nfrom weclone.utils.log import logger\nfrom weclone.utils.retry import retry_openai_api\n\nlogging.getLogger(\"openai._base_client\").setLevel(logging.WARNING)\nlogging.getLogger(\"httpx\").setLevel(logging.WARNING)\n\n\nclass OnlineLLM:\n    def __init__(\n        self,\n        api_key: str,\n        base_url: str,\n        model_name: str,\n        default_system: Optional[str] = None,\n        max_workers: int = 10,\n        prompt_with_system: bool = False,\n        response_format: str = \"json_object\",\n    ):\n        self.api_key = api_key\n        self.base_url = base_url\n        self.model_name = model_name\n        self.default_system = default_system\n        self.max_workers = max_workers\n        self.client = OpenAI(api_key=self.api_key, base_url=self.base_url, max_retries=0)\n        self.executor = ThreadPoolExecutor(max_workers=max_workers)\n        self.prompt_with_system = prompt_with_system\n        self.response_format = response_format\n\n    @retry_openai_api(max_retries=200, base_delay=30.0, max_delay=180.0)\n    def chat(\n        self,\n        prompt_text,\n        temperature: float = 0.7,\n        max_tokens: int = 1024,\n        top_p: float = 0.95,\n        stream: bool = False,\n    ):\n        messages: List[ChatCompletionMessageParam] = []\n        if self.prompt_with_system:\n            messages = prompt_text\n        else:\n            messages = [\n                # {\"role\": \"system\", \"content\": self.default_system},\n                {\"role\": \"user\", \"content\": prompt_text},\n            ]\n\n        params = {\n            \"model\": self.model_name,\n            \"messages\": messages,\n            \"stream\": stream,\n            \"temperature\": temperature,\n            \"max_tokens\": max_tokens,\n            \"top_p\": top_p,\n            # extra_body={\"chat_template_kwargs\": {\"enable_thinking\": False}}\n        }\n\n        if self.response_format:\n            params[\"response_format\"] = {\"type\": self.response_format}\n\n        response = self.client.chat.completions.create(**params)\n\n        return response\n\n    def chat_async(\n        self,\n        prompt_text: str,\n        temperature: float = 0.7,\n        max_tokens: int = 1024,\n        top_p: float = 0.95,\n        stream: bool = False,\n    ) -> Future:\n        \"\"\"Submit a chat request to the thread pool for async processing\"\"\"\n        return self.executor.submit(self.chat, prompt_text, temperature, max_tokens, top_p, stream)\n\n    def chat_batch(\n        self,\n        prompts: List[str],\n        temperature: float = 0.7,\n        max_tokens: int = 1024,\n        top_p: float = 0.95,\n        stream: bool = False,\n        callback: Optional[Callable[[int, Any], None]] = None,\n        guided_decoding_class: Optional[type[BaseModel]] = None,\n    ) -> Union[List[Union[ChatCompletion, Exception]], tuple[List[Optional[BaseModel]], List[int]]]:\n        \"\"\"Process multiple chat requests concurrently using thread pool\n\n        Args:\n            prompts: List of prompt strings\n            temperature: Sampling temperature\n            max_tokens: Maximum tokens to generate\n            top_p: Top-p sampling parameter\n            stream: Whether to stream the response\n            callback: Optional callback function called for each result\n            guided_decoding_class: Pydantic model class for JSON validation\n\n        Returns:\n            If enable_json_decode is False: List of ChatCompletion or Exception objects\n            If enable_json_decode is True: Tuple of (parsed_results, failed_indices)\n        \"\"\"\n        futures = []\n\n        for i, prompt in enumerate(prompts):\n            future = self.chat_async(prompt, temperature, max_tokens, top_p, stream)\n            futures.append((i, future))\n\n        results: List[Union[Any, Exception]] = [None] * len(prompts)\n\n        for i, future in futures:\n            try:\n                result = future.result()\n                results[i] = result\n                if callback:\n                    callback(i, result)\n            except Exception as e:\n                results[i] = e\n                if callback:\n                    callback(i, e)\n\n        if guided_decoding_class:\n            parsed_results: List[Optional[BaseModel]] = [None] * len(prompts)\n            failed_indexs: List[int] = []\n\n            for i, result in enumerate(results):\n                if isinstance(result, Exception):\n                    failed_indexs.append(i)\n                    logger.warning(f\"Request at index {i} failed with exception: {result}\")\n                elif isinstance(result, ChatCompletion):\n                    try:\n                        content = result.choices[0].message.content\n                        if content is None:\n                            raise ValueError(\"Message content is None\")\n                        json_text = extract_json_from_text(content)\n                        parsed_result = guided_decoding_class.model_validate_json(json_text)\n                        parsed_results[i] = parsed_result\n                    except Exception as e:\n                        content = result.choices[0].message.content\n                        log_text = (content[:100] + \"...\") if content else \"None\"\n                        logger.warning(\n                            f\"Failed to parse JSON from result at index {i}: {log_text}, error: {e}\"\n                        )\n                        failed_indexs.append(i)\n                else:\n                    logger.warning(f\"Unexpected result type at index {i}: {type(result)}\")\n                    failed_indexs.append(i)\n\n            return parsed_results, failed_indexs\n\n        return results\n\n    def close(self):\n        \"\"\"Clean up thread pool resources\"\"\"\n        if hasattr(self, \"executor\"):\n            self.executor.shutdown(wait=True)\n\n    def __enter__(self):\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        self.close()\n"
  },
  {
    "path": "weclone/data/__init__.py",
    "content": ""
  },
  {
    "path": "weclone/data/chat_parsers/telegram_parser.py",
    "content": "import csv\nimport json\nimport os\nimport shutil\nimport sys\nfrom datetime import datetime\nfrom typing import Dict, List\n\nfrom pandas import Timestamp\n\nfrom weclone.data.models import ChatMessage\nfrom weclone.utils.config_models import DataModality, WCMakeDatasetConfig\nfrom weclone.utils.log import logger\n\n\nclass TelegramChatParser:\n    \"\"\"Telegram chat parser that converts JSON format to data conforming to ChatMessage structure\"\"\"\n\n    def __init__(self, config: WCMakeDatasetConfig):\n        self.config = config\n        self.my_user_id = config.telegram_args.my_id if config.telegram_args else None\n        self.message_counter = 0\n\n        self.type_mapping = {\n            \"text\": \"text\",\n            \"photo\": \"image\",\n            \"video_file\": \"video\",\n            \"animation\": \"video\",\n            \"voice_message\": \"voice\",\n            \"audio_file\": \"file\",\n            \"sticker\": \"sticker\",\n            \"file\": \"file\",\n            \"location\": \"location\",\n            \"poll\": \"(share) card link\",\n            \"contact_information\": \"(share) card link\",\n        }\n\n    def get_message_type_and_content(self, message: Dict) -> tuple[str, str, str, bool]:\n        \"\"\"\n        Determine type_name, msg content, src and whether it's a forwarded message based on Telegram message content\n\n        Returns\n        -------\n        tuple[str, str, str, bool]\n            (type_name, msg_content, src_path, is_forward)\n        \"\"\"\n        msg_content = \"\"\n        src_path = \"\"\n        msg_type = \"text\"\n        is_forward = \"forwarded_from\" in message\n\n        if \"text\" in message:\n            msg_content = self.extract_text_content(message[\"text\"])\n\n        if \"media_type\" in message:\n            media_type = message[\"media_type\"]\n            msg_type = media_type\n\n            if media_type == \"photo\":\n                src_path = message.get(\"photo\", \"\")\n            elif media_type in [\"video_file\", \"animation\"]:\n                src_path = message.get(\"file\", \"\")\n            elif media_type == \"voice_message\":\n                src_path = message.get(\"file\", \"\")\n            elif media_type == \"audio_file\":\n                src_path = message.get(\"file\", \"\")\n            elif media_type == \"sticker\":\n                src_path = message.get(\"file\", \"\")\n                # Only set sticker emoji as msg_content if STICKER is in include_type\n                if DataModality.STICKER in self.config.include_type and not msg_content.strip():\n                    msg_content = message.get(\"sticker_emoji\", \"\")\n            else:\n                src_path = message.get(\"file\", \"\")\n\n        elif \"photo\" in message:\n            msg_type = \"photo\"\n            src_path = message[\"photo\"]\n\n        elif \"file\" in message:\n            msg_type = \"file\"\n            src_path = message[\"file\"]\n            if not msg_content.strip():\n                msg_content = message.get(\"file_name\", \"\")\n\n        elif \"location_information\" in message:\n            msg_type = \"location\"\n            loc = message[\"location_information\"]\n            src_path = f\"lat:{loc.get('latitude', 0)},lng:{loc.get('longitude', 0)}\"\n            if not msg_content.strip():\n                msg_content = message.get(\"place_name\", \"\") + message.get(\"address\", \"\")\n\n        type_name = self.type_mapping[msg_type]\n\n        return type_name, msg_content.strip(), src_path, is_forward\n\n    def extract_text_content(self, text_field) -> str:\n        content = \"\"\n        if isinstance(text_field, str):\n            content = text_field\n        elif isinstance(text_field, list):\n            for item in text_field:\n                if isinstance(item, str):\n                    content += item\n                elif isinstance(item, dict) and \"text\" in item:\n                    content += item[\"text\"]\n\n        return content.replace('\\\\\"', \"\")\n\n    def determine_sender_type(self, from_id: str) -> int:\n        return 1 if from_id == self.my_user_id else 0\n\n    def process_message(self, message: Dict) -> List[ChatMessage]:\n        \"\"\"\n        Process a single message, may return multiple messages (original message + extracted text message)\n        \"\"\"\n        if message.get(\"type\") != \"message\":\n            return []\n\n        msg_id = message.get(\"id\", 0)\n        sender_name = message.get(\"from\", \"\")\n        from_id = message.get(\"from_id\", \"\")\n        date = message.get(\"date\", \"\")\n\n        type_name, msg_content, src_path, is_forward = self.get_message_type_and_content(message)\n\n        try:\n            dt = datetime.fromisoformat(date.replace(\"T\", \" \").replace(\"Z\", \"\"))\n            create_time = Timestamp(dt)\n        except Exception as e:\n            logger.warning(f\"Time format conversion failed: {date}, error: {e}\")\n\n        is_sender = self.determine_sender_type(from_id)\n        self.message_counter += 1\n\n        result_messages = []\n        # Save messages with content or media files\n        if msg_content.strip() or src_path.strip():\n            original_msg = ChatMessage(\n                id=self.message_counter,  # Use global counter as sequential ID\n                MsgSvrID=msg_id,  # Telegram message ID\n                type_name=type_name,\n                is_sender=is_sender,  # 0: other party 1: myself\n                talker=sender_name,\n                msg=msg_content.replace(\"\\n\", \" \").strip() if msg_content.strip() else f\"{type_name}\",\n                src=src_path,\n                CreateTime=create_time,\n                is_forward=is_forward,\n            )\n            result_messages.append(original_msg)\n\n        # If it's a non-pure text message but contains text field, create additional text message\n        if type_name not in [\"text\"] and \"text\" in message:\n            text_content = self.extract_text_content(message[\"text\"])\n            if text_content.strip():\n                self.message_counter += 1\n                text_msg = ChatMessage(\n                    id=self.message_counter,\n                    MsgSvrID=msg_id,\n                    type_name=\"text\",\n                    is_sender=is_sender,\n                    talker=sender_name,\n                    msg=text_content.replace(\"\\n\", \" \").strip(),\n                    src=\"\",\n                    CreateTime=create_time,\n                    is_forward=is_forward,\n                )\n                result_messages.append(text_msg)\n\n        return result_messages\n\n    def process_chat(self, jdata: Dict) -> List[ChatMessage]:\n        \"\"\"\n        Process chat data\n\n        Parameters\n        ----------\n        jdata : Dict\n            Telegram chat JSON object\n\n        Returns\n        -------\n        List[ChatMessage]\n            List of ChatMessage objects\n        \"\"\"\n        chat_name = jdata.get(\"name\", \"Unknown Chat\")\n        messages = jdata.get(\"messages\", [])\n\n        chat_messages = []\n        for message in messages:\n            chat_msgs = self.process_message(message)\n            chat_messages.extend(chat_msgs)\n\n        for msg in chat_messages:\n            msg.room_name = chat_name\n\n        logger.info(f\"Chat '{chat_name}' parsing completed, {len(chat_messages)} messages in total\")\n        return chat_messages\n\n    def to_csv(self, chat_messages: List[ChatMessage], output_file: str):\n        \"\"\"\n        Save ChatMessage list to CSV file\n\n        Parameters\n        ----------\n        chat_messages : List[ChatMessage]\n            List of ChatMessage objects\n        output_file : str\n            Output CSV file path\n        \"\"\"\n        if not chat_messages:\n            logger.warning(\"No messages to save\")\n            return\n\n        fieldnames = [\n            \"id\",\n            \"MsgSvrID\",\n            \"type_name\",\n            \"is_sender\",\n            \"talker\",\n            \"room_name\",\n            \"msg\",\n            \"src\",\n            \"CreateTime\",\n            \"is_forward\",\n        ]\n\n        os.makedirs(os.path.dirname(output_file), exist_ok=True)\n\n        with open(output_file, \"w\", encoding=\"utf-8\", newline=\"\") as csvfile:\n            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n            writer.writeheader()\n\n            for msg in chat_messages:\n                writer.writerow(\n                    {\n                        \"id\": msg.id,\n                        \"MsgSvrID\": msg.MsgSvrID,\n                        \"type_name\": msg.type_name,\n                        \"is_sender\": msg.is_sender,\n                        \"talker\": msg.talker,\n                        \"room_name\": msg.room_name,\n                        \"msg\": msg.msg,\n                        \"src\": msg.src,\n                        \"CreateTime\": msg.CreateTime,\n                        \"is_forward\": msg.is_forward,\n                    }\n                )\n\n        logger.info(f\"CSV file saved: {output_file}\")\n\n    def copy_received_images(\n        self, chat_messages: List[ChatMessage], base_path: str = \"\", target_dir: str = \"dataset/media/images\"\n    ):\n        \"\"\"\n        Copy all images with is_sender=0 to specified directory\n        \"\"\"\n        os.makedirs(target_dir, exist_ok=True)\n\n        copied_count = 0\n        skipped_count = 0\n\n        for msg in chat_messages:\n            if msg.is_sender == 0 and msg.type_name == \"image\" and msg.src:\n                if base_path:\n                    full_src_path = os.path.join(base_path, msg.src)\n                else:\n                    full_src_path = msg.src\n\n                normalized_src = full_src_path.replace(\"\\\\\", \"/\")\n                if not os.path.exists(normalized_src):\n                    logger.warning(f\"Source file does not exist: {normalized_src}\")\n                    skipped_count += 1\n                    continue\n\n                filename = os.path.basename(normalized_src)\n\n                target_path = os.path.join(target_dir, filename)\n\n                shutil.copy2(normalized_src, target_path)\n                copied_count += 1\n\n        logger.info(f\"Image copying completed: successful {copied_count}, skipped {skipped_count}\")\n\n\ndef process_telegram_dataset(config: WCMakeDatasetConfig) -> None:\n    \"\"\"\n    Process Telegram dataset, traverse all folders under dataset/telegram\n    Create corresponding folders for each telegram folder under dataset/csv\n\n    Parameters\n    ----------\n    config : WCMakeDatasetConfig\n        Dataset configuration, contains telegram_args.my_id for determining sender\n    \"\"\"\n    telegram_dir = \"dataset/telegram\"\n    csv_output_dir = \"dataset/csv\"\n\n    if not os.path.exists(telegram_dir):\n        logger.error(f\"Telegram data directory does not exist: {telegram_dir}\")\n        return\n\n    if not config.telegram_args or not config.telegram_args.my_id:\n        logger.error(\"Telegram configuration missing, cannot process Telegram dataset\")\n        sys.exit(1)\n\n    if os.path.exists(csv_output_dir):\n        for item in os.listdir(csv_output_dir):\n            item_path = os.path.join(csv_output_dir, item)\n            if os.path.isdir(item_path):\n                shutil.rmtree(item_path)\n            else:\n                os.remove(item_path)\n\n    for folder_name in os.listdir(telegram_dir):\n        folder_path = os.path.join(telegram_dir, folder_name)\n        if not os.path.isdir(folder_path):\n            continue\n\n        json_path = os.path.join(folder_path, \"result.json\")\n\n        with open(json_path, \"r\", encoding=\"utf-8\") as file:\n            jdata = json.load(file)\n\n        chat_name = jdata.get(\"name\", \"unknown\")\n        chat_type = jdata.get(\"type\", \"unknown\")\n        chat_id = jdata.get(\"id\", \"unknown\")\n\n        safe_name = \"\".join(c for c in str(chat_name) if c.isalnum() or c in \"._-\")\n        safe_type = \"\".join(c for c in str(chat_type) if c.isalnum() or c in \"._-\")\n        safe_id = \"\".join(c for c in str(chat_id) if c.isalnum() or c in \"._-\")\n\n        csv_folder_name = f\"{safe_name}-{safe_type}-{safe_id}\"\n        csv_folder_path = os.path.join(csv_output_dir, csv_folder_name)\n\n        parser = TelegramChatParser(config=config)\n        messages = parser.process_chat(jdata)\n\n        if messages:\n            csv_file_path = os.path.join(csv_folder_path, f\"{csv_folder_name}.csv\")\n            parser.to_csv(messages, csv_file_path)\n            parser.copy_received_images(messages, folder_path)\n        else:\n            logger.warning(f\"Folder '{folder_name}' has no valid messages\")\n"
  },
  {
    "path": "weclone/data/clean/__init__.py",
    "content": ""
  },
  {
    "path": "weclone/data/clean/strategies.py",
    "content": "import json\nimport os\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass\nfrom typing import List, cast\n\nimport pandas as pd\nfrom langchain_core.prompts import PromptTemplate\nfrom tqdm import tqdm\n\nfrom weclone.core.inference.online_infer import OnlineLLM\nfrom weclone.data.models import QaPair, QaPairScore, QaPairScoreWithId\nfrom weclone.prompts.clean_data import CLEAN_PROMPT\nfrom weclone.utils.config_models import WCMakeDatasetConfig\nfrom weclone.utils.log import logger\n\n\n@dataclass\nclass CleaningStrategy(ABC):\n    \"\"\"Abstract base class for data cleaning strategies, but provides common cleaning methods\"\"\"\n\n    make_dataset_config: WCMakeDatasetConfig\n\n    @abstractmethod\n    def judge(self, data: List[QaPair]) -> None:\n        \"\"\"\n        Scoring method, needs to be implemented by subclasses.\n        \"\"\"\n        pass\n\n    def clean(self) -> str:\n        \"\"\"\n        Filter SFT data based on score and return the final dataset name to use.\n        \"\"\"\n        config = self.make_dataset_config\n        original_dataset_name = config.dataset\n        cleaned_dataset_name = original_dataset_name + \"-cleaned\"\n\n        dataset_dir = config.dataset_dir\n        dataset_info_path = os.path.join(dataset_dir, \"dataset_info.json\")\n\n        with open(dataset_info_path, \"r\", encoding=\"utf-8\") as f:\n            info = json.load(f)\n        paths = {\n            name: os.path.join(dataset_dir, info.get(name, {}).get(\"file_name\"))\n            for name in [original_dataset_name, cleaned_dataset_name]\n        }\n        original_data_path, cleaned_data_path = paths.values()\n\n        try:\n            with open(original_data_path, \"r\", encoding=\"utf-8\") as f:\n                data = json.load(f)\n            accept_score = config.clean_dataset.llm.accept_score\n            filtered_data = [item for item in data if item.get(\"score\", 0) >= accept_score]\n\n            if not filtered_data:\n                logger.warning(\"No data retained after cleaning, will use original dataset.\")\n                return original_dataset_name\n\n            with open(cleaned_data_path, \"w\", encoding=\"utf-8\") as f:\n                json.dump(filtered_data, f, ensure_ascii=False, indent=2)\n            logger.success(\n                f\"Filtered data below {accept_score} score, retained {len(filtered_data)} items, saved to {cleaned_data_path}\"\n            )\n            return cleaned_dataset_name\n\n        except Exception as e:\n            logger.error(f\"Error occurred during data cleaning, will use original dataset: {e}\")\n            return original_dataset_name\n\n\n@dataclass\nclass LLMCleaningStrategy(CleaningStrategy):\n    \"\"\"Strategy for data cleaning using large language models\"\"\"\n\n    make_dataset_config: WCMakeDatasetConfig\n\n    def judge(self, data: List[QaPair]) -> None:\n        \"\"\"\n        Call LLM for scoring and directly assign scores to the input QaPair.\n        \"\"\"\n        from weclone.core.inference.offline_infer import vllm_infer\n\n        logger.info(\"Starting LLM scoring of data\")\n        inputs = []\n        prompt_template = PromptTemplate.from_template(CLEAN_PROMPT)\n        for qa in data:\n            if qa.images:\n                qa.score = 6\n            else:\n                messages_str = \"\"\n                for msg in qa.messages:\n                    if msg.role == \"user\":\n                        messages_str += f\"Q: {msg.content}\\n\"\n                    elif msg.role == \"assistant\":\n                        messages_str += f\"A: {msg.content}\\n\"\n                prompt_value = prompt_template.invoke({\"id\": qa.id, \"messages\": messages_str.strip()})\n                inputs.append(prompt_value.to_string())\n\n        parsed_scores, failed_indexs = vllm_infer(\n            inputs,\n            self.make_dataset_config.model_name_or_path,\n            template=self.make_dataset_config.template,\n            temperature=0,\n            guided_decoding_class=QaPairScore,\n            repetition_penalty=1.1,\n            enable_thinking=self.make_dataset_config.clean_dataset.llm.enable_thinking,\n            cutoff_len=self.make_dataset_config.messages_max_length + 1024,  # add prompt length\n            max_new_tokens=1024 if self.make_dataset_config.clean_dataset.llm.enable_thinking else 200,\n        )\n\n        # We align scores by iterating only non-image examples and popping from the head of parsed_scores.\n        # Build an iterator over parsed results for simplicity and safety.\n        parsed_iter = iter(cast(List[QaPairScore | None], parsed_scores))\n        non_image_count = 0\n        failed_count = 0\n\n        for qa in data:\n            if qa.images:\n                continue\n            non_image_count += 1\n            parsed_item = next(parsed_iter, None)\n            if parsed_item is None:\n                failed_count += 1\n                qa.score = 0\n            else:\n                qa.score = parsed_item.score\n\n        # Sanity check: number of Nones should equal failed_indexs; and total length matches non-image count\n        assert failed_count == len(failed_indexs), (\n            f\"Mismatch: failed_count({failed_count}) != failed_indexs({len(failed_indexs)})\"\n        )\n        assert len(cast(List[QaPairScore | None], parsed_scores)) == non_image_count, (\n            f\"Mismatch: len(parsed_scores)({len(cast(List[QaPairScore | None], parsed_scores))}) != non_image_count({non_image_count})\"\n        )\n\n        scores = [qa.score for qa in data if qa.score is not None]\n        score_series = pd.Series(scores)\n        score_counts = score_series.value_counts().sort_index()\n        score_percentages = score_series.value_counts(normalize=True).sort_index() * 100\n        pd.set_option(\"display.unicode.east_asian_width\", True)  # Try to fix alignment issues\n        distribution_df = pd.DataFrame(  # Merge count and percentage into one DataFrame for printing\n            {\n                \"Count\": score_counts,\n                \"Percentage(%)\": score_percentages.round(2),\n            }\n        )\n        distribution_df.index.name = \"Score\"  # Add column name for the first column: Score\n        printable_df_str = distribution_df.reset_index().to_string(index=False)\n        logger.success(f\"LLM scoring distribution:\\n{printable_df_str}\")\n\n\n@dataclass\nclass OlineLLMCleaningStrategy(CleaningStrategy):\n    \"\"\"Strategy for data cleaning using large language models\"\"\"\n\n    # TODO: images clean support\n    def judge(self, data: List[QaPair]) -> None:\n        config = self.make_dataset_config\n        logger.info(\"Starting online model scoring of data\")\n        logger.info(f\"Using model {config.model_name}\")\n\n        client = OnlineLLM(\n            api_key=config.llm_api_key,\n            base_url=config.base_url,\n            model_name=config.model_name,\n            max_workers=config.clean_batch_size + 5,\n        )\n\n        inputs = []\n        prompt_template = PromptTemplate.from_template(CLEAN_PROMPT)\n        for qa in data:\n            if qa.images:\n                qa.score = 6\n            else:\n                messages_str = \"\"\n                for msg in qa.messages:\n                    if msg.role == \"user\":\n                        messages_str += f\"Q: {msg.content}\\n\"\n                    elif msg.role == \"assistant\":\n                        messages_str += f\"A: {msg.content}\\n\"\n                prompt_value = prompt_template.invoke({\"id\": qa.id, \"messages\": messages_str.strip()})\n                inputs.append(prompt_value.to_string())\n\n        clean_batch_size = config.clean_batch_size\n        all_parsed_scores = []\n\n        for i in tqdm(range(0, len(inputs), clean_batch_size), desc=\"Online model scoring progress\"):\n            batch = inputs[i : i + clean_batch_size]\n\n            try:\n                parsed_results, failed_indexs = client.chat_batch(\n                    batch, temperature=0, guided_decoding_class=QaPairScoreWithId\n                )\n\n                for j, parsed_result in enumerate(parsed_results):\n                    if parsed_result is not None:\n                        all_parsed_scores.append(parsed_result)\n                    else:\n                        logger.warning(f\"Failed to parse result for batch item at index {i + j}\")\n\n            except Exception as e:\n                logger.error(\n                    f\"Failed to call online model or parse result for batch starting at index {i}, error: {str(e)}\"\n                )\n\n        score_map = {score.id: score.score for score in all_parsed_scores}\n        for qa in data:\n            if qa.id in score_map:\n                qa.score = score_map[qa.id]\n            else:\n                logger.warning(f\"No score obtained for QA ID {qa.id}, default assigned 0\")\n                qa.score = 0\n\n        scores = [qa.score for qa in data if qa.score is not None]\n        score_series = pd.Series(scores)\n        score_counts = score_series.value_counts().sort_index()\n        score_percentages = score_series.value_counts(normalize=True).sort_index() * 100\n        pd.set_option(\"display.unicode.east_asian_width\", True)\n        distribution_df = pd.DataFrame(\n            {\n                \"Count\": score_counts,\n                \"Percentage(%)\": score_percentages.round(2),\n            }\n        )\n        distribution_df.index.name = \"Score\"\n        printable_df_str = distribution_df.reset_index().to_string(index=False)\n        logger.success(f\"Online model scoring distribution:\\n{printable_df_str}\")\n"
  },
  {
    "path": "weclone/data/models.py",
    "content": "from dataclasses import dataclass\nfrom typing import Optional\n\nfrom pandas import Timestamp\nfrom pydantic import BaseModel, Field\n\nfrom weclone.utils.config_models import DataModality\nfrom weclone.utils.i18n import MultiLangList\n\n\n@dataclass\nclass ChatMessage:\n    id: int  # sequential id\n    MsgSvrID: str  # original message id from platform\n    type_name: str  # message type, refer to cut_type_data and skip_type_data\n    is_sender: int  # 0: other party, 1: self\n    talker: str  # message sender\n    msg: str  # message content\n    src: str  # media file path, additional info field\n    CreateTime: Timestamp  # message send time\n    room_name: Optional[str] = None  # chat room name\n    is_forward: bool = False  # whether it's a forwarded message\n    modality: Optional[DataModality] = None  # message modality, set in qa_generator.py\n\n\n@dataclass\nclass CutMessage:\n    is_sender: int\n    cut_type: str\n    CreateTime: Timestamp\n\n\n@dataclass\nclass Message:\n    role: str\n    content: str\n\n\n@dataclass\nclass QaPair:\n    id: int\n    time: Timestamp\n    score: int\n    messages: list[Message]\n    images: list[str]\n    system: str\n\n\nclass QaPairScore(BaseModel):\n    score: int = Field(ge=1, le=5)\n\n\nclass QaPairScoreWithId(QaPairScore):\n    id: int\n\n\ncut_type_data = {\n    \"zh_CN\": [\n        \"cut\",\n        \"Cut\",\n        \"图片\",\n        \"视频\",\n        \"合并转发的聊天记录\",\n        \"语音\",\n        \"(分享)音乐\",\n        \"(分享)卡片式链接\",\n        \"(分享)笔记\",\n        \"(分享)小程序\",\n        \"(分享)收藏夹\",\n        \"(分享)视频号名片\",\n        \"(分享)视频号视频\",\n        \"粘贴的文本\",  # 无法解析的分享链接\n        \"未知\",\n    ],\n    \"en\": [\n        \"cut\",\n        \"Cut\",\n        \"image\",\n        \"video\",\n        \"merged forward chat records\",\n        \"voice\",\n        \"(share) music\",\n        \"(share) card link\",\n        \"(share) note\",\n        \"(share) mini program\",\n        \"(share) favorites\",\n        \"(share) video account card\",\n        \"(share) video account video\",\n        \"pasted text\",  # Unparseable share link\n        \"unknown\",\n    ],\n}\n\ncut_type_list = MultiLangList(cut_type_data, default_lang=\"en\")\n\n\nskip_type_data = {\n    \"zh_CN\": [\n        \"添加好友\",\n        \"推荐公众号\",\n        \"动画表情\",\n        \"用户上传的GIF表情\",\n        \"位置\",\n        \"文件\",\n        \"位置共享\",\n        \"引用回复\",\n        \"群公告\",\n        \"转账\",\n        \"语音通话\",\n        \"系统通知\",\n        \"消息撤回\",\n        \"拍一拍\",\n        \"邀请加群\",\n    ],\n    \"en\": [\n        \"add friend\",\n        \"recommend official account\",\n        \"sticker\",\n        \"sticker2\",\n        \"location\",\n        \"file\",\n        \"location sharing\",\n        \"reply with quote\",\n        \"group announcement\",\n        \"transfer\",\n        \"voice call\",\n        \"system notification\",\n        \"message recall\",\n        \"pat pat\",\n        \"invite to group\",\n    ],\n}\n\nskip_type_list = MultiLangList(skip_type_data, default_lang=\"en\")\n\nunprocessed_type_list = []\n"
  },
  {
    "path": "weclone/data/qa_generator.py",
    "content": "import json\nimport os\nimport re\nimport subprocess  # nosec\nimport sys\nfrom typing import List, Union, cast\n\nos.environ.setdefault(\"VLLM_WORKER_MULTIPROC_METHOD\", \"spawn\")\n\nimport pandas as pd\nfrom pandas import Timestamp\n\nfrom weclone.core.PII.pii_detector import ChinesePIIDetector, PIIDetector\nfrom weclone.data.chat_parsers.telegram_parser import process_telegram_dataset\nfrom weclone.data.clean.strategies import LLMCleaningStrategy, OlineLLMCleaningStrategy\nfrom weclone.data.models import (\n    ChatMessage,\n    CutMessage,\n    Message,\n    QaPair,\n    cut_type_list,\n    skip_type_list,\n)\nfrom weclone.data.strategies import TimeWindowStrategy\nfrom weclone.data.utils import ImageToTextProcessor, check_image_file_exists\nfrom weclone.utils.config import load_config\nfrom weclone.utils.config_models import DataModality, LanguageType, PlatformType, WCMakeDatasetConfig\nfrom weclone.utils.log import logger\n\n\nclass DataProcessor:\n    def __init__(self):\n        self.config = cast(WCMakeDatasetConfig, load_config(arg_type=\"make_dataset\"))\n        self.csv_folder = \"./dataset/csv\"\n        self.system_prompt = self.config.default_system\n        self.enable_clean = self.config.clean_dataset.enable_clean\n\n        # message type\n        self.QaPair = QaPair\n\n        self.include_type = self.config.include_type\n        if self.config.platform == PlatformType.CHAT:\n            self.cut_type_list = cut_type_list.get_items(lang=\"zh_CN\")\n            self.skip_type_list = skip_type_list.get_items(lang=\"zh_CN\")\n            self.include_type = cut_type_list.translate_batch(\n                texts=[t for t in self.include_type if t.lower() != \"text\"]\n            )\n            self.cut_type_list = [t for t in self.cut_type_list if t not in self.include_type]\n        elif self.config.platform == PlatformType.TELEGRAM:\n            self.cut_type_list = cut_type_list.get_items(lang=\"en\")\n            self.skip_type_list = skip_type_list.get_items(lang=\"en\")\n            self.include_type = [t for t in self.include_type if t.lower() != \"text\"]\n            self.cut_type_list = [t for t in self.cut_type_list if t not in self.include_type]\n            if DataModality.STICKER in self.include_type:\n                self.skip_type_list.remove(\"sticker\")\n\n        # blocked words\n        config_blocked_words = self.config.blocked_words\n        file_blocked_words = []\n        try:\n            with open(\"./dataset/blocked_words.json\", encoding=\"utf-8\") as f:\n                file_blocked_words = json.load(f).get(\"blocked_words\", [])\n        except (FileNotFoundError, json.JSONDecodeError):\n            pass\n\n        self.blocked_words = list(set(config_blocked_words + file_blocked_words))\n        # logger.info(f\"Chat record blocked words: {self.blocked_words}\")\n\n        # combine strategy\n        if self.config.single_combine_strategy == \"time_window\":\n            self.single_combine_strategy = TimeWindowStrategy(\n                time_window=self.config.single_combine_time_window * 60,\n                is_single_chat=True,\n            )\n\n        if self.config.qa_match_strategy == \"time_window\":\n            self.qa_match_strategy = TimeWindowStrategy(\n                time_window=self.config.qa_match_time_window * 60,\n                is_single_chat=False,\n            )\n\n        # PII detection\n        if self.config.language == LanguageType.ZH:\n            self.pii_detector = ChinesePIIDetector()\n        else:\n            self.pii_detector = PIIDetector(language=self.config.language)\n\n        # dataset cleaning\n        clean_dataset_config = self.config.clean_dataset\n\n        if self.enable_clean:\n            if clean_dataset_config.clean_strategy == \"llm\":\n                if self.config.online_llm_clear:\n                    self.clean_strategy = OlineLLMCleaningStrategy(make_dataset_config=self.config)\n                else:\n                    from llamafactory.extras.packages import is_vllm_available\n\n                    if not is_vllm_available():\n                        logger.error(\"vLLM is not available, dataset cleaning is not supported.\")\n                        sys.exit(1)\n                    else:\n                        self.clean_strategy = LLMCleaningStrategy(make_dataset_config=self.config)\n\n        vision_config = self.config.vision_api\n        if vision_config.enable and vision_config.api_key:\n            self.image_processor = ImageToTextProcessor(\n                api_url=vision_config.api_url,  # type: ignore\n                api_key=vision_config.api_key,  # type: ignore\n                model_name=vision_config.model_name,  # type: ignore\n                config=self.config,\n            )\n            logger.info(f\"ImageToText functionality enabled, model: {self.image_processor.model_name}\")\n        else:\n            self.image_processor = None\n\n        self.c = self.config\n\n        self.relations = {}\n\n    def main(self):\n        self.pre_parse_chat_dataset()\n\n        if not os.path.exists(self.csv_folder) or not os.listdir(self.csv_folder):\n            logger.error(\n                f\"Error: Directory '{self.csv_folder}' does not exist or is empty. Please check the path and ensure it contains CSV chat data files.\"\n            )\n            sys.exit(1)\n\n        csv_files = self.get_csv_files()\n        logger.info(f\"Found {len(csv_files)} CSV files in total, starting processing, please be patient...\")\n        message_list: List[ChatMessage] = []\n        for csv_file in csv_files:\n            logger.debug(f\"Starting to process CSV file: {csv_file}\")\n            chat_messages = self.load_file(csv_file)\n            message_list.extend(self.group_consecutive_messages(messages=chat_messages))\n            # self.process_by_msgtype(chat_message)\n            logger.debug(f\"Processing completed: {csv_file}, loaded {len(chat_messages)} messages in total\")\n        qa_res = self.match_qa(messages=message_list)\n        qa_res = [item for item in qa_res if isinstance(item, QaPair)]\n\n        if self.image_processor:\n            logger.info(\"Starting image recognition process...\")\n            qa_res = self.image_processor._process_images_in_parallel(qa_res)\n            logger.info(\"Image recognition process completed.\")\n\n        if self.enable_clean:\n            self.clean_strategy.judge(qa_res)  # type: ignore\n\n        self.save_result(qa_res)\n        self._execute_length_cdf_script()\n\n        logger.success(\n            f\"Chat record processing successful, obtained {len(qa_res)} data entries in total, saved to ./dataset/res_csv/sft/sft-my.json\"\n        )\n\n    def pre_parse_chat_dataset(self):\n        if self.c.platform == PlatformType.TELEGRAM:\n            process_telegram_dataset(self.config)\n\n    def _execute_length_cdf_script(self):\n        \"\"\"Execute the length_cdf.py script to calculate cutoff_len.\"\"\"\n        try:\n            python_executable = sys.executable\n            script_path = os.path.join(\"weclone\", \"utils\", \"length_cdf.py\")\n\n            command_parts = [\n                python_executable,\n                script_path,\n                f'--model_name_or_path=\"{self.c.model_name_or_path}\"',\n                f'--dataset=\"{self.c.dataset}\"',\n                f'--dataset_dir=\"{self.c.dataset_dir}\"',\n                f'--template=\"{self.c.template}\"',\n                \"--interval=512\",\n            ]\n\n            if hasattr(self.c, \"media_dir\") and self.c.media_dir:\n                command_parts.append(f'--media_dir=\"{self.c.media_dir}\"')\n            if hasattr(self.c, \"image_max_pixels\") and self.c.image_max_pixels:\n                command_parts.append(f'--image_max_pixels=\"{self.c.image_max_pixels}\"')\n\n            child_env = os.environ.copy()\n            child_env[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n            child_env[\"LLAMAFACTORY_VERBOSITY\"] = \"ERROR\"\n\n            process = subprocess.Popen(\n                command_parts,\n                env=child_env,\n                stdout=None,  # Use None to indicate using parent process's stdout (i.e., terminal)\n                stderr=None,\n                text=True,\n                bufsize=1,\n            )  # nosec\n            return_code = process.wait()\n            if return_code != 0:\n                logger.error(\n                    f\"Command '{' '.join(command_parts)}' execution failed with return code {return_code}\"\n                )\n        except FileNotFoundError:\n            logger.error(\n                f\"Command execution failed: executable '{command_parts[0]}' or script '{command_parts[1]}' not found\"\n            )\n        except KeyError as e:\n            logger.error(f\"Failed to execute length_cdf.py script: missing configuration item {str(e)}\")\n        except Exception as e:\n            logger.error(f\"Unknown error occurred while executing length_cdf.py script: {str(e)}\")\n\n    def get_csv_files(self):\n        \"\"\"Traverse the folder to get all CSV file paths and sort by starting sequence number in filename\"\"\"\n\n        csv_files = []\n        for chat_obj_folder in os.listdir(self.csv_folder):\n            chat_obj_folder_path = os.path.join(self.csv_folder, chat_obj_folder)\n            for csvfile in os.listdir(chat_obj_folder_path):\n                if not csvfile.endswith(\".csv\"):\n                    continue\n                csvfile_path = os.path.join(chat_obj_folder_path, csvfile)\n                csv_files.append(csvfile_path)\n        pattern = re.compile(r\"_(\\d+)_\\d+\\.csv$\")\n\n        def extract_start(fp: str) -> int:\n            name = os.path.basename(fp)\n            m = pattern.search(name)\n            return int(m.group(1)) if m else 0\n\n        csv_files.sort(key=extract_start)\n        return csv_files\n\n    def match_qa(self, messages: List[ChatMessage]) -> List[Union[QaPair, CutMessage]]:\n        \"\"\"\n        Match question-answer pairs\n\n        Args:\n            messages: Message list\n\n        Returns:\n            List[Union[QaPair, CutMessage]]: List of Q&A pairs containing instructions and outputs\n        \"\"\"\n        WAITING_INSTRUCTION = \"waiting_instruction\"\n        WAITING_RESPONSE = \"waiting_response\"\n\n        current_state = WAITING_INSTRUCTION\n        qa_res: List[Union[QaPair, CutMessage]] = []\n        last_message = None\n        current_instruction = None\n        qa_id_counter = 0\n\n        conversation_messages: List[Message] = []\n        conversation_images: List[str] = []\n        conversation_talker = \"\"\n\n        def _calculate_qa_length(\n            messages: List[Message], new_user_content: str, new_assistant_content: str\n        ) -> int:\n            \"\"\"Calculate total character length of messages plus new messages\"\"\"\n            total_length = 0\n            for msg in messages:\n                total_length += len(msg.content)\n            total_length += len(new_user_content) + len(new_assistant_content)\n            return total_length\n\n        def _save_current_qa_pair(\n            qa_id: int,\n            time_stamp: Timestamp,\n            current_conversation_messages: List[Message],\n            current_conversation_images: List[str],\n            talker: str = \"\",\n        ) -> int:\n            \"\"\"Helper function to save the current QA pair.\"\"\"\n            nonlocal qa_res  # Allow modification of qa_res from the outer scope\n\n            total_length = _calculate_qa_length(current_conversation_messages, \"\", \"\")\n\n            if total_length <= self.config.messages_max_length:\n                if len(current_conversation_images) > self.config.max_image_num:\n                    logger.warning(\n                        f\"QA pair (potential id {qa_id}) with timestamp {time_stamp} \"\n                        f\"has too many images ({len(current_conversation_images)} > {self.config.max_image_num}) \"\n                        \"and will be skipped.\"\n                    )\n                    return qa_id\n\n                if (\n                    len(current_conversation_messages) == 2\n                    and current_conversation_messages[0].role == \"user\"\n                    and current_conversation_messages[0].content == \"<begin_chat>\"\n                ):\n                    return qa_id\n\n                system_content = self.system_prompt\n                if self.c.add_time:\n                    system_content += f\"\\n 现在时间是{time_stamp.strftime('%m-%d %H:%M')}\"\n                if self.c.add_relation and talker:\n                    relation = self.relations.get(talker, \"\")\n                    if relation:\n                        system_content += f\"\\n 对方是你的{relation}，你们正在聊天\"\n\n                processed_messages = current_conversation_messages.copy()\n                for i in range(len(processed_messages) - 1):\n                    if (\n                        processed_messages[i].role == \"user\"\n                        and \"<begin_chat>\" in processed_messages[i].content\n                        and i + 1 < len(processed_messages)\n                        and processed_messages[i + 1].role == \"assistant\"\n                    ):\n                        assistant_content = processed_messages[i + 1].content\n                        processed_messages[i] = Message(\n                            role=\"user\",\n                            content=processed_messages[i].content.replace(\n                                \"<begin_chat>\", f\"<begin_chat>你应该说：{assistant_content}</begin_chat>\"\n                            ),\n                        )\n\n                qa_pair = self.QaPair(\n                    id=qa_id,\n                    time=time_stamp,\n                    score=0,\n                    messages=processed_messages,\n                    images=current_conversation_images.copy(),\n                    system=system_content,\n                )\n                qa_res.append(qa_pair)\n                return qa_id + 1\n            else:\n                logger.warning(\n                    f\"QA pair (potential id {qa_id}) with timestamp {time_stamp} \"\n                    f\"exceeds max length ({total_length} > {self.config.messages_max_length}) \"\n                    \"and will be skipped.\"\n                )\n                return qa_id\n\n        for msg in messages:\n            if isinstance(msg, CutMessage):\n                # When encountering CutMessage, save current conversation and reset state\n                if conversation_messages:\n                    qa_id_counter = _save_current_qa_pair(\n                        qa_id_counter,\n                        last_message.CreateTime if last_message else msg.CreateTime,\n                        conversation_messages,\n                        conversation_images,\n                        conversation_talker,\n                    )\n                # Reset state\n                current_state = WAITING_INSTRUCTION\n                current_instruction = None\n                last_message = None\n                conversation_messages = []\n                conversation_images = []\n                conversation_talker = \"\"\n                continue\n\n            if current_state == WAITING_INSTRUCTION:\n                if msg.is_sender == 0:  # Received message from other party\n                    if last_message and not self.qa_match_strategy.is_same_conversation([last_message], msg):\n                        # If not the same conversation and there is a previous message, save the previous conversation\n                        if conversation_messages:\n                            qa_id_counter = _save_current_qa_pair(\n                                qa_id_counter,\n                                last_message.CreateTime,\n                                conversation_messages,\n                                conversation_images,\n                                conversation_talker,\n                            )\n                            conversation_messages = []\n                            conversation_images = []\n\n                    # Regardless of whether a new conversation has just been started, this 'msg' now becomes the current instruction.\n                    current_instruction = msg\n                    last_message = msg\n                    conversation_talker = msg.talker\n                    current_state = WAITING_RESPONSE\n                elif msg.is_sender == 1:  # Own message as first message\n                    if last_message and not self.qa_match_strategy.is_same_conversation([last_message], msg):\n                        if conversation_messages:\n                            qa_id_counter = _save_current_qa_pair(\n                                qa_id_counter,\n                                last_message.CreateTime,\n                                conversation_messages,\n                                conversation_images,\n                                conversation_talker,\n                            )\n                            conversation_messages = []\n                            conversation_images = []\n\n                    conversation_messages.append(Message(role=\"user\", content=\"<begin_chat>\"))\n                    conversation_messages.append(Message(role=\"assistant\", content=msg.msg))\n                    last_message = msg\n\n            elif current_state == WAITING_RESPONSE:\n                if msg.is_sender == 0:  # Received message from other party\n                    if last_message and not self.qa_match_strategy.is_same_conversation([last_message], msg):\n                        if conversation_messages:\n                            qa_id_counter = _save_current_qa_pair(\n                                qa_id_counter,\n                                last_message.CreateTime,\n                                conversation_messages,\n                                conversation_images,\n                                conversation_talker,\n                            )\n                            conversation_messages = []\n                            conversation_images = []\n                    current_instruction = msg\n                    last_message = msg\n                    conversation_talker = msg.talker\n                    # State remains unchanged\n                else:  # Own message - use strategy to determine if it belongs to the same conversation\n                    if last_message and self.qa_match_strategy.is_same_conversation([last_message], msg):\n                        if current_instruction is None:\n                            raise ValueError(\"current_instruction should not be None when creating a QA pair\")\n\n                        conversation_messages.append(Message(role=\"user\", content=current_instruction.msg))\n                        conversation_messages.append(Message(role=\"assistant\", content=msg.msg))\n                        if hasattr(current_instruction, \"src\") and current_instruction.src:\n                            if isinstance(current_instruction.src, list):\n                                valid_images = [img_src for img_src in current_instruction.src if img_src]\n                                if valid_images:\n                                    conversation_images.extend(valid_images)\n                            elif current_instruction.src:\n                                conversation_images.append(current_instruction.src)\n                        last_message = msg\n\n                    # Regardless of whether it matches, reset state\n                    current_state = WAITING_INSTRUCTION\n                    current_instruction = None\n\n        # Process the last conversation\n        if conversation_messages and last_message:\n            qa_id_counter = _save_current_qa_pair(\n                qa_id_counter,\n                last_message.CreateTime,\n                conversation_messages,\n                conversation_images,\n                conversation_talker,\n            )\n\n        return qa_res\n\n    def group_consecutive_messages(self, messages: List[ChatMessage]) -> List[ChatMessage]:\n        \"\"\"\n        Combine multiple consecutive messages from the same person into one message, add cut when encountering cut_type\n\n        Args:\n            messages: Message list\n\n        Returns:\n            List[ChatMessage]: Combined message list\n        \"\"\"\n        if not messages:\n            return []\n\n        def _combine_text(messages: List[ChatMessage]) -> ChatMessage:\n            \"\"\"\n            Merge multiple messages into one\n\n            Args:\n                messages: List of messages to merge\n\n            Returns:\n                ChatMessage: Merged message\n            \"\"\"\n            base_msg = messages[0]\n            combined_content = messages[0].msg\n            combined_src_list = [messages[0].src] if messages[0].modality == DataModality.IMAGE else []\n\n            for i in messages[1:]:\n                content = i.msg\n                if not content:\n                    continue\n\n                if combined_content and combined_content[-1] not in [\n                    \"。\",\n                    \".\",\n                    \"！\",\n                    \"!\",\n                    \"？\",\n                    \"?\",\n                    \"…\",\n                    \"，\",\n                    \",\",\n                ]:\n                    combined_content += \"\\n\"\n\n                if i.modality == DataModality.IMAGE:\n                    combined_src_list.append(i.src)\n\n                combined_content += content\n\n            if len(combined_content) > self.c.combine_msg_max_length:\n                logger.warning(\n                    f\"Combined message length exceeds {self.c.combine_msg_max_length}, will truncate: {combined_content[:50]}\"\n                )\n                combined_content = combined_content[: self.c.combine_msg_max_length]\n                remaining_image_count = combined_content.count(\"<image>\")\n                if len(combined_src_list) > remaining_image_count:\n                    combined_src_list = combined_src_list[:remaining_image_count]\n\n            combined_message = ChatMessage(\n                id=base_msg.id,\n                MsgSvrID=base_msg.MsgSvrID,\n                type_name=base_msg.type_name,\n                is_sender=base_msg.is_sender,\n                talker=base_msg.talker,\n                room_name=base_msg.room_name,\n                msg=combined_content,\n                src=combined_src_list,  # type: ignore\n                CreateTime=messages[-1].CreateTime,  # Use the time of the last message\n                modality=base_msg.modality,\n                is_forward=base_msg.is_forward,\n            )\n\n            return combined_message\n\n        def _create_cut_message(message: ChatMessage) -> CutMessage:\n            return CutMessage(\n                is_sender=message.is_sender,\n                cut_type=message.type_name,\n                CreateTime=message.CreateTime,\n            )\n\n        def _combine_current_group(group):\n            \"\"\"\n            Process current message group and add to grouped_messages\n\n            Args:\n                group: Current message group\n            \"\"\"\n            if len(group) > 1:\n                combined_msg = _combine_text(group)\n                grouped_messages.append(combined_msg)\n            else:\n                grouped_messages.append(group[0])\n\n        grouped_messages = []\n        current_group = []\n\n        for _, current_msg in enumerate(messages):\n            if current_msg.type_name in self.cut_type_list or (\n                current_msg.modality == DataModality.IMAGE and current_msg.is_sender == 1\n            ):  # Own image messages need to be cut\n                if current_group:\n                    # Current group has messages, combine current group and add a cut\n                    _combine_current_group(current_group)\n                    current_group = []\n\n                    cut_msg = _create_cut_message(current_msg)\n                    grouped_messages.append(cut_msg)\n                else:\n                    # Current group has no messages, check previous group\n                    if grouped_messages:\n                        if not isinstance(grouped_messages[-1], CutMessage):\n                            cut_msg = _create_cut_message(current_msg)\n                            grouped_messages.append(cut_msg)\n                    # If previous group has no messages or last one is CutMessage, continue directly\n                continue\n\n            if not current_group:\n                current_group = [current_msg]\n                continue\n\n            last_msg = current_group[-1]\n\n            # Determine if it's consecutive messages from the same person\n            if (\n                current_msg.is_sender == last_msg.is_sender\n                and current_msg.talker == last_msg.talker\n                and self.single_combine_strategy.is_same_conversation([last_msg], current_msg)\n            ):\n                current_group.append(current_msg)\n            else:\n                # Not messages from the same person, process current group and start new group\n                _combine_current_group(current_group)\n                # Start new group\n                current_group = [current_msg]\n\n        # Process the last group of messages\n        if current_group:\n            _combine_current_group(current_group)\n\n        return grouped_messages\n\n    def process_by_msgtype(self, chat_message: ChatMessage):\n        if chat_message.type_name.lower() in [\"文本\", \"text\"]:\n            self.process_text(chat_message)\n        # elif chat_message.modality == DataModality.IMAGE:\n        #     self.process_image(chat_message)\n\n    def load_file(self, file_path) -> List[ChatMessage]:\n        \"\"\"\n        Perform overall first preprocessing, filter rows that don't meet conditions, check if images exist and change type to cut if not, add DataModality field\n        \"\"\"\n        folder_path = os.path.dirname(file_path)\n        folder_name = os.path.basename(folder_path)\n\n        if folder_name not in self.relations:\n            users_json_path = os.path.join(folder_path, \"users.json\")\n            if os.path.exists(users_json_path):\n                try:\n                    with open(users_json_path, encoding=\"utf-8\") as f:\n                        users_data = json.load(f)\n                        relation = users_data.get(\"relation\", \"\")\n                        if relation:\n                            self.relations[folder_name] = relation\n                            logger.debug(f\"Loaded relation for {folder_name}: {relation}\")\n                except (FileNotFoundError, json.JSONDecodeError) as e:\n                    logger.warning(f\"Failed to load users.json from {folder_path}: {e}\")\n\n        df = pd.read_csv(\n            file_path,\n            encoding=\"utf-8\",\n            dtype={\"msg\": str, \"src\": str},\n            escapechar=None,\n            keep_default_na=False,\n        )\n\n        df = df[~df[\"type_name\"].isin(values=self.skip_type_list)]\n\n        if \"is_forward\" in df.columns:\n            df = df[~((df[\"is_sender\"] == 1) & (df[\"is_forward\"]))]\n\n        # Batch process text messages for PII detection and blocked words\n        text_indices = []\n        text_messages = []\n\n        for i in df.index:\n            if df.loc[i, \"type_name\"].lower() in [\"文本\", \"text\"]:  # type: ignore\n                msg_str = str(df.loc[i, \"msg\"])\n                msg_str = msg_str.replace(\"\\n\", \"\")\n                text_indices.append(i)\n                text_messages.append(msg_str)\n\n        # TODO Deleting directly by batch_has_pii returning true/false.\n        indices_to_drop = []\n        if text_messages:\n            pii_results = self.pii_detector.batch_has_pii(text_messages)\n\n            for idx, (df_index, msg_str, has_pii) in enumerate(zip(text_indices, text_messages, pii_results)):\n                if has_pii:\n                    indices_to_drop.append(df_index)\n                    continue\n\n                # Check blocked words\n                for blocked_word in self.blocked_words:\n                    if blocked_word in msg_str:\n                        indices_to_drop.append(df_index)\n                        break\n\n        df = df.drop(index=indices_to_drop)\n\n        # Process other message types\n        for i in df.index:\n            if df.loc[i, \"type_name\"].lower() in [\"文本\", \"text\"]:\n                continue\n            if df.loc[i, \"src\"].lower().endswith(\".gif\"):\n                df.loc[i, \"src\"] = \"\"\n                df.loc[i, \"type_name\"] = \"动画表情\" if self.c.platform == PlatformType.CHAT else \"sticker\"\n                continue\n            if df.loc[i, \"type_name\"].lower() in [\"图片\", \"image\"]:  # type: ignore\n                if self.c.platform in [PlatformType.CHAT, PlatformType.TELEGRAM]:\n                    result = check_image_file_exists(str(df.loc[i, \"src\"]))\n                    if isinstance(result, str) and df.loc[i, \"is_sender\"] == 0:\n                        df.loc[i, \"src\"] = result\n                        df.loc[i, \"msg\"] = \"<image>\"\n                        df.loc[i, \"modality\"] = DataModality.IMAGE\n                    else:\n                        df.loc[i, \"type_name\"] = \"Cut\"\n            elif df.loc[i, \"type_name\"] in [\"sticker\", \"动画表情\"]:\n                if self.c.platform in [PlatformType.CHAT, PlatformType.TELEGRAM]:\n                    df.loc[i, \"src\"] = \"\"\n                    continue\n            else:\n                df.loc[i, \"msg\"] = \"\"\n\n        df = df.dropna(how=\"all\")\n        # Time format: 2021-07-07 10:27:23\n        df[\"CreateTime\"] = pd.to_datetime(df[\"CreateTime\"])\n\n        return [ChatMessage(**row) for row in df.to_dict(\"records\")]  # type: ignore\n\n    def process_text(self, chat_message: ChatMessage):\n        pass\n\n    def save_result(self, qa_res: List[QaPair]):\n        \"\"\"\n        Saves the list of QaPair objects to a JSON file after converting them to dictionaries.\n\n        Args:\n            qa_res: A list of QaPair objects.\n        \"\"\"\n        processed_qa_res = []\n        for idx, item in enumerate(qa_res):\n            item_dict = {\n                \"id\": str(idx),\n                \"time\": item.time.isoformat() if item.time else None,\n                \"score\": item.score,\n                \"messages\": [{\"role\": msg.role, \"content\": msg.content} for msg in item.messages],\n                \"images\": item.images,\n                \"system\": item.system,\n            }\n            processed_qa_res.append(item_dict)\n\n        output_path = \"./dataset/res_csv/sft/sft-my.json\"\n        os.makedirs(os.path.dirname(output_path), exist_ok=True)\n        with open(output_path, \"w\", encoding=\"utf-8\") as f:\n            json.dump(processed_qa_res, f, ensure_ascii=False, indent=4)\n        logger.success(\n            f\"Chat record processing successful, {len(qa_res)} entries in total, saved to {output_path}\"\n        )\n\n\nif __name__ == \"__main__\":\n    processor = DataProcessor()\n    processor.main()\n"
  },
  {
    "path": "weclone/data/strategies.py",
    "content": "from abc import ABC, abstractmethod\nfrom dataclasses import dataclass\nfrom typing import List\n\nfrom .models import ChatMessage\n\n\n@dataclass\nclass ConversationStrategy(ABC):\n    \"\"\"Abstract base class for conversation strategies\"\"\"\n\n    is_single_chat: bool\n\n    @abstractmethod\n    def is_same_conversation(self, history_msg: List[ChatMessage], current_msg: ChatMessage) -> bool:\n        \"\"\"Determine if two messages belong to the same conversation\"\"\"\n        pass\n\n\n@dataclass\nclass TimeWindowStrategy(ConversationStrategy):\n    \"\"\"Time window based judgment strategy\"\"\"\n\n    time_window: int  # Time window in minutes\n\n    def is_same_conversation(self, history_msg: List[ChatMessage], current_msg: ChatMessage) -> bool:\n        time_diff = abs((current_msg.CreateTime - history_msg[-1].CreateTime)).total_seconds()\n        return time_diff <= self.time_window\n\n\n@dataclass\nclass LLMStrategy(ConversationStrategy):\n    \"\"\"LLM based judgment strategy\"\"\"\n\n    def is_same_conversation(self, history_msg: List[ChatMessage], current_msg: ChatMessage) -> bool:\n        # TODO: Implement LLM-based conversation detection logic\n        return False\n\n\n@dataclass\nclass CompositeStrategy(ConversationStrategy):\n    \"\"\"Composite strategy that combines multiple strategies\"\"\"\n\n    strategies: List[ConversationStrategy]\n    require_all: bool = True\n\n    def is_same_conversation(self, history_msg: List[ChatMessage], current_msg: ChatMessage) -> bool:\n        # TODO: Implement composite strategy logic\n        return False\n"
  },
  {
    "path": "weclone/data/utils.py",
    "content": "import base64\nimport concurrent.futures\nimport os\nfrom pathlib import Path\n\nimport requests\n\nfrom weclone.utils.config_models import WCMakeDatasetConfig\nfrom weclone.utils.log import logger\nfrom weclone.utils.retry import retry_on_http_error\n\n\ndef check_image_file_exists(file_path: str) -> str | bool:\n    try:\n        normalized_path = os.path.normpath(file_path).replace(\"\\\\\", \"/\")\n\n        filename_with_ext = os.path.basename(normalized_path)\n        filename_without_ext = Path(filename_with_ext).stem\n\n        # 使用 glob 查找精确匹配该文件名的文件（不论扩展名）\n        images_dir = Path(\"dataset\") / \"media\" / \"images\"\n        matching_files = list(images_dir.glob(f\"{filename_without_ext}.*\"))\n\n        if len(matching_files) > 0:\n            # 获取相对于dataset/media的路径，只保留images/文件名\n            full_path = matching_files[0]\n            relative_path = full_path.relative_to(Path(\"dataset\") / \"media\")\n            return str(relative_path)\n        else:\n            return False\n\n    except Exception as e:\n        logger.error(f\"检查图片文件时出错: {file_path}, 错误: {e}\")\n        return False\n\n\nclass ImageToTextProcessor:\n    \"\"\"通过兼容OpenAI API的多模态LLM将图片转换为文本。\"\"\"\n\n    def __init__(self, api_url: str, api_key: str, model_name: str, config: WCMakeDatasetConfig):\n        self.api_url = api_url.rstrip(\"/\")\n        self.api_key = api_key\n        self.model_name = model_name\n        self.config = config\n        self.prompt = \"\"\"\n        请描述这张图片的内容，重点关注：\n        1. 如果是截图，描述界面内容和操作\n        2. 如果是表格，描述表格结构和数据\n        3. 如果是文档，提取关键文字信息\n        4. 如果是生活照片，简要描述场景和内容。\n        请用简洁明了的语言描述，不超过100字。\"\"\"\n\n    def _process_images_in_parallel(self, qa_list):\n        \"\"\"并行处理所有对话中的图片，并将描述替换回对话文本。\"\"\"\n        all_image_paths = []\n        media_dir = self.config.media_dir\n\n        # 遍历所有对话，收集并构造完整的图片路径\n        for qa_pair in qa_list:\n            if qa_pair.images:\n                image_list = qa_pair.images if isinstance(qa_pair.images, list) else [qa_pair.images]\n                for relative_path in image_list:\n                    full_path = os.path.join(media_dir, relative_path)\n                    all_image_paths.append(full_path)\n\n        if not all_image_paths:\n            logger.info(\"未在对话中找到任何图片，跳过识别。\")\n            return qa_list\n\n        logger.info(f\"共找到 {len(all_image_paths)} 张有效图片需要识别。\")\n        max_workers = self.config.vision_api.max_workers\n\n        # 使用线程池并行调用API，executor.map 会保持结果顺序与输入一致\n        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:\n            # 现在传递给 image_processor 的是完整的路径\n            image_descriptions = list(executor.map(self.describe_image, all_image_paths))\n\n        desc_iterator = iter(image_descriptions)\n        for qa_pair in qa_list:\n            if not qa_pair.images:\n                continue\n\n            for message in qa_pair.messages:\n                # 替换消息内容中的 <image> 占位符\n                num_images_in_message = message.content.count(\"<image>\")\n                for _ in range(num_images_in_message):\n                    try:\n                        description = next(desc_iterator)\n                        # 使用 count=1 确保每次只替换一个占位符，并添加换行符以增强可读性\n                        message.content = message.content.replace(\n                            \"<image>\", f\"\\n[图片描述: {description}]\\n\", 1\n                        )\n                    except StopIteration:\n                        logger.error(\"图片数量与描述数量不匹配，可能存在逻辑错误。\")\n                        message.content = message.content.replace(\"<image>\", \"\\n[图片描述缺失]\\n\", 1)\n\n            # 清空图片列表，因为它们已被转换为文本\n            qa_pair.images.clear()\n\n        return qa_list\n\n    def _encode_image_to_base64(self, image_path: str) -> str:\n        \"\"\"将图片编码为base64\"\"\"\n        try:\n            with open(image_path, \"rb\") as image_file:\n                return base64.b64encode(image_file.read()).decode(\"utf-8\")\n        except Exception as e:\n            logger.error(f\"编码图片失败 {image_path}: {e}\")\n            return \"\"\n\n    def _get_image_format(self, image_path: str) -> str:\n        \"\"\"获取图片格式\"\"\"\n        suffix = Path(image_path).suffix.lower().replace(\".\", \"\")\n        if suffix == \"jpg\":\n            return \"jpeg\"\n        return suffix\n\n    @retry_on_http_error(\n        max_retries=5,\n        base_delay=15.0,\n        max_delay=300.0,\n        backoff_factor=2.0,\n        retry_on_status=[429, 500, 502, 503, 504],\n        retry_on_exceptions=[requests.exceptions.RequestException, ConnectionError, TimeoutError],\n    )\n    def _call_vision_api(self, image_path: str) -> str:\n        \"\"\"调用Vision API（增加了重试机制）\"\"\"\n        base64_image = self._encode_image_to_base64(image_path)\n        if not base64_image:\n            return \"[图片处理失败：无法编码]\"\n\n        image_format = self._get_image_format(image_path)\n\n        headers = {\"Content-Type\": \"application/json\", \"Authorization\": f\"Bearer {self.api_key}\"}\n\n        payload = {\n            \"model\": self.model_name,\n            \"messages\": [\n                {\n                    \"role\": \"user\",\n                    \"content\": [\n                        {\"type\": \"text\", \"text\": self.prompt},\n                        {\n                            \"type\": \"image_url\",\n                            \"image_url\": {\"url\": f\"data:image/{image_format};base64,{base64_image}\"},\n                        },\n                    ],\n                }\n            ],\n            \"max_tokens\": 1000,\n            \"temperature\": 0.1,\n        }\n\n        response = requests.post(\n            f\"{self.api_url}/chat/completions\", headers=headers, json=payload, timeout=60\n        )\n\n        if response.status_code == 200:\n            result = response.json()\n            if \"choices\" in result and len(result[\"choices\"]) > 0:\n                content = result[\"choices\"][0][\"message\"][\"content\"]\n                return content.strip()\n            else:\n                logger.warning(f\"API响应格式异常: {result}\")\n                return \"[图片描述获取失败：API格式错误]\"\n        else:\n            logger.error(f\"API请求失败，状态码: {response.status_code}，原因: {response.reason}\")\n            response.raise_for_status()  # 触发重试机制\n            return \"[图片描述获取失败]\"\n\n    def describe_image(self, image_path: str) -> str:\n        \"\"\"公开方法，用于描述单张图片内容\"\"\"\n        if not os.path.exists(image_path):\n            logger.warning(f\"图片文件不存在: {image_path}\")\n            return \"[图片文件不存在]\"\n\n        logger.debug(f\"正在识别图片: {os.path.basename(image_path)}\")\n        return self._call_vision_api(image_path)\n\n\nif __name__ == \"__main__\":\n    path = \"Storage\\\\Image\\2021-08\\6ce3f785b4230246639c3dd0d4a8848c.dat\"\n    print(check_image_file_exists(path))\n"
  },
  {
    "path": "weclone/eval/__init__.py",
    "content": ""
  },
  {
    "path": "weclone/eval/cli_demo.py",
    "content": "from llamafactory.chat import ChatModel\nfrom llamafactory.extras.misc import torch_gc\n\n\ndef main():\n    try:\n        import platform\n\n        if platform.system() != \"Windows\":\n            import readline  # noqa: F401\n    except ImportError:\n        print(\"Install `readline` for a better experience.\")\n\n    chat_model = ChatModel()\n    messages = []\n    print(\n        \"Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application.\"\n    )\n\n    while True:\n        try:\n            query = input(\"\\nUser: \")\n        except UnicodeDecodeError:\n            print(\"Detected decoding error at the inputs, please set the terminal encoding to utf-8.\")\n            continue\n        except Exception:\n            raise\n\n        if query.strip() == \"exit\":\n            break\n\n        if query.strip() == \"clear\":\n            messages = []\n            torch_gc()\n            print(\"History has been removed.\")\n            continue\n\n        messages.append({\"role\": \"user\", \"content\": query})\n        print(\"Assistant: \", end=\"\", flush=True)\n\n        response = \"\"\n        for new_text in chat_model.stream_chat(messages):\n            print(new_text, end=\"\", flush=True)\n            response += new_text\n        print()\n        messages.append({\"role\": \"assistant\", \"content\": response})\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "weclone/eval/eval_model.py",
    "content": "from llamafactory.eval.evaluator import Evaluator\n\n\ndef main():\n    evaluator = Evaluator()\n    evaluator.eval()\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "weclone/eval/test_model.py",
    "content": "import json\nfrom typing import List, cast  # 导入 cast\n\nimport openai\nfrom openai import OpenAI  # 导入 OpenAI 类\nfrom openai.types.chat import ChatCompletionMessageParam  # 导入消息参数类型\nfrom tqdm import tqdm\n\nfrom weclone.utils.config import load_config\nfrom weclone.utils.config_models import TestModelArgs, WCInferConfig\n\ninfer_config = cast(WCInferConfig, load_config(\"web_demo\"))\ntest_config = cast(TestModelArgs, load_config(\"test_model\"))\n\ncompletion_config = {\n    \"default_prompt\": infer_config.default_system,\n    \"model\": \"gpt-3.5-turbo\",\n    \"history_len\": 15,\n}\n\ncompletion_config = type(\"Config\", (object,), completion_config)()\n\nclient = OpenAI(api_key=\"\"\"sk-test\"\"\", base_url=\"http://127.0.0.1:8005/v1\")\n\n\ndef handler_text(content: str, history: list, config):\n    messages = [{\"role\": \"system\", \"content\": f\"{config.default_prompt}\"}]\n    for item in history:\n        messages.append(item)\n    messages.append({\"role\": \"user\", \"content\": content})\n    history.append({\"role\": \"user\", \"content\": content})\n    try:\n        typed_messages = cast(List[ChatCompletionMessageParam], messages)\n        response = client.chat.completions.create(\n            model=config.model,\n            messages=typed_messages,\n            max_tokens=50,\n        )\n    except openai.APIError as e:\n        history.pop()\n        return \"AI interface error, please try again\\n\" + str(e)\n\n    resp = str(response.choices[0].message.content)  # type: ignore\n    resp = resp.replace(\"\\n \", \"\")\n    history.append({\"role\": \"assistant\", \"content\": resp})\n    return resp\n\n\ndef main():\n    test_list = json.loads(open(test_config.test_data_path, \"r\", encoding=\"utf-8\").read())[\"questions\"]\n    res = []\n    for questions in tqdm(test_list, desc=\" Testing...\"):\n        history = []\n        for q in questions:\n            handler_text(q, history=history, config=completion_config)\n        res.append(history)\n\n    res_file = open(\"test_result-my.txt\", \"w\")\n    for r in res:\n        for i in r:\n            res_file.write(i[\"content\"] + \"\\n\")\n        res_file.write(\"\\n\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "weclone/eval/web_demo.py",
    "content": "from llamafactory.webui.interface import create_web_demo\n\nfrom weclone.utils.config import load_config\n\n\ndef main():\n    load_config(\"web_demo\")\n    demo = create_web_demo()\n    demo.queue()\n    demo.launch(server_name=\"0.0.0.0\", share=True, inbrowser=True)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "weclone/prompts/__init__.py",
    "content": ""
  },
  {
    "path": "weclone/prompts/clean_data.py",
    "content": "CLEAN_PROMPT = \"\"\"\n\n# 角色\n你是一个数据质量评估员。\n\n# 任务\n你的任务是评估下面提供的聊天记录的**逻辑性**、**相关性**以及**风格代表性**。目标是识别并过滤掉那些回答与问题**明显不匹配**、**逻辑严重混乱**的样本，筛选出具有人类聊天风格独特性与辨识度的样本。请根据以下核心评估点给出一个1到5的整数分数，并将该分数与原始 `id` 一起输出。\n\n**重要考量:**\n1.  **简短回答的有效性:** 请注意，诸如“好的”、“是的”、“收到”、“嗯”、“知道了”等简短的肯定、确认或应答，在合适的语境下是完全**有逻辑且相关的**。**不要仅仅因为回答简短就将其评为低分。** 只有当这类简短回答与【问题/上下文 Q】**明显不符**时，才应考虑低分。\n2.  **处理错别字和自我纠正:** 聊天记录中可能包含常见的打字错误（错别字）或用户先打错字随后又自行纠正的情况（例如，发送“我想去1楼”紧接着又发送“*2楼”进行更正）。在评估时，请**聚焦于用户想要表达的最终意图和信息的核心内容**，而**不应仅仅因为存在错别字或纠正过程就判定为低质量**。。\n\n# 核心评估点 (请在心中衡量)\n1.  **相关性 (Relevance):** 【回答 A】是否直接回应或恰当地衔接了【问题/上下文 Q】？它是在回答问题，还是完全跑题了？只有当【回答 A】与【问题/上下文 Q】**明显矛盾**、**完全不着边际**（即使考虑上下文也无法合理化），或简短回答**明显不适用于**该【问题/上下文 Q】时，才给予低分。\n2.  **逻辑性 (Coherence):** 【回答 A】本身是否符合基本的逻辑？结合【问题/上下文 Q】来看，这个问答对是否构成了一个符合逻辑的交流片段？是否存在明显的矛盾、混乱的内容？只有当【回答 A】**自身逻辑混乱**、**与Q存在无法解释的矛盾**时，才给予低分。\n3. **风格代表性**  (Style Representativeness): 评估【回答 A】是否展现了自然、独特的人类对话风格特征。回答Ａ是否带有个性化的色彩？关注点包括但不限于：是否体现了特定的语气（如友好、幽默、不耐烦、正式、脏话），是否包含口头禅、俚语、网络用语（如“yyds”、“绝绝子”）、表情符号 Emoji、颜文字、标点符号的特殊使用如“!!!”、“???”、“~”等表达、特定的缩写或短语、非标准的但一致的表达方式（如方言词汇、个人口癖）？\n4. **以相关性和逻辑性为主要评判标准，风格代表性仅仅作为获得5分的必要条件。**\n\n# 评分标准 (1-5分)\n*   **1分 (极差):** 聊天记录中的问答内容完全不相关；逻辑严重混乱/矛盾。\n*   **2分 (差):** 大部分问答相关性很低；存在明显的逻辑问题或不连贯。\n*   **3分 (中等):** 问答相关性一般（可能部分问答跑题或回应不充分）；逻辑上勉强说得通但不够流畅或有瑕疵。\n*   **4分 (良好):** 大部分问答相关性好，回答了问题或恰当衔接，逻辑清晰。\n*   **5分 (优秀):** 问答相关性强，逻辑流畅，包含了显著的、具有辨识度的人类聊天的常用特征（例如情感情绪表达、口头禅、表情符号组合、特有的句子结构、鲜明的语气）\n# 输出要求\n请严格按照以下 JSON 格式输出，包含输入数据的 id 和你给出的1到5的整数评分 score，不要包含任何其他文字、解释或标签。\n{{\"id\": \"{id}\",\"score\": <这里填入1到5的整数评分>}}\n# 输入数据\n```json\n{{\"id\": \"{id}\",\"messages\": \"{messages}\"}}\n```\n\"\"\"\n\n# ONLINE_LLM_CLEAN_PROMPT = \"\"\"\n# # 角色\n# 你是一个数据质量评估员。\n\n# # 任务\n# 你的任务是评估下面提供的聊天记录的**逻辑性**、**相关性**以及**风格代表性**。目标是识别并过滤掉那些回答与问题**明显不匹配**、**逻辑严重混乱**的样本，筛选出具有人类聊天风格独特性与辨识度的样本。请根据以下核心评估点给出一个1到5的整数分数，并将该分数与原始 `id` 一起输出。\n\n# **重要考量:**\n# 1.  **简短回答的有效性:** 请注意，诸如“好的”、“是的”、“收到”、“嗯”、“知道了”等简短的肯定、确认或应答，在合适的语境下是完全**有逻辑且相关的**。**不要仅仅因为回答简短就将其评为低分。** 只有当这类简短回答与【问题/上下文 Q】**明显不符**时，才应考虑低分。\n# 2.  **处理错别字和自我纠正:** 聊天记录中可能包含常见的打字错误（错别字）或用户先打错字随后又自行纠正的情况（例如，发送“我想去1楼”紧接着又发送“*2楼”进行更正）。在评估时，请**聚焦于用户想要表达的最终意图和信息的核心内容**，而**不应仅仅因为存在错别字或纠正过程就判定为低质量**。。\n\n# # 核心评估点 (请在心中衡量)\n# 1.  **相关性 (Relevance):** 【回答 A】是否直接回应或恰当地衔接了【问题/上下文 Q】？它是在回答问题，还是完全跑题了？只有当【回答 A】与【问题/上下文 Q】**明显矛盾**、**完全不着边际**（即使考虑上下文也无法合理化），或简短回答**明显不适用于**该【问题/上下文 Q】时，才给予低分。\n# 2.  **逻辑性 (Coherence):** 【回答 A】本身是否符合基本的逻辑？结合【问题/上下文 Q】来看，这个问答对是否构成了一个符合逻辑的交流片段？是否存在明显的矛盾、混乱的内容？只有当【回答 A】**自身逻辑混乱**、**与Q存在无法解释的矛盾**时，才给予低分。\n# 3. **风格代表性**  (Style Representativeness): 评估【回答 A】是否展现了自然、独特的人类对话风格特征。回答Ａ是否带有个性化的色彩？关注点包括但不限于：是否体现了特定的语气（如友好、幽默、不耐烦、正式、脏话），是否包含口头禅、俚语、网络用语（如“yyds”、“绝绝子”）、表情符号 Emoji、颜文字、标点符号的特殊使用如“!!!”、“???”、“~”等表达、特定的缩写或短语、非标准的但一致的表达方式（如方言词汇、个人口癖）？\n# 4. **以相关性和逻辑性为主要评判标准，风格代表性仅仅作为获得5分的必要条件。**\n\n# # 评分标准 (1-5分)\n# *   **1分 (极差):** 聊天记录中的问答内容完全不相关；逻辑严重混乱/矛盾。\n# *   **2分 (差):** 大部分问答相关性很低；存在明显的逻辑问题或不连贯。\n# *   **3分 (中等):** 问答相关性一般（可能部分问答跑题或回应不充分）；逻辑上勉强说得通但不够流畅或有瑕疵。\n# *   **4分 (良好):** 大部分问答相关性好，回答了问题或恰当衔接，逻辑清晰。\n# *   **5分 (优秀):** 问答相关性强，逻辑流畅，包含了显著的、具有辨识度的人类聊天的常用特征（例如情感情绪表达、口头禅、表情符号组合、特有的句子结构、鲜明的语气）\n\n# # 输入数据\n# ```json\n# {qa_list}\n\n# # 输出要求\n# 请严格按照以下 JSON 格式输出，包含原始的 id 和你给出的1到5的整数评分 score，不要包含任何其他文字、解释或标签！\n# [\n#   {{\n#     \"id\": \"<这里填入第1条输入数据的id值>\",\n#     \"score\": <1-5的整数评分>\n#   }},\n#   {{\n#     \"id\": \"<这里填入第2条输入数据的id值>\",\n#     \"score\": <1-5的整数评分>\n#   }}\n#   …\n# ]\n# \"\"\"\n"
  },
  {
    "path": "weclone/server/__init__.py",
    "content": ""
  },
  {
    "path": "weclone/server/api_service.py",
    "content": "import os\n\nimport uvicorn\nfrom llamafactory.api.app import create_app\nfrom llamafactory.chat import ChatModel\n\nfrom weclone.utils.config import load_config\n\n\ndef main():\n    config = load_config(\"api_service\")\n    chat_model = ChatModel(config.model_dump(mode=\"json\"))\n    app = create_app(chat_model)\n    print(\"Visit http://localhost:{}/docs for API document.\".format(os.environ.get(\"API_PORT\", 8005)))\n    uvicorn.run(app, host=\"0.0.0.0\", port=int(os.environ.get(\"API_PORT\", 8005)), workers=1)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "weclone/train/__init__.py",
    "content": ""
  },
  {
    "path": "weclone/train/export_model.py",
    "content": "from llamafactory.train.tuner import export_model\n\n\ndef main():\n    export_model()\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "weclone/train/train_sft.py",
    "content": "import json\nimport os\nfrom typing import cast\n\nfrom llamafactory.extras.misc import get_current_device\nfrom llamafactory.train.tuner import run_exp\n\nfrom weclone.data.clean.strategies import LLMCleaningStrategy\nfrom weclone.utils.config import load_config\nfrom weclone.utils.config_models import WCMakeDatasetConfig, WCTrainSftConfig\nfrom weclone.utils.log import logger\n\n\ndef main():\n    train_config: WCTrainSftConfig = cast(WCTrainSftConfig, load_config(arg_type=\"train_sft\"))\n    dataset_config: WCMakeDatasetConfig = cast(WCMakeDatasetConfig, load_config(arg_type=\"make_dataset\"))\n\n    device = get_current_device()\n    if device == \"cpu\":\n        logger.warning(\"Please note you are using CPU for training, non-Mac devices may encounter issues\")\n\n    dataset_info_path = os.path.join(dataset_config.dataset_dir, \"dataset_info.json\")\n\n    with open(dataset_info_path, \"r\", encoding=\"utf-8\") as f:\n        dataset_info = json.load(f)\n        data_path = os.path.join(\n            dataset_config.dataset_dir, dataset_info.get(train_config.dataset, {}).get(\"file_name\")\n        )\n        if not os.path.exists(data_path):\n            raise FileNotFoundError(\n                f\"Dataset file '{data_path}' does not exist, please check if make-dataset was executed\"\n            )\n\n    if not dataset_config.clean_dataset.enable_clean:\n        logger.info(\"Data cleaning is not enabled, will use the original dataset.\")\n    else:\n        cleaner = LLMCleaningStrategy(make_dataset_config=dataset_config)\n        train_config.dataset = cleaner.clean()\n\n    formatted_config = json.dumps(train_config.model_dump(mode=\"json\"), indent=4, ensure_ascii=False)\n    logger.info(f\"Fine-tuning configuration:\\n{formatted_config}\")\n\n    run_exp(train_config.model_dump(mode=\"json\"))\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "weclone/utils/__init__.py",
    "content": ""
  },
  {
    "path": "weclone/utils/config.py",
    "content": "import os\nimport sys\nfrom typing import Any, Dict, cast\n\nimport pyjson5\nfrom omegaconf import OmegaConf\nfrom pydantic import BaseModel\n\nfrom .config_models import (\n    WcConfig,\n    WCInferConfig,\n    WCMakeDatasetConfig,\n    WCTrainSftConfig,\n)\nfrom .log import logger\nfrom .tools import dict_to_argv\n\n\ndef load_base_config() -> WcConfig:\n    \"\"\"Load base configuration file and create WcConfig object\"\"\"\n    config_path = os.environ.get(\"WECLONE_CONFIG_PATH\", \"./settings.jsonc\")\n    logger.info(f\"Loading configuration from: {config_path}\")\n\n    try:\n        with open(config_path, \"r\", encoding=\"utf-8\") as f:\n            s_config_dict: Dict[str, Any] = pyjson5.loads(f.read())\n    except FileNotFoundError:\n        logger.error(f\"Configuration file not found: {config_path}\")\n        sys.exit(1)\n    except Exception as e:\n        logger.error(f\"Error loading configuration file {config_path}: {e}\")\n        sys.exit(1)\n\n    # Use OmegaConf to parse configuration, then convert to Pydantic model for validation\n    try:\n        omega_config = OmegaConf.create(s_config_dict)\n        config_dict_for_validation = OmegaConf.to_container(omega_config, resolve=True)\n        if not isinstance(config_dict_for_validation, dict):\n            raise TypeError(\n                f\"Configuration should be a dictionary, but got {type(config_dict_for_validation)}\"\n            )\n        wc_config = WcConfig(**cast(Dict[str, Any], config_dict_for_validation))\n    except Exception as e:\n        logger.error(f\"Error parsing configuration with OmegaConf and WcConfig: {e}\")\n        sys.exit(1)\n\n    return wc_config\n\n\ndef create_config_by_arg_type(arg_type: str, wc_config: WcConfig) -> BaseModel:\n    \"\"\"Create corresponding configuration object based on argument type, merge common_config\"\"\"\n    if arg_type == \"cli_args\":\n        return wc_config.cli_args\n\n    common_config = wc_config.common_args.model_dump()\n\n    if arg_type == \"web_demo\" or arg_type == \"api_service\":\n        config_dict = {**common_config, **wc_config.infer_args.model_dump()}\n        return WCInferConfig(**config_dict)\n\n    elif arg_type == \"vllm\":\n        return wc_config.vllm_args\n\n    elif arg_type == \"test_model\":\n        return wc_config.test_model_args\n\n    elif arg_type == \"train_sft\":\n        common_config[\"include_type\"] = wc_config.make_dataset_args.include_type\n        config_dict = {**common_config, **wc_config.train_sft_args.model_dump()}\n        return WCTrainSftConfig(**config_dict)\n\n    elif arg_type == \"make_dataset\":\n        make_dataset_config = wc_config.make_dataset_args.model_dump()\n        # TODO: Should the following three parameters be moved to common?\n        train_sft_args = wc_config.train_sft_args\n        extra_values = {\n            \"dataset\": train_sft_args.dataset,\n            \"dataset_dir\": train_sft_args.dataset_dir,\n            \"cutoff_len\": train_sft_args.cutoff_len,\n        }\n        config_dict = {**common_config, **make_dataset_config, **extra_values}\n        return WCMakeDatasetConfig(**config_dict)\n\n    else:\n        raise ValueError(\"Unsupported argument type\")\n\n\ndef process_config_dict_and_argv(arg_type: str, config_pydantic: BaseModel) -> None:\n    \"\"\"Process configuration dictionary and update sys.argv\"\"\"\n    config_dict = config_pydantic.model_dump(mode=\"json\")\n\n    sys.argv += dict_to_argv(config_dict)\n\n\ndef load_config(arg_type: str) -> BaseModel:\n    \"\"\"Main function for loading configuration\"\"\"\n    # Load base configuration\n    wc_config = load_base_config()\n\n    config_pydantic = create_config_by_arg_type(arg_type, wc_config)\n\n    process_config_dict_and_argv(arg_type, config_pydantic)\n\n    return config_pydantic\n\n\nif __name__ == \"__main__\":\n    load_config(\"train_sft\")\n"
  },
  {
    "path": "weclone/utils/config_models.py",
    "content": "from enum import Enum\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom loguru import logger\nfrom pydantic import BaseModel, Field, model_validator\n\nif TYPE_CHECKING:\n    pass\n\n\nclass StrEnum(str, Enum):\n    \"\"\"\n    Pydantic-friendly string enum base class\n    Supports direct string comparison, e.g.: `if platform == PlatformType.CHAT`\n    Also supports string literal comparison, e.g.: `if platform == \"chat\"`\n    \"\"\"\n\n    def __str__(self) -> str:\n        return self.value\n\n    @classmethod\n    def _missing_(cls, value):\n        for member in cls:\n            if member.value == value:\n                return member\n        return None\n\n\nclass BaseConfigModel(BaseModel):\n    \"\"\"Base configuration model with default extra='allow'\"\"\"\n\n    model_config = {\"extra\": \"allow\"}\n\n\nclass PlatformType(StrEnum):\n    \"\"\"Data source platform\"\"\"\n\n    CHAT = \"chat\"\n    TELEGRAM = \"telegram\"\n\n\nclass LanguageType(StrEnum):\n    \"\"\"Data language\"\"\"\n\n    ZH = \"zh\"\n    EN = \"en\"\n\n\nclass DataModality(StrEnum):\n    \"\"\"Data modality\"\"\"\n\n    TEXT = \"text\"\n    IMAGE = \"image\"\n    STICKER = \"sticker\"\n    # AUDIO = \"audio\"\n    # VIDEO = \"video\"\n\n\nclass CombineStrategy(StrEnum):\n    \"\"\"Combination strategy\"\"\"\n\n    TIME_WINDOW = \"time_window\"\n\n\nclass CleanStrategy(StrEnum):\n    \"\"\"Data cleaning strategy\"\"\"\n\n    LLM = \"llm\"\n\n\nclass FinetuningType(StrEnum):\n    \"\"\"Finetuning type\"\"\"\n\n    LORA = \"lora\"\n    # FULL = \"full\"\n    # FREEZE = \"freeze\"\n\n\nclass CommonArgs(BaseConfigModel):\n    \"\"\"NOTE that all parameters here will be parsed by `HfArgumentParser`. Non-HfArgumentParser parameters should be placed in make_dataset_args.\"\"\"\n\n    model_name_or_path: str = Field(...)\n    adapter_name_or_path: Optional[str] = Field(None, description=\"Also as output_dir of train_sft_args\")\n    template: str = Field(..., description=\"model template\")\n    default_system: str = Field(..., description=\"default system prompt\")\n    finetuning_type: FinetuningType = Field(FinetuningType.LORA)\n    media_dir: str = Field(\"dataset/media\")\n    image_max_pixels: int = Field(409920, description=\"used in llama-factory, 409920 represents 720P\")\n    enable_thinking: bool = Field(False, description=\"used in llama-factory\")\n    trust_remote_code: bool = Field(True, description=\"used in huggingface\")\n\n\nclass CliArgs(BaseModel):\n    model_config = {\"extra\": \"forbid\"}\n    full_log: bool = Field(False)\n    log_level: str = Field(\"INFO\", description=\"DEBUG, INFO, WARNING, ERROR, CRITICAL\")\n\n\nclass LLMCleanConfig(BaseConfigModel):\n    accept_score: int = Field(\n        2,\n        description=\"Acceptable LLM scoring threshold: 1 (worst) to 5 (best). Data scoring below this threshold will not be used for training.\",\n    )\n    enable_thinking: bool = Field(False, description=\"used in llama-factory\")\n\n\nclass CleanDatasetConfig(BaseConfigModel):\n    enable_clean: bool = False\n    clean_strategy: CleanStrategy = CleanStrategy.LLM\n    llm: LLMCleanConfig = LLMCleanConfig(accept_score=2, enable_thinking=False)\n\n\nclass VisionApiConfig(BaseConfigModel):\n    \"\"\"Vision API specific configuration\"\"\"\n\n    enable: bool = Field(default=False, description=\"Whether to enable Vision API for image recognition\")\n    api_key: Optional[str] = None\n    api_url: Optional[str] = None\n    model_name: Optional[str] = None\n    max_workers: Optional[int] = None\n\n\nclass TelegramArgs(BaseModel):\n    model_config = {\"extra\": \"forbid\"}\n    my_id: str = Field(default=\"user1234567890\", description=\"Your own telegram id\")\n\n\nclass MakeDatasetArgs(BaseConfigModel):\n    model_config = {\"extra\": \"forbid\"}\n\n    platform: PlatformType = Field(..., description=\"Data source platform\")\n    telegram_args: Optional[TelegramArgs] = None\n    language: LanguageType = Field(LanguageType.ZH, description=\"Common language used in chat\")\n    include_type: List[DataModality] = Field([DataModality.TEXT], description=\"Types of data to include\")\n    max_image_num: int = Field(2, description=\"Maximum number of images per single data entry\")\n    blocked_words: List[str] = Field([], description=\"List of blocked words\")\n    add_time: bool = Field(False, description=\"Whether to add time to the dataset\")\n    add_relation: bool = Field(False, description=\"Whether to add chat member relationship to the dataset\")\n    single_combine_strategy: CombineStrategy = Field(\n        CombineStrategy.TIME_WINDOW,\n        description=\"Strategy for combining single person's messages into a single sentence\",\n    )\n    qa_match_strategy: CombineStrategy = Field(\n        CombineStrategy.TIME_WINDOW, description=\"Strategy for forming QA pairs\"\n    )\n    single_combine_time_window: int = Field(\n        2, description=\"Time window for combining single person's messages (minutes)\"\n    )\n    qa_match_time_window: int = Field(5, description=\"Time window for forming QA pairs (minutes)\")\n    combine_msg_max_length: int = Field(2048, description=\"Maximum length of combined messages\")\n    messages_max_length: int = Field(\n        2048, description=\"Maximum character count for messages, used with cutoff_len\"\n    )\n    prompt_with_history: bool = Field(\n        False, description=\"Whether to include conversation history in prompt, invalid for multimodal data\"\n    )\n    clean_dataset: CleanDatasetConfig = Field(CleanDatasetConfig(), description=\"Data cleaning configuration\")\n    online_llm_clear: bool = Field(False)\n    base_url: Optional[str] = Field(None, description=\"Base URL for online LLM\")\n    llm_api_key: Optional[str] = Field(None, description=\"API key for online LLM\")\n    model_name: Optional[str] = Field(\n        None, description=\"Model name for online LLM, recommend using larger parameter models\"\n    )\n    clean_batch_size: int = Field(10, description=\"Batch size for data cleaning\")\n    vision_api: VisionApiConfig = Field(VisionApiConfig())\n\n\nclass TrainSftArgs(BaseConfigModel):\n    stage: str = Field(\"sft\", description=\"Training stage\")\n    dataset: str = Field(..., description=\"Dataset name\")\n    dataset_dir: str = Field(\"./dataset/res_csv/sft\", description=\"Dataset directory\")\n    freeze_multi_modal_projector: bool = Field(\n        False, description=\"Whether to freeze multimodal projector during MLLM training\"\n    )\n    use_fast_tokenizer: bool = Field(True, description=\"Whether to use fast tokenizer\")\n    lora_target: str = Field(..., description=\"LoRA target modules\")\n    lora_rank: int = Field(4, description=\"LoRA rank\")\n    lora_dropout: float = Field(0.25, description=\"LoRA dropout\")\n    weight_decay: float = Field(0.1, description=\"Weight decay\")\n    overwrite_cache: bool = Field(True, description=\"Whether to overwrite cache\")\n    per_device_train_batch_size: int = Field(4, description=\"Training batch size per device\")\n    gradient_accumulation_steps: int = Field(8, description=\"Gradient accumulation steps\")\n    lr_scheduler_type: str = Field(\"cosine\", description=\"Learning rate scheduler type\")\n    cutoff_len: int = Field(4096, description=\"Cutoff length\")\n    logging_steps: int = Field(10, description=\"Logging steps\")\n    save_steps: int = Field(100, description=\"Model save steps\")\n    learning_rate: float = Field(1e-4, description=\"Learning rate\")\n    warmup_ratio: float = Field(0.1, description=\"Warmup ratio\")\n    num_train_epochs: int = Field(2, description=\"Number of training epochs\")\n    plot_loss: bool = Field(True, description=\"Whether to plot loss curve\")\n    fp16: bool = Field(True, description=\"Whether to use fp16\")\n    flash_attn: str = Field(\"fa2\", description=\"Flash Attention type\")\n    preprocessing_num_workers: int = Field(16, description=\"Number of preprocessing worker processes\")\n    dataloader_num_workers: int = Field(4, description=\"Number of dataloader worker processes\")\n    deepspeed: Optional[str] = Field(\n        None, description=\"DeepSpeed configuration file path for multi-GPU training\"\n    )\n    do_train: bool = Field(True)\n\n\nclass InferArgs(BaseConfigModel):\n    repetition_penalty: float = Field(1.2, description=\"Repetition penalty\")\n    temperature: float = Field(..., description=\"Temperature\")\n    top_p: float = Field(..., description=\"Top-p sampling\")\n    max_length: int = Field(..., description=\"Maximum generation length\")\n\n\nclass VllmArgs(BaseConfigModel):\n    gpu_memory_utilization: float = Field(default=0.9, description=\"vllm GPU memory utilization\")\n\n\nclass TestModelArgs(BaseConfigModel):\n    test_data_path: str = Field(default=\"dataset/eval/test_data-en.json\", description=\"Test data path\")\n\n\nclass CommonMethods:\n    def _parse_dataset_name(self) -> str:\n        \"\"\"Parse and process dataset name\"\"\"\n        if hasattr(self, \"include_type\") and \"image\" in getattr(self, \"include_type\", []):\n            return getattr(self, \"dataset\", \"\") + \"-vl\"\n        return getattr(self, \"dataset\", \"\")\n\n\nclass WcConfig(BaseModel):\n    model_config = {\"extra\": \"forbid\"}\n\n    version: str = Field(..., description=\"Configuration file version\")\n    common_args: CommonArgs = Field(..., description=\"Common parameters\")\n    cli_args: CliArgs = Field(..., description=\"Command line arguments\")\n    make_dataset_args: MakeDatasetArgs = Field(..., description=\"Dataset processing parameters\")\n    train_sft_args: TrainSftArgs = Field(..., description=\"SFT fine-tuning parameters\")\n    infer_args: InferArgs = Field(..., description=\"Inference parameters\")\n    vllm_args: VllmArgs = Field(VllmArgs())\n    test_model_args: TestModelArgs = Field(TestModelArgs())\n\n\nclass WCInferConfig(CommonArgs, InferArgs):\n    \"\"\"Final configuration model for Web Demo\"\"\"\n\n    pass\n\n\nclass WCTrainSftConfig(CommonArgs, TrainSftArgs, CommonMethods):\n    \"\"\"Final configuration model for SFT training\"\"\"\n\n    # Training output directory, converted from adapter_name_or_path\n    output_dir: Optional[str] = Field(None)\n    dataset: str = Field(..., description=\"Dataset name\")\n\n    @model_validator(mode=\"after\")\n    def process_config(self):\n        adapter_name_value = getattr(self, \"adapter_name_or_path\", None)\n\n        if adapter_name_value:\n            self.output_dir = adapter_name_value\n\n        self.dataset = self._parse_dataset_name()\n        # Always remove adapter_name_or_path field after processing\n        if hasattr(self, \"adapter_name_or_path\"):\n            delattr(self, \"adapter_name_or_path\")\n        if hasattr(self, \"include_type\"):\n            delattr(self, \"include_type\")\n\n        return self\n\n\nclass WCMakeDatasetConfig(CommonArgs, MakeDatasetArgs, CommonMethods):\n    \"\"\"Final configuration model for creating datasets\"\"\"\n\n    model_config = {\"extra\": \"allow\"}  # Explicitly set to allow\n\n    dataset: str = Field(..., description=\"Dataset name\")\n    dataset_dir: str = Field(\"./dataset/res_csv/sft\", description=\"Dataset directory\")\n    cutoff_len: int = Field(4096, description=\"Cutoff length\")\n\n    @model_validator(mode=\"after\")\n    def process_config(self):\n        # Validate Telegram configuration\n        if self.platform == PlatformType.TELEGRAM:\n            if self.telegram_args is None or self.telegram_args.my_id == \"user1234567890\":\n                logger.error(\n                    \"When using the Telegram platform, please set a valid `telegram_args.my_id`. The `from_id` in `result.json` for the messages you send represents your user ID.\"\n                )\n                exit(1)\n\n        self.dataset = self._parse_dataset_name()\n\n        return self\n"
  },
  {
    "path": "weclone/utils/i18n.py",
    "content": "from typing import Dict, List, Optional\n\n\nclass MultiLangList:\n    def __init__(self, translations: Dict[str, List[str]], default_lang=\"en\"):\n        self.translations = translations\n        self.current_lang = default_lang\n        self.default_lang = default_lang\n        # Validate that all translation lists have the same length\n        self._validate_translations()\n        # 创建反向映射字典，用于快速查找\n        self._build_reverse_mapping()\n\n    def _validate_translations(self):\n        \"\"\"Validate that all translation lists have the same length\"\"\"\n        if not self.translations:\n            raise ValueError(\"Translations dictionary cannot be empty\")\n\n        # Get the length of the first list as reference\n        first_lang = next(iter(self.translations))\n        expected_length = len(self.translations[first_lang])\n\n        # Check if all lists have the same length\n        for lang, items in self.translations.items():\n            if len(items) != expected_length:\n                raise ValueError(\n                    f\"Translation list for '{lang}' has {len(items)} items, \"\n                    f\"expected {expected_length} items (same as '{first_lang}')\"\n                )\n\n    def _build_reverse_mapping(self):\n        \"\"\"构建反向映射，用于根据文本查找对应的索引和其他语言翻译\"\"\"\n        self.text_to_index = {}  # 文本 -> (语言, 索引)\n\n        for lang, items in self.translations.items():\n            for index, text in enumerate(items):\n                self.text_to_index[text.lower()] = (lang, index)\n\n    def set_language(self, lang: str):\n        \"\"\"设置当前语言\"\"\"\n        if lang in self.translations:\n            self.current_lang = lang\n            return self\n        else:\n            print(f\"Warning: Language '{lang}' not available, using default\")\n\n    def get_items(self, lang: Optional[str] = None) -> List[str]:\n        \"\"\"获取指定语言的列表\"\"\"\n        target_lang = lang or self.current_lang\n        return self.translations.get(target_lang, self.translations[self.default_lang])\n\n    def get_item(self, index: int, lang: Optional[str] = None) -> str:\n        \"\"\"获取指定索引的翻译项\"\"\"\n        items = self.get_items(lang)\n        if 0 <= index < len(items):\n            return items[index]\n        raise IndexError(\"List index out of range\")\n\n    def translate_text(self, text: str, target_lang: Optional[str] = None) -> Optional[str]:\n        \"\"\"\n        根据输入的文本（中文或英文）获取另一种语言的翻译\n\n        Args:\n            text: 要翻译的文本\n            target_lang: 目标语言，如果不指定则自动判断（中文->英文，英文->中文）\n\n        Returns:\n            翻译后的文本，如果找不到则返回None\n        \"\"\"\n        text_lower = text.lower()\n\n        # 查找文本在哪个语言的哪个位置\n        if text_lower not in self.text_to_index:\n            return None\n\n        source_lang, index = self.text_to_index[text_lower]\n\n        # 如果没有指定目标语言，则自动判断\n        if target_lang is None:\n            if source_lang == \"en\":\n                target_lang = \"zh_CN\"  # 英文->中文\n            elif source_lang == \"zh_CN\":\n                target_lang = \"en\"  # 中文->英文\n            else:\n                return None\n\n        # 获取目标语言的翻译\n        if target_lang in self.translations:\n            target_items = self.translations[target_lang]\n            if index < len(target_items):\n                return target_items[index]\n\n        return None\n\n    def get_translation_pair(self, text: str) -> Dict[str, str]:\n        \"\"\"\n        获取某个文本的中英文对照\n\n        Args:\n            text: 要查找的文本\n\n        Returns:\n            包含中英文翻译的字典，例如 {'en': 'Administrator', 'zh_CN': '管理员'}\n        \"\"\"\n        text_lower = text.lower()\n\n        if text_lower not in self.text_to_index:\n            return {}\n\n        source_lang, index = self.text_to_index[text_lower]\n\n        result = {}\n        for lang in [\"en\", \"zh_CN\"]:\n            if lang in self.translations and index < len(self.translations[lang]):\n                result[lang] = self.translations[lang][index]\n\n        return result\n\n    def translate_batch(self, texts: List[str], target_lang: Optional[str] = None) -> List[Optional[str]]:\n        \"\"\"\n        批量翻译文本\n\n        Args:\n            texts: 要翻译的文本列表\n            target_lang: 目标语言\n\n        Returns:\n            翻译结果列表\n        \"\"\"\n        return [self.translate_text(text, target_lang) for text in texts]\n\n    def __iter__(self):\n        return iter(self.get_items())\n\n    def __len__(self):\n        return len(self.get_items())\n\n    def __getitem__(self, index):\n        return self.get_item(index)\n\n\nif __name__ == \"__main__\":\n    # 定义中英文双语数据\n    user_types_data = {\n        \"en\": [\"Administrator\", \"Regular User\", \"Guest\", \"Moderator\", \"Super Admin\"],\n        \"zh_CN\": [\"管理员\", \"普通用户\", \"访客\", \"版主\", \"超级管理员\"],\n    }\n\n    status_data = {\n        \"en\": [\"Active\", \"Inactive\", \"Pending\", \"Suspended\", \"Deleted\"],\n        \"zh_CN\": [\"活跃\", \"非活跃\", \"待定\", \"暂停\", \"已删除\"],\n    }\n\n    permission_data = {\n        \"en\": [\"Read\", \"Write\", \"Execute\", \"Delete\", \"Admin\"],\n        \"zh_CN\": [\"读取\", \"写入\", \"执行\", \"删除\", \"管理\"],\n    }\n\n    # 创建多语言列表\n    user_types = MultiLangList(user_types_data)\n    status_list = MultiLangList(status_data)\n    permissions = MultiLangList(permission_data)\n    # 使用示例\n    print(\"=== 基本翻译功能 ===\")\n    # 中文翻译为英文\n    result1 = user_types.translate_text(\"管理员\")\n    print(f\"'管理员' -> '{result1}'\")  # 输出: '管理员' -> 'Administrator'\n\n    # 英文翻译为中文\n    result2 = user_types.translate_text(\"Guest\")\n    print(f\"'Guest' -> '{result2}'\")  # 输出: 'Guest' -> '访客'\n\n    # 指定目标语言\n    result3 = user_types.translate_text(\"管理员\", target_lang=\"en\")\n    print(f\"'管理员' -> '{result3}' (指定英文)\")  # 输出: '管理员' -> 'Administrator' (指定英文)\n\n    print(\"\\n=== 获取中英文对照 ===\")\n    translation_pair = user_types.get_translation_pair(\"Administrator\")\n    print(f\"'Administrator' 的中英文对照: {translation_pair}\")\n    # 输出: {'en': 'Administrator', 'zh_CN': '管理员'}\n\n    print(\"\\n=== 批量翻译 ===\")\n    chinese_texts = [\"管理员\", \"普通用户\", \"访客\"]\n    english_results = user_types.translate_batch(chinese_texts)\n    print(f\"批量翻译结果: {list(zip(chinese_texts, english_results))}\")\n    # 输出: [('管理员', 'Administrator'), ('普通用户', 'Regular User'), ('访客', 'Guest')]\n\n    print(\"\\n=== 状态列表翻译 ===\")\n    status_result = status_list.translate_text(\"活跃\")\n    print(f\"'活跃' -> '{status_result}'\")  # 输出: '活跃' -> 'Active'\n\n    status_result2 = status_list.translate_text(\"Pending\")\n    print(f\"'Pending' -> '{status_result2}'\")  # 输出: 'Pending' -> '待定'\n\n    print(\"\\n=== 权限翻译 ===\")\n    perm_result = permissions.translate_text(\"读取\")\n    print(f\"'读取' -> '{perm_result}'\")  # 输出: '读取' -> 'Read'\n\n    print(\"\\n=== 错误处理 ===\")\n    not_found = user_types.translate_text(\"不存在的文本\")\n    print(f\"不存在的文本翻译结果: {not_found}\")  # 输出: None\n\n    print(\"\\n=== 当前语言设置 ===\")\n    user_types.set_language(\"zh_CN\")\n    print(f\"当前语言列表: {list(user_types)}\")  # 输出中文列表\n\n    user_types.set_language(\"en\")\n    print(f\"切换后列表: {list(user_types)}\")  # 输出英文列表\n"
  },
  {
    "path": "weclone/utils/length_cdf.py",
    "content": "# Copyright 2025 the LlamaFactory team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom collections import defaultdict\n\nimport fire\nfrom llamafactory.data import get_dataset, get_template_and_fix_tokenizer\nfrom llamafactory.hparams import get_train_args\nfrom llamafactory.model import load_tokenizer\nfrom tqdm import tqdm\n\nfrom weclone.utils.log import logger\n\n\ndef calculate_token_length(\n    text: str,\n    model_name_or_path: str = \"./models/Qwen3-32B-AWQ\",\n    template: str = \"qwen3\",\n) -> int:\n    \"\"\"Calculate the token length of the specified text\n\n    Args:\n        text: Text to calculate token length for\n        model_name_or_path: Model path\n        template: Template name\n\n    Returns:\n        Token length of the text\n    \"\"\"\n    logger.info(f\"Calculating text token length: {text[:50]}...\")\n\n    model_args, data_args, _, _, _ = get_train_args(\n        {\n            \"stage\": \"sft\",\n            \"model_name_or_path\": model_name_or_path,\n            \"template\": template,\n            \"dataset\": \"chat-sft\",\n            \"output_dir\": \"dummy_dir\",\n            \"do_train\": True,\n        }\n    )\n\n    tokenizer_module = load_tokenizer(model_args)\n    tokenizer = tokenizer_module[\"tokenizer\"]\n\n    # Directly use tokenizer to encode text\n    tokens = tokenizer.encode(text, add_special_tokens=False)\n    token_length = len(tokens)\n\n    logger.info(f\"Text token length: {token_length}\")\n    return token_length\n\n\ndef length_cdf(\n    model_name_or_path: str = \"./Qwen2.5-7B-Instruct\",\n    dataset: str = \"chat-sft\",\n    dataset_dir: str = \"./dataset/res_csv/sft\",\n    media_dir: str = \"./dataset/media\",\n    template: str = \"qwen\",\n    interval: int = 256,\n    image_max_pixels: int = 768 * 768,\n):\n    r\"\"\"Calculate the distribution of the input lengths in the dataset.\n\n    Usage: export CUDA_VISIBLE_DEVICES=0\n    python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en_demo --template default\n    \"\"\"\n    logger.info(\"Starting cutoff_len calculation......\")\n\n    model_args, data_args, training_args, _, _ = get_train_args(\n        {\n            \"stage\": \"sft\",\n            \"model_name_or_path\": model_name_or_path,\n            \"dataset\": dataset,\n            \"dataset_dir\": dataset_dir,\n            \"template\": template,\n            \"cutoff_len\": 1_000_000,\n            \"preprocessing_num_workers\": 16,\n            \"output_dir\": \"dummy_dir\",\n            \"media_dir\": media_dir,\n            \"image_max_pixels\": int(image_max_pixels),\n            \"overwrite_cache\": True,\n            \"do_train\": True,\n        }\n    )\n    tokenizer_module = load_tokenizer(model_args)\n    template_obj = get_template_and_fix_tokenizer(tokenizer_module[\"tokenizer\"], data_args)  # type: ignore\n    trainset = get_dataset(template_obj, model_args, data_args, training_args, \"sft\", **tokenizer_module)[\n        \"train_dataset\"\n    ]  # type: ignore\n    total_num = len(trainset)  # type: ignore\n    length_dict = defaultdict(int)\n    for sample in tqdm(trainset[\"input_ids\"], desc=\"Collecting lengths\"):  # type: ignore\n        length_dict[len(sample) // interval * interval] += 1\n\n    length_tuples = list(length_dict.items())\n    length_tuples.sort()\n    count_accu, prob_accu = 0, 0\n    logger.info(\" cutoff_len configuration suggestions:\")\n    logger.warning(\"For multimodal tasks, please ensure cutoff_len is set to the maximum data length\")\n    for length, count in length_tuples:\n        count_accu += count\n        prob_accu += count / total_num * 100\n        logger.info(f\"{count_accu:d} ({prob_accu:.2f}%) samples have length < {length + interval}.\")\n\n\nif __name__ == \"__main__\":\n    fire.Fire(length_cdf)\n"
  },
  {
    "path": "weclone/utils/log.py",
    "content": "import logging\nimport os\nimport sys\nimport time\nfrom functools import wraps\n\nfrom loguru import logger\n\nlogger.remove()\n\nenv_log_level = os.getenv(\"WC_LOG_LEVEL\")\n# Initialize basic log configuration, will be reconfigured later by configure_log_level_from_config\nlogger.add(\n    sys.stderr,\n    format=\"<green><b>[WeClone]</b></green> <level>{level.name[0]}</level> | <level>{time:HH:mm:ss}</level> | <level>{message}</level>\",\n    colorize=True,\n    level=env_log_level.upper() if env_log_level else \"INFO\",\n)\n\n\nclass InterceptHandler(logging.Handler):\n    def __init__(self, level=logging.INFO):\n        super().__init__(level)\n\n    def emit(self, record):\n        # Check log level, only handle logs at specified level and above\n        if record.levelno < self.level:\n            return\n\n        timestamp = time.strftime(\"%H:%M:%S\")\n        level_color = \"\\033[36m\" if record.levelno >= logging.INFO else \"\\033[0m\"\n        reset_color = \"\\033[0m\"\n        message = f\"[{record.name}] | {level_color}{record.levelname[0]}{reset_color} | {timestamp} | {record.getMessage()}\"\n        print(message, file=sys.stderr)\n\n\n# Bridge standard logging to loguru\nintercept_handler = InterceptHandler(level=logging.INFO)\nlogging.basicConfig(handlers=[intercept_handler], level=0, force=True)\n\n\ndef capture_output(func):\n    @wraps(func)\n    def wrapper(*args, **kwargs):\n        log_sink_buffer = []\n\n        def list_sink(message):\n            log_sink_buffer.append(message.record[\"message\"])\n\n        sink_id = logger.add(list_sink, format=\"{message}\", level=\"INFO\")\n\n        original_stdout = sys.stdout\n        original_stderr = sys.stderr\n\n        class OutputTeeToGlobalLog:\n            def __init__(self, original_stream, log_method):\n                self.original_stream = original_stream\n                self.log_method = log_method\n                self.current_line_content = \"\"  # Represents the current state of the line to be logged\n\n            def write(self, data_chunk):\n                self.original_stream.write(data_chunk)  # Pass through to console\n\n                if data_chunk.endswith(\"\\\\r\") and \"\\\\n\" not in data_chunk:\n                    self.current_line_content = data_chunk[:-1]  # Store without the trailing \\\\r\n                    return\n\n                full_buffer = self.current_line_content + data_chunk\n                lines_to_process = full_buffer.split(\"\\\\n\")\n\n                for i in range(len(lines_to_process) - 1):\n                    line = lines_to_process[i]\n                    final_content_of_line = line\n                    last_cr = line.rfind(\"\\\\r\")\n                    if last_cr != -1:\n                        final_content_of_line = line[last_cr + 1 :]\n\n                    escaped_log = final_content_of_line.replace(\"{\", \"{{\").replace(\"}\", \"}}\")\n                    if final_content_of_line.strip() or line:\n                        self.log_method(escaped_log, raw=True)\n\n                self.current_line_content = lines_to_process[-1]\n\n            def flush(self):\n                self.original_stream.flush()\n                if self.current_line_content:\n                    final_content_of_line = self.current_line_content\n                    last_cr = self.current_line_content.rfind(\"\\\\r\")\n                    if last_cr != -1:\n                        final_content_of_line = self.current_line_content[last_cr + 1 :]\n\n                    escaped_log = final_content_of_line.replace(\"{\", \"{{\").replace(\"}\", \"}}\")\n                    if final_content_of_line.strip() or self.current_line_content:\n                        self.log_method(escaped_log, raw=True)\n                    self.current_line_content = \"\"\n\n        sys.stdout = OutputTeeToGlobalLog(original_stdout, logger.opt(raw=True).info)\n        sys.stderr = OutputTeeToGlobalLog(original_stderr, logger.opt(raw=True).error)\n\n        try:\n            func(*args, **kwargs)\n        finally:\n            sys.stdout = original_stdout\n            sys.stderr = original_stderr\n            logger.remove(sink_id)\n\n    return wrapper\n\n\ndef configure_log_level_from_config():\n    \"\"\"\n    Read log level from config file and set complete log configuration\n    Should be called after config is loaded\n    \"\"\"\n    log_level = \"INFO\"  # default value\n\n    try:\n        from weclone.utils.config import load_config\n\n        cli_config = load_config(arg_type=\"cli_args\")\n        log_level = getattr(cli_config, \"log_level\", \"INFO\")\n    except Exception as e:\n        logger.warning(f\"Unable to load log level from config, using default INFO level: {e}\")\n\n    logger.remove()\n\n    logger.add(\n        sys.stderr,\n        format=\"<green><b>[WeClone]</b></green> <level>{level.name[0]}</level> | <level>{time:HH:mm:ss}</level> | <level>{message}</level>\",\n        colorize=True,\n        level=log_level.upper(),\n    )\n\n    logger.add(\n        \"logs/weclone.log\",\n        rotation=\"1 day\",\n        retention=\"7 days\",\n        compression=\"zip\",\n        level=\"DEBUG\",\n        format=\"{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {name}:{function}:{line} - {message}\",\n        encoding=\"utf-8\",\n        enqueue=True,\n    )\n\n    intercept_handler.setLevel(log_level.upper())\n\n    logger.info(f\"Log level has been set to: {log_level.upper()}\")\n"
  },
  {
    "path": "weclone/utils/retry.py",
    "content": "import random\nimport time\nfrom functools import wraps\nfrom typing import Callable, List, Optional\n\nfrom weclone.utils.log import logger\n\n\ndef retry_on_http_error(\n    max_retries: int = 3,\n    base_delay: float = 1.0,\n    max_delay: float = 60.0,\n    backoff_factor: float = 2.0,\n    jitter: bool = True,\n    retry_on_status: Optional[List[int]] = None,\n    retry_on_exceptions: Optional[List[type]] = None,\n):\n    \"\"\"\n    HTTP请求重试装饰器，专门处理429状态码和其他网络错误\n\n    Args:\n        max_retries: 最大重试次数\n        base_delay: 基础延迟时间（秒）\n        max_delay: 最大延迟时间（秒）\n        backoff_factor: 退避因子，每次重试延迟时间乘以此因子\n        jitter: 是否添加随机抖动，避免雷群效应\n        retry_on_status: 需要重试的HTTP状态码列表，默认包含429, 500, 502, 503, 504\n        retry_on_exceptions: 需要重试的异常类型列表\n    \"\"\"\n    if retry_on_status is None:\n        retry_on_status = [429, 500, 502, 503, 504]\n\n    if retry_on_exceptions is None:\n        retry_on_exceptions = [ConnectionError, TimeoutError]\n\n    def decorator(func):\n        @wraps(func)\n        def wrapper(*args, **kwargs):\n            for attempt in range(max_retries + 1):\n                try:\n                    result = func(*args, **kwargs)\n\n                    # 检查是否是HTTP响应对象\n                    if hasattr(result, \"status_code\"):\n                        if result.status_code in retry_on_status:\n                            if attempt < max_retries:\n                                delay = _calculate_delay(\n                                    attempt, base_delay, max_delay, backoff_factor, jitter\n                                )\n                                logger.warning(\n                                    f\"HTTP请求返回状态码 {result.status_code}，\"\n                                    f\"第 {attempt + 1}/{max_retries + 1} 次尝试，\"\n                                    f\"将在 {delay:.2f} 秒后重试...\"\n                                )\n                                time.sleep(delay)\n                                continue\n                            else:\n                                logger.error(\n                                    f\"HTTP请求在 {max_retries + 1} 次尝试后最终失败，状态码: {result.status_code}\"\n                                )\n                                return result\n\n                    return result\n\n                except Exception as e:\n                    should_retry_on_exception = any(\n                        isinstance(e, exc_type) for exc_type in retry_on_exceptions\n                    )\n\n                    if should_retry_on_exception and attempt < max_retries:\n                        delay = _calculate_delay(attempt, base_delay, max_delay, backoff_factor, jitter)\n                        logger.warning(\n                            f\"请求异常: {type(e).__name__}: {e}，\"\n                            f\"第 {attempt + 1}/{max_retries + 1} 次尝试，\"\n                            f\"将在 {delay:.2f} 秒后重试...\"\n                        )\n                        time.sleep(delay)\n                        continue\n                    elif should_retry_on_exception:\n                        logger.error(f\"请求在 {max_retries + 1} 次尝试后最终失败: {type(e).__name__}: {e}\")\n                        raise\n                    else:\n                        logger.error(f\"未知错误，不进行重试: {type(e).__name__}: {e}\")\n                        raise\n\n            return None  # 理论上不会执行到这里\n\n        return wrapper\n\n    return decorator\n\n\ndef retry_openai_api(\n    max_retries: int = 3,\n    base_delay: float = 1.0,\n    max_delay: float = 60.0,\n    backoff_factor: float = 2.0,\n    jitter: bool = True,\n):\n    \"\"\"\n    专门用于OpenAI API调用的重试装饰器\n    处理OpenAI特有的异常类型\n    \"\"\"\n\n    def decorator(func):\n        @wraps(func)\n        def wrapper(*args, **kwargs):\n            for attempt in range(max_retries + 1):\n                try:\n                    return func(*args, **kwargs)\n\n                except Exception as e:\n                    # 检查是否是速率限制或临时错误\n                    error_message = str(e).lower()\n                    should_retry = (\n                        \"rate limit\" in error_message\n                        or \"429\" in error_message\n                        or \"too many requests\" in error_message\n                        or \"server error\" in error_message\n                        or \"timeout\" in error_message\n                        or \"connection\" in error_message\n                    )\n\n                    if should_retry and attempt < max_retries:\n                        delay = _calculate_delay(attempt, base_delay, max_delay, backoff_factor, jitter)\n                        logger.warning(\n                            f\"OpenAI API调用失败: {type(e).__name__}: {e}，\"\n                            f\"第 {attempt + 1}/{max_retries + 1} 次尝试，\"\n                            f\"将在 {delay:.2f} 秒后重试...\"\n                        )\n                        time.sleep(delay)\n                        continue\n                    else:\n                        if attempt >= max_retries:\n                            logger.error(\n                                f\"OpenAI API调用在 {max_retries + 1} 次尝试后最终失败: {type(e).__name__}: {e}\"\n                            )\n                        raise\n\n            return None\n\n        return wrapper\n\n    return decorator\n\n\ndef _calculate_delay(\n    attempt: int, base_delay: float, max_delay: float, backoff_factor: float, jitter: bool\n) -> float:\n    \"\"\"计算重试延迟时间\"\"\"\n    delay = base_delay * (backoff_factor**attempt)\n    delay = min(delay, max_delay)\n\n    if jitter:\n        # 添加±20%的随机抖动\n        jitter_range = delay * 0.2\n        delay += random.uniform(-jitter_range, jitter_range)\n        delay = max(0, delay)  # 确保延迟不为负数\n\n    return delay\n\n\nclass RetryConfig:\n    \"\"\"重试配置类，用于统一管理重试参数\"\"\"\n\n    def __init__(\n        self,\n        max_retries: int = 3,\n        base_delay: float = 1.0,\n        max_delay: float = 60.0,\n        backoff_factor: float = 2.0,\n        jitter: bool = True,\n        retry_on_status: Optional[List[int]] = None,\n        retry_on_exceptions: Optional[List[type]] = None,\n    ):\n        self.max_retries = max_retries\n        self.base_delay = base_delay\n        self.max_delay = max_delay\n        self.backoff_factor = backoff_factor\n        self.jitter = jitter\n        self.retry_on_status = retry_on_status or [429, 500, 502, 503, 504]\n        self.retry_on_exceptions = retry_on_exceptions or [ConnectionError, TimeoutError]\n\n    def apply_to_function(self, func: Callable) -> Callable:\n        \"\"\"将重试配置应用到函数上\"\"\"\n        return retry_on_http_error(\n            max_retries=self.max_retries,\n            base_delay=self.base_delay,\n            max_delay=self.max_delay,\n            backoff_factor=self.backoff_factor,\n            jitter=self.jitter,\n            retry_on_status=self.retry_on_status,\n            retry_on_exceptions=self.retry_on_exceptions,\n        )(func)\n\n\n# 预定义的重试配置\nAGGRESSIVE_RETRY = RetryConfig(\n    max_retries=5,\n    base_delay=0.5,\n    max_delay=30.0,\n    backoff_factor=1.5,\n)\n\nCONSERVATIVE_RETRY = RetryConfig(\n    max_retries=2,\n    base_delay=2.0,\n    max_delay=10.0,\n    backoff_factor=2.0,\n)\n\nAPI_RETRY = RetryConfig(\n    max_retries=3,\n    base_delay=1.0,\n    max_delay=60.0,\n    backoff_factor=2.0,\n    retry_on_status=[429, 500, 502, 503, 504],\n)\n"
  },
  {
    "path": "weclone/utils/tools.py",
    "content": "def dict_to_argv(d):\n    argv = []\n    for k, v in d.items():\n        argv.append(\"--\" + k)\n        if v is not None:\n            argv.append(str(v))\n    return argv\n"
  }
]