Full Code of lipku/LiveTalking for AI

main 9db55d8ba80a cached

115 files

3.0 MB

801.9k tokens

671 symbols

1 requests

Download .txt

Showing preview only (3,206K chars total). Download the full file or copy to clipboard to get everything.

Repository: lipku/LiveTalking
Branch: main
Commit: 9db55d8ba80a
Files: 115
Total size: 3.0 MB

Directory structure:
gitextract_75g1ac60/

├── .github/
│   └── FUNDING.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README-EN.md
├── README.md
├── app.py
├── assets/
│   └── faq.md
├── baseasr.py
├── basereal.py
├── hubertasr.py
├── lightreal.py
├── lipasr.py
├── lipreal.py
├── llm.py
├── logger.py
├── museasr.py
├── musereal.py
├── musetalk/
│   ├── genavatar.py
│   ├── myutil.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── audio_processor.py
│   │   ├── blending.py
│   │   ├── dwpose/
│   │   │   ├── default_runtime.py
│   │   │   └── rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py
│   │   ├── face_detection/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── api.py
│   │   │   ├── detection/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── core.py
│   │   │   │   └── sfd/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── bbox.py
│   │   │   │       ├── detect.py
│   │   │   │       ├── net_s3fd.py
│   │   │   │       └── sfd_detector.py
│   │   │   ├── models.py
│   │   │   └── utils.py
│   │   ├── face_parsing/
│   │   │   ├── __init__.py
│   │   │   ├── model.py
│   │   │   └── resnet.py
│   │   ├── preprocessing.py
│   │   ├── training_utils.py
│   │   └── utils.py
│   └── whisper/
│       ├── audio2feature.py
│       └── whisper/
│           ├── __init__.py
│           ├── __main__.py
│           ├── assets/
│           │   ├── gpt2/
│           │   │   ├── merges.txt
│           │   │   ├── special_tokens_map.json
│           │   │   ├── tokenizer_config.json
│           │   │   └── vocab.json
│           │   ├── mel_filters.npz
│           │   └── multilingual/
│           │       ├── added_tokens.json
│           │       ├── merges.txt
│           │       ├── special_tokens_map.json
│           │       ├── tokenizer_config.json
│           │       └── vocab.json
│           ├── audio.py
│           ├── decoding.py
│           ├── model.py
│           ├── normalizers/
│           │   ├── __init__.py
│           │   ├── basic.py
│           │   ├── english.json
│           │   └── english.py
│           ├── tokenizer.py
│           ├── transcribe.py
│           └── utils.py
├── requirements.txt
├── ttsreal.py
├── ultralight/
│   ├── audio2feature.py
│   ├── face_detect_utils/
│   │   ├── base_module.py
│   │   ├── detect_face.py
│   │   ├── get_landmark.py
│   │   ├── mean_face.txt
│   │   └── pfld_mobileone.py
│   ├── genavatar-bak.py
│   ├── genavatar.py
│   └── unet.py
├── wav2lip/
│   ├── audio.py
│   ├── face_detection/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── api.py
│   │   ├── detection/
│   │   │   ├── __init__.py
│   │   │   ├── core.py
│   │   │   └── sfd/
│   │   │       ├── __init__.py
│   │   │       ├── bbox.py
│   │   │       ├── detect.py
│   │   │       ├── net_s3fd.py
│   │   │       └── sfd_detector.py
│   │   ├── models.py
│   │   └── utils.py
│   ├── genavatar.py
│   └── hparams.py
├── web/
│   ├── asr/
│   │   ├── index.html
│   │   ├── main.js
│   │   ├── pcm.js
│   │   ├── recorder-core.js
│   │   ├── wav.js
│   │   └── wsconnecter.js
│   ├── chat.html
│   ├── client.js
│   ├── dashboard.html
│   ├── echo.html
│   ├── echoapi.html
│   ├── rtcpush.html
│   ├── rtcpushapi-asr.html
│   ├── rtcpushapi.html
│   ├── rtcpushchat.html
│   ├── srs.sdk.js
│   ├── webrtc.html
│   ├── webrtcapi-asr.html
│   ├── webrtcapi-custom.html
│   ├── webrtcapi.html
│   ├── webrtcchat.html
│   └── whep.js
└── webrtc.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/FUNDING.yml
================================================
github: [lipku]


================================================
FILE: .gitignore
================================================
__pycache__/
build/
*.egg-info/
*.so
*.mp4

tmp*
trial*/

data
data_utils/face_tracking/3DMM/*
data_utils/face_parsing/79999_iter.pth

pretrained
*.mp4
.DS_Store
workspace/log_ngp.txt
.idea
models/
*.log
dist


================================================
FILE: Dockerfile
================================================
# Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

ARG BASE_IMAGE=nvcr.io/nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04
FROM $BASE_IMAGE

RUN apt-get update -yq --fix-missing \
 && DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends \
    pkg-config \
    wget \
    cmake \
    curl \
    git \
    vim

#ENV PYTHONDONTWRITEBYTECODE=1
#ENV PYTHONUNBUFFERED=1

# nvidia-container-runtime
#ENV NVIDIA_VISIBLE_DEVICES all
#ENV NVIDIA_DRIVER_CAPABILITIES compute,utility,graphics

RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
RUN sh Miniconda3-latest-Linux-x86_64.sh -b -u -p ~/miniconda3
RUN ~/miniconda3/bin/conda init
RUN source ~/.bashrc
RUN conda create -n nerfstream python=3.10
RUN conda activate nerfstream

RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
# install depend
RUN conda install pytorch==1.12.1 torchvision==0.13.1 cudatoolkit=11.3 -c pytorch
Copy requirements.txt ./
RUN pip install -r requirements.txt

# additional libraries
# RUN pip install "git+https://github.com/facebookresearch/pytorch3d.git"
# RUN pip install tensorflow-gpu==2.8.0

# RUN pip uninstall protobuf
# RUN pip install protobuf==3.20.1

# RUN conda install ffmpeg
# Copy ../python_rtmpstream /python_rtmpstream
# WORKDIR /python_rtmpstream/python
# RUN pip install .

Copy ../nerfstream /nerfstream
WORKDIR /nerfstream
CMD ["python3", "app.py"]


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [livetalking@lipku]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README-EN.md
================================================
# English | [中文版](./README.md)  
 <p align="center">
 <img src="./assets/LiveTalking-logo.jpg" align="middle" width = "600"/>
<p align="center">
<p align="center">
    <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
    <a href="https://github.com/lipku/LiveTalking/releases"><img src="https://img.shields.io/github/v/release/lipku/LiveTalking?color=ffa"></a>
    <a href=""><img src="https://img.shields.io/badge/python-3.10+-aff.svg"></a>
    <a href=""><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg"></a>
    <a href="https://github.com/lipku/LiveTalking/graphs/contributors"><img src="https://img.shields.io/github/contributors/lipku/LiveTalking?color=c4f042&style=flat-square"></a>
    <a href="https://github.com/lipku/LiveTalking/network/members"><img src="https://img.shields.io/github/forks/lipku/LiveTalking?color=8ae8ff"></a>
    <a href="https://github.com/lipku/LiveTalking/stargazers"><img src="https://img.shields.io/github/stars/lipku/LiveTalking?color=ccf"></a>
</p>

A real-time interactive streaming digital human system enabling synchronized audio-video conversation, which basically meets commercial application standards.  
[wav2lip Demo](https://www.bilibili.com/video/BV1scwBeyELA/) | [ernerf Demo](https://www.bilibili.com/video/BV1G1421z73r/) | [musetalk Demo](https://www.bilibili.com/video/BV1gm421N7vQ/)  
Domestic Mirror Repository: <https://gitee.com/lipku/LiveTalking> 


## Features
1. Supports multiple digital human models: ernerf, musetalk, wav2lip, Ultralight-Digital-Human.
2. Supports voice cloning.
3. Supports interrupting the digital human while it is speaking.
4. Supports full-body video stitching.
5. Supports WebRTC and virtual camera output.
6. Supports motion choreography: plays custom videos when the digital human is not speaking.
7. Supports custom digital human avatars.

## 1. Installation

Tested on Ubuntu 24.04, Python 3.10, PyTorch 2.5.0, and CUDA 12.4.

### 1.1 Install Dependencies

```bash
conda create -n nerfstream python=3.10
conda activate nerfstream
# If your CUDA version is not 12.4 (check via "nvidia-smi"), install the corresponding PyTorch version from <https://pytorch.org/get-started/previous-versions/>
conda install pytorch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 pytorch-cuda=12.4 -c pytorch -c nvidia
pip install -r requirements.txt
``` 
For common installation issues, refer to the [FAQ](https://livetalking-doc.readthedocs.io/en/latest/faq.html).  
For CUDA environment setup on Linux, refer to this article: <https://zhuanlan.zhihu.com/p/674972886>  
Troubleshooting for video connection issues: <https://mp.weixin.qq.com/s/MVUkxxhV2cgMMHalphr2cg>


## 2. Quick Start
- Download Models  
Quark Cloud Drive: <https://pan.quark.cn/s/83a750323ef0>    
Google Drive: <https://drive.google.com/drive/folders/1FOC_MD6wdogyyX_7V1d4NDIO7P9NlSAJ?usp=sharing>  
1. Copy `wav2lip256.pth` to the `models` directory of this project and rename it to `wav2lip.pth`.  
2. Extract the `wav2lip256_avatar1.tar.gz` archive and copy the entire extracted folder to `data/avatars` of this project.

- Run the Project  
Execute: `python app.py --transport webrtc --model wav2lip --avatar_id wav2lip256_avatar1`  
<font color=red>The server must open the following ports: TCP: 8010; UDP: 1-65536 </font>  

You can access the client in two ways:  
(1) Open `http://serverip:8010/webrtcapi.html` in a browser. First click "start" to play the digital human video; then enter any text in the input box and submit it. The digital human will broadcast the text.  
(2) Use the desktop client (download link: <https://pan.quark.cn/s/d7192d8ac19b>).  

- Quick Experience  
Visit <https://www.compshare.cn/images/4458094e-a43d-45fe-9b57-de79253befe4?referral_code=3XW3852OBmnD089hMMrtuU&ytag=GPU_GitHub_livetalking> and create an instance with this image to run the project successfully immediately.

If you cannot access Hugging Face, run the following command before starting the project:
```
export HF_ENDPOINT=https://hf-mirror.com
``` 


## 3. More Usage
For detailed usage instructions: <https://livetalking-doc.readthedocs.io/>
  
## 4. Docker Run  
No prior installation is required; run directly with Docker:
```
docker run --gpus all -it --network=host --rm registry.cn-zhangjiakou.aliyuncs.com/codewithgpu3/lipku-livetalking:toza2irpHZ
```
The code is located in `/root/livetalking`. First run `git pull` to fetch the latest code, then execute commands as described in Sections 2 and 3.

The following images are available:
- AutoDL Image: <https://www.codewithgpu.com/i/lipku/livetalking/base>   
[AutoDL Tutorial](https://livetalking-doc.readthedocs.io/en/latest/autodl/README.html)
- UCloud Image: <https://www.compshare.cn/images/4458094e-a43d-45fe-9b57-de79253befe4?referral_code=3XW3852OBmnD089hMMrtuU&ytag=GPU_GitHub_livetalking>  
Supports opening any port; no additional SRS service deployment is required.  
[UCloud Tutorial](https://livetalking-doc.readthedocs.io/en/latest/ucloud/ucloud.html) 


## 5. Performance
- Performance mainly depends on CPU and GPU: Each video stream compression consumes CPU resources, and CPU performance is positively correlated with video resolution; each lip-sync inference depends on GPU performance.  
- The number of concurrent streams when the digital human is not speaking depends on CPU performance; the number of concurrent streams when multiple digital humans are speaking simultaneously depends on GPU performance.  
- In the backend logs, `inferfps` refers to the GPU inference frame rate, and `finalfps` refers to the final streaming frame rate. Both need to be above 25 fps to achieve real-time performance. If `inferfps` is above 25 but `finalfps` is below 25, it indicates insufficient CPU performance.  

- Real-Time Inference Performance  

| Model       | GPU Model  | FPS  |
| :---------- | :--------- | :--- |
| wav2lip256  | RTX 3060   | 60   |
| wav2lip256  | RTX 3080Ti | 120  |
| musetalk    | RTX 3080Ti | 42   |
| musetalk    | RTX 3090   | 45   |
| musetalk    | RTX 4090   | 72   | 

A GPU of RTX 3060 or higher is sufficient for wav2lip256, while musetalk requires an RTX 3080Ti or higher.

## 6. Commercial Version
The following extended features are available for users who are familiar with the open-source project and need to expand product capabilities:
1. High-definition wav2lip model.
2. Full voice interaction: supports interrupting the digital human’s response via a wake word or button to ask a new question.
3. Real-time synchronized subtitles: provides the frontend with events for the start and end of each sentence spoken by the digital human.
4. Each connection can specify a corresponding avatar and voice; accelerated avatar image loading.
5. Supports avatars (digital human images) with unlimited duration.
6. Provides a real-time audio stream input interface.
7. Transparent background for the digital human, supporting dynamic background overlay.
8. Real-time avatar switching, supporting multiple digital humans in the same scene.
9. Camera‑driven digital human movements and facial expressions.

For more details: <https://livetalking-doc.readthedocs.io/en/latest/service.html>

## 7. Statement
Videos developed based on this project and published on platforms such as Bilibili, WeChat Channels, and Douyin must include the LiveTalking watermark and logo.

---
If this project is helpful to you, please give it a "Star". Contributions from developers interested in improving this project are also welcome.
* Knowledge Planet (for high-quality FAQs, best practices, and Q&A): https://t.zsxq.com/7NMyO  
* WeChat Official Account: 数字人技术 (Digital Human Technology)    
<img src="./assets/qrcode-wechat.jpg" align="middle" />

================================================
FILE: README.md
================================================
 # [English](./README-EN.md) | 中文版  
 <p align="center">
 <img src="./assets/LiveTalking-logo.jpg" align="middle" width = "300"/>
<p align="center">
<p align="center">
    <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
    <a href="https://github.com/lipku/LiveTalking/releases"><img src="https://img.shields.io/github/v/release/lipku/LiveTalking?color=ffa"></a>
    <a href=""><img src="https://img.shields.io/badge/python-3.10+-aff.svg"></a>
    <a href=""><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg"></a>
    <a href="https://github.com/lipku/LiveTalking/graphs/contributors"><img src="https://img.shields.io/github/contributors/lipku/LiveTalking?color=c4f042&style=flat-square"></a>
    <a href="https://github.com/lipku/LiveTalking/network/members"><img src="https://img.shields.io/github/forks/lipku/LiveTalking?color=8ae8ff"></a>
    <a href="https://github.com/lipku/LiveTalking/stargazers"><img src="https://img.shields.io/github/stars/lipku/LiveTalking?color=ccf"></a>
</p>

 实时交互流式数字人，实现音视频同步对话。基本可以达到商用效果  
[wav2lip效果](https://www.bilibili.com/video/BV1scwBeyELA/) | [ernerf效果](https://www.bilibili.com/video/BV1G1421z73r/) | [musetalk效果](https://www.bilibili.com/video/BV1gm421N7vQ/)  
国内镜像地址:<https://gitee.com/lipku/LiveTalking> 

## 为避免与3d数字人混淆，原项目metahuman-stream改名为livetalking，原有链接地址继续可用

## Features
1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human
2. 支持声音克隆
3. 支持数字人说话被打断
4. 支持webrtc、虚拟摄像头输出
5. 支持动作编排：不说话时播放自定义视频
6. 支持多并发
7. 支持自定义数字人形象

## 1. Installation

Tested on Ubuntu 24.04, Python3.10, Pytorch 2.5.0 and CUDA 12.4

### 1.1 Install dependency

```bash
conda create -n nerfstream python=3.10
conda activate nerfstream
#如果cuda版本不为12.4(运行nvidia-smi确认版本)，根据<https://pytorch.org/get-started/previous-versions/>安装对应版本的pytorch 
conda install pytorch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 pytorch-cuda=12.4 -c pytorch -c nvidia
pip install -r requirements.txt
``` 
安装常见问题[FAQ](https://livetalking-doc.readthedocs.io/zh-cn/latest/faq.html)  
linux cuda环境搭建可以参考这篇文章 <https://zhuanlan.zhihu.com/p/674972886>  
视频连不上解决方法 <https://mp.weixin.qq.com/s/MVUkxxhV2cgMMHalphr2cg>


## 2. Quick Start
- 下载模型  
夸克云盘<https://pan.quark.cn/s/83a750323ef0>    
GoogleDriver <https://drive.google.com/drive/folders/1FOC_MD6wdogyyX_7V1d4NDIO7P9NlSAJ?usp=sharing>  
将wav2lip256.pth拷到本项目的models下, 重命名为wav2lip.pth;  
将wav2lip256_avatar1.tar.gz解压后整个文件夹拷到本项目的data/avatars下
- 运行  
python app.py --transport webrtc --model wav2lip --avatar_id wav2lip256_avatar1  
<font color=red>服务端需要开放端口 tcp:8010; udp:1-65536 </font>  
客户端可以选用以下两种方式:  
(1)用浏览器打开http://serverip:8010/webrtcapi.html , 先点‘start',播放数字人视频；然后在文本框输入任意文字，提交。数字人播报该段文字  
(2)用客户端方式, 下载地址<https://pan.quark.cn/s/d7192d8ac19b>   

- 快速体验  
[在线镜像](https://www.compshare.cn/images/4458094e-a43d-45fe-9b57-de79253befe4?referral_code=3XW3852OBmnD089hMMrtuU&ytag=GPU_GitHub_livetalking) 用该镜像创建实例即可运行成功

安装运行过程中如果访问不了huggingface，在运行前
```
export HF_ENDPOINT=https://hf-mirror.com
``` 


## 3. More Usage
使用说明: <https://livetalking-doc.readthedocs.io/>
  
## 4. Docker Run  
不需要前面的安装，直接运行。
```
docker run --gpus all -it --network=host --rm registry.cn-beijing.aliyuncs.com/codewithgpu2/lipku-metahuman-stream:2K9qaMBu8v
```
代码在/root/metahuman-stream，先git pull拉一下最新代码，然后执行命令同第2、3步 

提供如下网络镜像
- ucloud镜像: <https://www.compshare.cn/images/4458094e-a43d-45fe-9b57-de79253befe4?referral_code=3XW3852OBmnD089hMMrtuU&ytag=GPU_GitHub_livetalking>  
[ucloud教程](https://livetalking-doc.readthedocs.io/zh-cn/latest/ucloud/ucloud.html) 
- autodl镜像: <https://www.codewithgpu.com/i/lipku/livetalking/base>   
[autodl教程](https://livetalking-doc.readthedocs.io/zh-cn/latest/autodl/README.html)，autodl由于不能开放udp端口，需要部署转发服务，如果看不到视频，请自行部署srs或turn服务



## 5. 性能
- 性能主要跟cpu和gpu相关，每路视频压缩需要消耗cpu，cpu性能与视频分辨率正相关；每路口型推理跟gpu性能相关。  
- 不说话时的并发数跟cpu相关，同时说话的并发数跟gpu相关。  
- 后端日志inferfps表示显卡推理帧率，finalfps表示最终推流帧率。两者都要在25以上才能实时。如果inferfps在25以上，finalfps达不到25表示cpu性能不足。  
- 实时推理性能  

模型    |显卡型号   |fps
:----   |:---   |:---
wav2lip256 | 3060    | 60
wav2lip256 | 3080Ti  | 120
musetalk   | 3080Ti  | 42
musetalk   | 3090    | 45
musetalk   | 4090    | 72 

wav2lip256显卡3060以上即可，musetalk需要3080Ti以上。 

## 6. 商业版
提供如下扩展功能，适用于对开源项目已经比较熟悉，需要扩展产品功能的用户
1. 高清wav2lip模型
2. 完全语音交互，数字人回答过程中支持通过唤醒词或者按钮打断提问
3. 实时同步字幕，给前端提供数字人每句话播报开始、结束事件
4. 每个连接可以指定对应avatar和音色，avatar图片加载加速
5. 支持不限时长的数字人形象avatar
6. 提供实时音频流输入接口
7. 数字人透明背景，叠加动态背景 
8. avatar实时切换, 同一个画面里支持多个数字人  
9. 摄像头驱动数字人形象动作和表情  

更多详情<https://livetalking-doc.readthedocs.io/zh-cn/latest/service.html>

## 7. 声明
基于本项目开发并发布在B站、视频号、抖音等网站上的视频需带上LiveTalking水印和标识。

---  
如果本项目对你有帮助，帮忙点个star。也欢迎感兴趣的朋友一起来完善该项目.
* 知识星球: https://t.zsxq.com/7NMyO 沉淀高质量常见问题、最佳实践经验、问题解答  
* 微信公众号：数字人技术    
<img src="./assets/qrcode-wechat.jpg" align="middle" />



================================================
FILE: app.py
================================================
###############################################################################
#  Copyright (C) 2024 LiveTalking@lipku https://github.com/lipku/LiveTalking
#  email: lipku@foxmail.com
# 
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  
#       http://www.apache.org/licenses/LICENSE-2.0
# 
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
###############################################################################

# server.py
from flask import Flask, render_template,send_from_directory,request, jsonify
from flask_sockets import Sockets
import base64
import json
#import gevent
#from gevent import pywsgi
#from geventwebsocket.handler import WebSocketHandler
import re
import numpy as np
from threading import Thread,Event
#import multiprocessing
import torch.multiprocessing as mp

from aiohttp import web
import aiohttp
import aiohttp_cors
from aiortc import RTCPeerConnection, RTCSessionDescription,RTCIceServer,RTCConfiguration
from aiortc.rtcrtpsender import RTCRtpSender
from webrtc import HumanPlayer
from basereal import BaseReal
from llm import llm_response

import argparse
import random
import shutil
import asyncio
import torch
from typing import Dict
from logger import logger
import gc


app = Flask(__name__)
#sockets = Sockets(app)
nerfreals:Dict[int, BaseReal] = {} #sessionid:BaseReal
opt = None
model = None
avatar = None
        

#####webrtc###############################
pcs = set()

def randN(N)->int:
    '''生成长度为 N的随机数 '''
    min = pow(10, N - 1)
    max = pow(10, N)
    return random.randint(min, max - 1)

def build_nerfreal(sessionid:int)->BaseReal:
    opt.sessionid=sessionid
    if opt.model == 'wav2lip':
        from lipreal import LipReal
        nerfreal = LipReal(opt,model,avatar)
    elif opt.model == 'musetalk':
        from musereal import MuseReal
        nerfreal = MuseReal(opt,model,avatar)
    # elif opt.model == 'ernerf':
    #     from nerfreal import NeRFReal
    #     nerfreal = NeRFReal(opt,model,avatar)
    elif opt.model == 'ultralight':
        from lightreal import LightReal
        nerfreal = LightReal(opt,model,avatar)
    return nerfreal

#@app.route('/offer', methods=['POST'])
async def offer(request):
    params = await request.json()
    offer = RTCSessionDescription(sdp=params["sdp"], type=params["type"])

    # if len(nerfreals) >= opt.max_session:
    #     logger.info('reach max session')
    #     return web.Response(
    #         content_type="application/json",
    #         text=json.dumps(
    #             {"code": -1, "msg": "reach max session"}
    #         ),
    #     )
    sessionid = randN(6) #len(nerfreals)
    nerfreals[sessionid] = None
    logger.info('sessionid=%d, session num=%d',sessionid,len(nerfreals))
    nerfreal = await asyncio.get_event_loop().run_in_executor(None, build_nerfreal,sessionid)
    nerfreals[sessionid] = nerfreal
    
    #ice_server = RTCIceServer(urls='stun:stun.l.google.com:19302')
    ice_server = RTCIceServer(urls='stun:stun.freeswitch.org:3478')
    pc = RTCPeerConnection(configuration=RTCConfiguration(iceServers=[ice_server]))
    pcs.add(pc)

    @pc.on("connectionstatechange")
    async def on_connectionstatechange():
        logger.info("Connection state is %s" % pc.connectionState)
        if pc.connectionState == "failed":
            await pc.close()
            pcs.discard(pc)
            del nerfreals[sessionid]
        if pc.connectionState == "closed":
            pcs.discard(pc)
            del nerfreals[sessionid]
            # gc.collect()

    player = HumanPlayer(nerfreals[sessionid])
    audio_sender = pc.addTrack(player.audio)
    video_sender = pc.addTrack(player.video)
    capabilities = RTCRtpSender.getCapabilities("video")
    preferences = list(filter(lambda x: x.name == "H264", capabilities.codecs))
    preferences += list(filter(lambda x: x.name == "VP8", capabilities.codecs))
    preferences += list(filter(lambda x: x.name == "rtx", capabilities.codecs))
    transceiver = pc.getTransceivers()[1]
    transceiver.setCodecPreferences(preferences)

    await pc.setRemoteDescription(offer)

    answer = await pc.createAnswer()
    await pc.setLocalDescription(answer)

    #return jsonify({"sdp": pc.localDescription.sdp, "type": pc.localDescription.type})

    return web.Response(
        content_type="application/json",
        text=json.dumps(
            {"sdp": pc.localDescription.sdp, "type": pc.localDescription.type, "sessionid":sessionid}
        ),
    )

async def human(request):
    try:
        params = await request.json()

        sessionid = params.get('sessionid',0)
        if params.get('interrupt'):
            nerfreals[sessionid].flush_talk()

        if params['type']=='echo':
            nerfreals[sessionid].put_msg_txt(params['text'])
        elif params['type']=='chat':
            asyncio.get_event_loop().run_in_executor(None, llm_response, params['text'],nerfreals[sessionid])                         
            #nerfreals[sessionid].put_msg_txt(res)

        return web.Response(
            content_type="application/json",
            text=json.dumps(
                {"code": 0, "msg":"ok"}
            ),
        )
    except Exception as e:
        logger.exception('exception:')
        return web.Response(
            content_type="application/json",
            text=json.dumps(
                {"code": -1, "msg": str(e)}
            ),
        )

async def interrupt_talk(request):
    try:
        params = await request.json()

        sessionid = params.get('sessionid',0)
        nerfreals[sessionid].flush_talk()
        
        return web.Response(
            content_type="application/json",
            text=json.dumps(
                {"code": 0, "msg":"ok"}
            ),
        )
    except Exception as e:
        logger.exception('exception:')
        return web.Response(
            content_type="application/json",
            text=json.dumps(
                {"code": -1, "msg": str(e)}
            ),
        )

async def humanaudio(request):
    try:
        form= await request.post()
        sessionid = int(form.get('sessionid',0))
        fileobj = form["file"]
        filename=fileobj.filename
        filebytes=fileobj.file.read()
        nerfreals[sessionid].put_audio_file(filebytes)

        return web.Response(
            content_type="application/json",
            text=json.dumps(
                {"code": 0, "msg":"ok"}
            ),
        )
    except Exception as e:
        logger.exception('exception:')
        return web.Response(
            content_type="application/json",
            text=json.dumps(
                {"code": -1, "msg": str(e)}
            ),
        )

async def set_audiotype(request):
    try:
        params = await request.json()

        sessionid = params.get('sessionid',0)    
        nerfreals[sessionid].set_custom_state(params['audiotype'],params['reinit'])

        return web.Response(
            content_type="application/json",
            text=json.dumps(
                {"code": 0, "msg":"ok"}
            ),
        )
    except Exception as e:
        logger.exception('exception:')
        return web.Response(
            content_type="application/json",
            text=json.dumps(
                {"code": -1, "msg": str(e)}
            ),
        )

async def record(request):
    try:
        params = await request.json()

        sessionid = params.get('sessionid',0)
        if params['type']=='start_record':
            # nerfreals[sessionid].put_msg_txt(params['text'])
            nerfreals[sessionid].start_recording()
        elif params['type']=='end_record':
            nerfreals[sessionid].stop_recording()
        return web.Response(
            content_type="application/json",
            text=json.dumps(
                {"code": 0, "msg":"ok"}
            ),
        )
    except Exception as e:
        logger.exception('exception:')
        return web.Response(
            content_type="application/json",
            text=json.dumps(
                {"code": -1, "msg": str(e)}
            ),
        )

async def is_speaking(request):
    params = await request.json()

    sessionid = params.get('sessionid',0)
    return web.Response(
        content_type="application/json",
        text=json.dumps(
            {"code": 0, "data": nerfreals[sessionid].is_speaking()}
        ),
    )


async def on_shutdown(app):
    # close peer connections
    coros = [pc.close() for pc in pcs]
    await asyncio.gather(*coros)
    pcs.clear()

async def post(url,data):
    try:
        async with aiohttp.ClientSession() as session:
            async with session.post(url,data=data) as response:
                return await response.text()
    except aiohttp.ClientError as e:
        logger.info(f'Error: {e}')

async def run(push_url,sessionid):
    nerfreal = await asyncio.get_event_loop().run_in_executor(None, build_nerfreal,sessionid)
    nerfreals[sessionid] = nerfreal

    pc = RTCPeerConnection()
    pcs.add(pc)

    @pc.on("connectionstatechange")
    async def on_connectionstatechange():
        logger.info("Connection state is %s" % pc.connectionState)
        if pc.connectionState == "failed":
            await pc.close()
            pcs.discard(pc)

    player = HumanPlayer(nerfreals[sessionid])
    audio_sender = pc.addTrack(player.audio)
    video_sender = pc.addTrack(player.video)

    await pc.setLocalDescription(await pc.createOffer())
    answer = await post(push_url,pc.localDescription.sdp)
    await pc.setRemoteDescription(RTCSessionDescription(sdp=answer,type='answer'))
##########################################
# os.environ['MKL_SERVICE_FORCE_INTEL'] = '1'
# os.environ['MULTIPROCESSING_METHOD'] = 'forkserver'                                                    
if __name__ == '__main__':
    mp.set_start_method('spawn')
    parser = argparse.ArgumentParser()
    
    # audio FPS
    parser.add_argument('--fps', type=int, default=50, help="audio fps,must be 50")
    # sliding window left-middle-right length (unit: 20ms)
    parser.add_argument('-l', type=int, default=10)
    parser.add_argument('-m', type=int, default=8)
    parser.add_argument('-r', type=int, default=10)

    parser.add_argument('--W', type=int, default=450, help="GUI width")
    parser.add_argument('--H', type=int, default=450, help="GUI height")

    #musetalk opt
    parser.add_argument('--avatar_id', type=str, default='avator_1', help="define which avatar in data/avatars")
    #parser.add_argument('--bbox_shift', type=int, default=5)
    parser.add_argument('--batch_size', type=int, default=16, help="infer batch")

    parser.add_argument('--customvideo_config', type=str, default='', help="custom action json")

    parser.add_argument('--tts', type=str, default='edgetts', help="tts service type") #xtts gpt-sovits cosyvoice fishtts tencent doubao indextts2 azuretts
    parser.add_argument('--REF_FILE', type=str, default="zh-CN-YunxiaNeural",help="参考文件名或语音模型ID，默认值为 edgetts的语音模型ID zh-CN-YunxiaNeural, 若--tts指定为azuretts, 可以使用Azure语音模型ID, 如zh-CN-XiaoxiaoMultilingualNeural")
    parser.add_argument('--REF_TEXT', type=str, default=None)
    parser.add_argument('--TTS_SERVER', type=str, default='http://127.0.0.1:9880') # http://localhost:9000
    # parser.add_argument('--CHARACTER', type=str, default='test')
    # parser.add_argument('--EMOTION', type=str, default='default')

    parser.add_argument('--model', type=str, default='musetalk') #musetalk wav2lip ultralight

    parser.add_argument('--transport', type=str, default='rtcpush') #webrtc rtcpush virtualcam
    parser.add_argument('--push_url', type=str, default='http://localhost:1985/rtc/v1/whip/?app=live&stream=livestream') #rtmp://localhost/live/livestream

    parser.add_argument('--max_session', type=int, default=1)  #multi session count
    parser.add_argument('--listenport', type=int, default=8010, help="web listen port")

    opt = parser.parse_args()
    #app.config.from_object(opt)
    #print(app.config)
    opt.customopt = []
    if opt.customvideo_config!='':
        with open(opt.customvideo_config,'r') as file:
            opt.customopt = json.load(file)

    # if opt.model == 'ernerf':       
    #     from nerfreal import NeRFReal,load_model,load_avatar
    #     model = load_model(opt)
    #     avatar = load_avatar(opt) 
    if opt.model == 'musetalk':
        from musereal import MuseReal,load_model,load_avatar,warm_up
        logger.info(opt)
        model = load_model()
        avatar = load_avatar(opt.avatar_id) 
        warm_up(opt.batch_size,model)      
    elif opt.model == 'wav2lip':
        from lipreal import LipReal,load_model,load_avatar,warm_up
        logger.info(opt)
        model = load_model("./models/wav2lip.pth")
        avatar = load_avatar(opt.avatar_id)
        warm_up(opt.batch_size,model,256)
    elif opt.model == 'ultralight':
        from lightreal import LightReal,load_model,load_avatar,warm_up
        logger.info(opt)
        model = load_model(opt)
        avatar = load_avatar(opt.avatar_id)
        warm_up(opt.batch_size,avatar,160)

    # if opt.transport=='rtmp':
    #     thread_quit = Event()
    #     nerfreals[0] = build_nerfreal(0)
    #     rendthrd = Thread(target=nerfreals[0].render,args=(thread_quit,))
    #     rendthrd.start()
    if opt.transport=='virtualcam':
        thread_quit = Event()
        nerfreals[0] = build_nerfreal(0)
        rendthrd = Thread(target=nerfreals[0].render,args=(thread_quit,))
        rendthrd.start()

    #############################################################################
    appasync = web.Application(client_max_size=1024**2*100)
    appasync.on_shutdown.append(on_shutdown)
    appasync.router.add_post("/offer", offer)
    appasync.router.add_post("/human", human)
    appasync.router.add_post("/humanaudio", humanaudio)
    appasync.router.add_post("/set_audiotype", set_audiotype)
    appasync.router.add_post("/record", record)
    appasync.router.add_post("/interrupt_talk", interrupt_talk)
    appasync.router.add_post("/is_speaking", is_speaking)
    appasync.router.add_static('/',path='web')

    # Configure default CORS settings.
    cors = aiohttp_cors.setup(appasync, defaults={
            "*": aiohttp_cors.ResourceOptions(
                allow_credentials=True,
                expose_headers="*",
                allow_headers="*",
            )
        })
    # Configure CORS on all routes.
    for route in list(appasync.router.routes()):
        cors.add(route)

    pagename='webrtcapi.html'
    if opt.transport=='rtmp':
        pagename='echoapi.html'
    elif opt.transport=='rtcpush':
        pagename='rtcpushapi.html'
    logger.info('start http server; http://<serverip>:'+str(opt.listenport)+'/'+pagename)
    logger.info('如果使用webrtc，推荐访问webrtc集成前端: http://<serverip>:'+str(opt.listenport)+'/dashboard.html')
    def run_server(runner):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(runner.setup())
        site = web.TCPSite(runner, '0.0.0.0', opt.listenport)
        loop.run_until_complete(site.start())
        if opt.transport=='rtcpush':
            for k in range(opt.max_session):
                push_url = opt.push_url
                if k!=0:
                    push_url = opt.push_url+str(k)
                loop.run_until_complete(run(push_url,k))
        loop.run_forever()    
    #Thread(target=run_server, args=(web.AppRunner(appasync),)).start()
    run_server(web.AppRunner(appasync))

    #app.on_shutdown.append(on_shutdown)
    #app.router.add_post("/offer", offer)

    # print('start websocket server')
    # server = pywsgi.WSGIServer(('0.0.0.0', 8000), app, handler_class=WebSocketHandler)
    # server.serve_forever()
    
    


================================================
FILE: assets/faq.md
================================================
1.  pytorch3d安装不成功\
    下载源码编译

```bash
git clone https://github.com/facebookresearch/pytorch3d.git
python setup.py install
```

2.  websocket连接报错\
    修改python/site-packages/flask\_sockets.py

```python
self.url_map.add(Rule(rule, endpoint=f)) 改成 
self.url_map.add(Rule(rule, endpoint=f, websocket=True))
```

3. protobuf版本过高

```bash
pip uninstall protobuf
pip install protobuf==3.20.1
```

4. 数字人不眨眼\
训练模型时添加如下步骤

> Obtain AU45 for eyes blinking.\
> Run FeatureExtraction in OpenFace, rename and move the output CSV file to data/\<ID>/au.csv.

将au.csv拷到本项目的data目录下

5. 数字人添加背景图片

```bash
python app.py --bg_img bc.jpg
```

6. 用自己训练的模型报错维度不匹配\
训练模型时用wav2vec提取音频特征

```bash
python main.py data/ --workspace workspace/ -O --iters 100000 --asr_model cpierse/wav2vec2-large-xlsr-53-esperanto
```

7. rtmp推流时ffmpeg版本不对
网上版友反馈是需要4.2.2版本。我也不确定具体哪些版本不行。原则是运行一下ffmpeg，打印的信息里需要有libx264，如果没有肯定不行
```
--enable-libx264
```
8. 替换自己训练的模型
```python
.
├── data
│   ├── data_kf.json （对应训练数据中的transforms_train.json）
│   ├── au.csv			
│   ├── pretrained
│   └── └── ngp_kf.pth （对应训练后的模型ngp_ep00xx.pth）

```


其他参考
https://github.com/lipku/metahuman-stream/issues/43#issuecomment-2008930101




================================================
FILE: baseasr.py
================================================
###############################################################################
#  Copyright (C) 2024 LiveTalking@lipku https://github.com/lipku/LiveTalking
#  email: lipku@foxmail.com
# 
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  
#       http://www.apache.org/licenses/LICENSE-2.0
# 
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
###############################################################################

import time
import numpy as np

import queue
from queue import Queue
import torch.multiprocessing as mp

from basereal import BaseReal


class BaseASR:
    def __init__(self, opt, parent:BaseReal = None):
        self.opt = opt
        self.parent = parent

        self.fps = opt.fps # 20 ms per frame
        self.sample_rate = 16000
        self.chunk = self.sample_rate // self.fps # 320 samples per chunk (20ms * 16000 / 1000)
        self.queue = Queue()
        self.output_queue = mp.Queue()

        self.batch_size = opt.batch_size

        self.frames = []
        self.stride_left_size = opt.l
        self.stride_right_size = opt.r
        #self.context_size = 10
        self.feat_queue = mp.Queue(2)

        #self.warm_up()

    def flush_talk(self):
        self.queue.queue.clear()

    def put_audio_frame(self,audio_chunk,datainfo:dict): #16khz 20ms pcm
        self.queue.put((audio_chunk,datainfo))

    #return frame:audio pcm; type: 0-normal speak, 1-silence; eventpoint:custom event sync with audio
    def get_audio_frame(self):        
        try:
            frame,eventpoint = self.queue.get(block=True,timeout=0.01)
            type = 0
            #print(f'[INFO] get frame {frame.shape}')
        except queue.Empty:
            if self.parent and self.parent.curr_state>1: #播放自定义音频
                frame = self.parent.get_audio_stream(self.parent.curr_state)
                type = self.parent.curr_state
            else:
                frame = np.zeros(self.chunk, dtype=np.float32)
                type = 1
            eventpoint = None

        return frame,type,eventpoint 

    #return frame:audio pcm; type: 0-normal speak, 1-silence; eventpoint:custom event sync with audio
    def get_audio_out(self): 
        return self.output_queue.get()
    
    def warm_up(self):
        for _ in range(self.stride_left_size + self.stride_right_size):
            audio_frame,type,eventpoint=self.get_audio_frame()
            self.frames.append(audio_frame)
            self.output_queue.put((audio_frame,type,eventpoint))
        for _ in range(self.stride_left_size):
            self.output_queue.get()

    def run_step(self):
        pass

    def get_next_feat(self,block,timeout):        
        return self.feat_queue.get(block,timeout)

================================================
FILE: basereal.py
================================================
###############################################################################
#  Copyright (C) 2024 LiveTalking@lipku https://github.com/lipku/LiveTalking
#  email: lipku@foxmail.com
# 
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  
#       http://www.apache.org/licenses/LICENSE-2.0
# 
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
###############################################################################

import math
import torch
import numpy as np

import subprocess
import os
import time
import cv2
import glob
import resampy

import queue
from queue import Queue
from threading import Thread, Event
from io import BytesIO
import soundfile as sf

import asyncio
from av import AudioFrame, VideoFrame

import av
from fractions import Fraction

from ttsreal import EdgeTTS,SovitsTTS,XTTS,CosyVoiceTTS,FishTTS,TencentTTS,DoubaoTTS,IndexTTS2,AzureTTS
from logger import logger

from tqdm import tqdm
def read_imgs(img_list):
    frames = []
    logger.info('reading images...')
    for img_path in tqdm(img_list):
        frame = cv2.imread(img_path)
        frames.append(frame)
    return frames

def play_audio(quit_event,queue):        
    import pyaudio
    p = pyaudio.PyAudio()
    stream = p.open(
        rate=16000,
        channels=1,
        format=8,
        output=True,
        output_device_index=1,
    )
    stream.start_stream()
    # while queue.qsize() <= 0:
    #     time.sleep(0.1)
    while not quit_event.is_set():
        stream.write(queue.get(block=True))
    stream.close()

class BaseReal:
    def __init__(self, opt):
        self.opt = opt
        self.sample_rate = 16000
        self.chunk = self.sample_rate // opt.fps # 320 samples per chunk (20ms * 16000 / 1000)
        self.sessionid = self.opt.sessionid

        if opt.tts == "edgetts":
            self.tts = EdgeTTS(opt,self)
        elif opt.tts == "gpt-sovits":
            self.tts = SovitsTTS(opt,self)
        elif opt.tts == "xtts":
            self.tts = XTTS(opt,self)
        elif opt.tts == "cosyvoice":
            self.tts = CosyVoiceTTS(opt,self)
        elif opt.tts == "fishtts":
            self.tts = FishTTS(opt,self)
        elif opt.tts == "tencent":
            self.tts = TencentTTS(opt,self)
        elif opt.tts == "doubao":
            self.tts = DoubaoTTS(opt,self)
        elif opt.tts == "indextts2":
            self.tts = IndexTTS2(opt,self)
        elif opt.tts == "azuretts":
            self.tts = AzureTTS(opt,self)

        self.speaking = False

        self.recording = False
        self._record_video_pipe = None
        self._record_audio_pipe = None
        self.width = self.height = 0

        self.curr_state=0
        self.custom_img_cycle = {}
        self.custom_audio_cycle = {}
        self.custom_audio_index = {}
        self.custom_index = {}
        self.custom_opt = {}
        self.__loadcustom()

    def put_msg_txt(self,msg,datainfo:dict={}):
        self.tts.put_msg_txt(msg,datainfo)
    
    def put_audio_frame(self,audio_chunk,datainfo:dict={}): #16khz 20ms pcm
        self.asr.put_audio_frame(audio_chunk,datainfo)

    def put_audio_file(self,filebyte,datainfo:dict={}): 
        input_stream = BytesIO(filebyte)
        stream = self.__create_bytes_stream(input_stream)
        streamlen = stream.shape[0]
        idx=0
        while streamlen >= self.chunk:  #and self.state==State.RUNNING
            self.put_audio_frame(stream[idx:idx+self.chunk],datainfo)
            streamlen -= self.chunk
            idx += self.chunk
    
    def __create_bytes_stream(self,byte_stream):
        #byte_stream=BytesIO(buffer)
        stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
        logger.info(f'[INFO]put audio stream {sample_rate}: {stream.shape}')
        stream = stream.astype(np.float32)

        if stream.ndim > 1:
            logger.info(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
            stream = stream[:, 0]
    
        if sample_rate != self.sample_rate and stream.shape[0]>0:
            logger.info(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
            stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)

        return stream

    def flush_talk(self):
        self.tts.flush_talk()
        self.asr.flush_talk()

    def is_speaking(self)->bool:
        return self.speaking
    
    def __loadcustom(self):
        for item in self.opt.customopt:
            logger.info(item)
            input_img_list = glob.glob(os.path.join(item['imgpath'], '*.[jpJP][pnPN]*[gG]'))
            input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
            self.custom_img_cycle[item['audiotype']] = read_imgs(input_img_list)
            self.custom_audio_cycle[item['audiotype']], sample_rate = sf.read(item['audiopath'], dtype='float32')
            self.custom_audio_index[item['audiotype']] = 0
            self.custom_index[item['audiotype']] = 0
            self.custom_opt[item['audiotype']] = item

    def init_customindex(self):
        self.curr_state=0
        for key in self.custom_audio_index:
            self.custom_audio_index[key]=0
        for key in self.custom_index:
            self.custom_index[key]=0

    def notify(self,eventpoint):
        logger.info("notify:%s",eventpoint)

    def start_recording(self):
        """开始录制视频"""
        if self.recording:
            return

        command = ['ffmpeg',
                    '-y', '-an',
                    '-f', 'rawvideo',
                    '-vcodec','rawvideo',
                    '-pix_fmt', 'bgr24', #像素格式
                    '-s', "{}x{}".format(self.width, self.height),
                    '-r', str(25),
                    '-i', '-',
                    '-pix_fmt', 'yuv420p', 
                    '-vcodec', "h264",
                    #'-f' , 'flv',                  
                    f'temp{self.opt.sessionid}.mp4']
        self._record_video_pipe = subprocess.Popen(command, shell=False, stdin=subprocess.PIPE)

        acommand = ['ffmpeg',
                    '-y', '-vn',
                    '-f', 's16le',
                    #'-acodec','pcm_s16le',
                    '-ac', '1',
                    '-ar', '16000',
                    '-i', '-',
                    '-acodec', 'aac',
                    #'-f' , 'wav',                  
                    f'temp{self.opt.sessionid}.aac']
        self._record_audio_pipe = subprocess.Popen(acommand, shell=False, stdin=subprocess.PIPE)

        self.recording = True
        # self.recordq_video.queue.clear()
        # self.recordq_audio.queue.clear()
        # self.container = av.open(path, mode="w")
    
        # process_thread = Thread(target=self.record_frame, args=())
        # process_thread.start()
    
    def record_video_data(self,image):
        if self.width == 0:
            print("image.shape:",image.shape)
            self.height,self.width,_ = image.shape
        if self.recording:
            self._record_video_pipe.stdin.write(image.tostring())

    def record_audio_data(self,frame):
        if self.recording:
            self._record_audio_pipe.stdin.write(frame.tostring())
    
    # def record_frame(self): 
    #     videostream = self.container.add_stream("libx264", rate=25)
    #     videostream.codec_context.time_base = Fraction(1, 25)
    #     audiostream = self.container.add_stream("aac")
    #     audiostream.codec_context.time_base = Fraction(1, 16000)
    #     init = True
    #     framenum = 0       
    #     while self.recording:
    #         try:
    #             videoframe = self.recordq_video.get(block=True, timeout=1)
    #             videoframe.pts = framenum #int(round(framenum*0.04 / videostream.codec_context.time_base))
    #             videoframe.dts = videoframe.pts
    #             if init:
    #                 videostream.width = videoframe.width
    #                 videostream.height = videoframe.height
    #                 init = False
    #             for packet in videostream.encode(videoframe):
    #                 self.container.mux(packet)
    #             for k in range(2):
    #                 audioframe = self.recordq_audio.get(block=True, timeout=1)
    #                 audioframe.pts = int(round((framenum*2+k)*0.02 / audiostream.codec_context.time_base))
    #                 audioframe.dts = audioframe.pts
    #                 for packet in audiostream.encode(audioframe):
    #                     self.container.mux(packet)
    #             framenum += 1
    #         except queue.Empty:
    #             print('record queue empty,')
    #             continue
    #         except Exception as e:
    #             print(e)
    #             #break
    #     for packet in videostream.encode(None):
    #         self.container.mux(packet)
    #     for packet in audiostream.encode(None):
    #         self.container.mux(packet)
    #     self.container.close()
    #     self.recordq_video.queue.clear()
    #     self.recordq_audio.queue.clear()
    #     print('record thread stop')
		
    def stop_recording(self):
        """停止录制视频"""
        if not self.recording:
            return
        self.recording = False 
        self._record_video_pipe.stdin.close()  #wait() 
        self._record_video_pipe.wait()
        self._record_audio_pipe.stdin.close()
        self._record_audio_pipe.wait()
        cmd_combine_audio = f"ffmpeg -y -i temp{self.opt.sessionid}.aac -i temp{self.opt.sessionid}.mp4 -c:v copy -c:a copy data/record.mp4"
        os.system(cmd_combine_audio) 
        #os.remove(output_path)

    def mirror_index(self,size, index):
        #size = len(self.coord_list_cycle)
        turn = index // size
        res = index % size
        if turn % 2 == 0:
            return res
        else:
            return size - res - 1 
    
    def get_audio_stream(self,audiotype):
        idx = self.custom_audio_index[audiotype]
        stream = self.custom_audio_cycle[audiotype][idx:idx+self.chunk]
        self.custom_audio_index[audiotype] += self.chunk
        if self.custom_audio_index[audiotype]>=self.custom_audio_cycle[audiotype].shape[0]:
            self.curr_state = 1  #当前视频不循环播放，切换到静音状态
        return stream
    
    def set_custom_state(self,audiotype, reinit=True):
        print('set_custom_state:',audiotype)
        if self.custom_audio_index.get(audiotype) is None:
            return
        self.curr_state = audiotype
        if reinit:
            self.custom_audio_index[audiotype] = 0
            self.custom_index[audiotype] = 0

    def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None):
        enable_transition = False  # 设置为False禁用过渡效果，True启用
        
        if enable_transition:
            _last_speaking = False
            _transition_start = time.time()
            _transition_duration = 0.1  # 过渡时间
            _last_silent_frame = None  # 静音帧缓存
            _last_speaking_frame = None  # 说话帧缓存
        
        if self.opt.transport=='virtualcam':
            import pyvirtualcam
            vircam = None

            audio_tmp = queue.Queue(maxsize=3000)
            audio_thread = Thread(target=play_audio, args=(quit_event,audio_tmp,), daemon=True, name="pyaudio_stream")
            audio_thread.start()
        
        while not quit_event.is_set():
            try:
                res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1)
            except queue.Empty:
                continue
            
            if enable_transition:
                # 检测状态变化
                current_speaking = not (audio_frames[0][1]!=0 and audio_frames[1][1]!=0)
                if current_speaking != _last_speaking:
                    logger.info(f"状态切换：{'说话' if _last_speaking else '静音'} → {'说话' if current_speaking else '静音'}")
                    _transition_start = time.time()
                _last_speaking = current_speaking

            if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据，只需要取fullimg
                self.speaking = False
                audiotype = audio_frames[0][1]
                if self.custom_index.get(audiotype) is not None: #有自定义视频
                    mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
                    target_frame = self.custom_img_cycle[audiotype][mirindex]
                    self.custom_index[audiotype] += 1
                else:
                    target_frame = self.frame_list_cycle[idx]
                
                if enable_transition:
                    # 说话→静音过渡
                    if time.time() - _transition_start < _transition_duration and _last_speaking_frame is not None:
                        alpha = min(1.0, (time.time() - _transition_start) / _transition_duration)
                        combine_frame = cv2.addWeighted(_last_speaking_frame, 1-alpha, target_frame, alpha, 0)
                    else:
                        combine_frame = target_frame
                    # 缓存静音帧
                    _last_silent_frame = combine_frame.copy()
                else:
                    combine_frame = target_frame
            else:
                self.speaking = True
                try:
                    current_frame = self.paste_back_frame(res_frame,idx)
                except Exception as e:
                    logger.warning(f"paste_back_frame error: {e}")
                    continue
                if enable_transition:
                    # 静音→说话过渡
                    if time.time() - _transition_start < _transition_duration and _last_silent_frame is not None:
                        alpha = min(1.0, (time.time() - _transition_start) / _transition_duration)
                        combine_frame = cv2.addWeighted(_last_silent_frame, 1-alpha, current_frame, alpha, 0)
                    else:
                        combine_frame = current_frame
                    # 缓存说话帧
                    _last_speaking_frame = combine_frame.copy()
                else:
                    combine_frame = current_frame

            cv2.putText(combine_frame, "LiveTalking", (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (128,128,128), 1)
            if self.opt.transport=='virtualcam':
                if vircam==None:
                    height, width,_= combine_frame.shape
                    vircam = pyvirtualcam.Camera(width=width, height=height, fps=25, fmt=pyvirtualcam.PixelFormat.BGR,print_fps=True)
                vircam.send(combine_frame)
            else: #webrtc
                image = combine_frame
                new_frame = VideoFrame.from_ndarray(image, format="bgr24")
                asyncio.run_coroutine_threadsafe(video_track._queue.put((new_frame,None)), loop)
            self.record_video_data(combine_frame)

            for audio_frame in audio_frames:
                frame,type,eventpoint = audio_frame
                frame = (frame * 32767).astype(np.int16)

                if self.opt.transport=='virtualcam':
                    audio_tmp.put(frame.tobytes()) #TODO
                else: #webrtc
                    new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
                    new_frame.planes[0].update(frame.tobytes())
                    new_frame.sample_rate=16000
                    asyncio.run_coroutine_threadsafe(audio_track._queue.put((new_frame,eventpoint)), loop)
                self.record_audio_data(frame)
            if self.opt.transport=='virtualcam':
                vircam.sleep_until_next_frame()
        if self.opt.transport=='virtualcam':
            audio_thread.join()
            vircam.close()
        logger.info('basereal process_frames thread stop') 
    
    # def process_custom(self,audiotype:int,idx:int):
    #     if self.curr_state!=audiotype: #从推理切到口播
    #         if idx in self.switch_pos:  #在卡点位置可以切换
    #             self.curr_state=audiotype
    #             self.custom_index=0
    #     else:
    #         self.custom_index+=1

================================================
FILE: hubertasr.py
================================================
import time
import torch
import numpy as np
from baseasr import BaseASR
from ultralight.audio2feature import Audio2Feature

# hubert audio feature
class HubertASR(BaseASR):
    #audio_feat_length: select audio feature before and after
    def __init__(self, opt, parent, audio_processor:Audio2Feature,audio_feat_length = [8,8]):
        super().__init__(opt, parent)
        self.audio_processor = audio_processor
        #self.stride_left_size = 32
        #self.stride_right_size = 32
        self.audio_feat_length = audio_feat_length


    def run_step(self):
        start_time = time.time()
        
        for _ in range(self.batch_size * 2):
            audio_frame, type,eventpoint = self.get_audio_frame()
            self.frames.append(audio_frame)
            self.output_queue.put((audio_frame, type,eventpoint))
        
        if len(self.frames) <= self.stride_left_size + self.stride_right_size:
            return
        
        inputs = np.concatenate(self.frames)  # [N * chunk]

        mel = self.audio_processor.get_hubert_from_16k_speech(inputs)
        mel_chunks=self.audio_processor.feature2chunks(feature_array=mel,fps=self.fps/2,batch_size=self.batch_size,audio_feat_length = self.audio_feat_length, start=self.stride_left_size/2)

        self.feat_queue.put(mel_chunks)
        self.frames = self.frames[-(self.stride_left_size + self.stride_right_size):]
        #print(f"Processing audio costs {(time.time() - start_time) * 1000}ms")



================================================
FILE: lightreal.py
================================================
###############################################################################
#  Copyright (C) 2024 LiveTalking@lipku https://github.com/lipku/LiveTalking
#  email: lipku@foxmail.com
# 
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  
#       http://www.apache.org/licenses/LICENSE-2.0
# 
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
###############################################################################

import math
import torch
import numpy as np

#from .utils import *
import os
import time
import cv2
import glob
import pickle
import copy

import queue
from queue import Queue
from threading import Thread, Event
import torch.multiprocessing as mp


from hubertasr import HubertASR
import asyncio
from av import AudioFrame, VideoFrame
from basereal import BaseReal

#from imgcache import ImgCache

from tqdm import tqdm

#new
import os
import cv2
import torch
import numpy as np
import torch.nn as nn
from torch import optim
from tqdm import tqdm
from transformers import Wav2Vec2Processor, HubertModel
from torch.utils.data import DataLoader
from ultralight.unet import Model
from ultralight.audio2feature import Audio2Feature
from logger import logger

device = "cuda" if torch.cuda.is_available() else ("mps" if (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()) else "cpu")
print('Using {} for inference.'.format(device))

def load_model(opt):
    audio_processor = Audio2Feature()
    return audio_processor

def load_avatar(avatar_id):
    avatar_path = f"./data/avatars/{avatar_id}"
    full_imgs_path = f"{avatar_path}/full_imgs" 
    face_imgs_path = f"{avatar_path}/face_imgs" 
    coords_path = f"{avatar_path}/coords.pkl" 
    
    model = Model(6, 'hubert').to(device)  # 假设Model是你自定义的类
    model.load_state_dict(torch.load(f"{avatar_path}/ultralight.pth"))
    
    with open(coords_path, 'rb') as f:
        coord_list_cycle = pickle.load(f)
    input_img_list = glob.glob(os.path.join(full_imgs_path, '*.[jpJP][pnPN]*[gG]'))
    input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
    frame_list_cycle = read_imgs(input_img_list)
    #self.imagecache = ImgCache(len(self.coord_list_cycle),self.full_imgs_path,1000)
    input_face_list = glob.glob(os.path.join(face_imgs_path, '*.[jpJP][pnPN]*[gG]'))
    input_face_list = sorted(input_face_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
    face_list_cycle = read_imgs(input_face_list)

    return model.eval(),frame_list_cycle,face_list_cycle,coord_list_cycle


@torch.no_grad()
def warm_up(batch_size,avatar,modelres):
    logger.info('warmup model...')
    model,_,_,_ = avatar
    img_batch = torch.ones(batch_size, 6, modelres, modelres).to(device)
    mel_batch = torch.ones(batch_size, 16, 32, 32).to(device)
    model(img_batch, mel_batch)

def read_imgs(img_list):
    frames = []
    logger.info('reading images...')
    for img_path in tqdm(img_list):
        frame = cv2.imread(img_path)
        frames.append(frame)
    return frames

def get_audio_features(features, index):
    left = index - 8
    right = index + 8
    pad_left = 0
    pad_right = 0
    if left < 0:
        pad_left = -left
        left = 0
    if right > features.shape[0]:
        pad_right = right - features.shape[0]
        right = features.shape[0]
    auds = torch.from_numpy(features[left:right])
    if pad_left > 0:
        auds = torch.cat([torch.zeros_like(auds[:pad_left]), auds], dim=0)
    if pad_right > 0:
        auds = torch.cat([auds, torch.zeros_like(auds[:pad_right])], dim=0) # [8, 16]
    return auds


def read_lms(lms_list):
    land_marks = []
    logger.info('reading lms...')
    for lms_path in tqdm(lms_list):
        file_landmarks = []  # Store landmarks for this file
        with open(lms_path, "r") as f:
            lines = f.read().splitlines()
            for line in lines:
                arr = list(filter(None, line.split(" ")))
                if arr:
                    arr = np.array(arr, dtype=np.float32)
                    file_landmarks.append(arr)
        land_marks.append(file_landmarks)  # Add the file's landmarks to the overall list
    return land_marks

def __mirror_index(size, index):
    #size = len(self.coord_list_cycle)
    turn = index // size
    res = index % size
    if turn % 2 == 0:
        return res
    else:
        return size - res - 1 


def inference(quit_event, batch_size, face_list_cycle, audio_feat_queue, audio_out_queue, res_frame_queue, model):
    length = len(face_list_cycle)
    index = 0
    count = 0
    counttime = 0
    logger.info('start inference')

    while not quit_event.is_set():
        starttime=time.perf_counter()
        try:
            mel_batch = audio_feat_queue.get(block=True, timeout=1)
        except queue.Empty:
            continue
        is_all_silence=True
        audio_frames = []
        for _ in range(batch_size*2):
            frame,type_,eventpoint = audio_out_queue.get()
            audio_frames.append((frame,type_,eventpoint))
            if type_==0:
                is_all_silence=False
        if is_all_silence:
            for i in range(batch_size):
                res_frame_queue.put((None,__mirror_index(length,index),audio_frames[i*2:i*2+2]))
                index = index + 1
        else:
            t = time.perf_counter()
            img_batch = []

            for i in range(batch_size):
                idx = __mirror_index(length, index + i)
                #face = face_list_cycle[idx]
                crop_img = face_list_cycle[idx] #face[ymin:ymax, xmin:xmax]
#                h, w = crop_img.shape[:2]
                #crop_img = cv2.resize(crop_img, (168, 168), cv2.INTER_AREA)
                #crop_img_ori = crop_img.copy()
                img_real_ex = crop_img[4:164, 4:164].copy()
                img_real_ex_ori = img_real_ex.copy()
                img_masked = cv2.rectangle(img_real_ex_ori,(5,5,150,145),(0,0,0),-1)
    
                img_masked = img_masked.transpose(2,0,1).astype(np.float32)
                img_real_ex = img_real_ex.transpose(2,0,1).astype(np.float32)
    
                img_real_ex_T = torch.from_numpy(img_real_ex / 255.0)
                img_masked_T = torch.from_numpy(img_masked / 255.0)
                img_concat_T = torch.cat([img_real_ex_T, img_masked_T], axis=0)[None]
                img_batch.append(img_concat_T)

            reshaped_mel_batch = [arr.reshape(16, 32, 32) for arr in mel_batch]
            mel_batch = torch.stack([torch.from_numpy(arr) for arr in reshaped_mel_batch])
            img_batch = torch.stack(img_batch).squeeze(1)


            with torch.no_grad():
                pred = model(img_batch.cuda(),mel_batch.cuda())
            pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.

            counttime += (time.perf_counter() - t)
            count += batch_size
            if count >= 100:
                logger.info(f"------actual avg infer fps:{count / counttime:.4f}")
                count = 0
                counttime = 0
            for i,res_frame in enumerate(pred):
                #self.__pushmedia(res_frame,loop,audio_track,video_track)
                res_frame_queue.put((res_frame,__mirror_index(length,index),audio_frames[i*2:i*2+2]))
                index = index + 1

#            for i, pred_frame in enumerate(pred):
#                pred_frame_uint8 = np.array(pred_frame, dtype=np.uint8)
#                res_frame_queue.put((pred_frame_uint8, __mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]))
#                index = (index + 1) % length

        #print('total batch time:', time.perf_counter() - starttime)

    logger.info('lightreal inference processor stop')


class LightReal(BaseReal):
    @torch.no_grad()
    def __init__(self, opt, model, avatar):
        super().__init__(opt)
        #self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
        # self.W = opt.W
        # self.H = opt.H

        self.fps = opt.fps # 20 ms per frame
        
        self.batch_size = opt.batch_size
        self.idx = 0
        self.res_frame_queue = Queue(self.batch_size*2)  #mp.Queue
        #self.__loadavatar()
        audio_processor = model
        self.model,self.frame_list_cycle,self.face_list_cycle,self.coord_list_cycle = avatar

        self.asr = HubertASR(opt,self,audio_processor,audio_feat_length =[4,4])
        self.asr.warm_up()
        #self.__warm_up()
        
        self.render_event = mp.Event()
    
    # def __del__(self):
    #     logger.info(f'lightreal({self.sessionid}) delete')

    def paste_back_frame(self,pred_frame,idx:int):
        bbox = self.coord_list_cycle[idx]
        combine_frame = copy.deepcopy(self.frame_list_cycle[idx])
        x1, y1, x2, y2 = bbox

        crop_img = self.face_list_cycle[idx]
        crop_img_ori = crop_img.copy()
        #res_frame = np.array(res_frame, dtype=np.uint8)

        crop_img_ori[4:164, 4:164] = pred_frame.astype(np.uint8)
        crop_img_ori = cv2.resize(crop_img_ori, (x2-x1,y2-y1))
        combine_frame[y1:y2, x1:x2] = crop_img_ori
        return combine_frame
            
    def render(self,quit_event,loop=None,audio_track=None,video_track=None):
        #if self.opt.asr:
        #     self.asr.warm_up()

        self.init_customindex()
        self.tts.render(quit_event)
        
        infer_quit_event = Event()
        infer_thread = Thread(target=inference, args=(infer_quit_event,self.batch_size,self.face_list_cycle,self.asr.feat_queue,self.asr.output_queue,self.res_frame_queue,
                                           self.model,))  #mp.Process
        infer_thread.start()
        
        process_quit_event = Event()
        process_thread = Thread(target=self.process_frames, args=(process_quit_event,loop,audio_track,video_track))
        process_thread.start()   

        #self.render_event.set() #start infer process render
        count=0
        totaltime=0
        _starttime=time.perf_counter()
        #_totalframe=0
        while not quit_event.is_set(): 
            # update texture every frame
            # audio stream thread...
            t = time.perf_counter()
            self.asr.run_step()

            # if video_track._queue.qsize()>=2*self.opt.batch_size:
            #     print('sleep qsize=',video_track._queue.qsize())
            #     time.sleep(0.04*video_track._queue.qsize()*0.8)
            if video_track and video_track._queue.qsize()>=5:
                logger.debug('sleep qsize=%d',video_track._queue.qsize())
                time.sleep(0.04*video_track._queue.qsize()*0.8)
                
            # delay = _starttime+_totalframe*0.04-time.perf_counter() #40ms
            # if delay > 0:
            #     time.sleep(delay)
        #self.render_event.clear() #end infer process render
        logger.info('lightreal thread stop')

        infer_quit_event.set()
        infer_thread.join()

        process_quit_event.set()
        process_thread.join()
            



================================================
FILE: lipasr.py
================================================
###############################################################################
#  Copyright (C) 2024 LiveTalking@lipku https://github.com/lipku/LiveTalking
#  email: lipku@foxmail.com
# 
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  
#       http://www.apache.org/licenses/LICENSE-2.0
# 
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
###############################################################################

import time
import torch
import numpy as np

import queue
from queue import Queue
#import multiprocessing as mp

from baseasr import BaseASR
from wav2lip import audio

class LipASR(BaseASR):

    def run_step(self):
        ############################################## extract audio feature ##############################################
        # get a frame of audio
        for _ in range(self.batch_size*2):
            frame,type,eventpoint = self.get_audio_frame()
            self.frames.append(frame)
            # put to output
            self.output_queue.put((frame,type,eventpoint))
        # context not enough, do not run network.
        if len(self.frames) <= self.stride_left_size + self.stride_right_size:
            return
        
        inputs = np.concatenate(self.frames) # [N * chunk]
        mel = audio.melspectrogram(inputs)
        #print(mel.shape[0],mel.shape,len(mel[0]),len(self.frames))
        # cut off stride
        left = max(0, self.stride_left_size*80/50)
        right = min(len(mel[0]), len(mel[0]) - self.stride_right_size*80/50)
        mel_idx_multiplier = 80.*2/self.fps 
        mel_step_size = 16
        i = 0
        mel_chunks = []
        while i < (len(self.frames)-self.stride_left_size-self.stride_right_size)/2:
            start_idx = int(left + i * mel_idx_multiplier)
            #print(start_idx)
            if start_idx + mel_step_size > len(mel[0]):
                mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
            else:
                mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
            i += 1
        self.feat_queue.put(mel_chunks)
        
        # discard the old part to save memory
        self.frames = self.frames[-(self.stride_left_size + self.stride_right_size):]


================================================
FILE: lipreal.py
================================================
###############################################################################
#  Copyright (C) 2024 LiveTalking@lipku https://github.com/lipku/LiveTalking
#  email: lipku@foxmail.com
# 
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  
#       http://www.apache.org/licenses/LICENSE-2.0
# 
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
###############################################################################

import math
import torch
import numpy as np

#from .utils import *
import os
import time
import cv2
import glob
import pickle
import copy

import queue
from queue import Queue
from threading import Thread, Event
import torch.multiprocessing as mp


from lipasr import LipASR
import asyncio
from av import AudioFrame, VideoFrame
from wav2lip.models import Wav2Lip
from basereal import BaseReal

#from imgcache import ImgCache

from tqdm import tqdm
from logger import logger

device = "cuda" if torch.cuda.is_available() else ("mps" if (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()) else "cpu")
print('Using {} for inference.'.format(device))

def _load(checkpoint_path):
	if device == 'cuda':
		checkpoint = torch.load(checkpoint_path) #,weights_only=True
	else:
		checkpoint = torch.load(checkpoint_path,
								map_location=lambda storage, loc: storage)
	return checkpoint

def load_model(path):
	model = Wav2Lip()
	logger.info("Load checkpoint from: {}".format(path))
	checkpoint = _load(path)
	s = checkpoint["state_dict"]
	new_s = {}
	for k, v in s.items():
		new_s[k.replace('module.', '')] = v
	model.load_state_dict(new_s)

	model = model.to(device)
	return model.eval()

def load_avatar(avatar_id):
    avatar_path = f"./data/avatars/{avatar_id}"
    full_imgs_path = f"{avatar_path}/full_imgs" 
    face_imgs_path = f"{avatar_path}/face_imgs" 
    coords_path = f"{avatar_path}/coords.pkl"
    
    with open(coords_path, 'rb') as f:
        coord_list_cycle = pickle.load(f)
    input_img_list = glob.glob(os.path.join(full_imgs_path, '*.[jpJP][pnPN]*[gG]'))
    input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
    frame_list_cycle = read_imgs(input_img_list)
    #self.imagecache = ImgCache(len(self.coord_list_cycle),self.full_imgs_path,1000)
    input_face_list = glob.glob(os.path.join(face_imgs_path, '*.[jpJP][pnPN]*[gG]'))
    input_face_list = sorted(input_face_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
    face_list_cycle = read_imgs(input_face_list)

    return frame_list_cycle,face_list_cycle,coord_list_cycle

@torch.no_grad()
def warm_up(batch_size,model,modelres):
    # 预热函数
    logger.info('warmup model...')
    img_batch = torch.ones(batch_size, 6, modelres, modelres).to(device)
    mel_batch = torch.ones(batch_size, 1, 80, 16).to(device)
    model(mel_batch, img_batch)

def read_imgs(img_list):
    frames = []
    logger.info('reading images...')
    for img_path in tqdm(img_list):
        frame = cv2.imread(img_path)
        frames.append(frame)
    return frames

def __mirror_index(size, index):
    #size = len(self.coord_list_cycle)
    turn = index // size
    res = index % size
    if turn % 2 == 0:
        return res
    else:
        return size - res - 1 

def inference(quit_event,batch_size,face_list_cycle,audio_feat_queue,audio_out_queue,res_frame_queue,model):
    
    #model = load_model("./models/wav2lip.pth")
    # input_face_list = glob.glob(os.path.join(face_imgs_path, '*.[jpJP][pnPN]*[gG]'))
    # input_face_list = sorted(input_face_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
    # face_list_cycle = read_imgs(input_face_list)
    
    #input_latent_list_cycle = torch.load(latents_out_path)
    length = len(face_list_cycle)
    index = 0
    count=0
    counttime=0
    logger.info('start inference')
    while not quit_event.is_set():
        starttime=time.perf_counter()
        mel_batch = []
        try:
            mel_batch = audio_feat_queue.get(block=True, timeout=1)
        except queue.Empty:
            continue
            
        is_all_silence=True
        audio_frames = []
        for _ in range(batch_size*2):
            frame,type,eventpoint = audio_out_queue.get()
            audio_frames.append((frame,type,eventpoint))
            if type==0:
                is_all_silence=False

        if is_all_silence:
            for i in range(batch_size):
                res_frame_queue.put((None,__mirror_index(length,index),audio_frames[i*2:i*2+2]))
                index = index + 1
        else:
            # print('infer=======')
            t=time.perf_counter()
            img_batch = []
            for i in range(batch_size):
                idx = __mirror_index(length,index+i)
                face = face_list_cycle[idx]
                img_batch.append(face)
            img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)

            img_masked = img_batch.copy()
            img_masked[:, face.shape[0]//2:] = 0

            img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
            mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
            
            img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
            mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)

            with torch.no_grad():
                pred = model(mel_batch, img_batch)
            pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.

            counttime += (time.perf_counter() - t)
            count += batch_size
            #_totalframe += 1
            if count>=100:
                logger.info(f"------actual avg infer fps:{count/counttime:.4f}")
                count=0
                counttime=0
            for i,res_frame in enumerate(pred):
                #self.__pushmedia(res_frame,loop,audio_track,video_track)
                res_frame_queue.put((res_frame,__mirror_index(length,index),audio_frames[i*2:i*2+2]))
                index = index + 1
            #print('total batch time:',time.perf_counter()-starttime)            
    logger.info('lipreal inference processor stop')

class LipReal(BaseReal):
    @torch.no_grad()
    def __init__(self, opt, model, avatar):
        super().__init__(opt)
        #self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
        # self.W = opt.W
        # self.H = opt.H

        self.fps = opt.fps # 20 ms per frame
        
        self.batch_size = opt.batch_size
        self.idx = 0
        self.res_frame_queue = Queue(self.batch_size*2)  #mp.Queue
        #self.__loadavatar()
        self.model = model
        self.frame_list_cycle,self.face_list_cycle,self.coord_list_cycle = avatar

        self.asr = LipASR(opt,self)
        self.asr.warm_up()
        
        self.render_event = mp.Event()
    
    # def __del__(self):
    #     logger.info(f'lipreal({self.sessionid}) delete')

    def paste_back_frame(self,pred_frame,idx:int):
        bbox = self.coord_list_cycle[idx]
        combine_frame = copy.deepcopy(self.frame_list_cycle[idx])
        #combine_frame = copy.deepcopy(self.imagecache.get_img(idx))
        y1, y2, x1, x2 = bbox
        res_frame = cv2.resize(pred_frame.astype(np.uint8),(x2-x1,y2-y1))
        #combine_frame = get_image(ori_frame,res_frame,bbox)
        #t=time.perf_counter()
        combine_frame[y1:y2, x1:x2] = res_frame
        return combine_frame
            
    def render(self,quit_event,loop=None,audio_track=None,video_track=None):
        #if self.opt.asr:
        #     self.asr.warm_up()

        self.init_customindex()
        self.tts.render(quit_event)
        
        infer_quit_event = Event()
        infer_thread = Thread(target=inference, args=(infer_quit_event,self.batch_size,self.face_list_cycle,
                                           self.asr.feat_queue,self.asr.output_queue,self.res_frame_queue,
                                           self.model,))  #mp.Process
        infer_thread.start()
        
        process_quit_event = Event()
        process_thread = Thread(target=self.process_frames, args=(process_quit_event,loop,audio_track,video_track))
        process_thread.start()

        #self.render_event.set() #start infer process render
        count=0
        totaltime=0
        _starttime=time.perf_counter()
        #_totalframe=0
        while not quit_event.is_set(): 
            # update texture every frame
            # audio stream thread...
            t = time.perf_counter()
            self.asr.run_step()

            # if video_track._queue.qsize()>=2*self.opt.batch_size:
            #     print('sleep qsize=',video_track._queue.qsize())
            #     time.sleep(0.04*video_track._queue.qsize()*0.8)
            if video_track and video_track._queue.qsize()>=5:
                logger.debug('sleep qsize=%d',video_track._queue.qsize())
                time.sleep(0.04*video_track._queue.qsize()*0.8)
                
            # delay = _starttime+_totalframe*0.04-time.perf_counter() #40ms
            # if delay > 0:
            #     time.sleep(delay)
        #self.render_event.clear() #end infer process render
        logger.info('lipreal thread stop')

        infer_quit_event.set()
        infer_thread.join()

        process_quit_event.set()
        process_thread.join()
            

================================================
FILE: llm.py
================================================
import time
import os
from basereal import BaseReal
from logger import logger

def llm_response(message,nerfreal:BaseReal):
    start = time.perf_counter()
    from openai import OpenAI
    client = OpenAI(
        # 如果您没有配置环境变量，请在此处用您的API Key进行替换
        api_key=os.getenv("DASHSCOPE_API_KEY"),
        # 填写DashScope SDK的base_url
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
    )
    end = time.perf_counter()
    logger.info(f"llm Time init: {end-start}s")
    completion = client.chat.completions.create(
        model="qwen-plus",
        messages=[{'role': 'system', 'content': 'You are a helpful assistant.'},
                  {'role': 'user', 'content': message}],
        stream=True,
        # 通过以下设置，在流式输出的最后一行展示token使用信息
        stream_options={"include_usage": True}
    )
    result=""
    first = True
    for chunk in completion:
        if len(chunk.choices)>0:
            #print(chunk.choices[0].delta.content)
            if first:
                end = time.perf_counter()
                logger.info(f"llm Time to first chunk: {end-start}s")
                first = False
            msg = chunk.choices[0].delta.content
            lastpos=0
            #msglist = re.split('[,.!;:，。！?]',msg)
            for i, char in enumerate(msg):
                if char in ",.!;:，。！？：；" :
                    result = result+msg[lastpos:i+1]
                    lastpos = i+1
                    if len(result)>10:
                        logger.info(result)
                        nerfreal.put_msg_txt(result)
                        result=""
            result = result+msg[lastpos:]
    end = time.perf_counter()
    logger.info(f"llm Time to last chunk: {end-start}s")
    nerfreal.put_msg_txt(result)    

================================================
FILE: logger.py
================================================
import logging
 
# 配置日志器
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler = logging.FileHandler('livetalking.log')  # 可以改为StreamHandler输出到控制台或多个Handler组合使用等。
fhandler.setFormatter(formatter)
fhandler.setLevel(logging.INFO)
logger.addHandler(fhandler)

# handler = logging.StreamHandler()
# handler.setLevel(logging.DEBUG)
# sformatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
# handler.setFormatter(sformatter)
# logger.addHandler(handler)

================================================
FILE: museasr.py
================================================
###############################################################################
#  Copyright (C) 2024 LiveTalking@lipku https://github.com/lipku/LiveTalking
#  email: lipku@foxmail.com
# 
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  
#       http://www.apache.org/licenses/LICENSE-2.0
# 
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
###############################################################################

import time
import numpy as np

import queue
from queue import Queue
#import multiprocessing as mp
from baseasr import BaseASR
from musetalk.whisper.audio2feature import Audio2Feature

class MuseASR(BaseASR):
    def __init__(self, opt, parent,audio_processor:Audio2Feature):
        super().__init__(opt,parent)
        self.audio_processor = audio_processor

    def run_step(self):
        ############################################## extract audio feature ##############################################
        start_time = time.time()
        for _ in range(self.batch_size*2):
            audio_frame,type,eventpoint = self.get_audio_frame()
            self.frames.append(audio_frame)
            self.output_queue.put((audio_frame,type,eventpoint))
        
        if len(self.frames) <= self.stride_left_size + self.stride_right_size:
            return
        
        inputs = np.concatenate(self.frames) # [N * chunk]
        whisper_feature = self.audio_processor.audio2feat(inputs)
        # for feature in whisper_feature:
        #     self.audio_feats.append(feature)        
        #print(f"processing audio costs {(time.time() - start_time) * 1000}ms, inputs shape:{inputs.shape} whisper_feature len:{len(whisper_feature)}")
        whisper_chunks = self.audio_processor.feature2chunks(feature_array=whisper_feature,fps=self.fps/2,batch_size=self.batch_size,start=self.stride_left_size/2 )
        #print(f"whisper_chunks len:{len(whisper_chunks)},self.audio_feats len:{len(self.audio_feats)},self.output_queue len:{self.output_queue.qsize()}")
        #self.audio_feats = self.audio_feats[-(self.stride_left_size + self.stride_right_size):]
        self.feat_queue.put(whisper_chunks)
        # discard the old part to save memory
        self.frames = self.frames[-(self.stride_left_size + self.stride_right_size):]


================================================
FILE: musereal.py
================================================
###############################################################################
#  Copyright (C) 2024 LiveTalking@lipku https://github.com/lipku/LiveTalking
#  email: lipku@foxmail.com
# 
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  
#       http://www.apache.org/licenses/LICENSE-2.0
# 
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
###############################################################################

import math
import torch
import numpy as np

#from .utils import *
import subprocess
import os
import time
import torch.nn.functional as F
import cv2
import glob
import pickle
import copy

import queue
from queue import Queue
from threading import Thread, Event
import torch.multiprocessing as mp

from musetalk.utils.utils import get_file_type,get_video_fps,datagen
#from musetalk.utils.preprocessing import get_landmark_and_bbox,read_imgs,coord_placeholder
from musetalk.myutil import get_image_blending
from musetalk.utils.utils import load_all_model
from musetalk.whisper.audio2feature import Audio2Feature

from museasr import MuseASR
import asyncio
from av import AudioFrame, VideoFrame
from basereal import BaseReal

from tqdm import tqdm
from logger import logger

def load_model():
    # load model weights
    vae, unet, pe = load_all_model()
    device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()) else "cpu"))
    timesteps = torch.tensor([0], device=device)
    pe = pe.half().to(device)
    vae.vae = vae.vae.half().to(device)
    #vae.vae.share_memory().to(device)
    unet.model = unet.model.half().to(device)
    #unet.model.share_memory()
    # Initialize audio processor and Whisper model
    audio_processor = Audio2Feature(model_path="./models/whisper")
    return vae, unet, pe, timesteps, audio_processor

def load_avatar(avatar_id):
    #self.video_path = '' #video_path
    #self.bbox_shift = opt.bbox_shift
    avatar_path = f"./data/avatars/{avatar_id}"
    full_imgs_path = f"{avatar_path}/full_imgs" 
    coords_path = f"{avatar_path}/coords.pkl"
    latents_out_path= f"{avatar_path}/latents.pt"
    video_out_path = f"{avatar_path}/vid_output/"
    mask_out_path =f"{avatar_path}/mask"
    mask_coords_path =f"{avatar_path}/mask_coords.pkl"
    avatar_info_path = f"{avatar_path}/avator_info.json"
    # self.avatar_info = {
    #     "avatar_id":self.avatar_id,
    #     "video_path":self.video_path,
    #     "bbox_shift":self.bbox_shift   
    # }

    input_latent_list_cycle = torch.load(latents_out_path)  #,weights_only=True
    with open(coords_path, 'rb') as f:
        coord_list_cycle = pickle.load(f)
    input_img_list = glob.glob(os.path.join(full_imgs_path, '*.[jpJP][pnPN]*[gG]'))
    input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
    frame_list_cycle = read_imgs(input_img_list)
    with open(mask_coords_path, 'rb') as f:
        mask_coords_list_cycle = pickle.load(f)
    input_mask_list = glob.glob(os.path.join(mask_out_path, '*.[jpJP][pnPN]*[gG]'))
    input_mask_list = sorted(input_mask_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
    mask_list_cycle = read_imgs(input_mask_list)
    return frame_list_cycle,mask_list_cycle,coord_list_cycle,mask_coords_list_cycle,input_latent_list_cycle

@torch.no_grad()
def warm_up(batch_size,model):
    # 预热函数
    logger.info('warmup model...')
    vae, unet, pe, timesteps, audio_processor = model
    #batch_size = 16
    #timesteps = torch.tensor([0], device=unet.device)
    whisper_batch = np.ones((batch_size, 50, 384), dtype=np.uint8)
    latent_batch = torch.ones(batch_size, 8, 32, 32).to(unet.device)

    audio_feature_batch = torch.from_numpy(whisper_batch)
    audio_feature_batch = audio_feature_batch.to(device=unet.device, dtype=unet.model.dtype)
    audio_feature_batch = pe(audio_feature_batch)
    latent_batch = latent_batch.to(dtype=unet.model.dtype)
    pred_latents = unet.model(latent_batch,
                              timesteps,
                              encoder_hidden_states=audio_feature_batch).sample
    vae.decode_latents(pred_latents)

def read_imgs(img_list):
    frames = []
    logger.info('reading images...')
    for img_path in tqdm(img_list):
        frame = cv2.imread(img_path)
        frames.append(frame)
    return frames

def __mirror_index(size, index):
    #size = len(self.coord_list_cycle)
    turn = index // size
    res = index % size
    if turn % 2 == 0:
        return res
    else:
        return size - res - 1 

@torch.no_grad()
def inference(quit_event,batch_size,input_latent_list_cycle,audio_feat_queue,audio_out_queue,res_frame_queue,
              vae, unet, pe,timesteps): #vae, unet, pe,timesteps
    
    # vae, unet, pe = load_diffusion_model()
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # timesteps = torch.tensor([0], device=device)
    # pe = pe.half()
    # vae.vae = vae.vae.half()
    # unet.model = unet.model.half()
    
    length = len(input_latent_list_cycle)
    index = 0
    count=0
    counttime=0
    logger.info('start inference')
    while not quit_event.is_set():
        starttime=time.perf_counter()
        try:
            whisper_chunks = audio_feat_queue.get(block=True, timeout=1)
        except queue.Empty:
            continue
        is_all_silence=True
        audio_frames = []
        for _ in range(batch_size*2):
            frame,type,eventpoint = audio_out_queue.get()
            audio_frames.append((frame,type,eventpoint))
            if type==0:
                is_all_silence=False
        if is_all_silence:
            for i in range(batch_size):
                res_frame_queue.put((None,__mirror_index(length,index),audio_frames[i*2:i*2+2]))
                index = index + 1
        else:
            # print('infer=======')
            t=time.perf_counter()
            whisper_batch = np.stack(whisper_chunks)
            latent_batch = []
            for i in range(batch_size):
                idx = __mirror_index(length,index+i)
                latent = input_latent_list_cycle[idx]
                latent_batch.append(latent)
            latent_batch = torch.cat(latent_batch, dim=0)
            
            # for i, (whisper_batch,latent_batch) in enumerate(gen):
            audio_feature_batch = torch.from_numpy(whisper_batch)
            audio_feature_batch = audio_feature_batch.to(device=unet.device,
                                                            dtype=unet.model.dtype)
            audio_feature_batch = pe(audio_feature_batch)
            latent_batch = latent_batch.to(dtype=unet.model.dtype)
            # print('prepare time:',time.perf_counter()-t)
            # t=time.perf_counter()

            pred_latents = unet.model(latent_batch, 
                                        timesteps, 
                                        encoder_hidden_states=audio_feature_batch).sample
            # print('unet time:',time.perf_counter()-t)
            # t=time.perf_counter()
            recon = vae.decode_latents(pred_latents)
            # infer_inqueue.put((whisper_batch,latent_batch,sessionid))
            # recon,outsessionid = infer_outqueue.get()
            # if outsessionid != sessionid:
            #     print('outsessionid:',outsessionid,' mysessionid:',sessionid)

            # print('vae time:',time.perf_counter()-t)
            #print('diffusion len=',len(recon))
            counttime += (time.perf_counter() - t)
            count += batch_size
            #_totalframe += 1
            if count>=100:
                logger.info(f"------actual avg infer fps:{count/counttime:.4f}")
                count=0
                counttime=0
            for i,res_frame in enumerate(recon):
                #self.__pushmedia(res_frame,loop,audio_track,video_track)
                res_frame_queue.put((res_frame,__mirror_index(length,index),audio_frames[i*2:i*2+2]))
                index = index + 1
            #print('total batch time:',time.perf_counter()-starttime)            
    logger.info('musereal inference processor stop')

class MuseReal(BaseReal):
    @torch.no_grad()
    def __init__(self, opt, model, avatar):
        super().__init__(opt)
        #self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
        # self.W = opt.W
        # self.H = opt.H

        self.fps = opt.fps # 20 ms per frame

        self.batch_size = opt.batch_size
        self.idx = 0
        self.res_frame_queue = mp.Queue(self.batch_size*2)

        self.vae, self.unet, self.pe, self.timesteps, self.audio_processor = model
        self.frame_list_cycle,self.mask_list_cycle,self.coord_list_cycle,self.mask_coords_list_cycle, self.input_latent_list_cycle = avatar
        #self.__loadavatar()

        self.asr = MuseASR(opt,self,self.audio_processor)
        self.asr.warm_up()
        
        self.render_event = mp.Event()

    # def __del__(self):
    #     logger.info(f'musereal({self.sessionid}) delete')
    

    def __mirror_index(self, index):
        size = len(self.coord_list_cycle)
        turn = index // size
        res = index % size
        if turn % 2 == 0:
            return res
        else:
            return size - res - 1  

    def __warm_up(self): 
        self.asr.run_step()
        whisper_chunks = self.asr.get_next_feat()
        whisper_batch = np.stack(whisper_chunks)
        latent_batch = []
        for i in range(self.batch_size):
            idx = self.__mirror_index(self.idx+i)
            latent = self.input_latent_list_cycle[idx]
            latent_batch.append(latent)
        latent_batch = torch.cat(latent_batch, dim=0)
        logger.info('infer=======')
        # for i, (whisper_batch,latent_batch) in enumerate(gen):
        audio_feature_batch = torch.from_numpy(whisper_batch)
        audio_feature_batch = audio_feature_batch.to(device=self.unet.device,
                                                        dtype=self.unet.model.dtype)
        audio_feature_batch = self.pe(audio_feature_batch)
        latent_batch = latent_batch.to(dtype=self.unet.model.dtype)

        pred_latents = self.unet.model(latent_batch, 
                                    self.timesteps, 
                                    encoder_hidden_states=audio_feature_batch).sample
        recon = self.vae.decode_latents(pred_latents)
      

    def paste_back_frame(self,pred_frame,idx:int):
        bbox = self.coord_list_cycle[idx]
        ori_frame = copy.deepcopy(self.frame_list_cycle[idx])
        x1, y1, x2, y2 = bbox

        res_frame = cv2.resize(pred_frame.astype(np.uint8),(x2-x1,y2-y1))
        mask = self.mask_list_cycle[idx]
        mask_crop_box = self.mask_coords_list_cycle[idx]

        combine_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box)
        return combine_frame
            
    def render(self,quit_event,loop=None,audio_track=None,video_track=None):
        #if self.opt.asr:
        #     self.asr.warm_up()

        self.init_customindex()
        self.tts.render(quit_event)
        
        #self.render_event.set() #start infer process render
        infer_quit_event = Event()
        infer_thread = Thread(target=inference, args=(infer_quit_event,self.batch_size,self.input_latent_list_cycle,
                                           self.asr.feat_queue,self.asr.output_queue,self.res_frame_queue,
                                           self.vae, self.unet, self.pe,self.timesteps)) #mp.Process
        infer_thread.start()
        
        process_quit_event = Event()
        process_thread = Thread(target=self.process_frames, args=(process_quit_event,loop,audio_track,video_track))
        process_thread.start()

        
        count=0
        totaltime=0
        _starttime=time.perf_counter()
        #_totalframe=0
        while not quit_event.is_set(): #todo
            # update texture every frame
            # audio stream thread...
            t = time.perf_counter()
            self.asr.run_step()
            #self.test_step(loop,audio_track,video_track)
            # totaltime += (time.perf_counter() - t)
            # count += self.opt.batch_size
            # if count>=100:
            #     print(f"------actual avg infer fps:{count/totaltime:.4f}")
            #     count=0
            #     totaltime=0
            if video_track and video_track._queue.qsize()>=1.5*self.opt.batch_size:
                logger.debug('sleep qsize=%d',video_track._queue.qsize())
                time.sleep(0.04*video_track._queue.qsize()*0.8)
            # if video_track._queue.qsize()>=5:
            #     print('sleep qsize=',video_track._queue.qsize())
            #     time.sleep(0.04*video_track._queue.qsize()*0.8)
                
            # delay = _starttime+_totalframe*0.04-time.perf_counter() #40ms
            # if delay > 0:
            #     time.sleep(delay)
        logger.info('musereal thread stop')

        infer_quit_event.set()
        infer_thread.join()

        process_quit_event.set()
        process_thread.join()
            


================================================
FILE: musetalk/genavatar.py
================================================
import argparse
import glob
import json
import os
import pickle
import shutil

import cv2
import numpy as np
import torch
# import torchvision.transforms as transforms
# from PIL import Image
# from diffusers import AutoencoderKL
# from face_alignment import NetworkSize
# from mmpose.apis import inference_topdown, init_model
# from mmpose.structures import merge_data_samples
from tqdm import tqdm

from musetalk.utils.preprocessing import get_landmark_and_bbox, read_imgs
from musetalk.utils.blending import get_image_prepare_material
from musetalk.utils.utils import load_all_model

try:
    from utils.face_parsing import FaceParsing
except ModuleNotFoundError:
    from musetalk.utils.face_parsing import FaceParsing


def video2imgs(vid_path, save_path, ext='.png', cut_frame=10000000):
    cap = cv2.VideoCapture(vid_path)
    count = 0
    while True:
        if count > cut_frame:
            break
        ret, frame = cap.read()
        if ret:
            cv2.putText(frame, "LiveTalking", (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (128,128,128), 1)
            cv2.imwrite(f"{save_path}/{count:08d}.png", frame)
            count += 1
        else:
            break

'''
def read_imgs(img_list):
    frames = []
    print('reading images...')
    for img_path in tqdm(img_list):
        frame = cv2.imread(img_path)
        frames.append(frame)
    return frames


def get_landmark_and_bbox(img_list, upperbondrange=0):
    frames = read_imgs(img_list)
    batch_size_fa = 1
    batches = [frames[i:i + batch_size_fa] for i in range(0, len(frames), batch_size_fa)]
    coords_list = []
    landmarks = []
    if upperbondrange != 0:
        print('get key_landmark and face bounding boxes with the bbox_shift:', upperbondrange)
    else:
        print('get key_landmark and face bounding boxes with the default value')
    average_range_minus = []
    average_range_plus = []
    coord_placeholder = (0.0, 0.0, 0.0, 0.0)
    for fb in tqdm(batches):
        results = inference_topdown(model, np.asarray(fb)[0])
        results = merge_data_samples(results)
        keypoints = results.pred_instances.keypoints
        face_land_mark = keypoints[0][23:91]
        face_land_mark = face_land_mark.astype(np.int32)

        # get bounding boxes by face detetion
        bbox = fa.get_detections_for_batch(np.asarray(fb))

        # adjust the bounding box refer to landmark
        # Add the bounding box to a tuple and append it to the coordinates list
        for j, f in enumerate(bbox):
            if f is None:  # no face in the image
                coords_list += [coord_placeholder]
                continue

            half_face_coord = face_land_mark[29]  # np.mean([face_land_mark[28], face_land_mark[29]], axis=0)
            range_minus = (face_land_mark[30] - face_land_mark[29])[1]
            range_plus = (face_land_mark[29] - face_land_mark[28])[1]
            average_range_minus.append(range_minus)
            average_range_plus.append(range_plus)
            if upperbondrange != 0:
                half_face_coord[1] = upperbondrange + half_face_coord[1]  # 手动调整  + 向下（偏29）  - 向上（偏28）
            half_face_dist = np.max(face_land_mark[:, 1]) - half_face_coord[1]
            upper_bond = half_face_coord[1] - half_face_dist

            f_landmark = (
                np.min(face_land_mark[:, 0]), int(upper_bond), np.max(face_land_mark[:, 0]),
                np.max(face_land_mark[:, 1]))
            x1, y1, x2, y2 = f_landmark

            if y2 - y1 <= 0 or x2 - x1 <= 0 or x1 < 0:  # if the landmark bbox is not suitable, reuse the bbox
                coords_list += [f]
                w, h = f[2] - f[0], f[3] - f[1]
                print("error bbox:", f)
            else:
                coords_list += [f_landmark]
    return coords_list, frames


class FaceAlignment:
    def __init__(self, landmarks_type, network_size=NetworkSize.LARGE,
                 device='cuda', flip_input=False, face_detector='sfd', verbose=False):
        self.device = device
        self.flip_input = flip_input
        self.landmarks_type = landmarks_type
        self.verbose = verbose

        network_size = int(network_size)
        if 'cuda' in device:
            torch.backends.cudnn.benchmark = True
            #             torch.backends.cuda.matmul.allow_tf32 = False
            #             torch.backends.cudnn.benchmark = True
            #             torch.backends.cudnn.deterministic = False
            #             torch.backends.cudnn.allow_tf32 = True
            print('cuda start')

        # Get the face detector
        face_detector_module = __import__('face_detection.detection.' + face_detector,
                                          globals(), locals(), [face_detector], 0)

        self.face_detector = face_detector_module.FaceDetector(device=device, verbose=verbose)

    def get_detections_for_batch(self, images):
        images = images[..., ::-1]
        detected_faces = self.face_detector.detect_from_batch(images.copy())
        results = []

        for i, d in enumerate(detected_faces):
            if len(d) == 0:
                results.append(None)
                continue
            d = d[0]
            d = np.clip(d, 0, None)

            x1, y1, x2, y2 = map(int, d[:-1])
            results.append((x1, y1, x2, y2))
        return results


def get_mask_tensor():
    """
    Creates a mask tensor for image processing.
    :return: A mask tensor.
    """
    mask_tensor = torch.zeros((256, 256))
    mask_tensor[:256 // 2, :] = 1
    mask_tensor[mask_tensor < 0.5] = 0
    mask_tensor[mask_tensor >= 0.5] = 1
    return mask_tensor


def preprocess_img(img_name, half_mask=False):
    window = []
    if isinstance(img_name, str):
        window_fnames = [img_name]
        for fname in window_fnames:
            img = cv2.imread(fname)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, (256, 256),
                             interpolation=cv2.INTER_LANCZOS4)
            window.append(img)
    else:
        img = cv2.cvtColor(img_name, cv2.COLOR_BGR2RGB)
        window.append(img)
    x = np.asarray(window) / 255.
    x = np.transpose(x, (3, 0, 1, 2))
    x = torch.squeeze(torch.FloatTensor(x))
    if half_mask:
        x = x * (get_mask_tensor() > 0.5)
    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    x = normalize(x)
    x = x.unsqueeze(0)  # [1, 3, 256, 256] torch tensor
    x = x.to(device)
    return x


def encode_latents(image):
    with torch.no_grad():
        init_latent_dist = vae.encode(image.to(vae.dtype)).latent_dist
    init_latents = vae.config.scaling_factor * init_latent_dist.sample()
    return init_latents


def get_latents_for_unet(img):
    ref_image = preprocess_img(img, half_mask=True)  # [1, 3, 256, 256] RGB, torch tensor
    masked_latents = encode_latents(ref_image)  # [1, 4, 32, 32], torch tensor
    ref_image = preprocess_img(img, half_mask=False)  # [1, 3, 256, 256] RGB, torch tensor
    ref_latents = encode_latents(ref_image)  # [1, 4, 32, 32], torch tensor
    latent_model_input = torch.cat([masked_latents, ref_latents], dim=1)
    return latent_model_input


def get_crop_box(box, expand):
    x, y, x1, y1 = box
    x_c, y_c = (x + x1) // 2, (y + y1) // 2
    w, h = x1 - x, y1 - y
    s = int(max(w, h) // 2 * expand)
    crop_box = [x_c - s, y_c - s, x_c + s, y_c + s]
    return crop_box, s


def face_seg(image):
    seg_image = fp(image)
    if seg_image is None:
        print("error, no person_segment")
        return None

    seg_image = seg_image.resize(image.size)
    return seg_image


def get_image_prepare_material(image, face_box, upper_boundary_ratio=0.5, expand=1.2):
    body = Image.fromarray(image[:, :, ::-1])

    x, y, x1, y1 = face_box
    # print(x1-x,y1-y)
    crop_box, s = get_crop_box(face_box, expand)
    x_s, y_s, x_e, y_e = crop_box

    face_large = body.crop(crop_box)
    ori_shape = face_large.size

    mask_image = face_seg(face_large)
    mask_small = mask_image.crop((x - x_s, y - y_s, x1 - x_s, y1 - y_s))
    mask_image = Image.new('L', ori_shape, 0)
    mask_image.paste(mask_small, (x - x_s, y - y_s, x1 - x_s, y1 - y_s))

    # keep upper_boundary_ratio of talking area
    width, height = mask_image.size
    top_boundary = int(height * upper_boundary_ratio)
    modified_mask_image = Image.new('L', ori_shape, 0)
    modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary))

    blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1
    mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0)
    return mask_array, crop_box
'''

##todo 简单根据文件后缀判断  要更精确的可以自己修改 使用 magic
def is_video_file(file_path):
    video_exts = ['.mp4', '.mkv', '.flv', '.avi', '.mov']  # 这里列出了一些常见的视频文件扩展名，可以根据需要添加更多
    file_ext = os.path.splitext(file_path)[1].lower()  # 获取文件扩展名并转换为小写
    return file_ext in video_exts


def create_dir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)


current_dir = os.path.dirname(os.path.abspath(__file__))


def create_musetalk_human(file, avatar_id):
    # 保存文件设置 可以不动
    save_path = os.path.join(current_dir, f'./data/avatars/{avatar_id}')
    save_full_path = os.path.join(current_dir, f'./data/avatars/{avatar_id}/full_imgs')
    create_dir(save_path)
    create_dir(save_full_path)
    mask_out_path = os.path.join(current_dir, f'./data/avatars/{avatar_id}/mask')
    create_dir(mask_out_path)

    # 模型
    mask_coords_path = os.path.join(current_dir, f'{save_path}/mask_coords.pkl')
    coords_path = os.path.join(current_dir, f'{save_path}/coords.pkl')
    latents_out_path = os.path.join(current_dir, f'{save_path}/latents.pt')

    with open(os.path.join(current_dir, f'{save_path}/avator_info.json'), "w") as f:
        json.dump({
            "avatar_id": avatar_id,
            "video_path": file,
            "bbox_shift": args.bbox_shift
        }, f)

    if os.path.isfile(file):
        if is_video_file(file):
            video2imgs(file, save_full_path, ext='png')
        else:
            shutil.copyfile(file, f"{save_full_path}/{os.path.basename(file)}")
    else:
        files = os.listdir(file)
        files.sort()
        files = [file for file in files if file.split(".")[-1] == "png"]
        for filename in files:
            shutil.copyfile(f"{file}/{filename}", f"{save_full_path}/{filename}")
    input_img_list = sorted(glob.glob(os.path.join(save_full_path, '*.[jpJP][pnPN]*[gG]')))
    print("extracting landmarks...")
    coord_list, frame_list = get_landmark_and_bbox(input_img_list, args.bbox_shift)
    input_latent_list = []
    idx = -1
    # maker if the bbox is not sufficient
    coord_placeholder = (0.0, 0.0, 0.0, 0.0)
    for bbox, frame in zip(coord_list, frame_list):
        idx = idx + 1
        if bbox == coord_placeholder:
            continue
        x1, y1, x2, y2 = bbox
        if args.version == "v15":
            y2 = y2 + args.extra_margin
            y2 = min(y2, frame.shape[0])
            coord_list[idx] = [x1, y1, x2, y2]  # 更新coord_list中的bbox
        crop_frame = frame[y1:y2, x1:x2]
        resized_crop_frame = cv2.resize(crop_frame, (256, 256), interpolation=cv2.INTER_LANCZOS4)
        latents = vae.get_latents_for_unet(resized_crop_frame)
        input_latent_list.append(latents)

    frame_list_cycle = frame_list #+ frame_list[::-1]
    coord_list_cycle = coord_list #+ coord_list[::-1]
    input_latent_list_cycle = input_latent_list #+ input_latent_list[::-1]
    mask_coords_list_cycle = []
    mask_list_cycle = []
    for i, frame in enumerate(tqdm(frame_list_cycle)):
        cv2.imwrite(f"{save_full_path}/{str(i).zfill(8)}.png", frame)

        x1, y1, x2, y2 = coord_list_cycle[i]
        if args.version == "v15":
            mode = args.parsing_mode
        else:
            mode = "raw"
        mask, crop_box = get_image_prepare_material(frame, [x1, y1, x2, y2], fp=fp, mode=mode)
        cv2.imwrite(f"{mask_out_path}/{str(i).zfill(8)}.png", mask)

        mask_coords_list_cycle += [crop_box]
        mask_list_cycle.append(mask)

    with open(mask_coords_path, 'wb') as f:
        pickle.dump(mask_coords_list_cycle, f)

    with open(coords_path, 'wb') as f:
        pickle.dump(coord_list_cycle, f)
    torch.save(input_latent_list_cycle, os.path.join(latents_out_path))


# initialize the mmpose model
# device = "cuda" if torch.cuda.is_available() else ("mps" if (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()) else "cpu")
# fa = FaceAlignment(1, flip_input=False, device=device)
# config_file = os.path.join(current_dir, 'utils/dwpose/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py')
# checkpoint_file = os.path.abspath(os.path.join(current_dir, '../models/dwpose/dw-ll_ucoco_384.pth'))
# model = init_model(config_file, checkpoint_file, device=device)
# vae = AutoencoderKL.from_pretrained(os.path.abspath(os.path.join(current_dir, '../models/sd-vae-ft-mse')))
# vae.to(device)
# fp = FaceParsing(os.path.abspath(os.path.join(current_dir, '../models/face-parse-bisent/resnet18-5c106cde.pth')),
#                  os.path.abspath(os.path.join(current_dir, '../models/face-parse-bisent/79999_iter.pth')))
if __name__ == '__main__':
    # 视频文件地址
    parser = argparse.ArgumentParser()
    parser.add_argument("--file",
                        type=str,
                        default=r'D:\ok\00000000.png',
                        )
    parser.add_argument("--avatar_id",
                        type=str,
                        default='musetalk_avatar1',
                        )
    parser.add_argument("--version", type=str, default="v15", choices=["v1", "v15"], help="Version of MuseTalk: v1 or v15")
    parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID to use")
    parser.add_argument("--left_cheek_width", type=int, default=90, help="Width of left cheek region")
    parser.add_argument("--right_cheek_width", type=int, default=90, help="Width of right cheek region")
    parser.add_argument("--bbox_shift", type=int, default=0, help="Bounding box shift value")
    parser.add_argument("--extra_margin", type=int, default=10, help="Extra margin for face cropping")
    parser.add_argument("--parsing_mode", default='jaw', help="Face blending parsing mode")
    args = parser.parse_args()

    # Set computing device
    device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")

    # Load model weights
    vae, unet, pe = load_all_model(
        device=device
    )
    vae.vae = vae.vae.half().to(device)
    # Initialize face parser with configurable parameters based on version
    if args.version == "v15":
        fp = FaceParsing(
            left_cheek_width=args.left_cheek_width,
            right_cheek_width=args.right_cheek_width
        )
    else:  # v1
        fp = FaceParsing()

    create_musetalk_human(args.file, args.avatar_id)


================================================
FILE: musetalk/myutil.py
================================================
import numpy as np
import cv2
import copy

def get_image_blending(image,face,face_box,mask_array,crop_box):
    body = image
    x, y, x1, y1 = face_box
    x_s, y_s, x_e, y_e = crop_box
    face_large = copy.deepcopy(body[y_s:y_e, x_s:x_e])
    face_large[y-y_s:y1-y_s, x-x_s:x1-x_s]=face

    mask_image = cv2.cvtColor(mask_array,cv2.COLOR_BGR2GRAY)
    mask_image = (mask_image/255).astype(np.float32)

    # mask_not = cv2.bitwise_not(mask_array)
    # prospect_tmp = cv2.bitwise_and(face_large, face_large, mask=mask_array)
    # background_img = body[y_s:y_e, x_s:x_e]
    # background_img = cv2.bitwise_and(background_img, background_img, mask=mask_not)
    # body[y_s:y_e, x_s:x_e] = prospect_tmp + background_img

    #print(mask_image.shape)
    #print(cv2.minMaxLoc(mask_image))

    body[y_s:y_e, x_s:x_e] = cv2.blendLinear(face_large,body[y_s:y_e, x_s:x_e],mask_image,1-mask_image)

    #body.paste(face_large, crop_box[:2], mask_image)
    return body

================================================
FILE: musetalk/utils/__init__.py
================================================
import sys
from os.path import abspath, dirname
current_dir = dirname(abspath(__file__))
parent_dir = dirname(current_dir)
sys.path.append(parent_dir+'/utils')


================================================
FILE: musetalk/utils/audio_processor.py
================================================
import math
import os

import librosa
import numpy as np
import torch
from einops import rearrange
from transformers import AutoFeatureExtractor


class AudioProcessor:
    def __init__(self, feature_extractor_path="openai/whisper-tiny/"):
        self.feature_extractor = AutoFeatureExtractor.from_pretrained(feature_extractor_path)

    def get_audio_feature(self, wav_path, start_index=0, weight_dtype=None):
        if not os.path.exists(wav_path):
            return None
        librosa_output, sampling_rate = librosa.load(wav_path, sr=16000)
        assert sampling_rate == 16000
        # Split audio into 30s segments
        segment_length = 30 * sampling_rate
        segments = [librosa_output[i:i + segment_length] for i in range(0, len(librosa_output), segment_length)]

        features = []
        for segment in segments:
            audio_feature = self.feature_extractor(
                segment,
                return_tensors="pt",
                sampling_rate=sampling_rate
            ).input_features
            if weight_dtype is not None:
                audio_feature = audio_feature.to(dtype=weight_dtype)
            features.append(audio_feature)

        return features, len(librosa_output)

    def get_whisper_chunk(
        self,
        whisper_input_features,
        device,
        weight_dtype,
        whisper,
        librosa_length,
        fps=25,
        audio_padding_length_left=2,
        audio_padding_length_right=2,
    ):
        audio_feature_length_per_frame = 2 * (audio_padding_length_left + audio_padding_length_right + 1)
        whisper_feature = []
        # Process multiple 30s mel input features
        for input_feature in whisper_input_features:
            input_feature = input_feature.to(device).to(weight_dtype)
            audio_feats = whisper.encoder(input_feature, output_hidden_states=True).hidden_states
            audio_feats = torch.stack(audio_feats, dim=2)
            whisper_feature.append(audio_feats)

        whisper_feature = torch.cat(whisper_feature, dim=1)
        # Trim the last segment to remove padding
        sr = 16000
        audio_fps = 50
        fps = int(fps)
        whisper_idx_multiplier = audio_fps / fps
        num_frames = math.floor((librosa_length / sr) * fps)
        actual_length = math.floor((librosa_length / sr) * audio_fps)
        whisper_feature = whisper_feature[:,:actual_length,...]

        # Calculate padding amount
        padding_nums = math.ceil(whisper_idx_multiplier)
        # Add padding at start and end
        whisper_feature = torch.cat([
            torch.zeros_like(whisper_feature[:, :padding_nums * audio_padding_length_left]),
            whisper_feature,
            # Add extra padding to prevent out of bounds
            torch.zeros_like(whisper_feature[:, :padding_nums * 3 * audio_padding_length_right])
        ], 1)

        audio_prompts = []
        for frame_index in range(num_frames):
            try:
                audio_index = math.floor(frame_index * whisper_idx_multiplier)
                audio_clip = whisper_feature[:, audio_index: audio_index + audio_feature_length_per_frame]
                assert audio_clip.shape[1] == audio_feature_length_per_frame
                audio_prompts.append(audio_clip)
            except Exception as e:
                print(f"Error occurred: {e}")
                print(f"whisper_feature.shape: {whisper_feature.shape}")
                print(f"audio_clip.shape: {audio_clip.shape}")
                print(f"num frames: {num_frames}, fps: {fps}, whisper_idx_multiplier: {whisper_idx_multiplier}")
                print(f"frame_index: {frame_index}, audio_index: {audio_index}-{audio_index + audio_feature_length_per_frame}")
                exit()

        audio_prompts = torch.cat(audio_prompts, dim=0)  # T, 10, 5, 384
        audio_prompts = rearrange(audio_prompts, 'b c h w -> b (c h) w')
        return audio_prompts

if __name__ == "__main__":
    audio_processor = AudioProcessor()
    wav_path = "./2.wav"
    audio_feature, librosa_feature_length = audio_processor.get_audio_feature(wav_path)
    print("Audio Feature shape:", audio_feature.shape)
    print("librosa_feature_length:", librosa_feature_length)



================================================
FILE: musetalk/utils/blending.py
================================================
from PIL import Image
import numpy as np
import cv2
import copy


def get_crop_box(box, expand):
    x, y, x1, y1 = box
    x_c, y_c = (x+x1)//2, (y+y1)//2
    w, h = x1-x, y1-y
    s = int(max(w, h)//2*expand)
    crop_box = [x_c-s, y_c-s, x_c+s, y_c+s]
    return crop_box, s


def face_seg(image, mode="raw", fp=None):
    """
    对图像进行面部解析，生成面部区域的掩码。

    Args:
        image (PIL.Image): 输入图像。

    Returns:
        PIL.Image: 面部区域的掩码图像。
    """
    seg_image = fp(image, mode=mode)  # 使用 FaceParsing 模型解析面部
    if seg_image is None:
        print("error, no person_segment")  # 如果没有检测到面部，返回错误
        return None

    seg_image = seg_image.resize(image.size)  # 将掩码图像调整为输入图像的大小
    return seg_image


def get_image(image, face, face_box, upper_boundary_ratio=0.5, expand=1.5, mode="raw", fp=None):
    """
    将裁剪的面部图像粘贴回原始图像，并进行一些处理。

    Args:
        image (numpy.ndarray): 原始图像（身体部分）。
        face (numpy.ndarray): 裁剪的面部图像。
        face_box (tuple): 面部边界框的坐标 (x, y, x1, y1)。
        upper_boundary_ratio (float): 用于控制面部区域的保留比例。
        expand (float): 扩展因子，用于放大裁剪框。
        mode: 融合mask构建方式 

    Returns:
        numpy.ndarray: 处理后的图像。
    """
    # 将 numpy 数组转换为 PIL 图像
    body = Image.fromarray(image[:, :, ::-1])  # 身体部分图像(整张图)
    face = Image.fromarray(face[:, :, ::-1])  # 面部图像

    x, y, x1, y1 = face_box  # 获取面部边界框的坐标
    crop_box, s = get_crop_box(face_box, expand)  # 计算扩展后的裁剪框
    x_s, y_s, x_e, y_e = crop_box  # 裁剪框的坐标
    face_position = (x, y)  # 面部在原始图像中的位置

    # 从身体图像中裁剪出扩展后的面部区域（下巴到边界有距离）
    face_large = body.crop(crop_box)
        
    ori_shape = face_large.size  # 裁剪后图像的原始尺寸

    # 对裁剪后的面部区域进行面部解析，生成掩码
    mask_image = face_seg(face_large, mode=mode, fp=fp)
    
    mask_small = mask_image.crop((x - x_s, y - y_s, x1 - x_s, y1 - y_s))  # 裁剪出面部区域的掩码
    
    mask_image = Image.new('L', ori_shape, 0)  # 创建一个全黑的掩码图像
    mask_image.paste(mask_small, (x - x_s, y - y_s, x1 - x_s, y1 - y_s))  # 将面部掩码粘贴到全黑图像上
    
    
    # 保留面部区域的上半部分（用于控制说话区域）
    width, height = mask_image.size
    top_boundary = int(height * upper_boundary_ratio)  # 计算上半部分的边界
    modified_mask_image = Image.new('L', ori_shape, 0)  # 创建一个新的全黑掩码图像
    modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary))  # 粘贴上半部分掩码
    
    
    # 对掩码进行高斯模糊，使边缘更平滑
    blur_kernel_size = int(0.05 * ori_shape[0] // 2 * 2) + 1  # 计算模糊核大小
    mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0)  # 高斯模糊
    #mask_array = np.array(modified_mask_image)
    mask_image = Image.fromarray(mask_array)  # 将模糊后的掩码转换回 PIL 图像
    
    # 将裁剪的面部图像粘贴回扩展后的面部区域
    face_large.paste(face, (x - x_s, y - y_s, x1 - x_s, y1 - y_s))
    
    body.paste(face_large, crop_box[:2], mask_image)
    
    body = np.array(body)  # 将 PIL 图像转换回 numpy 数组

    return body[:, :, ::-1]  # 返回处理后的图像（BGR 转 RGB）


def get_image_blending(image, face, face_box, mask_array, crop_box):
    body = Image.fromarray(image[:,:,::-1])
    face = Image.fromarray(face[:,:,::-1])

    x, y, x1, y1 = face_box
    x_s, y_s, x_e, y_e = crop_box
    face_large = body.crop(crop_box)

    mask_image = Image.fromarray(mask_array)
    mask_image = mask_image.convert("L")
    face_large.paste(face, (x-x_s, y-y_s, x1-x_s, y1-y_s))
    body.paste(face_large, crop_box[:2], mask_image)
    body = np.array(body)
    return body[:,:,::-1]


def get_image_prepare_material(image, face_box, upper_boundary_ratio=0.5, expand=1.5, fp=None, mode="raw"):
    body = Image.fromarray(image[:,:,::-1])

    x, y, x1, y1 = face_box
    #print(x1-x,y1-y)
    crop_box, s = get_crop_box(face_box, expand)
    x_s, y_s, x_e, y_e = crop_box

    face_large = body.crop(crop_box)
    ori_shape = face_large.size

    mask_image = face_seg(face_large, mode=mode, fp=fp)
    mask_small = mask_image.crop((x-x_s, y-y_s, x1-x_s, y1-y_s))
    mask_image = Image.new('L', ori_shape, 0)
    mask_image.paste(mask_small, (x-x_s, y-y_s, x1-x_s, y1-y_s))

    # keep upper_boundary_ratio of talking area
    width, height = mask_image.size
    top_boundary = int(height * upper_boundary_ratio)
    modified_mask_image = Image.new('L', ori_shape, 0)
    modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary))

    blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1
    mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0)
    return mask_array, crop_box


================================================
FILE: musetalk/utils/dwpose/default_runtime.py
================================================
default_scope = 'mmpose'

# hooks
default_hooks = dict(
    timer=dict(type='IterTimerHook'),
    logger=dict(type='LoggerHook', interval=50),
    param_scheduler=dict(type='ParamSchedulerHook'),
    checkpoint=dict(type='CheckpointHook', interval=10),
    sampler_seed=dict(type='DistSamplerSeedHook'),
    visualization=dict(type='PoseVisualizationHook', enable=False),
    badcase=dict(
        type='BadCaseAnalysisHook',
        enable=False,
        out_dir='badcase',
        metric_type='loss',
        badcase_thr=5))

# custom hooks
custom_hooks = [
    # Synchronize model buffers such as running_mean and running_var in BN
    # at the end of each epoch
    dict(type='SyncBuffersHook')
]

# multi-processing backend
env_cfg = dict(
    cudnn_benchmark=False,
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    dist_cfg=dict(backend='nccl'),
)

# visualizer
vis_backends = [
    dict(type='LocalVisBackend'),
    # dict(type='TensorboardVisBackend'),
    # dict(type='WandbVisBackend'),
]
visualizer = dict(
    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')

# logger
log_processor = dict(
    type='LogProcessor', window_size=50, by_epoch=True, num_digits=6)
log_level = 'INFO'
load_from = None
resume = False

# file I/O backend
backend_args = dict(backend='local')

# training/validation/testing progress
train_cfg = dict(by_epoch=True)
val_cfg = dict()
test_cfg = dict()


================================================
FILE: musetalk/utils/dwpose/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py
================================================
#_base_ = ['../../../_base_/default_runtime.py']
_base_ = ['default_runtime.py']

# runtime
max_epochs = 270
stage2_num_epochs = 30
base_lr = 4e-3
train_batch_size = 32
val_batch_size = 32

train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)

# optimizer
optim_wrapper = dict(
    type='OptimWrapper',
    optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
    paramwise_cfg=dict(
        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))

# learning rate
param_scheduler = [
    dict(
        type='LinearLR',
        start_factor=1.0e-5,
        by_epoch=False,
        begin=0,
        end=1000),
    dict(
        # use cosine lr from 150 to 300 epoch
        type='CosineAnnealingLR',
        eta_min=base_lr * 0.05,
        begin=max_epochs // 2,
        end=max_epochs,
        T_max=max_epochs // 2,
        by_epoch=True,
        convert_to_iter_based=True),
]

# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=512)

# codec settings
codec = dict(
    type='SimCCLabel',
    input_size=(288, 384),
    sigma=(6., 6.93),
    simcc_split_ratio=2.0,
    normalize=False,
    use_dark=False)

# model settings
model = dict(
    type='TopdownPoseEstimator',
    data_preprocessor=dict(
        type='PoseDataPreprocessor',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        bgr_to_rgb=True),
    backbone=dict(
        _scope_='mmdet',
        type='CSPNeXt',
        arch='P5',
        expand_ratio=0.5,
        deepen_factor=1.,
        widen_factor=1.,
        out_indices=(4, ),
        channel_attention=True,
        norm_cfg=dict(type='SyncBN'),
        act_cfg=dict(type='SiLU'),
        init_cfg=dict(
            type='Pretrained',
            prefix='backbone.',
            checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
            'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth'  # noqa: E501
        )),
    head=dict(
        type='RTMCCHead',
        in_channels=1024,
        out_channels=133,
        input_size=codec['input_size'],
        in_featuremap_size=(9, 12),
        simcc_split_ratio=codec['simcc_split_ratio'],
        final_layer_kernel_size=7,
        gau_cfg=dict(
            hidden_dims=256,
            s=128,
            expansion_factor=2,
            dropout_rate=0.,
            drop_path=0.,
            act_fn='SiLU',
            use_rel_bias=False,
            pos_enc=False),
        loss=dict(
            type='KLDiscretLoss',
            use_target_weight=True,
            beta=10.,
            label_softmax=True),
        decoder=codec),
    test_cfg=dict(flip_test=True, ))

# base dataset settings
dataset_type = 'UBody2dDataset'
data_mode = 'topdown'
data_root = 'data/UBody/'

backend_args = dict(backend='local')

scenes = [
    'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
    'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
    'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
]

train_datasets = [
    dict(
        type='CocoWholeBodyDataset',
        data_root='data/coco/',
        data_mode=data_mode,
        ann_file='annotations/coco_wholebody_train_v1.0.json',
        data_prefix=dict(img='train2017/'),
        pipeline=[])
]

for scene in scenes:
    train_dataset = dict(
        type=dataset_type,
        data_root=data_root,
        data_mode=data_mode,
        ann_file=f'annotations/{scene}/train_annotations.json',
        data_prefix=dict(img='images/'),
        pipeline=[],
        sample_interval=10)
    train_datasets.append(train_dataset)

# pipelines
train_pipeline = [
    dict(type='LoadImage', backend_args=backend_args),
    dict(type='GetBBoxCenterScale'),
    dict(type='RandomFlip', direction='horizontal'),
    dict(type='RandomHalfBody'),
    dict(
        type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
    dict(type='TopdownAffine', input_size=codec['input_size']),
    dict(type='mmdet.YOLOXHSVRandomAug'),
    dict(
        type='Albumentation',
        transforms=[
            dict(type='Blur', p=0.1),
            dict(type='MedianBlur', p=0.1),
            dict(
                type='CoarseDropout',
                max_holes=1,
                max_height=0.4,
                max_width=0.4,
                min_holes=1,
                min_height=0.2,
                min_width=0.2,
                p=1.0),
        ]),
    dict(type='GenerateTarget', encoder=codec),
    dict(type='PackPoseInputs')
]
val_pipeline = [
    dict(type='LoadImage', backend_args=backend_args),
    dict(type='GetBBoxCenterScale'),
    dict(type='TopdownAffine', input_size=codec['input_size']),
    dict(type='PackPoseInputs')
]

train_pipeline_stage2 = [
    dict(type='LoadImage', backend_args=backend_args),
    dict(type='GetBBoxCenterScale'),
    dict(type='RandomFlip', direction='horizontal'),
    dict(type='RandomHalfBody'),
    dict(
        type='RandomBBoxTransform',
        shift_factor=0.,
        scale_factor=[0.5, 1.5],
        rotate_factor=90),
    dict(type='TopdownAffine', input_size=codec['input_size']),
    dict(type='mmdet.YOLOXHSVRandomAug'),
    dict(
        type='Albumentation',
        transforms=[
            dict(type='Blur', p=0.1),
            dict(type='MedianBlur', p=0.1),
            dict(
                type='CoarseDropout',
                max_holes=1,
                max_height=0.4,
                max_width=0.4,
                min_holes=1,
                min_height=0.2,
                min_width=0.2,
                p=0.5),
        ]),
    dict(type='GenerateTarget', encoder=codec),
    dict(type='PackPoseInputs')
]

# data loaders
train_dataloader = dict(
    batch_size=train_batch_size,
    num_workers=10,
    persistent_workers=True,
    sampler=dict(type='DefaultSampler', shuffle=True),
    dataset=dict(
        type='CombinedDataset',
        metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
        datasets=train_datasets,
        pipeline=train_pipeline,
        test_mode=False,
    ))

val_dataloader = dict(
    batch_size=val_batch_size,
    num_workers=10,
    persistent_workers=True,
    drop_last=False,
    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
    dataset=dict(
        type='CocoWholeBodyDataset',
        data_root=data_root,
        data_mode=data_mode,
        ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
        bbox_file='data/coco/person_detection_results/'
        'COCO_val2017_detections_AP_H_56_person.json',
        data_prefix=dict(img='coco/val2017/'),
        test_mode=True,
        pipeline=val_pipeline,
    ))
test_dataloader = val_dataloader

# hooks
default_hooks = dict(
    checkpoint=dict(
        save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))

custom_hooks = [
    dict(
        type='EMAHook',
        ema_type='ExpMomentumEMA',
        momentum=0.0002,
        update_buffers=True,
        priority=49),
    dict(
        type='mmdet.PipelineSwitchHook',
        switch_epoch=max_epochs - stage2_num_epochs,
        switch_pipeline=train_pipeline_stage2)
]

# evaluators
val_evaluator = dict(
    type='CocoWholeBodyMetric',
    ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
test_evaluator = val_evaluator


================================================
FILE: musetalk/utils/face_detection/README.md
================================================
The code for Face Detection in this folder has been taken from the wonderful [face_alignment](https://github.com/1adrianb/face-alignment) repository. This has been modified to take batches of faces at a time. 

================================================
FILE: musetalk/utils/face_detection/__init__.py
================================================
# -*- coding: utf-8 -*-

__author__ = """Adrian Bulat"""
__email__ = 'adrian.bulat@nottingham.ac.uk'
__version__ = '1.0.1'

from .api import FaceAlignment, LandmarksType, NetworkSize, YOLOv8_face


================================================
FILE: musetalk/utils/face_detection/api.py
================================================
from __future__ import print_function
import os
import torch
from torch.utils.model_zoo import load_url
from enum import Enum
import numpy as np
import cv2
try:
    import urllib.request as request_file
except BaseException:
    import urllib as request_file

from .models import FAN, ResNetDepth
from .utils import *


class LandmarksType(Enum):
    """Enum class defining the type of landmarks to detect.

    ``_2D`` - the detected points ``(x,y)`` are detected in a 2D space and follow the visible contour of the face
    ``_2halfD`` - this points represent the projection of the 3D points into 3D
    ``_3D`` - detect the points ``(x,y,z)``` in a 3D space

    """
    _2D = 1
    _2halfD = 2
    _3D = 3


class NetworkSize(Enum):
    # TINY = 1
    # SMALL = 2
    # MEDIUM = 3
    LARGE = 4

    def __new__(cls, value):
        member = object.__new__(cls)
        member._value_ = value
        return member

    def __int__(self):
        return self.value



class FaceAlignment:
    def __init__(self, landmarks_type, network_size=NetworkSize.LARGE,
                 device='cuda', flip_input=False, face_detector='sfd', verbose=False):
        self.device = device
        self.flip_input = flip_input
        self.landmarks_type = landmarks_type
        self.verbose = verbose

        network_size = int(network_size)

        if 'cuda' in device:
            torch.backends.cudnn.benchmark = True
#             torch.backends.cuda.matmul.allow_tf32 = False
#             torch.backends.cudnn.benchmark = True
#             torch.backends.cudnn.deterministic = False
#             torch.backends.cudnn.allow_tf32 = True
            print('cuda start')


        # Get the face detector
        face_detector_module = __import__('face_detection.detection.' + face_detector,
                                          globals(), locals(), [face_detector], 0)
        
        self.face_detector = face_detector_module.FaceDetector(device=device, verbose=verbose)

    def get_detections_for_batch(self, images):
        images = images[..., ::-1]
        detected_faces = self.face_detector.detect_from_batch(images.copy())
        results = []

        for i, d in enumerate(detected_faces):
            if len(d) == 0:
                results.append(None)
                continue
            d = d[0]
            d = np.clip(d, 0, None)
            
            x1, y1, x2, y2 = map(int, d[:-1])
            results.append((x1, y1, x2, y2))

        return results
    
    
class YOLOv8_face:
    def __init__(self, path = 'face_detection/weights/yolov8n-face.onnx', conf_thres=0.2, iou_thres=0.5):
        self.conf_threshold = conf_thres
        self.iou_threshold = iou_thres
        self.class_names = ['face']
        self.num_classes = len(self.class_names)
        # Initialize model
        self.net = cv2.dnn.readNet(path)
        self.input_height = 640
        self.input_width = 640
        self.reg_max = 16

        self.project = np.arange(self.reg_max)
        self.strides = (8, 16, 32)
        self.feats_hw = [(math.ceil(self.input_height / self.strides[i]), math.ceil(self.input_width / self.strides[i])) for i in range(len(self.strides))]
        self.anchors = self.make_anchors(self.feats_hw)

    def make_anchors(self, feats_hw, grid_cell_offset=0.5):
        """Generate anchors from features."""
        anchor_points = {}
        for i, stride in enumerate(self.strides):
            h,w = feats_hw[i]
            x = np.arange(0, w) + grid_cell_offset  # shift x
            y = np.arange(0, h) + grid_cell_offset  # shift y
            sx, sy = np.meshgrid(x, y)
            # sy, sx = np.meshgrid(y, x)
            anchor_points[stride] = np.stack((sx, sy), axis=-1).reshape(-1, 2)
        return anchor_points

    def softmax(self, x, axis=1):
        x_exp = np.exp(x)
        # 如果是列向量，则axis=0
        x_sum = np.sum(x_exp, axis=axis, keepdims=True)
        s = x_exp / x_sum
        return s
    
    def resize_image(self, srcimg, keep_ratio=True):
        top, left, newh, neww = 0, 0, self.input_width, self.input_height
        if keep_ratio and srcimg.shape[0] != srcimg.shape[1]:
            hw_scale = srcimg.shape[0] / srcimg.shape[1]
            if hw_scale > 1:
                newh, neww = self.input_height, int(self.input_width / hw_scale)
                img = cv2.resize(srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
                left = int((self.input_width - neww) * 0.5)
                img = cv2.copyMakeBorder(img, 0, 0, left, self.input_width - neww - left, cv2.BORDER_CONSTANT,
                                         value=(0, 0, 0))  # add border
            else:
                newh, neww = int(self.input_height * hw_scale), self.input_width
                img = cv2.resize(srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
                top = int((self.input_height - newh) * 0.5)
                img = cv2.copyMakeBorder(img, top, self.input_height - newh - top, 0, 0, cv2.BORDER_CONSTANT,
                                         value=(0, 0, 0))
        else:
            img = cv2.resize(srcimg, (self.input_width, self.input_height), interpolation=cv2.INTER_AREA)
        return img, newh, neww, top, left

    def detect(self, srcimg):
        input_img, newh, neww, padh, padw = self.resize_image(cv2.cvtColor(srcimg, cv2.COLOR_BGR2RGB))
        scale_h, scale_w = srcimg.shape[0]/newh, srcimg.shape[1]/neww
        input_img = input_img.astype(np.float32) / 255.0

        blob = cv2.dnn.blobFromImage(input_img)
        self.net.setInput(blob)
        outputs = self.net.forward(self.net.getUnconnectedOutLayersNames())
        # if isinstance(outputs, tuple):
        #     outputs = list(outputs)
        # if float(cv2.__version__[:3])>=4.7:
        #     outputs = [outputs[2], outputs[0], outputs[1]] ###opencv4.7需要这一步，opencv4.5不需要
        # Perform inference on the image
        det_bboxes, det_conf, det_classid, landmarks = self.post_process(outputs, scale_h, scale_w, padh, padw)
        return det_bboxes, det_conf, det_classid, landmarks

    def post_process(self, preds, scale_h, scale_w, padh, padw):
        bboxes, scores, landmarks = [], [], []
        for i, pred in enumerate(preds):
            stride = int(self.input_height/pred.shape[2])
            pred = pred.transpose((0, 2, 3, 1))
            
            box = pred[..., :self.reg_max * 4]
            cls = 1 / (1 + np.exp(-pred[..., self.reg_max * 4:-15])).reshape((-1,1))
            kpts = pred[..., -15:].reshape((-1,15)) ### x1,y1,score1, ..., x5,y5,score5

            # tmp = box.reshape(self.feats_hw[i][0], self.feats_hw[i][1], 4, self.reg_max)
            tmp = box.reshape(-1, 4, self.reg_max)
            bbox_pred = self.softmax(tmp, axis=-1)
            bbox_pred = np.dot(bbox_pred, self.project).reshape((-1,4))

            bbox = self.distance2bbox(self.anchors[stride], bbox_pred, max_shape=(self.input_height, self.input_width)) * stride
            kpts[:, 0::3] = (kpts[:, 0::3] * 2.0 + (self.anchors[stride][:, 0].reshape((-1,1)) - 0.5)) * stride
            kpts[:, 1::3] = (kpts[:, 1::3] * 2.0 + (self.anchors[stride][:, 1].reshape((-1,1)) - 0.5)) * stride
            kpts[:, 2::3] = 1 / (1+np.exp(-kpts[:, 2::3]))

            bbox -= np.array([[padw, padh, padw, padh]])  ###合理使用广播法则
            bbox *= np.array([[scale_w, scale_h, scale_w, scale_h]])
            kpts -= np.tile(np.array([padw, padh, 0]), 5).reshape((1,15))
            kpts *= np.tile(np.array([scale_w, scale_h, 1]), 5).reshape((1,15))

            bboxes.append(bbox)
            scores.append(cls)
            landmarks.append(kpts)

        bboxes = np.concatenate(bboxes, axis=0)
        scores = np.concatenate(scores, axis=0)
        landmarks = np.concatenate(landmarks, axis=0)
    
        bboxes_wh = bboxes.copy()
        bboxes_wh[:, 2:4] = bboxes[:, 2:4] - bboxes[:, 0:2]  ####xywh
        classIds = np.argmax(scores, axis=1)
        confidences = np.max(scores, axis=1)  ####max_class_confidence
        
        mask = confidences>self.conf_threshold
        bboxes_wh = bboxes_wh[mask]  ###合理使用广播法则
        confidences = confidences[mask]
        classIds = classIds[mask]
        landmarks = landmarks[mask]
        
        indices = cv2.dnn.NMSBoxes(bboxes_wh.tolist(), confidences.tolist(), self.conf_threshold,
                                   self.iou_threshold).flatten()
        if len(indices) > 0:
            mlvl_bboxes = bboxes_wh[indices]
            confidences = confidences[indices]
            classIds = classIds[indices]
            landmarks = landmarks[indices]
            return mlvl_bboxes, confidences, classIds, landmarks
        else:
            print('nothing detect')
            return np.array([]), np.array([]), np.array([]), np.array([])

    def distance2bbox(self, points, distance, max_shape=None):
        x1 = points[:, 0] - distance[:, 0]
        y1 = points[:, 1] - distance[:, 1]
        x2 = points[:, 0] + distance[:, 2]
        y2 = points[:, 1] + distance[:, 3]
        if max_shape is not None:
            x1 = np.clip(x1, 0, max_shape[1])
            y1 = np.clip(y1, 0, max_shape[0])
            x2 = np.clip(x2, 0, max_shape[1])
            y2 = np.clip(y2, 0, max_shape[0])
        return np.stack([x1, y1, x2, y2], axis=-1)
    
    def draw_detections(self, image, boxes, scores, kpts):
        for box, score, kp in zip(boxes, scores, kpts):
            x, y, w, h = box.astype(int)
            # Draw rectangle
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 255), thickness=3)
            cv2.putText(image, "face:"+str(round(score,2)), (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), thickness=2)
            for i in range(5):
                cv2.circle(image, (int(kp[i * 3]), int(kp[i * 3 + 1])), 4, (0, 255, 0), thickness=-1)
                # cv2.putText(image, str(i), (int(kp[i * 3]), int(kp[i * 3 + 1]) - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), thickness=1)
        return image
    
ROOT = os.path.dirname(os.path.abspath(__file__))

================================================
FILE: musetalk/utils/face_detection/detection/__init__.py
================================================
from .core import FaceDetector

================================================
FILE: musetalk/utils/face_detection/detection/core.py
================================================
import logging
import glob
from tqdm import tqdm
import numpy as np
import torch
import cv2


class FaceDetector(object):
    """An abstract class representing a face detector.

    Any other face detection implementation must subclass it. All subclasses
    must implement ``detect_from_image``, that return a list of detected
    bounding boxes. Optionally, for speed considerations detect from path is
    recommended.
    """

    def __init__(self, device, verbose):
        self.device = device
        self.verbose = verbose

        if verbose:
            if 'cpu' in device:
                logger = logging.getLogger(__name__)
                logger.warning("Detection running on CPU, this may be potentially slow.")

        if 'cpu' not in device and 'cuda' not in device:
            if verbose:
                logger.error("Expected values for device are: {cpu, cuda} but got: %s", device)
            raise ValueError

    def detect_from_image(self, tensor_or_path):
        """Detects faces in a given image.

        This function detects the faces present in a provided BGR(usually)
        image. The input can be either the image itself or the path to it.

        Arguments:
            tensor_or_path {numpy.ndarray, torch.tensor or string} -- the path
            to an image or the image itself.

        Example::

            >>> path_to_image = 'data/image_01.jpg'
            ...   detected_faces = detect_from_image(path_to_image)
            [A list of bounding boxes (x1, y1, x2, y2)]
            >>> image = cv2.imread(path_to_image)
            ...   detected_faces = detect_from_image(image)
            [A list of bounding boxes (x1, y1, x2, y2)]

        """
        raise NotImplementedError

    def detect_from_directory(self, path, extensions=['.jpg', '.png'], recursive=False, show_progress_bar=True):
        """Detects faces from all the images present in a given directory.

        Arguments:
            path {string} -- a string containing a path that points to the folder containing the images

        Keyword Arguments:
            extensions {list} -- list of string containing the extensions to be
            consider in the following format: ``.extension_name`` (default:
            {['.jpg', '.png']}) recursive {bool} -- option wherever to scan the
            folder recursively (default: {False}) show_progress_bar {bool} --
            display a progressbar (default: {True})

        Example:
        >>> directory = 'data'
        ...   detected_faces = detect_from_directory(directory)
        {A dictionary of [lists containing bounding boxes(x1, y1, x2, y2)]}

        """
        if self.verbose:
            logger = logging.getLogger(__name__)

        if len(extensions) == 0:
            if self.verbose:
                logger.error("Expected at list one extension, but none was received.")
            raise ValueError

        if self.verbose:
            logger.info("Constructing the list of images.")
        additional_pattern = '/**/*' if recursive else '/*'
        files = []
        for extension in extensions:
            files.extend(glob.glob(path + additional_pattern + extension, recursive=recursive))

        if self.verbose:
            logger.info("Finished searching for images. %s images found", len(files))
            logger.info("Preparing to run the detection.")

        predictions = {}
        for image_path in tqdm(files, disable=not show_progress_bar):
            if self.verbose:
                logger.info("Running the face detector on image: %s", image_path)
            predictions[image_path] = self.detect_from_image(image_path)

        if self.verbose:
            logger.info("The detector was successfully run on all %s images", len(files))

        return predictions

    @property
    def reference_scale(self):
        raise NotImplementedError

    @property
    def reference_x_shift(self):
        raise NotImplementedError

    @property
    def reference_y_shift(self):
        raise NotImplementedError

    @staticmethod
    def tensor_or_path_to_ndarray(tensor_or_path, rgb=True):
        """Convert path (represented as a string) or torch.tensor to a numpy.ndarray

        Arguments:
            tensor_or_path {numpy.ndarray, torch.tensor or string} -- path to the image, or the image itself
        """
        if isinstance(tensor_or_path, str):
            return cv2.imread(tensor_or_path) if not rgb else cv2.imread(tensor_or_path)[..., ::-1]
        elif torch.is_tensor(tensor_or_path):
            # Call cpu in case its coming from cuda
            return tensor_or_path.cpu().numpy()[..., ::-1].copy() if not rgb else tensor_or_path.cpu().numpy()
        elif isinstance(tensor_or_path, np.ndarray):
            return tensor_or_path[..., ::-1].copy() if not rgb else tensor_or_path
        else:
            raise TypeError


================================================
FILE: musetalk/utils/face_detection/detection/sfd/__init__.py
================================================
from .sfd_detector import SFDDetector as FaceDetector

================================================
FILE: musetalk/utils/face_detection/detection/sfd/bbox.py
================================================
from __future__ import print_function
import os
import sys
import cv2
import random
import datetime
import time
import math
import argparse
import numpy as np
import torch

try:
    from iou import IOU
except BaseException:
    # IOU cython speedup 10x
    def IOU(ax1, ay1, ax2, ay2, bx1, by1, bx2, by2):
        sa = abs((ax2 - ax1) * (ay2 - ay1))
        sb = abs((bx2 - bx1) * (by2 - by1))
        x1, y1 = max(ax1, bx1), max(ay1, by1)
        x2, y2 = min(ax2, bx2), min(ay2, by2)
        w = x2 - x1
        h = y2 - y1
        if w < 0 or h < 0:
            return 0.0
        else:
            return 1.0 * w * h / (sa + sb - w * h)


def bboxlog(x1, y1, x2, y2, axc, ayc, aww, ahh):
    xc, yc, ww, hh = (x2 + x1) / 2, (y2 + y1) / 2, x2 - x1, y2 - y1
    dx, dy = (xc - axc) / aww, (yc - ayc) / ahh
    dw, dh = math.log(ww / aww), math.log(hh / ahh)
    return dx, dy, dw, dh


def bboxloginv(dx, dy, dw, dh, axc, ayc, aww, ahh):
    xc, yc = dx * aww + axc, dy * ahh + ayc
    ww, hh = math.exp(dw) * aww, math.exp(dh) * ahh
    x1, x2, y1, y2 = xc - ww / 2, xc + ww / 2, yc - hh / 2, yc + hh / 2
    return x1, y1, x2, y2


def nms(dets, thresh):
    if 0 == len(dets):
        return []
    x1, y1, x2, y2, scores = dets[:, 0], dets[:, 1], dets[:, 2], dets[:, 3], dets[:, 4]
    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1, yy1 = np.maximum(x1[i], x1[order[1:]]), np.maximum(y1[i], y1[order[1:]])
        xx2, yy2 = np.minimum(x2[i], x2[order[1:]]), np.minimum(y2[i], y2[order[1:]])

        w, h = np.maximum(0.0, xx2 - xx1 + 1), np.maximum(0.0, yy2 - yy1 + 1)
        ovr = w * h / (areas[i] + areas[order[1:]] - w * h)

        inds = np.where(ovr <= thresh)[0]
        order = order[inds + 1]

    return keep


def encode(matched, priors, variances):
    """Encode the variances from the priorbox layers into the ground truth boxes
    we have matched (based on jaccard overlap) with the prior boxes.
    Args:
        matched: (tensor) Coords of ground truth for each prior in point-form
            Shape: [num_priors, 4].
        priors: (tensor) Prior boxes in center-offset form
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        encoded boxes (tensor), Shape: [num_priors, 4]
    """

    # dist b/t match center and prior's center
    g_cxcy = (matched[:, :2] + matched[:, 2:]) / 2 - priors[:, :2]
    # encode variance
    g_cxcy /= (variances[0] * priors[:, 2:])
    # match wh / prior wh
    g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
    g_wh = torch.log(g_wh) / variances[1]
    # return target for smooth_l1_loss
    return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]


def decode(loc, priors, variances):
    """Decode locations from predictions using priors to undo
    the encoding we did for offset regression at train time.
    Args:
        loc (tensor): location predictions for loc layers,
            Shape: [num_priors,4]
        priors (tensor): Prior boxes in center-offset form.
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        decoded bounding box predictions
    """

    boxes = torch.cat((
        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
    boxes[:, :2] -= boxes[:, 2:] / 2
    boxes[:, 2:] += boxes[:, :2]
    return boxes

def batch_decode(loc, priors, variances):
    """Decode locations from predictions using priors to undo
    the encoding we did for offset regression at train time.
    Args:
        loc (tensor): location predictions for loc layers,
            Shape: [num_priors,4]
        priors (tensor): Prior boxes in center-offset form.
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        decoded bounding box predictions
    """

    boxes = torch.cat((
        priors[:, :, :2] + loc[:, :, :2] * variances[0] * priors[:, :, 2:],
        priors[:, :, 2:] * torch.exp(loc[:, :, 2:] * variances[1])), 2)
    boxes[:, :, :2] -= boxes[:, :, 2:] / 2
    boxes[:, :, 2:] += boxes[:, :, :2]
    return boxes


================================================
FILE: musetalk/utils/face_detection/detection/sfd/detect.py
================================================
import torch
import torch.nn.functional as F

import os
import sys
import cv2
import random
import datetime
import math
import argparse
import numpy as np

import scipy.io as sio
import zipfile
from .net_s3fd import s3fd
from .bbox import *


def detect(net, img, device):
    img = img - np.array([104, 117, 123])
    img = img.transpose(2, 0, 1)
    img = img.reshape((1,) + img.shape)

    if 'cuda' in device:
        torch.backends.cudnn.benchmark = True

    img = torch.from_numpy(img).float().to(device)
    BB, CC, HH, WW = img.size()
    with torch.no_grad():
        olist = net(img)

    bboxlist = []
    for i in range(len(olist) // 2):
        olist[i * 2] = F.softmax(olist[i * 2], dim=1)
    olist = [oelem.data.cpu() for oelem in olist]
    for i in range(len(olist) // 2):
        ocls, oreg = olist[i * 2], olist[i * 2 + 1]
        FB, FC, FH, FW = ocls.size()  # feature map size
        stride = 2**(i + 2)    # 4,8,16,32,64,128
        anchor = stride * 4
        poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
        for Iindex, hindex, windex in poss:
            axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
            score = ocls[0, 1, hindex, windex]
            loc = oreg[0, :, hindex, windex].contiguous().view(1, 4)
            priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]])
            variances = [0.1, 0.2]
            box = decode(loc, priors, variances)
            x1, y1, x2, y2 = box[0] * 1.0
            # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
            bboxlist.append([x1, y1, x2, y2, score])
    bboxlist = np.array(bboxlist)
    if 0 == len(bboxlist):
        bboxlist = np.zeros((1, 5))

    return bboxlist

def batch_detect(net, imgs, device):
    imgs = imgs - np.array([104, 117, 123])
    imgs = imgs.transpose(0, 3, 1, 2)

    if 'cuda' in device:
        torch.backends.cudnn.benchmark = True

    imgs = torch.from_numpy(imgs).float().to(device)
    BB, CC, HH, WW = imgs.size()
    with torch.no_grad():
        olist = net(imgs)
#     print(olist)
    
    bboxlist = []
    for i in range(len(olist) // 2):
        olist[i * 2] = F.softmax(olist[i * 2], dim=1)
    
    olist = [oelem.cpu() for oelem in olist]
    for i in range(len(olist) // 2):
        ocls, oreg = olist[i * 2], olist[i * 2 + 1]
        FB, FC, FH, FW = ocls.size()  # feature map size
        stride = 2**(i + 2)    # 4,8,16,32,64,128
        anchor = stride * 4
        poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
        for Iindex, hindex, windex in poss:
            axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
            score = ocls[:, 1, hindex, windex]
            loc = oreg[:, :, hindex, windex].contiguous().view(BB, 1, 4)
            priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]]).view(1, 1, 4)
            variances = [0.1, 0.2]
            box = batch_decode(loc, priors, variances)
            box = box[:, 0] * 1.0
            # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
            bboxlist.append(torch.cat([box, score.unsqueeze(1)], 1).cpu().numpy())
    bboxlist = np.array(bboxlist)
    if 0 == len(bboxlist):
        bboxlist = np.zeros((1, BB, 5))

    return bboxlist

def flip_detect(net, img, device):
    img = cv2.flip(img, 1)
    b = detect(net, img, device)

    bboxlist = np.zeros(b.shape)
    bboxlist[:, 0] = img.shape[1] - b[:, 2]
    bboxlist[:, 1] = b[:, 1]
    bboxlist[:, 2] = img.shape[1] - b[:, 0]
    bboxlist[:, 3] = b[:, 3]
    bboxlist[:, 4] = b[:, 4]
    return bboxlist


def pts_to_bb(pts):
    min_x, min_y = np.min(pts, axis=0)
    max_x, max_y = np.max(pts, axis=0)
    return np.array([min_x, min_y, max_x, max_y])


================================================
FILE: musetalk/utils/face_detection/detection/sfd/net_s3fd.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


class L2Norm(nn.Module):
    def __init__(self, n_channels, scale=1.0):
        super(L2Norm, self).__init__()
        self.n_channels = n_channels
        self.scale = scale
        self.eps = 1e-10
        self.weight = nn.Parameter(torch.Tensor(self.n_channels))
        self.weight.data *= 0.0
        self.weight.data += self.scale

    def forward(self, x):
        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
        x = x / norm * self.weight.view(1, -1, 1, 1)
        return x


class s3fd(nn.Module):
    def __init__(self):
        super(s3fd, self).__init__()
        self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)

        self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)

        self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)

        self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
        self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
        self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)

        self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
        self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
        self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)

        self.fc6 = nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=3)
        self.fc7 = nn.Conv2d(1024, 1024, kernel_size=1, stride=1, padding=0)

        self.conv6_1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0)
        self.conv6_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)

        self.conv7_1 = nn.Conv2d(512, 128, kernel_size=1, stride=1, padding=0)
        self.conv7_2 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)

        self.conv3_3_norm = L2Norm(256, scale=10)
        self.conv4_3_norm = L2Norm(512, scale=8)
        self.conv5_3_norm = L2Norm(512, scale=5)

        self.conv3_3_norm_mbox_conf = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)
        self.conv3_3_norm_mbox_loc = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)
        self.conv4_3_norm_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
        self.conv4_3_norm_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
        self.conv5_3_norm_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
        self.conv5_3_norm_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)

        self.fc7_mbox_conf = nn.Conv2d(1024, 2, kernel_size=3, stride=1, padding=1)
        self.fc7_mbox_loc = nn.Conv2d(1024, 4, kernel_size=3, stride=1, padding=1)
        self.conv6_2_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
        self.conv6_2_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
        self.conv7_2_mbox_conf = nn.Conv2d(256, 2, kernel_size=3, stride=1, padding=1)
        self.conv7_2_mbox_loc = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        h = F.relu(self.conv1_1(x))
        h = F.relu(self.conv1_2(h))
        h = F.max_pool2d(h, 2, 2)

        h = F.relu(self.conv2_1(h))
        h = F.relu(self.conv2_2(h))
        h = F.max_pool2d(h, 2, 2)

        h = F.relu(self.conv3_1(h))
        h = F.relu(self.conv3_2(h))
        h = F.relu(self.conv3_3(h))
        f3_3 = h
        h = F.max_pool2d(h, 2, 2)

        h = F.relu(self.conv4_1(h))
        h = F.relu(self.conv4_2(h))
        h = F.relu(self.conv4_3(h))
        f4_3 = h
        h = F.max_pool2d(h, 2, 2)

        h = F.relu(self.conv5_1(h))
        h = F.relu(self.conv5_2(h))
        h = F.relu(self.conv5_3(h))
        f5_3 = h
        h = F.max_pool2d(h, 2, 2)

        h = F.relu(self.fc6(h))
        h = F.relu(self.fc7(h))
        ffc7 = h
        h = F.relu(self.conv6_1(h))
        h = F.relu(self.conv6_2(h))
        f6_2 = h
        h = F.relu(self.conv7_1(h))
        h = F.relu(self.conv7_2(h))
        f7_2 = h

        f3_3 = self.conv3_3_norm(f3_3)
        f4_3 = self.conv4_3_norm(f4_3)
        f5_3 = self.conv5_3_norm(f5_3)

        cls1 = self.conv3_3_norm_mbox_conf(f3_3)
        reg1 = self.conv3_3_norm_mbox_loc(f3_3)
        cls2 = self.conv4_3_norm_mbox_conf(f4_3)
        reg2 = self.conv4_3_norm_mbox_loc(f4_3)
        cls3 = self.conv5_3_norm_mbox_conf(f5_3)
        reg3 = self.conv5_3_norm_mbox_loc(f5_3)
        cls4 = self.fc7_mbox_conf(ffc7)
        reg4 = self.fc7_mbox_loc(ffc7)
        cls5 = self.conv6_2_mbox_conf(f6_2)
        reg5 = self.conv6_2_mbox_loc(f6_2)
        cls6 = self.conv7_2_mbox_conf(f7_2)
        reg6 = self.conv7_2_mbox_loc(f7_2)

        # max-out background label
        chunk = torch.chunk(cls1, 4, 1)
        bmax = torch.max(torch.max(chunk[0], chunk[1]), chunk[2])
        cls1 = torch.cat([bmax, chunk[3]], dim=1)

        return [cls1, reg1, cls2, reg2, cls3, reg3, cls4, reg4, cls5, reg5, cls6, reg6]


================================================
FILE: musetalk/utils/face_detection/detection/sfd/sfd_detector.py
================================================
import os
import cv2
from torch.utils.model_zoo import load_url

from ..core import FaceDetector

from .net_s3fd import s3fd
from .bbox import *
from .detect import *

models_urls = {
    's3fd': 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth',
}


class SFDDetector(FaceDetector):
    def __init__(self, device, path_to_detector=os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth'), verbose=False):
        super(SFDDetector, self).__init__(device, verbose)

        # Initialise the face detector
        if not os.path.isfile(path_to_detector):
            model_weights = load_url(models_urls['s3fd'])
        else:
            model_weights = torch.load(path_to_detector)

        self.face_detector = s3fd()
        self.face_detector.load_state_dict(model_weights)
        self.face_detector.to(device)
        self.face_detector.eval()

    def detect_from_image(self, tensor_or_path):
        image = self.tensor_or_path_to_ndarray(tensor_or_path)

        bboxlist = detect(self.face_detector, image, device=self.device)
        keep = nms(bboxlist, 0.3)
        bboxlist = bboxlist[keep, :]
        bboxlist = [x for x in bboxlist if x[-1] > 0.5]

        return bboxlist

    def detect_from_batch(self, images):
        bboxlists = batch_detect(self.face_detector, images, device=self.device)
        keeps = [nms(bboxlists[:, i, :], 0.3) for i in range(bboxlists.shape[1])]
        bboxlists = [bboxlists[keep, i, :] for i, keep in enumerate(keeps)]
        bboxlists = [[x for x in bboxlist if x[-1] > 0.5] for bboxlist in bboxlists]

        return bboxlists

    @property
    def reference_scale(self):
        return 195

    @property
    def reference_x_shift(self):
        return 0

    @property
    def reference_y_shift(self):
        return 0


================================================
FILE: musetalk/utils/face_detection/models.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import math


def conv3x3(in_planes, out_planes, strd=1, padding=1, bias=False):
    "3x3 convolution with padding"
    return nn.Conv2d(in_planes, out_planes, kernel_size=3,
                     stride=strd, padding=padding, bias=bias)


class ConvBlock(nn.Module):
    def __init__(self, in_planes, out_planes):
        super(ConvBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = conv3x3(in_planes, int(out_planes / 2))
        self.bn2 = nn.BatchNorm2d(int(out_planes / 2))
        self.conv2 = conv3x3(int(out_planes / 2), int(out_planes / 4))
        self.bn3 = nn.BatchNorm2d(int(out_planes / 4))
        self.conv3 = conv3x3(int(out_planes / 4), int(out_planes / 4))

        if in_planes != out_planes:
            self.downsample = nn.Sequential(
                nn.BatchNorm2d(in_planes),
                nn.ReLU(True),
                nn.Conv2d(in_planes, out_planes,
                          kernel_size=1, stride=1, bias=False),
            )
        else:
            self.downsample = None

    def forward(self, x):
        residual = x

        out1 = self.bn1(x)
        out1 = F.relu(out1, True)
        out1 = self.conv1(out1)

        out2 = self.bn2(out1)
        out2 = F.relu(out2, True)
        out2 = self.conv2(out2)

        out3 = self.bn3(out2)
        out3 = F.relu(out3, True)
        out3 = self.conv3(out3)

        out3 = torch.cat((out1, out2, out3), 1)

        if self.downsample is not None:
            residual = self.downsample(residual)

        out3 += residual

        return out3


class Bottleneck(nn.Module):

    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class HourGlass(nn.Module):
    def __init__(self, num_modules, depth, num_features):
        super(HourGlass, self).__init__()
        self.num_modules = num_modules
        self.depth = depth
        self.features = num_features

        self._generate_network(self.depth)

    def _generate_network(self, level):
        self.add_module('b1_' + str(level), ConvBlock(self.features, self.features))

        self.add_module('b2_' + str(level), ConvBlock(self.features, self.features))

        if level > 1:
            self._generate_network(level - 1)
        else:
            self.add_module('b2_plus_' + str(level), ConvBlock(self.features, self.features))

        self.add_module('b3_' + str(level), ConvBlock(self.features, self.features))

    def _forward(self, level, inp):
        # Upper branch
        up1 = inp
        up1 = self._modules['b1_' + str(level)](up1)

        # Lower branch
        low1 = F.avg_pool2d(inp, 2, stride=2)
        low1 = self._modules['b2_' + str(level)](low1)

        if level > 1:
            low2 = self._forward(level - 1, low1)
        else:
            low2 = low1
            low2 = self._modules['b2_plus_' + str(level)](low2)

        low3 = low2
        low3 = self._modules['b3_' + str(level)](low3)

        up2 = F.interpolate(low3, scale_factor=2, mode='nearest')

        return up1 + up2

    def forward(self, x):
        return self._forward(self.depth, x)


class FAN(nn.Module):

    def __init__(self, num_modules=1):
        super(FAN, self).__init__()
        self.num_modules = num_modules

        # Base part
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = ConvBlock(64, 128)
        self.conv3 = ConvBlock(128, 128)
        self.conv4 = ConvBlock(128, 256)

        # Stacking part
        for hg_module in range(self.num_modules):
            self.add_module('m' + str(hg_module), HourGlass(1, 4, 256))
            self.add_module('top_m_' + str(hg_module), ConvBlock(256, 256))
            self.add_module('conv_last' + str(hg_module),
                            nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
            self.add_module('bn_end' + str(hg_module), nn.BatchNorm2d(256))
            self.add_module('l' + str(hg_module), nn.Conv2d(256,
                                                            68, kernel_size=1, stride=1, padding=0))

            if hg_module < self.num_modules - 1:
                self.add_module(
                    'bl' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
                self.add_module('al' + str(hg_module), nn.Conv2d(68,
                                                                 256, kernel_size=1, stride=1, padding=0))

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)), True)
        x = F.avg_pool2d(self.conv2(x), 2, stride=2)
        x = self.conv3(x)
        x = self.conv4(x)

        previous = x

        outputs = []
        for i in range(self.num_modules):
            hg = self._modules['m' + str(i)](previous)

            ll = hg
            ll = self._modules['top_m_' + str(i)](ll)

            ll = F.relu(self._modules['bn_end' + str(i)]
                        (self._modules['conv_last' + str(i)](ll)), True)

            # Predict heatmaps
            tmp_out = self._modules['l' + str(i)](ll)
            outputs.append(tmp_out)

            if i < self.num_modules - 1:
                ll = self._modules['bl' + str(i)](ll)
                tmp_out_ = self._modules['al' + str(i)](tmp_out)
                previous = previous + ll + tmp_out_

        return outputs


class ResNetDepth(nn.Module):

    def __init__(self, block=Bottleneck, layers=[3, 8, 36, 3], num_classes=68):
        self.inplanes = 64
        super(ResNetDepth, self).__init__()
        self.conv1 = nn.Conv2d(3 + 68, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


================================================
FILE: musetalk/utils/face_detection/utils.py
================================================
from __future__ import print_function
import os
import sys
import time
import torch
import math
import numpy as np
import cv2


def _gaussian(
        size=3, sigma=0.25, amplitude=1, normalize=False, width=None,
        height=None, sigma_horz=None, sigma_vert=None, mean_horz=0.5,
        mean_vert=0.5):
    # handle some defaults
    if width is None:
        width = size
    if height is None:
        height = size
    if sigma_horz is None:
        sigma_horz = sigma
    if sigma_vert is None:
        sigma_vert = sigma
    center_x = mean_horz * width + 0.5
    center_y = mean_vert * height + 0.5
    gauss = np.empty((height, width), dtype=np.float32)
    # generate kernel
    for i in range(height):
        for j in range(width):
            gauss[i][j] = amplitude * math.exp(-(math.pow((j + 1 - center_x) / (
                sigma_horz * width), 2) / 2.0 + math.pow((i + 1 - center_y) / (sigma_vert * height), 2) / 2.0))
    if normalize:
        gauss = gauss / np.sum(gauss)
    return gauss


def draw_gaussian(image, point, sigma):
    # Check if the gaussian is inside
    ul = [math.floor(point[0] - 3 * sigma), math.floor(point[1] - 3 * sigma)]
    br = [math.floor(point[0] + 3 * sigma), math.floor(point[1] + 3 * sigma)]
    if (ul[0] > image.shape[1] or ul[1] > image.shape[0] or br[0] < 1 or br[1] < 1):
        return image
    size = 6 * sigma + 1
    g = _gaussian(size)
    g_x = [int(max(1, -ul[0])), int(min(br[0], image.shape[1])) - int(max(1, ul[0])) + int(max(1, -ul[0]))]
    g_y = [int(max(1, -ul[1])), int(min(br[1], image.shape[0])) - int(max(1, ul[1])) + int(max(1, -ul[1]))]
    img_x = [int(max(1, ul[0])), int(min(br[0], image.shape[1]))]
    img_y = [int(max(1, ul[1])), int(min(br[1], image.shape[0]))]
    assert (g_x[0] > 0 and g_y[1] > 0)
    image[img_y[0] - 1:img_y[1], img_x[0] - 1:img_x[1]
          ] = image[img_y[0] - 1:img_y[1], img_x[0] - 1:img_x[1]] + g[g_y[0] - 1:g_y[1], g_x[0] - 1:g_x[1]]
    image[image > 1] = 1
    return image


def transform(point, center, scale, resolution, invert=False):
    """Generate and affine transformation matrix.

    Given a set of points, a center, a scale and a targer resolution, the
    function generates and affine transformation matrix. If invert is ``True``
    it will produce the inverse transformation.

    Arguments:
        point {torch.tensor} -- the input 2D point
        center {torch.tensor or numpy.array} -- the center around which to perform the transformations
        scale {float} -- the scale of the face/object
        resolution {float} -- the output resolution

    Keyword Arguments:
        invert {bool} -- define wherever the function should produce the direct or the
        inverse transformation matrix (default: {False})
    """
    _pt = torch.ones(3)
    _pt[0] = point[0]
    _pt[1] = point[1]

    h = 200.0 * scale
    t = torch.eye(3)
    t[0, 0] = resolution / h
    t[1, 1] = resolution / h
    t[0, 2] = resolution * (-center[0] / h + 0.5)
    t[1, 2] = resolution * (-center[1] / h + 0.5)

    if invert:
        t = torch.inverse(t)

    new_point = (torch.matmul(t, _pt))[0:2]

    return new_point.int()


def crop(image, center, scale, resolution=256.0):
    """Center crops an image or set of heatmaps

    Arguments:
        image {numpy.array} -- an rgb image
        center {numpy.array} -- the center of the object, usually the same as of the bounding box
        scale {float} -- scale of the face

    Keyword Arguments:
        resolution {float} -- the size of the output cropped image (default: {256.0})

    Returns:
        [type] -- [description]
    """  # Crop around the center point
    """ Crops the image around the center. Input is expected to be an np.ndarray """
    ul = transform([1, 1], center, scale, resolution, True)
    br = transform([resolution, resolution], center, scale, resolution, True)
    # pad = math.ceil(torch.norm((ul - br).float()) / 2.0 - (br[0] - ul[0]) / 2.0)
    if image.ndim > 2:
        newDim = np.array([br[1] - ul[1], br[0] - ul[0],
                           image.shape[2]], dtype=np.int32)
        newImg = np.zeros(newDim, dtype=np.uint8)
    else:
        newDim = np.array([br[1] - ul[1], br[0] - ul[0]], dtype=np.int)
        newImg = np.zeros(newDim, dtype=np.uint8)
    ht = image.shape[0]
    wd = image.shape[1]
    newX = np.array(
        [max(1, -ul[0] + 1), min(br[0], wd) - ul[0]], dtype=np.int32)
    newY = np.array(
        [max(1, -ul[1] + 1), min(br[1], ht) - ul[1]], dtype=np.int32)
    oldX = np.array([max(1, ul[0] + 1), min(br[0], wd)], dtype=np.int32)
    oldY = np.array([max(1, ul[1] + 1), min(br[1], ht)], dtype=np.int32)
    newImg[newY[0] - 1:newY[1], newX[0] - 1:newX[1]
           ] = image[oldY[0] - 1:oldY[1], oldX[0] - 1:oldX[1], :]
    newImg = cv2.resize(newImg, dsize=(int(resolution), int(resolution)),
                        interpolation=cv2.INTER_LINEAR)
    return newImg


def get_preds_fromhm(hm, center=None, scale=None):
    """Obtain (x,y) coordinates given a set of N heatmaps. If the center
    and the scale is provided the function will return the points also in
    the original coordinate frame.

    Arguments:
        hm {torch.tensor} -- the predicted heatmaps, of shape [B, N, W, H]

    Keyword Arguments:
        center {torch.tensor} -- the center of the bounding box (default: {None})
        scale {float} -- face scale (default: {None})
    """
    max, idx = torch.max(
        hm.view(hm.size(0), hm.size(1), hm.size(2) * hm.size(3)), 2)
    idx += 1
    preds = idx.view(idx.size(0), idx.size(1), 1).repeat(1, 1, 2).float()
    preds[..., 0].apply_(lambda x: (x - 1) % hm.size(3) + 1)
    preds[..., 1].add_(-1).div_(hm.size(2)).floor_().add_(1)

    for i in range(preds.size(0)):
        for j in range(preds.size(1)):
            hm_ = hm[i, j, :]
            pX, pY = int(preds[i, j, 0]) - 1, int(preds[i, j, 1]) - 1
            if pX > 0 and pX < 63 and pY > 0 and pY < 63:
                diff = torch.FloatTensor(
                    [hm_[pY, pX + 1] - hm_[pY, pX - 1],
                     hm_[pY + 1, pX] - hm_[pY - 1, pX]])
                preds[i, j].add_(diff.sign_().mul_(.25))

    preds.add_(-.5)

    preds_orig = torch.zeros(preds.size())
    if center is not None and scale is not None:
        for i in range(hm.size(0)):
            for j in range(hm.size(1)):
                preds_orig[i, j] = transform(
                    preds[i, j], center, scale, hm.size(2), True)

    return preds, preds_orig

def get_preds_fromhm_batch(hm, centers=None, scales=None):
    """Obtain (x,y) coordinates given a set of N heatmaps. If the centers
    and the scales is provided the function will return the points also in
    the original coordinate frame.

    Arguments:
        hm {torch.tensor} -- the predicted heatmaps, of shape [B, N, W, H]

    Keyword Arguments:
        centers {torch.tensor} -- the centers of the bounding box (default: {None})
        scales {float} -- face scales (default: {None})
    """
    max, idx = torch.max(
        hm.view(hm.size(0), hm.size(1), hm.size(2) * hm.size(3)), 2)
    idx += 1
    preds = idx.view(idx.size(0), idx.size(1), 1).repeat(1, 1, 2).float()
    preds[..., 0].apply_(lambda x: (x - 1) % hm.size(3) + 1)
    preds[..., 1].add_(-1).div_(hm.size(2)).floor_().add_(1)

    for i in range(preds.size(0)):
        for j in range(preds.size(1)):
            hm_ = hm[i, j, :]
            pX, pY = int(preds[i, j, 0]) - 1, int(preds[i, j, 1]) - 1
            if pX > 0 and pX < 63 and pY > 0 and pY < 63:
                diff = torch.FloatTensor(
                    [hm_[pY, pX + 1] - hm_[pY, pX - 1],
                     hm_[pY + 1, pX] - hm_[pY - 1, pX]])
                preds[i, j].add_(diff.sign_().mul_(.25))

    preds.add_(-.5)

    preds_orig = torch.zeros(preds.size())
    if centers is not None and scales is not None:
        for i in range(hm.size(0)):
            for j in range(hm.size(1)):
                preds_orig[i, j] = transform(
                    preds[i, j], centers[i], scales[i], hm.size(2), True)

    return preds, preds_orig

def shuffle_lr(parts, pairs=None):
    """Shuffle the points left-right according to the axis of symmetry
    of the object.

    Arguments:
        parts {torch.tensor} -- a 3D or 4D object containing the
        heatmaps.

    Keyword Arguments:
        pairs {list of integers} -- [order of the flipped points] (default: {None})
    """
    if pairs is None:
        pairs = [16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
                 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 27, 28, 29, 30, 35,
                 34, 33, 32, 31, 45, 44, 43, 42, 47, 46, 39, 38, 37, 36, 41,
                 40, 54, 53, 52, 51, 50, 49, 48, 59, 58, 57, 56, 55, 64, 63,
                 62, 61, 60, 67, 66, 65]
    if parts.ndimension() == 3:
        parts = parts[pairs, ...]
    else:
        parts = parts[:, pairs, ...]

    return parts


def flip(tensor, is_label=False):
    """Flip an image or a set of heatmaps left-right

    Arguments:
        tensor {numpy.array or torch.tensor} -- [the input image or heatmaps]

    Keyword Arguments:
        is_label {bool} -- [denote wherever the input is an image or a set of heatmaps ] (default: {False})
    """
    if not torch.is_tensor(tensor):
        tensor = torch.from_numpy(tensor)

    if is_label:
        tensor = shuffle_lr(tensor).flip(tensor.ndimension() - 1)
    else:
        tensor = tensor.flip(tensor.ndimension() - 1)

    return tensor

# From pyzolib/paths.py (https://bitbucket.org/pyzo/pyzolib/src/tip/paths.py)


def appdata_dir(appname=None, roaming=False):
    """ appdata_dir(appname=None, roaming=False)

    Get the path to the application directory, where applications are allowed
    to write user specific files (e.g. configurations). For non-user specific
    data, consider using common_appdata_dir().
    If appname is given, a subdir is appended (and created if necessary).
    If roaming is True, will prefer a roaming directory (Windows Vista/7).
    """

    # Define default user directory
    userDir = os.getenv('FACEALIGNMENT_USERDIR', None)
    if userDir is None:
        userDir = os.path.expanduser('~')
        if not os.path.isdir(userDir):  # pragma: no cover
            userDir = '/var/tmp'  # issue #54

    # Get system app data dir
    path = None
    if sys.platform.startswith('win'):
        path1, path2 = os.getenv('LOCALAPPDATA'), os.getenv('APPDATA')
        path = (path2 or path1) if roaming else (path1 or path2)
    elif sys.platform.startswith('darwin'):
        path = os.path.join(userDir, 'Library', 'Application Support')
    # On Linux and as fallback
    if not (path and os.path.isdir(path)):
        path = userDir

    # Maybe we should store things local to the executable (in case of a
    # portable distro or a frozen application that wants to be portable)
    prefix = sys.prefix
    if getattr(sys, 'frozen', None):
        prefix = os.path.abspath(os.path.dirname(sys.executable))
    for reldir in ('settings', '../settings'):
        localpath = os.path.abspath(os.path.join(prefix, reldir))
        if os.path.isdir(localpath):  # pragma: no cover
            try:
                open(os.path.join(localpath, 'test.write'), 'wb').close()
                os.remove(os.path.join(localpath, 'test.write'))
            except IOError:
                pass  # We cannot write in this directory
            else:
                path = localpath
                break

    # Get path specific for this app
    if appname:
        if path == userDir:
            appname = '.' + appname.lstrip('.')  # Make it a hidden directory
        path = os.path.join(path, appname)
        if not os.path.isdir(path):  # pragma: no cover
            os.mkdir(path)

    # Done
    return path


================================================
FILE: musetalk/utils/face_parsing/__init__.py
================================================
import torch
import time
import os
import cv2
import numpy as np
from PIL import Image
from .model import BiSeNet
import torchvision.transforms as transforms

class FaceParsing():
    def __init__(self, left_cheek_width=80, right_cheek_width=80):
        self.net = self.model_init()
        self.preprocess = self.image_preprocess()
        # Ensure all size parameters are integers
        cone_height = 21
        tail_height = 12
        total_size = cone_height + tail_height
        
        # Create kernel with explicit integer dimensions
        kernel = np.zeros((total_size, total_size), dtype=np.uint8)
        center_x = total_size // 2  # Ensure center coordinates are integers
        
        # Cone part
        for row in range(cone_height):
            if row < cone_height//2:
                continue
            width = int(2 * (row - cone_height//2) + 1)
            start = int(center_x - (width // 2))
            end = int(center_x + (width // 2) + 1)
            kernel[row, start:end] = 1

        # Vertical extension part
        if cone_height > 0:
            base_width = int(kernel[cone_height-1].sum())
        else:
            base_width = 1
        
        for row in range(cone_height, total_size):
            start = max(0, int(center_x - (base_width//2)))
            end = min(total_size, int(center_x + (base_width//2) + 1))
            kernel[row, start:end] = 1
        self.kernel = kernel
        
        # Modify cheek erosion kernel to be flatter ellipse
        self.cheek_kernel = cv2.getStructuringElement(
            cv2.MORPH_ELLIPSE, (35, 3))
        
        # Add cheek area mask (protect chin area)
        self.cheek_mask = self._create_cheek_mask(left_cheek_width=left_cheek_width, right_cheek_width=right_cheek_width)
        
    def _create_cheek_mask(self, left_cheek_width=80, right_cheek_width=80):
        """Create cheek area mask (1/4 area on both sides)"""
        mask = np.zeros((512, 512), dtype=np.uint8)
        center = 512 // 2
        cv2.rectangle(mask, (0, 0), (center - left_cheek_width, 512), 255, -1)    # Left cheek
        cv2.rectangle(mask, (center + right_cheek_width, 0), (512, 512), 255, -1)  # Right cheek
        return mask

    def model_init(self, 
                   resnet_path='./models/face-parse-bisent/resnet18-5c106cde.pth', 
                   model_pth='./models/face-parse-bisent/79999_iter.pth'):
        net = BiSeNet(resnet_path)
        if torch.cuda.is_available():
            net.cuda()
            net.load_state_dict(torch.load(model_pth)) 
        else:
            net.load_state_dict(torch.load(model_pth, map_location=torch.device('cpu')))
        net.eval()
        return net

    def image_preprocess(self):
        return transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        ])

    def __call__(self, image, size=(512, 512), mode="raw"):
        if isinstance(image, str):
            image = Image.open(image)

        width, height = image.size
        with torch.no_grad():

Download .txt

gitextract_75g1ac60/

├── .github/
│   └── FUNDING.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README-EN.md
├── README.md
├── app.py
├── assets/
│   └── faq.md
├── baseasr.py
├── basereal.py
├── hubertasr.py
├── lightreal.py
├── lipasr.py
├── lipreal.py
├── llm.py
├── logger.py
├── museasr.py
├── musereal.py
├── musetalk/
│   ├── genavatar.py
│   ├── myutil.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── audio_processor.py
│   │   ├── blending.py
│   │   ├── dwpose/
│   │   │   ├── default_runtime.py
│   │   │   └── rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py
│   │   ├── face_detection/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── api.py
│   │   │   ├── detection/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── core.py
│   │   │   │   └── sfd/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── bbox.py
│   │   │   │       ├── detect.py
│   │   │   │       ├── net_s3fd.py
│   │   │   │       └── sfd_detector.py
│   │   │   ├── models.py
│   │   │   └── utils.py
│   │   ├── face_parsing/
│   │   │   ├── __init__.py
│   │   │   ├── model.py
│   │   │   └── resnet.py
│   │   ├── preprocessing.py
│   │   ├── training_utils.py
│   │   └── utils.py
│   └── whisper/
│       ├── audio2feature.py
│       └── whisper/
│           ├── __init__.py
│           ├── __main__.py
│           ├── assets/
│           │   ├── gpt2/
│           │   │   ├── merges.txt
│           │   │   ├── special_tokens_map.json
│           │   │   ├── tokenizer_config.json
│           │   │   └── vocab.json
│           │   ├── mel_filters.npz
│           │   └── multilingual/
│           │       ├── added_tokens.json
│           │       ├── merges.txt
│           │       ├── special_tokens_map.json
│           │       ├── tokenizer_config.json
│           │       └── vocab.json
│           ├── audio.py
│           ├── decoding.py
│           ├── model.py
│           ├── normalizers/
│           │   ├── __init__.py
│           │   ├── basic.py
│           │   ├── english.json
│           │   └── english.py
│           ├── tokenizer.py
│           ├── transcribe.py
│           └── utils.py
├── requirements.txt
├── ttsreal.py
├── ultralight/
│   ├── audio2feature.py
│   ├── face_detect_utils/
│   │   ├── base_module.py
│   │   ├── detect_face.py
│   │   ├── get_landmark.py
│   │   ├── mean_face.txt
│   │   └── pfld_mobileone.py
│   ├── genavatar-bak.py
│   ├── genavatar.py
│   └── unet.py
├── wav2lip/
│   ├── audio.py
│   ├── face_detection/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── api.py
│   │   ├── detection/
│   │   │   ├── __init__.py
│   │   │   ├── core.py
│   │   │   └── sfd/
│   │   │       ├── __init__.py
│   │   │       ├── bbox.py
│   │   │       ├── detect.py
│   │   │       ├── net_s3fd.py
│   │   │       └── sfd_detector.py
│   │   ├── models.py
│   │   └── utils.py
│   ├── genavatar.py
│   └── hparams.py
├── web/
│   ├── asr/
│   │   ├── index.html
│   │   ├── main.js
│   │   ├── pcm.js
│   │   ├── recorder-core.js
│   │   ├── wav.js
│   │   └── wsconnecter.js
│   ├── chat.html
│   ├── client.js
│   ├── dashboard.html
│   ├── echo.html
│   ├── echoapi.html
│   ├── rtcpush.html
│   ├── rtcpushapi-asr.html
│   ├── rtcpushapi.html
│   ├── rtcpushchat.html
│   ├── srs.sdk.js
│   ├── webrtc.html
│   ├── webrtcapi-asr.html
│   ├── webrtcapi-custom.html
│   ├── webrtcapi.html
│   ├── webrtcchat.html
│   └── whep.js
└── webrtc.py

Download .txt

SYMBOL INDEX (671 symbols across 65 files)

FILE: app.py
  function randN (line 62) | def randN(N)->int:
  function build_nerfreal (line 68) | def build_nerfreal(sessionid:int)->BaseReal:
  function offer (line 85) | async def offer(request):
  function human (line 144) | async def human(request):
  function interrupt_talk (line 173) | async def interrupt_talk(request):
  function humanaudio (line 195) | async def humanaudio(request):
  function set_audiotype (line 219) | async def set_audiotype(request):
  function record (line 241) | async def record(request):
  function is_speaking (line 266) | async def is_speaking(request):
  function on_shutdown (line 278) | async def on_shutdown(app):
  function post (line 284) | async def post(url,data):
  function run (line 292) | async def run(push_url,sessionid):
  function run_server (line 425) | def run_server(runner):

FILE: baseasr.py
  class BaseASR (line 28) | class BaseASR:
    method __init__ (line 29) | def __init__(self, opt, parent:BaseReal = None):
    method flush_talk (line 49) | def flush_talk(self):
    method put_audio_frame (line 52) | def put_audio_frame(self,audio_chunk,datainfo:dict): #16khz 20ms pcm
    method get_audio_frame (line 56) | def get_audio_frame(self):
    method get_audio_out (line 73) | def get_audio_out(self):
    method warm_up (line 76) | def warm_up(self):
    method run_step (line 84) | def run_step(self):
    method get_next_feat (line 87) | def get_next_feat(self,block,timeout):

FILE: basereal.py
  function read_imgs (line 45) | def read_imgs(img_list):
  function play_audio (line 53) | def play_audio(quit_event,queue):
  class BaseReal (line 70) | class BaseReal:
    method __init__ (line 71) | def __init__(self, opt):
    method put_msg_txt (line 111) | def put_msg_txt(self,msg,datainfo:dict={}):
    method put_audio_frame (line 114) | def put_audio_frame(self,audio_chunk,datainfo:dict={}): #16khz 20ms pcm
    method put_audio_file (line 117) | def put_audio_file(self,filebyte,datainfo:dict={}):
    method __create_bytes_stream (line 127) | def __create_bytes_stream(self,byte_stream):
    method flush_talk (line 143) | def flush_talk(self):
    method is_speaking (line 147) | def is_speaking(self)->bool:
    method __loadcustom (line 150) | def __loadcustom(self):
    method init_customindex (line 161) | def init_customindex(self):
    method notify (line 168) | def notify(self,eventpoint):
    method start_recording (line 171) | def start_recording(self):
    method record_video_data (line 210) | def record_video_data(self,image):
    method record_audio_data (line 217) | def record_audio_data(self,frame):
    method stop_recording (line 261) | def stop_recording(self):
    method mirror_index (line 274) | def mirror_index(self,size, index):
    method get_audio_stream (line 283) | def get_audio_stream(self,audiotype):
    method set_custom_state (line 291) | def set_custom_state(self,audiotype, reinit=True):
    method process_frames (line 300) | def process_frames(self,quit_event,loop=None,audio_track=None,video_tr...

FILE: hubertasr.py
  class HubertASR (line 8) | class HubertASR(BaseASR):
    method __init__ (line 10) | def __init__(self, opt, parent, audio_processor:Audio2Feature,audio_fe...
    method run_step (line 18) | def run_step(self):

FILE: lightreal.py
  function load_model (line 62) | def load_model(opt):
  function load_avatar (line 66) | def load_avatar(avatar_id):
  function warm_up (line 89) | def warm_up(batch_size,avatar,modelres):
  function read_imgs (line 96) | def read_imgs(img_list):
  function get_audio_features (line 104) | def get_audio_features(features, index):
  function read_lms (line 123) | def read_lms(lms_list):
  function __mirror_index (line 138) | def __mirror_index(size, index):
  function inference (line 148) | def inference(quit_event, batch_size, face_list_cycle, audio_feat_queue,...
  class LightReal (line 225) | class LightReal(BaseReal):
    method __init__ (line 227) | def __init__(self, opt, model, avatar):
    method paste_back_frame (line 251) | def paste_back_frame(self,pred_frame,idx:int):
    method render (line 265) | def render(self,quit_event,loop=None,audio_track=None,video_track=None):

FILE: lipasr.py
  class LipASR (line 29) | class LipASR(BaseASR):
    method run_step (line 31) | def run_step(self):

FILE: lipreal.py
  function _load (line 50) | def _load(checkpoint_path):
  function load_model (line 58) | def load_model(path):
  function load_avatar (line 71) | def load_avatar(avatar_id):
  function warm_up (line 90) | def warm_up(batch_size,model,modelres):
  function read_imgs (line 97) | def read_imgs(img_list):
  function __mirror_index (line 105) | def __mirror_index(size, index):
  function inference (line 114) | def inference(quit_event,batch_size,face_list_cycle,audio_feat_queue,aud...
  class LipReal (line 184) | class LipReal(BaseReal):
    method __init__ (line 186) | def __init__(self, opt, model, avatar):
    method paste_back_frame (line 209) | def paste_back_frame(self,pred_frame,idx:int):
    method render (line 220) | def render(self,quit_event,loop=None,audio_track=None,video_track=None):

FILE: llm.py
  function llm_response (line 6) | def llm_response(message,nerfreal:BaseReal):

FILE: museasr.py
  class MuseASR (line 27) | class MuseASR(BaseASR):
    method __init__ (line 28) | def __init__(self, opt, parent,audio_processor:Audio2Feature):
    method run_step (line 32) | def run_step(self):

FILE: musereal.py
  function load_model (line 51) | def load_model():
  function load_avatar (line 65) | def load_avatar(avatar_id):
  function warm_up (line 96) | def warm_up(batch_size,model):
  function read_imgs (line 114) | def read_imgs(img_list):
  function __mirror_index (line 122) | def __mirror_index(size, index):
  function inference (line 132) | def inference(quit_event,batch_size,input_latent_list_cycle,audio_feat_q...
  class MuseReal (line 211) | class MuseReal(BaseReal):
    method __init__ (line 213) | def __init__(self, opt, model, avatar):
    method __mirror_index (line 238) | def __mirror_index(self, index):
    method __warm_up (line 247) | def __warm_up(self):
    method paste_back_frame (line 271) | def paste_back_frame(self,pred_frame,idx:int):
    method render (line 283) | def render(self,quit_event,loop=None,audio_track=None,video_track=None):

FILE: musetalk/genavatar.py
  function video2imgs (line 29) | def video2imgs(vid_path, save_path, ext='.png', cut_frame=10000000):
  function is_video_file (line 247) | def is_video_file(file_path):
  function create_dir (line 253) | def create_dir(dir_path):
  function create_musetalk_human (line 261) | def create_musetalk_human(file, avatar_id):

FILE: musetalk/myutil.py
  function get_image_blending (line 5) | def get_image_blending(image,face,face_box,mask_array,crop_box):

FILE: musetalk/utils/audio_processor.py
  class AudioProcessor (line 11) | class AudioProcessor:
    method __init__ (line 12) | def __init__(self, feature_extractor_path="openai/whisper-tiny/"):
    method get_audio_feature (line 15) | def get_audio_feature(self, wav_path, start_index=0, weight_dtype=None):
    method get_whisper_chunk (line 37) | def get_whisper_chunk(

FILE: musetalk/utils/blending.py
  function get_crop_box (line 7) | def get_crop_box(box, expand):
  function face_seg (line 16) | def face_seg(image, mode="raw", fp=None):
  function get_image (line 35) | def get_image(image, face, face_box, upper_boundary_ratio=0.5, expand=1....
  function get_image_blending (line 96) | def get_image_blending(image, face, face_box, mask_array, crop_box):
  function get_image_prepare_material (line 112) | def get_image_prepare_material(image, face_box, upper_boundary_ratio=0.5...

FILE: musetalk/utils/face_detection/api.py
  class LandmarksType (line 17) | class LandmarksType(Enum):
  class NetworkSize (line 30) | class NetworkSize(Enum):
    method __new__ (line 36) | def __new__(cls, value):
    method __int__ (line 41) | def __int__(self):
  class FaceAlignment (line 46) | class FaceAlignment:
    method __init__ (line 47) | def __init__(self, landmarks_type, network_size=NetworkSize.LARGE,
    method get_detections_for_batch (line 71) | def get_detections_for_batch(self, images):
  class YOLOv8_face (line 89) | class YOLOv8_face:
    method __init__ (line 90) | def __init__(self, path = 'face_detection/weights/yolov8n-face.onnx', ...
    method make_anchors (line 106) | def make_anchors(self, feats_hw, grid_cell_offset=0.5):
    method softmax (line 118) | def softmax(self, x, axis=1):
    method resize_image (line 125) | def resize_image(self, srcimg, keep_ratio=True):
    method detect (line 145) | def detect(self, srcimg):
    method post_process (line 161) | def post_process(self, preds, scale_h, scale_w, padh, padw):
    method distance2bbox (line 217) | def distance2bbox(self, points, distance, max_shape=None):
    method draw_detections (line 229) | def draw_detections(self, image, boxes, scores, kpts):

FILE: musetalk/utils/face_detection/detection/core.py
  class FaceDetector (line 9) | class FaceDetector(object):
    method __init__ (line 18) | def __init__(self, device, verbose):
    method detect_from_image (line 32) | def detect_from_image(self, tensor_or_path):
    method detect_from_directory (line 54) | def detect_from_directory(self, path, extensions=['.jpg', '.png'], rec...
    method reference_scale (line 104) | def reference_scale(self):
    method reference_x_shift (line 108) | def reference_x_shift(self):
    method reference_y_shift (line 112) | def reference_y_shift(self):
    method tensor_or_path_to_ndarray (line 116) | def tensor_or_path_to_ndarray(tensor_or_path, rgb=True):

FILE: musetalk/utils/face_detection/detection/sfd/bbox.py
  function IOU (line 17) | def IOU(ax1, ay1, ax2, ay2, bx1, by1, bx2, by2):
  function bboxlog (line 30) | def bboxlog(x1, y1, x2, y2, axc, ayc, aww, ahh):
  function bboxloginv (line 37) | def bboxloginv(dx, dy, dw, dh, axc, ayc, aww, ahh):
  function nms (line 44) | def nms(dets, thresh):
  function encode (line 67) | def encode(matched, priors, variances):
  function decode (line 91) | def decode(loc, priors, variances):
  function batch_decode (line 111) | def batch_decode(loc, priors, variances):

FILE: musetalk/utils/face_detection/detection/sfd/detect.py
  function detect (line 19) | def detect(net, img, device):
  function batch_detect (line 58) | def batch_detect(net, imgs, device):
  function flip_detect (line 98) | def flip_detect(net, img, device):
  function pts_to_bb (line 111) | def pts_to_bb(pts):

FILE: musetalk/utils/face_detection/detection/sfd/net_s3fd.py
  class L2Norm (line 6) | class L2Norm(nn.Module):
    method __init__ (line 7) | def __init__(self, n_channels, scale=1.0):
    method forward (line 16) | def forward(self, x):
  class s3fd (line 22) | class s3fd(nn.Module):
    method __init__ (line 23) | def __init__(self):
    method forward (line 70) | def forward(self, x):

FILE: musetalk/utils/face_detection/detection/sfd/sfd_detector.py
  class SFDDetector (line 16) | class SFDDetector(FaceDetector):
    method __init__ (line 17) | def __init__(self, device, path_to_detector=os.path.join(os.path.dirna...
    method detect_from_image (line 31) | def detect_from_image(self, tensor_or_path):
    method detect_from_batch (line 41) | def detect_from_batch(self, images):
    method reference_scale (line 50) | def reference_scale(self):
    method reference_x_shift (line 54) | def reference_x_shift(self):
    method reference_y_shift (line 58) | def reference_y_shift(self):

FILE: musetalk/utils/face_detection/models.py
  function conv3x3 (line 7) | def conv3x3(in_planes, out_planes, strd=1, padding=1, bias=False):
  class ConvBlock (line 13) | class ConvBlock(nn.Module):
    method __init__ (line 14) | def __init__(self, in_planes, out_planes):
    method forward (line 33) | def forward(self, x):
  class Bottleneck (line 58) | class Bottleneck(nn.Module):
    method __init__ (line 62) | def __init__(self, inplanes, planes, stride=1, downsample=None):
    method forward (line 75) | def forward(self, x):
  class HourGlass (line 98) | class HourGlass(nn.Module):
    method __init__ (line 99) | def __init__(self, num_modules, depth, num_features):
    method _generate_network (line 107) | def _generate_network(self, level):
    method _forward (line 119) | def _forward(self, level, inp):
    method forward (line 141) | def forward(self, x):
  class FAN (line 145) | class FAN(nn.Module):
    method __init__ (line 147) | def __init__(self, num_modules=1):
    method forward (line 174) | def forward(self, x):
  class ResNetDepth (line 204) | class ResNetDepth(nn.Module):
    method __init__ (line 206) | def __init__(self, block=Bottleneck, layers=[3, 8, 36, 3], num_classes...
    method _make_layer (line 229) | def _make_layer(self, block, planes, blocks, stride=1):
    method forward (line 246) | def forward(self, x):

FILE: musetalk/utils/face_detection/utils.py
  function _gaussian (line 11) | def _gaussian(
  function draw_gaussian (line 37) | def draw_gaussian(image, point, sigma):
  function transform (line 56) | def transform(point, center, scale, resolution, invert=False):
  function crop (line 92) | def crop(image, center, scale, resolution=256.0):
  function get_preds_fromhm (line 132) | def get_preds_fromhm(hm, center=None, scale=None):
  function get_preds_fromhm_batch (line 172) | def get_preds_fromhm_batch(hm, centers=None, scales=None):
  function shuffle_lr (line 212) | def shuffle_lr(parts, pairs=None):
  function flip (line 237) | def flip(tensor, is_label=False):
  function appdata_dir (line 259) | def appdata_dir(appname=None, roaming=False):

FILE: musetalk/utils/face_parsing/__init__.py
  class FaceParsing (line 10) | class FaceParsing():
    method __init__ (line 11) | def __init__(self, left_cheek_width=80, right_cheek_width=80):
    method _create_cheek_mask (line 51) | def _create_cheek_mask(self, left_cheek_width=80, right_cheek_width=80):
    method model_init (line 59) | def model_init(self,
    method image_preprocess (line 71) | def image_preprocess(self):
    method __call__ (line 77) | def __call__(self, image, size=(512, 512), mode="raw"):

FILE: musetalk/utils/face_parsing/model.py
  class ConvBNReLU (line 14) | class ConvBNReLU(nn.Module):
    method __init__ (line 15) | def __init__(self, in_chan, out_chan, ks=3, stride=1, padding=1, *args...
    method forward (line 26) | def forward(self, x):
    method init_weight (line 31) | def init_weight(self):
  class BiSeNetOutput (line 37) | class BiSeNetOutput(nn.Module):
    method __init__ (line 38) | def __init__(self, in_chan, mid_chan, n_classes, *args, **kwargs):
    method forward (line 44) | def forward(self, x):
    method init_weight (line 49) | def init_weight(self):
    method get_params (line 55) | def get_params(self):
  class AttentionRefinementModule (line 67) | class AttentionRefinementModule(nn.Module):
    method __init__ (line 68) | def __init__(self, in_chan, out_chan, *args, **kwargs):
    method forward (line 76) | def forward(self, x):
    method init_weight (line 85) | def init_weight(self):
  class ContextPath (line 92) | class ContextPath(nn.Module):
    method __init__ (line 93) | def __init__(self, resnet_path, *args, **kwargs):
    method forward (line 104) | def forward(self, x):
    method init_weight (line 127) | def init_weight(self):
    method get_params (line 133) | def get_params(self):
  class SpatialPath (line 146) | class SpatialPath(nn.Module):
    method __init__ (line 147) | def __init__(self, *args, **kwargs):
    method forward (line 155) | def forward(self, x):
    method init_weight (line 162) | def init_weight(self):
    method get_params (line 168) | def get_params(self):
  class FeatureFusionModule (line 180) | class FeatureFusionModule(nn.Module):
    method __init__ (line 181) | def __init__(self, in_chan, out_chan, *args, **kwargs):
    method forward (line 200) | def forward(self, fsp, fcp):
    method init_weight (line 212) | def init_weight(self):
    method get_params (line 218) | def get_params(self):
  class BiSeNet (line 230) | class BiSeNet(nn.Module):
    method __init__ (line 231) | def __init__(self, resnet_path='models/resnet18-5c106cde.pth', n_class...
    method forward (line 241) | def forward(self, x):
    method init_weight (line 256) | def init_weight(self):
    method get_params (line 262) | def get_params(self):

FILE: musetalk/utils/face_parsing/resnet.py
  function conv3x3 (line 14) | def conv3x3(in_planes, out_planes, stride=1):
  class BasicBlock (line 20) | class BasicBlock(nn.Module):
    method __init__ (line 21) | def __init__(self, in_chan, out_chan, stride=1):
    method forward (line 36) | def forward(self, x):
  function create_layer_basic (line 51) | def create_layer_basic(in_chan, out_chan, bnum, stride=1):
  class Resnet18 (line 58) | class Resnet18(nn.Module):
    method __init__ (line 59) | def __init__(self, model_path):
    method forward (line 71) | def forward(self, x):
    method init_weight (line 82) | def init_weight(self, model_path):
    method get_params (line 90) | def get_params(self):

FILE: musetalk/utils/preprocessing.py
  function resize_landmark (line 28) | def resize_landmark(landmark, w, h, new_w, new_h):
  function read_imgs (line 35) | def read_imgs(img_list):
  function get_bbox_range (line 43) | def get_bbox_range(img_list,upperbondrange =0):
  function get_landmark_and_bbox (line 84) | def get_landmark_and_bbox(img_list,upperbondrange =0):

FILE: musetalk/utils/training_utils.py
  class Net (line 25) | class Net(nn.Module):
    method __init__ (line 26) | def __init__(
    method forward (line 33) | def forward(
  function initialize_models_and_optimizers (line 48) | def initialize_models_and_optimizers(cfg, accelerator, weight_dtype):
  function initialize_dataloaders (line 144) | def initialize_dataloaders(cfg):
  function initialize_loss_functions (line 201) | def initialize_loss_functions(cfg, accelerator, scheduler_max_steps):
  function initialize_syncnet (line 251) | def initialize_syncnet(cfg, accelerator, weight_dtype):
  function initialize_vgg (line 272) | def initialize_vgg(cfg, accelerator):
  function validation (line 284) | def validation(

FILE: musetalk/utils/utils.py
  function load_all_model (line 15) | def load_all_model(
  function get_file_type (line 33) | def get_file_type(video_path):
  function get_video_fps (line 43) | def get_video_fps(video_path):
  function datagen (line 49) | def datagen(
  function cast_training_params (line 76) | def cast_training_params(
  function rand_log_normal (line 88) | def rand_log_normal(
  function get_mouth_region (line 102) | def get_mouth_region(frames, image_pred, pixel_values_face_mask):
  function get_image_pred (line 140) | def get_image_pred(pixel_values,
  function process_audio_features (line 177) | def process_audio_features(cfg, batch, wav2vec, bsz, num_frames, weight_...
  function save_checkpoint (line 206) | def save_checkpoint(model, save_dir, ckpt_num, name="appearance_net", to...
  function save_models (line 234) | def save_models(accelerator, net, save_dir, global_step, cfg, logger=None):
  function delete_additional_ckpt (line 245) | def delete_additional_ckpt(base_path, num_keep):
  function seed_everything (line 260) | def seed_everything(seed):
  function process_and_save_images (line 270) | def process_and_save_images(

FILE: musetalk/whisper/audio2feature.py
  class Audio2Feature (line 14) | class Audio2Feature():
    method __init__ (line 15) | def __init__(self,
    method get_sliced_feature (line 25) | def get_sliced_feature(self,
    method get_sliced_feature_sparse (line 56) | def get_sliced_feature_sparse(self,feature_array, vid_idx, audio_feat_...
    method feature2chunks (line 91) | def feature2chunks(self,feature_array,fps,batch_size,audio_feat_length...
    method audio2feat (line 106) | def audio2feat(self, wav_data): #, weight_dtype=None

FILE: musetalk/whisper/whisper/__init__.py
  function _download (line 33) | def _download(url: str, root: str, in_memory: bool) -> Union[bytes, str]:
  function available_models (line 66) | def available_models() -> List[str]:
  function load_model (line 71) | def load_model(name: str, device: Optional[Union[str, torch.device]] = N...

FILE: musetalk/whisper/whisper/audio.py
  function load_audio (line 22) | def load_audio(file: str, sr: int = SAMPLE_RATE):
  function pad_or_trim (line 52) | def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
  function mel_filters (line 77) | def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
  function log_mel_spectrogram (line 92) | def log_mel_spectrogram(audio: Union[str, np.ndarray, torch.Tensor], n_m...

FILE: musetalk/whisper/whisper/decoding.py
  function detect_language (line 19) | def detect_language(model: "Whisper", mel: Tensor, tokenizer: Tokenizer ...
  class DecodingOptions (line 72) | class DecodingOptions:
  class DecodingResult (line 104) | class DecodingResult:
  class Inference (line 118) | class Inference:
    method logits (line 119) | def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
    method rearrange_kv_cache (line 123) | def rearrange_kv_cache(self, source_indices) -> None:
    method cleanup_caching (line 127) | def cleanup_caching(self) -> None:
  class PyTorchInference (line 132) | class PyTorchInference(Inference):
    method __init__ (line 133) | def __init__(self, model: "Whisper", initial_token_length: int):
    method logits (line 139) | def logits(self, tokens: Tensor, audio_features: Tensor, include_embed...
    method cleanup_caching (line 151) | def cleanup_caching(self):
    method rearrange_kv_cache (line 158) | def rearrange_kv_cache(self, source_indices):
  class SequenceRanker (line 164) | class SequenceRanker:
    method rank (line 165) | def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[flo...
  class MaximumLikelihoodRanker (line 173) | class MaximumLikelihoodRanker(SequenceRanker):
    method __init__ (line 179) | def __init__(self, length_penalty: Optional[float]):
    method rank (line 182) | def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[flo...
  class TokenDecoder (line 199) | class TokenDecoder:
    method reset (line 200) | def reset(self):
    method update (line 203) | def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor)...
    method finalize (line 228) | def finalize(
  class GreedyDecoder (line 253) | class GreedyDecoder(TokenDecoder):
    method __init__ (line 254) | def __init__(self, temperature: float, eot: int):
    method update (line 258) | def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor)...
    method finalize (line 275) | def finalize(self, tokens: Tensor, sum_logprobs: Tensor):
  class BeamSearchDecoder (line 281) | class BeamSearchDecoder(TokenDecoder):
    method __init__ (line 282) | def __init__(self, beam_size: int, eot: int, inference: Inference, pat...
    method reset (line 292) | def reset(self):
    method update (line 295) | def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor)...
    method finalize (line 351) | def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor):
  class LogitFilter (line 371) | class LogitFilter:
    method apply (line 372) | def apply(self, logits: Tensor, tokens: Tensor) -> None:
  class SuppressBlank (line 387) | class SuppressBlank(LogitFilter):
    method __init__ (line 388) | def __init__(self, tokenizer: Tokenizer, sample_begin: int):
    method apply (line 392) | def apply(self, logits: Tensor, tokens: Tensor):
  class SuppressTokens (line 397) | class SuppressTokens(LogitFilter):
    method __init__ (line 398) | def __init__(self, suppress_tokens: Sequence[int]):
    method apply (line 401) | def apply(self, logits: Tensor, tokens: Tensor):
  class ApplyTimestampRules (line 405) | class ApplyTimestampRules(LogitFilter):
    method __init__ (line 406) | def __init__(
    method apply (line 413) | def apply(self, logits: Tensor, tokens: Tensor):
  class DecodingTask (line 444) | class DecodingTask:
    method __init__ (line 450) | def __init__(self, model: "Whisper", options: DecodingOptions):
    method _verify_options (line 499) | def _verify_options(self, options: DecodingOptions) -> DecodingOptions:
    method _get_initial_tokens (line 512) | def _get_initial_tokens(self) -> Tuple[int]:
    method _get_suppress_tokens (line 534) | def _get_suppress_tokens(self) -> Tuple[int]:
    method _get_audio_features (line 557) | def _get_audio_features(self, mel: Tensor, include_embeddings: bool = ...
    method _detect_language (line 579) | def _detect_language(self, audio_features: Tensor, tokens: Tensor):
    method _main_loop (line 591) | def _main_loop(self, audio_features: Tensor, tokens: Tensor):
    method run (line 631) | def run(self, mel: Tensor) -> List[DecodingResult]:
  function decode (line 700) | def decode(model: "Whisper", mel: Tensor, options: DecodingOptions = Dec...

FILE: musetalk/whisper/whisper/model.py
  class ModelDimensions (line 16) | class ModelDimensions:
  class LayerNorm (line 29) | class LayerNorm(nn.LayerNorm):
    method forward (line 30) | def forward(self, x: Tensor) -> Tensor:
  class Linear (line 34) | class Linear(nn.Linear):
    method forward (line 35) | def forward(self, x: Tensor) -> Tensor:
  class Conv1d (line 41) | class Conv1d(nn.Conv1d):
    method _conv_forward (line 42) | def _conv_forward(self, x: Tensor, weight: Tensor, bias: Optional[Tens...
  function sinusoids (line 48) | def sinusoids(length, channels, max_timescale=10000):
  class MultiHeadAttention (line 57) | class MultiHeadAttention(nn.Module):
    method __init__ (line 58) | def __init__(self, n_state: int, n_head: int):
    method forward (line 66) | def forward(
    method qkv_attention (line 88) | def qkv_attention(self, q: Tensor, k: Tensor, v: Tensor, mask: Optiona...
  class ResidualAttentionBlock (line 103) | class ResidualAttentionBlock(nn.Module):
    method __init__ (line 104) | def __init__(self, n_state: int, n_head: int, cross_attention: bool = ...
    method forward (line 117) | def forward(
  class AudioEncoder (line 131) | class AudioEncoder(nn.Module):
    method __init__ (line 132) | def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int,...
    method forward (line 143) | def forward(self, x: Tensor, include_embeddings: bool = False):
  class TextDecoder (line 174) | class TextDecoder(nn.Module):
    method __init__ (line 175) | def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int...
    method forward (line 189) | def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = No...
  class Whisper (line 220) | class Whisper(nn.Module):
    method __init__ (line 221) | def __init__(self, dims: ModelDimensions):
    method embed_audio (line 239) | def embed_audio(self, mel: torch.Tensor):
    method logits (line 242) | def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
    method forward (line 245) | def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str...
    method device (line 249) | def device(self):
    method is_multilingual (line 253) | def is_multilingual(self):
    method install_kv_cache_hooks (line 256) | def install_kv_cache_hooks(self, cache: Optional[dict] = None):

FILE: musetalk/whisper/whisper/normalizers/basic.py
  function remove_symbols_and_diacritics (line 27) | def remove_symbols_and_diacritics(s: str, keep=""):
  function remove_symbols (line 46) | def remove_symbols(s: str):
  class BasicTextNormalizer (line 55) | class BasicTextNormalizer:
    method __init__ (line 56) | def __init__(self, remove_diacritics: bool = False, split_letters: boo...
    method __call__ (line 60) | def __call__(self, s: str):

FILE: musetalk/whisper/whisper/normalizers/english.py
  class EnglishNumberNormalizer (line 12) | class EnglishNumberNormalizer:
    method __init__ (line 23) | def __init__(self):
    method process_words (line 160) | def process_words(self, words: List[str]) -> Iterator[str]:
    method preprocess (line 381) | def preprocess(self, s: str):
    method postprocess (line 410) | def postprocess(self, s: str):
    method __call__ (line 435) | def __call__(self, s: str):
  class EnglishSpellingNormalizer (line 443) | class EnglishSpellingNormalizer:
    method __init__ (line 450) | def __init__(self):
    method __call__ (line 454) | def __call__(self, s: str):
  class EnglishTextNormalizer (line 458) | class EnglishTextNormalizer:
    method __init__ (line 459) | def __init__(self):
    method __call__ (line 519) | def __call__(self, s: str):

FILE: musetalk/whisper/whisper/tokenizer.py
  class Tokenizer (line 130) | class Tokenizer:
    method encode (line 137) | def encode(self, text, **kwargs):
    method decode (line 140) | def decode(self, token_ids: Union[int, List[int], np.ndarray, torch.Te...
    method decode_with_timestamps (line 143) | def decode_with_timestamps(self, tokens) -> str:
    method eot (line 161) | def eot(self) -> int:
    method sot (line 166) | def sot(self) -> int:
    method sot_lm (line 171) | def sot_lm(self) -> int:
    method sot_prev (line 176) | def sot_prev(self) -> int:
    method no_speech (line 181) | def no_speech(self) -> int:
    method no_timestamps (line 186) | def no_timestamps(self) -> int:
    method timestamp_begin (line 191) | def timestamp_begin(self) -> int:
    method language_token (line 196) | def language_token(self) -> int:
    method all_language_tokens (line 215) | def all_language_tokens(self) -> Tuple[int]:
    method all_language_codes (line 227) | def all_language_codes(self) -> Tuple[str]:
    method sot_sequence_including_notimestamps (line 232) | def sot_sequence_including_notimestamps(self) -> Tuple[int]:
    method non_speech_tokens (line 237) | def non_speech_tokens(self) -> Tuple[int]:
    method _get_single_token_id (line 267) | def _get_single_token_id(self, text) -> int:
  function build_tokenizer (line 274) | def build_tokenizer(name: str = "gpt2"):
  function get_tokenizer (line 295) | def get_tokenizer(

FILE: musetalk/whisper/whisper/transcribe.py
  function transcribe (line 19) | def transcribe(
  function cli (line 133) | def cli():

FILE: musetalk/whisper/whisper/utils.py
  function exact_div (line 5) | def exact_div(x, y):
  function str2bool (line 10) | def str2bool(string):
  function optional_int (line 18) | def optional_int(string):
  function optional_float (line 22) | def optional_float(string):
  function compression_ratio (line 26) | def compression_ratio(text) -> float:
  function format_timestamp (line 30) | def format_timestamp(seconds: float, always_include_hours: bool = False,...
  function write_txt (line 47) | def write_txt(transcript: Iterator[dict], file: TextIO):
  function write_vtt (line 52) | def write_vtt(transcript: Iterator[dict], file: TextIO):
  function write_srt (line 63) | def write_srt(transcript: Iterator[dict], file: TextIO):

FILE: ttsreal.py
  class State (line 50) | class State(Enum):
  class BaseTTS (line 54) | class BaseTTS:
    method __init__ (line 55) | def __init__(self, opt, parent:BaseReal):
    method flush_talk (line 67) | def flush_talk(self):
    method put_msg_txt (line 71) | def put_msg_txt(self,msg:str,datainfo:dict={}):
    method render (line 75) | def render(self,quit_event):
    method process_tts (line 79) | def process_tts(self,quit_event):
    method txt_to_audio (line 89) | def txt_to_audio(self,msg:tuple[str, dict]):
  class EdgeTTS (line 94) | class EdgeTTS(BaseTTS):
    method txt_to_audio (line 95) | def txt_to_audio(self,msg:tuple[str, dict]):
    method __create_bytes_stream (line 125) | def __create_bytes_stream(self,byte_stream):
    method __main (line 141) | async def __main(self,voicename: str, text: str):
  class FishTTS (line 160) | class FishTTS(BaseTTS):
    method txt_to_audio (line 161) | def txt_to_audio(self,msg:tuple[str, dict]):
    method fish_speech (line 174) | def fish_speech(self, text, reffile, reftext,language, server_url) -> ...
    method stream_tts (line 213) | def stream_tts(self,audio_stream,msg:tuple[str, dict]):
  class SovitsTTS (line 238) | class SovitsTTS(BaseTTS):
    method txt_to_audio (line 239) | def txt_to_audio(self,msg:tuple[str, dict]):
    method gpt_sovits (line 252) | def gpt_sovits(self, text, reffile, reftext,language, server_url) -> I...
    method __create_bytes_stream (line 296) | def __create_bytes_stream(self,byte_stream):
    method stream_tts (line 312) | def stream_tts(self,audio_stream,msg:tuple[str, dict]):
  class CosyVoiceTTS (line 337) | class CosyVoiceTTS(BaseTTS):
    method txt_to_audio (line 338) | def txt_to_audio(self,msg:tuple[str, dict]):
    method cosy_voice (line 351) | def cosy_voice(self, text, reffile, reftext,language, server_url) -> I...
    method stream_tts (line 380) | def stream_tts(self,audio_stream,msg:tuple[str, dict]):
  class TencentTTS (line 410) | class TencentTTS(BaseTTS):
    method __init__ (line 411) | def __init__(self, opt, parent):
    method __gen_signature (line 422) | def __gen_signature(self, params):
    method __gen_params (line 434) | def __gen_params(self, session_id, text):
    method txt_to_audio (line 453) | def txt_to_audio(self,msg:tuple[str, dict]):
    method tencent_voice (line 466) | def tencent_voice(self, text, reffile, reftext,language, server_url) -...
    method stream_tts (line 503) | def stream_tts(self,audio_stream,msg:tuple[str, dict]):
  class DoubaoTTS (line 533) | class DoubaoTTS(BaseTTS):
    method __init__ (line 534) | def __init__(self, opt, parent):
    method doubao_voice (line 568) | async def doubao_voice(self, text): # -> Iterator[bytes]:
    method txt_to_audio (line 626) | def txt_to_audio(self, msg:tuple[str, dict]):
    method stream_tts (line 635) | async def stream_tts(self, audio_stream, msg:tuple[str, dict]):
  class IndexTTS2 (line 663) | class IndexTTS2(BaseTTS):
    method __init__ (line 664) | def __init__(self, opt, parent):
    method txt_to_audio (line 684) | def txt_to_audio(self, msg):
    method split_text (line 713) | def split_text(self, text):
    method indextts2_generate (line 747) | def indextts2_generate(self, text):
    method file_to_stream (line 796) | def file_to_stream(self, audio_file, msg, is_first=False, is_last=False):
  class XTTS (line 852) | class XTTS(BaseTTS):
    method __init__ (line 853) | def __init__(self, opt, parent):
    method txt_to_audio (line 857) | def txt_to_audio(self,msg:tuple[str, dict]):
    method get_speaker (line 870) | def get_speaker(self,ref_audio,server_url):
    method xtts (line 875) | def xtts(self,text, speaker, language, server_url, stream_chunk_size) ...
    method stream_tts (line 905) | def stream_tts(self,audio_stream,msg:tuple[str, dict]):
  class AzureTTS (line 933) | class AzureTTS(BaseTTS):
    method __init__ (line 935) | def __init__(self, opt, parent):
    method txt_to_audio (line 950) | def txt_to_audio(self,msg:tuple[str, dict]):
    method _on_synthesizing (line 967) | def _on_synthesizing(self, evt: speechsdk.SpeechSynthesisEventArgs):

FILE: ultralight/audio2feature.py
  class Audio2Feature (line 6) | class Audio2Feature():
    method __init__ (line 7) | def __init__(self):
    method get_hubert_from_16k_speech (line 14) | def get_hubert_from_16k_speech(self, speech):
    method get_sliced_feature (line 51) | def get_sliced_feature(self,
    method feature2chunks (line 82) | def feature2chunks(self,feature_array,fps,batch_size,audio_feat_length...

FILE: ultralight/face_detect_utils/base_module.py
  function Conv_Block (line 12) | def Conv_Block(in_channel, out_channel, kernel_size, stride, padding, gr...
  class InvertedResidual (line 20) | class InvertedResidual(Module):
    method __init__ (line 21) | def __init__(self, in_channel, out_channel, stride, use_res_connect, e...
    method forward (line 36) | def forward(self, x):
  class GhostModule (line 43) | class GhostModule(Module):
    method __init__ (line 44) | def __init__(self, in_channel, out_channel, is_linear=False):
    method forward (line 53) | def forward(self, x):
  class GhostBottleneck (line 60) | class GhostBottleneck(Module):
    method __init__ (line 61) | def __init__(self, in_channel, hidden_channel, out_channel, stride):
    method forward (line 83) | def forward(self, x):
  class GhostOneModule (line 87) | class GhostOneModule(Module):
    method __init__ (line 88) | def __init__(self, in_channel, out_channel, is_linear=False, inference...
    method forward (line 117) | def forward(self, x):
  class GhostOneBottleneck (line 124) | class GhostOneBottleneck(Module):
    method __init__ (line 125) | def __init__(self, in_channel, hidden_channel, out_channel, stride, in...
    method forward (line 150) | def forward(self, x):
  class SEBlock (line 154) | class SEBlock(nn.Module):
    method __init__ (line 161) | def __init__(self,
    method forward (line 181) | def forward(self, inputs: torch.Tensor) -> torch.Tensor:
  class MobileOneBlock (line 193) | class MobileOneBlock(nn.Module):
    method __init__ (line 203) | def __init__(self,
    method forward (line 275) | def forward(self, x: torch.Tensor) -> torch.Tensor:
    method reparameterize (line 299) | def reparameterize(self):
    method _get_kernel_bias (line 329) | def _get_kernel_bias(self) -> Tuple[torch.Tensor, torch.Tensor]:
    method _fuse_bn_tensor (line 363) | def _fuse_bn_tensor(self, branch) -> Tuple[torch.Tensor, torch.Tensor]:
    method _conv_bn (line 402) | def _conv_bn(self,

FILE: ultralight/face_detect_utils/detect_face.py
  class SCRFD (line 6) | class SCRFD():
    method __init__ (line 7) | def __init__(self, onnxmodel, confThreshold=0.5, nmsThreshold=0.5):
    method resize_image (line 17) | def resize_image(self, srcimg):
    method distance2bbox (line 35) | def distance2bbox(self, points, distance, max_shape=None):
    method distance2kps (line 46) | def distance2kps(self, points, distance, max_shape=None):
    method detect (line 57) | def detect(self, srcimg):

FILE: ultralight/face_detect_utils/get_landmark.py
  function face_det (line 14) | def face_det(img, model):
  class Landmark (line 70) | class Landmark:
    method __init__ (line 71) | def __init__(self):
    method detect (line 83) | def detect(self, img_path):

FILE: ultralight/face_detect_utils/pfld_mobileone.py
  class PFLD_GhostOne (line 12) | class PFLD_GhostOne(Module):
    method __init__ (line 13) | def __init__(self, width_factor=0.5, input_size=192, landmark_number=1...
    method forward (line 99) | def forward(self, x):
  class PFLD_GhostOne_WithSTN (line 136) | class PFLD_GhostOne_WithSTN(Module):
    method __init__ (line 137) | def __init__(self, width_factor=0.5, input_size=112, landmark_number=1...
    method forward (line 216) | def forward(self, x):
  class AuxiliaryNet (line 252) | class AuxiliaryNet(Module):
    method __init__ (line 253) | def __init__(self, width_factor=1):
    method forward (line 266) | def forward(self, out1, out2, out3, out4):
  function check_onnx (line 306) | def check_onnx(torch_out, torch_in):

FILE: ultralight/genavatar-bak.py
  function osmakedirs (line 16) | def osmakedirs(path_list):

FILE: ultralight/genavatar.py
  function osmakedirs (line 17) | def osmakedirs(path_list):
  function video2imgs (line 29) | def video2imgs(vid_path, save_path, ext = '.png',cut_frame = 10000000):
  function read_imgs (line 44) | def read_imgs(img_list):

FILE: ultralight/unet.py
  class InvertedResidual (line 7) | class InvertedResidual(nn.Module):
    method __init__ (line 8) | def __init__(self, inp, oup, stride, use_res_connect, expand_ratio=6):
    method forward (line 32) | def forward(self, x):
  class DoubleConvDW (line 38) | class DoubleConvDW(nn.Module):
    method __init__ (line 40) | def __init__(self, in_channels, out_channels, stride=2):
    method forward (line 48) | def forward(self, x):
  class InConvDw (line 51) | class InConvDw(nn.Module):
    method __init__ (line 52) | def __init__(self, in_channels, out_channels):
    method forward (line 57) | def forward(self, x):
  class Down (line 60) | class Down(nn.Module):
    method __init__ (line 62) | def __init__(self, in_channels, out_channels):
    method forward (line 69) | def forward(self, x):
  class Up (line 72) | class Up(nn.Module):
    method __init__ (line 74) | def __init__(self, in_channels, out_channels):
    method forward (line 79) | def forward(self, x1, x2):
  class OutConv (line 89) | class OutConv(nn.Module):
    method __init__ (line 90) | def __init__(self, in_channels, out_channels):
    method forward (line 93) | def forward(self, x):
  class AudioConvWenet (line 96) | class AudioConvWenet(nn.Module):
    method __init__ (line 97) | def __init__(self):
    method forward (line 116) | def forward(self, x):
  class AudioConvHubert (line 132) | class AudioConvHubert(nn.Module):
    method __init__ (line 133) | def __init__(self):
    method forward (line 152) | def forward(self, x):
  class Model (line 168) | class Model(nn.Module):
    method __init__ (line 169) | def __init__(self,n_channels=6, mode='wenet'):
    method forward (line 198) | def forward(self, x, audio_feat):
  function reparameterize_model (line 226) | def reparameterize_model(model: torch.nn.Module) -> torch.nn.Module:
  function check_onnx (line 240) | def check_onnx(torch_out, torch_in, audio):

FILE: wav2lip/audio.py
  function load_wav (line 9) | def load_wav(path, sr):
  function save_wav (line 12) | def save_wav(wav, path, sr):
  function save_wavenet_wav (line 17) | def save_wavenet_wav(wav, path, sr):
  function preemphasis (line 20) | def preemphasis(wav, k, preemphasize=True):
  function inv_preemphasis (line 25) | def inv_preemphasis(wav, k, inv_preemphasize=True):
  function get_hop_size (line 30) | def get_hop_size():
  function linearspectrogram (line 37) | def linearspectrogram(wav):
  function melspectrogram (line 45) | def melspectrogram(wav):
  function _lws_processor (line 53) | def _lws_processor():
  function _stft (line 57) | def _stft(y):
  function num_frames (line 65) | def num_frames(length, fsize, fshift):
  function pad_lr (line 76) | def pad_lr(x, fsize, fshift):
  function librosa_pad_lr (line 86) | def librosa_pad_lr(x, fsize, fshift):
  function _linear_to_mel (line 92) | def _linear_to_mel(spectogram):
  function _build_mel_basis (line 98) | def _build_mel_basis():
  function _amp_to_db (line 103) | def _amp_to_db(x):
  function _db_to_amp (line 107) | def _db_to_amp(x):
  function _normalize (line 110) | def _normalize(S):
  function _denormalize (line 124) | def _denormalize(D):

FILE: wav2lip/face_detection/api.py
  class LandmarksType (line 17) | class LandmarksType(Enum):
  class NetworkSize (line 30) | class NetworkSize(Enum):
    method __new__ (line 36) | def __new__(cls, value):
    method __int__ (line 41) | def __int__(self):
  class FaceAlignment (line 46) | class FaceAlignment:
    method __init__ (line 47) | def __init__(self, landmarks_type, network_size=NetworkSize.LARGE,
    method get_detections_for_batch (line 64) | def get_detections_for_batch(self, images):

FILE: wav2lip/face_detection/detection/core.py
  class FaceDetector (line 9) | class FaceDetector(object):
    method __init__ (line 18) | def __init__(self, device, verbose):
    method detect_from_image (line 32) | def detect_from_image(self, tensor_or_path):
    method detect_from_directory (line 54) | def detect_from_directory(self, path, extensions=['.jpg', '.png'], rec...
    method reference_scale (line 104) | def reference_scale(self):
    method reference_x_shift (line 108) | def reference_x_shift(self):
    method reference_y_shift (line 112) | def reference_y_shift(self):
    method tensor_or_path_to_ndarray (line 116) | def tensor_or_path_to_ndarray(tensor_or_path, rgb=True):

FILE: wav2lip/face_detection/detection/sfd/bbox.py
  function IOU (line 17) | def IOU(ax1, ay1, ax2, ay2, bx1, by1, bx2, by2):
  function bboxlog (line 30) | def bboxlog(x1, y1, x2, y2, axc, ayc, aww, ahh):
  function bboxloginv (line 37) | def bboxloginv(dx, dy, dw, dh, axc, ayc, aww, ahh):
  function nms (line 44) | def nms(dets, thresh):
  function encode (line 67) | def encode(matched, priors, variances):
  function decode (line 91) | def decode(loc, priors, variances):
  function batch_decode (line 111) | def batch_decode(loc, priors, variances):

FILE: wav2lip/face_detection/detection/sfd/detect.py
  function detect (line 19) | def detect(net, img, device):
  function batch_detect (line 58) | def batch_detect(net, imgs, device):
  function flip_detect (line 96) | def flip_detect(net, img, device):
  function pts_to_bb (line 109) | def pts_to_bb(pts):

FILE: wav2lip/face_detection/detection/sfd/net_s3fd.py
  class L2Norm (line 6) | class L2Norm(nn.Module):
    method __init__ (line 7) | def __init__(self, n_channels, scale=1.0):
    method forward (line 16) | def forward(self, x):
  class s3fd (line 22) | class s3fd(nn.Module):
    method __init__ (line 23) | def __init__(self):
    method forward (line 70) | def forward(self, x):

FILE: wav2lip/face_detection/detection/sfd/sfd_detector.py
  class SFDDetector (line 16) | class SFDDetector(FaceDetector):
    method __init__ (line 17) | def __init__(self, device, path_to_detector=os.path.join(os.path.dirna...
    method detect_from_image (line 31) | def detect_from_image(self, tensor_or_path):
    method detect_from_batch (line 41) | def detect_from_batch(self, images):
    method reference_scale (line 50) | def reference_scale(self):
    method reference_x_shift (line 54) | def reference_x_shift(self):
    method reference_y_shift (line 58) | def reference_y_shift(self):

FILE: wav2lip/face_detection/models.py
  function conv3x3 (line 7) | def conv3x3(in_planes, out_planes, strd=1, padding=1, bias=False):
  class ConvBlock (line 13) | class ConvBlock(nn.Module):
    method __init__ (line 14) | def __init__(self, in_planes, out_planes):
    method forward (line 33) | def forward(self, x):
  class Bottleneck (line 58) | class Bottleneck(nn.Module):
    method __init__ (line 62) | def __init__(self, inplanes, planes, stride=1, downsample=None):
    method forward (line 75) | def forward(self, x):
  class HourGlass (line 98) | class HourGlass(nn.Module):
    method __init__ (line 99) | def __init__(self, num_modules, depth, num_features):
    method _generate_network (line 107) | def _generate_network(self, level):
    method _forward (line 119) | def _forward(self, level, inp):
    method forward (line 141) | def forward(self, x):
  class FAN (line 145) | class FAN(nn.Module):
    method __init__ (line 147) | def __init__(self, num_modules=1):
    method forward (line 174) | def forward(self, x):
  class ResNetDepth (line 204) | class ResNetDepth(nn.Module):
    method __init__ (line 206) | def __init__(self, block=Bottleneck, layers=[3, 8, 36, 3], num_classes...
    method _make_layer (line 229) | def _make_layer(self, block, planes, blocks, stride=1):
    method forward (line 246) | def forward(self, x):

FILE: wav2lip/face_detection/utils.py
  function _gaussian (line 11) | def _gaussian(
  function draw_gaussian (line 37) | def draw_gaussian(image, point, sigma):
  function transform (line 56) | def transform(point, center, scale, resolution, invert=False):
  function crop (line 92) | def crop(image, center, scale, resolution=256.0):
  function get_preds_fromhm (line 132) | def get_preds_fromhm(hm, center=None, scale=None):
  function get_preds_fromhm_batch (line 172) | def get_preds_fromhm_batch(hm, centers=None, scales=None):
  function shuffle_lr (line 212) | def shuffle_lr(parts, pairs=None):
  function flip (line 237) | def flip(tensor, is_label=False):
  function appdata_dir (line 259) | def appdata_dir(appname=None, roaming=False):

FILE: wav2lip/genavatar.py
  function osmakedirs (line 27) | def osmakedirs(path_list):
  function video2imgs (line 31) | def video2imgs(vid_path, save_path, ext = '.png',cut_frame = 10000000):
  function read_imgs (line 45) | def read_imgs(img_list):
  function get_smoothened_boxes (line 53) | def get_smoothened_boxes(boxes, T):
  function face_detect (line 62) | def face_detect(images):

FILE: wav2lip/hparams.py
  function get_image_list (line 4) | def get_image_list(data_root, split):
  class HParams (line 15) | class HParams:
    method __init__ (line 16) | def __init__(self, **kwargs):
    method __getattr__ (line 22) | def __getattr__(self, key):
    method set_hparam (line 27) | def set_hparam(self, key, value):
  function hparams_debug_string (line 98) | def hparams_debug_string():

FILE: web/asr/main.js
  function addresschange (line 61) | function addresschange()
  function play_file (line 186) | function play_file()
  function start_file_send (line 194) | function start_file_send()
  function on_recoder_mode_change (line 221) | function on_recoder_mode_change()
  function getHotwords (line 260) | function getHotwords(){
  function getAsrMode (line 289) | function getAsrMode(){
  function handleWithTimestamp (line 310) | function handleWithTimestamp(tmptext,tmptime)
  function is_speaking (line 348) | async function is_speaking() {
  function waitSpeakingEnd (line 363) | async function waitSpeakingEnd() {
  function getJsonMessage (line 384) | function getJsonMessage( jsonMsg ) {
  function getConnState (line 435) | function getConnState( connState ) {
  function record (line 466) | function record()
  function start (line 482) | function start() {
  function stop (line 513) | function stop() {
  function clear (line 582) | function clear() {
  function recProcess (line 593) | function recProcess( buffer, powerLevel, bufferDuration, bufferSampleRat...
  function getUseITN (line 617) | function getUseITN() {

FILE: web/asr/recorder-core.js
  function initFn (line 598) | function initFn(set){

FILE: web/asr/wsconnecter.js
  function WebSocketConnectMethod (line 7) | function WebSocketConnectMethod( config ) { //定义socket连接方法类

FILE: web/client.js
  function negotiate (line 3) | function negotiate() {
  function start (line 45) | function start() {
  function stop (line 70) | function stop() {

FILE: web/srs.sdk.js
  function SrsError (line 10) | function SrsError(name, message) {
  function SrsRtcPublisherAsync (line 20) | function SrsRtcPublisherAsync() {
  function SrsRtcPlayerAsync (line 275) | function SrsRtcPlayerAsync() {
  function SrsRtcWhipWhepAsync (line 515) | function SrsRtcWhipWhepAsync() {
  function SrsRtcFormatSenders (line 671) | function SrsRtcFormatSenders(senders, kind) {

FILE: web/whep.js
  function negotiate (line 3) | function negotiate() {
  function start (line 43) | function start() {
  function stop (line 68) | function stop() {

FILE: webrtc.py
  class PlayerStreamTrack (line 48) | class PlayerStreamTrack(MediaStreamTrack):
    method __init__ (line 53) | def __init__(self, player, kind):
    method next_timestamp (line 68) | async def next_timestamp(self) -> Tuple[int, fractions.Fraction]:
    method recv (line 110) | async def recv(self) -> Union[Frame, Packet]:
    method stop (line 148) | def stop(self):
  function player_worker_thread (line 158) | def player_worker_thread(
  class HumanPlayer (line 167) | class HumanPlayer:
    method __init__ (line 169) | def __init__(
    method notify (line 185) | def notify(self,eventpoint):
    method audio (line 190) | def audio(self) -> MediaStreamTrack:
    method video (line 197) | def video(self) -> MediaStreamTrack:
    method _start (line 203) | def _start(self, track: PlayerStreamTrack) -> None:
    method _stop (line 221) | def _stop(self, track: PlayerStreamTrack) -> None:
    method __log_debug (line 234) | def __log_debug(self, msg: str, *args) -> None:

Download .json

Condensed preview — 115 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (3,536K chars).

[
  {
    "path": ".github/FUNDING.yml",
    "chars": 16,
    "preview": "github: [lipku]\n"
  },
  {
    "path": ".gitignore",
    "chars": 209,
    "preview": "__pycache__/\nbuild/\n*.egg-info/\n*.so\n*.mp4\n\ntmp*\ntrial*/\n\ndata\ndata_utils/face_tracking/3DMM/*\ndata_utils/face_parsing/7"
  },
  {
    "path": "Dockerfile",
    "chars": 1797,
    "preview": "# Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors retain all"
  },
  {
    "path": "LICENSE",
    "chars": 11344,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "README-EN.md",
    "chars": 7752,
    "preview": "# English | [中文版](./README.md)  \n <p align=\"center\">\n <img src=\"./assets/LiveTalking-logo.jpg\" align=\"middle\" width = \"6"
  },
  {
    "path": "README.md",
    "chars": 4784,
    "preview": " # [English](./README-EN.md) | 中文版  \n <p align=\"center\">\n <img src=\"./assets/LiveTalking-logo.jpg\" align=\"middle\" width "
  },
  {
    "path": "app.py",
    "chars": 16145,
    "preview": "###############################################################################\n#  Copyright (C) 2024 LiveTalking@lipku "
  },
  {
    "path": "assets/faq.md",
    "chars": 1174,
    "preview": "1.  pytorch3d安装不成功\\\n    下载源码编译\n\n```bash\ngit clone https://github.com/facebookresearch/pytorch3d.git\npython setup.py inst"
  },
  {
    "path": "baseasr.py",
    "chars": 3106,
    "preview": "###############################################################################\n#  Copyright (C) 2024 LiveTalking@lipku "
  },
  {
    "path": "basereal.py",
    "chars": 16458,
    "preview": "###############################################################################\n#  Copyright (C) 2024 LiveTalking@lipku "
  },
  {
    "path": "hubertasr.py",
    "chars": 1509,
    "preview": "import time\r\nimport torch\r\nimport numpy as np\r\nfrom baseasr import BaseASR\r\nfrom ultralight.audio2feature import Audio2F"
  },
  {
    "path": "lightreal.py",
    "chars": 11422,
    "preview": "###############################################################################\n#  Copyright (C) 2024 LiveTalking@lipku "
  },
  {
    "path": "lipasr.py",
    "chars": 2615,
    "preview": "###############################################################################\n#  Copyright (C) 2024 LiveTalking@lipku "
  },
  {
    "path": "lipreal.py",
    "chars": 10079,
    "preview": "###############################################################################\r\n#  Copyright (C) 2024 LiveTalking@lipku"
  },
  {
    "path": "llm.py",
    "chars": 1750,
    "preview": "import time\nimport os\nfrom basereal import BaseReal\nfrom logger import logger\n\ndef llm_response(message,nerfreal:BaseRea"
  },
  {
    "path": "logger.py",
    "chars": 577,
    "preview": "import logging\n \n# 配置日志器\nlogger = logging.getLogger(__name__)\nlogger.setLevel(logging.DEBUG)\nformatter = logging.Formatt"
  },
  {
    "path": "museasr.py",
    "chars": 2668,
    "preview": "###############################################################################\n#  Copyright (C) 2024 LiveTalking@lipku "
  },
  {
    "path": "musereal.py",
    "chars": 13867,
    "preview": "###############################################################################\r\n#  Copyright (C) 2024 LiveTalking@lipku"
  },
  {
    "path": "musetalk/genavatar.py",
    "chars": 14948,
    "preview": "import argparse\nimport glob\nimport json\nimport os\nimport pickle\nimport shutil\n\nimport cv2\nimport numpy as np\nimport torc"
  },
  {
    "path": "musetalk/myutil.py",
    "chars": 965,
    "preview": "import numpy as np\nimport cv2\nimport copy\n\ndef get_image_blending(image,face,face_box,mask_array,crop_box):\n    body = i"
  },
  {
    "path": "musetalk/utils/__init__.py",
    "chars": 160,
    "preview": "import sys\nfrom os.path import abspath, dirname\ncurrent_dir = dirname(abspath(__file__))\nparent_dir = dirname(current_di"
  },
  {
    "path": "musetalk/utils/audio_processor.py",
    "chars": 4227,
    "preview": "import math\nimport os\n\nimport librosa\nimport numpy as np\nimport torch\nfrom einops import rearrange\nfrom transformers imp"
  },
  {
    "path": "musetalk/utils/blending.py",
    "chars": 4459,
    "preview": "from PIL import Image\nimport numpy as np\nimport cv2\nimport copy\n\n\ndef get_crop_box(box, expand):\n    x, y, x1, y1 = box\n"
  },
  {
    "path": "musetalk/utils/dwpose/default_runtime.py",
    "chars": 1431,
    "preview": "default_scope = 'mmpose'\n\n# hooks\ndefault_hooks = dict(\n    timer=dict(type='IterTimerHook'),\n    logger=dict(type='Logg"
  },
  {
    "path": "musetalk/utils/dwpose/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py",
    "chars": 7364,
    "preview": "#_base_ = ['../../../_base_/default_runtime.py']\n_base_ = ['default_runtime.py']\n\n# runtime\nmax_epochs = 270\nstage2_num_"
  },
  {
    "path": "musetalk/utils/face_detection/README.md",
    "chars": 209,
    "preview": "The code for Face Detection in this folder has been taken from the wonderful [face_alignment](https://github.com/1adrian"
  },
  {
    "path": "musetalk/utils/face_detection/__init__.py",
    "chars": 196,
    "preview": "# -*- coding: utf-8 -*-\n\n__author__ = \"\"\"Adrian Bulat\"\"\"\n__email__ = 'adrian.bulat@nottingham.ac.uk'\n__version__ = '1.0."
  },
  {
    "path": "musetalk/utils/face_detection/api.py",
    "chars": 10034,
    "preview": "from __future__ import print_function\nimport os\nimport torch\nfrom torch.utils.model_zoo import load_url\nfrom enum import"
  },
  {
    "path": "musetalk/utils/face_detection/detection/__init__.py",
    "chars": 30,
    "preview": "from .core import FaceDetector"
  },
  {
    "path": "musetalk/utils/face_detection/detection/core.py",
    "chars": 4868,
    "preview": "import logging\nimport glob\nfrom tqdm import tqdm\nimport numpy as np\nimport torch\nimport cv2\n\n\nclass FaceDetector(object)"
  },
  {
    "path": "musetalk/utils/face_detection/detection/sfd/__init__.py",
    "chars": 53,
    "preview": "from .sfd_detector import SFDDetector as FaceDetector"
  },
  {
    "path": "musetalk/utils/face_detection/detection/sfd/bbox.py",
    "chars": 4279,
    "preview": "from __future__ import print_function\nimport os\nimport sys\nimport cv2\nimport random\nimport datetime\nimport time\nimport m"
  },
  {
    "path": "musetalk/utils/face_detection/detection/sfd/detect.py",
    "chars": 3792,
    "preview": "import torch\nimport torch.nn.functional as F\n\nimport os\nimport sys\nimport cv2\nimport random\nimport datetime\nimport math\n"
  },
  {
    "path": "musetalk/utils/face_detection/detection/sfd/net_s3fd.py",
    "chars": 5291,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass L2Norm(nn.Module):\n    def __init__(self, n_c"
  },
  {
    "path": "musetalk/utils/face_detection/detection/sfd/sfd_detector.py",
    "chars": 1809,
    "preview": "import os\nimport cv2\nfrom torch.utils.model_zoo import load_url\n\nfrom ..core import FaceDetector\n\nfrom .net_s3fd import "
  },
  {
    "path": "musetalk/utils/face_detection/models.py",
    "chars": 8619,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport math\n\n\ndef conv3x3(in_planes, out_planes, strd"
  },
  {
    "path": "musetalk/utils/face_detection/utils.py",
    "chars": 11808,
    "preview": "from __future__ import print_function\nimport os\nimport sys\nimport time\nimport torch\nimport math\nimport numpy as np\nimpor"
  },
  {
    "path": "musetalk/utils/face_parsing/__init__.py",
    "chars": 4700,
    "preview": "import torch\nimport time\nimport os\nimport cv2\nimport numpy as np\nfrom PIL import Image\nfrom .model import BiSeNet\nimport"
  },
  {
    "path": "musetalk/utils/face_parsing/model.py",
    "chars": 10676,
    "preview": "#!/usr/bin/python\n# -*- encoding: utf-8 -*-\n\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport "
  },
  {
    "path": "musetalk/utils/face_parsing/resnet.py",
    "chars": 3706,
    "preview": "#!/usr/bin/python\n# -*- encoding: utf-8 -*-\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport t"
  },
  {
    "path": "musetalk/utils/preprocessing.py",
    "chars": 6899,
    "preview": "import sys\nfrom face_detection import FaceAlignment,LandmarksType\nfrom os import listdir, path\nimport subprocess\nimport "
  },
  {
    "path": "musetalk/utils/training_utils.py",
    "chars": 12167,
    "preview": "import os\nimport json\nimport logging\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\nfrom torch.optim.lr_"
  },
  {
    "path": "musetalk/utils/utils.py",
    "chars": 11737,
    "preview": "import os\nimport cv2\nimport numpy as np\nimport torch\nfrom typing import Union, List\nimport torch.nn.functional as F\nfrom"
  },
  {
    "path": "musetalk/whisper/audio2feature.py",
    "chars": 6263,
    "preview": "import os\nfrom .whisper import load_model\nimport soundfile as sf\nimport numpy as np\nimport time\nimport sys\nfrom transfor"
  },
  {
    "path": "musetalk/whisper/whisper/__init__.py",
    "chars": 5529,
    "preview": "import hashlib\nimport io\nimport os\nimport urllib\nimport warnings\nfrom typing import List, Optional, Union\n\nimport torch\n"
  },
  {
    "path": "musetalk/whisper/whisper/__main__.py",
    "chars": 36,
    "preview": "from .transcribe import cli\n\n\ncli()\n"
  },
  {
    "path": "musetalk/whisper/whisper/assets/gpt2/merges.txt",
    "chars": 420610,
    "preview": "#version: 0.2 - Trained by `huggingface/tokenizers`\nĠ t\nĠ a\nh e\ni n\nr e\no n\nĠt he\ne r\nĠ s\na t\nĠ w\nĠ o\ne n\nĠ c\ni t\ni s\na "
  },
  {
    "path": "musetalk/whisper/whisper/assets/gpt2/special_tokens_map.json",
    "chars": 90,
    "preview": "{\"bos_token\": \"<|endoftext|>\", \"eos_token\": \"<|endoftext|>\", \"unk_token\": \"<|endoftext|>\"}"
  },
  {
    "path": "musetalk/whisper/whisper/assets/gpt2/tokenizer_config.json",
    "chars": 236,
    "preview": "{\"unk_token\": \"<|endoftext|>\", \"bos_token\": \"<|endoftext|>\", \"eos_token\": \"<|endoftext|>\", \"add_prefix_space\": false, \"m"
  },
  {
    "path": "musetalk/whisper/whisper/assets/gpt2/vocab.json",
    "chars": 762248,
    "preview": "{\"!\":0,\"\\\"\":1,\"#\":2,\"$\":3,\"%\":4,\"&\":5,\"'\":6,\"(\":7,\")\":8,\"*\":9,\"+\":10,\",\":11,\"-\":12,\".\":13,\"/\":14,\"0\":15,\"1\":16,\"2\":17,\"3"
  },
  {
    "path": "musetalk/whisper/whisper/assets/multilingual/added_tokens.json",
    "chars": 25,
    "preview": "{\"<|endoftext|>\": 50257}\n"
  },
  {
    "path": "musetalk/whisper/whisper/assets/multilingual/merges.txt",
    "chars": 406801,
    "preview": "Ġ t\nĠ a\nĠt h\ni n\ne r\nĠ w\nĠ s\no u\nĠth e\nr e\no n\na t\ne n\nĠ c\ni t\ni s\nĠ b\nn d\nĠ d\nĠ m\nĠ h\nĠ o\nin g\ne s\nĠ p\nĠt o\na n\nĠ f\no r"
  },
  {
    "path": "musetalk/whisper/whisper/assets/multilingual/special_tokens_map.json",
    "chars": 90,
    "preview": "{\"bos_token\": \"<|endoftext|>\", \"eos_token\": \"<|endoftext|>\", \"unk_token\": \"<|endoftext|>\"}"
  },
  {
    "path": "musetalk/whisper/whisper/assets/multilingual/tokenizer_config.json",
    "chars": 604,
    "preview": "{\"unk_token\": {\"content\": \"<|endoftext|>\", \"single_word\": false, \"lstrip\": false, \"rstrip\": false, \"normalized\": true, \""
  },
  {
    "path": "musetalk/whisper/whisper/assets/multilingual/vocab.json",
    "chars": 848825,
    "preview": "{\"!\": 0, \"\\\"\": 1, \"#\": 2, \"$\": 3, \"%\": 4, \"&\": 5, \"'\": 6, \"(\": 7, \")\": 8, \"*\": 9, \"+\": 10, \",\": 11, \"-\": 12, \".\": 13, \"/"
  },
  {
    "path": "musetalk/whisper/whisper/audio.py",
    "chars": 4071,
    "preview": "import os\nfrom functools import lru_cache\nfrom typing import Union\n\nimport ffmpeg\nimport numpy as np\nimport torch\nimport"
  },
  {
    "path": "musetalk/whisper/whisper/decoding.py",
    "chars": 30930,
    "preview": "from dataclasses import dataclass, field\nfrom typing import Dict, List, Tuple, Iterable, Optional, Sequence, Union, TYPE"
  },
  {
    "path": "musetalk/whisper/whisper/model.py",
    "chars": 10591,
    "preview": "from dataclasses import dataclass\nfrom typing import Dict\nfrom typing import Iterable, Optional\n\nimport numpy as np\nimpo"
  },
  {
    "path": "musetalk/whisper/whisper/normalizers/__init__.py",
    "chars": 82,
    "preview": "from .basic import BasicTextNormalizer\nfrom .english import EnglishTextNormalizer\n"
  },
  {
    "path": "musetalk/whisper/whisper/normalizers/basic.py",
    "chars": 1865,
    "preview": "import re\nimport unicodedata\n\nimport regex\n\n# non-ASCII letters that are not separated by \"NFKD\" normalization\nADDITIONA"
  },
  {
    "path": "musetalk/whisper/whisper/normalizers/english.json",
    "chars": 56145,
    "preview": "{\n    \"accessorise\": \"accessorize\",\n    \"accessorised\": \"accessorized\",\n    \"accessorises\": \"accessorizes\",\n    \"accesso"
  },
  {
    "path": "musetalk/whisper/whisper/normalizers/english.py",
    "chars": 20768,
    "preview": "import json\nimport os\nimport re\nfrom fractions import Fraction\nfrom typing import Iterator, List, Match, Optional, Union"
  },
  {
    "path": "musetalk/whisper/whisper/tokenizer.py",
    "chars": 9799,
    "preview": "import os\nfrom dataclasses import dataclass\nfrom functools import lru_cache\nfrom typing import List, Optional, Tuple, Un"
  },
  {
    "path": "musetalk/whisper/whisper/transcribe.py",
    "chars": 10728,
    "preview": "import argparse\nimport os\nimport warnings\nfrom typing import List, Optional, Tuple, Union, TYPE_CHECKING\n\nimport numpy a"
  },
  {
    "path": "musetalk/whisper/whisper/utils.py",
    "chars": 2660,
    "preview": "import zlib\nfrom typing import Iterator, TextIO\n\n\ndef exact_div(x, y):\n    assert x % y == 0\n    return x // y\n\n\ndef str"
  },
  {
    "path": "requirements.txt",
    "chars": 483,
    "preview": "torch-ema\nninja\ntrimesh\nopencv-python\ntensorboardX\nnumpy \npandas\ntqdm\nmatplotlib\nPyMCubes\nrich\ndearpygui\npackaging\nscipy"
  },
  {
    "path": "ttsreal.py",
    "chars": 39744,
    "preview": "###############################################################################\n#  Copyright (C) 2024 LiveTalking@lipku "
  },
  {
    "path": "ultralight/audio2feature.py",
    "chars": 3956,
    "preview": "from transformers import Wav2Vec2Processor, HubertModel\nimport torch\nimport numpy as np\n\n\nclass Audio2Feature():\n    def"
  },
  {
    "path": "ultralight/face_detect_utils/base_module.py",
    "chars": 17497,
    "preview": "#!/usr/bin/env python3\n# -*- coding:utf-8 -*-\n\nimport torch\nfrom torch.nn import Module, Sequential, Conv2d, BatchNorm2d"
  },
  {
    "path": "ultralight/face_detect_utils/detect_face.py",
    "chars": 5136,
    "preview": "import cv2\nimport time\nimport argparse\nimport numpy as np\n\nclass SCRFD():\n    def __init__(self, onnxmodel, confThreshol"
  },
  {
    "path": "ultralight/face_detect_utils/get_landmark.py",
    "chars": 3904,
    "preview": "import argparse\nfrom os import wait3\n\nimport numpy as np\nimport cv2\nimport math\n\nimport torch\nimport torchvision\nfrom .d"
  },
  {
    "path": "ultralight/face_detect_utils/mean_face.txt",
    "chars": 2394,
    "preview": "0.07823661 0.22561455 0.07775262 0.28360514 0.07767719 0.34125846 0.07962388 0.39897107 0.0852785 0.45675877 0.0948296 0"
  },
  {
    "path": "ultralight/face_detect_utils/pfld_mobileone.py",
    "chars": 16986,
    "preview": "#!/usr/bin/env python3\n# -*- coding:utf-8 -*-\n\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom "
  },
  {
    "path": "ultralight/genavatar-bak.py",
    "chars": 3857,
    "preview": "import argparse\nimport os\nimport cv2\nimport torch\nimport numpy as np\nimport torch.nn as nn\nfrom torch import optim\nfrom "
  },
  {
    "path": "ultralight/genavatar.py",
    "chars": 3072,
    "preview": "import argparse\nimport os\nimport cv2\nimport torch\nimport numpy as np\nimport torch.nn as nn\nfrom torch import optim\nfrom "
  },
  {
    "path": "ultralight/unet.py",
    "chars": 10132,
    "preview": "import time\nimport math\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass InvertedResidual(nn.Mo"
  },
  {
    "path": "wav2lip/audio.py",
    "chars": 4516,
    "preview": "import librosa\nimport librosa.filters\nimport numpy as np\n# import tensorflow as tf\nfrom scipy import signal\nfrom scipy.i"
  },
  {
    "path": "wav2lip/face_detection/README.md",
    "chars": 209,
    "preview": "The code for Face Detection in this folder has been taken from the wonderful [face_alignment](https://github.com/1adrian"
  },
  {
    "path": "wav2lip/face_detection/__init__.py",
    "chars": 183,
    "preview": "# -*- coding: utf-8 -*-\n\n__author__ = \"\"\"Adrian Bulat\"\"\"\n__email__ = 'adrian.bulat@nottingham.ac.uk'\n__version__ = '1.0."
  },
  {
    "path": "wav2lip/face_detection/api.py",
    "chars": 2266,
    "preview": "from __future__ import print_function\nimport os\nimport torch\nfrom torch.utils.model_zoo import load_url\nfrom enum import"
  },
  {
    "path": "wav2lip/face_detection/detection/__init__.py",
    "chars": 30,
    "preview": "from .core import FaceDetector"
  },
  {
    "path": "wav2lip/face_detection/detection/core.py",
    "chars": 4868,
    "preview": "import logging\nimport glob\nfrom tqdm import tqdm\nimport numpy as np\nimport torch\nimport cv2\n\n\nclass FaceDetector(object)"
  },
  {
    "path": "wav2lip/face_detection/detection/sfd/__init__.py",
    "chars": 53,
    "preview": "from .sfd_detector import SFDDetector as FaceDetector"
  },
  {
    "path": "wav2lip/face_detection/detection/sfd/bbox.py",
    "chars": 4279,
    "preview": "from __future__ import print_function\nimport os\nimport sys\nimport cv2\nimport random\nimport datetime\nimport time\nimport m"
  },
  {
    "path": "wav2lip/face_detection/detection/sfd/detect.py",
    "chars": 3769,
    "preview": "import torch\nimport torch.nn.functional as F\n\nimport os\nimport sys\nimport cv2\nimport random\nimport datetime\nimport math\n"
  },
  {
    "path": "wav2lip/face_detection/detection/sfd/net_s3fd.py",
    "chars": 5291,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass L2Norm(nn.Module):\n    def __init__(self, n_c"
  },
  {
    "path": "wav2lip/face_detection/detection/sfd/sfd_detector.py",
    "chars": 1809,
    "preview": "import os\nimport cv2\nfrom torch.utils.model_zoo import load_url\n\nfrom ..core import FaceDetector\n\nfrom .net_s3fd import "
  },
  {
    "path": "wav2lip/face_detection/models.py",
    "chars": 8619,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport math\n\n\ndef conv3x3(in_planes, out_planes, strd"
  },
  {
    "path": "wav2lip/face_detection/utils.py",
    "chars": 11808,
    "preview": "from __future__ import print_function\nimport os\nimport sys\nimport time\nimport torch\nimport math\nimport numpy as np\nimpor"
  },
  {
    "path": "wav2lip/genavatar.py",
    "chars": 4447,
    "preview": "from os import listdir, path\nimport numpy as np\nimport scipy, cv2, os, sys, argparse\nimport json, subprocess, random, st"
  },
  {
    "path": "wav2lip/hparams.py",
    "chars": 3556,
    "preview": "from glob import glob\nimport os\n\ndef get_image_list(data_root, split):\n\tfilelist = []\n\n\twith open('filelists/{}.txt'.for"
  },
  {
    "path": "web/asr/index.html",
    "chars": 2910,
    "preview": "<!DOCTYPE html>\r\n<html>\r\n\t<head>\r\n\t\t<meta charset=\"utf-8\" />\r\n\t\t<meta name=\"viewport\" content=\"width=device-width,initia"
  },
  {
    "path": "web/asr/main.js",
    "chars": 15288,
    "preview": "/**\r\n * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights\r\n * Reserved. MIT License  (https:"
  },
  {
    "path": "web/asr/pcm.js",
    "chars": 2115,
    "preview": "/*\npcm编码器+编码引擎\nhttps://github.com/xiangyuecn/Recorder\n\n编码原理：本编码器输出的pcm格式数据其实就是Recorder中的buffers原始数据（经过了重新采样），16位时为LE小端模式"
  },
  {
    "path": "web/asr/recorder-core.js",
    "chars": 40747,
    "preview": "/*\n录音\nhttps://github.com/xiangyuecn/Recorder\n*/\n(function(factory){\n\tfactory(window);\n\t//umd returnExports.js\n\tif(typeof"
  },
  {
    "path": "web/asr/wav.js",
    "chars": 2218,
    "preview": "/*\nwav编码器+编码引擎\nhttps://github.com/xiangyuecn/Recorder\n\n当然最佳推荐使用mp3、wav格式，代码也是优先照顾这两种格式\n浏览器支持情况\nhttps://developer.mozilla"
  },
  {
    "path": "web/asr/wsconnecter.js",
    "chars": 2457,
    "preview": "/**\r\n * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights\r\n * Reserved. MIT License  (https:"
  },
  {
    "path": "web/chat.html",
    "chars": 2107,
    "preview": "<!-- index.html -->\n<html>\n<head>\n  <script type=\"text/javascript\" src=\"mpegts-1.7.3.min.js\"></script>\n  <script type=\"t"
  },
  {
    "path": "web/client.js",
    "chars": 2735,
    "preview": "var pc = null;\n\nfunction negotiate() {\n    pc.addTransceiver('video', { direction: 'recvonly' });\n    pc.addTransceiver("
  },
  {
    "path": "web/dashboard.html",
    "chars": 29201,
    "preview": "<!DOCTYPE html>\n<html lang=\"zh-CN\">\n<head>\n    <meta charset=\"UTF-8\">\n    <meta name=\"viewport\" content=\"width=device-wi"
  },
  {
    "path": "web/echo.html",
    "chars": 1880,
    "preview": "<!-- index.html -->\n<html>\n<head>\n  <script type=\"text/javascript\" src=\"mpegts-1.7.3.min.js\"></script>\n  <script type=\"t"
  },
  {
    "path": "web/echoapi.html",
    "chars": 2107,
    "preview": "<!-- index.html -->\n<html>\n<head>\n  <script type=\"text/javascript\" src=\"mpegts-1.7.3.min.js\"></script>\n  <script type=\"t"
  },
  {
    "path": "web/rtcpush.html",
    "chars": 4007,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n    <meta charset=\"UTF-8\"/>\n    <meta name=\"viewport\" content=\"width=device-width, initial"
  },
  {
    "path": "web/rtcpushapi-asr.html",
    "chars": 4451,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n    <meta charset=\"UTF-8\"/>\n    <meta name=\"viewport\" content=\"width=device-width, initial"
  },
  {
    "path": "web/rtcpushapi.html",
    "chars": 4339,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n    <meta charset=\"UTF-8\"/>\n    <meta name=\"viewport\" content=\"width=device-width, initial"
  },
  {
    "path": "web/rtcpushchat.html",
    "chars": 4339,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n    <meta charset=\"UTF-8\"/>\n    <meta name=\"viewport\" content=\"width=device-width, initial"
  },
  {
    "path": "web/srs.sdk.js",
    "chars": 27565,
    "preview": "\n//\n// Copyright (c) 2013-2021 Winlin\n//\n// SPDX-License-Identifier: MIT\n//\n\n'use strict';\n\nfunction SrsError(name, mess"
  },
  {
    "path": "web/webrtc.html",
    "chars": 2171,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n    <meta charset=\"UTF-8\"/>\n    <meta name=\"viewport\" content=\"width=device-width, initial"
  },
  {
    "path": "web/webrtcapi-asr.html",
    "chars": 5906,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n    <meta charset=\"UTF-8\"/>\n    <meta name=\"viewport\" content=\"width=device-width, initial"
  },
  {
    "path": "web/webrtcapi-custom.html",
    "chars": 3271,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n    <meta charset=\"UTF-8\"/>\n    <meta name=\"viewport\" content=\"width=device-width, initial"
  },
  {
    "path": "web/webrtcapi.html",
    "chars": 6009,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n    <meta charset=\"UTF-8\"/>\n    <meta name=\"viewport\" content=\"width=device-width, initial"
  },
  {
    "path": "web/webrtcchat.html",
    "chars": 5841,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n    <meta charset=\"UTF-8\"/>\n    <meta name=\"viewport\" content=\"width=device-width, initial"
  },
  {
    "path": "web/whep.js",
    "chars": 2279,
    "preview": "var pc = null;\n\nfunction negotiate() {\n    var host = window.location.hostname\n    pc.addTransceiver('video', { directio"
  },
  {
    "path": "webrtc.py",
    "chars": 8864,
    "preview": "###############################################################################\r\n#  Copyright (C) 2024 LiveTalking@lipku"
  }
]

// ... and 1 more files (download for full content)

About this extraction

This page contains the full source code of the lipku/LiveTalking GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 115 files (3.0 MB), approximately 801.9k tokens, and a symbol index with 671 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo