[
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n**/__pycache__/\n*.py[cod]\n**/*.py[cod]\n*$py.class\n\n# Model weights\n**/*.pth\n**/*.onnx\n\npretrained_weights/*.md\npretrained_weights/docs\npretrained_weights/liveportrait\npretrained_weights/liveportrait_animals\n\n# Ipython notebook\n*.ipynb\n\n# Temporary files or benchmark resources\nanimations/*\ntmp/*\n.vscode/launch.json\n**/*.DS_Store\ngradio_temp/**\n\n# Windows dependencies\nffmpeg/\nLivePortrait_env/\n\n# XPose build files\nsrc/utils/dependencies/XPose/models/UniPose/ops/build\nsrc/utils/dependencies/XPose/models/UniPose/ops/dist\nsrc/utils/dependencies/XPose/models/UniPose/ops/MultiScaleDeformableAttention.egg-info\n"
  },
  {
    "path": ".vscode/settings.json",
    "content": "{\n    \"[python]\": {\n        \"editor.tabSize\": 4\n    },\n    \"files.eol\": \"\\n\",\n    \"files.insertFinalNewline\": true,\n    \"files.trimFinalNewlines\": true,\n    \"files.trimTrailingWhitespace\": true,\n    \"files.exclude\": {\n        \"**/.git\": true,\n        \"**/.svn\": true,\n        \"**/.hg\": true,\n        \"**/CVS\": true,\n        \"**/.DS_Store\": true,\n        \"**/Thumbs.db\": true,\n        \"**/*.crswap\": true,\n        \"**/__pycache__\": true\n    }\n}\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2024 Kuaishou Visual Generation and Interaction Center\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n---\n\nThe code of InsightFace is released under the MIT License.\nThe models of InsightFace are for non-commercial research purposes only.\n\nIf you want to use the LivePortrait project for commercial purposes, you \nshould remove and replace InsightFace’s detection models to fully comply with \nthe MIT license.\n"
  },
  {
    "path": "app.py",
    "content": "# coding: utf-8\n\n\"\"\"\nThe entrance of the gradio for human\n\"\"\"\n\nimport os\nimport tyro\nimport subprocess\nimport gradio as gr\nimport os.path as osp\nfrom src.utils.helper import load_description\nfrom src.gradio_pipeline import GradioPipeline\nfrom src.config.crop_config import CropConfig\nfrom src.config.argument_config import ArgumentConfig\nfrom src.config.inference_config import InferenceConfig\n\n\ndef partial_fields(target_class, kwargs):\n    return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)})\n\n\ndef fast_check_ffmpeg():\n    try:\n        subprocess.run([\"ffmpeg\", \"-version\"], capture_output=True, check=True)\n        return True\n    except:\n        return False\n\n\n# set tyro theme\ntyro.extras.set_accent_color(\"bright_cyan\")\nargs = tyro.cli(ArgumentConfig)\n\nffmpeg_dir = os.path.join(os.getcwd(), \"ffmpeg\")\nif osp.exists(ffmpeg_dir):\n    os.environ[\"PATH\"] += (os.pathsep + ffmpeg_dir)\n\nif not fast_check_ffmpeg():\n    raise ImportError(\n        \"FFmpeg is not installed. Please install FFmpeg (including ffmpeg and ffprobe) before running this script. https://ffmpeg.org/download.html\"\n    )\n# specify configs for inference\ninference_cfg = partial_fields(InferenceConfig, args.__dict__)  # use attribute of args to initial InferenceConfig\ncrop_cfg = partial_fields(CropConfig, args.__dict__)  # use attribute of args to initial CropConfig\n# global_tab_selection = None\n\ngradio_pipeline = GradioPipeline(\n    inference_cfg=inference_cfg,\n    crop_cfg=crop_cfg,\n    args=args\n)\n\nif args.gradio_temp_dir not in (None, ''):\n    os.environ[\"GRADIO_TEMP_DIR\"] = args.gradio_temp_dir\n    os.makedirs(args.gradio_temp_dir, exist_ok=True)\n\n\ndef gpu_wrapped_execute_video(*args, **kwargs):\n    return gradio_pipeline.execute_video(*args, **kwargs)\n\n\ndef gpu_wrapped_execute_image_retargeting(*args, **kwargs):\n    return gradio_pipeline.execute_image_retargeting(*args, **kwargs)\n\n\ndef gpu_wrapped_execute_video_retargeting(*args, **kwargs):\n    return gradio_pipeline.execute_video_retargeting(*args, **kwargs)\n\n\ndef reset_sliders(*args, **kwargs):\n    return 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.5, True, True\n\n\n# assets\ntitle_md = \"assets/gradio/gradio_title.md\"\nexample_portrait_dir = \"assets/examples/source\"\nexample_video_dir = \"assets/examples/driving\"\ndata_examples_i2v = [\n    [osp.join(example_portrait_dir, \"s9.jpg\"), osp.join(example_video_dir, \"d0.mp4\"), True, True, True, False],\n    [osp.join(example_portrait_dir, \"s6.jpg\"), osp.join(example_video_dir, \"d0.mp4\"), True, True, True, False],\n    [osp.join(example_portrait_dir, \"s10.jpg\"), osp.join(example_video_dir, \"d0.mp4\"), True, True, True, False],\n    [osp.join(example_portrait_dir, \"s5.jpg\"), osp.join(example_video_dir, \"d18.mp4\"), True, True, True, False],\n    [osp.join(example_portrait_dir, \"s7.jpg\"), osp.join(example_video_dir, \"d19.mp4\"), True, True, True, False],\n    [osp.join(example_portrait_dir, \"s2.jpg\"), osp.join(example_video_dir, \"d13.mp4\"), True, True, True, True],\n]\ndata_examples_v2v = [\n    [osp.join(example_portrait_dir, \"s13.mp4\"), osp.join(example_video_dir, \"d0.mp4\"), True, True, True, False, 3e-7],\n    # [osp.join(example_portrait_dir, \"s14.mp4\"), osp.join(example_video_dir, \"d18.mp4\"), True, True, True, False, False, 3e-7],\n    # [osp.join(example_portrait_dir, \"s15.mp4\"), osp.join(example_video_dir, \"d19.mp4\"), True, True, True, False, False, 3e-7],\n    [osp.join(example_portrait_dir, \"s18.mp4\"), osp.join(example_video_dir, \"d6.mp4\"), True, True, True, False, 3e-7],\n    # [osp.join(example_portrait_dir, \"s19.mp4\"), osp.join(example_video_dir, \"d6.mp4\"), True, True, True, False, False, 3e-7],\n    [osp.join(example_portrait_dir, \"s20.mp4\"), osp.join(example_video_dir, \"d0.mp4\"), True, True, True, False, 3e-7],\n]\n#################### interface logic ####################\n\n# Define components first\nretargeting_source_scale = gr.Number(minimum=1.8, maximum=3.2, value=2.5, step=0.05, label=\"crop scale\")\nvideo_retargeting_source_scale = gr.Number(minimum=1.8, maximum=3.2, value=2.3, step=0.05, label=\"crop scale\")\ndriving_smooth_observation_variance_retargeting = gr.Number(value=3e-6, label=\"motion smooth strength\", minimum=1e-11, maximum=1e-2, step=1e-8)\nvideo_retargeting_silence = gr.Checkbox(value=False, label=\"keeping the lip silent\")\neye_retargeting_slider = gr.Slider(minimum=0, maximum=0.8, step=0.01, label=\"target eyes-open ratio\")\nlip_retargeting_slider = gr.Slider(minimum=0, maximum=0.8, step=0.01, label=\"target lip-open ratio\")\nvideo_lip_retargeting_slider = gr.Slider(minimum=0, maximum=0.8, step=0.01, label=\"target lip-open ratio\")\nhead_pitch_slider = gr.Slider(minimum=-15.0, maximum=15.0, value=0, step=1, label=\"relative pitch\")\nhead_yaw_slider = gr.Slider(minimum=-25, maximum=25, value=0, step=1, label=\"relative yaw\")\nhead_roll_slider = gr.Slider(minimum=-15.0, maximum=15.0, value=0, step=1, label=\"relative roll\")\nmov_x = gr.Slider(minimum=-0.19, maximum=0.19, value=0.0, step=0.01, label=\"x-axis movement\")\nmov_y = gr.Slider(minimum=-0.19, maximum=0.19, value=0.0, step=0.01, label=\"y-axis movement\")\nmov_z = gr.Slider(minimum=0.9, maximum=1.2, value=1.0, step=0.01, label=\"z-axis movement\")\nlip_variation_zero = gr.Slider(minimum=-0.09, maximum=0.09, value=0, step=0.01, label=\"pouting\")\nlip_variation_one = gr.Slider(minimum=-20.0, maximum=15.0, value=0, step=0.01, label=\"pursing 😐\")\nlip_variation_two = gr.Slider(minimum=0.0, maximum=15.0, value=0, step=0.01, label=\"grin 😁\")\nlip_variation_three = gr.Slider(minimum=-90.0, maximum=120.0, value=0, step=1.0, label=\"lip close <-> open\")\nsmile = gr.Slider(minimum=-0.3, maximum=1.3, value=0, step=0.01, label=\"smile 😄\")\nwink = gr.Slider(minimum=0, maximum=39, value=0, step=0.01, label=\"wink 😉\")\neyebrow = gr.Slider(minimum=-30, maximum=30, value=0, step=0.01, label=\"eyebrow 🤨\")\neyeball_direction_x = gr.Slider(minimum=-30.0, maximum=30.0, value=0, step=0.01, label=\"eye gaze (horizontal) 👀\")\neyeball_direction_y = gr.Slider(minimum=-63.0, maximum=63.0, value=0, step=0.01, label=\"eye gaze (vertical) 🙄\")\nretargeting_input_image = gr.Image(type=\"filepath\")\nretargeting_input_video = gr.Video()\noutput_image = gr.Image(type=\"numpy\")\noutput_image_paste_back = gr.Image(type=\"numpy\")\nretargeting_output_image = gr.Image(type=\"numpy\")\nretargeting_output_image_paste_back = gr.Image(type=\"numpy\")\noutput_video = gr.Video(autoplay=False)\noutput_video_paste_back = gr.Video(autoplay=False)\n\nwith gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont(\"Plus Jakarta Sans\")])) as demo:\n    gr.HTML(load_description(title_md))\n\n    gr.Markdown(load_description(\"assets/gradio/gradio_description_upload.md\"))\n    with gr.Row():\n        with gr.Column():\n            with gr.Tabs():\n                with gr.TabItem(\"🖼️ Source Image\") as tab_image:\n                    with gr.Accordion(open=True, label=\"Source Image\"):\n                        source_image_input = gr.Image(type=\"filepath\")\n                        gr.Examples(\n                            examples=[\n                                [osp.join(example_portrait_dir, \"s9.jpg\")],\n                                [osp.join(example_portrait_dir, \"s6.jpg\")],\n                                [osp.join(example_portrait_dir, \"s10.jpg\")],\n                                [osp.join(example_portrait_dir, \"s5.jpg\")],\n                                [osp.join(example_portrait_dir, \"s7.jpg\")],\n                                [osp.join(example_portrait_dir, \"s12.jpg\")],\n                                [osp.join(example_portrait_dir, \"s22.jpg\")],\n                                [osp.join(example_portrait_dir, \"s23.jpg\")],\n                            ],\n                            inputs=[source_image_input],\n                            cache_examples=False,\n                        )\n\n                with gr.TabItem(\"🎞️ Source Video\") as tab_video:\n                    with gr.Accordion(open=True, label=\"Source Video\"):\n                        source_video_input = gr.Video()\n                        gr.Examples(\n                            examples=[\n                                [osp.join(example_portrait_dir, \"s13.mp4\")],\n                                # [osp.join(example_portrait_dir, \"s14.mp4\")],\n                                # [osp.join(example_portrait_dir, \"s15.mp4\")],\n                                [osp.join(example_portrait_dir, \"s18.mp4\")],\n                                # [osp.join(example_portrait_dir, \"s19.mp4\")],\n                                [osp.join(example_portrait_dir, \"s20.mp4\")],\n                            ],\n                            inputs=[source_video_input],\n                            cache_examples=False,\n                        )\n\n                tab_selection = gr.Textbox(visible=False)\n                tab_image.select(lambda: \"Image\", None, tab_selection)\n                tab_video.select(lambda: \"Video\", None, tab_selection)\n            with gr.Accordion(open=True, label=\"Cropping Options for Source Image or Video\"):\n                with gr.Row():\n                    flag_do_crop_input = gr.Checkbox(value=True, label=\"do crop (source)\")\n                    scale = gr.Number(value=2.3, label=\"source crop scale\", minimum=1.8, maximum=3.2, step=0.05)\n                    vx_ratio = gr.Number(value=0.0, label=\"source crop x\", minimum=-0.5, maximum=0.5, step=0.01)\n                    vy_ratio = gr.Number(value=-0.125, label=\"source crop y\", minimum=-0.5, maximum=0.5, step=0.01)\n\n        with gr.Column():\n            with gr.Tabs():\n                with gr.TabItem(\"🎞️ Driving Video\") as v_tab_video:\n                    with gr.Accordion(open=True, label=\"Driving Video\"):\n                        driving_video_input = gr.Video()\n                        gr.Examples(\n                            examples=[\n                                [osp.join(example_video_dir, \"d0.mp4\")],\n                                [osp.join(example_video_dir, \"d18.mp4\")],\n                                [osp.join(example_video_dir, \"d19.mp4\")],\n                                [osp.join(example_video_dir, \"d14.mp4\")],\n                                [osp.join(example_video_dir, \"d6.mp4\")],\n                                [osp.join(example_video_dir, \"d20.mp4\")],\n                            ],\n                            inputs=[driving_video_input],\n                            cache_examples=False,\n                        )\n                with gr.TabItem(\"🖼️ Driving Image\") as v_tab_image:\n                    with gr.Accordion(open=True, label=\"Driving Image\"):\n                        driving_image_input = gr.Image(type=\"filepath\")\n                        gr.Examples(\n                            examples=[\n                                [osp.join(example_video_dir, \"d30.jpg\")],\n                                [osp.join(example_video_dir, \"d9.jpg\")],\n                                [osp.join(example_video_dir, \"d19.jpg\")],\n                                [osp.join(example_video_dir, \"d8.jpg\")],\n                                [osp.join(example_video_dir, \"d12.jpg\")],\n                                [osp.join(example_video_dir, \"d38.jpg\")],\n                            ],\n                            inputs=[driving_image_input],\n                            cache_examples=False,\n                        )\n\n                with gr.TabItem(\"📁 Driving Pickle\") as v_tab_pickle:\n                    with gr.Accordion(open=True, label=\"Driving Pickle\"):\n                        driving_video_pickle_input = gr.File(type=\"filepath\", file_types=[\".pkl\"])\n                        gr.Examples(\n                            examples=[\n                                [osp.join(example_video_dir, \"d1.pkl\")],\n                                [osp.join(example_video_dir, \"d2.pkl\")],\n                                [osp.join(example_video_dir, \"d5.pkl\")],\n                                [osp.join(example_video_dir, \"d7.pkl\")],\n                                [osp.join(example_video_dir, \"d8.pkl\")],\n                            ],\n                            inputs=[driving_video_pickle_input],\n                            cache_examples=False,\n                        )\n\n                v_tab_selection = gr.Textbox(visible=False)\n                v_tab_video.select(lambda: \"Video\", None, v_tab_selection)\n                v_tab_image.select(lambda: \"Image\", None, v_tab_selection)\n                v_tab_pickle.select(lambda: \"Pickle\", None, v_tab_selection)\n            # with gr.Accordion(open=False, label=\"Animation Instructions\"):\n                # gr.Markdown(load_description(\"assets/gradio/gradio_description_animation.md\"))\n            with gr.Accordion(open=True, label=\"Cropping Options for Driving Video\"):\n                with gr.Row():\n                    flag_crop_driving_video_input = gr.Checkbox(value=False, label=\"do crop (driving)\")\n                    scale_crop_driving_video = gr.Number(value=2.2, label=\"driving crop scale\", minimum=1.8, maximum=3.2, step=0.05)\n                    vx_ratio_crop_driving_video = gr.Number(value=0.0, label=\"driving crop x\", minimum=-0.5, maximum=0.5, step=0.01)\n                    vy_ratio_crop_driving_video = gr.Number(value=-0.1, label=\"driving crop y\", minimum=-0.5, maximum=0.5, step=0.01)\n\n    with gr.Row():\n        with gr.Accordion(open=True, label=\"Animation Options\"):\n            with gr.Row():\n                flag_normalize_lip = gr.Checkbox(value=False, label=\"normalize lip\")\n                flag_relative_input = gr.Checkbox(value=True, label=\"relative motion\")\n                flag_remap_input = gr.Checkbox(value=True, label=\"paste-back\")\n                flag_stitching_input = gr.Checkbox(value=True, label=\"stitching\")\n                animation_region = gr.Radio([\"exp\", \"pose\", \"lip\", \"eyes\", \"all\"], value=\"all\", label=\"animation region\")\n                driving_option_input = gr.Radio(['expression-friendly', 'pose-friendly'], value=\"expression-friendly\", label=\"driving option (i2v)\")\n                driving_multiplier = gr.Number(value=1.0, label=\"driving multiplier (i2v)\", minimum=0.0, maximum=2.0, step=0.02)\n                driving_smooth_observation_variance = gr.Number(value=3e-7, label=\"motion smooth strength (v2v)\", minimum=1e-11, maximum=1e-2, step=1e-8)\n\n    gr.Markdown(load_description(\"assets/gradio/gradio_description_animate_clear.md\"))\n    with gr.Row():\n        process_button_animation = gr.Button(\"🚀 Animate\", variant=\"primary\")\n    with gr.Row():\n        with gr.Column():\n            output_video_i2v = gr.Video(autoplay=False, label=\"The animated video in the original image space\")\n        with gr.Column():\n            output_video_concat_i2v = gr.Video(autoplay=False, label=\"The animated video\")\n    with gr.Row():\n        with gr.Column():\n            output_image_i2i = gr.Image(type=\"numpy\", label=\"The animated image in the original image space\", visible=False)\n        with gr.Column():\n            output_image_concat_i2i = gr.Image(type=\"numpy\", label=\"The animated image\", visible=False)\n    with gr.Row():\n        process_button_reset = gr.ClearButton([source_image_input, source_video_input, driving_video_pickle_input, driving_video_input, driving_image_input, output_video_i2v, output_video_concat_i2v, output_image_i2i, output_image_concat_i2i], value=\"🧹 Clear\")\n\n    with gr.Row():\n        # Examples\n        gr.Markdown(\"## You could also choose the examples below by one click ⬇️\")\n    with gr.Row():\n        with gr.Tabs():\n            with gr.TabItem(\"🖼️ Portrait Animation\"):\n                gr.Examples(\n                    examples=data_examples_i2v,\n                    fn=gpu_wrapped_execute_video,\n                    inputs=[\n                        source_image_input,\n                        driving_video_input,\n                        flag_relative_input,\n                        flag_do_crop_input,\n                        flag_remap_input,\n                        flag_crop_driving_video_input,\n                    ],\n                    outputs=[output_image, output_image_paste_back],\n                    examples_per_page=len(data_examples_i2v),\n                    cache_examples=False,\n                )\n            with gr.TabItem(\"🎞️ Portrait Video Editing\"):\n                gr.Examples(\n                    examples=data_examples_v2v,\n                    fn=gpu_wrapped_execute_video,\n                    inputs=[\n                        source_video_input,\n                        driving_video_input,\n                        flag_relative_input,\n                        flag_do_crop_input,\n                        flag_remap_input,\n                        flag_crop_driving_video_input,\n                        driving_smooth_observation_variance,\n                    ],\n                    outputs=[output_image, output_image_paste_back],\n                    examples_per_page=len(data_examples_v2v),\n                    cache_examples=False,\n                )\n\n    # Retargeting Image\n    gr.Markdown(load_description(\"assets/gradio/gradio_description_retargeting.md\"), visible=True)\n    with gr.Row(visible=True):\n        flag_do_crop_input_retargeting_image = gr.Checkbox(value=True, label=\"do crop (source)\")\n        flag_stitching_retargeting_input = gr.Checkbox(value=True, label=\"stitching\")\n        retargeting_source_scale.render()\n        eye_retargeting_slider.render()\n        lip_retargeting_slider.render()\n    with gr.Row(visible=True):\n        with gr.Column():\n            with gr.Accordion(open=True, label=\"Facial movement sliders\"):\n                with gr.Row(visible=True):\n                    head_pitch_slider.render()\n                    head_yaw_slider.render()\n                    head_roll_slider.render()\n                with gr.Row(visible=True):\n                    mov_x.render()\n                    mov_y.render()\n                    mov_z.render()\n        with gr.Column():\n            with gr.Accordion(open=True, label=\"Facial expression sliders\"):\n                with gr.Row(visible=True):\n                    lip_variation_zero.render()\n                    lip_variation_one.render()\n                    lip_variation_two.render()\n                with gr.Row(visible=True):\n                    lip_variation_three.render()\n                    smile.render()\n                    wink.render()\n                with gr.Row(visible=True):\n                    eyebrow.render()\n                    eyeball_direction_x.render()\n                    eyeball_direction_y.render()\n    with gr.Row(visible=True):\n        reset_button = gr.Button(\"🔄 Reset\")\n        reset_button.click(\n            fn=reset_sliders,\n            inputs=None,\n            outputs=[\n                head_pitch_slider, head_yaw_slider, head_roll_slider, mov_x, mov_y, mov_z,\n                lip_variation_zero, lip_variation_one, lip_variation_two, lip_variation_three, smile, wink, eyebrow, eyeball_direction_x, eyeball_direction_y,\n                retargeting_source_scale, flag_stitching_retargeting_input, flag_do_crop_input_retargeting_image\n            ]\n        )\n    with gr.Row(visible=True):\n        with gr.Column():\n            with gr.Accordion(open=True, label=\"Retargeting Image Input\"):\n                retargeting_input_image.render()\n                gr.Examples(\n                    examples=[\n                        [osp.join(example_portrait_dir, \"s9.jpg\")],\n                        [osp.join(example_portrait_dir, \"s6.jpg\")],\n                        [osp.join(example_portrait_dir, \"s10.jpg\")],\n                        [osp.join(example_portrait_dir, \"s5.jpg\")],\n                        [osp.join(example_portrait_dir, \"s7.jpg\")],\n                        [osp.join(example_portrait_dir, \"s12.jpg\")],\n                        [osp.join(example_portrait_dir, \"s22.jpg\")],\n                        # [osp.join(example_portrait_dir, \"s23.jpg\")],\n                        [osp.join(example_portrait_dir, \"s42.jpg\")],\n                    ],\n                    inputs=[retargeting_input_image],\n                    cache_examples=False,\n                )\n        with gr.Column():\n            with gr.Accordion(open=True, label=\"Retargeting Result\"):\n                retargeting_output_image.render()\n        with gr.Column():\n            with gr.Accordion(open=True, label=\"Paste-back Result\"):\n                retargeting_output_image_paste_back.render()\n    with gr.Row(visible=True):\n        process_button_reset_retargeting = gr.ClearButton(\n            [\n                retargeting_input_image,\n                retargeting_output_image,\n                retargeting_output_image_paste_back,\n            ],\n            value=\"🧹 Clear\"\n        )\n\n    # Retargeting Video\n    gr.Markdown(load_description(\"assets/gradio/gradio_description_retargeting_video.md\"), visible=True)\n    with gr.Row(visible=True):\n        flag_do_crop_input_retargeting_video = gr.Checkbox(value=True, label=\"do crop (source)\")\n        video_retargeting_source_scale.render()\n        video_lip_retargeting_slider.render()\n        driving_smooth_observation_variance_retargeting.render()\n        video_retargeting_silence.render()\n    with gr.Row(visible=True):\n        process_button_retargeting_video = gr.Button(\"🚗 Retargeting Video\", variant=\"primary\")\n    with gr.Row(visible=True):\n        with gr.Column():\n            with gr.Accordion(open=True, label=\"Retargeting Video Input\"):\n                retargeting_input_video.render()\n                gr.Examples(\n                    examples=[\n                        [osp.join(example_portrait_dir, \"s13.mp4\")],\n                        # [osp.join(example_portrait_dir, \"s18.mp4\")],\n                        # [osp.join(example_portrait_dir, \"s20.mp4\")],\n                        [osp.join(example_portrait_dir, \"s29.mp4\")],\n                        [osp.join(example_portrait_dir, \"s32.mp4\")],\n                        [osp.join(example_video_dir, \"d3.mp4\")],\n                    ],\n                    inputs=[retargeting_input_video],\n                    cache_examples=False,\n                )\n        with gr.Column():\n            with gr.Accordion(open=True, label=\"Retargeting Result\"):\n                output_video.render()\n        with gr.Column():\n            with gr.Accordion(open=True, label=\"Paste-back Result\"):\n                output_video_paste_back.render()\n    with gr.Row(visible=True):\n        process_button_reset_retargeting = gr.ClearButton(\n            [\n                video_lip_retargeting_slider,\n                retargeting_input_video,\n                output_video,\n                output_video_paste_back\n            ],\n            value=\"🧹 Clear\"\n        )\n\n    # binding functions for buttons\n    process_button_animation.click(\n        fn=gpu_wrapped_execute_video,\n        inputs=[\n            source_image_input,\n            source_video_input,\n            driving_video_input,\n            driving_image_input,\n            driving_video_pickle_input,\n            flag_normalize_lip,\n            flag_relative_input,\n            flag_do_crop_input,\n            flag_remap_input,\n            flag_stitching_input,\n            animation_region,\n            driving_option_input,\n            driving_multiplier,\n            flag_crop_driving_video_input,\n            scale,\n            vx_ratio,\n            vy_ratio,\n            scale_crop_driving_video,\n            vx_ratio_crop_driving_video,\n            vy_ratio_crop_driving_video,\n            driving_smooth_observation_variance,\n            tab_selection,\n            v_tab_selection,\n        ],\n        outputs=[output_video_i2v, output_video_i2v, output_video_concat_i2v, output_video_concat_i2v, output_image_i2i, output_image_i2i, output_image_concat_i2i, output_image_concat_i2i],\n        show_progress=True\n    )\n\n\n    retargeting_input_image.change(\n        fn=gradio_pipeline.init_retargeting_image,\n        inputs=[retargeting_source_scale, eye_retargeting_slider, lip_retargeting_slider, retargeting_input_image],\n        outputs=[eye_retargeting_slider, lip_retargeting_slider]\n    )\n\n    sliders = [eye_retargeting_slider, lip_retargeting_slider, head_pitch_slider, head_yaw_slider, head_roll_slider, mov_x, mov_y, mov_z, lip_variation_zero, lip_variation_one, lip_variation_two, lip_variation_three, smile, wink, eyebrow, eyeball_direction_x, eyeball_direction_y]\n    for slider in sliders:\n        # NOTE: gradio >= 4.0.0 may cause slow response\n        slider.change(\n            fn=gpu_wrapped_execute_image_retargeting,\n            inputs=[\n                eye_retargeting_slider, lip_retargeting_slider, head_pitch_slider, head_yaw_slider, head_roll_slider, mov_x, mov_y, mov_z,\n                lip_variation_zero, lip_variation_one, lip_variation_two, lip_variation_three, smile, wink, eyebrow, eyeball_direction_x, eyeball_direction_y,\n                retargeting_input_image, retargeting_source_scale, flag_stitching_retargeting_input, flag_do_crop_input_retargeting_image\n            ],\n            outputs=[retargeting_output_image, retargeting_output_image_paste_back],\n        )\n\n    process_button_retargeting_video.click(\n        fn=gpu_wrapped_execute_video_retargeting,\n        inputs=[video_lip_retargeting_slider, retargeting_input_video, video_retargeting_source_scale, driving_smooth_observation_variance_retargeting, video_retargeting_silence, flag_do_crop_input_retargeting_video],\n        outputs=[output_video, output_video_paste_back],\n        show_progress=True\n    )\n\ndemo.launch(\n    server_port=args.server_port,\n    share=args.share,\n    server_name=args.server_name\n)\n"
  },
  {
    "path": "app_animals.py",
    "content": "# coding: utf-8\n\n\"\"\"\nThe entrance of the gradio for animal\n\"\"\"\n\nimport os\nimport tyro\nimport subprocess\nimport gradio as gr\nimport os.path as osp\nfrom src.utils.helper import load_description\nfrom src.gradio_pipeline import GradioPipelineAnimal\nfrom src.config.crop_config import CropConfig\nfrom src.config.argument_config import ArgumentConfig\nfrom src.config.inference_config import InferenceConfig\n\n\ndef partial_fields(target_class, kwargs):\n    return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)})\n\n\ndef fast_check_ffmpeg():\n    try:\n        subprocess.run([\"ffmpeg\", \"-version\"], capture_output=True, check=True)\n        return True\n    except:\n        return False\n\n\n# set tyro theme\ntyro.extras.set_accent_color(\"bright_cyan\")\nargs = tyro.cli(ArgumentConfig)\n\nffmpeg_dir = os.path.join(os.getcwd(), \"ffmpeg\")\nif osp.exists(ffmpeg_dir):\n    os.environ[\"PATH\"] += (os.pathsep + ffmpeg_dir)\n\nif not fast_check_ffmpeg():\n    raise ImportError(\n        \"FFmpeg is not installed. Please install FFmpeg (including ffmpeg and ffprobe) before running this script. https://ffmpeg.org/download.html\"\n    )\n# specify configs for inference\ninference_cfg = partial_fields(InferenceConfig, args.__dict__)  # use attribute of args to initial InferenceConfig\ncrop_cfg = partial_fields(CropConfig, args.__dict__)  # use attribute of args to initial CropConfig\n\ngradio_pipeline_animal: GradioPipelineAnimal = GradioPipelineAnimal(\n    inference_cfg=inference_cfg,\n    crop_cfg=crop_cfg,\n    args=args\n)\n\nif args.gradio_temp_dir not in (None, ''):\n    os.environ[\"GRADIO_TEMP_DIR\"] = args.gradio_temp_dir\n    os.makedirs(args.gradio_temp_dir, exist_ok=True)\n\ndef gpu_wrapped_execute_video(*args, **kwargs):\n    return gradio_pipeline_animal.execute_video(*args, **kwargs)\n\n\n# assets\ntitle_md = \"assets/gradio/gradio_title.md\"\nexample_portrait_dir = \"assets/examples/source\"\nexample_video_dir = \"assets/examples/driving\"\ndata_examples_i2v = [\n    [osp.join(example_portrait_dir, \"s41.jpg\"), osp.join(example_video_dir, \"d3.mp4\"), True, False, False, False],\n    [osp.join(example_portrait_dir, \"s40.jpg\"), osp.join(example_video_dir, \"d6.mp4\"), True, False, False, False],\n    [osp.join(example_portrait_dir, \"s25.jpg\"), osp.join(example_video_dir, \"d19.mp4\"), True, False, False, False],\n]\ndata_examples_i2v_pickle = [\n    [osp.join(example_portrait_dir, \"s25.jpg\"), osp.join(example_video_dir, \"wink.pkl\"), True, False, False, False],\n    [osp.join(example_portrait_dir, \"s40.jpg\"), osp.join(example_video_dir, \"talking.pkl\"), True, False, False, False],\n    [osp.join(example_portrait_dir, \"s41.jpg\"), osp.join(example_video_dir, \"aggrieved.pkl\"), True, False, False, False],\n]\n#################### interface logic ####################\n\n# Define components first\noutput_image = gr.Image(type=\"numpy\")\noutput_image_paste_back = gr.Image(type=\"numpy\")\noutput_video_i2v = gr.Video(autoplay=False)\noutput_video_concat_i2v = gr.Video(autoplay=False)\noutput_video_i2v_gif = gr.Image(type=\"numpy\")\n\n\nwith gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont(\"Plus Jakarta Sans\")])) as demo:\n    gr.HTML(load_description(title_md))\n\n    gr.Markdown(load_description(\"assets/gradio/gradio_description_upload_animal.md\"))\n    with gr.Row():\n        with gr.Column():\n            with gr.Accordion(open=True, label=\"🐱 Source Animal Image\"):\n                source_image_input = gr.Image(type=\"filepath\")\n                gr.Examples(\n                    examples=[\n                        [osp.join(example_portrait_dir, \"s25.jpg\")],\n                        [osp.join(example_portrait_dir, \"s30.jpg\")],\n                        [osp.join(example_portrait_dir, \"s31.jpg\")],\n                        [osp.join(example_portrait_dir, \"s32.jpg\")],\n                        [osp.join(example_portrait_dir, \"s33.jpg\")],\n                        [osp.join(example_portrait_dir, \"s39.jpg\")],\n                        [osp.join(example_portrait_dir, \"s40.jpg\")],\n                        [osp.join(example_portrait_dir, \"s41.jpg\")],\n                        [osp.join(example_portrait_dir, \"s38.jpg\")],\n                        [osp.join(example_portrait_dir, \"s36.jpg\")],\n                    ],\n                    inputs=[source_image_input],\n                    cache_examples=False,\n                )\n\n            with gr.Accordion(open=True, label=\"Cropping Options for Source Image\"):\n                with gr.Row():\n                    flag_do_crop_input = gr.Checkbox(value=True, label=\"do crop (source)\")\n                    scale = gr.Number(value=2.3, label=\"source crop scale\", minimum=1.8, maximum=3.2, step=0.05)\n                    vx_ratio = gr.Number(value=0.0, label=\"source crop x\", minimum=-0.5, maximum=0.5, step=0.01)\n                    vy_ratio = gr.Number(value=-0.125, label=\"source crop y\", minimum=-0.5, maximum=0.5, step=0.01)\n\n        with gr.Column():\n            with gr.Tabs():\n                with gr.TabItem(\"📁 Driving Pickle\") as tab_pickle:\n                    with gr.Accordion(open=True, label=\"Driving Pickle\"):\n                        driving_video_pickle_input = gr.File()\n                        gr.Examples(\n                            examples=[\n                                [osp.join(example_video_dir, \"wink.pkl\")],\n                                [osp.join(example_video_dir, \"shy.pkl\")],\n                                [osp.join(example_video_dir, \"aggrieved.pkl\")],\n                                [osp.join(example_video_dir, \"open_lip.pkl\")],\n                                [osp.join(example_video_dir, \"laugh.pkl\")],\n                                [osp.join(example_video_dir, \"talking.pkl\")],\n                                [osp.join(example_video_dir, \"shake_face.pkl\")],\n                            ],\n                            inputs=[driving_video_pickle_input],\n                            cache_examples=False,\n                        )\n                with gr.TabItem(\"🎞️ Driving Video\") as tab_video:\n                    with gr.Accordion(open=True, label=\"Driving Video\"):\n                        driving_video_input = gr.Video()\n                        gr.Examples(\n                            examples=[\n                                # [osp.join(example_video_dir, \"d0.mp4\")],\n                                # [osp.join(example_video_dir, \"d18.mp4\")],\n                                [osp.join(example_video_dir, \"d19.mp4\")],\n                                [osp.join(example_video_dir, \"d14.mp4\")],\n                                [osp.join(example_video_dir, \"d6.mp4\")],\n                                [osp.join(example_video_dir, \"d3.mp4\")],\n                            ],\n                            inputs=[driving_video_input],\n                            cache_examples=False,\n                        )\n\n                    tab_selection = gr.Textbox(visible=False)\n                    tab_pickle.select(lambda: \"Pickle\", None, tab_selection)\n                    tab_video.select(lambda: \"Video\", None, tab_selection)\n            with gr.Accordion(open=True, label=\"Cropping Options for Driving Video\"):\n                with gr.Row():\n                    flag_crop_driving_video_input = gr.Checkbox(value=False, label=\"do crop (driving)\")\n                    scale_crop_driving_video = gr.Number(value=2.2, label=\"driving crop scale\", minimum=1.8, maximum=3.2, step=0.05)\n                    vx_ratio_crop_driving_video = gr.Number(value=0.0, label=\"driving crop x\", minimum=-0.5, maximum=0.5, step=0.01)\n                    vy_ratio_crop_driving_video = gr.Number(value=-0.1, label=\"driving crop y\", minimum=-0.5, maximum=0.5, step=0.01)\n\n    with gr.Row():\n        with gr.Accordion(open=False, label=\"Animation Options\"):\n            with gr.Row():\n                flag_stitching = gr.Checkbox(value=False, label=\"stitching (not recommended)\")\n                flag_remap_input = gr.Checkbox(value=False, label=\"paste-back (not recommended)\")\n                driving_multiplier = gr.Number(value=1.0, label=\"driving multiplier\", minimum=0.0, maximum=2.0, step=0.02)\n\n    gr.Markdown(load_description(\"assets/gradio/gradio_description_animate_clear.md\"))\n    with gr.Row():\n        process_button_animation = gr.Button(\"🚀 Animate\", variant=\"primary\")\n    with gr.Row():\n        with gr.Column():\n            with gr.Accordion(open=True, label=\"The animated video in the cropped image space\"):\n                output_video_i2v.render()\n        with gr.Column():\n            with gr.Accordion(open=True, label=\"The animated gif in the cropped image space\"):\n                output_video_i2v_gif.render()\n        with gr.Column():\n            with gr.Accordion(open=True, label=\"The animated video\"):\n                output_video_concat_i2v.render()\n    with gr.Row():\n        process_button_reset = gr.ClearButton([source_image_input, driving_video_input, output_video_i2v, output_video_concat_i2v, output_video_i2v_gif], value=\"🧹 Clear\")\n\n    with gr.Row():\n        # Examples\n        gr.Markdown(\"## You could also choose the examples below by one click ⬇️\")\n    with gr.Row():\n        with gr.Tabs():\n            with gr.TabItem(\"📁 Driving Pickle\") as tab_video:\n                gr.Examples(\n                    examples=data_examples_i2v_pickle,\n                    fn=gpu_wrapped_execute_video,\n                    inputs=[\n                        source_image_input,\n                        driving_video_pickle_input,\n                        flag_do_crop_input,\n                        flag_stitching,\n                        flag_remap_input,\n                        flag_crop_driving_video_input,\n                    ],\n                    outputs=[output_image, output_image_paste_back, output_video_i2v_gif],\n                    examples_per_page=len(data_examples_i2v_pickle),\n                    cache_examples=False,\n                )\n            with gr.TabItem(\"🎞️ Driving Video\") as tab_video:\n                gr.Examples(\n                    examples=data_examples_i2v,\n                    fn=gpu_wrapped_execute_video,\n                    inputs=[\n                        source_image_input,\n                        driving_video_input,\n                        flag_do_crop_input,\n                        flag_stitching,\n                        flag_remap_input,\n                        flag_crop_driving_video_input,\n                    ],\n                    outputs=[output_image, output_image_paste_back, output_video_i2v_gif],\n                    examples_per_page=len(data_examples_i2v),\n                    cache_examples=False,\n                )\n\n    process_button_animation.click(\n        fn=gpu_wrapped_execute_video,\n        inputs=[\n            source_image_input,\n            driving_video_input,\n            driving_video_pickle_input,\n            flag_do_crop_input,\n            flag_remap_input,\n            driving_multiplier,\n            flag_stitching,\n            flag_crop_driving_video_input,\n            scale,\n            vx_ratio,\n            vy_ratio,\n            scale_crop_driving_video,\n            vx_ratio_crop_driving_video,\n            vy_ratio_crop_driving_video,\n            tab_selection,\n        ],\n        outputs=[output_video_i2v, output_video_concat_i2v, output_video_i2v_gif],\n        show_progress=True\n    )\n\ndemo.launch(\n    server_port=args.server_port,\n    share=args.share,\n    server_name=args.server_name\n)\n"
  },
  {
    "path": "assets/.gitignore",
    "content": "examples/driving/*.pkl\nexamples/driving/*_crop.mp4\n"
  },
  {
    "path": "assets/docs/changelog/2024-07-10.md",
    "content": "## 2024/07/10\n\n**First, thank you all for your attention, support, sharing, and contributions to LivePortrait!** ❤️\nThe popularity of LivePortrait has exceeded our expectations. If you encounter any issues or other problems and we do not respond promptly, please accept our apologies. We are still actively updating and improving this repository.\n\n### Updates\n\n- <strong>Audio and video concatenating: </strong> If the driving video contains audio, it will automatically be included in the generated video. Additionally, the generated video will maintain the same FPS as the driving video. If you run LivePortrait on Windows, you need to install `ffprobe` and `ffmpeg` exe, see issue [#94](https://github.com/KlingTeam/LivePortrait/issues/94).\n\n- <strong>Driving video auto-cropping: </strong> Implemented automatic cropping for driving videos by tracking facial landmarks and calculating a global cropping box with a 1:1 aspect ratio. Alternatively, you can crop using video editing software or other tools to achieve a 1:1 ratio. Auto-cropping is not enbaled by default, you can specify it by `--flag_crop_driving_video`.\n\n- <strong>Motion template making: </strong> Added the ability to create motion templates to protect privacy. The motion template is a `.pkl` file that only contains the motions of the driving video. Theoretically, it is impossible to reconstruct the original face from the template. These motion templates can be used to generate videos without needing the original driving video. By default, the motion template will be generated and saved as a `.pkl` file with the same name as the driving video, e.g., `d0.mp4` -> `d0.pkl`. Once generated, you can specify it using the `-d` or `--driving` option.\n\n\n### About driving video\n\n- For a guide on using your own driving video, see the [driving video auto-cropping](https://github.com/KlingTeam/LivePortrait/tree/main?tab=readme-ov-file#driving-video-auto-cropping) section.\n\n\n### Others\n\n- If you encounter a black box problem, disable half-precision inference by using `--no_flag_use_half_precision`, reported by issue [#40](https://github.com/KlingTeam/LivePortrait/issues/40), [#48](https://github.com/KlingTeam/LivePortrait/issues/48), [#62](https://github.com/KlingTeam/LivePortrait/issues/62).\n"
  },
  {
    "path": "assets/docs/changelog/2024-07-19.md",
    "content": "## 2024/07/19\n\n**Once again, we would like to express our heartfelt gratitude for your love, attention, and support for LivePortrait! 🎉**\nWe are excited to announce the release of an implementation of Portrait Video Editing (aka v2v) today! Special thanks to the hard work of the LivePortrait team: [Dingyun Zhang](https://github.com/Mystery099), [Zhizhou Zhong](https://github.com/zzzweakman), and [Jianzhu Guo](https://github.com/cleardusk).\n\n### Updates\n\n- <strong>Portrait video editing (v2v):</strong> Implemented a version of Portrait Video Editing (aka v2v). Ensure you have `pykalman` package installed, which has been added in [`requirements_base.txt`](../../../requirements_base.txt). You can specify the source video using the `-s` or `--source` option,  adjust the temporal smoothness of motion with `--driving_smooth_observation_variance`, enable head pose motion transfer with `--flag_video_editing_head_rotation`, and ensure the eye-open scalar of each source frame matches the first source frame before animation with `--flag_source_video_eye_retargeting`.\n\n- <strong>More options in Gradio:</strong> We have upgraded the Gradio interface and added more options. These include `Cropping Options for Source Image or Video` and `Cropping Options for Driving Video`, providing greater flexibility and control.\n\n<p align=\"center\">\n  <img src=\"../LivePortrait-Gradio-2024-07-19.jpg\" alt=\"LivePortrait\" width=\"800px\">\n  <br>\n  The Gradio Interface for LivePortrait\n</p>\n\n\n### Community Contributions\n\n- **ONNX/TensorRT Versions of LivePortrait:** Explore optimized versions of LivePortrait for faster performance:\n  - [FasterLivePortrait](https://github.com/warmshao/FasterLivePortrait) by [warmshao](https://github.com/warmshao) ([#150](https://github.com/KlingTeam/LivePortrait/issues/150))\n  - [Efficient-Live-Portrait](https://github.com/aihacker111/Efficient-Live-Portrait) by [aihacker111](https://github.com/aihacker111/Efficient-Live-Portrait) ([#126](https://github.com/KlingTeam/LivePortrait/issues/126), [#142](https://github.com/KlingTeam/LivePortrait/issues/142))\n- **LivePortrait with [X-Pose](https://github.com/IDEA-Research/X-Pose) Detection:** Check out [LivePortrait](https://github.com/ShiJiaying/LivePortrait) by [ShiJiaying](https://github.com/ShiJiaying) for enhanced detection capabilities using X-pose, see [#119](https://github.com/KlingTeam/LivePortrait/issues/119).\n"
  },
  {
    "path": "assets/docs/changelog/2024-07-24.md",
    "content": "## 2024/07/24\n\n### Updates\n\n- **Portrait pose editing:** You can change the `relative pitch`, `relative yaw`, and `relative roll` in the Gradio interface to adjust the pose of the source portrait.\n- **Detection threshold:** We have added a `--det_thresh` argument with a default value of 0.15 to increase recall, meaning more types of faces (e.g., monkeys, human-like) will be detected. You can set it to other values, e.g., 0.5, by using `python app.py --det_thresh 0.5`.\n\n<p align=\"center\">\n  <img src=\"../pose-edit-2024-07-24.jpg\" alt=\"LivePortrait\" width=\"960px\">\n  <br>\n  Pose Editing in the Gradio Interface\n</p>\n"
  },
  {
    "path": "assets/docs/changelog/2024-08-02.md",
    "content": "## 2024/08/02\n\n<table class=\"center\" style=\"width: 80%; margin-left: auto; margin-right: auto;\">\n<tr>\n    <td style=\"text-align: center\"><b>Animals Singing Dance Monkey 🎤</b></td>\n</tr>\n\n<tr>\n    <td style=\"border: none; text-align: center;\">\n        <video controls loop src=\"https://github.com/user-attachments/assets/38d5b6e5-d29b-458d-9f2c-4dd52546cb41\" muted=\"false\" style=\"width: 60%;\"></video>\n    </td>\n</tr>\n</table>\n\n\n🎉 We are excited to announce the release of a new version featuring animals mode, along with several other updates. Special thanks to the dedicated efforts of the LivePortrait team. 💪 We also provided an one-click installer for Windows users, checkout the details [here](./2024-08-05.md).\n\n### Updates on Animals mode\nWe are pleased to announce the release of the animals mode, which is fine-tuned on approximately 230K frames of various animals (mostly cats and dogs). The trained weights have been updated in the `liveportrait_animals` subdirectory, available on [HuggingFace](https://huggingface.co/KlingTeam/LivePortrait/tree/main/) or [Google Drive](https://drive.google.com/drive/u/0/folders/1UtKgzKjFAOmZkhNK-OYT0caJ_w2XAnib). You should [download the weights](https://github.com/KlingTeam/LivePortrait?tab=readme-ov-file#2-download-pretrained-weights) before running. There are two ways to run this mode.\n\n> Please note that we have not trained the stitching and retargeting modules for the animals model due to several technical issues. _This may be addressed in future updates._ Therefore, we recommend **disabling stitching by setting the `--no_flag_stitching`** option when running the model. Additionally, `paste-back` is also not recommended.\n\n#### Install X-Pose\nWe have chosen [X-Pose](https://github.com/IDEA-Research/X-Pose) as the keypoints detector for animals. This relies on `transformers==4.22.0` and `pillow>=10.2.0` (which are already updated in `requirements.txt`) and requires building an OP named `MultiScaleDeformableAttention`.\n\nRefer to the [PyTorch installation](https://github.com/KlingTeam/LivePortrait?tab=readme-ov-file#for-linux-or-windows-users) for Linux and Windows users.\n\n\nNext, build the OP `MultiScaleDeformableAttention` by running:\n```bash\ncd src/utils/dependencies/XPose/models/UniPose/ops\npython setup.py build install\ncd - # this returns to the previous directory\n```\n\nTo run the model, use the `inference_animals.py` script:\n```bash\npython inference_animals.py -s assets/examples/source/s39.jpg -d assets/examples/driving/wink.pkl --no_flag_stitching --driving_multiplier 1.75\n```\n\nAlternatively, you can use Gradio for a more user-friendly interface. Launch it with:\n```bash\npython app_animals.py # --server_port 8889 --server_name \"0.0.0.0\" --share\n```\n\n> [!WARNING]\n> [X-Pose](https://github.com/IDEA-Research/X-Pose) is only for Non-commercial Scientific Research Purposes, you should remove and replace it with other detectors if you use it for commercial purposes.\n\n### Updates on Humans mode\n\n- **Driving Options**: We have introduced an `expression-friendly` driving option to **reduce head wobbling**, now set as the default. While it may be less effective with large head poses, you can also select the `pose-friendly` option, which is the same as the previous version. This can be set using `--driving_option` or selected in the Gradio interface. Additionally, we added a `--driving_multiplier` option to adjust driving intensity, with a default value of 1, which can also be set in the Gradio interface.\n\n- **Retargeting Video in Gradio**: We have implemented a video retargeting feature. You can specify a `target lip-open ratio` to adjust the mouth movement in the source video. For instance, setting it to 0 will close the mouth in the source video 🤐.\n\n### Others\n\n- [**Poe supports LivePortrait**](https://poe.com/LivePortrait). Check out the news on [X](https://x.com/poe_platform/status/1816136105781256260).\n- [ComfyUI-LivePortraitKJ](https://github.com/kijai/ComfyUI-LivePortraitKJ) (1.1K 🌟) now includes MediaPipe as an alternative to InsightFace, ensuring the license remains under MIT and Apache 2.0.\n- [ComfyUI-AdvancedLivePortrait](https://github.com/PowerHouseMan/ComfyUI-AdvancedLivePortrait) features real-time portrait pose/expression editing and animation, and is registered with ComfyUI-Manager.\n\n\n\n**Below are some screenshots of the new features and improvements:**\n\n| ![The Gradio Interface of Animals Mode](../animals-mode-gradio-2024-08-02.jpg) |\n|:---:|\n| **The Gradio Interface of Animals Mode** |\n\n| ![Driving Options and Multiplier](../driving-option-multiplier-2024-08-02.jpg) |\n|:---:|\n| **Driving Options and Multiplier** |\n\n| ![The Feature of Retargeting Video](../retargeting-video-2024-08-02.jpg) |\n|:---:|\n| **The Feature of Retargeting Video** |\n"
  },
  {
    "path": "assets/docs/changelog/2024-08-05.md",
    "content": "## One-click Windows Installer\n\n### Download the installer from HuggingFace\n```bash\n# !pip install -U \"huggingface_hub[cli]\"\nhuggingface-cli download cleardusk/LivePortrait-Windows LivePortrait-Windows-v20240806.zip --local-dir ./\n```\n\nIf you cannot access to Huggingface, you can use [hf-mirror](https://hf-mirror.com/) to download:\n```bash\n# !pip install -U \"huggingface_hub[cli]\"\nexport HF_ENDPOINT=https://hf-mirror.com\nhuggingface-cli download cleardusk/LivePortrait-Windows LivePortrait-Windows-v20240806.zip --local-dir ./\n```\n\nAlternatively, you can manually download it from the [HuggingFace](https://huggingface.co/cleardusk/LivePortrait-Windows/blob/main/LivePortrait-Windows-v20240806.zip) page.\n\nThen, simply unzip the package `LivePortrait-Windows-v20240806.zip` and double-click `run_windows_human.bat` for the Humans mode, or `run_windows_animal.bat` for the **Animals mode**.\n"
  },
  {
    "path": "assets/docs/changelog/2024-08-06.md",
    "content": "## Precise Portrait Editing\n\nInspired by [ComfyUI-AdvancedLivePortrait](https://github.com/PowerHouseMan/ComfyUI-AdvancedLivePortrait) ([@PowerHouseMan](https://github.com/PowerHouseMan)), we have implemented a version of Precise Portrait Editing in the Gradio interface. With each adjustment of the slider, the edited image updates in real-time. You can click the `🔄 Reset` button to reset all slider parameters. However, the performance may not be as fast as the ComfyUI plugin.\n\n<p align=\"center\">\n  <img src=\"../editing-portrait-2024-08-06.jpg\" alt=\"LivePortrait\" width=\"960px\">\n  <br>\n  Preciese Portrait Editing in the Gradio Interface\n</p>\n"
  },
  {
    "path": "assets/docs/changelog/2024-08-19.md",
    "content": "## Image Driven and Regional Control\n\n<p align=\"center\">\n  <img src=\"../image-driven-image-2024-08-19.jpg\" alt=\"LivePortrait\" width=\"512px\">\n  <br>\n  <strong>Image Drives an Image</strong>\n</p>\n\nYou can now **use an image as a driving signal** to drive the source image or video! Additionally, we **have refined the driving options to support expressions, pose, lips, eyes, or all** (all is consistent with the previous default method), which we name it regional control. The control is becoming more and more precise! 🎯\n\n> Please note that image-based driving or regional control may not perform well in certain cases. Feel free to try different options, and be patient. 😊\n\n> [!Note]\n> We recognize that the project now offers more options, which have become increasingly complex, but due to our limited team capacity and resources, we haven’t fully documented them yet. We ask for your understanding and will work to improve the documentation over time. Contributions via PRs are welcome! If anyone is considering donating or sponsoring, feel free to leave a message in the GitHub Issues or Discussions. We will set up a payment account to reward the team members or support additional efforts in maintaining the project. 💖\n\n\n### CLI Usage\nIt's very simple to use an image as a driving reference. Just set the `-d` argument to the driving image:\n\n```bash\npython inference.py -s assets/examples/source/s5.jpg -d assets/examples/driving/d30.jpg\n```\n\nTo change the `animation_region` option, you can use the `--animation_region` argument to `exp`, `pose`, `lip`, `eyes`, or `all`. For example, to only drive the lip region, you can run by:\n\n```bash\n# only driving the lip region\npython inference.py -s assets/examples/source/s5.jpg -d assets/examples/driving/d0.mp4 --animation_region lip\n```\n\n### Gradio Interface\n\n<p align=\"center\">\n  <img src=\"../image-driven-portrait-animation-2024-08-19.jpg\" alt=\"LivePortrait\" width=\"960px\">\n  <br>\n  <strong>Image-driven Portrait Animation and Regional Control</strong>\n</p>\n\n### More Detailed Explanation\n\n**flag_relative_motion**:\nWhen using an image as the driving input, setting `--flag_relative_motion` to true will apply the motion deformation between the driving image and its canonical form. If set to false, the absolute motion of the driving image is used, which may amplify expression driving strength but could also cause identity leakage. This option corresponds to the `relative motion` toggle in the Gradio interface. Additionally, if both source and driving inputs are images, the output will be an image. If the source is a video and the driving input is an image, the output will be a video, with each frame driven by the image's motion. The Gradio interface automatically saves and displays the output in the appropriate format.\n\n**animation_region**:\nThis argument offers five options:\n\n- `exp`: Only the expression of the driving input influences the source.\n- `pose`: Only the head pose drives the source.\n- `lip`: Only lip movement drives the source.\n- `eyes`: Only eye movement drives the source.\n- `all`: All motions from the driving input are applied.\n\nYou can also select these options directly in the Gradio interface.\n\n**Editing the Lip Region of the Source Video to a Neutral Expression**:\nIn response to requests for a more neutral lip region in the `Retargeting Video` of the Gradio interface, we've added a `keeping the lip silent` option. When selected, the animated video's lip region will adopt a neutral expression. However, this may cause inter-frame jitter or identity leakage, as it uses a mode similar to absolute driving. Note that the neutral expression may sometimes feature a slightly open mouth.\n\n**Others**:\nWhen both source and driving inputs are videos, the output motion may be a blend of both, due to the default setting of `--flag_relative_motion`. This option uses relative driving, where the motion offset of the current driving frame relative to the first driving frame is added to the source frame's motion. In contrast, `--no_flag_relative_motion` applies the driving frame's motion directly as the final driving motion.\n\nFor CLI usage, to retain only the driving video's motion in the output, use:\n```bash\npython inference.py --no_flag_relative_motion\n```\nIn the Gradio interface, simply uncheck the relative motion option. Note that absolute driving may cause jitter or identity leakage in the animated video.\n"
  },
  {
    "path": "assets/docs/changelog/2025-01-01.md",
    "content": "## 2025/01/01\n\n**We’re thrilled that cats 🐱 are now speaking and singing across the internet!**  🎶\n\nIn this update, we’ve improved the [Animals model](https://huggingface.co/KlingTeam/LivePortrait/tree/main/liveportrait_animals/base_models_v1.1) with more data. While you might notice only a slight improvement for cats (if at all 😼), dogs have gotten a slightly better upgrade. For example, the model is now better at recognizing their mouths instead of mistaking them for noses. 🐶\n\n<table class=\"center\" style=\"width: 80%; margin-left: auto; margin-right: auto;\">\n<tr>\n    <td style=\"text-align: center\"><b>Before vs. After (v1.1)</b></td>\n</tr>\n\n<tr>\n    <td style=\"border: none; text-align: center;\">\n        <video controls loop src=\"https://github.com/user-attachments/assets/59fc09b9-6cb7-4265-833f-eebb27ed9511\" muted=\"false\" style=\"width: 60%;\"></video>\n    </td>\n</tr>\n</table>\n\n\nThe new version (v1.1) Animals Model has been updated on [HuggingFace](https://huggingface.co/KlingTeam/LivePortrait/tree/main/liveportrait_animals/base_models_v1.1). The new version is enabled by default.\n\n> [!IMPORTANT]\n> Note: Make sure to update your weights to use the new version.\n\nIf you prefer to use the original version, simply modify the configuration in [inference_config.py](../../../src/config/inference_config.py#L29)\n```python\nversion_animals = \"\" # old version\n# version_animals = \"_v1.1\" # new (v1.1) version\n```\n"
  },
  {
    "path": "assets/docs/directory-structure.md",
    "content": "## The directory structure of `pretrained_weights`\n\n```text\npretrained_weights\n├── insightface\n│   └── models\n│       └── buffalo_l\n│           ├── 2d106det.onnx\n│           └── det_10g.onnx\n├── liveportrait\n│   ├── base_models\n│   │   ├── appearance_feature_extractor.pth\n│   │   ├── motion_extractor.pth\n│   │   ├── spade_generator.pth\n│   │   └── warping_module.pth\n│   ├── landmark.onnx\n│   └── retargeting_models\n│       └── stitching_retargeting_module.pth\n└── liveportrait_animals\n    ├── base_models\n    │   ├── appearance_feature_extractor.pth\n    │   ├── motion_extractor.pth\n    │   ├── spade_generator.pth\n    │   └── warping_module.pth\n    ├── retargeting_models\n    │   └── stitching_retargeting_module.pth\n    └── xpose.pth\n```\n"
  },
  {
    "path": "assets/docs/how-to-install-ffmpeg.md",
    "content": "## Install FFmpeg\n\nMake sure you have `ffmpeg` and `ffprobe` installed on your system. If you don't have them installed, follow the instructions below.\n\n> [!Note]\n> The installation is copied from [SoVITS](https://github.com/RVC-Boss/GPT-SoVITS) 🤗\n\n### Conda Users\n\n```bash\nconda install ffmpeg\n```\n\n### Ubuntu/Debian Users\n\n```bash\nsudo apt install ffmpeg\nsudo apt install libsox-dev\nconda install -c conda-forge 'ffmpeg<7'\n```\n\n### Windows Users\n\nDownload and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root.\n\n### MacOS Users\n```bash\nbrew install ffmpeg\n```\n"
  },
  {
    "path": "assets/docs/speed.md",
    "content": "### Speed\n\nBelow are the results of inferring one frame on an RTX 4090 GPU using the native PyTorch framework with `torch.compile`:\n\n| Model                             | Parameters(M) | Model Size(MB) | Inference(ms) |\n|-----------------------------------|:-------------:|:--------------:|:-------------:|\n| Appearance Feature Extractor      |     0.84      |       3.3      |     0.82      |\n| Motion Extractor                  |     28.12     |       108      |     0.84      |\n| Spade Generator                   |     55.37     |       212      |     7.59      |\n| Warping Module                    |     45.53     |       174      |     5.21      |\n| Stitching and Retargeting Modules |     0.23      |       2.3      |     0.31      |\n\n*Note: The values for the Stitching and Retargeting Modules represent the combined parameter counts and total inference time of three sequential MLP networks.*\n"
  },
  {
    "path": "assets/gradio/gradio_description_animate_clear.md",
    "content": "<div style=\"font-size: 1.2em; text-align: center;\">\n    Step 3: Click the <strong>🚀 Animate</strong> button below to generate, or click <strong>🧹 Clear</strong> to erase the results\n</div>\n<!-- <div style=\"font-size: 1.1em; text-align: center;\">\n    <strong style=\"color: red;\">Note:</strong>  If both <strong>Source Image</strong> and <strong>Video</strong> are uploaded, the <strong>Source Image</strong> will be used. Please click the <strong>🧹 Clear</strong> button, then re-upload the <strong>Source Image</strong> or <strong>Video</strong>.\n</div> -->\n"
  },
  {
    "path": "assets/gradio/gradio_description_animation.md",
    "content": "<span style=\"font-size: 1.2em;\">🔥 To animate the source image or video with the driving video, please follow these steps:</span>\n<div style=\"font-size: 1.2em; margin-left: 20px;\">\n1. In the <strong>Animation Options for Source Image or Video</strong> section, we recommend enabling the <code>do crop (source)</code> option if faces occupy a small portion of your source image or video.\n</div>\n<div style=\"font-size: 1.2em; margin-left: 20px;\">\n2. In the <strong>Animation Options for Driving Video</strong> section, the <code>relative head rotation</code> and <code>smooth strength</code> options only take effect if the source input is a video.\n</div>\n<div style=\"font-size: 1.2em; margin-left: 20px;\">\n3. Press the <strong>🚀 Animate</strong> button and wait for a moment. Your animated video will appear in the result block. This may take a few moments. If the input is a source video, the length of the animated video is the minimum of the length of the source video and the driving video.\n</div>\n<div style=\"font-size: 1.2em; margin-left: 20px;\">\n4. If you want to upload your own driving video, <strong>the best practice</strong>:\n\n - Crop it to a 1:1 aspect ratio (e.g., 512x512 or 256x256 pixels), or enable auto-driving by checking `do crop (driving video)`.\n - Focus on the head area, similar to the example videos.\n - Minimize shoulder movement.\n - Make sure the first frame of driving video is a frontal face with **neutral expression**.\n\n</div>\n"
  },
  {
    "path": "assets/gradio/gradio_description_retargeting.md",
    "content": "<br>\n\n<!-- ## Retargeting -->\n<!-- <span style=\"font-size: 1.2em;\">🔥 To edit the eyes and lip open ratio of the source portrait, drag the sliders and click the <strong>🚗 Retargeting</strong> button. You can try running it multiple times. <strong>😊 Set both ratios to 0.8 to see what's going on!</strong> </span> -->\n\n\n<div style=\"display: flex; justify-content: center; align-items: center; text-align: center; font-size: 1.2em;\">\n  <div>\n    <h2>Retargeting and Editing Portraits</h2>\n    <p>Upload a source portrait, and the <code>eyes-open ratio</code> and <code>lip-open ratio</code> will be auto-calculated. Adjust the sliders to see instant edits. Feel free to experiment! 🎨</p>\n    <strong>😊 Set both target eyes-open and lip-open ratios to 0.8 to see what's going on!</strong></p>\n  </div>\n</div>\n"
  },
  {
    "path": "assets/gradio/gradio_description_retargeting_video.md",
    "content": "<br>\n<div style=\"display: flex; justify-content: center; align-items: center; text-align: center; font-size: 1.2em;\">\n  <div>\n    <h2>Retargeting Video</h2>\n    <p>Upload a Source Video as Retargeting Input, then drag the sliders and click the <strong>🚗 Retargeting Video</strong> button. You can try running it multiple times.\n    <br>\n    <strong>🤐 Set target lip-open ratio to 0 to see what's going on!</strong></p>\n  </div>\n</div>\n"
  },
  {
    "path": "assets/gradio/gradio_description_upload.md",
    "content": "<br>\n<div style=\"font-size: 1.2em; display: flex; justify-content: space-between;\">\n    <div style=\"flex: 1; text-align: center; margin-right: 20px;\">\n        <div style=\"display: inline-block;\">\n            Step 1: Upload a <strong>Source Image</strong> or <strong>Video</strong> (any aspect ratio) ⬇️\n        </div>\n        <div style=\"display: inline-block; font-size: 0.8em;\">\n            <strong>Note:</strong> Better if Source Video has <strong>the same FPS</strong> as the Driving Video.\n        </div>\n    </div>\n    <div style=\"flex: 1; text-align: center; margin-left: 20px;\">\n        <div style=\"display: inline-block;\">\n            Step 2: Upload a <strong>Driving Video</strong> (any aspect ratio) ⬇️\n        </div>\n        <div style=\"display: inline-block; font-size: 0.8em;\">\n            <strong>Tips:</strong> Focus on the head, minimize shoulder movement, <strong>neutral expression</strong> in first frame.\n        </div>\n    </div>\n</div>\n"
  },
  {
    "path": "assets/gradio/gradio_description_upload_animal.md",
    "content": "<br>\n<div style=\"font-size: 1.2em; display: flex; justify-content: space-between;\">\n    <div style=\"flex: 1; text-align: center; margin-right: 20px;\">\n        <div style=\"display: inline-block;\">\n            Step 1: Upload a <strong>Source Animal Image</strong> (any aspect ratio) ⬇️\n        </div>\n    </div>\n    <div style=\"flex: 1; text-align: center; margin-left: 20px;\">\n        <div style=\"display: inline-block;\">\n            Step 2: Upload a <strong>Driving Pickle</strong> or <strong>Driving Video</strong> (any aspect ratio) ⬇️\n        </div>\n        <div style=\"display: inline-block; font-size: 0.8em;\">\n            <strong>Tips:</strong> Focus on the head, minimize shoulder movement, <strong>neutral expression</strong> in first frame.\n        </div>\n    </div>\n</div>\n"
  },
  {
    "path": "assets/gradio/gradio_title.md",
    "content": "<div style=\"display: flex; justify-content: center; align-items: center; text-align: center;\">\n  <div>\n    <h1>LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control</h1>\n    <!-- <span>Add mimics and lip sync to your static portrait driven by a video</span> -->\n    <!-- <span>Efficient Portrait Animation with Stitching and Retargeting Control</span> -->\n    <!-- <br> -->\n    <div style=\"display: flex; justify-content: center; align-items: center; text-align: center;\">\n      <a href=\"https://arxiv.org/pdf/2407.03168\"><img src=\"https://img.shields.io/badge/arXiv-2407.03168-red\"></a>\n      &nbsp;\n      <a href=\"https://liveportrait.github.io\"><img src=\"https://img.shields.io/badge/Project_Page-LivePortrait-green\" alt=\"Project Page\"></a>\n      &nbsp;\n      <a href='https://huggingface.co/spaces/KlingTeam/LivePortrait'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>\n      &nbsp;\n      <a href=\"https://github.com/KlingTeam/LivePortrait\"><img src=\"https://img.shields.io/badge/Github-Code-blue\"></a>\n      &nbsp;\n      <a href=\"https://github.com/KlingTeam/LivePortrait\"><img src=\"https://img.shields.io/github/stars/KlingTeam/LivePortrait\n      \"></a>\n    </div>\n  </div>\n</div>\n"
  },
  {
    "path": "inference.py",
    "content": "# coding: utf-8\n\n\"\"\"\nThe entrance of humans\n\"\"\"\n\nimport os\nimport os.path as osp\nimport tyro\nimport subprocess\nfrom src.config.argument_config import ArgumentConfig\nfrom src.config.inference_config import InferenceConfig\nfrom src.config.crop_config import CropConfig\nfrom src.live_portrait_pipeline import LivePortraitPipeline\n\n\ndef partial_fields(target_class, kwargs):\n    return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)})\n\n\ndef fast_check_ffmpeg():\n    try:\n        subprocess.run([\"ffmpeg\", \"-version\"], capture_output=True, check=True)\n        return True\n    except:\n        return False\n\n\ndef fast_check_args(args: ArgumentConfig):\n    if not osp.exists(args.source):\n        raise FileNotFoundError(f\"source info not found: {args.source}\")\n    if not osp.exists(args.driving):\n        raise FileNotFoundError(f\"driving info not found: {args.driving}\")\n\n\ndef main():\n    # set tyro theme\n    tyro.extras.set_accent_color(\"bright_cyan\")\n    args = tyro.cli(ArgumentConfig)\n\n    ffmpeg_dir = os.path.join(os.getcwd(), \"ffmpeg\")\n    if osp.exists(ffmpeg_dir):\n        os.environ[\"PATH\"] += (os.pathsep + ffmpeg_dir)\n\n    if not fast_check_ffmpeg():\n        raise ImportError(\n            \"FFmpeg is not installed. Please install FFmpeg (including ffmpeg and ffprobe) before running this script. https://ffmpeg.org/download.html\"\n        )\n\n    fast_check_args(args)\n\n    # specify configs for inference\n    inference_cfg = partial_fields(InferenceConfig, args.__dict__)\n    crop_cfg = partial_fields(CropConfig, args.__dict__)\n\n    live_portrait_pipeline = LivePortraitPipeline(\n        inference_cfg=inference_cfg,\n        crop_cfg=crop_cfg\n    )\n\n    # run\n    live_portrait_pipeline.execute(args)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "inference_animals.py",
    "content": "# coding: utf-8\n\n\"\"\"\nThe entrance of animal\n\"\"\"\n\nimport os\nimport os.path as osp\nimport tyro\nimport subprocess\nfrom src.config.argument_config import ArgumentConfig\nfrom src.config.inference_config import InferenceConfig\nfrom src.config.crop_config import CropConfig\nfrom src.live_portrait_pipeline_animal import LivePortraitPipelineAnimal\n\n\ndef partial_fields(target_class, kwargs):\n    return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)})\n\n\ndef fast_check_ffmpeg():\n    try:\n        subprocess.run([\"ffmpeg\", \"-version\"], capture_output=True, check=True)\n        return True\n    except:\n        return False\n\n\ndef fast_check_args(args: ArgumentConfig):\n    if not osp.exists(args.source):\n        raise FileNotFoundError(f\"source info not found: {args.source}\")\n    if not osp.exists(args.driving):\n        raise FileNotFoundError(f\"driving info not found: {args.driving}\")\n\n\ndef main():\n    # set tyro theme\n    tyro.extras.set_accent_color(\"bright_cyan\")\n    args = tyro.cli(ArgumentConfig)\n\n    ffmpeg_dir = os.path.join(os.getcwd(), \"ffmpeg\")\n    if osp.exists(ffmpeg_dir):\n        os.environ[\"PATH\"] += (os.pathsep + ffmpeg_dir)\n\n    if not fast_check_ffmpeg():\n        raise ImportError(\n            \"FFmpeg is not installed. Please install FFmpeg (including ffmpeg and ffprobe) before running this script. https://ffmpeg.org/download.html\"\n        )\n\n    fast_check_args(args)\n\n    # specify configs for inference\n    inference_cfg = partial_fields(InferenceConfig, args.__dict__)\n    crop_cfg = partial_fields(CropConfig, args.__dict__)\n\n    live_portrait_pipeline_animal = LivePortraitPipelineAnimal(\n        inference_cfg=inference_cfg,\n        crop_cfg=crop_cfg\n    )\n\n    # run\n    live_portrait_pipeline_animal.execute(args)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "pretrained_weights/.gitkeep",
    "content": ""
  },
  {
    "path": "readme.md",
    "content": "<h1 align=\"center\">LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control</h1>\n\n<div align='center'>\n    <a href='https://github.com/cleardusk' target='_blank'><strong>Jianzhu Guo</strong></a><sup> 1*†</sup>&emsp;\n    <a href='https://github.com/Mystery099' target='_blank'><strong>Dingyun Zhang</strong></a><sup> 1,2*</sup>&emsp;\n    <a href='https://github.com/KwaiVGI' target='_blank'><strong>Xiaoqiang Liu</strong></a><sup> 1</sup>&emsp;\n    <a href='https://github.com/zzzweakman' target='_blank'><strong>Zhizhou Zhong</strong></a><sup> 1,3</sup>&emsp;\n    <a href='https://scholar.google.com.hk/citations?user=_8k1ubAAAAAJ' target='_blank'><strong>Yuan Zhang</strong></a><sup> 1</sup>&emsp;\n</div>\n\n<div align='center'>\n    <a href='https://scholar.google.com/citations?user=P6MraaYAAAAJ' target='_blank'><strong>Pengfei Wan</strong></a><sup> 1</sup>&emsp;\n    <a href='https://openreview.net/profile?id=~Di_ZHANG3' target='_blank'><strong>Di Zhang</strong></a><sup> 1</sup>&emsp;\n</div>\n\n<div align='center'>\n    <sup>1 </sup>Kuaishou Technology&emsp; <sup>2 </sup>University of Science and Technology of China&emsp; <sup>3 </sup>Fudan University&emsp;\n</div>\n<div align='center'>\n    <small><sup>*</sup> Equal contributions</small>\n    <small><sup>†</sup> Project lead</small>\n</div>\n<br>\n\n<!-- ===== LivePortrait – Quick Start & Links ===== -->\n<div align=\"center\">\n\n  <!-- 🚀 Quick Start buttons -->\n  <p>\n    <a href=\"https://huggingface.co/cleardusk/LivePortrait-Windows/blob/main/LivePortrait-Windows-v20240829.zip\" target=\"_blank\"><img src=\"https://img.shields.io/badge/🖥 Windows Installer-v20240829-00BFFF?style=for-the-badge&logo=windows&logoColor=white\" alt=\"Windows one-click installer\"></a>&nbsp;\n    <a href=\"https://huggingface.co/spaces/KlingTeam/LivePortrait\" target=\"_blank\"><img src=\"https://img.shields.io/badge/🌐 Try Online Demo-FF6F00?style=for-the-badge&logo=huggingface&logoColor=white\" alt=\"HuggingFace online demo\"></a>\n  </p>\n\n  <!-- 📄 Paper / project / GitHub stats -->\n  <p>\n    <a href=\"https://arxiv.org/pdf/2407.03168\" target=\"_blank\"><img src=\"https://img.shields.io/badge/arXiv-LivePortrait-red\" alt=\"arXiv link\"></a>&nbsp;\n    <a href=\"https://liveportrait.github.io\" target=\"_blank\"><img src=\"https://img.shields.io/badge/Project-Homepage-green\" alt=\"project homepage\"></a>&nbsp;\n    <a href=\"https://huggingface.co/spaces/KlingTeam/LivePortrait\" target=\"_blank\"><img src=\"https://img.shields.io/badge/🤗 Hugging Face-Spaces-blue\" alt=\"HF space\"></a>&nbsp;\n    <a href=\"https://hellogithub.com/repository/bed652ef02154dd7a434e0720125639e\" target=\"_blank\"><img src=\"https://abroad.hellogithub.com/v1/widgets/recommend.svg?rid=bed652ef02154dd7a434e0720125639e&claim_uid=XyBT2K9QJ7RZhej&theme=small\" alt=\"Featured by HelloGitHub\"></a>&nbsp;\n    <a href=\"https://github.com/KlingTeam/LivePortrait\" target=\"_blank\"><img src=\"https://img.shields.io/github/stars/KlingTeam/LivePortrait?style=social\" alt=\"GitHub stars\"></a>\n  </p>\n\n  <!-- 🌏 Language switch -->\n  <p><strong>English</strong> | <a href=\"./readme_zh_cn.md\"><strong>简体中文</strong></a></p>\n\n  <!-- 🎬 Showcase GIF -->\n  <p><img src=\"./assets/docs/showcase2.gif\" alt=\"LivePortrait showcase GIF\"></p>\n  <p>🔥 For more results, visit our <a href=\"https://liveportrait.github.io/\" target=\"_blank\"><strong>homepage</strong></a> 🔥</p>\n\n</div>\n<!-- ===== /LivePortrait ===== -->\n\n\n## 🔥 Updates\n- **`2025/06/01`**: 🌍 Over the past year, **LivePortrait** has 🚀 become an efficient portrait-animation (humans, cats and dogs) solution adopted by major video platforms—Kuaishou, Douyin, Jianying, WeChat Channels—as well as numerous startups and creators. 🎉\n- **`2025/01/01`**: 🐶 We updated a new version of the Animals model with more data, see [**here**](./assets/docs/changelog/2025-01-01.md).\n- **`2024/10/18`**: ❗ We have updated the versions of the `transformers` and `gradio` libraries to avoid security vulnerabilities. Details [here](https://github.com/KlingTeam/LivePortrait/pull/421/files).\n- **`2024/08/29`**: 📦 We update the Windows [one-click installer](https://huggingface.co/cleardusk/LivePortrait-Windows/blob/main/LivePortrait-Windows-v20240829.zip) and support auto-updates, see [changelog](https://huggingface.co/cleardusk/LivePortrait-Windows#20240829).\n- **`2024/08/19`**: 🖼️ We support **image driven mode** and **regional control**. For details, see [**here**](./assets/docs/changelog/2024-08-19.md).\n- **`2024/08/06`**: 🎨 We support **precise portrait editing** in the Gradio interface, inspired by [ComfyUI-AdvancedLivePortrait](https://github.com/PowerHouseMan/ComfyUI-AdvancedLivePortrait). See [**here**](./assets/docs/changelog/2024-08-06.md).\n- **`2024/08/05`**: 📦 Windows users can now download the [one-click installer](https://huggingface.co/cleardusk/LivePortrait-Windows/blob/main/LivePortrait-Windows-v20240806.zip) for Humans mode and **Animals mode** now! For details, see [**here**](./assets/docs/changelog/2024-08-05.md).\n- **`2024/08/02`**: 😸 We released a version of the **Animals model**, along with several other updates and improvements. Check out the details [**here**](./assets/docs/changelog/2024-08-02.md)!\n- **`2024/07/25`**: 📦 Windows users can now download the package from [HuggingFace](https://huggingface.co/cleardusk/LivePortrait-Windows/tree/main). Simply unzip and double-click `run_windows.bat` to enjoy!\n- **`2024/07/24`**: 🎨 We support pose editing for source portraits in the Gradio interface. We’ve also lowered the default detection threshold to increase recall. [Have fun](assets/docs/changelog/2024-07-24.md)!\n- **`2024/07/19`**: ✨ We support 🎞️ **portrait video editing (aka v2v)**! More to see [here](assets/docs/changelog/2024-07-19.md).\n- **`2024/07/17`**: 🍎 We support macOS with Apple Silicon, modified from [jeethu](https://github.com/jeethu)'s PR [#143](https://github.com/KlingTeam/LivePortrait/pull/143).\n- **`2024/07/10`**: 💪 We support audio and video concatenating, driving video auto-cropping, and template making to protect privacy. More to see [here](assets/docs/changelog/2024-07-10.md).\n- **`2024/07/09`**: 🤗 We released the [HuggingFace Space](https://huggingface.co/spaces/KlingTeam/LivePortrait), thanks to the HF team and [Gradio](https://github.com/gradio-app/gradio)!\n- **`2024/07/04`**: 😊 We released the initial version of the inference code and models. Continuous updates, stay tuned!\n- **`2024/07/04`**: 🔥 We released the [homepage](https://liveportrait.github.io) and technical report on [arXiv](https://arxiv.org/pdf/2407.03168).\n\n\n\n## Introduction 📖\nThis repo, named **LivePortrait**, contains the official PyTorch implementation of our paper [LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control](https://arxiv.org/pdf/2407.03168).\nWe are actively updating and improving this repository. If you find any bugs or have suggestions, welcome to raise issues or submit pull requests (PR) 💖.\n\n## Getting Started 🏁\n### 1. Clone the code and prepare the environment 🛠️\n\n> [!Note]\n> Make sure your system has [`git`](https://git-scm.com/), [`conda`](https://anaconda.org/anaconda/conda), and [`FFmpeg`](https://ffmpeg.org/download.html) installed. For details on FFmpeg installation, see [**how to install FFmpeg**](assets/docs/how-to-install-ffmpeg.md).\n\n```bash\ngit clone https://github.com/KlingTeam/LivePortrait\ncd LivePortrait\n\n# create env using conda\nconda create -n LivePortrait python=3.10\nconda activate LivePortrait\n```\n\n#### For Linux 🐧 or Windows 🪟 Users\n[X-Pose](https://github.com/IDEA-Research/X-Pose), required by Animals mode, is a dependency that needs to be installed. The step of `Check your CUDA versions` is **optional** if you only want to run Humans mode.\n\n<details>\n  <summary>Check your CUDA versions</summary>\n\n  Firstly, check your current CUDA version by:\n  ```bash\n  nvcc -V # example versions: 11.1, 11.8, 12.1, etc.\n  ```\n\n  Then, install the corresponding torch version. Here are examples for different CUDA versions. Visit the [PyTorch Official Website](https://pytorch.org/get-started/previous-versions) for installation commands if your CUDA version is not listed:\n  ```bash\n  # for CUDA 11.1\n  pip install torch==1.10.1+cu111 torchvision==0.11.2 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu111/torch_stable.html\n  # for CUDA 11.8\n  pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu118\n  # for CUDA 12.1\n  pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121\n  # ...\n  ```\n\n  **Note**: On Windows systems, some higher versions of CUDA (such as 12.4, 12.6, etc.) may lead to unknown issues. You may consider downgrading CUDA to version 11.8 for stability. See the [downgrade guide](https://github.com/dimitribarbot/sd-webui-live-portrait/blob/main/assets/docs/how-to-install-xpose.md#cuda-toolkit-118) by [@dimitribarbot](https://github.com/dimitribarbot).\n</details>\n\n\nFinally, install the remaining dependencies:\n```bash\npip install -r requirements.txt\n```\n\n#### For macOS  with Apple Silicon Users\nThe [X-Pose](https://github.com/IDEA-Research/X-Pose) dependency does not support macOS, so you can skip its installation. While Humans mode works as usual, Animals mode is not supported. Use the provided requirements file for macOS with Apple Silicon:\n```bash\n# for macOS with Apple Silicon users\npip install -r requirements_macOS.txt\n```\n\n### 2. Download pretrained weights 📥\n\nThe easiest way to download the pretrained weights is from HuggingFace:\n```bash\n# !pip install -U \"huggingface_hub[cli]\"\nhuggingface-cli download KlingTeam/LivePortrait --local-dir pretrained_weights --exclude \"*.git*\" \"README.md\" \"docs\"\n```\n\nIf you cannot access to Huggingface, you can use [hf-mirror](https://hf-mirror.com/) to download:\n```bash\n# !pip install -U \"huggingface_hub[cli]\"\nexport HF_ENDPOINT=https://hf-mirror.com\nhuggingface-cli download KlingTeam/LivePortrait --local-dir pretrained_weights --exclude \"*.git*\" \"README.md\" \"docs\"\n```\n\nAlternatively, you can download all pretrained weights from [Google Drive](https://drive.google.com/drive/folders/1UtKgzKjFAOmZkhNK-OYT0caJ_w2XAnib) or [Baidu Yun](https://pan.baidu.com/s/1MGctWmNla_vZxDbEp2Dtzw?pwd=z5cn). Unzip and place them in `./pretrained_weights`.\n\nEnsuring the directory structure is as or contains [**this**](assets/docs/directory-structure.md).\n\n### 3. Inference 🚀\n\n#### Fast hands-on (humans) 👤\n```bash\n# For Linux and Windows users\npython inference.py\n\n# For macOS users with Apple Silicon (Intel is not tested). NOTE: this maybe 20x slower than RTX 4090\nPYTORCH_ENABLE_MPS_FALLBACK=1 python inference.py\n```\n\nIf the script runs successfully, you will get an output mp4 file named `animations/s6--d0_concat.mp4`. This file includes the following results: driving video, input image or video, and generated result.\n\n<p align=\"center\">\n  <img src=\"./assets/docs/inference.gif\" alt=\"image\">\n</p>\n\nOr, you can change the input by specifying the `-s` and `-d` arguments:\n\n```bash\n# source input is an image\npython inference.py -s assets/examples/source/s9.jpg -d assets/examples/driving/d0.mp4\n\n# source input is a video ✨\npython inference.py -s assets/examples/source/s13.mp4 -d assets/examples/driving/d0.mp4\n\n# more options to see\npython inference.py -h\n```\n\n#### Fast hands-on (animals) 🐱🐶\nAnimals mode is ONLY tested on Linux and Windows with NVIDIA GPU.\n\nYou need to build an OP named `MultiScaleDeformableAttention` first (refer to the <a href=\"#for-linux--or-windows--users\">Check your CUDA versions</a> if needed), which is used by [X-Pose](https://github.com/IDEA-Research/X-Pose), a general keypoint detection framework.\n\n```bash\ncd src/utils/dependencies/XPose/models/UniPose/ops\npython setup.py build install\ncd - # equal to cd ../../../../../../../\n```\n\nThen\n```bash\npython inference_animals.py -s assets/examples/source/s39.jpg -d assets/examples/driving/wink.pkl --driving_multiplier 1.75 --no_flag_stitching\n```\nIf the script runs successfully, you will get an output mp4 file named `animations/s39--wink_concat.mp4`.\n<p align=\"center\">\n  <img src=\"./assets/docs/inference-animals.gif\" alt=\"image\">\n</p>\n\n#### Driving video auto-cropping 📢📢📢\n> [!IMPORTANT]\n> To use your own driving video, we **recommend**: ⬇️\n> - Crop it to a **1:1** aspect ratio (e.g., 512x512 or 256x256 pixels), or enable auto-cropping by `--flag_crop_driving_video`.\n> - Focus on the head area, similar to the example videos.\n> - Minimize shoulder movement.\n> - Make sure the first frame of driving video is a frontal face with **neutral expression**.\n\nBelow is an auto-cropping case by `--flag_crop_driving_video`:\n```bash\npython inference.py -s assets/examples/source/s9.jpg -d assets/examples/driving/d13.mp4 --flag_crop_driving_video\n```\n\nIf you find the results of auto-cropping is not well, you can modify the `--scale_crop_driving_video`, `--vy_ratio_crop_driving_video` options to adjust the scale and offset, or do it manually.\n\n#### Motion template making\nYou can also use the auto-generated motion template files ending with `.pkl` to speed up inference, and **protect privacy**, such as:\n```bash\npython inference.py -s assets/examples/source/s9.jpg -d assets/examples/driving/d5.pkl # portrait animation\npython inference.py -s assets/examples/source/s13.mp4 -d assets/examples/driving/d5.pkl # portrait video editing\n```\n\n### 4. Gradio interface 🤗\n\nWe also provide a Gradio <a href='https://github.com/gradio-app/gradio'><img src='https://img.shields.io/github/stars/gradio-app/gradio'></a> interface for a better experience, just run by:\n\n```bash\n# For Linux and Windows users (and macOS with Intel??)\npython app.py # humans mode\n\n# For macOS with Apple Silicon users, Intel not supported, this maybe 20x slower than RTX 4090\nPYTORCH_ENABLE_MPS_FALLBACK=1 python app.py # humans mode\n```\n\nWe also provide a Gradio interface of animals mode, which is only tested on Linux with NVIDIA GPU:\n```bash\npython app_animals.py # animals mode 🐱🐶\n```\n\nYou can specify the `--server_port`, `--share`, `--server_name` arguments to satisfy your needs!\n\n🚀 We also provide an acceleration option `--flag_do_torch_compile`. The first-time inference triggers an optimization process (about one minute), making subsequent inferences 20-30% faster. Performance gains may vary with different CUDA versions.\n```bash\n# enable torch.compile for faster inference\npython app.py --flag_do_torch_compile\n```\n**Note**: This method is not supported on Windows and macOS.\n\n**Or, try it out effortlessly on [HuggingFace](https://huggingface.co/spaces/KlingTeam/LivePortrait) 🤗**\n\n### 5. Inference speed evaluation 🚀🚀🚀\nWe have also provided a script to evaluate the inference speed of each module:\n\n```bash\n# For NVIDIA GPU\npython speed.py\n```\n\nThe results are [**here**](./assets/docs/speed.md).\n\n## Community Resources 🤗\n\nDiscover the invaluable resources contributed by our community to enhance your LivePortrait experience.\n\n\n### Community-developed Projects\n\n| Repo | Description | Author / Links |\n|------|------|--------|\n| [**ditto-talkinghead**](https://github.com/antgroup/ditto-talkinghead) | Real-time audio-driven talking head. | [ArXiv](https://arxiv.org/abs/2411.19509), [Homepage](https://digital-avatar.github.io/ai/Ditto/)  |\n| [**FasterLivePortrait**](https://github.com/warmshao/FasterLivePortrait) | Faster real-time version using TensorRT. | [@warmshao](https://github.com/warmshao) |\n| [**AdvancedLivePortrait-WebUI**](https://github.com/jhj0517/AdvancedLivePortrait-WebUI) | Dedicated gradio based WebUI started from [ComfyUI-AdvancedLivePortrait](https://github.com/PowerHouseMan/ComfyUI-AdvancedLivePortrait). | [@jhj0517](https://github.com/jhj0517) |\n| [**FacePoke**](https://github.com/jbilcke-hf/FacePoke) | A real-time head transformation app, controlled by your mouse! | [@jbilcke-hf](https://github.com/jbilcke-hf) |\n| [**FaceFusion**](https://github.com/facefusion/facefusion) | FaceFusion 3.0 integregates LivePortrait as `expression_restorer` and `face_editor` processors. | [@henryruhs](https://github.com/henryruhs) |\n| [**sd-webui-live-portrait**](https://github.com/dimitribarbot/sd-webui-live-portrait) | WebUI extension of LivePortrait, adding atab to the original Stable Diffusion WebUI to benefit from LivePortrait features. | [@dimitribarbot](https://github.com/dimitribarbot) |\n| [**ComfyUI-LivePortraitKJ**](https://github.com/kijai/ComfyUI-LivePortraitKJ) | A ComfyUI node to use LivePortrait, with MediaPipe as as an alternative to Insightface. | [@kijai](https://github.com/kijai) |\n| [**ComfyUI-AdvancedLivePortrait**](https://github.com/PowerHouseMan/ComfyUI-AdvancedLivePortrait) | A faster ComfyUI node with real-time preview that has inspired many other community-developed tools and projects. | [@PowerHouseMan](https://github.com/PowerHouseMan) |\n| [**comfyui-liveportrait**](https://github.com/shadowcz007/comfyui-liveportrait) | A ComfyUI node to use LivePortrait, supporting multi-faces, expression interpolation etc, with a [tutorial](https://www.bilibili.com/video/BV1JW421R7sP). | [@shadowcz007](https://github.com/shadowcz007) |\n\n### Playgrounds, 🤗 HuggingFace Spaces and Others\n- [FacePoke Space](https://huggingface.co/spaces/jbilcke-hf/FacePoke)\n- [Expression Editor Space](https://huggingface.co/spaces/fffiloni/expression-editor)\n- [Expression Editor Replicate](https://replicate.com/fofr/expression-editor)\n- [Face Control Realtime Demo](https://fal.ai/demos/face-control) on FAL\n- [Replicate Playground](https://replicate.com/fofr/live-portrait)\n- Nuke can use LivePortrait through CompyUI node, details [here](https://x.com/bilawalsidhu/status/1837349806475276338)\n- LivePortrait lives on [Poe](https://poe.com/LivePortrait)\n\n### Video Tutorials\n- [Workflow of LivePortrait Video to Video](https://youtu.be/xfzK_6cTs58?si=aYjgypeJBkhc46VL) by [@curiousrefuge](https://www.youtube.com/@curiousrefuge)\n- [Google Colab tutorial](https://youtu.be/59Y9ePAXTp0?si=KzEWhklBlporW7D8) by [@Planet Ai](https://www.youtube.com/@planetai217)\n- [Paper reading](https://youtu.be/fD0P6UWSu8I?si=Vn5wxUa8qSu1jv4l) by [@TwoMinutePapers](https://www.youtube.com/@TwoMinutePapers)\n- [ComfyUI Advanced LivePortrait](https://youtu.be/q0Vf-ZZsbzI?si=nbs3npleH-dVCt28) by [TutoView](https://www.youtube.com/@TutoView)\n- [LivePortarit exploration](https://www.youtube.com/watch?v=vsvlbTEqgXQ) and [A deep dive into LivePortrait](https://youtu.be/cucaEEDYmsw?si=AtPaDWc5G-a4E8dD) by [TheoreticallyMedia](https://www.youtube.com/@TheoreticallyMedia)\n- [LivePortrait hands-on tutorial](https://www.youtube.com/watch?v=uyjSTAOY7yI) by [@AI Search](https://www.youtube.com/@theAIsearch)\n- [ComfyUI tutorial](https://www.youtube.com/watch?v=8-IcDDmiUMM) by [@Sebastian Kamph](https://www.youtube.com/@sebastiankamph)\n- A [tutorial](https://www.bilibili.com/video/BV1cf421i7Ly) on BiliBili\n\nAnd so MANY amazing contributions from our community, too many to list them all 💖\n\n## Acknowledgements 💐\nWe would like to thank the contributors of [FOMM](https://github.com/AliaksandrSiarohin/first-order-model), [Open Facevid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis), [SPADE](https://github.com/NVlabs/SPADE), [InsightFace](https://github.com/deepinsight/insightface) and [X-Pose](https://github.com/IDEA-Research/X-Pose) repositories, for their open research and contributions.\n\n## Ethics Considerations 🛡️\nPortrait animation technologies come with social risks, particularly the potential for misuse in creating deepfakes. To mitigate these risks, it’s crucial to follow ethical guidelines and adopt responsible usage practices. At present, the synthesized results contain visual artifacts that may help in detecting deepfakes. Please note that we do not assume any legal responsibility for the use of the results generated by this project.\n\n## Citation 💖\nIf you find LivePortrait useful for your project or research, welcome to 🌟 this repo and cite our work using the following BibTeX:\n```bibtex\n@article{guo2024liveportrait,\n  title   = {LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control},\n  author  = {Guo, Jianzhu and Zhang, Dingyun and Liu, Xiaoqiang and Zhong, Zhizhou and Zhang, Yuan and Wan, Pengfei and Zhang, Di},\n  journal = {arXiv preprint arXiv:2407.03168},\n  year    = {2024}\n}\n```\n\n*Long live in arXiv.*\n\n## Contact 📧\n[**Jianzhu Guo (郭建珠)**](https://guojianzhu.com); **guojianzhu1994@gmail.com**\n\n## Star History 🌟\n<details>\n  <summary>Click to view Star chart</summary>\n  <p align=\"center\">\n    <a href=\"https://www.star-history.com/#KlingAIResearch/LivePortrait&Timeline\" target=\"_blank\">\n      <picture>\n        <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://api.star-history.com/svg?repos=KlingAIResearch/LivePortrait&type=Timeline&theme=dark\" />\n        <source media=\"(prefers-color-scheme: light)\" srcset=\"https://api.star-history.com/svg?repos=KlingAIResearch/LivePortrait&type=Timeline\" />\n        <img alt=\"Star History Chart\" src=\"https://api.star-history.com/svg?repos=KlingAIResearch/LivePortrait&type=Timeline\" width=\"90%\" />\n      </picture>\n    </a>\n  </p>\n</details>\n"
  },
  {
    "path": "readme_zh_cn.md",
    "content": "<h1 align=\"center\">LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control</h1>\n\n<!-- ===== 作者信息 ===== -->\n<div align='center'>\n  <a href='https://github.com/cleardusk' target='_blank'><strong>郭建珠</strong></a><sup> 1*†</sup>&emsp;\n  <a href='https://github.com/Mystery099' target='_blank'><strong>张丁芸</strong></a><sup> 1,2*</sup>&emsp;\n  <a href='https://github.com/KwaiVGI' target='_blank'><strong>刘晓强</strong></a><sup> 1</sup>&emsp;\n  <a href='https://github.com/zzzweakman' target='_blank'><strong>钟智舟</strong></a><sup> 1,3</sup>&emsp;\n  <a href='https://scholar.google.com.hk/citations?user=_8k1ubAAAAAJ' target='_blank'><strong>张渊</strong></a><sup> 1</sup>&emsp;\n  <a href='https://scholar.google.com/citations?user=P6MraaYAAAAJ' target='_blank'><strong>万鹏飞</strong></a><sup> 1</sup>&emsp;\n  <a href='https://openreview.net/profile?id=~Di_ZHANG3' target='_blank'><strong>张迪</strong></a><sup> 1</sup>&emsp;\n</div>\n\n\n<div align='center'>\n  <sup>1 </sup>快手科技&emsp; <sup>2 </sup>中国科学技术大学&emsp; <sup>3 </sup>复旦大学&emsp;\n</div>\n<div align='center'>\n  <small><sup>*</sup> Equal contributions</small>\n  <small><sup>†</sup> Project lead</small>\n</div>\n<br>\n\n<!-- ===== LivePortrait – 快速上手 & 链接 ===== -->\n<div align=\"center\">\n\n  <!-- 🚀 快速体验按钮 -->\n  <p>\n    <a href=\"https://huggingface.co/cleardusk/LivePortrait-Windows/blob/main/LivePortrait-Windows-v20240829.zip\" target=\"_blank\"><img src=\"https://img.shields.io/badge/🖥 Windows 一键安装-v20240829-00BFFF?style=for-the-badge&logo=windows&logoColor=white\" alt=\"Windows 一键安装包\"></a>&nbsp;\n    <a href=\"https://huggingface.co/spaces/KlingTeam/LivePortrait\" target=\"_blank\"><img src=\"https://img.shields.io/badge/🌐 在线体验-FF6F00?style=for-the-badge&logo=huggingface&logoColor=white\" alt=\"HuggingFace 在线体验\"></a>\n  </p>\n\n  <!-- 📄 论文 / 项目 / Star 等徽章 -->\n  <p>\n    <a href=\"https://arxiv.org/pdf/2407.03168\" target=\"_blank\"><img src=\"https://img.shields.io/badge/arXiv-LivePortrait-red\" alt=\"arXiv link\"></a>&nbsp;\n    <a href=\"https://liveportrait.github.io\" target=\"_blank\"><img src=\"https://img.shields.io/badge/Project-Homepage-green\" alt=\"project homepage\"></a>&nbsp;\n    <a href=\"https://huggingface.co/spaces/KlingTeam/LivePortrait\" target=\"_blank\"><img src=\"https://img.shields.io/badge/🤗 Hugging Face-Spaces-blue\" alt=\"HF space\"></a>&nbsp;\n    <a href=\"https://hellogithub.com/repository/bed652ef02154dd7a434e0720125639e\" target=\"_blank\"><img src=\"https://abroad.hellogithub.com/v1/widgets/recommend.svg?rid=bed652ef02154dd7a434e0720125639e&claim_uid=XyBT2K9QJ7RZhej&theme=small\" alt=\"Featured by HelloGitHub\"></a>&nbsp;\n    <a href=\"https://github.com/KlingTeam/LivePortrait\" target=\"_blank\"><img src=\"https://img.shields.io/github/stars/KlingTeam/LivePortrait?style=social\" alt=\"GitHub stars\"></a>\n  </p>\n\n  <!-- 🌏 语言切换 -->\n  <p><a href=\"./readme.md\"><strong>English</strong></a> | <strong>简体中文</strong></p>\n\n  <!-- 🎬 Showcase GIF -->\n  <p><img src=\"./assets/docs/showcase2.gif\" alt=\"LivePortrait 效果展示 GIF\"></p>\n  <p>🔥 更多效果，请访问我们的 <a href=\"https://liveportrait.github.io/\" target=\"_blank\"><strong>主页</strong></a> 🔥</p>\n\n</div>\n<!-- ===== /LivePortrait ===== -->\n\n\n## 🔥 更新日志\n- **`2025/06/01`**：🌍 过去一年里，LivePortrait 🚀 已成为高效的人像与宠物（猫狗）动画解决方案，被快手、抖音、剪映、视频号等主流视频平台，以及众多初创公司和创作者所采用。🎉\n- **`2025/01/01`**：🐶 我们更新了一版动物模型（使用了更多动物数据），具体查看[**这里**](./assets/docs/changelog/2025-01-01.md).\n- **`2024/10/18`**：❗ 我们更新了`transformers`，`gradio`库的版本避免安全漏洞，具体查看[这里](https://github.com/KlingTeam/LivePortrait/pull/421/files).\n- **`2024/08/29`**：📦 我们更新了Windows[一键安装程序](https://huggingface.co/cleardusk/LivePortrait-Windows/blob/main/LivePortrait-Windows-v20240829.zip)并支持自动更新, 详情建[这里](https://huggingface.co/cleardusk/LivePortrait-Windows#20240829)。\n- **`2024/08/19`**：🖼️ 我们支持了**图像驱动模式**和**区域控制**。详情请见[**这里**](./assets/docs/changelog/2024-08-19.md)。\n- **`2024/08/06`**：🎨 我们在Gradio界面支持**精确的人像编辑**, 受到[ComfyUI-AdvancedLivePortrait](https://github.com/PowerHouseMan/ComfyUI-AdvancedLivePortrait)启发。详见[**这里**](./assets/docs/changelog/2024-08-06.md)。\n- **`2024/08/05`**：📦 Windows用户现在可以下载[一键安装程序](https://huggingface.co/cleardusk/LivePortrait-Windows/blob/main/LivePortrait-Windows-v20240806.zip)，支持**人类模式**和**动物模式**！详情见[**这里**](./assets/docs/changelog/2024-08-05.md)。\n- **`2024/08/02`**：😸 我们发布了**动物模型**版本，以及其他一些更新和改进。查看详情[**这里**](./assets/docs/changelog/2024-08-02.md)！\n- **`2024/07/25`**：📦 Windows用户现在可以从 [HuggingFace](https://huggingface.co/cleardusk/LivePortrait-Windows/tree/main) 或 [百度云](https://pan.baidu.com/s/1FWsWqKe0eNfXrwjEhhCqlw?pwd=86q2) 下载软件包。解压并双击`run_windows.bat`即可享受！\n- **`2024/07/24`**：🎨 我们在Gradio界面支持源人像的姿势编辑。我们还降低了默认检测阈值以增加召回率。[玩得开心](assets/docs/changelog/2024-07-24.md)！\n- **`2024/07/19`**：✨ 我们支持🎞️ **人像视频编辑（aka v2v）**！更多信息见[**这里**](assets/docs/changelog/2024-07-19.md)。\n- **`2024/07/17`**：🍎 我们支持macOS搭载Apple Silicon，修改来自 [jeethu](https://github.com/jeethu) 的PR [#143](https://github.com/KlingTeam/LivePortrait/pull/143) 。\n- **`2024/07/10`**：💪我们支持音频和视频拼接、驱动视频自动裁剪以及制作模板以保护隐私。更多信息见[这里](assets/docs/changelog/2024-07-10.md)。\n- **`2024/07/09`**：🤗 我们发布了[HuggingFace Space](https://huggingface.co/spaces/KlingTeam/LivePortrait)，感谢HF团队和[Gradio](https://github.com/gradio-app/gradio)！\n- **`2024/07/04`**：😊 我们发布了初始版本的推理代码和模型。持续更新，敬请关注！\n- **`2024/07/04`**：🔥 我们发布了[主页](https://liveportrait.github.io)和在[arXiv](https://arxiv.org/pdf/2407.03168)上的技术报告。\n\n\n\n## 介绍 📖\n此仓库名为**LivePortrait**，包含我们论文（[LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control](https://arxiv.org/pdf/2407.03168)）的官方PyTorch实现。 我们正在积极更新和改进此仓库。如果您发现任何错误或有建议，欢迎提出问题或提交合并请求💖。\n\n## 上手指南 🏁\n### 1. 克隆代码和安装运行环境 🛠️\n\n> [!Note]\n> 确保您的系统已安装[`git`](https://git-scm.com/)、[`conda`](https://anaconda.org/anaconda/conda)和[`FFmpeg`](https://ffmpeg.org/download.html)。有关FFmpeg安装的详细信息，见[**如何安装FFmpeg**](assets/docs/how-to-install-ffmpeg.md)。\n\n```bash\ngit clone https://github.com/KlingTeam/LivePortrait\ncd LivePortrait\n\n# 使用conda创建环境\nconda create -n LivePortrait python=3.10\nconda activate LivePortrait\n```\n\n#### 对于Linux或Windows用户\n\n[X-Pose](https://github.com/IDEA-Research/X-Pose)需要您的`torch`版本与CUDA版本兼容。\n\n首先，通过以下命令检查您当前的CUDA版本：\n\n```bash\nnvcc -V # example versions: 11.1, 11.8, 12.1, etc.\n```\n\n然后，安装相应版本的torch。以下是不同CUDA版本的示例。如果您的CUDA版本未列出，请访问[PyTorch官方网站](https://pytorch.org/get-started/previous-versions)获取安装命令：\n```bash\n# for CUDA 11.1\npip install torch==1.10.1+cu111 torchvision==0.11.2 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu111/torch_stable.html\n# for CUDA 11.8\npip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu118\n# for CUDA 12.1\npip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121\n# ...\n```\n\n**注意**：在Windows系统上，一些过高版本的CUDA(12.4、12.6等)可能会导致未知的问题，您可以考虑降低您的CUDA版本到11.8，这是我们测试的一个较为稳定的版本。降级方法可以参考 [@dimitribarbot](https://github.com/dimitribarbot)\n提供的[文档](https://github.com/dimitribarbot/sd-webui-live-portrait/blob/main/assets/docs/how-to-install-xpose.md#cuda-toolkit-118).\n\n最后，安装其余依赖项：\n\n```bash\npip install -r requirements.txt\n```\n\n#### 对于搭载Apple Silicon的macOS用户\n\n[X-Pose](https://github.com/IDEA-Research/X-Pose)依赖项不支持macOS，因此您可以跳过其安装。人类模式照常工作，但不支持动物模式。使用为搭载Apple Silicon的macOS提供的requirements文件：\n\n```bash\n# 对于搭载Apple Silicon的macOS用户\npip install -r requirements_macOS.txt\n```\n\n### 2. 下载预训练权重(Pretrained weights) 📥\n\n从HuggingFace下载预训练权重的最简单方法是：\n```bash\n# !pip install -U \"huggingface_hub[cli]\"\nhuggingface-cli download KlingTeam/LivePortrait --local-dir pretrained_weights --exclude \"*.git*\" \"README.md\" \"docs\"\n```\n\n若您不能访问HuggingFace平台，你可以访问其镜像网站[hf-mirror](https://hf-mirror.com/)进行下载操作：\n\n```bash\n# !pip install -U \"huggingface_hub[cli]\"\nexport HF_ENDPOINT=https://hf-mirror.com\nhuggingface-cli download KlingTeam/LivePortrait --local-dir pretrained_weights --exclude \"*.git*\" \"README.md\" \"docs\"\n```\n\n或者，您可以从[Google Drive](https://drive.google.com/drive/folders/1UtKgzKjFAOmZkhNK-OYT0caJ_w2XAnib)或[百度云](https://pan.baidu.com/s/1MGctWmNla_vZxDbEp2Dtzw?pwd=z5cn)（进行中）下载所有预训练权重。解压并将它们放置在`./pretrained_weights`目录下。\n\n确保目录结构如所示包含[**本仓库该路径**](assets/docs/directory-structure.md)其中展示的内容。\n\n### 3. 推理 🚀\n\n#### 快速上手（人类模型）👤\n\n```bash\n# 对于Linux和Windows用户\npython inference.py\n\n# 对于搭载Apple Silicon的macOS用户（Intel未测试）。注意：这可能比RTX 4090慢20倍\nPYTORCH_ENABLE_MPS_FALLBACK=1 python inference.py\n```\n\n如果脚本成功运行，您将得到一个名为`animations/s6--d0_concat.mp4`的输出mp4文件。此文件包含以下结果：驱动视频、输入图像或视频以及生成结果。\n\n<p align=\"center\">\n  <img src=\"./assets/docs/inference.gif\" alt=\"image\">\n</p>\n或者，您可以通过指定`-s`和`-d`参数来更改输入：\n\n```bash\n# 源输入是图像\npython inference.py -s assets/examples/source/s9.jpg -d assets/examples/driving/d0.mp4\n\n# 源输入是视频 ✨\npython inference.py -s assets/examples/source/s13.mp4 -d assets/examples/driving/d0.mp4\n\n# 更多选项请见\npython inference.py -h\n```\n\n#### 快速上手（动物模型） 🐱🐶\n\n动物模式仅在Linux和Windows上经过测试，并且需要NVIDIA GPU。\n\n您需要首先构建一个名为`MultiScaleDeformableAttention`的OP，该OP由[X-Pose](https://github.com/IDEA-Research/X-Pose)使用，这是一个通用的关键点检测框架。\n\n```bash\ncd src/utils/dependencies/XPose/models/UniPose/ops\npython setup.py build install\ncd - # 等同于 cd ../../../../../../../\n```\n\n然后执行\n```bash\npython inference_animals.py -s assets/examples/source/s39.jpg -d assets/examples/driving/wink.pkl --driving_multiplier 1.75 --no_flag_stitching\n```\n如果脚本成功运行，您将得到一个名为`animations/s39--wink_concat.mp4`的输出mp4文件。\n<p align=\"center\">\n  <img src=\"./assets/docs/inference-animals.gif\" alt=\"image\">\n</p>\n\n#### 驱动视频自动裁剪 📢📢📢\n\n> [!IMPORTANT]\n> 使用您自己的驱动视频时，我们**推荐**： ⬇️\n>\n> - 将其裁剪为**1:1**的宽高比（例如，512x512或256x256像素），或通过`--flag_crop_driving_video`启用自动裁剪。\n> - 专注于头部区域，类似于示例视频。\n> - 最小化肩部运动。\n> - 确保驱动视频的第一帧是具有**中性表情**的正面面部。\n\n以下是通过`--flag_crop_driving_video`自动裁剪的示例：\n\n```bash\npython inference.py -s assets/examples/source/s9.jpg -d assets/examples/driving/d13.mp4 --flag_crop_driving_video\n```\n\n如果自动裁剪的结果不理想，您可以修改`--scale_crop_driving_video`、`--vy_ratio_crop_driving_video`选项来调整比例和偏移，或者手动进行调整。\n\n#### 动作模板制作\n\n您也可以使用以`.pkl`结尾的自动生成的动作模板文件来加快推理速度，并**保护隐私**，例如：\n```bash\npython inference.py -s assets/examples/source/s9.jpg -d assets/examples/driving/d5.pkl # 人像动画\npython inference.py -s assets/examples/source/s13.mp4 -d assets/examples/driving/d5.pkl # 人像视频编辑\n```\n\n### 4. Gradio 界面 🤗\n\n我们还提供了Gradio界面 <a href='https://github.com/gradio-app/gradio'><img src='https://img.shields.io/github/stars/gradio-app/gradio'></a>，以获得更好的体验，只需运行：\n\n```bash\n# 对于Linux和Windows用户（以及搭载Intel的macOS？？）\npython app.py # 人类模型模式\n\n# 对于搭载Apple Silicon的macOS用户，不支持Intel，这可能比RTX 4090慢20倍\nPYTORCH_ENABLE_MPS_FALLBACK=1 python app.py # 人类模型模式\n```\n\n我们还为动物模式提供了Gradio界面，这仅在Linux上经过NVIDIA GPU测试：\n```bash\npython app_animals.py # animals mode 🐱🐶\n```\n\n您可以指定`--server_port`、`--share`、`--server_name`参数以满足您的需求！\n\n🚀我们还提供了一个加速选项`--flag_do_torch_compile`。第一次推理触发优化过程（约一分钟），使后续推理速度提高20-30%。不同CUDA版本的性能提升可能有所不同。\n\n```bash\n# 启用torch.compile以进行更快的推理\npython app.py --flag_do_torch_compile\n```\n**注意**：此方法在Windows和macOS上不受支持。\n\n**或者，在[HuggingFace](https://huggingface.co/spaces/KlingTeam/LivePortrait)上轻松尝试**🤗。\n\n### 5. 推理速度预估 🚀🚀🚀\n我们还提供了一个脚本来评估每个模块的推理速度：\n\n```bash\n# 对于NVIDIA GPU\npython speed.py\n```\n\n结果在[**本仓库该文件展示**](./assets/docs/speed.md).\n\n## 社区资源 🤗\n\n### 社区项目\n\n| 仓库 | 描述 | 作者 / 链接 |\n|------|------|--------|\n| [**ditto-talkinghead**](https://github.com/antgroup/ditto-talkinghead) | 实时音频驱动。 | [论文](https://arxiv.org/abs/2411.19509), [主页](https://digital-avatar.github.io/ai/Ditto/) |\n| [**FasterLivePortrait**](https://github.com/warmshao/FasterLivePortrait) | 基于TensorRT加速更快的实时版本。 | [@warmshao](https://github.com/warmshao) |\n| [**AdvancedLivePortrait-WebUI**](https://github.com/jhj0517/AdvancedLivePortrait-WebUI) | Dedicated gradio based WebUI started from [ComfyUI-AdvancedLivePortrait](https://github.com/PowerHouseMan/ComfyUI-AdvancedLivePortrait) | [@jhj0517](https://github.com/jhj0517) |\n| [**FacePoke**](https://github.com/jbilcke-hf/FacePoke) | 一个实时的头部姿态表情控制应用，通过鼠标控制！ | [@jbilcke-hf](https://github.com/jbilcke-hf) |\n| [**FaceFusion**](https://github.com/facefusion/facefusion) | FaceFusion 3.0 集成了 LivePortrait 作为 `expression_restorer` 和 `face_editor` 处理器。 | [@henryruhs](https://github.com/henryruhs) |\n| [**sd-webui-live-portrait**](https://github.com/dimitribarbot/sd-webui-live-portrait) | LivePortrait 的 WebUI 扩展，在原版 Stable Diffusion WebUI 中添加了一个标签以使用 LivePortrait 的功能。 | [@dimitribarbot](https://github.com/dimitribarbot) |\n| [**ComfyUI-LivePortraitKJ**](https://github.com/kijai/ComfyUI-LivePortraitKJ) | 一个用于 LivePortrait 的 ComfyUI 节点，使用 MediaPipe 作为 Insightface 的替代方案。 | [@kijai](https://github.com/kijai) |\n| [**ComfyUI-AdvancedLivePortrait**](https://github.com/PowerHouseMan/ComfyUI-AdvancedLivePortrait) | 一个更快的 ComfyUI 节点，具有实时预览功能，启发了许多社区开发的工具和项目。 | [@PowerHouseMan](https://github.com/PowerHouseMan) |\n| [**comfyui-liveportrait**](https://github.com/shadowcz007/comfyui-liveportrait) | 一个用于 LivePortrait 的 ComfyUI 节点，支持多面部、表情插值等功能，并有[教程](https://www.bilibili.com/video/BV1JW421R7sP)。 | [@shadowcz007](https://github.com/shadowcz007) |\n\n### Playgrounds, 🤗 HuggingFace Spaces 以及其它\n- [FacePoke Space](https://huggingface.co/spaces/jbilcke-hf/FacePoke)\n- [Expression Editor Space](https://huggingface.co/spaces/fffiloni/expression-editor)\n- [Expression Editor Replicate](https://replicate.com/fofr/expression-editor)\n- [Face Control Realtime Demo](https://fal.ai/demos/face-control) on FAL\n- [Replicate Playground](https://replicate.com/fofr/live-portrait)\n- Nuke 可以通过 CompyUI 节点使用 LivePortrait，详情见[这里](https://x.com/bilawalsidhu/status/1837349806475276338)\n- LivePortrait 在 [Poe](https://poe.com/LivePortrait) 上运行\n\n### 视频教程\n- [LivePortrait 视频转视频的工作流程](https://youtu.be/xfzK_6cTs58?si=aYjgypeJBkhc46VL) 由 [@curiousrefuge](https://www.youtube.com/@curiousrefuge) 制作\n- [Google Colab 教程](https://youtu.be/59Y9ePAXTp0?si=KzEWhklBlporW7D8) 由 [@Planet Ai](https://www.youtube.com/@planetai217) 制作\n- [论文解读](https://youtu.be/fD0P6UWSu8I?si=Vn5wxUa8qSu1jv4l) 由 [@TwoMinutePapers](https://www.youtube.com/@TwoMinutePapers) 制作\n- [ComfyUI 高级 LivePortrait 教程](https://youtu.be/q0Vf-ZZsbzI?si=nbs3npleH-dVCt28) 由 [TutoView](https://www.youtube.com/@TutoView) 制作\n- [LivePortrait 探索](https://www.youtube.com/watch?v=vsvlbTEqgXQ) 和 [LivePortrait 深入探讨](https://youtu.be/cucaEEDYmsw?si=AtPaDWc5G-a4E8dD) 由 [TheoreticallyMedia](https://www.youtube.com/@TheoreticallyMedia) 制作\n- [LivePortrait 实战教程](https://www.youtube.com/watch?v=uyjSTAOY7yI) 由 [@AI Search](https://www.youtube.com/@theAIsearch) 制作\n- [ComfyUI 教程](https://www.youtube.com/watch?v=8-IcDDmiUMM) 由 [@Sebastian Kamph](https://www.youtube.com/@sebastiankamph) 制作\n- B 站上的[教程](https://www.bilibili.com/video/BV1cf421i7Ly)\n\n还有来自社区的无数令人惊叹的贡献，未能一一列举 💖\n\n## 致谢 💐\n\n我们要感谢[FOMM](https://github.com/AliaksandrSiarohin/first-order-model)、[Open Facevid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis)、[SPADE](https://github.com/NVlabs/SPADE)、[InsightFace](https://github.com/deepinsight/insightface)和[X-Pose](https://github.com/IDEA-Research/X-Pose)仓库的的贡献者，感谢他们的开放研究和贡献。\n\n## 道德考量 🛡️\n肖像动画技术伴随着社会风险，特别是在创建深度伪造（deepfakes）时可能被滥用。为了减轻这些风险，遵循道德指南并采取负责任的使用实践至关重要。目前，生成的结果包含一些视觉伪影，这些伪影可能有助于检测深度伪造。请注意，我们不对本项目生成的结果的使用承担任何法律责任。\n\n## 引用 💖\n\n如果您发现LivePortrait对您的研究有用，欢迎引用我们的工作，使用以下BibTeX：\n\n```bibtex\n@article{guo2024liveportrait,\n  title   = {LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control},\n  author  = {Guo, Jianzhu and Zhang, Dingyun and Liu, Xiaoqiang and Zhong, Zhizhou and Zhang, Yuan and Wan, Pengfei and Zhang, Di},\n  journal = {arXiv preprint arXiv:2407.03168},\n  year    = {2024}\n}\n```\n\n## 联系方式 📧\n\n[**Jianzhu Guo (郭建珠)**](https://guojianzhu.com); **guojianzhu1994@gmail.com**；\n\n## Star History 🌟\n<details>\n  <summary>点击展开查看项目 Star 曲线</summary>\n  <p align=\"center\">\n    <a href=\"https://www.star-history.com/#KlingAIResearch/LivePortrait&Timeline\" target=\"_blank\">\n      <picture>\n        <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://api.star-history.com/svg?repos=KlingAIResearch/LivePortrait&type=Timeline&theme=dark\" />\n        <source media=\"(prefers-color-scheme: light)\" srcset=\"https://api.star-history.com/svg?repos=KlingAIResearch/LivePortrait&type=Timeline\" />\n        <img alt=\"Star History Chart\" src=\"https://api.star-history.com/svg?repos=KlingAIResearch/LivePortrait&type=Timeline\" width=\"90%\" />\n      </picture>\n    </a>\n  </p>\n</details>\n"
  },
  {
    "path": "requirements.txt",
    "content": "-r requirements_base.txt\n\nonnxruntime-gpu==1.18.0\ntransformers==4.38.0\n"
  },
  {
    "path": "requirements_base.txt",
    "content": "numpy==1.26.4\npyyaml==6.0.1\nopencv-python==4.10.0.84\nscipy==1.13.1\nimageio==2.34.2\nlmdb==1.4.1\ntqdm==4.66.4\nrich==13.7.1\nffmpeg-python==0.2.0\nonnx==1.16.1\nscikit-image==0.24.0\nalbumentations==1.4.10\nmatplotlib==3.9.0\nimageio-ffmpeg==0.5.1\ntyro==0.8.5\ngradio==5.1.0\npykalman==0.9.7\npillow>=10.2.0"
  },
  {
    "path": "requirements_macOS.txt",
    "content": "-r requirements_base.txt\n\n--extra-index-url https://download.pytorch.org/whl/cpu\ntorch==2.3.0\ntorchvision==0.18.0\ntorchaudio==2.3.0\nonnxruntime-silicon==1.16.3\n"
  },
  {
    "path": "speed.py",
    "content": "# coding: utf-8\n\n\"\"\"\nBenchmark the inference speed of each module in LivePortrait.\n\nTODO: heavy GPT style, need to refactor\n\"\"\"\n\nimport torch\ntorch._dynamo.config.suppress_errors = True  # Suppress errors and fall back to eager execution\n\nimport yaml\nimport time\nimport numpy as np\n\nfrom src.utils.helper import load_model, concat_feat\nfrom src.config.inference_config import InferenceConfig\n\n\ndef initialize_inputs(batch_size=1, device_id=0):\n    \"\"\"\n    Generate random input tensors and move them to GPU\n    \"\"\"\n    feature_3d = torch.randn(batch_size, 32, 16, 64, 64).to(device_id).half()\n    kp_source = torch.randn(batch_size, 21, 3).to(device_id).half()\n    kp_driving = torch.randn(batch_size, 21, 3).to(device_id).half()\n    source_image = torch.randn(batch_size, 3, 256, 256).to(device_id).half()\n    generator_input = torch.randn(batch_size, 256, 64, 64).to(device_id).half()\n    eye_close_ratio = torch.randn(batch_size, 3).to(device_id).half()\n    lip_close_ratio = torch.randn(batch_size, 2).to(device_id).half()\n    feat_stitching = concat_feat(kp_source, kp_driving).half()\n    feat_eye = concat_feat(kp_source, eye_close_ratio).half()\n    feat_lip = concat_feat(kp_source, lip_close_ratio).half()\n\n    inputs = {\n        'feature_3d': feature_3d,\n        'kp_source': kp_source,\n        'kp_driving': kp_driving,\n        'source_image': source_image,\n        'generator_input': generator_input,\n        'feat_stitching': feat_stitching,\n        'feat_eye': feat_eye,\n        'feat_lip': feat_lip\n    }\n\n    return inputs\n\n\ndef load_and_compile_models(cfg, model_config):\n    \"\"\"\n    Load and compile models for inference\n    \"\"\"\n    appearance_feature_extractor = load_model(cfg.checkpoint_F, model_config, cfg.device_id, 'appearance_feature_extractor')\n    motion_extractor = load_model(cfg.checkpoint_M, model_config, cfg.device_id, 'motion_extractor')\n    warping_module = load_model(cfg.checkpoint_W, model_config, cfg.device_id, 'warping_module')\n    spade_generator = load_model(cfg.checkpoint_G, model_config, cfg.device_id, 'spade_generator')\n    stitching_retargeting_module = load_model(cfg.checkpoint_S, model_config, cfg.device_id, 'stitching_retargeting_module')\n\n    models_with_params = [\n        ('Appearance Feature Extractor', appearance_feature_extractor),\n        ('Motion Extractor', motion_extractor),\n        ('Warping Network', warping_module),\n        ('SPADE Decoder', spade_generator)\n    ]\n\n    compiled_models = {}\n    for name, model in models_with_params:\n        model = model.half()\n        model = torch.compile(model, mode='max-autotune')  # Optimize for inference\n        model.eval()  # Switch to evaluation mode\n        compiled_models[name] = model\n\n    retargeting_models = ['stitching', 'eye', 'lip']\n    for retarget in retargeting_models:\n        module = stitching_retargeting_module[retarget].half()\n        module = torch.compile(module, mode='max-autotune')  # Optimize for inference\n        module.eval()  # Switch to evaluation mode\n        stitching_retargeting_module[retarget] = module\n\n    return compiled_models, stitching_retargeting_module\n\n\ndef warm_up_models(compiled_models, stitching_retargeting_module, inputs):\n    \"\"\"\n    Warm up models to prepare them for benchmarking\n    \"\"\"\n    print(\"Warm up start!\")\n    with torch.no_grad():\n        for _ in range(10):\n            compiled_models['Appearance Feature Extractor'](inputs['source_image'])\n            compiled_models['Motion Extractor'](inputs['source_image'])\n            compiled_models['Warping Network'](inputs['feature_3d'], inputs['kp_driving'], inputs['kp_source'])\n            compiled_models['SPADE Decoder'](inputs['generator_input'])  # Adjust input as required\n            stitching_retargeting_module['stitching'](inputs['feat_stitching'])\n            stitching_retargeting_module['eye'](inputs['feat_eye'])\n            stitching_retargeting_module['lip'](inputs['feat_lip'])\n    print(\"Warm up end!\")\n\n\ndef measure_inference_times(compiled_models, stitching_retargeting_module, inputs):\n    \"\"\"\n    Measure inference times for each model\n    \"\"\"\n    times = {name: [] for name in compiled_models.keys()}\n    times['Stitching and Retargeting Modules'] = []\n\n    overall_times = []\n\n    with torch.no_grad():\n        for _ in range(100):\n            torch.cuda.synchronize()\n            overall_start = time.time()\n\n            start = time.time()\n            compiled_models['Appearance Feature Extractor'](inputs['source_image'])\n            torch.cuda.synchronize()\n            times['Appearance Feature Extractor'].append(time.time() - start)\n\n            start = time.time()\n            compiled_models['Motion Extractor'](inputs['source_image'])\n            torch.cuda.synchronize()\n            times['Motion Extractor'].append(time.time() - start)\n\n            start = time.time()\n            compiled_models['Warping Network'](inputs['feature_3d'], inputs['kp_driving'], inputs['kp_source'])\n            torch.cuda.synchronize()\n            times['Warping Network'].append(time.time() - start)\n\n            start = time.time()\n            compiled_models['SPADE Decoder'](inputs['generator_input'])  # Adjust input as required\n            torch.cuda.synchronize()\n            times['SPADE Decoder'].append(time.time() - start)\n\n            start = time.time()\n            stitching_retargeting_module['stitching'](inputs['feat_stitching'])\n            stitching_retargeting_module['eye'](inputs['feat_eye'])\n            stitching_retargeting_module['lip'](inputs['feat_lip'])\n            torch.cuda.synchronize()\n            times['Stitching and Retargeting Modules'].append(time.time() - start)\n\n            overall_times.append(time.time() - overall_start)\n\n    return times, overall_times\n\n\ndef print_benchmark_results(compiled_models, stitching_retargeting_module, retargeting_models, times, overall_times):\n    \"\"\"\n    Print benchmark results with average and standard deviation of inference times\n    \"\"\"\n    average_times = {name: np.mean(times[name]) * 1000 for name in times.keys()}\n    std_times = {name: np.std(times[name]) * 1000 for name in times.keys()}\n\n    for name, model in compiled_models.items():\n        num_params = sum(p.numel() for p in model.parameters())\n        num_params_in_millions = num_params / 1e6\n        print(f\"Number of parameters for {name}: {num_params_in_millions:.2f} M\")\n\n    for index, retarget in enumerate(retargeting_models):\n        num_params = sum(p.numel() for p in stitching_retargeting_module[retarget].parameters())\n        num_params_in_millions = num_params / 1e6\n        print(f\"Number of parameters for part_{index} in Stitching and Retargeting Modules: {num_params_in_millions:.2f} M\")\n\n    for name, avg_time in average_times.items():\n        std_time = std_times[name]\n        print(f\"Average inference time for {name} over 100 runs: {avg_time:.2f} ms (std: {std_time:.2f} ms)\")\n\n\ndef main():\n    \"\"\"\n    Main function to benchmark speed and model parameters\n    \"\"\"\n    # Load configuration\n    cfg = InferenceConfig()\n    model_config_path = cfg.models_config\n    with open(model_config_path, 'r') as file:\n        model_config = yaml.safe_load(file)\n\n    # Sample input tensors\n    inputs = initialize_inputs(device_id = cfg.device_id)\n\n    # Load and compile models\n    compiled_models, stitching_retargeting_module = load_and_compile_models(cfg, model_config)\n\n    # Warm up models\n    warm_up_models(compiled_models, stitching_retargeting_module, inputs)\n\n    # Measure inference times\n    times, overall_times = measure_inference_times(compiled_models, stitching_retargeting_module, inputs)\n\n    # Print benchmark results\n    print_benchmark_results(compiled_models, stitching_retargeting_module, ['stitching', 'eye', 'lip'], times, overall_times)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "src/config/__init__.py",
    "content": ""
  },
  {
    "path": "src/config/argument_config.py",
    "content": "# coding: utf-8\n\n\"\"\"\nAll configs for user\n\"\"\"\nfrom dataclasses import dataclass\nimport tyro\nfrom typing_extensions import Annotated\nfrom typing import Optional, Literal\nfrom .base_config import PrintableConfig, make_abs_path\n\n\n@dataclass(repr=False)  # use repr from PrintableConfig\nclass ArgumentConfig(PrintableConfig):\n    ########## input arguments ##########\n    source: Annotated[str, tyro.conf.arg(aliases=[\"-s\"])] = make_abs_path('../../assets/examples/source/s0.jpg')  # path to the source portrait (human/animal) or video (human)\n    driving:  Annotated[str, tyro.conf.arg(aliases=[\"-d\"])] = make_abs_path('../../assets/examples/driving/d0.mp4')  # path to driving video or template (.pkl format)\n    output_dir: Annotated[str, tyro.conf.arg(aliases=[\"-o\"])] = 'animations/'  # directory to save output video\n\n    ########## inference arguments ##########\n    flag_use_half_precision: bool = True  # whether to use half precision (FP16). If black boxes appear, it might be due to GPU incompatibility; set to False.\n    flag_crop_driving_video: bool = False  # whether to crop the driving video, if the given driving info is a video\n    device_id: int = 0  # gpu device id\n    flag_force_cpu: bool = False  # force cpu inference, WIP!\n    flag_normalize_lip: bool = False  # whether to let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False\n    flag_source_video_eye_retargeting: bool = False  # when the input is a source video, whether to let the eye-open scalar of each frame to be the same as the first source frame before the animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False, may cause the inter-frame jittering\n    flag_eye_retargeting: bool = False  # not recommend to be True, WIP; whether to transfer the eyes-open ratio of each driving frame to the source image or the corresponding source frame\n    flag_lip_retargeting: bool = False  # not recommend to be True, WIP; whether to transfer the lip-open ratio of each driving frame to the source image or the corresponding source frame\n    flag_stitching: bool = True  # recommend to True if head movement is small, False if head movement is large or the source image is an animal\n    flag_relative_motion: bool = True # whether to use relative motion\n    flag_pasteback: bool = True  # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space\n    flag_do_crop: bool = True  # whether to crop the source portrait or video to the face-cropping space\n    driving_option: Literal[\"expression-friendly\", \"pose-friendly\"] = \"expression-friendly\" # \"expression-friendly\" or \"pose-friendly\"; \"expression-friendly\" would adapt the driving motion with the global multiplier, and could be used when the source is a human image\n    driving_multiplier: float = 1.0 # be used only when driving_option is \"expression-friendly\"\n    driving_smooth_observation_variance: float = 3e-7  # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy\n    audio_priority: Literal['source', 'driving'] = 'driving'  # whether to use the audio from source or driving video\n    animation_region: Literal[\"exp\", \"pose\", \"lip\", \"eyes\", \"all\"] = \"all\" # the region where the animation was performed, \"exp\" means the expression, \"pose\" means the head pose, \"all\" means all regions\n    ########## source crop arguments ##########\n    det_thresh: float = 0.15 # detection threshold\n    scale: float = 2.3  # the ratio of face area is smaller if scale is larger\n    vx_ratio: float = 0  # the ratio to move the face to left or right in cropping space\n    vy_ratio: float = -0.125  # the ratio to move the face to up or down in cropping space\n    flag_do_rot: bool = True  # whether to conduct the rotation when flag_do_crop is True\n    source_max_dim: int = 1280 # the max dim of height and width of source image or video, you can change it to a larger number, e.g., 1920\n    source_division: int = 2 # make sure the height and width of source image or video can be divided by this number\n\n    ########## driving crop arguments ##########\n    scale_crop_driving_video: float = 2.2  # scale factor for cropping driving video\n    vx_ratio_crop_driving_video: float = 0.  # adjust y offset\n    vy_ratio_crop_driving_video: float = -0.1  # adjust x offset\n\n    ########## gradio arguments ##########\n    server_port: Annotated[int, tyro.conf.arg(aliases=[\"-p\"])] = 8890  # port for gradio server\n    share: bool = False  # whether to share the server to public\n    server_name: Optional[str] = \"127.0.0.1\"  # set the local server name, \"0.0.0.0\" to broadcast all\n    flag_do_torch_compile: bool = False  # whether to use torch.compile to accelerate generation\n    gradio_temp_dir: Optional[str] = None  # directory to save gradio temp files\n"
  },
  {
    "path": "src/config/base_config.py",
    "content": "# coding: utf-8\n\n\"\"\"\npretty printing class\n\"\"\"\n\nfrom __future__ import annotations\nimport os.path as osp\nfrom typing import Tuple\n\n\ndef make_abs_path(fn):\n    return osp.join(osp.dirname(osp.realpath(__file__)), fn)\n\n\nclass PrintableConfig:  # pylint: disable=too-few-public-methods\n    \"\"\"Printable Config defining str function\"\"\"\n\n    def __repr__(self):\n        lines = [self.__class__.__name__ + \":\"]\n        for key, val in vars(self).items():\n            if isinstance(val, Tuple):\n                flattened_val = \"[\"\n                for item in val:\n                    flattened_val += str(item) + \"\\n\"\n                flattened_val = flattened_val.rstrip(\"\\n\")\n                val = flattened_val + \"]\"\n            lines += f\"{key}: {str(val)}\".split(\"\\n\")\n        return \"\\n    \".join(lines)\n"
  },
  {
    "path": "src/config/crop_config.py",
    "content": "# coding: utf-8\n\n\"\"\"\nparameters used for crop faces\n\"\"\"\n\nfrom dataclasses import dataclass\n\nfrom .base_config import PrintableConfig, make_abs_path\n\n\n@dataclass(repr=False)  # use repr from PrintableConfig\nclass CropConfig(PrintableConfig):\n    insightface_root: str = make_abs_path(\"../../pretrained_weights/insightface\")\n    landmark_ckpt_path: str = make_abs_path(\"../../pretrained_weights/liveportrait/landmark.onnx\")\n    xpose_config_file_path: str = make_abs_path(\"../utils/dependencies/XPose/config_model/UniPose_SwinT.py\")\n    xpose_embedding_cache_path: str = make_abs_path('../utils/resources/clip_embedding')\n\n    xpose_ckpt_path: str = make_abs_path(\"../../pretrained_weights/liveportrait_animals/xpose.pth\")\n    device_id: int = 0  # gpu device id\n    flag_force_cpu: bool = False  # force cpu inference, WIP\n    det_thresh: float = 0.1 # detection threshold\n    ########## source image or video cropping option ##########\n    dsize: int = 512  # crop size\n    scale: float = 2.3  # scale factor\n    vx_ratio: float = 0  # vx ratio\n    vy_ratio: float = -0.125  # vy ratio +up, -down\n    max_face_num: int = 0  # max face number, 0 mean no limit\n    flag_do_rot: bool = True # whether to conduct the rotation when flag_do_crop is True\n    animal_face_type: str = \"animal_face_9\"  # animal_face_68 -> 68 landmark points, animal_face_9 -> 9 landmarks\n    ########## driving video auto cropping option ##########\n    scale_crop_driving_video: float = 2.2  # 2.0 # scale factor for cropping driving video\n    vx_ratio_crop_driving_video: float = 0.0  # adjust y offset\n    vy_ratio_crop_driving_video: float = -0.1  # adjust x offset\n    direction: str = \"large-small\"  # direction of cropping\n"
  },
  {
    "path": "src/config/inference_config.py",
    "content": "# coding: utf-8\n\n\"\"\"\nconfig dataclass used for inference\n\"\"\"\n\nimport cv2\nfrom numpy import ndarray\nimport pickle as pkl\nfrom dataclasses import dataclass, field\nfrom typing import Literal, Tuple\nfrom .base_config import PrintableConfig, make_abs_path\n\ndef load_lip_array():\n    with open(make_abs_path('../utils/resources/lip_array.pkl'), 'rb') as f:\n        return pkl.load(f)\n\n@dataclass(repr=False)  # use repr from PrintableConfig\nclass InferenceConfig(PrintableConfig):\n    # HUMAN MODEL CONFIG, NOT EXPORTED PARAMS\n    models_config: str = make_abs_path('./models.yaml')  # portrait animation config\n    checkpoint_F: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/appearance_feature_extractor.pth')  # path to checkpoint of F\n    checkpoint_M: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/motion_extractor.pth')  # path to checkpoint pf M\n    checkpoint_G: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/spade_generator.pth')  # path to checkpoint of G\n    checkpoint_W: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/warping_module.pth')  # path to checkpoint of W\n    checkpoint_S: str = make_abs_path('../../pretrained_weights/liveportrait/retargeting_models/stitching_retargeting_module.pth')  # path to checkpoint to S and R_eyes, R_lip\n\n    # ANIMAL MODEL CONFIG, NOT EXPORTED PARAMS\n    # version_animals = \"\" # old version\n    version_animals = \"_v1.1\" # new (v1.1) version\n    checkpoint_F_animal: str = make_abs_path(f'../../pretrained_weights/liveportrait_animals/base_models{version_animals}/appearance_feature_extractor.pth')  # path to checkpoint of F\n    checkpoint_M_animal: str = make_abs_path(f'../../pretrained_weights/liveportrait_animals/base_models{version_animals}/motion_extractor.pth')  # path to checkpoint pf M\n    checkpoint_G_animal: str = make_abs_path(f'../../pretrained_weights/liveportrait_animals/base_models{version_animals}/spade_generator.pth')  # path to checkpoint of G\n    checkpoint_W_animal: str = make_abs_path(f'../../pretrained_weights/liveportrait_animals/base_models{version_animals}/warping_module.pth')  # path to checkpoint of W\n    checkpoint_S_animal: str = make_abs_path('../../pretrained_weights/liveportrait/retargeting_models/stitching_retargeting_module.pth')  # path to checkpoint to S and R_eyes, R_lip, NOTE: use human temporarily!\n\n    # EXPORTED PARAMS\n    flag_use_half_precision: bool = True\n    flag_crop_driving_video: bool = False\n    device_id: int = 0\n    flag_normalize_lip: bool = True\n    flag_source_video_eye_retargeting: bool = False\n    flag_eye_retargeting: bool = False\n    flag_lip_retargeting: bool = False\n    flag_stitching: bool = True\n    flag_relative_motion: bool = True\n    flag_pasteback: bool = True\n    flag_do_crop: bool = True\n    flag_do_rot: bool = True\n    flag_force_cpu: bool = False\n    flag_do_torch_compile: bool = False\n    driving_option: str = \"pose-friendly\" # \"expression-friendly\" or \"pose-friendly\"\n    driving_multiplier: float = 1.0\n    driving_smooth_observation_variance: float = 3e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy\n    source_max_dim: int = 1280 # the max dim of height and width of source image or video\n    source_division: int = 2 # make sure the height and width of source image or video can be divided by this number\n    animation_region: Literal[\"exp\", \"pose\", \"lip\", \"eyes\", \"all\"] = \"all\" # the region where the animation was performed, \"exp\" means the expression, \"pose\" means the head pose\n\n    # NOT EXPORTED PARAMS\n    lip_normalize_threshold: float = 0.03 # threshold for flag_normalize_lip\n    source_video_eye_retargeting_threshold: float = 0.18 # threshold for eyes retargeting if the input is a source video\n    anchor_frame: int = 0 # TO IMPLEMENT\n\n    input_shape: Tuple[int, int] = (256, 256)  # input shape\n    output_format: Literal['mp4', 'gif'] = 'mp4'  # output video format\n    crf: int = 15  # crf for output video\n    output_fps: int = 25 # default output fps\n\n    mask_crop: ndarray = field(default_factory=lambda: cv2.imread(make_abs_path('../utils/resources/mask_template.png'), cv2.IMREAD_COLOR))\n    lip_array: ndarray = field(default_factory=load_lip_array)\n    size_gif: int = 256 # default gif size, TO IMPLEMENT\n"
  },
  {
    "path": "src/config/models.yaml",
    "content": "model_params:\n  appearance_feature_extractor_params: # the F in the paper\n    image_channel: 3\n    block_expansion: 64\n    num_down_blocks: 2\n    max_features: 512\n    reshape_channel: 32\n    reshape_depth: 16\n    num_resblocks: 6\n  motion_extractor_params: # the M in the paper\n    num_kp: 21\n    backbone: convnextv2_tiny\n  warping_module_params: # the W in the paper\n    num_kp: 21\n    block_expansion: 64\n    max_features: 512\n    num_down_blocks: 2\n    reshape_channel: 32\n    estimate_occlusion_map: True\n    dense_motion_params:\n      block_expansion: 32\n      max_features: 1024\n      num_blocks: 5\n      reshape_depth: 16\n      compress: 4\n  spade_generator_params: # the G in the paper\n    upscale: 2 # represents upsample factor 256x256 -> 512x512\n    block_expansion: 64\n    max_features: 512\n    num_down_blocks: 2\n  stitching_retargeting_module_params: # the S in the paper\n    stitching:\n      input_size: 126 # (21*3)*2\n      hidden_sizes: [128, 128, 64]\n      output_size: 65 # (21*3)+2(tx,ty)\n    lip:\n      input_size: 65 # (21*3)+2\n      hidden_sizes: [128, 128, 64]\n      output_size: 63 # (21*3)\n    eye:\n      input_size: 66 # (21*3)+3\n      hidden_sizes: [256, 256, 128, 128, 64]\n      output_size: 63 # (21*3)\n"
  },
  {
    "path": "src/gradio_pipeline.py",
    "content": "# coding: utf-8\n\n\"\"\"\nPipeline for gradio\n\"\"\"\n\nimport os.path as osp\nimport os\nimport cv2\nfrom rich.progress import track\nimport gradio as gr\nimport numpy as np\nimport torch\n\nfrom .config.argument_config import ArgumentConfig\nfrom .live_portrait_pipeline import LivePortraitPipeline\nfrom .live_portrait_pipeline_animal import LivePortraitPipelineAnimal\nfrom .utils.io import load_img_online, load_video, resize_to_limit\nfrom .utils.filter import smooth\nfrom .utils.rprint import rlog as log\nfrom .utils.crop import prepare_paste_back, paste_back\nfrom .utils.camera import get_rotation_matrix\nfrom .utils.video import get_fps, has_audio_stream, concat_frames, images2video, add_audio_to_video\nfrom .utils.helper import is_square_video, mkdir, dct2device, basename\nfrom .utils.retargeting_utils import calc_eye_close_ratio, calc_lip_close_ratio\n\n\ndef update_args(args, user_args):\n    \"\"\"update the args according to user inputs\n    \"\"\"\n    for k, v in user_args.items():\n        if hasattr(args, k):\n            setattr(args, k, v)\n    return args\n\n\nclass GradioPipeline(LivePortraitPipeline):\n    \"\"\"gradio for human\n    \"\"\"\n\n    def __init__(self, inference_cfg, crop_cfg, args: ArgumentConfig):\n        super().__init__(inference_cfg, crop_cfg)\n        # self.live_portrait_wrapper = self.live_portrait_wrapper\n        self.args = args\n\n    @torch.no_grad()\n    def update_delta_new_eyeball_direction(self, eyeball_direction_x, eyeball_direction_y, delta_new, **kwargs):\n        if eyeball_direction_x > 0:\n                delta_new[0, 11, 0] += eyeball_direction_x * 0.0007\n                delta_new[0, 15, 0] += eyeball_direction_x * 0.001\n        else:\n            delta_new[0, 11, 0] += eyeball_direction_x * 0.001\n            delta_new[0, 15, 0] += eyeball_direction_x * 0.0007\n\n        delta_new[0, 11, 1] += eyeball_direction_y * -0.001\n        delta_new[0, 15, 1] += eyeball_direction_y * -0.001\n        blink = -eyeball_direction_y / 2.\n\n        delta_new[0, 11, 1] += blink * -0.001\n        delta_new[0, 13, 1] += blink * 0.0003\n        delta_new[0, 15, 1] += blink * -0.001\n        delta_new[0, 16, 1] += blink * 0.0003\n\n        return delta_new\n\n    @torch.no_grad()\n    def update_delta_new_smile(self, smile, delta_new, **kwargs):\n        delta_new[0, 20, 1] += smile * -0.01\n        delta_new[0, 14, 1] += smile * -0.02\n        delta_new[0, 17, 1] += smile * 0.0065\n        delta_new[0, 17, 2] += smile * 0.003\n        delta_new[0, 13, 1] += smile * -0.00275\n        delta_new[0, 16, 1] += smile * -0.00275\n        delta_new[0, 3, 1] += smile * -0.0035\n        delta_new[0, 7, 1] += smile * -0.0035\n\n        return delta_new\n\n    @torch.no_grad()\n    def update_delta_new_wink(self, wink, delta_new, **kwargs):\n        delta_new[0, 11, 1] += wink * 0.001\n        delta_new[0, 13, 1] += wink * -0.0003\n        delta_new[0, 17, 0] += wink * 0.0003\n        delta_new[0, 17, 1] += wink * 0.0003\n        delta_new[0, 3, 1] += wink * -0.0003\n\n        return delta_new\n\n    @torch.no_grad()\n    def update_delta_new_eyebrow(self, eyebrow, delta_new, **kwargs):\n        if eyebrow > 0:\n            delta_new[0, 1, 1] += eyebrow * 0.001\n            delta_new[0, 2, 1] += eyebrow * -0.001\n        else:\n            delta_new[0, 1, 0] += eyebrow * -0.001\n            delta_new[0, 2, 0] += eyebrow * 0.001\n            delta_new[0, 1, 1] += eyebrow * 0.0003\n            delta_new[0, 2, 1] += eyebrow * -0.0003\n        return delta_new\n\n    @torch.no_grad()\n    def update_delta_new_lip_variation_zero(self, lip_variation_zero, delta_new, **kwargs):\n        delta_new[0, 19, 0] += lip_variation_zero\n\n        return delta_new\n\n    @torch.no_grad()\n    def update_delta_new_lip_variation_one(self, lip_variation_one, delta_new, **kwargs):\n        delta_new[0, 14, 1] += lip_variation_one * 0.001\n        delta_new[0, 3, 1] += lip_variation_one * -0.0005\n        delta_new[0, 7, 1] += lip_variation_one * -0.0005\n        delta_new[0, 17, 2] += lip_variation_one * -0.0005\n\n        return delta_new\n\n    @torch.no_grad()\n    def update_delta_new_lip_variation_two(self, lip_variation_two, delta_new, **kwargs):\n        delta_new[0, 20, 2] += lip_variation_two * -0.001\n        delta_new[0, 20, 1] += lip_variation_two * -0.001\n        delta_new[0, 14, 1] += lip_variation_two * -0.001\n\n        return delta_new\n\n    @torch.no_grad()\n    def update_delta_new_lip_variation_three(self, lip_variation_three, delta_new, **kwargs):\n        delta_new[0, 19, 1] += lip_variation_three * 0.001\n        delta_new[0, 19, 2] += lip_variation_three * 0.0001\n        delta_new[0, 17, 1] += lip_variation_three * -0.0001\n\n        return delta_new\n\n    @torch.no_grad()\n    def update_delta_new_mov_x(self, mov_x, delta_new, **kwargs):\n        delta_new[0, 5, 0] += mov_x\n\n        return delta_new\n\n    @torch.no_grad()\n    def update_delta_new_mov_y(self, mov_y, delta_new, **kwargs):\n        delta_new[0, 5, 1] += mov_y\n\n        return delta_new\n\n    @torch.no_grad()\n    def execute_video(\n        self,\n        input_source_image_path=None,\n        input_source_video_path=None,\n        input_driving_video_path=None,\n        input_driving_image_path=None,\n        input_driving_video_pickle_path=None,\n        flag_normalize_lip=False,\n        flag_relative_input=True,\n        flag_do_crop_input=True,\n        flag_remap_input=True,\n        flag_stitching_input=True,\n        animation_region=\"all\",\n        driving_option_input=\"pose-friendly\",\n        driving_multiplier=1.0,\n        flag_crop_driving_video_input=True,\n        # flag_video_editing_head_rotation=False,\n        scale=2.3,\n        vx_ratio=0.0,\n        vy_ratio=-0.125,\n        scale_crop_driving_video=2.2,\n        vx_ratio_crop_driving_video=0.0,\n        vy_ratio_crop_driving_video=-0.1,\n        driving_smooth_observation_variance=3e-7,\n        tab_selection=None,\n        v_tab_selection=None\n    ):\n        \"\"\" for video-driven portrait animation or video editing\n        \"\"\"\n        if tab_selection == 'Image':\n            input_source_path = input_source_image_path\n        elif tab_selection == 'Video':\n            input_source_path = input_source_video_path\n        else:\n            input_source_path = input_source_image_path\n\n        if v_tab_selection == 'Video':\n            input_driving_path = input_driving_video_path\n        elif v_tab_selection == 'Image':\n            input_driving_path = input_driving_image_path\n        elif v_tab_selection == 'Pickle':\n            input_driving_path = input_driving_video_pickle_path\n        else:\n            input_driving_path = input_driving_video_path\n\n        if input_source_path is not None and input_driving_path is not None:\n            if osp.exists(input_driving_path) and v_tab_selection == 'Video' and not flag_crop_driving_video_input and is_square_video(input_driving_path) is False:\n                flag_crop_driving_video_input = True\n                log(\"The driving video is not square, it will be cropped to square automatically.\")\n                gr.Info(\"The driving video is not square, it will be cropped to square automatically.\", duration=2)\n\n            args_user = {\n                'source': input_source_path,\n                'driving': input_driving_path,\n                'flag_normalize_lip' : flag_normalize_lip,\n                'flag_relative_motion': flag_relative_input,\n                'flag_do_crop': flag_do_crop_input,\n                'flag_pasteback': flag_remap_input,\n                'flag_stitching': flag_stitching_input,\n                'animation_region': animation_region,\n                'driving_option': driving_option_input,\n                'driving_multiplier': driving_multiplier,\n                'flag_crop_driving_video': flag_crop_driving_video_input,\n                'scale': scale,\n                'vx_ratio': vx_ratio,\n                'vy_ratio': vy_ratio,\n                'scale_crop_driving_video': scale_crop_driving_video,\n                'vx_ratio_crop_driving_video': vx_ratio_crop_driving_video,\n                'vy_ratio_crop_driving_video': vy_ratio_crop_driving_video,\n                'driving_smooth_observation_variance': driving_smooth_observation_variance,\n            }\n            # update config from user input\n            self.args = update_args(self.args, args_user)\n            self.live_portrait_wrapper.update_config(self.args.__dict__)\n            self.cropper.update_config(self.args.__dict__)\n\n            output_path, output_path_concat = self.execute(self.args)\n            gr.Info(\"Run successfully!\", duration=2)\n            if output_path.endswith(\".jpg\"):\n                return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), output_path, gr.update(visible=True), output_path_concat, gr.update(visible=True)\n            else:\n                return output_path, gr.update(visible=True), output_path_concat, gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)\n        else:\n            raise gr.Error(\"Please upload the source portrait or source video, and driving video 🤗🤗🤗\", duration=5)\n\n    @torch.no_grad()\n    def execute_image_retargeting(\n        self,\n        input_eye_ratio: float,\n        input_lip_ratio: float,\n        input_head_pitch_variation: float,\n        input_head_yaw_variation: float,\n        input_head_roll_variation: float,\n        mov_x: float,\n        mov_y: float,\n        mov_z: float,\n        lip_variation_zero: float,\n        lip_variation_one: float,\n        lip_variation_two: float,\n        lip_variation_three: float,\n        smile: float,\n        wink: float,\n        eyebrow: float,\n        eyeball_direction_x: float,\n        eyeball_direction_y: float,\n        input_image,\n        retargeting_source_scale: float,\n        flag_stitching_retargeting_input=True,\n        flag_do_crop_input_retargeting_image=True):\n        \"\"\" for single image retargeting\n        \"\"\"\n        if input_head_pitch_variation is None or input_head_yaw_variation is None or input_head_roll_variation is None:\n            raise gr.Error(\"Invalid relative pose input 💥!\", duration=5)\n        # disposable feature\n        f_s_user, x_s_user, R_s_user, R_d_user, x_s_info, source_lmk_user, crop_M_c2o, mask_ori, img_rgb = \\\n            self.prepare_retargeting_image(\n                input_image, input_head_pitch_variation, input_head_yaw_variation, input_head_roll_variation, retargeting_source_scale, flag_do_crop=flag_do_crop_input_retargeting_image)\n\n        if input_eye_ratio is None or input_lip_ratio is None:\n            raise gr.Error(\"Invalid ratio input 💥!\", duration=5)\n        else:\n            device = self.live_portrait_wrapper.device\n            # inference_cfg = self.live_portrait_wrapper.inference_cfg\n            x_s_user = x_s_user.to(device)\n            f_s_user = f_s_user.to(device)\n            R_s_user = R_s_user.to(device)\n            R_d_user = R_d_user.to(device)\n            mov_x = torch.tensor(mov_x).to(device)\n            mov_y = torch.tensor(mov_y).to(device)\n            mov_z = torch.tensor(mov_z).to(device)\n            eyeball_direction_x = torch.tensor(eyeball_direction_x).to(device)\n            eyeball_direction_y = torch.tensor(eyeball_direction_y).to(device)\n            smile = torch.tensor(smile).to(device)\n            wink = torch.tensor(wink).to(device)\n            eyebrow = torch.tensor(eyebrow).to(device)\n            lip_variation_zero = torch.tensor(lip_variation_zero).to(device)\n            lip_variation_one = torch.tensor(lip_variation_one).to(device)\n            lip_variation_two = torch.tensor(lip_variation_two).to(device)\n            lip_variation_three = torch.tensor(lip_variation_three).to(device)\n\n            x_c_s = x_s_info['kp'].to(device)\n            delta_new = x_s_info['exp'].to(device)\n            scale_new = x_s_info['scale'].to(device)\n            t_new = x_s_info['t'].to(device)\n            R_d_new = (R_d_user @ R_s_user.permute(0, 2, 1)) @ R_s_user\n\n            if eyeball_direction_x != 0 or eyeball_direction_y != 0:\n                delta_new = self.update_delta_new_eyeball_direction(eyeball_direction_x, eyeball_direction_y, delta_new)\n            if smile != 0:\n                delta_new = self.update_delta_new_smile(smile, delta_new)\n            if wink != 0:\n                delta_new = self.update_delta_new_wink(wink, delta_new)\n            if eyebrow != 0:\n                delta_new = self.update_delta_new_eyebrow(eyebrow, delta_new)\n            if lip_variation_zero != 0:\n                delta_new = self.update_delta_new_lip_variation_zero(lip_variation_zero, delta_new)\n            if lip_variation_one !=  0:\n                delta_new = self.update_delta_new_lip_variation_one(lip_variation_one, delta_new)\n            if lip_variation_two != 0:\n                delta_new = self.update_delta_new_lip_variation_two(lip_variation_two, delta_new)\n            if lip_variation_three != 0:\n                delta_new = self.update_delta_new_lip_variation_three(lip_variation_three, delta_new)\n            if mov_x != 0:\n                delta_new = self.update_delta_new_mov_x(-mov_x, delta_new)\n            if mov_y !=0 :\n                delta_new = self.update_delta_new_mov_y(mov_y, delta_new)\n\n            x_d_new = mov_z * scale_new * (x_c_s @ R_d_new + delta_new) + t_new\n            eyes_delta, lip_delta = None, None\n            if input_eye_ratio != self.source_eye_ratio:\n                combined_eye_ratio_tensor = self.live_portrait_wrapper.calc_combined_eye_ratio([[float(input_eye_ratio)]], source_lmk_user)\n                eyes_delta = self.live_portrait_wrapper.retarget_eye(x_s_user, combined_eye_ratio_tensor)\n            if input_lip_ratio != self.source_lip_ratio:\n                combined_lip_ratio_tensor = self.live_portrait_wrapper.calc_combined_lip_ratio([[float(input_lip_ratio)]], source_lmk_user)\n                lip_delta = self.live_portrait_wrapper.retarget_lip(x_s_user, combined_lip_ratio_tensor)\n                print(lip_delta)\n            x_d_new = x_d_new + \\\n                    (eyes_delta if eyes_delta is not None else 0) + \\\n                    (lip_delta if lip_delta is not None else 0)\n\n            if flag_stitching_retargeting_input:\n                x_d_new = self.live_portrait_wrapper.stitching(x_s_user, x_d_new)\n            out = self.live_portrait_wrapper.warp_decode(f_s_user, x_s_user, x_d_new)\n            out = self.live_portrait_wrapper.parse_output(out['out'])[0]\n            if flag_do_crop_input_retargeting_image:\n                out_to_ori_blend = paste_back(out, crop_M_c2o, img_rgb, mask_ori)\n            else:\n                out_to_ori_blend = out\n            return out, out_to_ori_blend\n\n    @torch.no_grad()\n    def prepare_retargeting_image(\n        self,\n        input_image,\n        input_head_pitch_variation, input_head_yaw_variation, input_head_roll_variation,\n        retargeting_source_scale,\n        flag_do_crop=True):\n        \"\"\" for single image retargeting\n        \"\"\"\n        if input_image is not None:\n            # gr.Info(\"Upload successfully!\", duration=2)\n            args_user = {'scale': retargeting_source_scale}\n            self.args = update_args(self.args, args_user)\n            self.cropper.update_config(self.args.__dict__)\n            inference_cfg = self.live_portrait_wrapper.inference_cfg\n            ######## process source portrait ########\n            img_rgb = load_img_online(input_image, mode='rgb', max_dim=1280, n=2)\n            if flag_do_crop:\n                crop_info = self.cropper.crop_source_image(img_rgb, self.cropper.crop_cfg)\n                I_s = self.live_portrait_wrapper.prepare_source(crop_info['img_crop_256x256'])\n                source_lmk_user = crop_info['lmk_crop']\n                crop_M_c2o = crop_info['M_c2o']\n                mask_ori = prepare_paste_back(inference_cfg.mask_crop, crop_info['M_c2o'], dsize=(img_rgb.shape[1], img_rgb.shape[0]))\n            else:\n                I_s = self.live_portrait_wrapper.prepare_source(img_rgb)\n                source_lmk_user = self.cropper.calc_lmk_from_cropped_image(img_rgb)\n                crop_M_c2o = None\n                mask_ori = None\n            x_s_info = self.live_portrait_wrapper.get_kp_info(I_s)\n            x_d_info_user_pitch = x_s_info['pitch'] + input_head_pitch_variation\n            x_d_info_user_yaw = x_s_info['yaw'] + input_head_yaw_variation\n            x_d_info_user_roll = x_s_info['roll'] + input_head_roll_variation\n            R_s_user = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])\n            R_d_user = get_rotation_matrix(x_d_info_user_pitch, x_d_info_user_yaw, x_d_info_user_roll)\n            ############################################\n            f_s_user = self.live_portrait_wrapper.extract_feature_3d(I_s)\n            x_s_user = self.live_portrait_wrapper.transform_keypoint(x_s_info)\n            return f_s_user, x_s_user, R_s_user, R_d_user, x_s_info, source_lmk_user, crop_M_c2o, mask_ori, img_rgb\n        else:\n            raise gr.Error(\"Please upload a source portrait as the retargeting input 🤗🤗🤗\", duration=5)\n\n    @torch.no_grad()\n    def init_retargeting_image(self, retargeting_source_scale: float, source_eye_ratio: float, source_lip_ratio:float, input_image = None):\n        \"\"\" initialize the retargeting slider\n        \"\"\"\n        if input_image != None:\n            args_user = {'scale': retargeting_source_scale}\n            self.args = update_args(self.args, args_user)\n            self.cropper.update_config(self.args.__dict__)\n            # inference_cfg = self.live_portrait_wrapper.inference_cfg\n            ######## process source portrait ########\n            img_rgb = load_img_online(input_image, mode='rgb', max_dim=1280, n=16)\n            log(f\"Load source image from {input_image}.\")\n            crop_info = self.cropper.crop_source_image(img_rgb, self.cropper.crop_cfg)\n            if crop_info is None:\n                raise gr.Error(\"Source portrait NO face detected\", duration=2)\n            source_eye_ratio = calc_eye_close_ratio(crop_info['lmk_crop'][None])\n            source_lip_ratio = calc_lip_close_ratio(crop_info['lmk_crop'][None])\n            self.source_eye_ratio = round(float(source_eye_ratio.mean()), 2)\n            self.source_lip_ratio = round(float(source_lip_ratio[0][0]), 2)\n            log(\"Calculating eyes-open and lip-open ratios successfully!\")\n            return self.source_eye_ratio, self.source_lip_ratio\n        else:\n            return source_eye_ratio, source_lip_ratio\n\n    @torch.no_grad()\n    def execute_video_retargeting(self, input_lip_ratio: float, input_video, retargeting_source_scale: float, driving_smooth_observation_variance_retargeting: float, video_retargeting_silence=False, flag_do_crop_input_retargeting_video=True):\n        \"\"\" retargeting the lip-open ratio of each source frame\n        \"\"\"\n        # disposable feature\n        device = self.live_portrait_wrapper.device\n\n        if not video_retargeting_silence:\n            f_s_user_lst, x_s_user_lst, source_lmk_crop_lst, source_M_c2o_lst, mask_ori_lst, source_rgb_lst, img_crop_256x256_lst, lip_delta_retargeting_lst_smooth, source_fps, n_frames = \\\n                self.prepare_retargeting_video(input_video, retargeting_source_scale, device, input_lip_ratio, driving_smooth_observation_variance_retargeting, flag_do_crop=flag_do_crop_input_retargeting_video)\n            if input_lip_ratio is None:\n                raise gr.Error(\"Invalid ratio input 💥!\", duration=5)\n            else:\n                inference_cfg = self.live_portrait_wrapper.inference_cfg\n\n                I_p_pstbk_lst = None\n                if flag_do_crop_input_retargeting_video:\n                    I_p_pstbk_lst = []\n                I_p_lst = []\n                for i in track(range(n_frames), description='Retargeting video...', total=n_frames):\n                    x_s_user_i = x_s_user_lst[i].to(device)\n                    f_s_user_i = f_s_user_lst[i].to(device)\n\n                    lip_delta_retargeting = lip_delta_retargeting_lst_smooth[i]\n                    x_d_i_new = x_s_user_i + lip_delta_retargeting\n                    x_d_i_new = self.live_portrait_wrapper.stitching(x_s_user_i, x_d_i_new)\n                    out = self.live_portrait_wrapper.warp_decode(f_s_user_i, x_s_user_i, x_d_i_new)\n                    I_p_i = self.live_portrait_wrapper.parse_output(out['out'])[0]\n                    I_p_lst.append(I_p_i)\n\n                    if flag_do_crop_input_retargeting_video:\n                        I_p_pstbk = paste_back(I_p_i, source_M_c2o_lst[i], source_rgb_lst[i], mask_ori_lst[i])\n                        I_p_pstbk_lst.append(I_p_pstbk)\n        else:\n            inference_cfg = self.live_portrait_wrapper.inference_cfg\n            f_s_user_lst, x_s_user_lst, x_d_i_new_lst, source_M_c2o_lst, mask_ori_lst, source_rgb_lst, img_crop_256x256_lst, source_fps, n_frames = \\\n                self.prepare_video_lip_silence(input_video, device, flag_do_crop=flag_do_crop_input_retargeting_video)\n\n            I_p_pstbk_lst = None\n            if flag_do_crop_input_retargeting_video:\n                I_p_pstbk_lst = []\n            I_p_lst = []\n            for i in track(range(n_frames), description='Silencing lip...', total=n_frames):\n                x_s_user_i = x_s_user_lst[i].to(device)\n                f_s_user_i = f_s_user_lst[i].to(device)\n                x_d_i_new = x_d_i_new_lst[i]\n                x_d_i_new = self.live_portrait_wrapper.stitching(x_s_user_i, x_d_i_new)\n                out = self.live_portrait_wrapper.warp_decode(f_s_user_i, x_s_user_i, x_d_i_new)\n                I_p_i = self.live_portrait_wrapper.parse_output(out['out'])[0]\n                I_p_lst.append(I_p_i)\n\n                if flag_do_crop_input_retargeting_video:\n                    I_p_pstbk = paste_back(I_p_i, source_M_c2o_lst[i], source_rgb_lst[i], mask_ori_lst[i])\n                    I_p_pstbk_lst.append(I_p_pstbk)\n\n        mkdir(self.args.output_dir)\n        flag_source_has_audio = has_audio_stream(input_video)\n\n        ######### build the final concatenation result #########\n        # source frame | generation\n        frames_concatenated = concat_frames(driving_image_lst=None, source_image_lst=img_crop_256x256_lst, I_p_lst=I_p_lst)\n        wfp_concat = osp.join(self.args.output_dir, f'{basename(input_video)}_retargeting_concat.mp4')\n        images2video(frames_concatenated, wfp=wfp_concat, fps=source_fps)\n\n        if flag_source_has_audio:\n            # final result with concatenation\n            wfp_concat_with_audio = osp.join(self.args.output_dir, f'{basename(input_video)}_retargeting_concat_with_audio.mp4')\n            add_audio_to_video(wfp_concat, input_video, wfp_concat_with_audio)\n            os.replace(wfp_concat_with_audio, wfp_concat)\n            log(f\"Replace {wfp_concat_with_audio} with {wfp_concat}\")\n\n        # save the animated result\n        wfp = osp.join(self.args.output_dir, f'{basename(input_video)}_retargeting.mp4')\n        if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:\n            images2video(I_p_pstbk_lst, wfp=wfp, fps=source_fps)\n        else:\n            images2video(I_p_lst, wfp=wfp, fps=source_fps)\n\n        ######### build the final result #########\n        if flag_source_has_audio:\n            wfp_with_audio = osp.join(self.args.output_dir, f'{basename(input_video)}_retargeting_with_audio.mp4')\n            add_audio_to_video(wfp, input_video, wfp_with_audio)\n            os.replace(wfp_with_audio, wfp)\n            log(f\"Replace {wfp_with_audio} with {wfp}\")\n        gr.Info(\"Run successfully!\", duration=2)\n        return wfp_concat, wfp\n\n    @torch.no_grad()\n    def prepare_retargeting_video(self, input_video, retargeting_source_scale, device, input_lip_ratio, driving_smooth_observation_variance_retargeting, flag_do_crop=True):\n        \"\"\" for video retargeting\n        \"\"\"\n        if input_video is not None:\n            # gr.Info(\"Upload successfully!\", duration=2)\n            args_user = {'scale': retargeting_source_scale}\n            self.args = update_args(self.args, args_user)\n            self.cropper.update_config(self.args.__dict__)\n            inference_cfg = self.live_portrait_wrapper.inference_cfg\n            ######## process source video ########\n            source_rgb_lst = load_video(input_video)\n            source_rgb_lst = [resize_to_limit(img, inference_cfg.source_max_dim, inference_cfg.source_division) for img in source_rgb_lst]\n            source_fps = int(get_fps(input_video))\n            n_frames = len(source_rgb_lst)\n            log(f\"Load source video from {input_video}. FPS is {source_fps}\")\n\n            if flag_do_crop:\n                ret_s = self.cropper.crop_source_video(source_rgb_lst, self.cropper.crop_cfg)\n                log(f'Source video is cropped, {len(ret_s[\"frame_crop_lst\"])} frames are processed.')\n                if len(ret_s[\"frame_crop_lst\"]) != n_frames:\n                    n_frames = min(len(source_rgb_lst), len(ret_s[\"frame_crop_lst\"]))\n                img_crop_256x256_lst, source_lmk_crop_lst, source_M_c2o_lst = ret_s['frame_crop_lst'], ret_s['lmk_crop_lst'], ret_s['M_c2o_lst']\n                mask_ori_lst = [prepare_paste_back(inference_cfg.mask_crop, source_M_c2o, dsize=(source_rgb_lst[0].shape[1], source_rgb_lst[0].shape[0])) for source_M_c2o in source_M_c2o_lst]\n            else:\n                source_lmk_crop_lst = self.cropper.calc_lmks_from_cropped_video(source_rgb_lst)\n                img_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in source_rgb_lst]  # force to resize to 256x256\n                source_M_c2o_lst, mask_ori_lst = None, None\n\n            c_s_eyes_lst, c_s_lip_lst = self.live_portrait_wrapper.calc_ratio(source_lmk_crop_lst)\n            # save the motion template\n            I_s_lst = self.live_portrait_wrapper.prepare_videos(img_crop_256x256_lst)\n            source_template_dct = self.make_motion_template(I_s_lst, c_s_eyes_lst, c_s_lip_lst, output_fps=source_fps)\n\n            c_d_lip_retargeting = [input_lip_ratio]\n            f_s_user_lst, x_s_user_lst, lip_delta_retargeting_lst = [], [], []\n            for i in track(range(n_frames), description='Preparing retargeting video...', total=n_frames):\n                x_s_info = source_template_dct['motion'][i]\n                x_s_info = dct2device(x_s_info, device)\n                x_s_user = x_s_info['x_s']\n\n                source_lmk = source_lmk_crop_lst[i]\n                img_crop_256x256 = img_crop_256x256_lst[i]\n                I_s = I_s_lst[i]\n                f_s_user = self.live_portrait_wrapper.extract_feature_3d(I_s)\n\n                combined_lip_ratio_tensor_retargeting = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_retargeting, source_lmk)\n                lip_delta_retargeting = self.live_portrait_wrapper.retarget_lip(x_s_user, combined_lip_ratio_tensor_retargeting)\n                f_s_user_lst.append(f_s_user); x_s_user_lst.append(x_s_user); lip_delta_retargeting_lst.append(lip_delta_retargeting.cpu().numpy().astype(np.float32))\n            lip_delta_retargeting_lst_smooth = smooth(lip_delta_retargeting_lst, lip_delta_retargeting_lst[0].shape, device, driving_smooth_observation_variance_retargeting)\n\n            return f_s_user_lst, x_s_user_lst, source_lmk_crop_lst, source_M_c2o_lst, mask_ori_lst, source_rgb_lst, img_crop_256x256_lst, lip_delta_retargeting_lst_smooth, source_fps, n_frames\n        else:\n            # when press the clear button, go here\n            raise gr.Error(\"Please upload a source video as the retargeting input 🤗🤗🤗\", duration=5)\n\n    @torch.no_grad()\n    def prepare_video_lip_silence(self, input_video, device, flag_do_crop=True):\n        \"\"\" for keeping lips in the source video silent\n        \"\"\"\n        if input_video is not None:\n            inference_cfg = self.live_portrait_wrapper.inference_cfg\n            ######## process source video ########\n            source_rgb_lst = load_video(input_video)\n            source_rgb_lst = [resize_to_limit(img, inference_cfg.source_max_dim, inference_cfg.source_division) for img in source_rgb_lst]\n            source_fps = int(get_fps(input_video))\n            n_frames = len(source_rgb_lst)\n            log(f\"Load source video from {input_video}. FPS is {source_fps}\")\n\n            if flag_do_crop:\n                ret_s = self.cropper.crop_source_video(source_rgb_lst, self.cropper.crop_cfg)\n                log(f'Source video is cropped, {len(ret_s[\"frame_crop_lst\"])} frames are processed.')\n                if len(ret_s[\"frame_crop_lst\"]) != n_frames:\n                    n_frames = min(len(source_rgb_lst), len(ret_s[\"frame_crop_lst\"]))\n                img_crop_256x256_lst, source_lmk_crop_lst, source_M_c2o_lst = ret_s['frame_crop_lst'], ret_s['lmk_crop_lst'], ret_s['M_c2o_lst']\n                mask_ori_lst = [prepare_paste_back(inference_cfg.mask_crop, source_M_c2o, dsize=(source_rgb_lst[0].shape[1], source_rgb_lst[0].shape[0])) for source_M_c2o in source_M_c2o_lst]\n            else:\n                source_lmk_crop_lst = self.cropper.calc_lmks_from_cropped_video(source_rgb_lst)\n                img_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in source_rgb_lst]  # force to resize to 256x256\n                source_M_c2o_lst, mask_ori_lst = None, None\n\n            c_s_eyes_lst, c_s_lip_lst = self.live_portrait_wrapper.calc_ratio(source_lmk_crop_lst)\n            # save the motion template\n            I_s_lst = self.live_portrait_wrapper.prepare_videos(img_crop_256x256_lst)\n            source_template_dct = self.make_motion_template(I_s_lst, c_s_eyes_lst, c_s_lip_lst, output_fps=source_fps)\n\n            f_s_user_lst, x_s_user_lst, x_d_i_new_lst = [], [], []\n            for i in track(range(n_frames), description='Preparing silencing lip...', total=n_frames):\n                x_s_info = source_template_dct['motion'][i]\n                x_s_info = dct2device(x_s_info, device)\n                scale_s = x_s_info['scale']\n                x_s_user = x_s_info['x_s']\n                x_c_s = x_s_info['kp']\n                R_s = x_s_info['R']\n                t_s = x_s_info['t']\n                delta_new = torch.zeros_like(x_s_info['exp']) + torch.from_numpy(inference_cfg.lip_array).to(dtype=torch.float32, device=device)\n                for eyes_idx in [11, 13, 15, 16, 18]:\n                    delta_new[:, eyes_idx, :] = x_s_info['exp'][:, eyes_idx, :]\n                source_lmk = source_lmk_crop_lst[i]\n                img_crop_256x256 = img_crop_256x256_lst[i]\n                I_s = I_s_lst[i]\n                f_s_user = self.live_portrait_wrapper.extract_feature_3d(I_s)\n                x_d_i_new = scale_s * (x_c_s @ R_s + delta_new) + t_s\n                f_s_user_lst.append(f_s_user); x_s_user_lst.append(x_s_user); x_d_i_new_lst.append(x_d_i_new)\n            return f_s_user_lst, x_s_user_lst, x_d_i_new_lst, source_M_c2o_lst, mask_ori_lst, source_rgb_lst, img_crop_256x256_lst, source_fps, n_frames\n        else:\n            # when press the clear button, go here\n            raise gr.Error(\"Please upload a source video as the input 🤗🤗🤗\", duration=5)\n\nclass GradioPipelineAnimal(LivePortraitPipelineAnimal):\n    \"\"\"gradio for animal\n    \"\"\"\n    def __init__(self, inference_cfg, crop_cfg, args: ArgumentConfig):\n        inference_cfg.flag_crop_driving_video = True # ensure the face_analysis_wrapper is enabled\n        super().__init__(inference_cfg, crop_cfg)\n        # self.live_portrait_wrapper_animal = self.live_portrait_wrapper_animal\n        self.args = args\n\n    @torch.no_grad()\n    def execute_video(\n        self,\n        input_source_image_path=None,\n        input_driving_video_path=None,\n        input_driving_video_pickle_path=None,\n        flag_do_crop_input=False,\n        flag_remap_input=False,\n        driving_multiplier=1.0,\n        flag_stitching=False,\n        flag_crop_driving_video_input=False,\n        scale=2.3,\n        vx_ratio=0.0,\n        vy_ratio=-0.125,\n        scale_crop_driving_video=2.2,\n        vx_ratio_crop_driving_video=0.0,\n        vy_ratio_crop_driving_video=-0.1,\n        tab_selection=None,\n    ):\n        \"\"\" for video-driven potrait animation\n        \"\"\"\n        input_source_path = input_source_image_path\n\n        if tab_selection == 'Video':\n            input_driving_path = input_driving_video_path\n        elif tab_selection == 'Pickle':\n            input_driving_path = input_driving_video_pickle_path\n        else:\n            input_driving_path = input_driving_video_pickle_path\n\n        if input_source_path is not None and input_driving_path is not None:\n            if osp.exists(input_driving_path) and tab_selection == 'Video' and is_square_video(input_driving_path) is False:\n                flag_crop_driving_video_input = True\n                log(\"The driving video is not square, it will be cropped to square automatically.\")\n                gr.Info(\"The driving video is not square, it will be cropped to square automatically.\", duration=2)\n\n            args_user = {\n                'source': input_source_path,\n                'driving': input_driving_path,\n                'flag_do_crop': flag_do_crop_input,\n                'flag_pasteback': flag_remap_input,\n                'driving_multiplier': driving_multiplier,\n                'flag_stitching': flag_stitching,\n                'flag_crop_driving_video': flag_crop_driving_video_input,\n                'scale': scale,\n                'vx_ratio': vx_ratio,\n                'vy_ratio': vy_ratio,\n                'scale_crop_driving_video': scale_crop_driving_video,\n                'vx_ratio_crop_driving_video': vx_ratio_crop_driving_video,\n                'vy_ratio_crop_driving_video': vy_ratio_crop_driving_video,\n            }\n            # update config from user input\n            self.args = update_args(self.args, args_user)\n            self.live_portrait_wrapper_animal.update_config(self.args.__dict__)\n            self.cropper.update_config(self.args.__dict__)\n            # video driven animation\n            video_path, video_path_concat, video_gif_path = self.execute(self.args)\n            gr.Info(\"Run successfully!\", duration=2)\n            return video_path, video_path_concat, video_gif_path\n        else:\n            raise gr.Error(\"Please upload the source animal image, and driving video 🤗🤗🤗\", duration=5)\n"
  },
  {
    "path": "src/live_portrait_pipeline.py",
    "content": "# coding: utf-8\n\n\"\"\"\nPipeline of LivePortrait (Human)\n\"\"\"\n\nimport torch\ntorch.backends.cudnn.benchmark = True # disable CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR warning\n\nimport cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)\nimport numpy as np\nimport os\nimport os.path as osp\nfrom rich.progress import track\n\nfrom .config.argument_config import ArgumentConfig\nfrom .config.inference_config import InferenceConfig\nfrom .config.crop_config import CropConfig\nfrom .utils.cropper import Cropper\nfrom .utils.camera import get_rotation_matrix\nfrom .utils.video import images2video, concat_frames, get_fps, add_audio_to_video, has_audio_stream\nfrom .utils.crop import prepare_paste_back, paste_back\nfrom .utils.io import load_image_rgb, load_video, resize_to_limit, dump, load\nfrom .utils.helper import mkdir, basename, dct2device, is_video, is_template, remove_suffix, is_image, is_square_video, calc_motion_multiplier\nfrom .utils.filter import smooth\nfrom .utils.rprint import rlog as log\n# from .utils.viz import viz_lmk\nfrom .live_portrait_wrapper import LivePortraitWrapper\n\n\ndef make_abs_path(fn):\n    return osp.join(osp.dirname(osp.realpath(__file__)), fn)\n\n\nclass LivePortraitPipeline(object):\n\n    def __init__(self, inference_cfg: InferenceConfig, crop_cfg: CropConfig):\n        self.live_portrait_wrapper: LivePortraitWrapper = LivePortraitWrapper(inference_cfg=inference_cfg)\n        self.cropper: Cropper = Cropper(crop_cfg=crop_cfg)\n\n    def make_motion_template(self, I_lst, c_eyes_lst, c_lip_lst, **kwargs):\n        n_frames = I_lst.shape[0]\n        template_dct = {\n            'n_frames': n_frames,\n            'output_fps': kwargs.get('output_fps', 25),\n            'motion': [],\n            'c_eyes_lst': [],\n            'c_lip_lst': [],\n        }\n\n        for i in track(range(n_frames), description='Making motion templates...', total=n_frames):\n            # collect s, R, δ and t for inference\n            I_i = I_lst[i]\n            x_i_info = self.live_portrait_wrapper.get_kp_info(I_i)\n            x_s = self.live_portrait_wrapper.transform_keypoint(x_i_info)\n            R_i = get_rotation_matrix(x_i_info['pitch'], x_i_info['yaw'], x_i_info['roll'])\n\n            item_dct = {\n                'scale': x_i_info['scale'].cpu().numpy().astype(np.float32),\n                'R': R_i.cpu().numpy().astype(np.float32),\n                'exp': x_i_info['exp'].cpu().numpy().astype(np.float32),\n                't': x_i_info['t'].cpu().numpy().astype(np.float32),\n                'kp': x_i_info['kp'].cpu().numpy().astype(np.float32),\n                'x_s': x_s.cpu().numpy().astype(np.float32),\n            }\n\n            template_dct['motion'].append(item_dct)\n\n            c_eyes = c_eyes_lst[i].astype(np.float32)\n            template_dct['c_eyes_lst'].append(c_eyes)\n\n            c_lip = c_lip_lst[i].astype(np.float32)\n            template_dct['c_lip_lst'].append(c_lip)\n\n        return template_dct\n\n    def execute(self, args: ArgumentConfig):\n        # for convenience\n        inf_cfg = self.live_portrait_wrapper.inference_cfg\n        device = self.live_portrait_wrapper.device\n        crop_cfg = self.cropper.crop_cfg\n\n        ######## load source input ########\n        flag_is_source_video = False\n        source_fps = None\n        if is_image(args.source):\n            flag_is_source_video = False\n            img_rgb = load_image_rgb(args.source)\n            img_rgb = resize_to_limit(img_rgb, inf_cfg.source_max_dim, inf_cfg.source_division)\n            log(f\"Load source image from {args.source}\")\n            source_rgb_lst = [img_rgb]\n        elif is_video(args.source):\n            flag_is_source_video = True\n            source_rgb_lst = load_video(args.source)\n            source_rgb_lst = [resize_to_limit(img, inf_cfg.source_max_dim, inf_cfg.source_division) for img in source_rgb_lst]\n            source_fps = int(get_fps(args.source))\n            log(f\"Load source video from {args.source}, FPS is {source_fps}\")\n        else:  # source input is an unknown format\n            raise Exception(f\"Unknown source format: {args.source}\")\n\n        ######## process driving info ########\n        flag_load_from_template = is_template(args.driving)\n        driving_rgb_crop_256x256_lst = None\n        wfp_template = None\n\n        if flag_load_from_template:\n            # NOTE: load from template, it is fast, but the cropping video is None\n            log(f\"Load from template: {args.driving}, NOT the video, so the cropping video and audio are both NULL.\", style='bold green')\n            driving_template_dct = load(args.driving)\n            c_d_eyes_lst = driving_template_dct['c_eyes_lst'] if 'c_eyes_lst' in driving_template_dct.keys() else driving_template_dct['c_d_eyes_lst'] # compatible with previous keys\n            c_d_lip_lst = driving_template_dct['c_lip_lst'] if 'c_lip_lst' in driving_template_dct.keys() else driving_template_dct['c_d_lip_lst']\n            driving_n_frames = driving_template_dct['n_frames']\n            flag_is_driving_video = True if driving_n_frames > 1 else False\n            if flag_is_source_video and flag_is_driving_video:\n                n_frames = min(len(source_rgb_lst), driving_n_frames)  # minimum number as the number of the animated frames\n            elif flag_is_source_video and not flag_is_driving_video:\n                n_frames = len(source_rgb_lst)\n            else:\n                n_frames = driving_n_frames\n\n            # set output_fps\n            output_fps = driving_template_dct.get('output_fps', inf_cfg.output_fps)\n            log(f'The FPS of template: {output_fps}')\n\n            if args.flag_crop_driving_video:\n                log(\"Warning: flag_crop_driving_video is True, but the driving info is a template, so it is ignored.\")\n\n        elif osp.exists(args.driving):\n            if is_video(args.driving):\n                flag_is_driving_video = True\n                # load from video file, AND make motion template\n                output_fps = int(get_fps(args.driving))\n                log(f\"Load driving video from: {args.driving}, FPS is {output_fps}\")\n                driving_rgb_lst = load_video(args.driving)\n            elif is_image(args.driving):\n                flag_is_driving_video = False\n                driving_img_rgb = load_image_rgb(args.driving)\n                output_fps = 25\n                log(f\"Load driving image from {args.driving}\")\n                driving_rgb_lst = [driving_img_rgb]\n            else:\n                raise Exception(f\"{args.driving} is not a supported type!\")\n            ######## make motion template ########\n            log(\"Start making driving motion template...\")\n            driving_n_frames = len(driving_rgb_lst)\n            if flag_is_source_video and flag_is_driving_video:\n                n_frames = min(len(source_rgb_lst), driving_n_frames)  # minimum number as the number of the animated frames\n                driving_rgb_lst = driving_rgb_lst[:n_frames]\n            elif flag_is_source_video and not flag_is_driving_video:\n                n_frames = len(source_rgb_lst)\n            else:\n                n_frames = driving_n_frames\n            if inf_cfg.flag_crop_driving_video or (not is_square_video(args.driving)):\n                ret_d = self.cropper.crop_driving_video(driving_rgb_lst)\n                log(f'Driving video is cropped, {len(ret_d[\"frame_crop_lst\"])} frames are processed.')\n                if len(ret_d[\"frame_crop_lst\"]) is not n_frames and flag_is_driving_video:\n                    n_frames = min(n_frames, len(ret_d[\"frame_crop_lst\"]))\n                driving_rgb_crop_lst, driving_lmk_crop_lst = ret_d['frame_crop_lst'], ret_d['lmk_crop_lst']\n                driving_rgb_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in driving_rgb_crop_lst]\n            else:\n                driving_lmk_crop_lst = self.cropper.calc_lmks_from_cropped_video(driving_rgb_lst)\n                driving_rgb_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in driving_rgb_lst]  # force to resize to 256x256\n            #######################################\n\n            c_d_eyes_lst, c_d_lip_lst = self.live_portrait_wrapper.calc_ratio(driving_lmk_crop_lst)\n            # save the motion template\n            I_d_lst = self.live_portrait_wrapper.prepare_videos(driving_rgb_crop_256x256_lst)\n            driving_template_dct = self.make_motion_template(I_d_lst, c_d_eyes_lst, c_d_lip_lst, output_fps=output_fps)\n\n            wfp_template = remove_suffix(args.driving) + '.pkl'\n            dump(wfp_template, driving_template_dct)\n            log(f\"Dump motion template to {wfp_template}\")\n        else:\n            raise Exception(f\"{args.driving} does not exist!\")\n        if not flag_is_driving_video:\n            c_d_eyes_lst = c_d_eyes_lst*n_frames\n            c_d_lip_lst = c_d_lip_lst*n_frames\n\n        ######## prepare for pasteback ########\n        I_p_pstbk_lst = None\n        if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:\n            I_p_pstbk_lst = []\n            log(\"Prepared pasteback mask done.\")\n\n        I_p_lst = []\n        R_d_0, x_d_0_info = None, None\n        flag_normalize_lip = inf_cfg.flag_normalize_lip  # not overwrite\n        flag_source_video_eye_retargeting = inf_cfg.flag_source_video_eye_retargeting  # not overwrite\n        lip_delta_before_animation, eye_delta_before_animation = None, None\n\n        ######## process source info ########\n        if flag_is_source_video:\n            log(f\"Start making source motion template...\")\n\n            source_rgb_lst = source_rgb_lst[:n_frames]\n            if inf_cfg.flag_do_crop:\n                ret_s = self.cropper.crop_source_video(source_rgb_lst, crop_cfg)\n                log(f'Source video is cropped, {len(ret_s[\"frame_crop_lst\"])} frames are processed.')\n                if len(ret_s[\"frame_crop_lst\"]) is not n_frames:\n                    n_frames = min(n_frames, len(ret_s[\"frame_crop_lst\"]))\n                img_crop_256x256_lst, source_lmk_crop_lst, source_M_c2o_lst = ret_s['frame_crop_lst'], ret_s['lmk_crop_lst'], ret_s['M_c2o_lst']\n            else:\n                source_lmk_crop_lst = self.cropper.calc_lmks_from_cropped_video(source_rgb_lst)\n                img_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in source_rgb_lst]  # force to resize to 256x256\n\n            c_s_eyes_lst, c_s_lip_lst = self.live_portrait_wrapper.calc_ratio(source_lmk_crop_lst)\n            # save the motion template\n            I_s_lst = self.live_portrait_wrapper.prepare_videos(img_crop_256x256_lst)\n            source_template_dct = self.make_motion_template(I_s_lst, c_s_eyes_lst, c_s_lip_lst, output_fps=source_fps)\n\n            key_r = 'R' if 'R' in driving_template_dct['motion'][0].keys() else 'R_d'  # compatible with previous keys\n            if inf_cfg.flag_relative_motion:\n                if flag_is_driving_video:\n                    x_d_exp_lst = [source_template_dct['motion'][i]['exp'] + driving_template_dct['motion'][i]['exp'] - driving_template_dct['motion'][0]['exp'] for i in range(n_frames)]\n                    x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)\n                else:\n                    x_d_exp_lst = [source_template_dct['motion'][i]['exp'] + (driving_template_dct['motion'][0]['exp'] - inf_cfg.lip_array) for i in range(n_frames)]\n                    x_d_exp_lst_smooth = [torch.tensor(x_d_exp[0], dtype=torch.float32, device=device) for x_d_exp in x_d_exp_lst]\n                if inf_cfg.animation_region == \"all\" or inf_cfg.animation_region == \"pose\":\n                    if flag_is_driving_video:\n                        x_d_r_lst = [(np.dot(driving_template_dct['motion'][i][key_r], driving_template_dct['motion'][0][key_r].transpose(0, 2, 1))) @ source_template_dct['motion'][i]['R'] for i in range(n_frames)]\n                        x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)\n                    else:\n                        x_d_r_lst = [source_template_dct['motion'][i]['R'] for i in range(n_frames)]\n                        x_d_r_lst_smooth = [torch.tensor(x_d_r[0], dtype=torch.float32, device=device) for x_d_r in x_d_r_lst]\n            else:\n                if flag_is_driving_video:\n                    x_d_exp_lst = [driving_template_dct['motion'][i]['exp'] for i in range(n_frames)]\n                    x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)\n                else:\n                    x_d_exp_lst = [driving_template_dct['motion'][0]['exp']]\n                    x_d_exp_lst_smooth = [torch.tensor(x_d_exp[0], dtype=torch.float32, device=device) for x_d_exp in x_d_exp_lst]*n_frames\n                if inf_cfg.animation_region == \"all\" or inf_cfg.animation_region == \"pose\":\n                    if flag_is_driving_video:\n                        x_d_r_lst = [driving_template_dct['motion'][i][key_r] for i in range(n_frames)]\n                        x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)\n                    else:\n                        x_d_r_lst = [driving_template_dct['motion'][0][key_r]]\n                        x_d_r_lst_smooth = [torch.tensor(x_d_r[0], dtype=torch.float32, device=device) for x_d_r in x_d_r_lst]*n_frames\n\n        else:  # if the input is a source image, process it only once\n            if inf_cfg.flag_do_crop:\n                crop_info = self.cropper.crop_source_image(source_rgb_lst[0], crop_cfg)\n                if crop_info is None:\n                    raise Exception(\"No face detected in the source image!\")\n                source_lmk = crop_info['lmk_crop']\n                img_crop_256x256 = crop_info['img_crop_256x256']\n            else:\n                source_lmk = self.cropper.calc_lmk_from_cropped_image(source_rgb_lst[0])\n                img_crop_256x256 = cv2.resize(source_rgb_lst[0], (256, 256))  # force to resize to 256x256\n            I_s = self.live_portrait_wrapper.prepare_source(img_crop_256x256)\n            x_s_info = self.live_portrait_wrapper.get_kp_info(I_s)\n            x_c_s = x_s_info['kp']\n            R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])\n            f_s = self.live_portrait_wrapper.extract_feature_3d(I_s)\n            x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info)\n\n            # let lip-open scalar to be 0 at first\n            if flag_normalize_lip and inf_cfg.flag_relative_motion and source_lmk is not None:\n                c_d_lip_before_animation = [0.]\n                combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)\n                if combined_lip_ratio_tensor_before_animation[0][0] >= inf_cfg.lip_normalize_threshold:\n                    lip_delta_before_animation = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)\n\n            if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:\n                mask_ori_float = prepare_paste_back(inf_cfg.mask_crop, crop_info['M_c2o'], dsize=(source_rgb_lst[0].shape[1], source_rgb_lst[0].shape[0]))\n\n        ######## animate ########\n        if flag_is_driving_video or (flag_is_source_video and not flag_is_driving_video):\n            log(f\"The animated video consists of {n_frames} frames.\")\n        else:\n            log(f\"The output of image-driven portrait animation is an image.\")\n        for i in track(range(n_frames), description='🚀Animating...', total=n_frames):\n            if flag_is_source_video:  # source video\n                x_s_info = source_template_dct['motion'][i]\n                x_s_info = dct2device(x_s_info, device)\n\n                source_lmk = source_lmk_crop_lst[i]\n                img_crop_256x256 = img_crop_256x256_lst[i]\n                I_s = I_s_lst[i]\n                f_s = self.live_portrait_wrapper.extract_feature_3d(I_s)\n\n                x_c_s = x_s_info['kp']\n                R_s = x_s_info['R']\n                x_s =x_s_info['x_s']\n\n                # let lip-open scalar to be 0 at first if the input is a video\n                if flag_normalize_lip and inf_cfg.flag_relative_motion and source_lmk is not None:\n                    c_d_lip_before_animation = [0.]\n                    combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)\n                    if combined_lip_ratio_tensor_before_animation[0][0] >= inf_cfg.lip_normalize_threshold:\n                        lip_delta_before_animation = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)\n                    else:\n                        lip_delta_before_animation = None\n\n                # let eye-open scalar to be the same as the first frame if the latter is eye-open state\n                if flag_source_video_eye_retargeting and source_lmk is not None:\n                    if i == 0:\n                        combined_eye_ratio_tensor_frame_zero = c_s_eyes_lst[0]\n                        c_d_eye_before_animation_frame_zero = [[combined_eye_ratio_tensor_frame_zero[0][:2].mean()]]\n                        if c_d_eye_before_animation_frame_zero[0][0] < inf_cfg.source_video_eye_retargeting_threshold:\n                            c_d_eye_before_animation_frame_zero = [[0.39]]\n                    combined_eye_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_eye_ratio(c_d_eye_before_animation_frame_zero, source_lmk)\n                    eye_delta_before_animation = self.live_portrait_wrapper.retarget_eye(x_s, combined_eye_ratio_tensor_before_animation)\n\n                if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:  # prepare for paste back\n                    mask_ori_float = prepare_paste_back(inf_cfg.mask_crop, source_M_c2o_lst[i], dsize=(source_rgb_lst[i].shape[1], source_rgb_lst[i].shape[0]))\n            if flag_is_source_video and not flag_is_driving_video:\n                x_d_i_info = driving_template_dct['motion'][0]\n            else:\n                x_d_i_info = driving_template_dct['motion'][i]\n            x_d_i_info = dct2device(x_d_i_info, device)\n            R_d_i = x_d_i_info['R'] if 'R' in x_d_i_info.keys() else x_d_i_info['R_d']  # compatible with previous keys\n\n            if i == 0:  # cache the first frame\n                R_d_0 = R_d_i\n                x_d_0_info = x_d_i_info.copy()\n\n            delta_new = x_s_info['exp'].clone()\n            if inf_cfg.flag_relative_motion:\n                if inf_cfg.animation_region == \"all\" or inf_cfg.animation_region == \"pose\":\n                    R_new = x_d_r_lst_smooth[i] if flag_is_source_video else (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s\n                else:\n                    R_new = R_s\n                if inf_cfg.animation_region == \"all\" or inf_cfg.animation_region == \"exp\":\n                    if flag_is_source_video:\n                        for idx in [1,2,6,11,12,13,14,15,16,17,18,19,20]:\n                            delta_new[:, idx, :] = x_d_exp_lst_smooth[i][idx, :]\n                        delta_new[:, 3:5, 1] = x_d_exp_lst_smooth[i][3:5, 1]\n                        delta_new[:, 5, 2] = x_d_exp_lst_smooth[i][5, 2]\n                        delta_new[:, 8, 2] = x_d_exp_lst_smooth[i][8, 2]\n                        delta_new[:, 9, 1:] = x_d_exp_lst_smooth[i][9, 1:]\n                    else:\n                        if flag_is_driving_video:\n                            delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])\n                        else:\n                            delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - torch.from_numpy(inf_cfg.lip_array).to(dtype=torch.float32, device=device))\n                elif inf_cfg.animation_region == \"lip\":\n                    for lip_idx in [6, 12, 14, 17, 19, 20]:\n                        if flag_is_source_video:\n                            delta_new[:, lip_idx, :] = x_d_exp_lst_smooth[i][lip_idx, :]\n                        elif flag_is_driving_video:\n                            delta_new[:, lip_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, lip_idx, :]\n                        else:\n                            delta_new[:, lip_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - torch.from_numpy(inf_cfg.lip_array).to(dtype=torch.float32, device=device)))[:, lip_idx, :]\n                elif inf_cfg.animation_region == \"eyes\":\n                    for eyes_idx in [11, 13, 15, 16, 18]:\n                        if flag_is_source_video:\n                            delta_new[:, eyes_idx, :] = x_d_exp_lst_smooth[i][eyes_idx, :]\n                        elif flag_is_driving_video:\n                            delta_new[:, eyes_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, eyes_idx, :]\n                        else:\n                            delta_new[:, eyes_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - 0))[:, eyes_idx, :]\n                if inf_cfg.animation_region == \"all\":\n                    scale_new = x_s_info['scale'] if flag_is_source_video else x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])\n                else:\n                    scale_new = x_s_info['scale']\n                if inf_cfg.animation_region == \"all\" or inf_cfg.animation_region == \"pose\":\n                    t_new = x_s_info['t'] if flag_is_source_video else x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])\n                else:\n                    t_new = x_s_info['t']\n            else:\n                if inf_cfg.animation_region == \"all\" or inf_cfg.animation_region == \"pose\":\n                    R_new = x_d_r_lst_smooth[i] if flag_is_source_video else R_d_i\n                else:\n                    R_new = R_s\n                if inf_cfg.animation_region == \"all\" or inf_cfg.animation_region == \"exp\":\n                    for idx in [1,2,6,11,12,13,14,15,16,17,18,19,20]:\n                        delta_new[:, idx, :] = x_d_exp_lst_smooth[i][idx, :] if flag_is_source_video else x_d_i_info['exp'][:, idx, :]\n                    delta_new[:, 3:5, 1] = x_d_exp_lst_smooth[i][3:5, 1] if flag_is_source_video else x_d_i_info['exp'][:, 3:5, 1]\n                    delta_new[:, 5, 2] = x_d_exp_lst_smooth[i][5, 2] if flag_is_source_video else x_d_i_info['exp'][:, 5, 2]\n                    delta_new[:, 8, 2] = x_d_exp_lst_smooth[i][8, 2] if flag_is_source_video else x_d_i_info['exp'][:, 8, 2]\n                    delta_new[:, 9, 1:] = x_d_exp_lst_smooth[i][9, 1:] if flag_is_source_video else x_d_i_info['exp'][:, 9, 1:]\n                elif inf_cfg.animation_region == \"lip\":\n                    for lip_idx in [6, 12, 14, 17, 19, 20]:\n                        delta_new[:, lip_idx, :] = x_d_exp_lst_smooth[i][lip_idx, :] if flag_is_source_video else x_d_i_info['exp'][:, lip_idx, :]\n                elif inf_cfg.animation_region == \"eyes\":\n                    for eyes_idx in [11, 13, 15, 16, 18]:\n                        delta_new[:, eyes_idx, :] = x_d_exp_lst_smooth[i][eyes_idx, :] if flag_is_source_video else x_d_i_info['exp'][:, eyes_idx, :]\n                scale_new = x_s_info['scale']\n                if inf_cfg.animation_region == \"all\" or inf_cfg.animation_region == \"pose\":\n                    t_new = x_d_i_info['t']\n                else:\n                    t_new = x_s_info['t']\n\n            t_new[..., 2].fill_(0)  # zero tz\n            x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new\n\n            if inf_cfg.flag_relative_motion and inf_cfg.driving_option == \"expression-friendly\" and not flag_is_source_video and flag_is_driving_video:\n                if i == 0:\n                    x_d_0_new = x_d_i_new\n                    motion_multiplier = calc_motion_multiplier(x_s, x_d_0_new)\n                    # motion_multiplier *= inf_cfg.driving_multiplier\n                x_d_diff = (x_d_i_new - x_d_0_new) * motion_multiplier\n                x_d_i_new = x_d_diff + x_s\n\n            # Algorithm 1:\n            if not inf_cfg.flag_stitching and not inf_cfg.flag_eye_retargeting and not inf_cfg.flag_lip_retargeting:\n                # without stitching or retargeting\n                if flag_normalize_lip and lip_delta_before_animation is not None:\n                    x_d_i_new += lip_delta_before_animation\n                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:\n                    x_d_i_new += eye_delta_before_animation\n                else:\n                    pass\n            elif inf_cfg.flag_stitching and not inf_cfg.flag_eye_retargeting and not inf_cfg.flag_lip_retargeting:\n                # with stitching and without retargeting\n                if flag_normalize_lip and lip_delta_before_animation is not None:\n                    x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new) + lip_delta_before_animation\n                else:\n                    x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new)\n                if flag_source_video_eye_retargeting and eye_delta_before_animation is not None:\n                    x_d_i_new += eye_delta_before_animation\n            else:\n                eyes_delta, lip_delta = None, None\n                if inf_cfg.flag_eye_retargeting and source_lmk is not None:\n                    c_d_eyes_i = c_d_eyes_lst[i]\n                    combined_eye_ratio_tensor = self.live_portrait_wrapper.calc_combined_eye_ratio(c_d_eyes_i, source_lmk)\n                    # ∆_eyes,i = R_eyes(x_s; c_s,eyes, c_d,eyes,i)\n                    eyes_delta = self.live_portrait_wrapper.retarget_eye(x_s, combined_eye_ratio_tensor)\n                if inf_cfg.flag_lip_retargeting and source_lmk is not None:\n                    c_d_lip_i = c_d_lip_lst[i]\n                    combined_lip_ratio_tensor = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_i, source_lmk)\n                    # ∆_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i)\n                    lip_delta = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor)\n\n                if inf_cfg.flag_relative_motion:  # use x_s\n                    x_d_i_new = x_s + \\\n                        (eyes_delta if eyes_delta is not None else 0) + \\\n                        (lip_delta if lip_delta is not None else 0)\n                else:  # use x_d,i\n                    x_d_i_new = x_d_i_new + \\\n                        (eyes_delta if eyes_delta is not None else 0) + \\\n                        (lip_delta if lip_delta is not None else 0)\n\n                if inf_cfg.flag_stitching:\n                    x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new)\n\n            x_d_i_new = x_s + (x_d_i_new - x_s) * inf_cfg.driving_multiplier\n            out = self.live_portrait_wrapper.warp_decode(f_s, x_s, x_d_i_new)\n            I_p_i = self.live_portrait_wrapper.parse_output(out['out'])[0]\n            I_p_lst.append(I_p_i)\n\n            if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:\n                # TODO: the paste back procedure is slow, considering optimize it using multi-threading or GPU\n                if flag_is_source_video:\n                    I_p_pstbk = paste_back(I_p_i, source_M_c2o_lst[i], source_rgb_lst[i], mask_ori_float)\n                else:\n                    I_p_pstbk = paste_back(I_p_i, crop_info['M_c2o'], source_rgb_lst[0], mask_ori_float)\n                I_p_pstbk_lst.append(I_p_pstbk)\n\n        mkdir(args.output_dir)\n        wfp_concat = None\n        ######### build the final concatenation result #########\n        # driving frame | source frame | generation\n        if flag_is_source_video and flag_is_driving_video:\n            frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst, img_crop_256x256_lst, I_p_lst)\n        elif flag_is_source_video and not flag_is_driving_video:\n            if flag_load_from_template:\n                frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst, img_crop_256x256_lst, I_p_lst)\n            else:\n                frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst*n_frames, img_crop_256x256_lst, I_p_lst)\n        else:\n            frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst, [img_crop_256x256], I_p_lst)\n\n        if flag_is_driving_video or (flag_is_source_video and not flag_is_driving_video):\n            flag_source_has_audio = flag_is_source_video and has_audio_stream(args.source)\n            flag_driving_has_audio = (not flag_load_from_template) and has_audio_stream(args.driving)\n\n            wfp_concat = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat.mp4')\n\n            # NOTE: update output fps\n            output_fps = source_fps if flag_is_source_video else output_fps\n            images2video(frames_concatenated, wfp=wfp_concat, fps=output_fps)\n\n            if flag_source_has_audio or flag_driving_has_audio:\n                # final result with concatenation\n                wfp_concat_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat_with_audio.mp4')\n                audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source\n                log(f\"Audio is selected from {audio_from_which_video}, concat mode\")\n                add_audio_to_video(wfp_concat, audio_from_which_video, wfp_concat_with_audio)\n                os.replace(wfp_concat_with_audio, wfp_concat)\n                log(f\"Replace {wfp_concat_with_audio} with {wfp_concat}\")\n\n            # save the animated result\n            wfp = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}.mp4')\n            if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:\n                images2video(I_p_pstbk_lst, wfp=wfp, fps=output_fps)\n            else:\n                images2video(I_p_lst, wfp=wfp, fps=output_fps)\n\n            ######### build the final result #########\n            if flag_source_has_audio or flag_driving_has_audio:\n                wfp_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_with_audio.mp4')\n                audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source\n                log(f\"Audio is selected from {audio_from_which_video}\")\n                add_audio_to_video(wfp, audio_from_which_video, wfp_with_audio)\n                os.replace(wfp_with_audio, wfp)\n                log(f\"Replace {wfp_with_audio} with {wfp}\")\n\n            # final log\n            if wfp_template not in (None, ''):\n                log(f'Animated template: {wfp_template}, you can specify `-d` argument with this template path next time to avoid cropping video, motion making and protecting privacy.', style='bold green')\n            log(f'Animated video: {wfp}')\n            log(f'Animated video with concat: {wfp_concat}')\n        else:\n            wfp_concat = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat.jpg')\n            cv2.imwrite(wfp_concat, frames_concatenated[0][..., ::-1])\n            wfp = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}.jpg')\n            if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:\n                cv2.imwrite(wfp, I_p_pstbk_lst[0][..., ::-1])\n            else:\n                cv2.imwrite(wfp, frames_concatenated[0][..., ::-1])\n            # final log\n            log(f'Animated image: {wfp}')\n            log(f'Animated image with concat: {wfp_concat}')\n\n        return wfp, wfp_concat\n"
  },
  {
    "path": "src/live_portrait_pipeline_animal.py",
    "content": "# coding: utf-8\n\n\"\"\"\nPipeline of LivePortrait (Animal)\n\"\"\"\n\nimport warnings\nwarnings.filterwarnings(\"ignore\", message=\"torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument.\")\nwarnings.filterwarnings(\"ignore\", message=\"torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly.\")\nwarnings.filterwarnings(\"ignore\", message=\"None of the inputs have requires_grad=True. Gradients will be None\")\n\nimport torch\ntorch.backends.cudnn.benchmark = True # disable CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR warning\n\nimport cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)\nimport numpy as np\nimport os\nimport os.path as osp\nfrom rich.progress import track\n\nfrom .config.argument_config import ArgumentConfig\nfrom .config.inference_config import InferenceConfig\nfrom .config.crop_config import CropConfig\nfrom .utils.cropper import Cropper\nfrom .utils.camera import get_rotation_matrix\nfrom .utils.video import images2video, concat_frames, get_fps, add_audio_to_video, has_audio_stream, video2gif\nfrom .utils.crop import _transform_img, prepare_paste_back, paste_back\nfrom .utils.io import load_image_rgb, load_video, resize_to_limit, dump, load\nfrom .utils.helper import mkdir, basename, dct2device, is_video, is_template, remove_suffix, is_image, calc_motion_multiplier\nfrom .utils.rprint import rlog as log\n# from .utils.viz import viz_lmk\nfrom .live_portrait_wrapper import LivePortraitWrapperAnimal\n\n\ndef make_abs_path(fn):\n    return osp.join(osp.dirname(osp.realpath(__file__)), fn)\n\nclass LivePortraitPipelineAnimal(object):\n\n    def __init__(self, inference_cfg: InferenceConfig, crop_cfg: CropConfig):\n        self.live_portrait_wrapper_animal: LivePortraitWrapperAnimal = LivePortraitWrapperAnimal(inference_cfg=inference_cfg)\n        self.cropper: Cropper = Cropper(crop_cfg=crop_cfg, image_type='animal_face', flag_use_half_precision=inference_cfg.flag_use_half_precision)\n\n    def make_motion_template(self, I_lst, **kwargs):\n        n_frames = I_lst.shape[0]\n        template_dct = {\n            'n_frames': n_frames,\n            'output_fps': kwargs.get('output_fps', 25),\n            'motion': [],\n        }\n\n        for i in track(range(n_frames), description='Making driving motion templates...', total=n_frames):\n            # collect s, R, δ and t for inference\n            I_i = I_lst[i]\n            x_i_info = self.live_portrait_wrapper_animal.get_kp_info(I_i)\n            R_i = get_rotation_matrix(x_i_info['pitch'], x_i_info['yaw'], x_i_info['roll'])\n\n            item_dct = {\n                'scale': x_i_info['scale'].cpu().numpy().astype(np.float32),\n                'R': R_i.cpu().numpy().astype(np.float32),\n                'exp': x_i_info['exp'].cpu().numpy().astype(np.float32),\n                't': x_i_info['t'].cpu().numpy().astype(np.float32),\n            }\n\n            template_dct['motion'].append(item_dct)\n\n        return template_dct\n\n    def execute(self, args: ArgumentConfig):\n        # for convenience\n        inf_cfg = self.live_portrait_wrapper_animal.inference_cfg\n        device = self.live_portrait_wrapper_animal.device\n        crop_cfg = self.cropper.crop_cfg\n\n        ######## load source input ########\n        if is_image(args.source):\n            img_rgb = load_image_rgb(args.source)\n            img_rgb = resize_to_limit(img_rgb, inf_cfg.source_max_dim, inf_cfg.source_division)\n            log(f\"Load source image from {args.source}\")\n        else:  # source input is an unknown format\n            raise Exception(f\"Unknown source format: {args.source}\")\n\n        ######## process driving info ########\n        flag_load_from_template = is_template(args.driving)\n        driving_rgb_crop_256x256_lst = None\n        wfp_template = None\n\n        if flag_load_from_template:\n            # NOTE: load from template, it is fast, but the cropping video is None\n            log(f\"Load from template: {args.driving}, NOT the video, so the cropping video and audio are both NULL.\", style='bold green')\n            driving_template_dct = load(args.driving)\n            n_frames = driving_template_dct['n_frames']\n\n            # set output_fps\n            output_fps = driving_template_dct.get('output_fps', inf_cfg.output_fps)\n            log(f'The FPS of template: {output_fps}')\n\n            if args.flag_crop_driving_video:\n                log(\"Warning: flag_crop_driving_video is True, but the driving info is a template, so it is ignored.\")\n\n        elif osp.exists(args.driving) and is_video(args.driving):\n            # load from video file, AND make motion template\n            output_fps = int(get_fps(args.driving))\n            log(f\"Load driving video from: {args.driving}, FPS is {output_fps}\")\n\n            driving_rgb_lst = load_video(args.driving)\n            n_frames = len(driving_rgb_lst)\n\n            ######## make motion template ########\n            log(\"Start making driving motion template...\")\n            if inf_cfg.flag_crop_driving_video:\n                ret_d = self.cropper.crop_driving_video(driving_rgb_lst)\n                log(f'Driving video is cropped, {len(ret_d[\"frame_crop_lst\"])} frames are processed.')\n                if len(ret_d[\"frame_crop_lst\"]) is not n_frames:\n                    n_frames = min(n_frames, len(ret_d[\"frame_crop_lst\"]))\n                driving_rgb_crop_lst = ret_d['frame_crop_lst']\n                driving_rgb_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in driving_rgb_crop_lst]\n            else:\n                driving_rgb_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in driving_rgb_lst]  # force to resize to 256x256\n            #######################################\n\n            # save the motion template\n            I_d_lst = self.live_portrait_wrapper_animal.prepare_videos(driving_rgb_crop_256x256_lst)\n            driving_template_dct = self.make_motion_template(I_d_lst, output_fps=output_fps)\n\n            wfp_template = remove_suffix(args.driving) + '.pkl'\n            dump(wfp_template, driving_template_dct)\n            log(f\"Dump motion template to {wfp_template}\")\n\n        else:\n            raise Exception(f\"{args.driving} not exists or unsupported driving info types!\")\n\n        ######## prepare for pasteback ########\n        I_p_pstbk_lst = None\n        if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:\n            I_p_pstbk_lst = []\n            log(\"Prepared pasteback mask done.\")\n\n        ######## process source info ########\n        if inf_cfg.flag_do_crop:\n            crop_info = self.cropper.crop_source_image(img_rgb, crop_cfg)\n            if crop_info is None:\n                raise Exception(\"No animal face detected in the source image!\")\n            img_crop_256x256 = crop_info['img_crop_256x256']\n        else:\n            img_crop_256x256 = cv2.resize(img_rgb, (256, 256))  # force to resize to 256x256\n        I_s = self.live_portrait_wrapper_animal.prepare_source(img_crop_256x256)\n        x_s_info = self.live_portrait_wrapper_animal.get_kp_info(I_s)\n        x_c_s = x_s_info['kp']\n        R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])\n        f_s = self.live_portrait_wrapper_animal.extract_feature_3d(I_s)\n        x_s = self.live_portrait_wrapper_animal.transform_keypoint(x_s_info)\n\n        if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:\n            mask_ori_float = prepare_paste_back(inf_cfg.mask_crop, crop_info['M_c2o'], dsize=(img_rgb.shape[1], img_rgb.shape[0]))\n\n        ######## animate ########\n        I_p_lst = []\n        for i in track(range(n_frames), description='🚀Animating...', total=n_frames):\n\n            x_d_i_info = driving_template_dct['motion'][i]\n            x_d_i_info = dct2device(x_d_i_info, device)\n\n            R_d_i = x_d_i_info['R'] if 'R' in x_d_i_info.keys() else x_d_i_info['R_d']  # compatible with previous keys\n            delta_new = x_d_i_info['exp']\n            t_new = x_d_i_info['t']\n            t_new[..., 2].fill_(0)  # zero tz\n            scale_new = x_s_info['scale']\n\n            x_d_i = scale_new * (x_c_s @ R_d_i + delta_new) + t_new\n\n            if i == 0:\n                x_d_0 = x_d_i\n                motion_multiplier = calc_motion_multiplier(x_s, x_d_0)\n\n            x_d_diff = (x_d_i - x_d_0) * motion_multiplier\n            x_d_i = x_d_diff + x_s\n\n            if not inf_cfg.flag_stitching:\n                pass\n            else:\n                x_d_i = self.live_portrait_wrapper_animal.stitching(x_s, x_d_i)\n\n            x_d_i = x_s + (x_d_i - x_s) * inf_cfg.driving_multiplier\n            out = self.live_portrait_wrapper_animal.warp_decode(f_s, x_s, x_d_i)\n            I_p_i = self.live_portrait_wrapper_animal.parse_output(out['out'])[0]\n            I_p_lst.append(I_p_i)\n\n            if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:\n                I_p_pstbk = paste_back(I_p_i, crop_info['M_c2o'], img_rgb, mask_ori_float)\n                I_p_pstbk_lst.append(I_p_pstbk)\n\n        mkdir(args.output_dir)\n        wfp_concat = None\n        flag_driving_has_audio = (not flag_load_from_template) and has_audio_stream(args.driving)\n\n        ######### build the final concatenation result #########\n        # driving frame | source image | generation\n        frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst, [img_crop_256x256], I_p_lst)\n        wfp_concat = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat.mp4')\n        images2video(frames_concatenated, wfp=wfp_concat, fps=output_fps)\n\n        if flag_driving_has_audio:\n            # final result with concatenation\n            wfp_concat_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat_with_audio.mp4')\n            audio_from_which_video = args.driving\n            add_audio_to_video(wfp_concat, audio_from_which_video, wfp_concat_with_audio)\n            os.replace(wfp_concat_with_audio, wfp_concat)\n            log(f\"Replace {wfp_concat_with_audio} with {wfp_concat}\")\n\n        # save the animated result\n        wfp = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}.mp4')\n        if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:\n            images2video(I_p_pstbk_lst, wfp=wfp, fps=output_fps)\n        else:\n            images2video(I_p_lst, wfp=wfp, fps=output_fps)\n\n        ######### build the final result #########\n        if flag_driving_has_audio:\n            wfp_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_with_audio.mp4')\n            audio_from_which_video = args.driving\n            add_audio_to_video(wfp, audio_from_which_video, wfp_with_audio)\n            os.replace(wfp_with_audio, wfp)\n            log(f\"Replace {wfp_with_audio} with {wfp}\")\n\n        # final log\n        if wfp_template not in (None, ''):\n            log(f'Animated template: {wfp_template}, you can specify `-d` argument with this template path next time to avoid cropping video, motion making and protecting privacy.', style='bold green')\n        log(f'Animated video: {wfp}')\n        log(f'Animated video with concat: {wfp_concat}')\n\n        # build the gif\n        wfp_gif = video2gif(wfp)\n        log(f'Animated gif: {wfp_gif}')\n\n\n        return wfp, wfp_concat, wfp_gif\n"
  },
  {
    "path": "src/live_portrait_wrapper.py",
    "content": "# coding: utf-8\n\n\"\"\"\nWrappers for LivePortrait core functions\n\"\"\"\n\nimport contextlib\nimport os.path as osp\nimport numpy as np\nimport cv2\nimport torch\nimport yaml\n\nfrom .utils.timer import Timer\nfrom .utils.helper import load_model, concat_feat\nfrom .utils.camera import headpose_pred_to_degree, get_rotation_matrix\nfrom .utils.retargeting_utils import calc_eye_close_ratio, calc_lip_close_ratio\nfrom .config.inference_config import InferenceConfig\nfrom .utils.rprint import rlog as log\n\n\nclass LivePortraitWrapper(object):\n    \"\"\"\n    Wrapper for Human\n    \"\"\"\n\n    def __init__(self, inference_cfg: InferenceConfig):\n\n        self.inference_cfg = inference_cfg\n        self.device_id = inference_cfg.device_id\n        self.compile = inference_cfg.flag_do_torch_compile\n        if inference_cfg.flag_force_cpu:\n            self.device = 'cpu'\n        else:\n            try:\n                if torch.backends.mps.is_available():\n                    self.device = 'mps'\n                else:\n                    self.device = 'cuda:' + str(self.device_id)\n            except:\n                self.device = 'cuda:' + str(self.device_id)\n\n        model_config = yaml.load(open(inference_cfg.models_config, 'r'), Loader=yaml.SafeLoader)\n        # init F\n        self.appearance_feature_extractor = load_model(inference_cfg.checkpoint_F, model_config, self.device, 'appearance_feature_extractor')\n        log(f'Load appearance_feature_extractor from {osp.realpath(inference_cfg.checkpoint_F)} done.')\n        # init M\n        self.motion_extractor = load_model(inference_cfg.checkpoint_M, model_config, self.device, 'motion_extractor')\n        log(f'Load motion_extractor from {osp.realpath(inference_cfg.checkpoint_M)} done.')\n        # init W\n        self.warping_module = load_model(inference_cfg.checkpoint_W, model_config, self.device, 'warping_module')\n        log(f'Load warping_module from {osp.realpath(inference_cfg.checkpoint_W)} done.')\n        # init G\n        self.spade_generator = load_model(inference_cfg.checkpoint_G, model_config, self.device, 'spade_generator')\n        log(f'Load spade_generator from {osp.realpath(inference_cfg.checkpoint_G)} done.')\n        # init S and R\n        if inference_cfg.checkpoint_S is not None and osp.exists(inference_cfg.checkpoint_S):\n            self.stitching_retargeting_module = load_model(inference_cfg.checkpoint_S, model_config, self.device, 'stitching_retargeting_module')\n            log(f'Load stitching_retargeting_module from {osp.realpath(inference_cfg.checkpoint_S)} done.')\n        else:\n            self.stitching_retargeting_module = None\n        # Optimize for inference\n        if self.compile:\n            torch._dynamo.config.suppress_errors = True  # Suppress errors and fall back to eager execution\n            self.warping_module = torch.compile(self.warping_module, mode='max-autotune')\n            self.spade_generator = torch.compile(self.spade_generator, mode='max-autotune')\n\n        self.timer = Timer()\n\n    def inference_ctx(self):\n        if self.device == \"mps\":\n            ctx = contextlib.nullcontext()\n        else:\n            ctx = torch.autocast(device_type=self.device[:4], dtype=torch.float16,\n                                 enabled=self.inference_cfg.flag_use_half_precision)\n        return ctx\n\n    def update_config(self, user_args):\n        for k, v in user_args.items():\n            if hasattr(self.inference_cfg, k):\n                setattr(self.inference_cfg, k, v)\n\n    def prepare_source(self, img: np.ndarray) -> torch.Tensor:\n        \"\"\" construct the input as standard\n        img: HxWx3, uint8, 256x256\n        \"\"\"\n        h, w = img.shape[:2]\n        if h != self.inference_cfg.input_shape[0] or w != self.inference_cfg.input_shape[1]:\n            x = cv2.resize(img, (self.inference_cfg.input_shape[0], self.inference_cfg.input_shape[1]))\n        else:\n            x = img.copy()\n\n        if x.ndim == 3:\n            x = x[np.newaxis].astype(np.float32) / 255.  # HxWx3 -> 1xHxWx3, normalized to 0~1\n        elif x.ndim == 4:\n            x = x.astype(np.float32) / 255.  # BxHxWx3, normalized to 0~1\n        else:\n            raise ValueError(f'img ndim should be 3 or 4: {x.ndim}')\n        x = np.clip(x, 0, 1)  # clip to 0~1\n        x = torch.from_numpy(x).permute(0, 3, 1, 2)  # 1xHxWx3 -> 1x3xHxW\n        x = x.to(self.device)\n        return x\n\n    def prepare_videos(self, imgs) -> torch.Tensor:\n        \"\"\" construct the input as standard\n        imgs: NxBxHxWx3, uint8\n        \"\"\"\n        if isinstance(imgs, list):\n            _imgs = np.array(imgs)[..., np.newaxis]  # TxHxWx3x1\n        elif isinstance(imgs, np.ndarray):\n            _imgs = imgs\n        else:\n            raise ValueError(f'imgs type error: {type(imgs)}')\n\n        y = _imgs.astype(np.float32) / 255.\n        y = np.clip(y, 0, 1)  # clip to 0~1\n        y = torch.from_numpy(y).permute(0, 4, 3, 1, 2)  # TxHxWx3x1 -> Tx1x3xHxW\n        y = y.to(self.device)\n\n        return y\n\n    def extract_feature_3d(self, x: torch.Tensor) -> torch.Tensor:\n        \"\"\" get the appearance feature of the image by F\n        x: Bx3xHxW, normalized to 0~1\n        \"\"\"\n        with torch.no_grad(), self.inference_ctx():\n            feature_3d = self.appearance_feature_extractor(x)\n\n        return feature_3d.float()\n\n    def get_kp_info(self, x: torch.Tensor, **kwargs) -> dict:\n        \"\"\" get the implicit keypoint information\n        x: Bx3xHxW, normalized to 0~1\n        flag_refine_info: whether to trandform the pose to degrees and the dimention of the reshape\n        return: A dict contains keys: 'pitch', 'yaw', 'roll', 't', 'exp', 'scale', 'kp'\n        \"\"\"\n        with torch.no_grad(), self.inference_ctx():\n            kp_info = self.motion_extractor(x)\n\n            if self.inference_cfg.flag_use_half_precision:\n                # float the dict\n                for k, v in kp_info.items():\n                    if isinstance(v, torch.Tensor):\n                        kp_info[k] = v.float()\n\n        flag_refine_info: bool = kwargs.get('flag_refine_info', True)\n        if flag_refine_info:\n            bs = kp_info['kp'].shape[0]\n            kp_info['pitch'] = headpose_pred_to_degree(kp_info['pitch'])[:, None]  # Bx1\n            kp_info['yaw'] = headpose_pred_to_degree(kp_info['yaw'])[:, None]  # Bx1\n            kp_info['roll'] = headpose_pred_to_degree(kp_info['roll'])[:, None]  # Bx1\n            kp_info['kp'] = kp_info['kp'].reshape(bs, -1, 3)  # BxNx3\n            kp_info['exp'] = kp_info['exp'].reshape(bs, -1, 3)  # BxNx3\n\n        return kp_info\n\n    def get_pose_dct(self, kp_info: dict) -> dict:\n        pose_dct = dict(\n            pitch=headpose_pred_to_degree(kp_info['pitch']).item(),\n            yaw=headpose_pred_to_degree(kp_info['yaw']).item(),\n            roll=headpose_pred_to_degree(kp_info['roll']).item(),\n        )\n        return pose_dct\n\n    def get_fs_and_kp_info(self, source_prepared, driving_first_frame):\n\n        # get the canonical keypoints of source image by M\n        source_kp_info = self.get_kp_info(source_prepared, flag_refine_info=True)\n        source_rotation = get_rotation_matrix(source_kp_info['pitch'], source_kp_info['yaw'], source_kp_info['roll'])\n\n        # get the canonical keypoints of first driving frame by M\n        driving_first_frame_kp_info = self.get_kp_info(driving_first_frame, flag_refine_info=True)\n        driving_first_frame_rotation = get_rotation_matrix(\n            driving_first_frame_kp_info['pitch'],\n            driving_first_frame_kp_info['yaw'],\n            driving_first_frame_kp_info['roll']\n        )\n\n        # get feature volume by F\n        source_feature_3d = self.extract_feature_3d(source_prepared)\n\n        return source_kp_info, source_rotation, source_feature_3d, driving_first_frame_kp_info, driving_first_frame_rotation\n\n    def transform_keypoint(self, kp_info: dict):\n        \"\"\"\n        transform the implicit keypoints with the pose, shift, and expression deformation\n        kp: BxNx3\n        \"\"\"\n        kp = kp_info['kp']    # (bs, k, 3)\n        pitch, yaw, roll = kp_info['pitch'], kp_info['yaw'], kp_info['roll']\n\n        t, exp = kp_info['t'], kp_info['exp']\n        scale = kp_info['scale']\n\n        pitch = headpose_pred_to_degree(pitch)\n        yaw = headpose_pred_to_degree(yaw)\n        roll = headpose_pred_to_degree(roll)\n\n        bs = kp.shape[0]\n        if kp.ndim == 2:\n            num_kp = kp.shape[1] // 3  # Bx(num_kpx3)\n        else:\n            num_kp = kp.shape[1]  # Bxnum_kpx3\n\n        rot_mat = get_rotation_matrix(pitch, yaw, roll)    # (bs, 3, 3)\n\n        # Eqn.2: s * (R * x_c,s + exp) + t\n        kp_transformed = kp.view(bs, num_kp, 3) @ rot_mat + exp.view(bs, num_kp, 3)\n        kp_transformed *= scale[..., None]  # (bs, k, 3) * (bs, 1, 1) = (bs, k, 3)\n        kp_transformed[:, :, 0:2] += t[:, None, 0:2]  # remove z, only apply tx ty\n\n        return kp_transformed\n\n    def retarget_eye(self, kp_source: torch.Tensor, eye_close_ratio: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        kp_source: BxNx3\n        eye_close_ratio: Bx3\n        Return: Bx(3*num_kp)\n        \"\"\"\n        feat_eye = concat_feat(kp_source, eye_close_ratio)\n\n        with torch.no_grad():\n            delta = self.stitching_retargeting_module['eye'](feat_eye)\n\n        return delta.reshape(-1, kp_source.shape[1], 3)\n\n    def retarget_lip(self, kp_source: torch.Tensor, lip_close_ratio: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        kp_source: BxNx3\n        lip_close_ratio: Bx2\n        Return: Bx(3*num_kp)\n        \"\"\"\n        feat_lip = concat_feat(kp_source, lip_close_ratio)\n\n        with torch.no_grad():\n            delta = self.stitching_retargeting_module['lip'](feat_lip)\n\n        return delta.reshape(-1, kp_source.shape[1], 3)\n\n    def stitch(self, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        kp_source: BxNx3\n        kp_driving: BxNx3\n        Return: Bx(3*num_kp+2)\n        \"\"\"\n        feat_stiching = concat_feat(kp_source, kp_driving)\n\n        with torch.no_grad():\n            delta = self.stitching_retargeting_module['stitching'](feat_stiching)\n\n        return delta\n\n    def stitching(self, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:\n        \"\"\" conduct the stitching\n        kp_source: Bxnum_kpx3\n        kp_driving: Bxnum_kpx3\n        \"\"\"\n\n        if self.stitching_retargeting_module is not None:\n\n            bs, num_kp = kp_source.shape[:2]\n\n            kp_driving_new = kp_driving.clone()\n            delta = self.stitch(kp_source, kp_driving_new)\n\n            delta_exp = delta[..., :3*num_kp].reshape(bs, num_kp, 3)  # 1x20x3\n            delta_tx_ty = delta[..., 3*num_kp:3*num_kp+2].reshape(bs, 1, 2)  # 1x1x2\n\n            kp_driving_new += delta_exp\n            kp_driving_new[..., :2] += delta_tx_ty\n\n            return kp_driving_new\n\n        return kp_driving\n\n    def warp_decode(self, feature_3d: torch.Tensor, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:\n        \"\"\" get the image after the warping of the implicit keypoints\n        feature_3d: Bx32x16x64x64, feature volume\n        kp_source: BxNx3\n        kp_driving: BxNx3\n        \"\"\"\n        # The line 18 in Algorithm 1: D(W(f_s; x_s, x′_d,i)）\n        with torch.no_grad(), self.inference_ctx():\n            if self.compile:\n                # Mark the beginning of a new CUDA Graph step\n                torch.compiler.cudagraph_mark_step_begin()\n            # get decoder input\n            ret_dct = self.warping_module(feature_3d, kp_source=kp_source, kp_driving=kp_driving)\n            # decode\n            ret_dct['out'] = self.spade_generator(feature=ret_dct['out'])\n\n            # float the dict\n            if self.inference_cfg.flag_use_half_precision:\n                for k, v in ret_dct.items():\n                    if isinstance(v, torch.Tensor):\n                        ret_dct[k] = v.float()\n\n        return ret_dct\n\n    def parse_output(self, out: torch.Tensor) -> np.ndarray:\n        \"\"\" construct the output as standard\n        return: 1xHxWx3, uint8\n        \"\"\"\n        out = np.transpose(out.data.cpu().numpy(), [0, 2, 3, 1])  # 1x3xHxW -> 1xHxWx3\n        out = np.clip(out, 0, 1)  # clip to 0~1\n        out = np.clip(out * 255, 0, 255).astype(np.uint8)  # 0~1 -> 0~255\n\n        return out\n\n    def calc_ratio(self, lmk_lst):\n        input_eye_ratio_lst = []\n        input_lip_ratio_lst = []\n        for lmk in lmk_lst:\n            # for eyes retargeting\n            input_eye_ratio_lst.append(calc_eye_close_ratio(lmk[None]))\n            # for lip retargeting\n            input_lip_ratio_lst.append(calc_lip_close_ratio(lmk[None]))\n        return input_eye_ratio_lst, input_lip_ratio_lst\n\n    def calc_combined_eye_ratio(self, c_d_eyes_i, source_lmk):\n        c_s_eyes = calc_eye_close_ratio(source_lmk[None])\n        c_s_eyes_tensor = torch.from_numpy(c_s_eyes).float().to(self.device)\n        c_d_eyes_i_tensor = torch.Tensor([c_d_eyes_i[0][0]]).reshape(1, 1).to(self.device)\n        # [c_s,eyes, c_d,eyes,i]\n        combined_eye_ratio_tensor = torch.cat([c_s_eyes_tensor, c_d_eyes_i_tensor], dim=1)\n        return combined_eye_ratio_tensor\n\n    def calc_combined_lip_ratio(self, c_d_lip_i, source_lmk):\n        c_s_lip = calc_lip_close_ratio(source_lmk[None])\n        c_s_lip_tensor = torch.from_numpy(c_s_lip).float().to(self.device)\n        c_d_lip_i_tensor = torch.Tensor([c_d_lip_i[0]]).to(self.device).reshape(1, 1) # 1x1\n        # [c_s,lip, c_d,lip,i]\n        combined_lip_ratio_tensor = torch.cat([c_s_lip_tensor, c_d_lip_i_tensor], dim=1) # 1x2\n        return combined_lip_ratio_tensor\n\n\nclass LivePortraitWrapperAnimal(LivePortraitWrapper):\n    \"\"\"\n    Wrapper for Animal\n    \"\"\"\n    def __init__(self, inference_cfg: InferenceConfig):\n        # super().__init__(inference_cfg)  # 调用父类的初始化方法\n\n        self.inference_cfg = inference_cfg\n        self.device_id = inference_cfg.device_id\n        self.compile = inference_cfg.flag_do_torch_compile\n        if inference_cfg.flag_force_cpu:\n            self.device = 'cpu'\n        else:\n            try: \n                if torch.backends.mps.is_available():\n                    self.device = 'mps'\n                else:\n                    self.device = 'cuda:' + str(self.device_id)\n            except:\n                    self.device = 'cuda:' + str(self.device_id)\n\n        model_config = yaml.load(open(inference_cfg.models_config, 'r'), Loader=yaml.SafeLoader)\n        # init F\n        self.appearance_feature_extractor = load_model(inference_cfg.checkpoint_F_animal, model_config, self.device, 'appearance_feature_extractor')\n        log(f'Load appearance_feature_extractor from {osp.realpath(inference_cfg.checkpoint_F_animal)} done.')\n        # init M\n        self.motion_extractor = load_model(inference_cfg.checkpoint_M_animal, model_config, self.device, 'motion_extractor')\n        log(f'Load motion_extractor from {osp.realpath(inference_cfg.checkpoint_M_animal)} done.')\n        # init W\n        self.warping_module = load_model(inference_cfg.checkpoint_W_animal, model_config, self.device, 'warping_module')\n        log(f'Load warping_module from {osp.realpath(inference_cfg.checkpoint_W_animal)} done.')\n        # init G\n        self.spade_generator = load_model(inference_cfg.checkpoint_G_animal, model_config, self.device, 'spade_generator')\n        log(f'Load spade_generator from {osp.realpath(inference_cfg.checkpoint_G_animal)} done.')\n        # init S and R\n        if inference_cfg.checkpoint_S_animal is not None and osp.exists(inference_cfg.checkpoint_S_animal):\n            self.stitching_retargeting_module = load_model(inference_cfg.checkpoint_S_animal, model_config, self.device, 'stitching_retargeting_module')\n            log(f'Load stitching_retargeting_module from {osp.realpath(inference_cfg.checkpoint_S_animal)} done.')\n        else:\n            self.stitching_retargeting_module = None\n\n        # Optimize for inference\n        if self.compile:\n            torch._dynamo.config.suppress_errors = True  # Suppress errors and fall back to eager execution\n            self.warping_module = torch.compile(self.warping_module, mode='max-autotune')\n            self.spade_generator = torch.compile(self.spade_generator, mode='max-autotune')\n\n        self.timer = Timer()\n"
  },
  {
    "path": "src/modules/__init__.py",
    "content": ""
  },
  {
    "path": "src/modules/appearance_feature_extractor.py",
    "content": "# coding: utf-8\n\n\"\"\"\nAppearance extractor(F) defined in paper, which maps the source image s to a 3D appearance feature volume.\n\"\"\"\n\nimport torch\nfrom torch import nn\nfrom .util import SameBlock2d, DownBlock2d, ResBlock3d\n\n\nclass AppearanceFeatureExtractor(nn.Module):\n\n    def __init__(self, image_channel, block_expansion, num_down_blocks, max_features, reshape_channel, reshape_depth, num_resblocks):\n        super(AppearanceFeatureExtractor, self).__init__()\n        self.image_channel = image_channel\n        self.block_expansion = block_expansion\n        self.num_down_blocks = num_down_blocks\n        self.max_features = max_features\n        self.reshape_channel = reshape_channel\n        self.reshape_depth = reshape_depth\n\n        self.first = SameBlock2d(image_channel, block_expansion, kernel_size=(3, 3), padding=(1, 1))\n\n        down_blocks = []\n        for i in range(num_down_blocks):\n            in_features = min(max_features, block_expansion * (2 ** i))\n            out_features = min(max_features, block_expansion * (2 ** (i + 1)))\n            down_blocks.append(DownBlock2d(in_features, out_features, kernel_size=(3, 3), padding=(1, 1)))\n        self.down_blocks = nn.ModuleList(down_blocks)\n\n        self.second = nn.Conv2d(in_channels=out_features, out_channels=max_features, kernel_size=1, stride=1)\n\n        self.resblocks_3d = torch.nn.Sequential()\n        for i in range(num_resblocks):\n            self.resblocks_3d.add_module('3dr' + str(i), ResBlock3d(reshape_channel, kernel_size=3, padding=1))\n\n    def forward(self, source_image):\n        out = self.first(source_image)  # Bx3x256x256 -> Bx64x256x256\n\n        for i in range(len(self.down_blocks)):\n            out = self.down_blocks[i](out)\n        out = self.second(out)\n        bs, c, h, w = out.shape  # ->Bx512x64x64\n\n        f_s = out.view(bs, self.reshape_channel, self.reshape_depth, h, w)  # ->Bx32x16x64x64\n        f_s = self.resblocks_3d(f_s)  # ->Bx32x16x64x64\n        return f_s\n"
  },
  {
    "path": "src/modules/convnextv2.py",
    "content": "# coding: utf-8\n\n\"\"\"\nThis moudle is adapted to the ConvNeXtV2 version for the extraction of implicit keypoints, poses, and expression deformation.\n\"\"\"\n\nimport torch\nimport torch.nn as nn\n# from timm.models.layers import trunc_normal_, DropPath\nfrom .util import LayerNorm, DropPath, trunc_normal_, GRN\n\n__all__ = ['convnextv2_tiny']\n\n\nclass Block(nn.Module):\n    \"\"\" ConvNeXtV2 Block.\n\n    Args:\n        dim (int): Number of input channels.\n        drop_path (float): Stochastic depth rate. Default: 0.0\n    \"\"\"\n\n    def __init__(self, dim, drop_path=0.):\n        super().__init__()\n        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv\n        self.norm = LayerNorm(dim, eps=1e-6)\n        self.pwconv1 = nn.Linear(dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers\n        self.act = nn.GELU()\n        self.grn = GRN(4 * dim)\n        self.pwconv2 = nn.Linear(4 * dim, dim)\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()\n\n    def forward(self, x):\n        input = x\n        x = self.dwconv(x)\n        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)\n        x = self.norm(x)\n        x = self.pwconv1(x)\n        x = self.act(x)\n        x = self.grn(x)\n        x = self.pwconv2(x)\n        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)\n\n        x = input + self.drop_path(x)\n        return x\n\n\nclass ConvNeXtV2(nn.Module):\n    \"\"\" ConvNeXt V2\n\n    Args:\n        in_chans (int): Number of input image channels. Default: 3\n        num_classes (int): Number of classes for classification head. Default: 1000\n        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]\n        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]\n        drop_path_rate (float): Stochastic depth rate. Default: 0.\n        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.\n    \"\"\"\n\n    def __init__(\n        self,\n        in_chans=3,\n        depths=[3, 3, 9, 3],\n        dims=[96, 192, 384, 768],\n        drop_path_rate=0.,\n        **kwargs\n    ):\n        super().__init__()\n        self.depths = depths\n        self.downsample_layers = nn.ModuleList()  # stem and 3 intermediate downsampling conv layers\n        stem = nn.Sequential(\n            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),\n            LayerNorm(dims[0], eps=1e-6, data_format=\"channels_first\")\n        )\n        self.downsample_layers.append(stem)\n        for i in range(3):\n            downsample_layer = nn.Sequential(\n                LayerNorm(dims[i], eps=1e-6, data_format=\"channels_first\"),\n                nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),\n            )\n            self.downsample_layers.append(downsample_layer)\n\n        self.stages = nn.ModuleList()  # 4 feature resolution stages, each consisting of multiple residual blocks\n        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]\n        cur = 0\n        for i in range(4):\n            stage = nn.Sequential(\n                *[Block(dim=dims[i], drop_path=dp_rates[cur + j]) for j in range(depths[i])]\n            )\n            self.stages.append(stage)\n            cur += depths[i]\n\n        self.norm = nn.LayerNorm(dims[-1], eps=1e-6)  # final norm layer\n\n        # NOTE: the output semantic items\n        num_bins = kwargs.get('num_bins', 66)\n        num_kp = kwargs.get('num_kp', 24)  # the number of implicit keypoints\n        self.fc_kp = nn.Linear(dims[-1], 3 * num_kp)  # implicit keypoints\n\n        # print('dims[-1]: ', dims[-1])\n        self.fc_scale = nn.Linear(dims[-1], 1)  # scale\n        self.fc_pitch = nn.Linear(dims[-1], num_bins)  # pitch bins\n        self.fc_yaw = nn.Linear(dims[-1], num_bins)  # yaw bins\n        self.fc_roll = nn.Linear(dims[-1], num_bins)  # roll bins\n        self.fc_t = nn.Linear(dims[-1], 3)  # translation\n        self.fc_exp = nn.Linear(dims[-1], 3 * num_kp)  # expression / delta\n\n    def _init_weights(self, m):\n        if isinstance(m, (nn.Conv2d, nn.Linear)):\n            trunc_normal_(m.weight, std=.02)\n            nn.init.constant_(m.bias, 0)\n\n    def forward_features(self, x):\n        for i in range(4):\n            x = self.downsample_layers[i](x)\n            x = self.stages[i](x)\n        return self.norm(x.mean([-2, -1]))  # global average pooling, (N, C, H, W) -> (N, C)\n\n    def forward(self, x):\n        x = self.forward_features(x)\n\n        # implicit keypoints\n        kp = self.fc_kp(x)\n\n        # pose and expression deformation\n        pitch = self.fc_pitch(x)\n        yaw = self.fc_yaw(x)\n        roll = self.fc_roll(x)\n        t = self.fc_t(x)\n        exp = self.fc_exp(x)\n        scale = self.fc_scale(x)\n\n        ret_dct = {\n            'pitch': pitch,\n            'yaw': yaw,\n            'roll': roll,\n            't': t,\n            'exp': exp,\n            'scale': scale,\n\n            'kp': kp,  # canonical keypoint\n        }\n\n        return ret_dct\n\n\ndef convnextv2_tiny(**kwargs):\n    model = ConvNeXtV2(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)\n    return model\n"
  },
  {
    "path": "src/modules/dense_motion.py",
    "content": "# coding: utf-8\n\n\"\"\"\nThe module that predicting a dense motion from sparse motion representation given by kp_source and kp_driving\n\"\"\"\n\nfrom torch import nn\nimport torch.nn.functional as F\nimport torch\nfrom .util import Hourglass, make_coordinate_grid, kp2gaussian\n\n\nclass DenseMotionNetwork(nn.Module):\n    def __init__(self, block_expansion, num_blocks, max_features, num_kp, feature_channel, reshape_depth, compress, estimate_occlusion_map=True):\n        super(DenseMotionNetwork, self).__init__()\n        self.hourglass = Hourglass(block_expansion=block_expansion, in_features=(num_kp+1)*(compress+1), max_features=max_features, num_blocks=num_blocks)  # ~60+G\n\n        self.mask = nn.Conv3d(self.hourglass.out_filters, num_kp + 1, kernel_size=7, padding=3)  # 65G! NOTE: computation cost is large\n        self.compress = nn.Conv3d(feature_channel, compress, kernel_size=1)  # 0.8G\n        self.norm = nn.BatchNorm3d(compress, affine=True)\n        self.num_kp = num_kp\n        self.flag_estimate_occlusion_map = estimate_occlusion_map\n\n        if self.flag_estimate_occlusion_map:\n            self.occlusion = nn.Conv2d(self.hourglass.out_filters*reshape_depth, 1, kernel_size=7, padding=3)\n        else:\n            self.occlusion = None\n\n    def create_sparse_motions(self, feature, kp_driving, kp_source):\n        bs, _, d, h, w = feature.shape  # (bs, 4, 16, 64, 64)\n        identity_grid = make_coordinate_grid((d, h, w), ref=kp_source)  # (16, 64, 64, 3)\n        identity_grid = identity_grid.view(1, 1, d, h, w, 3)  # (1, 1, d=16, h=64, w=64, 3)\n        coordinate_grid = identity_grid - kp_driving.view(bs, self.num_kp, 1, 1, 1, 3)\n\n        k = coordinate_grid.shape[1]\n\n        # NOTE: there lacks an one-order flow\n        driving_to_source = coordinate_grid + kp_source.view(bs, self.num_kp, 1, 1, 1, 3)    # (bs, num_kp, d, h, w, 3)\n\n        # adding background feature\n        identity_grid = identity_grid.repeat(bs, 1, 1, 1, 1, 1)\n        sparse_motions = torch.cat([identity_grid, driving_to_source], dim=1)  # (bs, 1+num_kp, d, h, w, 3)\n        return sparse_motions\n\n    def create_deformed_feature(self, feature, sparse_motions):\n        bs, _, d, h, w = feature.shape\n        feature_repeat = feature.unsqueeze(1).unsqueeze(1).repeat(1, self.num_kp+1, 1, 1, 1, 1, 1)      # (bs, num_kp+1, 1, c, d, h, w)\n        feature_repeat = feature_repeat.view(bs * (self.num_kp+1), -1, d, h, w)                         # (bs*(num_kp+1), c, d, h, w)\n        sparse_motions = sparse_motions.view((bs * (self.num_kp+1), d, h, w, -1))                       # (bs*(num_kp+1), d, h, w, 3)\n        sparse_deformed = F.grid_sample(feature_repeat, sparse_motions, align_corners=False)\n        sparse_deformed = sparse_deformed.view((bs, self.num_kp+1, -1, d, h, w))                        # (bs, num_kp+1, c, d, h, w)\n\n        return sparse_deformed\n\n    def create_heatmap_representations(self, feature, kp_driving, kp_source):\n        spatial_size = feature.shape[3:]  # (d=16, h=64, w=64)\n        gaussian_driving = kp2gaussian(kp_driving, spatial_size=spatial_size, kp_variance=0.01)  # (bs, num_kp, d, h, w)\n        gaussian_source = kp2gaussian(kp_source, spatial_size=spatial_size, kp_variance=0.01)  # (bs, num_kp, d, h, w)\n        heatmap = gaussian_driving - gaussian_source  # (bs, num_kp, d, h, w)\n\n        # adding background feature\n        zeros = torch.zeros(heatmap.shape[0], 1, spatial_size[0], spatial_size[1], spatial_size[2]).type(heatmap.dtype).to(heatmap.device)\n        heatmap = torch.cat([zeros, heatmap], dim=1)\n        heatmap = heatmap.unsqueeze(2)         # (bs, 1+num_kp, 1, d, h, w)\n        return heatmap\n\n    def forward(self, feature, kp_driving, kp_source):\n        bs, _, d, h, w = feature.shape  # (bs, 32, 16, 64, 64)\n\n        feature = self.compress(feature)  # (bs, 4, 16, 64, 64)\n        feature = self.norm(feature)  # (bs, 4, 16, 64, 64)\n        feature = F.relu(feature)  # (bs, 4, 16, 64, 64)\n\n        out_dict = dict()\n\n        # 1. deform 3d feature\n        sparse_motion = self.create_sparse_motions(feature, kp_driving, kp_source)  # (bs, 1+num_kp, d, h, w, 3)\n        deformed_feature = self.create_deformed_feature(feature, sparse_motion)  # (bs, 1+num_kp, c=4, d=16, h=64, w=64)\n\n        # 2. (bs, 1+num_kp, d, h, w)\n        heatmap = self.create_heatmap_representations(deformed_feature, kp_driving, kp_source)  # (bs, 1+num_kp, 1, d, h, w)\n\n        input = torch.cat([heatmap, deformed_feature], dim=2)  # (bs, 1+num_kp, c=5, d=16, h=64, w=64)\n        input = input.view(bs, -1, d, h, w)  # (bs, (1+num_kp)*c=105, d=16, h=64, w=64)\n\n        prediction = self.hourglass(input)\n\n        mask = self.mask(prediction)\n        mask = F.softmax(mask, dim=1)  # (bs, 1+num_kp, d=16, h=64, w=64)\n        out_dict['mask'] = mask\n        mask = mask.unsqueeze(2)                                   # (bs, num_kp+1, 1, d, h, w)\n        sparse_motion = sparse_motion.permute(0, 1, 5, 2, 3, 4)    # (bs, num_kp+1, 3, d, h, w)\n        deformation = (sparse_motion * mask).sum(dim=1)            # (bs, 3, d, h, w)  mask take effect in this place\n        deformation = deformation.permute(0, 2, 3, 4, 1)           # (bs, d, h, w, 3)\n\n        out_dict['deformation'] = deformation\n\n        if self.flag_estimate_occlusion_map:\n            bs, _, d, h, w = prediction.shape\n            prediction_reshape = prediction.view(bs, -1, h, w)\n            occlusion_map = torch.sigmoid(self.occlusion(prediction_reshape))  # Bx1x64x64\n            out_dict['occlusion_map'] = occlusion_map\n\n        return out_dict\n"
  },
  {
    "path": "src/modules/motion_extractor.py",
    "content": "# coding: utf-8\n\n\"\"\"\nMotion extractor(M), which directly predicts the canonical keypoints, head pose and expression deformation of the input image\n\"\"\"\n\nfrom torch import nn\nimport torch\n\nfrom .convnextv2 import convnextv2_tiny\nfrom .util import filter_state_dict\n\nmodel_dict = {\n    'convnextv2_tiny': convnextv2_tiny,\n}\n\n\nclass MotionExtractor(nn.Module):\n    def __init__(self, **kwargs):\n        super(MotionExtractor, self).__init__()\n\n        # default is convnextv2_base\n        backbone = kwargs.get('backbone', 'convnextv2_tiny')\n        self.detector = model_dict.get(backbone)(**kwargs)\n\n    def load_pretrained(self, init_path: str):\n        if init_path not in (None, ''):\n            state_dict = torch.load(init_path, map_location=lambda storage, loc: storage)['model']\n            state_dict = filter_state_dict(state_dict, remove_name='head')\n            ret = self.detector.load_state_dict(state_dict, strict=False)\n            print(f'Load pretrained model from {init_path}, ret: {ret}')\n\n    def forward(self, x):\n        out = self.detector(x)\n        return out\n"
  },
  {
    "path": "src/modules/spade_generator.py",
    "content": "# coding: utf-8\n\n\"\"\"\nSpade decoder(G) defined in the paper, which input the warped feature to generate the animated image.\n\"\"\"\n\nimport torch\nfrom torch import nn\nimport torch.nn.functional as F\nfrom .util import SPADEResnetBlock\n\n\nclass SPADEDecoder(nn.Module):\n    def __init__(self, upscale=1, max_features=256, block_expansion=64, out_channels=64, num_down_blocks=2):\n        for i in range(num_down_blocks):\n            input_channels = min(max_features, block_expansion * (2 ** (i + 1)))\n        self.upscale = upscale\n        super().__init__()\n        norm_G = 'spadespectralinstance'\n        label_num_channels = input_channels  # 256\n\n        self.fc = nn.Conv2d(input_channels, 2 * input_channels, 3, padding=1)\n        self.G_middle_0 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)\n        self.G_middle_1 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)\n        self.G_middle_2 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)\n        self.G_middle_3 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)\n        self.G_middle_4 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)\n        self.G_middle_5 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)\n        self.up_0 = SPADEResnetBlock(2 * input_channels, input_channels, norm_G, label_num_channels)\n        self.up_1 = SPADEResnetBlock(input_channels, out_channels, norm_G, label_num_channels)\n        self.up = nn.Upsample(scale_factor=2)\n\n        if self.upscale is None or self.upscale <= 1:\n            self.conv_img = nn.Conv2d(out_channels, 3, 3, padding=1)\n        else:\n            self.conv_img = nn.Sequential(\n                nn.Conv2d(out_channels, 3 * (2 * 2), kernel_size=3, padding=1),\n                nn.PixelShuffle(upscale_factor=2)\n            )\n\n    def forward(self, feature):\n        seg = feature  # Bx256x64x64\n        x = self.fc(feature)  # Bx512x64x64\n        x = self.G_middle_0(x, seg)\n        x = self.G_middle_1(x, seg)\n        x = self.G_middle_2(x, seg)\n        x = self.G_middle_3(x, seg)\n        x = self.G_middle_4(x, seg)\n        x = self.G_middle_5(x, seg)\n\n        x = self.up(x)  # Bx512x64x64 -> Bx512x128x128\n        x = self.up_0(x, seg)  # Bx512x128x128 -> Bx256x128x128\n        x = self.up(x)  # Bx256x128x128 -> Bx256x256x256\n        x = self.up_1(x, seg)  # Bx256x256x256 -> Bx64x256x256\n\n        x = self.conv_img(F.leaky_relu(x, 2e-1))  # Bx64x256x256 -> Bx3xHxW\n        x = torch.sigmoid(x)  # Bx3xHxW\n\n        return x"
  },
  {
    "path": "src/modules/stitching_retargeting_network.py",
    "content": "# coding: utf-8\n\n\"\"\"\nStitching module(S) and two retargeting modules(R) defined in the paper.\n\n- The stitching module pastes the animated portrait back into the original image space without pixel misalignment, such as in\nthe stitching region.\n\n- The eyes retargeting module is designed to address the issue of incomplete eye closure during cross-id reenactment, especially\nwhen a person with small eyes drives a person with larger eyes.\n\n- The lip retargeting module is designed similarly to the eye retargeting module, and can also normalize the input by ensuring that\nthe lips are in a closed state, which facilitates better animation driving.\n\"\"\"\nfrom torch import nn\n\n\nclass StitchingRetargetingNetwork(nn.Module):\n    def __init__(self, input_size, hidden_sizes, output_size):\n        super(StitchingRetargetingNetwork, self).__init__()\n        layers = []\n        for i in range(len(hidden_sizes)):\n            if i == 0:\n                layers.append(nn.Linear(input_size, hidden_sizes[i]))\n            else:\n                layers.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i]))\n            layers.append(nn.ReLU(inplace=True))\n        layers.append(nn.Linear(hidden_sizes[-1], output_size))\n        self.mlp = nn.Sequential(*layers)\n\n    def initialize_weights_to_zero(self):\n        for m in self.modules():\n            if isinstance(m, nn.Linear):\n                nn.init.zeros_(m.weight)\n                nn.init.zeros_(m.bias)\n\n    def forward(self, x):\n        return self.mlp(x)\n"
  },
  {
    "path": "src/modules/util.py",
    "content": "# coding: utf-8\n\n\"\"\"\nThis file defines various neural network modules and utility functions, including convolutional and residual blocks,\nnormalizations, and functions for spatial transformation and tensor manipulation.\n\"\"\"\n\nfrom torch import nn\nimport torch.nn.functional as F\nimport torch\nimport torch.nn.utils.spectral_norm as spectral_norm\nimport math\nimport warnings\nimport collections.abc\nfrom itertools import repeat\n\ndef kp2gaussian(kp, spatial_size, kp_variance):\n    \"\"\"\n    Transform a keypoint into gaussian like representation\n    \"\"\"\n    mean = kp\n\n    coordinate_grid = make_coordinate_grid(spatial_size, mean)\n    number_of_leading_dimensions = len(mean.shape) - 1\n    shape = (1,) * number_of_leading_dimensions + coordinate_grid.shape\n    coordinate_grid = coordinate_grid.view(*shape)\n    repeats = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 1)\n    coordinate_grid = coordinate_grid.repeat(*repeats)\n\n    # Preprocess kp shape\n    shape = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 3)\n    mean = mean.view(*shape)\n\n    mean_sub = (coordinate_grid - mean)\n\n    out = torch.exp(-0.5 * (mean_sub ** 2).sum(-1) / kp_variance)\n\n    return out\n\n\ndef make_coordinate_grid(spatial_size, ref, **kwargs):\n    d, h, w = spatial_size\n    x = torch.arange(w).type(ref.dtype).to(ref.device)\n    y = torch.arange(h).type(ref.dtype).to(ref.device)\n    z = torch.arange(d).type(ref.dtype).to(ref.device)\n\n    # NOTE: must be right-down-in\n    x = (2 * (x / (w - 1)) - 1)  # the x axis faces to the right\n    y = (2 * (y / (h - 1)) - 1)  # the y axis faces to the bottom\n    z = (2 * (z / (d - 1)) - 1)  # the z axis faces to the inner\n\n    yy = y.view(1, -1, 1).repeat(d, 1, w)\n    xx = x.view(1, 1, -1).repeat(d, h, 1)\n    zz = z.view(-1, 1, 1).repeat(1, h, w)\n\n    meshed = torch.cat([xx.unsqueeze_(3), yy.unsqueeze_(3), zz.unsqueeze_(3)], 3)\n\n    return meshed\n\n\nclass ConvT2d(nn.Module):\n    \"\"\"\n    Upsampling block for use in decoder.\n    \"\"\"\n\n    def __init__(self, in_features, out_features, kernel_size=3, stride=2, padding=1, output_padding=1):\n        super(ConvT2d, self).__init__()\n\n        self.convT = nn.ConvTranspose2d(in_features, out_features, kernel_size=kernel_size, stride=stride,\n                                        padding=padding, output_padding=output_padding)\n        self.norm = nn.InstanceNorm2d(out_features)\n\n    def forward(self, x):\n        out = self.convT(x)\n        out = self.norm(out)\n        out = F.leaky_relu(out)\n        return out\n\n\nclass ResBlock3d(nn.Module):\n    \"\"\"\n    Res block, preserve spatial resolution.\n    \"\"\"\n\n    def __init__(self, in_features, kernel_size, padding):\n        super(ResBlock3d, self).__init__()\n        self.conv1 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, padding=padding)\n        self.conv2 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, padding=padding)\n        self.norm1 = nn.BatchNorm3d(in_features, affine=True)\n        self.norm2 = nn.BatchNorm3d(in_features, affine=True)\n\n    def forward(self, x):\n        out = self.norm1(x)\n        out = F.relu(out)\n        out = self.conv1(out)\n        out = self.norm2(out)\n        out = F.relu(out)\n        out = self.conv2(out)\n        out += x\n        return out\n\n\nclass UpBlock3d(nn.Module):\n    \"\"\"\n    Upsampling block for use in decoder.\n    \"\"\"\n\n    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):\n        super(UpBlock3d, self).__init__()\n\n        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,\n                              padding=padding, groups=groups)\n        self.norm = nn.BatchNorm3d(out_features, affine=True)\n\n    def forward(self, x):\n        out = F.interpolate(x, scale_factor=(1, 2, 2))\n        out = self.conv(out)\n        out = self.norm(out)\n        out = F.relu(out)\n        return out\n\n\nclass DownBlock2d(nn.Module):\n    \"\"\"\n    Downsampling block for use in encoder.\n    \"\"\"\n\n    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):\n        super(DownBlock2d, self).__init__()\n        self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, padding=padding, groups=groups)\n        self.norm = nn.BatchNorm2d(out_features, affine=True)\n        self.pool = nn.AvgPool2d(kernel_size=(2, 2))\n\n    def forward(self, x):\n        out = self.conv(x)\n        out = self.norm(out)\n        out = F.relu(out)\n        out = self.pool(out)\n        return out\n\n\nclass DownBlock3d(nn.Module):\n    \"\"\"\n    Downsampling block for use in encoder.\n    \"\"\"\n\n    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):\n        super(DownBlock3d, self).__init__()\n        '''\n        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,\n                                padding=padding, groups=groups, stride=(1, 2, 2))\n        '''\n        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,\n                              padding=padding, groups=groups)\n        self.norm = nn.BatchNorm3d(out_features, affine=True)\n        self.pool = nn.AvgPool3d(kernel_size=(1, 2, 2))\n\n    def forward(self, x):\n        out = self.conv(x)\n        out = self.norm(out)\n        out = F.relu(out)\n        out = self.pool(out)\n        return out\n\n\nclass SameBlock2d(nn.Module):\n    \"\"\"\n    Simple block, preserve spatial resolution.\n    \"\"\"\n\n    def __init__(self, in_features, out_features, groups=1, kernel_size=3, padding=1, lrelu=False):\n        super(SameBlock2d, self).__init__()\n        self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, padding=padding, groups=groups)\n        self.norm = nn.BatchNorm2d(out_features, affine=True)\n        if lrelu:\n            self.ac = nn.LeakyReLU()\n        else:\n            self.ac = nn.ReLU()\n\n    def forward(self, x):\n        out = self.conv(x)\n        out = self.norm(out)\n        out = self.ac(out)\n        return out\n\n\nclass Encoder(nn.Module):\n    \"\"\"\n    Hourglass Encoder\n    \"\"\"\n\n    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):\n        super(Encoder, self).__init__()\n\n        down_blocks = []\n        for i in range(num_blocks):\n            down_blocks.append(DownBlock3d(in_features if i == 0 else min(max_features, block_expansion * (2 ** i)), min(max_features, block_expansion * (2 ** (i + 1))), kernel_size=3, padding=1))\n        self.down_blocks = nn.ModuleList(down_blocks)\n\n    def forward(self, x):\n        outs = [x]\n        for down_block in self.down_blocks:\n            outs.append(down_block(outs[-1]))\n        return outs\n\n\nclass Decoder(nn.Module):\n    \"\"\"\n    Hourglass Decoder\n    \"\"\"\n\n    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):\n        super(Decoder, self).__init__()\n\n        up_blocks = []\n\n        for i in range(num_blocks)[::-1]:\n            in_filters = (1 if i == num_blocks - 1 else 2) * min(max_features, block_expansion * (2 ** (i + 1)))\n            out_filters = min(max_features, block_expansion * (2 ** i))\n            up_blocks.append(UpBlock3d(in_filters, out_filters, kernel_size=3, padding=1))\n\n        self.up_blocks = nn.ModuleList(up_blocks)\n        self.out_filters = block_expansion + in_features\n\n        self.conv = nn.Conv3d(in_channels=self.out_filters, out_channels=self.out_filters, kernel_size=3, padding=1)\n        self.norm = nn.BatchNorm3d(self.out_filters, affine=True)\n\n    def forward(self, x):\n        out = x.pop()\n        for up_block in self.up_blocks:\n            out = up_block(out)\n            skip = x.pop()\n            out = torch.cat([out, skip], dim=1)\n        out = self.conv(out)\n        out = self.norm(out)\n        out = F.relu(out)\n        return out\n\n\nclass Hourglass(nn.Module):\n    \"\"\"\n    Hourglass architecture.\n    \"\"\"\n\n    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):\n        super(Hourglass, self).__init__()\n        self.encoder = Encoder(block_expansion, in_features, num_blocks, max_features)\n        self.decoder = Decoder(block_expansion, in_features, num_blocks, max_features)\n        self.out_filters = self.decoder.out_filters\n\n    def forward(self, x):\n        return self.decoder(self.encoder(x))\n\n\nclass SPADE(nn.Module):\n    def __init__(self, norm_nc, label_nc):\n        super().__init__()\n\n        self.param_free_norm = nn.InstanceNorm2d(norm_nc, affine=False)\n        nhidden = 128\n\n        self.mlp_shared = nn.Sequential(\n            nn.Conv2d(label_nc, nhidden, kernel_size=3, padding=1),\n            nn.ReLU())\n        self.mlp_gamma = nn.Conv2d(nhidden, norm_nc, kernel_size=3, padding=1)\n        self.mlp_beta = nn.Conv2d(nhidden, norm_nc, kernel_size=3, padding=1)\n\n    def forward(self, x, segmap):\n        normalized = self.param_free_norm(x)\n        segmap = F.interpolate(segmap, size=x.size()[2:], mode='nearest')\n        actv = self.mlp_shared(segmap)\n        gamma = self.mlp_gamma(actv)\n        beta = self.mlp_beta(actv)\n        out = normalized * (1 + gamma) + beta\n        return out\n\n\nclass SPADEResnetBlock(nn.Module):\n    def __init__(self, fin, fout, norm_G, label_nc, use_se=False, dilation=1):\n        super().__init__()\n        # Attributes\n        self.learned_shortcut = (fin != fout)\n        fmiddle = min(fin, fout)\n        self.use_se = use_se\n        # create conv layers\n        self.conv_0 = nn.Conv2d(fin, fmiddle, kernel_size=3, padding=dilation, dilation=dilation)\n        self.conv_1 = nn.Conv2d(fmiddle, fout, kernel_size=3, padding=dilation, dilation=dilation)\n        if self.learned_shortcut:\n            self.conv_s = nn.Conv2d(fin, fout, kernel_size=1, bias=False)\n        # apply spectral norm if specified\n        if 'spectral' in norm_G:\n            self.conv_0 = spectral_norm(self.conv_0)\n            self.conv_1 = spectral_norm(self.conv_1)\n            if self.learned_shortcut:\n                self.conv_s = spectral_norm(self.conv_s)\n        # define normalization layers\n        self.norm_0 = SPADE(fin, label_nc)\n        self.norm_1 = SPADE(fmiddle, label_nc)\n        if self.learned_shortcut:\n            self.norm_s = SPADE(fin, label_nc)\n\n    def forward(self, x, seg1):\n        x_s = self.shortcut(x, seg1)\n        dx = self.conv_0(self.actvn(self.norm_0(x, seg1)))\n        dx = self.conv_1(self.actvn(self.norm_1(dx, seg1)))\n        out = x_s + dx\n        return out\n\n    def shortcut(self, x, seg1):\n        if self.learned_shortcut:\n            x_s = self.conv_s(self.norm_s(x, seg1))\n        else:\n            x_s = x\n        return x_s\n\n    def actvn(self, x):\n        return F.leaky_relu(x, 2e-1)\n\n\ndef filter_state_dict(state_dict, remove_name='fc'):\n    new_state_dict = {}\n    for key in state_dict:\n        if remove_name in key:\n            continue\n        new_state_dict[key] = state_dict[key]\n    return new_state_dict\n\n\nclass GRN(nn.Module):\n    \"\"\" GRN (Global Response Normalization) layer\n    \"\"\"\n\n    def __init__(self, dim):\n        super().__init__()\n        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))\n        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))\n\n    def forward(self, x):\n        Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)\n        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)\n        return self.gamma * (x * Nx) + self.beta + x\n\n\nclass LayerNorm(nn.Module):\n    r\"\"\" LayerNorm that supports two data formats: channels_last (default) or channels_first.\n    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with\n    shape (batch_size, height, width, channels) while channels_first corresponds to inputs\n    with shape (batch_size, channels, height, width).\n    \"\"\"\n\n    def __init__(self, normalized_shape, eps=1e-6, data_format=\"channels_last\"):\n        super().__init__()\n        self.weight = nn.Parameter(torch.ones(normalized_shape))\n        self.bias = nn.Parameter(torch.zeros(normalized_shape))\n        self.eps = eps\n        self.data_format = data_format\n        if self.data_format not in [\"channels_last\", \"channels_first\"]:\n            raise NotImplementedError\n        self.normalized_shape = (normalized_shape, )\n\n    def forward(self, x):\n        if self.data_format == \"channels_last\":\n            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)\n        elif self.data_format == \"channels_first\":\n            u = x.mean(1, keepdim=True)\n            s = (x - u).pow(2).mean(1, keepdim=True)\n            x = (x - u) / torch.sqrt(s + self.eps)\n            x = self.weight[:, None, None] * x + self.bias[:, None, None]\n            return x\n\n\ndef _no_grad_trunc_normal_(tensor, mean, std, a, b):\n    # Cut & paste from PyTorch official master until it's in a few official releases - RW\n    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf\n    def norm_cdf(x):\n        # Computes standard normal cumulative distribution function\n        return (1. + math.erf(x / math.sqrt(2.))) / 2.\n\n    if (mean < a - 2 * std) or (mean > b + 2 * std):\n        warnings.warn(\"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. \"\n                      \"The distribution of values may be incorrect.\",\n                      stacklevel=2)\n\n    with torch.no_grad():\n        # Values are generated by using a truncated uniform distribution and\n        # then using the inverse CDF for the normal distribution.\n        # Get upper and lower cdf values\n        l = norm_cdf((a - mean) / std)\n        u = norm_cdf((b - mean) / std)\n\n        # Uniformly fill tensor with values from [l, u], then translate to\n        # [2l-1, 2u-1].\n        tensor.uniform_(2 * l - 1, 2 * u - 1)\n\n        # Use inverse cdf transform for normal distribution to get truncated\n        # standard normal\n        tensor.erfinv_()\n\n        # Transform to proper mean, std\n        tensor.mul_(std * math.sqrt(2.))\n        tensor.add_(mean)\n\n        # Clamp to ensure it's in the proper range\n        tensor.clamp_(min=a, max=b)\n        return tensor\n\n\ndef drop_path(x, drop_prob=0., training=False, scale_by_keep=True):\n    \"\"\" Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).\n\n    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,\n    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...\n    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for\n    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use\n    'survival rate' as the argument.\n\n    \"\"\"\n    if drop_prob == 0. or not training:\n        return x\n    keep_prob = 1 - drop_prob\n    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets\n    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)\n    if keep_prob > 0.0 and scale_by_keep:\n        random_tensor.div_(keep_prob)\n    return x * random_tensor\n\n\nclass DropPath(nn.Module):\n    \"\"\" Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).\n    \"\"\"\n\n    def __init__(self, drop_prob=None, scale_by_keep=True):\n        super(DropPath, self).__init__()\n        self.drop_prob = drop_prob\n        self.scale_by_keep = scale_by_keep\n\n    def forward(self, x):\n        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)\n\n\ndef trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):\n    return _no_grad_trunc_normal_(tensor, mean, std, a, b)\n\n# From PyTorch internals\ndef _ntuple(n):\n    def parse(x):\n        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):\n            return tuple(x)\n        return tuple(repeat(x, n))\n    return parse\n\nto_2tuple = _ntuple(2)\n"
  },
  {
    "path": "src/modules/warping_network.py",
    "content": "# coding: utf-8\n\n\"\"\"\nWarping field estimator(W) defined in the paper, which generates a warping field using the implicit\nkeypoint representations x_s and x_d, and employs this flow field to warp the source feature volume f_s.\n\"\"\"\n\nfrom torch import nn\nimport torch.nn.functional as F\nfrom .util import SameBlock2d\nfrom .dense_motion import DenseMotionNetwork\n\n\nclass WarpingNetwork(nn.Module):\n    def __init__(\n        self,\n        num_kp,\n        block_expansion,\n        max_features,\n        num_down_blocks,\n        reshape_channel,\n        estimate_occlusion_map=False,\n        dense_motion_params=None,\n        **kwargs\n    ):\n        super(WarpingNetwork, self).__init__()\n\n        self.upscale = kwargs.get('upscale', 1)\n        self.flag_use_occlusion_map = kwargs.get('flag_use_occlusion_map', True)\n\n        if dense_motion_params is not None:\n            self.dense_motion_network = DenseMotionNetwork(\n                num_kp=num_kp,\n                feature_channel=reshape_channel,\n                estimate_occlusion_map=estimate_occlusion_map,\n                **dense_motion_params\n            )\n        else:\n            self.dense_motion_network = None\n\n        self.third = SameBlock2d(max_features, block_expansion * (2 ** num_down_blocks), kernel_size=(3, 3), padding=(1, 1), lrelu=True)\n        self.fourth = nn.Conv2d(in_channels=block_expansion * (2 ** num_down_blocks), out_channels=block_expansion * (2 ** num_down_blocks), kernel_size=1, stride=1)\n\n        self.estimate_occlusion_map = estimate_occlusion_map\n\n    def deform_input(self, inp, deformation):\n        return F.grid_sample(inp, deformation, align_corners=False)\n\n    def forward(self, feature_3d, kp_driving, kp_source):\n        if self.dense_motion_network is not None:\n            # Feature warper, Transforming feature representation according to deformation and occlusion\n            dense_motion = self.dense_motion_network(\n                feature=feature_3d, kp_driving=kp_driving, kp_source=kp_source\n            )\n            if 'occlusion_map' in dense_motion:\n                occlusion_map = dense_motion['occlusion_map']  # Bx1x64x64\n            else:\n                occlusion_map = None\n\n            deformation = dense_motion['deformation']  # Bx16x64x64x3\n            out = self.deform_input(feature_3d, deformation)  # Bx32x16x64x64\n\n            bs, c, d, h, w = out.shape  # Bx32x16x64x64\n            out = out.view(bs, c * d, h, w)  # -> Bx512x64x64\n            out = self.third(out)  # -> Bx256x64x64\n            out = self.fourth(out)  # -> Bx256x64x64\n\n            if self.flag_use_occlusion_map and (occlusion_map is not None):\n                out = out * occlusion_map\n\n        ret_dct = {\n            'occlusion_map': occlusion_map,\n            'deformation': deformation,\n            'out': out,\n        }\n\n        return ret_dct\n"
  },
  {
    "path": "src/utils/__init__.py",
    "content": ""
  },
  {
    "path": "src/utils/animal_landmark_runner.py",
    "content": "# coding: utf-8\n\n\"\"\"\nface detectoin and alignment using XPose\n\"\"\"\n\nimport os\nimport pickle\nimport torch\nimport numpy as np\nfrom PIL import Image\nfrom torchvision.ops import nms\n\nfrom .timer import Timer\nfrom .rprint import rlog as log\nfrom .helper import clean_state_dict\n\nfrom .dependencies.XPose import transforms as T\nfrom .dependencies.XPose.models import build_model\nfrom .dependencies.XPose.predefined_keypoints import *\nfrom .dependencies.XPose.util import box_ops\nfrom .dependencies.XPose.util.config import Config\n\n\nclass XPoseRunner(object):\n    def __init__(self, model_config_path, model_checkpoint_path, embeddings_cache_path=None, cpu_only=False, **kwargs):\n        self.device_id = kwargs.get(\"device_id\", 0)\n        self.flag_use_half_precision = kwargs.get(\"flag_use_half_precision\", True)\n        self.device = f\"cuda:{self.device_id}\" if not cpu_only else \"cpu\"\n        self.model = self.load_animal_model(model_config_path, model_checkpoint_path, self.device)\n        self.timer = Timer()\n        # Load cached embeddings if available\n        try:\n            with open(f'{embeddings_cache_path}_9.pkl', 'rb') as f:\n                self.ins_text_embeddings_9, self.kpt_text_embeddings_9 = pickle.load(f)\n            with open(f'{embeddings_cache_path}_68.pkl', 'rb') as f:\n                self.ins_text_embeddings_68, self.kpt_text_embeddings_68 = pickle.load(f)\n            print(\"Loaded cached embeddings from file.\")\n        except Exception:\n            raise ValueError(\"Could not load clip embeddings from file, please check your file path.\")\n\n    def load_animal_model(self, model_config_path, model_checkpoint_path, device):\n        args = Config.fromfile(model_config_path)\n        args.device = device\n        model = build_model(args)\n        checkpoint = torch.load(model_checkpoint_path, map_location=lambda storage, loc: storage, weights_only=False)\n        load_res = model.load_state_dict(clean_state_dict(checkpoint[\"model\"]), strict=False)\n        model.eval()\n        return model\n\n    def load_image(self, input_image):\n        image_pil = input_image.convert(\"RGB\")\n        transform = T.Compose([\n            T.RandomResize([800], max_size=1333),  # NOTE: fixed size to 800\n            T.ToTensor(),\n            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),\n        ])\n        image, _ = transform(image_pil, None)\n        return image_pil, image\n\n    def get_unipose_output(self, image, instance_text_prompt, keypoint_text_prompt, box_threshold, IoU_threshold):\n        instance_list = instance_text_prompt.split(',')\n\n        if len(keypoint_text_prompt) == 9:\n            # torch.Size([1, 512]) torch.Size([9, 512])\n            ins_text_embeddings, kpt_text_embeddings = self.ins_text_embeddings_9, self.kpt_text_embeddings_9\n        elif len(keypoint_text_prompt) ==68:\n            # torch.Size([1, 512]) torch.Size([68, 512])\n            ins_text_embeddings, kpt_text_embeddings = self.ins_text_embeddings_68, self.kpt_text_embeddings_68\n        else:\n            raise ValueError(\"Invalid number of keypoint embeddings.\")\n        target = {\n            \"instance_text_prompt\": instance_list,\n            \"keypoint_text_prompt\": keypoint_text_prompt,\n            \"object_embeddings_text\": ins_text_embeddings.float(),\n            \"kpts_embeddings_text\": torch.cat((kpt_text_embeddings.float(), torch.zeros(100 - kpt_text_embeddings.shape[0], 512, device=self.device)), dim=0),\n            \"kpt_vis_text\": torch.cat((torch.ones(kpt_text_embeddings.shape[0], device=self.device), torch.zeros(100 - kpt_text_embeddings.shape[0], device=self.device)), dim=0)\n        }\n\n        self.model = self.model.to(self.device)\n        image = image.to(self.device)\n\n        with torch.no_grad():\n            with torch.autocast(device_type=self.device[:4], dtype=torch.float16, enabled=self.flag_use_half_precision):\n                outputs = self.model(image[None], [target])\n\n        logits = outputs[\"pred_logits\"].sigmoid()[0]\n        boxes = outputs[\"pred_boxes\"][0]\n        keypoints = outputs[\"pred_keypoints\"][0][:, :2 * len(keypoint_text_prompt)]\n\n        logits_filt = logits.cpu().clone()\n        boxes_filt = boxes.cpu().clone()\n        keypoints_filt = keypoints.cpu().clone()\n        filt_mask = logits_filt.max(dim=1)[0] > box_threshold\n        logits_filt = logits_filt[filt_mask]\n        boxes_filt = boxes_filt[filt_mask]\n        keypoints_filt = keypoints_filt[filt_mask]\n\n        keep_indices = nms(box_ops.box_cxcywh_to_xyxy(boxes_filt), logits_filt.max(dim=1)[0], iou_threshold=IoU_threshold)\n\n        filtered_boxes = boxes_filt[keep_indices]\n        filtered_keypoints = keypoints_filt[keep_indices]\n\n        return filtered_boxes, filtered_keypoints\n\n    def run(self, input_image, instance_text_prompt, keypoint_text_example, box_threshold, IoU_threshold):\n        if keypoint_text_example in globals():\n            keypoint_dict = globals()[keypoint_text_example]\n        elif instance_text_prompt in globals():\n            keypoint_dict = globals()[instance_text_prompt]\n        else:\n            keypoint_dict = globals()[\"animal\"]\n\n        keypoint_text_prompt = keypoint_dict.get(\"keypoints\")\n        keypoint_skeleton = keypoint_dict.get(\"skeleton\")\n\n        image_pil, image = self.load_image(input_image)\n        boxes_filt, keypoints_filt = self.get_unipose_output(image, instance_text_prompt, keypoint_text_prompt, box_threshold, IoU_threshold)\n\n        size = image_pil.size\n        H, W = size[1], size[0]\n        keypoints_filt = keypoints_filt[0].squeeze(0)\n        kp = np.array(keypoints_filt.cpu())\n        num_kpts = len(keypoint_text_prompt)\n        Z = kp[:num_kpts * 2] * np.array([W, H] * num_kpts)\n        Z = Z.reshape(num_kpts * 2)\n        x = Z[0::2]\n        y = Z[1::2]\n        return np.stack((x, y), axis=1)\n\n    def warmup(self):\n        self.timer.tic()\n\n        img_rgb = Image.fromarray(np.zeros((512, 512, 3), dtype=np.uint8))\n        self.run(img_rgb, 'face', 'face', box_threshold=0.0, IoU_threshold=0.0)\n\n        elapse = self.timer.toc()\n        log(f'XPoseRunner warmup time: {elapse:.3f}s')\n"
  },
  {
    "path": "src/utils/camera.py",
    "content": "# coding: utf-8\n\n\"\"\"\nfunctions for processing and transforming 3D facial keypoints\n\"\"\"\n\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\n\nPI = np.pi\n\n\ndef headpose_pred_to_degree(pred):\n    \"\"\"\n    pred: (bs, 66) or (bs, 1) or others\n    \"\"\"\n    if pred.ndim > 1 and pred.shape[1] == 66:\n        # NOTE: note that the average is modified to 97.5\n        device = pred.device\n        idx_tensor = [idx for idx in range(0, 66)]\n        idx_tensor = torch.FloatTensor(idx_tensor).to(device)\n        pred = F.softmax(pred, dim=1)\n        degree = torch.sum(pred*idx_tensor, axis=1) * 3 - 97.5\n\n        return degree\n\n    return pred\n\n\ndef get_rotation_matrix(pitch_, yaw_, roll_):\n    \"\"\" the input is in degree\n    \"\"\"\n    # transform to radian\n    pitch = pitch_ / 180 * PI\n    yaw = yaw_ / 180 * PI\n    roll = roll_ / 180 * PI\n\n    device = pitch.device\n\n    if pitch.ndim == 1:\n        pitch = pitch.unsqueeze(1)\n    if yaw.ndim == 1:\n        yaw = yaw.unsqueeze(1)\n    if roll.ndim == 1:\n        roll = roll.unsqueeze(1)\n\n    # calculate the euler matrix\n    bs = pitch.shape[0]\n    ones = torch.ones([bs, 1]).to(device)\n    zeros = torch.zeros([bs, 1]).to(device)\n    x, y, z = pitch, yaw, roll\n\n    rot_x = torch.cat([\n        ones, zeros, zeros,\n        zeros, torch.cos(x), -torch.sin(x),\n        zeros, torch.sin(x), torch.cos(x)\n    ], dim=1).reshape([bs, 3, 3])\n\n    rot_y = torch.cat([\n        torch.cos(y), zeros, torch.sin(y),\n        zeros, ones, zeros,\n        -torch.sin(y), zeros, torch.cos(y)\n    ], dim=1).reshape([bs, 3, 3])\n\n    rot_z = torch.cat([\n        torch.cos(z), -torch.sin(z), zeros,\n        torch.sin(z), torch.cos(z), zeros,\n        zeros, zeros, ones\n    ], dim=1).reshape([bs, 3, 3])\n\n    rot = rot_z @ rot_y @ rot_x\n    return rot.permute(0, 2, 1)  # transpose\n"
  },
  {
    "path": "src/utils/check_windows_port.py",
    "content": "import socket\nimport sys\n\nif len(sys.argv) != 2:\n    print(\"Usage: python check_port.py <port>\")\n    sys.exit(1)\n\nport = int(sys.argv[1])\n\nsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\nsock.settimeout(1)\nresult = sock.connect_ex(('127.0.0.1', port))\n\nif result == 0:\n    print(\"LISTENING\")\nelse:\n    print(\"NOT LISTENING\")\nsock.close\n"
  },
  {
    "path": "src/utils/crop.py",
    "content": "# coding: utf-8\n\n\"\"\"\ncropping function and the related preprocess functions for cropping\n\"\"\"\n\nimport numpy as np\nimport os.path as osp\nfrom math import sin, cos, acos, degrees\nimport cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False) # NOTE: enforce single thread\nfrom .rprint import rprint as print\n\nDTYPE = np.float32\nCV2_INTERP = cv2.INTER_LINEAR\n\ndef make_abs_path(fn):\n    return osp.join(osp.dirname(osp.realpath(__file__)), fn)\n\ndef _transform_img(img, M, dsize, flags=CV2_INTERP, borderMode=None):\n    \"\"\" conduct similarity or affine transformation to the image, do not do border operation!\n    img:\n    M: 2x3 matrix or 3x3 matrix\n    dsize: target shape (width, height)\n    \"\"\"\n    if isinstance(dsize, tuple) or isinstance(dsize, list):\n        _dsize = tuple(dsize)\n    else:\n        _dsize = (dsize, dsize)\n\n    if borderMode is not None:\n        return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags, borderMode=borderMode, borderValue=(0, 0, 0))\n    else:\n        return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags)\n\n\ndef _transform_pts(pts, M):\n    \"\"\" conduct similarity or affine transformation to the pts\n    pts: Nx2 ndarray\n    M: 2x3 matrix or 3x3 matrix\n    return: Nx2\n    \"\"\"\n    return pts @ M[:2, :2].T + M[:2, 2]\n\n\ndef parse_pt2_from_pt101(pt101, use_lip=True):\n    \"\"\"\n    parsing the 2 points according to the 101 points, which cancels the roll\n    \"\"\"\n    # the former version use the eye center, but it is not robust, now use interpolation\n    pt_left_eye = np.mean(pt101[[39, 42, 45, 48]], axis=0)  # left eye center\n    pt_right_eye = np.mean(pt101[[51, 54, 57, 60]], axis=0)  # right eye center\n\n    if use_lip:\n        # use lip\n        pt_center_eye = (pt_left_eye + pt_right_eye) / 2\n        pt_center_lip = (pt101[75] + pt101[81]) / 2\n        pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)\n    else:\n        pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)\n    return pt2\n\n\ndef parse_pt2_from_pt106(pt106, use_lip=True):\n    \"\"\"\n    parsing the 2 points according to the 106 points, which cancels the roll\n    \"\"\"\n    pt_left_eye = np.mean(pt106[[33, 35, 40, 39]], axis=0)  # left eye center\n    pt_right_eye = np.mean(pt106[[87, 89, 94, 93]], axis=0)  # right eye center\n\n    if use_lip:\n        # use lip\n        pt_center_eye = (pt_left_eye + pt_right_eye) / 2\n        pt_center_lip = (pt106[52] + pt106[61]) / 2\n        pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)\n    else:\n        pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)\n    return pt2\n\n\ndef parse_pt2_from_pt203(pt203, use_lip=True):\n    \"\"\"\n    parsing the 2 points according to the 203 points, which cancels the roll\n    \"\"\"\n    pt_left_eye = np.mean(pt203[[0, 6, 12, 18]], axis=0)  # left eye center\n    pt_right_eye = np.mean(pt203[[24, 30, 36, 42]], axis=0)  # right eye center\n    if use_lip:\n        # use lip\n        pt_center_eye = (pt_left_eye + pt_right_eye) / 2\n        pt_center_lip = (pt203[48] + pt203[66]) / 2\n        pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)\n    else:\n        pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)\n    return pt2\n\n\ndef parse_pt2_from_pt68(pt68, use_lip=True):\n    \"\"\"\n    parsing the 2 points according to the 68 points, which cancels the roll\n    \"\"\"\n    lm_idx = np.array([31, 37, 40, 43, 46, 49, 55], dtype=np.int32) - 1\n    if use_lip:\n        pt5 = np.stack([\n            np.mean(pt68[lm_idx[[1, 2]], :], 0),  # left eye\n            np.mean(pt68[lm_idx[[3, 4]], :], 0),  # right eye\n            pt68[lm_idx[0], :],  # nose\n            pt68[lm_idx[5], :],  # lip\n            pt68[lm_idx[6], :]   # lip\n        ], axis=0)\n\n        pt2 = np.stack([\n            (pt5[0] + pt5[1]) / 2,\n            (pt5[3] + pt5[4]) / 2\n        ], axis=0)\n    else:\n        pt2 = np.stack([\n            np.mean(pt68[lm_idx[[1, 2]], :], 0),  # left eye\n            np.mean(pt68[lm_idx[[3, 4]], :], 0),  # right eye\n        ], axis=0)\n\n    return pt2\n\n\ndef parse_pt2_from_pt5(pt5, use_lip=True):\n    \"\"\"\n    parsing the 2 points according to the 5 points, which cancels the roll\n    \"\"\"\n    if use_lip:\n        pt2 = np.stack([\n            (pt5[0] + pt5[1]) / 2,\n            (pt5[3] + pt5[4]) / 2\n        ], axis=0)\n    else:\n        pt2 = np.stack([\n            pt5[0],\n            pt5[1]\n        ], axis=0)\n    return pt2\n\ndef parse_pt2_from_pt9(pt9, use_lip=True):\n    '''\n    parsing the 2 points according to the 9 points, which cancels the roll\n    ['right eye right', 'right eye left', 'left eye right', 'left eye left', 'nose tip', 'lip right', 'lip left', 'upper lip', 'lower lip']\n    '''\n    if use_lip:\n        pt9 = np.stack([\n            (pt9[2] + pt9[3]) / 2, # left eye\n            (pt9[0] + pt9[1]) / 2, # right eye\n            pt9[4],\n            (pt9[5] + pt9[6] ) / 2 # lip\n        ], axis=0)\n        pt2 = np.stack([\n            (pt9[0] + pt9[1]) / 2, # eye\n            pt9[3] # lip\n        ], axis=0)\n    else:\n        pt2 = np.stack([\n            (pt9[2] + pt9[3]) / 2,\n            (pt9[0] + pt9[1]) / 2,\n        ], axis=0)\n\n    return pt2\n\ndef parse_pt2_from_pt_x(pts, use_lip=True):\n    if pts.shape[0] == 101:\n        pt2 = parse_pt2_from_pt101(pts, use_lip=use_lip)\n    elif pts.shape[0] == 106:\n        pt2 = parse_pt2_from_pt106(pts, use_lip=use_lip)\n    elif pts.shape[0] == 68:\n        pt2 = parse_pt2_from_pt68(pts, use_lip=use_lip)\n    elif pts.shape[0] == 5:\n        pt2 = parse_pt2_from_pt5(pts, use_lip=use_lip)\n    elif pts.shape[0] == 203:\n        pt2 = parse_pt2_from_pt203(pts, use_lip=use_lip)\n    elif pts.shape[0] > 101:\n        # take the first 101 points\n        pt2 = parse_pt2_from_pt101(pts[:101], use_lip=use_lip)\n    elif pts.shape[0] == 9:\n        pt2 = parse_pt2_from_pt9(pts, use_lip=use_lip)\n    else:\n        raise Exception(f'Unknow shape: {pts.shape}')\n\n    if not use_lip:\n        # NOTE: to compile with the latter code, need to rotate the pt2 90 degrees clockwise manually\n        v = pt2[1] - pt2[0]\n        pt2[1, 0] = pt2[0, 0] - v[1]\n        pt2[1, 1] = pt2[0, 1] + v[0]\n\n    return pt2\n\n\ndef parse_rect_from_landmark(\n    pts,\n    scale=1.5,\n    need_square=True,\n    vx_ratio=0,\n    vy_ratio=0,\n    use_deg_flag=False,\n    **kwargs\n):\n    \"\"\"parsing center, size, angle from 101/68/5/x landmarks\n    vx_ratio: the offset ratio along the pupil axis x-axis, multiplied by size\n    vy_ratio: the offset ratio along the pupil axis y-axis, multiplied by size, which is used to contain more forehead area\n\n    judge with pts.shape\n    \"\"\"\n    pt2 = parse_pt2_from_pt_x(pts, use_lip=kwargs.get('use_lip', True))\n\n    uy = pt2[1] - pt2[0]\n    l = np.linalg.norm(uy)\n    if l <= 1e-3:\n        uy = np.array([0, 1], dtype=DTYPE)\n    else:\n        uy /= l\n    ux = np.array((uy[1], -uy[0]), dtype=DTYPE)\n\n    # the rotation degree of the x-axis, the clockwise is positive, the counterclockwise is negative (image coordinate system)\n    # print(uy)\n    # print(ux)\n    angle = acos(ux[0])\n    if ux[1] < 0:\n        angle = -angle\n\n    # rotation matrix\n    M = np.array([ux, uy])\n\n    # calculate the size which contains the angle degree of the bbox, and the center\n    center0 = np.mean(pts, axis=0)\n    rpts = (pts - center0) @ M.T  # (M @ P.T).T = P @ M.T\n    lt_pt = np.min(rpts, axis=0)\n    rb_pt = np.max(rpts, axis=0)\n    center1 = (lt_pt + rb_pt) / 2\n\n    size = rb_pt - lt_pt\n    if need_square:\n        m = max(size[0], size[1])\n        size[0] = m\n        size[1] = m\n\n    size *= scale  # scale size\n    center = center0 + ux * center1[0] + uy * center1[1]  # counterclockwise rotation, equivalent to M.T @ center1.T\n    center = center + ux * (vx_ratio * size) + uy * \\\n        (vy_ratio * size)  # considering the offset in vx and vy direction\n\n    if use_deg_flag:\n        angle = degrees(angle)\n\n    return center, size, angle\n\n\ndef parse_bbox_from_landmark(pts, **kwargs):\n    center, size, angle = parse_rect_from_landmark(pts, **kwargs)\n    cx, cy = center\n    w, h = size\n\n    # calculate the vertex positions before rotation\n    bbox = np.array([\n        [cx-w/2, cy-h/2],  # left, top\n        [cx+w/2, cy-h/2],\n        [cx+w/2, cy+h/2],  # right, bottom\n        [cx-w/2, cy+h/2]\n    ], dtype=DTYPE)\n\n    # construct rotation matrix\n    bbox_rot = bbox.copy()\n    R = np.array([\n        [np.cos(angle), -np.sin(angle)],\n        [np.sin(angle),  np.cos(angle)]\n    ], dtype=DTYPE)\n\n    # calculate the relative position of each vertex from the rotation center, then rotate these positions, and finally add the coordinates of the rotation center\n    bbox_rot = (bbox_rot - center) @ R.T + center\n\n    return {\n        'center': center,  # 2x1\n        'size': size,  # scalar\n        'angle': angle,  # rad, counterclockwise\n        'bbox': bbox,  # 4x2\n        'bbox_rot': bbox_rot,  # 4x2\n    }\n\n\ndef crop_image_by_bbox(img, bbox, lmk=None, dsize=512, angle=None, flag_rot=False, **kwargs):\n    left, top, right, bot = bbox\n    if int(right - left) != int(bot - top):\n        print(f'right-left {right-left} != bot-top {bot-top}')\n    size = right - left\n\n    src_center = np.array([(left + right) / 2, (top + bot) / 2], dtype=DTYPE)\n    tgt_center = np.array([dsize / 2, dsize / 2], dtype=DTYPE)\n\n    s = dsize / size  # scale\n    if flag_rot and angle is not None:\n        costheta, sintheta = cos(angle), sin(angle)\n        cx, cy = src_center[0], src_center[1]  # ori center\n        tcx, tcy = tgt_center[0], tgt_center[1]  # target center\n        # need to infer\n        M_o2c = np.array(\n            [[s * costheta, s * sintheta, tcx - s * (costheta * cx + sintheta * cy)],\n             [-s * sintheta, s * costheta, tcy - s * (-sintheta * cx + costheta * cy)]],\n            dtype=DTYPE\n        )\n    else:\n        M_o2c = np.array(\n            [[s, 0, tgt_center[0] - s * src_center[0]],\n             [0, s, tgt_center[1] - s * src_center[1]]],\n            dtype=DTYPE\n        )\n\n    # if flag_rot and angle is None:\n        # print('angle is None, but flag_rotate is True', style=\"bold yellow\")\n\n    img_crop = _transform_img(img, M_o2c, dsize=dsize, borderMode=kwargs.get('borderMode', None))\n    lmk_crop = _transform_pts(lmk, M_o2c) if lmk is not None else None\n\n    M_o2c = np.vstack([M_o2c, np.array([0, 0, 1], dtype=DTYPE)])\n    M_c2o = np.linalg.inv(M_o2c)\n\n    # cv2.imwrite('crop.jpg', img_crop)\n\n    return {\n        'img_crop': img_crop,\n        'lmk_crop': lmk_crop,\n        'M_o2c': M_o2c,\n        'M_c2o': M_c2o,\n    }\n\n\ndef _estimate_similar_transform_from_pts(\n    pts,\n    dsize,\n    scale=1.5,\n    vx_ratio=0,\n    vy_ratio=-0.1,\n    flag_do_rot=True,\n    **kwargs\n):\n    \"\"\" calculate the affine matrix of the cropped image from sparse points, the original image to the cropped image, the inverse is the cropped image to the original image\n    pts: landmark, 101 or 68 points or other points, Nx2\n    scale: the larger scale factor, the smaller face ratio\n    vx_ratio: x shift\n    vy_ratio: y shift, the smaller the y shift, the lower the face region\n    rot_flag: if it is true, conduct correction\n    \"\"\"\n    center, size, angle = parse_rect_from_landmark(\n        pts, scale=scale, vx_ratio=vx_ratio, vy_ratio=vy_ratio,\n        use_lip=kwargs.get('use_lip', True)\n    )\n\n    s = dsize / size[0]  # scale\n    tgt_center = np.array([dsize / 2, dsize / 2], dtype=DTYPE)  # center of dsize\n\n    if flag_do_rot:\n        costheta, sintheta = cos(angle), sin(angle)\n        cx, cy = center[0], center[1]  # ori center\n        tcx, tcy = tgt_center[0], tgt_center[1]  # target center\n        # need to infer\n        M_INV = np.array(\n            [[s * costheta, s * sintheta, tcx - s * (costheta * cx + sintheta * cy)],\n             [-s * sintheta, s * costheta, tcy - s * (-sintheta * cx + costheta * cy)]],\n            dtype=DTYPE\n        )\n    else:\n        M_INV = np.array(\n            [[s, 0, tgt_center[0] - s * center[0]],\n             [0, s, tgt_center[1] - s * center[1]]],\n            dtype=DTYPE\n        )\n\n    M_INV_H = np.vstack([M_INV, np.array([0, 0, 1])])\n    M = np.linalg.inv(M_INV_H)\n\n    # M_INV is from the original image to the cropped image, M is from the cropped image to the original image\n    return M_INV, M[:2, ...]\n\n\ndef crop_image(img, pts: np.ndarray, **kwargs):\n    dsize = kwargs.get('dsize', 224)\n    scale = kwargs.get('scale', 1.5)  # 1.5 | 1.6\n    vy_ratio = kwargs.get('vy_ratio', -0.1)  # -0.0625 | -0.1\n\n    M_INV, _ = _estimate_similar_transform_from_pts(\n        pts,\n        dsize=dsize,\n        scale=scale,\n        vy_ratio=vy_ratio,\n        flag_do_rot=kwargs.get('flag_do_rot', True),\n    )\n\n    img_crop = _transform_img(img, M_INV, dsize)  # origin to crop\n    pt_crop = _transform_pts(pts, M_INV)\n\n    M_o2c = np.vstack([M_INV, np.array([0, 0, 1], dtype=DTYPE)])\n    M_c2o = np.linalg.inv(M_o2c)\n\n    ret_dct = {\n        'M_o2c': M_o2c,  # from the original image to the cropped image 3x3\n        'M_c2o': M_c2o,  # from the cropped image to the original image 3x3\n        'img_crop': img_crop,  # the cropped image\n        'pt_crop': pt_crop,  # the landmarks of the cropped image\n    }\n\n    return ret_dct\n\ndef average_bbox_lst(bbox_lst):\n    if len(bbox_lst) == 0:\n        return None\n    bbox_arr = np.array(bbox_lst)\n    return np.mean(bbox_arr, axis=0).tolist()\n\ndef prepare_paste_back(mask_crop, crop_M_c2o, dsize):\n    \"\"\"prepare mask for later image paste back\n    \"\"\"\n    mask_ori = _transform_img(mask_crop, crop_M_c2o, dsize)\n    mask_ori = mask_ori.astype(np.float32) / 255.\n    return mask_ori\n\ndef paste_back(img_crop, M_c2o, img_ori, mask_ori):\n    \"\"\"paste back the image\n    \"\"\"\n    dsize = (img_ori.shape[1], img_ori.shape[0])\n    result = _transform_img(img_crop, M_c2o, dsize=dsize)\n    result = np.clip(mask_ori * result + (1 - mask_ori) * img_ori, 0, 255).astype(np.uint8)\n    return result\n"
  },
  {
    "path": "src/utils/cropper.py",
    "content": "# coding: utf-8\n\nimport os.path as osp\nimport torch\nimport numpy as np\nimport cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)\n\nfrom PIL import Image\nfrom typing import List, Tuple, Union\nfrom dataclasses import dataclass, field\n\nfrom ..config.crop_config import CropConfig\nfrom .crop import (\n    average_bbox_lst,\n    crop_image,\n    crop_image_by_bbox,\n    parse_bbox_from_landmark,\n)\nfrom .io import contiguous\nfrom .rprint import rlog as log\nfrom .face_analysis_diy import FaceAnalysisDIY\nfrom .human_landmark_runner import LandmarkRunner as HumanLandmark\n\ndef make_abs_path(fn):\n    return osp.join(osp.dirname(osp.realpath(__file__)), fn)\n\n\n@dataclass\nclass Trajectory:\n    start: int = -1  # start frame\n    end: int = -1  # end frame\n    lmk_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # lmk list\n    bbox_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # bbox list\n    M_c2o_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # M_c2o list\n\n    frame_rgb_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # frame list\n    lmk_crop_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # lmk list\n    frame_rgb_crop_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # frame crop list\n\n\nclass Cropper(object):\n    def __init__(self, **kwargs) -> None:\n        self.crop_cfg: CropConfig = kwargs.get(\"crop_cfg\", None)\n        self.image_type = kwargs.get(\"image_type\", 'human_face')\n        device_id = kwargs.get(\"device_id\", 0)\n        flag_force_cpu = kwargs.get(\"flag_force_cpu\", False)\n        if flag_force_cpu:\n            device = \"cpu\"\n            face_analysis_wrapper_provider = [\"CPUExecutionProvider\"]\n        else:\n            try:\n                if torch.backends.mps.is_available():\n                    # Shape inference currently fails with CoreMLExecutionProvider\n                    # for the retinaface model\n                    device = \"mps\"\n                    face_analysis_wrapper_provider = [\"CPUExecutionProvider\"]\n                else:\n                    device = \"cuda\"\n                    face_analysis_wrapper_provider = [\"CUDAExecutionProvider\"]\n            except:\n                    device = \"cuda\"\n                    face_analysis_wrapper_provider = [\"CUDAExecutionProvider\"]\n        self.face_analysis_wrapper = FaceAnalysisDIY(\n                    name=\"buffalo_l\",\n                    root=self.crop_cfg.insightface_root,\n                    providers=face_analysis_wrapper_provider,\n                )\n        self.face_analysis_wrapper.prepare(ctx_id=device_id, det_size=(512, 512), det_thresh=self.crop_cfg.det_thresh)\n        self.face_analysis_wrapper.warmup()\n\n        self.human_landmark_runner = HumanLandmark(\n            ckpt_path=self.crop_cfg.landmark_ckpt_path,\n            onnx_provider=device,\n            device_id=device_id,\n        )\n        self.human_landmark_runner.warmup()\n\n        if self.image_type == \"animal_face\":\n            from .animal_landmark_runner import XPoseRunner as AnimalLandmarkRunner\n            self.animal_landmark_runner = AnimalLandmarkRunner(\n                    model_config_path=self.crop_cfg.xpose_config_file_path,\n                    model_checkpoint_path=self.crop_cfg.xpose_ckpt_path,\n                    embeddings_cache_path=self.crop_cfg.xpose_embedding_cache_path,\n                    flag_use_half_precision=kwargs.get(\"flag_use_half_precision\", True),\n                )\n            self.animal_landmark_runner.warmup()\n\n    def update_config(self, user_args):\n        for k, v in user_args.items():\n            if hasattr(self.crop_cfg, k):\n                setattr(self.crop_cfg, k, v)\n\n    def crop_source_image(self, img_rgb_: np.ndarray, crop_cfg: CropConfig):\n        # crop a source image and get neccessary information\n        img_rgb = img_rgb_.copy()  # copy it\n        img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)\n\n        if self.image_type == \"human_face\":\n            src_face = self.face_analysis_wrapper.get(\n                img_bgr,\n                flag_do_landmark_2d_106=True,\n                direction=crop_cfg.direction,\n                max_face_num=crop_cfg.max_face_num,\n            )\n\n            if len(src_face) == 0:\n                log(\"No face detected in the source image.\")\n                return None\n            elif len(src_face) > 1:\n                log(f\"More than one face detected in the image, only pick one face by rule {crop_cfg.direction}.\")\n\n            # NOTE: temporarily only pick the first face, to support multiple face in the future\n            src_face = src_face[0]\n            lmk = src_face.landmark_2d_106  # this is the 106 landmarks from insightface\n        else:\n            tmp_dct = {\n                'animal_face_9': 'animal_face',\n                'animal_face_68': 'face'\n            }\n\n            img_rgb_pil = Image.fromarray(img_rgb)\n            lmk = self.animal_landmark_runner.run(\n                img_rgb_pil,\n                'face',\n                tmp_dct[crop_cfg.animal_face_type],\n                0,\n                0\n            )\n\n        # crop the face\n        ret_dct = crop_image(\n            img_rgb,  # ndarray\n            lmk,  # 106x2 or Nx2\n            dsize=crop_cfg.dsize,\n            scale=crop_cfg.scale,\n            vx_ratio=crop_cfg.vx_ratio,\n            vy_ratio=crop_cfg.vy_ratio,\n            flag_do_rot=crop_cfg.flag_do_rot,\n        )\n\n        # update a 256x256 version for network input\n        ret_dct[\"img_crop_256x256\"] = cv2.resize(ret_dct[\"img_crop\"], (256, 256), interpolation=cv2.INTER_AREA)\n        if self.image_type == \"human_face\":\n            lmk = self.human_landmark_runner.run(img_rgb, lmk)\n            ret_dct[\"lmk_crop\"] = lmk\n            ret_dct[\"lmk_crop_256x256\"] = ret_dct[\"lmk_crop\"] * 256 / crop_cfg.dsize\n        else:\n            # 68x2 or 9x2\n            ret_dct[\"lmk_crop\"] = lmk\n\n        return ret_dct\n\n    def calc_lmk_from_cropped_image(self, img_rgb_, **kwargs):\n        direction = kwargs.get(\"direction\", \"large-small\")\n        src_face = self.face_analysis_wrapper.get(\n            contiguous(img_rgb_[..., ::-1]),  # convert to BGR\n            flag_do_landmark_2d_106=True,\n            direction=direction,\n        )\n        if len(src_face) == 0:\n            log(\"No face detected in the source image.\")\n            return None\n        elif len(src_face) > 1:\n            log(f\"More than one face detected in the image, only pick one face by rule {direction}.\")\n        src_face = src_face[0]\n        lmk = src_face.landmark_2d_106\n        lmk = self.human_landmark_runner.run(img_rgb_, lmk)\n\n        return lmk\n\n    # TODO: support skipping frame with NO FACE\n    def crop_source_video(self, source_rgb_lst, crop_cfg: CropConfig, **kwargs):\n        \"\"\"Tracking based landmarks/alignment and cropping\"\"\"\n        trajectory = Trajectory()\n        direction = kwargs.get(\"direction\", \"large-small\")\n        for idx, frame_rgb in enumerate(source_rgb_lst):\n            if idx == 0 or trajectory.start == -1:\n                src_face = self.face_analysis_wrapper.get(\n                    contiguous(frame_rgb[..., ::-1]),\n                    flag_do_landmark_2d_106=True,\n                    direction=crop_cfg.direction,\n                    max_face_num=crop_cfg.max_face_num,\n                )\n                if len(src_face) == 0:\n                    log(f\"No face detected in the frame #{idx}\")\n                    continue\n                elif len(src_face) > 1:\n                    log(f\"More than one face detected in the source frame_{idx}, only pick one face by rule {direction}.\")\n                src_face = src_face[0]\n                lmk = src_face.landmark_2d_106\n                lmk = self.human_landmark_runner.run(frame_rgb, lmk)\n                trajectory.start, trajectory.end = idx, idx\n            else:\n                # TODO: add IOU check for tracking\n                lmk = self.human_landmark_runner.run(frame_rgb, trajectory.lmk_lst[-1])\n                trajectory.end = idx\n\n            trajectory.lmk_lst.append(lmk)\n\n            # crop the face\n            ret_dct = crop_image(\n                frame_rgb,  # ndarray\n                lmk,  # 106x2 or Nx2\n                dsize=crop_cfg.dsize,\n                scale=crop_cfg.scale,\n                vx_ratio=crop_cfg.vx_ratio,\n                vy_ratio=crop_cfg.vy_ratio,\n                flag_do_rot=crop_cfg.flag_do_rot,\n            )\n\n            # update a 256x256 version for network input\n            ret_dct[\"img_crop_256x256\"] = cv2.resize(ret_dct[\"img_crop\"], (256, 256), interpolation=cv2.INTER_AREA)\n            ret_dct[\"lmk_crop_256x256\"] = ret_dct[\"pt_crop\"] * 256 / crop_cfg.dsize\n\n            trajectory.frame_rgb_crop_lst.append(ret_dct[\"img_crop_256x256\"])\n            trajectory.lmk_crop_lst.append(ret_dct[\"lmk_crop_256x256\"])\n            trajectory.M_c2o_lst.append(ret_dct['M_c2o'])\n\n        return {\n            \"frame_crop_lst\": trajectory.frame_rgb_crop_lst,\n            \"lmk_crop_lst\": trajectory.lmk_crop_lst,\n            \"M_c2o_lst\": trajectory.M_c2o_lst,\n        }\n\n    def crop_driving_video(self, driving_rgb_lst, **kwargs):\n        \"\"\"Tracking based landmarks/alignment and cropping\"\"\"\n        trajectory = Trajectory()\n        direction = kwargs.get(\"direction\", \"large-small\")\n        for idx, frame_rgb in enumerate(driving_rgb_lst):\n            if idx == 0 or trajectory.start == -1:\n                src_face = self.face_analysis_wrapper.get(\n                    contiguous(frame_rgb[..., ::-1]),\n                    flag_do_landmark_2d_106=True,\n                    direction=direction,\n                )\n                if len(src_face) == 0:\n                    log(f\"No face detected in the frame #{idx}\")\n                    continue\n                elif len(src_face) > 1:\n                    log(f\"More than one face detected in the driving frame_{idx}, only pick one face by rule {direction}.\")\n                src_face = src_face[0]\n                lmk = src_face.landmark_2d_106\n                lmk = self.human_landmark_runner.run(frame_rgb, lmk)\n                trajectory.start, trajectory.end = idx, idx\n            else:\n                lmk = self.human_landmark_runner.run(frame_rgb, trajectory.lmk_lst[-1])\n                trajectory.end = idx\n\n            trajectory.lmk_lst.append(lmk)\n            ret_bbox = parse_bbox_from_landmark(\n                lmk,\n                scale=self.crop_cfg.scale_crop_driving_video,\n                vx_ratio_crop_driving_video=self.crop_cfg.vx_ratio_crop_driving_video,\n                vy_ratio=self.crop_cfg.vy_ratio_crop_driving_video,\n            )[\"bbox\"]\n            bbox = [\n                ret_bbox[0, 0],\n                ret_bbox[0, 1],\n                ret_bbox[2, 0],\n                ret_bbox[2, 1],\n            ]  # 4,\n            trajectory.bbox_lst.append(bbox)  # bbox\n            trajectory.frame_rgb_lst.append(frame_rgb)\n\n        global_bbox = average_bbox_lst(trajectory.bbox_lst)\n\n        for idx, (frame_rgb, lmk) in enumerate(zip(trajectory.frame_rgb_lst, trajectory.lmk_lst)):\n            ret_dct = crop_image_by_bbox(\n                frame_rgb,\n                global_bbox,\n                lmk=lmk,\n                dsize=kwargs.get(\"dsize\", 512),\n                flag_rot=False,\n                borderValue=(0, 0, 0),\n            )\n            trajectory.frame_rgb_crop_lst.append(ret_dct[\"img_crop\"])\n            trajectory.lmk_crop_lst.append(ret_dct[\"lmk_crop\"])\n\n        return {\n            \"frame_crop_lst\": trajectory.frame_rgb_crop_lst,\n            \"lmk_crop_lst\": trajectory.lmk_crop_lst,\n        }\n\n\n    def calc_lmks_from_cropped_video(self, driving_rgb_crop_lst, **kwargs):\n        \"\"\"Tracking based landmarks/alignment\"\"\"\n        trajectory = Trajectory()\n        direction = kwargs.get(\"direction\", \"large-small\")\n\n        for idx, frame_rgb_crop in enumerate(driving_rgb_crop_lst):\n            if idx == 0 or trajectory.start == -1:\n                src_face = self.face_analysis_wrapper.get(\n                    contiguous(frame_rgb_crop[..., ::-1]),  # convert to BGR\n                    flag_do_landmark_2d_106=True,\n                    direction=direction,\n                )\n                if len(src_face) == 0:\n                    log(f\"No face detected in the frame #{idx}\")\n                    raise Exception(f\"No face detected in the frame #{idx}\")\n                elif len(src_face) > 1:\n                    log(f\"More than one face detected in the driving frame_{idx}, only pick one face by rule {direction}.\")\n                src_face = src_face[0]\n                lmk = src_face.landmark_2d_106\n                lmk = self.human_landmark_runner.run(frame_rgb_crop, lmk)\n                trajectory.start, trajectory.end = idx, idx\n            else:\n                lmk = self.human_landmark_runner.run(frame_rgb_crop, trajectory.lmk_lst[-1])\n                trajectory.end = idx\n\n            trajectory.lmk_lst.append(lmk)\n        return trajectory.lmk_lst\n"
  },
  {
    "path": "src/utils/dependencies/XPose/config_model/UniPose_SwinT.py",
    "content": "_base_ = ['coco_transformer.py']\n\nuse_label_enc = True\n\nnum_classes=2\n\nlr = 0.0001\nparam_dict_type = 'default'\nlr_backbone = 1e-05\nlr_backbone_names = ['backbone.0']\nlr_linear_proj_names = ['reference_points', 'sampling_offsets']\nlr_linear_proj_mult = 0.1\nddetr_lr_param = False\nbatch_size = 2\nweight_decay = 0.0001\nepochs = 12\nlr_drop = 11\nsave_checkpoint_interval = 100\nclip_max_norm = 0.1\nonecyclelr = False\nmulti_step_lr = False\nlr_drop_list = [33, 45]\n\n\nmodelname = 'UniPose'\nfrozen_weights = None\nbackbone = 'swin_T_224_1k'\n\n\ndilation = False\nposition_embedding = 'sine'\npe_temperatureH = 20\npe_temperatureW = 20\nreturn_interm_indices = [1, 2, 3]\nbackbone_freeze_keywords = None\nenc_layers = 6\ndec_layers = 6\nunic_layers = 0\npre_norm = False\ndim_feedforward = 2048\nhidden_dim = 256\ndropout = 0.0\nnheads = 8\nnum_queries = 900\nquery_dim = 4\nnum_patterns = 0\npdetr3_bbox_embed_diff_each_layer = False\npdetr3_refHW = -1\nrandom_refpoints_xy = False\nfix_refpoints_hw = -1\ndabdetr_yolo_like_anchor_update = False\ndabdetr_deformable_encoder = False\ndabdetr_deformable_decoder = False\nuse_deformable_box_attn = False\nbox_attn_type = 'roi_align'\ndec_layer_number = None\nnum_feature_levels = 4\nenc_n_points = 4\ndec_n_points = 4\ndecoder_layer_noise = False\ndln_xy_noise = 0.2\ndln_hw_noise = 0.2\nadd_channel_attention = False\nadd_pos_value = False\ntwo_stage_type = 'standard'\ntwo_stage_pat_embed = 0\ntwo_stage_add_query_num = 0\ntwo_stage_bbox_embed_share = False\ntwo_stage_class_embed_share = False\ntwo_stage_learn_wh = False\ntwo_stage_default_hw = 0.05\ntwo_stage_keep_all_tokens = False\nnum_select = 50\ntransformer_activation = 'relu'\nbatch_norm_type = 'FrozenBatchNorm2d'\nmasks = False\n\ndecoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content']\nmatcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher\ndecoder_module_seq = ['sa', 'ca', 'ffn']\nnms_iou_threshold = -1\n\ndec_pred_bbox_embed_share = True\ndec_pred_class_embed_share = True\n\n\nuse_dn = True\ndn_number = 100\ndn_box_noise_scale = 1.0\ndn_label_noise_ratio = 0.5\ndn_label_coef=1.0\ndn_bbox_coef=1.0\nembed_init_tgt = True\ndn_labelbook_size = 2000\n\nmatch_unstable_error = True\n\n# for ema\nuse_ema = True\nema_decay = 0.9997\nema_epoch = 0\n\nuse_detached_boxes_dec_out = False\n\nmax_text_len = 256\nshuffle_type = None\n\nuse_text_enhancer = True\nuse_fusion_layer = True\n\nuse_checkpoint = False # True\nuse_transformer_ckpt = True\ntext_encoder_type = 'bert-base-uncased'\n\nuse_text_cross_attention = True\ntext_dropout = 0.0\nfusion_dropout = 0.0\nfusion_droppath = 0.1\n\nnum_body_points=68\nbinary_query_selection = False\nuse_cdn = True\nffn_extra_layernorm = False\n\nfix_size=False\n"
  },
  {
    "path": "src/utils/dependencies/XPose/config_model/coco_transformer.py",
    "content": "data_aug_scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]\ndata_aug_max_size = 1333\ndata_aug_scales2_resize = [400, 500, 600]\ndata_aug_scales2_crop = [384, 600]\n\n\ndata_aug_scale_overlap = None\n\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/__init__.py",
    "content": "# ------------------------------------------------------------------------\n# Conditional DETR\n# Copyright (c) 2021 Microsoft. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# Copied from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.\n# ------------------------------------------------------------------------\n\nfrom .unipose import build_unipose\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/attention.py",
    "content": "# ------------------------------------------------------------------------\n# UniPose\n# url: https://github.com/IDEA-Research/UniPose\n# Copyright (c) 2023 IDEA. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# ED-Pose\n# Copyright (c) 2023 IDEA. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# Conditional DETR\n# Copyright (c) 2021 Microsoft. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# Modified from codes in torch.nn\n# ------------------------------------------------------------------------\n\n\"\"\"\nMultiheadAttention that support query, key, and value to have different dimensions.\nQuery, key, and value projections are removed.\n\nMostly copy-paste from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/activation.py#L873\nand https://github.com/pytorch/pytorch/blob/master/torch/nn/functional.py#L4837\n\"\"\"\n\nimport warnings\nimport torch\nfrom torch.nn.modules.linear import Linear\nfrom torch.nn.init import constant_\nfrom torch.nn.modules.module import Module\nfrom torch._jit_internal import Optional, Tuple\ntry:\n    from torch.overrides import has_torch_function, handle_torch_function\nexcept:\n    from torch._overrides import has_torch_function, handle_torch_function\nfrom torch.nn.functional import linear, pad, softmax, dropout\nTensor = torch.Tensor\n\nclass MultiheadAttention(Module):\n    r\"\"\"Allows the model to jointly attend to information\n    from different representation subspaces.\n    See reference: Attention Is All You Need\n    .. math::\n        \\text{MultiHead}(Q, K, V) = \\text{Concat}(head_1,\\dots,head_h)W^O\n        \\text{where} head_i = \\text{Attention}(QW_i^Q, KW_i^K, VW_i^V)\n    Args:\n        embed_dim: total dimension of the model.\n        num_heads: parallel attention heads.\n        dropout: a Dropout layer on attn_output_weights. Default: 0.0.\n        bias: add bias as module parameter. Default: True.\n        add_bias_kv: add bias to the key and value sequences at dim=0.\n        add_zero_attn: add a new batch of zeros to the key and\n                       value sequences at dim=1.\n        kdim: total number of features in key. Default: None.\n        vdim: total number of features in value. Default: None.\n        Note: if kdim and vdim are None, they will be set to embed_dim such that\n        query, key, and value have the same number of features.\n    Examples::\n        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)\n        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)\n    \"\"\"\n    bias_k: Optional[torch.Tensor]\n    bias_v: Optional[torch.Tensor]\n\n    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):\n        super(MultiheadAttention, self).__init__()\n        self.embed_dim = embed_dim\n        self.kdim = kdim if kdim is not None else embed_dim\n        self.vdim = vdim if vdim is not None else embed_dim\n        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim\n\n        self.num_heads = num_heads\n        self.dropout = dropout\n        self.head_dim = embed_dim // num_heads\n        assert self.head_dim * num_heads == self.embed_dim, \"embed_dim must be divisible by num_heads\"\n\n        vdim = vdim if vdim is not None else embed_dim\n        self.out_proj = Linear(vdim , vdim)\n\n        self.in_proj_bias = None\n        self.in_proj_weight = None\n        self.bias_k = self.bias_v = None\n        self.q_proj_weight = None\n        self.k_proj_weight = None\n        self.v_proj_weight = None\n\n        self.add_zero_attn = add_zero_attn\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        constant_(self.out_proj.bias, 0.)\n\n    def __setstate__(self, state):\n        # Support loading old MultiheadAttention checkpoints generated by v1.1.0\n        if '_qkv_same_embed_dim' not in state:\n            state['_qkv_same_embed_dim'] = True\n\n        super(MultiheadAttention, self).__setstate__(state)\n\n    def forward(self, query, key, value, key_padding_mask=None,\n                need_weights=True, attn_mask=None):\n        # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]\n        r\"\"\"\n    Args:\n        query, key, value: map a query and a set of key-value pairs to an output.\n            See \"Attention Is All You Need\" for more details.\n        key_padding_mask: if provided, specified padding elements in the key will\n            be ignored by the attention. When given a binary mask and a value is True,\n            the corresponding value on the attention layer will be ignored. When given\n            a byte mask and a value is non-zero, the corresponding value on the attention\n            layer will be ignored\n        need_weights: output attn_output_weights.\n        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all\n            the batches while a 3D mask allows to specify a different mask for the entries of each batch.\n    Shape:\n        - Inputs:\n        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is\n          the embedding dimension.\n        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is\n          the embedding dimension.\n        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is\n          the embedding dimension.\n        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.\n          If a ByteTensor is provided, the non-zero positions will be ignored while the position\n          with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the\n          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.\n        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.\n          3D mask :math:`(N*\\text{num_heads}, L, S)` where N is the batch size, L is the target sequence length,\n          S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked\n          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend\n          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``\n          is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor\n          is provided, it will be added to the attention weight.\n        - Outputs:\n        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,\n          E is the embedding dimension.\n        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,\n          L is the target sequence length, S is the source sequence length.\n        \"\"\"\n        if not self._qkv_same_embed_dim:\n            return multi_head_attention_forward(\n                query, key, value, self.embed_dim, self.num_heads,\n                self.in_proj_weight, self.in_proj_bias,\n                self.bias_k, self.bias_v, self.add_zero_attn,\n                self.dropout, self.out_proj.weight, self.out_proj.bias,\n                training=self.training,\n                key_padding_mask=key_padding_mask, need_weights=need_weights,\n                attn_mask=attn_mask, use_separate_proj_weight=True,\n                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,\n                v_proj_weight=self.v_proj_weight, out_dim=self.vdim)\n        else:\n            return multi_head_attention_forward(\n                query, key, value, self.embed_dim, self.num_heads,\n                self.in_proj_weight, self.in_proj_bias,\n                self.bias_k, self.bias_v, self.add_zero_attn,\n                self.dropout, self.out_proj.weight, self.out_proj.bias,\n                training=self.training,\n                key_padding_mask=key_padding_mask, need_weights=need_weights,\n                attn_mask=attn_mask, out_dim=self.vdim)\n\n\ndef multi_head_attention_forward(query: Tensor,\n                                 key: Tensor,\n                                 value: Tensor,\n                                 embed_dim_to_check: int,\n                                 num_heads: int,\n                                 in_proj_weight: Tensor,\n                                 in_proj_bias: Tensor,\n                                 bias_k: Optional[Tensor],\n                                 bias_v: Optional[Tensor],\n                                 add_zero_attn: bool,\n                                 dropout_p: float,\n                                 out_proj_weight: Tensor,\n                                 out_proj_bias: Tensor,\n                                 training: bool = True,\n                                 key_padding_mask: Optional[Tensor] = None,\n                                 need_weights: bool = True,\n                                 attn_mask: Optional[Tensor] = None,\n                                 use_separate_proj_weight: bool = False,\n                                 q_proj_weight: Optional[Tensor] = None,\n                                 k_proj_weight: Optional[Tensor] = None,\n                                 v_proj_weight: Optional[Tensor] = None,\n                                 static_k: Optional[Tensor] = None,\n                                 static_v: Optional[Tensor] = None,\n                                 out_dim: Optional[Tensor] = None\n                                 ) -> Tuple[Tensor, Optional[Tensor]]:\n    r\"\"\"\n    Args:\n        query, key, value: map a query and a set of key-value pairs to an output.\n            See \"Attention Is All You Need\" for more details.\n        embed_dim_to_check: total dimension of the model.\n        num_heads: parallel attention heads.\n        in_proj_weight, in_proj_bias: input projection weight and bias.\n        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.\n        add_zero_attn: add a new batch of zeros to the key and\n                       value sequences at dim=1.\n        dropout_p: probability of an element to be zeroed.\n        out_proj_weight, out_proj_bias: the output projection weight and bias.\n        training: apply dropout if is ``True``.\n        key_padding_mask: if provided, specified padding elements in the key will\n            be ignored by the attention. This is an binary mask. When the value is True,\n            the corresponding value on the attention layer will be filled with -inf.\n        need_weights: output attn_output_weights.\n        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all\n            the batches while a 3D mask allows to specify a different mask for the entries of each batch.\n        use_separate_proj_weight: the function accept the proj. weights for query, key,\n            and value in different forms. If false, in_proj_weight will be used, which is\n            a combination of q_proj_weight, k_proj_weight, v_proj_weight.\n        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.\n        static_k, static_v: static key and value used for attention operators.\n    Shape:\n        Inputs:\n        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is\n          the embedding dimension.\n        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is\n          the embedding dimension.\n        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is\n          the embedding dimension.\n        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.\n          If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions\n          will be unchanged. If a BoolTensor is provided, the positions with the\n          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.\n        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.\n          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,\n          S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked\n          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend\n          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``\n          are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor\n          is provided, it will be added to the attention weight.\n        - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,\n          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.\n        - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,\n          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.\n        Outputs:\n        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,\n          E is the embedding dimension.\n        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,\n          L is the target sequence length, S is the source sequence length.\n    \"\"\"\n    if not torch.jit.is_scripting():\n        tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v,\n                    out_proj_weight, out_proj_bias)\n        if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):\n            return handle_torch_function(\n                multi_head_attention_forward, tens_ops, query, key, value,\n                embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias,\n                bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight,\n                out_proj_bias, training=training, key_padding_mask=key_padding_mask,\n                need_weights=need_weights, attn_mask=attn_mask,\n                use_separate_proj_weight=use_separate_proj_weight,\n                q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight,\n                v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v)\n    tgt_len, bsz, embed_dim = query.size()\n    assert embed_dim == embed_dim_to_check\n    # allow MHA to have different sizes for the feature dimension\n    assert key.size(0) == value.size(0) and key.size(1) == value.size(1)\n\n    head_dim = embed_dim // num_heads\n    v_head_dim = out_dim // num_heads\n    assert head_dim * num_heads == embed_dim, \"embed_dim must be divisible by num_heads\"\n    scaling = float(head_dim) ** -0.5\n\n    q = query * scaling\n    k = key\n    v = value\n\n    if attn_mask is not None:\n        assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \\\n            attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \\\n            'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)\n        if attn_mask.dtype == torch.uint8:\n            warnings.warn(\"Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.\")\n            attn_mask = attn_mask.to(torch.bool)\n\n        if attn_mask.dim() == 2:\n            attn_mask = attn_mask.unsqueeze(0)\n            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:\n                raise RuntimeError('The size of the 2D attn_mask is not correct.')\n        elif attn_mask.dim() == 3:\n            if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:\n                raise RuntimeError('The size of the 3D attn_mask is not correct.')\n        else:\n            raise RuntimeError(\"attn_mask's dimension {} is not supported\".format(attn_mask.dim()))\n        # attn_mask's dim is 3 now.\n\n    # convert ByteTensor key_padding_mask to bool\n    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:\n        warnings.warn(\"Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.\")\n        key_padding_mask = key_padding_mask.to(torch.bool)\n\n    if bias_k is not None and bias_v is not None:\n        if static_k is None and static_v is None:\n            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])\n            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])\n            if attn_mask is not None:\n                attn_mask = pad(attn_mask, (0, 1))\n            if key_padding_mask is not None:\n                key_padding_mask = pad(key_padding_mask, (0, 1))\n        else:\n            assert static_k is None, \"bias cannot be added to static key.\"\n            assert static_v is None, \"bias cannot be added to static value.\"\n    else:\n        assert bias_k is None\n        assert bias_v is None\n\n    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)\n    if k is not None:\n        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)\n    if v is not None:\n        v = v.contiguous().view(-1, bsz * num_heads, v_head_dim).transpose(0, 1)\n\n    if static_k is not None:\n        assert static_k.size(0) == bsz * num_heads\n        assert static_k.size(2) == head_dim\n        k = static_k\n\n    if static_v is not None:\n        assert static_v.size(0) == bsz * num_heads\n        assert static_v.size(2) == v_head_dim\n        v = static_v\n\n    src_len = k.size(1)\n\n    if key_padding_mask is not None:\n        assert key_padding_mask.size(0) == bsz\n        assert key_padding_mask.size(1) == src_len\n\n    if add_zero_attn:\n        src_len += 1\n        k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)\n        v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)\n        if attn_mask is not None:\n            attn_mask = pad(attn_mask, (0, 1))\n        if key_padding_mask is not None:\n            key_padding_mask = pad(key_padding_mask, (0, 1))\n\n    attn_output_weights = torch.bmm(q, k.transpose(1, 2))\n    assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]\n\n    if attn_mask is not None:\n        if attn_mask.dtype == torch.bool:\n            attn_output_weights.masked_fill_(attn_mask, float('-inf'))\n        else:\n            attn_output_weights += attn_mask\n\n\n    if key_padding_mask is not None:\n        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)\n        attn_output_weights = attn_output_weights.masked_fill(\n            key_padding_mask.unsqueeze(1).unsqueeze(2),\n            float('-inf'),\n        )\n        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)\n\n    # attn_output_weights = softmax(\n    #     attn_output_weights, dim=-1)\n    attn_output_weights = softmax(\n            attn_output_weights - attn_output_weights.max(dim=-1, keepdim=True)[0], dim=-1)\n    attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)\n\n    attn_output = torch.bmm(attn_output_weights, v)\n    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, v_head_dim]\n    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, out_dim)\n    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)\n\n    if need_weights:\n        # average attention weights over heads\n        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)\n        return attn_output, attn_output_weights.sum(dim=1) / num_heads\n    else:\n        return attn_output, None\n\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/backbone.py",
    "content": "# ------------------------------------------------------------------------\n# UniPose\n# url: https://github.com/IDEA-Research/UniPose\n# Copyright (c) 2023 IDEA. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# Conditional DETR\n# Copyright (c) 2021 Microsoft. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# Copied from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.\n# ------------------------------------------------------------------------\n\n\"\"\"\nBackbone modules.\n\"\"\"\n\nimport torch\nimport torch.nn.functional as F\nimport torchvision\nfrom torch import nn\nfrom torchvision.models._utils import IntermediateLayerGetter\nfrom typing import Dict, List\n\nfrom util.misc import NestedTensor, is_main_process\n\nfrom .position_encoding import build_position_encoding\nfrom .swin_transformer import build_swin_transformer\n\nclass FrozenBatchNorm2d(torch.nn.Module):\n    \"\"\"\n    BatchNorm2d where the batch statistics and the affine parameters are fixed.\n\n    Copy-paste from torchvision.misc.ops with added eps before rqsrt,\n    without which any other models than torchvision.models.resnet[18,34,50,101]\n    produce nans.\n    \"\"\"\n\n    def __init__(self, n):\n        super(FrozenBatchNorm2d, self).__init__()\n        self.register_buffer(\"weight\", torch.ones(n))\n        self.register_buffer(\"bias\", torch.zeros(n))\n        self.register_buffer(\"running_mean\", torch.zeros(n))\n        self.register_buffer(\"running_var\", torch.ones(n))\n\n    def _load_from_state_dict(\n        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs\n    ):\n        num_batches_tracked_key = prefix + \"num_batches_tracked\"\n        if num_batches_tracked_key in state_dict:\n            del state_dict[num_batches_tracked_key]\n\n        super(FrozenBatchNorm2d, self)._load_from_state_dict(\n            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs\n        )\n\n    def forward(self, x):\n        # move reshapes to the beginning\n        # to make it fuser-friendly\n        w = self.weight.reshape(1, -1, 1, 1)\n        b = self.bias.reshape(1, -1, 1, 1)\n        rv = self.running_var.reshape(1, -1, 1, 1)\n        rm = self.running_mean.reshape(1, -1, 1, 1)\n        eps = 1e-5\n        scale = w * (rv + eps).rsqrt()\n        bias = b - rm * scale\n        return x * scale + bias\n\n\nclass BackboneBase(nn.Module):\n    def __init__(\n        self,\n        backbone: nn.Module,\n        train_backbone: bool,\n        num_channels: int,\n        return_interm_indices: list,\n    ):\n        super().__init__()\n        for name, parameter in backbone.named_parameters():\n            if (\n                not train_backbone\n                or \"layer2\" not in name\n                and \"layer3\" not in name\n                and \"layer4\" not in name\n            ):\n                parameter.requires_grad_(False)\n\n        return_layers = {}\n        for idx, layer_index in enumerate(return_interm_indices):\n            return_layers.update(\n                {\"layer{}\".format(5 - len(return_interm_indices) + idx): \"{}\".format(layer_index)}\n            )\n\n        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)\n        self.num_channels = num_channels\n\n    def forward(self, tensor_list: NestedTensor):\n        xs = self.body(tensor_list.tensors)\n        out: Dict[str, NestedTensor] = {}\n        for name, x in xs.items():\n            m = tensor_list.mask\n            assert m is not None\n            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]\n            out[name] = NestedTensor(x, mask)\n        # import ipdb; ipdb.set_trace()\n        return out\n\n\nclass Backbone(BackboneBase):\n    \"\"\"ResNet backbone with frozen BatchNorm.\"\"\"\n\n    def __init__(\n        self,\n        name: str,\n        train_backbone: bool,\n        dilation: bool,\n        return_interm_indices: list,\n        batch_norm=FrozenBatchNorm2d,\n    ):\n        if name in [\"resnet18\", \"resnet34\", \"resnet50\", \"resnet101\"]:\n            backbone = getattr(torchvision.models, name)(\n                replace_stride_with_dilation=[False, False, dilation],\n                pretrained=is_main_process(),\n                norm_layer=batch_norm,\n            )\n        else:\n            raise NotImplementedError(\"Why you can get here with name {}\".format(name))\n        # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048\n        assert name not in (\"resnet18\", \"resnet34\"), \"Only resnet50 and resnet101 are available.\"\n        assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]\n        num_channels_all = [256, 512, 1024, 2048]\n        num_channels = num_channels_all[4 - len(return_interm_indices) :]\n        super().__init__(backbone, train_backbone, num_channels, return_interm_indices)\n\n\nclass Joiner(nn.Sequential):\n    def __init__(self, backbone, position_embedding):\n        super().__init__(backbone, position_embedding)\n\n    def forward(self, tensor_list: NestedTensor):\n        xs = self[0](tensor_list)\n        out: List[NestedTensor] = []\n        pos = []\n        for name, x in xs.items():\n            out.append(x)\n            # position encoding\n            pos.append(self[1](x).to(x.tensors.dtype))\n\n        return out, pos\n\n\ndef build_backbone(args):\n    \"\"\"\n    Useful args:\n        - backbone: backbone name\n        - lr_backbone:\n        - dilation\n        - return_interm_indices: available: [0,1,2,3], [1,2,3], [3]\n        - backbone_freeze_keywords:\n        - use_checkpoint: for swin only for now\n\n    \"\"\"\n    position_embedding = build_position_encoding(args)\n    train_backbone = True\n    if not train_backbone:\n        raise ValueError(\"Please set lr_backbone > 0\")\n    return_interm_indices = args.return_interm_indices\n    assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]\n    args.backbone_freeze_keywords\n    use_checkpoint = getattr(args, \"use_checkpoint\", False)\n\n    if args.backbone in [\"resnet50\", \"resnet101\"]:\n        backbone = Backbone(\n            args.backbone,\n            train_backbone,\n            args.dilation,\n            return_interm_indices,\n            batch_norm=FrozenBatchNorm2d,\n        )\n        bb_num_channels = backbone.num_channels\n    elif args.backbone in [\n        \"swin_T_224_1k\",\n        \"swin_B_224_22k\",\n        \"swin_B_384_22k\",\n        \"swin_L_224_22k\",\n        \"swin_L_384_22k\",\n    ]:\n        pretrain_img_size = int(args.backbone.split(\"_\")[-2])\n        backbone = build_swin_transformer(\n            args.backbone,\n            pretrain_img_size=pretrain_img_size,\n            out_indices=tuple(return_interm_indices),\n            dilation=False,\n            use_checkpoint=use_checkpoint,\n        )\n\n        bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :]\n    else:\n        raise NotImplementedError(\"Unknown backbone {}\".format(args.backbone))\n\n    assert len(bb_num_channels) == len(\n        return_interm_indices\n    ), f\"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}\"\n\n    model = Joiner(backbone, position_embedding)\n    model.num_channels = bb_num_channels\n    assert isinstance(\n        bb_num_channels, List\n    ), \"bb_num_channels is expected to be a List but {}\".format(type(bb_num_channels))\n    return model\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/deformable_transformer.py",
    "content": "# ------------------------------------------------------------------------\n# UniPose\n# url: https://github.com/IDEA-Research/UniPose\n# Copyright (c) 2023 IDEA. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# ED-Pose\n# Copyright (c) 2023 IDEA. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# DINO\n# Copyright (c) 2022 IDEA. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# Modified from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.\n# ------------------------------------------------------------------------\n\nimport math\nimport copy\nimport torch\nimport torch.utils.checkpoint as checkpoint\nfrom torch import nn, Tensor\nfrom typing import Optional\nfrom util.misc import inverse_sigmoid\n\nfrom .transformer_vanilla import TransformerEncoderLayer\nfrom .fuse_modules import BiAttentionBlock\nfrom .utils import gen_encoder_output_proposals, MLP, _get_activation_fn, gen_sineembed_for_position, get_sine_pos_embed\nfrom .ops.modules import MSDeformAttn\n\n\nclass DeformableTransformer(nn.Module):\n\n    def __init__(self, d_model=256, nhead=8,\n                 num_queries=300,\n                 num_encoder_layers=6,\n                 num_unicoder_layers=0,\n                 num_decoder_layers=6,\n                 dim_feedforward=2048, dropout=0.0,\n                 activation=\"relu\", normalize_before=False,\n                 return_intermediate_dec=False, query_dim=4,\n                 num_patterns=0,\n                 modulate_hw_attn=False,\n                 # for deformable encoder\n                 deformable_encoder=False,\n                 deformable_decoder=False,\n                 num_feature_levels=1,\n                 enc_n_points=4,\n                 dec_n_points=4,\n                 use_deformable_box_attn=False,\n                 box_attn_type='roi_align',\n                 # init query\n                 learnable_tgt_init=False,\n                 decoder_query_perturber=None,\n                 add_channel_attention=False,\n                 add_pos_value=False,\n                 random_refpoints_xy=False,\n                 # two stage\n                 two_stage_type='no',\n                 two_stage_pat_embed=0,\n                 two_stage_add_query_num=0,\n                 two_stage_learn_wh=False,\n                 two_stage_keep_all_tokens=False,\n                 # evo of #anchors\n                 dec_layer_number=None,\n                 rm_enc_query_scale=True,\n                 rm_dec_query_scale=True,\n                 rm_self_attn_layers=None,\n                 key_aware_type=None,\n                 # layer share\n                 layer_share_type=None,\n                 # for detach\n                 rm_detach=None,\n                 decoder_sa_type='ca',\n                 module_seq=['sa', 'ca', 'ffn'],\n                 # for dn\n                 embed_init_tgt=False,\n\n                 use_detached_boxes_dec_out=False,\n                 use_text_enhancer=False,\n                 use_fusion_layer=False,\n                 use_checkpoint=False,\n                 use_transformer_ckpt=False,\n                 use_text_cross_attention=False,\n                 text_dropout=0.1,\n                 fusion_dropout=0.1,\n                 fusion_droppath=0.0,\n\n                 binary_query_selection=False,\n                 ffn_extra_layernorm=False,\n                 ):\n        super().__init__()\n        self.num_feature_levels = num_feature_levels\n        self.num_encoder_layers = num_encoder_layers\n        self.num_unicoder_layers = num_unicoder_layers\n        self.num_decoder_layers = num_decoder_layers\n        self.deformable_encoder = deformable_encoder\n        self.deformable_decoder = deformable_decoder\n        self.two_stage_keep_all_tokens = two_stage_keep_all_tokens\n        self.num_queries = num_queries\n        self.random_refpoints_xy = random_refpoints_xy\n        self.use_detached_boxes_dec_out = use_detached_boxes_dec_out\n        self.ffn_extra_layernorm = ffn_extra_layernorm\n        assert query_dim == 4\n\n        self.binary_query_selection = binary_query_selection\n        if self.binary_query_selection:\n            self.binary_query_selection_layer = nn.Linear(d_model, 1)\n        # assert not binary_query_selection, 'binary_query_selection not implemented yet'\n\n        if num_feature_levels > 1:\n            assert deformable_encoder, \"only support deformable_encoder for num_feature_levels > 1\"\n        if use_deformable_box_attn:\n            assert deformable_encoder or deformable_encoder\n\n        assert layer_share_type in [None, 'encoder', 'decoder', 'both']\n        if layer_share_type in ['encoder', 'both']:\n            enc_layer_share = True\n        else:\n            enc_layer_share = False\n        if layer_share_type in ['decoder', 'both']:\n            dec_layer_share = True\n        else:\n            dec_layer_share = False\n        assert layer_share_type is None\n\n        self.decoder_sa_type = decoder_sa_type\n        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']\n\n        # choose encoder layer type\n        if deformable_encoder:\n            encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,\n                                                              dropout, activation,\n                                                              num_feature_levels, nhead, enc_n_points,\n                                                              add_channel_attention=add_channel_attention,\n                                                              use_deformable_box_attn=use_deformable_box_attn,\n                                                              box_attn_type=box_attn_type)\n        else:\n            raise NotImplementedError\n\n        if use_text_enhancer:\n            text_enhance_layer = TransformerEncoderLayer(\n                d_model=d_model,\n                nhead=nhead // 2,\n                dim_feedforward=dim_feedforward // 2,\n                dropout=text_dropout\n            )\n        else:\n            text_enhance_layer = None\n\n        if use_fusion_layer:\n            feature_fusion_layer = BiAttentionBlock(\n                v_dim=d_model,\n                l_dim=d_model,\n                embed_dim=dim_feedforward // 2,\n                num_heads=nhead // 2,\n                dropout=fusion_dropout,\n                drop_path=fusion_droppath\n            )\n        else:\n            feature_fusion_layer = None\n\n        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None\n        assert encoder_norm is None\n        self.encoder = TransformerEncoder(\n            encoder_layer, num_encoder_layers, d_model=d_model,\n            num_queries=num_queries,\n            enc_layer_share=enc_layer_share,\n            text_enhance_layer=text_enhance_layer,\n            feature_fusion_layer=feature_fusion_layer,\n            use_checkpoint=use_checkpoint,\n            use_transformer_ckpt=use_transformer_ckpt,\n        )\n\n        # choose decoder layer type\n        if deformable_decoder:\n            decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,\n                                                              dropout, activation,\n                                                              num_feature_levels, nhead, dec_n_points,\n                                                              use_text_cross_attention=use_text_cross_attention,\n                                                              ffn_extra_layernorm=ffn_extra_layernorm, )\n\n        else:\n            raise NotImplementedError\n\n        decoder_norm = nn.LayerNorm(d_model)\n        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,\n                                          return_intermediate=return_intermediate_dec,\n                                          d_model=d_model, query_dim=query_dim,\n                                          modulate_hw_attn=modulate_hw_attn,\n                                          num_feature_levels=num_feature_levels,\n                                          deformable_decoder=deformable_decoder,\n                                          decoder_query_perturber=decoder_query_perturber,\n                                          dec_layer_number=dec_layer_number, rm_dec_query_scale=rm_dec_query_scale,\n                                          dec_layer_share=dec_layer_share,\n                                          use_detached_boxes_dec_out=use_detached_boxes_dec_out\n                                          )\n\n        self.d_model = d_model\n        self.nhead = nhead\n        self.dec_layers = num_decoder_layers\n        self.num_queries = num_queries  # useful for single stage model only\n        self.num_patterns = num_patterns\n        if not isinstance(num_patterns, int):\n            Warning(\"num_patterns should be int but {}\".format(type(num_patterns)))\n            self.num_patterns = 0\n\n        if num_feature_levels > 1:\n            if self.num_encoder_layers > 0:\n                self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))\n            else:\n                self.level_embed = None\n\n        self.learnable_tgt_init = learnable_tgt_init\n        assert learnable_tgt_init, \"why not learnable_tgt_init\"\n        self.embed_init_tgt = embed_init_tgt\n        if (two_stage_type != 'no' and embed_init_tgt) or (two_stage_type == 'no'):\n            self.tgt_embed = nn.Embedding(self.num_queries, d_model)\n            nn.init.normal_(self.tgt_embed.weight.data)\n        else:\n            self.tgt_embed = None\n\n        # for two stage\n        self.two_stage_type = two_stage_type\n        self.two_stage_pat_embed = two_stage_pat_embed\n        self.two_stage_add_query_num = two_stage_add_query_num\n        self.two_stage_learn_wh = two_stage_learn_wh\n        assert two_stage_type in ['no', 'standard'], \"unknown param {} of two_stage_type\".format(two_stage_type)\n        if two_stage_type == 'standard':\n            # anchor selection at the output of encoder\n            self.enc_output = nn.Linear(d_model, d_model)\n            self.enc_output_norm = nn.LayerNorm(d_model)\n\n            if two_stage_pat_embed > 0:\n                self.pat_embed_for_2stage = nn.Parameter(torch.Tensor(two_stage_pat_embed, d_model))\n                nn.init.normal_(self.pat_embed_for_2stage)\n\n            if two_stage_add_query_num > 0:\n                self.tgt_embed = nn.Embedding(self.two_stage_add_query_num, d_model)\n\n            if two_stage_learn_wh:\n                # import ipdb; ipdb.set_trace()\n                self.two_stage_wh_embedding = nn.Embedding(1, 2)\n            else:\n                self.two_stage_wh_embedding = None\n\n        if two_stage_type == 'no':\n            self.init_ref_points(num_queries)  # init self.refpoint_embed\n\n        self.enc_out_class_embed = None\n        self.enc_out_bbox_embed = None\n\n        # evolution of anchors\n        self.dec_layer_number = dec_layer_number\n        if dec_layer_number is not None:\n            if self.two_stage_type != 'no' or num_patterns == 0:\n                assert dec_layer_number[\n                           0] == num_queries, f\"dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries})\"\n            else:\n                assert dec_layer_number[\n                           0] == num_queries * num_patterns, f\"dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries}) * num_patterns({num_patterns})\"\n\n        self._reset_parameters()\n\n        self.rm_self_attn_layers = rm_self_attn_layers\n        if rm_self_attn_layers is not None:\n            # assert len(rm_self_attn_layers) == num_decoder_layers\n            print(\"Removing the self-attn in {} decoder layers\".format(rm_self_attn_layers))\n            for lid, dec_layer in enumerate(self.decoder.layers):\n                if lid in rm_self_attn_layers:\n                    dec_layer.rm_self_attn_modules()\n\n        self.rm_detach = rm_detach\n        if self.rm_detach:\n            assert isinstance(rm_detach, list)\n            assert any([i in ['enc_ref', 'enc_tgt', 'dec'] for i in rm_detach])\n        self.decoder.rm_detach = rm_detach\n\n    def _reset_parameters(self):\n        for p in self.parameters():\n            if p.dim() > 1:\n                nn.init.xavier_uniform_(p)\n        for m in self.modules():\n            if isinstance(m, MSDeformAttn):\n                m._reset_parameters()\n        if self.num_feature_levels > 1 and self.level_embed is not None:\n            nn.init.normal_(self.level_embed)\n\n        if self.two_stage_learn_wh:\n            nn.init.constant_(self.two_stage_wh_embedding.weight, math.log(0.05 / (1 - 0.05)))\n\n    def get_valid_ratio(self, mask):\n        _, H, W = mask.shape\n        valid_H = torch.sum(~mask[:, :, 0], 1)\n        valid_W = torch.sum(~mask[:, 0, :], 1)\n        valid_ratio_h = valid_H.float() / H\n        valid_ratio_w = valid_W.float() / W\n        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)\n        return valid_ratio\n\n    def init_ref_points(self, use_num_queries):\n        self.refpoint_embed = nn.Embedding(use_num_queries, 4)\n\n        if self.random_refpoints_xy:\n            # import ipdb; ipdb.set_trace()\n            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)\n            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])\n            self.refpoint_embed.weight.data[:, :2].requires_grad = False\n\n    def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, attn_mask2=None, text_dict=None,\n                dn_meta=None,targets=None,kpt_embed=None):\n        \"\"\"\n        Input:\n            - srcs: List of multi features [bs, ci, hi, wi]\n            - masks: List of multi masks [bs, hi, wi]\n            - refpoint_embed: [bs, num_dn, 4]. None in infer\n            - pos_embeds: List of multi pos embeds [bs, ci, hi, wi]\n            - tgt: [bs, num_dn, d_model]. None in infer\n\n        \"\"\"\n        # if self.two_stage_type != 'no' and self.two_stage_add_query_num == 0:\n        #     assert refpoint_embed is None\n\n        # prepare input for encoder\n        src_flatten = []\n        mask_flatten = []\n        lvl_pos_embed_flatten = []\n        spatial_shapes = []\n        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):\n            bs, c, h, w = src.shape\n            spatial_shape = (h, w)\n            spatial_shapes.append(spatial_shape)\n\n            src = src.flatten(2).transpose(1, 2)  # bs, hw, c\n            mask = mask.flatten(1)  # bs, hw\n            pos_embed = pos_embed.flatten(2).transpose(1, 2)  # bs, hw, c\n            if self.num_feature_levels > 1 and self.level_embed is not None:\n                lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)\n            else:\n                lvl_pos_embed = pos_embed\n            lvl_pos_embed_flatten.append(lvl_pos_embed)\n            src_flatten.append(src)\n            mask_flatten.append(mask)\n        src_flatten = torch.cat(src_flatten, 1)  # bs, \\sum{hxw}, c\n        mask_flatten = torch.cat(mask_flatten, 1)  # bs, \\sum{hxw}\n        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)  # bs, \\sum{hxw}, c\n        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)\n        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))\n        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)\n\n        # two stage\n        enc_topk_proposals = enc_refpoint_embed = None\n\n        #########################################################\n        # Begin Encoder\n        #########################################################\n        memory, memory_text = self.encoder(\n            src_flatten,\n            pos=lvl_pos_embed_flatten,\n            level_start_index=level_start_index,\n            spatial_shapes=spatial_shapes,\n            valid_ratios=valid_ratios,\n            key_padding_mask=mask_flatten,\n            memory_text=text_dict['encoded_text'],\n            text_attention_mask=~text_dict['text_token_mask'],\n            # we ~ the mask . False means use the token; True means pad the token\n            position_ids=text_dict['position_ids'],\n            text_self_attention_masks=text_dict['text_self_attention_masks'],\n        )\n        #########################################################\n        # End Encoder\n        # - memory: bs, \\sum{hw}, c\n        # - mask_flatten: bs, \\sum{hw}\n        # - lvl_pos_embed_flatten: bs, \\sum{hw}, c\n        # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)\n        # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)\n        #########################################################\n        text_dict['encoded_text'] = memory_text\n\n        if self.two_stage_type == 'standard':\n            if self.two_stage_learn_wh:\n                input_hw = self.two_stage_wh_embedding.weight[0]\n            else:\n                input_hw = None\n            output_memory, output_proposals = gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes,\n                                                                           input_hw)\n            output_memory = self.enc_output_norm(self.enc_output(output_memory))\n\n            if self.two_stage_pat_embed > 0:\n                bs, nhw, _ = output_memory.shape\n                # output_memory: bs, n, 256; self.pat_embed_for_2stage: k, 256\n                output_memory = output_memory.repeat(1, self.two_stage_pat_embed, 1)\n                _pats = self.pat_embed_for_2stage.repeat_interleave(nhw, 0)\n                output_memory = output_memory + _pats\n                output_proposals = output_proposals.repeat(1, self.two_stage_pat_embed, 1)\n\n            if self.two_stage_add_query_num > 0:\n                assert refpoint_embed is not None\n                output_memory = torch.cat((output_memory, tgt), dim=1)\n                output_proposals = torch.cat((output_proposals, refpoint_embed), dim=1)\n\n            if self.binary_query_selection:\n                topk_logits = self.binary_query_selection_layer(output_memory).squeeze(-1)\n            else:\n                if text_dict is not None:\n                    enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict)\n                else:\n                    enc_outputs_class_unselected = self.enc_out_class_embed(output_memory)\n\n                topk_logits = enc_outputs_class_unselected.max(-1)[0]\n            enc_outputs_coord_unselected = self.enc_out_bbox_embed(\n                output_memory) + output_proposals  # (bs, \\sum{hw}, 4) unsigmoid\n            topk = self.num_queries\n\n            topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]  # bs, nq\n\n            # gather boxes\n            refpoint_embed_undetach = torch.gather(enc_outputs_coord_unselected, 1,\n                                                   topk_proposals.unsqueeze(-1).repeat(1, 1, 4))  # unsigmoid\n            refpoint_embed_ = refpoint_embed_undetach.detach()\n            init_box_proposal = torch.gather(output_proposals, 1,\n                                             topk_proposals.unsqueeze(-1).repeat(1, 1, 4)).sigmoid()  # sigmoid\n\n            # gather tgt\n            tgt_undetach = torch.gather(output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))\n            if self.embed_init_tgt:\n                tgt_ = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, d_model\n            else:\n                tgt_ = tgt_undetach.detach()\n\n            if refpoint_embed is not None:\n                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)\n                tgt = torch.cat([tgt, tgt_], dim=1)\n            else:\n                refpoint_embed, tgt = refpoint_embed_, tgt_\n\n        elif self.two_stage_type == 'no':\n            tgt_ = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, d_model\n            refpoint_embed_ = self.refpoint_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, 4\n\n            if refpoint_embed is not None:\n                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)\n                tgt = torch.cat([tgt, tgt_], dim=1)\n            else:\n                refpoint_embed, tgt = refpoint_embed_, tgt_\n\n            if self.num_patterns > 0:\n                tgt_embed = tgt.repeat(1, self.num_patterns, 1)\n                refpoint_embed = refpoint_embed.repeat(1, self.num_patterns, 1)\n                tgt_pat = self.patterns.weight[None, :, :].repeat_interleave(self.num_queries,\n                                                                             1)  # 1, n_q*n_pat, d_model\n                tgt = tgt_embed + tgt_pat\n\n            init_box_proposal = refpoint_embed_.sigmoid()\n\n        else:\n            raise NotImplementedError(\"unknown two_stage_type {}\".format(self.two_stage_type))\n        #########################################################\n        # End preparing tgt\n        # - tgt: bs, NQ, d_model\n        # - refpoint_embed(unsigmoid): bs, NQ, d_model\n        #########################################################\n        # if os.environ.get(\"SHILONG_AMP_INFNAN_DEBUG\") == '1':\n        #     if refpoint_embed.isnan().any() | refpoint_embed.isinf().any():\n        #         import ipdb; ipdb.set_trace()\n        #     if tgt.isnan().any() | tgt.isinf().any():\n        #         import ipdb; ipdb.set_trace()\n\n        #########################################################\n        # Begin Decoder\n        #########################################################\n        hs, references = self.decoder(\n            tgt=tgt.transpose(0, 1),\n            memory=memory.transpose(0, 1),\n            memory_key_padding_mask=mask_flatten,\n            pos=lvl_pos_embed_flatten.transpose(0, 1),\n            refpoints_unsigmoid=refpoint_embed.transpose(0, 1),\n            level_start_index=level_start_index,\n            spatial_shapes=spatial_shapes,\n            valid_ratios=valid_ratios, tgt_mask=attn_mask,\n            tgt_mask2=attn_mask2,\n            memory_text=text_dict['encoded_text'],\n            text_attention_mask=~text_dict['text_token_mask'],\n            text_dict=text_dict,\n            dn_meta=dn_meta,\n            targets=targets,\n            kpt_embed=kpt_embed\n            # we ~ the mask . False means use the token; True means pad the token\n        )\n        #########################################################\n        # End Decoder\n        # hs: n_dec, bs, nq, d_model\n        # references: n_dec+1, bs, nq, query_dim\n        #########################################################\n\n        #########################################################\n        # Begin postprocess\n        #########################################################\n        if self.two_stage_type == 'standard':\n            if self.two_stage_keep_all_tokens:\n                hs_enc = output_memory.unsqueeze(0)\n                ref_enc = enc_outputs_coord_unselected.unsqueeze(0)\n                init_box_proposal = output_proposals\n                # import ipdb; ipdb.set_trace()\n            else:\n                hs_enc = tgt_undetach.unsqueeze(0)\n                ref_enc = refpoint_embed_undetach.sigmoid().unsqueeze(0)\n        else:\n            hs_enc = ref_enc = None\n        #########################################################\n        # End postprocess\n        # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or (n_enc, bs, nq, d_model) or None\n        # ref_enc: (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or (n_enc, bs, nq, d_model) or None\n        #########################################################\n\n        return hs, references, hs_enc, ref_enc, init_box_proposal\n        # hs: (n_dec, bs, nq, d_model)\n        # references: sigmoid coordinates. (n_dec+1, bs, bq, 4)\n        # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or None\n        # ref_enc: sigmoid coordinates. \\\n        #           (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or None\n\n\nclass TransformerEncoder(nn.Module):\n\n    def __init__(self,\n                 encoder_layer, num_layers, d_model=256,\n                 num_queries=300,\n                 enc_layer_share=False,\n                 text_enhance_layer=None,\n                 feature_fusion_layer=None,\n                 use_checkpoint=False,\n                 use_transformer_ckpt=False,\n                 ):\n        \"\"\"_summary_\n\n        Args:\n            encoder_layer (_type_): _description_\n            num_layers (_type_): _description_\n            norm (_type_, optional): _description_. Defaults to None.\n            d_model (int, optional): _description_. Defaults to 256.\n            num_queries (int, optional): _description_. Defaults to 300.\n            enc_layer_share (bool, optional): _description_. Defaults to False.\n\n        \"\"\"\n        super().__init__()\n        # prepare layers\n        self.layers = []\n        self.text_layers = []\n        self.fusion_layers = []\n        if num_layers > 0:\n            self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share)\n\n            if text_enhance_layer is not None:\n                self.text_layers = _get_clones(text_enhance_layer, num_layers, layer_share=enc_layer_share)\n            if feature_fusion_layer is not None:\n                self.fusion_layers = _get_clones(feature_fusion_layer, num_layers, layer_share=enc_layer_share)\n        else:\n            self.layers = []\n            del encoder_layer\n\n            if text_enhance_layer is not None:\n                self.text_layers = []\n                del text_enhance_layer\n            if feature_fusion_layer is not None:\n                self.fusion_layers = []\n                del feature_fusion_layer\n\n        self.query_scale = None\n        self.num_queries = num_queries\n        self.num_layers = num_layers\n        self.d_model = d_model\n\n        self.use_checkpoint = use_checkpoint\n        self.use_transformer_ckpt = use_transformer_ckpt\n\n    @staticmethod\n    def get_reference_points(spatial_shapes, valid_ratios, device):\n        reference_points_list = []\n        for lvl, (H_, W_) in enumerate(spatial_shapes):\n            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),\n                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),)\n            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)\n            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)\n            ref = torch.stack((ref_x, ref_y), -1)\n            reference_points_list.append(ref)\n        reference_points = torch.cat(reference_points_list, 1)\n        reference_points = reference_points[:, :, None] * valid_ratios[:, None]\n        return reference_points\n\n    def forward(self,\n                # for images\n                src: Tensor,\n                pos: Tensor,\n                spatial_shapes: Tensor,\n                level_start_index: Tensor,\n                valid_ratios: Tensor,\n                key_padding_mask: Tensor,\n                # for texts\n                memory_text: Tensor = None,\n                text_attention_mask: Tensor = None,\n                pos_text: Tensor = None,\n                text_self_attention_masks: Tensor = None,\n                position_ids: Tensor = None,\n                ):\n        \"\"\"\n        Input:\n            - src: [bs, sum(hi*wi), 256]\n            - pos: pos embed for src. [bs, sum(hi*wi), 256]\n            - spatial_shapes: h,w of each level [num_level, 2]\n            - level_start_index: [num_level] start point of level in sum(hi*wi).\n            - valid_ratios: [bs, num_level, 2]\n            - key_padding_mask: [bs, sum(hi*wi)]\n\n            - memory_text: bs, n_text, 256\n            - text_attention_mask: bs, n_text\n                False for no padding; True for padding\n            - pos_text: bs, n_text, 256\n\n            - position_ids: bs, n_text\n        Intermedia:\n            - reference_points: [bs, sum(hi*wi), num_level, 2]\n        Outpus:\n            - output: [bs, sum(hi*wi), 256]\n        \"\"\"\n\n        output = src\n\n        # preparation and reshape\n        if self.num_layers > 0:\n            reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)\n\n        if self.text_layers:\n            # generate pos_text\n            bs, n_text, text_dim = memory_text.shape\n            if pos_text is None and position_ids is None:\n                pos_text = torch.arange(n_text, device=memory_text.device).float().unsqueeze(0).unsqueeze(-1).repeat(bs,\n                                                                                                                     1,\n                                                                                                                     1)\n                pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False)\n            if position_ids is not None:\n                pos_text = get_sine_pos_embed(position_ids[..., None], num_pos_feats=256, exchange_xy=False)\n\n        # main process\n        for layer_id, layer in enumerate(self.layers):\n            # if output.isnan().any() or memory_text.isnan().any():\n            #     if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':\n            #         import ipdb; ipdb.set_trace()\n            if self.fusion_layers:\n                if self.use_checkpoint:\n                    output, memory_text = checkpoint.checkpoint(\n                        self.fusion_layers[layer_id],\n                        output,\n                        memory_text,\n                        key_padding_mask,\n                        text_attention_mask\n                    )\n                else:\n                    output, memory_text = self.fusion_layers[layer_id](v=output, l=memory_text,\n                                                                       attention_mask_v=key_padding_mask,\n                                                                       attention_mask_l=text_attention_mask)\n\n            if self.text_layers:\n                memory_text = self.text_layers[layer_id](\n                    src=memory_text.transpose(0, 1),\n                    src_mask=~text_self_attention_masks,  # note we use ~ for mask here\n                    src_key_padding_mask=text_attention_mask,\n                    pos=(pos_text.transpose(0, 1) if pos_text is not None else None)\n                ).transpose(0, 1)\n\n            # main process\n            if self.use_transformer_ckpt:\n                output = checkpoint.checkpoint(\n                    layer,\n                    output,\n                    pos,\n                    reference_points,\n                    spatial_shapes,\n                    level_start_index,\n                    key_padding_mask\n                )\n            else:\n                output = layer(src=output, pos=pos, reference_points=reference_points, spatial_shapes=spatial_shapes,\n                               level_start_index=level_start_index, key_padding_mask=key_padding_mask)\n\n        return output, memory_text\n\n\nclass TransformerDecoder(nn.Module):\n\n    def __init__(self, decoder_layer, num_layers, norm=None,\n                 return_intermediate=False,\n                 d_model=256, query_dim=4,\n                 modulate_hw_attn=False,\n                 num_feature_levels=1,\n                 deformable_decoder=False,\n                 decoder_query_perturber=None,\n                 dec_layer_number=None,  # number of queries each layer in decoder\n                 rm_dec_query_scale=False,\n                 dec_layer_share=False,\n                 dec_layer_dropout_prob=None,\n                 use_detached_boxes_dec_out=False,\n                 num_box_decoder_layers=2,\n                 num_body_points=68,\n                 ):\n        super().__init__()\n        if num_layers > 0:\n            self.layers = _get_clones(decoder_layer, num_layers, layer_share=dec_layer_share)\n        else:\n            self.layers = []\n        self.num_layers = num_layers\n        self.norm = norm\n        self.return_intermediate = return_intermediate\n        assert return_intermediate, \"support return_intermediate only\"\n        self.query_dim = query_dim\n        assert query_dim in [2, 4], \"query_dim should be 2/4 but {}\".format(query_dim)\n        self.num_feature_levels = num_feature_levels\n        self.use_detached_boxes_dec_out = use_detached_boxes_dec_out\n\n        self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)\n        if not deformable_decoder:\n            self.query_pos_sine_scale = MLP(d_model, d_model, d_model, 2)\n        else:\n            self.query_pos_sine_scale = None\n\n        if rm_dec_query_scale:\n            self.query_scale = None\n        else:\n            raise NotImplementedError\n            self.query_scale = MLP(d_model, d_model, d_model, 2)\n        self.bbox_embed = None\n        self.class_embed = None\n        self.pose_embed = None\n        self.pose_hw_embed = None\n        self.d_model = d_model\n        self.modulate_hw_attn = modulate_hw_attn\n        self.deformable_decoder = deformable_decoder\n\n        if not deformable_decoder and modulate_hw_attn:\n            self.ref_anchor_head = MLP(d_model, d_model, 2, 2)\n        else:\n            self.ref_anchor_head = None\n\n        self.decoder_query_perturber = decoder_query_perturber\n        self.box_pred_damping = None\n\n        self.dec_layer_number = dec_layer_number\n        if dec_layer_number is not None:\n            assert isinstance(dec_layer_number, list)\n            assert len(dec_layer_number) == num_layers\n            # assert dec_layer_number[0] ==\n\n        self.dec_layer_dropout_prob = dec_layer_dropout_prob\n        if dec_layer_dropout_prob is not None:\n            assert isinstance(dec_layer_dropout_prob, list)\n            assert len(dec_layer_dropout_prob) == num_layers\n            for i in dec_layer_dropout_prob:\n                assert 0.0 <= i <= 1.0\n\n        self.rm_detach = None\n        self.num_body_points = num_body_points\n\n        self.hw = nn.Embedding(17, 2)\n        self.num_box_decoder_layers = num_box_decoder_layers\n        self.kpt_index = [x for x in range(50 * (self.num_body_points + 1)) if x % (self.num_body_points + 1) != 0]\n        self.hw_append = nn.Embedding(self.num_body_points-17, 2)\n\n    def forward(self, tgt, memory,\n                tgt_mask: Optional[Tensor] = None,\n                tgt_mask2: Optional[Tensor] = None,\n                memory_mask: Optional[Tensor] = None,\n                tgt_key_padding_mask: Optional[Tensor] = None,\n                memory_key_padding_mask: Optional[Tensor] = None,\n                pos: Optional[Tensor] = None,\n                refpoints_unsigmoid: Optional[Tensor] = None,  # num_queries, bs, 2\n                # for memory\n                level_start_index: Optional[Tensor] = None,  # num_levels\n                spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2\n                valid_ratios: Optional[Tensor] = None,\n                # for text\n                memory_text: Optional[Tensor] = None,\n                text_attention_mask: Optional[Tensor] = None,\n                text_dict: Optional[Tensor] = None,\n                dn_meta: Optional[Tensor] = None,\n                targets: Optional[Tensor] = None,\n                kpt_embed: Optional[Tensor] = None\n                ):\n        \"\"\"\n        Input:\n            - tgt: nq, bs, d_model\n            - memory: hw, bs, d_model\n            - pos: hw, bs, d_model\n            - refpoints_unsigmoid: nq, bs, 2/4\n            - valid_ratios/spatial_shapes: bs, nlevel, 2\n        \"\"\"\n\n        output = tgt\n        output += self.hw.weight[0, 0] * 0.0\n\n\n        intermediate = []\n        reference_points = refpoints_unsigmoid.sigmoid()\n        ref_points = [reference_points]\n        effect_num_dn = dn_meta['pad_size'] if self.training else 0\n        inter_select_number = 50\n        for layer_id, layer in enumerate(self.layers):\n\n            if reference_points.shape[-1] == 4:\n                reference_points_input = reference_points[:, :, None] \\\n                                         * torch.cat([valid_ratios, valid_ratios], -1)[None, :]  # nq, bs, nlevel, 4\n            else:\n                assert reference_points.shape[-1] == 2\n                reference_points_input = reference_points[:, :, None] * valid_ratios[None, :]\n            query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :])  # nq, bs, 256*2\n\n            # conditional query\n            raw_query_pos = self.ref_point_head(query_sine_embed)  # nq, bs, 256\n            pos_scale = self.query_scale(output) if self.query_scale is not None else 1\n            query_pos = pos_scale * raw_query_pos\n            # if os.environ.get(\"SHILONG_AMP_INFNAN_DEBUG\") == '1':\n            #     if query_pos.isnan().any() | query_pos.isinf().any():\n            #         import ipdb; ipdb.set_trace()\n\n            # main process\n            output = layer(\n                tgt=output,\n                tgt_query_pos=query_pos,\n                tgt_query_sine_embed=query_sine_embed,\n                tgt_key_padding_mask=tgt_key_padding_mask,\n                tgt_reference_points=reference_points_input,\n\n                memory_text=memory_text,\n                text_attention_mask=text_attention_mask,\n\n                memory=memory,\n                memory_key_padding_mask=memory_key_padding_mask,\n                memory_level_start_index=level_start_index,\n                memory_spatial_shapes=spatial_shapes,\n                memory_pos=pos,\n\n                self_attn_mask=tgt_mask,\n                cross_attn_mask=memory_mask\n            )\n            if output.isnan().any() | output.isinf().any():\n                print(f\"output layer_id {layer_id} is nan\")\n                try:\n                    num_nan = output.isnan().sum().item()\n                    num_inf = output.isinf().sum().item()\n                    print(f\"num_nan {num_nan}, num_inf {num_inf}\")\n                except Exception as e:\n                    print(e)\n\n\n\n\n            intermediate.append(self.norm(output))\n            # iter update\n            if layer_id < self.num_box_decoder_layers:\n                reference_before_sigmoid = inverse_sigmoid(reference_points)\n                delta_unsig = self.bbox_embed[layer_id](output)\n                outputs_unsig = delta_unsig + reference_before_sigmoid\n                new_reference_points = outputs_unsig.sigmoid()\n\n            # select # ref points as anchors\n            if layer_id == self.num_box_decoder_layers - 1:\n                dn_output = output[:effect_num_dn]\n                dn_new_reference_points = new_reference_points[:effect_num_dn]\n                class_unselected = self.class_embed[layer_id](output.transpose(0, 1), text_dict)[:,\n                                   effect_num_dn:].transpose(0, 1)\n                topk_proposals = torch.topk(class_unselected.max(-1)[0], inter_select_number, dim=0)[1]\n                new_reference_points_for_box = torch.gather(new_reference_points[effect_num_dn:], 0,\n                                                            topk_proposals.unsqueeze(-1).repeat(1, 1, 4))\n                new_output_for_box = torch.gather(output[effect_num_dn:], 0,\n                                                  topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))\n                keypoint_embed=kpt_embed.transpose(0, 1)\n\n                new_output_for_keypoint = keypoint_embed[None, :, :, :].repeat(new_output_for_box.shape[0],1,1,1)\n                delta_xy = self.pose_embed[-1](new_output_for_keypoint)[..., :2]\n                keypoint_xy = (inverse_sigmoid(new_reference_points_for_box[..., :2][:, None]) + delta_xy).sigmoid()\n                num_queries, _, bs, _ = keypoint_xy.shape\n                aa = torch.cat((self.hw.weight,self.hw_append.weight),dim=0)\n                keypoint_wh_weight = aa.unsqueeze(0).unsqueeze(-2).repeat(num_queries, 1, bs, 1).sigmoid()\n                keypoint_wh = keypoint_wh_weight * new_reference_points_for_box[..., 2:][:, None]\n                new_reference_points_for_keypoint = torch.cat((keypoint_xy, keypoint_wh), dim=-1)\n                new_reference_points = torch.cat(\n                    (new_reference_points_for_box.unsqueeze(1), new_reference_points_for_keypoint), dim=1).flatten(0, 1)\n                output = torch.cat((new_output_for_box.unsqueeze(1), new_output_for_keypoint), dim=1).flatten(0, 1)\n                new_reference_points = torch.cat((dn_new_reference_points, new_reference_points), dim=0)\n                output = torch.cat((dn_output, output), dim=0)\n                tgt_mask = tgt_mask2\n\n            if layer_id >= self.num_box_decoder_layers:\n                reference_before_sigmoid = inverse_sigmoid(reference_points)\n                output_bbox_dn = output[:effect_num_dn]\n                output_bbox_norm = output[effect_num_dn:][0::(self.num_body_points + 1)]\n                reference_before_sigmoid_bbox_dn = reference_before_sigmoid[:effect_num_dn]\n                reference_before_sigmoid_bbox_norm = reference_before_sigmoid[effect_num_dn:][\n                                                     0::(self.num_body_points + 1)]\n                delta_unsig_dn = self.bbox_embed[layer_id](output_bbox_dn)\n                delta_unsig_norm = self.bbox_embed[layer_id](output_bbox_norm)\n                outputs_unsig_dn = delta_unsig_dn + reference_before_sigmoid_bbox_dn\n                outputs_unsig_norm = delta_unsig_norm + reference_before_sigmoid_bbox_norm\n                new_reference_points_for_box_dn = outputs_unsig_dn.sigmoid()\n                new_reference_points_for_box_norm = outputs_unsig_norm.sigmoid()\n                output_kpt = output[effect_num_dn:].index_select(0, torch.tensor(self.kpt_index, device=output.device))\n                delta_xy_unsig = self.pose_embed[layer_id - self.num_box_decoder_layers](output_kpt)\n                outputs_unsig = reference_before_sigmoid[effect_num_dn:].index_select(0, torch.tensor(self.kpt_index,\n                                                                                                      device=output.device)).clone()  ##\n                delta_hw_unsig = self.pose_hw_embed[layer_id - self.num_box_decoder_layers](output_kpt)\n                outputs_unsig[..., :2] += delta_xy_unsig[..., :2]\n                outputs_unsig[..., 2:] += delta_hw_unsig\n                new_reference_points_for_keypoint = outputs_unsig.sigmoid()\n                bs = new_reference_points_for_box_norm.shape[1]\n                new_reference_points_norm = torch.cat((new_reference_points_for_box_norm.unsqueeze(1),\n                                                       new_reference_points_for_keypoint.view(-1, self.num_body_points,\n                                                                                              bs, 4)), dim=1).flatten(0,\n                                                                                                                      1)\n                new_reference_points = torch.cat((new_reference_points_for_box_dn, new_reference_points_norm), dim=0)\n\n            if self.rm_detach and 'dec' in self.rm_detach:\n                reference_points = new_reference_points\n            else:\n                reference_points = new_reference_points.detach()\n\n            # if layer_id != self.num_layers - 1:\n            if self.use_detached_boxes_dec_out:\n                ref_points.append(reference_points)\n            else:\n                ref_points.append(new_reference_points)\n\n        return [\n            [itm_out.transpose(0, 1) for itm_out in intermediate],\n            [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points]\n        ]\n\n\nclass DeformableTransformerEncoderLayer(nn.Module):\n    def __init__(self,\n                 d_model=256, d_ffn=1024,\n                 dropout=0.1, activation=\"relu\",\n                 n_levels=4, n_heads=8, n_points=4,\n                 add_channel_attention=False,\n                 use_deformable_box_attn=False,\n                 box_attn_type='roi_align',\n                 ):\n        super().__init__()\n\n        # self attention\n        self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)\n        self.dropout1 = nn.Dropout(dropout)\n        self.norm1 = nn.LayerNorm(d_model)\n\n        # ffn\n        self.linear1 = nn.Linear(d_model, d_ffn)\n        self.activation = _get_activation_fn(activation, d_model=d_ffn)\n        self.dropout2 = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(d_ffn, d_model)\n        self.dropout3 = nn.Dropout(dropout)\n        self.norm2 = nn.LayerNorm(d_model)\n\n        # channel attention\n        self.add_channel_attention = add_channel_attention\n        if add_channel_attention:\n            self.activ_channel = _get_activation_fn('dyrelu', d_model=d_model)\n            self.norm_channel = nn.LayerNorm(d_model)\n\n    @staticmethod\n    def with_pos_embed(tensor, pos):\n        return tensor if pos is None else tensor + pos\n\n    def forward_ffn(self, src):\n        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))\n        src = src + self.dropout3(src2)\n        src = self.norm2(src)\n        return src\n\n    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None):\n        # self attention\n        # import ipdb; ipdb.set_trace()\n        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index,\n                              key_padding_mask)\n        src = src + self.dropout1(src2)\n        src = self.norm1(src)\n\n        # ffn\n        src = self.forward_ffn(src)\n\n        # channel attn\n        if self.add_channel_attention:\n            src = self.norm_channel(src + self.activ_channel(src))\n\n        return src\n\n\nclass DeformableTransformerDecoderLayer(nn.Module):\n    def __init__(self, d_model=256, d_ffn=1024,\n                 dropout=0.1, activation=\"relu\",\n                 n_levels=4, n_heads=8, n_points=4,\n                 use_text_feat_guide=False,\n                 use_text_cross_attention=False,\n                 ffn_extra_layernorm=False\n                 ):\n        super().__init__()\n\n        # cross attention\n        # self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)\n        self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)\n        self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()\n        self.norm1 = nn.LayerNorm(d_model)\n\n        # cross attention text\n        if use_text_cross_attention:\n            self.ca_text = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)\n            self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()\n            self.catext_norm = nn.LayerNorm(d_model)\n\n        # self attention\n        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)\n        self.dropout2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()\n        self.norm2 = nn.LayerNorm(d_model)\n\n        # ffn\n        self.linear1 = nn.Linear(d_model, d_ffn)\n        self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)\n        self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()\n        self.linear2 = nn.Linear(d_ffn, d_model)\n        self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()\n        self.norm3 = nn.LayerNorm(d_model)\n        if ffn_extra_layernorm:\n            raise NotImplementedError('ffn_extra_layernorm not implemented')\n            self.norm_ext = nn.LayerNorm(d_ffn)\n        else:\n            self.norm_ext = None\n\n        self.key_aware_proj = None\n        self.use_text_feat_guide = use_text_feat_guide\n        assert not use_text_feat_guide\n        self.use_text_cross_attention = use_text_cross_attention\n\n    def rm_self_attn_modules(self):\n        self.self_attn = None\n        self.dropout2 = None\n        self.norm2 = None\n\n    @staticmethod\n    def with_pos_embed(tensor, pos):\n        return tensor if pos is None else tensor + pos\n\n    def forward_ffn(self, tgt, ipdb_flag=False):\n\n        with torch.cuda.amp.autocast(enabled=False):\n            tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))\n\n        tgt = tgt + self.dropout4(tgt2)\n        tgt = self.norm3(tgt)\n        return tgt\n\n    def forward(self,\n                # for tgt\n                tgt: Optional[Tensor],  # nq, bs, d_model\n                tgt_query_pos: Optional[Tensor] = None,  # pos for query. MLP(Sine(pos))\n                tgt_query_sine_embed: Optional[Tensor] = None,  # pos for query. Sine(pos)\n                tgt_key_padding_mask: Optional[Tensor] = None,\n                tgt_reference_points: Optional[Tensor] = None,  # nq, bs, 4\n\n                memory_text: Optional[Tensor] = None,  # bs, num_token, d_model\n                text_attention_mask: Optional[Tensor] = None,  # bs, num_token\n\n                # for memory\n                memory: Optional[Tensor] = None,  # hw, bs, d_model\n                memory_key_padding_mask: Optional[Tensor] = None,\n                memory_level_start_index: Optional[Tensor] = None,  # num_levels\n                memory_spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2\n                memory_pos: Optional[Tensor] = None,  # pos for memory\n\n                # sa\n                self_attn_mask: Optional[Tensor] = None,  # mask used for self-attention\n                cross_attn_mask: Optional[Tensor] = None,  # mask used for cross-attention\n                ):\n        \"\"\"\n        Input:\n            - tgt/tgt_query_pos: nq, bs, d_model\n            -\n        \"\"\"\n        assert cross_attn_mask is None\n\n        # self attention\n        if self.self_attn is not None:\n            # import ipdb; ipdb.set_trace()\n            q = k = self.with_pos_embed(tgt, tgt_query_pos)\n            tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]\n            tgt = tgt + self.dropout2(tgt2)\n            tgt = self.norm2(tgt)\n\n            # if os.environ.get(\"SHILONG_AMP_INFNAN_DEBUG\") == '1':\n            #     if tgt.isnan().any() | tgt.isinf().any() :\n            #         import ipdb; ipdb.set_trace()\n\n        if self.use_text_cross_attention:\n            tgt2 = self.ca_text(self.with_pos_embed(tgt, tgt_query_pos), memory_text.transpose(0, 1),\n                                memory_text.transpose(0, 1), key_padding_mask=text_attention_mask)[0]\n            tgt = tgt + self.catext_dropout(tgt2)\n            tgt = self.catext_norm(tgt)\n\n            # if os.environ.get(\"SHILONG_AMP_INFNAN_DEBUG\") == '1':\n            #     if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':\n            #         import ipdb; ipdb.set_trace()\n\n            # if tgt.isnan().any() | tgt.isinf().any() :\n            #     import ipdb; ipdb.set_trace()\n\n        tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),\n                               tgt_reference_points.transpose(0, 1).contiguous(),\n                               memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index,\n                               memory_key_padding_mask).transpose(0, 1)\n        tgt = tgt + self.dropout1(tgt2)\n        tgt = self.norm1(tgt)\n\n        # if os.environ.get(\"SHILONG_AMP_INFNAN_DEBUG\") == '1':\n        #     tgtk = tgt.clone()\n        #     if tgt.isnan().any() | tgt.isinf().any() :\n        #         import ipdb; ipdb.set_trace()\n\n        # ffn\n        tgt = self.forward_ffn(tgt)\n        # if os.environ.get(\"SHILONG_AMP_INFNAN_DEBUG\") == '1':\n        #     if tgt.isnan().any() | tgt.isinf().any() :\n        #         tgtk = self.forward_ffn(tgtk, ipdb_flag=True)\n        #         import ipdb; ipdb.set_trace()\n\n        return tgt\n\n\ndef _get_clones(module, N, layer_share=False):\n    # import ipdb; ipdb.set_trace()\n    if layer_share:\n        return nn.ModuleList([module for i in range(N)])\n    else:\n        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])\n\n\ndef build_deformable_transformer(args):\n    decoder_query_perturber = None\n    if args.decoder_layer_noise:\n        from .utils import RandomBoxPerturber\n        decoder_query_perturber = RandomBoxPerturber(\n            x_noise_scale=args.dln_xy_noise, y_noise_scale=args.dln_xy_noise,\n            w_noise_scale=args.dln_hw_noise, h_noise_scale=args.dln_hw_noise)\n\n    use_detached_boxes_dec_out = False\n    try:\n        use_detached_boxes_dec_out = args.use_detached_boxes_dec_out\n    except:\n        use_detached_boxes_dec_out = False\n\n    binary_query_selection = False\n    try:\n        binary_query_selection = args.binary_query_selection\n    except:\n        binary_query_selection = False\n\n    ffn_extra_layernorm = False\n    try:\n        ffn_extra_layernorm = args.ffn_extra_layernorm\n    except:\n        print('ffn_extra_layernorm not found, set to False')\n        ffn_extra_layernorm = False\n\n    return DeformableTransformer(\n        d_model=args.hidden_dim,\n        dropout=args.dropout,\n        nhead=args.nheads,\n        num_queries=args.num_queries,\n        dim_feedforward=args.dim_feedforward,\n        num_encoder_layers=args.enc_layers,\n        num_unicoder_layers=args.unic_layers,\n        num_decoder_layers=args.dec_layers,\n        normalize_before=args.pre_norm,\n        return_intermediate_dec=True,\n        query_dim=args.query_dim,\n        activation=args.transformer_activation,\n        num_patterns=args.num_patterns,\n        modulate_hw_attn=True,\n\n        deformable_encoder=True,\n        deformable_decoder=True,\n        num_feature_levels=args.num_feature_levels,\n        enc_n_points=args.enc_n_points,\n        dec_n_points=args.dec_n_points,\n        use_deformable_box_attn=args.use_deformable_box_attn,\n        box_attn_type=args.box_attn_type,\n\n        learnable_tgt_init=True,\n        decoder_query_perturber=decoder_query_perturber,\n\n        add_channel_attention=args.add_channel_attention,\n        add_pos_value=args.add_pos_value,\n        random_refpoints_xy=args.random_refpoints_xy,\n\n        # two stage\n        two_stage_type=args.two_stage_type,  # ['no', 'standard', 'early']\n        two_stage_pat_embed=args.two_stage_pat_embed,\n        two_stage_add_query_num=args.two_stage_add_query_num,\n        two_stage_learn_wh=args.two_stage_learn_wh,\n        two_stage_keep_all_tokens=args.two_stage_keep_all_tokens,\n        dec_layer_number=args.dec_layer_number,\n        rm_self_attn_layers=None,\n        key_aware_type=None,\n        layer_share_type=None,\n\n        rm_detach=None,\n        decoder_sa_type=args.decoder_sa_type,\n        module_seq=args.decoder_module_seq,\n\n        embed_init_tgt=args.embed_init_tgt,\n        use_detached_boxes_dec_out=use_detached_boxes_dec_out,\n        use_text_enhancer=args.use_text_enhancer,\n        use_fusion_layer=args.use_fusion_layer,\n        use_checkpoint=args.use_checkpoint,\n        use_transformer_ckpt=args.use_transformer_ckpt,\n        use_text_cross_attention=args.use_text_cross_attention,\n\n        text_dropout=args.text_dropout,\n        fusion_dropout=args.fusion_dropout,\n        fusion_droppath=args.fusion_droppath,\n\n        binary_query_selection=binary_query_selection,\n        ffn_extra_layernorm=ffn_extra_layernorm,\n    )\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/fuse_modules.py",
    "content": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n# from timm.models.layers import DropPath\nfrom src.modules.util import DropPath\n\nclass FeatureResizer(nn.Module):\n    \"\"\"\n    This class takes as input a set of embeddings of dimension C1 and outputs a set of\n    embedding of dimension C2, after a linear transformation, dropout and normalization (LN).\n    \"\"\"\n\n    def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):\n        super().__init__()\n        self.do_ln = do_ln\n        # Object feature encoding\n        self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True)\n        self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)\n        self.dropout = nn.Dropout(dropout)\n\n    def forward(self, encoder_features):\n        x = self.fc(encoder_features)\n        if self.do_ln:\n            x = self.layer_norm(x)\n        output = self.dropout(x)\n        return output\n\n\n\n\ndef l1norm(X, dim, eps=1e-8):\n    \"\"\"L1-normalize columns of X\n    \"\"\"\n    norm = torch.abs(X).sum(dim=dim, keepdim=True) + eps\n    X = torch.div(X, norm)\n    return X\n\n\ndef l2norm(X, dim, eps=1e-8):\n    \"\"\"L2-normalize columns of X\n    \"\"\"\n    norm = torch.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps\n    X = torch.div(X, norm)\n    return X\n\n\ndef func_attention(query, context, smooth=1, raw_feature_norm=\"softmax\", eps=1e-8):\n    \"\"\"\n    query: (n_context, queryL, d)\n    context: (n_context, sourceL, d)\n    \"\"\"\n    batch_size_q, queryL = query.size(0), query.size(1)\n    batch_size, sourceL = context.size(0), context.size(1)\n\n    # Get attention\n    # --> (batch, d, queryL)\n    queryT = torch.transpose(query, 1, 2)\n\n    # (batch, sourceL, d)(batch, d, queryL)\n    # --> (batch, sourceL, queryL)\n    attn = torch.bmm(context, queryT)\n    if raw_feature_norm == \"softmax\":\n        # --> (batch*sourceL, queryL)\n        attn = attn.view(batch_size * sourceL, queryL)\n        attn = nn.Softmax()(attn)\n        # --> (batch, sourceL, queryL)\n        attn = attn.view(batch_size, sourceL, queryL)\n    elif raw_feature_norm == \"l2norm\":\n        attn = l2norm(attn, 2)\n    elif raw_feature_norm == \"clipped_l2norm\":\n        attn = nn.LeakyReLU(0.1)(attn)\n        attn = l2norm(attn, 2)\n    else:\n        raise ValueError(\"unknown first norm type:\", raw_feature_norm)\n    # --> (batch, queryL, sourceL)\n    attn = torch.transpose(attn, 1, 2).contiguous()\n    # --> (batch*queryL, sourceL)\n    attn = attn.view(batch_size * queryL, sourceL)\n    attn = nn.Softmax()(attn * smooth)\n    # --> (batch, queryL, sourceL)\n    attn = attn.view(batch_size, queryL, sourceL)\n    # --> (batch, sourceL, queryL)\n    attnT = torch.transpose(attn, 1, 2).contiguous()\n\n    # --> (batch, d, sourceL)\n    contextT = torch.transpose(context, 1, 2)\n    # (batch x d x sourceL)(batch x sourceL x queryL)\n    # --> (batch, d, queryL)\n    weightedContext = torch.bmm(contextT, attnT)\n    # --> (batch, queryL, d)\n    weightedContext = torch.transpose(weightedContext, 1, 2)\n\n    return weightedContext, attnT\n\n\nclass BiMultiHeadAttention(nn.Module):\n    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None):\n        super(BiMultiHeadAttention, self).__init__()\n\n        self.embed_dim = embed_dim\n        self.num_heads = num_heads\n        self.head_dim = embed_dim // num_heads\n        self.v_dim = v_dim\n        self.l_dim = l_dim\n\n        assert (\n                self.head_dim * self.num_heads == self.embed_dim\n        ), f\"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads}).\"\n        self.scale = self.head_dim ** (-0.5)\n        self.dropout = dropout\n\n        self.v_proj = nn.Linear(self.v_dim, self.embed_dim)\n        self.l_proj = nn.Linear(self.l_dim, self.embed_dim)\n        self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim)\n        self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim)\n\n        self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim)\n        self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim)\n\n        self.stable_softmax_2d = True\n        self.clamp_min_for_underflow = True\n        self.clamp_max_for_overflow = True\n\n        self._reset_parameters()\n\n    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):\n        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()\n\n    def _reset_parameters(self):\n        nn.init.xavier_uniform_(self.v_proj.weight)\n        self.v_proj.bias.data.fill_(0)\n        nn.init.xavier_uniform_(self.l_proj.weight)\n        self.l_proj.bias.data.fill_(0)\n        nn.init.xavier_uniform_(self.values_v_proj.weight)\n        self.values_v_proj.bias.data.fill_(0)\n        nn.init.xavier_uniform_(self.values_l_proj.weight)\n        self.values_l_proj.bias.data.fill_(0)\n        nn.init.xavier_uniform_(self.out_v_proj.weight)\n        self.out_v_proj.bias.data.fill_(0)\n        nn.init.xavier_uniform_(self.out_l_proj.weight)\n        self.out_l_proj.bias.data.fill_(0)\n\n    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):\n        \"\"\"_summary_\n\n        Args:\n            v (_type_): bs, n_img, dim\n            l (_type_): bs, n_text, dim\n            attention_mask_v (_type_, optional): _description_. bs, n_img\n            attention_mask_l (_type_, optional): _description_. bs, n_text\n\n        Returns:\n            _type_: _description_\n        \"\"\"\n        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':\n        #     import ipdb; ipdb.set_trace()\n        bsz, tgt_len, _ = v.size()\n\n        query_states = self.v_proj(v) * self.scale\n        key_states = self._shape(self.l_proj(l), -1, bsz)\n        value_v_states = self._shape(self.values_v_proj(v), -1, bsz)\n        value_l_states = self._shape(self.values_l_proj(l), -1, bsz)\n\n        proj_shape = (bsz * self.num_heads, -1, self.head_dim)\n        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)\n        key_states = key_states.view(*proj_shape)\n        value_v_states = value_v_states.view(*proj_shape)\n        value_l_states = value_l_states.view(*proj_shape)\n\n        src_len = key_states.size(1)\n        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt\n\n        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):\n            raise ValueError(\n                f\"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}\"\n            )\n\n        if self.stable_softmax_2d:\n            attn_weights = attn_weights - attn_weights.max()\n\n        if self.clamp_min_for_underflow:\n            attn_weights = torch.clamp(attn_weights, min=-50000) # Do not increase -50000, data type half has quite limited range\n        if self.clamp_max_for_overflow:\n            attn_weights = torch.clamp(attn_weights, max=50000) # Do not increase 50000, data type half has quite limited range\n\n        attn_weights_T = attn_weights.transpose(1, 2)\n        attn_weights_l = (attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[\n            0])\n        if self.clamp_min_for_underflow:\n            attn_weights_l = torch.clamp(attn_weights_l, min=-50000) # Do not increase -50000, data type half has quite limited range\n        if self.clamp_max_for_overflow:\n            attn_weights_l = torch.clamp(attn_weights_l, max=50000) # Do not increase 50000, data type half has quite limited range\n\n        # mask vison for language\n        if attention_mask_v is not None:\n            attention_mask_v = attention_mask_v[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)\n            attn_weights_l.masked_fill_(attention_mask_v, float('-inf'))\n\n        attn_weights_l = attn_weights_l.softmax(dim=-1)\n\n        # mask language for vision\n        if attention_mask_l is not None:\n            attention_mask_l = attention_mask_l[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)\n            attn_weights.masked_fill_(attention_mask_l, float('-inf'))\n        attn_weights_v = attn_weights.softmax(dim=-1)\n\n        attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training)\n        attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training)\n\n        attn_output_v = torch.bmm(attn_probs_v, value_l_states)\n        attn_output_l = torch.bmm(attn_probs_l, value_v_states)\n\n\n        if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim):\n            raise ValueError(\n                f\"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}\"\n            )\n\n        if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim):\n            raise ValueError(\n                f\"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}\"\n            )\n\n        attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim)\n        attn_output_v = attn_output_v.transpose(1, 2)\n        attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim)\n\n        attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim)\n        attn_output_l = attn_output_l.transpose(1, 2)\n        attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim)\n\n        attn_output_v = self.out_v_proj(attn_output_v)\n        attn_output_l = self.out_l_proj(attn_output_l)\n\n        return attn_output_v, attn_output_l\n\n\n# Bi-Direction MHA (text->image, image->text)\nclass BiAttentionBlock(nn.Module):\n    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1,\n                 drop_path=.0, init_values=1e-4, cfg=None):\n        \"\"\"\n        Inputs:\n            embed_dim - Dimensionality of input and attention feature vectors\n            hidden_dim - Dimensionality of hidden layer in feed-forward network\n                         (usually 2-4x larger than embed_dim)\n            num_heads - Number of heads to use in the Multi-Head Attention block\n            dropout - Amount of dropout to apply in the feed-forward network\n        \"\"\"\n        super(BiAttentionBlock, self).__init__()\n\n        # pre layer norm\n        self.layer_norm_v = nn.LayerNorm(v_dim)\n        self.layer_norm_l = nn.LayerNorm(l_dim)\n        self.attn = BiMultiHeadAttention(v_dim=v_dim,\n                                         l_dim=l_dim,\n                                         embed_dim=embed_dim,\n                                         num_heads=num_heads,\n                                         dropout=dropout)\n\n        # add layer scale for training stability\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()\n        self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=False)\n        self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=False)\n\n    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):\n        v = self.layer_norm_v(v)\n        l = self.layer_norm_l(l)\n        delta_v, delta_l = self.attn(v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l)\n        # v, l = v + delta_v, l + delta_l\n        v = v + self.drop_path(self.gamma_v * delta_v)\n        l = l + self.drop_path(self.gamma_l * delta_l)\n        return v, l\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/mask_generate.py",
    "content": "import torch\n\n\ndef prepare_for_mask(kpt_mask):\n\n\n    tgt_size2 = 50 * 69\n    attn_mask2 = torch.ones(kpt_mask.shape[0], 8, tgt_size2, tgt_size2).to('cuda') < 0\n    group_bbox_kpt = 69\n    num_group=50\n    for matchj in range(num_group * group_bbox_kpt):\n        sj = (matchj // group_bbox_kpt) * group_bbox_kpt\n        ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt\n        if sj > 0:\n            attn_mask2[:,:,matchj, :sj] = True\n        if ej < num_group * group_bbox_kpt:\n            attn_mask2[:,:,matchj, ej:] = True\n\n\n    bs, length = kpt_mask.shape\n    equal_mask = kpt_mask[:, :, None] == kpt_mask[:, None, :]\n    equal_mask= equal_mask.unsqueeze(1).repeat(1,8,1,1)\n    for idx in range(num_group):\n        start_idx = idx * length\n        end_idx = (idx + 1) * length\n        attn_mask2[:, :,start_idx:end_idx, start_idx:end_idx][equal_mask] = False\n        attn_mask2[:, :,start_idx:end_idx, start_idx:end_idx][~equal_mask] = True\n\n\n\n\n    input_query_label = None\n    input_query_bbox = None\n    attn_mask = None\n    dn_meta = None\n\n    return input_query_label, input_query_bbox, attn_mask, attn_mask2.flatten(0,1), dn_meta\n\n\ndef post_process(outputs_class, outputs_coord, dn_meta, aux_loss, _set_aux_loss):\n\n    if dn_meta and dn_meta['pad_size'] > 0:\n\n        output_known_class = [outputs_class_i[:, :dn_meta['pad_size'], :] for outputs_class_i in outputs_class]\n        output_known_coord = [outputs_coord_i[:, :dn_meta['pad_size'], :] for outputs_coord_i in outputs_coord]\n\n        outputs_class = [outputs_class_i[:, dn_meta['pad_size']:, :] for outputs_class_i in outputs_class]\n        outputs_coord = [outputs_coord_i[:, dn_meta['pad_size']:, :] for outputs_coord_i in outputs_coord]\n\n        out = {'pred_logits': output_known_class[-1], 'pred_boxes': output_known_coord[-1]}\n        if aux_loss:\n            out['aux_outputs'] = _set_aux_loss(output_known_class, output_known_coord)\n        dn_meta['output_known_lbs_bboxes'] = out\n    return outputs_class, outputs_coord\n\n\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/ops/functions/__init__.py",
    "content": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------------------------------\n# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n# ------------------------------------------------------------------------------------------------\n\nfrom .ms_deform_attn_func import MSDeformAttnFunction\n\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py",
    "content": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------------------------------\n# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n# ------------------------------------------------------------------------------------------------\n\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ import division\n\nimport torch\nimport torch.nn.functional as F\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\n\nimport MultiScaleDeformableAttention as MSDA\n\n\nclass MSDeformAttnFunction(Function):\n    @staticmethod\n    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):\n        ctx.im2col_step = im2col_step\n        output = MSDA.ms_deform_attn_forward(\n            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)\n        ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(ctx, grad_output):\n        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors\n        grad_value, grad_sampling_loc, grad_attn_weight = \\\n            MSDA.ms_deform_attn_backward(\n                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)\n\n        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None\n\n\ndef ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):\n    # for debug and test only,\n    # need to use cuda version instead\n    N_, S_, M_, D_ = value.shape\n    _, Lq_, M_, L_, P_, _ = sampling_locations.shape\n    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)\n    sampling_grids = 2 * sampling_locations - 1\n    sampling_value_list = []\n    for lid_, (H_, W_) in enumerate(value_spatial_shapes):\n        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_\n        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)\n        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2\n        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)\n        # N_*M_, D_, Lq_, P_\n        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,\n                                          mode='bilinear', padding_mode='zeros', align_corners=False)\n        sampling_value_list.append(sampling_value_l_)\n    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)\n    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)\n    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)\n    return output.transpose(1, 2).contiguous()\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/ops/modules/__init__.py",
    "content": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------------------------------\n# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n# ------------------------------------------------------------------------------------------------\n\nfrom .ms_deform_attn import MSDeformAttn\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn.py",
    "content": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------------------------------\n# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n# ------------------------------------------------------------------------------------------------\n\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ import division\n\nimport warnings\nimport math, os\nimport sys\nsys.path.append(os.path.dirname(os.path.abspath(__file__)))\n\nimport torch\nfrom torch import nn\nimport torch.nn.functional as F\nfrom torch.nn.init import xavier_uniform_, constant_\n\nfrom src.utils.dependencies.XPose.models.UniPose.ops.functions.ms_deform_attn_func import MSDeformAttnFunction\n\n\ndef _is_power_of_2(n):\n    if (not isinstance(n, int)) or (n < 0):\n        raise ValueError(\"invalid input for _is_power_of_2: {} (type: {})\".format(n, type(n)))\n    return (n & (n-1) == 0) and n != 0\n\n\nclass MSDeformAttn(nn.Module):\n    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):\n        \"\"\"\n        Multi-Scale Deformable Attention Module\n        :param d_model      hidden dimension\n        :param n_levels     number of feature levels\n        :param n_heads      number of attention heads\n        :param n_points     number of sampling points per attention head per feature level\n        \"\"\"\n        super().__init__()\n        if d_model % n_heads != 0:\n            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))\n        _d_per_head = d_model // n_heads\n        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation\n        if not _is_power_of_2(_d_per_head):\n            warnings.warn(\"You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 \"\n                          \"which is more efficient in our CUDA implementation.\")\n\n        self.im2col_step = 64\n\n        self.d_model = d_model\n        self.n_levels = n_levels\n        self.n_heads = n_heads\n        self.n_points = n_points\n\n        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)\n        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)\n        self.value_proj = nn.Linear(d_model, d_model)\n        self.output_proj = nn.Linear(d_model, d_model)\n\n        self.use_4D_normalizer = use_4D_normalizer\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        constant_(self.sampling_offsets.weight.data, 0.)\n        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)\n        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)\n        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)\n        for i in range(self.n_points):\n            grid_init[:, :, i, :] *= i + 1\n        with torch.no_grad():\n            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))\n        constant_(self.attention_weights.weight.data, 0.)\n        constant_(self.attention_weights.bias.data, 0.)\n        xavier_uniform_(self.value_proj.weight.data)\n        constant_(self.value_proj.bias.data, 0.)\n        xavier_uniform_(self.output_proj.weight.data)\n        constant_(self.output_proj.bias.data, 0.)\n\n    def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):\n        \"\"\"\n        :param query                       (N, Length_{query}, C)\n        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area\n                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes\n        :param input_flatten               (N, \\sum_{l=0}^{L-1} H_l \\cdot W_l, C)\n        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]\n        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]\n        :param input_padding_mask          (N, \\sum_{l=0}^{L-1} H_l \\cdot W_l), True for padding elements, False for non-padding elements\n\n        :return output                     (N, Length_{query}, C)\n        \"\"\"\n        N, Len_q, _ = query.shape\n        N, Len_in, _ = input_flatten.shape\n        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in\n\n        value = self.value_proj(input_flatten)\n        if input_padding_mask is not None:\n            value = value.masked_fill(input_padding_mask[..., None], float(0))\n        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)\n        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)\n        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)\n        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)\n        # N, Len_q, n_heads, n_levels, n_points, 2\n\n        # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':\n        #     import ipdb; ipdb.set_trace()\n\n        if reference_points.shape[-1] == 2:\n            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)\n            sampling_locations = reference_points[:, :, None, :, None, :] \\\n                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]\n        elif reference_points.shape[-1] == 4:\n            if self.use_4D_normalizer:\n                offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)\n                sampling_locations = reference_points[:, :, None, :, None, :2] \\\n                                    + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5\n            else:\n                sampling_locations = reference_points[:, :, None, :, None, :2] \\\n                                    + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5\n        else:\n            raise ValueError(\n                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))\n\n\n        # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':\n        #     import ipdb; ipdb.set_trace()\n\n        # for amp\n        if value.dtype == torch.float16:\n            # for mixed precision\n            output = MSDeformAttnFunction.apply(\n            value.to(torch.float32), input_spatial_shapes, input_level_start_index, sampling_locations.to(torch.float32), attention_weights, self.im2col_step)\n            output = output.to(torch.float16)\n            output = self.output_proj(output)\n            return output\n\n        output = MSDeformAttnFunction.apply(\n            value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)\n        output = self.output_proj(output)\n        return output\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn_key_aware.py",
    "content": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------------------------------\n# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n# ------------------------------------------------------------------------------------------------\n\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ import division\n\nimport warnings\nimport math, os\n\nimport torch\nfrom torch import nn\nimport torch.nn.functional as F\nfrom torch.nn.init import xavier_uniform_, constant_\n\ntry:\n    from src.utils.dependencies.XPose.models.UniPose.ops.functions import MSDeformAttnFunction\nexcept:\n    warnings.warn('Failed to import MSDeformAttnFunction.')\n\n\ndef _is_power_of_2(n):\n    if (not isinstance(n, int)) or (n < 0):\n        raise ValueError(\"invalid input for _is_power_of_2: {} (type: {})\".format(n, type(n)))\n    return (n & (n-1) == 0) and n != 0\n\n\nclass MSDeformAttn(nn.Module):\n    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):\n        \"\"\"\n        Multi-Scale Deformable Attention Module\n        :param d_model      hidden dimension\n        :param n_levels     number of feature levels\n        :param n_heads      number of attention heads\n        :param n_points     number of sampling points per attention head per feature level\n        \"\"\"\n        super().__init__()\n        if d_model % n_heads != 0:\n            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))\n        _d_per_head = d_model // n_heads\n        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation\n        if not _is_power_of_2(_d_per_head):\n            warnings.warn(\"You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 \"\n                          \"which is more efficient in our CUDA implementation.\")\n\n        self.im2col_step = 64\n\n        self.d_model = d_model\n        self.n_levels = n_levels\n        self.n_heads = n_heads\n        self.n_points = n_points\n\n        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)\n        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)\n        self.value_proj = nn.Linear(d_model, d_model)\n        self.output_proj = nn.Linear(d_model, d_model)\n\n        self.use_4D_normalizer = use_4D_normalizer\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        constant_(self.sampling_offsets.weight.data, 0.)\n        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)\n        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)\n        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)\n        for i in range(self.n_points):\n            grid_init[:, :, i, :] *= i + 1\n        with torch.no_grad():\n            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))\n        constant_(self.attention_weights.weight.data, 0.)\n        constant_(self.attention_weights.bias.data, 0.)\n        xavier_uniform_(self.value_proj.weight.data)\n        constant_(self.value_proj.bias.data, 0.)\n        xavier_uniform_(self.output_proj.weight.data)\n        constant_(self.output_proj.bias.data, 0.)\n\n    def forward(self, query, key, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):\n        \"\"\"\n        :param query                       (N, Length_{query}, C)\n        :param key                          (N, 1, C)\n        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area\n                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes\n        :param input_flatten               (N, \\sum_{l=0}^{L-1} H_l \\cdot W_l, C)\n        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]\n        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]\n        :param input_padding_mask          (N, \\sum_{l=0}^{L-1} H_l \\cdot W_l), True for padding elements, False for non-padding elements\n\n        :return output                     (N, Length_{query}, C)\n        \"\"\"\n        N, Len_q, _ = query.shape\n        N, Len_in, _ = input_flatten.shape\n        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in\n\n        value = self.value_proj(input_flatten)\n        if input_padding_mask is not None:\n            value = value.masked_fill(input_padding_mask[..., None], float(0))\n        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)\n        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)\n        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)\n        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)\n        # N, Len_q, n_heads, n_levels, n_points, 2\n\n        # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':\n        #     import ipdb; ipdb.set_trace()\n\n        if reference_points.shape[-1] == 2:\n            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)\n            sampling_locations = reference_points[:, :, None, :, None, :] \\\n                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]\n        elif reference_points.shape[-1] == 4:\n            if self.use_4D_normalizer:\n                offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)\n                sampling_locations = reference_points[:, :, None, :, None, :2] \\\n                                    + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5\n            else:\n                sampling_locations = reference_points[:, :, None, :, None, :2] \\\n                                    + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5\n        else:\n            raise ValueError(\n                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))\n        output = MSDeformAttnFunction.apply(\n            value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)\n        output = self.output_proj(output)\n        return output\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/ops/setup.py",
    "content": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------------------------------\n# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n# ------------------------------------------------------------------------------------------------\n\nimport os\nimport glob\n\nimport torch\n\nfrom torch.utils.cpp_extension import CUDA_HOME\nfrom torch.utils.cpp_extension import CppExtension\nfrom torch.utils.cpp_extension import CUDAExtension\n\nfrom setuptools import find_packages\nfrom setuptools import setup\n\nrequirements = [\"torch\", \"torchvision\"]\n\ndef get_extensions():\n    this_dir = os.path.dirname(os.path.abspath(__file__))\n    extensions_dir = os.path.join(this_dir, \"src\")\n\n    main_file = glob.glob(os.path.join(extensions_dir, \"*.cpp\"))\n    source_cpu = glob.glob(os.path.join(extensions_dir, \"cpu\", \"*.cpp\"))\n    source_cuda = glob.glob(os.path.join(extensions_dir, \"cuda\", \"*.cu\"))\n\n    sources = main_file + source_cpu\n    extension = CppExtension\n    extra_compile_args = {\"cxx\": []}\n    define_macros = []\n\n    # import ipdb; ipdb.set_trace()\n\n    if torch.cuda.is_available() and CUDA_HOME is not None:\n        extension = CUDAExtension\n        sources += source_cuda\n        define_macros += [(\"WITH_CUDA\", None)]\n        extra_compile_args[\"nvcc\"] = [\n            \"-DCUDA_HAS_FP16=1\",\n            \"-D__CUDA_NO_HALF_OPERATORS__\",\n            \"-D__CUDA_NO_HALF_CONVERSIONS__\",\n            \"-D__CUDA_NO_HALF2_OPERATORS__\",\n        ]\n    else:\n        raise NotImplementedError('Cuda is not availabel')\n\n    sources = [os.path.join(extensions_dir, s) for s in sources]\n    include_dirs = [extensions_dir]\n    ext_modules = [\n        extension(\n            \"MultiScaleDeformableAttention\",\n            sources,\n            include_dirs=include_dirs,\n            define_macros=define_macros,\n            extra_compile_args=extra_compile_args,\n        )\n    ]\n    return ext_modules\n\nsetup(\n    name=\"MultiScaleDeformableAttention\",\n    version=\"1.0\",\n    author=\"Weijie Su\",\n    url=\"https://github.com/fundamentalvision/Deformable-DETR\",\n    description=\"PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention\",\n    packages=find_packages(exclude=(\"configs\", \"tests\",)),\n    ext_modules=get_extensions(),\n    cmdclass={\"build_ext\": torch.utils.cpp_extension.BuildExtension},\n)\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.cpp",
    "content": "/*!\n**************************************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 SenseTime. All Rights Reserved.\n* Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n**************************************************************************************************\n* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n**************************************************************************************************\n*/\n\n#include <vector>\n\n#include <ATen/ATen.h>\n#include <ATen/cuda/CUDAContext.h>\n\n\nat::Tensor\nms_deform_attn_cpu_forward(\n    const at::Tensor &value, \n    const at::Tensor &spatial_shapes,\n    const at::Tensor &level_start_index,\n    const at::Tensor &sampling_loc,\n    const at::Tensor &attn_weight,\n    const int im2col_step)\n{\n    AT_ERROR(\"Not implement on cpu\");\n}\n\nstd::vector<at::Tensor>\nms_deform_attn_cpu_backward(\n    const at::Tensor &value, \n    const at::Tensor &spatial_shapes,\n    const at::Tensor &level_start_index,\n    const at::Tensor &sampling_loc,\n    const at::Tensor &attn_weight,\n    const at::Tensor &grad_output,\n    const int im2col_step)\n{\n    AT_ERROR(\"Not implement on cpu\");\n}\n\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.h",
    "content": "/*!\n**************************************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 SenseTime. All Rights Reserved.\n* Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n**************************************************************************************************\n* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n**************************************************************************************************\n*/\n\n#pragma once\n#include <torch/extension.h>\n\nat::Tensor\nms_deform_attn_cpu_forward(\n    const at::Tensor &value, \n    const at::Tensor &spatial_shapes,\n    const at::Tensor &level_start_index,\n    const at::Tensor &sampling_loc,\n    const at::Tensor &attn_weight,\n    const int im2col_step);\n\nstd::vector<at::Tensor>\nms_deform_attn_cpu_backward(\n    const at::Tensor &value, \n    const at::Tensor &spatial_shapes,\n    const at::Tensor &level_start_index,\n    const at::Tensor &sampling_loc,\n    const at::Tensor &attn_weight,\n    const at::Tensor &grad_output,\n    const int im2col_step);\n\n\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.cu",
    "content": "/*!\n**************************************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 SenseTime. All Rights Reserved.\n* Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n**************************************************************************************************\n* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n**************************************************************************************************\n*/\n\n#include <vector>\n#include \"cuda/ms_deform_im2col_cuda.cuh\"\n\n#include <ATen/ATen.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n\n\nat::Tensor ms_deform_attn_cuda_forward(\n    const at::Tensor &value,\n    const at::Tensor &spatial_shapes,\n    const at::Tensor &level_start_index,\n    const at::Tensor &sampling_loc,\n    const at::Tensor &attn_weight,\n    const int im2col_step)\n{\n    AT_ASSERTM(value.is_contiguous(), \"value tensor has to be contiguous\");\n    AT_ASSERTM(spatial_shapes.is_contiguous(), \"spatial_shapes tensor has to be contiguous\");\n    AT_ASSERTM(level_start_index.is_contiguous(), \"level_start_index tensor has to be contiguous\");\n    AT_ASSERTM(sampling_loc.is_contiguous(), \"sampling_loc tensor has to be contiguous\");\n    AT_ASSERTM(attn_weight.is_contiguous(), \"attn_weight tensor has to be contiguous\");\n\n    AT_ASSERTM(value.type().is_cuda(), \"value must be a CUDA tensor\");\n    AT_ASSERTM(spatial_shapes.type().is_cuda(), \"spatial_shapes must be a CUDA tensor\");\n    AT_ASSERTM(level_start_index.type().is_cuda(), \"level_start_index must be a CUDA tensor\");\n    AT_ASSERTM(sampling_loc.type().is_cuda(), \"sampling_loc must be a CUDA tensor\");\n    AT_ASSERTM(attn_weight.type().is_cuda(), \"attn_weight must be a CUDA tensor\");\n\n    const int batch = value.size(0);\n    const int spatial_size = value.size(1);\n    const int num_heads = value.size(2);\n    const int channels = value.size(3);\n\n    const int num_levels = spatial_shapes.size(0);\n\n    const int num_query = sampling_loc.size(1);\n    const int num_point = sampling_loc.size(4);\n\n    const int im2col_step_ = std::min(batch, im2col_step);\n\n    AT_ASSERTM(batch % im2col_step_ == 0, \"batch(%d) must divide im2col_step(%d)\", batch, im2col_step_);\n\n    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());\n\n    const int batch_n = im2col_step_;\n    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});\n    auto per_value_size = spatial_size * num_heads * channels;\n    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;\n    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;\n    for (int n = 0; n < batch/im2col_step_; ++n)\n    {\n        auto columns = output_n.select(0, n);\n        AT_DISPATCH_FLOATING_TYPES(value.scalar_type(), \"ms_deform_attn_forward_cuda\", ([&] {\n            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),\n                value.data<scalar_t>() + n * im2col_step_ * per_value_size,\n                spatial_shapes.data<int64_t>(),\n                level_start_index.data<int64_t>(),\n                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,\n                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,\n                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,\n                columns.data<scalar_t>());\n\n        }));\n    }\n\n    output = output.view({batch, num_query, num_heads*channels});\n\n    return output;\n}\n\n\nstd::vector<at::Tensor> ms_deform_attn_cuda_backward(\n    const at::Tensor &value,\n    const at::Tensor &spatial_shapes,\n    const at::Tensor &level_start_index,\n    const at::Tensor &sampling_loc,\n    const at::Tensor &attn_weight,\n    const at::Tensor &grad_output,\n    const int im2col_step)\n{\n\n    AT_ASSERTM(value.is_contiguous(), \"value tensor has to be contiguous\");\n    AT_ASSERTM(spatial_shapes.is_contiguous(), \"spatial_shapes tensor has to be contiguous\");\n    AT_ASSERTM(level_start_index.is_contiguous(), \"level_start_index tensor has to be contiguous\");\n    AT_ASSERTM(sampling_loc.is_contiguous(), \"sampling_loc tensor has to be contiguous\");\n    AT_ASSERTM(attn_weight.is_contiguous(), \"attn_weight tensor has to be contiguous\");\n    AT_ASSERTM(grad_output.is_contiguous(), \"grad_output tensor has to be contiguous\");\n\n    AT_ASSERTM(value.type().is_cuda(), \"value must be a CUDA tensor\");\n    AT_ASSERTM(spatial_shapes.type().is_cuda(), \"spatial_shapes must be a CUDA tensor\");\n    AT_ASSERTM(level_start_index.type().is_cuda(), \"level_start_index must be a CUDA tensor\");\n    AT_ASSERTM(sampling_loc.type().is_cuda(), \"sampling_loc must be a CUDA tensor\");\n    AT_ASSERTM(attn_weight.type().is_cuda(), \"attn_weight must be a CUDA tensor\");\n    AT_ASSERTM(grad_output.type().is_cuda(), \"grad_output must be a CUDA tensor\");\n\n    const int batch = value.size(0);\n    const int spatial_size = value.size(1);\n    const int num_heads = value.size(2);\n    const int channels = value.size(3);\n\n    const int num_levels = spatial_shapes.size(0);\n\n    const int num_query = sampling_loc.size(1);\n    const int num_point = sampling_loc.size(4);\n\n    const int im2col_step_ = std::min(batch, im2col_step);\n\n    AT_ASSERTM(batch % im2col_step_ == 0, \"batch(%d) must divide im2col_step(%d)\", batch, im2col_step_);\n\n    auto grad_value = at::zeros_like(value);\n    auto grad_sampling_loc = at::zeros_like(sampling_loc);\n    auto grad_attn_weight = at::zeros_like(attn_weight);\n\n    const int batch_n = im2col_step_;\n    auto per_value_size = spatial_size * num_heads * channels;\n    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;\n    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;\n    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});\n\n    for (int n = 0; n < batch/im2col_step_; ++n)\n    {\n        auto grad_output_g = grad_output_n.select(0, n);\n        AT_DISPATCH_FLOATING_TYPES(value.scalar_type(), \"ms_deform_attn_backward_cuda\", ([&] {\n            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),\n                                    grad_output_g.data<scalar_t>(),\n                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,\n                                    spatial_shapes.data<int64_t>(),\n                                    level_start_index.data<int64_t>(),\n                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,\n                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,\n                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,\n                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,\n                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,\n                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);\n\n        }));\n    }\n\n    return {\n        grad_value, grad_sampling_loc, grad_attn_weight\n    };\n}\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.h",
    "content": "/*!\n**************************************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 SenseTime. All Rights Reserved.\n* Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n**************************************************************************************************\n* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n**************************************************************************************************\n*/\n\n#pragma once\n#include <torch/extension.h>\n\nat::Tensor ms_deform_attn_cuda_forward(\n    const at::Tensor &value, \n    const at::Tensor &spatial_shapes,\n    const at::Tensor &level_start_index,\n    const at::Tensor &sampling_loc,\n    const at::Tensor &attn_weight,\n    const int im2col_step);\n\nstd::vector<at::Tensor> ms_deform_attn_cuda_backward(\n    const at::Tensor &value, \n    const at::Tensor &spatial_shapes,\n    const at::Tensor &level_start_index,\n    const at::Tensor &sampling_loc,\n    const at::Tensor &attn_weight,\n    const at::Tensor &grad_output,\n    const int im2col_step);\n\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_im2col_cuda.cuh",
    "content": "/*!\n**************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 SenseTime. All Rights Reserved.\n* Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n**************************************************************************\n* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)\n* Copyright (c) 2018 Microsoft\n**************************************************************************\n*/\n\n#include <cstdio>\n#include <algorithm>\n#include <cstring>\n\n#include <ATen/ATen.h>\n#include <ATen/cuda/CUDAContext.h>\n\n#include <THC/THCAtomics.cuh>\n\n#define CUDA_KERNEL_LOOP(i, n)                          \\\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \\\n      i < (n);                                          \\\n      i += blockDim.x * gridDim.x)\n\nconst int CUDA_NUM_THREADS = 1024;\ninline int GET_BLOCKS(const int N, const int num_threads)\n{\n  return (N + num_threads - 1) / num_threads;\n}\n\n\ntemplate <typename scalar_t>\n__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, \n                                                   const int &height, const int &width, const int &nheads, const int &channels,\n                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)\n{\n  const int h_low = floor(h);\n  const int w_low = floor(w);\n  const int h_high = h_low + 1;\n  const int w_high = w_low + 1;\n\n  const scalar_t lh = h - h_low;\n  const scalar_t lw = w - w_low;\n  const scalar_t hh = 1 - lh, hw = 1 - lw;\n\n  const int w_stride = nheads * channels;\n  const int h_stride = width * w_stride;\n  const int h_low_ptr_offset = h_low * h_stride;\n  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;\n  const int w_low_ptr_offset = w_low * w_stride;\n  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;\n  const int base_ptr = m * channels + c;\n\n  scalar_t v1 = 0;\n  if (h_low >= 0 && w_low >= 0)\n  {\n    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;\n    v1 = bottom_data[ptr1];\n  }\n  scalar_t v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1)\n  {\n    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;\n    v2 = bottom_data[ptr2];\n  }\n  scalar_t v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0)\n  {\n    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;\n    v3 = bottom_data[ptr3];\n  }\n  scalar_t v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1)\n  {\n    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;\n    v4 = bottom_data[ptr4];\n  }\n\n  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n\n  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  return val;\n}\n\n\ntemplate <typename scalar_t>\n__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, \n                                                   const int &height, const int &width, const int &nheads, const int &channels,\n                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,\n                                                   const scalar_t &top_grad,\n                                                   const scalar_t &attn_weight,\n                                                   scalar_t* &grad_value, \n                                                   scalar_t* grad_sampling_loc,\n                                                   scalar_t* grad_attn_weight)\n{\n  const int h_low = floor(h);\n  const int w_low = floor(w);\n  const int h_high = h_low + 1;\n  const int w_high = w_low + 1;\n\n  const scalar_t lh = h - h_low;\n  const scalar_t lw = w - w_low;\n  const scalar_t hh = 1 - lh, hw = 1 - lw;\n\n  const int w_stride = nheads * channels;\n  const int h_stride = width * w_stride;\n  const int h_low_ptr_offset = h_low * h_stride;\n  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;\n  const int w_low_ptr_offset = w_low * w_stride;\n  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;\n  const int base_ptr = m * channels + c;\n\n  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n  const scalar_t top_grad_value = top_grad * attn_weight;\n  scalar_t grad_h_weight = 0, grad_w_weight = 0;\n\n  scalar_t v1 = 0;\n  if (h_low >= 0 && w_low >= 0)\n  {\n    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;\n    v1 = bottom_data[ptr1];\n    grad_h_weight -= hw * v1;\n    grad_w_weight -= hh * v1;\n    atomicAdd(grad_value+ptr1, w1*top_grad_value);\n  }\n  scalar_t v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1)\n  {\n    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;\n    v2 = bottom_data[ptr2];\n    grad_h_weight -= lw * v2;\n    grad_w_weight += hh * v2;\n    atomicAdd(grad_value+ptr2, w2*top_grad_value);\n  }\n  scalar_t v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0)\n  {\n    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;\n    v3 = bottom_data[ptr3];\n    grad_h_weight += hw * v3;\n    grad_w_weight -= lh * v3;\n    atomicAdd(grad_value+ptr3, w3*top_grad_value); \n  }\n  scalar_t v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1)\n  {\n    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;\n    v4 = bottom_data[ptr4];\n    grad_h_weight += lw * v4;\n    grad_w_weight += lh * v4;\n    atomicAdd(grad_value+ptr4, w4*top_grad_value);\n  }\n\n  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  *grad_attn_weight = top_grad * val;\n  *grad_sampling_loc = width * grad_w_weight * top_grad_value;\n  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;\n}\n\n\ntemplate <typename scalar_t>\n__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, \n                                                   const int &height, const int &width, const int &nheads, const int &channels,\n                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,\n                                                   const scalar_t &top_grad,\n                                                   const scalar_t &attn_weight,\n                                                   scalar_t* &grad_value, \n                                                   scalar_t* grad_sampling_loc,\n                                                   scalar_t* grad_attn_weight)\n{\n  const int h_low = floor(h);\n  const int w_low = floor(w);\n  const int h_high = h_low + 1;\n  const int w_high = w_low + 1;\n\n  const scalar_t lh = h - h_low;\n  const scalar_t lw = w - w_low;\n  const scalar_t hh = 1 - lh, hw = 1 - lw;\n\n  const int w_stride = nheads * channels;\n  const int h_stride = width * w_stride;\n  const int h_low_ptr_offset = h_low * h_stride;\n  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;\n  const int w_low_ptr_offset = w_low * w_stride;\n  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;\n  const int base_ptr = m * channels + c;\n\n  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n  const scalar_t top_grad_value = top_grad * attn_weight;\n  scalar_t grad_h_weight = 0, grad_w_weight = 0;\n\n  scalar_t v1 = 0;\n  if (h_low >= 0 && w_low >= 0)\n  {\n    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;\n    v1 = bottom_data[ptr1];\n    grad_h_weight -= hw * v1;\n    grad_w_weight -= hh * v1;\n    atomicAdd(grad_value+ptr1, w1*top_grad_value);\n  }\n  scalar_t v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1)\n  {\n    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;\n    v2 = bottom_data[ptr2];\n    grad_h_weight -= lw * v2;\n    grad_w_weight += hh * v2;\n    atomicAdd(grad_value+ptr2, w2*top_grad_value);\n  }\n  scalar_t v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0)\n  {\n    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;\n    v3 = bottom_data[ptr3];\n    grad_h_weight += hw * v3;\n    grad_w_weight -= lh * v3;\n    atomicAdd(grad_value+ptr3, w3*top_grad_value); \n  }\n  scalar_t v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1)\n  {\n    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;\n    v4 = bottom_data[ptr4];\n    grad_h_weight += lw * v4;\n    grad_w_weight += lh * v4;\n    atomicAdd(grad_value+ptr4, w4*top_grad_value);\n  }\n\n  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  atomicAdd(grad_attn_weight, top_grad * val); \n  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);\n  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);\n}\n\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_im2col_gpu_kernel(const int n,\n                                                const scalar_t *data_value, \n                                                const int64_t *data_spatial_shapes,\n                                                const int64_t *data_level_start_index, \n                                                const scalar_t *data_sampling_loc,\n                                                const scalar_t *data_attn_weight,\n                                                const int batch_size, \n                                                const int spatial_size, \n                                                const int num_heads,\n                                                const int channels, \n                                                const int num_levels,\n                                                const int num_query,\n                                                const int num_point,\n                                                scalar_t *data_col)\n{\n  CUDA_KERNEL_LOOP(index, n)\n  {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp; \n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    const int q_col = _temp % num_query;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    scalar_t *data_col_ptr = data_col + index;\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n    scalar_t col = 0;\n    \n    for (int l_col=0; l_col < num_levels; ++l_col)\n    {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);\n      for (int p_col=0; p_col < num_point; ++p_col)\n      {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)\n        {\n          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;\n        }\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n      }\n    }\n    *data_col_ptr = col;\n  }\n}\n\ntemplate <typename scalar_t, unsigned int blockSize>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,\n                                                const scalar_t *grad_col,\n                                                const scalar_t *data_value,\n                                                const int64_t *data_spatial_shapes,\n                                                const int64_t *data_level_start_index, \n                                                const scalar_t *data_sampling_loc,\n                                                const scalar_t *data_attn_weight,\n                                                const int batch_size, \n                                                const int spatial_size, \n                                                const int num_heads,\n                                                const int channels, \n                                                const int num_levels,\n                                                const int num_query,\n                                                const int num_point,\n                                                scalar_t *grad_value,\n                                                scalar_t *grad_sampling_loc,\n                                                scalar_t *grad_attn_weight)\n{\n  CUDA_KERNEL_LOOP(index, n)\n  {\n    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];\n    __shared__ scalar_t cache_grad_attn_weight[blockSize];\n    unsigned int tid = threadIdx.x;\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp; \n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    const int q_col = _temp % num_query;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    grad_sampling_loc += grad_sampling_ptr << 1;\n    grad_attn_weight += grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col=0; l_col < num_levels; ++l_col)\n    {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col=0; p_col < num_point; ++p_col)\n      {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight+threadIdx.x)=0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)\n        {\n          ms_deform_attn_col2im_bilinear(\n            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,\n            top_grad, weight, grad_value_ptr, \n            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);\n        }\n        \n        __syncthreads();\n        if (tid == 0)\n        {\n          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];\n          int sid=2;\n          for (unsigned int tid = 1; tid < blockSize; ++tid)\n          {\n            _grad_w += cache_grad_sampling_loc[sid];\n            _grad_h += cache_grad_sampling_loc[sid + 1];\n            _grad_a += cache_grad_attn_weight[tid];\n            sid += 2;\n          }\n          \n          \n          *grad_sampling_loc = _grad_w;\n          *(grad_sampling_loc + 1) = _grad_h;\n          *grad_attn_weight = _grad_a;\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight += grad_weight_stride;\n        grad_sampling_loc += grad_loc_stride;\n      }\n    }\n  }\n}\n\n\ntemplate <typename scalar_t, unsigned int blockSize>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,\n                                                const scalar_t *grad_col,\n                                                const scalar_t *data_value,\n                                                const int64_t *data_spatial_shapes,\n                                                const int64_t *data_level_start_index, \n                                                const scalar_t *data_sampling_loc,\n                                                const scalar_t *data_attn_weight,\n                                                const int batch_size, \n                                                const int spatial_size, \n                                                const int num_heads,\n                                                const int channels, \n                                                const int num_levels,\n                                                const int num_query,\n                                                const int num_point,\n                                                scalar_t *grad_value,\n                                                scalar_t *grad_sampling_loc,\n                                                scalar_t *grad_attn_weight)\n{\n  CUDA_KERNEL_LOOP(index, n)\n  {\n    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];\n    __shared__ scalar_t cache_grad_attn_weight[blockSize];\n    unsigned int tid = threadIdx.x;\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp; \n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    const int q_col = _temp % num_query;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    grad_sampling_loc += grad_sampling_ptr << 1;\n    grad_attn_weight += grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col=0; l_col < num_levels; ++l_col)\n    {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col=0; p_col < num_point; ++p_col)\n      {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight+threadIdx.x)=0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)\n        {\n          ms_deform_attn_col2im_bilinear(\n            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,\n            top_grad, weight, grad_value_ptr, \n            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);\n        }\n        \n        __syncthreads();\n\n        for (unsigned int s=blockSize/2; s>0; s>>=1)\n        {\n          if (tid < s) {\n            const unsigned int xid1 = tid << 1;\n            const unsigned int xid2 = (tid + s) << 1;\n            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];\n            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];\n            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];\n          }\n          __syncthreads();\n        }\n\n        if (tid == 0)\n        { \n          *grad_sampling_loc = cache_grad_sampling_loc[0];\n          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];\n          *grad_attn_weight = cache_grad_attn_weight[0];\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight += grad_weight_stride;\n        grad_sampling_loc += grad_loc_stride;\n      }\n    }\n  }\n}\n\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,\n                                                const scalar_t *grad_col,\n                                                const scalar_t *data_value,\n                                                const int64_t *data_spatial_shapes,\n                                                const int64_t *data_level_start_index, \n                                                const scalar_t *data_sampling_loc,\n                                                const scalar_t *data_attn_weight,\n                                                const int batch_size, \n                                                const int spatial_size, \n                                                const int num_heads,\n                                                const int channels, \n                                                const int num_levels,\n                                                const int num_query,\n                                                const int num_point,\n                                                scalar_t *grad_value,\n                                                scalar_t *grad_sampling_loc,\n                                                scalar_t *grad_attn_weight)\n{\n  CUDA_KERNEL_LOOP(index, n)\n  {\n    extern __shared__ int _s[];\n    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;\n    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;\n    unsigned int tid = threadIdx.x;\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp; \n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    const int q_col = _temp % num_query;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    grad_sampling_loc += grad_sampling_ptr << 1;\n    grad_attn_weight += grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col=0; l_col < num_levels; ++l_col)\n    {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col=0; p_col < num_point; ++p_col)\n      {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight+threadIdx.x)=0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)\n        {\n          ms_deform_attn_col2im_bilinear(\n            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,\n            top_grad, weight, grad_value_ptr, \n            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);\n        }\n        \n        __syncthreads();\n        if (tid == 0)\n        {\n          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];\n          int sid=2;\n          for (unsigned int tid = 1; tid < blockDim.x; ++tid)\n          {\n            _grad_w += cache_grad_sampling_loc[sid];\n            _grad_h += cache_grad_sampling_loc[sid + 1];\n            _grad_a += cache_grad_attn_weight[tid];\n            sid += 2;\n          }\n          \n          \n          *grad_sampling_loc = _grad_w;\n          *(grad_sampling_loc + 1) = _grad_h;\n          *grad_attn_weight = _grad_a;\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight += grad_weight_stride;\n        grad_sampling_loc += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,\n                                                const scalar_t *grad_col,\n                                                const scalar_t *data_value,\n                                                const int64_t *data_spatial_shapes,\n                                                const int64_t *data_level_start_index, \n                                                const scalar_t *data_sampling_loc,\n                                                const scalar_t *data_attn_weight,\n                                                const int batch_size, \n                                                const int spatial_size, \n                                                const int num_heads,\n                                                const int channels, \n                                                const int num_levels,\n                                                const int num_query,\n                                                const int num_point,\n                                                scalar_t *grad_value,\n                                                scalar_t *grad_sampling_loc,\n                                                scalar_t *grad_attn_weight)\n{\n  CUDA_KERNEL_LOOP(index, n)\n  {\n    extern __shared__ int _s[];\n    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;\n    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;\n    unsigned int tid = threadIdx.x;\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp; \n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    const int q_col = _temp % num_query;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    grad_sampling_loc += grad_sampling_ptr << 1;\n    grad_attn_weight += grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col=0; l_col < num_levels; ++l_col)\n    {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col=0; p_col < num_point; ++p_col)\n      {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight+threadIdx.x)=0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)\n        {\n          ms_deform_attn_col2im_bilinear(\n            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,\n            top_grad, weight, grad_value_ptr, \n            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);\n        }\n        \n        __syncthreads();\n\n        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)\n        {\n          if (tid < s) {\n            const unsigned int xid1 = tid << 1;\n            const unsigned int xid2 = (tid + s) << 1;\n            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];\n            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];\n            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];\n            if (tid + (s << 1) < spre)\n            {\n              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];\n              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];\n              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];\n            } \n          }\n          __syncthreads();\n        }\n\n        if (tid == 0)\n        {\n          *grad_sampling_loc = cache_grad_sampling_loc[0];\n          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];\n          *grad_attn_weight = cache_grad_attn_weight[0];\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight += grad_weight_stride;\n        grad_sampling_loc += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,\n                                                const scalar_t *grad_col,\n                                                const scalar_t *data_value,\n                                                const int64_t *data_spatial_shapes,\n                                                const int64_t *data_level_start_index, \n                                                const scalar_t *data_sampling_loc,\n                                                const scalar_t *data_attn_weight,\n                                                const int batch_size, \n                                                const int spatial_size, \n                                                const int num_heads,\n                                                const int channels, \n                                                const int num_levels,\n                                                const int num_query,\n                                                const int num_point,\n                                                scalar_t *grad_value,\n                                                scalar_t *grad_sampling_loc,\n                                                scalar_t *grad_attn_weight)\n{\n  CUDA_KERNEL_LOOP(index, n)\n  {\n    extern __shared__ int _s[];\n    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;\n    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;\n    unsigned int tid = threadIdx.x;\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp; \n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    const int q_col = _temp % num_query;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    grad_sampling_loc += grad_sampling_ptr << 1;\n    grad_attn_weight += grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col=0; l_col < num_levels; ++l_col)\n    {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col=0; p_col < num_point; ++p_col)\n      {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight+threadIdx.x)=0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)\n        {\n          ms_deform_attn_col2im_bilinear(\n            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,\n            top_grad, weight, grad_value_ptr, \n            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);\n        }\n        \n        __syncthreads();\n\n        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)\n        {\n          if (tid < s) {\n            const unsigned int xid1 = tid << 1;\n            const unsigned int xid2 = (tid + s) << 1;\n            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];\n            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];\n            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];\n            if (tid + (s << 1) < spre)\n            {\n              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];\n              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];\n              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];\n            }\n          }\n          __syncthreads();\n        }\n\n        if (tid == 0)\n        {\n          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);\n          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);\n          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight += grad_weight_stride;\n        grad_sampling_loc += grad_loc_stride;\n      }\n    }\n  }\n}\n\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,\n                                                const scalar_t *grad_col,\n                                                const scalar_t *data_value,\n                                                const int64_t *data_spatial_shapes,\n                                                const int64_t *data_level_start_index, \n                                                const scalar_t *data_sampling_loc,\n                                                const scalar_t *data_attn_weight,\n                                                const int batch_size, \n                                                const int spatial_size, \n                                                const int num_heads,\n                                                const int channels, \n                                                const int num_levels,\n                                                const int num_query,\n                                                const int num_point,\n                                                scalar_t *grad_value,\n                                                scalar_t *grad_sampling_loc,\n                                                scalar_t *grad_attn_weight)\n{\n  CUDA_KERNEL_LOOP(index, n)\n  {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp; \n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    const int q_col = _temp % num_query;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    grad_sampling_loc += grad_sampling_ptr << 1;\n    grad_attn_weight += grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col=0; l_col < num_levels; ++l_col)\n    {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col=0; p_col < num_point; ++p_col)\n      {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)\n        {\n          ms_deform_attn_col2im_bilinear_gm(\n            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,\n            top_grad, weight, grad_value_ptr, \n            grad_sampling_loc, grad_attn_weight);\n        }\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight += grad_weight_stride;\n        grad_sampling_loc += grad_loc_stride;\n      }\n    }\n  }\n}\n\n\ntemplate <typename scalar_t>\nvoid ms_deformable_im2col_cuda(cudaStream_t stream,\n                              const scalar_t* data_value,\n                              const int64_t* data_spatial_shapes, \n                              const int64_t* data_level_start_index, \n                              const scalar_t* data_sampling_loc,\n                              const scalar_t* data_attn_weight,\n                              const int batch_size,\n                              const int spatial_size, \n                              const int num_heads, \n                              const int channels, \n                              const int num_levels, \n                              const int num_query,\n                              const int num_point,\n                              scalar_t* data_col)\n{\n  const int num_kernels = batch_size * num_query * num_heads * channels;\n  const int num_actual_kernels = batch_size * num_query * num_heads * channels;\n  const int num_threads = CUDA_NUM_THREADS;\n  ms_deformable_im2col_gpu_kernel<scalar_t>\n      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n          0, stream>>>(\n      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, \n      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);\n  \n  cudaError_t err = cudaGetLastError();\n  if (err != cudaSuccess)\n  {\n    printf(\"error in ms_deformable_im2col_cuda: %s\\n\", cudaGetErrorString(err));\n  }\n\n}\n\ntemplate <typename scalar_t>\nvoid ms_deformable_col2im_cuda(cudaStream_t stream,\n                              const scalar_t* grad_col,\n                              const scalar_t* data_value,\n                              const int64_t * data_spatial_shapes,\n                              const int64_t * data_level_start_index,\n                              const scalar_t * data_sampling_loc,\n                              const scalar_t * data_attn_weight,\n                              const int batch_size, \n                              const int spatial_size, \n                              const int num_heads,\n                              const int channels, \n                              const int num_levels,\n                              const int num_query,\n                              const int num_point, \n                              scalar_t* grad_value,\n                              scalar_t* grad_sampling_loc,\n                              scalar_t* grad_attn_weight)\n{\n  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;\n  const int num_kernels = batch_size * num_query * num_heads * channels;\n  const int num_actual_kernels = batch_size * num_query * num_heads * channels;\n  if (channels > 1024)\n  {\n    if ((channels & 1023) == 0)\n    {\n      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n              num_threads*3*sizeof(scalar_t), stream>>>(\n                        num_kernels, \n                        grad_col,\n                        data_value,\n                        data_spatial_shapes,\n                        data_level_start_index, \n                        data_sampling_loc,\n                        data_attn_weight,\n                        batch_size, \n                        spatial_size, \n                        num_heads,\n                        channels, \n                        num_levels,\n                        num_query,\n                        num_point,\n                        grad_value,\n                        grad_sampling_loc,\n                        grad_attn_weight);\n    }\n    else\n    {\n      ms_deformable_col2im_gpu_kernel_gm<scalar_t>\n        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n            0, stream>>>(\n                      num_kernels, \n                      grad_col,\n                      data_value,\n                      data_spatial_shapes,\n                      data_level_start_index, \n                      data_sampling_loc,\n                      data_attn_weight,\n                      batch_size, \n                      spatial_size, \n                      num_heads,\n                      channels, \n                      num_levels,\n                      num_query,\n                      num_point,\n                      grad_value,\n                      grad_sampling_loc,\n                      grad_attn_weight);\n    }\n  }\n  else{\n    switch(channels)\n    {\n      case 1:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>\n        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n            0, stream>>>(\n                      num_kernels, \n                      grad_col,\n                      data_value,\n                      data_spatial_shapes,\n                      data_level_start_index, \n                      data_sampling_loc,\n                      data_attn_weight,\n                      batch_size, \n                      spatial_size, \n                      num_heads,\n                      channels, \n                      num_levels,\n                      num_query,\n                      num_point,\n                      grad_value,\n                      grad_sampling_loc,\n                      grad_attn_weight);\n        break;\n      case 2:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>\n        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n            0, stream>>>(\n                      num_kernels, \n                      grad_col,\n                      data_value,\n                      data_spatial_shapes,\n                      data_level_start_index, \n                      data_sampling_loc,\n                      data_attn_weight,\n                      batch_size, \n                      spatial_size, \n                      num_heads,\n                      channels, \n                      num_levels,\n                      num_query,\n                      num_point,\n                      grad_value,\n                      grad_sampling_loc,\n                      grad_attn_weight);\n        break;\n      case 4:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>\n        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n            0, stream>>>(\n                      num_kernels, \n                      grad_col,\n                      data_value,\n                      data_spatial_shapes,\n                      data_level_start_index, \n                      data_sampling_loc,\n                      data_attn_weight,\n                      batch_size, \n                      spatial_size, \n                      num_heads,\n                      channels, \n                      num_levels,\n                      num_query,\n                      num_point,\n                      grad_value,\n                      grad_sampling_loc,\n                      grad_attn_weight);\n        break;\n      case 8:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>\n        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n            0, stream>>>(\n                      num_kernels, \n                      grad_col,\n                      data_value,\n                      data_spatial_shapes,\n                      data_level_start_index, \n                      data_sampling_loc,\n                      data_attn_weight,\n                      batch_size, \n                      spatial_size, \n                      num_heads,\n                      channels, \n                      num_levels,\n                      num_query,\n                      num_point,\n                      grad_value,\n                      grad_sampling_loc,\n                      grad_attn_weight);\n        break;\n      case 16:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>\n        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n            0, stream>>>(\n                      num_kernels, \n                      grad_col,\n                      data_value,\n                      data_spatial_shapes,\n                      data_level_start_index, \n                      data_sampling_loc,\n                      data_attn_weight,\n                      batch_size, \n                      spatial_size, \n                      num_heads,\n                      channels, \n                      num_levels,\n                      num_query,\n                      num_point,\n                      grad_value,\n                      grad_sampling_loc,\n                      grad_attn_weight);\n        break;\n      case 32:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>\n        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n            0, stream>>>(\n                      num_kernels, \n                      grad_col,\n                      data_value,\n                      data_spatial_shapes,\n                      data_level_start_index, \n                      data_sampling_loc,\n                      data_attn_weight,\n                      batch_size, \n                      spatial_size, \n                      num_heads,\n                      channels, \n                      num_levels,\n                      num_query,\n                      num_point,\n                      grad_value,\n                      grad_sampling_loc,\n                      grad_attn_weight);\n        break;\n      case 64:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>\n        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n            0, stream>>>(\n                      num_kernels, \n                      grad_col,\n                      data_value,\n                      data_spatial_shapes,\n                      data_level_start_index, \n                      data_sampling_loc,\n                      data_attn_weight,\n                      batch_size, \n                      spatial_size, \n                      num_heads,\n                      channels, \n                      num_levels,\n                      num_query,\n                      num_point,\n                      grad_value,\n                      grad_sampling_loc,\n                      grad_attn_weight);\n        break;\n      case 128:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>\n        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n            0, stream>>>(\n                      num_kernels, \n                      grad_col,\n                      data_value,\n                      data_spatial_shapes,\n                      data_level_start_index, \n                      data_sampling_loc,\n                      data_attn_weight,\n                      batch_size, \n                      spatial_size, \n                      num_heads,\n                      channels, \n                      num_levels,\n                      num_query,\n                      num_point,\n                      grad_value,\n                      grad_sampling_loc,\n                      grad_attn_weight);\n        break;\n      case 256:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>\n        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n            0, stream>>>(\n                      num_kernels, \n                      grad_col,\n                      data_value,\n                      data_spatial_shapes,\n                      data_level_start_index, \n                      data_sampling_loc,\n                      data_attn_weight,\n                      batch_size, \n                      spatial_size, \n                      num_heads,\n                      channels, \n                      num_levels,\n                      num_query,\n                      num_point,\n                      grad_value,\n                      grad_sampling_loc,\n                      grad_attn_weight);\n        break;\n      case 512:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>\n        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n            0, stream>>>(\n                      num_kernels, \n                      grad_col,\n                      data_value,\n                      data_spatial_shapes,\n                      data_level_start_index, \n                      data_sampling_loc,\n                      data_attn_weight,\n                      batch_size, \n                      spatial_size, \n                      num_heads,\n                      channels, \n                      num_levels,\n                      num_query,\n                      num_point,\n                      grad_value,\n                      grad_sampling_loc,\n                      grad_attn_weight);\n        break;\n      case 1024:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>\n        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n            0, stream>>>(\n                      num_kernels, \n                      grad_col,\n                      data_value,\n                      data_spatial_shapes,\n                      data_level_start_index, \n                      data_sampling_loc,\n                      data_attn_weight,\n                      batch_size, \n                      spatial_size, \n                      num_heads,\n                      channels, \n                      num_levels,\n                      num_query,\n                      num_point,\n                      grad_value,\n                      grad_sampling_loc,\n                      grad_attn_weight);\n        break;\n      default:\n        if (channels < 64)\n        {\n          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n              num_threads*3*sizeof(scalar_t), stream>>>(\n                        num_kernels, \n                        grad_col,\n                        data_value,\n                        data_spatial_shapes,\n                        data_level_start_index, \n                        data_sampling_loc,\n                        data_attn_weight,\n                        batch_size, \n                        spatial_size, \n                        num_heads,\n                        channels, \n                        num_levels,\n                        num_query,\n                        num_point,\n                        grad_value,\n                        grad_sampling_loc,\n                        grad_attn_weight);\n        }\n        else\n        {\n          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n              num_threads*3*sizeof(scalar_t), stream>>>(\n                        num_kernels, \n                        grad_col,\n                        data_value,\n                        data_spatial_shapes,\n                        data_level_start_index, \n                        data_sampling_loc,\n                        data_attn_weight,\n                        batch_size, \n                        spatial_size, \n                        num_heads,\n                        channels, \n                        num_levels,\n                        num_query,\n                        num_point,\n                        grad_value,\n                        grad_sampling_loc,\n                        grad_attn_weight);\n        }\n    }\n  }\n  cudaError_t err = cudaGetLastError();\n  if (err != cudaSuccess)\n  {\n    printf(\"error in ms_deformable_col2im_cuda: %s\\n\", cudaGetErrorString(err));\n  }\n\n}"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/ops/src/ms_deform_attn.h",
    "content": "/*!\n**************************************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 SenseTime. All Rights Reserved.\n* Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n**************************************************************************************************\n* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n**************************************************************************************************\n*/\n\n#pragma once\n\n#include \"cpu/ms_deform_attn_cpu.h\"\n\n#ifdef WITH_CUDA\n#include \"cuda/ms_deform_attn_cuda.h\"\n#endif\n\n\nat::Tensor\nms_deform_attn_forward(\n    const at::Tensor &value, \n    const at::Tensor &spatial_shapes,\n    const at::Tensor &level_start_index,\n    const at::Tensor &sampling_loc,\n    const at::Tensor &attn_weight,\n    const int im2col_step)\n{\n    if (value.type().is_cuda())\n    {\n#ifdef WITH_CUDA\n        return ms_deform_attn_cuda_forward(\n            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);\n#else\n        AT_ERROR(\"Not compiled with GPU support\");\n#endif\n    }\n    AT_ERROR(\"Not implemented on the CPU\");\n}\n\nstd::vector<at::Tensor>\nms_deform_attn_backward(\n    const at::Tensor &value, \n    const at::Tensor &spatial_shapes,\n    const at::Tensor &level_start_index,\n    const at::Tensor &sampling_loc,\n    const at::Tensor &attn_weight,\n    const at::Tensor &grad_output,\n    const int im2col_step)\n{\n    if (value.type().is_cuda())\n    {\n#ifdef WITH_CUDA\n        return ms_deform_attn_cuda_backward(\n            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);\n#else\n        AT_ERROR(\"Not compiled with GPU support\");\n#endif\n    }\n    AT_ERROR(\"Not implemented on the CPU\");\n}\n\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/ops/src/vision.cpp",
    "content": "/*!\n**************************************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 SenseTime. All Rights Reserved.\n* Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n**************************************************************************************************\n* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n**************************************************************************************************\n*/\n\n#include \"ms_deform_attn.h\"\n\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n  m.def(\"ms_deform_attn_forward\", &ms_deform_attn_forward, \"ms_deform_attn_forward\");\n  m.def(\"ms_deform_attn_backward\", &ms_deform_attn_backward, \"ms_deform_attn_backward\");\n}\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/ops/test.py",
    "content": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------------------------------\n# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n# ------------------------------------------------------------------------------------------------\n\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ import division\n\nimport time\nimport torch\nimport torch.nn as nn\nfrom torch.autograd import gradcheck\n\nfrom functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch\n\n\nN, M, D = 1, 2, 2\nLq, L, P = 2, 2, 2\nshapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()\nlevel_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))\nS = sum([(H*W).item() for H, W in shapes])\n\n\ntorch.manual_seed(3)\n\n\n@torch.no_grad()\ndef check_forward_equal_with_pytorch_double():\n    value = torch.rand(N, S, M, D).cuda() * 0.01\n    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()\n    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5\n    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)\n    im2col_step = 2\n    output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()\n    output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()\n    fwdok = torch.allclose(output_cuda, output_pytorch)\n    max_abs_err = (output_cuda - output_pytorch).abs().max()\n    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()\n\n    print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')\n\n\n@torch.no_grad()\ndef check_forward_equal_with_pytorch_float():\n    value = torch.rand(N, S, M, D).cuda() * 0.01\n    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()\n    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5\n    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)\n    im2col_step = 2\n    output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()\n    output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()\n    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)\n    max_abs_err = (output_cuda - output_pytorch).abs().max()\n    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()\n\n    print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')\n\n\ndef check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):\n\n    value = torch.rand(N, S, M, channels).cuda() * 0.01\n    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()\n    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5\n    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)\n    im2col_step = 2\n    func = MSDeformAttnFunction.apply\n\n    value.requires_grad = grad_value\n    sampling_locations.requires_grad = grad_sampling_loc\n    attention_weights.requires_grad = grad_attn_weight\n\n    gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))\n\n    print(f'* {gradok} check_gradient_numerical(D={channels})')\n\n\nif __name__ == '__main__':\n    check_forward_equal_with_pytorch_double()\n    check_forward_equal_with_pytorch_float()\n\n    for channels in [30, 32, 64, 71, 1025, 2048, 3096]:\n        check_gradient_numerical(channels, True, True, True)\n\n\n\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/position_encoding.py",
    "content": "# ------------------------------------------------------------------------\n# ED-Pose\n# Copyright (c) 2023 IDEA. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# Conditional DETR\n# Copyright (c) 2021 Microsoft. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# Copied from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.\n# ------------------------------------------------------------------------\n\n\"\"\"\nVarious positional encodings for the transformer.\n\"\"\"\nimport math\nimport torch\nfrom torch import nn\n\nfrom util.misc import NestedTensor\n\n\nclass PositionEmbeddingSine(nn.Module):\n    \"\"\"\n    This is a more standard version of the position embedding, very similar to the one\n    used by the Attention is all you need paper, generalized to work on images.\n    \"\"\"\n    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):\n        super().__init__()\n        self.num_pos_feats = num_pos_feats\n        self.temperature = temperature\n        self.normalize = normalize\n        if scale is not None and normalize is False:\n            raise ValueError(\"normalize should be True if scale is passed\")\n        if scale is None:\n            scale = 2 * math.pi\n        self.scale = scale\n\n    def forward(self, tensor_list: NestedTensor):\n        x = tensor_list.tensors\n        mask = tensor_list.mask\n        assert mask is not None\n        not_mask = ~mask\n        y_embed = not_mask.cumsum(1, dtype=torch.float32)\n        x_embed = not_mask.cumsum(2, dtype=torch.float32)\n        if self.normalize:\n            eps = 1e-6\n            # if os.environ.get(\"SHILONG_AMP\", None) == '1':\n            #     eps = 1e-4\n            # else:\n            #     eps = 1e-6\n            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale\n            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale\n\n        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)\n        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)\n\n        pos_x = x_embed[:, :, :, None] / dim_t\n        pos_y = y_embed[:, :, :, None] / dim_t\n        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)\n        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)\n        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)\n        return pos\n\nclass PositionEmbeddingSineHW(nn.Module):\n    \"\"\"\n    This is a more standard version of the position embedding, very similar to the one\n    used by the Attention is all you need paper, generalized to work on images.\n    \"\"\"\n    def __init__(self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None):\n        super().__init__()\n        self.num_pos_feats = num_pos_feats\n        self.temperatureH = temperatureH\n        self.temperatureW = temperatureW\n        self.normalize = normalize\n        if scale is not None and normalize is False:\n            raise ValueError(\"normalize should be True if scale is passed\")\n        if scale is None:\n            scale = 2 * math.pi\n        self.scale = scale\n\n    def forward(self, tensor_list: NestedTensor):\n        x = tensor_list.tensors\n        mask = tensor_list.mask\n        assert mask is not None\n        not_mask = ~mask\n        y_embed = not_mask.cumsum(1, dtype=torch.float32)\n        x_embed = not_mask.cumsum(2, dtype=torch.float32)\n\n        # import ipdb; ipdb.set_trace()\n\n        if self.normalize:\n            eps = 1e-6\n            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale\n            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale\n\n        dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)\n        dim_tx = self.temperatureW ** (2 * (dim_tx // 2) / self.num_pos_feats)\n        pos_x = x_embed[:, :, :, None] / dim_tx\n\n        dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)\n        dim_ty = self.temperatureH ** (2 * (dim_ty // 2) / self.num_pos_feats)\n        pos_y = y_embed[:, :, :, None] / dim_ty\n\n        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)\n        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)\n        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)\n\n        # import ipdb; ipdb.set_trace()\n\n        return pos\n\nclass PositionEmbeddingLearned(nn.Module):\n    \"\"\"\n    Absolute pos embedding, learned.\n    \"\"\"\n    def __init__(self, num_pos_feats=256):\n        super().__init__()\n        self.row_embed = nn.Embedding(50, num_pos_feats)\n        self.col_embed = nn.Embedding(50, num_pos_feats)\n        self.reset_parameters()\n\n    def reset_parameters(self):\n        nn.init.uniform_(self.row_embed.weight)\n        nn.init.uniform_(self.col_embed.weight)\n\n    def forward(self, tensor_list: NestedTensor):\n        x = tensor_list.tensors\n        h, w = x.shape[-2:]\n        i = torch.arange(w, device=x.device)\n        j = torch.arange(h, device=x.device)\n        x_emb = self.col_embed(i)\n        y_emb = self.row_embed(j)\n        pos = torch.cat([\n            x_emb.unsqueeze(0).repeat(h, 1, 1),\n            y_emb.unsqueeze(1).repeat(1, w, 1),\n        ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)\n        return pos\n\n\ndef build_position_encoding(args):\n    N_steps = args.hidden_dim // 2\n    if args.position_embedding in ('v2', 'sine'):\n        # TODO find a better way of exposing other arguments\n        position_embedding = PositionEmbeddingSineHW(\n            N_steps, \n            temperatureH=args.pe_temperatureH,\n            temperatureW=args.pe_temperatureW,\n            normalize=True\n        )\n    elif args.position_embedding in ('v3', 'learned'):\n        position_embedding = PositionEmbeddingLearned(N_steps)\n    else:\n        raise ValueError(f\"not supported {args.position_embedding}\")\n\n    return position_embedding\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/swin_transformer.py",
    "content": "\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.utils.checkpoint as checkpoint\nimport numpy as np\n\nfrom util.misc import NestedTensor\n# from timm.models.layers import DropPath, to_2tuple, trunc_normal_\nfrom src.modules.util import DropPath, to_2tuple, trunc_normal_\n\n\n\nclass Mlp(nn.Module):\n    \"\"\" Multilayer perceptron.\"\"\"\n\n    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):\n        super().__init__()\n        out_features = out_features or in_features\n        hidden_features = hidden_features or in_features\n        self.fc1 = nn.Linear(in_features, hidden_features)\n        self.act = act_layer()\n        self.fc2 = nn.Linear(hidden_features, out_features)\n        self.drop = nn.Dropout(drop)\n\n    def forward(self, x):\n        x = self.fc1(x)\n        x = self.act(x)\n        x = self.drop(x)\n        x = self.fc2(x)\n        x = self.drop(x)\n        return x\n\n\ndef window_partition(x, window_size):\n    \"\"\"\n    Args:\n        x: (B, H, W, C)\n        window_size (int): window size\n    Returns:\n        windows: (num_windows*B, window_size, window_size, C)\n    \"\"\"\n    B, H, W, C = x.shape\n    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)\n    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)\n    return windows\n\n\ndef window_reverse(windows, window_size, H, W):\n    \"\"\"\n    Args:\n        windows: (num_windows*B, window_size, window_size, C)\n        window_size (int): Window size\n        H (int): Height of image\n        W (int): Width of image\n    Returns:\n        x: (B, H, W, C)\n    \"\"\"\n    B = int(windows.shape[0] / (H * W / window_size / window_size))\n    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)\n    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)\n    return x\n\n\nclass WindowAttention(nn.Module):\n    \"\"\" Window based multi-head self attention (W-MSA) module with relative position bias.\n    It supports both of shifted and non-shifted window.\n    Args:\n        dim (int): Number of input channels.\n        window_size (tuple[int]): The height and width of the window.\n        num_heads (int): Number of attention heads.\n        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True\n        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set\n        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0\n        proj_drop (float, optional): Dropout ratio of output. Default: 0.0\n    \"\"\"\n\n    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):\n\n        super().__init__()\n        self.dim = dim\n        self.window_size = window_size  # Wh, Ww\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = qk_scale or head_dim ** -0.5\n\n        # define a parameter table of relative position bias\n        self.relative_position_bias_table = nn.Parameter(\n            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH\n\n        # get pair-wise relative position index for each token inside the window\n        coords_h = torch.arange(self.window_size[0])\n        coords_w = torch.arange(self.window_size[1])\n        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww\n        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww\n        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww\n        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2\n        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0\n        relative_coords[:, :, 1] += self.window_size[1] - 1\n        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1\n        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww\n        self.register_buffer(\"relative_position_index\", relative_position_index)\n\n        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)\n        self.attn_drop = nn.Dropout(attn_drop)\n        self.proj = nn.Linear(dim, dim)\n        self.proj_drop = nn.Dropout(proj_drop)\n\n        trunc_normal_(self.relative_position_bias_table, std=.02)\n        self.softmax = nn.Softmax(dim=-1)\n\n    def forward(self, x, mask=None):\n        \"\"\" Forward function.\n        Args:\n            x: input features with shape of (num_windows*B, N, C)\n            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None\n        \"\"\"\n        B_, N, C = x.shape\n        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)\n        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)\n\n        q = q * self.scale\n        attn = (q @ k.transpose(-2, -1))\n\n        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(\n            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH\n        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww\n        attn = attn + relative_position_bias.unsqueeze(0)\n\n        if mask is not None:\n            nW = mask.shape[0]\n            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)\n            attn = attn.view(-1, self.num_heads, N, N)\n            attn = self.softmax(attn)\n        else:\n            attn = self.softmax(attn)\n\n        attn = self.attn_drop(attn)\n\n        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)\n        x = self.proj(x)\n        x = self.proj_drop(x)\n        return x\n\n\nclass SwinTransformerBlock(nn.Module):\n    \"\"\" Swin Transformer Block.\n    Args:\n        dim (int): Number of input channels.\n        num_heads (int): Number of attention heads.\n        window_size (int): Window size.\n        shift_size (int): Shift size for SW-MSA.\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.\n        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True\n        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.\n        drop (float, optional): Dropout rate. Default: 0.0\n        attn_drop (float, optional): Attention dropout rate. Default: 0.0\n        drop_path (float, optional): Stochastic depth rate. Default: 0.0\n        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU\n        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm\n    \"\"\"\n\n    def __init__(self, dim, num_heads, window_size=7, shift_size=0,\n                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,\n                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):\n        super().__init__()\n        self.dim = dim\n        self.num_heads = num_heads\n        self.window_size = window_size\n        self.shift_size = shift_size\n        self.mlp_ratio = mlp_ratio\n        assert 0 <= self.shift_size < self.window_size, \"shift_size must in 0-window_size\"\n\n        self.norm1 = norm_layer(dim)\n        self.attn = WindowAttention(\n            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,\n            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)\n\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()\n        self.norm2 = norm_layer(dim)\n        mlp_hidden_dim = int(dim * mlp_ratio)\n        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)\n\n        self.H = None\n        self.W = None\n\n    def forward(self, x, mask_matrix):\n        \"\"\" Forward function.\n        Args:\n            x: Input feature, tensor size (B, H*W, C).\n            H, W: Spatial resolution of the input feature.\n            mask_matrix: Attention mask for cyclic shift.\n        \"\"\"\n        B, L, C = x.shape\n        H, W = self.H, self.W\n        assert L == H * W, \"input feature has wrong size\"\n\n        shortcut = x\n        x = self.norm1(x)\n        x = x.view(B, H, W, C)\n\n        # pad feature maps to multiples of window size\n        pad_l = pad_t = 0\n        pad_r = (self.window_size - W % self.window_size) % self.window_size\n        pad_b = (self.window_size - H % self.window_size) % self.window_size\n        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))\n        _, Hp, Wp, _ = x.shape\n\n        # cyclic shift\n        if self.shift_size > 0:\n            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))\n            attn_mask = mask_matrix\n        else:\n            shifted_x = x\n            attn_mask = None\n\n        # partition windows\n        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C\n        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C\n\n        # W-MSA/SW-MSA\n        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C\n\n        # merge windows\n        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)\n        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C\n\n        # reverse cyclic shift\n        if self.shift_size > 0:\n            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))\n        else:\n            x = shifted_x\n\n        if pad_r > 0 or pad_b > 0:\n            x = x[:, :H, :W, :].contiguous()\n\n        x = x.view(B, H * W, C)\n\n        # FFN\n        x = shortcut + self.drop_path(x)\n        x = x + self.drop_path(self.mlp(self.norm2(x)))\n\n        return x\n\n\nclass PatchMerging(nn.Module):\n    \"\"\" Patch Merging Layer\n    Args:\n        dim (int): Number of input channels.\n        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm\n    \"\"\"\n    def __init__(self, dim, norm_layer=nn.LayerNorm):\n        super().__init__()\n        self.dim = dim\n        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)\n        self.norm = norm_layer(4 * dim)\n\n    def forward(self, x, H, W):\n        \"\"\" Forward function.\n        Args:\n            x: Input feature, tensor size (B, H*W, C).\n            H, W: Spatial resolution of the input feature.\n        \"\"\"\n        B, L, C = x.shape\n        assert L == H * W, \"input feature has wrong size\"\n\n        x = x.view(B, H, W, C)\n\n        # padding\n        pad_input = (H % 2 == 1) or (W % 2 == 1)\n        if pad_input:\n            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))\n\n        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C\n        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C\n        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C\n        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C\n        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C\n        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C\n\n        x = self.norm(x)\n        x = self.reduction(x)\n\n        return x\n\n\nclass BasicLayer(nn.Module):\n    \"\"\" A basic Swin Transformer layer for one stage.\n    Args:\n        dim (int): Number of feature channels\n        depth (int): Depths of this stage.\n        num_heads (int): Number of attention head.\n        window_size (int): Local window size. Default: 7.\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.\n        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True\n        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.\n        drop (float, optional): Dropout rate. Default: 0.0\n        attn_drop (float, optional): Attention dropout rate. Default: 0.0\n        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0\n        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm\n        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None\n        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.\n    \"\"\"\n\n    def __init__(self,\n                 dim,\n                 depth,\n                 num_heads,\n                 window_size=7,\n                 mlp_ratio=4.,\n                 qkv_bias=True,\n                 qk_scale=None,\n                 drop=0.,\n                 attn_drop=0.,\n                 drop_path=0.,\n                 norm_layer=nn.LayerNorm,\n                 downsample=None,\n                 use_checkpoint=False):\n        super().__init__()\n        self.window_size = window_size\n        self.shift_size = window_size // 2\n        self.depth = depth\n        self.use_checkpoint = use_checkpoint\n\n        # build blocks\n        self.blocks = nn.ModuleList([\n            SwinTransformerBlock(\n                dim=dim,\n                num_heads=num_heads,\n                window_size=window_size,\n                shift_size=0 if (i % 2 == 0) else window_size // 2,\n                mlp_ratio=mlp_ratio,\n                qkv_bias=qkv_bias,\n                qk_scale=qk_scale,\n                drop=drop,\n                attn_drop=attn_drop,\n                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,\n                norm_layer=norm_layer)\n            for i in range(depth)])\n\n        # patch merging layer\n        if downsample is not None:\n            self.downsample = downsample(dim=dim, norm_layer=norm_layer)\n        else:\n            self.downsample = None\n\n    def forward(self, x, H, W):\n        \"\"\" Forward function.\n        Args:\n            x: Input feature, tensor size (B, H*W, C).\n            H, W: Spatial resolution of the input feature.\n        \"\"\"\n\n        # calculate attention mask for SW-MSA\n        Hp = int(np.ceil(H / self.window_size)) * self.window_size\n        Wp = int(np.ceil(W / self.window_size)) * self.window_size\n        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1\n        h_slices = (slice(0, -self.window_size),\n                    slice(-self.window_size, -self.shift_size),\n                    slice(-self.shift_size, None))\n        w_slices = (slice(0, -self.window_size),\n                    slice(-self.window_size, -self.shift_size),\n                    slice(-self.shift_size, None))\n        cnt = 0\n        for h in h_slices:\n            for w in w_slices:\n                img_mask[:, h, w, :] = cnt\n                cnt += 1\n\n        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1\n        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)\n        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)\n        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))\n\n        for blk in self.blocks:\n            blk.H, blk.W = H, W\n            if self.use_checkpoint:\n                x = checkpoint.checkpoint(blk, x, attn_mask)\n            else:\n                x = blk(x, attn_mask)\n        if self.downsample is not None:\n            x_down = self.downsample(x, H, W)\n            Wh, Ww = (H + 1) // 2, (W + 1) // 2\n            return x, H, W, x_down, Wh, Ww\n        else:\n            return x, H, W, x, H, W\n\n\nclass PatchEmbed(nn.Module):\n    \"\"\" Image to Patch Embedding\n    Args:\n        patch_size (int): Patch token size. Default: 4.\n        in_chans (int): Number of input image channels. Default: 3.\n        embed_dim (int): Number of linear projection output channels. Default: 96.\n        norm_layer (nn.Module, optional): Normalization layer. Default: None\n    \"\"\"\n\n    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):\n        super().__init__()\n        patch_size = to_2tuple(patch_size)\n        self.patch_size = patch_size\n\n        self.in_chans = in_chans\n        self.embed_dim = embed_dim\n\n        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)\n        if norm_layer is not None:\n            self.norm = norm_layer(embed_dim)\n        else:\n            self.norm = None\n\n    def forward(self, x):\n        \"\"\"Forward function.\"\"\"\n        # padding\n        _, _, H, W = x.size()\n        if W % self.patch_size[1] != 0:\n            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))\n        if H % self.patch_size[0] != 0:\n            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))\n\n        x = self.proj(x)  # B C Wh Ww\n        if self.norm is not None:\n            Wh, Ww = x.size(2), x.size(3)\n            x = x.flatten(2).transpose(1, 2)\n            x = self.norm(x)\n            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)\n\n        return x\n\n\nclass SwinTransformer(nn.Module):\n    \"\"\" Swin Transformer backbone.\n        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -\n          https://arxiv.org/pdf/2103.14030\n    Args:\n        pretrain_img_size (int): Input image size for training the pretrained model,\n            used in absolute postion embedding. Default 224.\n        patch_size (int | tuple(int)): Patch size. Default: 4.\n        in_chans (int): Number of input image channels. Default: 3.\n        embed_dim (int): Number of linear projection output channels. Default: 96.\n        depths (tuple[int]): Depths of each Swin Transformer stage.\n        num_heads (tuple[int]): Number of attention head of each stage.\n        window_size (int): Window size. Default: 7.\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.\n        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True\n        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.\n        drop_rate (float): Dropout rate.\n        attn_drop_rate (float): Attention dropout rate. Default: 0.\n        drop_path_rate (float): Stochastic depth rate. Default: 0.2.\n        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.\n        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.\n        patch_norm (bool): If True, add normalization after patch embedding. Default: True.\n        out_indices (Sequence[int]): Output from which stages.\n        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).\n            -1 means not freezing any parameters.\n        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.\n        dilation (bool): if True, the output size if 16x downsample, ow 32x downsample.\n    \"\"\"\n\n    def __init__(self,\n                 pretrain_img_size=224,\n                 patch_size=4,\n                 in_chans=3,\n                 embed_dim=96,\n                 depths=[2, 2, 6, 2],\n                 num_heads=[3, 6, 12, 24],\n                 window_size=7,\n                 mlp_ratio=4.,\n                 qkv_bias=True,\n                 qk_scale=None,\n                 drop_rate=0.,\n                 attn_drop_rate=0.,\n                 drop_path_rate=0.2,\n                 norm_layer=nn.LayerNorm,\n                 ape=False,\n                 patch_norm=True,\n                 out_indices=(0, 1, 2, 3),\n                 frozen_stages=-1,\n                 dilation=False,\n                 use_checkpoint=False):\n        super().__init__()\n\n        self.pretrain_img_size = pretrain_img_size\n        self.num_layers = len(depths)\n        self.embed_dim = embed_dim\n        self.ape = ape\n        self.patch_norm = patch_norm\n        self.out_indices = out_indices\n        self.frozen_stages = frozen_stages\n        self.dilation = dilation\n\n        # if use_checkpoint:\n        #     print(\"use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!!\")\n\n        # split image into non-overlapping patches\n        self.patch_embed = PatchEmbed(\n            patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,\n            norm_layer=norm_layer if self.patch_norm else None)\n\n        # absolute position embedding\n        if self.ape:\n            pretrain_img_size = to_2tuple(pretrain_img_size)\n            patch_size = to_2tuple(patch_size)\n            patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]\n\n            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))\n            trunc_normal_(self.absolute_pos_embed, std=.02)\n\n        self.pos_drop = nn.Dropout(p=drop_rate)\n\n        # stochastic depth\n        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule\n\n        # build layers\n        self.layers = nn.ModuleList()\n        # prepare downsample list\n        downsamplelist = [PatchMerging for i in range(self.num_layers)]\n        downsamplelist[-1] = None\n        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]\n        if self.dilation:\n            downsamplelist[-2] = None\n            num_features[-1] = int(embed_dim * 2 ** (self.num_layers - 1)) // 2\n        for i_layer in range(self.num_layers):\n            layer = BasicLayer(\n                # dim=int(embed_dim * 2 ** i_layer),\n                dim=num_features[i_layer],\n                depth=depths[i_layer],\n                num_heads=num_heads[i_layer],\n                window_size=window_size,\n                mlp_ratio=mlp_ratio,\n                qkv_bias=qkv_bias,\n                qk_scale=qk_scale,\n                drop=drop_rate,\n                attn_drop=attn_drop_rate,\n                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],\n                norm_layer=norm_layer,\n                # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,\n                downsample=downsamplelist[i_layer],\n                use_checkpoint=use_checkpoint)\n            self.layers.append(layer)\n\n        # num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]\n        self.num_features = num_features\n\n        # add a norm layer for each output\n        for i_layer in out_indices:\n            layer = norm_layer(num_features[i_layer])\n            layer_name = f'norm{i_layer}'\n            self.add_module(layer_name, layer)\n\n        self._freeze_stages()\n\n    def _freeze_stages(self):\n        if self.frozen_stages >= 0:\n            self.patch_embed.eval()\n            for param in self.patch_embed.parameters():\n                param.requires_grad = False\n\n        if self.frozen_stages >= 1 and self.ape:\n            self.absolute_pos_embed.requires_grad = False\n\n        if self.frozen_stages >= 2:\n            self.pos_drop.eval()\n            for i in range(0, self.frozen_stages - 1):\n                m = self.layers[i]\n                m.eval()\n                for param in m.parameters():\n                    param.requires_grad = False\n\n\n\n    def forward_raw(self, x):\n        \"\"\"Forward function.\"\"\"\n        x = self.patch_embed(x)\n\n        Wh, Ww = x.size(2), x.size(3)\n        if self.ape:\n            # interpolate the position embedding to the corresponding size\n            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')\n            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C\n        else:\n            x = x.flatten(2).transpose(1, 2)\n        x = self.pos_drop(x)\n\n        outs = []\n        for i in range(self.num_layers):\n            layer = self.layers[i]\n            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)\n            # import ipdb; ipdb.set_trace()\n\n            if i in self.out_indices:\n                norm_layer = getattr(self, f'norm{i}')\n                x_out = norm_layer(x_out)\n\n                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()\n                outs.append(out)\n        # in:\n        #   torch.Size([2, 3, 1024, 1024])\n        # outs:\n        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \\\n        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]\n        return tuple(outs)\n\n\n    def forward(self, tensor_list: NestedTensor):\n        x = tensor_list.tensors\n\n        \"\"\"Forward function.\"\"\"\n        x = self.patch_embed(x)\n\n        Wh, Ww = x.size(2), x.size(3)\n        if self.ape:\n            # interpolate the position embedding to the corresponding size\n            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')\n            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C\n        else:\n            x = x.flatten(2).transpose(1, 2)\n        x = self.pos_drop(x)\n\n        outs = []\n        for i in range(self.num_layers):\n            layer = self.layers[i]\n            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)\n\n            if i in self.out_indices:\n                norm_layer = getattr(self, f'norm{i}')\n                x_out = norm_layer(x_out)\n\n                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()\n                outs.append(out)\n        # in:\n        #   torch.Size([2, 3, 1024, 1024])\n        # out:\n        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \\\n        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]\n\n        # collect for nesttensors\n        outs_dict = {}\n        for idx, out_i in enumerate(outs):\n            m = tensor_list.mask\n            assert m is not None\n            mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0]\n            outs_dict[idx] = NestedTensor(out_i, mask)\n\n        return outs_dict\n\n\n    def train(self, mode=True):\n        \"\"\"Convert the model into training mode while keep layers freezed.\"\"\"\n        super(SwinTransformer, self).train(mode)\n        self._freeze_stages()\n\n\n\ndef build_swin_transformer(modelname, pretrain_img_size, **kw):\n    assert modelname in ['swin_T_224_1k', 'swin_B_224_22k', 'swin_B_384_22k', 'swin_L_224_22k', 'swin_L_384_22k']\n\n    model_para_dict = {\n        'swin_T_224_1k': dict(\n            embed_dim=96,\n            depths=[ 2, 2, 6, 2 ],\n            num_heads=[ 3, 6, 12, 24],\n            window_size=7\n        ),\n        'swin_B_224_22k': dict(\n            embed_dim=128,\n            depths=[ 2, 2, 18, 2 ],\n            num_heads=[ 4, 8, 16, 32 ],\n            window_size=7\n        ),\n        'swin_B_384_22k': dict(\n            embed_dim=128,\n            depths=[ 2, 2, 18, 2 ],\n            num_heads=[ 4, 8, 16, 32 ],\n            window_size=12\n        ),\n        'swin_L_224_22k': dict(\n            embed_dim=192,\n            depths=[ 2, 2, 18, 2 ],\n            num_heads=[ 6, 12, 24, 48 ],\n            window_size=7\n        ),\n        'swin_L_384_22k': dict(\n            embed_dim=192,\n            depths=[ 2, 2, 18, 2 ],\n            num_heads=[ 6, 12, 24, 48 ],\n            window_size=12\n        ),\n    }\n    kw_cgf = model_para_dict[modelname]\n    kw_cgf.update(kw)\n    model = SwinTransformer(pretrain_img_size=pretrain_img_size, **kw_cgf)\n    return model\n\nif __name__ == \"__main__\":\n    model = build_swin_transformer('swin_L_384_22k', 384, dilation=True)\n    x = torch.rand(2, 3, 1024, 1024)\n    y = model.forward_raw(x)\n    import ipdb; ipdb.set_trace()\n    x = torch.rand(2, 3, 384, 384)\n    y = model.forward_raw(x)\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/transformer_deformable.py",
    "content": "# ------------------------------------------------------------------------\n# ED-Pose\n# Copyright (c) 2023 IDEA. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# Deformable DETR\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# Modified from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n# ------------------------------------------------------------------------\n\nimport copy\nimport math\nimport torch\nfrom torch import nn, Tensor\nfrom torch.nn.init import xavier_uniform_, constant_, normal_\nfrom typing import Optional\n\nfrom util.misc import inverse_sigmoid\nfrom .ops.modules import MSDeformAttn\nfrom .utils import MLP, _get_activation_fn, gen_sineembed_for_position\n\nclass DeformableTransformer(nn.Module):\n    def __init__(self, d_model=256, nhead=8,\n                 num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.1,\n                 activation=\"relu\", return_intermediate_dec=False,\n                 num_feature_levels=4, dec_n_points=4,  enc_n_points=4,\n                 two_stage=False, two_stage_num_proposals=300,\n                 use_dab=False, high_dim_query_update=False, no_sine_embed=False):\n        super().__init__()\n\n        self.d_model = d_model\n        self.nhead = nhead\n        self.two_stage = two_stage\n        self.two_stage_num_proposals = two_stage_num_proposals\n        self.use_dab = use_dab\n\n        encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,\n                                                          dropout, activation,\n                                                          num_feature_levels, nhead, enc_n_points)\n        self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers)\n\n        decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,\n                                                          dropout, activation,\n                                                          num_feature_levels, nhead, dec_n_points)\n        self.decoder = DeformableTransformerDecoder(decoder_layer, num_decoder_layers, return_intermediate_dec,\n                                                            use_dab=use_dab, d_model=d_model, high_dim_query_update=high_dim_query_update, no_sine_embed=no_sine_embed)\n\n        self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))\n\n        if two_stage:\n            self.enc_output = nn.Linear(d_model, d_model)\n            self.enc_output_norm = nn.LayerNorm(d_model)\n            self.pos_trans = nn.Linear(d_model * 2, d_model * 2)\n            self.pos_trans_norm = nn.LayerNorm(d_model * 2)\n        else:\n            if not self.use_dab:\n                self.reference_points = nn.Linear(d_model, 2)\n\n        self.high_dim_query_update = high_dim_query_update\n        if high_dim_query_update:\n            assert not self.use_dab, \"use_dab must be True\"\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        for p in self.parameters():\n            if p.dim() > 1:\n                nn.init.xavier_uniform_(p)\n        for m in self.modules():\n            if isinstance(m, MSDeformAttn):\n                m._reset_parameters()\n        if not self.two_stage and not self.use_dab:\n            xavier_uniform_(self.reference_points.weight.data, gain=1.0)\n            constant_(self.reference_points.bias.data, 0.)\n        normal_(self.level_embed)\n\n    def get_proposal_pos_embed(self, proposals):\n        num_pos_feats = 128\n        temperature = 10000\n        scale = 2 * math.pi\n\n        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)\n        dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)\n        # N, L, 4\n        proposals = proposals.sigmoid() * scale\n        # N, L, 4, 128\n        pos = proposals[:, :, :, None] / dim_t\n        # N, L, 4, 64, 2\n        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)\n        return pos\n\n    def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):\n        N_, S_, C_ = memory.shape\n        base_scale = 4.0\n        proposals = []\n        _cur = 0\n        for lvl, (H_, W_) in enumerate(spatial_shapes):\n            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)\n            valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)\n            valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)\n\n            grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),\n                                            torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))\n            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)\n\n            scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)\n            grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale\n            wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)\n            proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)\n            proposals.append(proposal)\n            _cur += (H_ * W_)\n        output_proposals = torch.cat(proposals, 1)\n        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)\n        output_proposals = torch.log(output_proposals / (1 - output_proposals))\n        output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))\n        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))\n\n        output_memory = memory\n        output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))\n        output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))\n        output_memory = self.enc_output_norm(self.enc_output(output_memory))\n        return output_memory, output_proposals\n\n    def get_valid_ratio(self, mask):\n        _, H, W = mask.shape\n        valid_H = torch.sum(~mask[:, :, 0], 1)\n        valid_W = torch.sum(~mask[:, 0, :], 1)\n        valid_ratio_h = valid_H.float() / H\n        valid_ratio_w = valid_W.float() / W\n        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)\n        return valid_ratio\n\n    def forward(self, srcs, masks, pos_embeds, query_embed=None):\n        \"\"\"\n        Input:\n            - srcs: List([bs, c, h, w])\n            - masks: List([bs, h, w])\n        \"\"\"\n        assert self.two_stage or query_embed is not None\n\n        # prepare input for encoder\n        src_flatten = []\n        mask_flatten = []\n        lvl_pos_embed_flatten = []\n        spatial_shapes = []\n        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):\n            bs, c, h, w = src.shape\n            spatial_shape = (h, w)\n            spatial_shapes.append(spatial_shape)\n\n            src = src.flatten(2).transpose(1, 2)                # bs, hw, c\n            mask = mask.flatten(1)                              # bs, hw\n            pos_embed = pos_embed.flatten(2).transpose(1, 2)    # bs, hw, c\n            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)\n            lvl_pos_embed_flatten.append(lvl_pos_embed)\n            src_flatten.append(src)\n            mask_flatten.append(mask)\n        src_flatten = torch.cat(src_flatten, 1)     # bs, \\sum{hxw}, c\n        mask_flatten = torch.cat(mask_flatten, 1)   # bs, \\sum{hxw}\n        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)\n        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)\n        level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))\n        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)\n\n        # encoder\n        memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten)\n        # import ipdb; ipdb.set_trace()\n\n        # prepare input for decoder\n        bs, _, c = memory.shape\n        if self.two_stage:\n            output_memory, output_proposals = self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes)\n\n            # hack implementation for two-stage Deformable DETR\n            enc_outputs_class = self.decoder.class_embed[self.decoder.num_layers](output_memory)\n            enc_outputs_coord_unact = self.decoder.bbox_embed[self.decoder.num_layers](output_memory) + output_proposals\n\n            topk = self.two_stage_num_proposals\n            topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]\n            topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4))\n            topk_coords_unact = topk_coords_unact.detach()\n            reference_points = topk_coords_unact.sigmoid()\n            init_reference_out = reference_points\n            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)))\n            query_embed, tgt = torch.split(pos_trans_out, c, dim=2)\n        elif self.use_dab:\n            reference_points = query_embed[..., self.d_model:].sigmoid()\n            tgt = query_embed[..., :self.d_model]\n            tgt = tgt.unsqueeze(0).expand(bs, -1, -1)\n            init_reference_out = reference_points\n        else:\n            query_embed, tgt = torch.split(query_embed, c, dim=1)\n            query_embed = query_embed.unsqueeze(0).expand(bs, -1, -1)\n            tgt = tgt.unsqueeze(0).expand(bs, -1, -1)\n            reference_points = self.reference_points(query_embed).sigmoid()\n                # bs, num_quires, 2\n            init_reference_out = reference_points\n\n        # decoder\n        # import ipdb; ipdb.set_trace()\n        hs, inter_references = self.decoder(tgt, reference_points, memory,\n                                            spatial_shapes, level_start_index, valid_ratios,\n                                            query_pos=query_embed if not self.use_dab else None,\n                                            src_padding_mask=mask_flatten)\n\n        inter_references_out = inter_references\n        if self.two_stage:\n            return hs, init_reference_out, inter_references_out, enc_outputs_class, enc_outputs_coord_unact\n        return hs, init_reference_out, inter_references_out, None, None\n\n\nclass DeformableTransformerEncoderLayer(nn.Module):\n    def __init__(self,\n                 d_model=256, d_ffn=1024,\n                 dropout=0.1, activation=\"relu\",\n                 n_levels=4, n_heads=8, n_points=4,\n                 add_channel_attention=False,\n                 use_deformable_box_attn=False,\n                 box_attn_type='roi_align',\n                 ):\n        super().__init__()\n\n        # self attention\n        if use_deformable_box_attn:\n            self.self_attn = MSDeformableBoxAttention(d_model, n_levels, n_heads, n_boxes=n_points, used_func=box_attn_type)\n        else:\n            self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)\n        self.dropout1 = nn.Dropout(dropout)\n        self.norm1 = nn.LayerNorm(d_model)\n\n        # ffn\n        self.linear1 = nn.Linear(d_model, d_ffn)\n        self.activation = _get_activation_fn(activation, d_model=d_ffn)\n        self.dropout2 = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(d_ffn, d_model)\n        self.dropout3 = nn.Dropout(dropout)\n        self.norm2 = nn.LayerNorm(d_model)\n\n        # channel attention\n        self.add_channel_attention = add_channel_attention\n        if add_channel_attention:\n            self.activ_channel = _get_activation_fn('dyrelu', d_model=d_model)\n            self.norm_channel = nn.LayerNorm(d_model)\n\n    @staticmethod\n    def with_pos_embed(tensor, pos):\n        return tensor if pos is None else tensor + pos\n\n    def forward_ffn(self, src):\n        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))\n        src = src + self.dropout3(src2)\n        src = self.norm2(src)\n        return src\n\n    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None):\n        # self attention\n        # import ipdb; ipdb.set_trace()\n        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, key_padding_mask)\n        src = src + self.dropout1(src2)\n        src = self.norm1(src)\n\n        # ffn\n        src = self.forward_ffn(src)\n\n        # channel attn\n        if self.add_channel_attention:\n            src = self.norm_channel(src + self.activ_channel(src))\n\n        return src\n\n\nclass DeformableTransformerEncoder(nn.Module):\n    def __init__(self, encoder_layer, num_layers, norm=None):\n        super().__init__()\n        if num_layers > 0:\n            self.layers = _get_clones(encoder_layer, num_layers)\n        else:\n            self.layers = []\n            del encoder_layer\n        self.num_layers = num_layers\n        self.norm = norm\n\n    @staticmethod\n    def get_reference_points(spatial_shapes, valid_ratios, device):\n        reference_points_list = []\n        for lvl, (H_, W_) in enumerate(spatial_shapes):\n\n            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),\n                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))\n            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)\n            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)\n            ref = torch.stack((ref_x, ref_y), -1)\n            reference_points_list.append(ref)\n        reference_points = torch.cat(reference_points_list, 1)\n        reference_points = reference_points[:, :, None] * valid_ratios[:, None]\n        return reference_points\n\n    def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None):\n        \"\"\"\n        Input:\n            - src: [bs, sum(hi*wi), 256]\n            - spatial_shapes: h,w of each level [num_level, 2]\n            - level_start_index: [num_level] start point of level in sum(hi*wi).\n            - valid_ratios: [bs, num_level, 2]\n            - pos: pos embed for src. [bs, sum(hi*wi), 256]\n            - padding_mask: [bs, sum(hi*wi)]\n        Intermedia:\n            - reference_points: [bs, sum(hi*wi), num_lebel, 2]\n        \"\"\"\n        output = src\n        # bs, sum(hi*wi), 256\n        # import ipdb; ipdb.set_trace()\n        if self.num_layers > 0:\n            reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)\n        for _, layer in enumerate(self.layers):\n            output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)\n\n        if self.norm is not None:\n            output = self.norm(output)\n\n        return output\n\n\nclass DeformableTransformerDecoderLayer(nn.Module):\n    def __init__(self, d_model=256, d_ffn=1024,\n                 dropout=0.1, activation=\"relu\",\n                 n_levels=4, n_heads=8, n_points=4,\n                 use_deformable_box_attn=False,\n                 box_attn_type='roi_align',\n                 key_aware_type=None,\n                 decoder_sa_type='ca',\n                 module_seq=['sa', 'ca', 'ffn'],\n                 ):\n        super().__init__()\n        self.module_seq = module_seq\n        assert sorted(module_seq) == ['ca', 'ffn', 'sa']\n\n        # cross attention\n        # self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)\n        if use_deformable_box_attn:\n            self.cross_attn = MSDeformableBoxAttention(d_model, n_levels, n_heads, n_boxes=n_points, used_func=box_attn_type)\n        else:\n            self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)\n        self.dropout1 = nn.Dropout(dropout)\n        self.norm1 = nn.LayerNorm(d_model)\n\n        # self attention\n        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)\n        self.dropout2 = nn.Dropout(dropout)\n        self.norm2 = nn.LayerNorm(d_model)\n\n        # ffn\n        self.linear1 = nn.Linear(d_model, d_ffn)\n        self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)\n        self.dropout3 = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(d_ffn, d_model)\n        self.dropout4 = nn.Dropout(dropout)\n        self.norm3 = nn.LayerNorm(d_model)\n\n        self.key_aware_type = key_aware_type\n        self.key_aware_proj = None\n        self.decoder_sa_type = decoder_sa_type\n        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']\n\n        if decoder_sa_type == 'ca_content':\n            self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)\n\n\n\n\n    def rm_self_attn_modules(self):\n        self.self_attn = None\n        self.dropout2 = None\n        self.norm2 = None\n\n\n    @staticmethod\n    def with_pos_embed(tensor, pos):\n        return tensor if pos is None else tensor + pos\n\n    def forward_ffn(self, tgt):\n        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))\n        tgt = tgt + self.dropout4(tgt2)\n        tgt = self.norm3(tgt)\n        return tgt\n\n    def forward_sa(self,\n                # for tgt\n                tgt: Optional[Tensor],  # nq, bs, d_model\n                tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))\n                tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)\n                tgt_key_padding_mask: Optional[Tensor] = None,\n                tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4\n\n                # for memory\n                memory: Optional[Tensor] = None, # hw, bs, d_model\n                memory_key_padding_mask: Optional[Tensor] = None,\n                memory_level_start_index: Optional[Tensor] = None, # num_levels\n                memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2\n                memory_pos: Optional[Tensor] = None, # pos for memory\n\n                # sa\n                self_attn_mask: Optional[Tensor] = None, # mask used for self-attention\n                cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention\n            ):\n        # self attention\n        if self.self_attn is not None:\n            # import ipdb; ipdb.set_trace()\n            if self.decoder_sa_type == 'sa':\n                q = k = self.with_pos_embed(tgt, tgt_query_pos)\n                tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]\n                tgt = tgt + self.dropout2(tgt2)\n                tgt = self.norm2(tgt)\n            elif self.decoder_sa_type == 'ca_label':\n                # import ipdb; ipdb.set_trace()\n                # q = self.with_pos_embed(tgt, tgt_query_pos)\n                bs = tgt.shape[1]\n                k = v = self.label_embedding.weight[:, None, :].repeat(1, bs, 1)\n                tgt2 = self.self_attn(tgt, k, v, attn_mask=self_attn_mask)[0]\n                tgt = tgt + self.dropout2(tgt2)\n                tgt = self.norm2(tgt)\n            elif self.decoder_sa_type == 'ca_content':\n                tgt2 = self.self_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),\n                            tgt_reference_points.transpose(0, 1).contiguous(),\n                            memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index, memory_key_padding_mask).transpose(0, 1)\n                tgt = tgt + self.dropout2(tgt2)\n                tgt = self.norm2(tgt)\n            else:\n                raise NotImplementedError(\"Unknown decoder_sa_type {}\".format(self.decoder_sa_type))\n\n        return tgt\n\n    def forward_ca(self,\n                # for tgt\n                tgt: Optional[Tensor],  # nq, bs, d_model\n                tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))\n                tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)\n                tgt_key_padding_mask: Optional[Tensor] = None,\n                tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4\n\n                # for memory\n                memory: Optional[Tensor] = None, # hw, bs, d_model\n                memory_key_padding_mask: Optional[Tensor] = None,\n                memory_level_start_index: Optional[Tensor] = None, # num_levels\n                memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2\n                memory_pos: Optional[Tensor] = None, # pos for memory\n\n                # sa\n                self_attn_mask: Optional[Tensor] = None, # mask used for self-attention\n                cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention\n            ):\n        # cross attention\n        # import ipdb; ipdb.set_trace()\n        if self.key_aware_type is not None:\n\n            if self.key_aware_type == 'mean':\n                tgt = tgt + memory.mean(0, keepdim=True)\n            elif self.key_aware_type == 'proj_mean':\n                tgt = tgt + self.key_aware_proj(memory).mean(0, keepdim=True)\n            else:\n                raise NotImplementedError(\"Unknown key_aware_type: {}\".format(self.key_aware_type))\n        tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),\n                               tgt_reference_points.transpose(0, 1).contiguous(),\n                               memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index, memory_key_padding_mask).transpose(0, 1)\n        tgt = tgt + self.dropout1(tgt2)\n        tgt = self.norm1(tgt)\n\n        return tgt\n\n    def forward(self,\n                # for tgt\n                tgt: Optional[Tensor],  # nq, bs, d_model\n                tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))\n                tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)\n                tgt_key_padding_mask: Optional[Tensor] = None,\n                tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4\n\n                # for memory\n                memory: Optional[Tensor] = None, # hw, bs, d_model\n                memory_key_padding_mask: Optional[Tensor] = None,\n                memory_level_start_index: Optional[Tensor] = None, # num_levels\n                memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2\n                memory_pos: Optional[Tensor] = None, # pos for memory\n\n                # sa\n                self_attn_mask: Optional[Tensor] = None, # mask used for self-attention\n                cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention\n            ):\n\n        for funcname in self.module_seq:\n            # if os.environ.get('IPDB_DEBUG_SHILONG') == 'INFO':\n            #     import ipdb; ipdb.set_trace()\n            if funcname == 'ffn':\n                tgt = self.forward_ffn(tgt)\n            elif funcname == 'ca':\n                tgt = self.forward_ca(tgt, tgt_query_pos, tgt_query_sine_embed, \\\n                    tgt_key_padding_mask, tgt_reference_points, \\\n                        memory, memory_key_padding_mask, memory_level_start_index, \\\n                            memory_spatial_shapes, memory_pos, self_attn_mask, cross_attn_mask)\n            elif funcname == 'sa':\n                tgt = self.forward_sa(tgt, tgt_query_pos, tgt_query_sine_embed, \\\n                    tgt_key_padding_mask, tgt_reference_points, \\\n                        memory, memory_key_padding_mask, memory_level_start_index, \\\n                            memory_spatial_shapes, memory_pos, self_attn_mask, cross_attn_mask)\n            else:\n                raise ValueError('unknown funcname {}'.format(funcname))\n\n        return tgt\n\n\n\nclass DeformableTransformerDecoder(nn.Module):\n    def __init__(self, decoder_layer, num_layers, return_intermediate=False, use_dab=False, d_model=256, query_dim=4):\n        super().__init__()\n        self.layers = _get_clones(decoder_layer, num_layers)\n        self.num_layers = num_layers\n        self.return_intermediate = return_intermediate\n        assert return_intermediate\n        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR\n        self.bbox_embed = None\n        self.class_embed = None\n        self.use_dab = use_dab\n        self.d_model = d_model\n        self.query_dim = query_dim\n        if use_dab:\n            self.query_scale = MLP(d_model, d_model, d_model, 2)\n            self.ref_point_head = MLP(2 * d_model, d_model, d_model, 2)\n\n\n    def forward(self, tgt, reference_points, src, src_spatial_shapes,\n                src_level_start_index, src_valid_ratios,\n                query_pos=None, src_padding_mask=None):\n        output = tgt\n        if self.use_dab:\n            assert query_pos is None\n\n        intermediate = []\n        intermediate_reference_points = [reference_points]\n        for layer_id, layer in enumerate(self.layers):\n            # import ipdb; ipdb.set_trace()\n            if reference_points.shape[-1] == 4:\n                reference_points_input = reference_points[:, :, None] \\\n                                         * torch.cat([src_valid_ratios, src_valid_ratios], -1)[:, None] # bs, nq, 4, 4\n            else:\n                assert reference_points.shape[-1] == 2\n                reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None]\n\n            if self.use_dab:\n                # import ipdb; ipdb.set_trace()\n                query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :]) # bs, nq, 256*2\n                raw_query_pos = self.ref_point_head(query_sine_embed) # bs, nq, 256\n                pos_scale = self.query_scale(output) if layer_id != 0 else 1\n                query_pos = pos_scale * raw_query_pos\n\n            output = layer(output, query_pos, reference_points_input, src, src_spatial_shapes, src_level_start_index, src_padding_mask)\n\n            # hack implementation for iterative bounding box refinement\n            if self.bbox_embed is not None:\n                box_holder = self.bbox_embed(output)\n                box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points)\n                new_reference_points = box_holder[..., :self.query_dim].sigmoid()\n                reference_points = new_reference_points.detach()\n                if layer_id != self.num_layers - 1:\n                    intermediate_reference_points.append(new_reference_points)\n\n            intermediate.append(output)\n\n        return torch.stack(intermediate), torch.stack(intermediate_reference_points)\n\n\ndef _get_clones(module, N):\n    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])\n\n\ndef build_deforamble_transformer(args):\n    return DeformableTransformer(\n        d_model=args.hidden_dim,\n        nhead=args.nheads,\n        num_encoder_layers=args.enc_layers,\n        num_decoder_layers=args.dec_layers,\n        dim_feedforward=args.dim_feedforward,\n        dropout=args.dropout,\n        activation=\"relu\",\n        return_intermediate_dec=True,\n        num_feature_levels=args.ddetr_num_feature_levels,\n        dec_n_points=args.ddetr_dec_n_points,\n        enc_n_points=args.ddetr_enc_n_points,\n        two_stage=args.ddetr_two_stage,\n        two_stage_num_proposals=args.num_queries,\n        use_dab=args.ddetr_use_dab,\n        high_dim_query_update=args.ddetr_high_dim_query_update,\n        no_sine_embed=args.ddetr_no_sine_embed)\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/transformer_vanilla.py",
    "content": "# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n\"\"\"\nDETR Transformer class.\n\nCopy-paste from torch.nn.Transformer with modifications:\n    * positional encodings are passed in MHattention\n    * extra LN at the end of encoder is removed\n    * decoder returns a stack of activations from all decoding layers\n\"\"\"\nimport torch\nfrom torch import Tensor, nn\nfrom typing import List, Optional\n\nfrom .utils import  _get_activation_fn, _get_clones\n\n\nclass TextTransformer(nn.Module):\n    def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1):\n        super().__init__()\n        self.num_layers = num_layers\n        self.d_model = d_model\n        self.nheads = nheads\n        self.dim_feedforward = dim_feedforward\n        self.norm = None\n\n        single_encoder_layer = TransformerEncoderLayer(d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout)\n        self.layers = _get_clones(single_encoder_layer, num_layers)\n\n\n    def forward(self, memory_text:torch.Tensor, text_attention_mask:torch.Tensor):\n        \"\"\"        \n\n        Args:\n            text_attention_mask: bs, num_token\n            memory_text: bs, num_token, d_model\n\n        Raises:\n            RuntimeError: _description_\n\n        Returns:\n            output: bs, num_token, d_model\n        \"\"\"\n\n        output = memory_text.transpose(0, 1)\n\n        for layer in self.layers:\n            output = layer(output, src_key_padding_mask=text_attention_mask)\n\n        if self.norm is not None:\n            output = self.norm(output)\n\n        return output.transpose(0, 1)\n\n\n\n\nclass TransformerEncoderLayer(nn.Module):\n    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation=\"relu\", normalize_before=False):\n        super().__init__()\n        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)\n        # Implementation of Feedforward model\n        self.linear1 = nn.Linear(d_model, dim_feedforward)\n        self.dropout = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(dim_feedforward, d_model)\n\n        self.norm1 = nn.LayerNorm(d_model)\n        self.norm2 = nn.LayerNorm(d_model)\n        self.dropout1 = nn.Dropout(dropout)\n        self.dropout2 = nn.Dropout(dropout)\n\n        self.activation = _get_activation_fn(activation)\n        self.normalize_before = normalize_before\n        self.nhead = nhead\n\n    def with_pos_embed(self, tensor, pos: Optional[Tensor]):\n        return tensor if pos is None else tensor + pos\n\n    def forward(\n        self,\n        src,\n        src_mask: Optional[Tensor] = None,\n        src_key_padding_mask: Optional[Tensor] = None,\n        pos: Optional[Tensor] = None,\n    ):\n        # repeat attn mask\n        if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]:\n            # bs, num_q, num_k\n            src_mask = src_mask.repeat(self.nhead, 1, 1)\n\n        q = k = self.with_pos_embed(src, pos)\n\n        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0]\n\n        # src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]\n        src = src + self.dropout1(src2)\n        src = self.norm1(src)\n        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))\n        src = src + self.dropout2(src2)\n        src = self.norm2(src)\n        return src\n\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/unipose.py",
    "content": "# ------------------------------------------------------------------------\n# ED-Pose\n# Copyright (c) 2023 IDEA. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# ------------------------------------------------------------------------\nimport os\nimport copy\nimport torch\nimport torch.nn.functional as F\nfrom torch import nn\nfrom typing import List\n\nfrom util.keypoint_ops import keypoint_xyzxyz_to_xyxyzz\nfrom util.misc import NestedTensor, nested_tensor_from_tensor_list,inverse_sigmoid\n\nfrom .utils import MLP\nfrom .backbone import build_backbone\nfrom ..registry import MODULE_BUILD_FUNCS\nfrom .mask_generate import prepare_for_mask, post_process\nfrom .deformable_transformer import build_deformable_transformer\n\n\nclass UniPose(nn.Module):\n    \"\"\" This is the Cross-Attention Detector module that performs object detection \"\"\"\n\n    def __init__(self, backbone, transformer, num_classes, num_queries,\n                 aux_loss=False, iter_update=False,\n                 query_dim=2,\n                 random_refpoints_xy=False,\n                 fix_refpoints_hw=-1,\n                 num_feature_levels=1,\n                 nheads=8,\n                 # two stage\n                 two_stage_type='no',  # ['no', 'standard']\n                 two_stage_add_query_num=0,\n                 dec_pred_class_embed_share=True,\n                 dec_pred_bbox_embed_share=True,\n                 two_stage_class_embed_share=True,\n                 two_stage_bbox_embed_share=True,\n                 decoder_sa_type='sa',\n                 num_patterns=0,\n                 dn_number=100,\n                 dn_box_noise_scale=0.4,\n                 dn_label_noise_ratio=0.5,\n                 dn_labelbook_size=100,\n                 use_label_enc=True,\n\n                 text_encoder_type='bert-base-uncased',\n\n                 binary_query_selection=False,\n                 use_cdn=True,\n                 sub_sentence_present=True,\n                 num_body_points=68,\n                 num_box_decoder_layers=2,\n                 ):\n        \"\"\" Initializes the model.\n        Parameters:\n            backbone: torch module of the backbone to be used. See backbone.py\n            transformer: torch module of the transformer architecture. See transformer.py\n            num_classes: number of object classes\n            num_queries: number of object queries, ie detection slot. This is the maximal number of objects\n                         Conditional DETR can detect in a single image. For COCO, we recommend 100 queries.\n            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.\n\n            fix_refpoints_hw: -1(default): learn w and h for each box seperately\n                                >0 : given fixed number\n                                -2 : learn a shared w and h\n        \"\"\"\n        super().__init__()\n        self.num_queries = num_queries\n        self.transformer = transformer\n        self.num_classes = num_classes\n        self.hidden_dim = hidden_dim = transformer.d_model\n        self.num_feature_levels = num_feature_levels\n        self.nheads = nheads\n        self.use_label_enc = use_label_enc\n        if use_label_enc:\n            self.label_enc = nn.Embedding(dn_labelbook_size + 1, hidden_dim)\n        else:\n            raise NotImplementedError\n            self.label_enc = None\n        self.max_text_len = 256\n        self.binary_query_selection = binary_query_selection\n        self.sub_sentence_present = sub_sentence_present\n\n        # setting query dim\n        self.query_dim = query_dim\n        assert query_dim == 4\n        self.random_refpoints_xy = random_refpoints_xy\n        self.fix_refpoints_hw = fix_refpoints_hw\n\n        # for dn training\n        self.num_patterns = num_patterns\n        self.dn_number = dn_number\n        self.dn_box_noise_scale = dn_box_noise_scale\n        self.dn_label_noise_ratio = dn_label_noise_ratio\n        self.dn_labelbook_size = dn_labelbook_size\n        self.use_cdn = use_cdn\n\n\n        self.projection = MLP(512, hidden_dim, hidden_dim, 3)\n\n        self.projection_kpt = MLP(512, hidden_dim, hidden_dim, 3)\n\n\n        device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n        # model, _ = clip.load(\"ViT-B/32\", device=device)\n        # self.clip_model = model\n        # visual_parameters = list(self.clip_model.visual.parameters())\n        # #\n        # for param in visual_parameters:\n        #     param.requires_grad = False\n\n        self.pos_proj = nn.Linear(hidden_dim, 768)\n        self.padding = nn.Embedding(1, 768)\n\n        # prepare input projection layers\n        if num_feature_levels > 1:\n            num_backbone_outs = len(backbone.num_channels)\n            input_proj_list = []\n            for _ in range(num_backbone_outs):\n                in_channels = backbone.num_channels[_]\n                input_proj_list.append(nn.Sequential(\n                    nn.Conv2d(in_channels, hidden_dim, kernel_size=1),\n                    nn.GroupNorm(32, hidden_dim),\n                ))\n            for _ in range(num_feature_levels - num_backbone_outs):\n                input_proj_list.append(nn.Sequential(\n                    nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),\n                    nn.GroupNorm(32, hidden_dim),\n                ))\n                in_channels = hidden_dim\n            self.input_proj = nn.ModuleList(input_proj_list)\n        else:\n            assert two_stage_type == 'no', \"two_stage_type should be no if num_feature_levels=1 !!!\"\n            self.input_proj = nn.ModuleList([\n                nn.Sequential(\n                    nn.Conv2d(backbone.num_channels[-1], hidden_dim, kernel_size=1),\n                    nn.GroupNorm(32, hidden_dim),\n                )])\n\n        self.backbone = backbone\n        self.aux_loss = aux_loss\n        self.box_pred_damping = box_pred_damping = None\n\n        self.iter_update = iter_update\n        assert iter_update, \"Why not iter_update?\"\n\n        # prepare pred layers\n        self.dec_pred_class_embed_share = dec_pred_class_embed_share\n        self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share\n        # prepare class & box embed\n        _class_embed = ContrastiveAssign()\n\n\n\n        _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)\n        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)\n        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)\n\n        _pose_embed = MLP(hidden_dim, hidden_dim, 2, 3)\n        _pose_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)\n        nn.init.constant_(_pose_embed.layers[-1].weight.data, 0)\n        nn.init.constant_(_pose_embed.layers[-1].bias.data, 0)\n\n        if dec_pred_bbox_embed_share:\n            box_embed_layerlist = [_bbox_embed for i in range(transformer.num_decoder_layers)]\n        else:\n            box_embed_layerlist = [copy.deepcopy(_bbox_embed) for i in range(transformer.num_decoder_layers)]\n        if dec_pred_class_embed_share:\n            class_embed_layerlist = [_class_embed for i in range(transformer.num_decoder_layers)]\n        else:\n            class_embed_layerlist = [copy.deepcopy(_class_embed) for i in range(transformer.num_decoder_layers)]\n\n\n        if dec_pred_bbox_embed_share:\n\n            pose_embed_layerlist = [_pose_embed for i in\n                                    range(transformer.num_decoder_layers - num_box_decoder_layers + 1)]\n        else:\n            pose_embed_layerlist = [copy.deepcopy(_pose_embed) for i in\n                                    range(transformer.num_decoder_layers - num_box_decoder_layers + 1)]\n\n        pose_hw_embed_layerlist = [_pose_hw_embed for i in\n                                   range(transformer.num_decoder_layers - num_box_decoder_layers)]\n\n\n        self.num_box_decoder_layers = num_box_decoder_layers\n        self.bbox_embed = nn.ModuleList(box_embed_layerlist)\n        self.class_embed = nn.ModuleList(class_embed_layerlist)\n        self.num_body_points = num_body_points\n        self.pose_embed = nn.ModuleList(pose_embed_layerlist)\n        self.pose_hw_embed = nn.ModuleList(pose_hw_embed_layerlist)\n\n        self.transformer.decoder.bbox_embed = self.bbox_embed\n        self.transformer.decoder.class_embed = self.class_embed\n\n        self.transformer.decoder.pose_embed = self.pose_embed\n        self.transformer.decoder.pose_hw_embed = self.pose_hw_embed\n\n        self.transformer.decoder.num_body_points = num_body_points\n\n\n        # two stage\n        self.two_stage_type = two_stage_type\n        self.two_stage_add_query_num = two_stage_add_query_num\n        assert two_stage_type in ['no', 'standard'], \"unknown param {} of two_stage_type\".format(two_stage_type)\n        if two_stage_type != 'no':\n            if two_stage_bbox_embed_share:\n                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share\n                self.transformer.enc_out_bbox_embed = _bbox_embed\n            else:\n                self.transformer.enc_out_bbox_embed = copy.deepcopy(_bbox_embed)\n\n            if two_stage_class_embed_share:\n                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share\n                self.transformer.enc_out_class_embed = _class_embed\n            else:\n                self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed)\n\n            self.refpoint_embed = None\n            if self.two_stage_add_query_num > 0:\n                self.init_ref_points(two_stage_add_query_num)\n\n        self.decoder_sa_type = decoder_sa_type\n        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']\n        # self.replace_sa_with_double_ca = replace_sa_with_double_ca\n        if decoder_sa_type == 'ca_label':\n            self.label_embedding = nn.Embedding(num_classes, hidden_dim)\n            for layer in self.transformer.decoder.layers:\n                layer.label_embedding = self.label_embedding\n        else:\n            for layer in self.transformer.decoder.layers:\n                layer.label_embedding = None\n            self.label_embedding = None\n\n        self._reset_parameters()\n\n    def open_set_transfer_init(self):\n        for name, param in self.named_parameters():\n            if 'fusion_layers' in name:\n                continue\n            if 'ca_text' in name:\n                continue\n            if 'catext_norm' in name:\n                continue\n            if 'catext_dropout' in name:\n                continue\n            if \"text_layers\" in name:\n                continue\n            if 'bert' in name:\n                continue\n            if 'bbox_embed' in name:\n                continue\n            if 'label_enc.weight' in name:\n                continue\n            if 'feat_map' in name:\n                continue\n            if 'enc_output' in name:\n                continue\n\n            param.requires_grad_(False)\n\n        # import ipdb; ipdb.set_trace()\n\n    def _reset_parameters(self):\n        # init input_proj\n        for proj in self.input_proj:\n            nn.init.xavier_uniform_(proj[0].weight, gain=1)\n            nn.init.constant_(proj[0].bias, 0)\n\n    def init_ref_points(self, use_num_queries):\n        self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim)\n\n        if self.random_refpoints_xy:\n            # import ipdb; ipdb.set_trace()\n            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)\n            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])\n            self.refpoint_embed.weight.data[:, :2].requires_grad = False\n\n        if self.fix_refpoints_hw > 0:\n            print(\"fix_refpoints_hw: {}\".format(self.fix_refpoints_hw))\n            assert self.random_refpoints_xy\n            self.refpoint_embed.weight.data[:, 2:] = self.fix_refpoints_hw\n            self.refpoint_embed.weight.data[:, 2:] = inverse_sigmoid(self.refpoint_embed.weight.data[:, 2:])\n            self.refpoint_embed.weight.data[:, 2:].requires_grad = False\n        elif int(self.fix_refpoints_hw) == -1:\n            pass\n        elif int(self.fix_refpoints_hw) == -2:\n            print('learn a shared h and w')\n            assert self.random_refpoints_xy\n            self.refpoint_embed = nn.Embedding(use_num_queries, 2)\n            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)\n            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])\n            self.refpoint_embed.weight.data[:, :2].requires_grad = False\n            self.hw_embed = nn.Embedding(1, 1)\n        else:\n            raise NotImplementedError('Unknown fix_refpoints_hw {}'.format(self.fix_refpoints_hw))\n\n    def forward(self, samples: NestedTensor, targets: List = None, **kw):\n        \"\"\" The forward expects a NestedTensor, which consists of:\n               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]\n               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels\n\n            It returns a dict with the following elements:\n               - \"pred_logits\": the classification logits (including no-object) for all queries.\n                                Shape= [batch_size x num_queries x num_classes]\n               - \"pred_boxes\": The normalized boxes coordinates for all queries, represented as\n                               (center_x, center_y, width, height). These values are normalized in [0, 1],\n                               relative to the size of each individual image (disregarding possible padding).\n                               See PostProcess for information on how to retrieve the unnormalized bounding box.\n               - \"aux_outputs\": Optional, only returned when auxilary losses are activated. It is a list of\n                                dictionnaries containing the two above keys for each decoder layer.\n        \"\"\"\n\n        captions = [t['instance_text_prompt'] for t in targets]\n        bs=len(captions)\n        tensor_list = [tgt[\"object_embeddings_text\"] for tgt in targets]\n        max_size = 350\n        padded_tensors = [torch.cat([tensor, torch.zeros(max_size - tensor.size(0), tensor.size(1),device=tensor.device)]) if tensor.size(0) < max_size else tensor for tensor in tensor_list]\n        object_embeddings_text = torch.stack(padded_tensors)\n\n        kpts_embeddings_text = torch.stack([tgt[\"kpts_embeddings_text\"] for tgt in targets])[:, :self.num_body_points]\n        encoded_text=self.projection(object_embeddings_text) # bs, 81, 101, 256\n        kpt_embeddings_specific=self.projection_kpt(kpts_embeddings_text) # bs, 81, 101, 256\n\n\n        kpt_vis = torch.stack([tgt[\"kpt_vis_text\"] for tgt in targets])[:, :self.num_body_points]\n        kpt_mask = torch.cat((torch.ones_like(kpt_vis, device=kpt_vis.device)[..., 0].unsqueeze(-1), kpt_vis), dim=-1)\n\n\n        num_classes = encoded_text.shape[1] # bs, 81, 101, 256\n        text_self_attention_masks = torch.eye(num_classes).unsqueeze(0).expand(bs, -1, -1).bool().to(samples.device)\n        text_token_mask = torch.zeros(samples.shape[0],num_classes).to(samples.device)>0\n        for i in range(bs):\n            text_token_mask[i,:len(captions[i])]=True\n\n        position_ids = torch.zeros(samples.shape[0], num_classes).to(samples.device)\n\n        for i in range(bs):\n            position_ids[i,:len(captions[i])]= 1\n\n\n        text_dict = {\n            'encoded_text': encoded_text, # bs, 195, d_model\n            'text_token_mask': text_token_mask, # bs, 195\n            'position_ids': position_ids, # bs, 195\n            'text_self_attention_masks': text_self_attention_masks # bs, 195,195\n        }\n\n\n        # import ipdb; ipdb.set_trace()\n\n        if isinstance(samples, (list, torch.Tensor)):\n            samples = nested_tensor_from_tensor_list(samples)\n        features, poss = self.backbone(samples)\n        if os.environ.get(\"SHILONG_AMP_INFNAN_DEBUG\") == '1':\n            import ipdb;\n            ipdb.set_trace()\n\n\n        srcs = []\n        masks = []\n        for l, feat in enumerate(features):\n            src, mask = feat.decompose()\n            srcs.append(self.input_proj[l](src))\n            masks.append(mask)\n            assert mask is not None\n\n        if self.num_feature_levels > len(srcs):\n            _len_srcs = len(srcs)\n            for l in range(_len_srcs, self.num_feature_levels):\n                if l == _len_srcs:\n                    src = self.input_proj[l](features[-1].tensors)\n                else:\n                    src = self.input_proj[l](srcs[-1])\n                m = samples.mask\n                mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]\n                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)\n                srcs.append(src)\n                masks.append(mask)\n                poss.append(pos_l)\n\n        if self.label_enc is not None:\n            label_enc = self.label_enc\n        else:\n            raise NotImplementedError\n            label_enc = encoded_text\n        if self.dn_number > 0 or targets is not None:\n            input_query_label, input_query_bbox, attn_mask, attn_mask2, dn_meta = \\\n                prepare_for_mask(kpt_mask=kpt_mask)\n        else:\n            assert targets is None\n            input_query_bbox = input_query_label = attn_mask = attn_mask2 = dn_meta = None\n\n\n        hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(srcs, masks, input_query_bbox, poss,\n                                                                                 input_query_label, attn_mask, attn_mask2,\n                                                                                 text_dict, dn_meta,targets,kpt_embeddings_specific)\n\n        # In case num object=0\n        if self.label_enc is not None:\n            hs[0] += self.label_enc.weight[0, 0] * 0.0\n\n        hs[0] += self.pos_proj.weight[0, 0] * 0.0\n        hs[0] += self.pos_proj.bias[0] * 0.0\n        hs[0] += self.padding.weight[0, 0] * 0.0\n\n        num_group = 50\n        effective_dn_number = dn_meta['pad_size'] if self.training else 0\n        outputs_coord_list = []\n        outputs_class = []\n\n\n        for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_cls_embed, layer_hs) in enumerate(\n                zip(reference[:-1], self.bbox_embed, self.class_embed, hs)):\n\n\n            if dec_lid < self.num_box_decoder_layers:\n                layer_delta_unsig = layer_bbox_embed(layer_hs)\n                layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig)\n                layer_outputs_unsig = layer_outputs_unsig.sigmoid()\n                layer_cls = layer_cls_embed(layer_hs, text_dict)\n                outputs_coord_list.append(layer_outputs_unsig)\n                outputs_class.append(layer_cls)\n\n\n            else:\n\n                layer_hs_bbox_dn = layer_hs[:, :effective_dn_number, :]\n                layer_hs_bbox_norm = layer_hs[:, effective_dn_number:, :][:, 0::(self.num_body_points + 1), :]\n                bs = layer_ref_sig.shape[0]\n                reference_before_sigmoid_bbox_dn = layer_ref_sig[:, :effective_dn_number, :]\n                reference_before_sigmoid_bbox_norm = layer_ref_sig[:, effective_dn_number:, :][:,\n                                                     0::(self.num_body_points + 1), :]\n                layer_delta_unsig_dn = layer_bbox_embed(layer_hs_bbox_dn)\n                layer_delta_unsig_norm = layer_bbox_embed(layer_hs_bbox_norm)\n                layer_outputs_unsig_dn = layer_delta_unsig_dn + inverse_sigmoid(reference_before_sigmoid_bbox_dn)\n                layer_outputs_unsig_dn = layer_outputs_unsig_dn.sigmoid()\n                layer_outputs_unsig_norm = layer_delta_unsig_norm + inverse_sigmoid(reference_before_sigmoid_bbox_norm)\n                layer_outputs_unsig_norm = layer_outputs_unsig_norm.sigmoid()\n                layer_outputs_unsig = torch.cat((layer_outputs_unsig_dn, layer_outputs_unsig_norm), dim=1)\n                layer_cls_dn = layer_cls_embed(layer_hs_bbox_dn, text_dict)\n                layer_cls_norm = layer_cls_embed(layer_hs_bbox_norm, text_dict)\n                layer_cls = torch.cat((layer_cls_dn, layer_cls_norm), dim=1)\n                outputs_class.append(layer_cls)\n                outputs_coord_list.append(layer_outputs_unsig)\n\n        # update keypoints\n        outputs_keypoints_list = []\n        outputs_keypoints_hw = []\n        kpt_index = [x for x in range(num_group * (self.num_body_points + 1)) if x % (self.num_body_points + 1) != 0]\n        for dec_lid, (layer_ref_sig, layer_hs) in enumerate(zip(reference[:-1], hs)):\n            if dec_lid < self.num_box_decoder_layers:\n                assert isinstance(layer_hs, torch.Tensor)\n                bs = layer_hs.shape[0]\n                layer_res = layer_hs.new_zeros((bs, self.num_queries, self.num_body_points * 3))\n                outputs_keypoints_list.append(layer_res)\n            else:\n                bs = layer_ref_sig.shape[0]\n                layer_hs_kpt = layer_hs[:, effective_dn_number:, :].index_select(1, torch.tensor(kpt_index,\n                                                                                                 device=layer_hs.device))\n                delta_xy_unsig = self.pose_embed[dec_lid - self.num_box_decoder_layers](layer_hs_kpt)\n                layer_ref_sig_kpt = layer_ref_sig[:, effective_dn_number:, :].index_select(1, torch.tensor(kpt_index,\n                                                                                                           device=layer_hs.device))\n                layer_outputs_unsig_keypoints = delta_xy_unsig + inverse_sigmoid(layer_ref_sig_kpt[..., :2])\n                vis_xy_unsig = torch.ones_like(layer_outputs_unsig_keypoints,\n                                               device=layer_outputs_unsig_keypoints.device)\n                xyv = torch.cat((layer_outputs_unsig_keypoints, vis_xy_unsig[:, :, 0].unsqueeze(-1)), dim=-1)\n                xyv = xyv.sigmoid()\n                layer_res = xyv.reshape((bs, num_group, self.num_body_points, 3)).flatten(2, 3)\n                layer_hw = layer_ref_sig_kpt[..., 2:].reshape(bs, num_group, self.num_body_points, 2).flatten(2, 3)\n                layer_res = keypoint_xyzxyz_to_xyxyzz(layer_res)\n                outputs_keypoints_list.append(layer_res)\n                outputs_keypoints_hw.append(layer_hw)\n\n\n        if self.dn_number > 0 and dn_meta is not None:\n            outputs_class, outputs_coord_list = \\\n                post_process(outputs_class, outputs_coord_list,\n                                dn_meta, self.aux_loss, self._set_aux_loss)\n        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord_list[-1],\n               'pred_keypoints': outputs_keypoints_list[-1]}\n\n        return out\n\n\n@MODULE_BUILD_FUNCS.registe_with_name(module_name='UniPose')\ndef build_unipose(args):\n\n    num_classes = args.num_classes\n    device = torch.device(args.device)\n\n    backbone = build_backbone(args)\n\n    transformer = build_deformable_transformer(args)\n\n    try:\n        match_unstable_error = args.match_unstable_error\n        dn_labelbook_size = args.dn_labelbook_size\n    except:\n        match_unstable_error = True\n        dn_labelbook_size = num_classes\n\n    try:\n        dec_pred_class_embed_share = args.dec_pred_class_embed_share\n    except:\n        dec_pred_class_embed_share = True\n    try:\n        dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share\n    except:\n        dec_pred_bbox_embed_share = True\n\n    binary_query_selection = False\n    try:\n        binary_query_selection = args.binary_query_selection\n    except:\n        binary_query_selection = False\n\n    use_cdn = True\n    try:\n        use_cdn = args.use_cdn\n    except:\n        use_cdn = True\n\n    sub_sentence_present = True\n    try:\n        sub_sentence_present = args.sub_sentence_present\n    except:\n        sub_sentence_present = True\n    # print('********* sub_sentence_present', sub_sentence_present)\n\n    model = UniPose(\n        backbone,\n        transformer,\n        num_classes=num_classes,\n        num_queries=args.num_queries,\n        aux_loss=True,\n        iter_update=True,\n        query_dim=4,\n        random_refpoints_xy=args.random_refpoints_xy,\n        fix_refpoints_hw=args.fix_refpoints_hw,\n        num_feature_levels=args.num_feature_levels,\n        nheads=args.nheads,\n        dec_pred_class_embed_share=dec_pred_class_embed_share,\n        dec_pred_bbox_embed_share=dec_pred_bbox_embed_share,\n        # two stage\n        two_stage_type=args.two_stage_type,\n        # box_share\n        two_stage_bbox_embed_share=args.two_stage_bbox_embed_share,\n        two_stage_class_embed_share=args.two_stage_class_embed_share,\n        decoder_sa_type=args.decoder_sa_type,\n        num_patterns=args.num_patterns,\n        dn_number=args.dn_number if args.use_dn else 0,\n        dn_box_noise_scale=args.dn_box_noise_scale,\n        dn_label_noise_ratio=args.dn_label_noise_ratio,\n        dn_labelbook_size=dn_labelbook_size,\n        use_label_enc=args.use_label_enc,\n\n        text_encoder_type=args.text_encoder_type,\n\n        binary_query_selection=binary_query_selection,\n        use_cdn=use_cdn,\n        sub_sentence_present=sub_sentence_present\n    )\n\n    return model\n\n\nclass ContrastiveAssign(nn.Module):\n    def __init__(self, project=False, cal_bias=None, max_text_len=256):\n        \"\"\"\n        :param x: query\n        :param y: text embed\n        :param proj:\n        :return:\n        \"\"\"\n        super().__init__()\n        self.project = project\n        self.cal_bias = cal_bias\n        self.max_text_len = max_text_len\n\n    def forward(self, x, text_dict):\n        \"\"\"_summary_\n\n        Args:\n            x (_type_): _description_\n            text_dict (_type_): _description_\n            {\n                'encoded_text': encoded_text, # bs, 195, d_model\n                'text_token_mask': text_token_mask, # bs, 195\n                        # True for used tokens. False for padding tokens\n            }\n        Returns:\n            _type_: _description_\n        \"\"\"\n        assert isinstance(text_dict, dict)\n\n        y = text_dict['encoded_text']\n\n\n        max_text_len = y.shape[1]\n\n\n\n        text_token_mask = text_dict['text_token_mask']\n\n        if self.cal_bias is not None:\n            raise NotImplementedError\n            return x @ y.transpose(-1, -2) + self.cal_bias.weight.repeat(x.shape[0], x.shape[1], 1)\n        res = x @ y.transpose(-1, -2)\n        res.masked_fill_(~text_token_mask[:, None, :], float('-inf'))\n\n        # padding to max_text_len\n        new_res = torch.full((*res.shape[:-1], max_text_len), float('-inf'), device=res.device)\n        new_res[..., :res.shape[-1]] = res\n\n        return new_res\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/UniPose/utils.py",
    "content": "# ------------------------------------------------------------------------\n# ED-Pose\n# Copyright (c) 2023 IDEA. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n\nimport copy\nimport torch\nimport random\nfrom torch import nn, Tensor\nimport os\nimport numpy as np\nimport math\nimport torch.nn.functional as F\nfrom torch import nn\n\n\ndef _get_clones(module, N, layer_share=False):\n    # import ipdb; ipdb.set_trace()\n    if layer_share:\n        return nn.ModuleList([module for i in range(N)])\n    else:\n        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])\n\n\ndef get_sine_pos_embed(\n        pos_tensor: torch.Tensor,\n        num_pos_feats: int = 128,\n        temperature: int = 10000,\n        exchange_xy: bool = True,\n):\n    \"\"\"generate sine position embedding from a position tensor\n    Args:\n        pos_tensor (torch.Tensor): shape: [..., n].\n        num_pos_feats (int): projected shape for each float in the tensor.\n        temperature (int): temperature in the sine/cosine function.\n        exchange_xy (bool, optional): exchange pos x and pos y. \\\n            For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True.\n    Returns:\n        pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].\n    \"\"\"\n    scale = 2 * math.pi\n    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device)\n    dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode=\"floor\") / num_pos_feats)\n\n    def sine_func(x: torch.Tensor):\n        sin_x = x * scale / dim_t\n        sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2)\n        return sin_x\n\n    pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)]\n    if exchange_xy:\n        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]\n    pos_res = torch.cat(pos_res, dim=-1)\n    return pos_res\n\n\ndef gen_encoder_output_proposals(memory: Tensor, memory_padding_mask: Tensor, spatial_shapes: Tensor, learnedwh=None):\n    \"\"\"\n    Input:\n        - memory: bs, \\sum{hw}, d_model\n        - memory_padding_mask: bs, \\sum{hw}\n        - spatial_shapes: nlevel, 2\n        - learnedwh: 2\n    Output:\n        - output_memory: bs, \\sum{hw}, d_model\n        - output_proposals: bs, \\sum{hw}, 4\n    \"\"\"\n    N_, S_, C_ = memory.shape\n    base_scale = 4.0\n    proposals = []\n    _cur = 0\n    for lvl, (H_, W_) in enumerate(spatial_shapes):\n        mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)\n        valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)\n        valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)\n\n        # import ipdb; ipdb.set_trace()\n\n        grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),\n                                        torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))\n        grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)  # H_, W_, 2\n\n        scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)\n        grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale\n\n        if learnedwh is not None:\n            # import ipdb; ipdb.set_trace()\n            wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0 ** lvl)\n        else:\n            wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)\n\n        # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1)\n        # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale\n        # wh = torch.ones_like(grid) / scale\n        proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)\n        proposals.append(proposal)\n        _cur += (H_ * W_)\n    # import ipdb; ipdb.set_trace()\n    output_proposals = torch.cat(proposals, 1)\n    output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)\n    output_proposals = torch.log(output_proposals / (1 - output_proposals))  # unsigmoid\n    output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))\n    output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))\n\n    output_memory = memory\n    output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))\n    output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))\n\n    # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))\n    # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf'))\n\n    return output_memory, output_proposals\n\n\nclass RandomBoxPerturber():\n    def __init__(self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2) -> None:\n        self.noise_scale = torch.Tensor([x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale])\n\n    def __call__(self, refanchors: Tensor) -> Tensor:\n        nq, bs, query_dim = refanchors.shape\n        device = refanchors.device\n\n        noise_raw = torch.rand_like(refanchors)\n        noise_scale = self.noise_scale.to(device)[:query_dim]\n\n        new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale)\n        return new_refanchors.clamp_(0, 1)\n\n\ndef sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False):\n    \"\"\"\n    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.\n    Args:\n        inputs: A float tensor of arbitrary shape.\n                The predictions for each example.\n        targets: A float tensor with the same shape as inputs. Stores the binary\n                 classification label for each element in inputs\n                (0 for the negative class and 1 for the positive class).\n        alpha: (optional) Weighting factor in range (0,1) to balance\n                positive vs negative examples. Default = -1 (no weighting).\n        gamma: Exponent of the modulating factor (1 - p_t) to\n               balance easy vs hard examples.\n    Returns:\n        Loss tensor\n    \"\"\"\n    prob = inputs.sigmoid()\n    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction=\"none\")\n    p_t = prob * targets + (1 - prob) * (1 - targets)\n    loss = ce_loss * ((1 - p_t) ** gamma)\n\n    if alpha >= 0:\n        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)\n        loss = alpha_t * loss\n\n    if no_reduction:\n        return loss\n\n    return loss.mean(1).sum() / num_boxes\n\n\nclass MLP(nn.Module):\n    \"\"\" Very simple multi-layer perceptron (also called FFN)\"\"\"\n\n    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):\n        super().__init__()\n        self.num_layers = num_layers\n        h = [hidden_dim] * (num_layers - 1)\n        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))\n\n    def forward(self, x):\n        for i, layer in enumerate(self.layers):\n            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)\n        return x\n\n\ndef _get_activation_fn(activation, d_model=256, batch_dim=0):\n    \"\"\"Return an activation function given a string\"\"\"\n    if activation == \"relu\":\n        return F.relu\n    if activation == \"gelu\":\n        return F.gelu\n    if activation == \"glu\":\n        return F.glu\n    if activation == \"prelu\":\n        return nn.PReLU()\n    if activation == \"selu\":\n        return F.selu\n\n    raise RuntimeError(F\"activation should be relu/gelu, not {activation}.\")\n\n\ndef gen_sineembed_for_position(pos_tensor):\n    # n_query, bs, _ = pos_tensor.size()\n    # sineembed_tensor = torch.zeros(n_query, bs, 256)\n    scale = 2 * math.pi\n    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)\n    dim_t = 10000 ** (2 * (dim_t // 2) / 128)\n    x_embed = pos_tensor[:, :, 0] * scale\n    y_embed = pos_tensor[:, :, 1] * scale\n    pos_x = x_embed[:, :, None] / dim_t\n    pos_y = y_embed[:, :, None] / dim_t\n    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)\n    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)\n    if pos_tensor.size(-1) == 2:\n        pos = torch.cat((pos_y, pos_x), dim=2)\n    elif pos_tensor.size(-1) == 4:\n        w_embed = pos_tensor[:, :, 2] * scale\n        pos_w = w_embed[:, :, None] / dim_t\n        pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)\n\n        h_embed = pos_tensor[:, :, 3] * scale\n        pos_h = h_embed[:, :, None] / dim_t\n        pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)\n\n        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)\n    else:\n        raise ValueError(\"Unknown pos_tensor shape(-1):{}\".format(pos_tensor.size(-1)))\n    return pos\n\n\ndef oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas):\n    sigmas = kpt_preds.new_tensor(sigmas)\n    variances = (sigmas * 2) ** 2\n\n    assert kpt_preds.size(0) == kpt_gts.size(0)\n    kpt_preds = kpt_preds.reshape(-1, kpt_preds.size(-1) // 2, 2)\n    kpt_gts = kpt_gts.reshape(-1, kpt_gts.size(-1) // 2, 2)\n\n    squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \\\n                       (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2\n    # import pdb\n    # pdb.set_trace()\n    # assert (kpt_valids.sum(-1) > 0).all()\n    squared_distance0 = squared_distance / (kpt_areas[:, None] * variances[None, :] * 2)\n    squared_distance1 = torch.exp(-squared_distance0)\n    squared_distance1 = squared_distance1 * kpt_valids\n    oks = squared_distance1.sum(dim=1) / (kpt_valids.sum(dim=1) + 1e-6)\n\n    return oks\n\n\ndef oks_loss(pred,\n             target,\n             valid=None,\n             area=None,\n             linear=False,\n             sigmas=None,\n             eps=1e-6):\n    \"\"\"Oks loss.\n    Computing the oks loss between a set of predicted poses and target poses.\n    The loss is calculated as negative log of oks.\n    Args:\n        pred (torch.Tensor): Predicted poses of format (x1, y1, x2, y2, ...),\n            shape (n, 2K).\n        target (torch.Tensor): Corresponding gt poses, shape (n, 2K).\n        linear (bool, optional): If True, use linear scale of loss instead of\n            log scale. Default: False.\n        eps (float): Eps to avoid log(0).\n    Return:\n        torch.Tensor: Loss tensor.\n    \"\"\"\n    oks = oks_overlaps(pred, target, valid, area, sigmas).clamp(min=eps)\n    if linear:\n        loss = 1 - oks\n    else:\n        loss = -oks.log()\n    return loss\n\n\nclass OKSLoss(nn.Module):\n    \"\"\"IoULoss.\n    Computing the oks loss between a set of predicted poses and target poses.\n    Args:\n        linear (bool): If True, use linear scale of loss instead of log scale.\n            Default: False.\n        eps (float): Eps to avoid log(0).\n        reduction (str): Options are \"none\", \"mean\" and \"sum\".\n        loss_weight (float): Weight of loss.\n    \"\"\"\n\n    def __init__(self,\n                 linear=False,\n                 num_keypoints=17,\n                 eps=1e-6,\n                 reduction='mean',\n                 loss_weight=1.0):\n        super(OKSLoss, self).__init__()\n        self.linear = linear\n        self.eps = eps\n        self.reduction = reduction\n        self.loss_weight = loss_weight\n        if num_keypoints == 68:\n            self.sigmas = np.array([\n                .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,\n                1.07, .87, .87, .89, .89, .25, .25, .25, .25, .25, .25, .25, .25,\n                .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,\n                .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,\n                .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,\n            ], dtype=np.float32) / 10.0\n        else:\n            raise ValueError(f'Unsupported keypoints number {num_keypoints}')\n\n    def forward(self,\n                pred,\n                target,\n                valid,\n                area,\n                weight=None,\n                avg_factor=None,\n                reduction_override=None):\n        \"\"\"Forward function.\n        Args:\n            pred (torch.Tensor): The prediction.\n            target (torch.Tensor): The learning target of the prediction.\n            valid (torch.Tensor): The visible flag of the target pose.\n            area (torch.Tensor): The area of the target pose.\n            weight (torch.Tensor, optional): The weight of loss for each\n                prediction. Defaults to None.\n            avg_factor (int, optional): Average factor that is used to average\n                the loss. Defaults to None.\n            reduction_override (str, optional): The reduction method used to\n                override the original reduction method of the loss.\n                Defaults to None. Options are \"none\", \"mean\" and \"sum\".\n        \"\"\"\n        assert reduction_override in (None, 'none', 'mean', 'sum')\n        reduction = (\n            reduction_override if reduction_override else self.reduction)\n        if (weight is not None) and (not torch.any(weight > 0)) and (\n                reduction != 'none'):\n            if pred.dim() == weight.dim() + 1:\n                weight = weight.unsqueeze(1)\n            return (pred * weight).sum()  # 0\n        if weight is not None and weight.dim() > 1:\n            # TODO: remove this in the future\n            # reduce the weight of shape (n, 4) to (n,) to match the\n            # iou_loss of shape (n,)\n            assert weight.shape == pred.shape\n            weight = weight.mean(-1)\n        loss = self.loss_weight * oks_loss(\n            pred,\n            target,\n            valid=valid,\n            area=area,\n            linear=self.linear,\n            sigmas=self.sigmas,\n            eps=self.eps)\n        return loss\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/__init__.py",
    "content": "# ------------------------------------------------------------------------\n# ED-Pose\n# Copyright (c) 2023 IDEA. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\nfrom .UniPose.unipose import build_unipose\n\ndef build_model(args):\n    # we use register to maintain models from catdet6 on.\n    from .registry import MODULE_BUILD_FUNCS\n\n    assert args.modelname in MODULE_BUILD_FUNCS._module_dict\n    build_func = MODULE_BUILD_FUNCS.get(args.modelname)\n    model = build_func(args)\n    return model\n"
  },
  {
    "path": "src/utils/dependencies/XPose/models/registry.py",
    "content": "# -*- coding: utf-8 -*-\n# @Author: Yihao Chen\n# @Date:   2021-08-16 16:03:17\n# @Last Modified by:   Shilong Liu\n# @Last Modified time: 2022-01-23 15:26\n# modified from mmcv\n\nimport inspect\nfrom functools import partial\n\n\nclass Registry(object):\n\n    def __init__(self, name):\n        self._name = name\n        self._module_dict = dict()\n\n    def __repr__(self):\n        format_str = self.__class__.__name__ + '(name={}, items={})'.format(\n            self._name, list(self._module_dict.keys()))\n        return format_str\n\n    def __len__(self):\n        return len(self._module_dict)\n\n    @property\n    def name(self):\n        return self._name\n\n    @property\n    def module_dict(self):\n        return self._module_dict\n\n    def get(self, key):\n        return self._module_dict.get(key, None)\n\n    def registe_with_name(self, module_name=None, force=False):\n        return partial(self.register, module_name=module_name, force=force)\n\n    def register(self, module_build_function, module_name=None, force=False):\n        \"\"\"Register a module build function.\n        Args:\n            module (:obj:`nn.Module`): Module to be registered.\n        \"\"\"\n        if not inspect.isfunction(module_build_function):\n            raise TypeError('module_build_function must be a function, but got {}'.format(\n                type(module_build_function)))\n        if module_name is None:\n            module_name = module_build_function.__name__\n        if not force and module_name in self._module_dict:\n            raise KeyError('{} is already registered in {}'.format(\n                module_name, self.name))\n        self._module_dict[module_name] = module_build_function\n\n        return module_build_function\n\nMODULE_BUILD_FUNCS = Registry('model build functions')\n\n"
  },
  {
    "path": "src/utils/dependencies/XPose/predefined_keypoints.py",
    "content": "person = {\"keypoints\":['nose', 'left eye', 'right eye', 'left ear', 'right ear', 'left shoulder', 'right shoulder', 'left elbow', 'right elbow', 'left wrist', 'right wrist', 'left hip', 'right hip', 'left knee', 'right knee', 'left ankle', 'right ankle'],\"skeleton\": [[16,14],[14,12],[17,15],[15,13],[12,13],[6,12],[7,13],[6,7],[6,8],[7,9],[8,10],[9,11],[2,3],[1,2],[1,3],[2,4],[3,5],[4,6],[5,7]]}\n\nface = {\"keypoints\": ['right cheekbone 1', 'right cheekbone 2', 'right cheek 1', 'right cheek 2', 'right cheek 3', 'right cheek 4', 'right cheek 5', 'right chin', 'chin center', 'left chin', 'left cheek 5', 'left cheek 4', 'left cheek 3', 'left cheek 2', 'left cheek 1', 'left cheekbone 2', 'left cheekbone 1', 'right eyebrow 1', 'right eyebrow 2', 'right eyebrow 3', 'right eyebrow 4', 'right eyebrow 5', 'left eyebrow 1', 'left eyebrow 2', 'left eyebrow 3', 'left eyebrow 4', 'left eyebrow 5', 'nasal bridge 1', 'nasal bridge 2', 'nasal bridge 3', 'nasal bridge 4', 'right nasal wing 1', 'right nasal wing 2', 'nasal wing center', 'left nasal wing 1', 'left nasal wing 2', 'right eye eye corner 1', 'right eye upper eyelid 1', 'right eye upper eyelid 2', 'right eye eye corner 2', 'right eye lower eyelid 2', 'right eye lower eyelid 1', 'left eye eye corner 1', 'left eye upper eyelid 1', 'left eye upper eyelid 2', 'left eye eye corner 2', 'left eye lower eyelid 2', 'left eye lower eyelid 1', 'right mouth corner', 'upper lip outer edge 1', 'upper lip outer edge 2', 'upper lip outer edge 3', 'upper lip outer edge 4', 'upper lip outer edge 5', 'left mouth corner', 'lower lip outer edge 5', 'lower lip outer edge 4', 'lower lip outer edge 3', 'lower lip outer edge 2', 'lower lip outer edge 1', 'upper lip inter edge 1', 'upper lip inter edge 2', 'upper lip inter edge 3', 'upper lip inter edge 4', 'upper lip inter edge 5', 'lower lip inter edge 3', 'lower lip inter edge 2', 'lower lip inter edge 1'], \"skeleton\": []}\n\nhand = {\"keypoints\":['wrist', 'thumb root', \"thumb's third knuckle\", \"thumb's second knuckle\", 'thumb’s first knuckle', \"forefinger's root\", \"forefinger's third knuckle\", \"forefinger's second knuckle\", \"forefinger's first knuckle\", \"middle finger's root\", \"middle finger's third knuckle\", \"middle finger's second knuckle\", \"middle finger's first knuckle\", \"ring finger's root\", \"ring finger's third knuckle\", \"ring finger's second knuckle\", \"ring finger's first knuckle\", \"pinky finger's root\", \"pinky finger's third knuckle\", \"pinky finger's second knuckle\", \"pinky finger's first knuckle\"],\"skeleton\": []}\n\nanimal_in_AnimalKindom = {\"keypoints\":['head mid top', 'eye left', 'eye right', 'mouth front top', 'mouth back left', 'mouth back right', 'mouth front bottom', 'shoulder left', 'shoulder right', 'elbow left', 'elbow right', 'wrist left', 'wrist right', 'torso mid back', 'hip left', 'hip right', 'knee left', 'knee right', 'ankle left ', 'ankle right', 'tail top back', 'tail mid back', 'tail end back'],\"skeleton\": [[1, 0], [2, 0], [3, 4], [3, 5], [4, 6], [5, 6], [0, 7], [0, 8], [7, 9], [8, 10], [9, 11], [10, 12], [0, 13], [13, 20], [20, 14], [20, 15], [14, 16], [15, 17], [16, 18], [17, 19], [20, 21], [21, 22]]}\n\nanimal_in_AP10K = {\"keypoints\": ['left eye', 'right eye', 'nose', 'neck', 'root of tail', 'left shoulder', 'left elbow', 'left front paw', 'right shoulder', 'right elbow', 'right front paw', 'left hip', 'left knee', 'left back paw', 'right hip', 'right knee', 'right back paw'], \"skeleton\": [[1, 2], [1, 3], [2, 3], [3, 4], [4, 5], [4, 6], [6, 7], [7, 8], [4, 9], [9, 10], [10, 11], [5, 12], [12, 13], [13, 14], [5, 15], [15, 16], [16, 17]]}\n\nanimal= {\"keypoints\": ['left eye', 'right eye', 'nose', 'neck', 'root of tail', 'left shoulder', 'left elbow', 'left front paw', 'right shoulder', 'right elbow', 'right front paw', 'left hip', 'left knee', 'left back paw', 'right hip', 'right knee', 'right back paw'], \"skeleton\": [[1, 2], [1, 3], [2, 3], [3, 4], [4, 5], [4, 6], [6, 7], [7, 8], [4, 9], [9, 10], [10, 11], [5, 12], [12, 13], [13, 14], [5, 15], [15, 16], [16, 17]]}\n\nanimal_face = {\"keypoints\": ['right eye right', 'right eye left', 'left eye right', 'left eye left', 'nose tip', 'lip right', 'lip left', 'upper lip', 'lower lip'], \"skeleton\": []}\n\nfly = {\"keypoints\": ['head', 'eye left', 'eye right', 'neck', 'thorax', 'abdomen', 'foreleg right base', 'foreleg right first segment', 'foreleg right second segment', 'foreleg right tip', 'midleg right base', 'midleg right first segment', 'midleg right second segment', 'midleg right tip', 'hindleg right base', 'hindleg right first segment', 'hindleg right second segment', 'hindleg right tip', 'foreleg left base', 'foreleg left first segment', 'foreleg left second segment', 'foreleg left tip', 'midleg left base', 'midleg left first segment', 'midleg left second segment', 'midleg left tip', 'hindleg left base', 'hindleg left first segment', 'hindleg left second segment', 'hindleg left tip', 'wing left', 'wing right'], \"skeleton\": [[2, 1], [3, 1], [4, 1], [5, 4], [6, 5], [8, 7], [9, 8], [10, 9], [12, 11], [13, 12], [14, 13], [16, 15], [17, 16], [18, 17], [20, 19], [21, 20], [22, 21], [24, 23], [25, 24], [26, 25], [28, 27], [29, 28], [30, 29], [31, 4], [32, 4]]}\n\nlocust = {\"keypoints\": ['head', 'neck', 'thorax', 'abdomen1', 'abdomen2', 'anttip left', 'antbase left', 'eye left', 'foreleg left base', 'foreleg left first segment', 'foreleg left second segment', 'foreleg left tip', 'midleg left base', 'midleg left first segment', 'midleg left second segment', 'midleg left tip', 'hindleg left base', 'hindleg left first segment', 'hindleg left second segment', 'hindleg left tip', 'anttip right', 'antbase right', 'eye right', 'foreleg right base', 'foreleg right first segment', 'foreleg right second segment', 'foreleg right tip', 'midleg right base', 'midleg right first segment', 'midleg right second segment', 'midleg right tip', 'hindleg right base', 'hindleg right first segment', 'hindleg right second segment', 'hindleg right tip'],\"skeleton\": [[2, 1], [3, 2], [4, 3], [5, 4], [7, 6], [8, 7], [10, 9], [11, 10], [12, 11], [14, 13], [15, 14],[16, 15], [18, 17], [19, 18], [20, 19], [22, 21], [23, 22], [25, 24], [26, 25], [27, 26],[29, 28], [30, 29], [31, 30], [33, 32], [34, 33], [35, 34]]}\n\ncar ={\"keypoints\": ['right front wheel center', 'left front wheel center', 'right rear wheel center', 'left rear wheel center', 'front right', 'front left', 'back right', 'back left', 'none', 'roof front right', 'roof front left', 'roof back right', 'roof back left', 'none'],\"skeleton\": [[0, 2], [1, 3], [0, 1], [2, 3], [9, 11], [10, 12], [9, 10], [11, 12], [4, 0], [4, 9], [4, 5], [5, 1], [5, 10], [6, 2], [6, 11], [7, 3], [7, 12], [6, 7]]}\n\nshort_sleeved_shirt = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}\n\nlong_sleeved_outwear={'keypoints': ['upper center neckline', 'lower right center neckline', 'lower right neckline', 'upper right neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 1', 'right sleeve inside 2', 'right sleeve inside 3', 'right sleeve inside 4', 'right side outside 1', 'right side outside 2', 'right side outside 3', 'right side inside 3', 'left side outside 3', 'left side outside 2', 'left side outside 1', 'left sleeve inside 4', 'left sleeve inside 3', 'left sleeve inside 2', 'left sleeve inside 1', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1', 'lower left center neckline', 'left side inside 1', 'left side inside 2', 'left side inside 3', 'right side inside 1', 'right side inside 2'], 'skeleton': []}\n\nshort_sleeved_outwear={'keypoints': ['upper center neckline', 'lower right center neckline', 'lower right neckline', 'upper right neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 2', 'right sleeve inside 1', 'right side outside 1', 'right side outside 2', 'right side outside 3', 'right side inside 3', 'left side outside 3', 'left side outside 2', 'left side outside 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1', 'lower left center neckline', 'left side inside 1', 'left side inside 2', 'left side inside 3', 'right side inside 1', 'right side inside 2'], 'skeleton': []}\n\nsling={'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve'], 'skeleton': []}\n\nvest = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve'], 'skeleton': []}\n\nlong_sleeved_dress={'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 4', 'right sleeve inside 3', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'center hem', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left sleeve inside 3', 'left sleeve inside 4', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}\n\nlong_sleeved_shirt = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 4', 'right sleeve inside 3', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left sleeve inside 3', 'left sleeve inside 4', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}\n\ntrousers = {'keypoints': ['right side outside 1', 'upper center', 'left side outside 1', 'right side outside 2', 'right side outside 3', 'right cuff outside', 'right cuff inside', 'right side inside 1', 'crotch', 'left side inside 1', 'left cuff inside', 'left cuff outside', 'left side outside 3', 'left side outside 2'], 'skeleton': []}\n\nsling_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'right side 6', 'center hem', 'left side 6', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1'], 'skeleton': []}\n\nvest_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'right side 6', 'center hem', 'left side 6', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1'], 'skeleton': []}\n\nskirt = {'keypoints': ['right side 1', 'upper center', 'left side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2'], 'skeleton': []}\n\nshort_sleeved_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 1', 'right sleeve inside 2', 'left side 1', 'left side 2', 'left side 3', 'left side 4', 'left side 5', 'center hem', 'right side 5', 'right side 4', 'right side 3', 'right side 2', 'right side 1', 'left sleeve inside 2', 'left sleeve inside 1', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}\n\nshorts = {'keypoints': ['right side outside 1', 'upper center', 'left side outside 1', 'right side outside 2', 'right cuff outside', 'right cuff inside', 'crotch', 'left cuff inside', 'left cuff outside', 'left side outside 2'], 'skeleton': []}\n\ntable = {'keypoints': ['desktop corner 1', 'desktop corner 2', 'desktop corner 3', 'desktop corner 4', 'table leg 1', 'table leg 2', 'table leg 3', 'table leg 4'], 'skeleton': []}\n\nchair = {'keypoints': ['legs righttopcorner', 'legs lefttopcorner', 'legs leftbottomcorner', 'legs rightbottomcorner', 'base righttop', 'base lefttop', 'base leftbottom', 'base rightbottom', 'headboard righttop', 'headboard lefttop'], 'skeleton': []}\n\nbed = {'keypoints': ['legs rightbottomcorner', 'legs righttopcorner', 'base rightbottom', 'base righttop', 'backrest righttop', 'legs leftbottomcorner', 'legs lefttopcorner', 'base leftbottom', 'base lefttop', 'backrest lefttop'], 'skeleton': []}\n\nsofa = {'keypoints': ['legs rightbottomcorner', 'legs righttopcorner', 'base rightbottom', 'base righttop', 'armrests rightbottomcorner', 'armrests righttopcorner', 'backrest righttop', 'legs leftbottomcorner', 'legs lefttopcorner', 'base leftbottom', 'base lefttop', 'armrests leftbottomcorner', 'armrests lefttopcorner', 'backrest lefttop'], 'skeleton': []}\n\nswivelchair = {'keypoints': ['rotatingbase 1', 'rotatingbase 2', 'rotatingbase 3', 'rotatingbase 4', 'rotatingbase 5', 'rotatingbase center', 'base center', 'base righttop', 'base lefttop', 'base leftbottom', 'base rightbottom', 'backrest righttop', 'backrest lefttop'], 'skeleton': []}\n\n"
  },
  {
    "path": "src/utils/dependencies/XPose/transforms.py",
    "content": "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n\"\"\"\nTransforms and data augmentation for both image + bbox.\n\"\"\"\nimport os\nimport sys\nimport random\n\nimport PIL\nimport torch\nimport torchvision.transforms as T\nimport torchvision.transforms.functional as F\n\nsys.path.append(os.path.dirname(os.path.abspath(__file__)))\nfrom util.box_ops import box_xyxy_to_cxcywh\nfrom util.misc import interpolate\n\n\ndef crop(image, target, region):\n    cropped_image = F.crop(image, *region)\n\n    if target is not None:\n        target = target.copy()\n        i, j, h, w = region\n        id2catname = target[\"id2catname\"]\n        caption_list = target[\"caption_list\"]\n        target[\"size\"] = torch.tensor([h, w])\n\n        fields = [\"labels\", \"area\", \"iscrowd\", \"positive_map\",\"keypoints\"]\n\n        if \"boxes\" in target:\n            boxes = target[\"boxes\"]\n            max_size = torch.as_tensor([w, h], dtype=torch.float32)\n            cropped_boxes = boxes - torch.as_tensor([j, i, j, i])\n            cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)\n            cropped_boxes = cropped_boxes.clamp(min=0)\n            area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)\n            target[\"boxes\"] = cropped_boxes.reshape(-1, 4)\n            target[\"area\"] = area\n            fields.append(\"boxes\")\n\n        if \"masks\" in target:\n            # FIXME should we update the area here if there are no boxes?\n            target['masks'] = target['masks'][:, i:i + h, j:j + w]\n            fields.append(\"masks\")\n\n\n        # remove elements for which the boxes or masks that have zero area\n        if \"boxes\" in target or \"masks\" in target:\n            # favor boxes selection when defining which elements to keep\n            # this is compatible with previous implementation\n            if \"boxes\" in target:\n                cropped_boxes = target['boxes'].reshape(-1, 2, 2)\n                keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)\n            else:\n                keep = target['masks'].flatten(1).any(1)\n\n            for field in fields:\n                if field in target:\n                    target[field] = target[field][keep]\n\n        if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':\n            # for debug and visualization only.\n            if 'strings_positive' in target:\n                target['strings_positive'] = [_i for _i, _j in zip(target['strings_positive'], keep) if _j]\n\n\n        if \"keypoints\" in target:\n            max_size = torch.as_tensor([w, h], dtype=torch.float32)\n            keypoints = target[\"keypoints\"]\n            cropped_keypoints = keypoints.view(-1, 3)[:,:2] - torch.as_tensor([j, i])\n            cropped_keypoints = torch.min(cropped_keypoints, max_size)\n            cropped_keypoints = cropped_keypoints.clamp(min=0)\n            cropped_keypoints = torch.cat([cropped_keypoints, keypoints.view(-1, 3)[:,2].unsqueeze(1)], dim=1)\n            target[\"keypoints\"] = cropped_keypoints.view(target[\"keypoints\"].shape[0], target[\"keypoints\"].shape[1], 3)\n\n        target[\"id2catname\"] = id2catname\n        target[\"caption_list\"] = caption_list\n\n    return cropped_image, target\n\n\ndef hflip(image, target):\n    flipped_image = F.hflip(image)\n\n    w, h = image.size\n\n    if target is not None:\n        target = target.copy()\n        if \"boxes\" in target:\n            boxes = target[\"boxes\"]\n            boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])\n            target[\"boxes\"] = boxes\n\n        if \"masks\" in target:\n            target['masks'] = target['masks'].flip(-1)\n\n\n        if \"keypoints\" in target:\n            dataset_name=target[\"dataset_name\"]\n            if dataset_name == \"coco_person\" or dataset_name == \"macaque\":\n                flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8],\n                                   [9, 10], [11, 12], [13, 14], [15, 16]]\n\n            elif dataset_name==\"animalkindom_ak_P1_animal\":\n                flip_pairs = [[1, 2], [4, 5],[7,8],[9,10],[11,12],[14,15],[16,17],[18,19]]\n\n            elif dataset_name==\"animalweb_animal\":\n                flip_pairs = [[0, 3], [1, 2], [5, 6]]\n\n            elif dataset_name==\"face\":\n                flip_pairs = [\n                                [0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11], [6, 10], [7, 9],\n                                [17, 26], [18, 25], [19, 24], [20, 23], [21, 22],\n                                [31, 35], [32, 34],\n                                [36, 45], [37, 44], [38, 43], [39, 42], [40, 47], [41, 46],\n                                [48, 54], [49, 53], [50, 52],\n                                [55, 59], [56, 58],\n                                [60, 64], [61, 63],\n                                [65, 67]\n                            ]\n\n            elif dataset_name==\"hand\":\n                flip_pairs = []\n\n            elif dataset_name==\"foot\":\n                flip_pairs = []\n\n            elif dataset_name==\"locust\":\n                flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24], [10, 25], [11, 26], [12, 27], [13, 28], [14, 29], [15, 30], [16, 31], [17, 32], [18, 33], [19, 34]]\n\n            elif dataset_name==\"fly\":\n                flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22], [11, 23], [12, 24], [13, 25], [14, 26], [15, 27], [16, 28], [17, 29], [30, 31]]\n\n            elif dataset_name == \"ap_36k_animal\" or dataset_name == \"ap_10k_animal\":\n                flip_pairs = [[0, 1],[5, 8], [6, 9], [7, 10], [11, 14], [12, 15], [13, 16]]\n\n\n\n            keypoints = target[\"keypoints\"]\n            keypoints[:,:,0] = w - keypoints[:,:, 0]-1\n            for pair in flip_pairs:\n                keypoints[:,pair[0], :], keypoints[:,pair[1], :] = keypoints[:,pair[1], :], keypoints[:,pair[0], :].clone()\n            target[\"keypoints\"] = keypoints\n    return flipped_image, target\n\n\ndef resize(image, target, size, max_size=None):\n    # size can be min_size (scalar) or (w, h) tuple\n\n    def get_size_with_aspect_ratio(image_size, size, max_size=None):\n        w, h = image_size\n        if max_size is not None:\n            min_original_size = float(min((w, h)))\n            max_original_size = float(max((w, h)))\n            if max_original_size / min_original_size * size > max_size:\n                size = int(round(max_size * min_original_size / max_original_size))\n\n        if (w <= h and w == size) or (h <= w and h == size):\n            return (h, w)\n\n        if w < h:\n            ow = size\n            oh = int(size * h / w)\n        else:\n            oh = size\n            ow = int(size * w / h)\n\n        return (oh, ow)\n\n    def get_size(image_size, size, max_size=None):\n        if isinstance(size, (list, tuple)):\n            return size[::-1]\n        else:\n            return get_size_with_aspect_ratio(image_size, size, max_size)\n\n    size = get_size(image.size, size, max_size)\n    rescaled_image = F.resize(image, size)\n\n    if target is None:\n        return rescaled_image, None\n\n    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))\n    ratio_width, ratio_height = ratios\n\n    target = target.copy()\n    if \"boxes\" in target:\n        boxes = target[\"boxes\"]\n        scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])\n        target[\"boxes\"] = scaled_boxes\n\n    if \"area\" in target:\n        area = target[\"area\"]\n        scaled_area = area * (ratio_width * ratio_height)\n        target[\"area\"] = scaled_area\n\n\n    if \"keypoints\" in target:\n        keypoints = target[\"keypoints\"]\n        scaled_keypoints = keypoints * torch.as_tensor([ratio_width, ratio_height, 1])\n        target[\"keypoints\"] = scaled_keypoints\n\n    h, w = size\n    target[\"size\"] = torch.tensor([h, w])\n\n    if \"masks\" in target:\n        target['masks'] = interpolate(\n            target['masks'][:, None].float(), size, mode=\"nearest\")[:, 0] > 0.5\n\n    return rescaled_image, target\n\n\ndef pad(image, target, padding):\n    # assumes that we only pad on the bottom right corners\n    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))\n    if target is None:\n        return padded_image, None\n    target = target.copy()\n    # should we do something wrt the original size?\n    target[\"size\"] = torch.tensor(padded_image.size[::-1])\n    if \"masks\" in target:\n        target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))\n    return padded_image, target\n\n\nclass ResizeDebug(object):\n    def __init__(self, size):\n        self.size = size\n\n    def __call__(self, img, target):\n        return resize(img, target, self.size)\n\n\nclass RandomCrop(object):\n    def __init__(self, size):\n        self.size = size\n\n    def __call__(self, img, target):\n        region = T.RandomCrop.get_params(img, self.size)\n        return crop(img, target, region)\n\n\nclass RandomSizeCrop(object):\n    def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False):\n        # respect_boxes:    True to keep all boxes\n        #                   False to tolerence box filter\n        self.min_size = min_size\n        self.max_size = max_size\n        self.respect_boxes = respect_boxes\n\n    def __call__(self, img: PIL.Image.Image, target: dict):\n        init_boxes = len(target[\"boxes\"]) if (target is not None and \"boxes\" in target) else 0\n        max_patience = 10\n        for i in range(max_patience):\n            w = random.randint(self.min_size, min(img.width, self.max_size))\n            h = random.randint(self.min_size, min(img.height, self.max_size))\n            region = T.RandomCrop.get_params(img, [h, w])\n            result_img, result_target = crop(img, target, region)\n            if target is not None:\n                if not self.respect_boxes or len(result_target[\"boxes\"]) == init_boxes or i == max_patience - 1:\n                    return result_img, result_target\n        return result_img, result_target\n\n\nclass CenterCrop(object):\n    def __init__(self, size):\n        self.size = size\n\n    def __call__(self, img, target):\n        image_width, image_height = img.size\n        crop_height, crop_width = self.size\n        crop_top = int(round((image_height - crop_height) / 2.))\n        crop_left = int(round((image_width - crop_width) / 2.))\n        return crop(img, target, (crop_top, crop_left, crop_height, crop_width))\n\n\nclass RandomHorizontalFlip(object):\n    def __init__(self, p=0.5):\n        self.p = p\n\n    def __call__(self, img, target):\n        if random.random() < self.p:\n            return hflip(img, target)\n        return img, target\n\n\nclass RandomResize(object):\n    def __init__(self, sizes, max_size=None):\n        assert isinstance(sizes, (list, tuple))\n        self.sizes = sizes\n        self.max_size = max_size\n\n    def __call__(self, img, target=None):\n        size = random.choice(self.sizes)\n        return resize(img, target, size, self.max_size)\n\n\nclass RandomPad(object):\n    def __init__(self, max_pad):\n        self.max_pad = max_pad\n\n    def __call__(self, img, target):\n        pad_x = random.randint(0, self.max_pad)\n        pad_y = random.randint(0, self.max_pad)\n        return pad(img, target, (pad_x, pad_y))\n\n\nclass RandomSelect(object):\n    \"\"\"\n    Randomly selects between transforms1 and transforms2,\n    with probability p for transforms1 and (1 - p) for transforms2\n    \"\"\"\n    def __init__(self, transforms1, transforms2, p=0.5):\n        self.transforms1 = transforms1\n        self.transforms2 = transforms2\n        self.p = p\n\n    def __call__(self, img, target):\n        if random.random() < self.p:\n            return self.transforms1(img, target)\n        return self.transforms2(img, target)\n\n\nclass ToTensor(object):\n    def __call__(self, img, target):\n        return F.to_tensor(img), target\n\n\nclass RandomErasing(object):\n\n    def __init__(self, *args, **kwargs):\n        self.eraser = T.RandomErasing(*args, **kwargs)\n\n    def __call__(self, img, target):\n        return self.eraser(img), target\n\n\nclass Normalize(object):\n    def __init__(self, mean, std):\n        self.mean = mean\n        self.std = std\n\n    def __call__(self, image, target=None):\n        image = F.normalize(image, mean=self.mean, std=self.std)\n        if target is None:\n            return image, None\n        target = target.copy()\n        h, w = image.shape[-2:]\n        if \"boxes\" in target:\n            boxes = target[\"boxes\"]\n            boxes = box_xyxy_to_cxcywh(boxes)\n            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)\n            target[\"boxes\"] = boxes\n\n        if \"area\" in target:\n            area = target[\"area\"]\n            area = area / (torch.tensor(w, dtype=torch.float32)*torch.tensor(h, dtype=torch.float32))\n            target[\"area\"] = area\n\n        if \"keypoints\" in target:\n            keypoints = target[\"keypoints\"]\n            V = keypoints[:, :, 2]\n            V[V == 2] = 1\n            Z=keypoints[:, :, :2]\n            Z = Z.contiguous().view(-1, 2 * V.shape[-1])\n            Z = Z / torch.tensor([w, h] * V.shape[-1], dtype=torch.float32)\n            target[\"valid_kpt_num\"] = V.shape[1]\n            Z_pad = torch.zeros(Z.shape[0],68 * 2 - Z.shape[1])\n            V_pad = torch.zeros(V.shape[0],68 - V.shape[1])\n            V=torch.cat([V, V_pad], dim=1)\n            Z=torch.cat([Z, Z_pad], dim=1)\n            all_keypoints = torch.cat([Z, V], dim=1)\n            target[\"keypoints\"] = all_keypoints\n\n\n        return image, target\n\n\nclass Compose(object):\n    def __init__(self, transforms):\n        self.transforms = transforms\n\n    def __call__(self, image, target):\n        for t in self.transforms:\n            image, target = t(image, target)\n        return image, target\n\n    def __repr__(self):\n        format_string = self.__class__.__name__ + \"(\"\n        for t in self.transforms:\n            format_string += \"\\n\"\n            format_string += \"    {0}\".format(t)\n        format_string += \"\\n)\"\n        return format_string\n"
  },
  {
    "path": "src/utils/dependencies/XPose/util/addict.py",
    "content": "import copy\n\n\nclass Dict(dict):\n\n    def __init__(__self, *args, **kwargs):\n        object.__setattr__(__self, '__parent', kwargs.pop('__parent', None))\n        object.__setattr__(__self, '__key', kwargs.pop('__key', None))\n        object.__setattr__(__self, '__frozen', False)\n        for arg in args:\n            if not arg:\n                continue\n            elif isinstance(arg, dict):\n                for key, val in arg.items():\n                    __self[key] = __self._hook(val)\n            elif isinstance(arg, tuple) and (not isinstance(arg[0], tuple)):\n                __self[arg[0]] = __self._hook(arg[1])\n            else:\n                for key, val in iter(arg):\n                    __self[key] = __self._hook(val)\n\n        for key, val in kwargs.items():\n            __self[key] = __self._hook(val)\n\n    def __setattr__(self, name, value):\n        if hasattr(self.__class__, name):\n            raise AttributeError(\"'Dict' object attribute \"\n                                 \"'{0}' is read-only\".format(name))\n        else:\n            self[name] = value\n\n    def __setitem__(self, name, value):\n        isFrozen = (hasattr(self, '__frozen') and\n                    object.__getattribute__(self, '__frozen'))\n        if isFrozen and name not in super(Dict, self).keys():\n                raise KeyError(name)\n        super(Dict, self).__setitem__(name, value)\n        try:\n            p = object.__getattribute__(self, '__parent')\n            key = object.__getattribute__(self, '__key')\n        except AttributeError:\n            p = None\n            key = None\n        if p is not None:\n            p[key] = self\n            object.__delattr__(self, '__parent')\n            object.__delattr__(self, '__key')\n\n    def __add__(self, other):\n        if not self.keys():\n            return other\n        else:\n            self_type = type(self).__name__\n            other_type = type(other).__name__\n            msg = \"unsupported operand type(s) for +: '{}' and '{}'\"\n            raise TypeError(msg.format(self_type, other_type))\n\n    @classmethod\n    def _hook(cls, item):\n        if isinstance(item, dict):\n            return cls(item)\n        elif isinstance(item, (list, tuple)):\n            return type(item)(cls._hook(elem) for elem in item)\n        return item\n\n    def __getattr__(self, item):\n        return self.__getitem__(item)\n\n    def __missing__(self, name):\n        if object.__getattribute__(self, '__frozen'):\n            raise KeyError(name)\n        return self.__class__(__parent=self, __key=name)\n\n    def __delattr__(self, name):\n        del self[name]\n\n    def to_dict(self):\n        base = {}\n        for key, value in self.items():\n            if isinstance(value, type(self)):\n                base[key] = value.to_dict()\n            elif isinstance(value, (list, tuple)):\n                base[key] = type(value)(\n                    item.to_dict() if isinstance(item, type(self)) else\n                    item for item in value)\n            else:\n                base[key] = value\n        return base\n\n    def copy(self):\n        return copy.copy(self)\n\n    def deepcopy(self):\n        return copy.deepcopy(self)\n\n    def __deepcopy__(self, memo):\n        other = self.__class__()\n        memo[id(self)] = other\n        for key, value in self.items():\n            other[copy.deepcopy(key, memo)] = copy.deepcopy(value, memo)\n        return other\n\n    def update(self, *args, **kwargs):\n        other = {}\n        if args:\n            if len(args) > 1:\n                raise TypeError()\n            other.update(args[0])\n        other.update(kwargs)\n        for k, v in other.items():\n            if ((k not in self) or\n                (not isinstance(self[k], dict)) or\n                (not isinstance(v, dict))):\n                self[k] = v\n            else:\n                self[k].update(v)\n\n    def __getnewargs__(self):\n        return tuple(self.items())\n\n    def __getstate__(self):\n        return self\n\n    def __setstate__(self, state):\n        self.update(state)\n\n    def __or__(self, other):\n        if not isinstance(other, (Dict, dict)):\n            return NotImplemented\n        new = Dict(self)\n        new.update(other)\n        return new\n\n    def __ror__(self, other):\n        if not isinstance(other, (Dict, dict)):\n            return NotImplemented\n        new = Dict(other)\n        new.update(self)\n        return new\n\n    def __ior__(self, other):\n        self.update(other)\n        return self\n\n    def setdefault(self, key, default=None):\n        if key in self:\n            return self[key]\n        else:\n            self[key] = default\n            return default\n\n    def freeze(self, shouldFreeze=True):\n        object.__setattr__(self, '__frozen', shouldFreeze)\n        for key, val in self.items():\n            if isinstance(val, Dict):\n                val.freeze(shouldFreeze)\n\n    def unfreeze(self):\n        self.freeze(False)\n"
  },
  {
    "path": "src/utils/dependencies/XPose/util/box_ops.py",
    "content": "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n\"\"\"\nUtilities for bounding box manipulation and GIoU.\n\"\"\"\nimport torch, os\nfrom torchvision.ops.boxes import box_area\n\n\ndef box_cxcywh_to_xyxy(x):\n    x_c, y_c, w, h = x.unbind(-1)\n    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),\n         (x_c + 0.5 * w), (y_c + 0.5 * h)]\n    return torch.stack(b, dim=-1)\n\n\ndef box_xyxy_to_cxcywh(x):\n    x0, y0, x1, y1 = x.unbind(-1)\n    b = [(x0 + x1) / 2, (y0 + y1) / 2,\n         (x1 - x0), (y1 - y0)]\n    return torch.stack(b, dim=-1)\n\n\n# modified from torchvision to also return the union\ndef box_iou(boxes1, boxes2):\n    area1 = box_area(boxes1)\n    area2 = box_area(boxes2)\n\n    # import ipdb; ipdb.set_trace()\n    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]\n    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]\n\n    wh = (rb - lt).clamp(min=0)  # [N,M,2]\n    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]\n\n    union = area1[:, None] + area2 - inter\n\n    iou = inter / (union + 1e-6)\n    return iou, union\n\n\ndef generalized_box_iou(boxes1, boxes2):\n    \"\"\"\n    Generalized IoU from https://giou.stanford.edu/\n\n    The boxes should be in [x0, y0, x1, y1] format\n\n    Returns a [N, M] pairwise matrix, where N = len(boxes1)\n    and M = len(boxes2)\n    \"\"\"\n    # degenerate boxes gives inf / nan results\n    # so do an early check\n    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()\n    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()\n    # except:\n    #     import ipdb; ipdb.set_trace()\n    iou, union = box_iou(boxes1, boxes2)\n\n    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])\n    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])\n\n    wh = (rb - lt).clamp(min=0)  # [N,M,2]\n    area = wh[:, :, 0] * wh[:, :, 1]\n\n    return iou - (area - union) / (area + 1e-6)\n\n\n\n# modified from torchvision to also return the union\ndef box_iou_pairwise(boxes1, boxes2):\n    area1 = box_area(boxes1)\n    area2 = box_area(boxes2)\n\n    lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N,2]\n    rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N,2]\n\n    wh = (rb - lt).clamp(min=0)  # [N,2]\n    inter = wh[:, 0] * wh[:, 1]  # [N]\n\n    union = area1 + area2 - inter\n\n    iou = inter / union\n    return iou, union\n\n\ndef generalized_box_iou_pairwise(boxes1, boxes2):\n    \"\"\"\n    Generalized IoU from https://giou.stanford.edu/\n\n    Input:\n        - boxes1, boxes2: N,4\n    Output:\n        - giou: N, 4\n    \"\"\"\n    # degenerate boxes gives inf / nan results\n    # so do an early check\n    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()\n    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()\n    assert boxes1.shape == boxes2.shape\n    iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4\n\n    lt = torch.min(boxes1[:, :2], boxes2[:, :2])\n    rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])\n\n    wh = (rb - lt).clamp(min=0)  # [N,2]\n    area = wh[:, 0] * wh[:, 1]\n\n    return iou - (area - union) / area\n\ndef masks_to_boxes(masks):\n    \"\"\"Compute the bounding boxes around the provided masks\n\n    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.\n\n    Returns a [N, 4] tensors, with the boxes in xyxy format\n    \"\"\"\n    if masks.numel() == 0:\n        return torch.zeros((0, 4), device=masks.device)\n\n    h, w = masks.shape[-2:]\n\n    y = torch.arange(0, h, dtype=torch.float)\n    x = torch.arange(0, w, dtype=torch.float)\n    y, x = torch.meshgrid(y, x)\n\n    x_mask = (masks * x.unsqueeze(0))\n    x_max = x_mask.flatten(1).max(-1)[0]\n    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]\n\n    y_mask = (masks * y.unsqueeze(0))\n    y_max = y_mask.flatten(1).max(-1)[0]\n    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]\n\n    return torch.stack([x_min, y_min, x_max, y_max], 1)\n\nif __name__ == '__main__':\n    x = torch.rand(5, 4)\n    y = torch.rand(3, 4)\n    iou, union = box_iou(x, y)\n    import ipdb; ipdb.set_trace()\n"
  },
  {
    "path": "src/utils/dependencies/XPose/util/config.py",
    "content": "# ==========================================================\n# Modified from mmcv\n# ==========================================================\nimport sys\nimport os.path as osp\nimport ast\nimport tempfile\nimport shutil\nfrom importlib import import_module\nfrom argparse import Action\n\nfrom .addict import Dict\n\nBASE_KEY = '_base_'\nDELETE_KEY = '_delete_'\nRESERVED_KEYS = ['filename', 'text', 'pretty_text', 'get', 'dump', 'merge_from_dict']\n\n\ndef check_file_exist(filename, msg_tmpl='file \"{}\" does not exist'):\n    if not osp.isfile(filename):\n        raise FileNotFoundError(msg_tmpl.format(filename))\n\nclass ConfigDict(Dict):\n\n    def __missing__(self, name):\n        raise KeyError(name)\n\n    def __getattr__(self, name):\n        try:\n            value = super(ConfigDict, self).__getattr__(name)\n        except KeyError:\n            ex = AttributeError(f\"'{self.__class__.__name__}' object has no \"\n                                f\"attribute '{name}'\")\n        except Exception as e:\n            ex = e\n        else:\n            return value\n        raise ex\n\n\nclass Config(object):\n    \"\"\"\n    config files.\n    only support .py file as config now.\n\n    ref: mmcv.utils.config\n\n    Example:\n        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))\n        >>> cfg.a\n        1\n        >>> cfg.b\n        {'b1': [0, 1]}\n        >>> cfg.b.b1\n        [0, 1]\n        >>> cfg = Config.fromfile('tests/data/config/a.py')\n        >>> cfg.filename\n        \"/home/kchen/projects/mmcv/tests/data/config/a.py\"\n        >>> cfg.item4\n        'test'\n        >>> cfg\n        \"Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: \"\n        \"{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}\"\n    \"\"\"\n    @staticmethod\n    def _validate_py_syntax(filename):\n        with open(filename) as f:\n            content = f.read()\n        try:\n            ast.parse(content)\n        except SyntaxError:\n            raise SyntaxError('There are syntax errors in config '\n                              f'file {filename}')\n\n    @staticmethod\n    def _file2dict(filename):\n        filename = osp.abspath(osp.expanduser(filename))\n        check_file_exist(filename)\n        if filename.lower().endswith('.py'):\n            with tempfile.TemporaryDirectory() as temp_config_dir:\n                temp_config_file = tempfile.NamedTemporaryFile(\n                    dir=temp_config_dir, suffix='.py')\n                temp_config_name = osp.basename(temp_config_file.name)\n                # close temp file before copy\n                temp_config_file.close()\n                shutil.copyfile(filename,\n                                osp.join(temp_config_dir, temp_config_name))\n                temp_module_name = osp.splitext(temp_config_name)[0]\n                sys.path.insert(0, temp_config_dir)\n                Config._validate_py_syntax(filename)\n                mod = import_module(temp_module_name)\n                sys.path.pop(0)\n                cfg_dict = {\n                    name: value\n                    for name, value in mod.__dict__.items()\n                    if not name.startswith('__')\n                }\n                # delete imported module\n                del sys.modules[temp_module_name]\n                \n\n        elif filename.lower().endswith(('.yml', '.yaml', '.json')):\n            from .slio import slload\n            cfg_dict = slload(filename)\n        else:\n            raise IOError('Only py/yml/yaml/json type are supported now!')\n\n        cfg_text = filename + '\\n'\n        with open(filename, 'r') as f:\n            cfg_text += f.read()\n\n        # parse the base file\n        if BASE_KEY in cfg_dict:\n            cfg_dir = osp.dirname(filename)\n            base_filename = cfg_dict.pop(BASE_KEY)\n            base_filename = base_filename if isinstance(\n                base_filename, list) else [base_filename]\n\n            cfg_dict_list = list()\n            cfg_text_list = list()\n            for f in base_filename:\n                _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f))\n                cfg_dict_list.append(_cfg_dict)\n                cfg_text_list.append(_cfg_text)\n\n            base_cfg_dict = dict()\n            for c in cfg_dict_list:\n                if len(base_cfg_dict.keys() & c.keys()) > 0:\n                    raise KeyError('Duplicate key is not allowed among bases')\n                    # TODO Allow the duplicate key while warnning user\n                base_cfg_dict.update(c)\n\n            base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)\n            cfg_dict = base_cfg_dict\n\n            # merge cfg_text\n            cfg_text_list.append(cfg_text)\n            cfg_text = '\\n'.join(cfg_text_list)\n\n        return cfg_dict, cfg_text\n\n    @staticmethod\n    def _merge_a_into_b(a, b):\n        \"\"\"merge dict `a` into dict `b` (non-inplace).\n            values in `a` will overwrite `b`.\n            copy first to avoid inplace modification\n            \n        Args:\n            a ([type]): [description]\n            b ([type]): [description]\n\n        Returns:\n            [dict]: [description]\n        \"\"\"\n        # import ipdb; ipdb.set_trace()\n        if not isinstance(a, dict):\n            return a\n\n        b = b.copy()\n        for k, v in a.items():\n            if isinstance(v, dict) and k in b and not v.pop(DELETE_KEY, False):\n            \n                if not isinstance(b[k], dict) and not isinstance(b[k], list):\n                    # if :\n                    # import ipdb; ipdb.set_trace()\n                    raise TypeError(\n                        f'{k}={v} in child config cannot inherit from base '\n                        f'because {k} is a dict in the child config but is of '\n                        f'type {type(b[k])} in base config. You may set '\n                        f'`{DELETE_KEY}=True` to ignore the base config')\n                b[k] = Config._merge_a_into_b(v, b[k])\n            elif isinstance(b, list):\n                try:\n                    _ = int(k)\n                except:\n                    raise TypeError(\n                        f'b is a list, '\n                        f'index {k} should be an int when input but {type(k)}'\n                    )\n                b[int(k)] = Config._merge_a_into_b(v, b[int(k)])\n            else:   \n                b[k] = v\n                \n        return b\n\n    @staticmethod\n    def fromfile(filename):\n        cfg_dict, cfg_text = Config._file2dict(filename)\n        return Config(cfg_dict, cfg_text=cfg_text, filename=filename)\n\n\n    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):\n        if cfg_dict is None:\n            cfg_dict = dict()\n        elif not isinstance(cfg_dict, dict):\n            raise TypeError('cfg_dict must be a dict, but '\n                            f'got {type(cfg_dict)}')\n        for key in cfg_dict:\n            if key in RESERVED_KEYS:\n                raise KeyError(f'{key} is reserved for config file')\n\n        super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict))\n        super(Config, self).__setattr__('_filename', filename)\n        if cfg_text:\n            text = cfg_text\n        elif filename:\n            with open(filename, 'r') as f:\n                text = f.read()\n        else:\n            text = ''\n        super(Config, self).__setattr__('_text', text)\n\n\n    @property\n    def filename(self):\n        return self._filename\n\n    @property\n    def text(self):\n        return self._text\n\n    @property\n    def pretty_text(self):\n\n        indent = 4\n\n        def _indent(s_, num_spaces):\n            s = s_.split('\\n')\n            if len(s) == 1:\n                return s_\n            first = s.pop(0)\n            s = [(num_spaces * ' ') + line for line in s]\n            s = '\\n'.join(s)\n            s = first + '\\n' + s\n            return s\n\n        def _format_basic_types(k, v, use_mapping=False):\n            if isinstance(v, str):\n                v_str = f\"'{v}'\"\n            else:\n                v_str = str(v)\n\n            if use_mapping:\n                k_str = f\"'{k}'\" if isinstance(k, str) else str(k)\n                attr_str = f'{k_str}: {v_str}'\n            else:\n                attr_str = f'{str(k)}={v_str}'\n            attr_str = _indent(attr_str, indent)\n\n            return attr_str\n\n        def _format_list(k, v, use_mapping=False):\n            # check if all items in the list are dict\n            if all(isinstance(_, dict) for _ in v):\n                v_str = '[\\n'\n                v_str += '\\n'.join(\n                    f'dict({_indent(_format_dict(v_), indent)}),'\n                    for v_ in v).rstrip(',')\n                if use_mapping:\n                    k_str = f\"'{k}'\" if isinstance(k, str) else str(k)\n                    attr_str = f'{k_str}: {v_str}'\n                else:\n                    attr_str = f'{str(k)}={v_str}'\n                attr_str = _indent(attr_str, indent) + ']'\n            else:\n                attr_str = _format_basic_types(k, v, use_mapping)\n            return attr_str\n\n        def _contain_invalid_identifier(dict_str):\n            contain_invalid_identifier = False\n            for key_name in dict_str:\n                contain_invalid_identifier |= \\\n                    (not str(key_name).isidentifier())\n            return contain_invalid_identifier\n\n        def _format_dict(input_dict, outest_level=False):\n            r = ''\n            s = []\n\n            use_mapping = _contain_invalid_identifier(input_dict)\n            if use_mapping:\n                r += '{'\n            for idx, (k, v) in enumerate(input_dict.items()):\n                is_last = idx >= len(input_dict) - 1\n                end = '' if outest_level or is_last else ','\n                if isinstance(v, dict):\n                    v_str = '\\n' + _format_dict(v)\n                    if use_mapping:\n                        k_str = f\"'{k}'\" if isinstance(k, str) else str(k)\n                        attr_str = f'{k_str}: dict({v_str}'\n                    else:\n                        attr_str = f'{str(k)}=dict({v_str}'\n                    attr_str = _indent(attr_str, indent) + ')' + end\n                elif isinstance(v, list):\n                    attr_str = _format_list(k, v, use_mapping) + end\n                else:\n                    attr_str = _format_basic_types(k, v, use_mapping) + end\n\n                s.append(attr_str)\n            r += '\\n'.join(s)\n            if use_mapping:\n                r += '}'\n            return r\n\n        cfg_dict = self._cfg_dict.to_dict()\n        text = _format_dict(cfg_dict, outest_level=True)\n        return text\n    \n\n    def __repr__(self):\n        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'\n\n    def __len__(self):\n        return len(self._cfg_dict)\n\n    def __getattr__(self, name):\n        # # debug\n        # print('+'*15)\n        # print('name=%s' % name)\n        # print(\"addr:\", id(self))\n        # # print('type(self):', type(self))\n        # print(self.__dict__)\n        # print('+'*15)\n        # if self.__dict__ == {}:\n        #     raise ValueError\n\n        return getattr(self._cfg_dict, name)\n\n    def __getitem__(self, name):\n        return self._cfg_dict.__getitem__(name)\n\n    def __setattr__(self, name, value):\n        if isinstance(value, dict):\n            value = ConfigDict(value)\n        self._cfg_dict.__setattr__(name, value)\n\n    def __setitem__(self, name, value):\n        if isinstance(value, dict):\n            value = ConfigDict(value)\n        self._cfg_dict.__setitem__(name, value)\n\n    def __iter__(self):\n        return iter(self._cfg_dict)\n\n    def dump(self, file=None):\n        # import ipdb; ipdb.set_trace()\n        if file is None:\n            return self.pretty_text\n        else:\n            with open(file, 'w') as f:\n                f.write(self.pretty_text)\n\n    def merge_from_dict(self, options):\n        \"\"\"Merge list into cfg_dict\n\n        Merge the dict parsed by MultipleKVAction into this cfg.\n\n        Examples:\n            >>> options = {'model.backbone.depth': 50,\n            ...            'model.backbone.with_cp':True}\n            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))\n            >>> cfg.merge_from_dict(options)\n            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')\n            >>> assert cfg_dict == dict(\n            ...     model=dict(backbone=dict(depth=50, with_cp=True)))\n\n        Args:\n            options (dict): dict of configs to merge from.\n        \"\"\"\n        option_cfg_dict = {}\n        for full_key, v in options.items():\n            d = option_cfg_dict\n            key_list = full_key.split('.')\n            for subkey in key_list[:-1]:\n                d.setdefault(subkey, ConfigDict())\n                d = d[subkey]\n            subkey = key_list[-1]\n            d[subkey] = v\n\n        cfg_dict = super(Config, self).__getattribute__('_cfg_dict')\n        super(Config, self).__setattr__(\n            '_cfg_dict', Config._merge_a_into_b(option_cfg_dict, cfg_dict))\n\n    # for multiprocess\n    def __setstate__(self, state):\n        self.__init__(state)\n\n\n    def copy(self):\n        return Config(self._cfg_dict.copy())\n\n    def deepcopy(self):\n        return Config(self._cfg_dict.deepcopy())\n\n\nclass DictAction(Action):\n    \"\"\"\n    argparse action to split an argument into KEY=VALUE form\n    on the first = and append to a dictionary. List options should\n    be passed as comma separated values, i.e KEY=V1,V2,V3\n    \"\"\"\n\n    @staticmethod\n    def _parse_int_float_bool(val):\n        try:\n            return int(val)\n        except ValueError:\n            pass\n        try:\n            return float(val)\n        except ValueError:\n            pass\n        if val.lower() in ['true', 'false']:\n            return True if val.lower() == 'true' else False\n        if val.lower() in ['none', 'null']:\n            return None\n        return val\n\n    def __call__(self, parser, namespace, values, option_string=None):\n        options = {}\n        for kv in values:\n            key, val = kv.split('=', maxsplit=1)\n            val = [self._parse_int_float_bool(v) for v in val.split(',')]\n            if len(val) == 1:\n                val = val[0]\n            options[key] = val\n        setattr(namespace, self.dest, options)\n\n"
  },
  {
    "path": "src/utils/dependencies/XPose/util/keypoint_ops.py",
    "content": "import torch, os\n\ndef keypoint_xyxyzz_to_xyzxyz(keypoints: torch.Tensor):\n    \"\"\"_summary_\n\n    Args:\n        keypoints (torch.Tensor): ..., 51\n    \"\"\"\n    res = torch.zeros_like(keypoints)\n    num_points = keypoints.shape[-1] // 3\n    Z = keypoints[..., :2*num_points]\n    V = keypoints[..., 2*num_points:]\n    res[...,0::3] = Z[..., 0::2]\n    res[...,1::3] = Z[..., 1::2]\n    res[...,2::3] = V[...]\n    return res\n\ndef keypoint_xyzxyz_to_xyxyzz(keypoints: torch.Tensor):\n    \"\"\"_summary_\n\n    Args:\n        keypoints (torch.Tensor): ..., 51\n    \"\"\"\n    res = torch.zeros_like(keypoints)\n    num_points = keypoints.shape[-1] // 3\n    res[...,0:2*num_points:2] = keypoints[..., 0::3]\n    res[...,1:2*num_points:2] = keypoints[..., 1::3]\n    res[...,2*num_points:] = keypoints[..., 2::3]\n    return res"
  },
  {
    "path": "src/utils/dependencies/XPose/util/misc.py",
    "content": "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n\"\"\"\nMisc functions, including distributed helpers.\n\nMostly copy-paste from torchvision references.\n\"\"\"\nimport functools\nimport io\nimport os\nimport random \nimport subprocess\nimport time\nfrom collections import OrderedDict, defaultdict, deque\nimport datetime\nimport pickle\nfrom typing import Optional, List\n\nimport json, time\nimport numpy as np\nimport torch\nimport torch.distributed as dist\nfrom torch import Tensor\n\nimport colorsys\n\n# needed due to empty tensor bug in pytorch and torchvision 0.5\nimport torchvision\n__torchvision_need_compat_flag = float(torchvision.__version__.split('.')[1]) < 7\nif __torchvision_need_compat_flag:\n    from torchvision.ops import _new_empty_tensor\n    from torchvision.ops.misc import _output_size\n\n\nclass SmoothedValue(object):\n    \"\"\"Track a series of values and provide access to smoothed values over a\n    window or the global series average.\n    \"\"\"\n\n    def __init__(self, window_size=20, fmt=None):\n        if fmt is None:\n            fmt = \"{median:.4f} ({global_avg:.4f})\"\n        self.deque = deque(maxlen=window_size)\n        self.total = 0.0\n        self.count = 0\n        self.fmt = fmt\n\n    def update(self, value, n=1):\n        self.deque.append(value)\n        self.count += n\n        self.total += value * n\n\n    def synchronize_between_processes(self):\n        \"\"\"\n        Warning: does not synchronize the deque!\n        \"\"\"\n        if not is_dist_avail_and_initialized():\n            return\n        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')\n        dist.barrier()\n        dist.all_reduce(t)\n        t = t.tolist()\n        self.count = int(t[0])\n        self.total = t[1]\n\n    @property\n    def median(self):\n        d = torch.tensor(list(self.deque))\n        if d.shape[0] == 0:\n            return 0\n        return d.median().item()\n\n    @property\n    def avg(self):\n        d = torch.tensor(list(self.deque), dtype=torch.float32)\n        return d.mean().item()\n\n    @property\n    def global_avg(self):\n        if os.environ.get(\"SHILONG_AMP\", None) == '1':\n            eps = 1e-4\n        else:\n            eps = 1e-6\n        return self.total / (self.count + eps)\n\n    @property\n    def max(self):\n        return max(self.deque)\n\n    @property\n    def value(self):\n        return self.deque[-1]\n\n    def __str__(self):\n        return self.fmt.format(\n            median=self.median,\n            avg=self.avg,\n            global_avg=self.global_avg,\n            max=self.max,\n            value=self.value)\n\n@functools.lru_cache()\ndef _get_global_gloo_group():\n    \"\"\"\n    Return a process group based on gloo backend, containing all the ranks\n    The result is cached.\n    \"\"\"\n\n    if dist.get_backend() == \"nccl\":\n        return dist.new_group(backend=\"gloo\")\n\n    return dist.group.WORLD\n\ndef all_gather_cpu(data):\n    \"\"\"\n    Run all_gather on arbitrary picklable data (not necessarily tensors)\n    Args:\n        data: any picklable object\n    Returns:\n        list[data]: list of data gathered from each rank\n    \"\"\"\n\n    world_size = get_world_size()\n    if world_size == 1:\n        return [data]\n\n    cpu_group = _get_global_gloo_group()\n\n    buffer = io.BytesIO()\n    torch.save(data, buffer)\n    data_view = buffer.getbuffer()\n    device = \"cuda\" if cpu_group is None else \"cpu\"\n    tensor = torch.ByteTensor(data_view).to(device)\n\n    # obtain Tensor size of each rank\n    local_size = torch.tensor([tensor.numel()], device=device, dtype=torch.long)\n    size_list = [torch.tensor([0], device=device, dtype=torch.long) for _ in range(world_size)]\n    if cpu_group is None:\n        dist.all_gather(size_list, local_size)\n    else:\n        print(\"gathering on cpu\")\n        dist.all_gather(size_list, local_size, group=cpu_group)\n    size_list = [int(size.item()) for size in size_list]\n    max_size = max(size_list)\n    assert isinstance(local_size.item(), int)\n    local_size = int(local_size.item())\n\n    # receiving Tensor from all ranks\n    # we pad the tensor because torch all_gather does not support\n    # gathering tensors of different shapes\n    tensor_list = []\n    for _ in size_list:\n        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device=device))\n    if local_size != max_size:\n        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device=device)\n        tensor = torch.cat((tensor, padding), dim=0)\n    if cpu_group is None:\n        dist.all_gather(tensor_list, tensor)\n    else:\n        dist.all_gather(tensor_list, tensor, group=cpu_group)\n\n    data_list = []\n    for size, tensor in zip(size_list, tensor_list):\n        tensor = torch.split(tensor, [size, max_size - size], dim=0)[0]\n        buffer = io.BytesIO(tensor.cpu().numpy())\n        obj = torch.load(buffer)\n        data_list.append(obj)\n\n    return data_list\n\n\ndef all_gather(data):\n    \"\"\"\n    Run all_gather on arbitrary picklable data (not necessarily tensors)\n    Args:\n        data: any picklable object\n    Returns:\n        list[data]: list of data gathered from each rank\n    \"\"\"\n\n    if os.getenv(\"CPU_REDUCE\") == \"1\":\n        return all_gather_cpu(data)\n\n\n\n    world_size = get_world_size()\n    if world_size == 1:\n        return [data]\n\n    # serialized to a Tensor\n    buffer = pickle.dumps(data)\n    storage = torch.ByteStorage.from_buffer(buffer)\n    tensor = torch.ByteTensor(storage).to(\"cuda\")\n\n    # obtain Tensor size of each rank\n    local_size = torch.tensor([tensor.numel()], device=\"cuda\")\n    size_list = [torch.tensor([0], device=\"cuda\") for _ in range(world_size)]\n    dist.all_gather(size_list, local_size)\n    size_list = [int(size.item()) for size in size_list]\n    max_size = max(size_list)\n\n    # receiving Tensor from all ranks\n    # we pad the tensor because torch all_gather does not support\n    # gathering tensors of different shapes\n    tensor_list = []\n    for _ in size_list:\n        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device=\"cuda\"))\n    if local_size != max_size:\n        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device=\"cuda\")\n        tensor = torch.cat((tensor, padding), dim=0)\n    dist.all_gather(tensor_list, tensor)\n\n    data_list = []\n    for size, tensor in zip(size_list, tensor_list):\n        buffer = tensor.cpu().numpy().tobytes()[:size]\n        data_list.append(pickle.loads(buffer))\n\n    return data_list\n\n\ndef reduce_dict(input_dict, average=True):\n    \"\"\"\n    Args:\n        input_dict (dict): all the values will be reduced\n        average (bool): whether to do average or sum\n    Reduce the values in the dictionary from all processes so that all processes\n    have the averaged results. Returns a dict with the same fields as\n    input_dict, after reduction.\n    \"\"\"\n    world_size = get_world_size()\n    if world_size < 2:\n        return input_dict\n    with torch.no_grad():\n        names = []\n        values = []\n        # sort the keys so that they are consistent across processes\n        for k in sorted(input_dict.keys()):\n            names.append(k)\n            values.append(input_dict[k])\n        values = torch.stack(values, dim=0)\n        dist.all_reduce(values)\n        if average:\n            values /= world_size\n        reduced_dict = {k: v for k, v in zip(names, values)}\n    return reduced_dict\n\n\nclass MetricLogger(object):\n    def __init__(self, delimiter=\"\\t\"):\n        self.meters = defaultdict(SmoothedValue)\n        self.delimiter = delimiter\n\n    def update(self, **kwargs):\n        for k, v in kwargs.items():\n            if isinstance(v, torch.Tensor):\n                v = v.item()\n            assert isinstance(v, (float, int))\n            self.meters[k].update(v)\n\n    def __getattr__(self, attr):\n        if attr in self.meters:\n            return self.meters[attr]\n        if attr in self.__dict__:\n            return self.__dict__[attr]\n        raise AttributeError(\"'{}' object has no attribute '{}'\".format(\n            type(self).__name__, attr))\n\n    def __str__(self):\n        loss_str = []\n        for name, meter in self.meters.items():\n            # print(name, str(meter))\n            # import ipdb;ipdb.set_trace()\n            if meter.count > 0:\n                loss_str.append(\n                    \"{}: {}\".format(name, str(meter))\n                )\n        return self.delimiter.join(loss_str)\n\n    def synchronize_between_processes(self):\n        for meter in self.meters.values():\n            meter.synchronize_between_processes()\n\n    def add_meter(self, name, meter):\n        self.meters[name] = meter\n\n    def log_every(self, iterable, print_freq, header=None, logger=None):\n        if logger is None:\n            print_func = print\n        else:\n            print_func = logger.info\n\n        i = 0\n        if not header:\n            header = ''\n        start_time = time.time()\n        end = time.time()\n        iter_time = SmoothedValue(fmt='{avg:.4f}')\n        data_time = SmoothedValue(fmt='{avg:.4f}')\n        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'\n        if torch.cuda.is_available():\n            log_msg = self.delimiter.join([\n                header,\n                '[{0' + space_fmt + '}/{1}]',\n                'eta: {eta}',\n                '{meters}',\n                'time: {time}',\n                'data: {data}',\n                'max mem: {memory:.0f}'\n            ])\n        else:\n            log_msg = self.delimiter.join([\n                header,\n                '[{0' + space_fmt + '}/{1}]',\n                'eta: {eta}',\n                '{meters}',\n                'time: {time}',\n                'data: {data}'\n            ])\n        MB = 1024.0 * 1024.0\n        for obj in iterable:\n            data_time.update(time.time() - end)\n            yield obj\n            # import ipdb; ipdb.set_trace()\n            iter_time.update(time.time() - end)\n            if i % print_freq == 0 or i == len(iterable) - 1:\n                eta_seconds = iter_time.global_avg * (len(iterable) - i)\n                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))\n                if torch.cuda.is_available():\n                    print_func(log_msg.format(\n                        i, len(iterable), eta=eta_string,\n                        meters=str(self),\n                        time=str(iter_time), data=str(data_time),\n                        memory=torch.cuda.max_memory_allocated() / MB))\n                else:\n                    print_func(log_msg.format(\n                        i, len(iterable), eta=eta_string,\n                        meters=str(self),\n                        time=str(iter_time), data=str(data_time)))\n            i += 1\n            end = time.time()\n        total_time = time.time() - start_time\n        total_time_str = str(datetime.timedelta(seconds=int(total_time)))\n        print_func('{} Total time: {} ({:.4f} s / it)'.format(\n            header, total_time_str, total_time / len(iterable)))\n\n\ndef get_sha():\n    cwd = os.path.dirname(os.path.abspath(__file__))\n\n    def _run(command):\n        return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()\n    sha = 'N/A'\n    diff = \"clean\"\n    branch = 'N/A'\n    try:\n        sha = _run(['git', 'rev-parse', 'HEAD'])\n        subprocess.check_output(['git', 'diff'], cwd=cwd)\n        diff = _run(['git', 'diff-index', 'HEAD'])\n        diff = \"has uncommited changes\" if diff else \"clean\"\n        branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])\n    except Exception:\n        pass\n    message = f\"sha: {sha}, status: {diff}, branch: {branch}\"\n    return message\n\n\ndef collate_fn(batch):\n    # import ipdb; ipdb.set_trace()\n    batch = list(zip(*batch))\n    batch[0] = nested_tensor_from_tensor_list(batch[0])\n    return tuple(batch)\n\n\ndef _max_by_axis(the_list):\n    # type: (List[List[int]]) -> List[int]\n    maxes = the_list[0]\n    for sublist in the_list[1:]:\n        for index, item in enumerate(sublist):\n            maxes[index] = max(maxes[index], item)\n    return maxes\n\n\nclass NestedTensor(object):\n    def __init__(self, tensors, mask: Optional[Tensor]):\n        self.tensors = tensors\n        self.mask = mask\n        if mask == 'auto':\n            self.mask = torch.zeros_like(tensors).to(tensors.device)\n            if self.mask.dim() == 3:\n                self.mask = self.mask.sum(0).to(bool)\n            elif self.mask.dim() == 4:\n                self.mask = self.mask.sum(1).to(bool)\n            else:\n                raise ValueError(\"tensors dim must be 3 or 4 but {}({})\".format(self.tensors.dim(), self.tensors.shape))\n\n    def imgsize(self):\n        res = []\n        for i in range(self.tensors.shape[0]):\n            mask = self.mask[i]\n            maxH = (~mask).sum(0).max()\n            maxW = (~mask).sum(1).max()\n            res.append(torch.Tensor([maxH, maxW]))\n        return res\n\n    def to(self, device):\n        # type: (Device) -> NestedTensor # noqa\n        cast_tensor = self.tensors.to(device)\n        mask = self.mask\n        if mask is not None:\n            assert mask is not None\n            cast_mask = mask.to(device)\n        else:\n            cast_mask = None\n        return NestedTensor(cast_tensor, cast_mask)\n\n    def to_img_list_single(self, tensor, mask):\n        assert tensor.dim() == 3, \"dim of tensor should be 3 but {}\".format(tensor.dim())\n        maxH = (~mask).sum(0).max()\n        maxW = (~mask).sum(1).max()\n        img = tensor[:, :maxH, :maxW]\n        return img\n\n    def to_img_list(self):\n        \"\"\"remove the padding and convert to img list\n\n        Returns:\n            [type]: [description]\n        \"\"\"\n        if self.tensors.dim() == 3:\n            return self.to_img_list_single(self.tensors, self.mask)\n        else:\n            res = []\n            for i in range(self.tensors.shape[0]):\n                tensor_i = self.tensors[i]\n                mask_i = self.mask[i]\n                res.append(self.to_img_list_single(tensor_i, mask_i))\n            return res\n\n    @property\n    def device(self):\n        return self.tensors.device\n\n    def decompose(self):\n        return self.tensors, self.mask\n\n    def __repr__(self):\n        return str(self.tensors)\n\n    @property\n    def shape(self):\n        return {\n            'tensors.shape': self.tensors.shape,\n            'mask.shape': self.mask.shape\n        }\n\n\ndef nested_tensor_from_tensor_list(tensor_list: List[Tensor]):\n    # TODO make this more general\n    if tensor_list[0].ndim == 3:\n        if torchvision._is_tracing():\n            # nested_tensor_from_tensor_list() does not export well to ONNX\n            # call _onnx_nested_tensor_from_tensor_list() instead\n            return _onnx_nested_tensor_from_tensor_list(tensor_list)\n\n        # TODO make it support different-sized images\n        max_size = _max_by_axis([list(img.shape) for img in tensor_list])\n        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))\n        batch_shape = [len(tensor_list)] + max_size\n        b, c, h, w = batch_shape\n        dtype = tensor_list[0].dtype\n        device = tensor_list[0].device\n        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)\n        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)\n        for img, pad_img, m in zip(tensor_list, tensor, mask):\n            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)\n            m[: img.shape[1], :img.shape[2]] = False\n    else:\n        raise ValueError('not supported')\n    return NestedTensor(tensor, mask)\n\n\n# _onnx_nested_tensor_from_tensor_list() is an implementation of\n# nested_tensor_from_tensor_list() that is supported by ONNX tracing.\n@torch.jit.unused\ndef _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:\n    max_size = []\n    for i in range(tensor_list[0].dim()):\n        max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)\n        max_size.append(max_size_i)\n    max_size = tuple(max_size)\n\n    # work around for\n    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)\n    # m[: img.shape[1], :img.shape[2]] = False\n    # which is not yet supported in onnx\n    padded_imgs = []\n    padded_masks = []\n    for img in tensor_list:\n        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]\n        padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))\n        padded_imgs.append(padded_img)\n\n        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)\n        padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), \"constant\", 1)\n        padded_masks.append(padded_mask.to(torch.bool))\n\n    tensor = torch.stack(padded_imgs)\n    mask = torch.stack(padded_masks)\n\n    return NestedTensor(tensor, mask=mask)\n\n\ndef setup_for_distributed(is_master):\n    \"\"\"\n    This function disables printing when not in master process\n    \"\"\"\n    import builtins as __builtin__\n    builtin_print = __builtin__.print\n\n    def print(*args, **kwargs):\n        force = kwargs.pop('force', False)\n        if is_master or force:\n            builtin_print(*args, **kwargs)\n\n    __builtin__.print = print\n\n\ndef is_dist_avail_and_initialized():\n    if not dist.is_available():\n        return False\n    if not dist.is_initialized():\n        return False\n    return True\n\n\ndef get_world_size():\n    if not is_dist_avail_and_initialized():\n        return 1\n    return dist.get_world_size()\n\n\ndef get_rank():\n    if not is_dist_avail_and_initialized():\n        return 0\n    return dist.get_rank()\n\n\ndef is_main_process():\n    return get_rank() == 0\n\n\ndef save_on_master(*args, **kwargs):\n    if is_main_process():\n        torch.save(*args, **kwargs)\n\ndef init_distributed_mode(args):\n    if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != '': # 'RANK' in os.environ and \n        args.rank = int(os.environ[\"RANK\"])\n        args.world_size = int(os.environ['WORLD_SIZE'])\n        args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])\n\n        # launch by torch.distributed.launch\n        # Single node\n        #   python -m torch.distributed.launch --nproc_per_node=8 main.py --world-size 1 --rank 0 ...\n        # Multi nodes\n        #   python -m torch.distributed.launch --nproc_per_node=8 main.py --world-size 2 --rank 0 --dist-url 'tcp://IP_OF_NODE0:FREEPORT' ...\n        #   python -m torch.distributed.launch --nproc_per_node=8 main.py --world-size 2 --rank 1 --dist-url 'tcp://IP_OF_NODE0:FREEPORT' ...\n        # args.rank = int(os.environ.get('OMPI_COMM_WORLD_RANK'))        \n        # local_world_size = int(os.environ['GPU_PER_NODE_COUNT'])\n        # args.world_size = args.world_size * local_world_size\n        # args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])\n        # args.rank = args.rank * local_world_size + args.local_rank\n        print('world size: {}, rank: {}, local rank: {}'.format(args.world_size, args.rank, args.local_rank))\n        print(json.dumps(dict(os.environ), indent=2))\n    elif 'SLURM_PROCID' in os.environ:\n        args.rank = int(os.environ['SLURM_PROCID'])\n        args.gpu = args.local_rank = int(os.environ['SLURM_LOCALID'])\n        args.world_size = int(os.environ['SLURM_NPROCS'])\n\n        if os.environ.get('HAND_DEFINE_DIST_URL', 0) == '1':\n            pass\n        else:\n            import util.hostlist as uh\n            nodenames = uh.parse_nodelist(os.environ['SLURM_JOB_NODELIST'])\n            gpu_ids = [int(node[3:]) for node in nodenames]\n            fixid = int(os.environ.get('FIX_DISTRIBUTED_PORT_NUMBER', 0))\n            # fixid += random.randint(0, 300)\n            port = str(3137 + int(min(gpu_ids)) + fixid)\n            args.dist_url = \"tcp://{ip}:{port}\".format(ip=uh.nodename_to_ip(nodenames[0]), port=port)\n\n        print('world size: {}, world rank: {}, local rank: {}, device_count: {}'.format(args.world_size, args.rank, args.local_rank, torch.cuda.device_count()))\n\n\n    else:\n        print('Not using distributed mode')\n        args.distributed = False\n        args.world_size = 1\n        args.rank = 0\n        args.local_rank = 0\n        return\n\n    print(\"world_size:{} rank:{} local_rank:{}\".format(args.world_size, args.rank, args.local_rank))\n    args.distributed = True\n    torch.cuda.set_device(args.local_rank)\n    args.dist_backend = 'nccl'\n    print('| distributed init (rank {}): {}'.format(args.rank, args.dist_url), flush=True)\n\n    torch.distributed.init_process_group(\n        backend=args.dist_backend, \n        world_size=args.world_size, \n        rank=args.rank,\n        init_method=args.dist_url,\n    )\n\n    print(\"Before torch.distributed.barrier()\")\n    torch.distributed.barrier()\n    print(\"End torch.distributed.barrier()\")\n    setup_for_distributed(args.rank == 0)\n\n\n@torch.no_grad()\ndef accuracy(output, target, topk=(1,)):\n    \"\"\"Computes the precision@k for the specified values of k\"\"\"\n    if target.numel() == 0:\n        return [torch.zeros([], device=output.device)]\n    maxk = max(topk)\n    batch_size = target.size(0)\n\n    _, pred = output.topk(maxk, 1, True, True)\n    pred = pred.t()\n    correct = pred.eq(target.view(1, -1).expand_as(pred))\n\n    res = []\n    for k in topk:\n        correct_k = correct[:k].view(-1).float().sum(0)\n        res.append(correct_k.mul_(100.0 / batch_size))\n    return res\n\n@torch.no_grad()\ndef accuracy_onehot(pred, gt):\n    \"\"\"_summary_\n\n    Args:\n        pred (_type_): n, c\n        gt (_type_): n, c\n    \"\"\"\n    tp = ((pred - gt).abs().sum(-1) < 1e-4).float().sum()\n    acc = tp / gt.shape[0] * 100\n    return acc\n\n\n\n\n\ndef interpolate(input, size=None, scale_factor=None, mode=\"nearest\", align_corners=None):\n    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor\n    \"\"\"\n    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.\n    This will eventually be supported natively by PyTorch, and this\n    class can go away.\n    \"\"\"\n    if __torchvision_need_compat_flag < 0.7:\n        if input.numel() > 0:\n            return torch.nn.functional.interpolate(\n                input, size, scale_factor, mode, align_corners\n            )\n\n        output_shape = _output_size(2, input, size, scale_factor)\n        output_shape = list(input.shape[:-2]) + list(output_shape)\n        return _new_empty_tensor(input, output_shape)\n    else:\n        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)\n\n\n\nclass color_sys():\n    def __init__(self, num_colors) -> None:\n        self.num_colors = num_colors\n        colors=[]\n        for i in np.arange(0., 360., 360. / num_colors):\n            hue = i/360.\n            lightness = (50 + np.random.rand() * 10)/100.\n            saturation = (90 + np.random.rand() * 10)/100.\n            colors.append(tuple([int(j*255) for j in colorsys.hls_to_rgb(hue, lightness, saturation)]))\n        self.colors = colors\n\n    def __call__(self, idx):\n        return self.colors[idx]\n\ndef inverse_sigmoid(x, eps=1e-3):\n    x = x.clamp(min=0, max=1)\n    x1 = x.clamp(min=eps)\n    x2 = (1 - x).clamp(min=eps)\n    return torch.log(x1/x2)\n\ndef clean_state_dict(state_dict):\n    new_state_dict = OrderedDict()\n    for k, v in state_dict.items():\n        if k[:7] == 'module.':\n            k = k[7:]  # remove `module.`\n        new_state_dict[k] = v\n    return new_state_dict"
  },
  {
    "path": "src/utils/dependencies/insightface/__init__.py",
    "content": "# coding: utf-8\n# pylint: disable=wrong-import-position\n\"\"\"InsightFace: A Face Analysis Toolkit.\"\"\"\nfrom __future__ import absolute_import\n\ntry:\n    #import mxnet as mx\n    import onnxruntime\nexcept ImportError:\n    raise ImportError(\n        \"Unable to import dependency onnxruntime. \"\n    )\n\n__version__ = '0.7.3'\n\nfrom . import model_zoo\nfrom . import utils\nfrom . import app\nfrom . import data\n\n"
  },
  {
    "path": "src/utils/dependencies/insightface/app/__init__.py",
    "content": "from .face_analysis import *\n"
  },
  {
    "path": "src/utils/dependencies/insightface/app/common.py",
    "content": "import numpy as np\nfrom numpy.linalg import norm as l2norm\n#from easydict import EasyDict\n\nclass Face(dict):\n\n    def __init__(self, d=None, **kwargs):\n        if d is None:\n            d = {}\n        if kwargs:\n            d.update(**kwargs)\n        for k, v in d.items():\n            setattr(self, k, v)\n        # Class attributes\n        #for k in self.__class__.__dict__.keys():\n        #    if not (k.startswith('__') and k.endswith('__')) and not k in ('update', 'pop'):\n        #        setattr(self, k, getattr(self, k))\n\n    def __setattr__(self, name, value):\n        if isinstance(value, (list, tuple)):\n            value = [self.__class__(x)\n                    if isinstance(x, dict) else x for x in value]\n        elif isinstance(value, dict) and not isinstance(value, self.__class__):\n            value = self.__class__(value)\n        super(Face, self).__setattr__(name, value)\n        super(Face, self).__setitem__(name, value)\n\n    __setitem__ = __setattr__\n\n    def __getattr__(self, name):\n        return None\n\n    @property\n    def embedding_norm(self):\n        if self.embedding is None:\n            return None\n        return l2norm(self.embedding)\n\n    @property \n    def normed_embedding(self):\n        if self.embedding is None:\n            return None\n        return self.embedding / self.embedding_norm\n\n    @property \n    def sex(self):\n        if self.gender is None:\n            return None\n        return 'M' if self.gender==1 else 'F'\n"
  },
  {
    "path": "src/utils/dependencies/insightface/app/face_analysis.py",
    "content": "# -*- coding: utf-8 -*-\n# @Organization  : insightface.ai\n# @Author        : Jia Guo\n# @Time          : 2021-05-04\n# @Function      :\n\n\nfrom __future__ import division\n\nimport glob\nimport os.path as osp\n\nimport numpy as np\nimport onnxruntime\nfrom numpy.linalg import norm\n\nfrom ..model_zoo import model_zoo\nfrom ..utils import ensure_available\nfrom .common import Face\n\n\nDEFAULT_MP_NAME = 'buffalo_l'\n__all__ = ['FaceAnalysis']\n\nclass FaceAnalysis:\n    def __init__(self, name=DEFAULT_MP_NAME, root='~/.insightface', allowed_modules=None, **kwargs):\n        onnxruntime.set_default_logger_severity(3)\n        self.models = {}\n        self.model_dir = ensure_available('models', name, root=root)\n        onnx_files = glob.glob(osp.join(self.model_dir, '*.onnx'))\n        onnx_files = sorted(onnx_files)\n        for onnx_file in onnx_files:\n            model = model_zoo.get_model(onnx_file, **kwargs)\n            if model is None:\n                print('model not recognized:', onnx_file)\n            elif allowed_modules is not None and model.taskname not in allowed_modules:\n                print('model ignore:', onnx_file, model.taskname)\n                del model\n            elif model.taskname not in self.models and (allowed_modules is None or model.taskname in allowed_modules):\n                # print('find model:', onnx_file, model.taskname, model.input_shape, model.input_mean, model.input_std)\n                self.models[model.taskname] = model\n            else:\n                print('duplicated model task type, ignore:', onnx_file, model.taskname)\n                del model\n        assert 'detection' in self.models\n        self.det_model = self.models['detection']\n\n\n    def prepare(self, ctx_id, det_thresh=0.5, det_size=(640, 640)):\n        self.det_thresh = det_thresh\n        assert det_size is not None\n        # print('set det-size:', det_size)\n        self.det_size = det_size\n        for taskname, model in self.models.items():\n            if taskname=='detection':\n                model.prepare(ctx_id, input_size=det_size, det_thresh=det_thresh)\n            else:\n                model.prepare(ctx_id)\n\n    def get(self, img, max_num=0):\n        bboxes, kpss = self.det_model.detect(img,\n                                             max_num=max_num,\n                                             metric='default')\n        if bboxes.shape[0] == 0:\n            return []\n        ret = []\n        for i in range(bboxes.shape[0]):\n            bbox = bboxes[i, 0:4]\n            det_score = bboxes[i, 4]\n            kps = None\n            if kpss is not None:\n                kps = kpss[i]\n            face = Face(bbox=bbox, kps=kps, det_score=det_score)\n            for taskname, model in self.models.items():\n                if taskname=='detection':\n                    continue\n                model.get(img, face)\n            ret.append(face)\n        return ret\n\n    def draw_on(self, img, faces):\n        import cv2\n        dimg = img.copy()\n        for i in range(len(faces)):\n            face = faces[i]\n            box = face.bbox.astype(np.int)\n            color = (0, 0, 255)\n            cv2.rectangle(dimg, (box[0], box[1]), (box[2], box[3]), color, 2)\n            if face.kps is not None:\n                kps = face.kps.astype(np.int)\n                #print(landmark.shape)\n                for l in range(kps.shape[0]):\n                    color = (0, 0, 255)\n                    if l == 0 or l == 3:\n                        color = (0, 255, 0)\n                    cv2.circle(dimg, (kps[l][0], kps[l][1]), 1, color,\n                               2)\n            if face.gender is not None and face.age is not None:\n                cv2.putText(dimg,'%s,%d'%(face.sex,face.age), (box[0]-1, box[1]-4),cv2.FONT_HERSHEY_COMPLEX,0.7,(0,255,0),1)\n\n            #for key, value in face.items():\n            #    if key.startswith('landmark_3d'):\n            #        print(key, value.shape)\n            #        print(value[0:10,:])\n            #        lmk = np.round(value).astype(np.int)\n            #        for l in range(lmk.shape[0]):\n            #            color = (255, 0, 0)\n            #            cv2.circle(dimg, (lmk[l][0], lmk[l][1]), 1, color,\n            #                       2)\n        return dimg\n"
  },
  {
    "path": "src/utils/dependencies/insightface/data/__init__.py",
    "content": "from .image import get_image\nfrom .pickle_object import get_object\n"
  },
  {
    "path": "src/utils/dependencies/insightface/data/image.py",
    "content": "import cv2\nimport os\nimport os.path as osp\nfrom pathlib import Path\n\nclass ImageCache:\n    data = {}\n\ndef get_image(name, to_rgb=False):\n    key = (name, to_rgb)\n    if key in ImageCache.data:\n        return ImageCache.data[key]\n    images_dir = osp.join(Path(__file__).parent.absolute(), 'images')\n    ext_names = ['.jpg', '.png', '.jpeg']\n    image_file = None\n    for ext_name in ext_names:\n        _image_file = osp.join(images_dir, \"%s%s\"%(name, ext_name))\n        if osp.exists(_image_file):\n            image_file = _image_file\n            break\n    assert image_file is not None, '%s not found'%name\n    img = cv2.imread(image_file)\n    if to_rgb:\n        img = img[:,:,::-1]\n    ImageCache.data[key] = img\n    return img\n\n"
  },
  {
    "path": "src/utils/dependencies/insightface/data/pickle_object.py",
    "content": "import cv2\nimport os\nimport os.path as osp\nfrom pathlib import Path\nimport pickle\n\ndef get_object(name):\n    objects_dir = osp.join(Path(__file__).parent.absolute(), 'objects')\n    if not name.endswith('.pkl'):\n        name = name+\".pkl\"\n    filepath = osp.join(objects_dir, name)\n    if not osp.exists(filepath):\n        return None\n    with open(filepath, 'rb') as f:\n        obj = pickle.load(f)\n    return obj\n\n"
  },
  {
    "path": "src/utils/dependencies/insightface/data/rec_builder.py",
    "content": "import pickle\nimport numpy as np\nimport os\nimport os.path as osp\nimport sys\nimport mxnet as mx\n\n\nclass RecBuilder():\n    def __init__(self, path, image_size=(112, 112)):\n        self.path = path\n        self.image_size = image_size\n        self.widx = 0\n        self.wlabel = 0\n        self.max_label = -1\n        assert not osp.exists(path), '%s exists' % path\n        os.makedirs(path)\n        self.writer = mx.recordio.MXIndexedRecordIO(os.path.join(path, 'train.idx'), \n                                                    os.path.join(path, 'train.rec'),\n                                                    'w')\n        self.meta = []\n\n    def add(self, imgs):\n        #!!! img should be BGR!!!!\n        #assert label >= 0\n        #assert label > self.last_label\n        assert len(imgs) > 0\n        label = self.wlabel\n        for img in imgs:\n            idx = self.widx\n            image_meta = {'image_index': idx, 'image_classes': [label]}\n            header = mx.recordio.IRHeader(0, label, idx, 0)\n            if isinstance(img, np.ndarray):\n                s = mx.recordio.pack_img(header,img,quality=95,img_fmt='.jpg')\n            else:\n                s = mx.recordio.pack(header, img)\n            self.writer.write_idx(idx, s)\n            self.meta.append(image_meta)\n            self.widx += 1\n        self.max_label = label\n        self.wlabel += 1\n\n\n    def add_image(self, img, label):\n        #!!! img should be BGR!!!!\n        #assert label >= 0\n        #assert label > self.last_label\n        idx = self.widx\n        header = mx.recordio.IRHeader(0, label, idx, 0)\n        if isinstance(label, list):\n            idlabel = label[0]\n        else:\n            idlabel = label\n        image_meta = {'image_index': idx, 'image_classes': [idlabel]}\n        if isinstance(img, np.ndarray):\n            s = mx.recordio.pack_img(header,img,quality=95,img_fmt='.jpg')\n        else:\n            s = mx.recordio.pack(header, img)\n        self.writer.write_idx(idx, s)\n        self.meta.append(image_meta)\n        self.widx += 1\n        self.max_label = max(self.max_label, idlabel)\n\n    def close(self):\n        with open(osp.join(self.path, 'train.meta'), 'wb') as pfile:\n            pickle.dump(self.meta, pfile, protocol=pickle.HIGHEST_PROTOCOL)\n        print('stat:', self.widx, self.wlabel)\n        with open(os.path.join(self.path, 'property'), 'w') as f:\n            f.write(\"%d,%d,%d\\n\" % (self.max_label+1, self.image_size[0], self.image_size[1]))\n            f.write(\"%d\\n\" % (self.widx))\n\n"
  },
  {
    "path": "src/utils/dependencies/insightface/model_zoo/__init__.py",
    "content": "from .model_zoo import get_model\nfrom .arcface_onnx import ArcFaceONNX\nfrom .retinaface import RetinaFace\nfrom .scrfd import SCRFD\nfrom .landmark import Landmark\nfrom .attribute import Attribute\n"
  },
  {
    "path": "src/utils/dependencies/insightface/model_zoo/arcface_onnx.py",
    "content": "# -*- coding: utf-8 -*-\n# @Organization  : insightface.ai\n# @Author        : Jia Guo\n# @Time          : 2021-05-04\n# @Function      : \n\nfrom __future__ import division\nimport numpy as np\nimport cv2\nimport onnx\nimport onnxruntime\nfrom ..utils import face_align\n\n__all__ = [\n    'ArcFaceONNX',\n]\n\n\nclass ArcFaceONNX:\n    def __init__(self, model_file=None, session=None):\n        assert model_file is not None\n        self.model_file = model_file\n        self.session = session\n        self.taskname = 'recognition'\n        find_sub = False\n        find_mul = False\n        model = onnx.load(self.model_file)\n        graph = model.graph\n        for nid, node in enumerate(graph.node[:8]):\n            #print(nid, node.name)\n            if node.name.startswith('Sub') or node.name.startswith('_minus'):\n                find_sub = True\n            if node.name.startswith('Mul') or node.name.startswith('_mul'):\n                find_mul = True\n        if find_sub and find_mul:\n            #mxnet arcface model\n            input_mean = 0.0\n            input_std = 1.0\n        else:\n            input_mean = 127.5\n            input_std = 127.5\n        self.input_mean = input_mean\n        self.input_std = input_std\n        #print('input mean and std:', self.input_mean, self.input_std)\n        if self.session is None:\n            self.session = onnxruntime.InferenceSession(self.model_file, None)\n        input_cfg = self.session.get_inputs()[0]\n        input_shape = input_cfg.shape\n        input_name = input_cfg.name\n        self.input_size = tuple(input_shape[2:4][::-1])\n        self.input_shape = input_shape\n        outputs = self.session.get_outputs()\n        output_names = []\n        for out in outputs:\n            output_names.append(out.name)\n        self.input_name = input_name\n        self.output_names = output_names\n        assert len(self.output_names)==1\n        self.output_shape = outputs[0].shape\n\n    def prepare(self, ctx_id, **kwargs):\n        if ctx_id<0:\n            self.session.set_providers(['CPUExecutionProvider'])\n\n    def get(self, img, face):\n        aimg = face_align.norm_crop(img, landmark=face.kps, image_size=self.input_size[0])\n        face.embedding = self.get_feat(aimg).flatten()\n        return face.embedding\n\n    def compute_sim(self, feat1, feat2):\n        from numpy.linalg import norm\n        feat1 = feat1.ravel()\n        feat2 = feat2.ravel()\n        sim = np.dot(feat1, feat2) / (norm(feat1) * norm(feat2))\n        return sim\n\n    def get_feat(self, imgs):\n        if not isinstance(imgs, list):\n            imgs = [imgs]\n        input_size = self.input_size\n        \n        blob = cv2.dnn.blobFromImages(imgs, 1.0 / self.input_std, input_size,\n                                      (self.input_mean, self.input_mean, self.input_mean), swapRB=True)\n        net_out = self.session.run(self.output_names, {self.input_name: blob})[0]\n        return net_out\n\n    def forward(self, batch_data):\n        blob = (batch_data - self.input_mean) / self.input_std\n        net_out = self.session.run(self.output_names, {self.input_name: blob})[0]\n        return net_out\n\n\n"
  },
  {
    "path": "src/utils/dependencies/insightface/model_zoo/attribute.py",
    "content": "# -*- coding: utf-8 -*-\n# @Organization  : insightface.ai\n# @Author        : Jia Guo\n# @Time          : 2021-06-19\n# @Function      : \n\nfrom __future__ import division\nimport numpy as np\nimport cv2\nimport onnx\nimport onnxruntime\nfrom ..utils import face_align\n\n__all__ = [\n    'Attribute',\n]\n\n\nclass Attribute:\n    def __init__(self, model_file=None, session=None):\n        assert model_file is not None\n        self.model_file = model_file\n        self.session = session\n        find_sub = False\n        find_mul = False\n        model = onnx.load(self.model_file)\n        graph = model.graph\n        for nid, node in enumerate(graph.node[:8]):\n            #print(nid, node.name)\n            if node.name.startswith('Sub') or node.name.startswith('_minus'):\n                find_sub = True\n            if node.name.startswith('Mul') or node.name.startswith('_mul'):\n                find_mul = True\n            if nid<3 and node.name=='bn_data':\n                find_sub = True\n                find_mul = True\n        if find_sub and find_mul:\n            #mxnet arcface model\n            input_mean = 0.0\n            input_std = 1.0\n        else:\n            input_mean = 127.5\n            input_std = 128.0\n        self.input_mean = input_mean\n        self.input_std = input_std\n        #print('input mean and std:', model_file, self.input_mean, self.input_std)\n        if self.session is None:\n            self.session = onnxruntime.InferenceSession(self.model_file, None)\n        input_cfg = self.session.get_inputs()[0]\n        input_shape = input_cfg.shape\n        input_name = input_cfg.name\n        self.input_size = tuple(input_shape[2:4][::-1])\n        self.input_shape = input_shape\n        outputs = self.session.get_outputs()\n        output_names = []\n        for out in outputs:\n            output_names.append(out.name)\n        self.input_name = input_name\n        self.output_names = output_names\n        assert len(self.output_names)==1\n        output_shape = outputs[0].shape\n        #print('init output_shape:', output_shape)\n        if output_shape[1]==3:\n            self.taskname = 'genderage'\n        else:\n            self.taskname = 'attribute_%d'%output_shape[1]\n\n    def prepare(self, ctx_id, **kwargs):\n        if ctx_id<0:\n            self.session.set_providers(['CPUExecutionProvider'])\n\n    def get(self, img, face):\n        bbox = face.bbox\n        w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1])\n        center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2\n        rotate = 0\n        _scale = self.input_size[0]  / (max(w, h)*1.5)\n        #print('param:', img.shape, bbox, center, self.input_size, _scale, rotate)\n        aimg, M = face_align.transform(img, center, self.input_size[0], _scale, rotate)\n        input_size = tuple(aimg.shape[0:2][::-1])\n        #assert input_size==self.input_size\n        blob = cv2.dnn.blobFromImage(aimg, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)\n        pred = self.session.run(self.output_names, {self.input_name : blob})[0][0]\n        if self.taskname=='genderage':\n            assert len(pred)==3\n            gender = np.argmax(pred[:2])\n            age = int(np.round(pred[2]*100))\n            face['gender'] = gender\n            face['age'] = age\n            return gender, age\n        else:\n            return pred\n\n\n"
  },
  {
    "path": "src/utils/dependencies/insightface/model_zoo/inswapper.py",
    "content": "import time\nimport numpy as np\nimport onnxruntime\nimport cv2\nimport onnx\nfrom onnx import numpy_helper\nfrom ..utils import face_align\n\n\n\n\nclass INSwapper():\n    def __init__(self, model_file=None, session=None):\n        self.model_file = model_file\n        self.session = session\n        model = onnx.load(self.model_file)\n        graph = model.graph\n        self.emap = numpy_helper.to_array(graph.initializer[-1])\n        self.input_mean = 0.0\n        self.input_std = 255.0\n        #print('input mean and std:', model_file, self.input_mean, self.input_std)\n        if self.session is None:\n            self.session = onnxruntime.InferenceSession(self.model_file, None)\n        inputs = self.session.get_inputs()\n        self.input_names = []\n        for inp in inputs:\n            self.input_names.append(inp.name)\n        outputs = self.session.get_outputs()\n        output_names = []\n        for out in outputs:\n            output_names.append(out.name)\n        self.output_names = output_names\n        assert len(self.output_names)==1\n        output_shape = outputs[0].shape\n        input_cfg = inputs[0]\n        input_shape = input_cfg.shape\n        self.input_shape = input_shape\n        # print('inswapper-shape:', self.input_shape)\n        self.input_size = tuple(input_shape[2:4][::-1])\n\n    def forward(self, img, latent):\n        img = (img - self.input_mean) / self.input_std\n        pred = self.session.run(self.output_names, {self.input_names[0]: img, self.input_names[1]: latent})[0]\n        return pred\n\n    def get(self, img, target_face, source_face, paste_back=True):\n        face_mask = np.zeros((img.shape[0], img.shape[1]), np.uint8)\n        cv2.fillPoly(face_mask, np.array([target_face.landmark_2d_106[[1,9,10,11,12,13,14,15,16,2,3,4,5,6,7,8,0,24,23,22,21,20,19,18,32,31,30,29,28,27,26,25,17,101,105,104,103,51,49,48,43]].astype('int64')]), 1)\n        aimg, M = face_align.norm_crop2(img, target_face.kps, self.input_size[0])\n        blob = cv2.dnn.blobFromImage(aimg, 1.0 / self.input_std, self.input_size,\n                                      (self.input_mean, self.input_mean, self.input_mean), swapRB=True)\n        latent = source_face.normed_embedding.reshape((1,-1))\n        latent = np.dot(latent, self.emap)\n        latent /= np.linalg.norm(latent)\n        pred = self.session.run(self.output_names, {self.input_names[0]: blob, self.input_names[1]: latent})[0]\n        #print(latent.shape, latent.dtype, pred.shape)\n        img_fake = pred.transpose((0,2,3,1))[0]\n        bgr_fake = np.clip(255 * img_fake, 0, 255).astype(np.uint8)[:,:,::-1]\n        if not paste_back:\n            return bgr_fake, M\n        else:\n            target_img = img\n            fake_diff = bgr_fake.astype(np.float32) - aimg.astype(np.float32)\n            fake_diff = np.abs(fake_diff).mean(axis=2)\n            fake_diff[:2,:] = 0\n            fake_diff[-2:,:] = 0\n            fake_diff[:,:2] = 0\n            fake_diff[:,-2:] = 0\n            IM = cv2.invertAffineTransform(M)\n            img_white = np.full((aimg.shape[0],aimg.shape[1]), 255, dtype=np.float32)\n            bgr_fake = cv2.warpAffine(bgr_fake, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)\n            img_white = cv2.warpAffine(img_white, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)\n            fake_diff = cv2.warpAffine(fake_diff, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)\n            img_white[img_white>20] = 255\n            fthresh = 10\n            fake_diff[fake_diff<fthresh] = 0\n            fake_diff[fake_diff>=fthresh] = 255\n            img_mask = img_white\n            mask_h_inds, mask_w_inds = np.where(img_mask==255)\n            mask_h = np.max(mask_h_inds) - np.min(mask_h_inds)\n            mask_w = np.max(mask_w_inds) - np.min(mask_w_inds)\n            mask_size = int(np.sqrt(mask_h*mask_w))\n            k = max(mask_size//10, 10)\n            #k = max(mask_size//20, 6)\n            #k = 6\n            kernel = np.ones((k,k),np.uint8)\n            img_mask = cv2.erode(img_mask,kernel,iterations = 1)\n            kernel = np.ones((2,2),np.uint8)\n            fake_diff = cv2.dilate(fake_diff,kernel,iterations = 1)\n\n            face_mask = cv2.erode(face_mask,np.ones((11,11),np.uint8),iterations = 1)\n            fake_diff[face_mask==1] = 255\n\n            k = max(mask_size//20, 5)\n            #k = 3\n            #k = 3\n            kernel_size = (k, k)\n            blur_size = tuple(2*i+1 for i in kernel_size)\n            img_mask = cv2.GaussianBlur(img_mask, blur_size, 0)\n            k = 5\n            kernel_size = (k, k)\n            blur_size = tuple(2*i+1 for i in kernel_size)\n            fake_diff = cv2.blur(fake_diff, (11,11), 0)\n            ##fake_diff = cv2.GaussianBlur(fake_diff, blur_size, 0)\n            # print('blur_size: ', blur_size)\n            # fake_diff = cv2.blur(fake_diff, (21, 21), 0) # blur_size\n            img_mask /= 255\n            fake_diff /= 255\n            # img_mask = fake_diff\n            img_mask = img_mask*fake_diff\n            img_mask = np.reshape(img_mask, [img_mask.shape[0],img_mask.shape[1],1])\n            fake_merged = img_mask * bgr_fake + (1-img_mask) * target_img.astype(np.float32)\n            fake_merged = fake_merged.astype(np.uint8)\n            return fake_merged\n"
  },
  {
    "path": "src/utils/dependencies/insightface/model_zoo/landmark.py",
    "content": "# -*- coding: utf-8 -*-\n# @Organization  : insightface.ai\n# @Author        : Jia Guo\n# @Time          : 2021-05-04\n# @Function      : \n\nfrom __future__ import division\nimport numpy as np\nimport cv2\nimport onnx\nimport onnxruntime\nfrom ..utils import face_align\nfrom ..utils import transform\nfrom ..data import get_object\n\n__all__ = [\n    'Landmark',\n]\n\n\nclass Landmark:\n    def __init__(self, model_file=None, session=None):\n        assert model_file is not None\n        self.model_file = model_file\n        self.session = session\n        find_sub = False\n        find_mul = False\n        model = onnx.load(self.model_file)\n        graph = model.graph\n        for nid, node in enumerate(graph.node[:8]):\n            #print(nid, node.name)\n            if node.name.startswith('Sub') or node.name.startswith('_minus'):\n                find_sub = True\n            if node.name.startswith('Mul') or node.name.startswith('_mul'):\n                find_mul = True\n            if nid<3 and node.name=='bn_data':\n                find_sub = True\n                find_mul = True\n        if find_sub and find_mul:\n            #mxnet arcface model\n            input_mean = 0.0\n            input_std = 1.0\n        else:\n            input_mean = 127.5\n            input_std = 128.0\n        self.input_mean = input_mean\n        self.input_std = input_std\n        #print('input mean and std:', model_file, self.input_mean, self.input_std)\n        if self.session is None:\n            self.session = onnxruntime.InferenceSession(self.model_file, None)\n        input_cfg = self.session.get_inputs()[0]\n        input_shape = input_cfg.shape\n        input_name = input_cfg.name\n        self.input_size = tuple(input_shape[2:4][::-1])\n        self.input_shape = input_shape\n        outputs = self.session.get_outputs()\n        output_names = []\n        for out in outputs:\n            output_names.append(out.name)\n        self.input_name = input_name\n        self.output_names = output_names\n        assert len(self.output_names)==1\n        output_shape = outputs[0].shape\n        self.require_pose = False\n        #print('init output_shape:', output_shape)\n        if output_shape[1]==3309:\n            self.lmk_dim = 3\n            self.lmk_num = 68\n            self.mean_lmk = get_object('meanshape_68.pkl')\n            self.require_pose = True\n        else:\n            self.lmk_dim = 2\n            self.lmk_num = output_shape[1]//self.lmk_dim\n        self.taskname = 'landmark_%dd_%d'%(self.lmk_dim, self.lmk_num)\n\n    def prepare(self, ctx_id, **kwargs):\n        if ctx_id<0:\n            self.session.set_providers(['CPUExecutionProvider'])\n\n    def get(self, img, face):\n        bbox = face.bbox\n        w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1])\n        center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2\n        rotate = 0\n        _scale = self.input_size[0]  / (max(w, h)*1.5)\n        #print('param:', img.shape, bbox, center, self.input_size, _scale, rotate)\n        aimg, M = face_align.transform(img, center, self.input_size[0], _scale, rotate)\n        input_size = tuple(aimg.shape[0:2][::-1])\n        #assert input_size==self.input_size\n        blob = cv2.dnn.blobFromImage(aimg, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)\n        pred = self.session.run(self.output_names, {self.input_name : blob})[0][0]\n        if pred.shape[0] >= 3000:\n            pred = pred.reshape((-1, 3))\n        else:\n            pred = pred.reshape((-1, 2))\n        if self.lmk_num < pred.shape[0]:\n            pred = pred[self.lmk_num*-1:,:]\n        pred[:, 0:2] += 1\n        pred[:, 0:2] *= (self.input_size[0] // 2)\n        if pred.shape[1] == 3:\n            pred[:, 2] *= (self.input_size[0] // 2)\n\n        IM = cv2.invertAffineTransform(M)\n        pred = face_align.trans_points(pred, IM)\n        face[self.taskname] = pred\n        if self.require_pose:\n            P = transform.estimate_affine_matrix_3d23d(self.mean_lmk, pred)\n            s, R, t = transform.P2sRt(P)\n            rx, ry, rz = transform.matrix2angle(R)\n            pose = np.array( [rx, ry, rz], dtype=np.float32 )\n            face['pose'] = pose #pitch, yaw, roll\n        return pred\n\n\n"
  },
  {
    "path": "src/utils/dependencies/insightface/model_zoo/model_store.py",
    "content": "\"\"\"\nThis code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/model_zoo/model_store.py\n\"\"\"\nfrom __future__ import print_function\n\n__all__ = ['get_model_file']\nimport os\nimport zipfile\nimport glob\n\nfrom ..utils import download, check_sha1\n\n_model_sha1 = {\n    name: checksum\n    for checksum, name in [\n        ('95be21b58e29e9c1237f229dae534bd854009ce0', 'arcface_r100_v1'),\n        ('', 'arcface_mfn_v1'),\n        ('39fd1e087a2a2ed70a154ac01fecaa86c315d01b', 'retinaface_r50_v1'),\n        ('2c9de8116d1f448fd1d4661f90308faae34c990a', 'retinaface_mnet025_v1'),\n        ('0db1d07921d005e6c9a5b38e059452fc5645e5a4', 'retinaface_mnet025_v2'),\n        ('7dd8111652b7aac2490c5dcddeb268e53ac643e6', 'genderage_v1'),\n    ]\n}\n\nbase_repo_url = 'https://insightface.ai/files/'\n_url_format = '{repo_url}models/{file_name}.zip'\n\n\ndef short_hash(name):\n    if name not in _model_sha1:\n        raise ValueError(\n            'Pretrained model for {name} is not available.'.format(name=name))\n    return _model_sha1[name][:8]\n\n\ndef find_params_file(dir_path):\n    if not os.path.exists(dir_path):\n        return None\n    paths = glob.glob(\"%s/*.params\" % dir_path)\n    if len(paths) == 0:\n        return None\n    paths = sorted(paths)\n    return paths[-1]\n\n\ndef get_model_file(name, root=os.path.join('~', '.insightface', 'models')):\n    r\"\"\"Return location for the pretrained on local file system.\n\n    This function will download from online model zoo when model cannot be found or has mismatch.\n    The root directory will be created if it doesn't exist.\n\n    Parameters\n    ----------\n    name : str\n        Name of the model.\n    root : str, default '~/.mxnet/models'\n        Location for keeping the model parameters.\n\n    Returns\n    -------\n    file_path\n        Path to the requested pretrained model file.\n    \"\"\"\n\n    file_name = name\n    root = os.path.expanduser(root)\n    dir_path = os.path.join(root, name)\n    file_path = find_params_file(dir_path)\n    #file_path = os.path.join(root, file_name + '.params')\n    sha1_hash = _model_sha1[name]\n    if file_path is not None:\n        if check_sha1(file_path, sha1_hash):\n            return file_path\n        else:\n            print(\n                'Mismatch in the content of model file detected. Downloading again.'\n            )\n    else:\n        print('Model file is not found. Downloading.')\n\n    if not os.path.exists(root):\n        os.makedirs(root)\n    if not os.path.exists(dir_path):\n        os.makedirs(dir_path)\n\n    zip_file_path = os.path.join(root, file_name + '.zip')\n    repo_url = base_repo_url\n    if repo_url[-1] != '/':\n        repo_url = repo_url + '/'\n    download(_url_format.format(repo_url=repo_url, file_name=file_name),\n             path=zip_file_path,\n             overwrite=True)\n    with zipfile.ZipFile(zip_file_path) as zf:\n        zf.extractall(dir_path)\n    os.remove(zip_file_path)\n    file_path = find_params_file(dir_path)\n\n    if check_sha1(file_path, sha1_hash):\n        return file_path\n    else:\n        raise ValueError(\n            'Downloaded file has different hash. Please try again.')\n\n"
  },
  {
    "path": "src/utils/dependencies/insightface/model_zoo/model_zoo.py",
    "content": "# -*- coding: utf-8 -*-\n# @Organization  : insightface.ai\n# @Author        : Jia Guo\n# @Time          : 2021-05-04\n# @Function      :\n\nimport os\nimport os.path as osp\nimport glob\nimport onnxruntime\nfrom .arcface_onnx import *\nfrom .retinaface import *\n#from .scrfd import *\nfrom .landmark import *\nfrom .attribute import Attribute\nfrom .inswapper import INSwapper\nfrom ..utils import download_onnx\n\n__all__ = ['get_model']\n\n\nclass PickableInferenceSession(onnxruntime.InferenceSession):\n    # This is a wrapper to make the current InferenceSession class pickable.\n    def __init__(self, model_path, **kwargs):\n        super().__init__(model_path, **kwargs)\n        self.model_path = model_path\n\n    def __getstate__(self):\n        return {'model_path': self.model_path}\n\n    def __setstate__(self, values):\n        model_path = values['model_path']\n        self.__init__(model_path)\n\nclass ModelRouter:\n    def __init__(self, onnx_file):\n        self.onnx_file = onnx_file\n\n    def get_model(self, **kwargs):\n        session = PickableInferenceSession(self.onnx_file, **kwargs)\n        # print(f'Applied providers: {session._providers}, with options: {session._provider_options}')\n        inputs = session.get_inputs()\n        input_cfg = inputs[0]\n        input_shape = input_cfg.shape\n        outputs = session.get_outputs()\n\n        if len(outputs)>=5:\n            return RetinaFace(model_file=self.onnx_file, session=session)\n        elif input_shape[2]==192 and input_shape[3]==192:\n            return Landmark(model_file=self.onnx_file, session=session)\n        elif input_shape[2]==96 and input_shape[3]==96:\n            return Attribute(model_file=self.onnx_file, session=session)\n        elif len(inputs)==2 and input_shape[2]==128 and input_shape[3]==128:\n            return INSwapper(model_file=self.onnx_file, session=session)\n        elif input_shape[2]==input_shape[3] and input_shape[2]>=112 and input_shape[2]%16==0:\n            return ArcFaceONNX(model_file=self.onnx_file, session=session)\n        else:\n            #raise RuntimeError('error on model routing')\n            return None\n\ndef find_onnx_file(dir_path):\n    if not os.path.exists(dir_path):\n        return None\n    paths = glob.glob(\"%s/*.onnx\" % dir_path)\n    if len(paths) == 0:\n        return None\n    paths = sorted(paths)\n    return paths[-1]\n\ndef get_default_providers():\n    return ['CUDAExecutionProvider', 'CoreMLExecutionProvider', 'CPUExecutionProvider']\n\ndef get_default_provider_options():\n    return None\n\ndef get_model(name, **kwargs):\n    root = kwargs.get('root', '~/.insightface')\n    root = os.path.expanduser(root)\n    model_root = osp.join(root, 'models')\n    allow_download = kwargs.get('download', False)\n    download_zip = kwargs.get('download_zip', False)\n    if not name.endswith('.onnx'):\n        model_dir = os.path.join(model_root, name)\n        model_file = find_onnx_file(model_dir)\n        if model_file is None:\n            return None\n    else:\n        model_file = name\n    if not osp.exists(model_file) and allow_download:\n        model_file = download_onnx('models', model_file, root=root, download_zip=download_zip)\n    assert osp.exists(model_file), 'model_file %s should exist'%model_file\n    assert osp.isfile(model_file), 'model_file %s should be a file'%model_file\n    router = ModelRouter(model_file)\n    providers = kwargs.get('providers', get_default_providers())\n    provider_options = kwargs.get('provider_options', get_default_provider_options())\n    model = router.get_model(providers=providers, provider_options=provider_options)\n    return model\n"
  },
  {
    "path": "src/utils/dependencies/insightface/model_zoo/retinaface.py",
    "content": "# -*- coding: utf-8 -*-\n# @Organization  : insightface.ai\n# @Author        : Jia Guo\n# @Time          : 2021-09-18\n# @Function      : \n\nfrom __future__ import division\nimport datetime\nimport numpy as np\nimport onnx\nimport onnxruntime\nimport os\nimport os.path as osp\nimport cv2\nimport sys\n\ndef softmax(z):\n    assert len(z.shape) == 2\n    s = np.max(z, axis=1)\n    s = s[:, np.newaxis] # necessary step to do broadcasting\n    e_x = np.exp(z - s)\n    div = np.sum(e_x, axis=1)\n    div = div[:, np.newaxis] # dito\n    return e_x / div\n\ndef distance2bbox(points, distance, max_shape=None):\n    \"\"\"Decode distance prediction to bounding box.\n\n    Args:\n        points (Tensor): Shape (n, 2), [x, y].\n        distance (Tensor): Distance from the given point to 4\n            boundaries (left, top, right, bottom).\n        max_shape (tuple): Shape of the image.\n\n    Returns:\n        Tensor: Decoded bboxes.\n    \"\"\"\n    x1 = points[:, 0] - distance[:, 0]\n    y1 = points[:, 1] - distance[:, 1]\n    x2 = points[:, 0] + distance[:, 2]\n    y2 = points[:, 1] + distance[:, 3]\n    if max_shape is not None:\n        x1 = x1.clamp(min=0, max=max_shape[1])\n        y1 = y1.clamp(min=0, max=max_shape[0])\n        x2 = x2.clamp(min=0, max=max_shape[1])\n        y2 = y2.clamp(min=0, max=max_shape[0])\n    return np.stack([x1, y1, x2, y2], axis=-1)\n\ndef distance2kps(points, distance, max_shape=None):\n    \"\"\"Decode distance prediction to bounding box.\n\n    Args:\n        points (Tensor): Shape (n, 2), [x, y].\n        distance (Tensor): Distance from the given point to 4\n            boundaries (left, top, right, bottom).\n        max_shape (tuple): Shape of the image.\n\n    Returns:\n        Tensor: Decoded bboxes.\n    \"\"\"\n    preds = []\n    for i in range(0, distance.shape[1], 2):\n        px = points[:, i%2] + distance[:, i]\n        py = points[:, i%2+1] + distance[:, i+1]\n        if max_shape is not None:\n            px = px.clamp(min=0, max=max_shape[1])\n            py = py.clamp(min=0, max=max_shape[0])\n        preds.append(px)\n        preds.append(py)\n    return np.stack(preds, axis=-1)\n\nclass RetinaFace:\n    def __init__(self, model_file=None, session=None):\n        import onnxruntime\n        self.model_file = model_file\n        self.session = session\n        self.taskname = 'detection'\n        if self.session is None:\n            assert self.model_file is not None\n            assert osp.exists(self.model_file)\n            self.session = onnxruntime.InferenceSession(self.model_file, None)\n        self.center_cache = {}\n        self.nms_thresh = 0.4\n        self.det_thresh = 0.5\n        self._init_vars()\n\n    def _init_vars(self):\n        input_cfg = self.session.get_inputs()[0]\n        input_shape = input_cfg.shape\n        #print(input_shape)\n        if isinstance(input_shape[2], str):\n            self.input_size = None\n        else:\n            self.input_size = tuple(input_shape[2:4][::-1])\n        #print('image_size:', self.image_size)\n        input_name = input_cfg.name\n        self.input_shape = input_shape\n        outputs = self.session.get_outputs()\n        output_names = []\n        for o in outputs:\n            output_names.append(o.name)\n        self.input_name = input_name\n        self.output_names = output_names\n        self.input_mean = 127.5\n        self.input_std = 128.0\n        #print(self.output_names)\n        #assert len(outputs)==10 or len(outputs)==15\n        self.use_kps = False\n        self._anchor_ratio = 1.0\n        self._num_anchors = 1\n        if len(outputs)==6:\n            self.fmc = 3\n            self._feat_stride_fpn = [8, 16, 32]\n            self._num_anchors = 2\n        elif len(outputs)==9:\n            self.fmc = 3\n            self._feat_stride_fpn = [8, 16, 32]\n            self._num_anchors = 2\n            self.use_kps = True\n        elif len(outputs)==10:\n            self.fmc = 5\n            self._feat_stride_fpn = [8, 16, 32, 64, 128]\n            self._num_anchors = 1\n        elif len(outputs)==15:\n            self.fmc = 5\n            self._feat_stride_fpn = [8, 16, 32, 64, 128]\n            self._num_anchors = 1\n            self.use_kps = True\n\n    def prepare(self, ctx_id, **kwargs):\n        if ctx_id<0:\n            self.session.set_providers(['CPUExecutionProvider'])\n        nms_thresh = kwargs.get('nms_thresh', None)\n        if nms_thresh is not None:\n            self.nms_thresh = nms_thresh\n        det_thresh = kwargs.get('det_thresh', None)\n        if det_thresh is not None:\n            self.det_thresh = det_thresh\n        input_size = kwargs.get('input_size', None)\n        if input_size is not None:\n            if self.input_size is not None:\n                print('warning: det_size is already set in detection model, ignore')\n            else:\n                self.input_size = input_size\n\n    def forward(self, img, threshold):\n        scores_list = []\n        bboxes_list = []\n        kpss_list = []\n        input_size = tuple(img.shape[0:2][::-1])\n        blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)\n        net_outs = self.session.run(self.output_names, {self.input_name : blob})\n\n        input_height = blob.shape[2]\n        input_width = blob.shape[3]\n        fmc = self.fmc\n        for idx, stride in enumerate(self._feat_stride_fpn):\n            scores = net_outs[idx]\n            bbox_preds = net_outs[idx+fmc]\n            bbox_preds = bbox_preds * stride\n            if self.use_kps:\n                kps_preds = net_outs[idx+fmc*2] * stride\n            height = input_height // stride\n            width = input_width // stride\n            K = height * width\n            key = (height, width, stride)\n            if key in self.center_cache:\n                anchor_centers = self.center_cache[key]\n            else:\n                #solution-1, c style:\n                #anchor_centers = np.zeros( (height, width, 2), dtype=np.float32 )\n                #for i in range(height):\n                #    anchor_centers[i, :, 1] = i\n                #for i in range(width):\n                #    anchor_centers[:, i, 0] = i\n\n                #solution-2:\n                #ax = np.arange(width, dtype=np.float32)\n                #ay = np.arange(height, dtype=np.float32)\n                #xv, yv = np.meshgrid(np.arange(width), np.arange(height))\n                #anchor_centers = np.stack([xv, yv], axis=-1).astype(np.float32)\n\n                #solution-3:\n                anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)\n                #print(anchor_centers.shape)\n\n                anchor_centers = (anchor_centers * stride).reshape( (-1, 2) )\n                if self._num_anchors>1:\n                    anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) )\n                if len(self.center_cache)<100:\n                    self.center_cache[key] = anchor_centers\n\n            pos_inds = np.where(scores>=threshold)[0]\n            bboxes = distance2bbox(anchor_centers, bbox_preds)\n            pos_scores = scores[pos_inds]\n            pos_bboxes = bboxes[pos_inds]\n            scores_list.append(pos_scores)\n            bboxes_list.append(pos_bboxes)\n            if self.use_kps:\n                kpss = distance2kps(anchor_centers, kps_preds)\n                #kpss = kps_preds\n                kpss = kpss.reshape( (kpss.shape[0], -1, 2) )\n                pos_kpss = kpss[pos_inds]\n                kpss_list.append(pos_kpss)\n        return scores_list, bboxes_list, kpss_list\n\n    def detect(self, img, input_size = None, max_num=0, metric='default'):\n        assert input_size is not None or self.input_size is not None\n        input_size = self.input_size if input_size is None else input_size\n            \n        im_ratio = float(img.shape[0]) / img.shape[1]\n        model_ratio = float(input_size[1]) / input_size[0]\n        if im_ratio>model_ratio:\n            new_height = input_size[1]\n            new_width = int(new_height / im_ratio)\n        else:\n            new_width = input_size[0]\n            new_height = int(new_width * im_ratio)\n        det_scale = float(new_height) / img.shape[0]\n        resized_img = cv2.resize(img, (new_width, new_height))\n        det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 )\n        det_img[:new_height, :new_width, :] = resized_img\n\n        scores_list, bboxes_list, kpss_list = self.forward(det_img, self.det_thresh)\n\n        scores = np.vstack(scores_list)\n        scores_ravel = scores.ravel()\n        order = scores_ravel.argsort()[::-1]\n        bboxes = np.vstack(bboxes_list) / det_scale\n        if self.use_kps:\n            kpss = np.vstack(kpss_list) / det_scale\n        pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False)\n        pre_det = pre_det[order, :]\n        keep = self.nms(pre_det)\n        det = pre_det[keep, :]\n        if self.use_kps:\n            kpss = kpss[order,:,:]\n            kpss = kpss[keep,:,:]\n        else:\n            kpss = None\n        if max_num > 0 and det.shape[0] > max_num:\n            area = (det[:, 2] - det[:, 0]) * (det[:, 3] -\n                                                    det[:, 1])\n            img_center = img.shape[0] // 2, img.shape[1] // 2\n            offsets = np.vstack([\n                (det[:, 0] + det[:, 2]) / 2 - img_center[1],\n                (det[:, 1] + det[:, 3]) / 2 - img_center[0]\n            ])\n            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)\n            if metric=='max':\n                values = area\n            else:\n                values = area - offset_dist_squared * 2.0  # some extra weight on the centering\n            bindex = np.argsort(\n                values)[::-1]  # some extra weight on the centering\n            bindex = bindex[0:max_num]\n            det = det[bindex, :]\n            if kpss is not None:\n                kpss = kpss[bindex, :]\n        return det, kpss\n\n    def nms(self, dets):\n        thresh = self.nms_thresh\n        x1 = dets[:, 0]\n        y1 = dets[:, 1]\n        x2 = dets[:, 2]\n        y2 = dets[:, 3]\n        scores = dets[:, 4]\n\n        areas = (x2 - x1 + 1) * (y2 - y1 + 1)\n        order = scores.argsort()[::-1]\n\n        keep = []\n        while order.size > 0:\n            i = order[0]\n            keep.append(i)\n            xx1 = np.maximum(x1[i], x1[order[1:]])\n            yy1 = np.maximum(y1[i], y1[order[1:]])\n            xx2 = np.minimum(x2[i], x2[order[1:]])\n            yy2 = np.minimum(y2[i], y2[order[1:]])\n\n            w = np.maximum(0.0, xx2 - xx1 + 1)\n            h = np.maximum(0.0, yy2 - yy1 + 1)\n            inter = w * h\n            ovr = inter / (areas[i] + areas[order[1:]] - inter)\n\n            inds = np.where(ovr <= thresh)[0]\n            order = order[inds + 1]\n\n        return keep\n\ndef get_retinaface(name, download=False, root='~/.insightface/models', **kwargs):\n    if not download:\n        assert os.path.exists(name)\n        return RetinaFace(name)\n    else:\n        from .model_store import get_model_file\n        _file = get_model_file(\"retinaface_%s\" % name, root=root)\n        return retinaface(_file)\n\n\n"
  },
  {
    "path": "src/utils/dependencies/insightface/model_zoo/scrfd.py",
    "content": "# -*- coding: utf-8 -*-\n# @Organization  : insightface.ai\n# @Author        : Jia Guo\n# @Time          : 2021-05-04\n# @Function      : \n\nfrom __future__ import division\nimport datetime\nimport numpy as np\nimport onnx\nimport onnxruntime\nimport os\nimport os.path as osp\nimport cv2\nimport sys\n\ndef softmax(z):\n    assert len(z.shape) == 2\n    s = np.max(z, axis=1)\n    s = s[:, np.newaxis] # necessary step to do broadcasting\n    e_x = np.exp(z - s)\n    div = np.sum(e_x, axis=1)\n    div = div[:, np.newaxis] # dito\n    return e_x / div\n\ndef distance2bbox(points, distance, max_shape=None):\n    \"\"\"Decode distance prediction to bounding box.\n\n    Args:\n        points (Tensor): Shape (n, 2), [x, y].\n        distance (Tensor): Distance from the given point to 4\n            boundaries (left, top, right, bottom).\n        max_shape (tuple): Shape of the image.\n\n    Returns:\n        Tensor: Decoded bboxes.\n    \"\"\"\n    x1 = points[:, 0] - distance[:, 0]\n    y1 = points[:, 1] - distance[:, 1]\n    x2 = points[:, 0] + distance[:, 2]\n    y2 = points[:, 1] + distance[:, 3]\n    if max_shape is not None:\n        x1 = x1.clamp(min=0, max=max_shape[1])\n        y1 = y1.clamp(min=0, max=max_shape[0])\n        x2 = x2.clamp(min=0, max=max_shape[1])\n        y2 = y2.clamp(min=0, max=max_shape[0])\n    return np.stack([x1, y1, x2, y2], axis=-1)\n\ndef distance2kps(points, distance, max_shape=None):\n    \"\"\"Decode distance prediction to bounding box.\n\n    Args:\n        points (Tensor): Shape (n, 2), [x, y].\n        distance (Tensor): Distance from the given point to 4\n            boundaries (left, top, right, bottom).\n        max_shape (tuple): Shape of the image.\n\n    Returns:\n        Tensor: Decoded bboxes.\n    \"\"\"\n    preds = []\n    for i in range(0, distance.shape[1], 2):\n        px = points[:, i%2] + distance[:, i]\n        py = points[:, i%2+1] + distance[:, i+1]\n        if max_shape is not None:\n            px = px.clamp(min=0, max=max_shape[1])\n            py = py.clamp(min=0, max=max_shape[0])\n        preds.append(px)\n        preds.append(py)\n    return np.stack(preds, axis=-1)\n\nclass SCRFD:\n    def __init__(self, model_file=None, session=None):\n        import onnxruntime\n        self.model_file = model_file\n        self.session = session\n        self.taskname = 'detection'\n        self.batched = False\n        if self.session is None:\n            assert self.model_file is not None\n            assert osp.exists(self.model_file)\n            self.session = onnxruntime.InferenceSession(self.model_file, None)\n        self.center_cache = {}\n        self.nms_thresh = 0.4\n        self.det_thresh = 0.5\n        self._init_vars()\n\n    def _init_vars(self):\n        input_cfg = self.session.get_inputs()[0]\n        input_shape = input_cfg.shape\n        #print(input_shape)\n        if isinstance(input_shape[2], str):\n            self.input_size = None\n        else:\n            self.input_size = tuple(input_shape[2:4][::-1])\n        #print('image_size:', self.image_size)\n        input_name = input_cfg.name\n        self.input_shape = input_shape\n        outputs = self.session.get_outputs()\n        if len(outputs[0].shape) == 3:\n            self.batched = True\n        output_names = []\n        for o in outputs:\n            output_names.append(o.name)\n        self.input_name = input_name\n        self.output_names = output_names\n        self.input_mean = 127.5\n        self.input_std = 128.0\n        #print(self.output_names)\n        #assert len(outputs)==10 or len(outputs)==15\n        self.use_kps = False\n        self._anchor_ratio = 1.0\n        self._num_anchors = 1\n        if len(outputs)==6:\n            self.fmc = 3\n            self._feat_stride_fpn = [8, 16, 32]\n            self._num_anchors = 2\n        elif len(outputs)==9:\n            self.fmc = 3\n            self._feat_stride_fpn = [8, 16, 32]\n            self._num_anchors = 2\n            self.use_kps = True\n        elif len(outputs)==10:\n            self.fmc = 5\n            self._feat_stride_fpn = [8, 16, 32, 64, 128]\n            self._num_anchors = 1\n        elif len(outputs)==15:\n            self.fmc = 5\n            self._feat_stride_fpn = [8, 16, 32, 64, 128]\n            self._num_anchors = 1\n            self.use_kps = True\n\n    def prepare(self, ctx_id, **kwargs):\n        if ctx_id<0:\n            self.session.set_providers(['CPUExecutionProvider'])\n        nms_thresh = kwargs.get('nms_thresh', None)\n        if nms_thresh is not None:\n            self.nms_thresh = nms_thresh\n        det_thresh = kwargs.get('det_thresh', None)\n        if det_thresh is not None:\n            self.det_thresh = det_thresh\n        input_size = kwargs.get('input_size', None)\n        if input_size is not None:\n            if self.input_size is not None:\n                print('warning: det_size is already set in scrfd model, ignore')\n            else:\n                self.input_size = input_size\n\n    def forward(self, img, threshold):\n        scores_list = []\n        bboxes_list = []\n        kpss_list = []\n        input_size = tuple(img.shape[0:2][::-1])\n        blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)\n        net_outs = self.session.run(self.output_names, {self.input_name : blob})\n\n        input_height = blob.shape[2]\n        input_width = blob.shape[3]\n        fmc = self.fmc\n        for idx, stride in enumerate(self._feat_stride_fpn):\n            # If model support batch dim, take first output\n            if self.batched:\n                scores = net_outs[idx][0]\n                bbox_preds = net_outs[idx + fmc][0]\n                bbox_preds = bbox_preds * stride\n                if self.use_kps:\n                    kps_preds = net_outs[idx + fmc * 2][0] * stride\n            # If model doesn't support batching take output as is\n            else:\n                scores = net_outs[idx]\n                bbox_preds = net_outs[idx + fmc]\n                bbox_preds = bbox_preds * stride\n                if self.use_kps:\n                    kps_preds = net_outs[idx + fmc * 2] * stride\n\n            height = input_height // stride\n            width = input_width // stride\n            K = height * width\n            key = (height, width, stride)\n            if key in self.center_cache:\n                anchor_centers = self.center_cache[key]\n            else:\n                #solution-1, c style:\n                #anchor_centers = np.zeros( (height, width, 2), dtype=np.float32 )\n                #for i in range(height):\n                #    anchor_centers[i, :, 1] = i\n                #for i in range(width):\n                #    anchor_centers[:, i, 0] = i\n\n                #solution-2:\n                #ax = np.arange(width, dtype=np.float32)\n                #ay = np.arange(height, dtype=np.float32)\n                #xv, yv = np.meshgrid(np.arange(width), np.arange(height))\n                #anchor_centers = np.stack([xv, yv], axis=-1).astype(np.float32)\n\n                #solution-3:\n                anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)\n                #print(anchor_centers.shape)\n\n                anchor_centers = (anchor_centers * stride).reshape( (-1, 2) )\n                if self._num_anchors>1:\n                    anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) )\n                if len(self.center_cache)<100:\n                    self.center_cache[key] = anchor_centers\n\n            pos_inds = np.where(scores>=threshold)[0]\n            bboxes = distance2bbox(anchor_centers, bbox_preds)\n            pos_scores = scores[pos_inds]\n            pos_bboxes = bboxes[pos_inds]\n            scores_list.append(pos_scores)\n            bboxes_list.append(pos_bboxes)\n            if self.use_kps:\n                kpss = distance2kps(anchor_centers, kps_preds)\n                #kpss = kps_preds\n                kpss = kpss.reshape( (kpss.shape[0], -1, 2) )\n                pos_kpss = kpss[pos_inds]\n                kpss_list.append(pos_kpss)\n        return scores_list, bboxes_list, kpss_list\n\n    def detect(self, img, input_size = None, max_num=0, metric='default'):\n        assert input_size is not None or self.input_size is not None\n        input_size = self.input_size if input_size is None else input_size\n            \n        im_ratio = float(img.shape[0]) / img.shape[1]\n        model_ratio = float(input_size[1]) / input_size[0]\n        if im_ratio>model_ratio:\n            new_height = input_size[1]\n            new_width = int(new_height / im_ratio)\n        else:\n            new_width = input_size[0]\n            new_height = int(new_width * im_ratio)\n        det_scale = float(new_height) / img.shape[0]\n        resized_img = cv2.resize(img, (new_width, new_height))\n        det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 )\n        det_img[:new_height, :new_width, :] = resized_img\n\n        scores_list, bboxes_list, kpss_list = self.forward(det_img, self.det_thresh)\n\n        scores = np.vstack(scores_list)\n        scores_ravel = scores.ravel()\n        order = scores_ravel.argsort()[::-1]\n        bboxes = np.vstack(bboxes_list) / det_scale\n        if self.use_kps:\n            kpss = np.vstack(kpss_list) / det_scale\n        pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False)\n        pre_det = pre_det[order, :]\n        keep = self.nms(pre_det)\n        det = pre_det[keep, :]\n        if self.use_kps:\n            kpss = kpss[order,:,:]\n            kpss = kpss[keep,:,:]\n        else:\n            kpss = None\n        if max_num > 0 and det.shape[0] > max_num:\n            area = (det[:, 2] - det[:, 0]) * (det[:, 3] -\n                                                    det[:, 1])\n            img_center = img.shape[0] // 2, img.shape[1] // 2\n            offsets = np.vstack([\n                (det[:, 0] + det[:, 2]) / 2 - img_center[1],\n                (det[:, 1] + det[:, 3]) / 2 - img_center[0]\n            ])\n            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)\n            if metric=='max':\n                values = area\n            else:\n                values = area - offset_dist_squared * 2.0  # some extra weight on the centering\n            bindex = np.argsort(\n                values)[::-1]  # some extra weight on the centering\n            bindex = bindex[0:max_num]\n            det = det[bindex, :]\n            if kpss is not None:\n                kpss = kpss[bindex, :]\n        return det, kpss\n\n    def nms(self, dets):\n        thresh = self.nms_thresh\n        x1 = dets[:, 0]\n        y1 = dets[:, 1]\n        x2 = dets[:, 2]\n        y2 = dets[:, 3]\n        scores = dets[:, 4]\n\n        areas = (x2 - x1 + 1) * (y2 - y1 + 1)\n        order = scores.argsort()[::-1]\n\n        keep = []\n        while order.size > 0:\n            i = order[0]\n            keep.append(i)\n            xx1 = np.maximum(x1[i], x1[order[1:]])\n            yy1 = np.maximum(y1[i], y1[order[1:]])\n            xx2 = np.minimum(x2[i], x2[order[1:]])\n            yy2 = np.minimum(y2[i], y2[order[1:]])\n\n            w = np.maximum(0.0, xx2 - xx1 + 1)\n            h = np.maximum(0.0, yy2 - yy1 + 1)\n            inter = w * h\n            ovr = inter / (areas[i] + areas[order[1:]] - inter)\n\n            inds = np.where(ovr <= thresh)[0]\n            order = order[inds + 1]\n\n        return keep\n\ndef get_scrfd(name, download=False, root='~/.insightface/models', **kwargs):\n    if not download:\n        assert os.path.exists(name)\n        return SCRFD(name)\n    else:\n        from .model_store import get_model_file\n        _file = get_model_file(\"scrfd_%s\" % name, root=root)\n        return SCRFD(_file)\n\n\ndef scrfd_2p5gkps(**kwargs):\n    return get_scrfd(\"2p5gkps\", download=True, **kwargs)\n\n\nif __name__ == '__main__':\n    import glob\n    detector = SCRFD(model_file='./det.onnx')\n    detector.prepare(-1)\n    img_paths = ['tests/data/t1.jpg']\n    for img_path in img_paths:\n        img = cv2.imread(img_path)\n\n        for _ in range(1):\n            ta = datetime.datetime.now()\n            #bboxes, kpss = detector.detect(img, 0.5, input_size = (640, 640))\n            bboxes, kpss = detector.detect(img, 0.5)\n            tb = datetime.datetime.now()\n            print('all cost:', (tb-ta).total_seconds()*1000)\n        print(img_path, bboxes.shape)\n        if kpss is not None:\n            print(kpss.shape)\n        for i in range(bboxes.shape[0]):\n            bbox = bboxes[i]\n            x1,y1,x2,y2,score = bbox.astype(np.int)\n            cv2.rectangle(img, (x1,y1)  , (x2,y2) , (255,0,0) , 2)\n            if kpss is not None:\n                kps = kpss[i]\n                for kp in kps:\n                    kp = kp.astype(np.int)\n                    cv2.circle(img, tuple(kp) , 1, (0,0,255) , 2)\n        filename = img_path.split('/')[-1]\n        print('output:', filename)\n        cv2.imwrite('./outputs/%s'%filename, img)\n\n"
  },
  {
    "path": "src/utils/dependencies/insightface/utils/__init__.py",
    "content": "from __future__ import absolute_import\n\nfrom .storage import download, ensure_available, download_onnx\nfrom .filesystem import get_model_dir\nfrom .filesystem import makedirs, try_import_dali\nfrom .constant import *\n"
  },
  {
    "path": "src/utils/dependencies/insightface/utils/constant.py",
    "content": "\nDEFAULT_MP_NAME = 'buffalo_l'\n\n"
  },
  {
    "path": "src/utils/dependencies/insightface/utils/download.py",
    "content": "\"\"\"\nThis code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/utils/download.py\n\"\"\"\nimport os\nimport hashlib\nimport requests\nfrom tqdm import tqdm\n\n\ndef check_sha1(filename, sha1_hash):\n    \"\"\"Check whether the sha1 hash of the file content matches the expected hash.\n    Parameters\n    ----------\n    filename : str\n        Path to the file.\n    sha1_hash : str\n        Expected sha1 hash in hexadecimal digits.\n    Returns\n    -------\n    bool\n        Whether the file content matches the expected hash.\n    \"\"\"\n    sha1 = hashlib.sha1()\n    with open(filename, 'rb') as f:\n        while True:\n            data = f.read(1048576)\n            if not data:\n                break\n            sha1.update(data)\n\n    sha1_file = sha1.hexdigest()\n    l = min(len(sha1_file), len(sha1_hash))\n    return sha1.hexdigest()[0:l] == sha1_hash[0:l]\n\n\ndef download_file(url, path=None, overwrite=False, sha1_hash=None):\n    \"\"\"Download an given URL\n    Parameters\n    ----------\n    url : str\n        URL to download\n    path : str, optional\n        Destination path to store downloaded file. By default stores to the\n        current directory with same name as in url.\n    overwrite : bool, optional\n        Whether to overwrite destination file if already exists.\n    sha1_hash : str, optional\n        Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified\n        but doesn't match.\n    Returns\n    -------\n    str\n        The file path of the downloaded file.\n    \"\"\"\n    if path is None:\n        fname = url.split('/')[-1]\n    else:\n        path = os.path.expanduser(path)\n        if os.path.isdir(path):\n            fname = os.path.join(path, url.split('/')[-1])\n        else:\n            fname = path\n\n    if overwrite or not os.path.exists(fname) or (\n            sha1_hash and not check_sha1(fname, sha1_hash)):\n        dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))\n        if not os.path.exists(dirname):\n            os.makedirs(dirname)\n\n        print('Downloading %s from %s...' % (fname, url))\n        r = requests.get(url, stream=True)\n        if r.status_code != 200:\n            raise RuntimeError(\"Failed downloading url %s\" % url)\n        total_length = r.headers.get('content-length')\n        with open(fname, 'wb') as f:\n            if total_length is None:  # no content length header\n                for chunk in r.iter_content(chunk_size=1024):\n                    if chunk:  # filter out keep-alive new chunks\n                        f.write(chunk)\n            else:\n                total_length = int(total_length)\n                for chunk in tqdm(r.iter_content(chunk_size=1024),\n                                  total=int(total_length / 1024. + 0.5),\n                                  unit='KB',\n                                  unit_scale=False,\n                                  dynamic_ncols=True):\n                    f.write(chunk)\n\n        if sha1_hash and not check_sha1(fname, sha1_hash):\n            raise UserWarning('File {} is downloaded but the content hash does not match. ' \\\n                              'The repo may be outdated or download may be incomplete. ' \\\n                              'If the \"repo_url\" is overridden, consider switching to ' \\\n                              'the default repo.'.format(fname))\n\n    return fname\n"
  },
  {
    "path": "src/utils/dependencies/insightface/utils/face_align.py",
    "content": "import cv2\nimport numpy as np\nfrom skimage import transform as trans\n\n\narcface_dst = np.array(\n    [[38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366],\n     [41.5493, 92.3655], [70.7299, 92.2041]],\n    dtype=np.float32)\n\ndef estimate_norm(lmk, image_size=112,mode='arcface'):\n    assert lmk.shape == (5, 2)\n    assert image_size%112==0 or image_size%128==0\n    if image_size%112==0:\n        ratio = float(image_size)/112.0\n        diff_x = 0\n    else:\n        ratio = float(image_size)/128.0\n        diff_x = 8.0*ratio\n    dst = arcface_dst * ratio\n    dst[:,0] += diff_x\n    tform = trans.SimilarityTransform()\n    tform.estimate(lmk, dst)\n    M = tform.params[0:2, :]\n    return M\n\ndef norm_crop(img, landmark, image_size=112, mode='arcface'):\n    M = estimate_norm(landmark, image_size, mode)\n    warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)\n    return warped\n\ndef norm_crop2(img, landmark, image_size=112, mode='arcface'):\n    M = estimate_norm(landmark, image_size, mode)\n    warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)\n    return warped, M\n\ndef square_crop(im, S):\n    if im.shape[0] > im.shape[1]:\n        height = S\n        width = int(float(im.shape[1]) / im.shape[0] * S)\n        scale = float(S) / im.shape[0]\n    else:\n        width = S\n        height = int(float(im.shape[0]) / im.shape[1] * S)\n        scale = float(S) / im.shape[1]\n    resized_im = cv2.resize(im, (width, height))\n    det_im = np.zeros((S, S, 3), dtype=np.uint8)\n    det_im[:resized_im.shape[0], :resized_im.shape[1], :] = resized_im\n    return det_im, scale\n\n\ndef transform(data, center, output_size, scale, rotation):\n    scale_ratio = scale\n    rot = float(rotation) * np.pi / 180.0\n    #translation = (output_size/2-center[0]*scale_ratio, output_size/2-center[1]*scale_ratio)\n    t1 = trans.SimilarityTransform(scale=scale_ratio)\n    cx = center[0] * scale_ratio\n    cy = center[1] * scale_ratio\n    t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy))\n    t3 = trans.SimilarityTransform(rotation=rot)\n    t4 = trans.SimilarityTransform(translation=(output_size / 2,\n                                                output_size / 2))\n    t = t1 + t2 + t3 + t4\n    M = t.params[0:2]\n    cropped = cv2.warpAffine(data,\n                             M, (output_size, output_size),\n                             borderValue=0.0)\n    return cropped, M\n\n\ndef trans_points2d(pts, M):\n    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)\n    for i in range(pts.shape[0]):\n        pt = pts[i]\n        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)\n        new_pt = np.dot(M, new_pt)\n        #print('new_pt', new_pt.shape, new_pt)\n        new_pts[i] = new_pt[0:2]\n\n    return new_pts\n\n\ndef trans_points3d(pts, M):\n    scale = np.sqrt(M[0][0] * M[0][0] + M[0][1] * M[0][1])\n    #print(scale)\n    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)\n    for i in range(pts.shape[0]):\n        pt = pts[i]\n        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)\n        new_pt = np.dot(M, new_pt)\n        #print('new_pt', new_pt.shape, new_pt)\n        new_pts[i][0:2] = new_pt[0:2]\n        new_pts[i][2] = pts[i][2] * scale\n\n    return new_pts\n\n\ndef trans_points(pts, M):\n    if pts.shape[1] == 2:\n        return trans_points2d(pts, M)\n    else:\n        return trans_points3d(pts, M)\n\n"
  },
  {
    "path": "src/utils/dependencies/insightface/utils/filesystem.py",
    "content": "\"\"\"\nThis code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/utils/filesystem.py\n\"\"\"\nimport os\nimport os.path as osp\nimport errno\n\n\ndef get_model_dir(name, root='~/.insightface'):\n    root = os.path.expanduser(root)\n    model_dir = osp.join(root, 'models', name)\n    return model_dir\n\ndef makedirs(path):\n    \"\"\"Create directory recursively if not exists.\n    Similar to `makedir -p`, you can skip checking existence before this function.\n\n    Parameters\n    ----------\n    path : str\n        Path of the desired dir\n    \"\"\"\n    try:\n        os.makedirs(path)\n    except OSError as exc:\n        if exc.errno != errno.EEXIST:\n            raise\n\n\ndef try_import(package, message=None):\n    \"\"\"Try import specified package, with custom message support.\n\n    Parameters\n    ----------\n    package : str\n        The name of the targeting package.\n    message : str, default is None\n        If not None, this function will raise customized error message when import error is found.\n\n\n    Returns\n    -------\n    module if found, raise ImportError otherwise\n\n    \"\"\"\n    try:\n        return __import__(package)\n    except ImportError as e:\n        if not message:\n            raise e\n        raise ImportError(message)\n\n\ndef try_import_cv2():\n    \"\"\"Try import cv2 at runtime.\n\n    Returns\n    -------\n    cv2 module if found. Raise ImportError otherwise\n\n    \"\"\"\n    msg = \"cv2 is required, you can install by package manager, e.g. 'apt-get', \\\n        or `pip install opencv-python --user` (note that this is unofficial PYPI package).\"\n\n    return try_import('cv2', msg)\n\n\ndef try_import_mmcv():\n    \"\"\"Try import mmcv at runtime.\n\n    Returns\n    -------\n    mmcv module if found. Raise ImportError otherwise\n\n    \"\"\"\n    msg = \"mmcv is required, you can install by first `pip install Cython --user` \\\n        and then `pip install mmcv --user` (note that this is unofficial PYPI package).\"\n\n    return try_import('mmcv', msg)\n\n\ndef try_import_rarfile():\n    \"\"\"Try import rarfile at runtime.\n\n    Returns\n    -------\n    rarfile module if found. Raise ImportError otherwise\n\n    \"\"\"\n    msg = \"rarfile is required, you can install by first `sudo apt-get install unrar` \\\n        and then `pip install rarfile --user` (note that this is unofficial PYPI package).\"\n\n    return try_import('rarfile', msg)\n\n\ndef import_try_install(package, extern_url=None):\n    \"\"\"Try import the specified package.\n    If the package not installed, try use pip to install and import if success.\n\n    Parameters\n    ----------\n    package : str\n        The name of the package trying to import.\n    extern_url : str or None, optional\n        The external url if package is not hosted on PyPI.\n        For example, you can install a package using:\n         \"pip install git+http://github.com/user/repo/tarball/master/egginfo=xxx\".\n        In this case, you can pass the url to the extern_url.\n\n    Returns\n    -------\n    <class 'Module'>\n        The imported python module.\n\n    \"\"\"\n    try:\n        return __import__(package)\n    except ImportError:\n        try:\n            from pip import main as pipmain\n        except ImportError:\n            from pip._internal import main as pipmain\n\n        # trying to install package\n        url = package if extern_url is None else extern_url\n        pipmain(['install', '--user',\n                 url])  # will raise SystemExit Error if fails\n\n        # trying to load again\n        try:\n            return __import__(package)\n        except ImportError:\n            import sys\n            import site\n            user_site = site.getusersitepackages()\n            if user_site not in sys.path:\n                sys.path.append(user_site)\n            return __import__(package)\n    return __import__(package)\n\n\ndef try_import_dali():\n    \"\"\"Try import NVIDIA DALI at runtime.\n    \"\"\"\n    try:\n        dali = __import__('nvidia.dali', fromlist=['pipeline', 'ops', 'types'])\n        dali.Pipeline = dali.pipeline.Pipeline\n    except ImportError:\n\n        class dali:\n            class Pipeline:\n                def __init__(self):\n                    raise NotImplementedError(\n                        \"DALI not found, please check if you installed it correctly.\"\n                    )\n\n    return dali\n"
  },
  {
    "path": "src/utils/dependencies/insightface/utils/storage.py",
    "content": "\nimport os\nimport os.path as osp\nimport zipfile\nfrom .download import download_file\n\nBASE_REPO_URL = 'https://github.com/deepinsight/insightface/releases/download/v0.7'\n\ndef download(sub_dir, name, force=False, root='~/.insightface'):\n    _root = os.path.expanduser(root)\n    dir_path = os.path.join(_root, sub_dir, name)\n    if osp.exists(dir_path) and not force:\n        return dir_path\n    print('download_path:', dir_path)\n    zip_file_path = os.path.join(_root, sub_dir, name + '.zip')\n    model_url = \"%s/%s.zip\"%(BASE_REPO_URL, name)\n    download_file(model_url,\n             path=zip_file_path,\n             overwrite=True)\n    if not os.path.exists(dir_path):\n        os.makedirs(dir_path)\n    with zipfile.ZipFile(zip_file_path) as zf:\n        zf.extractall(dir_path)\n    #os.remove(zip_file_path)\n    return dir_path\n\ndef ensure_available(sub_dir, name, root='~/.insightface'):\n    return download(sub_dir, name, force=False, root=root)\n\ndef download_onnx(sub_dir, model_file, force=False, root='~/.insightface', download_zip=False):\n    _root = os.path.expanduser(root)\n    model_root = osp.join(_root, sub_dir)\n    new_model_file = osp.join(model_root, model_file)\n    if osp.exists(new_model_file) and not force:\n        return new_model_file\n    if not osp.exists(model_root):\n        os.makedirs(model_root)\n    print('download_path:', new_model_file)\n    if not download_zip:\n        model_url = \"%s/%s\"%(BASE_REPO_URL, model_file)\n        download_file(model_url,\n                 path=new_model_file,\n                 overwrite=True)\n    else:\n        model_url = \"%s/%s.zip\"%(BASE_REPO_URL, model_file)\n        zip_file_path = new_model_file+\".zip\"\n        download_file(model_url,\n                 path=zip_file_path,\n                 overwrite=True)\n        with zipfile.ZipFile(zip_file_path) as zf:\n            zf.extractall(model_root)\n        return new_model_file\n"
  },
  {
    "path": "src/utils/dependencies/insightface/utils/transform.py",
    "content": "import cv2\nimport math\nimport numpy as np\nfrom skimage import transform as trans\n\n\ndef transform(data, center, output_size, scale, rotation):\n    scale_ratio = scale\n    rot = float(rotation) * np.pi / 180.0\n    #translation = (output_size/2-center[0]*scale_ratio, output_size/2-center[1]*scale_ratio)\n    t1 = trans.SimilarityTransform(scale=scale_ratio)\n    cx = center[0] * scale_ratio\n    cy = center[1] * scale_ratio\n    t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy))\n    t3 = trans.SimilarityTransform(rotation=rot)\n    t4 = trans.SimilarityTransform(translation=(output_size / 2,\n                                                output_size / 2))\n    t = t1 + t2 + t3 + t4\n    M = t.params[0:2]\n    cropped = cv2.warpAffine(data,\n                             M, (output_size, output_size),\n                             borderValue=0.0)\n    return cropped, M\n\n\ndef trans_points2d(pts, M):\n    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)\n    for i in range(pts.shape[0]):\n        pt = pts[i]\n        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)\n        new_pt = np.dot(M, new_pt)\n        #print('new_pt', new_pt.shape, new_pt)\n        new_pts[i] = new_pt[0:2]\n\n    return new_pts\n\n\ndef trans_points3d(pts, M):\n    scale = np.sqrt(M[0][0] * M[0][0] + M[0][1] * M[0][1])\n    #print(scale)\n    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)\n    for i in range(pts.shape[0]):\n        pt = pts[i]\n        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)\n        new_pt = np.dot(M, new_pt)\n        #print('new_pt', new_pt.shape, new_pt)\n        new_pts[i][0:2] = new_pt[0:2]\n        new_pts[i][2] = pts[i][2] * scale\n\n    return new_pts\n\n\ndef trans_points(pts, M):\n    if pts.shape[1] == 2:\n        return trans_points2d(pts, M)\n    else:\n        return trans_points3d(pts, M)\n\ndef estimate_affine_matrix_3d23d(X, Y):\n    ''' Using least-squares solution \n    Args:\n        X: [n, 3]. 3d points(fixed)\n        Y: [n, 3]. corresponding 3d points(moving). Y = PX\n    Returns:\n        P_Affine: (3, 4). Affine camera matrix (the third row is [0, 0, 0, 1]).\n    '''\n    X_homo = np.hstack((X, np.ones([X.shape[0],1]))) #n x 4\n    P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4\n    return P\n\ndef P2sRt(P):\n    ''' decompositing camera matrix P\n    Args: \n        P: (3, 4). Affine Camera Matrix.\n    Returns:\n        s: scale factor.\n        R: (3, 3). rotation matrix.\n        t: (3,). translation. \n    '''\n    t = P[:, 3]\n    R1 = P[0:1, :3]\n    R2 = P[1:2, :3]\n    s = (np.linalg.norm(R1) + np.linalg.norm(R2))/2.0\n    r1 = R1/np.linalg.norm(R1)\n    r2 = R2/np.linalg.norm(R2)\n    r3 = np.cross(r1, r2)\n\n    R = np.concatenate((r1, r2, r3), 0)\n    return s, R, t\n\ndef matrix2angle(R):\n    ''' get three Euler angles from Rotation Matrix\n    Args:\n        R: (3,3). rotation matrix\n    Returns:\n        x: pitch\n        y: yaw\n        z: roll\n    '''\n    sy = math.sqrt(R[0,0] * R[0,0] +  R[1,0] * R[1,0])\n     \n    singular = sy < 1e-6\n \n    if  not singular :\n        x = math.atan2(R[2,1] , R[2,2])\n        y = math.atan2(-R[2,0], sy)\n        z = math.atan2(R[1,0], R[0,0])\n    else :\n        x = math.atan2(-R[1,2], R[1,1])\n        y = math.atan2(-R[2,0], sy)\n        z = 0\n\n    # rx, ry, rz = np.rad2deg(x), np.rad2deg(y), np.rad2deg(z)\n    rx, ry, rz = x*180/np.pi, y*180/np.pi, z*180/np.pi\n    return rx, ry, rz\n\n"
  },
  {
    "path": "src/utils/face_analysis_diy.py",
    "content": "# coding: utf-8\n\n\"\"\"\nface detectoin and alignment using InsightFace\n\"\"\"\n\nimport numpy as np\nfrom .rprint import rlog as log\nfrom .dependencies.insightface.app import FaceAnalysis\nfrom .dependencies.insightface.app.common import Face\nfrom .timer import Timer\n\n\ndef sort_by_direction(faces, direction: str = 'large-small', face_center=None):\n    if len(faces) <= 0:\n        return faces\n\n    if direction == 'left-right':\n        return sorted(faces, key=lambda face: face['bbox'][0])\n    if direction == 'right-left':\n        return sorted(faces, key=lambda face: face['bbox'][0], reverse=True)\n    if direction == 'top-bottom':\n        return sorted(faces, key=lambda face: face['bbox'][1])\n    if direction == 'bottom-top':\n        return sorted(faces, key=lambda face: face['bbox'][1], reverse=True)\n    if direction == 'small-large':\n        return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]))\n    if direction == 'large-small':\n        return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]), reverse=True)\n    if direction == 'distance-from-retarget-face':\n        return sorted(faces, key=lambda face: (((face['bbox'][2]+face['bbox'][0])/2-face_center[0])**2+((face['bbox'][3]+face['bbox'][1])/2-face_center[1])**2)**0.5)\n    return faces\n\n\nclass FaceAnalysisDIY(FaceAnalysis):\n    def __init__(self, name='buffalo_l', root='~/.insightface', allowed_modules=None, **kwargs):\n        super().__init__(name=name, root=root, allowed_modules=allowed_modules, **kwargs)\n\n        self.timer = Timer()\n\n    def get(self, img_bgr, **kwargs):\n        max_num = kwargs.get('max_face_num', 0)  # the number of the detected faces, 0 means no limit\n        flag_do_landmark_2d_106 = kwargs.get('flag_do_landmark_2d_106', True)  # whether to do 106-point detection\n        direction = kwargs.get('direction', 'large-small')  # sorting direction\n        face_center = None\n\n        bboxes, kpss = self.det_model.detect(img_bgr, max_num=max_num, metric='default')\n        if bboxes.shape[0] == 0:\n            return []\n        ret = []\n        for i in range(bboxes.shape[0]):\n            bbox = bboxes[i, 0:4]\n            det_score = bboxes[i, 4]\n            kps = None\n            if kpss is not None:\n                kps = kpss[i]\n            face = Face(bbox=bbox, kps=kps, det_score=det_score)\n            for taskname, model in self.models.items():\n                if taskname == 'detection':\n                    continue\n\n                if (not flag_do_landmark_2d_106) and taskname == 'landmark_2d_106':\n                    continue\n\n                # print(f'taskname: {taskname}')\n                model.get(img_bgr, face)\n            ret.append(face)\n\n        ret = sort_by_direction(ret, direction, face_center)\n        return ret\n\n    def warmup(self):\n        self.timer.tic()\n\n        img_bgr = np.zeros((512, 512, 3), dtype=np.uint8)\n        self.get(img_bgr)\n\n        elapse = self.timer.toc()\n        log(f'FaceAnalysisDIY warmup time: {elapse:.3f}s')\n"
  },
  {
    "path": "src/utils/filter.py",
    "content": "# coding: utf-8\n\nimport torch\nimport numpy as np\nfrom pykalman import KalmanFilter\n\n\ndef smooth(x_d_lst, shape, device, observation_variance=3e-7, process_variance=1e-5):\n    x_d_lst_reshape = [x.reshape(-1) for x in x_d_lst]\n    x_d_stacked = np.vstack(x_d_lst_reshape)\n    kf = KalmanFilter(\n        initial_state_mean=x_d_stacked[0],\n        n_dim_obs=x_d_stacked.shape[1],\n        transition_covariance=process_variance * np.eye(x_d_stacked.shape[1]),\n        observation_covariance=observation_variance * np.eye(x_d_stacked.shape[1])\n    )\n    smoothed_state_means, _ = kf.smooth(x_d_stacked)\n    x_d_lst_smooth = [torch.tensor(state_mean.reshape(shape[-2:]), dtype=torch.float32, device=device) for state_mean in smoothed_state_means]\n    return x_d_lst_smooth\n"
  },
  {
    "path": "src/utils/helper.py",
    "content": "# coding: utf-8\n\n\"\"\"\nutility functions and classes to handle feature extraction and model loading\n\"\"\"\n\nimport os\nimport os.path as osp\nimport torch\nfrom collections import OrderedDict\nimport numpy as np\nfrom scipy.spatial import ConvexHull # pylint: disable=E0401,E0611\nfrom typing import Union\nimport cv2\n\nfrom ..modules.spade_generator import SPADEDecoder\nfrom ..modules.warping_network import WarpingNetwork\nfrom ..modules.motion_extractor import MotionExtractor\nfrom ..modules.appearance_feature_extractor import AppearanceFeatureExtractor\nfrom ..modules.stitching_retargeting_network import StitchingRetargetingNetwork\n\n\ndef tensor_to_numpy(data: Union[np.ndarray, torch.Tensor]) -> np.ndarray:\n    \"\"\"transform torch.Tensor into numpy.ndarray\"\"\"\n    if isinstance(data, torch.Tensor):\n        return data.data.cpu().numpy()\n    return data\n\ndef calc_motion_multiplier(\n    kp_source: Union[np.ndarray, torch.Tensor],\n    kp_driving_initial: Union[np.ndarray, torch.Tensor]\n) -> float:\n    \"\"\"calculate motion_multiplier based on the source image and the first driving frame\"\"\"\n    kp_source_np = tensor_to_numpy(kp_source)\n    kp_driving_initial_np = tensor_to_numpy(kp_driving_initial)\n\n    source_area = ConvexHull(kp_source_np.squeeze(0)).volume\n    driving_area = ConvexHull(kp_driving_initial_np.squeeze(0)).volume\n    motion_multiplier = np.sqrt(source_area) / np.sqrt(driving_area)\n    # motion_multiplier = np.cbrt(source_area) / np.cbrt(driving_area)\n\n    return motion_multiplier\n\ndef suffix(filename):\n    \"\"\"a.jpg -> jpg\"\"\"\n    pos = filename.rfind(\".\")\n    if pos == -1:\n        return \"\"\n    return filename[pos + 1:]\n\n\ndef prefix(filename):\n    \"\"\"a.jpg -> a\"\"\"\n    pos = filename.rfind(\".\")\n    if pos == -1:\n        return filename\n    return filename[:pos]\n\n\ndef basename(filename):\n    \"\"\"a/b/c.jpg -> c\"\"\"\n    return prefix(osp.basename(filename))\n\n\ndef remove_suffix(filepath):\n    \"\"\"a/b/c.jpg -> a/b/c\"\"\"\n    return osp.join(osp.dirname(filepath), basename(filepath))\n\n\ndef is_image(file_path):\n    image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp')\n    return file_path.lower().endswith(image_extensions)\n\n\ndef is_video(file_path):\n    if file_path.lower().endswith((\".mp4\", \".mov\", \".avi\", \".webm\")) or osp.isdir(file_path):\n        return True\n    return False\n\n\ndef is_template(file_path):\n    if file_path.endswith(\".pkl\"):\n        return True\n    return False\n\n\ndef mkdir(d, log=False):\n    # return self-assined `d`, for one line code\n    if not osp.exists(d):\n        os.makedirs(d, exist_ok=True)\n        if log:\n            print(f\"Make dir: {d}\")\n    return d\n\n\ndef squeeze_tensor_to_numpy(tensor):\n    out = tensor.data.squeeze(0).cpu().numpy()\n    return out\n\n\ndef dct2device(dct: dict, device):\n    for key in dct:\n        if isinstance(dct[key], torch.Tensor):\n            dct[key] = dct[key].to(device)\n        else:\n            dct[key] = torch.tensor(dct[key]).to(device)\n    return dct\n\n\ndef concat_feat(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:\n    \"\"\"\n    kp_source: (bs, k, 3)\n    kp_driving: (bs, k, 3)\n    Return: (bs, 2k*3)\n    \"\"\"\n    bs_src = kp_source.shape[0]\n    bs_dri = kp_driving.shape[0]\n    assert bs_src == bs_dri, 'batch size must be equal'\n\n    feat = torch.cat([kp_source.view(bs_src, -1), kp_driving.view(bs_dri, -1)], dim=1)\n    return feat\n\n\ndef remove_ddp_dumplicate_key(state_dict):\n    state_dict_new = OrderedDict()\n    for key in state_dict.keys():\n        state_dict_new[key.replace('module.', '')] = state_dict[key]\n    return state_dict_new\n\n\ndef load_model(ckpt_path, model_config, device, model_type):\n    model_params = model_config['model_params'][f'{model_type}_params']\n\n    if model_type == 'appearance_feature_extractor':\n        model = AppearanceFeatureExtractor(**model_params).to(device)\n    elif model_type == 'motion_extractor':\n        model = MotionExtractor(**model_params).to(device)\n    elif model_type == 'warping_module':\n        model = WarpingNetwork(**model_params).to(device)\n    elif model_type == 'spade_generator':\n        model = SPADEDecoder(**model_params).to(device)\n    elif model_type == 'stitching_retargeting_module':\n        # Special handling for stitching and retargeting module\n        config = model_config['model_params']['stitching_retargeting_module_params']\n        checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage)\n\n        stitcher = StitchingRetargetingNetwork(**config.get('stitching'))\n        stitcher.load_state_dict(remove_ddp_dumplicate_key(checkpoint['retarget_shoulder']))\n        stitcher = stitcher.to(device)\n        stitcher.eval()\n\n        retargetor_lip = StitchingRetargetingNetwork(**config.get('lip'))\n        retargetor_lip.load_state_dict(remove_ddp_dumplicate_key(checkpoint['retarget_mouth']))\n        retargetor_lip = retargetor_lip.to(device)\n        retargetor_lip.eval()\n\n        retargetor_eye = StitchingRetargetingNetwork(**config.get('eye'))\n        retargetor_eye.load_state_dict(remove_ddp_dumplicate_key(checkpoint['retarget_eye']))\n        retargetor_eye = retargetor_eye.to(device)\n        retargetor_eye.eval()\n\n        return {\n            'stitching': stitcher,\n            'lip': retargetor_lip,\n            'eye': retargetor_eye\n        }\n    else:\n        raise ValueError(f\"Unknown model type: {model_type}\")\n\n    model.load_state_dict(torch.load(ckpt_path, map_location=lambda storage, loc: storage))\n    model.eval()\n    return model\n\n\ndef load_description(fp):\n    with open(fp, 'r', encoding='utf-8') as f:\n        content = f.read()\n    return content\n\n\ndef is_square_video(video_path):\n    video = cv2.VideoCapture(video_path)\n\n    width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))\n    height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))\n\n    video.release()\n    # if width != height:\n        # gr.Info(f\"Uploaded video is not square, force do crop (driving) to be True\")\n\n    return width == height\n\ndef clean_state_dict(state_dict):\n    new_state_dict = OrderedDict()\n    for k, v in state_dict.items():\n        if k[:7] == 'module.':\n            k = k[7:]  # remove `module.`\n        new_state_dict[k] = v\n    return new_state_dict\n"
  },
  {
    "path": "src/utils/human_landmark_runner.py",
    "content": "# coding: utf-8\n\nimport os.path as osp\nimport cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)\nimport torch\nimport numpy as np\nimport onnxruntime\nfrom .timer import Timer\nfrom .rprint import rlog\nfrom .crop import crop_image, _transform_pts\n\n\ndef make_abs_path(fn):\n    return osp.join(osp.dirname(osp.realpath(__file__)), fn)\n\n\ndef to_ndarray(obj):\n    if isinstance(obj, torch.Tensor):\n        return obj.cpu().numpy()\n    elif isinstance(obj, np.ndarray):\n        return obj\n    else:\n        return np.array(obj)\n\n\nclass LandmarkRunner(object):\n    \"\"\"landmark runner\"\"\"\n\n    def __init__(self, **kwargs):\n        ckpt_path = kwargs.get('ckpt_path')\n        onnx_provider = kwargs.get('onnx_provider', 'cuda')  # 默认用cuda\n        device_id = kwargs.get('device_id', 0)\n        self.dsize = kwargs.get('dsize', 224)\n        self.timer = Timer()\n\n        if onnx_provider.lower() == 'cuda':\n            self.session = onnxruntime.InferenceSession(\n                ckpt_path, providers=[\n                    ('CUDAExecutionProvider', {'device_id': device_id})\n                ]\n            )\n        elif onnx_provider.lower() == 'mps':\n            self.session = onnxruntime.InferenceSession(\n                ckpt_path, providers=[\n                    'CoreMLExecutionProvider'\n                ]\n            )\n        else:\n            opts = onnxruntime.SessionOptions()\n            opts.intra_op_num_threads = 4  # 默认线程数为 4\n            self.session = onnxruntime.InferenceSession(\n                ckpt_path, providers=['CPUExecutionProvider'],\n                sess_options=opts\n            )\n\n    def _run(self, inp):\n        out = self.session.run(None, {'input': inp})\n        return out\n\n    def run(self, img_rgb: np.ndarray, lmk=None):\n        if lmk is not None:\n            crop_dct = crop_image(img_rgb, lmk, dsize=self.dsize, scale=1.5, vy_ratio=-0.1)\n            img_crop_rgb = crop_dct['img_crop']\n        else:\n            # NOTE: force resize to 224x224, NOT RECOMMEND!\n            img_crop_rgb = cv2.resize(img_rgb, (self.dsize, self.dsize))\n            scale = max(img_rgb.shape[:2]) / self.dsize\n            crop_dct = {\n                'M_c2o': np.array([\n                    [scale, 0., 0.],\n                    [0., scale, 0.],\n                    [0., 0., 1.],\n                ], dtype=np.float32),\n            }\n\n        inp = (img_crop_rgb.astype(np.float32) / 255.).transpose(2, 0, 1)[None, ...]  # HxWx3 (BGR) -> 1x3xHxW (RGB!)\n\n        out_lst = self._run(inp)\n        out_pts = out_lst[2]\n\n        # 2d landmarks 203 points\n        lmk = to_ndarray(out_pts[0]).reshape(-1, 2) * self.dsize  # scale to 0-224\n        lmk = _transform_pts(lmk, M=crop_dct['M_c2o'])\n\n        return lmk\n\n    def warmup(self):\n        self.timer.tic()\n\n        dummy_image = np.zeros((1, 3, self.dsize, self.dsize), dtype=np.float32)\n\n        _ = self._run(dummy_image)\n\n        elapse = self.timer.toc()\n        rlog(f'LandmarkRunner warmup time: {elapse:.3f}s')\n"
  },
  {
    "path": "src/utils/io.py",
    "content": "# coding: utf-8\n\nimport os.path as osp\nimport imageio\nimport numpy as np\nimport pickle\nimport cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)\n\nfrom .helper import mkdir, suffix\n\n\ndef load_image_rgb(image_path: str):\n    if not osp.exists(image_path):\n        raise FileNotFoundError(f\"Image not found: {image_path}\")\n    img = cv2.imread(image_path, cv2.IMREAD_COLOR)\n    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n\n\ndef load_video(video_info, n_frames=-1):\n    reader = imageio.get_reader(video_info, \"ffmpeg\")\n\n    ret = []\n    for idx, frame_rgb in enumerate(reader):\n        if n_frames > 0 and idx >= n_frames:\n            break\n        ret.append(frame_rgb)\n\n    reader.close()\n    return ret\n\n\ndef contiguous(obj):\n    if not obj.flags.c_contiguous:\n        obj = obj.copy(order=\"C\")\n    return obj\n\n\ndef resize_to_limit(img: np.ndarray, max_dim=1920, division=2):\n    \"\"\"\n    ajust the size of the image so that the maximum dimension does not exceed max_dim, and the width and the height of the image are multiples of n.\n    :param img: the image to be processed.\n    :param max_dim: the maximum dimension constraint.\n    :param n: the number that needs to be multiples of.\n    :return: the adjusted image.\n    \"\"\"\n    h, w = img.shape[:2]\n\n    # ajust the size of the image according to the maximum dimension\n    if max_dim > 0 and max(h, w) > max_dim:\n        if h > w:\n            new_h = max_dim\n            new_w = int(w * (max_dim / h))\n        else:\n            new_w = max_dim\n            new_h = int(h * (max_dim / w))\n        img = cv2.resize(img, (new_w, new_h))\n\n    # ensure that the image dimensions are multiples of n\n    division = max(division, 1)\n    new_h = img.shape[0] - (img.shape[0] % division)\n    new_w = img.shape[1] - (img.shape[1] % division)\n\n    if new_h == 0 or new_w == 0:\n        # when the width or height is less than n, no need to process\n        return img\n\n    if new_h != img.shape[0] or new_w != img.shape[1]:\n        img = img[:new_h, :new_w]\n\n    return img\n\n\ndef load_img_online(obj, mode=\"bgr\", **kwargs):\n    max_dim = kwargs.get(\"max_dim\", 1920)\n    n = kwargs.get(\"n\", 2)\n    if isinstance(obj, str):\n        if mode.lower() == \"gray\":\n            img = cv2.imread(obj, cv2.IMREAD_GRAYSCALE)\n        else:\n            img = cv2.imread(obj, cv2.IMREAD_COLOR)\n    else:\n        img = obj\n\n    # Resize image to satisfy constraints\n    img = resize_to_limit(img, max_dim=max_dim, division=n)\n\n    if mode.lower() == \"bgr\":\n        return contiguous(img)\n    elif mode.lower() == \"rgb\":\n        return contiguous(img[..., ::-1])\n    else:\n        raise Exception(f\"Unknown mode {mode}\")\n\n\ndef load(fp):\n    suffix_ = suffix(fp)\n\n    if suffix_ == \"npy\":\n        return np.load(fp)\n    elif suffix_ == \"pkl\":\n        return pickle.load(open(fp, \"rb\"))\n    else:\n        raise Exception(f\"Unknown type: {suffix}\")\n\n\ndef dump(wfp, obj):\n    wd = osp.split(wfp)[0]\n    if wd != \"\" and not osp.exists(wd):\n        mkdir(wd)\n\n    _suffix = suffix(wfp)\n    if _suffix == \"npy\":\n        np.save(wfp, obj)\n    elif _suffix == \"pkl\":\n        pickle.dump(obj, open(wfp, \"wb\"))\n    else:\n        raise Exception(\"Unknown type: {}\".format(_suffix))\n"
  },
  {
    "path": "src/utils/retargeting_utils.py",
    "content": "\n\"\"\"\nFunctions to compute distance ratios between specific pairs of facial landmarks\n\"\"\"\n\nimport numpy as np\n\n\ndef calculate_distance_ratio(lmk: np.ndarray, idx1: int, idx2: int, idx3: int, idx4: int, eps: float = 1e-6) -> np.ndarray:\n    return (np.linalg.norm(lmk[:, idx1] - lmk[:, idx2], axis=1, keepdims=True) /\n            (np.linalg.norm(lmk[:, idx3] - lmk[:, idx4], axis=1, keepdims=True) + eps))\n\n\ndef calc_eye_close_ratio(lmk: np.ndarray, target_eye_ratio: np.ndarray = None) -> np.ndarray:\n    lefteye_close_ratio = calculate_distance_ratio(lmk, 6, 18, 0, 12)\n    righteye_close_ratio = calculate_distance_ratio(lmk, 30, 42, 24, 36)\n    if target_eye_ratio is not None:\n        return np.concatenate([lefteye_close_ratio, righteye_close_ratio, target_eye_ratio], axis=1)\n    else:\n        return np.concatenate([lefteye_close_ratio, righteye_close_ratio], axis=1)\n\n\ndef calc_lip_close_ratio(lmk: np.ndarray) -> np.ndarray:\n    return calculate_distance_ratio(lmk, 90, 102, 48, 66)\n"
  },
  {
    "path": "src/utils/rprint.py",
    "content": "# coding: utf-8\n\n\"\"\"\ncustom print and log functions \n\"\"\"\n\n__all__ = ['rprint', 'rlog']\n\ntry:\n    from rich.console import Console\n    console = Console()\n    rprint = console.print\n    rlog = console.log\nexcept:\n    rprint = print\n    rlog = print\n"
  },
  {
    "path": "src/utils/timer.py",
    "content": "# coding: utf-8\n\n\"\"\"\ntools to measure elapsed time\n\"\"\"\n\nimport time\n\nclass Timer(object):\n    \"\"\"A simple timer.\"\"\"\n\n    def __init__(self):\n        self.total_time = 0.\n        self.calls = 0\n        self.start_time = 0.\n        self.diff = 0.\n\n    def tic(self):\n        # using time.time instead of time.clock because time time.clock\n        # does not normalize for multithreading\n        self.start_time = time.time()\n\n    def toc(self, average=True):\n        self.diff = time.time() - self.start_time\n        return self.diff\n\n    def clear(self):\n        self.start_time = 0.\n        self.diff = 0.\n"
  },
  {
    "path": "src/utils/video.py",
    "content": "# coding: utf-8\n\n\"\"\"\nFunctions for processing video\n\nATTENTION: you need to install ffmpeg and ffprobe in your env!\n\"\"\"\n\nimport os.path as osp\nimport numpy as np\nimport subprocess\nimport imageio\nimport cv2\nfrom rich.progress import track\n\nfrom .rprint import rlog as log\nfrom .rprint import rprint as print\nfrom .helper import prefix\n\n\ndef exec_cmd(cmd):\n    return subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n\n\ndef images2video(images, wfp, **kwargs):\n    fps = kwargs.get('fps', 30)\n    video_format = kwargs.get('format', 'mp4')  # default is mp4 format\n    codec = kwargs.get('codec', 'libx264')  # default is libx264 encoding\n    quality = kwargs.get('quality')  # video quality\n    pixelformat = kwargs.get('pixelformat', 'yuv420p')  # video pixel format\n    image_mode = kwargs.get('image_mode', 'rgb')\n    macro_block_size = kwargs.get('macro_block_size', 2)\n    ffmpeg_params = ['-crf', str(kwargs.get('crf', 18))]\n\n    writer = imageio.get_writer(\n        wfp, fps=fps, format=video_format,\n        codec=codec, quality=quality, ffmpeg_params=ffmpeg_params, pixelformat=pixelformat, macro_block_size=macro_block_size\n    )\n\n    n = len(images)\n    for i in track(range(n), description='Writing', transient=True):\n        if image_mode.lower() == 'bgr':\n            writer.append_data(images[i][..., ::-1])\n        else:\n            writer.append_data(images[i])\n\n    writer.close()\n\n\ndef video2gif(video_fp, fps=30, size=256):\n    if osp.exists(video_fp):\n        d = osp.split(video_fp)[0]\n        fn = prefix(osp.basename(video_fp))\n        palette_wfp = osp.join(d, 'palette.png')\n        gif_wfp = osp.join(d, f'{fn}.gif')\n        # generate the palette\n        cmd = f'ffmpeg -i \"{video_fp}\" -vf \"fps={fps},scale={size}:-1:flags=lanczos,palettegen\" \"{palette_wfp}\" -y'\n        exec_cmd(cmd)\n        # use the palette to generate the gif\n        cmd = f'ffmpeg -i \"{video_fp}\" -i \"{palette_wfp}\" -filter_complex \"fps={fps},scale={size}:-1:flags=lanczos[x];[x][1:v]paletteuse\" \"{gif_wfp}\" -y'\n        exec_cmd(cmd)\n        return gif_wfp\n    else:\n        raise FileNotFoundError(f\"video_fp: {video_fp} not exists!\")\n\n\ndef merge_audio_video(video_fp, audio_fp, wfp):\n    if osp.exists(video_fp) and osp.exists(audio_fp):\n        cmd = f'ffmpeg -i \"{video_fp}\" -i \"{audio_fp}\" -c:v copy -c:a aac \"{wfp}\" -y'\n        exec_cmd(cmd)\n        print(f'merge {video_fp} and {audio_fp} to {wfp}')\n    else:\n        print(f'video_fp: {video_fp} or audio_fp: {audio_fp} not exists!')\n\n\ndef blend(img: np.ndarray, mask: np.ndarray, background_color=(255, 255, 255)):\n    mask_float = mask.astype(np.float32) / 255.\n    background_color = np.array(background_color).reshape([1, 1, 3])\n    bg = np.ones_like(img) * background_color\n    img = np.clip(mask_float * img + (1 - mask_float) * bg, 0, 255).astype(np.uint8)\n    return img\n\n\ndef concat_frames(driving_image_lst, source_image_lst, I_p_lst):\n    # TODO: add more concat style, e.g., left-down corner driving\n    out_lst = []\n    h, w, _ = I_p_lst[0].shape\n    source_image_resized_lst = [cv2.resize(img, (w, h)) for img in source_image_lst]\n\n    for idx, _ in track(enumerate(I_p_lst), total=len(I_p_lst), description='Concatenating result...'):\n        I_p = I_p_lst[idx]\n        source_image_resized = source_image_resized_lst[idx] if len(source_image_lst) > 1 else source_image_resized_lst[0]\n\n        if driving_image_lst is None:\n            out = np.hstack((source_image_resized, I_p))\n        else:\n            driving_image = driving_image_lst[idx]\n            driving_image_resized = cv2.resize(driving_image, (w, h))\n            out = np.hstack((driving_image_resized, source_image_resized, I_p))\n\n        out_lst.append(out)\n    return out_lst\n\n\nclass VideoWriter:\n    def __init__(self, **kwargs):\n        self.fps = kwargs.get('fps', 30)\n        self.wfp = kwargs.get('wfp', 'video.mp4')\n        self.video_format = kwargs.get('format', 'mp4')\n        self.codec = kwargs.get('codec', 'libx264')\n        self.quality = kwargs.get('quality')\n        self.pixelformat = kwargs.get('pixelformat', 'yuv420p')\n        self.image_mode = kwargs.get('image_mode', 'rgb')\n        self.ffmpeg_params = kwargs.get('ffmpeg_params')\n\n        self.writer = imageio.get_writer(\n            self.wfp, fps=self.fps, format=self.video_format,\n            codec=self.codec, quality=self.quality,\n            ffmpeg_params=self.ffmpeg_params, pixelformat=self.pixelformat\n        )\n\n    def write(self, image):\n        if self.image_mode.lower() == 'bgr':\n            self.writer.append_data(image[..., ::-1])\n        else:\n            self.writer.append_data(image)\n\n    def close(self):\n        if self.writer is not None:\n            self.writer.close()\n\n\ndef change_video_fps(input_file, output_file, fps=20, codec='libx264', crf=12):\n    cmd = f'ffmpeg -i \"{input_file}\" -c:v {codec} -crf {crf} -r {fps} \"{output_file}\" -y'\n    exec_cmd(cmd)\n\n\ndef get_fps(filepath, default_fps=25):\n    try:\n        fps = cv2.VideoCapture(filepath).get(cv2.CAP_PROP_FPS)\n\n        if fps in (0, None):\n            fps = default_fps\n    except Exception as e:\n        log(e)\n        fps = default_fps\n\n    return fps\n\n\ndef has_audio_stream(video_path: str) -> bool:\n    \"\"\"\n    Check if the video file contains an audio stream.\n\n    :param video_path: Path to the video file\n    :return: True if the video contains an audio stream, False otherwise\n    \"\"\"\n    if osp.isdir(video_path):\n        return False\n\n    cmd = [\n        'ffprobe',\n        '-v', 'error',\n        '-select_streams', 'a',\n        '-show_entries', 'stream=codec_type',\n        '-of', 'default=noprint_wrappers=1:nokey=1',\n        f'\"{video_path}\"'\n    ]\n\n    try:\n        # result = subprocess.run(cmd, capture_output=True, text=True)\n        result = exec_cmd(' '.join(cmd))\n        if result.returncode != 0:\n            log(f\"Error occurred while probing video: {result.stderr}\")\n            return False\n\n        # Check if there is any output from ffprobe command\n        return bool(result.stdout.strip())\n    except Exception as e:\n        log(\n            f\"Error occurred while probing video: {video_path}, \"\n            \"you may need to install ffprobe! (https://ffmpeg.org/download.html) \"\n            \"Now set audio to false!\",\n            style=\"bold red\"\n        )\n    return False\n\n\ndef add_audio_to_video(silent_video_path: str, audio_video_path: str, output_video_path: str):\n    cmd = [\n        'ffmpeg',\n        '-y',\n        '-i', f'\"{silent_video_path}\"',\n        '-i', f'\"{audio_video_path}\"',\n        '-map', '0:v',\n        '-map', '1:a',\n        '-c:v', 'copy',\n        '-shortest',\n        f'\"{output_video_path}\"'\n    ]\n\n    try:\n        exec_cmd(' '.join(cmd))\n        log(f\"Video with audio generated successfully: {output_video_path}\")\n    except subprocess.CalledProcessError as e:\n        log(f\"Error occurred: {e}\")\n\n\ndef bb_intersection_over_union(boxA, boxB):\n    xA = max(boxA[0], boxB[0])\n    yA = max(boxA[1], boxB[1])\n    xB = min(boxA[2], boxB[2])\n    yB = min(boxA[3], boxB[3])\n    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)\n    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)\n    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)\n    iou = interArea / float(boxAArea + boxBArea - interArea)\n    return iou\n"
  },
  {
    "path": "src/utils/viz.py",
    "content": "# coding: utf-8\n\nimport cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)\n\n\ndef viz_lmk(img_, vps, **kwargs):\n    \"\"\"可视化点\"\"\"\n    lineType = kwargs.get(\"lineType\", cv2.LINE_8)  # cv2.LINE_AA\n    img_for_viz = img_.copy()\n    for pt in vps:\n        cv2.circle(\n            img_for_viz,\n            (int(pt[0]), int(pt[1])),\n            radius=kwargs.get(\"radius\", 1),\n            color=(0, 255, 0),\n            thickness=kwargs.get(\"thickness\", 1),\n            lineType=lineType,\n        )\n    return img_for_viz\n"
  }
]