[
  {
    "path": "DeepSeek-OCR-master/DeepSeek-OCR-hf/run_dpsk_ocr.py",
    "content": "from transformers import AutoModel, AutoTokenizer\nimport torch\nimport os\n\n\nos.environ[\"CUDA_VISIBLE_DEVICES\"] = '0'\n\n\nmodel_name = 'deepseek-ai/DeepSeek-OCR'\n\n\ntokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\nmodel = AutoModel.from_pretrained(model_name, _attn_implementation='flash_attention_2', trust_remote_code=True, use_safetensors=True)\nmodel = model.eval().cuda().to(torch.bfloat16)\n\n\n\n# prompt = \"<image>\\nFree OCR. \"\nprompt = \"<image>\\n<|grounding|>Convert the document to markdown. \"\nimage_file = 'your_image.jpg'\noutput_path = 'your/output/dir'\n\n\n\n# infer(self, tokenizer, prompt='', image_file='', output_path = ' ', base_size = 1024, image_size = 640, crop_mode = True, test_compress = False, save_results = False):\n\n# Tiny: base_size = 512, image_size = 512, crop_mode = False\n# Small: base_size = 640, image_size = 640, crop_mode = False\n# Base: base_size = 1024, image_size = 1024, crop_mode = False\n# Large: base_size = 1280, image_size = 1280, crop_mode = False\n\n# Gundam: base_size = 1024, image_size = 640, crop_mode = True\n\nres = model.infer(tokenizer, prompt=prompt, image_file=image_file, output_path = output_path, base_size = 1024, image_size = 640, crop_mode=True, save_results = True, test_compress = True)\n"
  },
  {
    "path": "DeepSeek-OCR-master/DeepSeek-OCR-vllm/config.py",
    "content": "# TODO: change modes\n# Tiny: base_size = 512, image_size = 512, crop_mode = False\n# Small: base_size = 640, image_size = 640, crop_mode = False\n# Base: base_size = 1024, image_size = 1024, crop_mode = False\n# Large: base_size = 1280, image_size = 1280, crop_mode = False\n# Gundam: base_size = 1024, image_size = 640, crop_mode = True\n\nBASE_SIZE = 1024\nIMAGE_SIZE = 640\nCROP_MODE = True\nMIN_CROPS= 2\nMAX_CROPS= 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.\nMAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.\nNUM_WORKERS = 64 # image pre-process (resize/padding) workers \nPRINT_NUM_VIS_TOKENS = False\nSKIP_REPEAT = True\nMODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path\n\n# TODO: change INPUT_PATH\n# .pdf: run_dpsk_ocr_pdf.py; \n# .jpg, .png, .jpeg: run_dpsk_ocr_image.py; \n# Omnidocbench images path: run_dpsk_ocr_eval_batch.py\n\nINPUT_PATH = '' \nOUTPUT_PATH = ''\n\nPROMPT = '<image>\\n<|grounding|>Convert the document to markdown.'\n# PROMPT = '<image>\\nFree OCR.'\n# TODO commonly used prompts\n# document: <image>\\n<|grounding|>Convert the document to markdown.\n# other image: <image>\\n<|grounding|>OCR this image.\n# without layouts: <image>\\nFree OCR.\n# figures in document: <image>\\nParse the figure.\n# general: <image>\\nDescribe this image in detail.\n# rec: <image>\\nLocate <|ref|>xxxx<|/ref|> in the image.\n# '先天下之忧而忧'\n# .......\n\n\nfrom transformers import AutoTokenizer\n\nTOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)\n"
  },
  {
    "path": "DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/__init__.py",
    "content": ""
  },
  {
    "path": "DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/build_linear.py",
    "content": "import torch.nn as nn\nimport torch\nimport torch.nn.functional as F\nimport copy\n\n\nclass MlpProjector(nn.Module):\n\n    def __init__(self, cfg):\n\n        super().__init__()\n\n        self.cfg = cfg\n\n        if cfg.projector_type == \"identity\":\n            modules = nn.Identity()\n\n        elif cfg.projector_type == \"linear\":\n            modules = nn.Linear(cfg.input_dim, cfg.n_embed)\n\n        elif cfg.projector_type == \"mlp_gelu\":\n            mlp_depth = cfg.get(\"depth\", 1)\n            modules = [nn.Linear(cfg.input_dim, cfg.n_embed)]\n            for _ in range(1, mlp_depth):\n                modules.append(nn.GELU())\n                modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))\n            modules = nn.Sequential(*modules)\n        \n        elif cfg.projector_type == \"normlayer_downsample_mlp_gelu\":\n            mlp_depth = cfg.get(\"depth\", 1)\n            mlp_ratio = cfg.get(\"mlp_ratio\", 1)\n            modules = [\n                nn.LayerNorm(cfg.input_dim * cfg.downsample_ratio * cfg.downsample_ratio),\n                nn.Linear(cfg.input_dim * cfg.downsample_ratio * cfg.downsample_ratio, cfg.n_embed * mlp_ratio)\n            ]\n            for _ in range(1, mlp_depth - 1):\n                modules.append(nn.GELU())\n                modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed * mlp_ratio))\n            modules.append(nn.GELU())\n            modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed))\n            modules = nn.Sequential(*modules)\n        \n        elif cfg.projector_type == \"downsample_mlp_gelu\":\n            mlp_depth = cfg.get(\"depth\", 1)\n            mlp_ratio = cfg.get(\"mlp_ratio\", 1)\n            modules = [nn.Linear(cfg.input_dim * cfg.downsample_ratio * cfg.downsample_ratio, cfg.n_embed * mlp_ratio)]\n            for _ in range(1, mlp_depth - 1):\n                modules.append(nn.GELU())\n                modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed * mlp_ratio))\n            modules.append(nn.GELU())\n            modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed))\n            modules = nn.Sequential(*modules)\n\n        elif cfg.projector_type == \"low_high_hybrid_split_mlp_gelu\":\n            mlp_depth = cfg.get(\"depth\", 1)\n            self.high_up_proj = nn.Linear(cfg.input_dim, cfg.n_embed // 2)\n            self.low_up_proj = nn.Linear(cfg.input_dim, cfg.n_embed // 2)\n\n            modules = []\n            for _ in range(1, mlp_depth):\n                modules.append(nn.GELU())\n                modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))\n            modules = nn.Sequential(*modules)\n\n        elif cfg.projector_type == \"hybrid_split_feature_mlp_gelu\":\n            mlp_depth = cfg.get(\"depth\", 1)\n            channel_div = cfg.get(\"channel_div\", 0.5)\n            self.high_up_proj = nn.Linear(cfg.input_dim[0], int(cfg.n_embed * channel_div))\n            self.low_up_proj = nn.Linear(cfg.input_dim[1], cfg.n_embed - int(cfg.n_embed * channel_div))\n\n            modules = []\n            for _ in range(1, mlp_depth):\n                modules.append(nn.GELU())\n                modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))\n            modules = nn.Sequential(*modules)\n\n        elif cfg.projector_type == \"low_high_split_mlp_gelu\":\n            mlp_depth = cfg.get(\"depth\", 1)\n            modules = []\n            for _ in range(1, mlp_depth):\n                modules.append(nn.GELU())\n                modules.append(nn.Linear(cfg.n_embed // 2, cfg.n_embed // 2))\n            modules = nn.Sequential(*modules)\n            self.high_layers = nn.Sequential(*modules)\n            self.low_layers = copy.deepcopy(modules)\n\n        else:\n            raise ValueError(f\"Unknown projector type: {cfg.projector_type}\")\n\n        if cfg.get(\"token_pooling\", False):\n            self.token_pooling_layer = nn.Linear(cfg.input_dim * 4, cfg.input_dim)\n\n        if cfg.get(\"conv_fusion_high_low_features\", False):\n            self.fusion_layer = nn.Linear(cfg.input_dim, cfg.input_dim)\n        self.layers = modules\n\n    def forward(self, x):\n        if self.cfg.get(\"token_pooling\", False):\n            batch_size, wxh, channels = x.shape\n            w = h = int(wxh**0.5)\n            x = x.view(batch_size, w, h, channels)\n            x = x.permute(0, 3, 1, 2)\n            # import ipdb; ipdb.set_trace()\n            patches = x.unfold(2, 2, 2).unfold(3, 2, 2)\n            batch_size, channels, h_patches, w_patches, _, _ = patches.size()\n            # 在通道维度上拼接\n            patches = patches.contiguous().view(batch_size, channels, h_patches * w_patches, -1)\n\n            # 通过线性层\n            patches = patches.permute(0, 2, 1, 3).contiguous()\n            patches = patches.view(batch_size, h_patches * w_patches, channels * 4)\n\n            x = self.token_pooling_layer(patches)\n        \n        if self.cfg.get(\"conv_fusion_high_low_features\", False):\n            x = self.fusion_layer(x[:, 0]) + x[:, 1]\n\n        if self.cfg.projector_type == 'low_high_hybrid_split_mlp_gelu':\n            high_x, low_x = x[0], x[1]\n            high_x = self.high_up_proj(high_x)\n            low_x = self.low_up_proj(low_x)\n            x = torch.concat([high_x, low_x], dim=-1)\n        \n        if self.cfg.projector_type == 'hybrid_split_feature_mlp_gelu':\n            high_x = x[...,:self.cfg.input_dim[0]]\n            low_x = x[...,self.cfg.input_dim[0]:]\n            high_x = self.high_up_proj(high_x)\n            low_x = self.low_up_proj(low_x)\n            x = torch.concat([high_x, low_x], dim=-1)\n        \n        if self.cfg.projector_type == 'low_high_split_mlp_gelu':\n            high_x, low_x = x[0], x[1]\n            high_x = self.high_layers(high_x)\n            low_x = self.low_layers(low_x)\n            x = torch.concat([high_x, low_x], dim=-1)\n            return x\n        \n        if self.cfg.projector_type == 'downsample_mlp_gelu' or self.cfg.projector_type == 'normlayer_downsample_mlp_gelu':\n            bs, hw, input_dim = x.shape\n            h = w = int((hw) ** 0.5)\n\n            \"\"\"compute padding\"\"\"\n            if h % self.cfg.downsample_ratio:\n                pad = self.cfg.downsample_ratio - h % self.cfg.downsample_ratio\n            else:\n                pad = 0\n            x = x.reshape(bs, h, w, input_dim)\n            if pad > 0:\n                x = F.pad(x, (0, 0, 0, pad, 0, pad), \"constant\", 0)\n\n            \"\"\"4 to 1 concat\"\"\"\n            x = x.permute(0, 3, 1, 2)  # B, C, H, W\n            x = F.unfold(x, kernel_size=self.cfg.downsample_ratio, stride=self.cfg.downsample_ratio, padding=0) # B, C*4, HW // 4\n            x = x.permute(0, 2, 1)\n            \n        return self.layers(x)\n\n    @staticmethod\n    def get_flops_per_sample(cfg):\n        if cfg.projector_type == \"linear\":\n            fwd = 2 * cfg.input_dim * cfg.n_embed\n\n        elif \"mlp_gelu\" in cfg.projector_type :\n            mlp_depth = cfg.get(\"depth\", 1)\n            downsample_ratio = cfg.get(\"downsample_ratio\", 1)\n            input_dim = sum(cfg.input_dim) if isinstance(cfg.input_dim, list) else cfg.input_dim\n            input_dim = input_dim * downsample_ratio * downsample_ratio\n            fwd = 2 * input_dim * cfg.n_embed + (mlp_depth - 1) * 2 * cfg.n_embed * cfg.n_embed\n        else:\n            fwd = 0\n\n        return fwd * 3\n\n\n"
  },
  {
    "path": "DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/clip_sdpa.py",
    "content": "from contextlib import nullcontext\nimport math\nfrom typing import Optional, Tuple\n# from megatron.model import LayerNorm\nfrom easydict import EasyDict as adict\nimport torch\nfrom torch.nn import functional as F\nfrom torch import nn\nfrom flash_attn import flash_attn_qkvpacked_func, flash_attn_func\n# from optimus import flash_attn_func\n# from megatron.core import tensor_parallel\n# from megatron.core import parallel_state as mpu\n# from megatron.core.utils import make_viewless_tensor, divide\n# from megatron.model.fused_rms_norm import RMSNorm\n# from megatron.model.transformer import (\n#     FlashSelfAttention,\n#     NoopTransformerLayer,\n#     _cfg_to_kwargs,\n# )\n# from megatron.model.enums import AttnMaskType, AttnType\n# from megatron.model.fused_softmax import FusedScaleMaskSoftmax\n# from megatron.model.utils import attention_mask_func\n\n# from megatron.model.module import MegatronModule\n\n# try:\n#     from einops import rearrange\n# except ImportError:\n#     rearrange = None\n\n# from flash_attn import flash_attn_varlen_func as flash_attn_unpadded_func\n\n# try:\n#     # flash attention 2.x\n#     from flash_attn import flash_attn_varlen_func as flash_attn_unpadded_func\n# except ImportError:\n#     try:\n#         # flash attention 1.x\n#         from flash_attn.flash_attn_interface import flash_attn_unpadded_func\n#     except ImportError:\n#         flash_attn_unpadded_func = None\n\n# try:\n#     from flash_attn.flash_attn_interface import flash_attn_unpadded_relative_attention_bias_func\n# except ImportError:\n#     flash_attn_unpadded_relative_attention_bias_func = None\n\n# try:\n#     from flash_attn.flash_attn_interface import mask_flash_attn_unpadded_func\n# except ImportError:\n#     mask_flash_attn_unpadded_func = None\n\n\nclass LayerNormfp32(torch.nn.LayerNorm):\n    \"\"\"Subclass torch's LayerNorm to handle fp16.\"\"\"\n\n    def forward(self, x: torch.Tensor):\n        orig_type = x.dtype\n        ret = super().forward(x.type(torch.float32))\n        return ret.type(orig_type)\n\n\ndef get_abs_pos(abs_pos, tgt_size):\n    # abs_pos: L, C\n    # tgt_size: M\n    # return: M, C\n\n    # print(tgt_size)\n    # print(abs_pos.shape)\n    # exit()\n    dim = abs_pos.size(-1)\n    # print(dim)\n    abs_pos_new = abs_pos.squeeze(0)\n    cls_token, old_pos_embed = abs_pos_new[:1], abs_pos_new[1:]\n\n\n\n    src_size = int(math.sqrt(abs_pos_new.shape[0] - 1))\n    tgt_size = int(math.sqrt(tgt_size))\n    dtype = abs_pos.dtype\n\n    if src_size != tgt_size:\n        old_pos_embed = old_pos_embed.view(1, src_size, src_size, dim).permute(0, 3, 1,\n                                                                                    2).contiguous()\n        old_pos_embed = old_pos_embed.to(torch.float32)\n        new_pos_embed = F.interpolate(\n            old_pos_embed,\n            size=(tgt_size, tgt_size),\n            mode='bicubic',\n            antialias=True,\n            align_corners=False,\n        ).to(dtype)\n        new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)\n        new_pos_embed = new_pos_embed.view(tgt_size * tgt_size, dim)\n        vision_pos_embed = torch.cat([cls_token, new_pos_embed], dim=0)\n        vision_pos_embed = vision_pos_embed.view(1, tgt_size * tgt_size + 1, dim)\n        return vision_pos_embed\n    else:\n        return abs_pos\n\n@torch.jit.script\ndef quick_gelu(x):\n    return x * torch.sigmoid(1.702 * x)\n\n\n\nclass CLIPVisionEmbeddings(nn.Module):\n    def __init__(self, hidden_size=1024, image_size=224, patch_size=14, num_channels=3):\n        super().__init__()\n        self.embed_dim = hidden_size\n        self.image_size = image_size\n        self.patch_size = patch_size\n\n        self.class_embedding = torch.nn.Parameter(torch.randn(self.embed_dim))\n\n        self.patch_embedding = torch.nn.Conv2d(\n            in_channels=num_channels,\n            out_channels=self.embed_dim,\n            kernel_size=self.patch_size,\n            stride=self.patch_size,\n            bias=False,\n        )\n\n        self.num_patches = (self.image_size // self.patch_size) ** 2\n        self.num_positions = self.num_patches + 1\n        self.position_embedding = torch.nn.Embedding(self.num_positions, self.embed_dim)\n        self.register_buffer(\n            \"position_ids\", torch.arange(self.num_positions).expand((1, -1))\n        )\n\n    def forward(self, pixel_values, patch_embeds):\n        batch_size = pixel_values.shape[0]\n        # patch_embeds = self.patch_embedding(\n        #     pixel_values\n        # )  # shape = [*, width, grid, grid]\n\n\n        if patch_embeds is not None:\n            patch_embeds = patch_embeds\n            # print(patch_embeds.shape)\n        else:\n            patch_embeds = self.patch_embedding(pixel_values)  \n            # print(111111)\n        # shape = [*, width, grid, grid]\n        # patch_embeds = patch_embeds.flatten(2).transpose(1, 2)\n\n        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)\n\n\n        class_embeds = self.class_embedding.expand(batch_size, 1, -1)\n        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)\n\n        # x = torch.cat([cls_token, x], dim=1)\n        embeddings = embeddings + get_abs_pos(self.position_embedding(self.position_ids), embeddings.size(1))\n        # embeddings = embeddings + self.position_embedding(self.position_ids)\n        return embeddings\n\n\nclass NoTPFeedForward(nn.Module):\n    def __init__(\n            self,\n            cfg,\n            dim: int,\n            hidden_dim: int,\n    ):\n        super().__init__()\n\n        self.fc1 = torch.nn.Linear(dim, hidden_dim, bias=True)\n        self.fc2 = torch.nn.Linear(hidden_dim, dim, bias=True)\n\n    def forward(self, x):\n        output = self.fc2(quick_gelu(self.fc1(x)))\n        return output\n\n\n# from optimus.flash_attn_interface import flash_attn_qkvpacked_func\n\n\n# class NoTPAttention(nn.Module):\n#     def __init__(self, cfg):\n#         super().__init__()\n#         self.num_heads = cfg.num_attention_heads\n#         self.n_local_heads = cfg.num_attention_heads\n#         self.head_dim = cfg.hidden_size // cfg.num_attention_heads\n#         self.max_seq_len = cfg.seq_length\n#         self.use_flash_attention = cfg.use_flash_attn\n\n#         self.qkv_proj = torch.nn.Linear(cfg.hidden_size, cfg.hidden_size * 3, bias=True)\n#         self.out_proj = torch.nn.Linear(cfg.hidden_size, cfg.hidden_size, bias=True)\n\n#         # self.core_attention = CoreAttention(cfg, AttnType.self_attn)\n\n#         self.attn_drop = cfg.attention_dropout\n\n#     def forward(\n#             self,\n#             x: torch.Tensor,\n#     ):\n#         bsz, seqlen, _ = x.shape\n#         xqkv = self.qkv_proj(x)\n#         xqkv = xqkv.view(bsz, seqlen, 3, self.num_heads, self.head_dim)\n\n#         if self.use_flash_attention:\n#             output = flash_attn_qkvpacked_func(xqkv)\n#             output = output.view(bsz, seqlen, -1)\n#         else:\n#             xq, xk, xv = torch.split(xqkv, 1, dim=2)\n#             xq = xq.squeeze(2)\n#             xk = xk.squeeze(2)\n#             xv = xv.squeeze(2)\n#             # xq, xk, xv = xqkv[:, :, 0, ...], xqkv[:, :, 1, ...], xqkv[:, :, 2, ...]\n\n#             # （B, num_head, S, head_size)\n#             xq = xq.permute(0, 2, 1, 3)\n#             xk = xk.permute(0, 2, 1, 3)\n#             xv = xv.permute(0, 2, 1, 3)\n\n#             output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None)\n#             utput = output.permute(0, 2, 1, 3).view(bsz, seqlen, -1)\n#         output = self.out_proj(output)\n#         return output\n\n\n# from optimus.flash_attn_interface import flash_attn_qkvpacked_func\n\n\nclass NoTPAttention(torch.nn.Module):\n    def __init__(self, cfg):\n        super().__init__()\n        self.num_heads = cfg.num_attention_heads\n        self.n_local_heads = cfg.num_attention_heads\n        self.head_dim = cfg.hidden_size // cfg.num_attention_heads\n        self.max_seq_len = cfg.seq_length\n        self.use_flash_attention = cfg.use_flash_attn\n\n        self.qkv_proj = torch.nn.Linear(cfg.hidden_size, cfg.hidden_size * 3, bias=True)\n        self.out_proj = torch.nn.Linear(cfg.hidden_size, cfg.hidden_size, bias=True)\n\n        # self.core_attention = CoreAttention(cfg, AttnType.self_attn)\n\n        self.attn_drop = cfg.attention_dropout\n\n    def forward(\n            self,\n            x: torch.Tensor,\n    ):\n        bsz, seqlen, _ = x.shape\n        xqkv = self.qkv_proj(x)\n        xqkv = xqkv.view(bsz, seqlen, 3, self.num_heads, self.head_dim)\n\n        if self.use_flash_attention:\n            output = flash_attn_qkvpacked_func(xqkv)\n            output = output.view(bsz, seqlen, -1)\n            # xq, xk, xv = torch.split(xqkv, 1, dim=2)\n            # xq = xq.squeeze(2)\n            # xk = xk.squeeze(2)\n            # xv = xv.squeeze(2)\n            # # xq, xk, xv = xqkv[:, :, 0, ...], xqkv[:, :, 1, ...], xqkv[:, :, 2, ...]\n\n            # # （B, num_head, S, head_size)\n            # xq = xq.permute(0, 2, 1, 3)\n            # xk = xk.permute(0, 2, 1, 3)\n            # xv = xv.permute(0, 2, 1, 3)\n            # # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):\n            # output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None)\n            # output = output.permute(0, 2, 1, 3).reshape(bsz, seqlen, -1)\n                # output = output.permute(0, 2, 1, 3).contiguous().view(bsz, seqlen, -1)\n        else:\n            # output = flash_attn_qkvpacked_func(xqkv)\n            xq, xk, xv = torch.split(xqkv, 1, dim=2)\n            xq = xq.squeeze(2)\n            xk = xk.squeeze(2)\n            xv = xv.squeeze(2)\n            # xq, xk, xv = xqkv[:, :, 0, ...], xqkv[:, :, 1, ...], xqkv[:, :, 2, ...]\n\n            # （B, num_head, S, head_size)\n            xq = xq.permute(0, 2, 1, 3)\n            xk = xk.permute(0, 2, 1, 3)\n            xv = xv.permute(0, 2, 1, 3)\n            # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):\n            output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None)\n            output = output.permute(0, 2, 1, 3).reshape(bsz, seqlen, -1)\n        output = self.out_proj(output)\n        return output\n\nclass NoTPTransformerBlock(nn.Module):\n    def __init__(self, cfg, layer_id: int, multiple_of=256):\n        super().__init__()\n\n        self.n_heads = cfg.num_attention_heads\n        self.dim = cfg.hidden_size\n        self.head_dim = cfg.hidden_size // cfg.num_attention_heads\n        self.self_attn = NoTPAttention(cfg)\n        self.mlp = NoTPFeedForward(\n            cfg, dim=cfg.hidden_size, hidden_dim=cfg.ffn_hidden_size\n        )\n        self.layer_id = layer_id\n        self.layer_norm1 = torch.nn.LayerNorm(\n            cfg.hidden_size, eps=cfg.layernorm_epsilon\n        )\n        self.layer_norm2 = torch.nn.LayerNorm(\n            cfg.hidden_size, eps=cfg.layernorm_epsilon\n        )\n\n    def forward(self, x: torch.Tensor):\n        residual = self.self_attn.forward(self.layer_norm1(x))\n        h = x + residual\n        out = h + self.mlp.forward(self.layer_norm2(h))\n        return out\n\n\nclass NoTPTransformer(nn.Module):\n    def __init__(self, cfg):\n        super().__init__()\n\n        self.cfg = cfg\n        # self.recompute_list = self.cfg.get(\"recompute_list\", [])\n        self.num_layers = cfg.num_layers  # _get_num_layers(cfg)\n\n        self.layers = torch.nn.ModuleList()\n        for layer_id in range(self.num_layers):\n            self.layers.append(\n                NoTPTransformerBlock(\n                    cfg,\n                    layer_id + 1,\n                )\n            )\n\n    def forward(\n            self,\n            hidden_states,\n    ):\n\n        for lid, layer in enumerate(self.layers):\n            # if lid in self.recompute_list:\n            #     def custom(layer_id):\n            #         def custom_forward(*args, **kwargs):\n            #             x_ = self.layers[layer_id](*args, **kwargs)\n            #             return x_\n\n            #         return custom_forward\n\n            #     assert hidden_states.requires_grad == True, logger.warning(\n            #         \"When using recalculation, the input must have grad fn\"\n            #     )\n            #     hidden_states = tensor_parallel.checkpoint(\n            #         custom(lid),\n            #         False,\n            #         hidden_states.contiguous()\n            #     )\n            # else:\n            hidden_states = layer(hidden_states)\n\n        return hidden_states\n\n\n# from megatron.core.tensor_parallel.layers import non_tensor_paralleled, local_dp_reduce, local_dp_scatter\n\nclass VitModel(nn.Module):\n    def __init__(\n            self,\n            cfg,\n            freeze_embed=False,\n            freeze_pre_norm=False\n    ) -> None:\n        super().__init__()\n\n        self.embeddings = CLIPVisionEmbeddings(hidden_size=cfg.hidden_size, image_size=cfg.image_size, patch_size=cfg.patch_size)\n\n        if freeze_embed:\n            for name, param in self.embeddings.named_parameters():\n                param.requires_grad = False\n\n        self.transformer = NoTPTransformer(cfg=cfg)\n\n        if cfg.get(\"fp32norm\", False):\n            logger.info(\"Load fp32 layernorm for ViT.\")\n            self.pre_layrnorm = LayerNormfp32(\n                cfg.hidden_size,\n                eps=cfg.get(\"pre_layernorm_epsilon\", 1e-5),\n            )\n        else:\n            self.pre_layrnorm = torch.nn.LayerNorm(\n                cfg.hidden_size,\n                eps=cfg.get(\"pre_layernorm_epsilon\", 1e-5),\n            )\n\n        # self.pre_layrnorm = RMSNorm(\n        #     cfg.hidden_size,\n        #     eps=cfg.get(\"pre_layernorm_epsilon\", 1e-5),\n        #     sequence_parallel=False,\n        #     use_fp32=True,\n        #     use_optimus=True,\n        # )\n\n        if freeze_pre_norm:\n            for name, param in self.pre_layrnorm.named_parameters():\n                param.requires_grad = False\n\n        for p in self.parameters():\n            p.micro_dp = True\n\n    def set_input_tensor(self, input_tensor):\n        if not isinstance(input_tensor, list):\n            input_tensor = [input_tensor]\n        self.transformer.set_input_tensor(input_tensor[0])\n\n    def __str__(self) -> str:\n        return \"open_clip\"\n\n    def forward(\n            self,\n            x,\n            patch_embeds\n    ):\n        x = self.embeddings(x, patch_embeds)\n        hidden_states = self.pre_layrnorm(x)\n\n        # hidden_states, dis = local_dp_scatter(hidden_states)\n        output = self.transformer(hidden_states)\n\n        # output = local_dp_reduce(output, dis)\n\n        return output\n\n\nvit_model_cfg = adict(\n    num_layers=24,\n    hidden_size=1024,\n    num_heads = 16,\n    num_attention_heads=16,\n    ffn_hidden_size=4096,\n    seq_length=256,\n    max_position_embeddings=256,\n    use_flash_attn=False,\n    understand_projector_stride=2,\n    hidden_dropout = 0.0,\n    attention_dropout = 0.0,\n    no_persist_layer_norm = False,\n    layernorm_epsilon = 1e-5,\n    pre_layernorm_epsilon = 1e-5,\n    image_size = 224,\n    patch_size = 14,\n    recompute_list = []\n)\n\ndef build_clip_l():\n    return VitModel(\n        cfg=vit_model_cfg,\n        freeze_embed=False,\n        freeze_pre_norm=False,\n    )\n\n\nif __name__ == '__main__':\n\n    \n    from mmgpt.model.vision_encoder.sam_b import build_sam_vit_b\n\n\n\n    vit_model_cfg = adict(\n        num_layers=24,\n        hidden_size=1024,\n        num_attention_heads=16,\n        ffn_hidden_size=4096,\n        seq_length=256,\n        max_position_embeddings=256,\n        use_flash_attn=False,\n        understand_projector_stride=2,\n        hidden_dropout = 0.0,\n        attention_dropout = 0.0,\n        no_persist_layer_norm = False,\n        layernorm_epsilon = 1e-5,\n        pre_layernorm_epsilon = 1e-5,\n        image_size = 224,\n        patch_size = 14,\n        recompute_list = []\n    )\n\n    sam_model = build_sam_vit_b()\n\n\n    vision_model = VitModel(\n        cfg=vit_model_cfg,\n        freeze_embed=False,\n        freeze_pre_norm=False,\n    )\n\n    # model = VitModel(1344)\n    # x = torch.zeros(2, 3, 224, 224)\n    x = torch.zeros(2, 3, 1024, 1024)\n\n    \n    with torch.no_grad():\n        # y = vision_model(x)\n        patch_embed = sam_model(x)\n        print(patch_embed.shape)\n        y = vision_model(x, patch_embed)\n        print(y.shape)\n\n        image_feature = torch.add(y[:, 1:], patch_embed.flatten(2).permute(0, 2, 1))\n\n        print(image_feature.shape)\n"
  },
  {
    "path": "DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py",
    "content": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n# All rights reserved.\n\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom typing import Optional, Tuple, Type\nfrom functools import partial\nfrom flash_attn import flash_attn_qkvpacked_func\n# from .common import LayerNorm2d, MLPBlock\n\n# from mmgpt.model.vision_encoder.flash_4 import _attention_rel_h_rel_w\n\n\ndef get_abs_pos(abs_pos, tgt_size):\n\n    dtype = abs_pos.dtype\n\n    src_size = abs_pos.size(1)\n\n    if src_size != tgt_size:\n        old_pos_embed = abs_pos.permute(0, 3, 1, 2)\n        old_pos_embed = old_pos_embed.to(torch.float32)\n        new_pos_embed = F.interpolate(\n            old_pos_embed,\n            size=(tgt_size, tgt_size),\n            mode='bicubic',\n            antialias=True,\n            align_corners=False,\n        ).to(dtype)\n        new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)\n        return new_pos_embed\n    else:\n        return abs_pos\n\n\n\n\nclass MLPBlock(nn.Module):\n    def __init__(\n        self,\n        embedding_dim: int,\n        mlp_dim: int,\n        act: Type[nn.Module] = nn.GELU,\n    ) -> None:\n        super().__init__()\n        self.lin1 = nn.Linear(embedding_dim, mlp_dim)\n        self.lin2 = nn.Linear(mlp_dim, embedding_dim)\n        self.act = act()\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        return self.lin2(self.act(self.lin1(x)))\n\n\n# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa\n# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa\nclass LayerNorm2d(nn.Module):\n    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:\n        super().__init__()\n        self.weight = nn.Parameter(torch.ones(num_channels))\n        self.bias = nn.Parameter(torch.zeros(num_channels))\n        self.eps = eps\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        u = x.mean(1, keepdim=True)\n        s = (x - u).pow(2).mean(1, keepdim=True)\n        x = (x - u) / torch.sqrt(s + self.eps)\n        x = self.weight[:, None, None] * x + self.bias[:, None, None]\n        return x\n\n\n# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa\nclass ImageEncoderViT(nn.Module):\n    def __init__(\n        self,\n        img_size: int = 1024,\n        patch_size: int = 16,\n        in_chans: int = 3,\n        embed_dim: int = 768,\n        depth: int = 12,\n        num_heads: int = 12,\n        mlp_ratio: float = 4.0,\n        out_chans: int = 256,\n        qkv_bias: bool = True,\n        norm_layer: Type[nn.Module] = nn.LayerNorm,\n        act_layer: Type[nn.Module] = nn.GELU,\n        use_abs_pos: bool = True,\n        use_rel_pos: bool = False,\n        rel_pos_zero_init: bool = True,\n        window_size: int = 0,\n        global_attn_indexes: Tuple[int, ...] = (),\n    ) -> None:\n        \"\"\"\n        Args:\n            img_size (int): Input image size.\n            patch_size (int): Patch size.\n            in_chans (int): Number of input image channels.\n            embed_dim (int): Patch embedding dimension.\n            depth (int): Depth of ViT.\n            num_heads (int): Number of attention heads in each ViT block.\n            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.\n            qkv_bias (bool): If True, add a learnable bias to query, key, value.\n            norm_layer (nn.Module): Normalization layer.\n            act_layer (nn.Module): Activation layer.\n            use_abs_pos (bool): If True, use absolute positional embeddings.\n            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.\n            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.\n            window_size (int): Window size for window attention blocks.\n            global_attn_indexes (list): Indexes for blocks using global attention.\n        \"\"\"\n        super().__init__()\n        self.img_size = img_size\n\n        self.patch_embed = PatchEmbed(\n            kernel_size=(patch_size, patch_size),\n            stride=(patch_size, patch_size),\n            in_chans=in_chans,\n            embed_dim=embed_dim,\n        )\n\n        self.pos_embed: Optional[nn.Parameter] = None\n        if use_abs_pos:\n            # Initialize absolute positional embedding with pretrain image size.\n            self.pos_embed = nn.Parameter(\n                torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)\n            )\n\n        self.blocks = nn.ModuleList()\n        for i in range(depth):\n            block = Block(\n                dim=embed_dim,\n                num_heads=num_heads,\n                mlp_ratio=mlp_ratio,\n                qkv_bias=qkv_bias,\n                norm_layer=norm_layer,\n                act_layer=act_layer,\n                use_rel_pos=use_rel_pos,\n                rel_pos_zero_init=rel_pos_zero_init,\n                window_size=window_size if i not in global_attn_indexes else 0,\n                input_size=(img_size // patch_size, img_size // patch_size),\n            )\n            self.blocks.append(block)\n\n        self.neck = nn.Sequential(\n            nn.Conv2d(\n                embed_dim,\n                out_chans,\n                kernel_size=1,\n                bias=False,\n            ),\n            LayerNorm2d(out_chans),\n            nn.Conv2d(\n                out_chans,\n                out_chans,\n                kernel_size=3,\n                padding=1,\n                bias=False,\n            ),\n            LayerNorm2d(out_chans),\n        )\n\n        self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)\n        self.net_3 = nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1, bias=False)\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        x = self.patch_embed(x)\n        if self.pos_embed is not None:\n            # x = x + self.pos_embed\n            x = x + get_abs_pos(self.pos_embed, x.size(1))\n\n        for blk in self.blocks:\n            x = blk(x)\n\n        neck_output  = self.neck(x.permute(0, 3, 1, 2))\n        conv2_output  = self.net_2(neck_output)\n        # print(f\"conv2_output shape: {conv2_output.shape}\")\n        conv3_output  = self.net_3(conv2_output)\n\n        return conv3_output \n\n\nclass Block(nn.Module):\n    \"\"\"Transformer blocks with support of window attention and residual propagation blocks\"\"\"\n\n    def __init__(\n        self,\n        dim: int,\n        num_heads: int,\n        mlp_ratio: float = 4.0,\n        qkv_bias: bool = True,\n        norm_layer: Type[nn.Module] = nn.LayerNorm,\n        act_layer: Type[nn.Module] = nn.GELU,\n        use_rel_pos: bool = False,\n        rel_pos_zero_init: bool = True,\n        window_size: int = 0,\n        input_size: Optional[Tuple[int, int]] = None,\n    ) -> None:\n        \"\"\"\n        Args:\n            dim (int): Number of input channels.\n            num_heads (int): Number of attention heads in each ViT block.\n            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.\n            qkv_bias (bool): If True, add a learnable bias to query, key, value.\n            norm_layer (nn.Module): Normalization layer.\n            act_layer (nn.Module): Activation layer.\n            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.\n            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.\n            window_size (int): Window size for window attention blocks. If it equals 0, then\n                use global attention.\n            input_size (tuple(int, int) or None): Input resolution for calculating the relative\n                positional parameter size.\n        \"\"\"\n        super().__init__()\n        self.norm1 = norm_layer(dim)\n        self.attn = Attention(\n            dim,\n            num_heads=num_heads,\n            qkv_bias=qkv_bias,\n            use_rel_pos=use_rel_pos,\n            rel_pos_zero_init=rel_pos_zero_init,\n            input_size=input_size if window_size == 0 else (window_size, window_size),\n        )\n\n        self.norm2 = norm_layer(dim)\n        self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)\n\n        self.window_size = window_size\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        shortcut = x\n        x = self.norm1(x)\n        # Window partition\n        if self.window_size > 0:\n            H, W = x.shape[1], x.shape[2]\n            x, pad_hw = window_partition(x, self.window_size)\n\n        x = self.attn(x)\n        # Reverse window partition\n        if self.window_size > 0:\n            x = window_unpartition(x, self.window_size, pad_hw, (H, W))\n\n        x = shortcut + x\n        x = x + self.mlp(self.norm2(x))\n\n        return x\n\n\nclass Attention(nn.Module):\n    \"\"\"Multi-head Attention block with relative position embeddings.\"\"\"\n\n    def __init__(\n        self,\n        dim: int,\n        num_heads: int = 8,\n        qkv_bias: bool = True,\n        use_rel_pos: bool = False,\n        rel_pos_zero_init: bool = True,\n        input_size: Optional[Tuple[int, int]] = None,\n    ) -> None:\n        \"\"\"\n        Args:\n            dim (int): Number of input channels.\n            num_heads (int): Number of attention heads.\n            qkv_bias (bool):  If True, add a learnable bias to query, key, value.\n            rel_pos (bool): If True, add relative positional embeddings to the attention map.\n            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.\n            input_size (tuple(int, int) or None): Input resolution for calculating the relative\n                positional parameter size.\n        \"\"\"\n        super().__init__()\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = head_dim**-0.5\n\n        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)\n        self.proj = nn.Linear(dim, dim)\n\n        self.use_rel_pos = use_rel_pos\n        if self.use_rel_pos:\n            assert (\n                input_size is not None\n            ), \"Input size must be provided if using relative positional encoding.\"\n            # initialize relative positional embeddings\n            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))\n            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        B, H, W, _ = x.shape\n        # qkv with shape (3, B, nHead, H * W, C)\n        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)\n        # q, k, v with shape (B * nHead, H * W, C)\n        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)\n\n        rel_h, rel_w = None, None\n        if self.use_rel_pos:\n            rel_h, rel_w = add_decomposed_rel_pos(q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))\n\n        q = q.view(B, self.num_heads, H * W, -1)\n        k = k.view(B, self.num_heads, H * W, -1)\n        v = v.view(B, self.num_heads, H * W, -1)\n\n        if self.use_rel_pos:\n            rel_h = rel_h.view(B, self.num_heads, rel_h.size(1), rel_h.size(2), rel_h.size(3))\n            rel_w = rel_w.view(B, self.num_heads, rel_w.size(1), rel_w.size(2), rel_w.size(3))\n            attn_bias = (rel_h + rel_w).view(B, self.num_heads, rel_h.size(2), rel_h.size(3) * rel_w.size(4))\n            x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)\n            # x = _attention_rel_h_rel_w(q, k, v, rel_h, rel_w)\n        else:\n            x = torch.nn.functional.scaled_dot_product_attention(q, k, v)\n            # qkv = torch.stack([q, k, v], dim=1).transpose(1, 3).reshape(B, H * W, 3, self.num_heads, -1)\n            # x = flash_attn_qkvpacked_func(qkv, dropout_p=0.0, causal=False).transpose(1, 2)\n\n        \n\n        x = x.view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)\n\n        x = self.proj(x)\n\n        return x\n\n\ndef window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:\n    \"\"\"\n    Partition into non-overlapping windows with padding if needed.\n    Args:\n        x (tensor): input tokens with [B, H, W, C].\n        window_size (int): window size.\n\n    Returns:\n        windows: windows after partition with [B * num_windows, window_size, window_size, C].\n        (Hp, Wp): padded height and width before partition\n    \"\"\"\n    B, H, W, C = x.shape\n\n    pad_h = (window_size - H % window_size) % window_size\n    pad_w = (window_size - W % window_size) % window_size\n    if pad_h > 0 or pad_w > 0:\n        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))\n    Hp, Wp = H + pad_h, W + pad_w\n\n    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)\n    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)\n    return windows, (Hp, Wp)\n\n\ndef window_unpartition(\n    windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]\n) -> torch.Tensor:\n    \"\"\"\n    Window unpartition into original sequences and removing padding.\n    Args:\n        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].\n        window_size (int): window size.\n        pad_hw (Tuple): padded height and width (Hp, Wp).\n        hw (Tuple): original height and width (H, W) before padding.\n\n    Returns:\n        x: unpartitioned sequences with [B, H, W, C].\n    \"\"\"\n    Hp, Wp = pad_hw\n    H, W = hw\n    B = windows.shape[0] // (Hp * Wp // window_size // window_size)\n    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)\n    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)\n\n    if Hp > H or Wp > W:\n        x = x[:, :H, :W, :].contiguous()\n    return x\n\n\ndef get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:\n    \"\"\"\n    Get relative positional embeddings according to the relative positions of\n        query and key sizes.\n    Args:\n        q_size (int): size of query q.\n        k_size (int): size of key k.\n        rel_pos (Tensor): relative position embeddings (L, C).\n\n    Returns:\n        Extracted positional embeddings according to relative positions.\n    \"\"\"\n    max_rel_dist = int(2 * max(q_size, k_size) - 1)\n    # Interpolate rel pos if needed.\n    if rel_pos.shape[0] != max_rel_dist:\n        # Interpolate rel pos.\n        dtype = rel_pos.dtype\n        rel_pos = rel_pos.to(torch.float32)\n        rel_pos_resized = F.interpolate(\n            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),\n            size=max_rel_dist,\n            mode=\"linear\",\n        ).to(dtype)\n        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)\n    else:\n        rel_pos_resized = rel_pos\n\n    # Scale the coords with short length if shapes for q and k are different.\n    q_coords = torch.arange(q_size, device=rel_pos.device)[:, None] * max(k_size / q_size, 1.0)\n    k_coords = torch.arange(k_size, device=rel_pos.device)[None, :] * max(q_size / k_size, 1.0)\n    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)\n\n    return rel_pos_resized[relative_coords.long()]\n\n\ndef add_decomposed_rel_pos(\n    q: torch.Tensor,\n    rel_pos_h: torch.Tensor,\n    rel_pos_w: torch.Tensor,\n    q_size: Tuple[int, int],\n    k_size: Tuple[int, int],\n) -> torch.Tensor:\n    \"\"\"\n    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.\n    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950\n    Args:\n        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).\n        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.\n        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.\n        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).\n        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).\n\n    Returns:\n        attn (Tensor): attention map with added relative positional embeddings.\n    \"\"\"\n    q_h, q_w = q_size\n    k_h, k_w = k_size\n    Rh = get_rel_pos(q_h, k_h, rel_pos_h)\n    Rw = get_rel_pos(q_w, k_w, rel_pos_w)\n\n    B, _, dim = q.shape\n    r_q = q.reshape(B, q_h, q_w, dim)\n    rel_h = torch.einsum(\"bhwc,hkc->bhwk\", r_q, Rh)\n    rel_w = torch.einsum(\"bhwc,wkc->bhwk\", r_q, Rw)\n    rel_h = rel_h.unsqueeze(-1)\n    rel_w = rel_w.unsqueeze(-2)\n    rel_h = rel_h.reshape(B, q_h * q_w, k_h, 1)\n    rel_w = rel_w.reshape(B, q_h * q_w, 1, k_w)\n\n    return rel_h, rel_w\n\n\nclass PatchEmbed(nn.Module):\n    \"\"\"\n    Image to Patch Embedding.\n    \"\"\"\n\n    def __init__(\n        self,\n        kernel_size: Tuple[int, int] = (16, 16),\n        stride: Tuple[int, int] = (16, 16),\n        padding: Tuple[int, int] = (0, 0),\n        in_chans: int = 3,\n        embed_dim: int = 768,\n    ) -> None:\n        \"\"\"\n        Args:\n            kernel_size (Tuple): kernel size of the projection layer.\n            stride (Tuple): stride of the projection layer.\n            padding (Tuple): padding size of the projection layer.\n            in_chans (int): Number of input image channels.\n            embed_dim (int): Patch embedding dimension.\n        \"\"\"\n        super().__init__()\n\n        self.proj = nn.Conv2d(\n            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding\n        )\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        x = self.proj(x)\n        # B C H W -> B H W C\n        x = x.permute(0, 2, 3, 1)\n        return x\n\n\ndef build_sam_vit_b(checkpoint=None):\n    return _build_sam(\n        encoder_embed_dim=768,\n        encoder_depth=12,\n        encoder_num_heads=12,\n        encoder_global_attn_indexes=[2, 5, 8, 11],\n        checkpoint=checkpoint,\n    )\n\n\ndef _build_sam(\n    encoder_embed_dim,\n    encoder_depth,\n    encoder_num_heads,\n    encoder_global_attn_indexes,\n    checkpoint=None,\n):\n    prompt_embed_dim = 256\n    image_size = 1024\n    vit_patch_size = 16\n    image_embedding_size = image_size // vit_patch_size\n    image_encoder=ImageEncoderViT(\n            depth=encoder_depth,\n            embed_dim=encoder_embed_dim,\n            img_size=image_size,\n            mlp_ratio=4,\n            norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),\n            num_heads=encoder_num_heads,\n            patch_size=vit_patch_size,\n            qkv_bias=True,\n            use_rel_pos=True,\n            global_attn_indexes=encoder_global_attn_indexes,\n            window_size=14,\n            out_chans=prompt_embed_dim,\n        )\n    \n    if checkpoint is not None:\n        # with open(checkpoint, \"rb\") as f:\n        state_dict = torch.load(checkpoint)\n        # print(state_dict.keys())\n        # for key in state_dict:\n        # image_encoder.load_state_dict({k[14:]: v for k, v in state_dict.items() if 'image_encoder' in k}, strict=False)\n        # ocr-anyting\n        # image_encoder.load_state_dict(state_dict, strict=True)\n        # tob\n        image_encoder.load_state_dict({k[30:]: v for k, v in state_dict.items() if 'vision_tower_high' in k}, strict=True)\n        print(checkpoint)\n    return image_encoder"
  },
  {
    "path": "DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepseek_ocr.py",
    "content": "\n\"\"\"Inference-only Deepseek-OCR model compatible with HuggingFace weights.\"\"\"\nimport math\nfrom collections.abc import Iterable, Mapping, Sequence\nfrom typing import List, Literal, Optional, Set, Tuple, TypedDict, Union\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom einops import rearrange, repeat\nfrom transformers import BatchFeature\n\nfrom vllm.config import VllmConfig\nfrom vllm.model_executor import SamplingMetadata\nfrom vllm.model_executor.layers.quantization import QuantizationConfig\nfrom vllm.model_executor.model_loader.utils import set_default_torch_dtype\nfrom vllm.multimodal import MULTIMODAL_REGISTRY\nfrom vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,\n                                    MultiModalKwargs, NestedTensors)\nfrom vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,\n                                   ImageSize, MultiModalDataItems)\nfrom vllm.multimodal.processing import (BaseMultiModalProcessor,\n                                        BaseProcessingInfo, PromptReplacement,\n                                        PromptUpdate)\nfrom vllm.multimodal.profiling import BaseDummyInputsBuilder\nfrom vllm.sequence import IntermediateTensors\nfrom vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,\n                                                          MlpProjectorConfig,\n                                                          VisionEncoderConfig)\nfrom process.image_process import (\n    DeepseekOCRProcessor, count_tiles)\nfrom vllm.transformers_utils.tokenizer import cached_tokenizer_from_config\n# from vllm.utils import is_list_of\n\nfrom vllm.model_executor.models.interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP\nfrom vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,\n                    init_vllm_registered_model, maybe_prefix,\n                    merge_multimodal_embeddings)\n\nfrom deepencoder.sam_vary_sdpa import build_sam_vit_b\nfrom deepencoder.clip_sdpa import build_clip_l\nfrom deepencoder.build_linear import MlpProjector\nfrom addict import Dict\n# import time\nfrom config import IMAGE_SIZE, BASE_SIZE, CROP_MODE, PRINT_NUM_VIS_TOKENS, PROMPT\n# The image token id may be various\n_IMAGE_TOKEN = \"<image>\"\n\n\nclass DeepseekOCRProcessingInfo(BaseProcessingInfo):\n\n    def get_hf_config(self):\n        return self.ctx.get_hf_config(DeepseekVLV2Config)\n\n    def get_hf_processor(self, **kwargs: object):\n        return self.ctx.get_hf_processor(DeepseekOCRProcessor, **kwargs)\n\n    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:\n        return {\"image\": None}\n\n    def get_num_image_tokens(self,\n                             *,\n                             image_width: int,\n                             image_height: int,\n                             cropping: bool = True) -> int:\n        hf_processor = self.get_hf_processor()\n\n\n        # image_size = hf_processor.image_size\n        # patch_size = hf_processor.patch_size\n        # downsample_ratio = hf_processor.downsample_ratio\n\n        image_size = IMAGE_SIZE\n        base_size = BASE_SIZE\n        patch_size = 16\n        downsample_ratio = 4\n\n        if CROP_MODE:\n            if image_width <= 640 and image_height <= 640:\n                crop_ratio = [1, 1]\n            else:\n                # images_crop_raw, crop_ratio = hf_processor.dynamic_preprocess(image)\n\n                # find the closest aspect ratio to the target\n                crop_ratio = count_tiles(image_width, image_height, image_size=IMAGE_SIZE)\n\n                # print('===========')\n                # print('crop_ratio ', crop_ratio)\n                # print('============')\n                \n            num_width_tiles, num_height_tiles = crop_ratio\n        else:\n            num_width_tiles = num_height_tiles = 1\n\n        h = w = math.ceil((base_size // patch_size) / downsample_ratio)\n\n        h2 = w2 = math.ceil((image_size // patch_size) / downsample_ratio)\n\n        global_views_tokens = h * (w + 1)\n        if num_width_tiles >1 or num_height_tiles>1:\n            local_views_tokens = (num_height_tiles * h2) * (num_width_tiles * w2 + 1)\n        else:\n            local_views_tokens = 0\n\n\n        return global_views_tokens + local_views_tokens + 1\n\n    def get_image_size_with_most_features(self) -> ImageSize:\n\n        if IMAGE_SIZE == 1024 and BASE_SIZE == 1280:\n            return ImageSize(width=1024*2, height=1024*2)\n        return ImageSize(width=640*2, height=640*2)\n\n\nclass DeepseekOCRDummyInputsBuilder(\n        BaseDummyInputsBuilder[DeepseekOCRProcessingInfo]):\n\n    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:\n        num_images = mm_counts.get(\"image\", 0)\n\n        processor = self.info.get_hf_processor()\n        image_token = processor.image_token\n\n        return image_token * num_images\n\n    def get_dummy_mm_data(\n        self,\n        seq_len: int,\n        mm_counts: Mapping[str, int],\n    ) -> MultiModalDataDict:\n        num_images = mm_counts.get(\"image\", 0)\n\n        max_image_size = self.info.get_image_size_with_most_features()\n\n        if '<image>' in PROMPT:\n            return {\n                \"image\":\n                DeepseekOCRProcessor().tokenize_with_images(images = self._get_dummy_images(width=max_image_size.width,\n                                    height=max_image_size.height,\n                                    num_images=num_images), bos=True, eos=True, cropping=CROP_MODE)\n            }\n        else:\n            return {\n                \"image\": []\n            }\n\n\n\n\nclass DeepseekOCRMultiModalProcessor(\n        BaseMultiModalProcessor[DeepseekOCRProcessingInfo]):\n    \n\n    def _call_hf_processor(\n        self,\n        prompt: str,\n        mm_data: Mapping[str, object],\n        mm_kwargs: Mapping[str, object],\n    ) -> BatchFeature:\n        \n        \n        # print(mm_data)\n        if mm_data:\n            processed_outputs = self.info.ctx.call_hf_processor(\n                self.info.get_hf_processor(**mm_kwargs),\n                dict(prompt=prompt, **mm_data),\n                mm_kwargs,\n            )\n\n        else:\n            tokenizer = self.info.get_tokenizer()\n            processed_outputs = tokenizer(prompt,\n                                          add_special_tokens=True,\n                                          return_tensors=\"pt\")\n\n        return processed_outputs\n\n    def _get_mm_fields_config(\n        self,\n        hf_inputs: BatchFeature,\n        hf_processor_mm_kwargs: Mapping[str, object],\n    ) -> Mapping[str, MultiModalFieldConfig]:\n        return dict(\n            pixel_values=MultiModalFieldConfig.batched(\"image\"),\n            images_spatial_crop=MultiModalFieldConfig.batched(\"image\"),\n            # image_embeds=MultiModalFieldConfig.batched(\"image2\"),\n            images_crop=MultiModalFieldConfig.batched(\"image\"),\n        )\n\n    def _get_prompt_updates(\n        self,\n        mm_items: MultiModalDataItems,\n        hf_processor_mm_kwargs: Mapping[str, object],\n        out_mm_kwargs: MultiModalKwargs,\n    ) -> Sequence[PromptUpdate]:\n        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)\n\n        image_token_id = hf_processor.image_token_id\n        assert isinstance(image_token_id, int)\n\n        def get_replacement_deepseek_vl2(item_idx: int):\n            images = mm_items.get_items(\n                \"image\", (ImageEmbeddingItems, ImageProcessorItems))\n\n\n\n            if isinstance(images, ImageEmbeddingItems):\n                num_image_tokens = images.get_feature_size(item_idx)\n            else:\n\n                \n                width = images[0][-1][0][0]\n                height = images[0][-1][0][1]\n\n                num_image_tokens = self.info.get_num_image_tokens(\n                    image_width=width,\n                    image_height=height,\n                    # flag = True,\n                    cropping=CROP_MODE,\n                )\n            return [image_token_id] * num_image_tokens\n\n        return [\n            PromptReplacement(\n                modality=\"image\",\n                target=[image_token_id],\n                replacement=get_replacement_deepseek_vl2,\n            )\n        ]\n\n    def _cached_apply_hf_processor(\n        self,\n        prompt: Union[str, list[int]],\n        mm_data_items: MultiModalDataItems,\n        hf_processor_mm_kwargs: Mapping[str, object],\n    ) -> tuple[list[int], MultiModalKwargs, bool]:\n        # The processor logic is different for len(images) <= 2 vs > 2\n        # Since the processing cache assumes that the processor output is\n        # invariant of how many images are passed per prompt, we only\n        # perform caching for the most common case\n        if mm_data_items.get_count(\"image\", strict=False) > 2:\n            # This code path corresponds to the cache being disabled\n            return self._apply_hf_processor_main(\n                prompt=prompt,\n                mm_items=mm_data_items,\n                hf_processor_mm_kwargs=hf_processor_mm_kwargs,\n                enable_hf_prompt_update=True,\n            )\n\n        return super()._cached_apply_hf_processor(\n            prompt=prompt,\n            mm_data_items=mm_data_items,\n            hf_processor_mm_kwargs=hf_processor_mm_kwargs,\n        )\n\n\n@MULTIMODAL_REGISTRY.register_processor(\n    DeepseekOCRMultiModalProcessor,\n    info=DeepseekOCRProcessingInfo,\n    dummy_inputs=DeepseekOCRDummyInputsBuilder)\nclass DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):\n\n    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={\n        \"language.\": \"language_model.\",\n    })\n\n    def __init__(self, *, vllm_config: VllmConfig, prefix: str = \"\"):\n        super().__init__()\n\n        config: DeepseekVLV2Config = vllm_config.model_config.hf_config\n        quant_config = vllm_config.quant_config\n        multimodal_config = vllm_config.model_config.multimodal_config\n\n        # config.model_type ='deepseek_vl_v2'\n\n        self.config = config\n        self.multimodal_config = multimodal_config\n\n\n        self.vision_config = config.vision_config\n        self.projector_config = config.projector_config\n        self.text_config = config.text_config\n\n        model_config = vllm_config.model_config\n        tokenizer = cached_tokenizer_from_config(model_config)\n        self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]\n\n        self.sam_model = build_sam_vit_b()\n        self.vision_model = build_clip_l()\n\n        n_embed = 1280\n        self.projector =  MlpProjector(Dict(projector_type=\"linear\", input_dim=2048, n_embed=n_embed))\n        self.tile_tag = config.tile_tag\n        self.global_view_pos = config.global_view_pos\n    \n        # self.sam_model = torch.compile(self.sam_model, mode=\"reduce-overhead\")\n        # self.vision_model = torch.compile(self.vision_model, mode=\"reduce-overhead\")\n        # self.projector = torch.compile(self.projector, mode=\"max-autotune\")\n\n\n\n\n        # special token for image token sequence format\n        embed_std = 1 / torch.sqrt(torch.tensor(n_embed, dtype=torch.float32))\n        if self.tile_tag == \"2D\":\n            # <|view_separator|>, <|\\n|>\n            self.image_newline = nn.Parameter(torch.randn(n_embed) * embed_std)\n            self.view_seperator = nn.Parameter(torch.randn(n_embed) * embed_std)\n        else:\n            raise ValueError(\n                f\"Only 2D tile_tag is supported currently, got: {self.tile_tag}\"\n            )\n\n        if self.text_config.topk_method == \"noaux_tc\":\n            architectures = [\"DeepseekV3ForCausalLM\"]\n        elif not self.text_config.use_mla:\n            architectures = [\"DeepseekForCausalLM\"]\n        else:\n            architectures = [\"DeepseekV2ForCausalLM\"]\n\n        self.language_model = init_vllm_registered_model(\n            vllm_config=vllm_config,\n            hf_config=self.text_config,\n            prefix=maybe_prefix(prefix, \"language\"),\n            architectures=architectures,\n        )\n\n        self.make_empty_intermediate_tensors = (\n            self.language_model.make_empty_intermediate_tensors)\n\n\n\n    def _parse_and_validate_image_input(\n            self, **kwargs: object):\n        \n        pixel_values = kwargs.pop(\"pixel_values\", None)\n        images_spatial_crop = kwargs.pop(\"images_spatial_crop\", None)\n        images_crop = kwargs.pop(\"images_crop\", None)\n\n\n        if pixel_values is None or torch.sum(pixel_values).item() == 0:\n            return None\n\n        if pixel_values is not None:\n            if not isinstance(pixel_values, (torch.Tensor, list)):\n                raise ValueError(\"Incorrect type of pixel values. \"\n                                 f\"Got type: {type(pixel_values)}\")\n\n            if not isinstance(images_spatial_crop, (torch.Tensor, list)):\n                raise ValueError(\"Incorrect type of image sizes. \"\n                                 f\"Got type: {type(images_spatial_crop)}\")\n            \n            if not isinstance(images_crop, (torch.Tensor, list)):\n                raise ValueError(\"Incorrect type of image crop. \"\n                                 f\"Got type: {type(images_crop)}\")\n\n            return [pixel_values, images_crop, images_spatial_crop]\n\n\n        raise AssertionError(\"This line should be unreachable.\")\n    \n\n\n    def _pixel_values_to_embedding(\n        self,\n        pixel_values: torch.Tensor,\n        images_crop: torch.Tensor,\n        images_spatial_crop: torch.Tensor,\n    ) -> NestedTensors:\n\n        # Pixel_values (global view): [n_image, batch_size, 3, height, width]\n        # images_spatial_crop: [n_image, batch_size, [num_tiles_w, num_tiles_h]]\n        # images_crop (local view): [n_image, batch_size, num_pathes, 3, h, w]\n        # split the pixel and image_crop, all batch_size = 1\n\n        images_in_this_batch = []\n\n\n        # print(type(images_crop))\n\n        # print(pixel_values.shape)\n\n\n        with torch.no_grad():\n            for jdx in range(images_spatial_crop.size(0)):\n                # with torch.set_grad_enabled(False):\n                patches = images_crop[jdx][0].to(torch.bfloat16) # batch_size = 1\n                image_ori = pixel_values[jdx]\n                crop_shape = images_spatial_crop[jdx][0]\n\n                if torch.sum(patches).item() != 0:  # if all values = 0, no crop\n                    # P, C, H, W = patches.shape\n                    # crop_flag = 1\n                    local_features_1 = self.sam_model(patches)\n                    #TODO del patches \n                    # torch.compiler.cudagraph_mark_step_begin()\n                    local_features_2 = self.vision_model(patches, local_features_1)  \n\n\n                    local_features = torch.cat((local_features_2[:, 1:], local_features_1.flatten(2).permute(0, 2, 1)), dim=-1) \n                    local_features = self.projector(local_features)\n\n\n                    global_features_1 = self.sam_model(image_ori)\n                    global_features_2 = self.vision_model(image_ori, global_features_1) \n                    global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1) \n                    global_features = self.projector(global_features)\n\n                    if PRINT_NUM_VIS_TOKENS:\n                        print('=====================')\n                        print('BASE: ', global_features.shape)\n                        print('PATCHES: ', local_features.shape)\n                        print('=====================')\n\n                    _, hw, n_dim = global_features.shape\n                    h = w = int(hw ** 0.5)\n\n                    _2, hw2, n_dim2 = local_features.shape\n                    h2 = w2 = int(hw2 ** 0.5)\n\n                    width_crop_num, height_crop_num = crop_shape[0], crop_shape[1]\n\n                    global_features = global_features.view(h, w, n_dim)\n\n                    global_features = torch.cat(\n                        [global_features, self.image_newline[None, None, :].expand(h, 1, n_dim)], dim=1\n                    )\n\n                    global_features = global_features.view(-1, n_dim)\n\n\n                    local_features = local_features.view(height_crop_num, width_crop_num, h2, w2, n_dim2).permute(0, 2, 1, 3, 4).reshape(height_crop_num*h2, width_crop_num*w2, n_dim2)\n                    local_features = torch.cat(\n                        [local_features, self.image_newline[None, None, :].expand(height_crop_num * h2, 1, n_dim2)], dim=1\n                    )\n                    local_features = local_features.view(-1, n_dim2)\n\n                    global_local_features = torch.cat([local_features, global_features, self.view_seperator[None, :]], dim=0)\n                \n                else:\n                    global_features_1 = self.sam_model(image_ori)\n                    global_features_2 = self.vision_model(image_ori, global_features_1) \n                    global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1) \n                    global_features = self.projector(global_features)\n\n                    if PRINT_NUM_VIS_TOKENS:\n                        print('=====================')\n                        print('BASE: ', global_features.shape)\n                        print('NO PATCHES')\n                        print('=====================')\n\n                    _, hw, n_dim = global_features.shape\n                    h = w = int(hw ** 0.5)\n\n                    global_features = global_features.view(h, w, n_dim)\n\n                    global_features = torch.cat(\n                        [global_features, self.image_newline[None, None, :].expand(h, 1, n_dim)], dim=1\n                    )\n\n                    global_features = global_features.view(-1, n_dim)\n\n                    global_local_features = torch.cat([global_features, self.view_seperator[None, :]], dim=0)\n\n                images_in_this_batch.append(global_local_features)\n\n        return images_in_this_batch\n\n    def _process_image_input(\n            self, image_input) -> torch.Tensor:\n        \n\n        # image_input: [pixel_values, images_crop, images_spatial_crop]\n    \n        pixel_values = image_input[0].to(torch.bfloat16)\n        # print(image_input[1][0].shape)\n        # print(type(image_input[1]))\n        # exit()\n\n        # images_crop = image_input[1].to(torch.bfloat16)\n        images_crop = image_input[1]\n        # images_crop = image_input[1]\n        images_spatial_crop = image_input[2].to(dtype=torch.long)\n\n        # local_start = time.time()\n        vision_features = self._pixel_values_to_embedding(\n            pixel_values=pixel_values, images_crop = images_crop,  images_spatial_crop=images_spatial_crop)\n\n        # local_total_time = time.time() - local_start\n\n        # print('encoder_time: ', local_total_time)\n        # exit()\n        return vision_features\n\n    def get_language_model(self) -> torch.nn.Module:\n        return self.language_model\n\n    def get_multimodal_embeddings(\n            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:\n        image_input = self._parse_and_validate_image_input(**kwargs)\n        if image_input is None:\n            return None\n        vision_embeddings = self._process_image_input(image_input)\n        return vision_embeddings\n    \n\n\n    def get_input_embeddings(\n        self,\n        input_ids: torch.Tensor,\n        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,\n    ) -> torch.Tensor:\n        \n\n\n        inputs_embeds = self.language_model.get_input_embeddings(input_ids)\n\n\n        if multimodal_embeddings is not None:\n            inputs_embeds = merge_multimodal_embeddings(\n                input_ids, inputs_embeds, multimodal_embeddings,\n                self.image_token_id)\n            # print(len(multimodal_embeddings))\n            # print(input_ids.shape)\n            # print(type(inputs_embeds))\n            # print(inputs_embeds.shape)\n            \n        return inputs_embeds\n\n    def forward(self,\n                input_ids: torch.Tensor,\n                positions: torch.Tensor,\n                intermediate_tensors: Optional[IntermediateTensors] = None,\n                inputs_embeds: Optional[torch.Tensor] = None,\n                **kwargs: object):\n\n        if intermediate_tensors is not None:\n            inputs_embeds = None\n\n        # NOTE: In v1, inputs_embeds is always generated at model runner, this\n        # condition is for v0 compatibility\n        elif inputs_embeds is None:\n            vision_embeddings = self.get_multimodal_embeddings(**kwargs)\n            inputs_embeds = self.get_input_embeddings(input_ids,\n                                                      vision_embeddings)\n            input_ids = None\n\n        hidden_states = self.language_model(input_ids,\n                                            positions,\n                                            intermediate_tensors,\n                                            inputs_embeds=inputs_embeds)\n\n        return hidden_states\n\n    def compute_logits(\n        self,\n        hidden_states: torch.Tensor,\n        sampling_metadata: SamplingMetadata,\n    ) -> Optional[torch.Tensor]:\n        return self.language_model.compute_logits(hidden_states,\n                                                  sampling_metadata)\n\n\n    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:\n        processed_weights = []\n        \n        for name, tensor in weights:\n            if 'sam_model' in name or 'vision_model' in name or 'projector' in name or 'image_newline' in name or 'view_seperator' in name:\n                new_name = name.replace('model.', '', 1)\n            else:\n                new_name = 'language.' + name\n\n            processed_weights.append((new_name, tensor))\n        \n        loader = AutoWeightsLoader(self)\n        autoloaded_weights = loader.load_weights(processed_weights, mapper=self.hf_to_vllm_mapper)\n\n\n\n\n\n        return autoloaded_weights\n"
  },
  {
    "path": "DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/__init__.py",
    "content": ""
  },
  {
    "path": "DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/image_process.py",
    "content": "import math\nfrom typing import List, Tuple\n\nimport torch\nimport torchvision.transforms as T\nfrom PIL import Image, ImageOps\nfrom transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast\nfrom transformers.processing_utils import ProcessorMixin\nfrom config import IMAGE_SIZE, BASE_SIZE, CROP_MODE, MIN_CROPS, MAX_CROPS, PROMPT, TOKENIZER\n\ndef find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):\n    best_ratio_diff = float('inf')\n    best_ratio = (1, 1)\n    area = width * height\n    for ratio in target_ratios:\n        target_aspect_ratio = ratio[0] / ratio[1]\n        ratio_diff = abs(aspect_ratio - target_aspect_ratio)\n        if ratio_diff < best_ratio_diff:\n            best_ratio_diff = ratio_diff\n            best_ratio = ratio\n        elif ratio_diff == best_ratio_diff:\n            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:\n                best_ratio = ratio\n    # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')\n    return best_ratio\n\n\ndef count_tiles(orig_width, orig_height, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False):\n    aspect_ratio = orig_width / orig_height\n\n    # calculate the existing image aspect ratio\n    target_ratios = set(\n        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if\n        i * j <= max_num and i * j >= min_num)\n    # print(target_ratios)\n    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])\n\n    # find the closest aspect ratio to the target\n    target_aspect_ratio = find_closest_aspect_ratio(\n        aspect_ratio, target_ratios, orig_width, orig_height, image_size)\n\n    return target_aspect_ratio\n\n\ndef dynamic_preprocess(image, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False):\n    orig_width, orig_height = image.size\n    aspect_ratio = orig_width / orig_height\n\n    # calculate the existing image aspect ratio\n    target_ratios = set(\n        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if\n        i * j <= max_num and i * j >= min_num)\n    # print(target_ratios)\n    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])\n\n    # find the closest aspect ratio to the target\n    target_aspect_ratio = find_closest_aspect_ratio(\n        aspect_ratio, target_ratios, orig_width, orig_height, image_size)\n\n    # print(target_aspect_ratio)\n    # calculate the target width and height\n    target_width = image_size * target_aspect_ratio[0]\n    target_height = image_size * target_aspect_ratio[1]\n    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]\n\n    # resize the image\n    resized_img = image.resize((target_width, target_height))\n    processed_images = []\n    for i in range(blocks):\n        box = (\n            (i % (target_width // image_size)) * image_size,\n            (i // (target_width // image_size)) * image_size,\n            ((i % (target_width // image_size)) + 1) * image_size,\n            ((i // (target_width // image_size)) + 1) * image_size\n        )\n        # split the image\n        split_img = resized_img.crop(box)\n        processed_images.append(split_img)\n    assert len(processed_images) == blocks\n    if use_thumbnail and len(processed_images) != 1:\n        thumbnail_img = image.resize((image_size, image_size))\n        processed_images.append(thumbnail_img)\n    return processed_images, target_aspect_ratio\n\n\n\n\n\nclass ImageTransform:\n\n    def __init__(self,\n                 mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),\n                 std: Tuple[float, float, float] = (0.5, 0.5, 0.5),\n                 normalize: bool = True):\n        self.mean = mean\n        self.std = std\n        self.normalize = normalize\n\n        transform_pipelines = [T.ToTensor()]\n\n        if normalize:\n            transform_pipelines.append(T.Normalize(mean, std))\n\n        self.transform = T.Compose(transform_pipelines)\n\n    def __call__(self, pil_img: Image.Image):\n        x = self.transform(pil_img)\n        return x\n\n\nclass DeepseekOCRProcessor(ProcessorMixin):\n    tokenizer_class = (\"LlamaTokenizer\", \"LlamaTokenizerFast\")\n    attributes = [\"tokenizer\"]\n\n    def __init__(\n        self,\n        tokenizer: LlamaTokenizerFast = TOKENIZER,\n        candidate_resolutions: Tuple[Tuple[int, int]] = [[1024, 1024]],\n        patch_size: int = 16,\n        downsample_ratio: int = 4,\n        image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),\n        image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5),\n        normalize: bool = True,\n        image_token: str = \"<image>\",\n        pad_token: str = \"<｜▁pad▁｜>\",\n        add_special_token: bool = False,\n        sft_format: str = \"deepseek\",\n        mask_prompt: bool = True,\n        ignore_id: int = -100,\n        **kwargs,\n    ):\n\n        # self.candidate_resolutions = candidate_resolutions # placeholder no use\n        self.image_size = IMAGE_SIZE\n        self.base_size = BASE_SIZE\n        # self.patch_size = patch_size\n        self.patch_size = 16 \n        self.image_mean = image_mean\n        self.image_std = image_std\n        self.normalize = normalize\n        # self.downsample_ratio = downsample_ratio\n        self.downsample_ratio = 4\n\n        self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)\n\n\n        self.tokenizer = tokenizer\n        # self.tokenizer = add_special_token(tokenizer)\n        self.tokenizer.padding_side = 'left'  # must set this，padding side with make a difference in batch inference\n\n        # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'\n        if self.tokenizer.pad_token is None:\n            self.tokenizer.add_special_tokens({'pad_token': pad_token})\n\n        # add image token\n        # image_token_id = self.tokenizer.vocab.get(image_token)\n        # if image_token_id is None:\n        #     special_tokens = [image_token]\n        #     special_tokens_dict = {\"additional_special_tokens\": special_tokens}\n        #     self.tokenizer.add_special_tokens(special_tokens_dict)\n        self.image_token_id = self.tokenizer.vocab.get(image_token)\n\n        # add five special tokens for grounding-related tasks\n        # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>\n        # special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']\n        # special_tokens_dict = {\"additional_special_tokens\": special_tokens}\n\n        # special_tokens = ['<image>','<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>', '<td>', '</td>', '<tr>', '</tr>']\n        # special_tokens_dict = {\"additional_special_tokens\": special_tokens}\n        # self.tokenizer.add_special_tokens(special_tokens_dict)\n\n        # # add special tokens for SFT data\n        # special_tokens = [\"<|User|>\", \"<|Assistant|>\"]\n        # special_tokens_dict = {\"additional_special_tokens\": special_tokens}\n        # self.tokenizer.add_special_tokens(special_tokens_dict)\n\n        self.image_token = image_token\n        self.pad_token = pad_token\n        self.add_special_token = add_special_token\n        self.sft_format = sft_format\n        self.mask_prompt = mask_prompt\n        self.ignore_id = ignore_id\n\n        super().__init__(\n            tokenizer,\n            **kwargs,\n        )\n\n\n    \n\n    # def select_best_resolution(self, image_size):\n    #     # used for cropping\n    #     original_width, original_height = image_size\n    #     best_fit = None\n    #     max_effective_resolution = 0\n    #     min_wasted_resolution = float(\"inf\")\n\n    #     for width, height in self.candidate_resolutions:\n    #         scale = min(width / original_width, height / original_height)\n    #         downscaled_width, downscaled_height = int(\n    #             original_width * scale), int(original_height * scale)\n    #         effective_resolution = min(downscaled_width * downscaled_height,\n    #                                    original_width * original_height)\n    #         wasted_resolution = (width * height) - effective_resolution\n\n    #         if effective_resolution > max_effective_resolution or (\n    #                 effective_resolution == max_effective_resolution\n    #                 and wasted_resolution < min_wasted_resolution):\n    #             max_effective_resolution = effective_resolution\n    #             min_wasted_resolution = wasted_resolution\n    #             best_fit = (width, height)\n\n    #     return best_fit\n\n    @property\n    def bos_id(self):\n        return self.tokenizer.bos_token_id\n\n    @property\n    def eos_id(self):\n        return self.tokenizer.eos_token_id\n\n    @property\n    def pad_id(self):\n        return self.tokenizer.pad_token_id\n\n    def encode(self, text: str, bos: bool = True, eos: bool = False):\n        t = self.tokenizer.encode(text, add_special_tokens=False)\n\n        if bos:\n            t = [self.bos_id] + t\n        if eos:\n            t = t + [self.eos_id]\n\n        return t\n\n    def decode(self, t: List[int], **kwargs) -> str:\n        return self.tokenizer.decode(t, **kwargs)\n\n    def process_one(\n        self,\n        prompt: str,\n        images: List,\n        inference_mode: bool = True,\n        **kwargs,\n    ):\n        \"\"\"\n\n        Args:\n            prompt (str): the formatted prompt;\n            conversations (List[Dict]): conversations with a list of messages;\n            images (List[ImageType]): the list of images;\n            inference_mode (bool): if True, then remove the last eos token;\n            system_prompt (str): the system prompt;\n            **kwargs:\n\n        Returns:\n            outputs (BaseProcessorOutput): the output of the processor,\n                - input_ids (torch.LongTensor): [N + image tokens]\n                - target_ids (torch.LongTensor): [N + image tokens]\n                - pixel_values (torch.FloatTensor): [n_patches, 3, H, W]\n                - image_id (int): the id of the image token\n                - num_image_tokens (List[int]): the number of image tokens\n        \"\"\"\n\n        assert (prompt is not None and images is not None\n                ), \"prompt and images must be used at the same time.\"\n\n        sft_format = prompt\n\n        input_ids, pixel_values, images_crop, images_seq_mask, images_spatial_crop, num_image_tokens, _ = images[0]\n\n\n        return {\n            \"input_ids\": input_ids,\n            \"pixel_values\": pixel_values,\n            \"images_crop\": images_crop,\n            \"images_seq_mask\": images_seq_mask,\n            \"images_spatial_crop\": images_spatial_crop,\n            \"num_image_tokens\": num_image_tokens,\n        }\n\n\n        # prepare = BatchFeature(\n        #     data=dict(\n        #         input_ids=input_ids,\n        #         pixel_values=pixel_values,\n        #         images_crop = images_crop,\n        #         images_seq_mask=images_seq_mask,\n        #         images_spatial_crop=images_spatial_crop,\n        #         num_image_tokens=num_image_tokens,\n        #     ),\n        #     tensor_type=\"pt\",\n        # )\n        # return prepare\n\n    def __call__(\n        self,\n        *,\n        prompt: str,\n        images: List,\n        inference_mode: bool = True,\n        **kwargs,\n    ):\n        \"\"\"\n\n        Args:\n            prompt (str): the formatted prompt;\n            images (List[ImageType]): the list of images;\n            inference_mode (bool): if True, then remove the last eos token;\n            **kwargs:\n\n        Returns:\n            outputs (BaseProcessorOutput): the output of the processor,\n                - input_ids (torch.LongTensor): [N + image tokens]\n                - images (torch.FloatTensor): [n_images, 3, H, W]\n                - image_id (int): the id of the image token\n                - num_image_tokens (List[int]): the number of image tokens\n        \"\"\"\n\n        prepare = self.process_one(\n            prompt=prompt,\n            images=images,\n            inference_mode=inference_mode,\n        )\n\n        return prepare\n\n    def tokenize_with_images(\n        self,\n        # conversation: str,\n        images: List[Image.Image],\n        bos: bool = True,\n        eos: bool = True,\n        cropping: bool = True,\n    ):\n        \"\"\"Tokenize text with <image> tags.\"\"\"\n\n        # print(conversation)\n        conversation = PROMPT\n        assert conversation.count(self.image_token) == len(images)\n        text_splits = conversation.split(self.image_token)\n        images_list, images_crop_list, images_seq_mask, images_spatial_crop = [], [], [], []\n        image_shapes = []\n        num_image_tokens = []\n        tokenized_str = []\n        # print('image: ', len(images))\n        for text_sep, image in zip(text_splits, images):\n            \"\"\"encode text_sep\"\"\"\n            tokenized_sep = self.encode(text_sep, bos=False, eos=False)\n            tokenized_str += tokenized_sep\n            images_seq_mask += [False] * len(tokenized_sep)\n\n            \"\"\"select best resolution for anyres\"\"\"\n            # if cropping:\n            #     best_width, best_height = self.select_best_resolution(image.size)\n            # else:\n            #     best_width, best_height = self.image_size, self.image_size\n\n            image_shapes.append(image.size)\n\n            if image.size[0] <= 640 and image.size[1] <= 640:\n                crop_ratio = [1, 1]\n            else:\n                if cropping:\n                    # print('image-size: ', image.size)\n                    # best_width, best_height = select_best_resolution(image.size, self.candidate_resolutions)\n                    # print('image ', image.size)\n                    # print('open_size:', image.size)\n                    images_crop_raw, crop_ratio = dynamic_preprocess(image, image_size=IMAGE_SIZE)\n                    # print('crop_ratio: ', crop_ratio)\n                else:\n                    # best_width, best_height = self.image_size, self.image_size\n                    crop_ratio = [1, 1]\n            # print(image.size, (best_width, best_height)) # check the select_best_resolutions func\n\n            # print(crop_ratio)\n            \"\"\"process the global view\"\"\"\n\n            # if cropping\n            if self.image_size <= 640 and not cropping:\n                # print('directly resize')\n                image = image.resize((self.image_size, self.image_size))\n\n            global_view = ImageOps.pad(image, (self.base_size, self.base_size),\n                                    color=tuple(int(x * 255) for x in self.image_transform.mean))\n            images_list.append(self.image_transform(global_view))\n\n            \"\"\"record height / width crop num\"\"\"\n            # width_crop_num, height_crop_num = best_width // self.image_size, best_height // self.image_size\n            num_width_tiles, num_height_tiles = crop_ratio\n            images_spatial_crop.append([num_width_tiles, num_height_tiles])\n\n\n\n\n            if num_width_tiles > 1 or num_height_tiles > 1:\n                \"\"\"process the local views\"\"\"\n                # local_view = ImageOps.pad(image, (best_width, best_height),\n                #                         color=tuple(int(x * 255) for x in self.image_transform.mean))\n                # for i in range(0, best_height, self.image_size):\n                #     for j in range(0, best_width, self.image_size):\n                #         images_crop_list.append(\n                #             self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))\n                for i in range(len(images_crop_raw)):\n                    images_crop_list.append(self.image_transform(images_crop_raw[i]))\n\n            # \"\"\"process the global view\"\"\"\n            # global_view = ImageOps.pad(image, (self.image_size, self.image_size),\n            #                            color=tuple(int(x * 255) for x in self.image_transform.mean))\n            # images_list.append(self.image_transform(global_view))\n\n            # \"\"\"process the local views\"\"\"\n            # local_view = ImageOps.pad(image, (best_width, best_height),\n            #                           color=tuple(int(x * 255) for x in self.image_transform.mean))\n            # for i in range(0, best_height, self.image_size):\n            #     for j in range(0, best_width, self.image_size):\n            #         images_list.append(\n            #             self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))\n\n            # \"\"\"add image tokens\"\"\"\n            \"\"\"add image tokens\"\"\"\n            num_queries = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)\n            num_queries_base = math.ceil((self.base_size // self.patch_size) / self.downsample_ratio)\n\n\n            tokenized_image = ([self.image_token_id] * num_queries_base + [self.image_token_id]) * num_queries_base\n            tokenized_image += [self.image_token_id]\n            if num_width_tiles > 1 or num_height_tiles > 1:\n                tokenized_image += ([self.image_token_id] * (num_queries * num_width_tiles) + [self.image_token_id]) * (\n                            num_queries * num_height_tiles)\n            tokenized_str += tokenized_image\n            images_seq_mask += [True] * len(tokenized_image)\n            num_image_tokens.append(len(tokenized_image))\n\n        \"\"\"process the last text split\"\"\"\n        tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)\n        tokenized_str += tokenized_sep\n        images_seq_mask += [False] * len(tokenized_sep)\n\n        \"\"\"add the bos and eos tokens\"\"\"\n        if bos:\n            tokenized_str = [self.bos_id] + tokenized_str\n            images_seq_mask = [False] + images_seq_mask\n        if eos:\n            tokenized_str = tokenized_str + [self.eos_id]\n            images_seq_mask = images_seq_mask + [False]\n\n        assert len(tokenized_str) == len(\n            images_seq_mask), f\"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}\"\n        \n\n\n        masked_tokenized_str = []\n        for token_index in tokenized_str:\n            if token_index != self.image_token_id:\n                masked_tokenized_str.append(token_index)\n            else:\n                masked_tokenized_str.append(self.ignore_id)\n\n        assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \\\n            (f\"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, \"\n             f\"imags_seq_mask's length {len(images_seq_mask)}, are not equal\")\n\n        input_ids = torch.LongTensor(tokenized_str)\n        target_ids = torch.LongTensor(masked_tokenized_str)\n        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)\n\n        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id\n        target_ids[(input_ids < 0) |\n                   (input_ids == self.image_token_id)] = self.ignore_id\n        input_ids[input_ids < 0] = self.pad_id\n\n        inference_mode = True\n\n        if inference_mode:\n            # Remove the ending eos token\n            assert input_ids[-1] == self.eos_id\n            input_ids = input_ids[:-1]\n            target_ids = target_ids[:-1]\n            images_seq_mask = images_seq_mask[:-1]\n\n        if len(images_list) == 0:\n            pixel_values = torch.zeros((1, 3, self.base_size, self.base_size))\n            images_spatial_crop = torch.zeros((1, 1), dtype=torch.long)\n            images_crop = torch.zeros((1, 3, self.image_size, self.image_size)).unsqueeze(0)\n        else:\n            pixel_values = torch.stack(images_list, dim=0)\n            images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)\n            if images_crop_list:\n                images_crop = torch.stack(images_crop_list, dim=0).unsqueeze(0)\n            else:\n                images_crop = torch.zeros((1, 3, self.image_size, self.image_size)).unsqueeze(0)\n\n        input_ids = input_ids.unsqueeze(0)\n\n        \n        return [[input_ids, pixel_values, images_crop, images_seq_mask, images_spatial_crop, num_image_tokens, image_shapes]]\n\n\nAutoProcessor.register(\"DeepseekVLV2Processor\", DeepseekOCRProcessor)\n"
  },
  {
    "path": "DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/ngram_norepeat.py",
    "content": "import torch\nfrom transformers import LogitsProcessor\nfrom transformers.generation.logits_process import _calc_banned_ngram_tokens\nfrom typing import List, Set\n\n\nclass NoRepeatNGramLogitsProcessor(LogitsProcessor):\n\n    def __init__(self, ngram_size: int, window_size: int = 100, whitelist_token_ids: set = None):\n        if not isinstance(ngram_size, int) or ngram_size <= 0:\n            raise ValueError(f\"`ngram_size` has to be a strictly positive integer, but is {ngram_size}\")\n        if not isinstance(window_size, int) or window_size <= 0:\n            raise ValueError(f\"`window_size` has to be a strictly positive integer, but is {window_size}\")\n        self.ngram_size = ngram_size\n        self.window_size = window_size\n        self.whitelist_token_ids = whitelist_token_ids or set()\n    \n    def __call__(self, input_ids: List[int], scores: torch.FloatTensor) -> torch.FloatTensor:\n        if len(input_ids) < self.ngram_size:\n            return scores\n        \n        current_prefix = tuple(input_ids[-(self.ngram_size - 1):])\n        \n        search_start = max(0, len(input_ids) - self.window_size)\n        search_end = len(input_ids) - self.ngram_size + 1\n        \n        banned_tokens = set()\n        for i in range(search_start, search_end):\n            ngram = tuple(input_ids[i:i + self.ngram_size])\n            if ngram[:-1] == current_prefix:\n                banned_tokens.add(ngram[-1])\n        \n        banned_tokens = banned_tokens - self.whitelist_token_ids\n        \n        if banned_tokens:\n            scores = scores.clone()\n            for token in banned_tokens:\n                scores[token] = -float(\"inf\")\n        \n        return scores"
  },
  {
    "path": "DeepSeek-OCR-master/DeepSeek-OCR-vllm/run_dpsk_ocr_eval_batch.py",
    "content": "import os\nimport re\nfrom tqdm import tqdm\nimport torch\nif torch.version.cuda == '11.8':\n    os.environ[\"TRITON_PTXAS_PATH\"] = \"/usr/local/cuda-11.8/bin/ptxas\"\nos.environ['VLLM_USE_V1'] = '0'\nos.environ[\"CUDA_VISIBLE_DEVICES\"] = '0'\n\nfrom config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, MAX_CONCURRENCY, CROP_MODE, NUM_WORKERS\nfrom concurrent.futures import ThreadPoolExecutor\nimport glob\nfrom PIL import Image\nfrom deepseek_ocr import DeepseekOCRForCausalLM\n\nfrom vllm.model_executor.models.registry import ModelRegistry\n\nfrom vllm import LLM, SamplingParams\nfrom process.ngram_norepeat import NoRepeatNGramLogitsProcessor\nfrom process.image_process import DeepseekOCRProcessor\nModelRegistry.register_model(\"DeepseekOCRForCausalLM\", DeepseekOCRForCausalLM)\n\n\nllm = LLM(\n    model=MODEL_PATH,\n    hf_overrides={\"architectures\": [\"DeepseekOCRForCausalLM\"]},\n    block_size=256,\n    enforce_eager=False,\n    trust_remote_code=True, \n    max_model_len=8192,\n    swap_space=0,\n    max_num_seqs = MAX_CONCURRENCY,\n    tensor_parallel_size=1,\n    gpu_memory_utilization=0.9,\n)\n\nlogits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=40, window_size=90, whitelist_token_ids= {128821, 128822})] #window for fast；whitelist_token_ids: <td>,</td>\n\nsampling_params = SamplingParams(\n    temperature=0.0,\n    max_tokens=8192,\n    logits_processors=logits_processors,\n    skip_special_tokens=False,\n)\n\nclass Colors:\n    RED = '\\033[31m'\n    GREEN = '\\033[32m'\n    YELLOW = '\\033[33m'\n    BLUE = '\\033[34m'\n    RESET = '\\033[0m' \n\ndef clean_formula(text):\n\n    formula_pattern = r'\\\\\\[(.*?)\\\\\\]'\n    \n    def process_formula(match):\n        formula = match.group(1)\n\n        formula = re.sub(r'\\\\quad\\s*\\([^)]*\\)', '', formula)\n        \n        formula = formula.strip()\n        \n        return r'\\[' + formula + r'\\]'\n\n    cleaned_text = re.sub(formula_pattern, process_formula, text)\n    \n    return cleaned_text\n\ndef re_match(text):\n    pattern = r'(<\\|ref\\|>(.*?)<\\|/ref\\|><\\|det\\|>(.*?)<\\|/det\\|>)'\n    matches = re.findall(pattern, text, re.DOTALL)\n\n\n    # mathes_image = []\n    mathes_other = []\n    for a_match in matches:\n        mathes_other.append(a_match[0])\n    return matches, mathes_other\n\ndef process_single_image(image):\n    \"\"\"single image\"\"\"\n    prompt_in = prompt\n    cache_item = {\n        \"prompt\": prompt_in,\n        \"multi_modal_data\": {\"image\": DeepseekOCRProcessor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)},\n    }\n    return cache_item\n\n\nif __name__ == \"__main__\":\n\n    # INPUT_PATH = OmniDocBench images path\n\n    os.makedirs(OUTPUT_PATH, exist_ok=True)\n\n    # print('image processing until processing prompts.....')\n\n    print(f'{Colors.RED}glob images.....{Colors.RESET}')\n\n    images_path = glob.glob(f'{INPUT_PATH}/*')\n\n    images = []\n\n    for image_path in images_path:\n        image = Image.open(image_path).convert('RGB')\n        images.append(image)\n\n    prompt = PROMPT\n\n    # batch_inputs = []\n\n\n    # for image in tqdm(images):\n\n    #     prompt_in = prompt\n    #     cache_list = [\n    #         {\n    #             \"prompt\": prompt_in,\n    #             \"multi_modal_data\": {\"image\": Image.open(image).convert('RGB')},\n    #         }\n    #     ]\n    #     batch_inputs.extend(cache_list)\n\n    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:  \n        batch_inputs = list(tqdm(\n            executor.map(process_single_image, images),\n            total=len(images),\n            desc=\"Pre-processed images\"\n        ))\n\n\n    \n\n    outputs_list = llm.generate(\n        batch_inputs,\n        sampling_params=sampling_params\n    )\n\n\n    output_path = OUTPUT_PATH\n\n    os.makedirs(output_path, exist_ok=True)\n\n    for output, image in zip(outputs_list, images_path):\n\n        content = output.outputs[0].text\n        mmd_det_path = output_path + image.split('/')[-1].replace('.jpg', '_det.md')\n\n        with open(mmd_det_path, 'w', encoding='utf-8') as afile:\n            afile.write(content)\n\n        content = clean_formula(content)\n        matches_ref, mathes_other = re_match(content)\n        for idx, a_match_other in enumerate(tqdm(mathes_other, desc=\"other\")):\n            content = content.replace(a_match_other, '').replace('\\n\\n\\n\\n', '\\n\\n').replace('\\n\\n\\n', '\\n\\n').replace('<center>', '').replace('</center>', '')\n        \n        mmd_path = output_path + image.split('/')[-1].replace('.jpg', '.md')\n\n        with open(mmd_path, 'w', encoding='utf-8') as afile:\n            afile.write(content)\n"
  },
  {
    "path": "DeepSeek-OCR-master/DeepSeek-OCR-vllm/run_dpsk_ocr_image.py",
    "content": "import asyncio\nimport re\nimport os\n\nimport torch\nif torch.version.cuda == '11.8':\n    os.environ[\"TRITON_PTXAS_PATH\"] = \"/usr/local/cuda-11.8/bin/ptxas\"\n\nos.environ['VLLM_USE_V1'] = '0'\nos.environ[\"CUDA_VISIBLE_DEVICES\"] = '0'\n\nfrom vllm import AsyncLLMEngine, SamplingParams\nfrom vllm.engine.arg_utils import AsyncEngineArgs\nfrom vllm.model_executor.models.registry import ModelRegistry\nimport time\nfrom deepseek_ocr import DeepseekOCRForCausalLM\nfrom PIL import Image, ImageDraw, ImageFont, ImageOps\nimport numpy as np\nfrom tqdm import tqdm\nfrom process.ngram_norepeat import NoRepeatNGramLogitsProcessor\nfrom process.image_process import DeepseekOCRProcessor\nfrom config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, CROP_MODE\n\n\n\nModelRegistry.register_model(\"DeepseekOCRForCausalLM\", DeepseekOCRForCausalLM)\n\ndef load_image(image_path):\n\n    try:\n        image = Image.open(image_path)\n        \n        corrected_image = ImageOps.exif_transpose(image)\n\n        return corrected_image\n        \n    except Exception as e:\n        print(f\"error: {e}\")\n        try:\n            return Image.open(image_path)\n        except:\n            return None\n\n\ndef re_match(text):\n    pattern = r'(<\\|ref\\|>(.*?)<\\|/ref\\|><\\|det\\|>(.*?)<\\|/det\\|>)'\n    matches = re.findall(pattern, text, re.DOTALL)\n\n\n    mathes_image = []\n    mathes_other = []\n    for a_match in matches:\n        if '<|ref|>image<|/ref|>' in a_match[0]:\n            mathes_image.append(a_match[0])\n        else:\n            mathes_other.append(a_match[0])\n    return matches, mathes_image, mathes_other\n\n\ndef extract_coordinates_and_label(ref_text, image_width, image_height):\n\n\n    try:\n        label_type = ref_text[1]\n        cor_list = eval(ref_text[2])\n    except Exception as e:\n        print(e)\n        return None\n\n    return (label_type, cor_list)\n\n\ndef draw_bounding_boxes(image, refs):\n\n    image_width, image_height = image.size\n    img_draw = image.copy()\n    draw = ImageDraw.Draw(img_draw)\n\n    overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))\n    draw2 = ImageDraw.Draw(overlay)\n    \n    #     except IOError:\n    font = ImageFont.load_default()\n\n    img_idx = 0\n    \n    for i, ref in enumerate(refs):\n        try:\n            result = extract_coordinates_and_label(ref, image_width, image_height)\n            if result:\n                label_type, points_list = result\n                \n                color = (np.random.randint(0, 200), np.random.randint(0, 200), np.random.randint(0, 255))\n\n                color_a = color + (20, )\n                for points in points_list:\n                    x1, y1, x2, y2 = points\n\n                    x1 = int(x1 / 999 * image_width)\n                    y1 = int(y1 / 999 * image_height)\n\n                    x2 = int(x2 / 999 * image_width)\n                    y2 = int(y2 / 999 * image_height)\n\n                    if label_type == 'image':\n                        try:\n                            cropped = image.crop((x1, y1, x2, y2))\n                            cropped.save(f\"{OUTPUT_PATH}/images/{img_idx}.jpg\")\n                        except Exception as e:\n                            print(e)\n                            pass\n                        img_idx += 1\n                        \n                    try:\n                        if label_type == 'title':\n                            draw.rectangle([x1, y1, x2, y2], outline=color, width=4)\n                            draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)\n                        else:\n                            draw.rectangle([x1, y1, x2, y2], outline=color, width=2)\n                            draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)\n\n                        text_x = x1\n                        text_y = max(0, y1 - 15)\n                            \n                        text_bbox = draw.textbbox((0, 0), label_type, font=font)\n                        text_width = text_bbox[2] - text_bbox[0]\n                        text_height = text_bbox[3] - text_bbox[1]\n                        draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height], \n                                    fill=(255, 255, 255, 30))\n                        \n                        draw.text((text_x, text_y), label_type, font=font, fill=color)\n                    except:\n                        pass\n        except:\n            continue\n    img_draw.paste(overlay, (0, 0), overlay)\n    return img_draw\n\n\ndef process_image_with_refs(image, ref_texts):\n    result_image = draw_bounding_boxes(image, ref_texts)\n    return result_image\n\n\n\n\nasync def stream_generate(image=None, prompt=''):\n\n\n    engine_args = AsyncEngineArgs(\n        model=MODEL_PATH,\n        hf_overrides={\"architectures\": [\"DeepseekOCRForCausalLM\"]},\n        block_size=256,\n        max_model_len=8192,\n        enforce_eager=False,\n        trust_remote_code=True,  \n        tensor_parallel_size=1,\n        gpu_memory_utilization=0.75,\n    )\n    engine = AsyncLLMEngine.from_engine_args(engine_args)\n    \n    logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=30, window_size=90, whitelist_token_ids= {128821, 128822})] #whitelist: <td>, </td> \n\n    sampling_params = SamplingParams(\n        temperature=0.0,\n        max_tokens=8192,\n        logits_processors=logits_processors,\n        skip_special_tokens=False,\n        # ignore_eos=False,\n        \n    )\n    \n    request_id = f\"request-{int(time.time())}\"\n\n    printed_length = 0  \n\n    if image and '<image>' in prompt:\n        request = {\n            \"prompt\": prompt,\n            \"multi_modal_data\": {\"image\": image}\n        }\n    elif prompt:\n        request = {\n            \"prompt\": prompt\n        }\n    else:\n        assert False, f'prompt is none!!!'\n    async for request_output in engine.generate(\n        request, sampling_params, request_id\n    ):\n        if request_output.outputs:\n            full_text = request_output.outputs[0].text\n            new_text = full_text[printed_length:]\n            print(new_text, end='', flush=True)\n            printed_length = len(full_text)\n            final_output = full_text\n    print('\\n') \n\n    return final_output\n\n\n\n\nif __name__ == \"__main__\":\n\n    os.makedirs(OUTPUT_PATH, exist_ok=True)\n    os.makedirs(f'{OUTPUT_PATH}/images', exist_ok=True)\n\n    image = load_image(INPUT_PATH).convert('RGB')\n\n    \n    if '<image>' in PROMPT:\n\n        image_features = DeepseekOCRProcessor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)\n    else:\n        image_features = ''\n\n    prompt = PROMPT\n\n    result_out = asyncio.run(stream_generate(image_features, prompt))\n\n\n    save_results = 1\n\n    if save_results and '<image>' in prompt:\n        print('='*15 + 'save results:' + '='*15)\n\n        image_draw = image.copy()\n\n        outputs = result_out\n\n        with open(f'{OUTPUT_PATH}/result_ori.mmd', 'w', encoding = 'utf-8') as afile:\n            afile.write(outputs)\n\n        matches_ref, matches_images, mathes_other = re_match(outputs)\n        # print(matches_ref)\n        result = process_image_with_refs(image_draw, matches_ref)\n\n\n        for idx, a_match_image in enumerate(tqdm(matches_images, desc=\"image\")):\n            outputs = outputs.replace(a_match_image, f'![](images/' + str(idx) + '.jpg)\\n')\n\n        for idx, a_match_other in enumerate(tqdm(mathes_other, desc=\"other\")):\n            outputs = outputs.replace(a_match_other, '').replace('\\\\coloneqq', ':=').replace('\\\\eqqcolon', '=:')\n\n        # if 'structural formula' in conversation[0]['content']:\n        #     outputs = '<smiles>' + outputs + '</smiles>'\n        with open(f'{OUTPUT_PATH}/result.mmd', 'w', encoding = 'utf-8') as afile:\n            afile.write(outputs)\n\n        if 'line_type' in outputs:\n            import matplotlib.pyplot as plt\n            from matplotlib.patches import Circle\n            lines = eval(outputs)['Line']['line']\n\n            line_type = eval(outputs)['Line']['line_type']\n            # print(lines)\n\n            endpoints = eval(outputs)['Line']['line_endpoint']\n\n            fig, ax = plt.subplots(figsize=(3,3), dpi=200)\n            ax.set_xlim(-15, 15)\n            ax.set_ylim(-15, 15)\n\n            for idx, line in enumerate(lines):\n                try:\n                    p0 = eval(line.split(' -- ')[0])\n                    p1 = eval(line.split(' -- ')[-1])\n\n                    if line_type[idx] == '--':\n                        ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth=0.8, color='k')\n                    else:\n                        ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth = 0.8, color = 'k')\n\n                    ax.scatter(p0[0], p0[1], s=5, color = 'k')\n                    ax.scatter(p1[0], p1[1], s=5, color = 'k')\n                except:\n                    pass\n\n            for endpoint in endpoints:\n\n                label = endpoint.split(': ')[0]\n                (x, y) = eval(endpoint.split(': ')[1])\n                ax.annotate(label, (x, y), xytext=(1, 1), textcoords='offset points', \n                            fontsize=5, fontweight='light')\n            \n            try:\n                if 'Circle' in eval(outputs).keys():\n                    circle_centers = eval(outputs)['Circle']['circle_center']\n                    radius = eval(outputs)['Circle']['radius']\n\n                    for center, r in zip(circle_centers, radius):\n                        center = eval(center.split(': ')[1])\n                        circle = Circle(center, radius=r, fill=False, edgecolor='black', linewidth=0.8)\n                        ax.add_patch(circle)\n            except:\n                pass\n\n\n            plt.savefig(f'{OUTPUT_PATH}/geo.jpg')\n            plt.close()\n\n        result.save(f'{OUTPUT_PATH}/result_with_boxes.jpg')\n"
  },
  {
    "path": "DeepSeek-OCR-master/DeepSeek-OCR-vllm/run_dpsk_ocr_pdf.py",
    "content": "import os\nimport fitz\nimport img2pdf\nimport io\nimport re\nfrom tqdm import tqdm\nimport torch\nfrom concurrent.futures import ThreadPoolExecutor\n \n\nif torch.version.cuda == '11.8':\n    os.environ[\"TRITON_PTXAS_PATH\"] = \"/usr/local/cuda-11.8/bin/ptxas\"\nos.environ['VLLM_USE_V1'] = '0'\nos.environ[\"CUDA_VISIBLE_DEVICES\"] = '0'\n\n\nfrom config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, SKIP_REPEAT, MAX_CONCURRENCY, NUM_WORKERS, CROP_MODE\n\nfrom PIL import Image, ImageDraw, ImageFont\nimport numpy as np\nfrom deepseek_ocr import DeepseekOCRForCausalLM\n\nfrom vllm.model_executor.models.registry import ModelRegistry\n\nfrom vllm import LLM, SamplingParams\nfrom process.ngram_norepeat import NoRepeatNGramLogitsProcessor\nfrom process.image_process import DeepseekOCRProcessor\n\nModelRegistry.register_model(\"DeepseekOCRForCausalLM\", DeepseekOCRForCausalLM)\n\n\nllm = LLM(\n    model=MODEL_PATH,\n    hf_overrides={\"architectures\": [\"DeepseekOCRForCausalLM\"]},\n    block_size=256,\n    enforce_eager=False,\n    trust_remote_code=True, \n    max_model_len=8192,\n    swap_space=0,\n    max_num_seqs=MAX_CONCURRENCY,\n    tensor_parallel_size=1,\n    gpu_memory_utilization=0.9,\n    disable_mm_preprocessor_cache=True\n)\n\nlogits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=20, window_size=50, whitelist_token_ids= {128821, 128822})] #window for fast；whitelist_token_ids: <td>,</td>\n\nsampling_params = SamplingParams(\n    temperature=0.0,\n    max_tokens=8192,\n    logits_processors=logits_processors,\n    skip_special_tokens=False,\n    include_stop_str_in_output=True,\n)\n\n\nclass Colors:\n    RED = '\\033[31m'\n    GREEN = '\\033[32m'\n    YELLOW = '\\033[33m'\n    BLUE = '\\033[34m'\n    RESET = '\\033[0m' \n\ndef pdf_to_images_high_quality(pdf_path, dpi=144, image_format=\"PNG\"):\n    \"\"\"\n    pdf2images\n    \"\"\"\n    images = []\n    \n    pdf_document = fitz.open(pdf_path)\n    \n    zoom = dpi / 72.0\n    matrix = fitz.Matrix(zoom, zoom)\n    \n    for page_num in range(pdf_document.page_count):\n        page = pdf_document[page_num]\n\n        pixmap = page.get_pixmap(matrix=matrix, alpha=False)\n        Image.MAX_IMAGE_PIXELS = None\n\n        if image_format.upper() == \"PNG\":\n            img_data = pixmap.tobytes(\"png\")\n            img = Image.open(io.BytesIO(img_data))\n        else:\n            img_data = pixmap.tobytes(\"png\")\n            img = Image.open(io.BytesIO(img_data))\n            if img.mode in ('RGBA', 'LA'):\n                background = Image.new('RGB', img.size, (255, 255, 255))\n                background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)\n                img = background\n        \n        images.append(img)\n    \n    pdf_document.close()\n    return images\n\ndef pil_to_pdf_img2pdf(pil_images, output_path):\n\n    if not pil_images:\n        return\n    \n    image_bytes_list = []\n    \n    for img in pil_images:\n        if img.mode != 'RGB':\n            img = img.convert('RGB')\n        \n        img_buffer = io.BytesIO()\n        img.save(img_buffer, format='JPEG', quality=95)\n        img_bytes = img_buffer.getvalue()\n        image_bytes_list.append(img_bytes)\n    \n    try:\n        pdf_bytes = img2pdf.convert(image_bytes_list)\n        with open(output_path, \"wb\") as f:\n            f.write(pdf_bytes)\n\n    except Exception as e:\n        print(f\"error: {e}\")\n\n\n\ndef re_match(text):\n    pattern = r'(<\\|ref\\|>(.*?)<\\|/ref\\|><\\|det\\|>(.*?)<\\|/det\\|>)'\n    matches = re.findall(pattern, text, re.DOTALL)\n\n\n    mathes_image = []\n    mathes_other = []\n    for a_match in matches:\n        if '<|ref|>image<|/ref|>' in a_match[0]:\n            mathes_image.append(a_match[0])\n        else:\n            mathes_other.append(a_match[0])\n    return matches, mathes_image, mathes_other\n\n\ndef extract_coordinates_and_label(ref_text, image_width, image_height):\n\n\n    try:\n        label_type = ref_text[1]\n        cor_list = eval(ref_text[2])\n    except Exception as e:\n        print(e)\n        return None\n\n    return (label_type, cor_list)\n\n\ndef draw_bounding_boxes(image, refs, jdx):\n\n    image_width, image_height = image.size\n    img_draw = image.copy()\n    draw = ImageDraw.Draw(img_draw)\n\n    overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))\n    draw2 = ImageDraw.Draw(overlay)\n    \n    #     except IOError:\n    font = ImageFont.load_default()\n\n    img_idx = 0\n    \n    for i, ref in enumerate(refs):\n        try:\n            result = extract_coordinates_and_label(ref, image_width, image_height)\n            if result:\n                label_type, points_list = result\n                \n                color = (np.random.randint(0, 200), np.random.randint(0, 200), np.random.randint(0, 255))\n\n                color_a = color + (20, )\n                for points in points_list:\n                    x1, y1, x2, y2 = points\n\n                    x1 = int(x1 / 999 * image_width)\n                    y1 = int(y1 / 999 * image_height)\n\n                    x2 = int(x2 / 999 * image_width)\n                    y2 = int(y2 / 999 * image_height)\n\n                    if label_type == 'image':\n                        try:\n                            cropped = image.crop((x1, y1, x2, y2))\n                            cropped.save(f\"{OUTPUT_PATH}/images/{jdx}_{img_idx}.jpg\")\n                        except Exception as e:\n                            print(e)\n                            pass\n                        img_idx += 1\n                        \n                    try:\n                        if label_type == 'title':\n                            draw.rectangle([x1, y1, x2, y2], outline=color, width=4)\n                            draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)\n                        else:\n                            draw.rectangle([x1, y1, x2, y2], outline=color, width=2)\n                            draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)\n\n                        text_x = x1\n                        text_y = max(0, y1 - 15)\n                            \n                        text_bbox = draw.textbbox((0, 0), label_type, font=font)\n                        text_width = text_bbox[2] - text_bbox[0]\n                        text_height = text_bbox[3] - text_bbox[1]\n                        draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height], \n                                    fill=(255, 255, 255, 30))\n                        \n                        draw.text((text_x, text_y), label_type, font=font, fill=color)\n                    except:\n                        pass\n        except:\n            continue\n    img_draw.paste(overlay, (0, 0), overlay)\n    return img_draw\n\n\ndef process_image_with_refs(image, ref_texts, jdx):\n    result_image = draw_bounding_boxes(image, ref_texts, jdx)\n    return result_image\n\n\ndef process_single_image(image):\n    \"\"\"single image\"\"\"\n    prompt_in = prompt\n    cache_item = {\n        \"prompt\": prompt_in,\n        \"multi_modal_data\": {\"image\": DeepseekOCRProcessor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)},\n    }\n    return cache_item\n\n\nif __name__ == \"__main__\":\n\n    os.makedirs(OUTPUT_PATH, exist_ok=True)\n    os.makedirs(f'{OUTPUT_PATH}/images', exist_ok=True)\n    \n    print(f'{Colors.RED}PDF loading .....{Colors.RESET}')\n\n\n    images = pdf_to_images_high_quality(INPUT_PATH)\n\n\n    prompt = PROMPT\n\n    # batch_inputs = []\n\n    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:  \n        batch_inputs = list(tqdm(\n            executor.map(process_single_image, images),\n            total=len(images),\n            desc=\"Pre-processed images\"\n        ))\n\n\n    # for image in tqdm(images):\n\n    #     prompt_in = prompt\n    #     cache_list = [\n    #         {\n    #             \"prompt\": prompt_in,\n    #             \"multi_modal_data\": {\"image\": DeepseekOCRProcessor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)},\n    #         }\n    #     ]\n    #     batch_inputs.extend(cache_list)\n\n\n    outputs_list = llm.generate(\n        batch_inputs,\n        sampling_params=sampling_params\n    )\n\n\n    output_path = OUTPUT_PATH\n\n    os.makedirs(output_path, exist_ok=True)\n\n\n    mmd_det_path = output_path + '/' + INPUT_PATH.split('/')[-1].replace('.pdf', '_det.mmd')\n    mmd_path = output_path + '/' + INPUT_PATH.split('/')[-1].replace('pdf', 'mmd')\n    pdf_out_path = output_path + '/' + INPUT_PATH.split('/')[-1].replace('.pdf', '_layouts.pdf')\n    contents_det = ''\n    contents = ''\n    draw_images = []\n    jdx = 0\n    for output, img in zip(outputs_list, images):\n        content = output.outputs[0].text\n\n        if '<｜end▁of▁sentence｜>' in content: # repeat no eos\n            content = content.replace('<｜end▁of▁sentence｜>', '')\n        else:\n            if SKIP_REPEAT:\n                continue\n\n        \n        page_num = f'\\n<--- Page Split --->'\n\n        contents_det += content + f'\\n{page_num}\\n'\n\n        image_draw = img.copy()\n\n        matches_ref, matches_images, mathes_other = re_match(content)\n        # print(matches_ref)\n        result_image = process_image_with_refs(image_draw, matches_ref, jdx)\n\n\n        draw_images.append(result_image)\n\n\n        for idx, a_match_image in enumerate(matches_images):\n            content = content.replace(a_match_image, f'![](images/' + str(jdx) + '_' + str(idx) + '.jpg)\\n')\n\n        for idx, a_match_other in enumerate(mathes_other):\n            content = content.replace(a_match_other, '').replace('\\\\coloneqq', ':=').replace('\\\\eqqcolon', '=:').replace('\\n\\n\\n\\n', '\\n\\n').replace('\\n\\n\\n', '\\n\\n')\n\n\n        contents += content + f'\\n{page_num}\\n'\n\n\n        jdx += 1\n\n    with open(mmd_det_path, 'w', encoding='utf-8') as afile:\n        afile.write(contents_det)\n\n    with open(mmd_path, 'w', encoding='utf-8') as afile:\n        afile.write(contents)\n\n\n    pil_to_pdf_img2pdf(draw_images, pdf_out_path)\n\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2025 DeepSeek\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "<!-- markdownlint-disable first-line-h1 -->\n<!-- markdownlint-disable html -->\n<!-- markdownlint-disable no-duplicate-header -->\n\n\n<div align=\"center\">\n  <img src=\"assets/logo.svg\" width=\"60%\" alt=\"DeepSeek AI\" />\n</div>\n\n\n<hr>\n<div align=\"center\">\n  <a href=\"https://www.deepseek.com/\" target=\"_blank\">\n    <img alt=\"Homepage\" src=\"assets/badge.svg\" />\n  </a>\n  <a href=\"https://huggingface.co/deepseek-ai/DeepSeek-OCR\" target=\"_blank\">\n    <img alt=\"Hugging Face\" src=\"https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-DeepSeek%20AI-ffc107?color=ffc107&logoColor=white\" />\n  </a>\n\n</div>\n\n<div align=\"center\">\n\n  <a href=\"https://discord.gg/Tc7c45Zzu5\" target=\"_blank\">\n    <img alt=\"Discord\" src=\"https://img.shields.io/badge/Discord-DeepSeek%20AI-7289da?logo=discord&logoColor=white&color=7289da\" />\n  </a>\n  <a href=\"https://twitter.com/deepseek_ai\" target=\"_blank\">\n    <img alt=\"Twitter Follow\" src=\"https://img.shields.io/badge/Twitter-deepseek_ai-white?logo=x&logoColor=white\" />\n  </a>\n\n</div>\n\n\n\n<p align=\"center\">\n  <a href=\"https://huggingface.co/deepseek-ai/DeepSeek-OCR\"><b>📥 Model Download</b></a> |\n  <a href=\"https://github.com/deepseek-ai/DeepSeek-OCR/blob/main/DeepSeek_OCR_paper.pdf\"><b>📄 Paper Link</b></a> |\n  <a href=\"https://arxiv.org/abs/2510.18234\"><b>📄 Arxiv Paper Link</b></a> |\n</p>\n\n<h2>\n<p align=\"center\">\n  <a href=\"\">DeepSeek-OCR: Contexts Optical Compression</a>\n</p>\n</h2>\n\n<p align=\"center\">\n<img src=\"assets/fig1.png\" style=\"width: 1000px\" align=center>\n</p>\n<p align=\"center\">\n<a href=\"\">Explore the boundaries of visual-text compression.</a>       \n</p>\n\n## Release\n- [2026/01/27]🚀🚀🚀🚀🚀🚀 We present [DeepSeek-OCR2](https://github.com/deepseek-ai/DeepSeek-OCR-2)\n- [2025/10/23]🚀🚀🚀 DeepSeek-OCR is now officially supported in upstream [vLLM](https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-OCR.html#installing-vllm). Thanks to the [vLLM](https://github.com/vllm-project/vllm) team for their help.\n- [2025/10/20]🚀🚀🚀 We release DeepSeek-OCR, a model to investigate the role of vision encoders from an LLM-centric viewpoint.\n\n## Contents\n- [Install](#install)\n- [vLLM Inference](#vllm-inference)\n- [Transformers Inference](#transformers-inference)\n  \n\n\n\n\n## Install\n>Our environment is cuda11.8+torch2.6.0.\n1. Clone this repository and navigate to the DeepSeek-OCR folder\n```bash\ngit clone https://github.com/deepseek-ai/DeepSeek-OCR.git\n```\n2. Conda\n```Shell\nconda create -n deepseek-ocr python=3.12.9 -y\nconda activate deepseek-ocr\n```\n3. Packages\n\n- download the vllm-0.8.5 [whl](https://github.com/vllm-project/vllm/releases/tag/v0.8.5) \n```Shell\npip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118\npip install vllm-0.8.5+cu118-cp38-abi3-manylinux1_x86_64.whl\npip install -r requirements.txt\npip install flash-attn==2.7.3 --no-build-isolation\n```\n**Note:** if you want vLLM and transformers codes to run in the same environment, you don't need to worry about this installation error like: vllm 0.8.5+cu118 requires transformers>=4.51.1\n\n## vLLM-Inference\n- VLLM:\n>**Note:** change the INPUT_PATH/OUTPUT_PATH and other settings in the DeepSeek-OCR-master/DeepSeek-OCR-vllm/config.py\n```Shell\ncd DeepSeek-OCR-master/DeepSeek-OCR-vllm\n```\n1. image: streaming output\n```Shell\npython run_dpsk_ocr_image.py\n```\n2. pdf: concurrency ~2500tokens/s(an A100-40G)\n```Shell\npython run_dpsk_ocr_pdf.py\n```\n3. batch eval for benchmarks\n```Shell\npython run_dpsk_ocr_eval_batch.py\n```\n\n**[2025/10/23] The version of upstream [vLLM](https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-OCR.html#installing-vllm):**\n\n```shell\nuv venv\nsource .venv/bin/activate\n# Until v0.11.1 release, you need to install vLLM from nightly build\nuv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly\n```\n\n```python\nfrom vllm import LLM, SamplingParams\nfrom vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor\nfrom PIL import Image\n\n# Create model instance\nllm = LLM(\n    model=\"deepseek-ai/DeepSeek-OCR\",\n    enable_prefix_caching=False,\n    mm_processor_cache_gb=0,\n    logits_processors=[NGramPerReqLogitsProcessor]\n)\n\n# Prepare batched input with your image file\nimage_1 = Image.open(\"path/to/your/image_1.png\").convert(\"RGB\")\nimage_2 = Image.open(\"path/to/your/image_2.png\").convert(\"RGB\")\nprompt = \"<image>\\nFree OCR.\"\n\nmodel_input = [\n    {\n        \"prompt\": prompt,\n        \"multi_modal_data\": {\"image\": image_1}\n    },\n    {\n        \"prompt\": prompt,\n        \"multi_modal_data\": {\"image\": image_2}\n    }\n]\n\nsampling_param = SamplingParams(\n            temperature=0.0,\n            max_tokens=8192,\n            # ngram logit processor args\n            extra_args=dict(\n                ngram_size=30,\n                window_size=90,\n                whitelist_token_ids={128821, 128822},  # whitelist: <td>, </td>\n            ),\n            skip_special_tokens=False,\n        )\n# Generate output\nmodel_outputs = llm.generate(model_input, sampling_param)\n\n# Print output\nfor output in model_outputs:\n    print(output.outputs[0].text)\n```\n## Transformers-Inference\n- Transformers\n```python\nfrom transformers import AutoModel, AutoTokenizer\nimport torch\nimport os\nos.environ[\"CUDA_VISIBLE_DEVICES\"] = '0'\nmodel_name = 'deepseek-ai/DeepSeek-OCR'\n\ntokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\nmodel = AutoModel.from_pretrained(model_name, _attn_implementation='flash_attention_2', trust_remote_code=True, use_safetensors=True)\nmodel = model.eval().cuda().to(torch.bfloat16)\n\n# prompt = \"<image>\\nFree OCR. \"\nprompt = \"<image>\\n<|grounding|>Convert the document to markdown. \"\nimage_file = 'your_image.jpg'\noutput_path = 'your/output/dir'\n\nres = model.infer(tokenizer, prompt=prompt, image_file=image_file, output_path = output_path, base_size = 1024, image_size = 640, crop_mode=True, save_results = True, test_compress = True)\n```\nor you can\n```Shell\ncd DeepSeek-OCR-master/DeepSeek-OCR-hf\npython run_dpsk_ocr.py\n```\n## Support-Modes\nThe current open-source model supports the following modes:\n- Native resolution:\n  - Tiny: 512×512 （64 vision tokens）✅\n  - Small: 640×640 （100 vision tokens）✅\n  - Base: 1024×1024 （256 vision tokens）✅\n  - Large: 1280×1280 （400 vision tokens）✅\n- Dynamic resolution\n  - Gundam: n×640×640 + 1×1024×1024 ✅\n\n## Prompts examples\n```python\n# document: <image>\\n<|grounding|>Convert the document to markdown.\n# other image: <image>\\n<|grounding|>OCR this image.\n# without layouts: <image>\\nFree OCR.\n# figures in document: <image>\\nParse the figure.\n# general: <image>\\nDescribe this image in detail.\n# rec: <image>\\nLocate <|ref|>xxxx<|/ref|> in the image.\n# '先天下之忧而忧'\n```\n\n\n## Visualizations\n<table>\n<tr>\n<td><img src=\"assets/show1.jpg\" style=\"width: 500px\"></td>\n<td><img src=\"assets/show2.jpg\" style=\"width: 500px\"></td>\n</tr>\n<tr>\n<td><img src=\"assets/show3.jpg\" style=\"width: 500px\"></td>\n<td><img src=\"assets/show4.jpg\" style=\"width: 500px\"></td>\n</tr>\n</table>\n\n\n## Acknowledgement\n\nWe would like to thank [Vary](https://github.com/Ucas-HaoranWei/Vary/), [GOT-OCR2.0](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/), [MinerU](https://github.com/opendatalab/MinerU), [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [OneChart](https://github.com/LingyvKong/OneChart), [Slow Perception](https://github.com/Ucas-HaoranWei/Slow-Perception) for their valuable models and ideas.\n\nWe also appreciate the benchmarks: [Fox](https://github.com/ucaslcl/Fox), [OminiDocBench](https://github.com/opendatalab/OmniDocBench).\n\n## Citation\n\n```bibtex\n@article{wei2025deepseek,\n  title={DeepSeek-OCR: Contexts Optical Compression},\n  author={Wei, Haoran and Sun, Yaofeng and Li, Yukun},\n  journal={arXiv preprint arXiv:2510.18234},\n  year={2025}\n}\n"
  },
  {
    "path": "requirements.txt",
    "content": "transformers==4.46.3\ntokenizers==0.20.3\nPyMuPDF\nimg2pdf\neinops\neasydict\naddict \nPillow\nnumpy\n"
  }
]