Full Code of Stardust-y/XTVQA for AI

master f617c5231301 cached

117 files

4.0 MB

1.0M tokens

157 symbols

1 requests

Download .txt

Showing preview only (7,413K chars total). Download the full file or copy to clipboard to get everything.

Repository: Stardust-y/XTVQA
Branch: master
Commit: f617c5231301
Files: 117
Total size: 4.0 MB

Directory structure:
gitextract_8dq1qumw/

├── .gitignore
├── dataset/
│   ├── __init__.py
│   ├── ch_qa_gen.py
│   ├── data_construct.py
│   ├── dataloader.py
│   ├── glue.py
│   ├── paper_ocr.py
│   ├── pz_loader.py
│   ├── test_gemini.py
│   ├── test_pymu.py
│   └── text_rendering.py
├── eval/
│   ├── all_in_one.sh
│   ├── cogvlm/
│   │   └── inference.py
│   ├── eval_all.py
│   ├── eval_cogagent.py
│   ├── eval_lunwen.sh
│   ├── eval_minicpm.py
│   ├── eval_monkey.py
│   ├── eval_pageqa.py
│   ├── eval_paper_llm.py
│   ├── eval_qasper.py
│   ├── eval_qasper.sh
│   ├── eval_xtreme.py
│   ├── gemini/
│   │   ├── eval_en_ocr.py
│   │   └── eval_qasper.py
│   ├── intern/
│   │   └── inference.py
│   ├── llava/
│   │   ├── acc_chartvqa.py
│   │   ├── eval_chartvqa.py
│   │   ├── eval_en_ocr.py
│   │   ├── eval_mathvqa.py
│   │   ├── eval_qasper.py
│   │   ├── eval_snli.py
│   │   ├── eval_snli_2.py
│   │   ├── inference.py
│   │   ├── run_lunwen.sh
│   │   ├── run_math.sh
│   │   └── run_qasper.sh
│   ├── log/
│   │   ├── llava/
│   │   │   ├── test-185577.err
│   │   │   ├── test-185577.out
│   │   │   ├── test-185579.err
│   │   │   └── test-185579.out
│   │   ├── test-136013.err
│   │   ├── test-136013.out
│   │   ├── test-136014.err
│   │   ├── test-136014.out
│   │   ├── test-136017.err
│   │   ├── test-136017.out
│   │   ├── test-184040.err
│   │   ├── test-184040.out
│   │   ├── test-184041.err
│   │   ├── test-184041.out
│   │   ├── test-184077.err
│   │   ├── test-184077.out
│   │   ├── test-184078.err
│   │   ├── test-184078.out
│   │   ├── test-184087.err
│   │   ├── test-184087.out
│   │   ├── test-184154.err
│   │   ├── test-184154.out
│   │   ├── test-184155.err
│   │   ├── test-184155.out
│   │   ├── test-184219.err
│   │   ├── test-184219.out
│   │   ├── test-184220.err
│   │   ├── test-184220.out
│   │   ├── test-185258.err
│   │   ├── test-185258.out
│   │   ├── test-185259.err
│   │   ├── test-185259.out
│   │   ├── test-185279.err
│   │   ├── test-185279.out
│   │   ├── test-185281.err
│   │   ├── test-185281.out
│   │   ├── test-185283.err
│   │   ├── test-185283.out
│   │   ├── test-185289.err
│   │   ├── test-185289.out
│   │   ├── test-185292.err
│   │   ├── test-185292.out
│   │   ├── test-185300.err
│   │   ├── test-185300.out
│   │   ├── test-185345.err
│   │   ├── test-185345.out
│   │   ├── test-185351.err
│   │   ├── test-185351.out
│   │   ├── test-185410.err
│   │   ├── test-185410.out
│   │   ├── test-185483.err
│   │   ├── test-185483.out
│   │   ├── test-185484.err
│   │   ├── test-185484.out
│   │   ├── test-185573.err
│   │   ├── test-185573.out
│   │   ├── test-185574.err
│   │   └── test-185574.out
│   ├── logits_llava.py
│   ├── make_score.py
│   ├── mark_score.sh
│   ├── monkey/
│   │   └── eval_chart.py
│   ├── run_eval.sh
│   ├── run_eval_monkey.sh
│   ├── run_eval_monkey_ch.sh
│   ├── run_eval_pageqa.sh
│   ├── run_eval_paper.sh
│   ├── run_eval_paper_llm.sh
│   ├── run_eval_paper_llm_zh.sh
│   ├── run_llava_chart.sh
│   └── run_llava_snli.sh
├── src/
│   ├── dtw/
│   │   └── dtw.py
│   └── generate_gemini.py
└── utils/
    ├── __init__.py
    ├── calculate.py
    ├── gemini.py
    ├── gemini_api.py
    ├── gpt4o.py
    ├── render_text.py
    └── utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
results/*
result/*
checkpoints/*
close_result/*
data/*
log/*
models/*
poe-api-wrapper/*
visualize/*



================================================
FILE: dataset/__init__.py
================================================


================================================
FILE: dataset/ch_qa_gen.py
================================================
import pdb
import sys
sys.path.append("..")
sys.path.append("../..")
import fitz
import os
import argparse
import uuid
#from utils.utils import new_ip
import glob
#from paddleocr import PaddleOCR, draw_ocr
import json
# from utils.gemini import Gemini_Model, prompt_gen_ch_qa_extractive, prompt_gen_ch_qa_vis, prompt_gen_qa, prompt_gen_ch_qa_abstractive, prompt_gen_ch_qa_yes_no
# API_KEY = "AIzaSyB2rGDZzkVKxgkV8y_uJf4LvK9E9WKfWoE"


def paddle_pdfs(data_dir, save_path="../data"):
    from paddleocr import PaddleOCR, draw_ocr
    lunwen_list = os.listdir(os.path.join(data_dir, "pdf"))
    ocr = PaddleOCR(lang="ch")
    with open(os.path.join(save_path, "metadata.jsonl"), "w") as f:
        for lid, lunwen in enumerate(lunwen_list):
            doc = fitz.open(os.path.join(os.path.join(data_dir, "pdf"), lunwen))
            img_folder = os.path.join(data_dir, f"png/{lid}")
            print("img folder", img_folder)
            if not os.path.exists(img_folder):
                os.makedirs(img_folder)
            for i, page in enumerate(doc):
                zoom_x = 2  # (1.33333333-->1056x816)   (2-->1584x1224)
                zoom_y = 2
                mat = fitz.Matrix(zoom_x, zoom_y)
                pix = page.get_pixmap(matrix=mat, alpha=False)
                img_dir = os.path.join(img_folder, f"{str(i)}.png")
                pix.save(img_dir)
                print("saved to", img_dir)
                result = ocr.ocr(img_dir)
                print("lid", lid)
                f.write(json.dumps({"lid": lid, "origin_name": lunwen, "image_path": img_dir, "detection": result}, ensure_ascii=False) + "\n")

def paddle_png(img_path):
    ocr = PaddleOCR(lang="en")
    result = ocr.ocr(img_path)
    return result


def gemini_gen_qa_text(metadata, prompt, savedir="../data/ch_paper/qa"):
    model = Gemini_Model(key="AIzaSyCYo6MWJKX4nrV8i36GKVVEVeuYfD3co-s", vision=False)
    print("model initialized")
    savename = os.path.join(savedir, "zh-zh-gemini-txt-yes-no.jsonl")
    if os.path.exists(savename):
        with open(savename, "r") as fr:
            line_num = len(fr.readlines())
    else:
        line_num = 0
    index  = 0
    with open(metadata, "r") as f:
        for i, line in enumerate(f):
            if i < line_num:
                continue
            ocr_res = eval(line)
            detection = ocr_res['detection'][0]
            img_path = ocr_res['image_path']

            if len(detection) > 10: ## 检测出的文字大于一定数量 可以提问
                text = "".join([det[1][0] for det in detection])
                query = prompt + text
                try:
                    response = model.get_response_text(query)
                except:
                    print("skip")
                    pdb.set_trace()
                    continue
                print(response.text, "-" * 20)
                with open(savename, "a") as fr:
                    for line in response.text.split("\n"):
                        try:
                            qa = eval(line.strip())
                            fr.write(json.dumps({
                                "question_id": index,
                                "img_path": img_path,
                                "question": qa['question'],
                                "answer": qa['answer']
                            }, ensure_ascii=False) + "\n")
                            index += 1
                        except:
                            pass

def gemini_gen_qa_vision(metadata, prompt, savedir):
    model = Gemini_Model(key=API_KEY)
    savename = os.path.join(savedir, "zh-zh-gemini-vis.jsonl")
    if os.path.exists(savename):
        with open(savename, "r") as fr:
            line_num = len(fr.readlines())
    else:
        line_num = 0

    with open(metadata, "r") as f:
        for i, line in enumerate(f):
            if i < line_num:
                continue
            ocr_res = eval(line)
           # detection = ocr_res['detection'][0]
            image_path = ocr_res['image_path']
            query = prompt

            response = model.get_response_vision(image_path, query)
            ocr_res['qas'] = []
            for line in response.split("\n"):
                try:
                    qa = eval(line.strip())
                    print(qa)
                    ocr_res['qas'].append(qa)
                except:
                    ocr_res['qas'].append(line)
            with open(os.path.join(savedir, "zh-zh-gemini-vis.jsonl"), "a") as fr:
                fr.write(json.dumps(ocr_res, ensure_ascii=False)+"\n")

def gemini_ocr(args):
    model = Gemini_Model(key=API_KEY)
    savename = os.path.join(args.savedir, "zh-gemini-ocr.jsonl")
    imgs = glob.glob(os.path.join(args.image_path, os.path.join("*", "*.png")))
    if os.path.exists(savename):
        with open(savename, "r") as fr:
            line_num = len(fr.readlines())
    else:
        line_num = 0
    with open(savename, "w") as f:
        for img in imgs:
            response = model.get_response_vision(img, "识别图中所有文字")
            print(response)
            f.write(json.dumps({
                "img_path": img,
                "predicted_answer": response,
            }) + "\n")

if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Visualize a series of point clouds as an animation.")
    parser.add_argument("--vis", type=bool, default=False)
    parser.add_argument("--savedir", type=str, default="../result/ocr/")
    parser.add_argument("--datadir", type=str, default="../data/ch_paper")
    parser.add_argument("--image_path", type=str, default="../data/ch_paper/png")
    parser.add_argument("--form", type=str, default="yes_no")
    parser.add_argument("--mode", type=str, default="enpaper")
    args = parser.parse_args()
    from paddleocr import PaddleOCR, draw_ocr
    res = {}
    if args.mode == "enpaper":
        dev_data = json.load(open("../data/pageqa/dev.json"))
        with open("../data/pageqa/dev_metadata_en.json", "w") as f:
            for data in dev_data:
                imgpath = f"../data/pageqa/png/{data['imgname']}.png"
                result = paddle_png(imgpath)
                res[data['imgname']] = result
                json.dump(res, f, indent=4)


    else:

        gemini_ocr(args)
        exit()
        metadata = os.path.join(args.savedir, "metadata.jsonl")

        if not os.path.exists(metadata):
            paddle_pdfs(args.datadir, args.savedir)

        if args.vis:
            print("using vis---")
            prompt = prompt_gen_ch_qa_vis
            gemini_gen_qa_vision(metadata, prompt, args.savedir)
        else:
            prompt = eval(f'prompt_gen_ch_qa_{args.form}')
            gemini_gen_qa_text(metadata, prompt, args.savedir)



================================================
FILE: dataset/data_construct.py
================================================


import pdb

import fitz
import json
import os
### READ IN PDF
from collections import Counter
LAST_INDEX = 18
with open("../data/qasper-train-dev-v0.3/qasper-train-v0.3.json", "r") as f:
    data = json.load(f)

save_dir = "../data/pageqa/train.json"
if os.path.exists(save_dir):
    with open(save_dir, "r") as f:
        qa_list = json.load(f)
else:
    qa_list = []
print([qa['question_id'] for qa in qa_list])

import uuid
black_list = ["1802.00396", "1911.03343", "1908.07822", "1612.04675", "1702.03274"]
def hl_section(hl_evidence, full_text):
    hl_evidence = hl_evidence.strip()
    for section in full_text:
        section_name = section['section_name']
        paragraphs = "\n".join(section['paragraphs'])
        if hl_evidence in paragraphs:
            print("sn", section_name)
            section_name = section_name.split(" ::: ")[-1]
            return section_name
    return None

counter = 0
skip_num = 0
for num, d in enumerate(data.items()):
    # if num < 247:
    #     print("computed", num)
    #     continue
    k, v = d
    if k in black_list:
        print(k, "in black list")
        continue
    print("num", num, "doc", k)
    pdf_url = f"https://arxiv.org/pdf/{k}"
    output_path = f"../data/pageqa/pdfs/{k}.pdf"
    if not os.path.exists(output_path):
        print(f"wget {pdf_url} -O {output_path}")
        os.system(f"wget {pdf_url} -O {output_path}")
    doc = fitz.open(output_path)
    qas, full_text = v['qas'], v['full_text']

    for qa in qas:
        question_id = qa['question_id']
        # print(question_id, [qa['question_id'] for qa in qa_list])
        if question_id in [qa['question_id'] for qa in qa_list]:
            print(question_id, "has been calculated, skip")
            continue

        answers = qa['answers']
        sections = []  #majority vote 选section
        answer_dict = {"extractive_spans": [],
                       "yes_no": [],
                       "free_form_answer": []}
        for answer in answers:
            answer = answer['answer']
            if answer['unanswerable']:
                continue
            hl_evidence = answer['highlighted_evidence']
            if len(hl_evidence) == 0:
                print("answer", answer)
                continue
            # for evidence in hl_evidence:
            #     section = hl_section(hl_evidence[0], full_text)
            #     if section is not None:
            #         sections.append(section)
            answer_dict['extractive_spans'].append(answer['extractive_spans'])
            answer_dict['yes_no'].append(answer['yes_no'])
            answer_dict['free_form_answer'].append(answer['free_form_answer'])
        # 创建Counter对象
        # print("sections", sections)
        # counter = Counter(sections)
        # if len(sections) == 0:
        #     continue   ## 没找到对应的section
        # top_section = counter.most_common(1)[0][0]
        candidates = []
        for i, page in enumerate(doc):
            text_instances = []
            for he in hl_evidence:  #在这一页中通过hl第一句话找所有hl是否有出现
                he = he.split(".")[0]
                if he == "":
                    continue
                text_instance = page.search_for(he)
                if text_instance is None:
                    continue
                else:
                    text_instances += page.search_for(he)
               # print(text_instances)
            if len(text_instances):
                candidates.append(page)
        print("top", len(candidates))
        if len(candidates) > 0:
            page = candidates[0]
        else:
            for i, page in enumerate(doc):
                text_instances = []
                # pdb.set_trace()
                for he in hl_evidence:
                    he = he.split(".")[0].split(",")[0]
                    text_instance = page.search_for(he)
                    if text_instance is None:
                        continue
                    else:
                        text_instances += page.search_for(he)
                    # print(text_instances)
                if len(text_instances):
                    candidates.append(page)
        if len(candidates) > 0:
            page = candidates[0]
        else:
            skip_num += 1
            print("unsuccessful so far", skip_num)
            continue
        zoom_x = 2  # (1.33333333-->1056x816)   (2-->1584x1224)
        zoom_y = 2
        mat = fitz.Matrix(zoom_x, zoom_y)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        unique_id = str(uuid.uuid4())[:12]  # 获取UUID的前9位作为哈希码
        pix.save("../data/pageqa/png/" + unique_id + ".png")
        print(question_id, "save to", "../data/pageqa/png/" + unique_id + ".png")
        qa_list.append({
            "question": qa['question'],
            "imgname": unique_id,
            "question_id": qa['question_id'],
            "answers": answer_dict,
        })
    with open(save_dir, "w") as f:
        f.write(json.dumps(qa_list, indent=4))




================================================
FILE: dataset/dataloader.py
================================================
import json

import glob
import os

class MathLoader:
    def __init__(self, img_path, data_file):
        with open(data_file, "r") as f:
            self.data = [json.loads(line) for line in f]
        self.img_path = img_path

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item["question"]
        options = item['options']
        imgname = item['image']
        imgname = os.path.join(self.img_path, imgname)
        if len(options):
            question += f"options from the following options: {str(options)}"

        id = item['id']
        return {
            "question_id": id,
            "image_path": imgname,
            "question": question,
        }

class PaperLoader:
    def __init__(self, data_file):
        if "jsonl" in data_file:
            with open(data_file, "r") as f:
                self.data = [json.loads(line) for line in f]
        else:
            self.data = json.load(open(data_file, "r"))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item["question"]
        imgname = item["imgname"]
        question_id = item["question_id"]
        answers = item["answers"]
        return question, imgname, question_id, answers

class PaperTextLoader:
    def __init__(self, data_file, metadata_file):
        if "jsonl" in data_file:
            with open(data_file, "r") as f:
                self.data = [json.loads(line) for line in f]
        else:
            self.data = json.load(open(data_file, "r"))
        with open(metadata_file, "r") as f:
            self.metadata = json.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item["question"]
        imgname = item["imgname"]
        question_id = item["question_id"]
        answers = item["answers"]
        content = self.metadata['imgname']
        return question, imgname, question_id, answers, content

class OcrvqaLoader:
    def __init__(self, data_file):
        with open(data_file, "r") as f:
            self.data = [json.loads(line) for line in f]
        # self.test_data = []
        # for k, v in self.data.items():
        #     if v['split'] == 3:
        #         for q, a in zip(v['questions'], v['answers']):
        #             self.test_data.append({
        #                 "imgname": k,
        #                 "question": q,
        #                 "answer": a
        #             })
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question_id = item['question_id']
        question = item['question']
        answer = item['answer']
        imgname = item['imgname']
        return question_id, question, imgname, answer

class TextvqaLoader:
    def __init__(self, data_file):
        if "jsonl" in data_file:
            with open(data_file, "r") as f:
                self.data = [json.loads(line) for line in f]
        else:
            self.data = json.load(open(data_file, "r"))
            self.data = self.data['data']
        with open(os.path.join(os.path.dirname(data_file), "TextVQA_Rosetta_OCR_v0.2_val.json"), "r") as f:
            self.ocr_data = json.load(f)['data']
            self.ocr_data = {data['image_id']: data['ocr_tokens'] for data in self.ocr_data}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item["question"]
        imgname = item["image_id"]
        question_id = item["question_id"]
        answers = item["answers"][0]
        ocr_tokens = self.ocr_data[imgname]
        return question, imgname, question_id, answers, ocr_tokens

class ChartvqaLoader:
    def __init__(self, data_file):
        if "jsonl" in data_file:
            with open(data_file, "r") as f:
                self.data = [json.loads(line) for line in f]
        else:
            self.data = json.load(open(data_file, "r"))


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item["query"]
        imgname = item["imgname"]
        answers = item["label"]
        return question, imgname, answers

class DocvqaLoader:
    def __init__(self, data_file):
        if "jsonl" in data_file:
            with open(data_file, "r") as f:
                self.data = [json.loads(line) for line in f]
        else:
            self.data = json.load(open(data_file, "r"))
            self.data = self.data['data']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question_id = item['questionId']
        question = item["question"]
        image = item["image"]
        answers = item["answers"][0]
        return question_id, question, image, answers

class LunwenLoader:
    def __init__(self, data_file):
        self.data = []
        with open(data_file, "r") as f:
            for line in f:
                self.data.append(eval(line))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        img_path = item['img_path']
        question = item['question']
        answer = item['answer']

        return {
            "img_path": img_path,
            "question": question,
            "answer": answer
        }
class LunwenTextLoader:
    def __init__(self, data_file, metadata_file):
        self.data, self.metadata = [], {}
        with open(data_file, "r") as f:
            for line in f:
                self.data.append(eval(line))
        with open(metadata_file, "r") as f:
            for line in f:
                line_data = json.loads(line)
                self.metadata[line_data['image_path']] = line_data['detection']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        img_path = item['img_path']
        question = item['question']
        answers = item['answer']
        question_id = item['question_id']
        content = self.metadata[img_path]

        return question, img_path, question_id, answers, content


if __name__ == "__main__":
    from paddleocr import PaddleOCR, draw_ocr
    ocr = PaddleOCR(lang="en")
    dataloader = DataLoader("./data/pageqa/dev_zh_en.jsonl")
    imgs = glob.glob(os.path.join("./data/pageqa/png", "*.png"))
    with open("./data/pageqa/ocr.jsonl", "w") as f:
        for img in imgs:
            result = ocr.ocr(img)
            f.write(json.dumps({
                "imgname": img.split('/')[-1].replace(".png", ""),
                "ocr": result
            }) + "\n")





================================================
FILE: dataset/glue.py
================================================
from dataset import load_dataset

dataset = load_dataset("nyu-mll/glue", "ax")

sub_cat = ["ax", "cola", "mnli", "mnli_matched", "mnli_mismatched", "mrpc", "qnli", "qqp",
"rte", "sst2", "stsb", "wnli"]

================================================
FILE: dataset/paper_ocr.py
================================================
import json
import os, pdb
import random

def get_index_by_category(category="cs"):
    index_list = []
    with open("./data/arxiv-metadata-oai-snapshot.json", "r") as f:
        line = f.readline()
        while line:
            data = json.loads(line)
            print(data['categories'])
            if data['categories'][:2] == "cs":
                index_list.append(data['id'])
            line = f.readline()
    return index_list


if __name__ == "__main__":
    category = "cs"
    # index_list = get_index_by_category("cs")
    with open(f"../data/paperocr/cs.txt", "r") as f:
        index_list = f.readlines()
    random.shuffle(index_list)

    # 从打乱后的列表中选取前1000个元素
    index_list = index_list[:1000]
    for id in index_list:
        cmd = f"wget https://ar5iv.labs.arxiv.org/html/{id.strip()}#/ -O ../data/paperocr/html/{id.strip()}.html"
        cmd_pdf = f"wget https://arxiv.org/pdf/{id.strip()}#/ -O ../data/paperocr/html/{id.strip()}.pdf"
        print(cmd)
        try:
            os.system(cmd)
            os.system(cmd_pdf)
        except:
            continue

================================================
FILE: dataset/pz_loader.py
================================================
import json, os

class MathLoader:
    def __init__(self, data_file, img_path):
        with open(data_file, "r") as f:
            self.data = [json.loads(line) for line in f]
        self.img_path = img_path

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item["question"]
        imgname = item['image']
        imgname = os.path.join(self.img_path, imgname)

        id = item['id']
        return question, imgname, id

================================================
FILE: dataset/test_gemini.py
================================================
import google.generativeai as genai
import os
# API_KEY = "AIzaSyB2rGDZzkVKxgkV8y_uJf4LvK9E9WKfWoE"
API_KEY = "AIzaSyCYo6MWJKX4nrV8i36GKVVEVeuYfD3co-s"
genai.configure(api_key=API_KEY)

model = genai.GenerativeModel('gemini-pro')
response = model.generate_content('Please summarise this document: ...')

print(response.text)

================================================
FILE: dataset/test_pymu.py
================================================
import fitz
output_path = "../data/pageqa/pdfs/1912.01214.pdf"
doc = fitz.open(output_path)

for i, page in enumerate(doc):
    print(page)
    text_instances = page.search_for("We compare our approaches with related approaches of pivoting")
    print(text_instances)

================================================
FILE: dataset/text_rendering.py
================================================


================================================
FILE: eval/all_in_one.sh
================================================
#!/bin/bash
#SBATCH -J $1-$2                               # 作业名为 test
#SBATCH -o ./log/eval/test-%j.out                           # stdout 重定向到 test.out
#SBATCH -e ./log/eval/test-%j.err                           # stderr 重定向到 test.err
#SBATCH -p compute                            # 作业提交的分区为 compute
#SBATCH -N 1                                  # 作业申请 1 个节点
#SBATCH -t 10:00:00                            # 任务运行的最长时间为 1 小时
#SBATCH --cpus-per-task=8
#SBATCH --gres=gpu:a100-sxm4-80gb:1

#python ./eval/eval_all.py --data_path "./data/ChartQA/test" \
#  --model_path "./checkpoints/monkey" \
#  --lang "en" \
#  --conv-mode "llava_v1" \
#  --mode chart \
#  --save_path "./result/ChartQA/en/monkey.jsonl"
export DATASET=$2
if [ $1 = "monkey" ]
then
  python ./eval/eval_all.py --data_path "./data/ocrvqa/" \
    --model_path "./checkpoints/monkey" \
    --lang "en" \
    --conv-mode "llava_v1" \
    --mode $DATASET \
    --save_path "./result/ocrvqa/en/monkey_mi.jsonl" \
    --mi True

  python ./eval/eval_all.py --data_path "./data/ocrvqa/" \
    --model_path "./checkpoints/monkey" \
    --lang "fr" \
    --conv-mode "llava_v1" \
    --mode $DATASET \
    --save_path "./result/ocrvqa/fr/monkey_mi.jsonl" \
    --mi True

  python ./eval/eval_all.py --data_path "./data/ocrvqa/" \
    --model_path "./checkpoints/monkey" \
    --lang "zh" \
    --conv-mode "llava_v1" \
    --mode $DATASET \
    --save_path "./result/ocrvqa/zh/monkey_mi.jsonl" \
    --mi True
elif [ $1 = "qwen" ]
then
  python ./eval/eval_all.py --data_path "./data/textvqa/" \
    --model_path "./checkpoints/Qwen/Qwen-VL-Chat" \
    --lang "en" \
    --conv-mode "llava_v1" \
    --mode $DATASET \
    --save_path "./result/textvqa/en/qwen.jsonl"

  python ./eval/eval_all.py --data_path "./data/textvqa/" \
    --model_path "./checkpoints/Qwen/Qwen-VL-Chat" \
    --lang "zh" \
    --conv-mode "llava_v1" \
    --mode $DATASET \
    --save_path "./result/textvqa/zh/qwen.jsonl"
elif [ $1 = "llava" ]
then
  for lang in "zh"
  do
    python ./eval/eval_all.py --data_path "./data/textvqa/" \
      --model_path "./checkpoints/llava-v1.6-34b" \
      --lang ${lang} \
      --conv-mode "llava_v1" \
      --mode $DATASET \
      --save_path "./result/textvqa/en/llava-v1.5-13b.jsonl"
  done
elif [ $1 = "blip" ]
then
  for lang in "zh"
  do
    python ./eval/eval_all.py --data_path "./data/textvqa/" \
      --model_path "./checkpoints/instructblip" \
      --lang ${lang} \
      --mode $DATASET \
      --ocr True
  done
elif [ $1 = "cog" ]
then
  for lang in "en" "zh"
  do
    python ./eval/eval_all.py  \
      --model_path "./checkpoints/cogvlm-chat-hf" \
      --lang ${lang} \
      --mode $DATASET
  done
elif [ $1 = "mplug" ]
then
  for lang in "zh"
  do
    python ./eval/eval_all.py  \
      --model_path "./checkpoints/mplug-owl2" \
      --lang ${lang} \
      --mode $DATASET
  done
elif [ $1 = "gemini" ]
then
  for lang in "en" "zh"
  do
    python ./eval/eval_all.py --data_path "./data/textvqa" --mode ${DATASET} --lang ${lang} \
      --model_path gemini-1.5-flash
  done
elif [ $1 = "gpt" ]
then
  for lang in "en"
  do
    python ./eval/eval_all.py --data_path "./data/textvqa" --mode ${DATASET} --lang ${lang} \
      --model_path gpt-4o-v3
  done
fi


================================================
FILE: eval/cogvlm/inference.py
================================================
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
import sys, json
from tqdm import tqdm
sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")
from dataset.dataloader import MathLoader


MODEL_PATH = "./checkpoints/cogvlm2"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=TORCH_TYPE,
    trust_remote_code=True,
).to(DEVICE).eval()

text_only_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"

dataloader = MathLoader("./data/pazhou", "./data/pazhou/test.jsonl")
# set the max number of tiles in `max_num`
gen_kwargs = {
    "max_new_tokens": 2048,
    "pad_token_id": 128002,
}
with open("./results/cogvlm2_exact.txt", "w") as f:
    for data in tqdm(dataloader):
        img_path, id, question = data['image_path'], data['question_id'], data['question']

        image = Image.open(img_path).convert('RGB')

        history = []

        query = "USER: {} ASSISTANT:".format(question)
        print("-" * 10, "\nquery 1: ", query)

        input_by_model = model.build_conversation_input_ids(
            tokenizer,
            query=query,
            history=history,
            images=[image],
            template_version='chat'
        )
        inputs = {
            'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
            'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
            'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
            'images': [[input_by_model['images'][0].to(DEVICE).to(TORCH_TYPE)]] if image is not None else None,
        }


        with torch.no_grad():
            outputs = model.generate(**inputs, **gen_kwargs)
            outputs = outputs[:, inputs['input_ids'].shape[1]:]
            response = tokenizer.decode(outputs[0])
            response = response.split("<|end_of_text|>")[0]
            print("\nCogVLM2:", response)
        history.append((query, response))

        if "options" in question:
            options = question.split("options: ")[1]
            question = f'Extract the direct answer from model response with a single option, options from the following options: {options}" Answer:'
        else:
            question = 'Extract the direct answer from model response, which is a single number value, formula or short sentence. Answer:'

        query = "USER: {} ASSISTANT:".format(question)
        print("-"*10, "\nquery 2: ", query)

        input_by_model = model.build_conversation_input_ids(
            tokenizer,
            query=query,
            history=history,
            images=[image],
            template_version='chat'
        )
        inputs = {
            'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
            'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
            'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
            'images': [[input_by_model['images'][0].to(DEVICE).to(TORCH_TYPE)]] if image is not None else None,
        }

        with torch.no_grad():
            outputs = model.generate(**inputs, **gen_kwargs)
            outputs = outputs[:, inputs['input_ids'].shape[1]:]
            response = tokenizer.decode(outputs[0])
            response = response.split("<|end_of_text|>")[0]
            print("\nCogVLM2:", response)
        f.write(json.dumps({"id": id, "model_answer": response}) + "\n")


================================================
FILE: eval/eval_all.py
================================================
import json
import pdb
import sys
sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")
sys.path.append("../../..")
import argparse
#from datasets import dataloader
import base64
import os, glob
from tqdm import tqdm
import cv2, random
import numpy as np
import traceback
import re
# from utils.calculate import  compute_pmi
def load_image(image_file):
    if image_file.startswith("http") or image_file.startswith("https"):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert("RGB")
    else:
        image = Image.open(image_file).convert("RGB")
    return image


def load_images(image_files):
    out = []
    for image_file in image_files:
        image = load_image(image_file)
        out.append(image)
    return out

def add_noise(img, mode="gauss"):
    img_array = np.array(img[0])

    if mode == "gauss":
        # 创建高斯噪声
        mean = 0
        sigma = 80  # 控制噪声级别,数值越大噪声越明显
        gauss = np.random.normal(mean, sigma, img_array.shape)
        gauss = gauss.astype(np.int32)

        # 将噪声叠加到图像上
        img_array = img_array + gauss
    elif mode == "pepper":
        # 添加椒盐噪声
        prob = 0.8  # 控制噪声比例
        thres = 1 - prob
        for i in range(img_array.shape[0]):
            for j in range(img_array.shape[1]):
                rdn = random.random()
                if rdn < prob:
                    img_array[i][j] = 0
                elif rdn > thres:
                    img_array[i][j] = 255
    elif mode == "reli":
        # 添加瑞利噪声
        rayleigh_noise = np.random.rayleigh(0.5, size=img_array.shape)
        img_array = img_array + (rayleigh_noise * 255).astype(np.uint8)

    # 将数组转换回 PIL 图像
    noisy_img = Image.fromarray(img_array.astype('uint8'))
    noisy_img.save("noise.png")
    return noisy_img

import torch


def instructblip_inference(model, img, qs, lang, processor, ocr_tokens=None):
    image = Image.open(img).convert("RGB")
    device = "cuda"
    if ocr_tokens is not None:
        prompt = f"OCR tokens: {' '.join(ocr_tokens)}. Question: {qs} Short answer:"
    else:
        prompt = f"Question: {qs} Short answer:"
    inputs = processor(images=image, text=prompt, return_tensors="pt")
    inputs = inputs.to(device)

    outputs = model.generate(
        **inputs,
        do_sample=False,
        num_beams=5,
        max_length=256,
        min_length=1,
        top_p=0.9,
        repetition_penalty=1.5,
        length_penalty=1.0,
        temperature=1,
    )
    generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
    return generated_text, None

def llava_inference(model, img, qs, lang, tokenizer, conv, gt=None, mi=False):

    image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
    if IMAGE_PLACEHOLDER in qs:
        if model.config.mm_use_im_start_end:
            qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
        else:
            qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
    else:
        if model.config.mm_use_im_start_end:
            qs = image_token_se + "\n" + qs
        else:
            qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
    conv = conv_templates[conv].copy()

    conv.append_message(conv.roles[0], qs)
    conv.append_message(conv.roles[1], None)
    # pdb.set_trace()
    prompt = conv.get_prompt()

    images = load_images([img])
    image_sizes = [x.size for x in images]
    images_tensor = process_images(
        images,
        image_processor,
        model.config
    ).to(model.device, dtype=torch.float16)

    input_ids = (
        tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
        .unsqueeze(0)
        .cuda()
    )
    if mi:
        images_wo = [add_noise(images)]
        image_sizes_wo = [x.size for x in images_wo]
        images_tensor_wo = process_images(
            images_wo,
            image_processor,
            model.config
        ).to(model.device, dtype=torch.float16)
        target_ids = (
            tokenizer_image_token(gt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
            .unsqueeze(0)
            .cuda()
        )

    with torch.inference_mode():
        if mi:
            outputs_wo = model.generate(
                input_ids,
                # images=images_tensor_wo.unsqueeze(0).half().cuda() if "1.5" in version else images_tensor_wo,
                images=images_tensor_wo,
                image_sizes=image_sizes_wo,
                do_sample=False,
                temperature=0,
                top_p=None,
                num_beams=1,
                # no_repeat_ngram_size=3,
                max_new_tokens=1024,
                use_cache=True,
                return_dict_in_generate=True,
                output_scores=True,
            )
            logits_wo = outputs_wo.scores
            logits_wo = torch.concatenate(logits_wo, dim=0)
            output_ids = outputs_wo.sequences
            response_wo = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
            print("response wo", response_wo)
        output = model.generate(
            input_ids,
            # images=images_tensor.unsqueeze(0).half().cuda() if "1.5" in version else images_tensor,
            images=images_tensor,
            image_sizes=image_sizes,
            do_sample=False,
            temperature=0,
            top_p=None,
            num_beams=1,
            # no_repeat_ngram_size=3,
            max_new_tokens=1024,
            use_cache=True,
            return_dict_in_generate=True,
            output_scores=True,
        )
        logits = output.scores
        logits = torch.concatenate(logits, dim=0)
        output_ids = output.sequences
        if mi:
            mi, ppl_wo, ppl, e_wo, e = compute_pmi(logits, logits_wo, target_ids)
            mi = {
                "mi": mi,
                "e_wo": ppl_wo,
                "e": ppl,
                "entropy": e,
                "entropy_wo": e_wo,
                "response_wo": response_wo
            }
            ## mi = e - e_wo
        else:
            mi = None
        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
        print("response", outputs)
        return outputs, mi


def monkey_inference(model, img, query, lang, tokenizer, gt=None, mi=False):
    # if lang == "zh":
    #     query = f'<img>{img}</img> 使用一个数字，单词或者句子回答问题 {query} '
    # elif lang == "fr":
    #     query = f'<img>{img}</img> Répondez à la question en utilisant un seul numéro, mot ou phrase {query} '
    # elif lang == "en":
    #     query = f'<img>{img}</img> Answer the question using a single number, word or sentence {query} '
    if mi:
        target_ids = tokenizer(gt, return_tensors="pt", padding="longest").input_ids.cuda()
        input_ids = tokenizer(query, return_tensors='pt', padding='longest')
        attention_mask = input_ids.attention_mask
        input_ids = input_ids.input_ids
        pred_wo = model.generate(
            input_ids=input_ids.cuda(),
            attention_mask=attention_mask.cuda(),
            do_sample=False,
            num_beams=1,
            max_new_tokens=64,
            min_new_tokens=1,
            length_penalty=1,
            num_return_sequences=1,
            output_hidden_states=True,
            use_cache=True,
            pad_token_id=tokenizer.eod_id,
            eos_token_id=tokenizer.eod_id,
            return_dict_in_generate=True,
            output_scores=True,
            # output_logits=True
        )
        logits_wo = pred_wo.scores
        logits_wo = torch.concatenate(logits_wo, dim=0)
        output_ids = pred_wo.sequences
        response_wo = tokenizer.decode(output_ids[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
        print("response wo", response_wo)
    query = f'<img>{img}</img> {query} Answer: '
    input_ids = tokenizer(query, return_tensors='pt', padding='longest')
    attention_mask = input_ids.attention_mask
    input_ids = input_ids.input_ids
    pred = model.generate(
        input_ids=input_ids.cuda(),
        attention_mask=attention_mask.cuda(),
        do_sample=False,
        num_beams=1,
        max_new_tokens=100,
        min_new_tokens=1,
        length_penalty=1,
        num_return_sequences=1,
        output_hidden_states=True,
        use_cache=True,
        pad_token_id=tokenizer.eod_id,
        eos_token_id=tokenizer.eod_id,
        return_dict_in_generate=True,
        output_scores=True
        # output_logits=True
    )
    logits = pred.scores
    logits = torch.concatenate(logits, dim=0)
    output_ids = pred.sequences
    if mi:
        mi, e_wo, e = compute_pmi(logits, logits_wo, target_ids)
        mi = {
            "mi": mi,
            "e_wo": e_wo,
            "e": e
        }
    else:
        mi = None
    response = tokenizer.decode(output_ids[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
    return response, mi

def gpt_inference(img_path, question, lang):
    base64_image = encode_image(img_path)
    if lang == "zh":
        prompt = "使用一个数字，单词或者句子回答问题"
    elif lang == "fr":
        prompt = "Répondez à la question en utilisant un seul numéro, mot ou phrase"
    elif lang == "en":
        prompt = "Answer the question using a single number, word or sentence "
    question += "Answer the question using a single number, word or sentence "

    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": [
                {"type": "text", "text": question},
                {"type": "image_url", "image_url": {
                    "url": f"data:image/png;base64,{base64_image}"}
                 }
            ]}
        ],
        temperature=0.0,
    )
    return response.choices[0].message.content

def gemini_inference(model, img_path, question):
    while True:
        response = model.get_response_vision(img_path, question + "Answer the question using a single number, word or sentence in English ", )
        if len(response) > 0:
            return response
        else:
            model = Gemini_Model(key=random.choice(API_LIST))

    # return response

def minicpm_inference(chat_model, img_path, question, lang):
    im_64 = img2base64(img_path)
    # First round chat
    if lang == "zh":
        msgs = [{"role": "user", "content": question + " 使用一个数字，单词或者句子回答问题"}]
    elif lang == "fr":
        msgs = [{"role": "user", "content": question + " Répondez à la question en utilisant un seul numéro, mot ou phrase"}]
    elif lang == "en":
        msgs = [{"role": "user", "content": question + " Answer the question using a single number, word or sentence"}]

    inputs = {"image": im_64, "question": json.dumps(msgs)}
    answer = chat_model.chat(inputs)
    return answer
def cogvlm_inference(model, tokenizer, img_path, question):
    question = f"Question: {question} Answer in short:"
    image = Image.open(img_path).convert("RGB")
    input_by_model = model.build_conversation_input_ids(tokenizer, query=question, history=[], images=[image])
    inputs = {
        'input_ids': input_by_model['input_ids'].unsqueeze(0).cuda(),
        'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).cuda(),
        'attention_mask': input_by_model['attention_mask'].unsqueeze(0).cuda(),
        'images': [[input_by_model['images'][0].cuda().to(torch.bfloat16)]] if image is not None else None,
    }
    gen_kwargs = {"max_length": 2048,
                  "do_sample": False}
    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_kwargs)
        outputs = outputs[:, inputs['input_ids'].shape[1]:]
        outputs = tokenizer.decode(outputs[0])
        outputs = outputs.replace("</s>", "")
        print("\nCog:", outputs)
    return outputs

def mplug_inference(model, img_path, question, tokenizer,image_processor):
    conv = conv_templates["mplug_owl2"].copy()
    roles = conv.roles

    image = Image.open(img_path).convert('RGB')
    max_edge = max(image.size)  # We recommand you to resize to squared image for BEST performance.
    image = image.resize((max_edge, max_edge))

    image_tensor = process_images([image], image_processor)
    image_tensor = image_tensor.to(model.device, dtype=torch.float16)
    inp = DEFAULT_IMAGE_TOKEN + question + "Answer in a word, number or a short sentence: "
    conv.append_message(conv.roles[0], inp)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()

    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(
        model.device)
    stop_str = conv.sep2
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    temperature = 0.2
    max_new_tokens = 512

    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=image_tensor,
            do_sample=True,
            temperature=temperature,
            max_new_tokens=max_new_tokens,
            streamer=streamer,
            use_cache=True,
            stopping_criteria=[stopping_criteria])

    outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()

    outputs = outputs.replace("</s>", "")
    print("outputs", outputs)
    return outputs

def qwenvl_inference(model, img_path, question, lang, tokenizer):
    # 1st dialogue turn
    # if lang == "zh":
    #     question += "\n" + "使用一个数字，单词或者句子回答问题"
    # elif lang == "fr":
    #     question += "\n" + "Répondez à la question en utilisant un seul numéro, mot ou phrase"
    # elif lang == "en":
    #     question = "Answer the question using a single number, word or sentence " + question

    query = tokenizer.from_list_format([
        {'image': img_path},
        # Either a local path or an url
        {'text': question + "Answer: "},
    ])
    response, history = model.chat(tokenizer, query=query, history=None)
    print("query", query, "response", response)
    return response


if __name__=="__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", type=str, default="./checkpoints/llava-v1.6-34b") #echo840/Monkey-Chat  echo840/Monkey
    parser.add_argument("--data_path", type=str, default="./data/pageqa")
    parser.add_argument("--human", type=bool, default=False)
    parser.add_argument("--save_path", type=str, default="./result/qasper/ocr/llava-v1.6-34b-2.jsonl")
    parser.add_argument("--conv-mode", type=str, default="chatml_ocr")
    parser.add_argument("--mode", type=str, default="chatml_ocr")
    parser.add_argument("--lang", type=str, default="fr")
    parser.add_argument("--temperature", type=float, default=0)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    parser.add_argument("--mi", type=bool, default=False)
    parser.add_argument("--ocr", type=bool, default=False)
    parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')

    args = parser.parse_args()

    if "llava" in args.model_path:
        from llava.constants import (
            IMAGE_TOKEN_INDEX,
            DEFAULT_IMAGE_TOKEN,
            DEFAULT_IM_START_TOKEN,
            DEFAULT_IM_END_TOKEN,
            IMAGE_PLACEHOLDER,
        )
        from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
        from llava.model.builder import load_pretrained_model
        import torch
        from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
        from llava.conversation import conv_templates, SeparatorStyle
        checkpoint = args.model_path

        model_name = get_model_name_from_path(args.model_path)
        tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, None, model_name)
    elif "monkey" in args.model_path:
        from transformers import AutoModelForCausalLM, AutoTokenizer

        checkpoint = args.model_path
        model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map='cuda', trust_remote_code=True, fp16=True,
                                                     bf16=False).eval()
        tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
        tokenizer.padding_side = 'left'
        tokenizer.pad_token_id = tokenizer.eod_id
    elif "cog" in args.model_path.lower():
        from transformers import AutoModelForCausalLM, LlamaTokenizer

        model = AutoModelForCausalLM.from_pretrained(
            args.model_path,
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
            trust_remote_code=True
        ).to('cuda').eval()
        tokenizer = LlamaTokenizer.from_pretrained("/home/share/models/vicuna-7b-v1.5")
    elif "mplug" in args.model_path:
        from transformers import TextStreamer
        from mplug_owl2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
        from mplug_owl2.conversation import conv_templates, SeparatorStyle
        from mplug_owl2.model.builder import load_pretrained_model
        from mplug_owl2.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, \
            KeywordsStoppingCriteria


        model_name = get_model_name_from_path(args.model_path)
        tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, None, model_name,
                                                                               load_8bit=False, load_4bit=False,
                                                                               device="cuda")
    elif "Qwen" in args.model_path:
        from transformers import AutoModelForCausalLM, AutoTokenizer
        from transformers.generation import GenerationConfig
        import torch

        torch.manual_seed(1234)

        # Note: The default behavior now has injection attack prevention off.
        tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(args.model_path, device_map="cuda",
                                                     trust_remote_code=True).eval()
        tokenizer.padding_side = 'left'
        tokenizer.pad_token_id = tokenizer.eod_id
    elif "gpt" in args.model_path:
        import openai
        openai.api_base = "https://openkey.cloud/v1"
        openai.api_key = "sk-7MEBBkzTBFQLHSfc4f4fC44e4c054c298a416f81138bA6D6"
        # openai.api_base = "http://127.0.0.1:8000/v1"
        # openai.api_key = "anything"
        # openai.default_headers = {"Authorization": "Bearer anything"}
        def encode_image(image_path):
            with open(image_path, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode("utf-8")
        # client = OpenAI(api_key="sk-i9mcaWsHOpu7MmyF60F238Ff2cBe4f66BfF79e9d9b4c4550")


    elif "gemini" in args.model_path:
        from utils.gemini import Gemini_Model
        import itertools
        API_KEY = "AIzaSyCYo6MWJKX4nrV8i36GKVVEVeuYfD3co-s"
        API_KEY4 = 'AIzaSyBFu_DKEssYwlc7MLB_vqQgNOwSJosDAvo'  # yli
        API_KEY5 = 'AIzaSyBbXMPmA8fjG1WN0bPMdMhTLflW_ufJGvY'  # RH

        # API_LIST = [
        #     "AIzaSyC71sLRFmcqw4dWk4G6u4JcfU7oyIdwle0",
        #     "AIzaSyDvgDp0P90FkPrhRu-dH4ckZBgvU9xZA0o",
        #     "AIzaSyDW1zBm_yT6m22_6LiE73iFysZXNyRuSkM",
        #     "AIzaSyBD3PJJ1pGDplOhTjVK3ESNFiC5umisaRk",
        #     "AIzaSyCujTyiBQKNq6M2Z2DeEY5WE7KKMdJTpFw",
        #     "AIzaSyD-LI7LrdqtmUCC4Uh8SLupmSOs-BXvzVM",
        #     "AIzaSyBc1wCA-INgCw-BzcEEw3wNYwG9qvyTP0I",
        #     "AIzaSyAKAFr4BluYrhbgvYjVIdIKQIHPhi49Ugc",
        #     "AIzaSyBDp6XTgF_5oNWxB79eLeg9UaC0xqn23J8",
        #     "AIzaSyD-Kwllyf5KY0VbSlHz22Yu_1qTOH6XU1g",
        #     "AIzaSyCocBiQHc0AWmnK4WvVb3CzjKQ_w_y1kmc",
        #     "AIzaSyDKJMM6OgbH9sVV_nTevWW4SaQe3KSyyv4",
        #     "AIzaSyCi98bty7j9n299249JmN_ZdJWiw-ENs3M",
        #     "AIzaSyCJDJpV_iuvbDuzn0AdVCZ0EDxHm3Z3xJw",
        #     "AIzaSyAI8xqLA9bEH9sVnxAi6oJGcGuRB3YqqMM",
        # ]
        API_LIST = [
            "AIzaSyD-Kwllyf5KY0VbSlHz22Yu_1qTOH6XU1g",
            "AIzaSyCocBiQHc0AWmnK4WvVb3CzjKQ_w_y1kmc",
            "AIzaSyDKJMM6OgbH9sVV_nTevWW4SaQe3KSyyv4",
            "AIzaSyAI8xqLA9bEH9sVnxAi6oJGcGuRB3YqqMM"
        ]
        # API_LIST = [
        #     "AIzaSyCYo6MWJKX4nrV8i36GKVVEVeuYfD3co-s",
        #     'AIzaSyBFu_DKEssYwlc7MLB_vqQgNOwSJosDAvo',
        #     'AIzaSyBbXMPmA8fjG1WN0bPMdMhTLflW_ufJGvY'
        # ]
        models = [Gemini_Model(key=key) for key in API_LIST]
        model_iterator = itertools.cycle(models)
    elif "blip" in args.model_path:
        from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
        import torch
        from PIL import Image
        import requests

        model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b").cuda()
        processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

    # elif "minicpm" in args.model_path:
    #     from transformers import AutoModelForCausalLM, AutoTokenizer
    #     from models.MiniCPM-V.chat import MiniCPMVChat, img2base64
    #     chat_model = MiniCPMVChat(args.model_path)
    mode_dict = {
        "ocrvqa": "ocrvqa",
        "text": "textvqa",
        "doc": "docvqa",
        "chart": "chartvqa",
        "paper": "paper",
        "lunwen": "lunwen"
    }
    if args.mi:
        args.save_path = f'./close_result/{mode_dict[args.mode]}/{args.lang}/{args.model_path.split("/")[-1]}_gauss80.jsonl'
    elif args.ocr:
        args.save_path = f'./close_result/{mode_dict[args.mode]}/{args.lang}/{args.model_path.split("/")[-1]}_ocr.jsonl'
    elif args.human:
        args.save_path = f'./close_result/{mode_dict[args.mode]}/{args.lang}/{args.model_path.split("/")[-1]}_human.jsonl'
    else:
        args.save_path = f'./close_result/{mode_dict[args.mode]}/{args.lang}/{args.model_path.split("/")[-1]}.jsonl'
    print("save path", args.save_path)

    file_mode = "a" if "gemini" in args.model_path else "w"
    if os.path.exists(args.save_path):
        with open(args.save_path, "r") as f:
            gen_length = len(f.readlines())
    else:
        gen_length = 0
    print("gen length", gen_length)
    if not os.path.exists(os.path.dirname(args.save_path)):
        os.makedirs(os.path.dirname(args.save_path))
    with open(args.save_path, file_mode) as f:

        if args.mode == "ocr":
            imgs = glob.glob(os.path.join("./data/pageqa/png", "*.png"))
            for img in imgs:
                outputs = llava_inference(model, "")
                f.write(json.dumps({
                    "imgname": img.split('/')[-1].replace(".png", ""),
                    "ocr": outputs,
                }) + "\n")
                print(f"Question: {img} Answer: {outputs}")
        elif args.mode == "chart":
            from dataset.dataloader import ChartvqaLoader
            args.data_path = "./data/ChartQA/test/"
            data_split = "human" if args.human else "augmented"

            if args.lang == "en":
                dataloader = ChartvqaLoader(os.path.join(args.data_path, f"test_{data_split}.json"))
            else:
                dataloader = ChartvqaLoader(os.path.join(args.data_path, f"test_{data_split}_{args.lang}.jsonl"))

            for i, (question, imgname, answers) in enumerate(tqdm(dataloader)):
                img_path = os.path.join(args.data_path, f"png/{imgname}")
                if "llava" in args.model_path:
                    outputs, mi = llava_inference(model, img_path, question, args.lang, tokenizer, args.conv_mode, answers, mi=True)
                elif "monkey" in args.model_path:
                    outputs, mi = monkey_inference(model, img_path, question, args.lang, tokenizer, answers, mi=True)
                elif "Qwen" in args.model_path:
                    outputs, mi = monkey_inference(model, img_path, question, args.lang, tokenizer, answers, mi=True)
                elif "gemini" in args.model_path:
                    if i % 30 != 0:
                        continue
                    model = next(model_iterator)
                    outputs = gemini_inference(model, img_path, question)
                elif "gpt" in args.model_path:
                    if i % 30 != 0:
                        continue
                    outputs = gpt_inference(img_path, question, args.lang)
                elif "blip" in args.model_path:
                    outputs, _ = instructblip_inference(model, img_path, question, args.lang, processor)
                elif "cog" in args.model_path:
                    outputs = cogvlm_inference(model, tokenizer, img_path, question)
                elif "mplug" in args.model_path:
                    outputs = mplug_inference(model, img_path, question, tokenizer, image_processor)
                print(f"Question: {question} Answer: {outputs}")
                f.write(json.dumps({"img_id": imgname,
                                           "prompt": question,
                                           "text": outputs,
                                            "gt": answers,
                                            "mi": mi if args.mi else 0,
                                           "metadata": {}}, ensure_ascii=False) + "\n")

        elif args.mode == "ocrvqa":
            from dataset.dataloader import OcrvqaLoader
            args.data_path = "./data/ocrvqa"
            dataloader = OcrvqaLoader(os.path.join(args.data_path, f"test_{args.lang}.jsonl"))
            for i, (question_id, question, imgname, answer) in enumerate(tqdm(dataloader)):
                if i % 20 != 0:
                    continue
                img_path = os.path.join(args.data_path, f"images/{imgname}.jpg")
                try:
                    if "llava" in args.model_path:
                        outputs, _ = llava_inference(model, img_path, question, args.lang, tokenizer, args.conv_mode)
                    elif "monkey" in args.model_path:
                        outputs, mi = monkey_inference(model, img_path, question, args.lang, tokenizer, answer, mi=True)
                    elif "Qwen" in args.model_path:
                        outputs = qwenvl_inference(model, img_path, question, args.lang, tokenizer)
                    elif "gemini" in args.model_path:

                        model = next(model_iterator)
                        outputs = gemini_inference(model, img_path, question)
                    elif "gpt" in args.model_path:
                        outputs = gpt_inference(img_path, question, args.lang)
                    elif "blip" in args.model_path:
                        outputs, _ = instructblip_inference(model, img_path, question, args.lang, processor)
                    elif "cog" in args.model_path:
                        outputs = cogvlm_inference(model, tokenizer, img_path, question)
                    elif "mplug" in args.model_path:
                        outputs = mplug_inference(model, img_path, question, tokenizer, image_processor)

                except:

                    print(traceback.format_exc())
                    continue

                f.write(json.dumps({
                    "question_id": question_id,
                    "imgname": imgname,
                    "question": question,
                    "gt": answer,
                    "answer": outputs,
                    "mi": mi if args.mi else 0,
                }, ensure_ascii=False) + "\n")
                print(f"Question: {question} Answer: {outputs}")
        elif args.mode == "doc":
            args.data_path = "./data/docvqa/"
            from dataset.dataloader import DocvqaLoader
            if args.lang == "en":
                dataloader = DocvqaLoader(os.path.join(args.data_path, f"qas/val_v1.0_withQT.json"))
            else:
                dataloader = DocvqaLoader(os.path.join(args.data_path, f"qas/val_v1.0_withQT_{args.lang}.jsonl"))
            for i, (question_id, question, image, answers) in enumerate(tqdm(dataloader)):
                img_path = os.path.join(args.data_path, image)
                try:
                    if "llava" in args.model_path:
                        outputs = llava_inference(model, img_path, question, args.lang, tokenizer, args.conv_mode)
                    elif "monkey" in args.model_path:
                        outputs = monkey_inference(model, img_path, question, args.lang, tokenizer)
                    elif "Qwen" in args.model_path:
                        outputs = monkey_inference(model, img_path, question, args.lang, tokenizer)
                    elif "gemini" in args.model_path:
                        if i % 20 != 1:
                            continue
                        model = next(model_iterator)
                        outputs = gemini_inference(model, img_path, question)
                    elif "gpt" in args.model_path:
                        outputs = gpt_inference(client, img_path, question, args.lang)
                    elif "blip" in args.model_path:
                        outputs, _ = instructblip_inference(model, img_path, question, args.lang, processor)
                    elif "cog" in args.model_path:
                        outputs = cogvlm_inference(model, tokenizer, img_path, question)
                    elif "mplug" in args.model_path:
                        outputs = mplug_inference(model, img_path, question, tokenizer, image_processor)
                except:
                    continue


                f.write(json.dumps({
                    "questionId": question_id,
                    "imgname": image,
                    "question": question,
                    "gt": answers,
                    "answer": outputs,
                }, ensure_ascii=False) + "\n")
                print(f"Question: {question} Answer: {outputs}")
        elif args.mode == "text":
            args.data_path = "./data/textvqa"
            from dataset.dataloader import TextvqaLoader
            if args.lang == "en":
                dataloader = TextvqaLoader(os.path.join(args.data_path, f"TextVQA_0.5.1_val.json"))
            else:
                dataloader = TextvqaLoader(os.path.join(args.data_path, f"TextVQA_0.5.1_val_{args.lang}.jsonl"))
            print("data loaded", os.path.join(args.data_path, f"TextVQA_0.5.1_val_{args.lang}.jsonl"))


            for i, (question, imgname, question_id, answers, ocr_tokens) in enumerate(tqdm(dataloader)):
                img_path = os.path.join(args.data_path, f"train_images/{imgname}.jpg")
                if i % 20 != 1:
                    continue
                if "llava" in args.model_path:
                    outputs = llava_inference(model, img_path, question, args.lang, tokenizer, args.conv_mode)
                elif "monkey" in args.model_path:
                    outputs = monkey_inference(model, img_path, question, args.lang, tokenizer)
                elif "Qwen" in args.model_path:
                    outputs = qwenvl_inference(model, img_path, question, args.lang, tokenizer)
                elif "gemini" in args.model_path:

                    model = next(model_iterator)
                    outputs = gemini_inference(model, img_path, question)
                elif "gpt" in args.model_path:
                    outputs = gpt_inference(img_path, question, args.lang)
                elif "blip" in args.model_path:
                    outputs, _ = instructblip_inference(model, img_path, question, args.lang, processor, ocr_tokens)
                elif "cog" in args.model_path:
                    outputs = cogvlm_inference(model, tokenizer, img_path, question)
                elif "mplug" in args.model_path:
                    outputs = mplug_inference(model, img_path, question, tokenizer, image_processor)
                f.write(json.dumps({
                    "question_id": question_id,
                    "imgname": imgname,
                    "question": question,
                    "gt": answers,
                    "answer": outputs,
                }, ensure_ascii=False) + "\n")
                print(f"Question: {question} Answer: {outputs}")
        elif args.mode == "paper":
            from dataset.dataloader import PaperLoader
            if args.lang == "en":
                args.data_path = "./data/pageqa/dev.json"
            else:
                args.data_path = "./data/pageqa/dev_zh.jsonl"
            dataloader = PaperLoader(args.data_path)


            for i, (question, imgname, question_id, answers) in enumerate(tqdm(dataloader)):
                img_path = os.path.join("./data/pageqa/png", f"{imgname}.png")
                if "cog" in args.model_path:
                    outputs = cogvlm_inference(model, tokenizer, img_path, question)
                elif "mplug" in args.model_path:
                    outputs = mplug_inference(model, img_path, question, tokenizer, image_processor)
                elif "Qwen" in args.model_path:
                    outputs = qwenvl_inference(model, img_path, question, args.lang, tokenizer)
                elif "blip" in args.model_path:
                    outputs, _ = instructblip_inference(model, img_path, question, args.lang, processor)
                elif "gpt" in args.model_path:
                    if i % 20 != 3:
                        continue
                    outputs = gpt_inference(img_path, question, args.lang)
                elif "gemini" in args.model_path:
                    if i % 20 != 3:
                        continue
                    model = next(model_iterator)
                    outputs = gemini_inference(model, img_path, question)
                elif "llava" in args.model_path:
                    outputs = llava_inference(model, img_path, question, args.lang, tokenizer, args.conv_mode)
                f.write(json.dumps({
                    "question_id": question_id,
                    "question": question,
                    "answer": outputs,
                    "gt": answers}
                ,ensure_ascii = False
                ) + "\n")
        elif args.mode == "lunwen":
            from dataset.dataloader import LunwenLoader
            data_paths = [f"./data/ch_paper/qas/final-abstractive-{args.lang}-zh.jsonl",
                         f"./data/ch_paper/qas/final-extractive-{args.lang}-zh.jsonl",
                         f"./data/ch_paper/qas/latest-yes-no-{args.lang}-zh.jsonl"]
            types = ['abstractive', 'extractive', 'yesno']
            for data_path, type in zip(data_paths, types):
                dataloader = LunwenLoader(data_path)

                for i, data in enumerate(tqdm(dataloader)):

                    img_path, question, answer = data['img_path'], data['question'], data['answer']
                    img_path = img_path.replace("../", "./")
                    if "cog" in args.model_path:
                        outputs = cogvlm_inference(model, tokenizer, img_path, question)
                    elif "mplug" in args.model_path:
                        outputs = mplug_inference(model, img_path, question, tokenizer, image_processor)
                    elif "Qwen" in args.model_path:
                        outputs = qwenvl_inference(model, img_path, question, args.lang, tokenizer)
                    elif "gpt" in args.model_path:
                        try:
                            if i % 50 != 3:
                                continue
                            outputs = gpt_inference(img_path, question, args.lang)
                        except Exception as e:
                            print(e)
                            continue
                    elif "gemini" in args.model_path:
                        try:
                            if i % 50 != 3:
                                continue
                            model = next(model_iterator)
                            outputs = gemini_inference(model, img_path, question)
                        except:
                            continue
                    elif "blip" in args.model_path:
                        outputs, _ = instructblip_inference(model, img_path, question, args.lang, processor)
                    elif "llava" in args.model_path:
                        outputs = llava_inference(model, img_path, question, args.lang, tokenizer, args.conv_mode)
                    f.write(json.dumps({
                        "img_path": img_path,
                        "question": question,
                        "answer": outputs,
                        "gt": answer,
                        "type": type}
                        , ensure_ascii=False
                    ) + "\n")









================================================
FILE: eval/eval_cogagent.py
================================================
import torch
import sys
sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")
from PIL import Image
from transformers import AutoModelForCausalLM, LlamaTokenizer
import argparse
from dataset.dataloader import DataLoader
import os
import json

parser = argparse.ArgumentParser()
parser.add_argument("--quant", choices=[4], type=int, default=None, help='quantization bits')
parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
parser.add_argument("--fp16", action="store_true")
parser.add_argument("--bf16", action="store_true")
parser.add_argument("--model_path", type=str, default="./checkpoints/cog-agent-hf") #echo840/Monkey-Chat  echo840/Monkey
parser.add_argument("--data_path", type=str, default="./data/pageqa")
parser.add_argument("--save_path", type=str, default="./result/cogagent/qasper_zh.jsonl")

args = parser.parse_args()
dataloader = DataLoader(os.path.join(args.data_path, "dev_back.jsonl"))
MODEL_PATH = args.model_path
TOKENIZER_PATH = args.local_tokenizer
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_PATH)
if args.bf16:
    torch_type = torch.bfloat16
else:
    torch_type = torch.float16

print("========Use torch type as:{} with device:{}========\n\n".format(torch_type, DEVICE))

if args.quant:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        torch_dtype=torch_type,
        low_cpu_mem_usage=True,
        load_in_4bit=True,
        trust_remote_code=True
    ).eval()
else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        torch_dtype=torch_type,
        low_cpu_mem_usage=True,
        load_in_4bit=args.quant is not None,
        trust_remote_code=True
    ).to(DEVICE).eval()

with open(args.save_path, "w") as f:
    for question, imgname, question_id, answers in dataloader:
        img_path = f"{args.data_path}/png/{imgname}.png"

        image = Image.open(img_path).convert('RGB')
        history = []

        query = question + "Answer in one short sentence: "
        input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history,
                                                            images=[image])
        inputs = {
            'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
            'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
            'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
            'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]],
        }
        if 'cross_images' in input_by_model and input_by_model['cross_images']:
            inputs['cross_images'] = [[input_by_model['cross_images'][0].to(DEVICE).to(torch_type)]]

        # add any transformers params here.
        gen_kwargs = {"max_length": 2048,
                      "temperature": 0.9,
                      "do_sample": False}
        with torch.no_grad():
            outputs = model.generate(**inputs, **gen_kwargs)
            outputs = outputs[:, inputs['input_ids'].shape[1]:]
            response = tokenizer.decode(outputs[0])
            response = response.split("</s>")[0]
            print("\nCog:", response)
        f.write(json.dumps({
            "question_id": question_id,
            "predicted_answer": response,
        }) + "\n")
        print(f"Question: {question} Answer: {response}")



================================================
FILE: eval/eval_lunwen.sh
================================================
export MODEL=qwen
export NAME=llava-v1.6-34b
python ./eval/eval_pageqa.py --predictions result/lunwen/en/llava-v1.5-13b-trans.jsonl \
  --language mix
python ./eval/eval_pageqa.py --predictions result/lunwen/en/llava-v1.6-34b-trans.jsonl \
  --language mix
#python ./eval/eval_pageqa.py --predictions results/paper/chpaper/full/en/minicpm/20240814170301/lunwen-trans.jsonl \
#  --language yesno
#python ./eval/eval_pageqa.py --predictions ./result/lunwen/zh/${NAME}.jsonl \
#  --language mix

#python ./eval/eval_pageqa.py --predictions results/paper/chpaper/en/${MODEL}-abstractive.jsonl \
#  --language zh
#python ./eval/eval_pageqa.py --predictions results/paper/chpaper/en/${MODEL}-extractive.jsonl \
#  --language zh
#python ./eval/eval_pageqa.py --predictions results/paper/chpaper/en/${MODEL}-yes-no.jsonl \
#  --language yesno
#python ./eval/eval_pageqa.py --predictions results/paper/chpaper/zh/minicpm/20240803022751/lunwen.json \
#  --language zh
#python ./eval/eval_pageqa.py --predictions results/paper/chpaper/wokl/en/minicpm/20240813161800/lunwen.json \
# --language zh
#python ./eval/eval_pageqa.py --predictions results/paper/chpaper/wokl/en/minicpm/20240813181343/lunwen.json \
# --language zh
#python ./eval/eval_pageqa.py --predictions results/paper/chpaper/en/minicpm/20240813195649/lunwen.json \
# --language yesno
#
#python ./eval/eval_pageqa.py --predictions results/paper/chpaper/wokl/zh/minicpm/20240813202906/lunwen.json \
# --language zh
#python ./eval/eval_pageqa.py --predictions results/paper/chpaper/wokl/zh/minicpm/20240813225802/lunwen.json \
# --language zh
#python ./eval/eval_pageqa.py --predictions results/paper/chpaper/zh/minicpm/20240814011900/lunwen.json \
# --language yesno




================================================
FILE: eval/eval_minicpm.py
================================================
from transformers import AutoModelForCausalLM, AutoTokenizer
from dataset.dataloader import DataLoader
import torch
torch.manual_seed(0)

path = 'openbmb/MiniCPM-2B-dpo-bf16'
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map='cuda', trust_remote_code=True)

responds, history = model.chat(tokenizer, "山东省最高的山是哪座山, 它比黄山高还是矮？差距多少？", temperature=0.5, top_p=0.8, repetition_penalty=1.02)
print(responds)

================================================
FILE: eval/eval_monkey.py
================================================
import json
import sys
sys.path.append("..")
sys.path.append("../..")
from transformers import AutoModelForCausalLM, AutoTokenizer
import argparse
from dataset.dataloader import PaperLoader, LunwenLoader
import os, glob

if __name__=="__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", type=str, default="../checkpoints/monkey") #echo840/Monkey-Chat  echo840/Monkey
    parser.add_argument("--data_path", type=str, default="../data/pageqa")
    parser.add_argument("--save_path", type=str, default="../results/paper/enpaper/en/monkey.jsonl")
    parser.add_argument("--mode", type=str, default="paper")
    parser.add_argument("--lang", type=str, default="en")
    args = parser.parse_args()
    checkpoint = args.model_path
    if not os.path.exists(os.path.dirname(args.save_path)):
        os.makedirs(os.path.dirname(args.save_path))

    model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map='cuda', trust_remote_code=True, fp16=True, bf16=False).eval()
    tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
    tokenizer.padding_side = 'left'
    tokenizer.pad_token_id = tokenizer.eod_id
    if args.mode == "paper":
        if args.lang == "en":
            dataloader = PaperLoader(os.path.join(args.data_path, "dev.json"))
        else:
            dataloader = PaperLoader(os.path.join(args.data_path, "dev_zh_en.jsonl"))
        with open(args.save_path, "w") as f:
            for question, imgname, question_id, answers in dataloader:
                img_path = os.path.join(args.data_path, f"png/{imgname}.png")
                query = f'<img>{img_path}</img> {question} Answer: '  # VQA

                input_ids = tokenizer(query, return_tensors='pt', padding='longest')
                attention_mask = input_ids.attention_mask
                input_ids = input_ids.input_ids

                pred = model.generate(
                    input_ids=input_ids.cuda(),
                    attention_mask=attention_mask.cuda(),
                    do_sample=False,
                    num_beams=1,
                    max_new_tokens=512,
                    min_new_tokens=1,
                    length_penalty=1,
                    num_return_sequences=1,
                    output_hidden_states=True,
                    use_cache=True,
                    pad_token_id=tokenizer.eod_id,
                    eos_token_id=tokenizer.eod_id,
                )
                response = tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
                f.write(json.dumps({
                    "question_id": question_id,
                    "question": question,
                    "answer": response
                }) + '\n')
                print(f"Question: {question} Answer: {response}")
    elif args.mode == "lunwen":
        dataloader = LunwenLoader(args.data_path)
        with open(args.save_path, "w") as f:
            for data in dataloader:
                img_path, question, answer = data['img_path'], data['question'], data['answer']
                query = f'<img>{img_path}</img> {question} Answer: '  # VQA

                input_ids = tokenizer(query, return_tensors='pt', padding='longest')
                attention_mask = input_ids.attention_mask
                input_ids = input_ids.input_ids

                pred = model.generate(
                    input_ids=input_ids.cuda(),
                    attention_mask=attention_mask.cuda(),
                    do_sample=False,
                    num_beams=1,
                    max_new_tokens=512,
                    min_new_tokens=1,
                    length_penalty=1,
                    num_return_sequences=1,
                    output_hidden_states=True,
                    use_cache=True,
                    pad_token_id=tokenizer.eod_id,
                    eos_token_id=tokenizer.eod_id,
                )
                response = tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
                f.write(json.dumps({
                    "gt_answer": answer,
                    "question": question,
                    "answer": response
                }, ensure_ascii=False) + '\n')
                print(f"Question: {question} Answer: {response}")



================================================
FILE: eval/eval_pageqa.py
================================================
import argparse
import pdb

import jieba
import json
from collections import Counter
import string, re

def normalize_answer(s):
    """
    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
    Lower text and remove punctuation, articles and extra whitespace.
    """

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def single_f1_zh(gold, answer):
    if type(answer) is list:
        answer = answer[0]
    gold_seg = jieba.cut(gold, cut_all=False)
    answer_seg = jieba.cut(answer, cut_all=False)
    # print("Default Mode: " + "/ ".join(gold_seg))  # 精确模式
    # print("Default Mode: " + "/ ".join(answer_seg))  # 精确模
    gold_seg = "/ ".join(gold_seg).split("/ ")
    answer_seg = "/ ".join(answer_seg).split("/ ")
    common = Counter(answer_seg) & Counter(gold_seg)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(answer_seg)
    recall = num_same / len(gold_seg)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def single_f1_en(prediction, ground_truth):
    """
    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
    """
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def score_yes_no(prediction):
    total_gold_true, total_predict_true, total_right = 0, 0, 0

    for p in prediction:
        gt_answer = p['gt_answers'] if "gt_answers" in p.keys() else p['gt']
        if type(p['answer']) is list:
            p['answer'] = p['answer'][0]
        p['answer'] = p['answer'].lower()
        if gt_answer.startswith("是") or gt_answer.startswith("是"):
            total_gold_true += 1
        if p['answer'].startswith("是") or p['answer'].startswith("yes"):
            total_predict_true += 1
        if (p['answer'].startswith("是") or p['answer'].startswith("yes")) and (gt_answer.startswith("是") or gt_answer.startswith("yes")):
            total_right += 1
    precision = 1.0 * total_right / total_predict_true
    recall = 1.0 * total_right / total_gold_true
    print(precision, recall)
    print("amount", total_right, total_predict_true, total_gold_true)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1



if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--predictions",
        type=str,
        required=True,
        help="""JSON lines file with each line in format:
                    {'question_id': str, 'predicted_answer': str, 'predicted_evidence': List[str]}"""
    )
    parser.add_argument(
        "--language",
        type=str,
        default="yesno"
    )
    parser.add_argument(
        "--text_evidence_only",
        action="store_true",
        help="If set, the evaluator will ignore evidence in figures and tables while reporting evidence f1"
    )
    args = parser.parse_args()
    extract_scores, abstract_scores, yesno_scores = [], [], []
    if "jsonl" in args.predictions:
        with open(args.predictions, "r") as f:
            predictions = [json.loads(line) for line in f]
    else:
        predictions = json.load(open(args.predictions))
    if args.language == "yesno":
        f1 = score_yes_no(predictions)
        print("Avg F1", f1)
    elif args.language == "mix":
        golds, predicts = [], []
        for p in predictions:
            gold = p['gt_answers'] if "gt_answers" in p.keys() else p['gt']
            predict = p['answer']
            score = eval(f'single_f1_zh')(gold, predict)
            if p['type'] == "extractive":
                extract_scores.append(score)
            elif p['type'] == "abstractive":
                abstract_scores.append(score)
            elif p['type'] == "yesno":
                yesno_scores.append(p)

        print("extract F1", sum(extract_scores) / len(extract_scores))
        print("abs F1", sum(abstract_scores) / len(abstract_scores))
        print("yesno F1", score_yes_no(yesno_scores))
    else:
        scores = []
        for p in predictions:
            gold = p['gt_answers'] if "gt_answers" in p.keys() else p['gt']
            predict = p['answer']
            score = eval(f'single_f1_{args.language}')(gold, predict)
            scores.append(score)

        print("Avg F1", sum(scores) / len(scores))

================================================
FILE: eval/eval_paper_llm.py
================================================
import sys, os, json
sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")
sys.path.append("../../..")

import transformers, torch
from tqdm import tqdm
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM


if __name__=="__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", type=str, default="./checkpoints/llava-v1.6-34b") #echo840/Monkey-Chat  echo840/Monkey
    parser.add_argument("--data_path", type=str, default="./data/pageqa")
    parser.add_argument("--save_path", type=str, default="./result/qasper/ocr/llava-v1.6-34b-2.jsonl")
    parser.add_argument("--metadata", type=str, default="./data/ch_paper/metadata.jsonl")
    parser.add_argument("--lang", type=str, default="zh")
    parser.add_argument("--mode", type=str, default="chpaper")

    args = parser.parse_args()
    from dataset.dataloader import PaperTextLoader, LunwenTextLoader
    if not os.path.exists(os.path.dirname(args.save_path)):
        os.makedirs(os.path.dirname(args.save_path))
    if args.mode == "enpaper":
        if args.lang == "en":
            dataloader = PaperTextLoader(
                os.path.join(args.data_path, "dev.json"),
                os.path.join(args.data_path, "dev_metadata.json")
            )
        elif args.lang == "zh":
            dataloader = PaperTextLoader(
                os.path.join(args.data_path, "dev_zh_en.jsonl"),
                os.path.join(args.data_path, "dev_metadata.json")
            )
    elif args.mode == "chpaper":
        dataloader = LunwenTextLoader(args.data_path, args.metadata)
    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
    model = AutoModelForCausalLM.from_pretrained(args.model_path)
    with open(args.save_path, "w") as f:
        for question, imgname, question_id, answers, content in tqdm(dataloader):
            gold = "".join([det[1][0] for det in content[0]])
            if args.mode == "enpaper":
                query = f"Answer the question in a short sentence based on the content. Content:{content}" \
                        f"Question: {question}" \
                        f"Answer in a short sentence: "
            elif args.mode == "chpaper":
                query = f"根据所给内容用一句简短的话回答问题。内容：{content}" \
                        f"问题： {question}" \
                        f"用一句简短的话回答："
            inputs = tokenizer(query, return_tensors="pt")

            # Generate
            generate_ids = model.generate(inputs.input_ids, max_length=4096)
            output_text = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

            # input_ids = tokenizer.encode(query, return_tensors='pt')
            #
            #
            # # 生成文本回复
            # output = model.generate(input_ids, max_length=4096, num_return_sequences=1)
            #
            # # 解码生成的输出，得到最终的回复文本
            # output_text = tokenizer.decode(output[0], skip_special_tokens=True)

            # 打印回复文本
            print(f"Question {query} Answer: {output_text}")




================================================
FILE: eval/eval_qasper.py
================================================
"""
Official script for evaluating models built for the Qasper dataset. The script
outputs Answer F1 and Evidence F1 reported in the paper.
"""
from collections import Counter
import argparse
import string
import re
import json


def normalize_answer(s):
    """
    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
    Lower text and remove punctuation, articles and extra whitespace.
    """

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def token_f1_score(prediction, ground_truth):
    """
    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
    """
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    # print(precision, recall)
    return f1


def paragraph_f1_score(prediction, ground_truth):
    if not ground_truth and not prediction:
        # The question is unanswerable and the prediction is empty.
        return 1.0
    num_same = len(set(ground_truth).intersection(set(prediction)))
    if num_same == 0:
        return 0.0
    precision = num_same / len(prediction)
    recall = num_same / len(ground_truth)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def get_answers_and_evidence(data, text_evidence_only):
    answers_and_evidence = {}
    for paper_data in data.values():
        for qa_info in paper_data["qas"]:
            question_id = qa_info["question_id"]
            references = []
            for annotation_info in qa_info["answers"]:
                answer_info = annotation_info["answer"]
                if answer_info["unanswerable"]:
                    references.append({"answer": "Unanswerable", "evidence": [], "type": "none"})
                else:
                    if answer_info["extractive_spans"]:
                        answer = ", ".join(answer_info["extractive_spans"])
                        answer_type = "extractive"
                    elif answer_info["free_form_answer"]:
                        answer = answer_info["free_form_answer"]
                        answer_type = "abstractive"
                    elif answer_info["yes_no"]:
                        answer = "Yes"
                        answer_type = "boolean"
                    elif answer_info["yes_no"] is not None:
                        answer = "No"
                        answer_type = "boolean"
                    else:
                        raise RuntimeError(f"Annotation {answer_info['annotation_id']} does not contain an answer")
                    if text_evidence_only:
                        evidence = [text for text in answer_info["evidence"] if "FLOAT SELECTED" not in text]
                    else:
                        evidence = answer_info["evidence"]
                    references.append({"answer": answer, "evidence": evidence, "type": answer_type})
            answers_and_evidence[question_id] = references

    return answers_and_evidence


def evaluate(gold, predicted):
    max_answer_f1s = []
    max_evidence_f1s = []
    max_answer_f1s_by_type = {
        "extractive": [],
        "abstractive": [],
        "boolean": [],
        "none": [],
    }
    num_missing_predictions = 0
    for question_id, references in gold.items():
        if question_id not in predicted:
            num_missing_predictions += 1
            # max_answer_f1s.append(0.0)
            # max_evidence_f1s.append(0.0)
            continue
        answer_f1s_and_types = [
            (token_f1_score(predicted[question_id]["answer"], reference["answer"]),
             reference["type"])
            for reference in gold[question_id]
        ]
        max_answer_f1, answer_type = sorted(answer_f1s_and_types, key=lambda x: x[0], reverse=True)[0]
        max_answer_f1s.append(max_answer_f1)
        max_answer_f1s_by_type[answer_type].append(max_answer_f1)
        evidence_f1s = [
            paragraph_f1_score(predicted[question_id]["evidence"], reference["evidence"])
            for reference in gold[question_id]
        ]
        max_evidence_f1s.append(max(evidence_f1s))

    print(len(max_answer_f1s))
    mean = lambda x: sum(x) / len(x) if x else 0.0
    return {
        "Answer F1": mean(max_answer_f1s),
        "Answer F1 by type": {key: mean(value) for key, value in max_answer_f1s_by_type.items()},
        "Evidence F1": mean(max_evidence_f1s),
        "Missing predictions": num_missing_predictions
    }

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--predictions",
        type=str,
        required=True,
        help="""JSON lines file with each line in format:
                {'question_id': str, 'predicted_answer': str, 'predicted_evidence': List[str]}"""
    )
    parser.add_argument(
        "--gold",
        type=str,
        required=True,
        help="Test or dev set from the released dataset"
    )
    parser.add_argument(
        "--text_evidence_only",
        action="store_true",
        help="If set, the evaluator will ignore evidence in figures and tables while reporting evidence f1"
    )
    args = parser.parse_args()
    gold_data = json.load(open(args.gold))
    gold_answers_and_evidence = get_answers_and_evidence(gold_data, args.text_evidence_only)
    predicted_answers_and_evidence = {}
    if "jsonl" in args.predictions:
        for line in open(args.predictions):
            prediction_data = json.loads(line)
            d = prediction_data
            if d['answer'].startswith("Yes") or d['answer'].startswith("是"):
                answer = "Yes"
            elif d['answer'].startswith("No") or d['answer'].startswith("否") or d['answer'].startswith("不"):
                answer = "No"
            else:
                answer = prediction_data['answer']
            predicted_answers_and_evidence[prediction_data["question_id"]] = {
                "answer": answer,
                "evidence": [],
                #"evidence": prediction_data["predicted_evidence"]
            }
    else:
        prediction_data = json.load(open(args.predictions, "r"))
        for d in prediction_data:
            if d['answer'].startswith("Yes") or d['answer'].startswith("是"):
                answer = "Yes"
            elif d['answer'].startswith("No") or d['answer'].startswith("否") or d['answer'].startswith("不"):
                answer = "No"
            else:
                answer = d['answer']
            predicted_answers_and_evidence[d['question_id']] = {
                "answer": answer,
                "evidence": [],
            }
    evaluation_output = evaluate(gold_answers_and_evidence, predicted_answers_and_evidence)
    print(json.dumps(evaluation_output, indent=2))

================================================
FILE: eval/eval_qasper.sh
================================================
export NAME=$1
python ./eval/eval_qasper.py --predictions close_result/paper/en/gemini-1.5-flash.jsonl \
  --gold ./data/qasper-train-dev-v0.3/qasper-dev-v0.3.json
python ./eval/eval_qasper.py --predictions close_result/paper/zh/gemini-1.5-flash.jsonl \
  --gold ./data/qasper-train-dev-v0.3/qasper-dev-v0.3.json
#python ./eval/eval_qasper.py --predictions results/paper/chpaper/wokl/zh/minicpm/20240814180236/lunwen.json \
#  --gold ./data/qasper-train-dev-v0.3/qasper-dev-v0.3.json
#python ./eval/eval_qasper.py --predictions ./result/paper/zh/${NAME}.jsonl \
#  --gold ./data/qasper-train-dev-v0.3/qasper-dev-v0.3.json


================================================
FILE: eval/eval_xtreme.py
================================================
import os.path
import pdb
import sys
sys.path.append("..")
sys.path.append("../..")

from dataset import load_dataset
import argparse
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
from torch.utils.data import DataLoader
import json
from tqdm import tqdm
from utils.render_text import render_text
import openai
import string, collections
import re
openai.api_base = "https://openkey.cloud/v1"
openai.api_key = "sk-iXsrRL5fn1ZM15F54a2aEf41Ff674620BbAdB2A395Bc4a91"

language_dict = {"zh": "Chinese", "en": "English", "fr": "French", "de": "German"}
def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def evaluate(args):
    source_lang = args.dataset.split(".")[0]
    dataset = load_dataset("xtreme", f"MLQA.{args.dataset}")
    test_loader = DataLoader(dataset['test'], shuffle=False)

    model = AutoModelForCausalLM.from_pretrained(
        args.checkpoint, device_map='cuda', trust_remote_code=True).eval()

    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
                                              trust_remote_code=True)
    tokenizer.padding_side = 'left'
    tokenizer.pad_token_id = tokenizer.eod_id
    with open(f"../result/qwen/xtreme/{args.dataset}.json", "w") as f:
        for item in tqdm(test_loader):
            query = f"Answer the question based on the given context in {language_dict[source_lang]} \n" \
                    f"Context: {item['context'][0]} \n" \
                    f"Question: {item['question'][0]} \n" \
                    f"Answer: "
            response, history = model.chat(tokenizer, query, history=None)
            item['response'] = response
            item['answers']['answer_start'] = item['answers']['answer_start'][0].numpy().tolist()
            print(item)
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

def evaluate_vis(args):
    dataset = load_dataset("xtreme", f"MLQA.{args.dataset}")
    test_loader = DataLoader(dataset['test'], shuffle=False)


    model = AutoModelForCausalLM.from_pretrained(
        args.checkpoint, device_map='cuda', trust_remote_code=True).eval()

    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
                                              trust_remote_code=True)
    tokenizer.padding_side = 'left'
    tokenizer.pad_token_id = tokenizer.eod_id
    image_dir = f"../data/xtreme/{args.dataset}"
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)
    with open(f"../result/qwen/xtreme/{args.dataset}-vis.json", "w") as f:
        for item in tqdm(test_loader):
            id = item['id'][0]
            context = item['context'][0]
            image_path = os.path.join(image_dir, f"{id}.png")
            if not os.path.exists(image_path):
                image = render_text(context)
                image.save(image_path)
            prompt = f'<img>{image_path}</img> Answer the question in {args.dataset.split(".")[0]} Question:{item["question"]} '
            response = model.chat(tokenizer, query=prompt, history=None)
            item['response'] = response
            item['answers']['answer_start'] = item['answers']['answer_start'][0].numpy().tolist()
            print(item)
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

def evaluate_gpt(args):
    dataset = load_dataset("xtreme", f"MLQA.{args.dataset}")
    test_loader = DataLoader(dataset['test'], shuffle=False)
    save_dir = f"../result/gpt3.5/xtreme/{args.dataset}.jsonl"
    with open(save_dir, "r") as f:
        length = len(f.readlines())
    with open(save_dir, "a") as f:
        for i, item in enumerate(tqdm(test_loader)):
            if i < length:
                continue
            query = f"Answer the question based on the given context in {language_dict[args.dataset.split('.')[0]]} \n" \
                    f"Context: {item['context'][0]} \n" \
                    f"Question: {item['question'][0]} \n" \
                    f"Answer: "
            while True:
                try:
                    response = openai.ChatCompletion.create(
                        model="gpt-3.5-turbo",
                        messages=[{'role': 'system', 'content': query}],
                        temperature=0.2,
                        stop='\n\n',
                    )
                    item['response'] = response.choices[0].message.content
                    item['answers']['answer_start'] = item['answers']['answer_start'][0].numpy().tolist()
                    print(item)
                    f.write(json.dumps(item, ensure_ascii=False) + "\n")
                    break
                except:
                    continue

def calc_f1(file_path):
    scores = []
    with open(file_path, "r") as f:
        for line in f:
            result = eval(line)
            f1 = compute_f1(result['response'], result['answers']['text'][0][0])
            scores.append(f1)

    print("Accuracy: ", sum(scores) / len(scores))

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Visualize a series of point clouds as an animation.")
    parser.add_argument("--dataset", type=str, default='zh.en')
    parser.add_argument("--savedir", type=str, default="../data/lunwen/qa")
    parser.add_argument("--checkpoint", type=str, default="../checkpoints/Qwen-VL")
    parser.add_argument("--eval", type=str)
    args = parser.parse_args()
    if "VL" in args.checkpoint:
        #evaluate_vis(args)
        calc_f1(f"../result/qwen/xtreme/{args.dataset}-vis.json")
    elif "gpt" in args.checkpoint:
        # evaluate_gpt(args)
        calc_f1(f"../result/gpt3.5/xtreme/{args.dataset}.jsonl")
    else:
        #evaluate(args)
        calc_f1(f"../result/qwen/xtreme/{args.dataset}.json")


================================================
FILE: eval/gemini/eval_en_ocr.py
================================================


================================================
FILE: eval/gemini/eval_qasper.py
================================================
import pdb
import sys
sys.path.append("..")
sys.path.append("../..")
import fitz
import os
import argparse
import uuid
#from utils.utils import new_ip
import glob
#from paddleocr import PaddleOCR, draw_ocr
import json
from utils.gemini import Gemini_Model, prompt_gen_ch_qa_extractive, prompt_gen_ch_qa_vis, prompt_gen_qa, prompt_gen_ch_qa_abstractive, prompt_gen_ch_qa_yes_no
API_KEY = "AIzaSyCYo6MWJKX4nrV8i36GKVVEVeuYfD3co-s"
from dataset.dataloader import DataLoader

def gemini_gen_qa_vision(metadata, prompt, savedir):
    model = Gemini_Model(key=API_KEY)
    data_path = "../../data/pageqa/dev.json"
    img_path = "../../data/pageqa/png"
    savedir = "../../result/gemini/qasper"
    savename = os.path.join(savedir, "en-en.jsonl")
    if os.path.exists(savename):
        with open(savename, "r") as fr:
            line_num = len(fr.readlines())
    else:
        line_num = 0
    dataloader = DataLoader(data_path)
    with open(savename, "w") as f:
        for question, imgname, question_id, answers in dataloader:
            image_path = os.path.join(img_path, imgname + ".png")
            response = model.get_response_vision(image_path, question + "Answer in one sentence:")
            f.write(json.dumps({
                "question_id": question_id,
                "predicted_answer": response,
            }) + "\n")
            print(f"Question: {question} Answer: {response}")

gemini_gen_qa_vision(0, 0, 0)

================================================
FILE: eval/intern/inference.py
================================================
import numpy as np
import torch, json
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
import sys
from tqdm import tqdm
sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")
from dataset.dataloader import MathLoader

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

# If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
path = './checkpoints/InternVL2-8B'
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

dataloader = MathLoader("./data/pazhou", "./data/pazhou/test.jsonl")
# set the max number of tiles in `max_num`
with open("./results/intern8b_exact.txt", "w") as f:
    for data in tqdm(dataloader):
        img_path, id, question = data['image_path'], data['question_id'], data['question']
        pixel_values = load_image(img_path, max_num=12).to(torch.bfloat16).cuda()
        generation_config = dict(max_new_tokens=1024, do_sample=True, temperature=0.1)


        # single-image single-round conversation (单图单轮对话)
        question = f'<image>\n{question}'
        response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
        print(f'User: {question}  \nAssistant: {response}')

        question = 'Extract the direct answer from model response, which is a single option letter or number value, eg. 1,2,3,A,B,C. Answer:'
        response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history,
                                       return_history=True)
        print(f'User: {question}\nAssistant: {response}')
        f.write(json.dumps({"id": id, "model_answer": response}) + "\n")



================================================
FILE: eval/llava/acc_chartvqa.py
================================================
import os
import argparse
import json
import re

from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--annotation-file', type=str)
    parser.add_argument('--result-file', type=str)
    parser.add_argument('--result-dir', type=str)
    return parser.parse_args()


def prompt_processor(prompt):
    if prompt.startswith('OCR tokens: '):
        pattern = r"Question: (.*?) Short answer:"
        match = re.search(pattern, prompt, re.DOTALL)
        question = match.group(1)
    elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
        if prompt.startswith('Reference OCR token:'):
            question = prompt.split('\n')[1]
        else:
            question = prompt.split('\n')[0]
    elif len(prompt.split('\n')) == 2:
        question = prompt.split('\n')[0]
    else:
        question = prompt

    return question.lower()


def eval_single(annotation_file, result_file):
    experiment_name = os.path.splitext(os.path.basename(result_file))[0]
    print(experiment_name)
    annotations = json.load(open(annotation_file))
    annotations = {(annotation['imgname'], annotation['query'].lower()): annotation for annotation in annotations}
    results = [json.loads(line) for line in open(result_file)]

    pred_list = []
    acc = []
    for result in results:
        annotation = annotations[(result['img_id'], prompt_processor(result['prompt']))]
        pred_list.append({
            "pred_answer": result['text'],
            "gt_answers": annotation['label'],
        })
        acc.append(1 if result['text'] == annotation['label'] else 0)

    print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * (sum(acc) / len(acc))))


if __name__ == "__main__":
    args = get_args()

    if args.result_file is not None:
        eval_single(args.annotation_file, args.result_file)

    if args.result_dir is not None:
        for result_file in sorted(os.listdir(args.result_dir)):
            if not result_file.endswith('.jsonl'):
                print(f'Skipping {result_file}')
                continue
            eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))


================================================
FILE: eval/llava/eval_chartvqa.py
================================================
import argparse
import pdb

import torch
import os
import json
from tqdm import tqdm
import shortuuid

from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from PIL import Image
import math
import pandas as pd


def split_list(lst, n):
    """Split a list into n (roughly) equal-sized chunks"""
    chunk_size = math.ceil(len(lst) / n)  # integer division
    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]


def get_chunk(lst, n, k):
    chunks = split_list(lst, n)
    return chunks[k]


# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, questions, image_folder, tables_folder, tokenizer, image_processor, model_config):
        self.questions = questions
        self.image_folder = image_folder
        self.tables_folder = tables_folder
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.model_config = model_config

    def __getitem__(self, index):
        line = self.questions[index]
        image_file = line["imgname"]
        qs = line["query"]
        table_file = image_file.replace("png", "csv")

        if self.model_config.mm_use_im_start_end:
            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
        else:
            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs

        csv = pd.read_csv(os.path.join(self.tables_folder, table_file))
        csv = csv.to_string()
       # qs = qs + "\n" + "Reference table content in csv format: " + csv
        if args.lang == "zh":
            qs = qs + "\n" + "使用一个数字，单词或者句子回答问题"
        elif args.lang == "fr":
            qs = qs + "\n" + "Répondez à la question en utilisant un seul numéro, mot ou phrase"
        elif args.lang == "en":
            qs = qs + "\n" + "Answer the question using a single number, word or sentence"

        conv = conv_templates[args.conv_mode].copy()
        conv.append_message(conv.roles[0], qs)
        conv.append_message(conv.roles[1], None)
        # if args.lang == "zh-CN":
        #     conv.append_message(conv.roles[0], "表图片的csv形式为: " + csv)
        # elif args.lang == "fr":
        #     conv.append_message(conv.roles[0], "Le format CSV de la table img:" + csv)
        # else:
        #     conv.append_message(conv.roles[0], "The csv format of the table img: " + csv)

        prompt = conv.get_prompt()
        image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')

        img_size = image.size
        image_tensor = process_images([image], self.image_processor, self.model_config)[0]

        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')

        return input_ids, image_tensor, prompt, img_size

    def __len__(self):
        return len(self.questions)

class SimpleDataset(Dataset):
    def __init__(self, questions, image_folder):
        self.image_folder = image_folder
        self.questions = questions

    def __getitem__(self, item):
        line = self.questions[item]
        image_file = line["image"]
        image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
        image = transforms.ToTensor()(image)
        return image

    def __len__(self):
        return len(self.questions)


# DataLoader
def create_data_loader(questions, image_folder, table_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
    assert batch_size == 1, "batch_size must be 1"
    dataset = CustomDataset(questions, image_folder, table_folder, tokenizer, image_processor, model_config)
    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
    return data_loader


def eval_model(args):
    # Model
    disable_torch_init()
    model_path = os.path.expanduser(args.model_path)
    model_name = get_model_name_from_path(model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)
    if args.lang == "en":
        with open(os.path.expanduser(args.question_file), "r") as f:
            questions = json.load(f)
        questions = [q for q in questions]
    else:
        with open(args.question_file, "r") as f:
            questions = [json.loads(line) for line in f]

    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
    answers_file = os.path.expanduser(args.answers_file)
    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
    ans_file = open(answers_file, "w")

    if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
        args.conv_mode = args.conv_mode + '_mmtag'
        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')

    data_loader = create_data_loader(questions, args.image_folder, args.table_folder, tokenizer, image_processor, model.config)

    for (input_ids, image_tensor, prompt, img_size), line in tqdm(zip(data_loader, questions), total=len(questions)):
        cur_prompt = line["query"]
        img_id = line["imgname"]
        # origin = line['origin']
        input_ids = input_ids.to(device='cuda', non_blocking=True)

        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=image_tensor.unsqueeze(0).half().cuda(),
                image_sizes=img_size,
                do_sample=True if args.temperature > 0 else False,
                temperature=args.temperature,
                top_p=args.top_p,
                num_beams=args.num_beams,
                # no_repeat_ngram_size=3,
                max_new_tokens=1024,
                use_cache=False
            )

        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
        print(f"Question: {prompt} Answer: {outputs}")
        ans_file.write(json.dumps({"img_id": img_id,
                                   "prompt": cur_prompt,
                                   "text": outputs,
                                   "model_id": model_name,
                                   "metadata": {}}, ensure_ascii=False) + "\n")
        # ans_file.flush()
    ans_file.close()



if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
    parser.add_argument("--vision-encoder", type=str, default="clip")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--image-folder", type=str, default="")
    parser.add_argument("--table-folder", type=str, default="")
    parser.add_argument("--question-file", type=str, default="")
    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
    parser.add_argument("--conv-mode", type=str, default="llava_v1")
    parser.add_argument("--num-chunks", type=int, default=1)
    parser.add_argument("--chunk-idx", type=int, default=0)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    parser.add_argument("--max_new_tokens", type=int, default=128)
    parser.add_argument("--lang", type=str, default="en")
    parser.add_argument("--projector", type=str, default="./checkpoints/llava-v1.5-13b-pretrain/checkpoint-1000")
    args = parser.parse_args()


    eval_model(args)


================================================
FILE: eval/llava/eval_en_ocr.py
================================================
import json
import pdb
import sys
sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")
sys.path.append("../../..")
from transformers import AutoModelForCausalLM, AutoTokenizer
import argparse
#from datasets import dataloader
from dataset.dataloader import DataLoader
import os, glob
from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from llava.model.builder import load_pretrained_model
import torch
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from transformers import AutoModelForCausalLM, LlamaTokenizer

from PIL import Image

if __name__=="__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", type=str, default="./checkpoints/llava-v1.6-34b") #echo840/Monkey-Chat  echo840/Monkey
    parser.add_argument("--data_path", type=str, default="./data/pageqa")
    parser.add_argument("--save_path", type=str, default="./result/qasper/ocr/llava-v1.6-34b-2.jsonl")
    parser.add_argument("--conv-mode", type=str, default="chatml_ocr")
    parser.add_argument("--temperature", type=float, default=0)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
    parser.add_argument("--fp16", action="store_true")
    parser.add_argument("--bf16", action="store_true")
    args = parser.parse_args()
    dataloader = DataLoader(os.path.join(args.data_path, "dev.json"))

    if "llava" in args.model_path:
        checkpoint = args.model_path

        model_name = get_model_name_from_path(args.model_path)
        tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, None, model_name)

    with open(args.save_path, "w") as f:
        imgs = glob.glob(os.path.join("./data/pageqa/png", "*.png"))
        for img in imgs:

            if model.config.mm_use_im_start_end:
                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n'
            else:
                qs = DEFAULT_IMAGE_TOKEN + '\n'

            conv = conv_templates[args.conv_mode].copy()
            conv.append_message(conv.roles[0], qs)
            conv.append_message(conv.roles[1], None)
            print("conv", conv)
            # pdb.set_trace()
            prompt = conv.get_prompt()
            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(
                0).cuda()
            image = Image.open(img).convert('RGB')
            image_tensor = process_images([image], image_processor, model.config)[0]
            print("img tensor", image_tensor.shape)

            with torch.inference_mode():
                output_ids = model.generate(
                    input_ids,
                    images=image_tensor.unsqueeze(0).half().cuda(),
                    image_sizes=[image.size],
                    do_sample=True if args.temperature > 0 else False,
                    temperature=args.temperature,
                    top_p=args.top_p,
                    num_beams=args.num_beams,
                    # no_repeat_ngram_size=3,
                    max_new_tokens=1024,
                    use_cache=True)

            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
            f.write(json.dumps({
                "imgname": img.split('/')[-1].replace(".png", ""),
                "ocr": outputs,
            }) + "\n")
            print(f"Question: {img} Answer: {outputs}")





================================================
FILE: eval/llava/eval_mathvqa.py
================================================
import argparse
import pdb

import torch
import os
import json
from tqdm import tqdm
import shortuuid

from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from PIL import Image
import math
import pandas as pd


def split_list(lst, n):
    """Split a list into n (roughly) equal-sized chunks"""
    chunk_size = math.ceil(len(lst) / n)  # integer division
    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]


def get_chunk(lst, n, k):
    chunks = split_list(lst, n)
    return chunks[k]


# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, questions, image_folder, tokenizer, image_processor, model_config):
        self.questions = questions
        self.image_folder = image_folder
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.model_config = model_config

    def __getitem__(self, index):
        line = self.questions[index]
        image_file = line["image"]
        qs = line["question"]

        if self.model_config.mm_use_im_start_end:
            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
        else:
            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs

        qs += "Answer in a short sentence. Do not output explanations."

        conv = conv_templates[args.conv_mode].copy()
        conv.append_message(conv.roles[0], qs)
        conv.append_message(conv.roles[1], None)

        prompt = conv.get_prompt()
        image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')

        img_size = image.size
        image_tensor = process_images([image], self.image_processor, self.model_config)[0]

        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)

        return input_ids, image_tensor, prompt, img_size

    def __len__(self):
        return len(self.questions)

class SimpleDataset(Dataset):
    def __init__(self, questions, image_folder):
        self.image_folder = image_folder
        self.questions = questions

    def __getitem__(self, item):
        line = self.questions[item]
        image_file = line["image"]
        image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
        image = transforms.ToTensor()(image)
        return image

    def __len__(self):
        return len(self.questions)


# DataLoader
def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
    assert batch_size == 1, "batch_size must be 1"
    dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config)
    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
    return data_loader


def eval_model(args):
    # Model
    disable_torch_init()
    model_path = os.path.expanduser(args.model_path)
    model_name = get_model_name_from_path(model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)
    if args.lang == "en":
        with open(os.path.expanduser(args.question_file), "r") as f:
            questions = json.load(f)
        questions = [q for q in questions]
    else:
        with open(args.question_file, "r") as f:
            questions = [json.loads(line) for line in f]

    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
    answers_file = os.path.expanduser(args.answers_file)
    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
    ans_file = open(answers_file, "w")

    if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
        args.conv_mode = args.conv_mode + '_mmtag'
        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')

    data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config)

    for (input_ids, image_tensor, prompt, img_size), line in tqdm(zip(data_loader, questions), total=len(questions)):
        cur_prompt = line["question"]
        img_id = line["image"]
        id = line['question']
        # origin = line['origin']
        input_ids = input_ids.to(device='cuda', non_blocking=True)

        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=image_tensor.unsqueeze(0).half().cuda() if "1.5" in args.model_path else image_tensor.half(),
                image_sizes=img_size,
                do_sample=True if args.temperature > 0 else False,
                temperature=args.temperature,
                top_p=args.top_p,
                num_beams=args.num_beams,
                # no_repeat_ngram_size=3,
                max_new_tokens=1024,
                use_cache=False
            )

        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
        print(f"Question: {prompt} Answer: {outputs}")
        ans_file.write(json.dumps({"id": id,
                                   "model_answer": outputs,
                                   }, ensure_ascii=False) + "\n")
        # ans_file.flush()
    ans_file.close()



if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
    parser.add_argument("--vision-encoder", type=str, default="clip")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--image-folder", type=str, default="")
    parser.add_argument("--table-folder", type=str, default="")
    parser.add_argument("--question-file", type=str, default="")
    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
    parser.add_argument("--conv-mode", type=str, default="llava_v1")
    parser.add_argument("--num-chunks", type=int, default=1)
    parser.add_argument("--chunk-idx", type=int, default=0)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    parser.add_argument("--max_new_tokens", type=int, default=128)
    parser.add_argument("--lang", type=str, default="en")
    parser.add_argument("--projector", type=str, default="./checkpoints/llava-v1.5-13b-pretrain/checkpoint-1000")
    args = parser.parse_args()


    eval_model(args)


================================================
FILE: eval/llava/eval_qasper.py
================================================
import json
import pdb
import sys
sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")
sys.path.append("../../..")
from transformers import AutoModelForCausalLM, AutoTokenizer
import argparse
#from datasets import dataloader
from dataset.dataloader import PaperLoader, LunwenLoader
import os
from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from llava.model.builder import load_pretrained_model
import torch
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from transformers import AutoModelForCausalLM, LlamaTokenizer

from PIL import Image

if __name__=="__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", type=str, default="./checkpoints/llava-v1.6-34b") #echo840/Monkey-Chat  echo840/Monkey
    parser.add_argument("--data_path", type=str, default="./data/pageqa")
    parser.add_argument("--save_path", type=str, default="./result/qasper/llava-v1.5-13b.jsonl")
    parser.add_argument("--conv-mode", type=str, default="llava_v1")
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
    parser.add_argument("--fp16", action="store_true")
    parser.add_argument("--bf16", action="store_true")
    parser.add_argument("--lang", type=str, default="en")
    parser.add_argument("--mode", type=str, default="lunwen")
    args = parser.parse_args()
    if args.mode == "lunwen":
        dataloader = LunwenLoader(args.data_path)
    else:
        if args.lang == "en":
            dataloader = PaperLoader(os.path.join(args.data_path, "dev.json"))
        else:
            dataloader = PaperLoader(os.path.join(args.data_path, "dev_zh_en.jsonl"))

    if "llava" in args.model_path:
        checkpoint = args.model_path

        model_name = get_model_name_from_path(args.model_path)
        tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, None, model_name)

        if not os.path.exists(os.path.dirname(args.save_path)):
            os.makedirs(os.path.dirname(args.save_path))
        with open(args.save_path, "w") as f:
            if args.mode == "lunwen":
                for data in dataloader:
                    img_path, question, answer = data['img_path'], data['question'], data['answer']
                    img_path = img_path.replace("../", "./")
                    if model.config.mm_use_im_start_end:
                        qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + question
                    else:
                        qs = DEFAULT_IMAGE_TOKEN + '\n' + question

                    conv = conv_templates[args.conv_mode].copy()
                    conv.append_message(conv.roles[0], qs)
                    conv.append_message(conv.roles[1], None)
                    print("conv", conv)
                    # pdb.set_trace()
                    prompt = conv.get_prompt()
                    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX,
                                                      return_tensors='pt').unsqueeze(
                        0).cuda()
                    image = Image.open(img_path).convert('RGB')
                    image_tensor = process_images([image], image_processor, model.config)[0]
                    print("img tensor", image_tensor.shape)
                    # print("input_ids", input_ids)
                    with torch.inference_mode():
                        output_ids = model.generate(
                            input_ids,
                            images=image_tensor.unsqueeze(0).half().cuda(),
                            image_sizes=[image.size],
                            do_sample=True if args.temperature > 0 else False,
                            temperature=args.temperature,
                            top_p=args.top_p,
                            num_beams=args.num_beams,
                            # no_repeat_ngram_size=3,
                            max_new_tokens=1024,
                            use_cache=True)

                    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
                    f.write(json.dumps({
                        "question": question,
                        "answer": answer,
                        "gt_answer": outputs,
                    }, ensure_ascii=False) + "\n")
                    print(f"Question: {question} Answer: {outputs}")
            else:
                for question, imgname, question_id, answers in dataloader:
                    img_path = f"{args.data_path}/png/{imgname}.png"
                    if model.config.mm_use_im_start_end:
                        qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + question
                    else:
                        qs = DEFAULT_IMAGE_TOKEN + '\n' + question

                    conv = conv_templates[args.conv_mode].copy()
                    conv.append_message(conv.roles[0], qs)
                    conv.append_message(conv.roles[1], None)
                    print("conv", conv)
                    # pdb.set_trace()
                    prompt = conv.get_prompt()
                    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(
                        0).cuda()
                    image = Image.open(img_path).convert('RGB')
                    image_tensor = process_images([image], image_processor, model.config)[0]
                    print("img tensor", image_tensor.shape)
                    # print("input_ids", input_ids)
                    with torch.inference_mode():
                        output_ids = model.generate(
                            input_ids,
                            images=image_tensor.unsqueeze(0).half().cuda(),
                            image_sizes=[image.size],
                            do_sample=True if args.temperature > 0 else False,
                            temperature=args.temperature,
                            top_p=args.top_p,
                            num_beams=args.num_beams,
                            # no_repeat_ngram_size=3,
                            max_new_tokens=1024,
                            use_cache=True)

                    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
                    f.write(json.dumps({
                        "question_id": question_id,
                        "predicted_answer": outputs,
                    }, ensure_ascii=False) + "\n")
                    print(f"Question: {question} Answer: {outputs}")





================================================
FILE: eval/llava/eval_snli.py
================================================
import argparse
import pdb

import torch
import os
import json
from tqdm import tqdm
import shortuuid
import cv2
import numpy as np

from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers.generation.configuration_utils import GenerationConfig
from PIL import Image
import math
import pandas as pd
import matplotlib.pyplot as plt
from torch.nn import functional as F

PROMPT = "Given a picture and a hypothesis, you need to judge the relationship between the picture and the text, " \
         "There're three options: Entailment, Neutral or Contradiction." \
         "Here's the hypothesis: <hypothesis> " \
         "Answer with only one word: "
class CustomDataset(Dataset):
    def __init__(self, datafile, image_folder, tokenizer, image_processor, model_config):
        with open(datafile, "r") as f:
            self.data = [json.loads(line) for line in f]
        self.image_folder = image_folder
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.model_config = model_config

    def __getitem__(self, index):
        data = self.data[index]

        qs = data['sentence2']
        qs = PROMPT.replace("<hypothesis>", qs)
        image_file = data['Flickr30K_ID'] + ".jpg"
        print(image_file)
        pair_id = data['pairID']

        if self.model_config.mm_use_im_start_end:
            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
        else:
            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs

        conv = conv_templates[args.conv_mode].copy()
        conv.append_message(conv.roles[0], qs)
        conv.append_message(conv.roles[1], None)

        prompt = conv.get_prompt()
        img_dir = os.path.join(self.image_folder, image_file)
        image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')

        img_size = image.size
        image_tensor = process_images([image], self.image_processor, self.model_config)[0]

        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')

        return input_ids, image_tensor, prompt, img_size, pair_id, img_dir

    def __len__(self):
        return len(self.data)

def create_data_loader(data_file, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
    assert batch_size == 1, "batch_size must be 1"
    dataset = CustomDataset(data_file, image_folder, tokenizer, image_processor, model_config)
    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
    return data_loader

def eval_model(args):
    # Model
    disable_torch_init()

    model_path = os.path.expanduser(args.model_path)
    model_name = get_model_name_from_path(model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)

    answers_file = os.path.expanduser(args.answers_file)
    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
    ans_file = open(answers_file, "w")
    if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
        args.conv_mode = args.conv_mode + '_mmtag'
        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')

    data_loader = create_data_loader(args.data_file, args.image_folder, tokenizer, image_processor, model.config)

    for i, (input_ids, image_tensor, prompt, img_size, pair_id, image_file) in enumerate(tqdm(data_loader)):
        if "1.6-34b" in args.model_path:
            input_ids = input_ids.unsqueeze(0)

        input_ids = input_ids.to(device='cuda', non_blocking=True)
        # generation_config = {
        #     "output_attentions": True,
        # }

        with torch.inference_mode():
            outputs = model(
                input_ids=input_ids,
                images=image_tensor.unsqueeze(0).half().cuda(),
                image_sizes=[img_size],
                output_attentions=True,
                return_dict=True
            )

            attn = outputs['attentions'][0]

            print(attn.shape) # q为query的序列长(这里设为16)，k为key的序列长（这里表示图像feature的patch数192=24*8）
            attn_map = attn[0][15, :, :].detach().cpu().numpy()
            print(attn_map)

            img = cv2.imread(image_file[0])

            attn_map = (attn_map - np.min(attn_map)) / (np.max(attn_map) - np.min(attn_map))

            # 创建热力图颜色映射
            heat_map = cv2.applyColorMap(np.uint8(255 * attn_map), cv2.COLORMAP_JET)
            heat_map = cv2.resize(heat_map, (img.shape[1], img.shape[0]))

            # 将热力图和原始图像融合
            img_with_heatmap = cv2.addWeighted(img, 0.5, heat_map, 0.5, 0)

            # 保存结果图像
            cv2.imwrite(f'./result/SNLI/answers/output_image{str(i)}.jpg', img_with_heatmap)
            # print(im.size)
            # plt.imshow(im)  # 设置plt可视化图层为原图
            # attn_map.squeeze()
            # # attn_map = attn_map.resize(im.size)
            # plt.imshow(attn_map.cpu().numpy() * 255, alpha=0.4, cmap='rainbow')  # 这行将attention图叠加显示，透明度0.4
            # plt.axis('off')  # 关闭坐标轴
            # plt.savefig(f'./result/SNLI/answers/output_image{str(i)}.jpg')
            # plt.clf()

            # output_ids = model.generate(
            #     input_ids,
            #     images=image_tensor.unsqueeze(0).half().cuda(),
            #     image_sizes=[img_size],
            #     do_sample=True if args.temperature > 0 else False,
            #     temperature=args.temperature,
            #     top_p=args.top_p,
            #     num_beams=args.num_beams,
            #     # no_repeat_ngram_size=3,
            #     max_new_tokens=1024,
            #     use_cache=False,
            #     output_attentions=True,
            #
            # )

        # outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
        # print(f"Question: {prompt} Answer: {outputs}")
        # ans_file.write(json.dumps({"img_id": pair_id,
        #                            "prompt": prompt,
        #                            "text": outputs,
        #                            "model_id": model_name,
        #                            "metadata": {}}, ensure_ascii=False) + "\n")
        # ans_file.flush()
    ans_file.close()



if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
    parser.add_argument("--vision-encoder", type=str, default="clip")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--image-folder", type=str, default="../../data/SNLI-VE/data/images")
    parser.add_argument("--data-file", type=str, default="../../data/SNLI-VE/data/snli_ve_dev.jsonl")
    parser.add_argument("--question-file", type=str, default="")
    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
    parser.add_argument("--conv-mode", type=str, default="llava_v1")
    parser.add_argument("--num-chunks", type=int, default=1)
    parser.add_argument("--chunk-idx", type=int, default=0)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    parser.add_argument("--max_new_tokens", type=int, default=128)
    parser.add_argument("--lang", type=str, default="en")
    parser.add_argument("--projector", type=str, default="./checkpoints/llava-v1.5-13b-pretrain/checkpoint-1000")
    args = parser.parse_args()


    eval_model(args)


================================================
FILE: eval/llava/eval_snli_2.py
================================================
import json
import pdb
import sys
sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")
sys.path.append("../../..")
from transformers import AutoModelForCausalLM, AutoTokenizer
import argparse
#from datasets import dataloader
from torch.utils.data import Dataset, DataLoader
import os
from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from llava.model.builder import load_pretrained_model
import torch
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from transformers import AutoModelForCausalLM, LlamaTokenizer

from PIL import Image
PROMPT = "Given a picture and a hypothesis, you need to judge the relationship between the picture and the text, " \
         "There're three options: Entailment, Neutral or Contradiction." \
         "Here's the hypothesis: <hypothesis> " \
         "Answer with only one word: "

class CustomDataset(Dataset):
    def __init__(self, datafile, image_folder, tokenizer, image_processor, model_config):
        with open(datafile, "r") as f:
            self.data = [json.loads(line) for line in f]
        self.image_folder = image_folder
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.model_config = model_config

    def __getitem__(self, index):
        data = self.data[index]

        qs = data['sentence2']
        qs = PROMPT.replace("<hypothesis>", qs)
        image_file = data['Flickr30K_ID'] + ".jpg"
        print(image_file)
        pair_id = data['pairID']

        if self.model_config.mm_use_im_start_end:
            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
        else:
            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs

        conv = conv_templates[args.conv_mode].copy()
        conv.append_message(conv.roles[0], qs)
        conv.append_message(conv.roles[1], None)

        prompt = conv.get_prompt()
        image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')

        img_size = image.size
        image_tensor = process_images([image], self.image_processor, self.model_config)[0]

        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
        print(image_tensor.shape)
        exit()
        return input_ids, image_tensor, prompt, img_size, pair_id

    def __len__(self):
        return len(self.data)
def create_data_loader(data_file, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
    assert batch_size == 1, "batch_size must be 1"
    dataset = CustomDataset(data_file, image_folder, tokenizer, image_processor, model_config)
    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
    return data_loader

if __name__=="__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", type=str, default="./checkpoints/llava-v1.5-13b") #echo840/Monkey-Chat  echo840/Monkey
    parser.add_argument("--data_path", type=str, default="./data/pageqa")
    parser.add_argument("--save_path", type=str, default="./result/qasper/llava-v1.5-13b.jsonl")
    parser.add_argument("--conv-mode", type=str, default="chatml_direct")
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
    parser.add_argument("--fp16", action="store_true")
    parser.add_argument("--bf16", action="store_true")
    args = parser.parse_args()


    if "llava" in args.model_path:
        checkpoint = args.model_path

        model_name = get_model_name_from_path(args.model_path)
        tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, None, model_name)
        data_loader = create_data_loader(args.data_file, args.image_folder, tokenizer, image_processor, model.config)

        with open(args.save_path, "w") as f:
            for question, imgname, question_id, answers in data_loader:
                img_path = f"{args.data_path}/png/{imgname}.png"
                if model.config.mm_use_im_start_end:
                    qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + question
                else:
                    qs = DEFAULT_IMAGE_TOKEN + '\n' + question

                conv = conv_templates[args.conv_mode].copy()
                conv.append_message(conv.roles[0], qs)
                conv.append_message(conv.roles[1], None)
                print("conv", conv)
                # pdb.set_trace()
                prompt = conv.get_prompt()
                input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(
                    0).cuda()
                image = Image.open(img_path).convert('RGB')
                image_tensor = process_images([image], image_processor, model.config)[0]
                # print("img tensor", image_tensor)
                # print("input_ids", input_ids)
                with torch.inference_mode():
                    output_ids = model.generate(
                        input_ids,
                        images=image_tensor.unsqueeze(0).half().cuda(),
                        image_sizes=[image.size],
                        do_sample=True if args.temperature > 0 else False,
                        temperature=args.temperature,
                        top_p=args.top_p,
                        num_beams=args.num_beams,
                        # no_repeat_ngram_size=3,
                        max_new_tokens=1024,
                        use_cache=True)

                outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
                f.write(json.dumps({
                    "question_id": question_id,
                    "predicted_answer": outputs,
                }) + "\n")
                print(f"Question: {question} Answer: {outputs}")





================================================
FILE: eval/llava/inference.py
================================================
from PIL import Image
import requests
from transformers import AutoProcessor, LlavaForConditionalGeneration

model = LlavaForConditionalGeneration.from_pretrained("./checkpoints/llava-v1.5-13b")
processor = AutoProcessor.from_pretrained("./checkpoints/llava-v1.5-13b")

prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=prompt, images=image, return_tensors="pt")
print(inputs)
generate_ids = model(**inputs, output_attentions=True)

# Generate
generate_ids = model.generate(**inputs, max_new_tokens=15)
processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

================================================
FILE: eval/llava/run_lunwen.sh
================================================
#!/bin/bash
#SBATCH -J test                               # 作业名为 test
#SBATCH -o ./log/llava/test-%j.out                           # stdout 重定向到 test.out
#SBATCH -e ./log/llava/test-%j.err                           # stderr 重定向到 test.err
#SBATCH -p compute                            # 作业提交的分区为 compute
#SBATCH -N 1                                  # 作业申请 1 个节点
#SBATCH -t 10:00:00                            # 任务运行的最长时间为 1 小时
#SBATCH --cpus-per-task=8
#SBATCH --gres=gpu:a100-sxm4-80gb:1

export MODEL=$1

for lang in "en" "zh"
do
  for mode in "extractive" "abstractive"
  do
    python ./eval/llava/eval_qasper.py --conv-mode llava_v1 \
      --model_path ./checkpoints/${MODEL} \
      --data_path ./data/ch_paper/qas/final-${mode}-${lang}-zh.jsonl \
      --lang ${lang} \
      --save_path ./results/paper/chpaper/dev/${lang}/${MODEL}-${mode}.jsonl \
      --mode lunwen
  done
  python ./eval/llava/eval_qasper.py --conv-mode llava_v1 \
      --model_path ./checkpoints/${MODEL} \
      --data_path ./data/ch_paper/qas/final-${mode}-${lang}-zh.jsonl \
      --lang ${lang} \
      --save_path ./results/paper/chpaper/dev/en/${MODEL}-yes-no.jsonl \
      --mode lunwen
done



================================================
FILE: eval/llava/run_math.sh
================================================
python ./eval/llava/eval_mathvqa.py --conv-mode llava_v1 \
  --model-path ./checkpoints/llava-v1.6-34b \
  --question-file ./data/pazhou/test.jsonl \
  --answers-file ./results/pazhou/llava-v1.6-34b.jsonl \
  --image-folder ./data/pazhou \
  --lang ch

================================================
FILE: eval/llava/run_qasper.sh
================================================
#!/bin/bash
#SBATCH -J test                               # 作业名为 test
#SBATCH -o ./log/llava/test-%j.out                           # stdout 重定向到 test.out
#SBATCH -e ./log/llava/test-%j.err                           # stderr 重定向到 test.err
#SBATCH -p compute                            # 作业提交的分区为 compute
#SBATCH -N 1                                  # 作业申请 1 个节点
#SBATCH -t 10:00:00                            # 任务运行的最长时间为 1 小时
#SBATCH --cpus-per-task=8
#SBATCH --gres=gpu:a100-sxm4-80gb:1

export MODEL=$1
python ./eval/llava/eval_qasper.py --conv-mode llava_v1 \
  --model_path ./checkpoints/${MODEL} \
  --lang en \
  --save_path ./results/paper/enpaper/dev/en/${MODEL}.jsonl
python ./eval/llava/eval_qasper.py --conv-mode llava_v1 \
  --model_path ./checkpoints/${MODEL} \
  --lang zh \
  --save_path ./results/paper/enpaper/dev/zh/${MODEL}.jsonl

================================================
FILE: eval/log/llava/test-185577.err
================================================
Traceback (most recent call last):
  File "/home/xmyu/mmm-eval/eval/./llava/eval_qasper.py", line 38, in <module>
    dataloader = LunwenLoader(args.data_path)
  File "/home/xmyu/mmm-eval/eval/../dataset/dataloader.py", line 137, in __init__
    with open(data_file, "r") as f:
FileNotFoundError: [Errno 2] No such file or directory: './data/ch_paper/qas/final-extractive-en-zh.jsonl'
Traceback (most recent call last):
  File "/home/xmyu/mmm-eval/eval/./llava/eval_qasper.py", line 38, in <module>
    dataloader = LunwenLoader(args.data_path)
  File "/home/xmyu/mmm-eval/eval/../dataset/dataloader.py", line 137, in __init__
    with open(data_file, "r") as f:
FileNotFoundError: [Errno 2] No such file or directory: './data/ch_paper/qas/final-abstractive-en-zh.jsonl'
Traceback (most recent call last):
  File "/home/xmyu/mmm-eval/eval/./llava/eval_qasper.py", line 38, in <module>
    dataloader = LunwenLoader(args.data_path)
  File "/home/xmyu/mmm-eval/eval/../dataset/dataloader.py", line 137, in __init__
    with open(data_file, "r") as f:
FileNotFoundError: [Errno 2] No such file or directory: './data/ch_paper/qas/final-abstractive-en-zh.jsonl'
Traceback (most recent call last):
  File "/home/xmyu/mmm-eval/eval/./llava/eval_qasper.py", line 38, in <module>
    dataloader = LunwenLoader(args.data_path)
  File "/home/xmyu/mmm-eval/eval/../dataset/dataloader.py", line 137, in __init__
    with open(data_file, "r") as f:
FileNotFoundError: [Errno 2] No such file or directory: './data/ch_paper/qas/final-extractive-zh-zh.jsonl'
Traceback (most recent call last):
  File "/home/xmyu/mmm-eval/eval/./llava/eval_qasper.py", line 38, in <module>
    dataloader = LunwenLoader(args.data_path)
  File "/home/xmyu/mmm-eval/eval/../dataset/dataloader.py", line 137, in __init__
    with open(data_file, "r") as f:
FileNotFoundError: [Errno 2] No such file or directory: './data/ch_paper/qas/final-abstractive-zh-zh.jsonl'
Traceback (most recent call last):
  File "/home/xmyu/mmm-eval/eval/./llava/eval_qasper.py", line 38, in <module>
    dataloader = LunwenLoader(args.data_path)
  File "/home/xmyu/mmm-eval/eval/../dataset/dataloader.py", line 137, in __init__
    with open(data_file, "r") as f:
FileNotFoundError: [Errno 2] No such file or directory: './data/ch_paper/qas/final-abstractive-zh-zh.jsonl'


================================================
FILE: eval/log/llava/test-185577.out
================================================


================================================
FILE: eval/log/llava/test-185579.err
================================================
Traceback (most recent call last):
  File "/home/xmyu/mmm-eval/eval/./llava/eval_qasper.py", line 38, in <module>
    dataloader = LunwenLoader(args.data_path)
  File "/home/xmyu/mmm-eval/eval/../dataset/dataloader.py", line 137, in __init__
    with open(data_file, "r") as f:
FileNotFoundError: [Errno 2] No such file or directory: './data/ch_paper/qas/final-extractive-en-zh.jsonl'
Traceback (most recent call last):
  File "/home/xmyu/mmm-eval/eval/./llava/eval_qasper.py", line 38, in <module>
    dataloader = LunwenLoader(args.data_path)
  File "/home/xmyu/mmm-eval/eval/../dataset/dataloader.py", line 137, in __init__
    with open(data_file, "r") as f:
FileNotFoundError: [Errno 2] No such file or directory: './data/ch_paper/qas/final-abstractive-en-zh.jsonl'
Traceback (most recent call last):
  File "/home/xmyu/mmm-eval/eval/./llava/eval_qasper.py", line 38, in <module>
    dataloader = LunwenLoader(args.data_path)
  File "/home/xmyu/mmm-eval/eval/../dataset/dataloader.py", line 137, in __init__
    with open(data_file, "r") as f:
FileNotFoundError: [Errno 2] No such file or directory: './data/ch_paper/qas/final-abstractive-en-zh.jsonl'
Traceback (most recent call last):
  File "/home/xmyu/mmm-eval/eval/./llava/eval_qasper.py", line 38, in <module>
    dataloader = LunwenLoader(args.data_path)
  File "/home/xmyu/mmm-eval/eval/../dataset/dataloader.py", line 137, in __init__
    with open(data_file, "r") as f:
FileNotFoundError: [Errno 2] No such file or directory: './data/ch_paper/qas/final-extractive-zh-zh.jsonl'
Traceback (most recent call last):
  File "/home/xmyu/mmm-eval/eval/./llava/eval_qasper.py", line 38, in <module>
    dataloader = LunwenLoader(args.data_path)
  File "/home/xmyu/mmm-eval/eval/../dataset/dataloader.py", line 137, in __init__
    with open(data_file, "r") as f:
FileNotFoundError: [Errno 2] No such file or directory: './data/ch_paper/qas/final-abstractive-zh-zh.jsonl'
Traceback (most recent call last):
  File "/home/xmyu/mmm-eval/eval/./llava/eval_qasper.py", line 38, in <module>
    dataloader = LunwenLoader(args.data_path)
  File "/home/xmyu/mmm-eval/eval/../dataset/dataloader.py", line 137, in __init__
    with open(data_file, "r") as f:
FileNotFoundError: [Errno 2] No such file or directory: './data/ch_paper/qas/final-abstractive-zh-zh.jsonl'


================================================
FILE: eval/log/llava/test-185579.out
================================================


================================================
FILE: eval/log/test-136013.err
================================================
usage: eval_xtreme.py [-h] [--dataset DATASET] [--savedir SAVEDIR]
                      [--checkpoint CHECKPOINT] [--eval EVAL]
eval_xtreme.py: error: unrecognized arguments: --checkpoints ../checkpoints/Qwen-VL


================================================
FILE: eval/log/test-136013.out
================================================


================================================
FILE: eval/log/test-136014.err
================================================
/home/xmyu/anaconda3/envs/eval/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: '/home/xmyu/anaconda3/envs/eval/lib/python3.10/site-packages/torchvision/image.so: undefined symbol: _ZN3c1017RegisterOperatorsD1Ev'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]
Loading checkpoint shards:  10%|█         | 1/10 [00:01<00:14,  1.64s/it]
Loading checkpoint shards:  20%|██        | 2/10 [00:03<00:15,  1.88s/it]
Loading checkpoint shards:  30%|███       | 3/10 [00:05<00:13,  1.92s/it]
Loading checkpoint shards:  40%|████      | 4/10 [00:07<00:11,  1.91s/it]
Loading checkpoint shards:  50%|█████     | 5/10 [00:09<00:09,  1.89s/it]
Loading checkpoint shards:  60%|██████    | 6/10 [00:11<00:07,  1.86s/it]
Loading checkpoint shards:  70%|███████   | 7/10 [00:13<00:05,  1.89s/it]
Loading checkpoint shards:  80%|████████  | 8/10 [00:16<00:04,  2.21s/it]
Loading checkpoint shards:  90%|█████████ | 9/10 [00:19<00:02,  2.59s/it]
Loading checkpoint shards: 100%|██████████| 10/10 [00:20<00:00,  2.22s/it]
Loading checkpoint shards: 100%|██████████| 10/10 [00:20<00:00,  2.09s/it]


================================================
FILE: eval/log/test-136014.out
================================================


================================================
FILE: eval/log/test-136017.err
================================================
The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Try importing flash-attention for faster inference...
Warning: import flash_attn rotary fail, please install FlashAttention rotary to get higher efficiency https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary
Warning: import flash_attn rms_norm fail, please install FlashAttention layer_norm to get higher efficiency https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm
Warning: import flash_attn fail, please install FlashAttention to get higher efficiency https://github.com/Dao-AILab/flash-attention

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]
Loading checkpoint shards:  12%|█▎        | 1/8 [00:17<02:03, 17.70s/it]
Loading checkpoint shards:  25%|██▌       | 2/8 [00:36<01:50, 18.34s/it]
Loading checkpoint shards:  38%|███▊      | 3/8 [00:54<01:31, 18.31s/it]
Loading checkpoint shards:  50%|█████     | 4/8 [01:13<01:14, 18.52s/it]
Loading checkpoint shards:  62%|██████▎   | 5/8 [01:35<00:58, 19.56s/it]
Loading checkpoint shards:  75%|███████▌  | 6/8 [01:56<00:40, 20.05s/it]
Loading checkpoint shards:  88%|████████▊ | 7/8 [02:16<00:20, 20.06s/it]
Loading checkpoint shards: 100%|██████████| 8/8 [02:28<00:00, 17.47s/it]
Loading checkpoint shards: 100%|██████████| 8/8 [02:28<00:00, 18.50s/it]

  0%|          | 0/5137 [00:00<?, ?it/s]
  0%|          | 1/5137 [00:08<12:20:33,  8.65s/it]
  0%|          | 2/5137 [00:09<5:24:19,  3.79s/it] 
  0%|          | 3/5137 [00:09<3:13:06,  2.26s/it]
  0%|          | 4/5137 [00:10<2:25:02,  1.70s/it]
  0%|          | 5/5137 [00:12<2:35:45,  1.82s/it]
  0%|          | 6/5137 [00:12<1:47:05,  1.25s/it]
  0%|          | 7/5137 [00:14<1:56:43,  1.37s/it]
  0%|          | 8/5137 [00:14<1:41:14,  1.18s/it]
  0%|          | 9/5137 [00:15<1:21:04,  1.05it/s]
  0%|          | 10/5137 [00:16<1:16:53,  1.11it/s]
  0%|          | 11/5137 [00:18<1:45:50,  1.24s/it]
  0%|          | 12/5137 [00:18<1:22:29,  1.04it/s]
  0%|          | 13/5137 [00:19<1:11:26,  1.20it/s]
  0%|          | 14/5137 [00:19<1:01:35,  1.39it/s]
  0%|          | 15/5137 [00:20<1:03:24,  1.35it/s]
  0%|          | 16/5137 [00:23<2:00:06,  1.41s/it]
  0%|          | 17/5137 [00:23<1:34:23,  1.11s/it]
  0%|          | 18/5137 [00:24<1:28:11,  1.03s/it]
  0%|          | 19/5137 [00:25<1:29:48,  1.05s/it]
  0%|          | 20/5137 [00:26<1:15:55,  1.12it/s]
  0%|          | 21/5137 [00:29<2:21:57,  1.66s/it]
  0%|          | 22/5137 [00:30<2:01:52,  1.43s/it]
  0%|          | 23/5137 [00:31<1:43:36,  1.22s/it]
  0%|          | 24/5137 [00:31<1:25:30,  1.00s/it]
  0%|          | 25/5137 [00:34<2:02:31,  1.44s/it]
  1%|          | 26/5137 [00:34<1:29:28,  1.05s/it]
  1%|          | 27/5137 [00:35<1:33:31,  1.10s/it]
  1%|          | 28/5137 [00:36<1:25:39,  1.01s/it]
  1%|          | 29/5137 [00:36<1:11:32,  1.19it/s]
  1%|          | 30/5137 [00:38<1:30:53,  1.07s/it]
  1%|          | 31/5137 [00:40<1:54:33,  1.35s/it]
  1%|          | 32/5137 [00:40<1:29:37,  1.05s/it]
  1%|          | 33/5137 [00:42<1:49:18,  1.28s/it]
  1%|          | 34/5137 [00:42<1:25:30,  1.01s/it]
  1%|          | 35/5137 [00:43<1:17:11,  1.10it/s]
  1%|          | 36/5137 [00:44<1:11:40,  1.19it/s]
  1%|          | 37/5137 [00:46<1:54:06,  1.34s/it]
  1%|          | 38/5137 [00:48<1:53:49,  1.34s/it]
  1%|          | 39/5137 [00:48<1:42:56,  1.21s/it]
  1%|          | 40/5137 [00:49<1:25:00,  1.00s/it]
  1%|          | 41/5137 [00:50<1:15:20,  1.13it/s]
  1%|          | 42/5137 [00:51<1:18:20,  1.08it/s]
  1%|          | 43/5137 [00:52<1:20:36,  1.05it/s]
  1%|          | 44/5137 [00:52<1:06:31,  1.28it/s]
  1%|          | 45/5137 [00:53<1:06:36,  1.27it/s]
  1%|          | 46/5137 [00:55<1:39:30,  1.17s/it]
  1%|          | 47/5137 [00:56<1:36:20,  1.14s/it]
  1%|          | 48/5137 [00:58<1:52:06,  1.32s/it]
  1%|          | 49/5137 [00:58<1:33:13,  1.10s/it]
  1%|          | 50/5137 [00:59<1:21:03,  1.05it/s]
  1%|          | 51/5137 [00:59<1:02:34,  1.35it/s]
  1%|          | 52/5137 [01:02<2:03:47,  1.46s/it]
  1%|          | 53/5137 [01:04<2:10:59,  1.55s/it]
  1%|          | 54/5137 [01:04<1:41:24,  1.20s/it]
  1%|          | 55/5137 [01:06<1:55:52,  1.37s/it]
  1%|          | 56/5137 [01:07<1:31:53,  1.09s/it]
  1%|          | 57/5137 [01:08<1:29:19,  1.06s/it]
  1%|          | 58/5137 [01:08<1:19:09,  1.07it/s]
  1%|          | 59/5137 [01:09<1:04:45,  1.31it/s]
  1%|          | 60/5137 [01:09<54:37,  1.55it/s]  
  1%|          | 61/5137 [01:11<1:32:20,  1.09s/it]
  1%|          | 62/5137 [01:12<1:17:35,  1.09it/s]
  1%|          | 63/5137 [01:12<1:05:00,  1.30it/s]
  1%|          | 64/5137 [01:12<51:18,  1.65it/s]  
  1%|▏         | 65/5137 [01:13<43:10,  1.96it/s]
  1%|▏         | 66/5137 [01:13<38:05,  2.22it/s]
  1%|▏         | 67/5137 [01:13<35:18,  2.39it/s]
  1%|▏         | 68/5137 [01:14<43:29,  1.94it/s]
  1%|▏         | 69/5137 [01:14<42:37,  1.98it/s]
  1%|▏         | 70/5137 [01:15<41:17,  2.05it/s]
  1%|▏         | 71/5137 [01:16<59:23,  1.42it/s]
  1%|▏         | 72/5137 [01:17<58:42,  1.44it/s]
  1%|▏         | 73/5137 [01:17<53:56,  1.56it/s]
  1%|▏         | 74/5137 [01:19<1:18:57,  1.07it/s]
  1%|▏         | 75/5137 [01:19<1:10:16,  1.20it/s]
  1%|▏         | 76/5137 [01:20<1:10:30,  1.20it/s]
  1%|▏         | 77/5137 [01:21<1:10:00,  1.20it/s]
  2%|▏         | 78/5137 [01:21<54:05,  1.56it/s]  
  2%|▏         | 79/5137 [01:22<52:07,  1.62it/s]
  2%|▏         | 80/5137 [01:23<57:50,  1.46it/s]
  2%|▏         | 81/5137 [01:25<1:36:29,  1.14s/it]
  2%|▏         | 82/5137 [01:25<1:16:07,  1.11it/s]
  2%|▏         | 83/5137 [01:26<1:01:12,  1.38it/s]
  2%|▏         | 84/5137 [01:27<1:14:06,  1.14it/s]
  2%|▏         | 85/5137 [01:27<1:05:25,  1.29it/s]
  2%|▏         | 86/5137 [01:28<1:07:07,  1.25it/s]
  2%|▏         | 87/5137 [01:29<1:04:06,  1.31it/s]
  2%|▏         | 88/5137 [01:29<48:31,  1.73it/s]  
  2%|▏         | 89/5137 [01:30<46:34,  1.81it/s]
  2%|▏         | 90/5137 [01:30<51:34,  1.63it/s]
  2%|▏         | 91/5137 [01:31<41:39,  2.02it/s]
  2%|▏         | 92/5137 [01:31<37:40,  2.23it/s]
  2%|▏         | 93/5137 [01:33<1:21:03,  1.04it/s]
  2%|▏         | 94/5137 [01:34<1:21:53,  1.03it/s]
  2%|▏         | 95/5137 [01:34<1:07:19,  1.25it/s]
  2%|▏         | 96/5137 [01:35<1:10:32,  1.19it/s]
  2%|▏         | 97/5137 [01:36<1:09:54,  1.20it/s]
  2%|▏         | 98/5137 [01:37<59:39,  1.41it/s]  
  2%|▏         | 99/5137 [01:37<56:13,  1.49it/s]
  2%|▏         | 100/5137 [01:38<58:54,  1.43it/s]
  2%|▏         | 101/5137 [01:39<55:26,  1.51it/s]
  2%|▏         | 102/5137 [01:39<49:30,  1.69it/s]
  2%|▏         | 103/5137 [01:39<46:06,  1.82it/s]
  2%|▏         | 104/5137 [01:40<47:55,  1.75it/s]
  2%|▏         | 105/5137 [01:41<52:16,  1.60it/s]
  2%|▏         | 106/5137 [01:42<57:24,  1.46it/s]
  2%|▏         | 107/5137 [01:42<1:02:46,  1.34it/s]
  2%|▏         | 108/5137 [01:43<1:03:48,  1.31it/s]
  2%|▏         | 109/5137 [01:44<59:50,  1.40it/s]  
  2%|▏         | 110/5137 [01:44<56:05,  1.49it/s]
  2%|▏         | 111/5137 [01:45<50:39,  1.65it/s]
  2%|▏         | 112/5137 [01:46<1:00:25,  1.39it/s]
  2%|▏         | 113/5137 [01:46<50:51,  1.65it/s]  
  2%|▏         | 114/5137 [01:47<53:34,  1.56it/s]
  2%|▏         | 115/5137 [01:47<47:27,  1.76it/s]
  2%|▏         | 116/5137 [01:49<1:04:09,  1.30it/s]
  2%|▏         | 117/5137 [01:50<1:16:35,  1.09it/s]
  2%|▏         | 118/5137 [01:50<1:02:52,  1.33it/s]
  2%|▏         | 119/5137 [01:51<58:10,  1.44it/s]  
  2%|▏         | 120/5137 [01:52<1:00:08,  1.39it/s]
  2%|▏         | 121/5137 [01:53<1:07:27,  1.24it/s]
  2%|▏         | 122/5137 [01:54<1:10:55,  1.18it/s]
  2%|▏         | 123/5137 [01:54<1:01:41,  1.35it/s]
  2%|▏         | 124/5137 [01:54<53:06,  1.57it/s]  
  2%|▏         | 125/5137 [01:56<1:16:33,  1.09it/s]
  2%|▏         | 126/5137 [01:56<1:03:52,  1.31it/s]
  2%|▏         | 127/5137 [01:59<1:58:49,  1.42s/it]
  2%|▏         | 128/5137 [02:00<1:29:36,  1.07s/it]
  3%|▎         | 129/5137 [02:02<1:53:17,  1.36s/it]
  3%|▎         | 130/5137 [02:02<1:28:33,  1.06s/it]
  3%|▎         | 131/5137 [02:02<1:11:12,  1.17it/s]
  3%|▎         | 132/5137 [02:03<1:09:04,  1.21it/s]
  3%|▎         | 133/5137 [02:03<57:44,  1.44it/s]  
  3%|▎         | 134/5137 [02:04<55:59,  1.49it/s]
  3%|▎         | 135/5137 [02:05<1:01:47,  1.35it/s]
  3%|▎         | 136/5137 [02:07<1:21:40,  1.02it/s]
  3%|▎         | 137/5137 [02:08<1:45:33,  1.27s/it]
  3%|▎         | 138/5137 [02:10<1:53:44,  1.37s/it]
  3%|▎         | 139/5137 [02:11<1:30:12,  1.08s/it]
  3%|▎         | 140/5137 [02:11<1:16:34,  1.09it/s]
  3%|▎         | 141/5137 [02:12<1:08:24,  1.22it/s]
  3%|▎         | 142/5137 [02:12<1:04:47,  1.28it/s]
  3%|▎         | 143/5137 [02:14<1:20:25,  1.03it/s]
  3%|▎         | 144/5137 [02:16<1:43:38,  1.25s/it]
  3%|▎         | 145/5137 [02:16<1:30:23,  1.09s/it]
  3%|▎         | 146/5137 [02:17<1:17:14,  1.08it/s]
  3%|▎         | 147/5137 [02:17<1:04:45,  1.28it/s]
  3%|▎         | 148/5137 [02:18<51:44,  1.61it/s]  
  3%|▎         | 149/5137 [02:19<1:12:53,  1.14it/s]
  3%|▎         | 150/5137 [02:23<2:29:09,  1.79s/it]
  3%|▎         | 151/5137 [02:24<2:01:16,  1.46s/it]
  3%|▎         | 152/5137 [02:24<1:44:39,  1.26s/it]
  3%|▎         | 153/5137 [02:25<1:20:37,  1.03it/s]
  3%|▎         | 154/5137 [02:25<1:03:30,  1.31it/s]
  3%|▎         | 155/5137 [02:25<52:10,  1.59it/s]  
  3%|▎         | 156/5137 [02:26<49:08,  1.69it/s]
  3%|▎         | 157/5137 [02:26<47:36,  1.74it/s]
  3%|▎         | 158/5137 [02:27<44:36,  1.86it/s]
  3%|▎         | 159/5137 [02:27<37:15,  2.23it/s]
  3%|▎         | 160/5137 [02:28<36:14,  2.29it/s]
  3%|▎         | 161/5137 [02:28<36:38,  2.26it/s]
  3%|▎         | 162/5137 [02:28<38:17,  2.17it/s]
  3%|▎         | 163/5137 [02:32<1:51:04,  1.34s/it]
  3%|▎         | 164/5137 [02:32<1:29:41,  1.08s/it]
  3%|▎         | 165/5137 [02:33<1:14:43,  1.11it/s]
  3%|▎         | 166/5137 [02:33<57:58,  1.43it/s]  
  3%|▎         | 167/5137 [02:33<49:02,  1.69it/s]
  3%|▎         | 168/5137 [02:34<52:32,  1.58it/s]
  3%|▎         | 169/5137 [02:34<45:51,  1.81it/s]
  3%|▎         | 170/5137 [02:35<43:15,  1.91it/s]
  3%|▎         | 171/5137 [02:37<1:30:19,  1.09s/it]
  3%|▎         | 172/5137 [02:38<1:26:34,  1.05s/it]
  3%|▎         | 173/5137 [02:39<1:15:38,  1.09it/s]
  3%|▎         | 174/5137 [02:41<1:37:23,  1.18s/it]
  3%|▎         | 175/5137 [02:42<1:30:28,  1.09s/it]
  3%|▎         | 176/5137 [02:42<1:08:16,  1.21it/s]
  3%|▎         | 177/5137 [02:42<1:03:10,  1.31it/s]
  3%|▎         | 178/5137 [02:43<1:01:01,  1.35it/s]
  3%|▎         | 179/5137 [02:46<2:06:44,  1.53s/it]
  4%|▎         | 180/5137 [02:47<1:35:42,  1.16s/it]
  4%|▎         | 181/5137 [02:48<1:25:48,  1.04s/it]
  4%|▎         | 182/5137 [02:48<1:04:19,  1.28it/s]
  4%|▎         | 183/5137 [02:48<52:01,  1.59it/s]  
  4%|▎         | 184/5137 [02:48<48:48,  1.69it/s]
  4%|▎         | 185/5137 [02:49<50:00,  1.65it/s]
  4%|▎         | 186/5137 [02:51<1:11:40,  1.15it/s]
  4%|▎         | 187/5137 [02:51<1:07:34,  1.22it/s]
  4%|▎         | 188/5137 [02:52<1:15:44,  1.09it/s]
  4%|▎         | 189/5137 [02:53<1:08:21,  1.21it/s]
  4%|▎         | 190/5137 [02:54<1:13:51,  1.12it/s]
  4%|▎         | 191/5137 [02:55<1:03:33,  1.30it/s]
  4%|▎         | 192/5137 [02:55<53:36,  1.54it/s]  
  4%|▍         | 193/5137 [02:55<48:19,  1.71it/s]
  4%|▍         | 194/5137 [02:56<46:23,  1.78it/s]
  4%|▍         | 195/5137 [02:56<38:07,  2.16it/s]
  4%|▍         | 196/5137 [02:57<44:46,  1.84it/s]
  4%|▍         | 197/5137 [02:58<52:10,  1.58it/s]
  4%|▍         | 198/5137 [02:58<40:46,  2.02it/s]
  4%|▍         | 199/5137 [02:58<42:27,  1.94it/s]
  4%|▍         | 200/5137 [03:01<1:22:30,  1.00s/it]
  4%|▍         | 201/5137 [03:01<1:05:45,  1.25it/s]
  4%|▍         | 202/5137 [03:01<59:55,  1.37it/s]  
  4%|▍         | 203/5137 [03:02<53:47,  1.53it/s]
  4%|▍         | 204/5137 [03:03<1:04:56,  1.27it/s]
  4%|▍         | 205/5137 [03:03<51:04,  1.61it/s]  
  4%|▍         | 206/5137 [03:04<50:53,  1.61it/s]
  4%|▍         | 207/5137 [03:05<57:08,  1.44it/s]
  4%|▍         | 208/5137 [03:06<1:09:31,  1.18it/s]
  4%|▍         | 209/5137 [03:06<1:01:13,  1.34it/s]
  4%|▍         | 210/5137 [03:07<58:50,  1.40it/s]  
  4%|▍         | 211/5137 [03:08<59:59,  1.37it/s]
  4%|▍         | 212/5137 [03:09<1:00:37,  1.35it/s]
  4%|▍         | 213/5137 [03:09<1:01:07,  1.34it/s]
  4%|▍         | 214/5137 [03:10<1:00:48,  1.35it/s]
  4%|▍         | 215/5137 [03:11<1:03:20,  1.30it/s]
  4%|▍         | 216/5137 [03:11<56:24,  1.45it/s]  
  4%|▍         | 217/5137 [03:15<2:02:05,  1.49s/it]
  4%|▍         | 218/5137 [03:16<1:42:06,  1.25s/it]
  4%|▍         | 219/5137 [03:16<1:23:18,  1.02s/it]
  4%|▍         | 220/5137 [03:17<1:18:38,  1.04it/s]
  4%|▍         | 221/5137 [03:17<1:07:34,  1.21it/s]
  4%|▍         | 222/5137 [03:19<1:31:26,  1.12s/it]
  4%|▍         | 223/5137 [03:20<1:24:44,  1.03s/it]
  4%|▍         | 224/5137 [03:20<1:09:03,  1.19it/s]
  4%|▍         | 225/5137 [03:21<59:25,  1.38it/s]  
  4%|▍         | 226/5137 [03:22<1:18:03,  1.05it/s]
  4%|▍         | 227/5137 [03:23<1:05:00,  1.26it/s]
  4%|▍         | 228/5137 [03:23<53:06,  1.54it/s]  
  4%|▍         | 229/5137 [03:24<54:25,  1.50it/s]
  4%|▍         | 230/5137 [03:24<51:14,  1.60it/s]
  4%|▍         | 231/5137 [03:26<1:08:52,  1.19it/s]
  5%|▍         | 232/5137 [03:26<55:16,  1.48it/s]  
  5%|▍         | 233/5137 [03:26<43:44,  1.87it/s]
  5%|▍         | 234/5137 [03:27<46:01,  1.78it/s]
  5%|▍         | 235/5137 [03:30<1:41:31,  1.24s/it]
  5%|▍         | 236/5137 [03:30<1:22:48,  1.01s/it]
  5%|▍         | 237/5137 [03:30<1:07:03,  1.22it/s]
  5%|▍         | 238/5137 [03:31<58:11,  1.40it/s]  
  5%|▍         | 239/5137 [03:31<50:14,  1.63it/s]
  5%|▍         | 240/5137 [03:32<48:43,  1.68it/s]
  5%|▍         | 241/5137 [03:32<36:59,  2.21it/s]
  5%|▍         | 242/5137 [03:32<34:15,  2.38it/s]
  5%|▍         | 243/5137 [03:33<35:57,  2.27it/s]
  5%|▍         | 244/5137 [03:33<37:12,  2.19it/s]
  5%|▍         | 245/5137 [03:34<42:44,  1.91it/s]
  5%|▍         | 246/5137 [03:35<57:24,  1.42it/s]
  5%|▍         | 247/5137 [03:36<57:38,  1.41it/s]
  5%|▍         | 248/5137 [03:37<59:17,  1.37it/s]
  5%|▍         | 249/5137 [03:38<1:19:12,  1.03it/s]
  5%|▍         | 250/5137 [03:39<1:10:49,  1.15it/s]
  5%|▍         | 251/5137 [03:39<56:27,  1.44it/s]  
  5%|▍         | 252/5137 [03:39<50:43,  1.60it/s]
  5%|▍         | 253/5137 [03:40<52:45,  1.54it/s]
  5%|▍         | 254/5137 [03:41<45:18,  1.80it/s]
  5%|▍         | 255/5137 [03:41<50:34,  1.61it/s]
  5%|▍         | 256/5137 [03:42<50:25,  1.61it/s]
  5%|▌         | 257/5137 [03:42<45:08,  1.80it/s]
  5%|▌         | 258/5137 [03:43<46:33,  1.75it/s]
  5%|▌         | 259/5137 [03:44<48:25,  1.68it/s]
  5%|▌         | 260/5137 [03:46<1:30:41,  1.12s/it]
  5%|▌         | 261/5137 [03:47<1:31:21,  1.12s/it]
  5%|▌         | 262/5137 [03:48<1:29:24,  1.10s/it]
  5%|▌         | 263/5137 [03:50<1:40:33,  1.24s/it]
  5%|▌         | 264/5137 [03:51<1:30:52,  1.12s/it]
  5%|▌         | 265/5137 [03:52<1:35:25,  1.18s/it]
  5%|▌         | 266/5137 [03:54<2:08:57,  1.59s/it]
  5%|▌         | 267/5137 [03:55<1:35:56,  1.18s/it]
  5%|▌         | 268/5137 [03:56<1:29:43,  1.11s/it]
  5%|▌         | 269/5137 [03:56<1:17:34,  1.05it/s]
  5%|▌         | 270/5137 [03:57<1:08:59,  1.18it/s]
  5%|▌         | 271/5137 [03:59<1:37:06,  1.20s/it]
  5%|▌         | 272/5137 [03:59<1:16:16,  1.06it/s]
  5%|▌         | 273/5137 [04:00<1:23:30,  1.03s/it]
  5%|▌         | 274/5137 [04:01<1:10:23,  1.15it/s]
  5%|▌         | 275/5137 [04:02<1:06:11,  1.22it/s]
  5%|▌         | 276/5137 [04:02<1:02:58,  1.29it/s]
  5%|▌         | 277/5137 [04:03<55:37,  1.46it/s]  
  5%|▌         | 278/5137 [04:03<49:03,  1.65it/s]
  5%|▌         | 279/5137 [04:04<47:13,  1.71it/s]
  5%|▌         | 280/5137 [04:05<56:36,  1.43it/s]
  5%|▌         | 281/5137 [04:05<56:14,  1.44it/s]
  5%|▌         | 282/5137 [04:06<1:04:43,  1.25it/s]
  6%|▌         | 283/5137 [04:08<1:16:41,  1.05it/s]
  6%|▌         | 284/5137 [04:10<1:42:10,  1.26s/it]
  6%|▌         | 285/5137 [04:10<1:23:37,  1.03s/it]
  6%|▌         | 286/5137 [04:11<1:25:41,  1.06s/it]
  6%|▌         | 287/5137 [04:13<1:36:26,  1.19s/it]
  6%|▌         | 288/5137 [04:13<1:20:48,  1.00it/s]
  6%|▌         | 289/5137 [04:14<1:17:31,  1.04it/s]
  6%|▌         | 290/5137 [04:16<1:41:23,  1.26s/it]
  6%|▌         | 291/5137 [04:17<1:30:55,  1.13s/it]
  6%|▌         | 292/5137 [04:17<1:13:19,  1.10it/s]
  6%|▌         | 293/5137 [04:18<1:07:40,  1.19it/s]
  6%|▌         | 294/5137 [04:20<1:25:26,  1.06s/it]
  6%|▌         | 295/5137 [04:21<1:37:54,  1.21s/it]
  6%|▌         | 296/5137 [04:23<1:42:50,  1.27s/it]
  6%|▌         | 297/5137 [04:23<1:33:22,  1.16s/it]
  6%|▌         | 298/5137 [04:24<1:18:13,  1.03it/s]
  6%|▌         | 299/5137 [04:24<1:06:37,  1.21it/s]
  6%|▌         | 300/5137 [04:25<54:47,  1.47it/s]  
  6%|▌         | 301/5137 [04:26<1:00:54,  1.32it/s]
  6%|▌         | 302/5137 [04:26<58:47,  1.37it/s]  
  6%|▌         | 303/5137 [04:27<52:13,  1.54it/s]
  6%|▌         | 304/5137 [04:27<50:37,  1.59it/s]
  6%|▌         | 305/5137 [04:28<58:59,  1.37it/s]
  6%|▌         | 306/5137 [04:29<58:59,  1.36it/s]
  6%|▌         | 307/5137 [04:29<48:40,  1.65it/s]
  6%|▌         | 308/5137 [04:31<1:01:17,  1.31it/s]
  6%|▌         | 309/5137 [04:31<59:51,  1.34it/s]  
  6%|▌         | 310/5137 [04:33<1:13:24,  1.10it/s]
  6%|▌         | 311/5137 [04:33<56:38,  1.42it/s]  
  6%|▌         | 312/5137 [04:34<1:02:27,  1.29it/s]
  6%|▌         | 313/5137 [04:35<1:12:17,  1.11it/s]
  6%|▌         | 314/5137 [04:36<1:22:25,  1.03s/it]
  6%|▌         | 315/5137 [04:37<1:10:57,  1.13it/s]
  6%|▌         | 316/5137 [04:37<59:23,  1.35it/s]  
  6%|▌         | 317/5137 [04:38<55:28,  1.45it/s]
  6%|▌         | 318/5137 [04:38<42:37,  1.88it/s]
  6%|▌         | 319/5137 [04:38<39:43,  2.02it/s]
  6%|▌         | 320/5137 [04:39<42:06,  1.91it/s]
  6%|▌         | 321/5137 [04:40<1:02:26,  1.29it/s]
  6%|▋         | 322/5137 [04:41<51:50,  1.55it/s]  
  6%|▋         | 323/5137 [04:42<59:00,  1.36it/s]
  6%|▋         | 324/5137 [04:43<1:25:54,  1.07s/it]
  6%|▋         | 325/5137 [04:44<1:11:49,  1.12it/s]
  6%|▋         | 326/5137 [04:45<1:07:03,  1.20it/s]
  6%|▋         | 327/5137 [04:45<55:01,  1.46it/s]  
  6%|▋         | 328/5137 [04:46<57:07,  1.40it/s]
  6%|▋         | 329/5137 [04:47<1:10:55,  1.13it/s]
  6%|▋         | 330/5137 [04:49<1:32:48,  1.16s/it]
  6%|▋         | 331/5137 [04:51<2:04:02,  1.55s/it]
  6%|▋         | 332/5137 [04:54<2:19:30,  1.74s/it]
  6%|▋         | 333/5137 [04:54<1:56:45,  1.46s/it]
  7%|▋         | 334/5137 [05:00<3:39:36,  2.74s/it]
  7%|▋         | 335/5137 [05:01<2:47:41,  2.10s/it]
  7%|▋         | 336/5137 [05:01<2:12:21,  1.65s/it]
  7%|▋         | 337/5137 [05:02<1:50:27,  1.38s/it]
  7%|▋         | 338/5137 [05:03<1:40:12,  1.25s/it]
  7%|▋         | 339/5137 [05:03<1:17:32,  1.03it/s]
  7%|▋         | 340/5137 [05:04<1:23:28,  1.04s/it]
  7%|▋         | 341/5137 [05:05<1:09:25,  1.15it/s]
  7%|▋         | 342/5137 [05:06<1:03:13,  1.26it/s]
  7%|▋         | 343/5137 [05:06<1:04:42,  1.23it/s]
  7%|▋         | 344/5137 [05:08<1:18:07,  1.02it/s]
  7%|▋         | 345/5137 [05:12<2:31:34,  1.90s/it]
  7%|▋         | 346/5137 [05:12<2:02:10,  1.53s/it]
  7%|▋         | 347/5137 [05:13<1:44:31,  1.31s/it]
  7%|▋         | 348/5137 [05:14<1:36:30,  1.21s/it]
  7%|▋         | 349/5137 [05:16<1:44:36,  1.31s/it]
  7%|▋         | 350/5137 [05:17<1:44:33,  1.31s/it]
  7%|▋         | 351/5137 [05:18<1:25:36,  1.07s/it]
  7%|▋         | 352/5137 [05:19<1:34:07,  1.18s/it]
  7%|▋         | 353/5137 [05:22<2:07:30,  1.60s/it]
  7%|▋         | 354/5137 [05:22<1:43:08,  1.29s/it]
  7%|▋         | 355/5137 [05:28<3:39:29,  2.75s/it]
  7%|▋         | 356/5137 [05:30<3:08:08,  2.36s/it]
  7%|▋         | 357/5137 [05:33<3:26:00,  2.59s/it]
  7%|▋         | 358/5137 [05:34<2:38:05,  1.98s/it]
  7%|▋         | 359/5137 [05:34<2:00:10,  1.51s/it]
  7%|▋         | 360/5137 [05:35<1:37:54,  1.23s/it]
  7%|▋         | 361/5137 [05:36<1:32:26,  1.16s/it]
  7%|▋         | 362/5137 [05:36<1:12:41,  1.09it/s]
  7%|▋         | 363/5137 [05:36<1:05:04,  1.22it/s]
  7%|▋         | 364/5137 [05:37<1:00:42,  1.31it/s]
  7%|▋         | 365/5137 [05:38<1:13:16,  1.09it/s]
  7%|▋         | 366/5137 [05:40<1:37:39,  1.23s/it]
  7%|▋         | 367/5137 [05:41<1:29:05,  1.12s/it]
  7%|▋         | 368/5137 [05:43<1:53:50,  1.43s/it]
  7%|▋         | 369/5137 [05:44<1:41:17,  1.27s/it]
  7%|▋         | 370/5137 [05:45<1:31:14,  1.15s/it]
  7%|▋         | 371/5137 [05:47<1:49:23,  1.38s/it]
  7%|▋         | 372/5137 [05:48<1:50:50,  1.40s/it]
  7%|▋         | 373/5137 [05:52<2:30:51,  1.90s/it]
  7%|▋         | 374/5137 [05:52<2:06:53,  1.60s/it]
  7%|▋         | 375/5137 [05:55<2:28:18,  1.87s/it]
  7%|▋         | 376/5137 [05:57<2:27:09,  1.85s/it]
  7%|▋         | 377/5137 [05:58<2:01:29,  1.53s/it]
  7%|▋         | 378/5137 [05:59<1:57:03,  1.48s/it]
  7%|▋         | 379/5137 [06:00<1:37:27,  1.23s/it]
  7%|▋         | 380/5137 [06:01<1:41:12,  1.28s/it]
  7%|▋         | 381/5137 [06:02<1:34:30,  1.19s/it]
  7%|▋         | 382/5137 [06:04<1:55:58,  1.46s/it]
  7%|▋         | 383/5137 [06:04<1:30:21,  1.14s/it]
  7%|▋         | 384/5137 [06:11<3:37:03,  2.74s/it]
  7%|▋         | 385/5137 [06:13<3:11:55,  2.42s/it]
  8%|▊         | 386/5137 [06:13<2:27:39,  1.86s/it]
  8%|▊         | 387/5137 [06:13<1:47:48,  1.36s/it]
  8%|▊         | 388/5137 [06:16<2:19:36,  1.76s/it]
  8%|▊         | 389/5137 [06:16<1:45:22,  1.33s/it]
  8%|▊         | 390/5137 [06:17<1:27:51,  1.11s/it]
  8%|▊         | 391/5137 [06:18<1:24:14,  1.07s/it]
  8%|▊         | 392/5137 [06:20<1:49:01,  1.38s/it]
  8%|▊         | 393/5137 [06:21<1:29:25,  1.13s/it]
  8%|▊         | 394/5137 [06:21<1:16:14,  1.04it/s]
  8%|▊         | 395/5137 [06:22<1:05:00,  1.22it/s]
  8%|▊         | 396/5137 [06:23<1:15:01,  1.05it/s]
  8%|▊         | 397/5137 [06:24<1:15:34,  1.05it/s]
  8%|▊         | 398/5137 [06:27<1:59:54,  1.52s/it]
  8%|▊         | 399/5137 [06:31<3:09:32,  2.40s/it]
  8%|▊         | 400/5137 [06:32<2:27:03,  1.86s/it]
  8%|▊         | 401/5137 [06:34<2:38:44,  2.01s/it]
  8%|▊         | 402/5137 [06:35<2:07:18,  1.61s/it]
  8%|▊         | 403/5137 [06:35<1:38:03,  1.24s/it]
  8%|▊         | 404/5137 [06:36<1:22:19,  1.04s/it]
  8%|▊         | 405/5137 [06:36<1:07:39,  1.17it/s]
  8%|▊         | 406/5137 [06:39<1:57:40,  1.49s/it]
  8%|▊         | 407/5137 [06:39<1:27:31,  1.11s/it]
  8%|▊         | 408/5137 [06:40<1:11:23,  1.10it/s]
  8%|▊         | 409/5137 [06:40<55:06,  1.43it/s]  
  8%|▊         | 410/5137 [06:40<47:56,  1.64it/s]
  8%|▊         | 411/5137 [06:41<41:31,  1.90it/s]
  8%|▊         | 412/5137 [06:41<44:58,  1.75it/s]
  8%|▊         | 413/5137 [06:42<46:03,  1.71it/s]
  8%|▊         | 414/5137 [06:43<46:58,  1.68it/s]
  8%|▊         | 415/5137 [06:43<48:45,  1.61it/s]
  8%|▊         | 416/5137 [06:44<43:34,  1.81it/s]
  8%|▊         | 417/5137 [06:44<43:33,  1.81it/s]
  8%|▊         | 418/5137 [06:45<39:51,  1.97it/s]
  8%|▊         | 419/5137 [06:45<37:58,  2.07it/s]
  8%|▊         | 420/5137 [06:45<33:07,  2.37it/s]
  8%|▊         | 421/5137 [06:46<43:20,  1.81it/s]
  8%|▊         | 422/5137 [06:47<40:41,  1.93it/s]
  8%|▊         | 423/5137 [06:48<1:08:58,  1.14it/s]
  8%|▊         | 424/5137 [06:50<1:27:05,  1.11s/it]
  8%|▊         | 425/5137 [06:51<1:15:40,  1.04it/s]
  8%|▊         | 426/5137 [06:51<1:08:19,  1.15it/s]
  8%|▊         | 427/5137 [06:52<1:15:07,  1.04it/s]
  8%|▊         | 428/5137 [06:53<1:01:11,  1.28it/s]
  8%|▊         | 429/5137 [06:53<55:11,  1.42it/s]  
  8%|▊         | 430/5137 [06:54<1:04:27,  1.22it/s]
  8%|▊         | 431/5137 [06:55<54:32,  1.44it/s]  
  8%|▊         | 432/5137 [06:57<1:31:10,  1.16s/it]
  8%|▊         | 433/5137 [07:00<2:08:44,  1.64s/it]
  8%|▊         | 434/5137 [07:01<2:07:13,  1.62s/it]
  8%|▊         | 435/5137 [07:02<1:44:26,  1.33s/it]
  8%|▊         | 436/5137 [07:03<1:26:59,  1.11s/it]
  9%|▊         | 437/5137 [07:03<1:11:01,  1.10it/s]
  9%|▊         | 438/5137 [07:04<1:19:38,  1.02s/it]
  9%|▊         | 439/5137 [07:05<1:10:06,  1.12it/s]
  9%|▊         | 440/5137 [07:05<59:47,  1.31it/s]  
  9%|▊         | 441/5137 [07:06<49:53,  1.57it/s]
  9%|▊         | 442/5137 [07:08<1:19:54,  1.02s/it]
  9%|▊         | 443/5137 [07:09<1:15:27,  1.04it/s]
  9%|▊         | 444/5137 [07:09<1:14:10,  1.05it/s]
  9%|▊         | 445/5137 [07:11<1:27:50,  1.12s/it]
  9%|▊         | 446/5137 [07:12<1:14:21,  1.05it/s]
  9%|▊         | 447/5137 [07:12<1:07:43,  1.15it/s]
  9%|▊         | 448/5137 [07:14<1:28:47,  1.14s/it]
  9%|▊         | 449/5137 [07:15<1:26:36,  1.11s/it]
  9%|▉         | 450/5137 [07:16<1:12:46,  1.07it/s]
  9%|▉         | 451/5137 [07:17<1:36:13,  1.23s/it]
  9%|▉         | 452/5137 [07:18<1:21:44,  1.05s/it]
  9%|▉         | 453/5137 [07:19<1:07:57,  1.15it/s]
  9%|▉         | 454/5137 [07:19<58:26,  1.34it/s]  
  9%|▉         | 455/5137 [07:20<53:09,  1.47it/s]
  9%|▉         | 456/5137 [07:20<50:06,  1.56it/s]
  9%|▉         | 457/5137 [07:21<1:00:04,  1.30it/s]
  9%|▉         | 458/5137 [07:22<52:45,  1.48it/s]  
  9%|▉         | 459/5137 [07:22<51:59,  1.50it/s]
  9%|▉         | 460/5137 [07:22<42:12,  1.85it/s]
  9%|▉         | 461/5137 [07:23<38:55,  2.00it/s]
  9%|▉         | 462/5137 [07:23<36:32,  2.13it/s]
  9%|▉         | 463/5137 [07:24<45:50,  1.70it/s]
  9%|▉         | 464/5137 [07:25<57:09,  1.36it/s]
  9%|▉         | 465/5137 [07:26<55:02,  1.41it/s]
  9%|▉         | 466/5137 [07:26<51:18,  1.52it/s]
  9%|▉         | 467/5137 [07:28<1:18:50,  1.01s/it]
  9%|▉         | 468/5137 [07:29<1:13:45,  1.06it/s]
  9%|▉         | 469/5137 [07:29<1:01:40,  1.26it/s]
  9%|▉         | 470/5137 [07:30<53:09,  1.46it/s]  
  9%|▉         | 471/5137 [07:31<53:43,  1.45it/s]
  9%|▉         | 472/5137 [07:31<56:51,  1.37it/s]
  9%|▉         | 473/5137 [07:32<53:46,  1.45it/s]
  9%|▉         | 474/5137 [07:32<45:49,  1.70it/s]
  9%|▉         | 475/5137 [07:33<46:20,  1.68it/s]
  9%|▉         | 476/5137 [07:34<44:52,  1.73it/s]
  9%|▉         | 477/5137 [07:34<43:51,  1.77it/s]
  9%|▉         | 478/5137 [07:35<43:47,  1.77it/s]
  9%|▉         | 479/5137 [07:35<39:20,  1.97it/s]
  9%|▉         | 480/5137 [07:35<38:17,  2.03it/s]
  9%|▉         | 481/5137 [07:36<38:15,  2.03it/s]
  9%|▉         | 482/5137 [07:40<1:49:51,  1.42s/it]
  9%|▉         | 483/5137 [07:41<1:44:37,  1.35s/it]
  9%|▉         | 484/5137 [07:41<1:20:22,  1.04s/it]
  9%|▉         | 485/5137 [07:42<1:11:52,  1.08it/s]
  9%|▉         | 486/5137 [07:43<1:11:34,  1.08it/s]
  9%|▉         | 487/5137 [07:46<2:07:12,  1.64s/it]
  9%|▉         | 488/5137 [07:46<1:39:45,  1.29s/it]
 10%|▉         | 489/5137 [07:47<1:26:08,  1.11s/it]
 10%|▉         | 490/5137 [07:48<1:10:57,  1.09it/s]
 10%|▉         | 491/5137 [07:48<55:26,  1.40it/s]  
 10%|▉         | 492/5137 [07:49<1:15:29,  1.03it/s]
 10%|▉         | 493/5137 [07:52<1:49:25,  1.41s/it]
 10%|▉         | 494/5137 [07:52<1:28:42,  1.15s/it]
 10%|▉         | 495/5137 [07:53<1:14:52,  1.03it/s]
 10%|▉         | 496/5137 [07:54<1:09:24,  1.11it/s]
 10%|▉         | 497/5137 [07:54<58:04,  1.33it/s]  
 10%|▉         | 498/5137 [07:55<53:25,  1.45it/s]
 10%|▉         | 499/5137 [07:55<41:42,  1.85it/s]
 10%|▉         | 500/5137 [07:56<1:05:45,  1.18it/s]
 10%|▉         | 501/5137 [07:57<51:46,  1.49it/s]  
 10%|▉         | 502/5137 [07:58<1:06:18,  1.16it/s]
 10%|▉         | 503/5137 [07:58<57:07,  1.35it/s]  
 10%|▉         | 504/5137 [08:00<1:06:40,  1.16it/s]
 10%|▉         | 505/5137 [08:00<1:00:01,  1.29it/s]
 10%|▉         | 506/5137 [08:01<57:32,  1.34it/s]  
 10%|▉         | 507/5137 [08:02<1:11:54,  1.07it/s]
 10%|▉         | 508/5137 [08:03<58:51,  1.31it/s]  
 10%|▉         | 509/5137 [08:04<1:07:18,  1.15it/s]
 10%|▉         | 510/5137 [08:04<1:06:54,  1.15it/s]
 10%|▉         | 511/5137 [08:05<56:06,  1.37it/s]  
 10%|▉         | 512/5137 [08:06<55:06,  1.40it/s]
 10%|▉         | 513/5137 [08:06<43:33,  1.77it/s]
 10%|█         | 514/5137 [08:06<43:47,  1.76it/s]
 10%|█         | 515/5137 [08:07<52:26,  1.47it/s]
 10%|█         | 516/5137 [08:08<50:50,  1.51it/s]
 10%|█         | 517/5137 [08:09<56:58,  1.35it/s]
 10%|█         | 518/5137 [08:10<57:49,  1.33it/s]
 10%|█         | 519/5137 [08:11<1:13:03,  1.05it/s]
 10%|█         | 520/5137 [08:12<1:06:36,  1.16it/s]
 10%|█         | 521/5137 [08:12<1:00:14,  1.28it/s]
 10%|█         | 523/5137 [08:14<1:11:18,  1.08it/s]
 10%|█         | 524/5137 [08:16<1:20:53,  1.05s/it]
 10%|█         | 525/5137 [08:18<1:32:42,  1.21s/it]
 10%|█         | 526/5137 [08:18<1:24:41,  1.10s/it]
 10%|█         | 527/5137 [08:20<1:25:35,  1.11s/it]
 10%|█         | 528/5137 [08:20<1:06:35,  1.15it/s]
 10%|█         | 529/5137 [08:20<58:10,  1.32it/s]  
 10%|█         | 530/5137 [08:21<51:27,  1.49it/s]
 10%|█         | 531/5137 [08:21<50:47,  1.51it/s]
 10%|█         | 532/5137 [08:22<48:16,  1.59it/s]
 10%|█         | 533/5137 [08:22<44:23,  1.73it/s]
 10%|█         | 534/5137 [08:23<45:08,  1.70it/s]
 10%|█         | 535/5137 [08:24<45:40,  1.68it/s]
 10%|█         | 536/5137 [08:25<56:26,  1.36it/s]
 10%|█         | 537/5137 [08:26<59:08,  1.30it/s]
 10%|█         | 538/5137 [08:26<51:19,  1.49it/s]
 10%|█         | 539/5137 [08:26<40:58,  1.87it/s]
 11%|█         | 540/5137 [08:27<52:34,  1.46it/s]
 11%|█         | 541/5137 [08:28<48:42,  1.57it/s]
 11%|█         | 542/5137 [08:29<59:04,  1.30it/s]
 11%|█         | 543/5137 [08:29<55:56,  1.37it/s]
 11%|█         | 544/5137 [08:30<53:58,  1.42it/s]
 11%|█         | 545/5137 [08:31<52:48,  1.45it/s]
 11%|█         | 546/5137 [08:31<44:17,  1.73it/s]
 11%|█         | 547/5137 [08:32<40:49,  1.87it/s]
 11%|█         | 548/5137 [08:33<1:04:13,  1.19it/s]
 11%|█         | 549/5137 [08:34<1:00:24,  1.27it/s]
 11%|█         | 550/5137 [08:34<58:38,  1.30it/s]  
 11%|█         | 551/5137 [08:35<56:10,  1.36it/s]
 11%|█         | 552/5137 [08:36<50:35,  1.51it/s]
 11%|█         | 553/5137 [08:36<46:34,  1.64it/s]
 11%|█         | 554/5137 [08:37<55:53,  1.37it/s]
 11%|█         | 555/5137 [08:38<1:05:59,  1.16it/s]
 11%|█         | 556/5137 [08:40<1:24:30,  1.11s/it]
 11%|█         | 557/5137 [08:41<1:18:46,  1.03s/it]
 11%|█         | 558/5137 [08:42<1:13:07,  1.04it/s]
 11%|█         | 559/5137 [08:42<1:00:58,  1.25it/s]
 11%|█         | 560/5137 [08:43<1:06:07,  1.15it/s]
 11%|█         | 561/5137 [08:45<1:28:04,  1.15s/it]
 11%|█         | 562/5137 [08:45<1:10:44,  1.08it/s]
 11%|█         | 563/5137 [08:46<57:58,  1.31it/s]  
 11%|█         | 564/5137 [08:47<1:18:49,  1.03s/it]
 11%|█         | 565/5137 [08:48<1:10:47,  1.08it/s]
 11%|█         | 566/5137 [08:49<1:04:57,  1.17it/s]
 11%|█         | 567/5137 [08:52<2:01:23,  1.59s/it]
 11%|█         | 568/5137 [08:57<3:19:23,  2.62s/it]
 11%|█         | 569/5137 [08:58<2:33:05,  2.01s/it]
 11%|█         | 570/5137 [08:59<2:16:21,  1.79s/it]
 11%|█         | 571/5137 [08:59<1:47:14,  1.41s/it]
 11%|█         | 572/5137 [09:00<1:25:49,  1.13s/it]
 11%|█         | 573/5137 [09:02<1:49:22,  1.44s/it]
 11%|█         | 574/5137 [09:02<1:24:55,  1.12s/it]
 11%|█         | 575/5137 [09:03<1:07:13,  1.13it/s]
 11%|█         | 576/5137 [09:04<1:05:47,  1.16it/s]
 11%|█         | 577/5137 [09:04<55:44,  1.36it/s]  
 11%|█▏        | 578/5137 [09:05<1:01:13,  1.24it/s]
 11%|█▏        | 579/5137 [09:06<55:24,  1.37it/s]  
 11%|█▏        | 580/5137 [09:06<51:21,  1.48it/s]
 11%|█▏        | 581/5137 [09:07<56:34,  1.34it/s]
 11%|█▏        | 582/5137 [09:07<43:12,  1.76it/s]
 11%|█▏        | 583/5137 [09:08<58:25,  1.30it/s]
 11%|█▏        | 584/5137 [09:09<53:41,  1.41it/s]
 11%|█▏        | 585/5137 [09:09<42:31,  1.78it/s]
 11%|█▏        | 586/5137 [09:11<1:07:03,  1.13it/s]
 11%|█▏        | 587/5137 [09:11<51:11,  1.48it/s]  
 11%|█▏        | 588/5137 [09:13<1:18:00,  1.03s/it]
 11%|█▏        | 589/5137 [09:13<59:32,  1.27it/s]  
 11%|█▏        | 590/5137 [09:14<55:35,  1.36it/s]
 12%|█▏        | 591/5137 [09:14<47:17,  1.60it/s]
 12%|█▏        | 592/5137 [09:14<42:07,  1.80it/s]
 12%|█▏        | 593/5137 [09:15<43:22,  1.75it/s]
 12%|█▏        | 594/5137 [09:15<38:46,  1.95it/s]
 12%|█▏        | 595/5137 [09:21<2:41:52,  2.14s/it]
 12%|█▏        | 596/5137 [09:22<2:18:12,  1.83s/it]
 12%|█▏        | 597/5137 [09:23<1:59:32,  1.58s/it]
 12%|█▏        | 598/5137 [09:24<1:34:47,  1.25s/it]
 12%|█▏        | 599/5137 [09:25<1:31:12,  1.21s/it]
 12%|█▏        | 600/5137 [09:26<1:29:35,  1.18s/it]
 12%|█▏        | 601/5137 [09:28<1:46:30,  1.41s/it]
 12%|█▏        | 602/5137 [09:29<1:34:10,  1.25s/it]
 12%|█▏        | 603/5137 [09:29<1:10:35,  1.07it/s]
 12%|█▏        | 604/5137 [09:31<1:31:31,  1.21s/it]
 12%|█▏        | 605/5137 [09:32<1:26:27,  1.14s/it]
 12%|█▏        | 606/5137 [09:33<1:15:59,  1.01s/it]
 12%|█▏        | 607/5137 [09:33<1:09:59,  1.08it/s]
 12%|█▏        | 608/5137 [09:34<57:52,  1.30it/s]  
 12%|█▏        | 609/5137 [09:35<56:45,  1.33it/s]
 12%|█▏        | 610/5137 [09:35<48:22,  1.56it/s]
 12%|█▏        | 611/5137 [09:36<53:08,  1.42it/s]
 12%|█▏        | 612/5137 [09:36<53:04,  1.42it/s]
 12%|█▏        | 613/5137 [09:37<49:36,  1.52it/s]
 12%|█▏        | 614/5137 [09:38<52:44,  1.43it/s]
 12%|█▏        | 615/5137 [09:43<2:22:40,  1.89s/it]
 12%|█▏        | 616/5137 [09:43<1:55:50,  1.54s/it]
 12%|█▏        | 617/5137 [09:44<1:37:19,  1.29s/it]
 12%|█▏        | 618/5137 [09:44<1:17:24,  1.03s/it]
 12%|█▏        | 619/5137 [09:45<1:07:13,  1.12it/s]
 12%|█▏        | 620/5137 [09:45<56:42,  1.33it/s]  
 12%|█▏        | 621/5137 [09:46<54:10,  1.39it/s]
 12%|█▏        | 622/5137 [09:47<52:25,  1.44it/s]
 12%|█▏        | 623/5137 [09:48<1:11:05,  1.06it/s]
 12%|█▏        | 624/5137 [09:50<1:23:12,  1.11s/it]
 12%|█▏        | 625/5137 [09:50<1:17:31,  1.03s/it]
 12%|█▏        | 626/5137 [09:51<1:12:13,  1.04it/s]
 12%|█▏        | 627/5137 [09:53<1:22:05,  1.09s/it]
 12%|█▏        | 628/5137 [09:55<1:49:21,  1.46s/it]
 12%|█▏        | 629/5137 [09:57<1:50:37,  1.47s/it]
 12%|█▏        | 630/5137 [09:57<1:31:14,  1.21s/it]
 12%|█▏        | 631/5137 [09:58<1:16:19,  1.02s/it]
 12%|█▏        | 632/5137 [09:58<1:02:26,  1.20it/s]
 12%|█▏        | 633/5137 [09:59<58:53,  1.27it/s]  
 12%|█▏        | 634/5137 [09:59<55:58,  1.34it/s]
 12%|█▏        | 635/5137 [10:01<1:05:59,  1.14it/s]
 12%|█▏        | 636/5137 [10:01<1:01:24,  1.22it/s]
 12%|█▏        | 637/5137 [10:02<59:17,  1.26it/s]  
 12%|█▏        | 638/5137 [10:03<53:26,  1.40it/s]
 12%|█▏        | 639/5137 [10:03<57:40,  1.30it/s]
 12%|█▏        | 640/5137 [10:04<49:30,  1.51it/s]
 12%|█▏        | 641/5137 [10:04<44:21,  1.69it/s]
 12%|█▏        | 642/5137 [10:05<42:44,  1.75it/s]
 13%|█▎        | 643/5137 [10:05<43:01,  1.74it/s]
 13%|█▎        | 644/5137 [10:06<37:44,  1.98it/s]
 13%|█▎        | 645/5137 [10:07<57:48,  1.29it/s]
 13%|█▎        | 646/5137 [10:08<57:12,  1.31it/s]
 13%|█▎        | 647/5137 [10:09<1:00:10,  1.24it/s]
 13%|█▎        | 648/5137 [10:10<1:13:17,  1.02it/s]
 13%|█▎        | 649/5137 [10:11<1:05:04,  1.15it/s]
 13%|█▎        | 650/5137 [10:13<1:31:11,  1.22s/it]
 13%|█▎        | 651/5137 [10:14<1:21:38,  1.09s/it]
 13%|█▎        | 652/5137 [10:14<1:09:37,  1.07it/s]
 13%|█▎        | 653/5137 [10:15<1:07:48,  1.10it/s]
 13%|█▎        | 654/5137 [10:16<1:15:15,  1.01s/it]
 13%|█▎        | 655/5137 [10:17<1:16:36,  1.03s/it]
 13%|█▎        | 656/5137 [10:19<1:26:16,  1.16s/it]
 13%|█▎        | 657/5137 [10:19<1:14:38,  1.00it/s]
 13%|█▎        | 658/5137 [10:20<1:06:30,  1.12it/s]
 13%|█▎        | 659/5137 [10:21<1:14:52,  1.00s/it]
 13%|█▎        | 660/5137 [10:23<1:21:53,  1.10s/it]
 13%|█▎        | 661/5137 [10:23<1:06:57,  1.11it/s]
 13%|█▎        | 662/5137 [10:24<59:16,  1.26it/s]  
 13%|█▎        | 663/5137 [10:24<53:08,  1.40it/s]
 13%|█▎        | 664/5137 [10:25<48:52,  1.53it/s]
 13%|█▎        | 665/5137 [10:25<45:25,  1.64it/s]
 13%|█▎        | 666/5137 [10:26<43:37,  1.71it/s]
 13%|█▎        | 667/5137 [10:26<45:09,  1.65it/s]
 13%|█▎        | 668/5137 [10:28<59:35,  1.25it/s]
 13%|█▎        | 669/5137 [10:28<57:23,  1.30it/s]
 13%|█▎        | 670/5137 [10:29<54:30,  1.37it/s]
 13%|█▎        | 671/5137 [10:36<3:17:00,  2.65s/it]
 13%|█▎        | 672/5137 [10:37<2:32:53,  2.05s/it]
 13%|█▎        | 673/5137 [10:37<1:58:42,  1.60s/it]
 13%|█▎        | 674/5137 [10:38<1:41:50,  1.37s/it]
 13%|█▎        | 675/5137 [10:39<1:32:24,  1.24s/it]
 13%|█▎        | 676/5137 [10:40<1:19:27,  1.07s/it]
 13%|█▎        | 677/5137 [10:41<1:22:50,  1.11s/it]
 13%|█▎        | 678/5137 [10:43<1:46:22,  1.43s/it]
 13%|█▎        | 679/5137 [10:44<1:29:38,  1.21s/it]
 13%|█▎        | 680/5137 [10:44<1:17:44,  1.05s/it]
 13%|█▎        | 681/5137 [10:46<1:26:21,  1.16s/it]
 13%|█▎        | 682/5137 [10:47<1:16:59,  1.04s/it]
 13%|█▎        | 683/5137 [10:48<1:22:24,  1.11s/it]
 13%|█▎        | 684/5137 [10:49<1:18:09,  1.05s/it]
 13%|█▎        | 685/5137 [10:50<1:23:30,  1.13s/it]
 13%|█▎        | 686/5137 [10:52<1:39:48,  1.35s/it]
 13%|█▎        | 687/5137 [10:53<1:33:38,  1.26s/it]
 13%|█▎        | 688/5137 [10:54<1:26:31,  1.17s/it]
 13%|█▎        | 689/5137 [10:54<1:11:45,  1.03it/s]
 13%|█▎        | 690/5137 [10:55<1:03:47,  1.16it/s]
 13%|█▎        | 691/5137 [10:56<58:21,  1.27it/s]  
 13%|█▎        | 692/5137 [10:57<1:06:41,  1.11it/s]
 13%|█▎        | 693/5137 [10:58<1:11:09,  1.04it/s]
 14%|█▎        | 694/5137 [10:59<1:17:10,  1.04s/it]
 14%|█▎        | 695/5137 [10:59<58:27,  1.27it/s]  
 14%|█▎        | 696/5137 [11:00<56:34,  1.31it/s]
 14%|█▎        | 697/5137 [11:01<57:11,  1.29it/s]
 14%|█▎        | 698/5137 [11:02<1:01:01,  1.21it/s]
 14%|█▎        | 699/5137 [11:02<50:16,  1.47it/s]  
 14%|█▎        | 700/5137 [11:03<47:25,  1.56it/s]
 14%|█▎        | 701/5137 [11:04<1:04:31,  1.15it/s]
 14%|█▎

Download .txt

gitextract_8dq1qumw/

├── .gitignore
├── dataset/
│   ├── __init__.py
│   ├── ch_qa_gen.py
│   ├── data_construct.py
│   ├── dataloader.py
│   ├── glue.py
│   ├── paper_ocr.py
│   ├── pz_loader.py
│   ├── test_gemini.py
│   ├── test_pymu.py
│   └── text_rendering.py
├── eval/
│   ├── all_in_one.sh
│   ├── cogvlm/
│   │   └── inference.py
│   ├── eval_all.py
│   ├── eval_cogagent.py
│   ├── eval_lunwen.sh
│   ├── eval_minicpm.py
│   ├── eval_monkey.py
│   ├── eval_pageqa.py
│   ├── eval_paper_llm.py
│   ├── eval_qasper.py
│   ├── eval_qasper.sh
│   ├── eval_xtreme.py
│   ├── gemini/
│   │   ├── eval_en_ocr.py
│   │   └── eval_qasper.py
│   ├── intern/
│   │   └── inference.py
│   ├── llava/
│   │   ├── acc_chartvqa.py
│   │   ├── eval_chartvqa.py
│   │   ├── eval_en_ocr.py
│   │   ├── eval_mathvqa.py
│   │   ├── eval_qasper.py
│   │   ├── eval_snli.py
│   │   ├── eval_snli_2.py
│   │   ├── inference.py
│   │   ├── run_lunwen.sh
│   │   ├── run_math.sh
│   │   └── run_qasper.sh
│   ├── log/
│   │   ├── llava/
│   │   │   ├── test-185577.err
│   │   │   ├── test-185577.out
│   │   │   ├── test-185579.err
│   │   │   └── test-185579.out
│   │   ├── test-136013.err
│   │   ├── test-136013.out
│   │   ├── test-136014.err
│   │   ├── test-136014.out
│   │   ├── test-136017.err
│   │   ├── test-136017.out
│   │   ├── test-184040.err
│   │   ├── test-184040.out
│   │   ├── test-184041.err
│   │   ├── test-184041.out
│   │   ├── test-184077.err
│   │   ├── test-184077.out
│   │   ├── test-184078.err
│   │   ├── test-184078.out
│   │   ├── test-184087.err
│   │   ├── test-184087.out
│   │   ├── test-184154.err
│   │   ├── test-184154.out
│   │   ├── test-184155.err
│   │   ├── test-184155.out
│   │   ├── test-184219.err
│   │   ├── test-184219.out
│   │   ├── test-184220.err
│   │   ├── test-184220.out
│   │   ├── test-185258.err
│   │   ├── test-185258.out
│   │   ├── test-185259.err
│   │   ├── test-185259.out
│   │   ├── test-185279.err
│   │   ├── test-185279.out
│   │   ├── test-185281.err
│   │   ├── test-185281.out
│   │   ├── test-185283.err
│   │   ├── test-185283.out
│   │   ├── test-185289.err
│   │   ├── test-185289.out
│   │   ├── test-185292.err
│   │   ├── test-185292.out
│   │   ├── test-185300.err
│   │   ├── test-185300.out
│   │   ├── test-185345.err
│   │   ├── test-185345.out
│   │   ├── test-185351.err
│   │   ├── test-185351.out
│   │   ├── test-185410.err
│   │   ├── test-185410.out
│   │   ├── test-185483.err
│   │   ├── test-185483.out
│   │   ├── test-185484.err
│   │   ├── test-185484.out
│   │   ├── test-185573.err
│   │   ├── test-185573.out
│   │   ├── test-185574.err
│   │   └── test-185574.out
│   ├── logits_llava.py
│   ├── make_score.py
│   ├── mark_score.sh
│   ├── monkey/
│   │   └── eval_chart.py
│   ├── run_eval.sh
│   ├── run_eval_monkey.sh
│   ├── run_eval_monkey_ch.sh
│   ├── run_eval_pageqa.sh
│   ├── run_eval_paper.sh
│   ├── run_eval_paper_llm.sh
│   ├── run_eval_paper_llm_zh.sh
│   ├── run_llava_chart.sh
│   └── run_llava_snli.sh
├── src/
│   ├── dtw/
│   │   └── dtw.py
│   └── generate_gemini.py
└── utils/
    ├── __init__.py
    ├── calculate.py
    ├── gemini.py
    ├── gemini_api.py
    ├── gpt4o.py
    ├── render_text.py
    └── utils.py

Download .txt

SYMBOL INDEX (157 symbols across 24 files)

FILE: dataset/ch_qa_gen.py
  function paddle_pdfs (line 17) | def paddle_pdfs(data_dir, save_path="../data"):
  function paddle_png (line 40) | def paddle_png(img_path):
  function gemini_gen_qa_text (line 46) | def gemini_gen_qa_text(metadata, prompt, savedir="../data/ch_paper/qa"):
  function gemini_gen_qa_vision (line 88) | def gemini_gen_qa_vision(metadata, prompt, savedir):
  function gemini_ocr (line 118) | def gemini_ocr(args):

FILE: dataset/data_construct.py
  function hl_section (line 24) | def hl_section(hl_evidence, full_text):

FILE: dataset/dataloader.py
  class MathLoader (line 6) | class MathLoader:
    method __init__ (line 7) | def __init__(self, img_path, data_file):
    method __len__ (line 12) | def __len__(self):
    method __getitem__ (line 15) | def __getitem__(self, idx):
  class PaperLoader (line 31) | class PaperLoader:
    method __init__ (line 32) | def __init__(self, data_file):
    method __len__ (line 39) | def __len__(self):
    method __getitem__ (line 42) | def __getitem__(self, idx):
  class PaperTextLoader (line 50) | class PaperTextLoader:
    method __init__ (line 51) | def __init__(self, data_file, metadata_file):
    method __len__ (line 60) | def __len__(self):
    method __getitem__ (line 63) | def __getitem__(self, idx):
  class OcrvqaLoader (line 72) | class OcrvqaLoader:
    method __init__ (line 73) | def __init__(self, data_file):
    method __len__ (line 85) | def __len__(self):
    method __getitem__ (line 88) | def __getitem__(self, idx):
  class TextvqaLoader (line 96) | class TextvqaLoader:
    method __init__ (line 97) | def __init__(self, data_file):
    method __len__ (line 108) | def __len__(self):
    method __getitem__ (line 111) | def __getitem__(self, idx):
  class ChartvqaLoader (line 120) | class ChartvqaLoader:
    method __init__ (line 121) | def __init__(self, data_file):
    method __len__ (line 129) | def __len__(self):
    method __getitem__ (line 132) | def __getitem__(self, idx):
  class DocvqaLoader (line 139) | class DocvqaLoader:
    method __init__ (line 140) | def __init__(self, data_file):
    method __len__ (line 148) | def __len__(self):
    method __getitem__ (line 151) | def __getitem__(self, idx):
  class LunwenLoader (line 159) | class LunwenLoader:
    method __init__ (line 160) | def __init__(self, data_file):
    method __len__ (line 166) | def __len__(self):
    method __getitem__ (line 169) | def __getitem__(self, idx):
  class LunwenTextLoader (line 180) | class LunwenTextLoader:
    method __init__ (line 181) | def __init__(self, data_file, metadata_file):
    method __len__ (line 191) | def __len__(self):
    method __getitem__ (line 194) | def __getitem__(self, idx):

FILE: dataset/paper_ocr.py
  function get_index_by_category (line 5) | def get_index_by_category(category="cs"):

FILE: dataset/pz_loader.py
  class MathLoader (line 3) | class MathLoader:
    method __init__ (line 4) | def __init__(self, data_file, img_path):
    method __len__ (line 9) | def __len__(self):
    method __getitem__ (line 12) | def __getitem__(self, idx):

FILE: eval/eval_all.py
  function load_image (line 18) | def load_image(image_file):
  function load_images (line 27) | def load_images(image_files):
  function add_noise (line 34) | def add_noise(img, mode="gauss"):
  function instructblip_inference (line 70) | def instructblip_inference(model, img, qs, lang, processor, ocr_tokens=N...
  function llava_inference (line 94) | def llava_inference(model, img, qs, lang, tokenizer, conv, gt=None, mi=F...
  function monkey_inference (line 199) | def monkey_inference(model, img, query, lang, tokenizer, gt=None, mi=Fal...
  function gpt_inference (line 269) | def gpt_inference(img_path, question, lang):
  function gemini_inference (line 294) | def gemini_inference(model, img_path, question):
  function minicpm_inference (line 304) | def minicpm_inference(chat_model, img_path, question, lang):
  function cogvlm_inference (line 317) | def cogvlm_inference(model, tokenizer, img_path, question):
  function mplug_inference (line 337) | def mplug_inference(model, img_path, question, tokenizer,image_processor):
  function qwenvl_inference (line 379) | def qwenvl_inference(model, img_path, question, lang, tokenizer):
  function encode_image (line 485) | def encode_image(image_path):

FILE: eval/eval_pageqa.py
  function normalize_answer (line 9) | def normalize_answer(s):
  function single_f1_zh (line 31) | def single_f1_zh(gold, answer):
  function single_f1_en (line 49) | def single_f1_en(prediction, ground_truth):
  function score_yes_no (line 64) | def score_yes_no(prediction):

FILE: eval/eval_qasper.py
  function normalize_answer (line 12) | def normalize_answer(s):
  function token_f1_score (line 34) | def token_f1_score(prediction, ground_truth):
  function paragraph_f1_score (line 51) | def paragraph_f1_score(prediction, ground_truth):
  function get_answers_and_evidence (line 64) | def get_answers_and_evidence(data, text_evidence_only):
  function evaluate (line 99) | def evaluate(gold, predicted):

FILE: eval/eval_xtreme.py
  function get_tokens (line 22) | def get_tokens(s):
  function normalize_answer (line 26) | def normalize_answer(s):
  function compute_f1 (line 45) | def compute_f1(a_gold, a_pred):
  function evaluate (line 60) | def evaluate(args):
  function evaluate_vis (line 84) | def evaluate_vis(args):
  function evaluate_gpt (line 114) | def evaluate_gpt(args):
  function calc_f1 (line 144) | def calc_f1(file_path):

FILE: eval/gemini/eval_qasper.py
  function gemini_gen_qa_vision (line 17) | def gemini_gen_qa_vision(metadata, prompt, savedir):

FILE: eval/intern/inference.py
  function build_transform (line 18) | def build_transform(input_size):
  function find_closest_aspect_ratio (line 28) | def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height...
  function dynamic_preprocess (line 43) | def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use...
  function load_image (line 81) | def load_image(image_file, input_size=448, max_num=12):

FILE: eval/llava/acc_chartvqa.py
  function get_args (line 9) | def get_args():
  function prompt_processor (line 17) | def prompt_processor(prompt):
  function eval_single (line 35) | def eval_single(annotation_file, result_file):

FILE: eval/llava/eval_chartvqa.py
  function split_list (line 23) | def split_list(lst, n):
  function get_chunk (line 29) | def get_chunk(lst, n, k):
  class CustomDataset (line 35) | class CustomDataset(Dataset):
    method __init__ (line 36) | def __init__(self, questions, image_folder, tables_folder, tokenizer, ...
    method __getitem__ (line 44) | def __getitem__(self, index):
    method __len__ (line 85) | def __len__(self):
  class SimpleDataset (line 88) | class SimpleDataset(Dataset):
    method __init__ (line 89) | def __init__(self, questions, image_folder):
    method __getitem__ (line 93) | def __getitem__(self, item):
    method __len__ (line 100) | def __len__(self):
  function create_data_loader (line 105) | def create_data_loader(questions, image_folder, table_folder, tokenizer,...
  function eval_model (line 112) | def eval_model(args):

FILE: eval/llava/eval_mathvqa.py
  function split_list (line 23) | def split_list(lst, n):
  function get_chunk (line 29) | def get_chunk(lst, n, k):
  class CustomDataset (line 35) | class CustomDataset(Dataset):
    method __init__ (line 36) | def __init__(self, questions, image_folder, tokenizer, image_processor...
    method __getitem__ (line 43) | def __getitem__(self, index):
    method __len__ (line 69) | def __len__(self):
  class SimpleDataset (line 72) | class SimpleDataset(Dataset):
    method __init__ (line 73) | def __init__(self, questions, image_folder):
    method __getitem__ (line 77) | def __getitem__(self, item):
    method __len__ (line 84) | def __len__(self):
  function create_data_loader (line 89) | def create_data_loader(questions, image_folder, tokenizer, image_process...
  function eval_model (line 96) | def eval_model(args):

FILE: eval/llava/eval_snli.py
  class CustomDataset (line 30) | class CustomDataset(Dataset):
    method __init__ (line 31) | def __init__(self, datafile, image_folder, tokenizer, image_processor,...
    method __getitem__ (line 39) | def __getitem__(self, index):
    method __len__ (line 68) | def __len__(self):
  function create_data_loader (line 71) | def create_data_loader(data_file, image_folder, tokenizer, image_process...
  function eval_model (line 77) | def eval_model(args):

FILE: eval/llava/eval_snli_2.py
  class CustomDataset (line 26) | class CustomDataset(Dataset):
    method __init__ (line 27) | def __init__(self, datafile, image_folder, tokenizer, image_processor,...
    method __getitem__ (line 35) | def __getitem__(self, index):
    method __len__ (line 64) | def __len__(self):
  function create_data_loader (line 66) | def create_data_loader(data_file, image_folder, tokenizer, image_process...

FILE: eval/make_score.py
  function prompt_processor (line 14) | def prompt_processor(prompt):
  function evaluate_exact_match_accuracy (line 31) | def evaluate_exact_match_accuracy(entries):
  function relaxed_correctness (line 44) | def relaxed_correctness(target: str,
  function evaluate_relaxed_accuracy (line 86) | def evaluate_relaxed_accuracy(entries):
  function eval_single_chart (line 98) | def eval_single_chart(annotation_file, result_file):
  function relaxed_correctness (line 117) | def relaxed_correctness(target: str,
  function evaluate_relaxed_accuracy (line 159) | def evaluate_relaxed_accuracy(entries):
  function levenshtein_distance (line 171) | def levenshtein_distance(s1, s2):

FILE: src/generate_gemini.py
  function generate_docvqa (line 17) | def generate_docvqa(model: genai.GenerativeModel, image: Image.Image, QA...
  function generate_chartqa (line 53) | def generate_chartqa(model: genai.GenerativeModel, image: Image.Image, Q...
  function generate_infovqa (line 92) | def generate_infovqa(model: genai.GenerativeModel, image: Image.Image, Q...
  function get_structured_extension (line 148) | def get_structured_extension(text):
  function generate_conversation (line 181) | def generate_conversation(model: genai.GenerativeModel, image: Image.Ima...
  function get_structured_conversation (line 218) | def get_structured_conversation(text):
  function generate_and_save (line 236) | def generate_and_save(model, dataset, mode, annotation):

FILE: utils/calculate.py
  function compute_entropy (line 7) | def compute_entropy(logits):
  function compute_entropy_v2 (line 26) | def compute_entropy_v2(logits):
  function compute_entropy_v3 (line 68) | def compute_entropy_v3(logits):
  function compute_mi (line 80) | def compute_mi(logits, logits_wo):
  function compute_pmi (line 110) | def compute_pmi(logits, logits_wo, target_ids):

FILE: utils/gemini.py
  function verify_response (line 66) | def verify_response(response):
  class Gemini_Model (line 77) | class Gemini_Model():
    method __init__ (line 78) | def __init__(self, key, vision=True, patience=1, sleep_time=1):
    method get_response_text (line 93) | def get_response_text(self, prompt):
    method get_response_vision (line 96) | def get_response_vision(self, image_path, input_text):

FILE: utils/gemini_api.py
  function verify_response (line 36) | def verify_response(response):
  class Gemini_Model (line 47) | class Gemini_Model():
    method __init__ (line 48) | def __init__(self, key, vision=True, patience=1, sleep_time=1):
    method get_response_text (line 63) | def get_response_text(self, prompt):
    method get_response_vision (line 67) | def get_response_vision(self, image_path, input_text):

FILE: utils/gpt4o.py
  function encode_image (line 9) | def encode_image(image_path):

FILE: utils/render_text.py
  function load_jsonl (line 17) | def load_jsonl(file_path):
  function render_header (line 25) | def render_header(image: Image.Image, header: str) -> Image.Image:
  function render_text (line 45) | def render_text(text: str,

FILE: utils/utils.py
  function calc_rouge (line 7) | def calc_rouge(hypotheses, references):
  function new_ip (line 40) | def new_ip(api):
  function compute_mi (line 62) | def compute_mi(p, q):
  function compute_conditional_mi (line 90) | def compute_conditional_mi(x, q, p_a_given_x_q):

Download .json

Condensed preview — 117 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (7,430K chars).

[
  {
    "path": ".gitignore",
    "chars": 101,
    "preview": "results/*\nresult/*\ncheckpoints/*\nclose_result/*\ndata/*\nlog/*\nmodels/*\npoe-api-wrapper/*\nvisualize/*\n\n"
  },
  {
    "path": "dataset/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "dataset/ch_qa_gen.py",
    "chars": 6743,
    "preview": "import pdb\nimport sys\nsys.path.append(\"..\")\nsys.path.append(\"../..\")\nimport fitz\nimport os\nimport argparse\nimport uuid\n#"
  },
  {
    "path": "dataset/data_construct.py",
    "chars": 4980,
    "preview": "\n\nimport pdb\n\nimport fitz\nimport json\nimport os\n### READ IN PDF\nfrom collections import Counter\nLAST_INDEX = 18\nwith ope"
  },
  {
    "path": "dataset/dataloader.py",
    "chars": 6815,
    "preview": "import json\n\nimport glob\nimport os\n\nclass MathLoader:\n    def __init__(self, img_path, data_file):\n        with open(dat"
  },
  {
    "path": "dataset/glue.py",
    "chars": 201,
    "preview": "from dataset import load_dataset\n\ndataset = load_dataset(\"nyu-mll/glue\", \"ax\")\n\nsub_cat = [\"ax\", \"cola\", \"mnli\", \"mnli_m"
  },
  {
    "path": "dataset/paper_ocr.py",
    "chars": 1088,
    "preview": "import json\nimport os, pdb\nimport random\n\ndef get_index_by_category(category=\"cs\"):\n    index_list = []\n    with open(\"."
  },
  {
    "path": "dataset/pz_loader.py",
    "chars": 511,
    "preview": "import json, os\n\nclass MathLoader:\n    def __init__(self, data_file, img_path):\n        with open(data_file, \"r\") as f:\n"
  },
  {
    "path": "dataset/test_gemini.py",
    "chars": 324,
    "preview": "import google.generativeai as genai\nimport os\n# API_KEY = \"AIzaSyB2rGDZzkVKxgkV8y_uJf4LvK9E9WKfWoE\"\nAPI_KEY = \"AIzaSyCYo"
  },
  {
    "path": "dataset/test_pymu.py",
    "chars": 267,
    "preview": "import fitz\noutput_path = \"../data/pageqa/pdfs/1912.01214.pdf\"\ndoc = fitz.open(output_path)\n\nfor i, page in enumerate(do"
  },
  {
    "path": "dataset/text_rendering.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/all_in_one.sh",
    "chars": 3253,
    "preview": "#!/bin/bash\n#SBATCH -J $1-$2                               # 作业名为 test\n#SBATCH -o ./log/eval/test-%j.out                "
  },
  {
    "path": "eval/cogvlm/inference.py",
    "chars": 3871,
    "preview": "import torch\nfrom PIL import Image\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\nimport sys, json\nfrom tq"
  },
  {
    "path": "eval/eval_all.py",
    "chars": 37034,
    "preview": "import json\nimport pdb\nimport sys\nsys.path.append(\".\")\nsys.path.append(\"..\")\nsys.path.append(\"../..\")\nsys.path.append(\"."
  },
  {
    "path": "eval/eval_cogagent.py",
    "chars": 3464,
    "preview": "import torch\nimport sys\nsys.path.append(\".\")\nsys.path.append(\"..\")\nsys.path.append(\"../..\")\nfrom PIL import Image\nfrom t"
  },
  {
    "path": "eval/eval_lunwen.sh",
    "chars": 1720,
    "preview": "export MODEL=qwen\nexport NAME=llava-v1.6-34b\npython ./eval/eval_pageqa.py --predictions result/lunwen/en/llava-v1.5-13b-"
  },
  {
    "path": "eval/eval_minicpm.py",
    "chars": 487,
    "preview": "from transformers import AutoModelForCausalLM, AutoTokenizer\nfrom dataset.dataloader import DataLoader\nimport torch\ntorc"
  },
  {
    "path": "eval/eval_monkey.py",
    "chars": 4319,
    "preview": "import json\nimport sys\nsys.path.append(\"..\")\nsys.path.append(\"../..\")\nfrom transformers import AutoModelForCausalLM, Aut"
  },
  {
    "path": "eval/eval_pageqa.py",
    "chars": 5005,
    "preview": "import argparse\nimport pdb\n\nimport jieba\nimport json\nfrom collections import Counter\nimport string, re\n\ndef normalize_an"
  },
  {
    "path": "eval/eval_paper_llm.py",
    "chars": 3068,
    "preview": "import sys, os, json\nsys.path.append(\".\")\nsys.path.append(\"..\")\nsys.path.append(\"../..\")\nsys.path.append(\"../../..\")\n\nim"
  },
  {
    "path": "eval/eval_qasper.py",
    "chars": 7377,
    "preview": "\"\"\"\nOfficial script for evaluating models built for the Qasper dataset. The script\noutputs Answer F1 and Evidence F1 rep"
  },
  {
    "path": "eval/eval_qasper.sh",
    "chars": 622,
    "preview": "export NAME=$1\npython ./eval/eval_qasper.py --predictions close_result/paper/en/gemini-1.5-flash.jsonl \\\n  --gold ./data"
  },
  {
    "path": "eval/eval_xtreme.py",
    "chars": 6920,
    "preview": "import os.path\nimport pdb\nimport sys\nsys.path.append(\"..\")\nsys.path.append(\"../..\")\n\nfrom dataset import load_dataset\nim"
  },
  {
    "path": "eval/gemini/eval_en_ocr.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/gemini/eval_qasper.py",
    "chars": 1433,
    "preview": "import pdb\nimport sys\nsys.path.append(\"..\")\nsys.path.append(\"../..\")\nimport fitz\nimport os\nimport argparse\nimport uuid\n#"
  },
  {
    "path": "eval/intern/inference.py",
    "chars": 5016,
    "preview": "import numpy as np\nimport torch, json\nimport torchvision.transforms as T\nfrom decord import VideoReader, cpu\nfrom PIL im"
  },
  {
    "path": "eval/llava/acc_chartvqa.py",
    "chars": 2238,
    "preview": "import os\nimport argparse\nimport json\nimport re\n\nfrom llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator\n\n\ndef get"
  },
  {
    "path": "eval/llava/eval_chartvqa.py",
    "chars": 7812,
    "preview": "import argparse\nimport pdb\n\nimport torch\nimport os\nimport json\nfrom tqdm import tqdm\nimport shortuuid\n\nfrom llava.consta"
  },
  {
    "path": "eval/llava/eval_en_ocr.py",
    "chars": 3723,
    "preview": "import json\nimport pdb\nimport sys\nsys.path.append(\".\")\nsys.path.append(\"..\")\nsys.path.append(\"../..\")\nsys.path.append(\"."
  },
  {
    "path": "eval/llava/eval_mathvqa.py",
    "chars": 6849,
    "preview": "import argparse\nimport pdb\n\nimport torch\nimport os\nimport json\nfrom tqdm import tqdm\nimport shortuuid\n\nfrom llava.consta"
  },
  {
    "path": "eval/llava/eval_qasper.py",
    "chars": 6973,
    "preview": "import json\nimport pdb\nimport sys\nsys.path.append(\".\")\nsys.path.append(\"..\")\nsys.path.append(\"../..\")\nsys.path.append(\"."
  },
  {
    "path": "eval/llava/eval_snli.py",
    "chars": 8011,
    "preview": "import argparse\nimport pdb\n\nimport torch\nimport os\nimport json\nfrom tqdm import tqdm\nimport shortuuid\nimport cv2\nimport "
  },
  {
    "path": "eval/llava/eval_snli_2.py",
    "chars": 6206,
    "preview": "import json\nimport pdb\nimport sys\nsys.path.append(\".\")\nsys.path.append(\"..\")\nsys.path.append(\"../..\")\nsys.path.append(\"."
  },
  {
    "path": "eval/llava/inference.py",
    "chars": 763,
    "preview": "from PIL import Image\nimport requests\nfrom transformers import AutoProcessor, LlavaForConditionalGeneration\n\nmodel = Lla"
  },
  {
    "path": "eval/llava/run_lunwen.sh",
    "chars": 1181,
    "preview": "#!/bin/bash\n#SBATCH -J test                               # 作业名为 test\n#SBATCH -o ./log/llava/test-%j.out                "
  },
  {
    "path": "eval/llava/run_math.sh",
    "chars": 251,
    "preview": "python ./eval/llava/eval_mathvqa.py --conv-mode llava_v1 \\\n  --model-path ./checkpoints/llava-v1.6-34b \\\n  --question-fi"
  },
  {
    "path": "eval/llava/run_qasper.sh",
    "chars": 849,
    "preview": "#!/bin/bash\n#SBATCH -J test                               # 作业名为 test\n#SBATCH -o ./log/llava/test-%j.out                "
  },
  {
    "path": "eval/log/llava/test-185577.err",
    "chars": 2314,
    "preview": "Traceback (most recent call last):\n  File \"/home/xmyu/mmm-eval/eval/./llava/eval_qasper.py\", line 38, in <module>\n    da"
  },
  {
    "path": "eval/log/llava/test-185577.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/llava/test-185579.err",
    "chars": 2314,
    "preview": "Traceback (most recent call last):\n  File \"/home/xmyu/mmm-eval/eval/./llava/eval_qasper.py\", line 38, in <module>\n    da"
  },
  {
    "path": "eval/log/llava/test-185579.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-136013.err",
    "chars": 213,
    "preview": "usage: eval_xtreme.py [-h] [--dataset DATASET] [--savedir SAVEDIR]\n                      [--checkpoint CHECKPOINT] [--ev"
  },
  {
    "path": "eval/log/test-136013.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-136014.err",
    "chars": 1614,
    "preview": "/home/xmyu/anaconda3/envs/eval/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load imag"
  },
  {
    "path": "eval/log/test-136014.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-136017.err",
    "chars": 272761,
    "preview": "The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, plea"
  },
  {
    "path": "eval/log/test-136017.out",
    "chars": 2848313,
    "preview": "{'id': ['465f3fb044b5c50a78a2e2f9bc94c424d1f7d039'], 'title': ['電動勢'], 'context': ['在电路学里，电动势（英语：electromotive force，缩写为"
  },
  {
    "path": "eval/log/test-184040.err",
    "chars": 2528,
    "preview": "/home/xmyu/anaconda3/envs/monkey/lib/python3.9/site-packages/transformers/generation/configuration_utils.py:367: UserWar"
  },
  {
    "path": "eval/log/test-184040.out",
    "chars": 119,
    "preview": "Question: Did the authors try stacking multiple convolutional layers? Answer: Yes\nQuestion: 作者是否尝试堆叠多个卷积层？ Answer: Yes\n"
  },
  {
    "path": "eval/log/test-184041.err",
    "chars": 4852,
    "preview": "/home/xmyu/anaconda3/envs/qwen/lib/python3.9/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._"
  },
  {
    "path": "eval/log/test-184041.out",
    "chars": 1091,
    "preview": "Question: Did the authors try stacking multiple convolutional layers? Answer: 't know\nQuestion: 作者是否尝试堆叠多个卷积层？ Answer: 未"
  },
  {
    "path": "eval/log/test-184077.err",
    "chars": 1264,
    "preview": "/home/xmyu/anaconda3/envs/monkey/lib/python3.9/site-packages/transformers/generation/configuration_utils.py:367: UserWar"
  },
  {
    "path": "eval/log/test-184077.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-184078.err",
    "chars": 9792,
    "preview": "Try importing flash-attention for faster inference...\nWarning: import flash_attn rms_norm fail, please install FlashAtte"
  },
  {
    "path": "eval/log/test-184078.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-184087.err",
    "chars": 540,
    "preview": "/home/xmyu/anaconda3/envs/qwen/lib/python3.9/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._"
  },
  {
    "path": "eval/log/test-184087.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-184154.err",
    "chars": 4852,
    "preview": "/home/xmyu/anaconda3/envs/qwen/lib/python3.9/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._"
  },
  {
    "path": "eval/log/test-184154.out",
    "chars": 1091,
    "preview": "Question: Did the authors try stacking multiple convolutional layers? Answer: 't know\nQuestion: 作者是否尝试堆叠多个卷积层？ Answer: 未"
  },
  {
    "path": "eval/log/test-184155.err",
    "chars": 2528,
    "preview": "/home/xmyu/anaconda3/envs/monkey/lib/python3.9/site-packages/transformers/generation/configuration_utils.py:367: UserWar"
  },
  {
    "path": "eval/log/test-184155.out",
    "chars": 119,
    "preview": "Question: Did the authors try stacking multiple convolutional layers? Answer: Yes\nQuestion: 作者是否尝试堆叠多个卷积层？ Answer: Yes\n"
  },
  {
    "path": "eval/log/test-184219.err",
    "chars": 2426,
    "preview": "/home/xmyu/anaconda3/envs/qwen/lib/python3.9/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._"
  },
  {
    "path": "eval/log/test-184219.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-184220.err",
    "chars": 1264,
    "preview": "/home/xmyu/anaconda3/envs/monkey/lib/python3.9/site-packages/transformers/generation/configuration_utils.py:367: UserWar"
  },
  {
    "path": "eval/log/test-184220.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-185258.err",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-185258.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-185259.err",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-185259.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-185279.err",
    "chars": 4852,
    "preview": "/home/xmyu/anaconda3/envs/qwen/lib/python3.9/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._"
  },
  {
    "path": "eval/log/test-185279.out",
    "chars": 276,
    "preview": "Question: What kind of issues (that are not on the forefront of computational text analysis) do they tackle? Answer: 't "
  },
  {
    "path": "eval/log/test-185281.err",
    "chars": 552,
    "preview": "Traceback (most recent call last):\n  File \"/home/xmyu/mmm-eval/eval/./eval_monkey.py\", line 7, in <module>\n    from data"
  },
  {
    "path": "eval/log/test-185281.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-185283.err",
    "chars": 4852,
    "preview": "/home/xmyu/anaconda3/envs/qwen/lib/python3.9/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._"
  },
  {
    "path": "eval/log/test-185283.out",
    "chars": 354,
    "preview": "Question: What kind of issues (that are not on the forefront of computational text analysis) do they tackle? Answer: 't "
  },
  {
    "path": "eval/log/test-185289.err",
    "chars": 1264,
    "preview": "/home/xmyu/anaconda3/envs/monkey/lib/python3.9/site-packages/transformers/generation/configuration_utils.py:367: UserWar"
  },
  {
    "path": "eval/log/test-185289.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-185292.err",
    "chars": 2528,
    "preview": "/home/xmyu/anaconda3/envs/monkey/lib/python3.9/site-packages/transformers/generation/configuration_utils.py:367: UserWar"
  },
  {
    "path": "eval/log/test-185292.out",
    "chars": 324,
    "preview": "Question: What kind of issues (that are not on the forefront of computational text analysis) do they tackle? Answer: The"
  },
  {
    "path": "eval/log/test-185300.err",
    "chars": 1264,
    "preview": "/home/xmyu/anaconda3/envs/monkey/lib/python3.9/site-packages/transformers/generation/configuration_utils.py:367: UserWar"
  },
  {
    "path": "eval/log/test-185300.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-185345.err",
    "chars": 2137,
    "preview": "/home/xmyu/anaconda3/envs/qwen/lib/python3.9/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._"
  },
  {
    "path": "eval/log/test-185345.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-185351.err",
    "chars": 4852,
    "preview": "/home/xmyu/anaconda3/envs/qwen/lib/python3.9/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._"
  },
  {
    "path": "eval/log/test-185351.out",
    "chars": 230974,
    "preview": "Question: which multilingual approaches do they compare with? Answer: 't be able to answer this question without more co"
  },
  {
    "path": "eval/log/test-185410.err",
    "chars": 4852,
    "preview": "/home/xmyu/anaconda3/envs/qwen/lib/python3.9/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._"
  },
  {
    "path": "eval/log/test-185410.out",
    "chars": 230974,
    "preview": "Question: which multilingual approaches do they compare with? Answer: 't be able to answer this question without more co"
  },
  {
    "path": "eval/log/test-185483.err",
    "chars": 626,
    "preview": "/home/xmyu/anaconda3/envs/qwen/lib/python3.9/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._"
  },
  {
    "path": "eval/log/test-185483.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-185484.err",
    "chars": 4102,
    "preview": "/home/xmyu/anaconda3/envs/qwen/lib/python3.9/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._"
  },
  {
    "path": "eval/log/test-185484.out",
    "chars": 306529,
    "preview": "Question: which multilingual approaches do they compare with? Answer: 406 word-level multilingual models and 1140 senten"
  },
  {
    "path": "eval/log/test-185573.err",
    "chars": 2051,
    "preview": "/home/xmyu/anaconda3/envs/qwen/lib/python3.9/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._"
  },
  {
    "path": "eval/log/test-185573.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/log/test-185574.err",
    "chars": 1264,
    "preview": "/home/xmyu/anaconda3/envs/monkey/lib/python3.9/site-packages/transformers/generation/configuration_utils.py:367: UserWar"
  },
  {
    "path": "eval/log/test-185574.out",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "eval/logits_llava.py",
    "chars": 787,
    "preview": "from PIL import Image\nimport requests, pdb\nfrom transformers import AutoProcessor, LlavaForConditionalGeneration\n\nmodel "
  },
  {
    "path": "eval/make_score.py",
    "chars": 8941,
    "preview": "\nimport os\nimport argparse\nimport json\nimport pdb\nimport re\n\nfrom llava.eval.m4c_evaluator import TextVQAAccuracyEvaluat"
  },
  {
    "path": "eval/mark_score.sh",
    "chars": 2897,
    "preview": "export NAME=instructblip\nexport MODE=$1\n\nif [ $MODE = \"ocr\" ]\nthen\n  python ./eval/make_score.py --result-file ./close_r"
  },
  {
    "path": "eval/monkey/eval_chart.py",
    "chars": 6179,
    "preview": "import json\nimport sys\nsys.path.append(\"..\")\nsys.path.append(\"../..\")\nsys.path.append(\"../../..\")\nsys.path.append(\"../.."
  },
  {
    "path": "eval/run_eval.sh",
    "chars": 601,
    "preview": "#!/bin/bash\n#SBATCH -J test                               # 作业名为 test\n#SBATCH -o ./log/test-%j.out                      "
  },
  {
    "path": "eval/run_eval_monkey.sh",
    "chars": 1270,
    "preview": "#!/bin/bash\n#SBATCH -J test                               # 作业名为 test\n#SBATCH -o ./log/test-%j.out                      "
  },
  {
    "path": "eval/run_eval_monkey_ch.sh",
    "chars": 1729,
    "preview": "#!/bin/bash\n#SBATCH -J test                               # 作业名为 test\n#SBATCH -o ./log/test-%j.out                      "
  },
  {
    "path": "eval/run_eval_pageqa.sh",
    "chars": 434,
    "preview": "export MODE=$1\npython eval_pageqa.py \\\n --predictions ../result/minicpm/ch_paper/origin/en-zh-$MODE.jsonl \\\n --lang zh\n\n"
  },
  {
    "path": "eval/run_eval_paper.sh",
    "chars": 811,
    "preview": "#!/bin/bash\n#SBATCH -J test                               # 作业名为 test\n#SBATCH -o ./log/crhtest-%j.out                   "
  },
  {
    "path": "eval/run_eval_paper_llm.sh",
    "chars": 781,
    "preview": "export MODEL=$1\nif [ $MODEL = \"llama\" ]\nthen\n  for lang in \"zh\" \"en\"\n  do\n    python ./eval/eval_paper_llm.py --model_pa"
  },
  {
    "path": "eval/run_eval_paper_llm_zh.sh",
    "chars": 1710,
    "preview": "export MODEL=$1\nif [ $MODEL = \"llama\" ]\nthen\n  for lang in \"zh\" \"en\"\n  do\n    for mode in \"extractive\" \"abstractive\"\n   "
  },
  {
    "path": "eval/run_llava_chart.sh",
    "chars": 1124,
    "preview": "#!/bin/bash\n#SBATCH -J train_story-dalle\n#SBATCH -o ./log/chart/h5-%j.out\n#SBATCH -e ./log/chart/h5-%j.err\n#SBATCH -p co"
  },
  {
    "path": "eval/run_llava_snli.sh",
    "chars": 664,
    "preview": "#!/bin/bash\n#SBATCH -J train_story-dalle\n#SBATCH -o ./log/chart/h5-%j.out\n#SBATCH -e ./log/chart/h5-%j.err\n#SBATCH -p co"
  },
  {
    "path": "src/dtw/dtw.py",
    "chars": 279,
    "preview": "from fastdtw import fastdtw\nfrom scipy.spatial.distance import euclidean\nimport numpy as np\n\nx = np.array([1, 3, 4, 4])\n"
  },
  {
    "path": "src/generate_gemini.py",
    "chars": 10075,
    "preview": "import os\nimport re\nimport threading\nimport time\nfrom typing import List, Tuple\nfrom PIL import Image\n\nimport google.gen"
  },
  {
    "path": "utils/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "utils/calculate.py",
    "chars": 4621,
    "preview": "from scipy.stats import entropy\n\nimport numpy as np\nimport torch.nn.functional as F\nfrom scipy.stats import entropy\nimpo"
  },
  {
    "path": "utils/gemini.py",
    "chars": 6063,
    "preview": "import os\nimport pdb\nimport time\n\nimport google.generativeai as genai\nfrom pathlib import Path\n\ngeneration_config = {\n  "
  },
  {
    "path": "utils/gemini_api.py",
    "chars": 3143,
    "preview": "import os\nimport pdb\nimport time\n\nimport google.generativeai as genai\nfrom pathlib import Path\n\ngeneration_config = {\n  "
  },
  {
    "path": "utils/gpt4o.py",
    "chars": 736,
    "preview": "import base64\n# from openai import OpenAI\nimport openai\nopenai.api_base = \"https://openkey.cloud/v1\"\nopenai.api_key = \"s"
  },
  {
    "path": "utils/render_text.py",
    "chars": 3002,
    "preview": "\nimport hashlib\nimport io\nimport os\nimport pdb\nimport random\nimport textwrap\nfrom typing import Any, Callable, Iterable,"
  },
  {
    "path": "utils/utils.py",
    "chars": 2974,
    "preview": "import requests\nimport re, logging\n#import urlib\nfrom rouge import Rouge\nimport numpy as np\nfrom sklearn.neighbors impor"
  }
]

About this extraction

This page contains the full source code of the Stardust-y/XTVQA GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 117 files (4.0 MB), approximately 1.0M tokens, and a symbol index with 157 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo