Full Code of wuomzfx/pdfGPT for AI

main 09a53a0a2e3b cached

17 files

25.1 KB

7.7k tokens

50 symbols

1 requests

Download .txt

Repository: wuomzfx/pdfGPT
Branch: main
Commit: 09a53a0a2e3b
Files: 17
Total size: 25.1 KB

Directory structure:
gitextract_vsirs4ek/

├── .gitignore
├── cache/
│   └── index.js
├── config.js
├── index.js
├── package.json
├── readme.md
├── scripts/
│   ├── ask.js
│   └── load.js
├── userdict.utf8
└── utils/
    ├── ai.js
    ├── ask.js
    ├── content.js
    ├── embedding.js
    ├── fs.js
    ├── openai.js
    ├── pdf.js
    └── tree.js

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
node_modules/
.DS_Store
.vscode/
cache/files/
pdfs/
knowledgeFiles/
answerFiles/

================================================
FILE: cache/index.js
================================================
const { existsSync, writeFileSync, readFileSync } = require('fs');
const { join } = require('path');

const getPath = name => join(__dirname, `./files/${name}.json`);

const getJson = path => {
  // 不存在，返回空对象
  if (!existsSync(path)) {
    return {};
  }
  // 读文件
  let string = readFileSync(path).toString();
  let cacheJson = {};

  try {
    // 反序列化
    cacheJson = JSON.parse(string);
  } catch {}

  return cacheJson;
};

function get(name, key) {
  const path = getPath(name);
  const json = getJson(path);
  return json[key];
}

function set(name, key, value) {
  const path = getPath(name);
  const json = getJson(path);
  json[key] = value;
  writeFileSync(path, JSON.stringify(json));
}

module.exports = { get, set };


================================================
FILE: config.js
================================================
module.exports = {
  apiKey: 'your api key',
  pdfName: 'your pdf name,不需要 .pdf 结尾',
  // pdfName: 'hyb',
  // pdfName: 'e享护',
  // pdfName: '达尔文',
  // pdfName: '微保终身重疾',
  // 医疗险问题
  questions: [
    // '投保年龄限制',
    // '能无条件续保20年吗',
    // '每年的保费会变化吗',
    // '都能报销什么费用',
    // '门诊看病能报销吗',
    // '什么情况下不能理赔报销',
    // '要去哪些医院才能理赔报销',
    // '什么是等待期',
    // '什么是犹豫期',
    // '医保已经报销了，还能继续报销吗',
    // '最多能报销多少钱',
    '如果投保年龄填写错误，理赔时会怎么样',
  ],
  // 重疾险问题
  // questions: [
  //   '投保年龄限制',
  //   '每年的保费会变化吗',
  //   '得什么病能获得赔偿',
  //   '能赔的重大疾病有哪些',
  //   '最多能理赔多少钱',
  //   '什么情况下不能理赔',
  //   '人死了保费能退回吗',
  //   '要去哪些医院才能理赔',
  //   '我总共要投多少钱',
  //   '退保能退钱吗',
  // ],
};


================================================
FILE: index.js
================================================
const { getPdfName } = require('./utils/fs');
const { buildDocTreeFromPdf } = require('./utils/pdf');
const { buildKnowledgeFromDocTree } = require('./utils/tree');
const { buildKnowledgeEmbeddings } = require('./utils/embedding');
const ask = require('./utils/ask');

async function loadingPdf(pdfPath) {
  const pdfName = getPdfName(pdfPath);
  // 构建内容树
  const docTree = await buildDocTreeFromPdf(pdfPath);
  // const fs = require('fs');
  // fs.writeFileSync('./temp.json', JSON.stringify(docTree))
  // 构建知识库
  const knowledge = await buildKnowledgeFromDocTree(docTree, pdfName);
  // // 构建知识库向量
  await buildKnowledgeEmbeddings(knowledge, pdfName);
}

async function askQuestion(question, pdfName) {
  console.log(`AI 正在努力回答您的问题『${question}』，请稍作等待...\n`);
  const answer = await ask(question, pdfName);
  console.log(`您的问题『${question}』回答如下：\n==========\n${answer}\n==========\n`);
  return answer;
}

module.exports = {
  loadingPdf,
  askQuestion,
};


================================================
FILE: package.json
================================================
{
  "name": "pdf-gpt",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "load": "node ./scripts/load",
    "ask": "node ./scripts/ask",
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "author": "",
  "license": "ISC",
  "dependencies": {
    "@stdlib/blas": "^0.0.12",
    "gpt-3-encoder": "^1.1.4",
    "nodejieba": "^2.6.0",
    "openai": "^3.1.0",
    "pdfjs-dist": "^3.3.122"
  },
  "repository": "https://github.com/wuomzfx/pdfGPT.git"
}

================================================
FILE: readme.md
================================================
## 如何使用
1. 执行 `npm install` 或 `tnpm install`;
2. 下载一个保险条款 PDF，放在 `pdfs/` 这个目录下;
3. 在 `config.json` 中，配置你的 `apiKey` 以及你的 PDF 文档名;
4. 针对你的 PDF 文档，修改 `config.json` 中问题 `questions`;
5. 先执行 `npm run load`，如果异常报错了，可以继续重试;
6. 再执行 `npm run ask`;
7. 最终可以在 answerFiles 文件目录下看到答案记录

================================================
FILE: scripts/ask.js
================================================
const config = require('../config');
const { askQuestion } = require('../index');

const { pdfName, questions } = config;

(async () => {
  for (let index = 0; index < questions.length; index++) {
    const question = questions[index];
    await askQuestion(question, pdfName);
  }
})();


================================================
FILE: scripts/load.js
================================================
const config = require('../config');
const { loadingPdf } = require('../index');
const { getPdfPath } = require('../utils/fs');

const { pdfName } = config;

const pdfPath = getPdfPath(pdfName);

loadingPdf(pdfPath);


================================================
FILE: userdict.utf8
================================================
我们
重度
轻度
轻症
中症
可选责任
身故或全残
身故或全残保险金
疾病关爱保险金
恶性肿瘤
被保险人
本合同
疾病扩展保险金
基本保险金额
若被保险人
被保险人
经我们认可的医院专科医生
重大疾病
一种或者多种
18周岁
相学长

================================================
FILE: utils/ai.js
================================================
const crypto = require('crypto');
const { encode } = require('gpt-3-encoder');

const openai = require('./openai');
const cache = require('../cache');

function buildHash(content) {
  return crypto.createHash('md5').update(content).digest('hex');
}

async function createCompletion({
  prompt,
  max_tokens = 1024,
  temperature = 0,
}) {
  const completion = await openai.createCompletion({
    model: 'text-davinci-003',
    prompt,
    max_tokens,
    temperature,
  });

  return strip(completion?.data?.choices?.[0].text, ['\n']).trim();
}

// 去头尾指定字符
const strip = (str, chars) => {
  let newStr = str;
  chars.forEach(char => {
    newStr = newStr.replace(new RegExp(`^${char}+|${char}+$`, 'g'), '');
  });
  return newStr;
};

const withCache =
  (wrappedFn, suffix, getContent) => async (arg, cacheFileName) => {
    const content = getContent(arg);
    const cacheName = `${cacheFileName}_${suffix}`;
    // 文本太长，hash一下
    const hash = buildHash(content);
    const cacheValue = cache.get(cacheName, hash);
    if (cacheValue) {
      return cacheValue;
    }

    const rs = await wrappedFn(arg);

    cache.set(cacheName, hash, rs);
    return rs;
  };

async function getSummary({ content, tokenLength }) {
  const promptContext =
    content.indexOf('|上文中a:') >= -1
      ? `'''{{content}}'''基于字典翻译并返回内容摘要：`
      : `'''{{content}}'''基于命名实体识别构建内容摘要：`;
  const contentTokenLength = tokenLength || encode(content).length;
  const promptContextTokenLength = encode(promptContext).length;

  const completion = await openai.createCompletion({
    model: 'text-davinci-003',
    prompt: promptContext.replace('{{content}}', content),
    // 1000 ~ 4096，最大也不能超过1000
    max_tokens: Math.min(
      4096 - contentTokenLength - promptContextTokenLength,
      1000,
    ),
    temperature: 0,
  });

  return strip(completion?.data?.choices?.[0].text, ['\n']);
}

async function createEmbedding(input) {
  const [response] = await Promise.all([
    openai.createEmbedding({
      model: 'text-embedding-ada-002',
      input: input,
    }),
    // 向量化很快，休息一下，防止调用超限(默认最多每分钟60次)
    await sleep(3000),
  ]);

  return response.data.data[0].embedding;
}

async function askInsQuestion({ question, knowledge }) {
  const prompt = `
    以下是某保险产品条款的部分
    '''${knowledge}'''
    请基于对保险的理解与该部分条款内容，回答如下问题：
    ${question}。
    答案：
    `;

  const promptTokenLength = encode(prompt).length;

  return createCompletion({ prompt, max_tokens: 4096 - promptTokenLength });
}

// 防止超过每分钟调用限制
const sleep = time =>
  new Promise(resolve => {
    setTimeout(resolve, time);
  });

module.exports = {
  sleep,
  getSummary,
  getSummaryWithCache: withCache(
    getSummary,
    'summary',
    ({ content }) => content,
  ),
  createEmbeddingWithCache: withCache(
    createEmbedding,
    'embedding',
    input => input,
  ),
  askInsQuestion,
  createCompletion,
};


================================================
FILE: utils/ask.js
================================================
const { encode } = require('gpt-3-encoder');
const ddot = require('@stdlib/blas/base/ddot');

const { buildQuestionEmbedding } = require('./embedding');
const { readKnowledgeEmbeddings, readKnowledge, writeAnswer } = require('./fs');
const { askInsQuestion } = require('./ai');

function getKnowledge({
  questionEmbedding,
  knowledgeEmbeddings,
  knowledgeList,
}) {
  const kList = knowledgeEmbeddings
    .map((knowledge, index) => {
      const x = new Float64Array(questionEmbedding);
      const y = new Float64Array(knowledge);
      return {
        index,
        ddot: ddot(x.length, x, 1, y, 1),
        knowledge: knowledgeList[index],
      };
    })
    .sort((a, b) => b.ddot - a.ddot)
    .filter(k => k.ddot > 0.8);

  let tokens = 0;
  const enoughTokenList = kList.filter(k => {
    tokens += encode(k.knowledge).length;
    return tokens < 3000;
  });

  return enoughTokenList.map(({ knowledge }) => knowledge).join('\n');
}

async function ask(question, pdfName) {
  const questionEmbedding = await buildQuestionEmbedding(question, pdfName);
  const knowledgeEmbeddings = readKnowledgeEmbeddings(pdfName);
  const knowledgeList = readKnowledge(pdfName);

  const knowledge = getKnowledge({
    questionEmbedding,
    knowledgeEmbeddings,
    knowledgeList,
  });
  const answer = await askInsQuestion({ question, knowledge });
  writeAnswer(pdfName, question, answer);
  return answer;
}

module.exports = ask;


================================================
FILE: utils/content.js
================================================
const path = require('path');
const nodejieba = require('nodejieba');

const LETTERS =
  'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZαβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ'.split('');

nodejieba.load({
  userDict: path.join(__dirname, '../userdict.utf8'),
});

// 判断是否是疾病介绍
function isDiseaseIntro(tokenLength, joinedContent) {
  // 比较短就不处理了
  if (tokenLength < 2000) {
    return false;
  }
  // 粗暴的简单判断
  return !!['重大疾病', '中症疾病', '轻症疾病', '特定心脑血管疾病'].find(
    disease => joinedContent.indexOf(disease) === 0,
  );
}

// 疾病介绍的信息太长了，需要阉割一下，舍弃疾病介绍详情
function shortenDiseaseIntro(content) {
  const titleRegExp = /(?=（[0-9]+）)/g;
  const sections = content.split(titleRegExp).map(section => {
    if (titleRegExp.test(section)) {
      const [title, ..._] = section.split(' ');
      return title;
    }
    return section;
  });
  return sections.join('');
}

function shortenByDictionary(originContent, words, should) {
  let shortContent = originContent;
  const dictionary = [];
  const wordsCounts = words.reduce((acc, cur) => {
    acc[cur] = (acc[cur] || 0) + 1;
    return acc;
  }, {});

  Object.keys(wordsCounts).forEach(word => {
    if (should(wordsCounts[word], word.length)) {
      dictionary.push(word);
      shortContent = shortContent.replaceAll(
        word,
        `${LETTERS[dictionary.length - 1]}`,
      );
    }
  });
  shortContent = `${shortContent}|上文中，${dictionary.map(
    (word, index) => `${LETTERS[index]}:${word}`,
  )}`;
  return shortContent;
}

function shortenTableContent(tableContent) {
  const words = tableContent.split(' ');
  return shortenByDictionary(
    tableContent,
    words,
    (counts, length) => counts > 3 && length > 3,
  );
}

function shortenSectionContent(sectionContent) {
  const longContent = sectionContent
    // 去无不需要文案
    .replaceAll('（见释义）', '')
    // 减少字符
    .replaceAll('——', '—')
    // 全角半角化
    .replaceAll('（', '(')
    .replaceAll('）', ')')
    .replaceAll('：', ':')
    .replaceAll('；', ';')
    .replaceAll('、', '|')
    .replaceAll('，', ',')
    .replaceAll('。', '.')
    .replaceAll('“', `'`)
    .replaceAll('”', `'`)
    // 去无意义空格
    .replaceAll('. ', '.')
    .replaceAll(` '`, `'`)
    .replaceAll('; ', ';');
  const words = nodejieba.cut(longContent);
  return shortenByDictionary(
    longContent,
    words,
    (counts, length) => counts > 4 && length > 1,
  );
}

function shortenContent(longContent) {
  if (longContent.split(' ').length > 100) {
    return shortenTableContent(longContent);
  }
  return shortenSectionContent(longContent);
}

module.exports = {
  isDiseaseIntro,
  shortenDiseaseIntro,
  shortenContent,
  shortenTableContent,
  shortenSectionContent,
};


================================================
FILE: utils/embedding.js
================================================
const { createEmbeddingWithCache } = require('./ai');
const { writeKnowledgeEmbeddings } = require('./fs');

async function buildKnowledgeEmbeddings(knowledge, pdfName) {
  const embeddings = [];
  for (let index = 0; index < knowledge.length; index++) {
    if (!embeddings[index]) {
      const embedding = await createEmbeddingWithCache(knowledge[index], pdfName);
      embeddings[index] = embedding;
      console.log('createEmbedding success', index);
    }
  }
  writeKnowledgeEmbeddings(pdfName, embeddings);
  return embeddings;
}

async function buildQuestionEmbedding(question, pdfName) {
  const embedding = await createEmbeddingWithCache(question, pdfName);
  // console.log('createQuestionEmbedding success:', question);

  return embedding;
}

module.exports = { buildKnowledgeEmbeddings, buildQuestionEmbedding };


================================================
FILE: utils/fs.js
================================================
const { writeFileSync, readFileSync, existsSync, mkdirSync } = require('fs');
const { join } = require('path');

function readJsonFile(path) {
  try {
    const string = readFileSync(path).toString();
    return JSON.parse(string);
  } catch {
    return {};
  }
}

function getPdfName(pdfPath) {
  return pdfPath.split('/').pop().split('.pdf')[0];
}

function getPath(pdfName, fileName) {
  const relativeDirPath = `../knowledgeFiles/${pdfName}`;
  const dirPath = join(__dirname, relativeDirPath);
  // 文件夹初始化
  if (!existsSync(dirPath)) {
    mkdirSync(dirPath);
  }
  return join(__dirname, `${relativeDirPath}/${fileName}.json`);
}

function writeContentTree(pdfName, docTree) {
  writeFileSync(getPath(pdfName, 'contentTree'), JSON.stringify(docTree));
}

function writeKnowledge(pdfName, knowledge) {
  writeFileSync(getPath(pdfName, 'knowledge'), JSON.stringify(knowledge));
}

function readKnowledge(pdfName) {
  return readJsonFile(getPath(pdfName, 'knowledge'));
}

function writeKnowledgeEmbeddings(pdfName, embeddings) {
  writeFileSync(
    getPath(pdfName, 'knowledgeEmbeddings'),
    JSON.stringify(embeddings),
  );
}

function readKnowledgeEmbeddings(pdfName) {
  return readJsonFile(getPath(pdfName, 'knowledgeEmbeddings'));
}

function getPdfPath(pdfName) {
  return join(__dirname, `../pdfs/${pdfName}.pdf`);
}

function writeAnswer(pdfName, question, answer) {
  const answerPath = join(__dirname, `../answerFiles/${pdfName}_answers.json`);
  if (!existsSync(answerPath)) {
    writeFileSync(answerPath, JSON.stringify({ [question]: answer }));
    return;
  }

  const answerJson = readJsonFile(answerPath);
  answerJson[question] = answer;
  writeFileSync(answerPath, JSON.stringify(answerJson));
}

module.exports = {
  getPdfPath,
  getPdfName,
  writeAnswer,
  readKnowledge,
  writeKnowledge,
  writeContentTree,
  writeKnowledgeEmbeddings,
  readKnowledgeEmbeddings,
};


================================================
FILE: utils/openai.js
================================================
const { Configuration, OpenAIApi } = require('openai');
const { apiKey } = require('../config');

const configuration = new Configuration({
  apiKey,
});

const openai = new OpenAIApi(configuration);

module.exports = openai;


================================================
FILE: utils/pdf.js
================================================
const pdfjs = require('pdfjs-dist');
const { encode } = require('gpt-3-encoder');
const {
  isDiseaseIntro,
  shortenDiseaseIntro,
  shortenContent,
} = require('./content');

// 封面
const PAGE_TYPE_COVER = 0;
// 目录
const PAGE_TYPE_CATALOG = 1;
// 正文
const PAGE_TYPE_MAIN = 2;

const TITLE_SPLIT = '__TITLE__';
const QUOTE_SPLIT = '__QUOTE__';
const REF_SPLIT = '__REF__';

function buildDocTree(longStr) {
  const [, ...sections] = longStr.split(TITLE_SPLIT); // 将字符串划分成 section 数组

  const treeNodes = sections
    .map(section => {
      let [titleNo, ...content] = section.split(' ');
      if (titleNo.endsWith('.')) {
        titleNo = titleNo.slice(0, -1);
      }

      const matchedTitleNo = titleNo.match(/^\d+(\.\d*)*\.?/)?.[0];

      let joinedContent = content.join(' ');

      // 说明标题中有非纯数字标题的内容，把这部分内容拼接到正文中
      if (matchedTitleNo !== titleNo) {
        const titleContent = titleNo.replace(/^\d+(\.\d*)*\.?/, '');
        joinedContent = titleContent + ' ' + joinedContent;
      }

      let tokenLength = encode(joinedContent).length;

      // 疾病介绍内容特别长，可以阉割掉具体疾病的详细信息
      if (isDiseaseIntro(tokenLength, joinedContent)) {
        joinedContent = shortenDiseaseIntro(joinedContent);
      } else if (tokenLength > 4000) {
        // 不是疾病介绍也特别长的，采用字典压缩法压缩
        joinedContent = shortenContent(joinedContent);
      }

      tokenLength = encode(joinedContent).length;

      return {
        titleNo: matchedTitleNo || titleNo,
        content: joinedContent,
        children: [],
        refs: [],
        tokenLength,
      };
    })
    // .map(node => {
    //   const { content } = node;

    //   if (content.indexOf(QUOTE_SPLIT)) {
    //     const regex = /__QUOTE__([0-9.]+)/g;
    //     let match;
    //     while ((match = regex.exec(content)) !== null) {
    //       node.refs.push(match[1]);
    //     }
    //     node.content = node.content
    //       .replace(regex, '')
    //       .replace(/第\s*\d+\s*页\s*共\d+页/g, '');
    //     return node;
    //   }
    // });
  return treeNodes;
}

function isCatalogPage({ items }) {
  const pageContent = items.map(i => i.str).join('');
  if (pageContent.indexOf('条款目录') > -1) {
    return true;
  }
  if (pageContent.split(/(?=\d+.\d+)/).length > 10) {
    return true;
  }
}

// 将注释内容拼接到正文中
function moveNoteToMain(items) {
  const { mainFontHeight, titlePositionX, pageNumberPositionY } =
    getPageMetaData(items);

  const isRefTitle = item =>
    Math.abs(item.transform[4] - titlePositionX) < 2 &&
    item.height / mainFontHeight < 0.7;

  const refSplitIndex = items.findIndex(isRefTitle);

  if (refSplitIndex < 0) {
    return items;
  }

  // 正文
  const mainItems = items.slice(0, refSplitIndex);
  // 注释
  items
    .slice(refSplitIndex)
    .map(refItem => {
      if (isRefTitle(refItem)) {
        refItem.str = `${REF_SPLIT}${refItem.str.trim()} `;
      }
      return refItem.str;
    })
    .join('')
    .split(REF_SPLIT)
    .forEach(refContent => {
      const [refNo, ...content] = refContent.split(' ');
      if (refNo && content.length) {
        const mainItem = mainItems.find(i => i.str.trim() === refNo);

        if (!mainItem) {
          return;
        }
        mainItem.str = `[${content.join('')}]`;
      }
    });
  return mainItems;
}

async function getPdfItems(pdfPath) {
  const pdfItems = [];
  let pageType = PAGE_TYPE_CATALOG;
  await pdfjs.getDocument(pdfPath).promise.then(doc => {
    const numPages = doc.numPages;
    let lastPromise = doc.getMetadata();

    const loadPage = function (pageNum) {
      return doc.getPage(pageNum).then(page => {
        return page
          .getTextContent({
            disableCombineTextItems: true,
            // includeMarkedContent: true,
          })
          .then(pageData => {
            // 如果之前是封面，当前页已经是目录页了，状态改为目录页
            if (pageType === PAGE_TYPE_COVER && isCatalogPage(pageData)) {
              pageType = PAGE_TYPE_CATALOG;
            }
            // 如果之前是目录页，当前页已经不是目录页，状态改为正文页
            if (pageType === PAGE_TYPE_CATALOG && !isCatalogPage(pageData)) {
              pageType = PAGE_TYPE_MAIN;
            }
            // 从正文开始，push内容
            if (pageType === PAGE_TYPE_MAIN) {
              const contentItems = pageData.items.map(i => ({ ...i, pageNum }));
              pdfItems.push(...moveNoteToMain(contentItems));
            }
            page.cleanup();
          });
      });
    };
    // Loading of the first page will wait on metadata and subsequent loadings
    // will wait on the previous pages.
    for (let i = 1; i <= numPages; i++) {
      lastPromise = lastPromise.then(() => loadPage(i));
    }
    return lastPromise;
  });
  return pdfItems;
}

const isTitleNo = (items, itemIndex) => {
  const item = items[itemIndex];
  const nextItem = items[itemIndex + 1];

  const { str: itemContent } = item;
  // 一般来说，太长字符的肯定不是标题，减少后续的正则校验开销
  if (itemContent.length > 20) {
    return false;
  }

  if (nextItem && nextItem.str.trim() === '页') {
    return item;
  }

  return /^\d+(\.\d*)*\.?/.test(itemContent.trim());
  // return /^\d+(\.\d*)*\.?$/.test(itemContent.trim());
};

function getPageMetaData(items) {
  const fontHeightCountMap = {};
  const numberPositionXCountMap = {};
  let minPositionY = Infinity;

  items.forEach((cur, index) => {
    const { height, transform } = cur;
    const positionX = transform[4];
    const positionY = transform[5];
    if (!height || !transform) {
      console.log(cur);
    }
    const isTitle = isTitleNo(items, index);

    fontHeightCountMap[height] = (fontHeightCountMap[height] || 0) + 1;

    minPositionY = Math.min(minPositionY, positionY);

    if (isTitle) {
      numberPositionXCountMap[positionX] =
        (numberPositionXCountMap[positionX] || 0) + 1;
    }
  }, {});

  const sortedHeights = Object.keys(fontHeightCountMap)
    .map(height => {
      return {
        height: Number(height),
        counts: fontHeightCountMap[height],
      };
    })
    .sort((a, b) => b.counts - a.counts);

  const sortedPositionXs = Object.keys(numberPositionXCountMap)
    .map(positionX => {
      return {
        positionX: Number(positionX),
        counts: numberPositionXCountMap[positionX],
      };
    })
    .filter(i => i.positionX < 100)
    .sort((a, b) => b.counts - a.counts);
  // 处于较左端的，否则会被有些列表项的数字污染

  return {
    // 使用最多的字体大小，有理由相信，它就是正文字体大小
    mainFontHeight: sortedHeights[0].height,
    // 即是数字，又是持续在一个x坐标体现的，有理由相信，它就是标题数字
    titlePositionX: sortedPositionXs?.[0]?.positionX,
    // 最靠底的部分，有理想相信，它是页码的位置。但要小于60，否则是无页码的PDF
    pageNumberPositionY: minPositionY < 60 ? minPositionY : undefined,
  };
}

function rebuildPdfItems(items) {
  const { titlePositionX, pageNumberPositionY, mainFontHeight } =
    getPageMetaData(items);
  return items
    .map((item, index) => {
      const { height: currentHeight, str: itemContent, transform } = item;
      const nextItem = items[index + 1];
      const prevItem = items[index - 1];
      const positionX = transform[4];
      const positionY = transform[5];

      // 页码数据不需要
      if (pageNumberPositionY === positionY) {
        return null;
      }

      if (itemContent.startsWith('附表')) {
        item.str = `${TITLE_SPLIT}${itemContent.trim()}`;
        return item;
      }

      if (!isTitleNo(items, index)) {
        return item;
      }

      // 大标题，允许一定误差
      if (Math.abs(positionX - titlePositionX) < 2) {
        item.str = `${TITLE_SPLIT}${itemContent.trim()}`;
        return item;
      }

      // const prevHeight = prevItem?.height;
      // const nextHeight = nextItem.height;

      // 引用注释
      // if (
      //   prevItem &&
      //   currentHeight < prevHeight &&
      //   currentHeight < nextHeight
      // ) {
      //   item.str = `${QUOTE_SPLIT}${itemContent}`;
      //   return item;
      // }

      return item;
    })
    .filter(Boolean);
}

async function buildDocTreeFromPdf(pdfPath) {
  const items = await getPdfItems(pdfPath);
  const itemsWithTreeInfo = rebuildPdfItems(items);
  // const fs = require('fs');
  // console.log('===');
  // fs.writeFileSync('./tempItems.json', JSON.stringify(itemsWithTreeInfo));
  return buildDocTree(itemsWithTreeInfo.map(i => i.str).join(''));
}

module.exports = {
  buildDocTreeFromPdf,
  getPdfItems,
  rebuildPdfItems,
};


================================================
FILE: utils/tree.js
================================================
const { getSummaryWithCache } = require('./ai');
const { writeKnowledge, writeContentTree } = require('./fs');
const { shortenContent } = require('./content');
const { encode } = require('gpt-3-encoder');

function getParentNo(titleNo) {
  const parentNo = titleNo.split('.').slice(0, -1).join('.');
  return parentNo;
}

// 构建嵌套树
function toNestTree(flattenTree) {
  const tree = [];
  // 构建一个节点 map
  const nodesMap = flattenTree.reduce((acc, cur) => {
    acc[cur.titleNo] = cur;
    return acc;
  }, {});

  function updateParentTokenLength(node, tokenLength) {
    const parentNo = getParentNo(node.titleNo);
    if (parentNo && nodesMap[parentNo]) {
      const parentNode = nodesMap[parentNo];
      // 增加父节点的内容长度
      parentNode.allTokenLength =
        (parentNode.allTokenLength || 0) + tokenLength;
      // 递归累加
      updateParentTokenLength(parentNode, tokenLength);
    }
  }

  // 构建嵌套节点树，并计算每个节点涵盖的内容字符串总长度
  flattenTree.forEach(node => {
    // 更新相关节点的token长度
    const { tokenLength, summaryTokenLength } = node;
    const currentTokenLength = summaryTokenLength || tokenLength;
    // 用自己节点的内容初始化自身内容长度
    // 初始时可能已经被自己的子节点初始化过了，因此是累加
    node.allTokenLength = (node.allTokenLength || 0) + currentTokenLength;
    updateParentTokenLength(node, currentTokenLength);

    const parentNo = getParentNo(node.titleNo);
    // 把节点插入到父节点中
    if (parentNo && nodesMap[parentNo]) {
      const parentNode = nodesMap[parentNo];
      parentNode.children.push(node);
    } else {
      tree.push(node);
    }
  });

  return tree;
}

// 文本节点tokens大于1000的，重构为摘要
async function rebuildTreeWithAISummary(docTree, pdfName) {
  for (let index = 0; index < docTree.length; index++) {
    const node = docTree[index];

    if (node.tokenLength > 1000 && !node.summary) {
      // 实在特别长的，再压缩一下
      // const { content, tokenLength } =
      //   node.tokenLength < 3600
      //     ? node
      //     : {
      //         content: shortenContent(node.content),
      //       };

      const { content, tokenLength } = node;
      node.summary = await getSummaryWithCache(
        { content, tokenLength },
        pdfName,
      );
      console.log('build summary success', node.titleNo);
    }

    if (node.summary && !node.summaryTokenLength) {
      node.summaryTokenLength = encode(node.summary).length;
    }
  }
  return docTree;
}

// 构建嵌套内容树，并将过长子节点做摘要优化，减少节点内容
async function buildNestTreeWithAISummary(docTree, pdfName) {
  const tree = await rebuildTreeWithAISummary(docTree, pdfName);
  const nestTree = toNestTree(tree);

  // 写入文件
  writeContentTree(pdfName, nestTree);
  return nestTree;
}

// 将多段内容合并为一段
function unionContent(node) {
  let content = `第${node.titleNo}节内容:` + (node.summary || node.content);

  node.children.forEach(child => {
    content = content + '|' + unionContent(child);
  });

  return content;
}

// 将嵌套树递归构建为打平的内容段落
function buildContents(nodes, contents) {
  const newContents = contents || [];
  for (let index = 0; index < nodes.length; index++) {
    const node = nodes[index];
    if (node.allTokenLength > 3000) {
      buildContents(node.children, newContents);
    } else {
      const content = unionContent(node);
      newContents.push(content);
    }
  }
  return newContents;
}

// 构建知识库
async function buildKnowledgeFromDocTree(docTree, pdfName) {
  const nestTree = await buildNestTreeWithAISummary(docTree, pdfName);
  // const fs = require('fs');
  // fs.writeFileSync('./tempNestTree.json', JSON.stringify(nestTree));
  const knowledge = buildContents(nestTree);
  // 写入文件
  writeKnowledge(pdfName, knowledge);
  return knowledge;
}

module.exports = { buildKnowledgeFromDocTree };

Download .txt

gitextract_vsirs4ek/

├── .gitignore
├── cache/
│   └── index.js
├── config.js
├── index.js
├── package.json
├── readme.md
├── scripts/
│   ├── ask.js
│   └── load.js
├── userdict.utf8
└── utils/
    ├── ai.js
    ├── ask.js
    ├── content.js
    ├── embedding.js
    ├── fs.js
    ├── openai.js
    ├── pdf.js
    └── tree.js

Download .txt

SYMBOL INDEX (50 symbols across 9 files)

FILE: cache/index.js
  function get (line 23) | function get(name, key) {
  function set (line 29) | function set(name, key, value) {

FILE: index.js
  function loadingPdf (line 7) | async function loadingPdf(pdfPath) {
  function askQuestion (line 19) | async function askQuestion(question, pdfName) {

FILE: utils/ai.js
  function buildHash (line 7) | function buildHash(content) {
  function createCompletion (line 11) | async function createCompletion({
  function getSummary (line 52) | async function getSummary({ content, tokenLength }) {
  function createEmbedding (line 74) | async function createEmbedding(input) {
  function askInsQuestion (line 87) | async function askInsQuestion({ question, knowledge }) {

FILE: utils/ask.js
  function getKnowledge (line 8) | function getKnowledge({
  function ask (line 35) | async function ask(question, pdfName) {

FILE: utils/content.js
  constant LETTERS (line 4) | const LETTERS =
  function isDiseaseIntro (line 12) | function isDiseaseIntro(tokenLength, joinedContent) {
  function shortenDiseaseIntro (line 24) | function shortenDiseaseIntro(content) {
  function shortenByDictionary (line 36) | function shortenByDictionary(originContent, words, should) {
  function shortenTableContent (line 59) | function shortenTableContent(tableContent) {
  function shortenSectionContent (line 68) | function shortenSectionContent(sectionContent) {
  function shortenContent (line 96) | function shortenContent(longContent) {

FILE: utils/embedding.js
  function buildKnowledgeEmbeddings (line 4) | async function buildKnowledgeEmbeddings(knowledge, pdfName) {
  function buildQuestionEmbedding (line 17) | async function buildQuestionEmbedding(question, pdfName) {

FILE: utils/fs.js
  function readJsonFile (line 4) | function readJsonFile(path) {
  function getPdfName (line 13) | function getPdfName(pdfPath) {
  function getPath (line 17) | function getPath(pdfName, fileName) {
  function writeContentTree (line 27) | function writeContentTree(pdfName, docTree) {
  function writeKnowledge (line 31) | function writeKnowledge(pdfName, knowledge) {
  function readKnowledge (line 35) | function readKnowledge(pdfName) {
  function writeKnowledgeEmbeddings (line 39) | function writeKnowledgeEmbeddings(pdfName, embeddings) {
  function readKnowledgeEmbeddings (line 46) | function readKnowledgeEmbeddings(pdfName) {
  function getPdfPath (line 50) | function getPdfPath(pdfName) {
  function writeAnswer (line 54) | function writeAnswer(pdfName, question, answer) {

FILE: utils/pdf.js
  constant PAGE_TYPE_COVER (line 10) | const PAGE_TYPE_COVER = 0;
  constant PAGE_TYPE_CATALOG (line 12) | const PAGE_TYPE_CATALOG = 1;
  constant PAGE_TYPE_MAIN (line 14) | const PAGE_TYPE_MAIN = 2;
  constant TITLE_SPLIT (line 16) | const TITLE_SPLIT = '__TITLE__';
  constant QUOTE_SPLIT (line 17) | const QUOTE_SPLIT = '__QUOTE__';
  constant REF_SPLIT (line 18) | const REF_SPLIT = '__REF__';
  function buildDocTree (line 20) | function buildDocTree(longStr) {
  function isCatalogPage (line 78) | function isCatalogPage({ items }) {
  function moveNoteToMain (line 89) | function moveNoteToMain(items) {
  function getPdfItems (line 130) | async function getPdfItems(pdfPath) {
  function getPageMetaData (line 190) | function getPageMetaData(items) {
  function rebuildPdfItems (line 244) | function rebuildPdfItems(items) {
  function buildDocTreeFromPdf (line 293) | async function buildDocTreeFromPdf(pdfPath) {

FILE: utils/tree.js
  function getParentNo (line 6) | function getParentNo(titleNo) {
  function toNestTree (line 12) | function toNestTree(flattenTree) {
  function rebuildTreeWithAISummary (line 56) | async function rebuildTreeWithAISummary(docTree, pdfName) {
  function buildNestTreeWithAISummary (line 85) | async function buildNestTreeWithAISummary(docTree, pdfName) {
  function unionContent (line 95) | function unionContent(node) {
  function buildContents (line 106) | function buildContents(nodes, contents) {
  function buildKnowledgeFromDocTree (line 121) | async function buildKnowledgeFromDocTree(docTree, pdfName) {

Download .json

Condensed preview — 17 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (30K chars).

[
  {
    "path": ".gitignore",
    "chars": 80,
    "preview": "node_modules/\n.DS_Store\n.vscode/\ncache/files/\npdfs/\nknowledgeFiles/\nanswerFiles/"
  },
  {
    "path": "cache/index.js",
    "chars": 729,
    "preview": "const { existsSync, writeFileSync, readFileSync } = require('fs');\nconst { join } = require('path');\n\nconst getPath = na"
  },
  {
    "path": "config.js",
    "chars": 683,
    "preview": "module.exports = {\n  apiKey: 'your api key',\n  pdfName: 'your pdf name,不需要 .pdf 结尾',\n  // pdfName: 'hyb',\n  // pdfName: "
  },
  {
    "path": "index.js",
    "chars": 958,
    "preview": "const { getPdfName } = require('./utils/fs');\nconst { buildDocTreeFromPdf } = require('./utils/pdf');\nconst { buildKnowl"
  },
  {
    "path": "package.json",
    "chars": 494,
    "preview": "{\n  \"name\": \"pdf-gpt\",\n  \"version\": \"1.0.0\",\n  \"description\": \"\",\n  \"main\": \"index.js\",\n  \"scripts\": {\n    \"load\": \"node"
  },
  {
    "path": "readme.md",
    "chars": 270,
    "preview": "## 如何使用\n1. 执行 `npm install` 或 `tnpm install`;\n2. 下载一个保险条款 PDF，放在 `pdfs/` 这个目录下;\n3. 在 `config.json` 中，配置你的 `apiKey` 以及你的 "
  },
  {
    "path": "scripts/ask.js",
    "chars": 288,
    "preview": "const config = require('../config');\nconst { askQuestion } = require('../index');\n\nconst { pdfName, questions } = config"
  },
  {
    "path": "scripts/load.js",
    "chars": 217,
    "preview": "const config = require('../config');\nconst { loadingPdf } = require('../index');\nconst { getPdfPath } = require('../util"
  },
  {
    "path": "userdict.utf8",
    "chars": 116,
    "preview": "我们\n重度\n轻度\n轻症\n中症\n可选责任\n身故或全残\n身故或全残保险金\n疾病关爱保险金\n恶性肿瘤\n被保险人\n本合同\n疾病扩展保险金\n基本保险金额\n若被保险人\n被保险人\n经我们认可的医院专科医生\n重大疾病\n一种或者多种\n18周岁\n相学长"
  },
  {
    "path": "utils/ai.js",
    "chars": 2859,
    "preview": "const crypto = require('crypto');\nconst { encode } = require('gpt-3-encoder');\n\nconst openai = require('./openai');\ncons"
  },
  {
    "path": "utils/ask.js",
    "chars": 1434,
    "preview": "const { encode } = require('gpt-3-encoder');\nconst ddot = require('@stdlib/blas/base/ddot');\n\nconst { buildQuestionEmbed"
  },
  {
    "path": "utils/content.js",
    "chars": 2689,
    "preview": "const path = require('path');\nconst nodejieba = require('nodejieba');\n\nconst LETTERS =\n  'abcdefghijklmnopqrstuvwxyzABCD"
  },
  {
    "path": "utils/embedding.js",
    "chars": 830,
    "preview": "const { createEmbeddingWithCache } = require('./ai');\nconst { writeKnowledgeEmbeddings } = require('./fs');\n\nasync funct"
  },
  {
    "path": "utils/fs.js",
    "chars": 1899,
    "preview": "const { writeFileSync, readFileSync, existsSync, mkdirSync } = require('fs');\nconst { join } = require('path');\n\nfunctio"
  },
  {
    "path": "utils/openai.js",
    "chars": 226,
    "preview": "const { Configuration, OpenAIApi } = require('openai');\nconst { apiKey } = require('../config');\n\nconst configuration = "
  },
  {
    "path": "utils/pdf.js",
    "chars": 8280,
    "preview": "const pdfjs = require('pdfjs-dist');\nconst { encode } = require('gpt-3-encoder');\nconst {\n  isDiseaseIntro,\n  shortenDis"
  },
  {
    "path": "utils/tree.js",
    "chars": 3651,
    "preview": "const { getSummaryWithCache } = require('./ai');\nconst { writeKnowledge, writeContentTree } = require('./fs');\nconst { s"
  }
]

About this extraction

This page contains the full source code of the wuomzfx/pdfGPT GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 17 files (25.1 KB), approximately 7.7k tokens, and a symbol index with 50 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo