[
  {
    "path": ".gitignore",
    "content": "node_modules/\n.DS_Store\n.vscode/\ncache/files/\npdfs/\nknowledgeFiles/\nanswerFiles/"
  },
  {
    "path": "cache/index.js",
    "content": "const { existsSync, writeFileSync, readFileSync } = require('fs');\nconst { join } = require('path');\n\nconst getPath = name => join(__dirname, `./files/${name}.json`);\n\nconst getJson = path => {\n  // 不存在，返回空对象\n  if (!existsSync(path)) {\n    return {};\n  }\n  // 读文件\n  let string = readFileSync(path).toString();\n  let cacheJson = {};\n\n  try {\n    // 反序列化\n    cacheJson = JSON.parse(string);\n  } catch {}\n\n  return cacheJson;\n};\n\nfunction get(name, key) {\n  const path = getPath(name);\n  const json = getJson(path);\n  return json[key];\n}\n\nfunction set(name, key, value) {\n  const path = getPath(name);\n  const json = getJson(path);\n  json[key] = value;\n  writeFileSync(path, JSON.stringify(json));\n}\n\nmodule.exports = { get, set };\n"
  },
  {
    "path": "config.js",
    "content": "module.exports = {\n  apiKey: 'your api key',\n  pdfName: 'your pdf name,不需要 .pdf 结尾',\n  // pdfName: 'hyb',\n  // pdfName: 'e享护',\n  // pdfName: '达尔文',\n  // pdfName: '微保终身重疾',\n  // 医疗险问题\n  questions: [\n    // '投保年龄限制',\n    // '能无条件续保20年吗',\n    // '每年的保费会变化吗',\n    // '都能报销什么费用',\n    // '门诊看病能报销吗',\n    // '什么情况下不能理赔报销',\n    // '要去哪些医院才能理赔报销',\n    // '什么是等待期',\n    // '什么是犹豫期',\n    // '医保已经报销了，还能继续报销吗',\n    // '最多能报销多少钱',\n    '如果投保年龄填写错误，理赔时会怎么样',\n  ],\n  // 重疾险问题\n  // questions: [\n  //   '投保年龄限制',\n  //   '每年的保费会变化吗',\n  //   '得什么病能获得赔偿',\n  //   '能赔的重大疾病有哪些',\n  //   '最多能理赔多少钱',\n  //   '什么情况下不能理赔',\n  //   '人死了保费能退回吗',\n  //   '要去哪些医院才能理赔',\n  //   '我总共要投多少钱',\n  //   '退保能退钱吗',\n  // ],\n};\n"
  },
  {
    "path": "index.js",
    "content": "const { getPdfName } = require('./utils/fs');\nconst { buildDocTreeFromPdf } = require('./utils/pdf');\nconst { buildKnowledgeFromDocTree } = require('./utils/tree');\nconst { buildKnowledgeEmbeddings } = require('./utils/embedding');\nconst ask = require('./utils/ask');\n\nasync function loadingPdf(pdfPath) {\n  const pdfName = getPdfName(pdfPath);\n  // 构建内容树\n  const docTree = await buildDocTreeFromPdf(pdfPath);\n  // const fs = require('fs');\n  // fs.writeFileSync('./temp.json', JSON.stringify(docTree))\n  // 构建知识库\n  const knowledge = await buildKnowledgeFromDocTree(docTree, pdfName);\n  // // 构建知识库向量\n  await buildKnowledgeEmbeddings(knowledge, pdfName);\n}\n\nasync function askQuestion(question, pdfName) {\n  console.log(`AI 正在努力回答您的问题『${question}』，请稍作等待...\\n`);\n  const answer = await ask(question, pdfName);\n  console.log(`您的问题『${question}』回答如下：\\n==========\\n${answer}\\n==========\\n`);\n  return answer;\n}\n\nmodule.exports = {\n  loadingPdf,\n  askQuestion,\n};\n"
  },
  {
    "path": "package.json",
    "content": "{\n  \"name\": \"pdf-gpt\",\n  \"version\": \"1.0.0\",\n  \"description\": \"\",\n  \"main\": \"index.js\",\n  \"scripts\": {\n    \"load\": \"node ./scripts/load\",\n    \"ask\": \"node ./scripts/ask\",\n    \"test\": \"echo \\\"Error: no test specified\\\" && exit 1\"\n  },\n  \"author\": \"\",\n  \"license\": \"ISC\",\n  \"dependencies\": {\n    \"@stdlib/blas\": \"^0.0.12\",\n    \"gpt-3-encoder\": \"^1.1.4\",\n    \"nodejieba\": \"^2.6.0\",\n    \"openai\": \"^3.1.0\",\n    \"pdfjs-dist\": \"^3.3.122\"\n  },\n  \"repository\": \"https://github.com/wuomzfx/pdfGPT.git\"\n}"
  },
  {
    "path": "readme.md",
    "content": "## 如何使用\n1. 执行 `npm install` 或 `tnpm install`;\n2. 下载一个保险条款 PDF，放在 `pdfs/` 这个目录下;\n3. 在 `config.json` 中，配置你的 `apiKey` 以及你的 PDF 文档名;\n4. 针对你的 PDF 文档，修改 `config.json` 中问题 `questions`;\n5. 先执行 `npm run load`，如果异常报错了，可以继续重试;\n6. 再执行 `npm run ask`;\n7. 最终可以在 answerFiles 文件目录下看到答案记录"
  },
  {
    "path": "scripts/ask.js",
    "content": "const config = require('../config');\nconst { askQuestion } = require('../index');\n\nconst { pdfName, questions } = config;\n\n(async () => {\n  for (let index = 0; index < questions.length; index++) {\n    const question = questions[index];\n    await askQuestion(question, pdfName);\n  }\n})();\n"
  },
  {
    "path": "scripts/load.js",
    "content": "const config = require('../config');\nconst { loadingPdf } = require('../index');\nconst { getPdfPath } = require('../utils/fs');\n\nconst { pdfName } = config;\n\nconst pdfPath = getPdfPath(pdfName);\n\nloadingPdf(pdfPath);\n"
  },
  {
    "path": "userdict.utf8",
    "content": "我们\n重度\n轻度\n轻症\n中症\n可选责任\n身故或全残\n身故或全残保险金\n疾病关爱保险金\n恶性肿瘤\n被保险人\n本合同\n疾病扩展保险金\n基本保险金额\n若被保险人\n被保险人\n经我们认可的医院专科医生\n重大疾病\n一种或者多种\n18周岁\n相学长"
  },
  {
    "path": "utils/ai.js",
    "content": "const crypto = require('crypto');\nconst { encode } = require('gpt-3-encoder');\n\nconst openai = require('./openai');\nconst cache = require('../cache');\n\nfunction buildHash(content) {\n  return crypto.createHash('md5').update(content).digest('hex');\n}\n\nasync function createCompletion({\n  prompt,\n  max_tokens = 1024,\n  temperature = 0,\n}) {\n  const completion = await openai.createCompletion({\n    model: 'text-davinci-003',\n    prompt,\n    max_tokens,\n    temperature,\n  });\n\n  return strip(completion?.data?.choices?.[0].text, ['\\n']).trim();\n}\n\n// 去头尾指定字符\nconst strip = (str, chars) => {\n  let newStr = str;\n  chars.forEach(char => {\n    newStr = newStr.replace(new RegExp(`^${char}+|${char}+$`, 'g'), '');\n  });\n  return newStr;\n};\n\nconst withCache =\n  (wrappedFn, suffix, getContent) => async (arg, cacheFileName) => {\n    const content = getContent(arg);\n    const cacheName = `${cacheFileName}_${suffix}`;\n    // 文本太长，hash一下\n    const hash = buildHash(content);\n    const cacheValue = cache.get(cacheName, hash);\n    if (cacheValue) {\n      return cacheValue;\n    }\n\n    const rs = await wrappedFn(arg);\n\n    cache.set(cacheName, hash, rs);\n    return rs;\n  };\n\nasync function getSummary({ content, tokenLength }) {\n  const promptContext =\n    content.indexOf('|上文中a:') >= -1\n      ? `'''{{content}}'''基于字典翻译并返回内容摘要：`\n      : `'''{{content}}'''基于命名实体识别构建内容摘要：`;\n  const contentTokenLength = tokenLength || encode(content).length;\n  const promptContextTokenLength = encode(promptContext).length;\n\n  const completion = await openai.createCompletion({\n    model: 'text-davinci-003',\n    prompt: promptContext.replace('{{content}}', content),\n    // 1000 ~ 4096，最大也不能超过1000\n    max_tokens: Math.min(\n      4096 - contentTokenLength - promptContextTokenLength,\n      1000,\n    ),\n    temperature: 0,\n  });\n\n  return strip(completion?.data?.choices?.[0].text, ['\\n']);\n}\n\nasync function createEmbedding(input) {\n  const [response] = await Promise.all([\n    openai.createEmbedding({\n      model: 'text-embedding-ada-002',\n      input: input,\n    }),\n    // 向量化很快，休息一下，防止调用超限(默认最多每分钟60次)\n    await sleep(3000),\n  ]);\n\n  return response.data.data[0].embedding;\n}\n\nasync function askInsQuestion({ question, knowledge }) {\n  const prompt = `\n    以下是某保险产品条款的部分\n    '''${knowledge}'''\n    请基于对保险的理解与该部分条款内容，回答如下问题：\n    ${question}。\n    答案：\n    `;\n\n  const promptTokenLength = encode(prompt).length;\n\n  return createCompletion({ prompt, max_tokens: 4096 - promptTokenLength });\n}\n\n// 防止超过每分钟调用限制\nconst sleep = time =>\n  new Promise(resolve => {\n    setTimeout(resolve, time);\n  });\n\nmodule.exports = {\n  sleep,\n  getSummary,\n  getSummaryWithCache: withCache(\n    getSummary,\n    'summary',\n    ({ content }) => content,\n  ),\n  createEmbeddingWithCache: withCache(\n    createEmbedding,\n    'embedding',\n    input => input,\n  ),\n  askInsQuestion,\n  createCompletion,\n};\n"
  },
  {
    "path": "utils/ask.js",
    "content": "const { encode } = require('gpt-3-encoder');\nconst ddot = require('@stdlib/blas/base/ddot');\n\nconst { buildQuestionEmbedding } = require('./embedding');\nconst { readKnowledgeEmbeddings, readKnowledge, writeAnswer } = require('./fs');\nconst { askInsQuestion } = require('./ai');\n\nfunction getKnowledge({\n  questionEmbedding,\n  knowledgeEmbeddings,\n  knowledgeList,\n}) {\n  const kList = knowledgeEmbeddings\n    .map((knowledge, index) => {\n      const x = new Float64Array(questionEmbedding);\n      const y = new Float64Array(knowledge);\n      return {\n        index,\n        ddot: ddot(x.length, x, 1, y, 1),\n        knowledge: knowledgeList[index],\n      };\n    })\n    .sort((a, b) => b.ddot - a.ddot)\n    .filter(k => k.ddot > 0.8);\n\n  let tokens = 0;\n  const enoughTokenList = kList.filter(k => {\n    tokens += encode(k.knowledge).length;\n    return tokens < 3000;\n  });\n\n  return enoughTokenList.map(({ knowledge }) => knowledge).join('\\n');\n}\n\nasync function ask(question, pdfName) {\n  const questionEmbedding = await buildQuestionEmbedding(question, pdfName);\n  const knowledgeEmbeddings = readKnowledgeEmbeddings(pdfName);\n  const knowledgeList = readKnowledge(pdfName);\n\n  const knowledge = getKnowledge({\n    questionEmbedding,\n    knowledgeEmbeddings,\n    knowledgeList,\n  });\n  const answer = await askInsQuestion({ question, knowledge });\n  writeAnswer(pdfName, question, answer);\n  return answer;\n}\n\nmodule.exports = ask;\n"
  },
  {
    "path": "utils/content.js",
    "content": "const path = require('path');\nconst nodejieba = require('nodejieba');\n\nconst LETTERS =\n  'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZαβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ'.split('');\n\nnodejieba.load({\n  userDict: path.join(__dirname, '../userdict.utf8'),\n});\n\n// 判断是否是疾病介绍\nfunction isDiseaseIntro(tokenLength, joinedContent) {\n  // 比较短就不处理了\n  if (tokenLength < 2000) {\n    return false;\n  }\n  // 粗暴的简单判断\n  return !!['重大疾病', '中症疾病', '轻症疾病', '特定心脑血管疾病'].find(\n    disease => joinedContent.indexOf(disease) === 0,\n  );\n}\n\n// 疾病介绍的信息太长了，需要阉割一下，舍弃疾病介绍详情\nfunction shortenDiseaseIntro(content) {\n  const titleRegExp = /(?=（[0-9]+）)/g;\n  const sections = content.split(titleRegExp).map(section => {\n    if (titleRegExp.test(section)) {\n      const [title, ..._] = section.split(' ');\n      return title;\n    }\n    return section;\n  });\n  return sections.join('');\n}\n\nfunction shortenByDictionary(originContent, words, should) {\n  let shortContent = originContent;\n  const dictionary = [];\n  const wordsCounts = words.reduce((acc, cur) => {\n    acc[cur] = (acc[cur] || 0) + 1;\n    return acc;\n  }, {});\n\n  Object.keys(wordsCounts).forEach(word => {\n    if (should(wordsCounts[word], word.length)) {\n      dictionary.push(word);\n      shortContent = shortContent.replaceAll(\n        word,\n        `${LETTERS[dictionary.length - 1]}`,\n      );\n    }\n  });\n  shortContent = `${shortContent}|上文中，${dictionary.map(\n    (word, index) => `${LETTERS[index]}:${word}`,\n  )}`;\n  return shortContent;\n}\n\nfunction shortenTableContent(tableContent) {\n  const words = tableContent.split(' ');\n  return shortenByDictionary(\n    tableContent,\n    words,\n    (counts, length) => counts > 3 && length > 3,\n  );\n}\n\nfunction shortenSectionContent(sectionContent) {\n  const longContent = sectionContent\n    // 去无不需要文案\n    .replaceAll('（见释义）', '')\n    // 减少字符\n    .replaceAll('——', '—')\n    // 全角半角化\n    .replaceAll('（', '(')\n    .replaceAll('）', ')')\n    .replaceAll('：', ':')\n    .replaceAll('；', ';')\n    .replaceAll('、', '|')\n    .replaceAll('，', ',')\n    .replaceAll('。', '.')\n    .replaceAll('“', `'`)\n    .replaceAll('”', `'`)\n    // 去无意义空格\n    .replaceAll('. ', '.')\n    .replaceAll(` '`, `'`)\n    .replaceAll('; ', ';');\n  const words = nodejieba.cut(longContent);\n  return shortenByDictionary(\n    longContent,\n    words,\n    (counts, length) => counts > 4 && length > 1,\n  );\n}\n\nfunction shortenContent(longContent) {\n  if (longContent.split(' ').length > 100) {\n    return shortenTableContent(longContent);\n  }\n  return shortenSectionContent(longContent);\n}\n\nmodule.exports = {\n  isDiseaseIntro,\n  shortenDiseaseIntro,\n  shortenContent,\n  shortenTableContent,\n  shortenSectionContent,\n};\n"
  },
  {
    "path": "utils/embedding.js",
    "content": "const { createEmbeddingWithCache } = require('./ai');\nconst { writeKnowledgeEmbeddings } = require('./fs');\n\nasync function buildKnowledgeEmbeddings(knowledge, pdfName) {\n  const embeddings = [];\n  for (let index = 0; index < knowledge.length; index++) {\n    if (!embeddings[index]) {\n      const embedding = await createEmbeddingWithCache(knowledge[index], pdfName);\n      embeddings[index] = embedding;\n      console.log('createEmbedding success', index);\n    }\n  }\n  writeKnowledgeEmbeddings(pdfName, embeddings);\n  return embeddings;\n}\n\nasync function buildQuestionEmbedding(question, pdfName) {\n  const embedding = await createEmbeddingWithCache(question, pdfName);\n  // console.log('createQuestionEmbedding success:', question);\n\n  return embedding;\n}\n\nmodule.exports = { buildKnowledgeEmbeddings, buildQuestionEmbedding };\n"
  },
  {
    "path": "utils/fs.js",
    "content": "const { writeFileSync, readFileSync, existsSync, mkdirSync } = require('fs');\nconst { join } = require('path');\n\nfunction readJsonFile(path) {\n  try {\n    const string = readFileSync(path).toString();\n    return JSON.parse(string);\n  } catch {\n    return {};\n  }\n}\n\nfunction getPdfName(pdfPath) {\n  return pdfPath.split('/').pop().split('.pdf')[0];\n}\n\nfunction getPath(pdfName, fileName) {\n  const relativeDirPath = `../knowledgeFiles/${pdfName}`;\n  const dirPath = join(__dirname, relativeDirPath);\n  // 文件夹初始化\n  if (!existsSync(dirPath)) {\n    mkdirSync(dirPath);\n  }\n  return join(__dirname, `${relativeDirPath}/${fileName}.json`);\n}\n\nfunction writeContentTree(pdfName, docTree) {\n  writeFileSync(getPath(pdfName, 'contentTree'), JSON.stringify(docTree));\n}\n\nfunction writeKnowledge(pdfName, knowledge) {\n  writeFileSync(getPath(pdfName, 'knowledge'), JSON.stringify(knowledge));\n}\n\nfunction readKnowledge(pdfName) {\n  return readJsonFile(getPath(pdfName, 'knowledge'));\n}\n\nfunction writeKnowledgeEmbeddings(pdfName, embeddings) {\n  writeFileSync(\n    getPath(pdfName, 'knowledgeEmbeddings'),\n    JSON.stringify(embeddings),\n  );\n}\n\nfunction readKnowledgeEmbeddings(pdfName) {\n  return readJsonFile(getPath(pdfName, 'knowledgeEmbeddings'));\n}\n\nfunction getPdfPath(pdfName) {\n  return join(__dirname, `../pdfs/${pdfName}.pdf`);\n}\n\nfunction writeAnswer(pdfName, question, answer) {\n  const answerPath = join(__dirname, `../answerFiles/${pdfName}_answers.json`);\n  if (!existsSync(answerPath)) {\n    writeFileSync(answerPath, JSON.stringify({ [question]: answer }));\n    return;\n  }\n\n  const answerJson = readJsonFile(answerPath);\n  answerJson[question] = answer;\n  writeFileSync(answerPath, JSON.stringify(answerJson));\n}\n\nmodule.exports = {\n  getPdfPath,\n  getPdfName,\n  writeAnswer,\n  readKnowledge,\n  writeKnowledge,\n  writeContentTree,\n  writeKnowledgeEmbeddings,\n  readKnowledgeEmbeddings,\n};\n"
  },
  {
    "path": "utils/openai.js",
    "content": "const { Configuration, OpenAIApi } = require('openai');\nconst { apiKey } = require('../config');\n\nconst configuration = new Configuration({\n  apiKey,\n});\n\nconst openai = new OpenAIApi(configuration);\n\nmodule.exports = openai;\n"
  },
  {
    "path": "utils/pdf.js",
    "content": "const pdfjs = require('pdfjs-dist');\nconst { encode } = require('gpt-3-encoder');\nconst {\n  isDiseaseIntro,\n  shortenDiseaseIntro,\n  shortenContent,\n} = require('./content');\n\n// 封面\nconst PAGE_TYPE_COVER = 0;\n// 目录\nconst PAGE_TYPE_CATALOG = 1;\n// 正文\nconst PAGE_TYPE_MAIN = 2;\n\nconst TITLE_SPLIT = '__TITLE__';\nconst QUOTE_SPLIT = '__QUOTE__';\nconst REF_SPLIT = '__REF__';\n\nfunction buildDocTree(longStr) {\n  const [, ...sections] = longStr.split(TITLE_SPLIT); // 将字符串划分成 section 数组\n\n  const treeNodes = sections\n    .map(section => {\n      let [titleNo, ...content] = section.split(' ');\n      if (titleNo.endsWith('.')) {\n        titleNo = titleNo.slice(0, -1);\n      }\n\n      const matchedTitleNo = titleNo.match(/^\\d+(\\.\\d*)*\\.?/)?.[0];\n\n      let joinedContent = content.join(' ');\n\n      // 说明标题中有非纯数字标题的内容，把这部分内容拼接到正文中\n      if (matchedTitleNo !== titleNo) {\n        const titleContent = titleNo.replace(/^\\d+(\\.\\d*)*\\.?/, '');\n        joinedContent = titleContent + ' ' + joinedContent;\n      }\n\n      let tokenLength = encode(joinedContent).length;\n\n      // 疾病介绍内容特别长，可以阉割掉具体疾病的详细信息\n      if (isDiseaseIntro(tokenLength, joinedContent)) {\n        joinedContent = shortenDiseaseIntro(joinedContent);\n      } else if (tokenLength > 4000) {\n        // 不是疾病介绍也特别长的，采用字典压缩法压缩\n        joinedContent = shortenContent(joinedContent);\n      }\n\n      tokenLength = encode(joinedContent).length;\n\n      return {\n        titleNo: matchedTitleNo || titleNo,\n        content: joinedContent,\n        children: [],\n        refs: [],\n        tokenLength,\n      };\n    })\n    // .map(node => {\n    //   const { content } = node;\n\n    //   if (content.indexOf(QUOTE_SPLIT)) {\n    //     const regex = /__QUOTE__([0-9.]+)/g;\n    //     let match;\n    //     while ((match = regex.exec(content)) !== null) {\n    //       node.refs.push(match[1]);\n    //     }\n    //     node.content = node.content\n    //       .replace(regex, '')\n    //       .replace(/第\\s*\\d+\\s*页\\s*共\\d+页/g, '');\n    //     return node;\n    //   }\n    // });\n  return treeNodes;\n}\n\nfunction isCatalogPage({ items }) {\n  const pageContent = items.map(i => i.str).join('');\n  if (pageContent.indexOf('条款目录') > -1) {\n    return true;\n  }\n  if (pageContent.split(/(?=\\d+.\\d+)/).length > 10) {\n    return true;\n  }\n}\n\n// 将注释内容拼接到正文中\nfunction moveNoteToMain(items) {\n  const { mainFontHeight, titlePositionX, pageNumberPositionY } =\n    getPageMetaData(items);\n\n  const isRefTitle = item =>\n    Math.abs(item.transform[4] - titlePositionX) < 2 &&\n    item.height / mainFontHeight < 0.7;\n\n  const refSplitIndex = items.findIndex(isRefTitle);\n\n  if (refSplitIndex < 0) {\n    return items;\n  }\n\n  // 正文\n  const mainItems = items.slice(0, refSplitIndex);\n  // 注释\n  items\n    .slice(refSplitIndex)\n    .map(refItem => {\n      if (isRefTitle(refItem)) {\n        refItem.str = `${REF_SPLIT}${refItem.str.trim()} `;\n      }\n      return refItem.str;\n    })\n    .join('')\n    .split(REF_SPLIT)\n    .forEach(refContent => {\n      const [refNo, ...content] = refContent.split(' ');\n      if (refNo && content.length) {\n        const mainItem = mainItems.find(i => i.str.trim() === refNo);\n\n        if (!mainItem) {\n          return;\n        }\n        mainItem.str = `[${content.join('')}]`;\n      }\n    });\n  return mainItems;\n}\n\nasync function getPdfItems(pdfPath) {\n  const pdfItems = [];\n  let pageType = PAGE_TYPE_CATALOG;\n  await pdfjs.getDocument(pdfPath).promise.then(doc => {\n    const numPages = doc.numPages;\n    let lastPromise = doc.getMetadata();\n\n    const loadPage = function (pageNum) {\n      return doc.getPage(pageNum).then(page => {\n        return page\n          .getTextContent({\n            disableCombineTextItems: true,\n            // includeMarkedContent: true,\n          })\n          .then(pageData => {\n            // 如果之前是封面，当前页已经是目录页了，状态改为目录页\n            if (pageType === PAGE_TYPE_COVER && isCatalogPage(pageData)) {\n              pageType = PAGE_TYPE_CATALOG;\n            }\n            // 如果之前是目录页，当前页已经不是目录页，状态改为正文页\n            if (pageType === PAGE_TYPE_CATALOG && !isCatalogPage(pageData)) {\n              pageType = PAGE_TYPE_MAIN;\n            }\n            // 从正文开始，push内容\n            if (pageType === PAGE_TYPE_MAIN) {\n              const contentItems = pageData.items.map(i => ({ ...i, pageNum }));\n              pdfItems.push(...moveNoteToMain(contentItems));\n            }\n            page.cleanup();\n          });\n      });\n    };\n    // Loading of the first page will wait on metadata and subsequent loadings\n    // will wait on the previous pages.\n    for (let i = 1; i <= numPages; i++) {\n      lastPromise = lastPromise.then(() => loadPage(i));\n    }\n    return lastPromise;\n  });\n  return pdfItems;\n}\n\nconst isTitleNo = (items, itemIndex) => {\n  const item = items[itemIndex];\n  const nextItem = items[itemIndex + 1];\n\n  const { str: itemContent } = item;\n  // 一般来说，太长字符的肯定不是标题，减少后续的正则校验开销\n  if (itemContent.length > 20) {\n    return false;\n  }\n\n  if (nextItem && nextItem.str.trim() === '页') {\n    return item;\n  }\n\n  return /^\\d+(\\.\\d*)*\\.?/.test(itemContent.trim());\n  // return /^\\d+(\\.\\d*)*\\.?$/.test(itemContent.trim());\n};\n\nfunction getPageMetaData(items) {\n  const fontHeightCountMap = {};\n  const numberPositionXCountMap = {};\n  let minPositionY = Infinity;\n\n  items.forEach((cur, index) => {\n    const { height, transform } = cur;\n    const positionX = transform[4];\n    const positionY = transform[5];\n    if (!height || !transform) {\n      console.log(cur);\n    }\n    const isTitle = isTitleNo(items, index);\n\n    fontHeightCountMap[height] = (fontHeightCountMap[height] || 0) + 1;\n\n    minPositionY = Math.min(minPositionY, positionY);\n\n    if (isTitle) {\n      numberPositionXCountMap[positionX] =\n        (numberPositionXCountMap[positionX] || 0) + 1;\n    }\n  }, {});\n\n  const sortedHeights = Object.keys(fontHeightCountMap)\n    .map(height => {\n      return {\n        height: Number(height),\n        counts: fontHeightCountMap[height],\n      };\n    })\n    .sort((a, b) => b.counts - a.counts);\n\n  const sortedPositionXs = Object.keys(numberPositionXCountMap)\n    .map(positionX => {\n      return {\n        positionX: Number(positionX),\n        counts: numberPositionXCountMap[positionX],\n      };\n    })\n    .filter(i => i.positionX < 100)\n    .sort((a, b) => b.counts - a.counts);\n  // 处于较左端的，否则会被有些列表项的数字污染\n\n  return {\n    // 使用最多的字体大小，有理由相信，它就是正文字体大小\n    mainFontHeight: sortedHeights[0].height,\n    // 即是数字，又是持续在一个x坐标体现的，有理由相信，它就是标题数字\n    titlePositionX: sortedPositionXs?.[0]?.positionX,\n    // 最靠底的部分，有理想相信，它是页码的位置。但要小于60，否则是无页码的PDF\n    pageNumberPositionY: minPositionY < 60 ? minPositionY : undefined,\n  };\n}\n\nfunction rebuildPdfItems(items) {\n  const { titlePositionX, pageNumberPositionY, mainFontHeight } =\n    getPageMetaData(items);\n  return items\n    .map((item, index) => {\n      const { height: currentHeight, str: itemContent, transform } = item;\n      const nextItem = items[index + 1];\n      const prevItem = items[index - 1];\n      const positionX = transform[4];\n      const positionY = transform[5];\n\n      // 页码数据不需要\n      if (pageNumberPositionY === positionY) {\n        return null;\n      }\n\n      if (itemContent.startsWith('附表')) {\n        item.str = `${TITLE_SPLIT}${itemContent.trim()}`;\n        return item;\n      }\n\n      if (!isTitleNo(items, index)) {\n        return item;\n      }\n\n      // 大标题，允许一定误差\n      if (Math.abs(positionX - titlePositionX) < 2) {\n        item.str = `${TITLE_SPLIT}${itemContent.trim()}`;\n        return item;\n      }\n\n      // const prevHeight = prevItem?.height;\n      // const nextHeight = nextItem.height;\n\n      // 引用注释\n      // if (\n      //   prevItem &&\n      //   currentHeight < prevHeight &&\n      //   currentHeight < nextHeight\n      // ) {\n      //   item.str = `${QUOTE_SPLIT}${itemContent}`;\n      //   return item;\n      // }\n\n      return item;\n    })\n    .filter(Boolean);\n}\n\nasync function buildDocTreeFromPdf(pdfPath) {\n  const items = await getPdfItems(pdfPath);\n  const itemsWithTreeInfo = rebuildPdfItems(items);\n  // const fs = require('fs');\n  // console.log('===');\n  // fs.writeFileSync('./tempItems.json', JSON.stringify(itemsWithTreeInfo));\n  return buildDocTree(itemsWithTreeInfo.map(i => i.str).join(''));\n}\n\nmodule.exports = {\n  buildDocTreeFromPdf,\n  getPdfItems,\n  rebuildPdfItems,\n};\n"
  },
  {
    "path": "utils/tree.js",
    "content": "const { getSummaryWithCache } = require('./ai');\nconst { writeKnowledge, writeContentTree } = require('./fs');\nconst { shortenContent } = require('./content');\nconst { encode } = require('gpt-3-encoder');\n\nfunction getParentNo(titleNo) {\n  const parentNo = titleNo.split('.').slice(0, -1).join('.');\n  return parentNo;\n}\n\n// 构建嵌套树\nfunction toNestTree(flattenTree) {\n  const tree = [];\n  // 构建一个节点 map\n  const nodesMap = flattenTree.reduce((acc, cur) => {\n    acc[cur.titleNo] = cur;\n    return acc;\n  }, {});\n\n  function updateParentTokenLength(node, tokenLength) {\n    const parentNo = getParentNo(node.titleNo);\n    if (parentNo && nodesMap[parentNo]) {\n      const parentNode = nodesMap[parentNo];\n      // 增加父节点的内容长度\n      parentNode.allTokenLength =\n        (parentNode.allTokenLength || 0) + tokenLength;\n      // 递归累加\n      updateParentTokenLength(parentNode, tokenLength);\n    }\n  }\n\n  // 构建嵌套节点树，并计算每个节点涵盖的内容字符串总长度\n  flattenTree.forEach(node => {\n    // 更新相关节点的token长度\n    const { tokenLength, summaryTokenLength } = node;\n    const currentTokenLength = summaryTokenLength || tokenLength;\n    // 用自己节点的内容初始化自身内容长度\n    // 初始时可能已经被自己的子节点初始化过了，因此是累加\n    node.allTokenLength = (node.allTokenLength || 0) + currentTokenLength;\n    updateParentTokenLength(node, currentTokenLength);\n\n    const parentNo = getParentNo(node.titleNo);\n    // 把节点插入到父节点中\n    if (parentNo && nodesMap[parentNo]) {\n      const parentNode = nodesMap[parentNo];\n      parentNode.children.push(node);\n    } else {\n      tree.push(node);\n    }\n  });\n\n  return tree;\n}\n\n// 文本节点tokens大于1000的，重构为摘要\nasync function rebuildTreeWithAISummary(docTree, pdfName) {\n  for (let index = 0; index < docTree.length; index++) {\n    const node = docTree[index];\n\n    if (node.tokenLength > 1000 && !node.summary) {\n      // 实在特别长的，再压缩一下\n      // const { content, tokenLength } =\n      //   node.tokenLength < 3600\n      //     ? node\n      //     : {\n      //         content: shortenContent(node.content),\n      //       };\n\n      const { content, tokenLength } = node;\n      node.summary = await getSummaryWithCache(\n        { content, tokenLength },\n        pdfName,\n      );\n      console.log('build summary success', node.titleNo);\n    }\n\n    if (node.summary && !node.summaryTokenLength) {\n      node.summaryTokenLength = encode(node.summary).length;\n    }\n  }\n  return docTree;\n}\n\n// 构建嵌套内容树，并将过长子节点做摘要优化，减少节点内容\nasync function buildNestTreeWithAISummary(docTree, pdfName) {\n  const tree = await rebuildTreeWithAISummary(docTree, pdfName);\n  const nestTree = toNestTree(tree);\n\n  // 写入文件\n  writeContentTree(pdfName, nestTree);\n  return nestTree;\n}\n\n// 将多段内容合并为一段\nfunction unionContent(node) {\n  let content = `第${node.titleNo}节内容:` + (node.summary || node.content);\n\n  node.children.forEach(child => {\n    content = content + '|' + unionContent(child);\n  });\n\n  return content;\n}\n\n// 将嵌套树递归构建为打平的内容段落\nfunction buildContents(nodes, contents) {\n  const newContents = contents || [];\n  for (let index = 0; index < nodes.length; index++) {\n    const node = nodes[index];\n    if (node.allTokenLength > 3000) {\n      buildContents(node.children, newContents);\n    } else {\n      const content = unionContent(node);\n      newContents.push(content);\n    }\n  }\n  return newContents;\n}\n\n// 构建知识库\nasync function buildKnowledgeFromDocTree(docTree, pdfName) {\n  const nestTree = await buildNestTreeWithAISummary(docTree, pdfName);\n  // const fs = require('fs');\n  // fs.writeFileSync('./tempNestTree.json', JSON.stringify(nestTree));\n  const knowledge = buildContents(nestTree);\n  // 写入文件\n  writeKnowledge(pdfName, knowledge);\n  return knowledge;\n}\n\nmodule.exports = { buildKnowledgeFromDocTree };\n"
  }
]