Repository: wuomzfx/pdfGPT
Branch: main
Commit: 09a53a0a2e3b
Files: 17
Total size: 25.1 KB

Directory structure:
gitextract_vsirs4ek/

├── .gitignore
├── cache/
│   └── index.js
├── config.js
├── index.js
├── package.json
├── readme.md
├── scripts/
│   ├── ask.js
│   └── load.js
├── userdict.utf8
└── utils/
    ├── ai.js
    ├── ask.js
    ├── content.js
    ├── embedding.js
    ├── fs.js
    ├── openai.js
    ├── pdf.js
    └── tree.js

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
node_modules/
.DS_Store
.vscode/
cache/files/
pdfs/
knowledgeFiles/
answerFiles/

================================================
FILE: cache/index.js
================================================
const { existsSync, writeFileSync, readFileSync } = require('fs');
const { join } = require('path');

const getPath = name => join(__dirname, `./files/${name}.json`);

const getJson = path => {
  // 不存在，返回空对象
  if (!existsSync(path)) {
    return {};
  }
  // 读文件
  let string = readFileSync(path).toString();
  let cacheJson = {};

  try {
    // 反序列化
    cacheJson = JSON.parse(string);
  } catch {}

  return cacheJson;
};

function get(name, key) {
  const path = getPath(name);
  const json = getJson(path);
  return json[key];
}

function set(name, key, value) {
  const path = getPath(name);
  const json = getJson(path);
  json[key] = value;
  writeFileSync(path, JSON.stringify(json));
}

module.exports = { get, set };


================================================
FILE: config.js
================================================
module.exports = {
  apiKey: 'your api key',
  pdfName: 'your pdf name,不需要 .pdf 结尾',
  // pdfName: 'hyb',
  // pdfName: 'e享护',
  // pdfName: '达尔文',
  // pdfName: '微保终身重疾',
  // 医疗险问题
  questions: [
    // '投保年龄限制',
    // '能无条件续保20年吗',
    // '每年的保费会变化吗',
    // '都能报销什么费用',
    // '门诊看病能报销吗',
    // '什么情况下不能理赔报销',
    // '要去哪些医院才能理赔报销',
    // '什么是等待期',
    // '什么是犹豫期',
    // '医保已经报销了，还能继续报销吗',
    // '最多能报销多少钱',
    '如果投保年龄填写错误，理赔时会怎么样',
  ],
  // 重疾险问题
  // questions: [
  //   '投保年龄限制',
  //   '每年的保费会变化吗',
  //   '得什么病能获得赔偿',
  //   '能赔的重大疾病有哪些',
  //   '最多能理赔多少钱',
  //   '什么情况下不能理赔',
  //   '人死了保费能退回吗',
  //   '要去哪些医院才能理赔',
  //   '我总共要投多少钱',
  //   '退保能退钱吗',
  // ],
};


================================================
FILE: index.js
================================================
const { getPdfName } = require('./utils/fs');
const { buildDocTreeFromPdf } = require('./utils/pdf');
const { buildKnowledgeFromDocTree } = require('./utils/tree');
const { buildKnowledgeEmbeddings } = require('./utils/embedding');
const ask = require('./utils/ask');

async function loadingPdf(pdfPath) {
  const pdfName = getPdfName(pdfPath);
  // 构建内容树
  const docTree = await buildDocTreeFromPdf(pdfPath);
  // const fs = require('fs');
  // fs.writeFileSync('./temp.json', JSON.stringify(docTree))
  // 构建知识库
  const knowledge = await buildKnowledgeFromDocTree(docTree, pdfName);
  // // 构建知识库向量
  await buildKnowledgeEmbeddings(knowledge, pdfName);
}

async function askQuestion(question, pdfName) {
  console.log(`AI 正在努力回答您的问题『${question}』，请稍作等待...\n`);
  const answer = await ask(question, pdfName);
  console.log(`您的问题『${question}』回答如下：\n==========\n${answer}\n==========\n`);
  return answer;
}

module.exports = {
  loadingPdf,
  askQuestion,
};


================================================
FILE: package.json
================================================
{
  "name": "pdf-gpt",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "load": "node ./scripts/load",
    "ask": "node ./scripts/ask",
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "author": "",
  "license": "ISC",
  "dependencies": {
    "@stdlib/blas": "^0.0.12",
    "gpt-3-encoder": "^1.1.4",
    "nodejieba": "^2.6.0",
    "openai": "^3.1.0",
    "pdfjs-dist": "^3.3.122"
  },
  "repository": "https://github.com/wuomzfx/pdfGPT.git"
}

================================================
FILE: readme.md
================================================
## 如何使用
1. 执行 `npm install` 或 `tnpm install`;
2. 下载一个保险条款 PDF，放在 `pdfs/` 这个目录下;
3. 在 `config.json` 中，配置你的 `apiKey` 以及你的 PDF 文档名;
4. 针对你的 PDF 文档，修改 `config.json` 中问题 `questions`;
5. 先执行 `npm run load`，如果异常报错了，可以继续重试;
6. 再执行 `npm run ask`;
7. 最终可以在 answerFiles 文件目录下看到答案记录

================================================
FILE: scripts/ask.js
================================================
const config = require('../config');
const { askQuestion } = require('../index');

const { pdfName, questions } = config;

(async () => {
  for (let index = 0; index < questions.length; index++) {
    const question = questions[index];
    await askQuestion(question, pdfName);
  }
})();


================================================
FILE: scripts/load.js
================================================
const config = require('../config');
const { loadingPdf } = require('../index');
const { getPdfPath } = require('../utils/fs');

const { pdfName } = config;

const pdfPath = getPdfPath(pdfName);

loadingPdf(pdfPath);


================================================
FILE: userdict.utf8
================================================
我们
重度
轻度
轻症
中症
可选责任
身故或全残
身故或全残保险金
疾病关爱保险金
恶性肿瘤
被保险人
本合同
疾病扩展保险金
基本保险金额
若被保险人
被保险人
经我们认可的医院专科医生
重大疾病
一种或者多种
18周岁
相学长

================================================
FILE: utils/ai.js
================================================
const crypto = require('crypto');
const { encode } = require('gpt-3-encoder');

const openai = require('./openai');
const cache = require('../cache');

function buildHash(content) {
  return crypto.createHash('md5').update(content).digest('hex');
}

async function createCompletion({
  prompt,
  max_tokens = 1024,
  temperature = 0,
}) {
  const completion = await openai.createCompletion({
    model: 'text-davinci-003',
    prompt,
    max_tokens,
    temperature,
  });

  return strip(completion?.data?.choices?.[0].text, ['\n']).trim();
}

// 去头尾指定字符
const strip = (str, chars) => {
  let newStr = str;
  chars.forEach(char => {
    newStr = newStr.replace(new RegExp(`^${char}+|${char}+$`, 'g'), '');
  });
  return newStr;
};

const withCache =
  (wrappedFn, suffix, getContent) => async (arg, cacheFileName) => {
    const content = getContent(arg);
    const cacheName = `${cacheFileName}_${suffix}`;
    // 文本太长，hash一下
    const hash = buildHash(content);
    const cacheValue = cache.get(cacheName, hash);
    if (cacheValue) {
      return cacheValue;
    }

    const rs = await wrappedFn(arg);

    cache.set(cacheName, hash, rs);
    return rs;
  };

async function getSummary({ content, tokenLength }) {
  const promptContext =
    content.indexOf('|上文中a:') >= -1
      ? `'''{{content}}'''基于字典翻译并返回内容摘要：`
      : `'''{{content}}'''基于命名实体识别构建内容摘要：`;
  const contentTokenLength = tokenLength || encode(content).length;
  const promptContextTokenLength = encode(promptContext).length;

  const completion = await openai.createCompletion({
    model: 'text-davinci-003',
    prompt: promptContext.replace('{{content}}', content),
    // 1000 ~ 4096，最大也不能超过1000
    max_tokens: Math.min(
      4096 - contentTokenLength - promptContextTokenLength,
      1000,
    ),
    temperature: 0,
  });

  return strip(completion?.data?.choices?.[0].text, ['\n']);
}

async function createEmbedding(input) {
  const [response] = await Promise.all([
    openai.createEmbedding({
      model: 'text-embedding-ada-002',
      input: input,
    }),
    // 向量化很快，休息一下，防止调用超限(默认最多每分钟60次)
    await sleep(3000),
  ]);

  return response.data.data[0].embedding;
}

async function askInsQuestion({ question, knowledge }) {
  const prompt = `
    以下是某保险产品条款的部分
    '''${knowledge}'''
    请基于对保险的理解与该部分条款内容，回答如下问题：
    ${question}。
    答案：
    `;

  const promptTokenLength = encode(prompt).length;

  return createCompletion({ prompt, max_tokens: 4096 - promptTokenLength });
}

// 防止超过每分钟调用限制
const sleep = time =>
  new Promise(resolve => {
    setTimeout(resolve, time);
  });

module.exports = {
  sleep,
  getSummary,
  getSummaryWithCache: withCache(
    getSummary,
    'summary',
    ({ content }) => content,
  ),
  createEmbeddingWithCache: withCache(
    createEmbedding,
    'embedding',
    input => input,
  ),
  askInsQuestion,
  createCompletion,
};


================================================
FILE: utils/ask.js
================================================
const { encode } = require('gpt-3-encoder');
const ddot = require('@stdlib/blas/base/ddot');

const { buildQuestionEmbedding } = require('./embedding');
const { readKnowledgeEmbeddings, readKnowledge, writeAnswer } = require('./fs');
const { askInsQuestion } = require('./ai');

function getKnowledge({
  questionEmbedding,
  knowledgeEmbeddings,
  knowledgeList,
}) {
  const kList = knowledgeEmbeddings
    .map((knowledge, index) => {
      const x = new Float64Array(questionEmbedding);
      const y = new Float64Array(knowledge);
      return {
        index,
        ddot: ddot(x.length, x, 1, y, 1),
        knowledge: knowledgeList[index],
      };
    })
    .sort((a, b) => b.ddot - a.ddot)
    .filter(k => k.ddot > 0.8);

  let tokens = 0;
  const enoughTokenList = kList.filter(k => {
    tokens += encode(k.knowledge).length;
    return tokens < 3000;
  });

  return enoughTokenList.map(({ knowledge }) => knowledge).join('\n');
}

async function ask(question, pdfName) {
  const questionEmbedding = await buildQuestionEmbedding(question, pdfName);
  const knowledgeEmbeddings = readKnowledgeEmbeddings(pdfName);
  const knowledgeList = readKnowledge(pdfName);

  const knowledge = getKnowledge({
    questionEmbedding,
    knowledgeEmbeddings,
    knowledgeList,
  });
  const answer = await askInsQuestion({ question, knowledge });
  writeAnswer(pdfName, question, answer);
  return answer;
}

module.exports = ask;


================================================
FILE: utils/content.js
================================================
const path = require('path');
const nodejieba = require('nodejieba');

const LETTERS =
  'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZαβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ'.split('');

nodejieba.load({
  userDict: path.join(__dirname, '../userdict.utf8'),
});

// 判断是否是疾病介绍
function isDiseaseIntro(tokenLength, joinedContent) {
  // 比较短就不处理了
  if (tokenLength < 2000) {
    return false;
  }
  // 粗暴的简单判断
  return !!['重大疾病', '中症疾病', '轻症疾病', '特定心脑血管疾病'].find(
    disease => joinedContent.indexOf(disease) === 0,
  );
}

// 疾病介绍的信息太长了，需要阉割一下，舍弃疾病介绍详情
function shortenDiseaseIntro(content) {
  const titleRegExp = /(?=（[0-9]+）)/g;
  const sections = content.split(titleRegExp).map(section => {
    if (titleRegExp.test(section)) {
      const [title, ..._] = section.split(' ');
      return title;
    }
    return section;
  });
  return sections.join('');
}

function shortenByDictionary(originContent, words, should) {
  let shortContent = originContent;
  const dictionary = [];
  const wordsCounts = words.reduce((acc, cur) => {
    acc[cur] = (acc[cur] || 0) + 1;
    return acc;
  }, {});

  Object.keys(wordsCounts).forEach(word => {
    if (should(wordsCounts[word], word.length)) {
      dictionary.push(word);
      shortContent = shortContent.replaceAll(
        word,
        `${LETTERS[dictionary.length - 1]}`,
      );
    }
  });
  shortContent = `${shortContent}|上文中，${dictionary.map(
    (word, index) => `${LETTERS[index]}:${word}`,
  )}`;
  return shortContent;
}

function shortenTableContent(tableContent) {
  const words = tableContent.split(' ');
  return shortenByDictionary(
    tableContent,
    words,
    (counts, length) => counts > 3 && length > 3,
  );
}

function shortenSectionContent(sectionContent) {
  const longContent = sectionContent
    // 去无不需要文案
    .replaceAll('（见释义）', '')
    // 减少字符
    .replaceAll('——', '—')
    // 全角半角化
    .replaceAll('（', '(')
    .replaceAll('）', ')')
    .replaceAll('：', ':')
    .replaceAll('；', ';')
    .replaceAll('、', '|')
    .replaceAll('，', ',')
    .replaceAll('。', '.')
    .replaceAll('“', `'`)
    .replaceAll('”', `'`)
    // 去无意义空格
    .replaceAll('. ', '.')
    .replaceAll(` '`, `'`)
    .replaceAll('; ', ';');
  const words = nodejieba.cut(longContent);
  return shortenByDictionary(
    longContent,
    words,
    (counts, length) => counts > 4 && length > 1,
  );
}

function shortenContent(longContent) {
  if (longContent.split(' ').length > 100) {
    return shortenTableContent(longContent);
  }
  return shortenSectionContent(longContent);
}

module.exports = {
  isDiseaseIntro,
  shortenDiseaseIntro,
  shortenContent,
  shortenTableContent,
  shortenSectionContent,
};


================================================
FILE: utils/embedding.js
================================================
const { createEmbeddingWithCache } = require('./ai');
const { writeKnowledgeEmbeddings } = require('./fs');

async function buildKnowledgeEmbeddings(knowledge, pdfName) {
  const embeddings = [];
  for (let index = 0; index < knowledge.length; index++) {
    if (!embeddings[index]) {
      const embedding = await createEmbeddingWithCache(knowledge[index], pdfName);
      embeddings[index] = embedding;
      console.log('createEmbedding success', index);
    }
  }
  writeKnowledgeEmbeddings(pdfName, embeddings);
  return embeddings;
}

async function buildQuestionEmbedding(question, pdfName) {
  const embedding = await createEmbeddingWithCache(question, pdfName);
  // console.log('createQuestionEmbedding success:', question);

  return embedding;
}

module.exports = { buildKnowledgeEmbeddings, buildQuestionEmbedding };


================================================
FILE: utils/fs.js
================================================
const { writeFileSync, readFileSync, existsSync, mkdirSync } = require('fs');
const { join } = require('path');

function readJsonFile(path) {
  try {
    const string = readFileSync(path).toString();
    return JSON.parse(string);
  } catch {
    return {};
  }
}

function getPdfName(pdfPath) {
  return pdfPath.split('/').pop().split('.pdf')[0];
}

function getPath(pdfName, fileName) {
  const relativeDirPath = `../knowledgeFiles/${pdfName}`;
  const dirPath = join(__dirname, relativeDirPath);
  // 文件夹初始化
  if (!existsSync(dirPath)) {
    mkdirSync(dirPath);
  }
  return join(__dirname, `${relativeDirPath}/${fileName}.json`);
}

function writeContentTree(pdfName, docTree) {
  writeFileSync(getPath(pdfName, 'contentTree'), JSON.stringify(docTree));
}

function writeKnowledge(pdfName, knowledge) {
  writeFileSync(getPath(pdfName, 'knowledge'), JSON.stringify(knowledge));
}

function readKnowledge(pdfName) {
  return readJsonFile(getPath(pdfName, 'knowledge'));
}

function writeKnowledgeEmbeddings(pdfName, embeddings) {
  writeFileSync(
    getPath(pdfName, 'knowledgeEmbeddings'),
    JSON.stringify(embeddings),
  );
}

function readKnowledgeEmbeddings(pdfName) {
  return readJsonFile(getPath(pdfName, 'knowledgeEmbeddings'));
}

function getPdfPath(pdfName) {
  return join(__dirname, `../pdfs/${pdfName}.pdf`);
}

function writeAnswer(pdfName, question, answer) {
  const answerPath = join(__dirname, `../answerFiles/${pdfName}_answers.json`);
  if (!existsSync(answerPath)) {
    writeFileSync(answerPath, JSON.stringify({ [question]: answer }));
    return;
  }

  const answerJson = readJsonFile(answerPath);
  answerJson[question] = answer;
  writeFileSync(answerPath, JSON.stringify(answerJson));
}

module.exports = {
  getPdfPath,
  getPdfName,
  writeAnswer,
  readKnowledge,
  writeKnowledge,
  writeContentTree,
  writeKnowledgeEmbeddings,
  readKnowledgeEmbeddings,
};


================================================
FILE: utils/openai.js
================================================
const { Configuration, OpenAIApi } = require('openai');
const { apiKey } = require('../config');

const configuration = new Configuration({
  apiKey,
});

const openai = new OpenAIApi(configuration);

module.exports = openai;


================================================
FILE: utils/pdf.js
================================================
const pdfjs = require('pdfjs-dist');
const { encode } = require('gpt-3-encoder');
const {
  isDiseaseIntro,
  shortenDiseaseIntro,
  shortenContent,
} = require('./content');

// 封面
const PAGE_TYPE_COVER = 0;
// 目录
const PAGE_TYPE_CATALOG = 1;
// 正文
const PAGE_TYPE_MAIN = 2;

const TITLE_SPLIT = '__TITLE__';
const QUOTE_SPLIT = '__QUOTE__';
const REF_SPLIT = '__REF__';

function buildDocTree(longStr) {
  const [, ...sections] = longStr.split(TITLE_SPLIT); // 将字符串划分成 section 数组

  const treeNodes = sections
    .map(section => {
      let [titleNo, ...content] = section.split(' ');
      if (titleNo.endsWith('.')) {
        titleNo = titleNo.slice(0, -1);
      }

      const matchedTitleNo = titleNo.match(/^\d+(\.\d*)*\.?/)?.[0];

      let joinedContent = content.join(' ');

      // 说明标题中有非纯数字标题的内容，把这部分内容拼接到正文中
      if (matchedTitleNo !== titleNo) {
        const titleContent = titleNo.replace(/^\d+(\.\d*)*\.?/, '');
        joinedContent = titleContent + ' ' + joinedContent;
      }

      let tokenLength = encode(joinedContent).length;

      // 疾病介绍内容特别长，可以阉割掉具体疾病的详细信息
      if (isDiseaseIntro(tokenLength, joinedContent)) {
        joinedContent = shortenDiseaseIntro(joinedContent);
      } else if (tokenLength > 4000) {
        // 不是疾病介绍也特别长的，采用字典压缩法压缩
        joinedContent = shortenContent(joinedContent);
      }

      tokenLength = encode(joinedContent).length;

      return {
        titleNo: matchedTitleNo || titleNo,
        content: joinedContent,
        children: [],
        refs: [],
        tokenLength,
      };
    })
    // .map(node => {
    //   const { content } = node;

    //   if (content.indexOf(QUOTE_SPLIT)) {
    //     const regex = /__QUOTE__([0-9.]+)/g;
    //     let match;
    //     while ((match = regex.exec(content)) !== null) {
    //       node.refs.push(match[1]);
    //     }
    //     node.content = node.content
    //       .replace(regex, '')
    //       .replace(/第\s*\d+\s*页\s*共\d+页/g, '');
    //     return node;
    //   }
    // });
  return treeNodes;
}

function isCatalogPage({ items }) {
  const pageContent = items.map(i => i.str).join('');
  if (pageContent.indexOf('条款目录') > -1) {
    return true;
  }
  if (pageContent.split(/(?=\d+.\d+)/).length > 10) {
    return true;
  }
}

// 将注释内容拼接到正文中
function moveNoteToMain(items) {
  const { mainFontHeight, titlePositionX, pageNumberPositionY } =
    getPageMetaData(items);

  const isRefTitle = item =>
    Math.abs(item.transform[4] - titlePositionX) < 2 &&
    item.height / mainFontHeight < 0.7;

  const refSplitIndex = items.findIndex(isRefTitle);

  if (refSplitIndex < 0) {
    return items;
  }

  // 正文
  const mainItems = items.slice(0, refSplitIndex);
  // 注释
  items
    .slice(refSplitIndex)
    .map(refItem => {
      if (isRefTitle(refItem)) {
        refItem.str = `${REF_SPLIT}${refItem.str.trim()} `;
      }
      return refItem.str;
    })
    .join('')
    .split(REF_SPLIT)
    .forEach(refContent => {
      const [refNo, ...content] = refContent.split(' ');
      if (refNo && content.length) {
        const mainItem = mainItems.find(i => i.str.trim() === refNo);

        if (!mainItem) {
          return;
        }
        mainItem.str = `[${content.join('')}]`;
      }
    });
  return mainItems;
}

async function getPdfItems(pdfPath) {
  const pdfItems = [];
  let pageType = PAGE_TYPE_CATALOG;
  await pdfjs.getDocument(pdfPath).promise.then(doc => {
    const numPages = doc.numPages;
    let lastPromise = doc.getMetadata();

    const loadPage = function (pageNum) {
      return doc.getPage(pageNum).then(page => {
        return page
          .getTextContent({
            disableCombineTextItems: true,
            // includeMarkedContent: true,
          })
          .then(pageData => {
            // 如果之前是封面，当前页已经是目录页了，状态改为目录页
            if (pageType === PAGE_TYPE_COVER && isCatalogPage(pageData)) {
              pageType = PAGE_TYPE_CATALOG;
            }
            // 如果之前是目录页，当前页已经不是目录页，状态改为正文页
            if (pageType === PAGE_TYPE_CATALOG && !isCatalogPage(pageData)) {
              pageType = PAGE_TYPE_MAIN;
            }
            // 从正文开始，push内容
            if (pageType === PAGE_TYPE_MAIN) {
              const contentItems = pageData.items.map(i => ({ ...i, pageNum }));
              pdfItems.push(...moveNoteToMain(contentItems));
            }
            page.cleanup();
          });
      });
    };
    // Loading of the first page will wait on metadata and subsequent loadings
    // will wait on the previous pages.
    for (let i = 1; i <= numPages; i++) {
      lastPromise = lastPromise.then(() => loadPage(i));
    }
    return lastPromise;
  });
  return pdfItems;
}

const isTitleNo = (items, itemIndex) => {
  const item = items[itemIndex];
  const nextItem = items[itemIndex + 1];

  const { str: itemContent } = item;
  // 一般来说，太长字符的肯定不是标题，减少后续的正则校验开销
  if (itemContent.length > 20) {
    return false;
  }

  if (nextItem && nextItem.str.trim() === '页') {
    return item;
  }

  return /^\d+(\.\d*)*\.?/.test(itemContent.trim());
  // return /^\d+(\.\d*)*\.?$/.test(itemContent.trim());
};

function getPageMetaData(items) {
  const fontHeightCountMap = {};
  const numberPositionXCountMap = {};
  let minPositionY = Infinity;

  items.forEach((cur, index) => {
    const { height, transform } = cur;
    const positionX = transform[4];
    const positionY = transform[5];
    if (!height || !transform) {
      console.log(cur);
    }
    const isTitle = isTitleNo(items, index);

    fontHeightCountMap[height] = (fontHeightCountMap[height] || 0) + 1;

    minPositionY = Math.min(minPositionY, positionY);

    if (isTitle) {
      numberPositionXCountMap[positionX] =
        (numberPositionXCountMap[positionX] || 0) + 1;
    }
  }, {});

  const sortedHeights = Object.keys(fontHeightCountMap)
    .map(height => {
      return {
        height: Number(height),
        counts: fontHeightCountMap[height],
      };
    })
    .sort((a, b) => b.counts - a.counts);

  const sortedPositionXs = Object.keys(numberPositionXCountMap)
    .map(positionX => {
      return {
        positionX: Number(positionX),
        counts: numberPositionXCountMap[positionX],
      };
    })
    .filter(i => i.positionX < 100)
    .sort((a, b) => b.counts - a.counts);
  // 处于较左端的，否则会被有些列表项的数字污染

  return {
    // 使用最多的字体大小，有理由相信，它就是正文字体大小
    mainFontHeight: sortedHeights[0].height,
    // 即是数字，又是持续在一个x坐标体现的，有理由相信，它就是标题数字
    titlePositionX: sortedPositionXs?.[0]?.positionX,
    // 最靠底的部分，有理想相信，它是页码的位置。但要小于60，否则是无页码的PDF
    pageNumberPositionY: minPositionY < 60 ? minPositionY : undefined,
  };
}

function rebuildPdfItems(items) {
  const { titlePositionX, pageNumberPositionY, mainFontHeight } =
    getPageMetaData(items);
  return items
    .map((item, index) => {
      const { height: currentHeight, str: itemContent, transform } = item;
      const nextItem = items[index + 1];
      const prevItem = items[index - 1];
      const positionX = transform[4];
      const positionY = transform[5];

      // 页码数据不需要
      if (pageNumberPositionY === positionY) {
        return null;
      }

      if (itemContent.startsWith('附表')) {
        item.str = `${TITLE_SPLIT}${itemContent.trim()}`;
        return item;
      }

      if (!isTitleNo(items, index)) {
        return item;
      }

      // 大标题，允许一定误差
      if (Math.abs(positionX - titlePositionX) < 2) {
        item.str = `${TITLE_SPLIT}${itemContent.trim()}`;
        return item;
      }

      // const prevHeight = prevItem?.height;
      // const nextHeight = nextItem.height;

      // 引用注释
      // if (
      //   prevItem &&
      //   currentHeight < prevHeight &&
      //   currentHeight < nextHeight
      // ) {
      //   item.str = `${QUOTE_SPLIT}${itemContent}`;
      //   return item;
      // }

      return item;
    })
    .filter(Boolean);
}

async function buildDocTreeFromPdf(pdfPath) {
  const items = await getPdfItems(pdfPath);
  const itemsWithTreeInfo = rebuildPdfItems(items);
  // const fs = require('fs');
  // console.log('===');
  // fs.writeFileSync('./tempItems.json', JSON.stringify(itemsWithTreeInfo));
  return buildDocTree(itemsWithTreeInfo.map(i => i.str).join(''));
}

module.exports = {
  buildDocTreeFromPdf,
  getPdfItems,
  rebuildPdfItems,
};


================================================
FILE: utils/tree.js
================================================
const { getSummaryWithCache } = require('./ai');
const { writeKnowledge, writeContentTree } = require('./fs');
const { shortenContent } = require('./content');
const { encode } = require('gpt-3-encoder');

function getParentNo(titleNo) {
  const parentNo = titleNo.split('.').slice(0, -1).join('.');
  return parentNo;
}

// 构建嵌套树
function toNestTree(flattenTree) {
  const tree = [];
  // 构建一个节点 map
  const nodesMap = flattenTree.reduce((acc, cur) => {
    acc[cur.titleNo] = cur;
    return acc;
  }, {});

  function updateParentTokenLength(node, tokenLength) {
    const parentNo = getParentNo(node.titleNo);
    if (parentNo && nodesMap[parentNo]) {
      const parentNode = nodesMap[parentNo];
      // 增加父节点的内容长度
      parentNode.allTokenLength =
        (parentNode.allTokenLength || 0) + tokenLength;
      // 递归累加
      updateParentTokenLength(parentNode, tokenLength);
    }
  }

  // 构建嵌套节点树，并计算每个节点涵盖的内容字符串总长度
  flattenTree.forEach(node => {
    // 更新相关节点的token长度
    const { tokenLength, summaryTokenLength } = node;
    const currentTokenLength = summaryTokenLength || tokenLength;
    // 用自己节点的内容初始化自身内容长度
    // 初始时可能已经被自己的子节点初始化过了，因此是累加
    node.allTokenLength = (node.allTokenLength || 0) + currentTokenLength;
    updateParentTokenLength(node, currentTokenLength);

    const parentNo = getParentNo(node.titleNo);
    // 把节点插入到父节点中
    if (parentNo && nodesMap[parentNo]) {
      const parentNode = nodesMap[parentNo];
      parentNode.children.push(node);
    } else {
      tree.push(node);
    }
  });

  return tree;
}

// 文本节点tokens大于1000的，重构为摘要
async function rebuildTreeWithAISummary(docTree, pdfName) {
  for (let index = 0; index < docTree.length; index++) {
    const node = docTree[index];

    if (node.tokenLength > 1000 && !node.summary) {
      // 实在特别长的，再压缩一下
      // const { content, tokenLength } =
      //   node.tokenLength < 3600
      //     ? node
      //     : {
      //         content: shortenContent(node.content),
      //       };

      const { content, tokenLength } = node;
      node.summary = await getSummaryWithCache(
        { content, tokenLength },
        pdfName,
      );
      console.log('build summary success', node.titleNo);
    }

    if (node.summary && !node.summaryTokenLength) {
      node.summaryTokenLength = encode(node.summary).length;
    }
  }
  return docTree;
}

// 构建嵌套内容树，并将过长子节点做摘要优化，减少节点内容
async function buildNestTreeWithAISummary(docTree, pdfName) {
  const tree = await rebuildTreeWithAISummary(docTree, pdfName);
  const nestTree = toNestTree(tree);

  // 写入文件
  writeContentTree(pdfName, nestTree);
  return nestTree;
}

// 将多段内容合并为一段
function unionContent(node) {
  let content = `第${node.titleNo}节内容:` + (node.summary || node.content);

  node.children.forEach(child => {
    content = content + '|' + unionContent(child);
  });

  return content;
}

// 将嵌套树递归构建为打平的内容段落
function buildContents(nodes, contents) {
  const newContents = contents || [];
  for (let index = 0; index < nodes.length; index++) {
    const node = nodes[index];
    if (node.allTokenLength > 3000) {
      buildContents(node.children, newContents);
    } else {
      const content = unionContent(node);
      newContents.push(content);
    }
  }
  return newContents;
}

// 构建知识库
async function buildKnowledgeFromDocTree(docTree, pdfName) {
  const nestTree = await buildNestTreeWithAISummary(docTree, pdfName);
  // const fs = require('fs');
  // fs.writeFileSync('./tempNestTree.json', JSON.stringify(nestTree));
  const knowledge = buildContents(nestTree);
  // 写入文件
  writeKnowledge(pdfName, knowledge);
  return knowledge;
}

module.exports = { buildKnowledgeFromDocTree };