Repository: wuomzfx/pdfGPT Branch: main Commit: 09a53a0a2e3b Files: 17 Total size: 25.1 KB Directory structure: gitextract_vsirs4ek/ ├── .gitignore ├── cache/ │ └── index.js ├── config.js ├── index.js ├── package.json ├── readme.md ├── scripts/ │ ├── ask.js │ └── load.js ├── userdict.utf8 └── utils/ ├── ai.js ├── ask.js ├── content.js ├── embedding.js ├── fs.js ├── openai.js ├── pdf.js └── tree.js ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ node_modules/ .DS_Store .vscode/ cache/files/ pdfs/ knowledgeFiles/ answerFiles/ ================================================ FILE: cache/index.js ================================================ const { existsSync, writeFileSync, readFileSync } = require('fs'); const { join } = require('path'); const getPath = name => join(__dirname, `./files/${name}.json`); const getJson = path => { // 不存在,返回空对象 if (!existsSync(path)) { return {}; } // 读文件 let string = readFileSync(path).toString(); let cacheJson = {}; try { // 反序列化 cacheJson = JSON.parse(string); } catch {} return cacheJson; }; function get(name, key) { const path = getPath(name); const json = getJson(path); return json[key]; } function set(name, key, value) { const path = getPath(name); const json = getJson(path); json[key] = value; writeFileSync(path, JSON.stringify(json)); } module.exports = { get, set }; ================================================ FILE: config.js ================================================ module.exports = { apiKey: 'your api key', pdfName: 'your pdf name,不需要 .pdf 结尾', // pdfName: 'hyb', // pdfName: 'e享护', // pdfName: '达尔文', // pdfName: '微保终身重疾', // 医疗险问题 questions: [ // '投保年龄限制', // '能无条件续保20年吗', // '每年的保费会变化吗', // '都能报销什么费用', // '门诊看病能报销吗', // '什么情况下不能理赔报销', // '要去哪些医院才能理赔报销', // '什么是等待期', // '什么是犹豫期', // '医保已经报销了,还能继续报销吗', // '最多能报销多少钱', '如果投保年龄填写错误,理赔时会怎么样', ], // 重疾险问题 // questions: [ // '投保年龄限制', // '每年的保费会变化吗', // '得什么病能获得赔偿', // '能赔的重大疾病有哪些', // '最多能理赔多少钱', // '什么情况下不能理赔', // '人死了保费能退回吗', // '要去哪些医院才能理赔', // '我总共要投多少钱', // '退保能退钱吗', // ], }; ================================================ FILE: index.js ================================================ const { getPdfName } = require('./utils/fs'); const { buildDocTreeFromPdf } = require('./utils/pdf'); const { buildKnowledgeFromDocTree } = require('./utils/tree'); const { buildKnowledgeEmbeddings } = require('./utils/embedding'); const ask = require('./utils/ask'); async function loadingPdf(pdfPath) { const pdfName = getPdfName(pdfPath); // 构建内容树 const docTree = await buildDocTreeFromPdf(pdfPath); // const fs = require('fs'); // fs.writeFileSync('./temp.json', JSON.stringify(docTree)) // 构建知识库 const knowledge = await buildKnowledgeFromDocTree(docTree, pdfName); // // 构建知识库向量 await buildKnowledgeEmbeddings(knowledge, pdfName); } async function askQuestion(question, pdfName) { console.log(`AI 正在努力回答您的问题『${question}』,请稍作等待...\n`); const answer = await ask(question, pdfName); console.log(`您的问题『${question}』回答如下:\n==========\n${answer}\n==========\n`); return answer; } module.exports = { loadingPdf, askQuestion, }; ================================================ FILE: package.json ================================================ { "name": "pdf-gpt", "version": "1.0.0", "description": "", "main": "index.js", "scripts": { "load": "node ./scripts/load", "ask": "node ./scripts/ask", "test": "echo \"Error: no test specified\" && exit 1" }, "author": "", "license": "ISC", "dependencies": { "@stdlib/blas": "^0.0.12", "gpt-3-encoder": "^1.1.4", "nodejieba": "^2.6.0", "openai": "^3.1.0", "pdfjs-dist": "^3.3.122" }, "repository": "https://github.com/wuomzfx/pdfGPT.git" } ================================================ FILE: readme.md ================================================ ## 如何使用 1. 执行 `npm install` 或 `tnpm install`; 2. 下载一个保险条款 PDF,放在 `pdfs/` 这个目录下; 3. 在 `config.json` 中,配置你的 `apiKey` 以及你的 PDF 文档名; 4. 针对你的 PDF 文档,修改 `config.json` 中问题 `questions`; 5. 先执行 `npm run load`,如果异常报错了,可以继续重试; 6. 再执行 `npm run ask`; 7. 最终可以在 answerFiles 文件目录下看到答案记录 ================================================ FILE: scripts/ask.js ================================================ const config = require('../config'); const { askQuestion } = require('../index'); const { pdfName, questions } = config; (async () => { for (let index = 0; index < questions.length; index++) { const question = questions[index]; await askQuestion(question, pdfName); } })(); ================================================ FILE: scripts/load.js ================================================ const config = require('../config'); const { loadingPdf } = require('../index'); const { getPdfPath } = require('../utils/fs'); const { pdfName } = config; const pdfPath = getPdfPath(pdfName); loadingPdf(pdfPath); ================================================ FILE: userdict.utf8 ================================================ 我们 重度 轻度 轻症 中症 可选责任 身故或全残 身故或全残保险金 疾病关爱保险金 恶性肿瘤 被保险人 本合同 疾病扩展保险金 基本保险金额 若被保险人 被保险人 经我们认可的医院专科医生 重大疾病 一种或者多种 18周岁 相学长 ================================================ FILE: utils/ai.js ================================================ const crypto = require('crypto'); const { encode } = require('gpt-3-encoder'); const openai = require('./openai'); const cache = require('../cache'); function buildHash(content) { return crypto.createHash('md5').update(content).digest('hex'); } async function createCompletion({ prompt, max_tokens = 1024, temperature = 0, }) { const completion = await openai.createCompletion({ model: 'text-davinci-003', prompt, max_tokens, temperature, }); return strip(completion?.data?.choices?.[0].text, ['\n']).trim(); } // 去头尾指定字符 const strip = (str, chars) => { let newStr = str; chars.forEach(char => { newStr = newStr.replace(new RegExp(`^${char}+|${char}+$`, 'g'), ''); }); return newStr; }; const withCache = (wrappedFn, suffix, getContent) => async (arg, cacheFileName) => { const content = getContent(arg); const cacheName = `${cacheFileName}_${suffix}`; // 文本太长,hash一下 const hash = buildHash(content); const cacheValue = cache.get(cacheName, hash); if (cacheValue) { return cacheValue; } const rs = await wrappedFn(arg); cache.set(cacheName, hash, rs); return rs; }; async function getSummary({ content, tokenLength }) { const promptContext = content.indexOf('|上文中a:') >= -1 ? `'''{{content}}'''基于字典翻译并返回内容摘要:` : `'''{{content}}'''基于命名实体识别构建内容摘要:`; const contentTokenLength = tokenLength || encode(content).length; const promptContextTokenLength = encode(promptContext).length; const completion = await openai.createCompletion({ model: 'text-davinci-003', prompt: promptContext.replace('{{content}}', content), // 1000 ~ 4096,最大也不能超过1000 max_tokens: Math.min( 4096 - contentTokenLength - promptContextTokenLength, 1000, ), temperature: 0, }); return strip(completion?.data?.choices?.[0].text, ['\n']); } async function createEmbedding(input) { const [response] = await Promise.all([ openai.createEmbedding({ model: 'text-embedding-ada-002', input: input, }), // 向量化很快,休息一下,防止调用超限(默认最多每分钟60次) await sleep(3000), ]); return response.data.data[0].embedding; } async function askInsQuestion({ question, knowledge }) { const prompt = ` 以下是某保险产品条款的部分 '''${knowledge}''' 请基于对保险的理解与该部分条款内容,回答如下问题: ${question}。 答案: `; const promptTokenLength = encode(prompt).length; return createCompletion({ prompt, max_tokens: 4096 - promptTokenLength }); } // 防止超过每分钟调用限制 const sleep = time => new Promise(resolve => { setTimeout(resolve, time); }); module.exports = { sleep, getSummary, getSummaryWithCache: withCache( getSummary, 'summary', ({ content }) => content, ), createEmbeddingWithCache: withCache( createEmbedding, 'embedding', input => input, ), askInsQuestion, createCompletion, }; ================================================ FILE: utils/ask.js ================================================ const { encode } = require('gpt-3-encoder'); const ddot = require('@stdlib/blas/base/ddot'); const { buildQuestionEmbedding } = require('./embedding'); const { readKnowledgeEmbeddings, readKnowledge, writeAnswer } = require('./fs'); const { askInsQuestion } = require('./ai'); function getKnowledge({ questionEmbedding, knowledgeEmbeddings, knowledgeList, }) { const kList = knowledgeEmbeddings .map((knowledge, index) => { const x = new Float64Array(questionEmbedding); const y = new Float64Array(knowledge); return { index, ddot: ddot(x.length, x, 1, y, 1), knowledge: knowledgeList[index], }; }) .sort((a, b) => b.ddot - a.ddot) .filter(k => k.ddot > 0.8); let tokens = 0; const enoughTokenList = kList.filter(k => { tokens += encode(k.knowledge).length; return tokens < 3000; }); return enoughTokenList.map(({ knowledge }) => knowledge).join('\n'); } async function ask(question, pdfName) { const questionEmbedding = await buildQuestionEmbedding(question, pdfName); const knowledgeEmbeddings = readKnowledgeEmbeddings(pdfName); const knowledgeList = readKnowledge(pdfName); const knowledge = getKnowledge({ questionEmbedding, knowledgeEmbeddings, knowledgeList, }); const answer = await askInsQuestion({ question, knowledge }); writeAnswer(pdfName, question, answer); return answer; } module.exports = ask; ================================================ FILE: utils/content.js ================================================ const path = require('path'); const nodejieba = require('nodejieba'); const LETTERS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZαβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ'.split(''); nodejieba.load({ userDict: path.join(__dirname, '../userdict.utf8'), }); // 判断是否是疾病介绍 function isDiseaseIntro(tokenLength, joinedContent) { // 比较短就不处理了 if (tokenLength < 2000) { return false; } // 粗暴的简单判断 return !!['重大疾病', '中症疾病', '轻症疾病', '特定心脑血管疾病'].find( disease => joinedContent.indexOf(disease) === 0, ); } // 疾病介绍的信息太长了,需要阉割一下,舍弃疾病介绍详情 function shortenDiseaseIntro(content) { const titleRegExp = /(?=([0-9]+))/g; const sections = content.split(titleRegExp).map(section => { if (titleRegExp.test(section)) { const [title, ..._] = section.split(' '); return title; } return section; }); return sections.join(''); } function shortenByDictionary(originContent, words, should) { let shortContent = originContent; const dictionary = []; const wordsCounts = words.reduce((acc, cur) => { acc[cur] = (acc[cur] || 0) + 1; return acc; }, {}); Object.keys(wordsCounts).forEach(word => { if (should(wordsCounts[word], word.length)) { dictionary.push(word); shortContent = shortContent.replaceAll( word, `${LETTERS[dictionary.length - 1]}`, ); } }); shortContent = `${shortContent}|上文中,${dictionary.map( (word, index) => `${LETTERS[index]}:${word}`, )}`; return shortContent; } function shortenTableContent(tableContent) { const words = tableContent.split(' '); return shortenByDictionary( tableContent, words, (counts, length) => counts > 3 && length > 3, ); } function shortenSectionContent(sectionContent) { const longContent = sectionContent // 去无不需要文案 .replaceAll('(见释义)', '') // 减少字符 .replaceAll('——', '—') // 全角半角化 .replaceAll('(', '(') .replaceAll(')', ')') .replaceAll(':', ':') .replaceAll(';', ';') .replaceAll('、', '|') .replaceAll(',', ',') .replaceAll('。', '.') .replaceAll('“', `'`) .replaceAll('”', `'`) // 去无意义空格 .replaceAll('. ', '.') .replaceAll(` '`, `'`) .replaceAll('; ', ';'); const words = nodejieba.cut(longContent); return shortenByDictionary( longContent, words, (counts, length) => counts > 4 && length > 1, ); } function shortenContent(longContent) { if (longContent.split(' ').length > 100) { return shortenTableContent(longContent); } return shortenSectionContent(longContent); } module.exports = { isDiseaseIntro, shortenDiseaseIntro, shortenContent, shortenTableContent, shortenSectionContent, }; ================================================ FILE: utils/embedding.js ================================================ const { createEmbeddingWithCache } = require('./ai'); const { writeKnowledgeEmbeddings } = require('./fs'); async function buildKnowledgeEmbeddings(knowledge, pdfName) { const embeddings = []; for (let index = 0; index < knowledge.length; index++) { if (!embeddings[index]) { const embedding = await createEmbeddingWithCache(knowledge[index], pdfName); embeddings[index] = embedding; console.log('createEmbedding success', index); } } writeKnowledgeEmbeddings(pdfName, embeddings); return embeddings; } async function buildQuestionEmbedding(question, pdfName) { const embedding = await createEmbeddingWithCache(question, pdfName); // console.log('createQuestionEmbedding success:', question); return embedding; } module.exports = { buildKnowledgeEmbeddings, buildQuestionEmbedding }; ================================================ FILE: utils/fs.js ================================================ const { writeFileSync, readFileSync, existsSync, mkdirSync } = require('fs'); const { join } = require('path'); function readJsonFile(path) { try { const string = readFileSync(path).toString(); return JSON.parse(string); } catch { return {}; } } function getPdfName(pdfPath) { return pdfPath.split('/').pop().split('.pdf')[0]; } function getPath(pdfName, fileName) { const relativeDirPath = `../knowledgeFiles/${pdfName}`; const dirPath = join(__dirname, relativeDirPath); // 文件夹初始化 if (!existsSync(dirPath)) { mkdirSync(dirPath); } return join(__dirname, `${relativeDirPath}/${fileName}.json`); } function writeContentTree(pdfName, docTree) { writeFileSync(getPath(pdfName, 'contentTree'), JSON.stringify(docTree)); } function writeKnowledge(pdfName, knowledge) { writeFileSync(getPath(pdfName, 'knowledge'), JSON.stringify(knowledge)); } function readKnowledge(pdfName) { return readJsonFile(getPath(pdfName, 'knowledge')); } function writeKnowledgeEmbeddings(pdfName, embeddings) { writeFileSync( getPath(pdfName, 'knowledgeEmbeddings'), JSON.stringify(embeddings), ); } function readKnowledgeEmbeddings(pdfName) { return readJsonFile(getPath(pdfName, 'knowledgeEmbeddings')); } function getPdfPath(pdfName) { return join(__dirname, `../pdfs/${pdfName}.pdf`); } function writeAnswer(pdfName, question, answer) { const answerPath = join(__dirname, `../answerFiles/${pdfName}_answers.json`); if (!existsSync(answerPath)) { writeFileSync(answerPath, JSON.stringify({ [question]: answer })); return; } const answerJson = readJsonFile(answerPath); answerJson[question] = answer; writeFileSync(answerPath, JSON.stringify(answerJson)); } module.exports = { getPdfPath, getPdfName, writeAnswer, readKnowledge, writeKnowledge, writeContentTree, writeKnowledgeEmbeddings, readKnowledgeEmbeddings, }; ================================================ FILE: utils/openai.js ================================================ const { Configuration, OpenAIApi } = require('openai'); const { apiKey } = require('../config'); const configuration = new Configuration({ apiKey, }); const openai = new OpenAIApi(configuration); module.exports = openai; ================================================ FILE: utils/pdf.js ================================================ const pdfjs = require('pdfjs-dist'); const { encode } = require('gpt-3-encoder'); const { isDiseaseIntro, shortenDiseaseIntro, shortenContent, } = require('./content'); // 封面 const PAGE_TYPE_COVER = 0; // 目录 const PAGE_TYPE_CATALOG = 1; // 正文 const PAGE_TYPE_MAIN = 2; const TITLE_SPLIT = '__TITLE__'; const QUOTE_SPLIT = '__QUOTE__'; const REF_SPLIT = '__REF__'; function buildDocTree(longStr) { const [, ...sections] = longStr.split(TITLE_SPLIT); // 将字符串划分成 section 数组 const treeNodes = sections .map(section => { let [titleNo, ...content] = section.split(' '); if (titleNo.endsWith('.')) { titleNo = titleNo.slice(0, -1); } const matchedTitleNo = titleNo.match(/^\d+(\.\d*)*\.?/)?.[0]; let joinedContent = content.join(' '); // 说明标题中有非纯数字标题的内容,把这部分内容拼接到正文中 if (matchedTitleNo !== titleNo) { const titleContent = titleNo.replace(/^\d+(\.\d*)*\.?/, ''); joinedContent = titleContent + ' ' + joinedContent; } let tokenLength = encode(joinedContent).length; // 疾病介绍内容特别长,可以阉割掉具体疾病的详细信息 if (isDiseaseIntro(tokenLength, joinedContent)) { joinedContent = shortenDiseaseIntro(joinedContent); } else if (tokenLength > 4000) { // 不是疾病介绍也特别长的,采用字典压缩法压缩 joinedContent = shortenContent(joinedContent); } tokenLength = encode(joinedContent).length; return { titleNo: matchedTitleNo || titleNo, content: joinedContent, children: [], refs: [], tokenLength, }; }) // .map(node => { // const { content } = node; // if (content.indexOf(QUOTE_SPLIT)) { // const regex = /__QUOTE__([0-9.]+)/g; // let match; // while ((match = regex.exec(content)) !== null) { // node.refs.push(match[1]); // } // node.content = node.content // .replace(regex, '') // .replace(/第\s*\d+\s*页\s*共\d+页/g, ''); // return node; // } // }); return treeNodes; } function isCatalogPage({ items }) { const pageContent = items.map(i => i.str).join(''); if (pageContent.indexOf('条款目录') > -1) { return true; } if (pageContent.split(/(?=\d+.\d+)/).length > 10) { return true; } } // 将注释内容拼接到正文中 function moveNoteToMain(items) { const { mainFontHeight, titlePositionX, pageNumberPositionY } = getPageMetaData(items); const isRefTitle = item => Math.abs(item.transform[4] - titlePositionX) < 2 && item.height / mainFontHeight < 0.7; const refSplitIndex = items.findIndex(isRefTitle); if (refSplitIndex < 0) { return items; } // 正文 const mainItems = items.slice(0, refSplitIndex); // 注释 items .slice(refSplitIndex) .map(refItem => { if (isRefTitle(refItem)) { refItem.str = `${REF_SPLIT}${refItem.str.trim()} `; } return refItem.str; }) .join('') .split(REF_SPLIT) .forEach(refContent => { const [refNo, ...content] = refContent.split(' '); if (refNo && content.length) { const mainItem = mainItems.find(i => i.str.trim() === refNo); if (!mainItem) { return; } mainItem.str = `[${content.join('')}]`; } }); return mainItems; } async function getPdfItems(pdfPath) { const pdfItems = []; let pageType = PAGE_TYPE_CATALOG; await pdfjs.getDocument(pdfPath).promise.then(doc => { const numPages = doc.numPages; let lastPromise = doc.getMetadata(); const loadPage = function (pageNum) { return doc.getPage(pageNum).then(page => { return page .getTextContent({ disableCombineTextItems: true, // includeMarkedContent: true, }) .then(pageData => { // 如果之前是封面,当前页已经是目录页了,状态改为目录页 if (pageType === PAGE_TYPE_COVER && isCatalogPage(pageData)) { pageType = PAGE_TYPE_CATALOG; } // 如果之前是目录页,当前页已经不是目录页,状态改为正文页 if (pageType === PAGE_TYPE_CATALOG && !isCatalogPage(pageData)) { pageType = PAGE_TYPE_MAIN; } // 从正文开始,push内容 if (pageType === PAGE_TYPE_MAIN) { const contentItems = pageData.items.map(i => ({ ...i, pageNum })); pdfItems.push(...moveNoteToMain(contentItems)); } page.cleanup(); }); }); }; // Loading of the first page will wait on metadata and subsequent loadings // will wait on the previous pages. for (let i = 1; i <= numPages; i++) { lastPromise = lastPromise.then(() => loadPage(i)); } return lastPromise; }); return pdfItems; } const isTitleNo = (items, itemIndex) => { const item = items[itemIndex]; const nextItem = items[itemIndex + 1]; const { str: itemContent } = item; // 一般来说,太长字符的肯定不是标题,减少后续的正则校验开销 if (itemContent.length > 20) { return false; } if (nextItem && nextItem.str.trim() === '页') { return item; } return /^\d+(\.\d*)*\.?/.test(itemContent.trim()); // return /^\d+(\.\d*)*\.?$/.test(itemContent.trim()); }; function getPageMetaData(items) { const fontHeightCountMap = {}; const numberPositionXCountMap = {}; let minPositionY = Infinity; items.forEach((cur, index) => { const { height, transform } = cur; const positionX = transform[4]; const positionY = transform[5]; if (!height || !transform) { console.log(cur); } const isTitle = isTitleNo(items, index); fontHeightCountMap[height] = (fontHeightCountMap[height] || 0) + 1; minPositionY = Math.min(minPositionY, positionY); if (isTitle) { numberPositionXCountMap[positionX] = (numberPositionXCountMap[positionX] || 0) + 1; } }, {}); const sortedHeights = Object.keys(fontHeightCountMap) .map(height => { return { height: Number(height), counts: fontHeightCountMap[height], }; }) .sort((a, b) => b.counts - a.counts); const sortedPositionXs = Object.keys(numberPositionXCountMap) .map(positionX => { return { positionX: Number(positionX), counts: numberPositionXCountMap[positionX], }; }) .filter(i => i.positionX < 100) .sort((a, b) => b.counts - a.counts); // 处于较左端的,否则会被有些列表项的数字污染 return { // 使用最多的字体大小,有理由相信,它就是正文字体大小 mainFontHeight: sortedHeights[0].height, // 即是数字,又是持续在一个x坐标体现的,有理由相信,它就是标题数字 titlePositionX: sortedPositionXs?.[0]?.positionX, // 最靠底的部分,有理想相信,它是页码的位置。但要小于60,否则是无页码的PDF pageNumberPositionY: minPositionY < 60 ? minPositionY : undefined, }; } function rebuildPdfItems(items) { const { titlePositionX, pageNumberPositionY, mainFontHeight } = getPageMetaData(items); return items .map((item, index) => { const { height: currentHeight, str: itemContent, transform } = item; const nextItem = items[index + 1]; const prevItem = items[index - 1]; const positionX = transform[4]; const positionY = transform[5]; // 页码数据不需要 if (pageNumberPositionY === positionY) { return null; } if (itemContent.startsWith('附表')) { item.str = `${TITLE_SPLIT}${itemContent.trim()}`; return item; } if (!isTitleNo(items, index)) { return item; } // 大标题,允许一定误差 if (Math.abs(positionX - titlePositionX) < 2) { item.str = `${TITLE_SPLIT}${itemContent.trim()}`; return item; } // const prevHeight = prevItem?.height; // const nextHeight = nextItem.height; // 引用注释 // if ( // prevItem && // currentHeight < prevHeight && // currentHeight < nextHeight // ) { // item.str = `${QUOTE_SPLIT}${itemContent}`; // return item; // } return item; }) .filter(Boolean); } async function buildDocTreeFromPdf(pdfPath) { const items = await getPdfItems(pdfPath); const itemsWithTreeInfo = rebuildPdfItems(items); // const fs = require('fs'); // console.log('==='); // fs.writeFileSync('./tempItems.json', JSON.stringify(itemsWithTreeInfo)); return buildDocTree(itemsWithTreeInfo.map(i => i.str).join('')); } module.exports = { buildDocTreeFromPdf, getPdfItems, rebuildPdfItems, }; ================================================ FILE: utils/tree.js ================================================ const { getSummaryWithCache } = require('./ai'); const { writeKnowledge, writeContentTree } = require('./fs'); const { shortenContent } = require('./content'); const { encode } = require('gpt-3-encoder'); function getParentNo(titleNo) { const parentNo = titleNo.split('.').slice(0, -1).join('.'); return parentNo; } // 构建嵌套树 function toNestTree(flattenTree) { const tree = []; // 构建一个节点 map const nodesMap = flattenTree.reduce((acc, cur) => { acc[cur.titleNo] = cur; return acc; }, {}); function updateParentTokenLength(node, tokenLength) { const parentNo = getParentNo(node.titleNo); if (parentNo && nodesMap[parentNo]) { const parentNode = nodesMap[parentNo]; // 增加父节点的内容长度 parentNode.allTokenLength = (parentNode.allTokenLength || 0) + tokenLength; // 递归累加 updateParentTokenLength(parentNode, tokenLength); } } // 构建嵌套节点树,并计算每个节点涵盖的内容字符串总长度 flattenTree.forEach(node => { // 更新相关节点的token长度 const { tokenLength, summaryTokenLength } = node; const currentTokenLength = summaryTokenLength || tokenLength; // 用自己节点的内容初始化自身内容长度 // 初始时可能已经被自己的子节点初始化过了,因此是累加 node.allTokenLength = (node.allTokenLength || 0) + currentTokenLength; updateParentTokenLength(node, currentTokenLength); const parentNo = getParentNo(node.titleNo); // 把节点插入到父节点中 if (parentNo && nodesMap[parentNo]) { const parentNode = nodesMap[parentNo]; parentNode.children.push(node); } else { tree.push(node); } }); return tree; } // 文本节点tokens大于1000的,重构为摘要 async function rebuildTreeWithAISummary(docTree, pdfName) { for (let index = 0; index < docTree.length; index++) { const node = docTree[index]; if (node.tokenLength > 1000 && !node.summary) { // 实在特别长的,再压缩一下 // const { content, tokenLength } = // node.tokenLength < 3600 // ? node // : { // content: shortenContent(node.content), // }; const { content, tokenLength } = node; node.summary = await getSummaryWithCache( { content, tokenLength }, pdfName, ); console.log('build summary success', node.titleNo); } if (node.summary && !node.summaryTokenLength) { node.summaryTokenLength = encode(node.summary).length; } } return docTree; } // 构建嵌套内容树,并将过长子节点做摘要优化,减少节点内容 async function buildNestTreeWithAISummary(docTree, pdfName) { const tree = await rebuildTreeWithAISummary(docTree, pdfName); const nestTree = toNestTree(tree); // 写入文件 writeContentTree(pdfName, nestTree); return nestTree; } // 将多段内容合并为一段 function unionContent(node) { let content = `第${node.titleNo}节内容:` + (node.summary || node.content); node.children.forEach(child => { content = content + '|' + unionContent(child); }); return content; } // 将嵌套树递归构建为打平的内容段落 function buildContents(nodes, contents) { const newContents = contents || []; for (let index = 0; index < nodes.length; index++) { const node = nodes[index]; if (node.allTokenLength > 3000) { buildContents(node.children, newContents); } else { const content = unionContent(node); newContents.push(content); } } return newContents; } // 构建知识库 async function buildKnowledgeFromDocTree(docTree, pdfName) { const nestTree = await buildNestTreeWithAISummary(docTree, pdfName); // const fs = require('fs'); // fs.writeFileSync('./tempNestTree.json', JSON.stringify(nestTree)); const knowledge = buildContents(nestTree); // 写入文件 writeKnowledge(pdfName, knowledge); return knowledge; } module.exports = { buildKnowledgeFromDocTree };