Repository: filipedeschamps/parse-google-docs-json Branch: master Commit: d95dd1213125 Files: 7 Total size: 14.5 KB Directory structure: gitextract_1al11tqs/ ├── .gitignore ├── LICENSE ├── README.md ├── index.d.ts ├── package.json └── source/ ├── index.js └── parser.js ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Logs logs *.log npm-debug.log* yarn-debug.log* yarn-error.log* lerna-debug.log* # Diagnostic reports (https://nodejs.org/api/report.html) report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json # Runtime data pids *.pid *.seed *.pid.lock # Directory for instrumented libs generated by jscoverage/JSCover lib-cov # Coverage directory used by tools like istanbul coverage *.lcov # nyc test coverage .nyc_output # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) .grunt # Bower dependency directory (https://bower.io/) bower_components # node-waf configuration .lock-wscript # Compiled binary addons (https://nodejs.org/api/addons.html) build/Release # Dependency directories node_modules/ jspm_packages/ # TypeScript v1 declaration files typings/ # TypeScript cache *.tsbuildinfo # Optional npm cache directory .npm # Optional eslint cache .eslintcache # Microbundle cache .rpt2_cache/ .rts2_cache_cjs/ .rts2_cache_es/ .rts2_cache_umd/ # Optional REPL history .node_repl_history # Output of 'npm pack' *.tgz # Yarn Integrity file .yarn-integrity # dotenv environment variables file .env .env.test # parcel-bundler cache (https://parceljs.org/) .cache # Next.js build output .next # Nuxt.js build / generate output .nuxt dist # Gatsby files .cache/ # Comment in the public line in if your project uses Gatsby and *not* Next.js # https://nextjs.org/blog/next-9-1#public-directory-support # public # vuepress build output .vuepress/dist # Serverless directories .serverless/ # FuseBox cache .fusebox/ # DynamoDB Local files .dynamodb/ # TernJS port file .tern-port ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2020 Filipe Deschamps Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Parse Google Docs JSON This **Node.js** module authenticates with **Google API** and parse **Google Docs** to human-readable **JSON** or **Markdown** without the need to use cumbersome methods like exporting it in HTML via **Google Drive API** and then parse it back to other formats. # Why When you use **Google Docs API V1**, the [body](https://developers.google.com/docs/api/reference/rest/v1/documents#Body) that comes with the `documents.get` method is completely fragmented. It's a JSON that you need to recursively parse to get the document into human-readable format. For my luck, there's a Gatsby plugin that internally has this implementation already: [gatsby-source-google-docs](https://github.com/cedricdelpoux/gatsby-source-google-docs). So I've extracted this implementation into this module and exposed it with a **Service Authentication**. For more information about this type of authentication, follow this tutorial: [How to authenticate to any Google API](https://flaviocopes.com/google-api-authentication) # Warning This module works like a charm, but it's for personal use, primarily. It will follow semantic version best practices, but will not have any automated tests in the short term. # How to use ```js const parseGoogleDocsJson = require("parse-google-docs-json"); async function start() { const parsed = await parseGoogleDocsJson({ documentId: "1ymKw2OGcMfc02XdEEWdy22a_zUAlCxyN3P5Ab4c", clientEmail: "service@iam.gserviceaccount.com", privateKey: "-----BEGIN PRIVATE KEY...", }); console.log(parsed.toJson()); console.log(parsed.toMarkdown()); } start(); ``` # Environment variables ``` clientEmail = process.env.PARSE_GOOGLE_DOCS_CLIENT_EMAIL privateKey = process.env.PARSE_GOOGLE_DOCS_PRIVATE_KEY ``` ================================================ FILE: index.d.ts ================================================ declare module 'parse-google-docs-json' { interface Configuration { clientEmail?:string, privateKey?:string, documentId?: string } namespace parseGoogleDocs {} function parseGoogleDocs( configuration?: Configuration): { toJson:()=>{ cover: { image: string title: string alt: string } content: any[] metadata: { title: string } }, toMarkdown:()=> string } export = parseGoogleDocs } ================================================ FILE: package.json ================================================ { "name": "parse-google-docs-json", "version": "4.0.0", "description": "This Node.js module authenticates with Google API and parse Google Docs to human-readable JSON or Markdown.", "main": "source/index.js", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" }, "repository": { "type": "git", "url": "git+https://github.com/filipedeschamps/parse-google-docs-json.git" }, "keywords": [ "google", "api", "google", "docs", "html", "markdown" ], "author": "Filipe Deschamps", "license": "MIT", "bugs": { "url": "https://github.com/filipedeschamps/parse-google-docs-json/issues" }, "homepage": "https://github.com/filipedeschamps/parse-google-docs-json#readme", "typings": "index.d.ts", "dependencies": { "googleapis": "148.0.0", "json2md": "1.7.1", "lodash.get": "4.4.2", "lodash.last": "3.0.0", "lodash.repeat": "4.1.0", "yamljs": "0.3.0" } } ================================================ FILE: source/index.js ================================================ const { google } = require("googleapis"); const { convertGoogleDocumentToJson, convertJsonToMarkdown, } = require("./parser.js"); async function parseGoogleDocs(configuration = {}) { const clientEmail = configuration.clientEmail || process.env.PARSE_GOOGLE_DOCS_CLIENT_EMAIL; const privateKey = configuration.privateKey || process.env.PARSE_GOOGLE_DOCS_PRIVATE_KEY; const documentId = configuration.documentId; const scopes = ["https://www.googleapis.com/auth/documents.readonly"]; if (!clientEmail) { throw new Error('Please, provide "clientEmail" in the constructor'); } if (!privateKey) { throw new Error('Please, provide "privateKey" in the constructor'); } if (!documentId) { throw new Error('Please, provide "documentId" in the constructor'); } const auth = new google.auth.JWT({ email: clientEmail, key: privateKey, scopes: scopes, }); const docs = google.docs({ version: "v1", auth }); const docsResponse = await docs.documents.get({ documentId: documentId, }); function toJson() { const jsonDocument = convertGoogleDocumentToJson(docsResponse.data); return { metadata: { title: docsResponse.data.title }, ...jsonDocument, }; } function toMarkdown() { const documentInJson = convertGoogleDocumentToJson(docsResponse.data); return convertJsonToMarkdown(documentInJson); } return { toJson, toMarkdown, }; } module.exports = parseGoogleDocs; ================================================ FILE: source/parser.js ================================================ const json2md = require("json2md"); const YAML = require("yamljs"); const _last = require("lodash.last"); const _get = require("lodash.get"); const _repeat = require("lodash.repeat"); function getParagraphTag(p) { const tags = { NORMAL_TEXT: "p", SUBTITLE: "blockquote", HEADING_1: "h1", HEADING_2: "h2", HEADING_3: "h3", HEADING_4: "h4", HEADING_5: "h5", }; return tags[p.paragraphStyle.namedStyleType]; } function getListTag(list) { const glyphType = _get(list, [ "listProperties", "nestingLevels", 0, "glyphType", ]); return glyphType !== undefined ? "ol" : "ul"; } function cleanText(text) { return text.replace(/\n/g, "").trim(); } function getNestedListIndent(level, listTag) { const indentType = listTag === "ol" ? "1." : "-"; return `${_repeat(" ", level)}${indentType} `; } function getTextFromParagraph(p) { return p.elements ? p.elements .filter((el) => el.textRun && el.textRun.content !== "\n") .map((el) => (el.textRun ? getText(el) : "")) .join("") : ""; } function getTableCellContent(content) { if (!content.length === 0) return ""; return content .map(({ paragraph }) => cleanText(getTextFromParagraph(paragraph))) .join(""); } function getImage(document, element) { const { inlineObjects } = document; if (!inlineObjects || !element.inlineObjectElement) { return null; } const inlineObject = inlineObjects[element.inlineObjectElement.inlineObjectId]; const embeddedObject = inlineObject.inlineObjectProperties.embeddedObject; if (embeddedObject && embeddedObject.imageProperties) { return { source: embeddedObject.imageProperties.contentUri, title: embeddedObject.title || "", alt: embeddedObject.description || "", }; } return null; } function getBulletContent(document, element) { if (element.inlineObjectElement) { const image = getImage(document, element); return `![${image.alt}](${image.source} "${image.title}")`; } return getText(element); } function getText(element, { isHeader = false } = {}) { let text = cleanText(element.textRun.content); const { link, underline, strikethrough, bold, italic, } = element.textRun.textStyle; text = text.replace(/\*/g, "\\*"); text = text.replace(/_/g, "\\_"); if (underline) { // Underline isn't supported in markdown so we'll use emphasis text = `_${text}_`; } if (italic) { text = `_${text}_`; } // Set bold unless it's a header if (bold & !isHeader) { text = `**${text}**`; } if (strikethrough) { text = `~~${text}~~`; } if (link) { return `[${text}](${link.url})`; } return text; } function getCover(document) { const { headers, documentStyle } = document; if ( !documentStyle || !documentStyle.firstPageHeaderId || !headers[documentStyle.firstPageHeaderId] ) { return null; } const headerElement = _get(headers[documentStyle.firstPageHeaderId], [ "content", 0, "paragraph", "elements", 0, ]); const image = getImage(document, headerElement); return image ? { image: image.source, title: image.title, alt: image.alt, } : null; } function convertGoogleDocumentToJson(document) { const { body, footnotes = {} } = document; const cover = getCover(document); const content = []; const footnoteIDs = {}; body.content.forEach(({ paragraph, table }, i) => { // Paragraphs if (paragraph) { const tag = getParagraphTag(paragraph); // Lists if (paragraph.bullet) { const listId = paragraph.bullet.listId; const list = document.lists[listId]; const listTag = getListTag(list); const bulletContent = paragraph.elements .map((el) => getBulletContent(document, el)) .join(" ") .replace(" .", ".") .replace(" ,", ","); const prev = body.content[i - 1]; const prevListId = _get(prev, "paragraph.bullet.listId"); if (prevListId === listId) { const list = _last(content)[listTag]; const { nestingLevel } = paragraph.bullet; if (nestingLevel !== undefined) { // mimic nested lists const lastIndex = list.length - 1; const indent = getNestedListIndent(nestingLevel, listTag); list[lastIndex] += `\n${indent} ${bulletContent}`; } else { list.push(bulletContent); } } else { content.push({ [listTag]: [bulletContent], }); } } // Headings, Images, Texts else if (tag) { let tagContent = []; paragraph.elements.forEach((el) => { // EmbeddedObject if (el.inlineObjectElement) { const image = getImage(document, el); if (image) { tagContent.push({ img: image, }); } } // Headings, Texts else if (el.textRun && el.textRun.content !== "\n") { tagContent.push({ [tag]: getText(el, { isHeader: tag !== "p", }), }); } // Footnotes else if (el.footnoteReference) { tagContent.push({ [tag]: `[^${el.footnoteReference.footnoteNumber}]`, }); footnoteIDs[el.footnoteReference.footnoteId] = el.footnoteReference.footnoteNumber; } }); if (tagContent.every((el) => el[tag] !== undefined)) { content.push({ [tag]: tagContent .map((el) => el[tag]) .join(" ") .replace(" .", ".") .replace(" ,", ","), }); } else { content.push(...tagContent); } } } // Table else if (table && table.tableRows.length > 0) { const [thead, ...tbody] = table.tableRows; content.push({ table: { headers: thead.tableCells.map(({ content }) => getTableCellContent(content) ), rows: tbody.map((row) => row.tableCells.map(({ content }) => getTableCellContent(content)) ), }, }); } }); // Footnotes reference section (end of document) let formatedFootnotes = []; Object.entries(footnotes).forEach(([, value]) => { // Concatenate all content const text_items = value.content[0].paragraph.elements.map((element) => getText(element) ); const text = text_items.join(" ").replace(" .", ".").replace(" ,", ","); formatedFootnotes.push({ footnote: { number: footnoteIDs[value.footnoteId], text: text }, }); }); formatedFootnotes.sort( (item1, item2) => parseInt(item1.footnote.number) - parseInt(item2.footnote.number) ); content.push(...formatedFootnotes); return { cover, content, }; } // Add extra converter for footnotes json2md.converters.footnote = function (footnote) { return `[^${footnote.number}]: ${footnote.text}`; }; function convertJsonToMarkdown({ content, metadata }) { // Do NOT move the formatting of the following lines // to prevent markdown parsing errors return `--- ${YAML.stringify(metadata)} --- ${json2md(content)}`; } module.exports = { convertGoogleDocumentToJson, convertJsonToMarkdown };