Repository: filipedeschamps/parse-google-docs-json
Branch: master
Commit: d95dd1213125
Files: 7
Total size: 14.5 KB

Directory structure:
gitextract_1al11tqs/

├── .gitignore
├── LICENSE
├── README.md
├── index.d.ts
├── package.json
└── source/
    ├── index.js
    └── parser.js

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*

# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json

# Runtime data
pids
*.pid
*.seed
*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov

# Coverage directory used by tools like istanbul
coverage
*.lcov

# nyc test coverage
.nyc_output

# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt

# Bower dependency directory (https://bower.io/)
bower_components

# node-waf configuration
.lock-wscript

# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release

# Dependency directories
node_modules/
jspm_packages/

# TypeScript v1 declaration files
typings/

# TypeScript cache
*.tsbuildinfo

# Optional npm cache directory
.npm

# Optional eslint cache
.eslintcache

# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/

# Optional REPL history
.node_repl_history

# Output of 'npm pack'
*.tgz

# Yarn Integrity file
.yarn-integrity

# dotenv environment variables file
.env
.env.test

# parcel-bundler cache (https://parceljs.org/)
.cache

# Next.js build output
.next

# Nuxt.js build / generate output
.nuxt
dist

# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and *not* Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public

# vuepress build output
.vuepress/dist

# Serverless directories
.serverless/

# FuseBox cache
.fusebox/

# DynamoDB Local files
.dynamodb/

# TernJS port file
.tern-port


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2020 Filipe Deschamps

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# Parse Google Docs JSON

This **Node.js** module authenticates with **Google API** and parse **Google Docs** to human-readable **JSON** or **Markdown** without the need to use cumbersome methods like exporting it in HTML via **Google Drive API** and then parse it back to other formats.

# Why

When you use **Google Docs API V1**, the [body](https://developers.google.com/docs/api/reference/rest/v1/documents#Body) that comes with the `documents.get` method is completely fragmented. It's a JSON that you need to recursively parse to get the document into human-readable format. For my luck, there's a Gatsby plugin that internally has this implementation already: [gatsby-source-google-docs](https://github.com/cedricdelpoux/gatsby-source-google-docs). So I've extracted this implementation into this module and exposed it with a **Service Authentication**. For more information about this type of authentication, follow this tutorial: [How to authenticate to any Google API](https://flaviocopes.com/google-api-authentication)

# Warning

This module works like a charm, but it's for personal use, primarily. It will follow semantic version best practices, but will not have any automated tests in the short term.

# How to use

```js
const parseGoogleDocsJson = require("parse-google-docs-json");

async function start() {
  const parsed = await parseGoogleDocsJson({
    documentId: "1ymKw2OGcMfc02XdEEWdy22a_zUAlCxyN3P5Ab4c",
    clientEmail: "service@iam.gserviceaccount.com",
    privateKey: "-----BEGIN PRIVATE KEY...",
  });

  console.log(parsed.toJson());
  console.log(parsed.toMarkdown());
}

start();
```

# Environment variables

```
clientEmail = process.env.PARSE_GOOGLE_DOCS_CLIENT_EMAIL
privateKey = process.env.PARSE_GOOGLE_DOCS_PRIVATE_KEY
```


================================================
FILE: index.d.ts
================================================
declare module 'parse-google-docs-json' {
  interface Configuration {
    clientEmail?:string,
    privateKey?:string,
    documentId?: string
  }

  namespace parseGoogleDocs {}

  function parseGoogleDocs( configuration?: Configuration): {
    toJson:()=>{
      cover: {
        image: string
        title: string
        alt: string
      }
      content: any[]
      metadata: {
        title: string
      }
    },
    toMarkdown:()=> string
  }

  export = parseGoogleDocs
}

================================================
FILE: package.json
================================================
{
  "name": "parse-google-docs-json",
  "version": "4.0.0",
  "description": "This Node.js module authenticates with Google API and parse Google Docs to human-readable JSON or Markdown.",
  "main": "source/index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "repository": {
    "type": "git",
    "url": "git+https://github.com/filipedeschamps/parse-google-docs-json.git"
  },
  "keywords": [
    "google",
    "api",
    "google",
    "docs",
    "html",
    "markdown"
  ],
  "author": "Filipe Deschamps",
  "license": "MIT",
  "bugs": {
    "url": "https://github.com/filipedeschamps/parse-google-docs-json/issues"
  },
  "homepage": "https://github.com/filipedeschamps/parse-google-docs-json#readme",
  "typings": "index.d.ts",
  "dependencies": {
    "googleapis": "148.0.0",
    "json2md": "1.7.1",
    "lodash.get": "4.4.2",
    "lodash.last": "3.0.0",
    "lodash.repeat": "4.1.0",
    "yamljs": "0.3.0"
  }
}


================================================
FILE: source/index.js
================================================
const { google } = require("googleapis");

const {
  convertGoogleDocumentToJson,
  convertJsonToMarkdown,
} = require("./parser.js");

async function parseGoogleDocs(configuration = {}) {
  const clientEmail =
    configuration.clientEmail || process.env.PARSE_GOOGLE_DOCS_CLIENT_EMAIL;
  const privateKey =
    configuration.privateKey || process.env.PARSE_GOOGLE_DOCS_PRIVATE_KEY;
  const documentId = configuration.documentId;
  const scopes = ["https://www.googleapis.com/auth/documents.readonly"];

  if (!clientEmail) {
    throw new Error('Please, provide "clientEmail" in the constructor');
  }

  if (!privateKey) {
    throw new Error('Please, provide "privateKey" in the constructor');
  }

  if (!documentId) {
    throw new Error('Please, provide "documentId" in the constructor');
  }

  const auth = new google.auth.JWT({
    email: clientEmail,
    key: privateKey,
    scopes: scopes,
  });

  const docs = google.docs({ version: "v1", auth });

  const docsResponse = await docs.documents.get({
    documentId: documentId,
  });

  function toJson() {
    const jsonDocument = convertGoogleDocumentToJson(docsResponse.data);

    return {
      metadata: { title: docsResponse.data.title },
      ...jsonDocument,
    };
  }

  function toMarkdown() {
    const documentInJson = convertGoogleDocumentToJson(docsResponse.data);
    return convertJsonToMarkdown(documentInJson);
  }

  return {
    toJson,
    toMarkdown,
  };
}

module.exports = parseGoogleDocs;


================================================
FILE: source/parser.js
================================================
const json2md = require("json2md");
const YAML = require("yamljs");
const _last = require("lodash.last");
const _get = require("lodash.get");
const _repeat = require("lodash.repeat");

function getParagraphTag(p) {
  const tags = {
    NORMAL_TEXT: "p",
    SUBTITLE: "blockquote",
    HEADING_1: "h1",
    HEADING_2: "h2",
    HEADING_3: "h3",
    HEADING_4: "h4",
    HEADING_5: "h5",
  };

  return tags[p.paragraphStyle.namedStyleType];
}

function getListTag(list) {
  const glyphType = _get(list, [
    "listProperties",
    "nestingLevels",
    0,
    "glyphType",
  ]);
  return glyphType !== undefined ? "ol" : "ul";
}

function cleanText(text) {
  return text.replace(/\n/g, "").trim();
}

function getNestedListIndent(level, listTag) {
  const indentType = listTag === "ol" ? "1." : "-";
  return `${_repeat("  ", level)}${indentType} `;
}

function getTextFromParagraph(p) {
  return p.elements
    ? p.elements
        .filter((el) => el.textRun && el.textRun.content !== "\n")
        .map((el) => (el.textRun ? getText(el) : ""))
        .join("")
    : "";
}

function getTableCellContent(content) {
  if (!content.length === 0) return "";
  return content
    .map(({ paragraph }) => cleanText(getTextFromParagraph(paragraph)))
    .join("");
}

function getImage(document, element) {
  const { inlineObjects } = document;

  if (!inlineObjects || !element.inlineObjectElement) {
    return null;
  }

  const inlineObject =
    inlineObjects[element.inlineObjectElement.inlineObjectId];
  const embeddedObject = inlineObject.inlineObjectProperties.embeddedObject;

  if (embeddedObject && embeddedObject.imageProperties) {
    return {
      source: embeddedObject.imageProperties.contentUri,
      title: embeddedObject.title || "",
      alt: embeddedObject.description || "",
    };
  }

  return null;
}

function getBulletContent(document, element) {
  if (element.inlineObjectElement) {
    const image = getImage(document, element);
    return `![${image.alt}](${image.source} "${image.title}")`;
  }

  return getText(element);
}

function getText(element, { isHeader = false } = {}) {
  let text = cleanText(element.textRun.content);
  const {
    link,
    underline,
    strikethrough,
    bold,
    italic,
  } = element.textRun.textStyle;

  text = text.replace(/\*/g, "\\*");
  text = text.replace(/_/g, "\\_");

  if (underline) {
    // Underline isn't supported in markdown so we'll use emphasis
    text = `_${text}_`;
  }

  if (italic) {
    text = `_${text}_`;
  }

  // Set bold unless it's a header
  if (bold & !isHeader) {
    text = `**${text}**`;
  }

  if (strikethrough) {
    text = `~~${text}~~`;
  }

  if (link) {
    return `[${text}](${link.url})`;
  }

  return text;
}

function getCover(document) {
  const { headers, documentStyle } = document;

  if (
    !documentStyle ||
    !documentStyle.firstPageHeaderId ||
    !headers[documentStyle.firstPageHeaderId]
  ) {
    return null;
  }

  const headerElement = _get(headers[documentStyle.firstPageHeaderId], [
    "content",
    0,
    "paragraph",
    "elements",
    0,
  ]);

  const image = getImage(document, headerElement);

  return image
    ? {
        image: image.source,
        title: image.title,
        alt: image.alt,
      }
    : null;
}

function convertGoogleDocumentToJson(document) {
  const { body, footnotes = {} } = document;
  const cover = getCover(document);

  const content = [];
  const footnoteIDs = {};

  body.content.forEach(({ paragraph, table }, i) => {
    // Paragraphs
    if (paragraph) {
      const tag = getParagraphTag(paragraph);

      // Lists
      if (paragraph.bullet) {
        const listId = paragraph.bullet.listId;
        const list = document.lists[listId];
        const listTag = getListTag(list);

        const bulletContent = paragraph.elements
          .map((el) => getBulletContent(document, el))
          .join(" ")
          .replace(" .", ".")
          .replace(" ,", ",");

        const prev = body.content[i - 1];
        const prevListId = _get(prev, "paragraph.bullet.listId");

        if (prevListId === listId) {
          const list = _last(content)[listTag];
          const { nestingLevel } = paragraph.bullet;

          if (nestingLevel !== undefined) {
            // mimic nested lists
            const lastIndex = list.length - 1;
            const indent = getNestedListIndent(nestingLevel, listTag);

            list[lastIndex] += `\n${indent} ${bulletContent}`;
          } else {
            list.push(bulletContent);
          }
        } else {
          content.push({
            [listTag]: [bulletContent],
          });
        }
      }

      // Headings, Images, Texts
      else if (tag) {
        let tagContent = [];

        paragraph.elements.forEach((el) => {
          // EmbeddedObject
          if (el.inlineObjectElement) {
            const image = getImage(document, el);

            if (image) {
              tagContent.push({
                img: image,
              });
            }
          }

          // Headings, Texts
          else if (el.textRun && el.textRun.content !== "\n") {
            tagContent.push({
              [tag]: getText(el, {
                isHeader: tag !== "p",
              }),
            });
          }

          // Footnotes
          else if (el.footnoteReference) {
            tagContent.push({
              [tag]: `[^${el.footnoteReference.footnoteNumber}]`,
            });
            footnoteIDs[el.footnoteReference.footnoteId] =
              el.footnoteReference.footnoteNumber;
          }
        });

        if (tagContent.every((el) => el[tag] !== undefined)) {
          content.push({
            [tag]: tagContent
              .map((el) => el[tag])
              .join(" ")
              .replace(" .", ".")
              .replace(" ,", ","),
          });
        } else {
          content.push(...tagContent);
        }
      }
    }

    // Table
    else if (table && table.tableRows.length > 0) {
      const [thead, ...tbody] = table.tableRows;
      content.push({
        table: {
          headers: thead.tableCells.map(({ content }) =>
            getTableCellContent(content)
          ),
          rows: tbody.map((row) =>
            row.tableCells.map(({ content }) => getTableCellContent(content))
          ),
        },
      });
    }
  });

  // Footnotes reference section (end of document)
  let formatedFootnotes = [];
  Object.entries(footnotes).forEach(([, value]) => {
    // Concatenate all content
    const text_items = value.content[0].paragraph.elements.map((element) =>
      getText(element)
    );
    const text = text_items.join(" ").replace(" .", ".").replace(" ,", ",");

    formatedFootnotes.push({
      footnote: { number: footnoteIDs[value.footnoteId], text: text },
    });
  });
  formatedFootnotes.sort(
    (item1, item2) =>
      parseInt(item1.footnote.number) - parseInt(item2.footnote.number)
  );
  content.push(...formatedFootnotes);
  return {
    cover,
    content,
  };
}

// Add extra converter for footnotes
json2md.converters.footnote = function (footnote) {
  return `[^${footnote.number}]: ${footnote.text}`;
};

function convertJsonToMarkdown({ content, metadata }) {
  // Do NOT move the formatting of the following lines
  // to prevent markdown parsing errors
  return `---
${YAML.stringify(metadata)}
---

${json2md(content)}`;
}

module.exports = { convertGoogleDocumentToJson, convertJsonToMarkdown };