Repository: aws-samples/amazon-textract-response-parser
Branch: master
Commit: a9b44fee2aa9
Files: 178
Total size: 53.3 MB
Directory structure:
gitextract_tsv6824f/
├── .flake8
├── .github/
│ ├── PULL_REQUEST_TEMPLATE.md
│ └── workflows/
│ └── test_pull_request.yml
├── .idea/
│ ├── amazon-textract-response-parser.iml
│ ├── inspectionProfiles/
│ │ └── profiles_settings.xml
│ ├── misc.xml
│ ├── modules.xml
│ └── vcs.xml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── src-csharp/
│ ├── LICENSE
│ ├── Program.cs
│ ├── README.md
│ ├── TextractExtensions.cs
│ ├── appsettings.json
│ └── parser.csproj
├── src-js/
│ ├── .eslintrc.js
│ ├── .nvmrc
│ ├── .prettierrc.js
│ ├── CHANGELOG.md
│ ├── README.md
│ ├── bin/
│ │ └── reading-order-diagnostic.js
│ ├── examples/
│ │ ├── README.md
│ │ ├── browser-iife/
│ │ │ ├── main.html
│ │ │ ├── main.js
│ │ │ ├── package.json
│ │ │ └── test.js
│ │ ├── nodejs-import/
│ │ │ ├── main.js
│ │ │ └── package.json
│ │ ├── nodejs-require/
│ │ │ ├── main.js
│ │ │ └── package.json
│ │ └── nodejs-typescript/
│ │ ├── package.json
│ │ ├── src/
│ │ │ └── main.ts
│ │ └── tsconfig.json
│ ├── jest.config.js
│ ├── package.json
│ ├── rollup.config.mjs
│ ├── src/
│ │ ├── api-models/
│ │ │ ├── base.ts
│ │ │ ├── content.ts
│ │ │ ├── document.ts
│ │ │ ├── expense.ts
│ │ │ ├── form.ts
│ │ │ ├── geometry.ts
│ │ │ ├── id.ts
│ │ │ ├── index.ts
│ │ │ ├── layout.ts
│ │ │ ├── query.ts
│ │ │ ├── response.ts
│ │ │ └── table.ts
│ │ ├── base.ts
│ │ ├── content.ts
│ │ ├── document.ts
│ │ ├── expense.ts
│ │ ├── form.ts
│ │ ├── geometry.ts
│ │ ├── id.ts
│ │ ├── index.ts
│ │ ├── layout.ts
│ │ ├── query.ts
│ │ └── table.ts
│ ├── test/
│ │ ├── data/
│ │ │ ├── analyzeid-test-drivers-license-response.json
│ │ │ ├── analyzeid-test-passport-response.json
│ │ │ ├── expense-missing-geoms-response.json
│ │ │ ├── financial-document-response.json
│ │ │ ├── form1005-response.json
│ │ │ ├── invoice-expense-response.json
│ │ │ ├── paystub-response.json
│ │ │ ├── table-example-response.json
│ │ │ ├── test-failed-response.json
│ │ │ ├── test-inprogress-response.json
│ │ │ ├── test-multicol-response-2.json
│ │ │ ├── test-multicol-response.json
│ │ │ ├── test-query-response.json
│ │ │ ├── test-response.json
│ │ │ └── test-twocol-header-footer-response.json
│ │ ├── integ/
│ │ │ └── aws-sdk.test.ts
│ │ ├── tsconfig.json
│ │ └── unit/
│ │ ├── api-models.test.ts
│ │ ├── base.test.ts
│ │ ├── content.test.ts
│ │ ├── corpus/
│ │ │ ├── header-footer.test.ts
│ │ │ └── reading-order.test.ts
│ │ ├── document.test.ts
│ │ ├── expense.test.ts
│ │ ├── form.test.ts
│ │ ├── geometry.test.ts
│ │ ├── id.test.ts
│ │ ├── index.test.ts
│ │ ├── layout.test.ts
│ │ ├── query.test.ts
│ │ └── table.test.ts
│ ├── tsconfig.browser.json
│ ├── tsconfig.cjs.json
│ ├── tsconfig.es.json
│ ├── tsconfig.json
│ └── tsconfig.types.json
└── src-python/
├── .style.yapf
├── .yapfignore
├── README.md
├── a2i/
│ ├── README.md
│ ├── __init__.py
│ ├── a2i-response.json
│ ├── a2irp.py
│ └── a2irptest.py
├── bin/
│ └── amazon-textract-pipeline
├── extras/
│ └── dev.txt
├── setup.cfg
├── setup.py
├── tests/
│ ├── data/
│ │ ├── 180-degree-roation.json
│ │ ├── 2023-Q2-table-model-sample.json
│ │ ├── all_features_with_floating_title_header.json
│ │ ├── analyzeExpenseResponse-multipage.json
│ │ ├── bounding_box_issue.json
│ │ ├── employment-application.json
│ │ ├── gib.json
│ │ ├── gib1.json
│ │ ├── gib_10_degrees.json
│ │ ├── gib__10_degrees.json
│ │ ├── gib__15_degrees.json
│ │ ├── gib__180_degrees.json
│ │ ├── gib__25_degrees.json
│ │ ├── gib__270_degrees.json
│ │ ├── gib__90_degrees.json
│ │ ├── gib__minus_10_degrees.json
│ │ ├── gib_multi_page_table_merge.json
│ │ ├── gib_multi_page_tables.json
│ │ ├── gib_multi_tables_multi_page_sample.json
│ │ ├── in-table-footer.json
│ │ ├── in-table-title.json
│ │ ├── issue_83.json
│ │ ├── lending-doc-output.json
│ │ ├── lending-package-no-signature.json
│ │ ├── little_women_page_1.json
│ │ ├── multi-page-forms-samples-2-page.json
│ │ ├── multi-tables-multi-page-sample.json
│ │ ├── patient_intake_form_sample.json
│ │ ├── paystub_with_signature.json
│ │ ├── queries_sample.json
│ │ ├── request_for_verification_of_employment.json
│ │ ├── table-performance-pretty.json
│ │ ├── tables_with_headers_and_merged_cells.json
│ │ ├── tables_with_headers_out_of_order_cells.json
│ │ ├── tables_with_merged_cells_sample1.json
│ │ ├── tables_with_merged_cells_sample2.json
│ │ ├── test-trp2-analyzeid_sample_multi_page.json
│ │ ├── test-trp2_analyzeid_sample1.json
│ │ ├── test-trp2_analyzeid_sample1_with_OCR.json
│ │ ├── test-trp2_analyzeid_sample2.json
│ │ ├── test_table_merged_text.json
│ │ ├── test_trp2_expense_sample1.json
│ │ ├── test_trp2_expense_sample2.json
│ │ ├── test_trp2_expense_sample3.json
│ │ ├── test_trp2_expense_sample4.json
│ │ └── textract-new-tables-api.json
│ ├── test-response.json
│ ├── test_base_trp2.py
│ ├── test_merged.py
│ ├── test_t_tables.py
│ ├── test_trp.py
│ ├── test_trp2.py
│ ├── test_trp2_analyzeid.py
│ ├── test_trp2_expense.py
│ └── test_trp2_lending.py
├── textract-mapping/
│ ├── README.md
│ ├── __init__.py
│ ├── loan-app-response.json
│ ├── mapping-response.json
│ ├── mapping.py
│ └── mappingtest.py
└── trp/
├── __init__.py
├── t_pipeline.py
├── t_tables.py
├── trp2.py
├── trp2_analyzeid.py
├── trp2_expense.py
└── trp2_lending.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .flake8
================================================
[flake8]
ignore = E501,W503
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
*Issue #, if available:*
*Description of changes:*
By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
================================================
FILE: .github/workflows/test_pull_request.yml
================================================
# Controls when the action will run. Triggers the workflow on push or pull request
# events but only for the main branch and changes in folder src-python
name: Test-Pull-Request
on:
pull_request:
types: [assigned, opened, synchronize, reopened]
paths:
- src-python
workflow_dispatch:
# Run the tests
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11"]
defaults:
run:
working-directory: ./src-python
steps:
# Checks out the repository
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
# Install package locally
- name: Install package
run: pip install -e .
# Install dev dependencies
- name: Install dependencies
run: pip install -r ./extras/dev.txt
# Run tests
- name: Test
run: pytest
================================================
FILE: .idea/amazon-textract-response-parser.iml
================================================
================================================
FILE: .idea/inspectionProfiles/profiles_settings.xml
================================================
================================================
FILE: .idea/misc.xml
================================================
================================================
FILE: .idea/modules.xml
================================================
================================================
FILE: .idea/vcs.xml
================================================
================================================
FILE: CODE_OF_CONDUCT.md
================================================
## Code of Conduct
This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
opensource-codeofconduct@amazon.com with any additional questions or comments.
================================================
FILE: CONTRIBUTING.md
================================================
# Contributing Guidelines
Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
documentation, we greatly value feedback and contributions from our community.
Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
information to effectively respond to your bug report or contribution.
## Reporting Bugs/Feature Requests
We welcome you to use the GitHub issue tracker to report bugs or suggest features.
When filing an issue, please check [existing open](https://github.com/aws-samples/amazon-textract-response-parser/issues), or [recently closed](https://github.com/aws-samples/amazon-textract-response-parser/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already
reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
* A reproducible test case or series of steps
* The version of our code being used
* Any modifications you've made relevant to the bug
* Anything unusual about your environment or deployment
## Contributing via Pull Requests
Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
1. You are working against the latest source on the *master* branch.
2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
To send us a pull request, please:
1. Fork the repository.
2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
3. Ensure local tests pass.
4. Commit to your fork using clear commit messages.
5. Send us a pull request, answering any default questions in the pull request interface.
6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
[creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
## Finding contributions to work on
Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws-samples/amazon-textract-response-parser/labels/help%20wanted) issues is a great place to start.
## Code of Conduct
This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
opensource-codeofconduct@amazon.com with any additional questions or comments.
## Security issue notifications
If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
## Licensing
See the [LICENSE](https://github.com/aws-samples/amazon-textract-response-parser/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
================================================
FILE: README.md
================================================
# Textract Response Parser
You can use Textract response parser library to easily parse JSON returned by Amazon Textract. The library parses JSON and provides programming language specific constructs to work with different parts of the document. [textractor](https://github.com/aws-samples/amazon-textract-textractor) is an example of a PoC batch processing tool that takes advantage of the Textract response parser library and generates output in multiple formats.
## Python Usage
For documentation on usage see: [src-python/README.md](src-python/README.md)
## JavaScript/TypeScript Usage
For documentation on usage see: [src-js/README.md](src-js/README.md)
## C# Usage
### Forms
```csharp
document.Pages.ForEach(page => {
Console.WriteLine("Print Lines and Words:");
page.Lines.ForEach(line => {
Console.WriteLine("{0}--{1}", line.Text, line.Confidence);
line.Words.ForEach(word => {
Console.WriteLine("{0}--{1}", word.Text, word.Confidence);
});
});
Console.WriteLine("Print Fields:");
page.Form.Fields.ForEach(f => {
Console.WriteLine("Field: Key: {0}, Value {1}", f.Key, f.Value);
});
Console.WriteLine("Get Field by Key:");
var key = "Phone Number:";
var field = page.Form.GetFieldByKey(key);
if(field != null) {
Console.WriteLine("Field: Key: {0}, Value: {1}", field.Key, field.Value);
}
});
```
### Tables
```csharp
document.Pages.ForEach(page => {
page.Tables.ForEach(table => {
var r = 0;
table.Rows.ForEach(row => {
r++;
var c = 0;
row.Cells.ForEach(cell => {
c++;
Console.WriteLine("Table [{0}][{1}] = {2}--{3}", r, c, cell.Text, cell.Confidence);
});
});
});
});
```
Check out the `src-csharp` folder for instructions on how to run [.NET Core C#](src-csharp/readme.md) samples
## Other Resources
- [Large scale document processing with Amazon Textract - Reference Architecture](https://github.com/aws-samples/amazon-textract-serverless-large-scale-document-processing)
- [Batch processing tool](https://github.com/aws-samples/amazon-textract-textractor)
- [Code samples](https://github.com/aws-samples/amazon-textract-code-samples)
## License Summary
This sample code is made available under the Apache License V2.0 license. See the LICENSE file.
================================================
FILE: src-csharp/LICENSE
================================================
Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
================================================
FILE: src-csharp/Program.cs
================================================
using System;
using Microsoft.Extensions.Configuration;
using Amazon.Textract;
using Amazon.Textract.Model;
using System.Collections.Generic;
using System.Threading.Tasks;
namespace parser {
class Program {
const string BucketName = "";
const string FormFile = "employmentapp.png";
static void Main(string[] args) {
var textractAnalysisClient = BuildTextractClient();
var document = PrepareDocument(textractAnalysisClient, "FORMS");
document.Pages.ForEach(page => {
Console.WriteLine("Print Lines and Words:");
page.Lines.ForEach(line => {
Console.WriteLine("{0}--{1}", line.Text, line.Confidence);
line.Words.ForEach(word => {
Console.WriteLine("{0}--{1}", word.Text, word.Confidence);
});
});
Console.WriteLine("Print Fields:");
page.Form.Fields.ForEach(f => {
Console.WriteLine("Field: Key: {0}, Value {1}", f.Key, f.Value);
});
Console.WriteLine("Get Field by Key:");
var key = "Phone Number:";
var field = page.Form.GetFieldByKey(key);
if(field != null) {
Console.WriteLine("Field: Key: {0}, Value: {1}", field.Key, field.Value);
}
});
document = PrepareDocument(textractAnalysisClient, "TABLES");
document.Pages.ForEach(page => {
page.Tables.ForEach(table => {
var r = 0;
table.Rows.ForEach(row => {
r++;
var c = 0;
row.Cells.ForEach(cell => {
c++;
Console.WriteLine("Table [{0}][{1}] = {2}--{3}", r, c, cell.Text, cell.Confidence);
});
});
});
});
}
static TextractDocument PrepareDocument(TextractTextAnalysisService textractAnalysisClient, string type) {
var task = textractAnalysisClient.StartDocumentAnalysis(BucketName, FormFile, type);
var jobId = task.Result;
textractAnalysisClient.WaitForJobCompletion(jobId);
var results = textractAnalysisClient.GetJobResults(jobId);
return new TextractDocument(results);
}
static TextractTextAnalysisService BuildTextractClient() {
var builder = new ConfigurationBuilder()
.SetBasePath(Environment.CurrentDirectory)
.AddJsonFile("appsettings.json", optional: false, reloadOnChange: true)
.AddEnvironmentVariables()
.Build();
var awsOptions = builder.GetAWSOptions();
return new TextractTextAnalysisService(awsOptions.CreateServiceClient());
}
}
public class TextractTextAnalysisService {
private IAmazonTextract textract;
public TextractTextAnalysisService(IAmazonTextract textract) {
this.textract = textract;
}
public GetDocumentAnalysisResponse GetJobResults(string jobId) {
var response = this.textract.GetDocumentAnalysisAsync(new GetDocumentAnalysisRequest {
JobId = jobId
});
response.Wait();
return response.Result;
}
public bool IsJobComplete(string jobId) {
var response = this.textract.GetDocumentAnalysisAsync(new GetDocumentAnalysisRequest {
JobId = jobId
});
response.Wait();
return !response.Result.JobStatus.Equals("IN_PROGRESS");
}
public async Task StartDocumentAnalysis(string bucketName, string key, string featureType) {
var request = new StartDocumentAnalysisRequest();
var s3Object = new S3Object {
Bucket = bucketName,
Name = key
};
request.DocumentLocation = new DocumentLocation {
S3Object = s3Object
};
request.FeatureTypes = new List { featureType };
var response = await this.textract.StartDocumentAnalysisAsync(request);
return response.JobId;
}
public void WaitForJobCompletion(string jobId, int delay = 5000) {
while(!IsJobComplete(jobId)) {
this.Wait(delay);
}
}
private void Wait(int delay = 5000) {
Task.Delay(delay).Wait();
Console.Write(".");
}
}
}
================================================
FILE: src-csharp/README.md
================================================
# Usage
## Forms
```csharp
document.Pages.ForEach(page => {
Console.WriteLine("Print Lines and Words:");
page.Lines.ForEach(line => {
Console.WriteLine("{0}--{1}", line.Text, line.Confidence);
line.Words.ForEach(word => {
Console.WriteLine("{0}--{1}", word.Text, word.Confidence);
});
});
Console.WriteLine("Print Fields:");
page.Form.Fields.ForEach(f => {
Console.WriteLine("Field: Key: {0}, Value {1}", f.Key, f.Value);
});
Console.WriteLine("Get Field by Key:");
var key = "Phone Number:";
var field = page.Form.GetFieldByKey(key);
if(field != null) {
Console.WriteLine("Field: Key: {0}, Value: {1}", field.Key, field.Value);
}
});
```
## Tables
```csharp
document.Pages.ForEach(page => {
page.Tables.ForEach(table => {
var r = 0;
table.Rows.ForEach(row => {
r++;
var c = 0;
row.Cells.ForEach(cell => {
c++;
Console.WriteLine("Table [{0}][{1}] = {2}--{3}", r, c, cell.Text, cell.Confidence);
});
});
});
});
```
# Test
## Prerequisites
- [Install](https://dotnet.microsoft.com/download) .NET Core
- [Install](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html)
and
[Configure](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
AWS CLI
Then
- Download source code to your local machine
- Run the following at a command line inside the source code folder to execute
```
dotnet run
```
# Extra
upload file to S3
```
aws s3 cp test-files/employmentapp.png s3://
```
================================================
FILE: src-csharp/TextractExtensions.cs
================================================
using System;
using System.Collections.Generic;
namespace Amazon.Textract.Model {
public class Word {
public Word(Block block, List blocks) {
this.Block = block;
this.Confidence = block.Confidence;
this.Geometry = block.Geometry;
this.Id = block.Id;
this.Text = block == null ? string.Empty : block.Text;
}
public Block Block { get; set; }
public float Confidence { get; set; }
public Geometry Geometry { get; set; }
public string Id { get; set; }
public string Text { get; set; }
public override string ToString() {
return Text;
}
}
public class TextractDocument {
private List blockMap = new List();
List> documentPages = new List>();
public TextractDocument(GetDocumentAnalysisResponse response) {
this.Pages = new List();
this.ResponsePages = new List();
this.ResponsePages.Add(response);
this.ParseDocumentPagesAndBlockMap();
this.Parse();
}
private void ParseDocumentPagesAndBlockMap() {
List documentPage = null;
this.ResponsePages.ForEach(page => {
page.Blocks.ForEach(block => {
this.blockMap.Add(block);
if(block.BlockType == "PAGE") {
if(documentPage != null) {
this.documentPages.Add(documentPage);
}
documentPage = new List();
documentPage.Add(block);
} else {
documentPage.Add(block);
}
});
});
if(documentPage != null) {
this.documentPages.Add(documentPage);
}
}
private void Parse() {
this.documentPages.ForEach(documentPage => {
var page = new Page(documentPage, this.blockMap);
this.Pages.Add(page);
});
}
public Block GetBlockById(string blockId) {
return this.blockMap.Find(x => x.Id == blockId);
}
public List ResponsePages { get; set; }
public List Pages { get; set; }
public List> PageBlocks {
get {
return this.documentPages;
}
}
}
public class Table {
public Table(Block block, List blocks) {
this.Block = block;
this.Confidence = block.Confidence;
this.Geometry = block.Geometry;
this.Id = block.Id;
this.Rows = new List();
var ri = 1;
var row = new Row();
var relationships = block.Relationships;
if(relationships != null && relationships.Count > 0) {
relationships.ForEach(r => {
if(r.Type == "CHILD") {
r.Ids.ForEach(id => {
var cell = new Cell(blocks.Find(b => b.Id == id), blocks);
if(cell.RowIndex > ri) {
this.Rows.Add(row);
row = new Row();
ri = cell.RowIndex;
}
row.Cells.Add(cell);
});
if(row != null && row.Cells.Count > 0)
this.Rows.Add(row);
}
});
}
}
public List Rows { get; set; }
public Block Block { get; set; }
public float Confidence { get; set; }
public Geometry Geometry { get; set; }
public string Id { get; set; }
public override string ToString() {
var result = new List();
result.Add(string.Format("Table{0}===={0}", Environment.NewLine));
this.Rows.ForEach(r => {
result.Add(string.Format("Row{0}===={0}{1}{0}", Environment.NewLine, r));
});
return string.Join("", result);
}
}
public class SelectionElement {
public SelectionElement(Block block, List blocks) {
this.Confidence = block.Confidence;
this.Geometry = block.Geometry;
this.Id = block.Id;
this.SelectionStatus = block.SelectionStatus;
}
public float Confidence { get; set; }
public Geometry Geometry { get; set; }
public string Id { get; set; }
public string SelectionStatus { get; set; }
}
public class Row {
public Row() {
this.Cells = new List();
}
public List Cells { get; set; }
public override string ToString() {
var result = new List();
this.Cells.ForEach(c => {
result.Add(string.Format("[{0}]", c));
});
return string.Join("", result);
}
}
public class Page {
public Page(List blocks, List blockMap) {
this.Blocks = blocks;
this.Text = string.Empty;
this.Lines = new List();
this.Form = new Form();
this.Tables = new List
();
this.Content = new List();
blocks.ForEach(b => {
if(b.BlockType == "PAGE") {
this.Geometry = new NewGeometry(b.Geometry);
this.Id = b.Id;
} else if(b.BlockType == "LINE") {
var l = new Line(b, blockMap);
this.Lines.Add(l);
this.Content.Add(l);
this.Text = this.Text + l.Text + Environment.NewLine;
} else if(b.BlockType == "TABLE") {
var t = new Table(b, blockMap);
this.Tables.Add(t);
this.Content.Add(t);
} else if(b.BlockType == "KEY_VALUE_SET") {
if(b.EntityTypes.Contains("KEY")) {
var f = new Field(b, blockMap);
if(f.Key != null) {
this.Form.AddField(f);
this.Content.Add(f);
}
}
}
});
}
public List GetLinesInReadingOrder() {
var lines = new List();
var columns = new List();
this.Lines.ForEach(line => {
var columnFound = false;
for(var index = 0; index < columns.Count; index++) {
var column = columns[index];
var bb = line.Geometry.BoundingBox;
var bbLeft = bb.Left;
var bbRight = bb.Left + bb.Width;
var bbCentre = bb.Left + (bb.Width / 2);
var columnCentre = column.Left + (column.Right / 2);
if((bbCentre > column.Left && bbCentre < column.Right) || (columnCentre > bbLeft && columnCentre < bbRight)) {
lines.Add(new IndexedText { ColumnIndex = index, Text = line.Text });
columnFound = true;
break;
}
}
if(!columnFound) {
var bb = line.Geometry.BoundingBox;
columns.Add(new Column { Left = bb.Left, Right = bb.Left + bb.Width });
lines.Add(new IndexedText { ColumnIndex = columns.Count - 1, Text = line.Text });
}
});
lines.FindAll(line => line.ColumnIndex == 0).ForEach(line => Console.WriteLine(line));
return lines;
}
public string GetTextInReadingOrder() {
var lines = this.GetLinesInReadingOrder();
var text = string.Empty;
lines.ForEach(line => {
text = text + line.Text + "\n";
});
return text;
}
public List Blocks { get; set; }
public string Text { get; set; }
public List Lines { get; set; }
public Form Form { get; set; }
public List
Tables { get; set; }
public List Content { get; set; }
public Geometry Geometry { get; set; }
public string Id { get; set; }
public override string ToString() {
var result = new List();
result.Add(string.Format("Page{0}===={0}", Environment.NewLine));
this.Content.ForEach(c => {
result.Add(string.Format("{1}{0}", Environment.NewLine, c));
});
return string.Join("", result);
}
public class Column {
public float Left { get; set; }
public float Right { get; set; }
public override string ToString() {
return string.Format("Left: {0}, Right :{1}", this.Left, this.Right);
}
}
public class IndexedText {
public int ColumnIndex { get; set; }
public string Text { get; set; }
public override string ToString() {
return string.Format("[{0}] {1}", this.ColumnIndex, this.Text);
}
}
}
public class NewGeometry : Geometry {
public NewGeometry(Geometry geometry) : base() {
this.BoundingBox = geometry.BoundingBox;
this.Polygon = geometry.Polygon;
var bb = new NewBoundingBox(this.BoundingBox.Width, this.BoundingBox.Height, this.BoundingBox.Left, this.BoundingBox.Top);
var pgs = new List();
Polygon.ForEach(pg => pgs.Add(new Point {
X = pg.X,
Y = pg.Y
}));
BoundingBox = bb;
Polygon = pgs;
}
public override string ToString() {
return string.Format("BoundingBox: {0}{1}", BoundingBox, Environment.NewLine);
}
}
public class NewBoundingBox : BoundingBox {
public NewBoundingBox(float width, float height, float left, float top) : base() {
this.Width = width;
this.Height = height;
this.Left = left;
this.Top = top;
}
public override string ToString() {
return string.Format("width: {0}, height: {1}, left: {2}, top: {3}", Width, Height, Left, Top);
}
}
public class Line {
public Line(Block block, List blocks) {
this.Block = block;
this.Confidence = block.Confidence;
this.Geometry = block.Geometry;
this.Id = block.Id;
this.Text = block == null ? string.Empty : block.Text;
this.Words = new List();
var relationships = block.Relationships;
if(relationships != null && relationships.Count > 0) {
relationships.ForEach(r => {
if(r.Type == "CHILD") {
r.Ids.ForEach(id => {
this.Words.Add(new Word(blocks.Find(b => b.BlockType == "WORD" && b.Id == id), blocks));
});
}
});
}
}
public float Confidence { get; set; }
public Geometry Geometry { get; set; }
public string Id { get; set; }
public List Words { get; set; }
public string Text { get; set; }
public Block Block { get; set; }
public override string ToString() {
return string.Format(@"
Line{0}===={0}
{1} {0}
Words{0}----{0}
{2}{0}
----
", Environment.NewLine, this.Text, string.Join(", ", this.Words));
}
}
public class Form {
public List Fields { get; set; }
private Dictionary fieldMap;
public Form() {
this.Fields = new List();
this.fieldMap = new Dictionary();
}
public void AddField(Field field) {
this.Fields.Add(field);
this.fieldMap.Add(field.Key.ToString(), field);
}
public Field GetFieldByKey(string key) {
return this.fieldMap.GetValueOrDefault(key);
}
public List SearchFieldsByKey(string key) {
return this.Fields.FindAll(f => f.Key.ToString().ToLower().Contains(key.ToLower()));
}
public override string ToString() {
return string.Join("\n", this.Fields);
}
}
public class FieldValue {
public FieldValue(Block block, List children, List blocks) {
this.Block = block;
this.Confidence = block.Confidence;
this.Geometry = block.Geometry;
this.Id = block.Id;
this.Text = string.Empty;
this.Content = new List();
var words = new List();
if(children != null && children.Count > 0) {
children.ForEach(c => {
var wordBlock = blocks.Find(b => b.Id == c);
if(wordBlock.BlockType == "WORD") {
var w = new Word(wordBlock, blocks);
this.Content.Add(w);
words.Add(w.Text);
} else if(wordBlock.BlockType == "SELECTION_ELEMENT") {
var selection = new SelectionElement(wordBlock, blocks);
this.Content.Add(selection);
words.Add(selection.SelectionStatus);
}
});
}
if(words.Count > 0) {
this.Text = string.Join(" ", words);
}
}
public List Content { get; set; }
public Block Block { get; set; }
public float Confidence { get; set; }
public Geometry Geometry { get; set; }
public string Id { get; set; }
public string Text { get; set; }
public override string ToString() {
return Text;
}
}
public class FieldKey {
public FieldKey(Block block, List children, List blocks) {
this.Block = block;
this.Confidence = block.Confidence;
this.Geometry = block.Geometry;
this.Id = block.Id;
this.Text = string.Empty;
this.Content = new List();
var words = new List();
if(children != null && children.Count > 0) {
children.ForEach(c => {
var wordBlock = blocks.Find(b => b.Id == c);
if(wordBlock.BlockType == "WORD") {
var w = new Word(wordBlock, blocks);
this.Content.Add(w);
words.Add(w.Text);
}
});
}
if(words.Count > 0) {
this.Text = string.Join(" ", words);
}
}
public List Content { get; set; }
public Block Block { get; set; }
public float Confidence { get; set; }
public Geometry Geometry { get; set; }
public string Id { get; set; }
public string Text { get; set; }
public override string ToString() {
return Text;
}
}
public class Field {
public Field(Block block, List blocks) {
var relationships = block.Relationships;
if(relationships != null && relationships.Count > 0) {
relationships.ForEach(r => {
if(r.Type == "CHILD") {
this.Key = new FieldKey(block, r.Ids, blocks);
} else if(r.Type == "VALUE") {
r.Ids.ForEach(id => {
var v = blocks.Find(b => b.Id == id);
if(v.EntityTypes.Contains("VALUE")) {
var vr = v.Relationships;
if(vr != null && vr.Count > 0) {
vr.ForEach(vc => {
if(vc.Type == "CHILD") {
this.Value = new FieldValue(v, vc.Ids, blocks);
}
});
}
}
});
}
});
}
}
public FieldKey Key { get; set; }
public FieldValue Value { get; set; }
public override string ToString() {
var k = this.Key == null ? string.Empty : this.Key.ToString();
var v = this.Value == null ? string.Empty : this.Value.ToString();
return string.Format(@"
{0}Field{0}===={0}
Key: {1}, Value: {2}
", Environment.NewLine, k, v);
}
}
public class Cell {
public Cell(Block block, List blocks) {
this.Block = block;
this.ColumnIndex = block.ColumnIndex;
this.ColumnSpan = block.ColumnSpan;
this.Confidence = block.Confidence;
this.Content = new List();
this.Geometry = block.Geometry;
this.Id = block.Id;
this.RowIndex = block.RowIndex;
this.RowSpan = block.RowSpan;
this.Text = string.Empty;
var relationships = block.Relationships;
if(relationships != null && relationships.Count > 0) {
relationships.ForEach(r => {
if(r.Type == "CHILD") {
r.Ids.ForEach(id => {
var rb = blocks.Find(b => b.Id == id);
if(rb.BlockType == "WORD") {
var w = new Word(rb, blocks);
this.Content.Add(w);
this.Text = this.Text + w.Text + " ";
} else if(rb.BlockType == "SELECTION_ELEMENT") {
var se = new SelectionElement(rb, blocks);
this.Content.Add(se);
this.Text = this.Text + se.SelectionStatus + ", ";
}
});
}
});
}
}
public int RowIndex { get; set; }
public int RowSpan { get; set; }
public int ColumnIndex { get; set; }
public int ColumnSpan { get; set; }
public List Content { get; set; }
public Block Block { get; set; }
public float Confidence { get; set; }
public Geometry Geometry { get; set; }
public string Id { get; set; }
public string Text { get; set; }
public override string ToString() {
return this.Text;
}
}
}
================================================
FILE: src-csharp/appsettings.json
================================================
{
"AWS": {
"Profile": "default",
"Region": "us-west-2"
}
}
================================================
FILE: src-csharp/parser.csproj
================================================
Exenetcoreapp2.2Always
================================================
FILE: src-js/.eslintrc.js
================================================
module.exports = {
parser: "@typescript-eslint/parser", // Specifies the ESLint parser
parserOptions: {
ecmaVersion: 2020, // Allows for the parsing of modern ECMAScript features
sourceType: "module" // Allows for the use of imports
},
plugins: ["@typescript-eslint", "prettier"],
extends: [
// "plugin:@typescript-eslint/recommended", // Uses the recommended rules from the @typescript-eslint/eslint-plugin
// "prettier/@typescript-eslint", // Uses eslint-config-prettier to disable ESLint rules from @typescript-eslint/eslint-plugin that would conflict with prettier
// "plugin:prettier/recommended", // Enables eslint-plugin-prettier and eslint-config-prettier. This will display prettier errors as ESLint errors. Make sure this is always the last configuration in the extends array.
"eslint:recommended",
"plugin:@typescript-eslint/eslint-recommended",
"plugin:@typescript-eslint/recommended",
"prettier",
],
rules: {
// Place to specify ESLint rules. Can be used to overwrite rules specified from the extended config
// e.g. "@typescript-eslint/explicit-function-return-type": "off",
"prettier/prettier": 2,
},
};
================================================
FILE: src-js/.nvmrc
================================================
lts/hydrogen
================================================
FILE: src-js/.prettierrc.js
================================================
module.exports = {
printWidth: 110,
};
================================================
FILE: src-js/CHANGELOG.md
================================================
# Changelog
## 0.4.3 (2024-11-19)
### Changed
- Bumped dev dependencies (including `cross-spawn`, `lint-staged`, `rollup`) for dependabot/audit
### Fixed
- `.html()` no longer fails on empty pages with no content (proposed fix for [this AWS re:Post question](https://repost.aws/questions/QU68wHh5vLSkiDC9Vt4lXXsw))
## 0.4.2 (2024-06-28)
### Added
- Filter content by block type in a variety of contexts, with `includeBlockTypes` (allow-list) and `skipBlockTypes` (deny-list) options. These filters are available in the core `iter/listContent()`, `Layout.iter/listItems()` and `LayoutItem.iter/listLayoutChildren()` accessors, but can also be used to hide certain content (like page headers and footers) when you render with `.html({...})`. ([#179](https://github.com/aws-samples/amazon-textract-response-parser/issues/179))
- Low-level relationship traversal via `iter/listRelatedItemsByRelType()` is now supported from `Page`s (PAGE blocks)
- New accessor on `SelectionElement.isSelected`, in convenient boolean format (versus the 2-member `.selectionStatus` enumeration)
- Form `Field.isCheckbox` and `FieldValue.isCheckbox`, check if a K->V field corresponds to a (label)->(checkbox) pair. Also added `{Field/FieldValue}.isSelected` and `.selectionStatus`, which return `null` for non-'checkbox' fields. (Pre-work for [#183](https://github.com/aws-samples/amazon-textract-response-parser/issues/183))
### Changed
- `WithContent` mixin options refactored to more closely mirror `IBlockTypeFilterOpts`, because WithContent now aligns to `iter/listRelatedItemsByRelType()` under the hood. This will give us more fine-grained but standardised control of missing and unexpected non-content child block type handling, per item class... But means some warning/error behaviour when parsing Textract JSON might have shifted a little (hopefully for the better).
- A page's `Layout` no longer keeps any internal list-of-items state, instead referring to the parent `PAGE` block's child relationships directly.
## 0.4.1 (2024-06-04)
### Added
- `iter/listRelatedItemsByRelType()` utility methods on all host-linked block wrapper objects, as most common use-cases for `relatedBlockIdsByRelType()` were just to then fetch the parsed wrapper for the retrieved block ID. Hope to further standardise across `childBlockIds`, `relatedBlockIdsByRelType`, and these new methods in a future release - but this might require some breaking changes to drive consistency in the handling of invalid JSONs (with missing block IDs, etc).
- `iter/listLayoutChildren()` utility methods to generically traverse (nested?) child layout elements. We support generic & recursive access, but today the only known nesting is LAYOUT_LIST->LAYOUT_TEXT.
### Fixed
- `html()`, `str()` and `text` representations of page `Layout` no longer duplicate the content of `LAYOUT_TEXT` children under `LAYOUT_LIST` obects. ([#177](https://github.com/aws-samples/amazon-textract-response-parser/issues/177))
### Deprecated
- Page `Layout.nItems` is ambiguous: Prefer `.nItemsTotal` for previous behaviour (counting all direct and indirect children) or `.nItemsDirect` to count only top-level layout items, excluding those referenced as children by others.
## 0.4.0 (2024-02-06)
### Added
- Load and navigate [Amazon Textract Layout analysis](https://aws.amazon.com/blogs/machine-learning/amazon-textracts-new-layout-feature-introduces-efficiencies-in-general-purpose-and-generative-ai-document-processing-tasks/) data. ([#164](https://github.com/aws-samples/amazon-textract-response-parser/issues/164))
- Serialize individual elements, pages and documents to semantic markup with `.html()` (for page and document level, currently depends on `Layout` being enabled).
- Proper support for [table title and footer elements](https://aws.amazon.com/blogs/machine-learning/announcing-enhanced-table-extractions-with-amazon-textract/) (`TABLE_TITLE` and `TABLE_FOOTER`) linked from tables. ([#171](https://github.com/aws-samples/amazon-textract-response-parser/issues/171))
- Support [signature detection results](https://aws.amazon.com/blogs/machine-learning/detect-signatures-on-documents-or-images-using-the-signatures-feature-in-amazon-textract/) (`SIGNATURE` blocks)
- More complete exposure of Textract API model constructs and `base.ts` utility functions in external-facing TRP API
### Changed
- **(BREAKING)** Previously-exposed `CellBase` class is removed, due to refactoring `Cell` and `MergedCell` to depend more on composable mixins and less on fragile hierarchy of (now internal) `CellBaseGeneric`. Use `Cell | MergedCell` instead for typing.
- `Page`s now explicitly track parsed objects in their scope by block ID, which reduced state tracking requirements for other objects (like `Line`, `Query`) as we work toward supporting more edit/mutation operations. See `IBlockManager.registerParsedItem()` and `.getItemByBlockId()` for details. This may result in some **minor warning & error behavior changes** when handling invalid or incomplete Textract JSON.
- Split out `api-models/document` types to better align with library components, and made some minor typing updates.
### Fixed
- `Table.nCells` now correctly reflects merged cells (instead of just counting all sub-cells).
- Support alternative `KEY` and `VALUE` blocks for Forms K-V data, observed in place of the typical `KEY_VALUE_SET` blocks for some test data files (Was this a temporary API issue? A change going forward? 🤷♂️)
### Deprecated
- `ApiBlockWrapper` base class is now slated to become internal-only: Please let us know if you have use-cases
- Various re-exports from `/api-modules/document` sub-module: Prefer importing direct from top-level
- `ApiAsyncJobOuputInProgress` typo superseded by `ApiAsyncJobOutputInProgress`, but original not yet fully removed
## 0.3.1 (2023-08-28)
### Fixed
- Suppress "content may be truncated" warnings when API `NextToken` is present but `null` ([#154](https://github.com/aws-samples/amazon-textract-response-parser/issues/154))
- Fix typed `TABLE_FOOTER` and `TABLE_SECTION_HEADER` EntityType values to match the [API doc](https://docs.aws.amazon.com/textract/latest/dg/API_Block.html) ([#158](https://github.com/aws-samples/amazon-textract-response-parser/issues/158))
## 0.3.0 (2023-07-31)
### Added
- **(BREAKING)** `ignoreMerged` and `repeatMultiRowCells` options on `Table` methods are now wrapped into `opts` objects for better future extensibility and clearer user code.
- Expose the `ignoreMerged` option through `Table.rowAt()`, `Table.iterRows()`, and `Table.listRows()`, to enable navigating table rows ignoring merged cells.
- Page-level access to [Amazon Textract Queries](https://docs.aws.amazon.com/textract/latest/dg/queryresponse.html) results. (Still assessing compositing architecture for a unified document-level view in future) ([#80](https://github.com/aws-samples/amazon-textract-response-parser/issues/80))
- Average OCR (text recognition) confidence is now available on form fields (and their keys and values) as well as tables, table rows, and table cells - via `getOcrConfidence()`, with configurable aggregation method (including minimum, mean, etc.).
- `EntityTypes` for tables and table cells/merged-cells are now accessible through `Table.tableType` property and `Cell.hasEntityTypes()` function - and also added to the underlying API data types. ([#78](https://github.com/aws-samples/amazon-textract-response-parser/issues/78))
### Changed
- **(BREAKING)** UMD module output `dist/umd` removed, following deprecation at v0.2.0 and no requests from users to restore it.
### Fixed
- Corrected wrongly typed `ApiCellBlock.Relationships` from an array of `ApiChildRelationship` to an optional array of same: This field may be omitted altogether when a cell is detected but has no content.
- Corrected wrongly typed `ApiKeyValueSetBlock.EntityTypes` data model from `ApiKeyValueEntityType` to an array of same.
## 0.2.2 (2023-06-19)
### Fixed
- Removed `browser` field from package.json because front end bundlers like webpack use it, and the (IIFE `dist/browser`) build it pointed to was not appropriate for these build systems. Added `jsdelivr` field in its place to help ensure direct-to-browser CDN imports continue to consume the IIFE build by default. ([Issue #139](https://github.com/aws-samples/amazon-textract-response-parser/issues/139))
## 0.2.1 (2023-05-22)
### Fixed
- `.geometry` on Expense result fields is now optional, as the underlying field may not be returned by Amazon Textract in some cases. Typings updated to reflect the fix. ([Issue #102](https://github.com/aws-samples/amazon-textract-response-parser/issues/102))
## 0.2.0 (2022-04-28)
### Added
- Initial support for Amazon Textract [identity document APIs](https://docs.aws.amazon.com/textract/latest/dg/how-it-works-identity.html).
- Document-level Form field access and querying via `TextractDocument.form` in addition to `Page.form`.
- `Page.pageNumber` to find and return 1-based index of the current page in the parent document.
- New ES (esnext) module output in `dist/es` and `module` hint in package.json to encourage compatible tools to use this output.
### Changed
- Use CommonJS `dist/cjs` as default NPM module format instead of previous UMD `dist/umd`.
- Separate type declarations into `dist/types` to reduce duplication and build size.
- Use new [merged table cells](https://aws.amazon.com/about-aws/whats-new/2022/03/amazon-textract-updates-tables-check-detection/) feature by default, rather than classic split cells.
- Eliminate trailing whitespace previously automatically added to Cell.text
### Deprecated
- UMD module output `dist/umd` slated to be removed in a future version: Please let us know if the other format options don't work for you!
## 0.1.2 (2021-12-16)
### Added
- Header and footer segmentation utility (by text `LINE`)
### Changed
- Significantly improved `inReadingOrder` results for multi-column documents.
================================================
FILE: src-js/README.md
================================================
# Textract Response Parser for JavaScript/TypeScript
This library loads [Amazon Textract](https://docs.aws.amazon.com/textract/latest/dg/what-is.html) API response JSONs into structured classes with helper methods, for easier post-processing.
It's designed to work in both NodeJS and browser environments, and to support projects in either JavaScript or TypeScript.
> ⚠️ **Warning:** If you're migrating from another TRP implementation such as the [Textract Response Parser for Python](https://github.com/aws-samples/amazon-textract-response-parser/tree/master/src-python), please note that the APIs and available features may be substantially different. Please let us know if there's a feature you're missing!
## Installation
You can use TRP in your JavaScript or TypeScript NPM projects:
```sh
$ npm install amazon-textract-response-parser
```
```js
// With CommonJS-style require:
const { TextractDocument, TextractIdentity } = require("amazon-textract-response-parser");
// Or ES-style module imports:
import { TextractDocument, TextractExpense } from "amazon-textract-response-parser";
```
...Or link directly in the browser - for example via a CDN like [unpkg](https://unpkg.com/):
```html
```
To enable this, the distribution of this library provides multiple builds:
- `dist/cjs` (default `main`), for CommonJS environments like NodeJS - including most front end applications built with tools like React and Webpack.
- `dist/es` (default `module`), for ES6/ES2015/esnext capable environments.
- `dist/browser` (default `jsdelivr` and `unpkg`), for linking directly from browser HTML with no module framework (IIFE).
This means that **deep imports** will depend on your build environment, but are generally discouraged anyway and may not work correctly with TypeScript. Check out the [examples/](examples/README.md) folder on GitHub for some basic starters using the different styles.
## Loading data
Initialize a `TextractDocument` (or `TextractExpense`, `TextractIdentity`) by providing the parsed response JSON object from the corresponding [Amazon Textract APIs](https://docs.aws.amazon.com/textract/latest/dg/API_Reference.html) such as [GetDocumentAnalysis](https://docs.aws.amazon.com/textract/latest/dg/API_GetDocumentAnalysis.html), [AnalyzeID](https://docs.aws.amazon.com/textract/latest/dg/API_AnalyzeID.html), or [AnalyzeExpense](https://docs.aws.amazon.com/textract/latest/dg/API_AnalyzeExpense.html). In most cases, providing an **array** of response objects is also supported (for use when a large Amazon Textract response was split/paginated).
For example, loading a response JSON from file in NodeJS:
```js
fs.readFile("./my-analyze-document-response.json", (err, resBuffer) => {
if (err) throw err;
const doc = new TextractDocument(JSON.parse(resBuffer));
// ...
});
```
If you're using TypeScript, you may need to **typecast** your input JSON while loading it.
> The `ApiResponsePage` input interface exposed and expected by this module is more constrained than - but functionally compatible with - the output types produced by the [AWS SDK for JavaScript Textract Client](https://docs.aws.amazon.com/AWSJavaScriptSDK/v3/latest/clients/client-textract/index.html).
```typescript
import { ApiAnalyzeExpenseResponse } from "amazon-textract-response-parser";
import { TextractClient, AnalyzeExpenseCommand } from "@aws-sdk/client-textract";
const textract = new TextractClient({});
async function main() {
const textractResponse = await textract.send(
new AnalyzeExpenseCommand({
Document: { Bytes: await fs.readFile("...") },
})
);
const expense = new TextractExpense((textractResponse as unknown) as ApiAnalyzeExpenseResponse);
}
```
With your data loaded in to a TRP `TextractDocument` or similar, you're ready to take advantage of the higher-level TRP.js functions to navigate and analyze the result.
## Generic document text navigation
In general, this library avoids directly exposing **arrays** in results (see the *Mutation operations* section below). Instead, you can use:
- `.n***` properties to count items
- `.list***()` functions to return a copy of the underlying array
- `.iter***()` functions to iterate through collections, or
- `.***At***()` functions to fetch a specific item from a collection
For example:
```typescript
// Navigate the document hierarchy:
console.log(`Opened doc with ${doc.nPages} pages`);
console.log(
`The first word of the first line is ${doc.pageNumber(1).lineAtIndex(0).wordAtIndex(0).text}`
);
// Iterate through content:
for (const page of doc.iterPages()) {
// (In Textract's output order...)
for (const line of page.iterLines()) {
for (const word of line.iterWords()) {
console.log(word.text);
}
}
}
// ...Or get snapshot arrays instead of iterators, if you need:
const linesArrsByPage = doc.listPages().map((p) => p.listLines());
```
These arrays are in the raw order returned by Amazon Textract, which is not necessarily a logical human reading order - especially for multi-column documents. See the *Layout analysis* and *List text in approximate reading order* sections below for extra content sorting utilities.
## Queries
The results of [Amazon Textract Queries](https://docs.aws.amazon.com/textract/latest/dg/queryresponse.html) are accessible at the page level under `page.queries`. You can `get*` a query by exact question text or alias, or `search*` them by case-insensitive substrings:
```typescript
doc.listPages().forEach((page) => {
// Log a quick human-readable overview of queries & answers:
console.log(page.queries.str());
// Get a query (and its top result's text) by exact alias:
const customer = page.queries.getQueryByAlias("customer_name")?.topResult?.text;
// Get possible results of a query from most to least confident:
const shippingAddrCandidates =
page.queries.getQueryByAlias("shipping_addr")?.listResultsByConfidence() || [];
const shippingAddrTopConf = shippingAddrCandidates[0].confidence;
// Seaching matches queries e.g. 'What is the Shipping Address?', 'FIND THE BILLING ADDRESS', etc
const addrQueries = page.queries.searchQueriesByQuestion("address");
});
```
## Forms (Key-Value pairs)
As well as looping through the [form data key-value pairs](https://docs.aws.amazon.com/textract/latest/dg/how-it-works-kvp.html) in the document, you can query fields by key:
```typescript
console.log(doc.form.nFields);
const fields = doc.form.listFields();
// Exact match:
const addr = doc.form.getFieldByKey("Address").value?.text;
// Search key containing (case-insensitive):
const addresses = doc.form.searchFieldsByKey("address");
addresses.forEach((addrField) => { console.log(addrField.key.text); });
```
Note that the `Field.confidence`, `FieldKey.confidence` and `FieldValue.confidence` scores reflect confidence of the **key-value structure detection** model. For aggregated OCR confidence of their **actual text**, use `.getOcrConfidence()` instead.
You can also search form keys at the individual page level, or look up the page number for detected fields:
```typescript
const fieldByDoc = doc.form.getFieldByKey("Address");
console.log(`Detected Address on page ${fieldByDoc.parentPage.pageNumber}`);
const page = doc.pageNumber(1);
const fieldByPage = page.form.getFieldByKey("Address");
```
`field.isCheckbox` is true for fields whose value contain exactly one SelectionElement object: meaning they're a (key=label)->(value=checkbox/radio) pair. For these fields, you can directly use `field.selectionStatus` or `field.isSelected` to look up the value's status. For other (non-checkbox) fields, they'll return `null`.
## Tables
This library's table navigation tools address **[merged cells](https://docs.aws.amazon.com/textract/latest/dg/how-it-works-tables.html) by default**, for convenience.
```typescript
console.log(page.nTables);
const table = page.tableAtIndex(0);
// Index cells by row, column, or both:
const headerStrs = table.cellsAt(1, null)?.map(cell => cell.text);
const firstColCells = table.cellsAt(null, 1);
const targetCell = table.cellAt(2, 4);
// Iterate over rows/cells:
for (const row of table.iterRows()) {
for (const cell of row.iterCells()) {
console.log(cell.text);
}
}
```
Further configuration arguments can be used to change the treatment of merged cells if needed:
```typescript
// Iterate over rows repeating any cells spanning multiple rows:
for (const row of table.iterRows({repeatMultiRowCells: true})) {}
// Return split sub-cells instead of merged cells when indexing:
const firstColCellFragments = table.cellsAt(null, 1, {ignoreMerged: true});
```
The `Table.confidence`, `Row.getConfidence()` and `Cell.confidence` scores reflect confidence of the **table structure detection** model. For aggregated OCR confidence of the text contained inside, use `.getOcrConfidence()` instead.
Use `Table.tableType` and `Cell.hasEntityTypes()` to explore the more advanced [entity types](https://docs.aws.amazon.com/textract/latest/dg/how-it-works-tables.html) extracted by Amazon Textract: For example column headers, title cells, footer cells, and summary cells:
```typescript
import { ApiTableCellEntityType, ApiTableEntityType } from "amazon-textract-response-parser";
const isSemiStruct = table.tableType === ApiTableEntityType.SemiStructuredTable;
const colHeaders = table.rowAt(1).listCells()
.filter((c) => c.hasEntityTypes(ApiTableCellEntityType.ColumnHeader));
```
For [overall table-level title and footer captions](https://aws.amazon.com/blogs/machine-learning/announcing-enhanced-table-extractions-with-amazon-textract/), see `table.listTitles()` and `table.listFooters()`, etc.
## Layout analysis
[Layout analysis in Amazon Textract](https://aws.amazon.com/blogs/machine-learning/amazon-textracts-new-layout-feature-introduces-efficiencies-in-general-purpose-and-generative-ai-document-processing-tasks/) detects higher-level semantic components than the core text Lines & Words - like paragraphs and headings. If you enabled this analysis, you can access the results through the `page.layout` collection:
```typescript
// Loop through content in implied reading order (from Layout API):
page.layout.listItems().forEach((layItem) => {
console.log(layItem.blockType); // There are different kinds of Layout Item
const textLines = layItem.listTextLines(); // All Layout* items can be queried for text LINEs
const children = layItem.listContent(); // Usually text LINEs, but sometimes other Layout* items
console.log(layItem.text + "\n"); // ...Or you can just pull up the text
});
// Filtering by content type is also supported:
for (const layItem of page.layout.listItems({
skipBlockTypes: [
ApiBlockType.LayoutHeader, ApiBlockType.LayoutFooter, ApiBlockType.LayoutPageNumber
],
})) {
console.log(layItem.text);
}
```
If Forms and/or Tables analyses were also enabled, you'll be able to traverse from the relevant Layout object types to these more detailed representations. **However,** because these are separate analyses the correspondence may not be 1-to-1 and TRP is having to do some reconciliation under the hood:
```typescript
import { ApiBlockType, LayoutKeyValue, LayoutTable } from "amazon-textract-response-parser";
page.layout.listItems().forEach((layItem) => {
if (layItem.blockType === ApiBlockType.LayoutKeyValue) {
const fields = (layItem as LayoutKeyValue).listFields(); // Probably multiple
fields.forEach((field) => console.log(field.key.text));
} else if (layItem.blockType === ApiBlockType.LayoutTable) {
const tables = (layItem as LayoutTable).listTables(); // Probably just 1
tables.forEach((table) => console.log(table.nCells));
}
});
```
### List text in approximate reading order (with or without `Layout`)
Particularly for multi-column documents, the default output sequence for Amazon Textract `LINE`/`WORD` OCR results will likely not be the overall reading order you'd like. For best performance, enable and use the `Layout` analysis because **layout items are returned in implied reading order** as estimated by the AI service.
Alternatively, TRP.js provides a **client-side heuristic algorithm** that can attempt to sort results without Layout. There are even some configuration parameters exposed to help you tune the results for your particular domain, and test harnesses in the [tests/unit/corpus folder](tests/unit/corpus) to help you experiment via `npm run test:unit`:
```typescript
import { ReadingOrderLayoutMode } from "amazon-textract-response-parser";
// By default, we automatically use `Layout` when it's available and heuristics when it's not:
let textInReadingOrder: string = page.getTextInReadingOrder(); // Just generate text
let pseudoParas = page.getLineClustersInReadingOrder();
// You can force use of `Layout` (throwing an error if none available):
let layText = page.getTextInReadingOrder({ useLayout: ReadingOrderLayoutMode.RequireLayout });
// Or fine-tune heuristic parameters:
let layParas = page.getLineClustersInReadingOrder({
colHOverlapThresh = 0.75,
paraVDistTol = 0.8,
// ...
useLayout: ReadingOrderLayoutMode.IgnoreLayout,
});
// Lines are clustered by "paragraph"/layout element:
for (const pseudoParagraph of pseudoParas) {
for (const line of pseudoParagraph) {
console.log(line.text);
}
console.log(); // Print a gap between "paragraphs"
}
```
When configured to use Layout analysis results, these functions should be equivalent to just looping through your `page.layout.iterItems()` to get the text from each one in order.
### Render documents to semantic markup/markdown
If you'd like to use AI/ML models to further post-process your Amazon Textract results, you have a choice between those that take text-only inputs - and "multi-modal" models that can also ingest structural information (see for example [this Amazon Comprehend feature](https://aws.amazon.com/about-aws/whats-new/2021/09/amazon-comprehend-extract-entities-native-format/) and [this Amazon SageMaker sample](https://github.com/aws-samples/amazon-textract-transformer-pipeline/tree/main)). While multi-modal models may work best on complex structured documents, the pace of research on text-only Large Language Models has historically been faster (perhaps because plain text data is easier to come by and work with).
**Semantic markup like HTML** provides somewhat of a middle ground where we can try to preserve the layout/form/table/etc structure Amazon Textract extracted, but still provide plain text. This may be particularly useful for working with **Generative Large Language Models** (GenAI/LLMs) like those on [Amazon Bedrock](https://aws.amazon.com/bedrock/).
```typescript
// Render HTML for individual components:
console.log(page.listTables[0].html());
// ...Or for whole pages/documents:
const docHtml = doc.html();
fs.writeFile("./my-doc.html", docHtml, (err) => {});
```
Some caveats to be aware of:
- Top-level `Page.html()` and `TextractDocument.html()` currently depend on Layout analysis being enabled, because the Layout results are used to sequence all the elements together.
- Only HTML is supported currently, but we're keen to add `.markdown()` if there's interest
You can also **filter out** types of content you don't want to include in your HTML.
```typescript
// Most commonly, you'll `skip` high-level layout elements like `LayoutHeader`:
const docHtml = doc.html({
skipBlockTypes: [
ApiBlockType.LayoutHeader, ApiBlockType.LayoutFooter, ApiBlockType.LayoutPageNumber
],
});
// Skipping lower-level blocks is also possible, but can produce weird results:
const docHtmlNoCellsOrSelectors = doc.html({
skipBlockTypes: [ApiBlockType.Cell, ApiBlockType.SelectionElement],
});
// Allow-listing is also possible, but you should include *everything* relevant:
const docTablesHtml = doc.html({
includeBlockTypes: [
ApiBlockType.Page,
ApiBlockType.LayoutTable,
ApiBlockType.Table,
ApiBlockType.Cell,
ApiBlockType.SelectionElement,
ApiBlockType.Word,
],
});
```
If you have feedback about these features, please let us know in the GitHub issues to help prioritise!
### Segment headers and footers from main content
This is another task for which you might find [Textract Layout analysis](https://docs.aws.amazon.com/textract/latest/dg/layoutresponse.html) useful - by looping through layout items and filtering out those of type `LayoutHeader`, `LayoutFooter` and `PageNumber`.
However, TRP.js also provides a heuristic function you can try instead:
```typescript
const segmented = page.getLinesByLayoutArea(
true // (Also try to sort lines in reading order)
);
console.log("---- HEADER:")
console.log(segmented.header.map((l) => l.text).join("\n"));
console.log("\n---- CONTENT:")
console.log(segmented.content.map((l) => l.text).join("\n"));
console.log("\n---- FOOTER:")
console.log(segmented.footer.map((l) => l.text).join("\n"));
```
**Note:** Unlike the `*inReadingOrder` APIs, this utility has not yet been updated to use Textract Layout analysis when it's available. That behavior might change in future.
### Calculate average skew of page text
Calculating the overall skew of a page can be useful for validation checks: For example to detect and reject a strongly skewed image which might degrade the accuracy of tables, forms, or other downstream analyses.
```typescript
// Check the average angle/skew of detected text:
const skew = page.getModalWordOrientationDegrees();
```
This method aggregates the skew to find the most common angle across all content on the page.
## Signatures
If you enabled [signature detection in Amazon Textract](https://aws.amazon.com/blogs/machine-learning/detect-signatures-on-documents-or-images-using-the-signatures-feature-in-amazon-textract/), you can check for signatures at the page level:
```typescript
// e.g. print number of signatures detected by page:
doc.listPages()
.forEach((page, ix) => { console.log(`${page.nSignatures} signatures on page ${ix+1}`); });
// ...Or get the position of the first signature on the first page:
const bbox = doc.pageNumber(1).listSignatures()[0].geometry.boundingBox;
```
## Expense (invoice and receipt) objects
Since the format of responses for Amazon Textract's [Expense results](https://docs.aws.amazon.com/textract/latest/dg/expensedocuments.html) is very different from the [general document analysis APIs](https://docs.aws.amazon.com/textract/latest/dg/how-it-works-document-layout.html), you can use the separate `TextractExpense` class in this library to process these.
```typescript
const expense = new TextractExpense(textractResponse);
// Iterate through content:
console.log(`Found ${expense.nDocs} expense docs in file`);
const expenseDoc = [...expense.iterDocs()][0];
for (const group of expenseDoc.iterLineItemGroups()) {
for (const item of group.iterLineItems()) {
console.log(`Found line item with ${item.nFields} fields`);
for (const field of item.iterFields()) {
...
}
}
}
// Get snapshot arrays instead of iterators, if you need:
const summaryFieldsArrByDoc = expense.listDocs().map((doc) => doc.listSummaryFields());
const linesArrsByPage = doc.listPages().map((p) => p.listLines())
// Retrieve item fields by their tagged 'type':
const vendorNameFields = expenseDoc.searchSummaryFieldsByType("VENDOR_NAME");
console.log(`Found ${vendorNameFields.length} vendor name fields in doc summary`);
console.log(vendorNameFields[0].fieldType.text); // "VENDOR_NAME"
console.log(vendorNameFields[0].value.text); // e.g. "Amazon.com"
```
## Identity document objects
Similarly to expenses mentioned above, Amazon Textract offers specific APIs for [identity document analysis](https://docs.aws.amazon.com/textract/latest/dg/how-it-works-identity.html). You can use the separate `TextractIdentity` class in this library to process these.
```typescript
import { ApiAnalyzeIdResponse, TextractIdentity } from "amazon-textract-response-parser";
import { TextractClient, AnalyzeIDCommand } from "@aws-sdk/client-textract";
const textract = new TextractClient({});
async function main() {
const textractResponse = await textract.send(
new AnalyzeIDCommand({
Document: { Bytes: await fs.readFile("...") },
})
);
const identity = new TextractIdentity((textractResponse as unknown) as ApiAnalyzeIdResponse);
}
```
The library implements some enumerations of known values (for field types, ID types, and so on) to make processing AnalyzeID responses a little simpler:
```typescript
import { IdDocumentType, IdFieldType } from "amazon-textract-response-parser";
const idDoc = identity.getDocAtIndex(0); // (Or iterate, list docs in a result)
if (idDoc.idType === IdDocumentType.Passport) {
// Fetch fields by known type:
const passNumField = idDoc.getFieldByType(IdFieldType.DocumentNumber);
console.log(
`Passport number ${passNumField.value}, confidence ${passNumField.valueConfidence}%`
);
} else if (idDoc.idType === IdDocumentType.DrivingLicense) {
// ...Or list or iterate the document's fields:
for (const field of idDoc.iterFields()) {
console.log(`${field.fieldTypeRaw}: ${field.valueRaw}`);
}
} else {
// Produce human-readable representations of fields, documents, or whole responses:
console.log(idDoc.str());
}
```
## Mutation operations
Easier analysis and querying of Textract results is useful, but what if you want to augment or edit your Textract JSONs with JS/TS Textract Response Parser?
In general:
- Where the library classes (`TextractDocument`, `Page`, `Word`, etc) offer mutation operations, these should modify the source API JSON object **in-place** and ensure self-consistency.
- For library classes that are backed by a specific object in the source API JSON, you can access it via the `.dict` property (`word.dict`, `table.dict`, etc) but then *you're* responsible for updating any required references in other objects if making changes there.
- Any individual-block-level changes you make to the underlying API JSON should be dynamically reflected in the parsed TRP objects (e.g. overriding word text, coordinates, etc)... But changes that affect inter-block relationships are more likely to cause staleness issues.
In particular for **array properties**, you'll note that TRP generally exposes getters and iterators (such as `table.nRows`, `table.iterRows()`, `table.listRows()`, `table.cellsAt()`) rather than direct access to lists - to avoid implying that arbitrary array mutations (such as `table.rows.pop()`) are properly supported.
## Other features and examples
For more examples on how to use the library, you can refer to the (basic) [examples](examples/) and (more complete) [test](test/) folders on GitHub, and the source code itself. If you have suggestions for additional features that would be useful, please open a GitHub issue!
## Development
The integration tests for this library validate the end-to-end toolchain for calling Amazon Textract and parsing the result, so note that to run the full `npm run test` command:
1. Your environment will need to be configured with a login to AWS (e.g. via the [AWS CLI](https://aws.amazon.com/cli/))
2. Billable API requests may be made
You can alternatively run just the local/unit tests via `npm run test:unit`.
================================================
FILE: src-js/bin/reading-order-diagnostic.js
================================================
/**
* Basic script to extract and save reading-order text from Amazon Textract JSONs.
*
* This script uses the built NodeJS library, so check your build is up-to-date by first running
* `npm run build`! JSON files are read from IN_FOLDER, parsed with the TRP, and reading-order text
* files output to the OUT_FOLDER. This can be a helpful tool for debugging issues with (or writing
* corpus tests for) the 'inReadingOrder' functions.
*/
/* eslint-disable no-undef */
/* eslint-disable @typescript-eslint/no-var-requires */
// Node Built-Ins:
const fs = require("fs");
const path = require("path");
// Local Dependencies:
const { TextractDocument } = require("../dist/cjs");
const IN_FOLDER = "test/data/corpus";
const OUT_FOLDER = "test/data/corpus-readingorder";
if (!fs.existsSync(OUT_FOLDER)) {
fs.mkdirSync(OUT_FOLDER, { recursive: true });
}
fs.readdirSync(IN_FOLDER).forEach((file) => {
let response;
try {
response = JSON.parse(fs.readFileSync(path.join(IN_FOLDER, file)));
} catch (err) {
console.error(`Skipping ${file} - doesn't look like valid JSON`);
return;
}
const pageTexts = [];
const doc = new TextractDocument(response);
doc.listPages().forEach((page, ixPage) => {
pageTexts.push(
[
"------------------------------------------------",
`PAGE ${ixPage + 1}`,
"------------------------------------------------",
page.getTextInReadingOrder(),
].join("\n")
);
});
const outFileRoot = path.join(OUT_FOLDER, file.split(".")[0]);
fs.writeFileSync(`${outFileRoot}.readingorder.txt`, pageTexts.join("\n\n\n"));
fs.writeFileSync(
`${outFileRoot}.readingorder.json`,
JSON.stringify(
doc
.listPages()
.map((page) =>
page._getLineClustersByColumn().map((col) => col.map((cluster) => cluster.map((line) => line.text)))
),
null,
2
)
);
console.log(`Done ${file}`);
});
console.log("All done!");
================================================
FILE: src-js/examples/README.md
================================================
# Examples for TRP.js
This folder contains example projects using the Amazon Textract Response Parser for JavaScript/TypeScript from various different build environments, to help you get started.
> ⚠️ **Note:** While all of the example projects reference local API response JSON files, some also make Amazon Textract API calls by default - so running them may incur (typically very small) charges. See [Amazon Textract Pricing](https://aws.amazon.com/textract/pricing/) for details.
## Pre-requisites for running the examples
### Local builds of TRP.js
The projects use the **local build** of the library for pre-publication testing, so you'll need to run `npm run build` in the parent `src-js` folder before they'll work.
To instead switch to published TRP.js versions (if you're using an example as a skeleton for your own project):
- For NodeJS projects, Replace the package.json relative path in `"amazon-textract-response-parser": "file:../.."` with a normal version spec like `"amazon-textract-response-parser": "^0.4.3"`, and re-run `npm install`
- For browser IIFE projects, edit the `
Select an Amazon Textract API response JSON file to analyze