Full Code of ZJONSSON/node-etl for AI

master 60e7da365898 cached

79 files

109.8 KB

31.3k tokens

50 symbols

1 requests

Download .txt

Repository: ZJONSSON/node-etl
Branch: master
Commit: 60e7da365898
Files: 79
Total size: 109.8 KB

Directory structure:
gitextract_s4il6__p/

├── .github/
│   └── workflows/
│       ├── coverage.yml
│       ├── publish.yml
│       └── test.yml
├── .gitignore
├── .npmignore
├── .vscode/
│   ├── launch.json
│   └── tasks.json
├── Dockerfile
├── README.md
├── docker-compose.yml
├── index.js
├── lib/
│   ├── bigquery/
│   │   └── index.js
│   ├── chain.js
│   ├── cluster.js
│   ├── collect.js
│   ├── csv_parser.js
│   ├── cut.js
│   ├── elasticsearch/
│   │   ├── bulk.js
│   │   ├── find.js
│   │   ├── index.js
│   │   ├── mapping.js
│   │   └── scroll.js
│   ├── expand.js
│   ├── file.js
│   ├── fixed.js
│   ├── index.js
│   ├── inspect.js
│   ├── keepOpen.js
│   ├── mongo/
│   │   ├── bulk.js
│   │   ├── index.js
│   │   ├── insert.js
│   │   └── update.js
│   ├── mysql/
│   │   ├── execute.js
│   │   ├── index.js
│   │   ├── mysql.js
│   │   ├── script.js
│   │   └── upsert.js
│   ├── postgres/
│   │   ├── execute.js
│   │   ├── index.js
│   │   ├── insert.js
│   │   ├── postgres.js
│   │   ├── script.js
│   │   └── upsert.js
│   ├── prescan.js
│   ├── split.js
│   ├── stringify.js
│   ├── timeout.js
│   ├── toFile.js
│   └── tostream.js
├── package.json
├── test/
│   ├── chain-test.js
│   ├── cluster-test.js
│   ├── collect-test.js
│   ├── csv-test.js
│   ├── cut-test.js
│   ├── data-changed.js
│   ├── data.js
│   ├── elastic-retry-test.js
│   ├── elastic-test.js
│   ├── expand-test.js
│   ├── file-test.js
│   ├── fixed-test.js
│   ├── keepopen-test.js
│   ├── lib/
│   │   ├── dataStream.js
│   │   ├── mongo.js
│   │   └── worker.js
│   ├── mongo-insert-test.js
│   ├── mongo-update-test.js
│   ├── mysql-test.js
│   ├── postgres-test.js
│   ├── scan-test.js
│   ├── split-test.js
│   ├── stringify-test.js
│   ├── test.csv
│   ├── test.txt
│   ├── timeout-test.js
│   ├── tofile-test.js
│   └── tostream-test.js
└── test.sh

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/coverage.yml
================================================
name: Node.js CI

on:
  push:
    branches: [ master ]
  workflow_dispatch:

permissions:
  contents: read
  pages: write
  id-token: write

jobs:
  coverage:
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v3
      - name: Generate coverage report
        uses: actions/setup-node@v3
        with:
          node-version: 18.x
      - run: npm test
      - run: docker exec node-etl-runner-1 bash -c "npx lcov-badge2 .tap/report/lcov.info -o.tap/report/lcov-report/badge.svg"
      - name: Setup Pages
        uses: actions/configure-pages@v3
      - name: Upload artifact
        uses: actions/upload-pages-artifact@v2
        with:
          path: '.tap/report/lcov-report'
      - name: Deploy to GitHub Pages
        id: deployment
        uses: actions/deploy-pages@v2

================================================
FILE: .github/workflows/publish.yml
================================================
name: Publish to NPM
on:
  release:
    types: [created]
jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v3
      - name: Setup Node
        uses: actions/setup-node@v3
        with:
          node-version: '18.x'
          registry-url: 'https://registry.npmjs.org'
      - name: Publish package on NPM 📦
        run: npm publish
        env:
          NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}

================================================
FILE: .github/workflows/test.yml
================================================
name: Node.js CI

on:
  push:
    branches: [ master ]
  pull_request:
    branches: [ master ]
  workflow_dispatch:

jobs:
  test:
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v3
      - name: Unit tests
        uses: actions/setup-node@v3
        with:
          node-version: 18.x
      - run: npm test


================================================
FILE: .gitignore
================================================
/.docker_node_modules/
/node_modules/
/.nyc_output/
/coverage/
/.tap/
package-lock.json

================================================
FILE: .npmignore
================================================
/.tap/
/test/
/.docker_node_modules/
/.vscode/


================================================
FILE: .vscode/launch.json
================================================
{
  // Use IntelliSense to learn about possible attributes.
  // Hover to view descriptions of existing attributes.
  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
  "version": "0.2.0",
  "configurations": [

    {
      "type": "node",
      "request": "attach",
      "preLaunchTask": "Inspect Current File (Docker)",
      "name": "Debug current file (docker)",
      "localRoot": "${workspaceRoot}",
      "remoteRoot": "/usr/src/app",
     // "address": "172.19.0.100",
      "port": 9229,
  },
  ]
}

================================================
FILE: .vscode/tasks.json
================================================
{
  // See https://go.microsoft.com/fwlink/?LinkId=733558
  // for the documentation about the tasks.json format
  "version": "2.0.0",
  "tasks": [
    {
      "label": "Inspect Current File (Docker)",
      "type": "shell",
      "command": "bash docker exec -it etl-runner-1 pkill -f ':9229';(docker exec  etl-runner-1 node --inspect-brk=173.23.0.100:9229 /usr/src/app/${relativeFile} &);sleep 1",
      "group": "test",
    },
    {
      "label": "Tap Current File (Docker)",
      "type": "shell",
      "command": "docker exec -it etl-runner-1 tap -Rspec /usr/src/app/${relativeFile}",
      "group": "test",
    },
    {
      "label": "All Unit tests (Docker)",
      "type": "shell",
      "command": "npm test",
      "group": "test",
    }
  ]
}

================================================
FILE: Dockerfile
================================================
FROM node:20-bookworm
USER root
RUN apt-get update
RUN wget http://ftp.us.debian.org/debian/pool/main/w/wait-for-it/wait-for-it_0.0~git20180723-1_all.deb
RUN dpkg -i ./wait-for-it_0.0~git20180723-1_all.deb
RUN apt-get install -f
RUN npm install -g tap

================================================
FILE: README.md
================================================
[![NPM Version][npm-image]][npm-url]
[![NPM Downloads][downloads-image]][downloads-url]
[![code coverage](https://zjonsson.github.io/node-etl/badge.svg)](https://zjonsson.github.io/node-etl/) 


[npm-image]: https://img.shields.io/npm/v/etl.svg
[npm-url]: https://npmjs.org/package/etl
[downloads-image]: https://img.shields.io/npm/dm/etl.svg
[downloads-url]: https://npmjs.org/package/etl

ETL is a collection of stream based components that can be piped together to form a complete ETL pipeline with buffering, bulk-inserts and concurrent database streams. See the `test` directory for live examples.

```
npm install etl
```

Introductory example: csv -> elasticsearch 

```js
fs.createReadStream('scores.csv')
  // parse the csv file
  .pipe(etl.csv())
  // map `date` into a javascript date and set unique _id
  .pipe(etl.map(d => {
    d._id = d.person_id;
    d.date = new Date(d.date);
    return d;
  }))
  // collect 1000 records at a time for bulk-insert
  .pipe(etl.collect(1000))  
  // upsert records to elastic with max 10 concurrent server requests
  .pipe(etl.elastic.index(esClient,'scores','records',{concurrency:10}))
  // Switch from stream to promise chain and report done or error
  .promise()
  .then( () => console.log('done'), e => console.log('error',e));
```

## API Reference
* [Parsers](#parsers)
* [Transforms](#transforms)
* [Database upload](#databases)
  * [Mongodb](#mongo)
  * [Mysql](#mysql)
  * [Postgres](#postgres)
  * [Elasticsearch](#elasticsearch)
  * [BigQuery](#bigquery)
* [Utilities](#utilities)

### Parsers 

<a name="csv" href="#csv">#</a> etl.<b>csv</b>([<i>options</i>])

Parses incoming csv text into individual records.  For parsing options see [csv-parser](https://www.npmjs.com/package/csv-parser).  If  `options` contains a `transform` object containing functions, those functions will be applied on the values of any matching keys in the data.  If a key in the `transform` object is set to `null` then value with that key will not be included in the downstream packets.  If option `santitize` is set to true, then headers will be trimmed, converted to lowercase, spaces converted to underscore and any blank values (empty strings) will be set to undefined.

A `header` event will be emitted when headers have been parsed. An event listener can change the headers in-place before the stream starts piping out parsed data.

Example

```js
// Here the test.csv is parsed but field dt is converted to date.  Each packet will 
// contain the following properties:  __filename, __path, __line and csv fields
etl.file('test.csv')
  .pipe(etl.csv({
    transform: {
      dt : function(d) {
        return new Date(d);
      }
    }
  })
  .pipe(...)
```

<a name="fixed" href="#fixed">#</a> etl.<b>fixed</b>(<i>layout</i>)

Parses incoming text into objects using a fixed width layout.   The layout should be an object where each key is a field name that should be parsed, containing an object with `start`, `end` and/or `length`.  Alternatively each key can just have a number, which will be deemed to be `length`. 

If a key contains a `transform` function, it will be applied to the parsed value of that key. 


The layout can also be supplied as an array where instead of an object key the fieldname is defined using property `field` in each element.

The length of a single record will be determined by the highest `end` or `start+length` position.

Each packet will contain `__line`, a number signifying the sequential position of the record

Example

```js
// Reads the file and parses into individual records.  Each packet will contain the
// following properties:  __filename, __path, __line, firstCol, nextCol, lastCol.
// nextCol values are coerced into numbers here

var layout = {
  firstCol: {start: 0, end:10},
  nextCol: {length:5, transform: Number},
  lastCol: 5
}

etl.file('test.txt')
  .pipe(etl.fixed(layout))
  .pipe(...)
```

If needed, you can get the current line as param in the transform function.

```js

var layout = {
  firstCol: {
    start: 0, end: 10,
    transform: function (value, line) {      
      console.log(`firstCol on line ${line} is: ${value}`); // firstCol on line 1 is: test
      return value;
    }
  }
}

```

### Transforms

<a name="map" href="#map">#</a> etl.<b>map</b>(<i>fn</i>)

The base [`streamz`](http://github.com/ZJONSSON/streamz) object is exposed as `etl.map` to provide quick ability to do mappings on the fly.  Anything `pushed` inside the custom function will go downstream.  Also if the function has a return value (or a promise with a return value) that is `!== undefined` that return value will be pushed as well.

Example

```js
// In this example names and ages and normalized to fresh objects pushed
// downstream.  (If we wanted to retain metadata we would use Object.create(d))

etl.file('test.csv')
  .pipe(etl.csv())
  .pipe(etl.map(function(d) {
    this.push({name: d.name_1, age: d.age_1});
    this.push({name: d.name_2, age: d.age_2});
    this.push({name: d.name_3, age: d.age_3});
  }))
```

<a name="split" href="#split">#</a> etl.<b>split</b>([<i>symbol</i>])

Splits the `text` of an incoming stream by the provided symbol into separate packets. The default symbol is a newline, i.e. splitting incoming text into invididual lines.  If the supplied symbol is never found, the incoming stream will be buffered until the end, where all content is sent in one chunk.  The prototype of each packet is the first incoming packet for each buffer.

Example

```js
// Reads the file and splits `text` by newline
etl.file('text.txt')
  .pipe(etl.split())
```

<a name="cut" href="#cut">#</a> etl.<b>cut</b>(<i>maxLength</i>)

Cuts incoming text into text snippets of a given maximum length and pushes downstream.

<a name="collect" href="#collect">#</a> etl.<b>collect</b>(<i>count</i> [,<i>maxDuration</i>] [,<i>maxTextLength</i>])

Buffers incoming packets until they reach a specified count and then sends the array of the buffered packets downstream. At the end of the incoming stream, any buffered items are shipped off even though the count has not been achieved.   This functionality can come in handy when preparing to bulk-insert into databases.   An optional `maxDuration` can be supplied to signal the maximum amount of time (in ms) that can pass from a new collection starting until the contents are pushed out.  This can come in handy when processing real time sporadic data where we want the collection to flush early even if the count has not been reached.  Finally defining an optional `maxTextLength` will cause the stream to keep track of the stringified length of the puffer and push when it goes over the limit. 

Example:

```js
var data = [1,2,3,4,5,6,7,8,9,10,11],
    collect = etl.collect(3);

data.forEach(collect.write.bind(collect));
collect.end();

collect.pipe(etl.inspect());
// Should show 4 packets: [1,2,3]   [4,5,6]    [7,8,9]   [10,11]
```

If the first argument (`count`) is a function it will be used as a custom collection function.  This function can add elements to the buffer by: `this.buffer.push(data)` and push buffer downstream by: `this._push()`.   When stream is ended any remaining buffer is pushed automatically.

<a name="chain" href="#chain">#</a> etl.<b>chain</b>(<i>fn</i>)

Allows a custom subchain of streams to be injected into the pipe using duplexer3. You must provide a custom function that takes in the inbound stream as a first argument and optionally an outbound stream as the second argument.   You can use the optional outbound stream directly to chain the two streams together or you can return a stream or a Promise resolved with stream or values (all of which will be piped down with `etl.toStream`).

Example 1: Simple return of the outbound stream

```js
etl.file('test.csv')
  .pipe(etl.chain(function(inbound) {
    return inbound
      .pipe(etl.csv())
      .pipe(etl.collect(100));
  }))
  .pipe(console.log);
```

Example: Using the outbond stream from arguments
```js
etl.file('test.csv')
  .pipe(etl.chain(function(inbound,outbound) {
    inbound
      .pipe(etl.csv())
      .pipe(etl.collect(100))
      .pipe(outbound);
  }))
  .pipe(console.log);
```

<a name="prescan" href="#prescan">#</a> etl.<b>prescan</b>(<i>size</i>,<i>fn</i>)

Buffers the incoming data until the supplied size is reached (either number of records for objects or buffer/string length). When target size is reached, the supplied function will be called with the buffered data (array) as an argument. After the function has executed and the returning promise (if any) has been resolved, all buffered data will be piped downstream as well as all subsequent data.

Prescan allows the user to make certain determinations from the incoming data before passing it down, such as inspecting data types across multiple rows.


Example:

```js
// In this example we want to collect all columns for first 10 rows
// of a 
// to build a csv header row

let headers = new Set();
fs.createReadStream('data.json')
  .pipe(etl.split()) // split on newline
  .pipe(etl.map(d => JSON.parse(d)))  // parse each line as json
  .pipe(etl.prescan(10,d => 
    // build up headers from the first 10 lines
    d.forEach(d => Object.keys(d).forEach(key => headers.add(key)))
  ))
  .pipe(etl.map(function(d) => {
    this.firstline = this.firstline || this.push([...headers].join('.')+'\n');
    headers.map(header => d[header]).join('.')+'\n'))
  }))
  .pipe(fs.createWriteStream('data.csv'))
```

<a name="expand" href="#expand">#</a> etl.<b>expand</b>([<i>convert</i>])

Throughout the etl pipeline new packets are generated with incoming packets as prototypes (using `Object.create`).  This means that inherited values are not enumerable and will not show up in stringification by default (although they are available directly).  `etl.expand()` loops through all keys of an incoming packet and explicitly sets any inherited values as regular properties of the object

Example:

```js
// In this example the `obj` would only show property `c` in stringify
// unless expanded first
var base = {a:1,b:'test'},
    obj = Object.create(base),
    s = etl.streamz();

obj.c = 'visible';
s.end(obj);

s.pipe(etl.expand())
  .pipe(etl.inspect());
```

The optional `convert` option will modify the keys of the new object.  If `convert` is `'uppercase'` or `'lowercase'` the case of the keys will be adjusted accordingly.  If `convert` is a function it will set the keyname to the function output (and if output is `undefined` that particular key will not be included in the new object)

<a name="stringify" href="#stringify">#</a> etl.<b>stringify</b>([<i>indent</i>] [,<i>replacer</i>] [,<i>newline</i>])

Transforms incoming packets into JSON stringified versions, with optional `indent` and `replacer`.  If `newline` is true a `\n` will be appended to each packet.

<a name="inspect" href="#inspect">#</a> etl.<b>inspect</b>([<i>options</i>])

Logs incoming packets to console using `util.inspect` (with optional custom options)

<a name="timeout" href="#timeout">#</a> etl.<b>timeout</b>([<i>ms</i>])

A passthrough transform that emits an error if no data has passed through for at least the supplied milliseconds (`ms`).  This is useful to manage pipelines that go stale for some reason and need to be errored out for further inspection.

Example:

```js
// Here the pipeline times out if no data has been flowing to the file for at least 1 second
mongocollection.find({})
  .pipe(lookup)
  .timeout(1000)
  .pipe(etl.toFile('test.json'))
```


### Databases

#### Mongo

<a name="mongoinsert" href="#mongoinsert">#</a> etl.mongo.<b>insert</b>(<i>collection</i> [,<i>options</i>])

Inserts incoming data into the provided mongodb collection.  The supplied collection can be a promise on a collection.  The options are passed on to both streamz and the mongodb insert comand.  By default this object doesn't push anything downstream, but if `pushResults` is set as `true` in options, the results from mongo will be pushed downstream.

Example

```js
// The following inserts data from a csv, 10 records at a time into a mongo collection
// ..assuming mongo has been promisified

var db = mongo.ConnectAsync('mongodb://localhost:27017/testdb');
var collection = db.then(function(db) {
  return db.collection('testcollection');
});

etl.file('test.csv')
  .pipe(etl.csv())
  .pipe(etl.collect(10))
  .pipe(etl.mongo.insert(collection));

```

<a name="mongoupdate" href="#mongoupdate">#</a> etl.mongo.<b>update</b>(<i>collection</i> [,<i>keys</i>] [,<i>options</i>])

Updates incoming data by building a `criteria` from an array of `keys` and the incoming data.   Supplied collection can be a promise and results can be pushed downstream by declaring `pushResults : true`.   The options are passed to mongo so defining `upsert : true` in options will ensure an upsert of the data.

Example

```js
// The following updates incoming persons using the personId as a criteria (100 records at a time)

etl.file('test.csv')
  .pipe(etl.csv())
  .pipe(etl.collect(100))
  .pipe(etl.mongo.update(collection,['personId']));

```

<a name="mongouupsert" href="#mongoupsert">#</a> etl.mongo.<b>upsert</b>(<i>collection</i> [,<i>keys</i>] [,<i>options</i>])

Syntax sugar for `mongo.update` with `{upsert: true}`

By default `update` and `upsert` will take each data object and wrap it within a `$set{}`.  If you want to have full control of the mongo update verbs used you can put them under `$update` in the data object.

#### Mysql

<a name="mysqlupsert" href="#mysqlupsert">#</a> etl.mysql.<b>upsert</b>(<i>pool</i>, <i>schema</i>, <i>table</i> [,<i>options</i>])

Pipeline that scripts incoming packets into bulk sql commands (`etl.mysql.script`) and executes them (`etl.mysql.execute`) using the supplied mysql pool. When the size of each SQL command reaches `maxBuffer` (1mb by default) the command is sent to the server.  Concurrency is managed automatically by the mysql poolSize. 

Example:

```js
etl.file('test.csv')
  .pipe(etl.csv())
  .pipe(etl.mysql.upsert(pool,'testschema','testtable',{concurrency:4 }))
```

<a name="mysqlscript" href="#mysqlscript">#</a> etl.mysql.<b>script</b>(<i>pool</i>, <i>schema</i>, <i>table</i> [,<i>options</i>])

Collects data and builds up a mysql statement to insert/update data until the buffer is more than `maxBuffer` (customizable in options).  Then the maxBuffer is reached, a full sql statement is pushed downstream.   When the input stream has ended, any remaining sql statement buffer will be flushed as well. 

The script stream first establishes the column names of the table being updated, and as data comes in - it uses only the properties that match column names in the table.

<a name="mysqlexecute" href="#mysqlexecute">#</a> etl.mysql.<b>execute</b>(<i>pool</i> [,<i>options</i>])

This component executes any incoming packets as sql statements using connections from the connection pool. The maximum concurrency is automatically determined by the mysql poolSize, using the combination of callbacks and Promises.

Example:

```js
// The following bulks data from the csv into sql statements and executes them with 
// a maximum of 4 concurrent connections

etl.file('test.csv')
  .pipe(etl.csv())
  .pipe(etl.mysql.script(pool,'testschema','testtable'))
  .pipe(etl.mysql.execute(pool,4))
```

#### Postgres

<a name="postgresupsert" href="#postgresupsert">#</a> etl.postgres.<b>upsert</b>(<i>pool</i>, <i>schema</i>, <i>table</i> [,<i>options</i>])

Pipeline that scripts incoming packets into bulk sql commands (`etl.postgres.script`) and executes them (`etl.postgres.execute`) using the supplied postgres pool. When the size of each SQL command reaches `maxBuffer` (1mb by default) the command is sent to the server.  Concurrency is managed automatically by the postgres poolSize. If primary key is defined and an incoming data packet contains a primary key that already exists in the table, the record will be updated - otherwise the packet will be inserted.


Example:

```js
etl.file('test.csv')
  .pipe(etl.csv())
  .pipe(etl.postgres.upsert(pool,'testschema','testtable',{concurrency:4 }))
```


<a name="postgresscript" href="#postgresscript">#</a> etl.postgres.<b>script</b>(<i>pool</i>, <i>schema</i>, <i>table</i> [,<i>options</i>])

Collects data and builds up a postgres statement to insert/update data until the buffer is more than `maxBuffer` (customizable in options).  Then the maxBuffer is reached, a full sql statement is pushed downstream.   When the input stream has ended, any remaining sql statement buffer will be flushed as well.

The script stream first establishes the column names of the table being updated, and as data comes in - it uses only the properties that match column names in the table.

<a name="postgresexecute" href="#postgresexecute">#</a> etl.postgres.<b>execute</b>(<i>pool</i> [,<i>options</i>])

This component executes any incoming packets as sql statements using connections from the connection pool. The maximum concurrency is automatically determined by the postgres poolSize, using the combination of callbacks and Promises.

Example:

```js
// The following bulks data from the csv into sql statements and executes them with 
// a maximum of 4 concurrent connections

etl.file('test.csv')
  .pipe(etl.csv())
  .pipe(etl.postgres.script(pool,'testschema','testtable'))
  .pipe(etl.postgres.execute(pool,4))
```

#### Elasticsearch

<a name="elasticbulk" href="#elasticbulk">#</a> etl.elastic.<b>bulk</b>(<i>action</i>, <i>client</i>, [,<i>index</i>] [,<i>type</i>] [,<i>options</i>])

Transmit incoming packets to elasticsearch, setting the appropriate meta-data depending on the default action. Each incoming packet can be an array of documents (or a single document).  Each document should contain a unique `_id`.   To bulk documents together use `etl.collect(num)` above the elastic adapter.

The results are not pushed downstream unless `pushResults` is defined in the options. The body of the incoming data is included in the results, allowing for easy resubmission upon version conflicts. By defining option `pushErrors` as `true` only errors will be pushed downstream.  Maximum number of concurrent connections can be defined as option `concurrency`.  If `maxRetries` is defined in options, an error response from the server will result in retries up to the specified limit - after a wait of `retryDelay` or 30 seconds.  This can be useful for long-running upsert operations that might encounter the occasional network or timeout errors along the way.  If `debug` is defined true, the error message will be printed to console before retrying.  `maxRetries` should only be used for data with user-supplied `_id` to prevent potential duplicate records on retry.

An exponential backoff is provided by defining `backoffDelay` in options. The backoff can be capped by defining `maxBackoffDelay` and variance can be applied by defining `backoffVariance` (should be between 0-1)

If index or type are not specified when the function is called,  they will have to be supplied as `_index` and `_type` properties of each document. The bulk command first looks for `_source` in the document to use as a document body (in case the document originates from a scroll command), alternatively using the document itself as a body.

Available actions are also provided as separate api commands:

* `etl.elastic.index(client,index,type,[options])`
* `etl.elastic.update(client,index,type,[options])`
* `etl.elastic.upsert(client,index,type,[options])`
* `etl.elastic.delete(client,index,type,[options])`
* `etl.elastic.custom(client,index,type,[options])`

Example

```js
etl.file('test.csv')
  .pipe(etl.csv())
  .pipe(etl.collect(100))
  .pipe(etl.elastic.index(esClient,'testindex','testtype'))
```

Another example shows how one index can be copied to another, retaining the `_type` of each document:

```js
console.time('copy');
etl.elastic.scroll(esClient,{index: 'index.a', size: 5000})
  .pipe(etl.collect(1000))
  .pipe(etl.elastic.index(esClient,'index.b',null,{concurrency:10}))
  .promise()
  .then(function() {
    console.timeEnd('copy');
  });
```

If `custom` action is selected, each packet must be the raw metadata to be sent to elasticsearch with the optional second line stored in property `body`

Since `_type` is no longer allowed in elasticsearch > 7, it should be set as undefined for use in later versions.

#### BigQuery

<a name="bigquery" href="#bigquery">#</a> etl.bigquery.<b>insert</b>(<i>table</i>, [,<i>options</i>])

Bulk insert data into BigQuery. This function first downloads the field names for the table and then inserts the matching columns from the incoming data.  The first variable needs to be an instance of the BigQuery table class.  Options can specify the concurrency (i.e. how many concurrent insert connections are allowed);

example:

```js
const {BigQuery} = require('@google-cloud/bigquery');
const bigquery = new BigQuery(config);
const dataset = bigquery.dataset('my_dataset');
const table = dataset.table('my_table');

 etl.file('test.csv')
  .pipe(etl.collect(100))  // send 100 rows in each packet
  .pipe(etl.bigquery.insert(table, {concurrency: 5}));
```


### Cluster

<a name="clusterschedule" href="#clusterschedule">#</a> etl.cluster.<b>schedule</b>(<i>list</i> [,<i>num_threads</i>] [,<i>reportingInterval</i>])

Schedules a list (array) of tasks to be performed by workers.  Returns a promise on the completion of all the tasks.   Number of threads will default to number of cpus.  If reporting interval is defined - progress will be reported in console.log.Should only be run from the master thread.  

<a name="clusterprocess" href="#clusterprocess">#</a> etl.cluster.<b>process</b>(<i>data</i> <i>[callback]</i>) 

This function should be overwritten in the worker to perform each task and either return a Promise that is resolved when the task is done or call the optional callback.

<a name="clusterprocess" href="#clusterprocess">#</a> etl.cluster.<b>process</b>(<i>num</i>)

This function sends a numerical value representing progress up to the master (for reporting).  


### Utilities

<a name="tostream" href="#tostream">#</a> etl.<b>toStream</b>(<i>data</i>)

A helper function that returns a stream that is initialized by writing every element of the supplied data (if array) before being ended.  This allows for an easy transition from a known set of elements to a flowing stream with concurrency control.  The input `data` can also be supplied as a promise or a function and the resulting values will be piped to the returned stream. If the resulting value from a supplied function or promise is a stream, it will be piped downstream.

<a name="file" href="#file">#</a> etl.<b>file</b>(<i>data</i> [,<i>options</i>])

Opens up a `fileStream` on the specified file and pushes the content downstream.  Each packet has a base prototype of of either an  optional `info` object provided in options or the empty object.  The following properties are defined for each downstream packet:  `__filename`, '__path' and `text` containing incremental contents of the file.

The optional `info` object allows setting generic properties that will, through inheritance, be available in any derived packets downstream.

Example:

```js
// each packet will contain  properties context, __filename, __path and text
etl.file('text.txt',{info: {context: 'test'}})
```

<a name="tofile" href="#tofile">#</a> etl.<b>toFile</b>(<i>filename</i>)

This is a convenience wrapper for `fs.createWriteStream` that returns a `streamz` object.  This allows appending `.promise()` to capture the finish event (or error) in a promise form.

Example:

```js
etl.toStream([1,2,3,4,5])
  .pipe(etl.stringify(0,null,true))
  .pipe(etl.toFile('/tmp/test.txt'))
  .promise()
  .then(function() {
    console.log('done')
  })
  .catch(function(e) {
    console.log('error',e);
  })
```

<a name="keepopen" href="#keepopen">#</a> etl.<b>keepOpen</b>(<i>[timeout]</i>)

`etl.keepOpen([timeout])` is a passthrough component that stays open after receiving an `end` only to finally close down when no data has passed through for a period of `[timeout]`.  This can be useful for any pipelines where data from lower part of the pipeline is pushed back higher for reprocessing (for example when encountering version conflicts of database documents) - as it will avoid `write after end` error.   The default timeout is 1000ms

## Testing

Testing environment is provided using docker-compose.  

`npm test` starts docker-compose (if not already running) and executes the test suite.

You can run individual tests from the docker directly.  To enter the docker type `npm run docker`

================================================
FILE: docker-compose.yml
================================================
version: "4"

# IMPORTANT
# Make sure you run setup-docker.sh before executing this!!!
#
# Bump the client image version HERE when changing client/Dockerfile

services:
  runner:
    build:
      context: .
      dockerfile: ./Dockerfile
    depends_on:
      - mongodb
      - elasticsearch
      - mysql
      - postgres
    links:
      - mongodb
      - elasticsearch
      - mysql
      - postgres
    working_dir: /usr/src/app
    volumes:
     - .:/usr/src/app
     - ./.docker_node_modules:/usr/src/app/node_modules:Z
    command: sh -c 'tail -f /dev/null'
    ports:
      - "9229:9229"
    expose:
      - 9229

  mongodb:
    image: 'mongo:7'

  mysql:
    image: mysql:8.0
    restart: always
    environment:
      - MYSQL_ROOT_PASSWORD=example
      
    command: ["mysqld", "--default-authentication-plugin=mysql_native_password", "--sql_mode="]
    volumes:
      - mysql-data:/var/lib/mysql

  postgres:
    image: postgres:16
    restart: always
    environment:
      POSTGRES_PASSWORD: example

  elasticsearch:
    image: elasticsearch:8.12.0
    environment: ['http.host=0.0.0.0', 'transport.host=127.0.0.1','xpack.security.enabled=false', 'xpack.security.enrollment.enabled=false']


volumes:
  mysql-data:

================================================
FILE: index.js
================================================
module.exports = require('./lib');

================================================
FILE: lib/bigquery/index.js
================================================
const Streamz = require('streamz');

function insert(table, options) {
  options = options || {};

  let cols = (async () => {
    var metadata = await table.getMetadata();
    cols = metadata[0].schema.fields.map(d => d.name);
    return cols;
  })();

  return Streamz(options.concurrency, async d => {
    await cols;
    const data = [].concat(d).map(d => {
      return cols.reduce( (p,col) => {
        if (d[col] !== undefined) p[col] = d[col];
        return p;
      },{});
    });
    await table.insert(data);  
  });
}

module.exports = {insert};

================================================
FILE: lib/chain.js
================================================
const duplexer3 = require('duplexer3');
const Streamz = require('streamz');
const toStream = require('./tostream');

module.exports = function(fn) {
  const inStream = Streamz();
  const outStream = Streamz();

  if (fn.length > 1)
    fn(inStream,outStream);
  else
    toStream(fn(inStream)).pipe(outStream);

  const stream = duplexer3({objectMode: true},inStream,outStream);

  // Mirror error and promise behaviour from streamz
  stream.on('error',e => {
    if (stream._events.error.length < 2) {
      const pipes = stream._readableState.pipes;
      if (pipes)
        [].concat(pipes).forEach(child => child.emit('error',e));
      else
        throw e;
    }
  });

  stream.promise = Streamz.prototype.promise;

  return stream;
};

================================================
FILE: lib/cluster.js
================================================
// Worker provides a simple framework around the cluster library to orchestrate a multicore ETL pipeline
// A tasklist is scheduled with a number of workers and then a process function is defined to process
// each element

const cluster = require('cluster');
const os = require('os');
const Promise = require('bluebird');

const isMaster = module.exports.isMaster = cluster.isMaster;
const isWorker = module.exports.isWorker = cluster.isWorker;


// Schedule a list of jobs to be distributed to workers
module.exports.schedule = function(list,threads,reporting) {
  let i = 0, last = 0, workers=[], reportInterval;
  if (!isMaster) 
    throw 'No scheduling from a worker';

  threads = threads || os.cpus().length;
  list = [].concat(list);

  function next(worker) {
    const item = list.pop();
    if (!item) {
      if (reporting) console.log('Worker done',worker.num);
      worker.disconnect();
      worker.done.resolve(true);
    } else 
      worker.send(item);
  }

  function createWorker() {
    const worker = cluster.fork();
    
    worker.num = threads;
    worker.done = Promise.defer();
    workers.push(worker.done.promise);
    worker.on('message',function(msg) {
      if (msg.id === 'done')
        next(worker);
      
      if (msg.id === 'progress')
        i+= msg.items;
    });
  }

  while (threads--)
    createWorker();

  cluster.on('online',next);

  if (reporting)
    reportInterval = setInterval(() => {
      console.log(i-last,last);
      last = i;
    },!isNaN(reporting) ? reporting : 1000);

  return Promise.all(workers)
    .then(() => {
      clearInterval(reportInterval);
      return i;
    });
};

// This function should be overwritten in the worker
module.exports.process = function(d,callback) {
  callback();
};

module.exports.progress = function(d) {
  if (isWorker)
    process.send({id:'progress',items:d});
};

if (isWorker)
  process.on('message',d => {
    const done = ()  => process.send({id: 'done'});
    if (module.exports.process.length > 1)
      module.exports.process(d,done);
    else
      Promise.resolve(module.exports.process(d)).then(done);
  });

================================================
FILE: lib/collect.js
================================================
const Streamz = require('streamz');
const util = require('util');

function Collect(maxSize,maxDuration,maxTextLength) {
  if (!(this instanceof Streamz))
    return new Collect(maxSize,maxDuration,maxTextLength);
  Streamz.call(this);
  // Allow a custom collection function as first argument
  if (typeof maxSize === 'function')
    this._fn = maxSize;
  this.maxSize = maxSize;
  this.textLength = 0;
  this.maxTextLength = maxTextLength;
  this.maxDuration = maxDuration;
  this.buffer = [];
}

util.inherits(Collect,Streamz);

Collect.prototype.buffer = undefined;

Collect.prototype._push = function() {
  this.textLength = 0;
  if (this.buffer.length)
    this.push(this.buffer);

  if (this.timeout) {
    clearTimeout(this.timeout);
    this.timeout = undefined;
  }

  this.buffer = [];
};

Collect.prototype._fn = function(d) {
  this.buffer.push(d);
  
  if (this.maxDuration && !this.timeout)
    this.timeout = setTimeout(this._push.bind(this),this.maxDuration);

  if (this.maxTextLength && (this.textLength += JSON.stringify(d).length) > this.maxTextLength)
    this._push();

  if(this.buffer.length >= this.maxSize)
    this._push();
};

Collect.prototype._flush = function(cb) {
  this._push();
  setImmediate( () => Streamz.prototype._flush(cb));
};

module.exports = Collect;

================================================
FILE: lib/csv_parser.js
================================================
const Streamz = require('streamz');
const util = require('util');
const Csv = require('csv-parser');

function Csv_parser(options) {
  if (!(this instanceof Streamz))
    return new Csv_parser(options);
  
  Streamz.call(this);
  
  this.options = options = options || {};
  this.options.transform = options.transform || {};
  this.csv = Csv(options);

  const _compile = this.csv._compile;
  const self = this;

  this.csv._compile = function() {
    if (options.sanitize)
      this.headers = this.headers.map(header => String(header)
        .trim()
        .toLowerCase()
        .replace(/\./g,'')
        .replace(/\'/g,'')
        .replace(/\s+/g,'_')
        .replace(/\u2013|\u2014/g, '-')
        .replace(/\//g,'_')
      );
        
    self.emit('headers',this.headers);
    return _compile.call(this);
  };
  this.csv.on('data',data => this._push(data));
}

util.inherits(Csv_parser,Streamz);

Csv_parser.prototype.base = {};

Csv_parser.prototype.line = 1;

Csv_parser.prototype._fn = function(d) { 
  if (d instanceof Buffer || typeof d !== 'object')
    d = Object.create({},{
      // text should be non-enumerable
      text: {
        value: d.toString('utf8'),
        writable: true,
        configurable: true
      } 
    });
  if (typeof d === 'object')
    this.base = d;
  this.csv.write(d.text || d);
};

Csv_parser.prototype._push = function(d) {
  const obj = Object.create(this.base);
  for (let key in d) {
    if (this.options.sanitize && typeof d[key] === 'string' && !d[key].trim().length) {
      d[key] = undefined;
    } else {
      const transform = this.options.transform[key];
      if (typeof transform === 'function')
        obj[key] = transform(d[key]);
      else if (transform !== null)
        obj[key] = d[key];
    }
  }
  obj.__line = ++this.line;
  this.push(obj);
};

Csv_parser.prototype._flush = function(cb) {
  this.csv.end();
  setImmediate( () => Streamz.prototype._flush(cb));
};

module.exports = Csv_parser;

================================================
FILE: lib/cut.js
================================================
const Streamz = require('streamz');
const util = require('util');

function Cut(maxLen,options) {
  if (!(this instanceof Cut))
    return new Cut(maxLen,options);

  if(!maxLen && isNaN(maxLen))
    throw 'MaxLen not defined';

  Streamz.call(this,options);

  this.maxLen = +maxLen;
  this.options = options || {};
}

util.inherits(Cut,Streamz);

Cut.prototype.buffer = '';

Cut.prototype._proto = {};

Cut.prototype.line = 0;

Cut.prototype._push = function(end) {
  if (this.buffer.length < this.maxLen && !end)
    return;

  const obj = Object.create(this._proto);
  obj.text = this.buffer.slice(0,this.maxLen);
  obj.__line = this.line++;
  this.push(obj);
  this.buffer = this.buffer.slice(this.maxLen);
  return this._push();
};

Cut.prototype._fn = function(d) {
  if (d instanceof Buffer || typeof d !== 'object')
    d = d.toString('utf8');

  if (typeof d === 'object') this._proto = d;
  this.buffer += (typeof d == 'string') ? d : d.text;

  this._push();
};

Cut.prototype._flush = function(cb) {
  this._push();
  setImmediate( () => Streamz.prototype._flush(cb));
};

module.exports = Cut;

================================================
FILE: lib/elasticsearch/bulk.js
================================================
const Streamz = require('streamz');
const Promise = require('bluebird');
const util = require('util');

function Bulk(action,client,index,type,options) {
  if (!(this instanceof Bulk))
    return new Bulk(action,client,index,type,options);

  if (!client)
    throw 'CLIENT_MISSING';

  if (!action)
    throw 'ACTION_MISSING';

  Streamz.call(this,options);
  this.options = options || {};
  if (this.options.pushResult)  // legacy fix
    this.options.pushResults = this.options.pushResult;
  this.action = action;
  this.index = index;
  this.type = type;
  this.client = client;
}

util.inherits(Bulk,Streamz);

Bulk.prototype.getMeta = function(d) {
  const res = {};
  const action = this.action == 'upsert' ? 'update' : this.action;
  const obj = res[action] = {
    _id : d._id,
  };

  delete d._id;

  if (!this.index) {
    obj._index = d._index;
    delete d._index;
  }

  if (!this.type && d._type) {
    obj._type = d._type;
    delete d._type;
  }

  if (!this.parent) {
    obj.parent = d.parent;
    delete d.parent;
  }

  if (!this.routing) {
    obj.routing = d.routing;
    delete d.routing;
  }

  return res;
};

Bulk.prototype._fn = function(d) {
  let itemsSuccessfullyPushed = [];
  let retries;
  let itemsToProcess = [].concat(d).reduce((p,d) => {
    if (this.action == 'custom') {
      const body = d.body;
      delete d.body;
      p.push(d);
      if (body)
        p.push(body);
      return p;
    }

    p.push(this.getMeta(d));
    
    d = d._source || d;

    if (this.action == 'index')
      p.push(d);
    else if (this.action == 'upsert')
      p.push({doc:d,doc_as_upsert:true});
    else if(this.action == 'update')
      p.push({doc:d});
    return p;
  },[]);

  const processError = e => {
    retries = retries || [];
    const retryNo = retries.length;
    if (!this.options.maxRetries || retryNo >= this.options.maxRetries) {
      if (e) e.retries = retries;
      throw e || 'MAXIMUM_RETRIES';
    }
    if (this.options.debug)
      console.log('Retry',e.message);

    let retryDelay;

    if (this.options.backoffDelay > 0) { 
      retryDelay = this.options.backoffDelay * Math.pow(2,retryNo);

      if (this.options.backoffVariance > 0)
        retryDelay *= (1 + this.options.backoffVariance * (Math.random() -0.5));

      if (this.options.maxBackoffDelay > 0)
        retryDelay = Math.min(retryDelay, this.options.maxBackoffDelay);

    } else {
      retryDelay = this.options.retryDelay || 30000;
    }

    retries.push(retryDelay);

    return Promise.delay(retryDelay).then(execute);
  };

  const execute = () => {
    const params = {
      body : itemsToProcess,
      index: this.index,
      consistency : this.options.consistency,
      refresh : this.options.refresh,
      routing : this.options.routing,
      timeout : this.options.timeout,
      fields : this.options.fields
    };
    // type is forbidden in elasticsearch > 7
    if (this.type) {
      params.type = this.type;
    }
    return this.client.bulk(params)
    .then(e => {
      if (!this.options.pushResults && !this.options.pushErrors)
        return;

      if (e.body) e = e.body;

      // Insert a copy of the original body
      e.items.forEach((e,i) => e.body = itemsToProcess[i * 2 + 1]);

      if (this.options.maxRetries) {
        let itemsToRetry;
        e.items.forEach((item, index) => {
          const verb = item.update || item.index || item.create;
          if (verb && verb.error && verb.error.type !== 'mapper_parsing_exception' && verb.error.type !== 'document_parsing_exception') {
            itemsToRetry = itemsToRetry || [];
            itemsToRetry.push(itemsToProcess[index * 2]);
            itemsToRetry.push(itemsToProcess[index * 2 + 1]);
          }
          else {
            itemsSuccessfullyPushed.push(item);
          }
        });

        if (itemsToRetry) {
          itemsToProcess = itemsToRetry;
          return processError();
        }

        e.items = itemsSuccessfullyPushed;
      }

      if (this.options.pushResults)
        return e;

      const items = e.items.filter(d => {
        const verb = d.update || d.index || d.create;
        d.error = verb.error;
        return verb.status !== 201 && verb.status !== 200;
      });
      return items.length && items || undefined;
    }, e => processError(e));
  };

  return execute();
};

module.exports = Bulk;


================================================
FILE: lib/elasticsearch/find.js
================================================
const Streamz = require('streamz');
const util = require('util');

function Find(client,options) {
  if (!(this instanceof Find))
    return new Find(client);

  if (!client)
    throw 'CLIENT_MISSING';

  Streamz.call(this,options);
  this.client = client;
}

util.inherits(Find,Streamz);

Find.prototype.search = function(d) {
  return this.client.search(d);
};

Find.prototype._fn = function(query) {
  return this.search(query)
    .then(d => {
      if (d.body) d = d.body;
      d.hits.hits.forEach(d => {
        d._search = query;
        this.push(d);
      });
    });
};

module.exports = Find;

================================================
FILE: lib/elasticsearch/index.js
================================================
const bulk = require('./bulk');

module.exports = {
  bulk : bulk,
  custom : bulk.bind(bulk,'custom'),
  index : bulk.bind(bulk,'index'),
  update : bulk.bind(null,'update'),
  upsert : bulk.bind(null,'upsert'),
  delete : bulk.bind(bulk,'delete'),
  find : require('./find'),
  mapping : require('./mapping'),
  scroll : require('./scroll')
};

================================================
FILE: lib/elasticsearch/mapping.js
================================================
function parse(obj,path,res) {
  path = path || [];
  res = res || {};

  if (obj.field)
    res[obj.field] = {path:path,type:obj.type};
  
  if(obj.properties) {
    Object.keys(obj.properties).forEach(key => {
      parse(obj.properties[key],path.concat(key),res);
    });
  }
  return res;
}

function populate(map,src) {
  const obj = {};

  function setValue(path,o,val) {
    o = o || {};
    if (path.length == 1) {
      o[path[0]] = val;
      return o;
    } else {
      o[path[0]] = o[path[0]] || {};
      return setValue(path.slice(1),o[path[0]],val);
    }
  }

  Object.keys(map).forEach(key => {
    let value = src[key];
    if (!isNaN(value) && (map[key].type == 'long' || map[key].type == 'float'))
      value = Number(value);
    if (value !== undefined)
      setValue(map[key].path,obj,value);
  });
  return obj;
}


module.exports = {
  parse : parse,
  populate : populate
};

================================================
FILE: lib/elasticsearch/scroll.js
================================================
const Readable = require('stream').Readable;
const util = require('util');

function Scroll(client,query,options) {
  if (!(this instanceof Scroll))
    return new Scroll(client,query,options);

  options = options || {};
  options.objectMode = true;
  Readable.call(this,options);

  query.scroll = query.scroll || '10s';

  this.client = client;
  this.query = query;
  this.options = options;
}

Scroll.prototype.buffer = [];

util.inherits(Scroll,Readable);

Scroll.prototype._read = function() {
  let paused;

  if (this.search)
    return;

  if (!this.scroll_id) 
    this.search = this.client.search(this.query);
  else
    this.search = this.client.scroll({scroll_id: this.scroll_id, scroll: this.query.scroll});
    
  return this.search
    .then(d => {
      if (d.body) d = d.body;
      this.search = undefined;
      this.scroll_id = this.scroll_id || d._scroll_id;
      
      if (!d.hits.hits.length) {
        this.scroll_id = undefined;
        return this.push(null);
      }
      
      d.hits.hits.forEach(d => paused = !this.push(d));

      if (!paused)
        return this._read();
    })
    .catch(e => this.emit('error',e));
};

module.exports = Scroll;

================================================
FILE: lib/expand.js
================================================
const Streamz = require('streamz');
const util = require('util');

function expand(convert) {
  if (!(this instanceof Streamz))
    return new expand(convert);

  Streamz.call(this);

  if (convert == 'uppercase')
    this.convert = function(d) {
      return String(d).toUpperCase();
    };

  else if (convert == 'lowercase')
    this.convert = function(d) {
      return String(d).toLowerCase();
    };

  else
    this.convert = convert;
}

util.inherits(expand,Streamz);

expand.prototype.expand = function(d) {
  for (let key in d) {
    const oldKey = key;
    if (typeof this.convert === 'function')
      key = this.convert(key);

    if (key) {
      if (typeof d[key] === 'object')
        d[key] = this.expand(d[oldKey]);
      else
        d[key] = d[oldKey];
      if (oldKey !== key)
        delete d[oldKey];
    } else
      delete d[oldKey];
  }
  return d;
};

expand.prototype._fn = function(d) {
  return this.expand(Object.create(d));
};

module.exports = expand;

================================================
FILE: lib/file.js
================================================
const fs = require('fs');
const util = require('util');
const Streamz = require('streamz');

function File(file,options) {
  if (!(this instanceof File))
    return new File(file,options);

  Streamz.call(this);

  options = options || {};
  if (options.encoding === undefined)
    options.encoding = 'utf-8';

  fs.createReadStream(file,options)
    .pipe(this);

  let filename = file.split('/');
  filename = filename[filename.length-1];

  this.info = options.info || {};
  this.info.__path = file;
  this.info.__filename = filename;
}

util.inherits(File,Streamz);

File.prototype._fn = function(d) {
  const obj = Object.create(this.info);
  obj.text = d;
  this.push(obj);
};

module.exports = File;

================================================
FILE: lib/fixed.js
================================================
const Streamz = require('streamz');
const util = require('util');

function Fixed(layout,options) {
  if (!(this instanceof Fixed))
    return new Fixed(layout,options);

  Streamz.call(this);

  this.options = options || {};

  let n = 0;

  // If the layout is an array, we reduce to an object
  if(layout.length)
    layout = layout.reduce((p,d) => {
      p[d.field] = d;
      return p;
    },{});

  // Take note of the record length by looking for last `end`
  this.recordLength = Object.keys(layout).reduce((p,key) => {
    if (!isNaN(layout[key]))
      layout[key] = { length: layout[key] };
    
    const item = layout[key];
    if (!item.start)
      item.start = n;
    if (!item.end)
      item.end = item.start + item.length;
    n = item.end || 0;
    return Math.max(p, n);
  },0);

  this.layout = layout;
}

util.inherits(Fixed,Streamz);

Fixed.prototype.__line = 0;

Fixed.prototype._push = function() {
  
  if (!this.buffer || this.buffer.text.length < this.recordLength)
    return;

  const layout = this.layout;

  const obj = Object.create(this.buffer);
  obj.text = obj.text.slice(0,this.recordLength);
  
  Object.keys(layout)
    .forEach(key => {
      const e = layout[key];
      let val = obj.text.slice(e.start,e.end || e.start + e.length).trim();
      if (!val.length)
        return;   
      if (e.transform)
        val = e.transform(val, (this.__line + 1));
      if (val !== undefined)
        obj[key] = val;
    });

  if (this.options.clean)
    delete obj.text;
  else
    obj.__line = ++this.__line;

  this.push(obj);

  this.buffer.text = this.buffer.text.slice(this.recordLength);
  return this._push();
};

Fixed.prototype._fn = function(d) {

  if (d instanceof Buffer || typeof d !== 'object') {
    d = Object.create({},{
      // text should be non-enumerable
      text: {
        value: d.toString('utf8'),
        writable: true,
        configurable: true
      } 
    });
  }

  if (!this.buffer) {
    this.buffer = Object.create(!this.options.clean ? d : {});
    this.buffer.text = '';
  }

  this.buffer.text += d.text;
  this._push();
};

Fixed.prototype._flush = function(cb) {
  this._push();
  setImmediate( () => Streamz.prototype._flush(cb));
};

module.exports = Fixed;

================================================
FILE: lib/index.js
================================================
const Streamz = require('streamz');

module.exports = {
  collect : require('./collect'),
  timeout : require('./timeout'),
  cut : require('./cut'),
  file : require('./file'),
  fixed : require('./fixed'),
  csv_parser : require('./csv_parser'),
  csv: require('./csv_parser'),
  split : require('./split'),
  expand : require('./expand'),
  stringify : require('./stringify'),
  inspect : require('./inspect'),
  mongo : require('./mongo'),
  mysql : require('./mysql'),
  postgres : require('./postgres'),
  elastic : require('./elasticsearch'),
  cluster : require('./cluster'),
  chain : require('./chain'),
  toStream : require('./tostream'),
  toFile : require('./toFile'),
  map : Streamz,
  keepOpen: require('./keepOpen'),
  prescan: require('./prescan'),
  bigquery: require('./bigquery'),
  streamz : Streamz
};


================================================
FILE: lib/inspect.js
================================================
const Streamz = require('streamz');
const util = require('util');

function Inspect(options) {
  if (!(this instanceof Streamz))
    return new Inspect(options);
  Streamz.call(this);

  this.options = this.options || {};
}

util.inherits(Inspect,Streamz);

Inspect.prototype._fn = function(d) {
  console.log(util.inspect(d,this.options));
};

module.exports = Inspect;

================================================
FILE: lib/keepOpen.js
================================================
const Streamz = require('streamz');
const util = require('util');

function KeepOpen(timeout) {
  if (!(this instanceof Streamz))
    return new KeepOpen(timeout);
  Streamz.call(this, { autoDestroy: false });
  this.timeout = timeout || 1000;
}

util.inherits(KeepOpen,Streamz);

KeepOpen.prototype._fn = function() {
  this.last = new Date();
  return Streamz.prototype._fn.apply(this,arguments);
};

KeepOpen.prototype.end = function(d) {
  if (d !== null && d !== undefined)
    this.write(d);

  let timer = setInterval(() => {
    if (new Date() - this.last > this.timeout) {
      clearInterval(timer);
      Streamz.prototype.end.call(this);
    }
  },this.timeout);
};

module.exports = KeepOpen;

================================================
FILE: lib/mongo/bulk.js
================================================
const Streamz = require('streamz');
const Promise = require('bluebird');
const util = require('util');

function Update(_c,collection,keys,options) {
  if (!(this instanceof Streamz))
    return new Update(_c,collection,keys,options);

  if (isNaN(_c)) {
    options = keys;
    keys = collection;
    collection = _c;
    _c = undefined;
  }

  if (keys === undefined)
    throw new Error('Missing Keys');

  Streamz.call(this, _c, null, options);
  this.collection = Promise.resolve(collection);
  this.options = options || {};
  this.options.pushResults = this.options.pushResults || this.options.pushResult; // legacy
  this.keys = [].concat(keys);
}

util.inherits(Update,Streamz);

Update.prototype._fn = function(d,cb) {
  this.collection
    .then(collection => {
      const bulk = collection.initializeUnorderedBulkOp();

      [].concat(d || []).forEach(d => {
        const criteria = this.keys.reduce((p,key) => {
          const keyPieces = key.split('.');
          const value = keyPieces.reduce((a, b) => {
            if (a[b] === undefined) {
              throw new Error('Key "' + b + '" not found in data ' + JSON.stringify(d));
            }
            return a[b];
          }, d);

          //if query referencing array, use $elemMatch instead of equality match to prevent issues with upsert
          if(keyPieces[0] == '$push' || keyPieces[0] == '$addToSet')  {
            const arrayProp = keyPieces[1];
            if(p[arrayProp] === undefined) {
              p[arrayProp] = {$elemMatch:{}};
            }
            keyPieces.splice(0,2);
            p[arrayProp]['$elemMatch'][keyPieces.join('.')] = value;
          } else {
            //check if key starts with '$' to remove operator from query key
            if(key.charAt(0) == '$') {
              key = key.substring(key.indexOf('.') + 1);
            }
            p[key] = value;
          }
          return p;
        },{});

        let op = bulk.find(criteria);

        if (this.options.upsert) {
          op = op.upsert();
        }

        op.updateOne(d);
      });

      bulk.execute(this.options.writeConcern,(err,d) => {
        console.log('inserted', d.insertedCount ?? d.nInserted,'upserted', d.upsertedCount ?? d.nUpserted, 'matched', d.matchedCount ?? d.nMatched, err);
        cb(err,this.options.pushResults && d);
      });
    });
};

module.exports = Update;


================================================
FILE: lib/mongo/index.js
================================================
module.exports = {
  insert : require('./insert'),
  update : require('./update'),
  bulk: require('./bulk'),
  upsert : function() {
    const update = require('./update').apply(this,arguments);
    update.options.upsert = true;
    return update;
  }
};

================================================
FILE: lib/mongo/insert.js
================================================
const Streamz = require('streamz');
const Promise = require('bluebird');
const util = require('util');

function Insert(_c,collection,options) {
  if (!(this instanceof Streamz))
    return new Insert(_c,collection);

  if (isNaN(_c)) {
    options = collection;
    collection = _c;
    _c = undefined;
  }

  Streamz.call(this, _c, null, options);
  this.collection = Promise.resolve(collection);
  this.options = options || {};
}

util.inherits(Insert,Streamz);

Insert.prototype._fn = function (d) {
  const operation = Array.isArray(d) ? 'insertMany' : 'insertOne';
  return this.collection
    .then(collection =>collection[operation](d,this.options))
    .then(d => {
      if (this.options.pushResults)
        return d;
    });
};

module.exports = Insert;

================================================
FILE: lib/mongo/update.js
================================================
const Streamz = require('streamz');
const Promise = require('bluebird');
const util = require('util');

function Update(_c,collection,keys,options) {
  if (!(this instanceof Streamz))
    return new Update(_c,collection,keys,options);

  if (isNaN(_c)) {
    options = keys;
    keys = collection;
    collection = _c;
    _c = undefined;
  }

  if (keys === undefined)
    throw new Error('Missing Keys');

  Streamz.call(this, _c, null, options);
  this.collection = Promise.resolve(collection);
  this.options = options || {};
  this.options.pushResults = this.options.pushResults || this.options.pushResult; // legacy
  this.keys = [].concat(keys);
}

util.inherits(Update,Streamz);

Update.prototype._fn = function(d) {
  return this.collection
    .then(collection => {
      const bulk = collection.initializeUnorderedBulkOp();

      [].concat(d || []).forEach(d => {
        const criteria = this.keys.reduce((p,key) => {
          if (d[key] === undefined)
            throw new Error('Key not found in data');
          p[key] = d[key];
          return p;
        },{});

        let op = bulk.find(criteria);

        if (this.options.upsert)
          op = op.upsert();

        let payload = (d.$set || d.$addToSet) ? d : {$set: d};
        if (d.$update) payload = d.$update;

        op.updateOne(payload);
      });

      return bulk.execute(this.options.writeConcern);
    })
    .then(d => {
      if (this.options.pushResults)
        return d;
    });
    
};

module.exports = Update;

================================================
FILE: lib/mysql/execute.js
================================================
const Mysql = require('./mysql');
const util = require('util');

function Execute(pool,options) {
  if (!(this instanceof Execute))
    return new Execute(pool,options);

  options = options || {};
  Mysql.call(this,pool,options);
}

util.inherits(Execute,Mysql);

Execute.prototype._fn = function(d,cb) {
  return this.query(d,cb)
    .then(d => (this.options.pushResult || this.options.pushResults) && d || undefined);
};

module.exports = Execute;

================================================
FILE: lib/mysql/index.js
================================================
module.exports = {
  mysql : require('./mysql'),
  script : require('./script'),
  execute : require('./execute'),
  upsert : require('./upsert')
};

================================================
FILE: lib/mysql/mysql.js
================================================
const Streamz = require('streamz');
const util = require('util');
const Promise = require('bluebird');

function Mysql(pool,options) {
  if (!(this instanceof Mysql))
    return new Mysql(pool);

  if (!pool)
    throw 'POOL_MISSING';

  Streamz.call(this,options);
  this.pool = pool;
  this.options = options || {};
}

util.inherits(Mysql,Streamz);

Mysql.prototype.getConnection = function() {
  return Promise.fromNode(this.pool.getConnection.bind(this.pool))
    .disposer(connection => connection.release());
};

Mysql.prototype.query = function(query,cb) {
  return Promise.using(this.getConnection(),connection => {
    // Trigger callback when we get a connection, not when we (later) get results
    // allowing overall concurrency to be controlled by the mysql pool
    if (typeof cb === 'function') cb();
    return Promise.fromNode(callback => connection.query(query,callback));
  });
};

Mysql.prototype.stream = function(query,cb) {
  const passThrough = Streamz();

  Promise.using(this.getConnection(),connection => {
    // Trigger callback when we get a connection, not when we (later) get results
    // allowing overall concurrency to be controlled by the mysql pool
    if (typeof cb === 'function') cb();
    return new Promise((resolve,reject) => {
      connection.query(query)
        .stream()
        .on('end',resolve)
        .on('error',reject)
        .pipe(passThrough);
      
      passThrough.on('error', err => {
        connection.destroy();
      });
    });
  })
  .catch(e => passThrough.emit('error',e));

  return passThrough;  
};

module.exports = Mysql;

================================================
FILE: lib/mysql/script.js
================================================
const Mysql = require('./mysql');
const util = require('util');

function Script(pool,schema,table,options) {
  if (!(this instanceof Script))
    return new Script(pool,schema,table,options);

  Mysql.call(this,pool,options);

  this.schema = schema;
  this.table = table;
  this.columns = this.getColumns();
  this.prefix = this.options.prefix || 'REPLACE INTO ';
  this.maxBuffer = this.options.maxBuffer || 1024 * 1024;
}

util.inherits(Script,Mysql);

Script.prototype.getColumns = function() {
  const sql = 'SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE '+
            `TABLE_SCHEMA="${this.schema}" AND TABLE_NAME="${this.table}";`;

  return this.query(sql)
    .then(d => {
      if (!d.length)
        throw 'TABLE_NOT_FOUND';
      return d.map(d => d.COLUMN_NAME);
    });
};

Script.prototype.buffer = undefined;

Script.prototype._push = function() {
  if (this.buffer)
    this.push(this.buffer);
  this.buffer = undefined;
};

Script.prototype._fn = function(d) {
  return this.columns.then(columns => {
    if (!this.buffer)
      this.buffer = `${this.prefix} \`${this.schema}\`.\`${this.table}\` (\`${columns.join('`,`')}\`) VALUES `;
    else
      this.buffer += ', ';

    this.buffer += '('+columns.map(key => {
      const value = d[key];
      if (typeof value === 'undefined')
        return 'DEFAULT';
      else
        return this.pool.escape(value);
    })
    .join(',')+')';

    if (this.buffer.length >= this.maxBuffer)
      this._push();
  });
};

Script.prototype._flush = function(cb) {
  this._push();
  setImmediate(() => Mysql.prototype._flush(cb));
};

module.exports = Script;


================================================
FILE: lib/mysql/upsert.js
================================================
const chain = require('../chain');
const script = require('./script');
const execute = require('./execute');

module.exports = function upsert(pool,schema,table,options) {
  return chain(incoming => incoming
    .pipe(script(pool,schema,table,options))
    .pipe(execute(pool,options))
  );
};

================================================
FILE: lib/postgres/execute.js
================================================
const Postgres = require('./postgres');

class Execute extends Postgres {
  constructor(pool, options = {}) {
    super(pool, options);
  }

  _fn(d, cb) {
    // TODO make transaction or use {maxBuffer:1} in options
    // console.log(d);
    return this.query(d, cb)
      .then(d => (this.options.pushResult || this.options.pushResults) && d || undefined);
  }
}

module.exports = (...params) => new Execute(...params);


================================================
FILE: lib/postgres/index.js
================================================
module.exports = {
  postgres : require('./postgres'),
  script : require('./script'),
  execute : require('./execute'),
  upsert : require('./upsert'),
  insert: require('./insert')
};

================================================
FILE: lib/postgres/insert.js
================================================
const chain = require('../chain');
const script = require('./script');
const execute = require('./execute');

module.exports = function upsert(pool,schema,table,options) {
  options = Object.assign({}, options, {upsert: false});
  return chain(inStream => inStream
    .pipe(script(pool, schema, table, options))
    .pipe(execute(pool, options))
  );
};


================================================
FILE: lib/postgres/postgres.js
================================================
const Streamz = require('streamz');
const Promise = require('bluebird');

class Postgres extends Streamz {
  constructor(pool, options = {}) {
    super(options);

    if (!pool)
      throw 'POOL_MISSING';

    this.pool = pool;
    this.options = options;
  }

  getConnection() {
    return Promise.fromNode(this.pool.connect.bind(this.pool))
      .disposer(connection => connection.release());
  }

  query(query, cb) {
    return Promise.using(this.getConnection(), connection => {
      // Trigger callback when we get a connection, not when we (later) get results
      // allowing overall concurrency to be controlled by the Postgres pool
      if (typeof cb === 'function') cb();
      return Promise.fromNode(callback => connection.query(query, callback));
    });
  }

  stream(query, cb) {
    const passThrough = Streamz();
    const state = query && query.cursor ? query.cursor.state : query.state;

    if (!state || state !== 'initialized') {
      passThrough.emit('error', new Error('Query should be QueryStream'));
      return passThrough;
    }

    Promise.using(this.getConnection(), connection => {
      // Trigger callback when we get a connection, not when we (later) get results
      // allowing overall concurrency to be controlled by the Postgres pool
      if (typeof cb === 'function') cb();
      return new Promise((resolve, reject) => {
        connection.query(query)
          .on('end', resolve)
          .on('error', reject)
          .pipe(passThrough);
      });
    })
    .catch(e => passThrough.emit('error', e));

    return passThrough;
  }
}

module.exports = Postgres;


================================================
FILE: lib/postgres/script.js
================================================
const Postgres = require('./postgres');

const quoteString = (str) => `"${str}"`;

class Script extends Postgres {
  constructor(pool, schema, table, options) {
    super(pool, options);

    this.schema = schema;
    this.table = table;
    this.columns = this.getColumns();
    this.upsert = options.upsert;
    if (this.upsert) this.pkeys = this.getPrimaryKeys();
    this.prefix = this.options.prefix || 'INSERT INTO';
    this.maxBuffer = options.maxBuffer || 1024 * 1024;
  }

  getColumns() {
    const sql = 'SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE ' +
      `TABLE_SCHEMA='${this.schema}' AND TABLE_NAME='${this.table}';`;

    return this.query(sql)
      .then(d => {
        d = d.rows;
        if (!d.length)
          throw 'TABLE_NOT_FOUND';
        return d.map(d => d.column_name);
      });
  }

  getPrimaryKeys() {
    const sql = 'SELECT a.attname' +
      ' FROM   pg_index i' +
      ' JOIN   pg_attribute a ON a.attrelid = i.indrelid' +
      ' AND a.attnum = ANY(i.indkey)' +
      ` WHERE  i.indrelid = '${this.schema}.${this.table}'::regclass` +
      ' AND    i.indisprimary;';

    return this.query(sql)
      .then(d => d.rows.map(d => d.attname))
      .catch(e => {
        if (e.message && e.message.includes('op ANY/ALL'))
          return [];
        else
          throw e;
      });
  }

  _push() {
    if (this.buffer) {
      this.push(this.buffer);
    }
    this.buffer = undefined;
  }

  _fn(record) {
    return Promise.all([this.columns, this.pkeys]).then(data => {
      const columns = data[0];
      const pkeys = data[1] || [];
      const d = (Array.isArray(record)) ? record[0] : record;

      if (typeof d === 'undefined')
        return;

      if (!this.buffer)
        this.buffer = `${this.prefix} ${this.schema}.${this.table} ( ${columns.map(quoteString).join(',')} ) VALUES `;
      else
        this.buffer += ', ';

      this.buffer += '(' + columns.map(key => {
        const value = d[key];
        if (typeof value === 'undefined')
          return 'DEFAULT';
        else if (value === null)
          return 'null';
        else if (typeof value === 'object')
          return escapeLiteral(JSON.stringify(value));
        else
          return escapeLiteral(value);
      })
        .join(',') + ')';

      if (this.upsert) {
        let tmp_arr = [];
        for (let i = 0, l = columns.length; i < l; i++) {
          const value = d[columns[i]];
          if (typeof value === 'undefined')
            continue;

          let sql = `"${columns[i]}" =`;
          if (value === null)
            sql += 'null';
          else if (typeof value === 'object')
            sql += escapeLiteral(JSON.stringify(value));
          else
            sql += escapeLiteral(value);

          tmp_arr.push(sql);
        }
        if (tmp_arr.length && pkeys.length) {
          this.buffer += ` ON CONFLICT (${pkeys.map(quoteString).join(', ')}) DO UPDATE SET ${tmp_arr.join(', ')}`;
        }

        this._push();
      }
      if (this.buffer && this.buffer.length > this.maxBuffer) this._push();

    });
  }

  _flush(cb) {
    this._push();
    setImmediate(() => Postgres.prototype._flush(cb));
  }
}

// https://github.com/brianc/node-postgres/blob/83a946f61cb9e74c7f499e44d03a268c399bd623/lib/client.js
function escapeLiteral(str) {
  let hasBackslash = false;
  let escaped = '\'';

  if (typeof str !== 'string')
    return str;

  for (let i = 0; i < str.length; i++) {
    const c = str[i];
    if (c === '\'') {
      escaped += c + c;
    } else if (c === '\\') {
      escaped += c + c;
      hasBackslash = true;
    } else {
      escaped += c;
    }
  }

  escaped += '\'';

  if (hasBackslash === true)
    escaped = ' E' + escaped;

  return escaped;
}

module.exports = (...params) => new Script(...params);


================================================
FILE: lib/postgres/upsert.js
================================================
const chain = require('../chain');
const script = require('./script');
const execute = require('./execute');

module.exports = function upsert(pool,schema,table,options) {
  options = Object.assign({}, options, {upsert: true});
  return chain(inStream => inStream
    .pipe(script(pool, schema, table, options))
    .pipe(execute(pool, options))
  );
};


================================================
FILE: lib/prescan.js
================================================
const Streamz = require('streamz');
const Promise = require('bluebird');
const util = require('util');

function Prescan(count,fn) {
  if (!(this instanceof Prescan))
    return new Prescan(count,fn);

  Streamz.call(this);
  // Allow a custom collection function as first argument
  this.count = count;
  this.fn = fn;
  this.buffer = [];
  this.i = 0;
}

util.inherits(Prescan,Streamz);

Prescan.prototype.buffer = undefined;

Prescan.prototype._push = function() {
  if (!this.buffer)
    return Promise.resolve();

  const buffer = this.buffer;
  this.buffer = undefined;

  return Promise.try(() =>this.fn(buffer))
  .then(() => buffer.forEach(d => this.push(d)));
};

Prescan.prototype._fn = function(d) {
  if (!this.buffer)
    return d;

  this.i +=  d.length || 1;
  this.buffer.push(d);

  if (this.i >= this.count)
    return this._push();
};

Prescan.prototype._flush = function(cb) {
  this._push()
    .then( () => setImmediate( () => Streamz.prototype._flush(cb)));
};

module.exports = Prescan;

================================================
FILE: lib/split.js
================================================
const Streamz = require('streamz');
const util = require('util');

function Split(symbol) {
  if (!(this instanceof Streamz))
    return new Split(symbol);

  Streamz.call(this);
  this.symbol = symbol || '\n';
}

util.inherits(Split,Streamz);

Split.prototype.buffer = '';

Split.prototype.__line = 0;

Split.prototype._push = function() {
  if (this.buffer) {
    this.buffer.__line = this.__line++;
    this.push(this.buffer);
  }
  delete this.buffer;
};

Split.prototype._fn = function(d) {
  if (d instanceof Buffer || typeof d !== 'object')
    d = { text: d.toString('utf8') };

  if (!this.buffer) {
    this.buffer = Object.create(d);
    this.buffer.text = '';
  }

  const buffer = (this.buffer.text += d.text).split(this.symbol);

  buffer.slice(0,buffer.length-1)
    .forEach(d => {
      const obj = Object.create(this.buffer);
      obj.text = d;
      obj.__line = this.__line++;
      this.push(obj);
    },this);

  this.buffer.text = buffer[buffer.length-1];
};

Split.prototype._flush = function(cb) {
  this._push();
  setImmediate( () => Streamz.prototype._flush(cb));
};

module.exports = Split;

================================================
FILE: lib/stringify.js
================================================
const Streamz = require('streamz');
const util = require('util');

function Stringify(indent,replacer,newline) {
  if (!(this instanceof Streamz))
    return new Stringify(indent,replacer,newline);
  Streamz.call(this);
  this.indent = indent;
  this.replacer = replacer;
  this.newline = newline;
}

util.inherits(Stringify,Streamz);

Stringify.prototype._fn = function(d) {
  return JSON.stringify(d,this.replacer,this.indent) + (this.newline ? '\n' : '');
};

module.exports = Stringify;

================================================
FILE: lib/timeout.js
================================================
const Streamz = require('streamz');
const util = require('util');

function Timeout(ms) {
  if (!(this instanceof Streamz))
    return new Timeout(ms);
  Streamz.call(this);
  
  this.interval = setInterval(() => {
    if (this.last && (new Date()) - this.last > ms) {
      this.emit('error','ETL_TIMEOUT');
      clearInterval(this.interval);
    }
  },ms);
}

util.inherits(Timeout,Streamz);

Timeout.prototype._fn = function(d) {
  this.last = new Date();
  return d;
};

Timeout.prototype._flush = function(cb) {
  clearInterval(this.interval);
  return Streamz.prototype._flush.call(this,cb);
};

module.exports = Timeout;

================================================
FILE: lib/toFile.js
================================================
const Streamz = require('streamz');
const chain = require('./chain');
const fs = require('fs');

function toFile(filename) {  
  return chain(inStream => {
    const stream = Streamz();

    inStream
      .pipe(fs.createWriteStream(filename))
      .on('error',e => stream.emit('error',e))
      .on('finish',() => stream.end(true));

    return stream;
  });
}

module.exports = toFile;

================================================
FILE: lib/tostream.js
================================================
const Streamz = require('streamz');
const Promise = require('bluebird');

function toStream(data) {
  const stream = Streamz();

  if (typeof data == 'function')
    data = Promise.try(data.bind(stream));

  Promise.resolve(data)
    .then(d => {
      if (d && typeof d.pipe == 'function')
        return d.pipe(stream);
      else if (d !== undefined)
        [].concat(d).forEach(d => stream.write(d));
      stream.end();
    })
    .catch(e => stream.emit('error',e));

  return stream;
}

module.exports = toStream;

================================================
FILE: package.json
================================================
{
  "name": "etl",
  "version": "0.8.0",
  "description": "Collection of stream-based components that form an ETL pipeline",
  "main": "index.js",
  "author": "Ziggy Jonsson (http://github.com/zjonsson/)",
  "repository": {
    "type": "git",
    "url": "http://github.com/ZJONSSON/node-etl"
  },
  "scripts": {
    "test": "docker compose up -d --no-recreate --quiet-pull;docker exec node-etl-runner-1 bash ./test.sh",
    "docker": "docker compose up -d --no-recreate --quiet-pull;docker exec node-etl-runner-1 bash"
  },
  "license": "MIT",
  "dependencies": {
    "bluebird": "~3.5.0",
    "csv-parser": "~1.8.0",
    "duplexer3": "^0.1.4",
    "moment": "~2.29.4",
    "streamz": "~1.8.10"
  },
  "devDependencies": {
    "@elastic/elasticsearch": "^8.11.0",
    "mongodb": "^6.3.0",
    "mysql": "^2.18.1",
    "pg": "^8.11.3",
    "pg-query-stream": "~1.0.0",
    "tap": "^19.0.2",
    "url-js": "^0.2.5"
  }
}


================================================
FILE: test/chain-test.js
================================================
const etl = require('../index');
const dataStream = require('./lib/dataStream');
const t = require('tap');

const data = [1,2,3,4,5,6,7,8,9,10,11];

const expected = [
  [1,2,3],
  [4,5,6],
  [7,8,9],
  [10,11]
];

t.test('chain', async t => {

  t.test('returning a stream', async t => {
    const d = await dataStream(data)
      .pipe(etl.chain(stream => stream.pipe(etl.collect(3))))
      .promise();

    t.same(d,expected,'returning stream is piped down');
  });

  t.test('using the second argument as outstream', async t => {
    const d = await dataStream(data)
      .pipe(etl.chain((stream,out) =>
        stream.pipe(etl.collect(3)).pipe(out)
      ))
      .promise();

    t.same(d,expected,'outstream is piped down');
  });

  t.test('returning a promise', async t => {
    const d = await dataStream(data)
      .pipe(etl.chain(stream =>
        stream
          .pipe(etl.collect(3))
          .promise()
      ))
      .promise();
    t.same(d,expected,'pipes down promise results');
  });

  t.test('errors in subchain', async t => {
    const chain = etl.map();
    setTimeout( () => chain.end('test'));

    const e = await chain
      .pipe(etl.map())
      .pipe(etl.chain(stream =>
        stream
          .pipe(etl.map(() => { throw 'ERROR';}))
          .pipe(etl.collect(3))
      ))
      .promise()
      .then(() => { throw 'Should error';},String);

    t.same(e,'ERROR','bubble down');
  });
});

================================================
FILE: test/cluster-test.js
================================================
const etl = require('../index');
const cluster = require('cluster');
const path = require('path');
const t = require('tap');

cluster.setupMaster({
  exec : path.join(__dirname,'lib','worker.js')
});


t.test('cluster', async t => {
  const d = await etl.cluster.schedule([1,2,3,4,5],3);
  t.equal(d, 15, 'should schedule tasks');
});


================================================
FILE: test/collect-test.js
================================================
const Promise = require('bluebird');
const etl = require('../index');
const dataStream = require('./lib/dataStream');
const t = require('tap');

const data = [1,2,3,4,5,6,7,8,9,10,11];

t.test('collect', {jobs: 10}, async t => {
  
  t.test('etl.collect(3)', async t => {
    const expected = [
      [1,2,3],
      [4,5,6],
      [7,8,9],
      [10,11]
    ];

    const d = await dataStream(data).pipe(etl.collect(3)).promise();
    t.same(d,expected,'collects 3 records at a time');
  });


  t.test('etl.collect(999) for small dataset', async t => {
    const expected = [data];

    const d = await dataStream(data)
      .pipe(etl.collect(9999))
      .promise();

    t.same(d,expected,'returns everything in the array');
  });
  
  t.test('etl.collect(1)', async t => {
    const expected = data.map(d => [d]);

    const d = await dataStream(data)
      .pipe(etl.collect(1))
      .promise();

    t.same(d,expected,'returns array of one element arrays');
  });

  t.test('maxDuration', async t => {
    const d = await etl.toStream([1,2,3,4,5,6,7,8,9,10])
      .pipe(etl.map(d => Promise.delay(100).then(() => d)))
      .pipe(etl.collect(5,300))
      .promise();

    t.same(d,[[1,2,3],[4,5,6],[7,8,9],[10]],'pushes on timeouts');
  });

  t.test('maxTextLength', async t => {
    const data = [
      {text:'test'},
      {text:'test'},
      {text:'this is a really long string'},
      {text:'test'}
    ];

    const d = await etl.toStream(data)
      .pipe(etl.collect(1000,1000,15))
      .promise();

    t.same(d,[
      [data[0],data[1]],
      [data[2]],
      [data[3]]
    ],'pushes when text reaches max');
  });

  t.test('custom fn', async t => {
    const expected = [
      [1,2,3],
      [4,5,6],
      [7,8,9,10,11]
    ];

    const d = await dataStream(data)
      .pipe(etl.collect(function(d) {
        this.buffer.push(d);
        if (d < 7 && this.buffer.length > 2)
          this._push();
      }))
      .promise();

    t.same(d,expected,'runs and flush pushes remaining buffer');
  });
});

================================================
FILE: test/csv-test.js
================================================
const etl = require('../index');
const path = require('path');
const t = require('tap');
const data = require('./data');

t.test('csv',async t => {
  const csv = etl.csv_parser({
    sanitize: true,
    transform: {
      dt: d => new Date(d)
    }
  });

  etl.file(path.join(__dirname,'test.csv')).pipe(csv);

  // Adjust expected values to the csv
  const expected = data.copy().map(function(d) {
    d.__line = d.__line +1;
    d.__filename = 'test.csv';

    // Clear out __path and text as they are volatile
    d.__path = undefined;
    d.text = undefined;
    
    return d;
  });

  
  const d = await csv.pipe(etl.expand()).promise();
  
  d.forEach(d => {
    d.__path = undefined;
    d.text = undefined;
  });

  t.same(d,expected,'parses data correctly');
});

================================================
FILE: test/cut-test.js
================================================
const etl = require('../index');
const t = require('tap');

const data = [ '1','2345678','9012345678','9'];

const expected = [
  { text: '1234', __line: 0, __filename: 'text.txt' },
  { text: '5678', __line: 1, __filename: 'text.txt' },
  { text: '9012', __line: 2, __filename: 'text.txt' },
  { text: '3456', __line: 3, __filename: 'text.txt' },
  { text: '789', __line: 4, __filename: 'text.txt' },
];

t.test('cut(4)',async t => {
  const cut = etl.cut(4);

  data.forEach((d,i) => setTimeout(() => {
    cut.write(i == 2 ? d : {text:d,__filename:'text.txt'});
    if (i == data.length-1)
      cut.end();
  }));

  const d = await cut.pipe(etl.expand()).promise();
  t.same(d[0],expected[0],'splits text into packets with maxlength 4');
});


================================================
FILE: test/data-changed.js
================================================
const PassThrough = require('stream').PassThrough;

const data = {
  data : [
    { text: 'Nathaniel Olson 51    1/10/2025',
      name: 'Nathaniel Olson',
      age: 51,
      dt: new Date('1/10/2025'),
      __line: 1,
      __filename: 'test.txt'
    },
    { text: 'Ann Ellis       35    10/2/2035',
      name: 'Ann Ellis',
      age: 35,
      dt: new Date('10/2/2035'),
      __line: 2,
      __filename: 'test.txt'
    },
    { text: 'Willie Freeman  38     4/1/2016',
      name: 'Willie Freeman',
      age: 38,
      dt: new Date('4/1/2016'),
      __line: 3,
      __filename: 'test.txt'
    }
  ],

  fixed: ['Nathaniel O','lson 52    1/10/202','5Ann Ellis       36  ','  10/2/203','5Willie F','reeman  38     4/1/20','16'],
  layout : { 
    name : 16,
    age : { length : 3,  transform: Number },
    dt : { length : 12, transform: function(d) { return new Date(d); } }
  }
};

data.copy = function() {
  return this.data.map(function(d) {
    return Object.keys(d).reduce(function(p,key) {
      p[key] = d[key];
      return p;
    },{});
  });
};

data.stream = function(options) {
  const s = PassThrough({objectMode:true});
  const data = this.copy();

  data.forEach(function(d,i) {
    setTimeout(function() {
      s.write(options && options.clone ? Object.create(d) : d);
      if (i == data.length-1)
        s.end();
    },i*10);
  });
  return s;
};


for (let key in data.data) 
  Object.freeze(data.data[key]);


module.exports = data;

================================================
FILE: test/data.js
================================================
const PassThrough = require('stream').PassThrough;

const data = {
  data : [
    { text: 'Nathaniel Olson 52    1/10/2025',
      name: 'Nathaniel Olson',
      age: 52,
      dt: new Date('1/10/2025'),
      __line: 1,
      __filename: 'test.txt'
    },
    { text: 'Ann Ellis       36    10/2/2035',
      name: 'Ann Ellis',
      age: 36,
      dt: new Date('10/2/2035'),
      __line: 2,
      __filename: 'test.txt'
    },
    { text: 'Willie Freeman  38     4/1/2016',
      name: 'Willie Freeman',
      age: 38,
      dt: new Date('4/1/2016'),
      __line: 3,
      __filename: 'test.txt'
    }
  ],

  fixed: ['Nathaniel O','lson 52    1/10/202','5Ann Ellis       36  ','  10/2/203','5Willie F','reeman  38     4/1/20','16'],
  layout : { 
    name : 16,
    age : { length : 3,  transform: Number },
    dt : { length : 12, transform: function(d) { return new Date(d); } }
  }
};

data.copy = function() {
  return this.data.map(function(d) {
    return Object.keys(d).reduce(function(p,key) {
      p[key] = d[key];
      return p;
    },{});
  });
};

data.stream = function(options) {
  const s = PassThrough({objectMode:true});
  const data = this.copy();

  data.forEach(function(d,i) {
    setTimeout(function() {
      s.write(options && options.clone ? Object.create(d) : d);
      if (i == data.length-1)
        s.end();
    },i*10);
  });
  return s;
};


for (let key in data.data) 
  Object.freeze(data.data[key]);


module.exports = data;

================================================
FILE: test/elastic-retry-test.js
================================================
const etl = require('../index');
const Promise = require('bluebird');
const t = require('tap');

let data = [], i=0;
let retries = 0;

// Mock an elastic client
const client = {
  bulk : async function(options) {
    await Promise.delay(100);
    if (i++ == 2) {
      retries++;
      throw 'NETWORK_ERROR';
    }
    else {
      data = data.concat(options.body);
      return {
        items: options.body.slice(0, options.body.length/2).map(function() { return {};})
      };
    }
  }
};

const backoffFailingClient = {
  bulk: async function() {
    throw {message: 'network_error'};
  }
};

t.test('elastic bulk insert retry ',async t => {  
  const upsert = etl.elastic.upsert(client,'test','test',{pushResults:true,maxRetries:1,retryDelay:10,concurrency:10});

  await etl.toStream([1,2,3,4,5,6,7,8,9,10].map(function(d) { return {_id:d,num:d};}))
    .pipe(etl.collect(2))
    .pipe(upsert)
    .promise();
    
  data = data.filter(d => d.update)
    .map(d => d.update._id);

  t.same(retries, 1, 'A single retry is made');
  t.same(data,[1,2,3,4,7,8,9,10,5,6],'retries on error');
});

t.test('elastic bulk insert retry single item',async t => {  

  // Mock an elastic client
  const singleFailureRetryClient = {
    bulk : async function(options) {
      await Promise.delay(100);
      if (options.body.length === 4) {
        return {
          items: [
            {
              update: { error: 'unable to insert' }
            },
            {
              update: { _id: options.body[2].update._id }
            }
          ]
        };
      }
      else if (options.body.length === 2) {
        return {
          items: [{update: { _id: options.body[0].update._id }}]
        };
      }
    }
  };

  const upsert = etl.elastic.upsert(singleFailureRetryClient,'test','test',{pushResults:true,maxRetries:1,retryDelay:10,concurrency:10});

  let data = await etl.toStream([1,2,3,4,5,6,7,8,9,10].map(function(d) { return {_id:d,num:d};}))
    .pipe(etl.collect(2))
    .pipe(upsert)
    .promise();
  
  data = [].concat.apply([],data.map(d => d.items));
  
  data = data.filter(d => d.update)
    .map(d => d.update._id);

  t.same(data,[2,1,4,3,6,5,8,7,10,9],'retries on error');
});


t.test('backoff retries', async t => {
  const options = {pushResults:true,maxRetries:7, backoffDelay:10, maxBackoffDelay: 250};
  const upsert = etl.elastic.upsert(backoffFailingClient,'test','test', options);
  let err;

  await etl.toStream([1,2,3].map(function(d) { return {_id:d,num:d};}))
    .pipe(etl.collect(3))
    .pipe(upsert)
    .promise()
    .catch(e => err = e);

  t.same(err.retries, [10, 20, 40, 80, 160, 250, 250], 'exponential backoff with a cap');
});

t.test('backoff retries with variance', async t => {
  const options = {pushResults:true,maxRetries:7, backoffDelay:10, backoffVariance: 0.1, maxBackoffDelay: 250};
  const upsert = etl.elastic.upsert(backoffFailingClient,'test','test', options);
  let err;

  await etl.toStream([1,2,3].map(function(d) { return {_id:d,num:d};}))
    .pipe(etl.collect(3))
    .pipe(upsert)
    .promise()
    .catch(e => err = e);

  const mean = [10, 20, 40, 80, 160, 250, 250];

  t.ok(err.retries.every( (d,i) => {
    const expected = mean[i]; 
    const low = expected * (1 - options.backoffVariance);
    const high = expected * (1 + options.backoffVariance);
    return low < d && d < Math.min(high, 251);
  }));

  t.same(err.retries.length, options.maxRetries, 'maxRetries performed');
});


================================================
FILE: test/elastic-test.js
================================================
const etl = require('../index');
const data = require('./data');
const Promise = require('bluebird');
const elasticsearch = require('@elastic/elasticsearch');
const t = require('tap');

const client = new elasticsearch.Client({
  node: 'http://elasticsearch:9200'
});

function convertHits(d) {
  return d.map(d => {
    d = d._source;
    d.dt = new Date(d.dt);
    return d;
  })
  .sort((a,b) => a.__line - b.__line);
}

t.test('elastic', async t => {
  await client.indices.delete({index:'test'}).catch(Object);
  await client.indices.delete({index:'testretries'}).catch(Object);

  t.test('#getMeta', t => {
    const bulk = etl.elastic.bulk('index', {});
    const d = {_id: '1', parent: 'testparent', routing: 'testrouting', _index: 'testindex', _type: 'testtype'};
    const metaData = bulk.getMeta(d);
    t.ok(metaData.hasOwnProperty('index'),'returns Object with key "index"');
    t.same(metaData.index.parent,'testparent','includes parent=testparent');
    t.same(metaData.index.routing,'testrouting','includes routing=testrouting');
    t.same(metaData.index._id,'1','includes _id=1');
    t.same(metaData.index._type,'testtype','includes type=testtype');
    t.same(metaData.index._index,'testindex','includes index=testindex');
    t.end();
  });

  t.test('pipe into etl.elastic.index()',async t => {
    let i = 0;
    const upsert = etl.elastic.index(client,'test',undefined,{pushResults:true});

    const d = await data.stream()
      .pipe(etl.map(d => {
        d._id = i++;
        return d;
      }))
      .pipe(etl.collect(100))
      .pipe(upsert)
      .promise();

    t.same(d[0].items.length,3,'record count matches');
    t.same(d[0].items[0].body,data.data[0],'data matches');
  });

  t.test('retreive data with client.search()', async t => {
    await Promise.delay(2000); 
    let d = await client.search({index:'test'});
    if (d.body) d = d.body;
    const values = convertHits(d.hits.hits);
    t.same(values,data.data,'data matches');
  });

  t.test('etl.elastic.find()', async t => {
    const find = etl.elastic.find(client);
    find.end({index:'test'});

    let d = await find.promise();  
    const values = convertHits(d);
    if (d.body) d = d.body;
    t.same(values,data.data,'returns original data');
  });

  t.test('etl.elastic.scroll()', async t => {
    const scroll = etl.elastic.scroll(client,{index: 'test', size: 1},{ highWaterMark: 0 });
    // setting highWaterMark to zero and size = 1 allows us to test for backpressure
    // a missing scroll_id would indicate that scrolling has finished pre-emptively
    const d = await scroll.pipe(etl.map(d => {
      t.ok(scroll.scroll_id && scroll.scroll_id.length,'Scroll id available - backpressure is managed');
      return Promise.delay(200).then(() => d);
    },{highWaterMark: 0}))
    .promise();

    t.same(scroll.scroll_id,undefined,'scrolling has finished');  // scrolling has finished
    t.same(convertHits(d),data.data,'returns original data');
  });

  t.test('No retry on mapping exception', async t => {
    const upsert = etl.elastic.index(client,'testretries',undefined,{maxRetries: 1, retryDelay:1, pushErrors: true});
    let results = upsert.pipe(etl.map()).promise();
    upsert.write({number:2});
    upsert.write({number: 'not a number'});
    upsert.end();

    results = await results;
    t.same(results[0][0].error.type, 'document_parsing_exception');
    t.end();
  });
});


================================================
FILE: test/expand-test.js
================================================
const etl = require('../index');
const data = require('./data');
const t = require('tap');


t.test('expanded', {jobs: 10}, async t => {
  t.test('stream without expanded', async t => {
    const d = await data.stream({clone:true}).pipe(etl.map()).promise();
    d.forEach(d => t.same(Object.keys(d),[],'no prototype keys'));
  });

  t.test('stream with expanded', async t => {
    const d = await data.stream({clone:true}).pipe(etl.expand()).promise();
    t.same(d,data.data,'prototype data is available');
  });

  t.test('etl.expand(\'uppercase\')', async t => {
    const ukeys = Object.keys(data.data[0]).map(key => key.toUpperCase());

    const d = await data.stream({clone:true})
      .pipe(etl.expand('uppercase'))
      .promise();

    t.same(Object.keys(d[0]),ukeys,'converts keys to uppercase');
  });

  t.test('etl.expand(\'lowercase\')',async t => {
    const lkeys = Object.keys(data.data[0]).map(key => key.toLowerCase());
    
    const d = await data.stream({clone:true})
      .pipe(etl.expand('lowercase'))
      .promise();
    
    d.forEach(d => t.same(Object.keys(d),lkeys,'converts keys to lowercase'));
  });

  t.test('etl.expand(customTransform)',async t => {
    function customTransform(key) {
      if (key == '__line') return;
      return 'custom_'+key;
    }

    const ckeys = Object.keys(data.data[0])
      .filter(key => key !== '__line')
      .map(key => 'custom_'+key);

    const d = await data.stream({clone:true})
      .pipe(etl.expand(customTransform))
      .promise();
    
    d.forEach(d => t.same(Object.keys(d),ckeys, 'transforms keys'));
  });
});

================================================
FILE: test/file-test.js
================================================
const etl = require('../index');
const path = require('path');
const data = require('./data');
const t = require('tap');

t.test('file', async t => {
  const file = await etl.file(path.join(__dirname,'test.txt')).promise();
    
  t.test('results', t => {
    t.same(file[0].text,data.fixed.join(''),'return data');
    t.same(file[0].__filename,'test.txt','contain file information');
    t.end();
  });
    
  t.test('piped',async t => {
    const expected = data.copy().map(d => {
      d.__path = undefined;
      d.category = 'A';
      return d;
    });

    const d = await etl.file(path.join(__dirname,'test.txt'),{info:{category:'A'}})
      .pipe(etl.fixed(data.layout))
      .pipe(etl.expand())
      .promise();
    
    d.forEach(d => d.__path = undefined);
    t.same(d,expected,'returns data');
  });
});

================================================
FILE: test/fixed-test.js
================================================
const etl = require('../index');
const data = require('./data');
const t = require('tap');

t.test('fixed layout', {jobs: 10}, async t => {
  t.test('defined as object', async t => {
    const d = await data.stream()
      .pipe(etl.map(d => ({text: d.text, __filename: d.__filename})))
      .pipe(etl.fixed(data.layout))
      .pipe(etl.expand())
      .promise();

    t.same(d,data.data,'splits incoming data into columns');
  });
  

  t.test('defined as an array', async t => {
    const layout = Object.keys(data.layout).map(key => {
      let val = data.layout[key];
      if (!isNaN(val)) val = {length:val};
      val.field = key;
      return val;
    });

    const d = await data.stream()
      .pipe(etl.map(d => ({text: d.text, __filename: d.__filename})))
      .pipe(etl.fixed(layout))
      .pipe(etl.expand())
      .promise();
      
    t.same(d,data.data,'splits inc;oming data into columns');
  });  
});

================================================
FILE: test/keepopen-test.js
================================================
const etl = require('../index');
const Promise = require('bluebird');
const t = require('tap');

t.test('keepOpen', { jobs: 10 }, async t => {
  t.test('after end',async t => {
    const p = etl.keepOpen(500);
    const d = await etl.toStream([1,999,[3,4]])
      .pipe(p)
      .pipe(etl.map(d => {
        if (d === 999)
          Promise.delay(200)
           .then(() => p.write(2));
        else
          return d;
      }))
      .promise();

    t.same(d,[1,[3,4],2],'stays open as long as data arrives before timeout');
  });

  t.test('data arrives after timeout', async t => {
    const p = etl.keepOpen(100);

    const e = await etl.toStream([1,undefined,[3,4]])
      .pipe(p)
      .pipe(etl.map(d => {
        if (d === undefined)
          return Promise.delay(1000)
           .then(p.write.bind(p,2));
        else
          return d;
      }))
      .promise()
      .then(function() { throw 'Should Error'; },Object);

    t.ok(e.message === 'stream.push() after EOF' || e.message === 'write after end','should error');
  });  
});

================================================
FILE: test/lib/dataStream.js
================================================
const PassThrough = require('stream').PassThrough;

module.exports = function dataStream(data) {
  const s = PassThrough({objectMode:true});
  // Simulate async data stream
  Promise.resolve().then(async() => {
    for (let i = 0; i < data.length; i++) {
      s.write(data[i]);
      await new Promise(resolve => setTimeout(resolve,10))
    }
    s.end();
  });
  return s;
};

================================================
FILE: test/lib/mongo.js
================================================

const mongodbClient = require("mongodb").MongoClient;

let client;

async function getMongodbDriver() {

  if (!client) {
    client = await mongodbClient.connect('mongodb://mongodb:27017/etl_tests', {});
  }

  return client.db();
}

async function getCollection(collectionName) {
  const db = await getMongodbDriver();
  return db.collection(collectionName);
}

async function clear() {
  const db = await getMongodbDriver();
  await Promise.all(
    [
      db.collection("insert").deleteMany({}),
      db.collection("update-empty").deleteMany({}),
      db.collection("update-populated").deleteMany({}),
      db.collection("upsert").deleteMany({}),
      db.collection("upsert2").deleteMany({}),
      db.collection("upsert3").deleteMany({})
    ]);
  await client.close();
}

module.exports = {
  getCollection,
  clear
};



================================================
FILE: test/lib/worker.js
================================================
const etl = require('../../index');
const Promise = require('bluebird');

if (etl.cluster.isWorker) {
  etl.cluster.process = d => Promise.delay(100)
  	.then(() => etl.cluster.progress(d));
}


================================================
FILE: test/mongo-insert-test.js
================================================
const etl = require('../index');
const data = require('./data');
const {getCollection, clear} = require('./lib/mongo');
const t = require('tap');
const Promise = require('bluebird');

t.test('mongo.insert', async t => {

  t.teardown(() => t.end());
  
  t.test('piping data into mongo.insert', async t => {
    const collection = await getCollection('insert');
    const d = await data.stream()
                    .pipe(etl.mongo.insert(collection,{pushResults:true}))
                    .promise();
    t.same(d.length,3,'returns results');
    d.forEach(d => t.ok(d.acknowledged && d.insertedId,'inserts each record'));
  });

  t.test('mongo collection', async t => {
    const collection = await getCollection('insert');
    const d = await collection.find({},{ projection: {_id:0}}).toArray();

    t.same(d,data.data,'reveals data');
  });

  t.test('pushResults == false and collection as promise', async t => {
    const collection = await getCollection('insert');
    const d = await data.stream(etl.mongo.insert(collection))
                .pipe(etl.mongo.insert(collection))
                .promise();

    t.same(d,[],'returns nothing');
  });

  t.test('error in collection', async t => {
    const collection = Promise.reject({message: 'CONNECTION_ERROR'});
    collection.suppressUnhandledRejections();
    const e = await etl.toStream({test:true})
      .pipe(etl.mongo.update(collection,'_id'))
      .promise()
      .then(() => {throw 'SHOULD_ERROR';}, Object);

    t.same(e.message,'CONNECTION_ERROR','should bubble down');
  });
})
.then(() => clear())
.then(() => t.end())
.catch(e => {
  if (e.message.includes('ECONNREFUSED'))
    console.warn('Warning: MongoDB server not available');
  else
    console.warn(e.message);
});

  

================================================
FILE: test/mongo-update-test.js
================================================
const etl = require('../index');
const Promise = require('bluebird');
const data = require('./data');
const {getCollection, clear} = require('./lib/mongo');
const t = require('tap');

t.test('mongo update', async t => {
  t.teardown(() => t.end());

  t.test('single record', async t => {
    const collection = await getCollection('update-empty');

    t.test('missing keys', async t => {
      const e = await Promise.try(() => etl.mongo.update(collection))
        .then(() => { throw 'Should result in an error';},Object);
      t.same(e.message,'Missing Keys','Errors');
    });

    t.test('upsert', async t => {
      const insert = etl.mongo.update(collection,['__line'],{upsert: true, pushResults:true});
      const data = etl.map();
      data.end({name:'single record',__line:999});

      const d = await data.pipe(insert).promise();

      t.same(d[0].upsertedCount,1,'upserts one record');
    });

    t.test('updates into mongo', async t => {
      const insert = etl.mongo.update(collection,['__line'],{pushResults:true});
      const data = etl.map();
      data.end({name:'updated single record',__line:999});

      const d = await data.pipe(insert).promise();

      t.same(d[0].modifiedCount,1);
    });
  });

  t.test('bulk', async t => {
    const collection = await getCollection('update-empty');

    t.test('on an empty collection', async t => {
      const update = etl.mongo.update(collection,['name'],{pushResults:true});

      let d = await data.stream()
              .pipe(etl.collect(100))
              .pipe(update)
              .promise();
          
      d = d[0];
      t.same(d.insertedCount,0,'inserts no records');
      t.same(d.upsertedCount,0,'upserts no records');
      t.same(d.matchedCount,0,'matched no records');
    });

    t.test('with pushresults == false',async t => {
      const collection = await getCollection('update-empty');
      const update = etl.mongo.update(collection,['name']);

      const d = await data.stream()
        .pipe(update)
        .promise();
    
      t.same(d,[],'pushes nothing downstream');
    });

    t.test('on a populated collection', {autoend: true}, async t => {
      const collection = await getCollection('update-populated');

      t.test('update', async t => {
        await collection.insertMany(data.copy());
        const update = etl.mongo.update(collection,['name'],{pushResults:true});

        let d = await data.stream()
          .pipe(etl.map(function(d) {
            if (d.name == 'Nathaniel Olson')
              d.name = 'Not Found';
            d.newfield='newfield';
            return d;
          }))
          .pipe(etl.collect(100))
          .pipe(update)
          .promise();
                
        d = d[0];
        t.same(d.modifiedCount,2,'modified 2 records');
        t.same(d.insertedCount,0,'inserted zero records');
      });

      t.test('using find', async t => {
        const collection = await getCollection('update-populated');
        const d = await collection.find({},{projection: {_id:false}}).toArray();

        const expected = data.copy().map(function(d,i) {
          if (i) d.newfield = 'newfield';
          return d;
        });
        t.same(d,expected,'results were saved');
      });
    });

    t.test('using upsert option', {autoend: true}, async t => {
      const collection = await getCollection('upsert');

      t.test('upsert', async t => {  
        const upsert = etl.mongo.update(collection,['name'],{pushResults:true,upsert:true});
        let d = await data.stream()
          .pipe(etl.collect(100))
          .pipe(upsert)
          .promise();

        d = d[0];
        t.same(d.upsertedCount,3,'3 updated');
        t.same(d.matchedCount,0, '0 matched');
      });

      t.test('find',async t => {
        const collection = await getCollection('upsert');
        const d = await collection.find({},{projection : {_id:false}}).toArray();
          
        t.same(d,data.data,'results are saved');
      });
    });

    t.test('using upsert function',{autoend: true}, async t => {
      const collection = await getCollection('upsert2');

      t.test('should populate', async t => {
        const upsert = etl.mongo.upsert(collection,['name'],{pushResults:true});

        let d = await data.stream()
          .pipe(etl.collect(100))
          .pipe(upsert)
          .promise();

        d = d[0];
        t.same(d.upsertedCount,3,'upserts 3 records');
        t.same(d.matchedCount,0,'matches 0 records');
      });

      t.test('find',async t =>  {
        const d = await collection.find({},{projection: {_id:false}}).toArray();
        t.same(d,data.data,'results are saved');
      });
    });
  });

  t.test('using $update with upsert function', async t => {
    const collection = await getCollection('upsert3');
    t.test('should populate', async t => {
      const upsert = etl.mongo.upsert(collection,['name'],{pushResults:true});

      let d = await data.stream()
        .pipe(etl.map(d => Object.assign( {name: d.name, $update:{$set: d}})))
        .pipe(etl.collect(100))
        .pipe(upsert)
        .promise();

      d = d[0];
      t.same(d.upsertedCount,3,'upserts 3 records');
      t.same(d.matchedCount,0,'matches 0 records');
      t.end();
    });

    t.test('find',async t =>  {
      const d = await collection.find({}, {projection: {_id: false}}).toArray();          
      t.same(d,data.data,'results are saved');
    });
  });

  t.test('error in collection', async t => {
    const collection = Promise.reject(new Error('CONNECTION_ERROR'));
    const e = await etl.toStream({test:true})
      .pipe(etl.mongo.update(collection,'_id'))
      .promise()
      .then(() => { throw 'SHOULD_ERROR';}, Object);
    t.same(e.message,'CONNECTION_ERROR','passes down');
  });
})
.then(() => clear())
.then(() => t.end())
.catch(e => {
  if (e.message.includes('ECONNREFUSED'))
    console.warn('Warning: MongoDB server not available');
  else
    console.warn(e.message);
});

================================================
FILE: test/mysql-test.js
================================================
const etl = require('../index');
const mysql = require('mysql');
const data = require('./data');
const t = require('tap');

const pool = mysql.createPool({
  host: 'mysql',
  connectionLimit : 10,
  user: 'root',
  password: 'example'
});

const p = etl.mysql.execute(pool);

const before = async function() {
  await p.query('DROP DATABASE IF EXISTS `circle_test-1`');
  await p.query('CREATE DATABASE `circle_test-1`');
  await p.query('DROP TABLE IF EXISTS `circle_test-1`.`test-2`;');
  await p.query(
    'CREATE TABLE `circle_test-1`.`test-2` ('+
    'name varchar(45) DEFAULT NULL,'+
    'age int(11) DEFAULT NULL,'+
    'dt datetime DEFAULT NULL '+
    ')'
  );
};
 
t.test('mysql', {timeout: 20000}, async function(t) {
  await before();

  t.test('inserts', async function(t) {
    const d = await data.stream()
      .pipe(etl.mysql.upsert(pool,'circle_test-1','test-2',{pushResults:true}))
      .promise();

    t.same(d[0].affectedRows,3,'returns right length');
  });

  t.test('selecting back',async function(t){
    const d = await p.query('SELECT * from `circle_test-1`.`test-2`');

    t.same(d,data.data.map(d => ({
      name : d.name,
      age : d.age,
      dt : d.dt          
    })));
  });

  t.test('streaming works',async function(t) {
    const d = await p.stream('select * from `circle_test-1`.`test-2`').promise();
    t.same(d.length,3);
  });

})
.catch(console.log)
.then(() => pool.end());

================================================
FILE: test/postgres-test.js
================================================
const etl = require('../index');
const pg = require('pg');
const data = require('./data');
const dataChanged = require('./data-changed');
const QueryStream = require('pg-query-stream');
const t = require('tap');

const pool = new pg.Pool({
  host: 'postgres',
  port: 5432,
  database: 'postgres',
  user: 'postgres',
  password: 'example'
});

const p = etl.postgres.execute(pool);

const before = p.query('CREATE SCHEMA circle_test_schema')
  .catch(Object)
  .then(() => p.query('DROP TABLE IF EXISTS circle_test_schema.test;'))
  .then(()  => p.query(
    'CREATE TABLE circle_test_schema.test ('+
    'name varchar(45),'+
    'age integer,'+
    'dt date, '+
    'CONSTRAINT test_pkey PRIMARY KEY (name)'+
    ')'
  ));

t.test('postgres', async t => {
  await before;
  t.test('inserts', async t => {
    const d = await data.stream()
      .pipe(etl.postgres.insert(pool,'circle_test_schema','test',{pushResults:true}))
      .promise();

    t.same(d.length,1,'Only one request sent');
    t.same(d[0].rowCount,3,'rowCount is correct');
  });

  t.test('and records are verified',async t => {
    const expected = data.data.map(d => ({
      name : d.name,
      age : d.age,
      dt : d.dt
    }))
    .sort((a,b) => a.age - b.age);

    const d = await p.query('SELECT * from circle_test_schema.test order by age');
    t.same(d.rows,expected,'records verified');
  });

  t.test('upserts', async t => {
    // Remove delete, leave previous data in table to match name PKEY
    //await p.query('DELETE from circle_test_schema.test');
    const d = await dataChanged.stream()
      .pipe(etl.postgres.upsert(pool,'circle_test_schema','test',{pushResults:true}))
      .promise();

    t.same(d.length,3,'Three request sent');
    t.same(d[0].rowCount,1,'rowCount is correct');
  });

  t.test('and Upsert records are verified',async t => {
    //Reverify against changed data
    const expected = dataChanged.data.map(d => ({
      name : d.name,
      age : d.age,
      dt : d.dt
    }))
    .sort((a,b) => a.age - b.age);

    const d = await p.query('SELECT * from circle_test_schema.test order by age');
    t.same(d.rows,expected,'records verified');
  });


  t.test('streaming', async t => {
    const d = await p.stream(new QueryStream('select * from circle_test_schema.test'))
      .pipe(etl.map())
      .promise();

    t.same(d.length,3,'work');
  });
})
.catch(console.log)
.then(() => pool.end());


================================================
FILE: test/scan-test.js
================================================
const etl = require('../index');
const Promise = require('bluebird');
const t = require('tap');

const dataStream = require('./lib/dataStream');
const data = [1,2,3,4,5,6,7,8,9,10,11];

t.test('prescan', { jobs: 10 }, async t => {

  t.test('etl.prescan(3) with stream of objects',async t => {
    let prescanned,firstRecord;
    const d = await dataStream(data)
      .pipe(etl.prescan(3,d => Promise.delay(500).then(() => prescanned = d)))
      .pipe(etl.map(d => {
        if (!firstRecord)
          firstRecord = prescanned || [];
        return d;
      }))
      .promise();

    t.same(prescanned,[1,2,3],'first 3 records prescanned');
    t.same(firstRecord,[1,2,3],'prescan finished before streaming down');
    t.same(d,data,'all data is piped down');
  });

  t.test('etl.prescan(100) with stream of 10 objects',async t => {
    let prescanned,firstRecord;
    const d = await dataStream(data)
      .pipe(etl.prescan(100,d => Promise.delay(500).then(() => prescanned = d)))
      .pipe(etl.map(d => {
        if (!firstRecord)
          firstRecord = prescanned || [];
        return d;
      }))
      .promise();

    t.same(prescanned,data,'first 3 records prescanned');
    t.same(firstRecord,data,'prescan finished before streaming down');
    t.same(d,data,'all data is piped down');
  });

  t.test('etl.prescan(30) with strings',async t => {
    const text = [
      'Lorem ipsum dolor sit amet, ',
      'consectetur adipiscing elit, ',
      'sed do eiusmod tempor incididunt ',
      'ut labore et dolore magna aliqua.' 
    ];

    let prescanned,firstRecord;
    const d = await dataStream(text)
      .pipe(etl.prescan(30,d => Promise.delay(10).then(() => prescanned = d)))
      .pipe(etl.map(d => {
        if (!firstRecord)
          firstRecord = prescanned || [];
        return d;
      }))
      .promise();

    t.same(prescanned,text.slice(0,2),'first 3 records prescanned');
    t.same(firstRecord,text.slice(0,2),'prescan finished before streaming down');
    t.same(d,text,'all data is piped down');
  });
});

================================================
FILE: test/split-test.js
================================================
const etl = require('../index');
const t = require('tap');

const data = [
  'here are\nso',
  'me chunks\n',
  'of data\nDATA1   \nDATA2\nDATA3\n    '
];

t.test('etl.split()', async t => {
  const expected = [
    { text: 'here are', __line: 0, __filename: 'text.txt' },
    { text: 'some chunks', __line: 1, __filename: 'text.txt' },
    { text: 'of data', __line: 2, __filename: 'text.txt' },
    { text: 'DATA1   ', __line: 3, __filename: 'text.txt' },
    { text: 'DATA2', __line: 4, __filename: 'text.txt' },
    { text: 'DATA3', __line: 5, __filename: 'text.txt' },
    { text: '    ', __line: 6, __filename: 'text.txt' }
  ];

  const split = etl.split();

  data.forEach((d,i) => {
    setTimeout(() => {
      split.write(i == 2 ? d : {text:d,__filename:'text.txt'});
      if (i == data.length-1)
        split.end();
    });
  });

  const d = await split.pipe(etl.expand()).promise();
  t.same(d,expected,'splits data on newlines by default');
});

t.test('split(\'a\')', async t => {
  const split = etl.split('a');

  const expected = [
    { text: 'here ', __line: 0, __filename: 'text.txt' },
    { text: 're\nsome chunks\nof d', __line: 1, __filename: 'text.txt' },
    { text: 't', __line: 2, __filename: 'text.txt' },
    { text: '\nDATA1   \nDATA2\nDATA3\n    ',__line: 3, __filename: 'text.txt' }
  ];

  data.forEach((d,i) => {
    setTimeout(() => {
      split.write(i == 2 ? d : {text:d,__filename:'text.txt'});
      if (i == data.length-1)
        split.end();
    });
  });

  const d = await split.pipe(etl.expand()).promise();
  t.same(d,expected,'splits on a');
});

t.test('etl.split empty', async t => {
  const p = etl.map();
  p.end();

  const data = await p.pipe(etl.split()).promise();
  t.same(data,[]);
});

================================================
FILE: test/stringify-test.js
================================================
const etl = require('../index');
const t = require('tap');

const data = [
  {a:1,b:'test1'},
  {a:2,b:'test2'}
];

function dataStream() {
  const s = etl.streamz();
  data.forEach(d => s.write(d));
  s.end();
  return s;
}

t.test('stringify', { jobs: 10 }, async t => {
  t.test('etl.stringify()',async t => {
    const stringify = etl.stringify();
    const expected = [ '{"a":1,"b":"test1"}', '{"a":2,"b":"test2"}' ];
    const d = await dataStream().pipe(stringify).promise();

    t.same(d,expected,'returns stringified object');
  });

  t.test('etl.stringify(d)', async t => {
    const stringify = etl.stringify(2);
    const expected = [ '{\n  "a": 1,\n  "b": "test1"\n}', '{\n  "a": 2,\n  "b": "test2"\n}' ];
    const d = await dataStream().pipe(stringify).promise();

    t.same(d,expected, 'returns indented object');
  });
});

================================================
FILE: test/test.csv
================================================
"name   ","Age","DT"
"Nathaniel Olson",52,"1/10/2025",
"Ann Ellis",36,"10/2/2035"
"Willie Freeman",38,"4/1/2016"


================================================
FILE: test/test.txt
================================================
Nathaniel Olson 52    1/10/2025Ann Ellis       36    10/2/2035Willie Freeman  38     4/1/2016

================================================
FILE: test/timeout-test.js
================================================
const Promise = require('bluebird');
const etl = require('../index');
const t = require('tap');

t.test('timeout', { jobs: 10 }, async t => {
  t.test('with no delay', async t => {
    const timeout = etl.timeout(100);
    const d = await etl.toStream([1,2,3,4])
      .pipe(etl.map(d => Promise.delay(30,d)))
      .pipe(timeout)
      .promise();

    t.same(d,[1,2,3,4],'results are as expected');
    t.same(timeout.interval._idleTimeout,-1,'cleans up setInterval');
  });
    
  t.test('with a triggering delay',async t => {
    const timeout = etl.timeout(100);   
    const e = await etl.toStream([1,2,3,4])
      // Delay by 250ms when we see '4'
      .pipe(etl.map(d => Promise.delay(d === 4 ? 250 : 30,d)))
      .pipe(timeout)
      .promise()
      .then(() => { throw 'SHOULD_ERROR'; }, String);

    t.same(e,'ETL_TIMEOUT','emits timeout');
    t.same(timeout.interval._idleTimeout,-1,'cleans up setInterval');
  });
});

================================================
FILE: test/tofile-test.js
================================================
const etl = require('../index');
const path = require('path');
const os = require('os');
const t = require('tap');

const filename = path.join(os.tmpdir(),Number(new Date())+'.txt');

t.test('toFile', async t => {
  t.test('piping into', async t => {
    const d = await etl.toStream([1,2,[3,4]])
      .pipe(etl.stringify(0,null,true))
      .pipe(etl.toFile(filename))
      .promise();

    t.same(d,[true],'returns true when done');
  });

  t.test('reading file', async t => {
    const d = await etl.file(filename).promise();
    t.same(d[0].text,'1\n2\n[3,4]\n','verifies content');
  });
});

================================================
FILE: test/tostream-test.js
================================================
const etl = require('../index');
const t = require('tap');

t.test('toStream', { jobs: 10 }, async t => {
  t.test('static data',t => {
    t.test('array',async t => {
      const d = await etl.toStream([1,2,[3,4]]).pipe(etl.map(d => [d])).promise();
      t.same(d,[[1],[2],[[3,4]]],'streams elements');
    });

    t.test('no data',async t => {
      const d = await etl.toStream().promise();
      t.same(d,[],'returns empty stream');
    });

    t.end();
  });

  t.test('function input', t => {
    t.test('returning an array',async t => {
      const d = await etl.toStream(() => [1,2,[3,4]]).promise();
      t.same(d,[1,2,[3,4]],'streams the array');
    });

    t.test('`this.push` and function results', async t => {
      const d = await etl.toStream(function() {
        this.push(1);
        this.push(2);
        return [[3,4]];
      }).promise();

      t.same(d,[1,2,[3,4]],'streams combined data');
    });

    t.test('undefined return', async t => {
      const d = await etl.toStream(function() {}).promise();
      t.same(d,[],'returns empty stream');
    });

    t.end();
  });

  t.test('promise input', t => {
    t.test('resolving to an array', async t => {
      const d = await etl.toStream(Promise.resolve([1,2,[3,4]])).promise();
      t.same(d,[1,2,[3,4]],'streams elements');
    });

    t.test('resolving to undefined', async t => {
      const d = await etl.toStream(Promise.resolve()).promise();
      t.same(d,[],'returns empty stream');
    });

    t.end();
  });

  t.test('function returning a stream', t => {
    t.test('with data', async t => {
      const d = await etl.toStream(() => etl.toStream([1,2,[3,4]])).promise();
      t.same(d,[1,2,[3,4]],'returns data');
    });

    t.test('with no data', async t => {
      const d = await etl.toStream(() => etl.toStream()).promise();
      t.same(d,[],'returns empty stream');
    });

    t.end();
  });

  t.test('error in the function ', async t =>  {
    const e = await etl.toStream(() => { throw 'ERROR'; })
      .promise()
      .then(() => { throw 'SHOULD_ERROR';}, String);
    
    t.same(e,'ERROR','is passed downstream');
  });
});

================================================
FILE: test.sh
================================================
npm install
wait-for-it elasticsearch:9200
wait-for-it mongodb:27017
wait-for-it mysql:3306
wait-for-it postgres:5432
# cluster test needs to be run from master thread
node test/cluster-test.js | tap test --exclude=*cluster* jobs=5  --coverage-report=lcov --allow-incomplete-coverage

Download .txt

gitextract_s4il6__p/

├── .github/
│   └── workflows/
│       ├── coverage.yml
│       ├── publish.yml
│       └── test.yml
├── .gitignore
├── .npmignore
├── .vscode/
│   ├── launch.json
│   └── tasks.json
├── Dockerfile
├── README.md
├── docker-compose.yml
├── index.js
├── lib/
│   ├── bigquery/
│   │   └── index.js
│   ├── chain.js
│   ├── cluster.js
│   ├── collect.js
│   ├── csv_parser.js
│   ├── cut.js
│   ├── elasticsearch/
│   │   ├── bulk.js
│   │   ├── find.js
│   │   ├── index.js
│   │   ├── mapping.js
│   │   └── scroll.js
│   ├── expand.js
│   ├── file.js
│   ├── fixed.js
│   ├── index.js
│   ├── inspect.js
│   ├── keepOpen.js
│   ├── mongo/
│   │   ├── bulk.js
│   │   ├── index.js
│   │   ├── insert.js
│   │   └── update.js
│   ├── mysql/
│   │   ├── execute.js
│   │   ├── index.js
│   │   ├── mysql.js
│   │   ├── script.js
│   │   └── upsert.js
│   ├── postgres/
│   │   ├── execute.js
│   │   ├── index.js
│   │   ├── insert.js
│   │   ├── postgres.js
│   │   ├── script.js
│   │   └── upsert.js
│   ├── prescan.js
│   ├── split.js
│   ├── stringify.js
│   ├── timeout.js
│   ├── toFile.js
│   └── tostream.js
├── package.json
├── test/
│   ├── chain-test.js
│   ├── cluster-test.js
│   ├── collect-test.js
│   ├── csv-test.js
│   ├── cut-test.js
│   ├── data-changed.js
│   ├── data.js
│   ├── elastic-retry-test.js
│   ├── elastic-test.js
│   ├── expand-test.js
│   ├── file-test.js
│   ├── fixed-test.js
│   ├── keepopen-test.js
│   ├── lib/
│   │   ├── dataStream.js
│   │   ├── mongo.js
│   │   └── worker.js
│   ├── mongo-insert-test.js
│   ├── mongo-update-test.js
│   ├── mysql-test.js
│   ├── postgres-test.js
│   ├── scan-test.js
│   ├── split-test.js
│   ├── stringify-test.js
│   ├── test.csv
│   ├── test.txt
│   ├── timeout-test.js
│   ├── tofile-test.js
│   └── tostream-test.js
└── test.sh

Download .txt

SYMBOL INDEX (50 symbols across 33 files)

FILE: lib/bigquery/index.js
  function insert (line 3) | function insert(table, options) {

FILE: lib/cluster.js
  function next (line 22) | function next(worker) {
  function createWorker (line 32) | function createWorker() {

FILE: lib/collect.js
  function Collect (line 4) | function Collect(maxSize,maxDuration,maxTextLength) {

FILE: lib/csv_parser.js
  function Csv_parser (line 5) | function Csv_parser(options) {

FILE: lib/cut.js
  function Cut (line 4) | function Cut(maxLen,options) {

FILE: lib/elasticsearch/bulk.js
  function Bulk (line 5) | function Bulk(action,client,index,type,options) {

FILE: lib/elasticsearch/find.js
  function Find (line 4) | function Find(client,options) {

FILE: lib/elasticsearch/mapping.js
  function parse (line 1) | function parse(obj,path,res) {
  function populate (line 16) | function populate(map,src) {

FILE: lib/elasticsearch/scroll.js
  function Scroll (line 4) | function Scroll(client,query,options) {

FILE: lib/expand.js
  function expand (line 4) | function expand(convert) {

FILE: lib/file.js
  function File (line 5) | function File(file,options) {

FILE: lib/fixed.js
  function Fixed (line 4) | function Fixed(layout,options) {

FILE: lib/inspect.js
  function Inspect (line 4) | function Inspect(options) {

FILE: lib/keepOpen.js
  function KeepOpen (line 4) | function KeepOpen(timeout) {

FILE: lib/mongo/bulk.js
  function Update (line 5) | function Update(_c,collection,keys,options) {

FILE: lib/mongo/insert.js
  function Insert (line 5) | function Insert(_c,collection,options) {

FILE: lib/mongo/update.js
  function Update (line 5) | function Update(_c,collection,keys,options) {

FILE: lib/mysql/execute.js
  function Execute (line 4) | function Execute(pool,options) {

FILE: lib/mysql/mysql.js
  function Mysql (line 5) | function Mysql(pool,options) {

FILE: lib/mysql/script.js
  function Script (line 4) | function Script(pool,schema,table,options) {

FILE: lib/postgres/execute.js
  class Execute (line 3) | class Execute extends Postgres {
    method constructor (line 4) | constructor(pool, options = {}) {
    method _fn (line 8) | _fn(d, cb) {

FILE: lib/postgres/postgres.js
  class Postgres (line 4) | class Postgres extends Streamz {
    method constructor (line 5) | constructor(pool, options = {}) {
    method getConnection (line 15) | getConnection() {
    method query (line 20) | query(query, cb) {
    method stream (line 29) | stream(query, cb) {

FILE: lib/postgres/script.js
  class Script (line 5) | class Script extends Postgres {
    method constructor (line 6) | constructor(pool, schema, table, options) {
    method getColumns (line 18) | getColumns() {
    method getPrimaryKeys (line 31) | getPrimaryKeys() {
    method _push (line 49) | _push() {
    method _fn (line 56) | _fn(record) {
    method _flush (line 111) | _flush(cb) {
  function escapeLiteral (line 118) | function escapeLiteral(str) {

FILE: lib/prescan.js
  function Prescan (line 5) | function Prescan(count,fn) {

FILE: lib/split.js
  function Split (line 4) | function Split(symbol) {

FILE: lib/stringify.js
  function Stringify (line 4) | function Stringify(indent,replacer,newline) {

FILE: lib/timeout.js
  function Timeout (line 4) | function Timeout(ms) {

FILE: lib/toFile.js
  function toFile (line 5) | function toFile(filename) {

FILE: lib/tostream.js
  function toStream (line 4) | function toStream(data) {

FILE: test/elastic-test.js
  function convertHits (line 11) | function convertHits(d) {

FILE: test/expand-test.js
  function customTransform (line 38) | function customTransform(key) {

FILE: test/lib/mongo.js
  function getMongodbDriver (line 6) | async function getMongodbDriver() {
  function getCollection (line 15) | async function getCollection(collectionName) {
  function clear (line 20) | async function clear() {

FILE: test/stringify-test.js
  function dataStream (line 9) | function dataStream() {

Download .json

Condensed preview — 79 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (120K chars).

[
  {
    "path": ".github/workflows/coverage.yml",
    "chars": 796,
    "preview": "name: Node.js CI\n\non:\n  push:\n    branches: [ master ]\n  workflow_dispatch:\n\npermissions:\n  contents: read\n  pages: writ"
  },
  {
    "path": ".github/workflows/publish.yml",
    "chars": 454,
    "preview": "name: Publish to NPM\non:\n  release:\n    types: [created]\njobs:\n  build:\n    runs-on: ubuntu-latest\n    steps:\n      - na"
  },
  {
    "path": ".github/workflows/test.yml",
    "chars": 331,
    "preview": "name: Node.js CI\n\non:\n  push:\n    branches: [ master ]\n  pull_request:\n    branches: [ master ]\n  workflow_dispatch:\n\njo"
  },
  {
    "path": ".gitignore",
    "chars": 87,
    "preview": "/.docker_node_modules/\n/node_modules/\n/.nyc_output/\n/coverage/\n/.tap/\npackage-lock.json"
  },
  {
    "path": ".npmignore",
    "chars": 47,
    "preview": "/.tap/\n/test/\n/.docker_node_modules/\n/.vscode/\n"
  },
  {
    "path": ".vscode/launch.json",
    "chars": 538,
    "preview": "{\n  // Use IntelliSense to learn about possible attributes.\n  // Hover to view descriptions of existing attributes.\n  //"
  },
  {
    "path": ".vscode/tasks.json",
    "chars": 756,
    "preview": "{\n  // See https://go.microsoft.com/fwlink/?LinkId=733558\n  // for the documentation about the tasks.json format\n  \"vers"
  },
  {
    "path": "Dockerfile",
    "chars": 251,
    "preview": "FROM node:20-bookworm\nUSER root\nRUN apt-get update\nRUN wget http://ftp.us.debian.org/debian/pool/main/w/wait-for-it/wait"
  },
  {
    "path": "README.md",
    "chars": 24662,
    "preview": "[![NPM Version][npm-image]][npm-url]\n[![NPM Downloads][downloads-image]][downloads-url]\n[![code coverage](https://zjonss"
  },
  {
    "path": "docker-compose.yml",
    "chars": 1229,
    "preview": "version: \"4\"\n\n# IMPORTANT\n# Make sure you run setup-docker.sh before executing this!!!\n#\n# Bump the client image version"
  },
  {
    "path": "index.js",
    "chars": 34,
    "preview": "module.exports = require('./lib');"
  },
  {
    "path": "lib/bigquery/index.js",
    "chars": 558,
    "preview": "const Streamz = require('streamz');\n\nfunction insert(table, options) {\n  options = options || {};\n\n  let cols = (async ("
  },
  {
    "path": "lib/chain.js",
    "chars": 742,
    "preview": "const duplexer3 = require('duplexer3');\nconst Streamz = require('streamz');\nconst toStream = require('./tostream');\n\nmod"
  },
  {
    "path": "lib/cluster.js",
    "chars": 2123,
    "preview": "// Worker provides a simple framework around the cluster library to orchestrate a multicore ETL pipeline\n// A tasklist i"
  },
  {
    "path": "lib/collect.js",
    "chars": 1296,
    "preview": "const Streamz = require('streamz');\nconst util = require('util');\n\nfunction Collect(maxSize,maxDuration,maxTextLength) {"
  },
  {
    "path": "lib/csv_parser.js",
    "chars": 1970,
    "preview": "const Streamz = require('streamz');\nconst util = require('util');\nconst Csv = require('csv-parser');\n\nfunction Csv_parse"
  },
  {
    "path": "lib/cut.js",
    "chars": 1107,
    "preview": "const Streamz = require('streamz');\nconst util = require('util');\n\nfunction Cut(maxLen,options) {\n  if (!(this instanceo"
  },
  {
    "path": "lib/elasticsearch/bulk.js",
    "chars": 4371,
    "preview": "const Streamz = require('streamz');\nconst Promise = require('bluebird');\nconst util = require('util');\n\nfunction Bulk(ac"
  },
  {
    "path": "lib/elasticsearch/find.js",
    "chars": 605,
    "preview": "const Streamz = require('streamz');\nconst util = require('util');\n\nfunction Find(client,options) {\n  if (!(this instance"
  },
  {
    "path": "lib/elasticsearch/index.js",
    "chars": 345,
    "preview": "const bulk = require('./bulk');\n\nmodule.exports = {\n  bulk : bulk,\n  custom : bulk.bind(bulk,'custom'),\n  index : bulk.b"
  },
  {
    "path": "lib/elasticsearch/mapping.js",
    "chars": 902,
    "preview": "function parse(obj,path,res) {\n  path = path || [];\n  res = res || {};\n\n  if (obj.field)\n    res[obj.field] = {path:path"
  },
  {
    "path": "lib/elasticsearch/scroll.js",
    "chars": 1184,
    "preview": "const Readable = require('stream').Readable;\nconst util = require('util');\n\nfunction Scroll(client,query,options) {\n  if"
  },
  {
    "path": "lib/expand.js",
    "chars": 985,
    "preview": "const Streamz = require('streamz');\nconst util = require('util');\n\nfunction expand(convert) {\n  if (!(this instanceof St"
  },
  {
    "path": "lib/file.js",
    "chars": 706,
    "preview": "const fs = require('fs');\nconst util = require('util');\nconst Streamz = require('streamz');\n\nfunction File(file,options)"
  },
  {
    "path": "lib/fixed.js",
    "chars": 2240,
    "preview": "const Streamz = require('streamz');\nconst util = require('util');\n\nfunction Fixed(layout,options) {\n  if (!(this instanc"
  },
  {
    "path": "lib/index.js",
    "chars": 825,
    "preview": "const Streamz = require('streamz');\n\nmodule.exports = {\n  collect : require('./collect'),\n  timeout : require('./timeout"
  },
  {
    "path": "lib/inspect.js",
    "chars": 370,
    "preview": "const Streamz = require('streamz');\nconst util = require('util');\n\nfunction Inspect(options) {\n  if (!(this instanceof S"
  },
  {
    "path": "lib/keepOpen.js",
    "chars": 705,
    "preview": "const Streamz = require('streamz');\nconst util = require('util');\n\nfunction KeepOpen(timeout) {\n  if (!(this instanceof "
  },
  {
    "path": "lib/mongo/bulk.js",
    "chars": 2381,
    "preview": "const Streamz = require('streamz');\nconst Promise = require('bluebird');\nconst util = require('util');\n\nfunction Update("
  },
  {
    "path": "lib/mongo/index.js",
    "chars": 255,
    "preview": "module.exports = {\n  insert : require('./insert'),\n  update : require('./update'),\n  bulk: require('./bulk'),\n  upsert :"
  },
  {
    "path": "lib/mongo/insert.js",
    "chars": 765,
    "preview": "const Streamz = require('streamz');\nconst Promise = require('bluebird');\nconst util = require('util');\n\nfunction Insert("
  },
  {
    "path": "lib/mongo/update.js",
    "chars": 1508,
    "preview": "const Streamz = require('streamz');\nconst Promise = require('bluebird');\nconst util = require('util');\n\nfunction Update("
  },
  {
    "path": "lib/mysql/execute.js",
    "chars": 450,
    "preview": "const Mysql = require('./mysql');\nconst util = require('util');\n\nfunction Execute(pool,options) {\n  if (!(this instanceo"
  },
  {
    "path": "lib/mysql/index.js",
    "chars": 148,
    "preview": "module.exports = {\n  mysql : require('./mysql'),\n  script : require('./script'),\n  execute : require('./execute'),\n  ups"
  },
  {
    "path": "lib/mysql/mysql.js",
    "chars": 1599,
    "preview": "const Streamz = require('streamz');\nconst util = require('util');\nconst Promise = require('bluebird');\n\nfunction Mysql(p"
  },
  {
    "path": "lib/mysql/script.js",
    "chars": 1632,
    "preview": "const Mysql = require('./mysql');\nconst util = require('util');\n\nfunction Script(pool,schema,table,options) {\n  if (!(th"
  },
  {
    "path": "lib/mysql/upsert.js",
    "chars": 293,
    "preview": "const chain = require('../chain');\nconst script = require('./script');\nconst execute = require('./execute');\n\nmodule.exp"
  },
  {
    "path": "lib/postgres/execute.js",
    "chars": 423,
    "preview": "const Postgres = require('./postgres');\n\nclass Execute extends Postgres {\n  constructor(pool, options = {}) {\n    super("
  },
  {
    "path": "lib/postgres/index.js",
    "chars": 185,
    "preview": "module.exports = {\n  postgres : require('./postgres'),\n  script : require('./script'),\n  execute : require('./execute'),"
  },
  {
    "path": "lib/postgres/insert.js",
    "chars": 355,
    "preview": "const chain = require('../chain');\nconst script = require('./script');\nconst execute = require('./execute');\n\nmodule.exp"
  },
  {
    "path": "lib/postgres/postgres.js",
    "chars": 1620,
    "preview": "const Streamz = require('streamz');\nconst Promise = require('bluebird');\n\nclass Postgres extends Streamz {\n  constructor"
  },
  {
    "path": "lib/postgres/script.js",
    "chars": 3810,
    "preview": "const Postgres = require('./postgres');\n\nconst quoteString = (str) => `\"${str}\"`;\n\nclass Script extends Postgres {\n  con"
  },
  {
    "path": "lib/postgres/upsert.js",
    "chars": 354,
    "preview": "const chain = require('../chain');\nconst script = require('./script');\nconst execute = require('./execute');\n\nmodule.exp"
  },
  {
    "path": "lib/prescan.js",
    "chars": 1011,
    "preview": "const Streamz = require('streamz');\nconst Promise = require('bluebird');\nconst util = require('util');\n\nfunction Prescan"
  },
  {
    "path": "lib/split.js",
    "chars": 1120,
    "preview": "const Streamz = require('streamz');\nconst util = require('util');\n\nfunction Split(symbol) {\n  if (!(this instanceof Stre"
  },
  {
    "path": "lib/stringify.js",
    "chars": 490,
    "preview": "const Streamz = require('streamz');\nconst util = require('util');\n\nfunction Stringify(indent,replacer,newline) {\n  if (!"
  },
  {
    "path": "lib/timeout.js",
    "chars": 628,
    "preview": "const Streamz = require('streamz');\nconst util = require('util');\n\nfunction Timeout(ms) {\n  if (!(this instanceof Stream"
  },
  {
    "path": "lib/toFile.js",
    "chars": 388,
    "preview": "const Streamz = require('streamz');\nconst chain = require('./chain');\nconst fs = require('fs');\n\nfunction toFile(filenam"
  },
  {
    "path": "lib/tostream.js",
    "chars": 521,
    "preview": "const Streamz = require('streamz');\nconst Promise = require('bluebird');\n\nfunction toStream(data) {\n  const stream = Str"
  },
  {
    "path": "package.json",
    "chars": 918,
    "preview": "{\n  \"name\": \"etl\",\n  \"version\": \"0.8.0\",\n  \"description\": \"Collection of stream-based components that form an ETL pipeli"
  },
  {
    "path": "test/chain-test.js",
    "chars": 1429,
    "preview": "const etl = require('../index');\nconst dataStream = require('./lib/dataStream');\nconst t = require('tap');\n\nconst data ="
  },
  {
    "path": "test/cluster-test.js",
    "chars": 335,
    "preview": "const etl = require('../index');\nconst cluster = require('cluster');\nconst path = require('path');\nconst t = require('ta"
  },
  {
    "path": "test/collect-test.js",
    "chars": 2033,
    "preview": "const Promise = require('bluebird');\nconst etl = require('../index');\nconst dataStream = require('./lib/dataStream');\nco"
  },
  {
    "path": "test/csv-test.js",
    "chars": 773,
    "preview": "const etl = require('../index');\nconst path = require('path');\nconst t = require('tap');\nconst data = require('./data');"
  },
  {
    "path": "test/cut-test.js",
    "chars": 746,
    "preview": "const etl = require('../index');\nconst t = require('tap');\n\nconst data = [ '1','2345678','9012345678','9'];\n\nconst expec"
  },
  {
    "path": "test/data-changed.js",
    "chars": 1465,
    "preview": "const PassThrough = require('stream').PassThrough;\n\nconst data = {\n  data : [\n    { text: 'Nathaniel Olson 51    1/10/20"
  },
  {
    "path": "test/data.js",
    "chars": 1465,
    "preview": "const PassThrough = require('stream').PassThrough;\n\nconst data = {\n  data : [\n    { text: 'Nathaniel Olson 52    1/10/20"
  },
  {
    "path": "test/elastic-retry-test.js",
    "chars": 3472,
    "preview": "const etl = require('../index');\nconst Promise = require('bluebird');\nconst t = require('tap');\n\nlet data = [], i=0;\nlet"
  },
  {
    "path": "test/elastic-test.js",
    "chars": 3412,
    "preview": "const etl = require('../index');\nconst data = require('./data');\nconst Promise = require('bluebird');\nconst elasticsearc"
  },
  {
    "path": "test/expand-test.js",
    "chars": 1605,
    "preview": "const etl = require('../index');\nconst data = require('./data');\nconst t = require('tap');\n\n\nt.test('expanded', {jobs: 1"
  },
  {
    "path": "test/file-test.js",
    "chars": 820,
    "preview": "const etl = require('../index');\nconst path = require('path');\nconst data = require('./data');\nconst t = require('tap');"
  },
  {
    "path": "test/fixed-test.js",
    "chars": 927,
    "preview": "const etl = require('../index');\nconst data = require('./data');\nconst t = require('tap');\n\nt.test('fixed layout', {jobs"
  },
  {
    "path": "test/keepopen-test.js",
    "chars": 1052,
    "preview": "const etl = require('../index');\nconst Promise = require('bluebird');\nconst t = require('tap');\n\nt.test('keepOpen', { jo"
  },
  {
    "path": "test/lib/dataStream.js",
    "chars": 377,
    "preview": "const PassThrough = require('stream').PassThrough;\n\nmodule.exports = function dataStream(data) {\n  const s = PassThrough"
  },
  {
    "path": "test/lib/mongo.js",
    "chars": 832,
    "preview": "\nconst mongodbClient = require(\"mongodb\").MongoClient;\n\nlet client;\n\nasync function getMongodbDriver() {\n\n  if (!client)"
  },
  {
    "path": "test/lib/worker.js",
    "chars": 193,
    "preview": "const etl = require('../../index');\nconst Promise = require('bluebird');\n\nif (etl.cluster.isWorker) {\n  etl.cluster.proc"
  },
  {
    "path": "test/mongo-insert-test.js",
    "chars": 1759,
    "preview": "const etl = require('../index');\nconst data = require('./data');\nconst {getCollection, clear} = require('./lib/mongo');\n"
  },
  {
    "path": "test/mongo-update-test.js",
    "chars": 5988,
    "preview": "const etl = require('../index');\nconst Promise = require('bluebird');\nconst data = require('./data');\nconst {getCollecti"
  },
  {
    "path": "test/mysql-test.js",
    "chars": 1426,
    "preview": "const etl = require('../index');\nconst mysql = require('mysql');\nconst data = require('./data');\nconst t = require('tap'"
  },
  {
    "path": "test/postgres-test.js",
    "chars": 2424,
    "preview": "const etl = require('../index');\nconst pg = require('pg');\nconst data = require('./data');\nconst dataChanged = require('"
  },
  {
    "path": "test/scan-test.js",
    "chars": 2049,
    "preview": "const etl = require('../index');\nconst Promise = require('bluebird');\nconst t = require('tap');\n\nconst dataStream = requ"
  },
  {
    "path": "test/split-test.js",
    "chars": 1748,
    "preview": "const etl = require('../index');\nconst t = require('tap');\n\nconst data = [\n  'here are\\nso',\n  'me chunks\\n',\n  'of data"
  },
  {
    "path": "test/stringify-test.js",
    "chars": 842,
    "preview": "const etl = require('../index');\nconst t = require('tap');\n\nconst data = [\n  {a:1,b:'test1'},\n  {a:2,b:'test2'}\n];\n\nfunc"
  },
  {
    "path": "test/test.csv",
    "chars": 113,
    "preview": "\"name   \",\"Age\",\"DT\"\n\"Nathaniel Olson\",52,\"1/10/2025\",\n\"Ann Ellis\",36,\"10/2/2035\"\n\"Willie Freeman\",38,\"4/1/2016\"\n"
  },
  {
    "path": "test/test.txt",
    "chars": 93,
    "preview": "Nathaniel Olson 52    1/10/2025Ann Ellis       36    10/2/2035Willie Freeman  38     4/1/2016"
  },
  {
    "path": "test/timeout-test.js",
    "chars": 935,
    "preview": "const Promise = require('bluebird');\nconst etl = require('../index');\nconst t = require('tap');\n\nt.test('timeout', { job"
  },
  {
    "path": "test/tofile-test.js",
    "chars": 599,
    "preview": "const etl = require('../index');\nconst path = require('path');\nconst os = require('os');\nconst t = require('tap');\n\ncons"
  },
  {
    "path": "test/tostream-test.js",
    "chars": 2142,
    "preview": "const etl = require('../index');\nconst t = require('tap');\n\nt.test('toStream', { jobs: 10 }, async t => {\n  t.test('stat"
  },
  {
    "path": "test.sh",
    "chars": 283,
    "preview": "npm install\nwait-for-it elasticsearch:9200\nwait-for-it mongodb:27017\nwait-for-it mysql:3306\nwait-for-it postgres:5432\n# "
  }
]

About this extraction

This page contains the full source code of the ZJONSSON/node-etl GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 79 files (109.8 KB), approximately 31.3k tokens, and a symbol index with 50 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo