Showing preview only (3,350K chars total). Download the full file or copy to clipboard to get everything.
Repository: clarkgrubb/data-tools
Branch: master
Commit: d9702c9df50b
Files: 219
Total size: 3.2 MB
Directory structure:
gitextract_t4c_0bd0/
├── .gitignore
├── .pylintrc
├── LICENSE
├── Makefile
├── README.md
├── data_tools/
│ ├── __init__.py
│ ├── check-tsv
│ ├── convert_date.py
│ ├── counting_sort.py
│ ├── csv-to-postgres
│ ├── csv_to_json.py
│ ├── csv_to_xlsx.py
│ ├── date_fill.py
│ ├── date_seq.py
│ ├── header-sort
│ ├── highlight.py
│ ├── html_table_to_csv.py
│ ├── join_tsv.py
│ ├── json-diff
│ ├── normalize_utf8.py
│ ├── postgres-to-csv
│ ├── reservoir_sample.py
│ ├── set-diff.sh
│ ├── set-intersect
│ ├── tokenize
│ ├── trim_tsv.py
│ ├── tsv-header
│ ├── tsv_to_json.py
│ ├── xlsx_to_csv.py
│ └── yaml_to_json.py
├── doc/
│ ├── check-tsv.1.md
│ ├── convert-date.1.md
│ ├── counting-sort.1.md
│ ├── csv-to-json.1.md
│ ├── csv-to-postgres.1.md
│ ├── csv-to-tab.1.md
│ ├── csv-to-xlsx.1.md
│ ├── date-seq.1.md
│ ├── header-sort.1.md
│ ├── highlight.1.md
│ ├── html-table-to-csv.1.md
│ ├── join-tsv.1.md
│ ├── json-diff.1.md
│ ├── normalize-utf8.1.md
│ ├── postgres-to-csv.1.md
│ ├── reservoir-sample.1.md
│ ├── set-diff.1.md
│ ├── set-intersect.1.md
│ ├── tab-to-csv.1.md
│ ├── tokenize.1.md
│ ├── trim-tsv.1.md
│ ├── tsv-header.1.md
│ ├── tsv-to-json.1.md
│ ├── utf8-category.1.md
│ ├── utf8-script.1.md
│ └── xlsx-to-csv.1.md
├── man/
│ ├── check-tsv.1
│ ├── convert-date.1
│ ├── counting-sort.1
│ ├── csv-to-json.1
│ ├── csv-to-postgres.1
│ ├── csv-to-tab.1
│ ├── csv-to-xlsx.1
│ ├── date-seq.1
│ ├── header-sort.1
│ ├── highlight.1
│ ├── html-table-to-csv.1
│ ├── iso_8859-1.7
│ ├── join-tsv.1
│ ├── json-diff.1
│ ├── normalize-utf8.1
│ ├── postgres-to-csv.1
│ ├── reservoir-sample.1
│ ├── set-diff.1
│ ├── set-intersect.1
│ ├── tab-to-csv.1
│ ├── tokenize.1
│ ├── trim-tsv.1
│ ├── tsv-header.1
│ ├── tsv-to-json.1
│ ├── utf8-category.1
│ ├── utf8-script.1
│ └── xlsx-to-csv.1
├── requirements.txt
├── setup.py
├── src/
│ ├── csv-to-tab/
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── csv_to_tab.c
│ │ ├── state.dot
│ │ └── test/
│ │ ├── expected.output/
│ │ │ ├── backslash.default.tab
│ │ │ ├── backslash.escape.tab
│ │ │ ├── backslash.replace.tab
│ │ │ ├── backslash.strip.tab
│ │ │ ├── cr.escape.tab
│ │ │ ├── cr.replace.tab
│ │ │ ├── cr.strip.tab
│ │ │ ├── four.tab
│ │ │ ├── newline.escape.tab
│ │ │ ├── newline.replace.tab
│ │ │ ├── newline.strip.tab
│ │ │ ├── one.tab
│ │ │ ├── tab.escape.tab
│ │ │ ├── tab.replace.tab
│ │ │ ├── tab.strip.tab
│ │ │ ├── three.tab
│ │ │ └── two.tab
│ │ └── input/
│ │ ├── backslash.csv
│ │ ├── cr.csv
│ │ ├── four.csv
│ │ ├── newline.csv
│ │ ├── one.csv
│ │ ├── tab.csv
│ │ ├── three.csv
│ │ └── two.csv
│ ├── json-pluck/
│ │ ├── Makefile
│ │ ├── json_pluck.c
│ │ └── test/
│ │ ├── expected.output/
│ │ │ ├── sample.json
│ │ │ └── sample2.json
│ │ └── input/
│ │ ├── sample.json
│ │ └── sample2.json
│ ├── tab-to-csv/
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── tab_to_csv.c
│ │ └── test/
│ │ ├── expected.output/
│ │ │ ├── backslash.default.csv
│ │ │ ├── backslash.unescape.csv
│ │ │ ├── cr.default.csv
│ │ │ ├── cr.unescape.csv
│ │ │ ├── newline.default.csv
│ │ │ ├── newline.unescape.csv
│ │ │ ├── one.csv
│ │ │ ├── tab.default.csv
│ │ │ └── tab.unescape.csv
│ │ └── input/
│ │ ├── backslash.tab
│ │ ├── cr.tab
│ │ ├── newline.tab
│ │ ├── one.tab
│ │ └── tab.tab
│ └── utf8-script/
│ ├── Makefile
│ ├── README.md
│ ├── Scripts.txt
│ ├── UnicodeData.txt
│ ├── generate_category.rb
│ ├── generate_script.rb
│ ├── test/
│ │ ├── utf8-category/
│ │ │ ├── expected.output/
│ │ │ │ └── one.txt
│ │ │ └── input/
│ │ │ └── one.txt
│ │ └── utf8-script/
│ │ ├── expected.output/
│ │ │ └── one.txt
│ │ └── input/
│ │ └── one.txt
│ ├── utf8_category.c
│ ├── utf8_category.c.erb
│ ├── utf8_script.c
│ └── utf8_script.c.erb
└── test/
├── check_tsv/
│ ├── input.bad.tsv
│ └── input.good.tsv
├── check_yaml/
│ ├── bad.yaml
│ └── good.yaml
├── convert_date/
│ └── input.txt
├── counting_sort/
│ └── input.txt
├── csv_files/
│ ├── no-header.csv
│ ├── no-quote.csv
│ ├── quoted-chars.csv
│ ├── single-quote.csv
│ ├── unequal-rows.csv
│ └── unicode.csv
├── csv_to_json/
│ └── test.csv
├── csv_to_postgres/
│ ├── customers.csv
│ └── customers.sql
├── csv_to_tab/
│ ├── expected.escape.tab
│ ├── expected.strip.tab
│ ├── expected.tab
│ └── expected.unicode.tab
├── date_fill/
│ ├── expected.output.tsv
│ └── input.tsv
├── highlight/
│ ├── expected.output.txt
│ └── input.txt
├── html_table_to_csv/
│ ├── expected.test.csv
│ └── test.html
├── join_tsv/
│ ├── expected.output.NULL_VALUE.tsv
│ ├── expected.output.diff.tsv
│ ├── expected.output.left.tsv
│ ├── expected.output.left2.tsv
│ ├── expected.output.right.tsv
│ ├── expected.output.tsv
│ ├── input1.NULL_VALUE.tsv
│ ├── input1.diff.tsv
│ ├── input1.left.tsv
│ ├── input1.null.tsv
│ ├── input1.tsv
│ ├── input2.NULL_VALUE.tsv
│ ├── input2.diff.tsv
│ ├── input2.left.tsv
│ ├── input2.null.tsv
│ └── input2.tsv
├── json_diff/
│ ├── 1a.json
│ ├── 1b.json
│ ├── 2a.json
│ ├── 2b.json
│ ├── expected.output1.txt
│ └── expected.output2.txt
├── normalize_utf8/
│ ├── expected.output.nfc.txt
│ ├── expected.output.nfd.txt
│ ├── expected.output.txt
│ └── input.txt
├── reservoir_sample/
│ ├── expected.output.txt
│ └── input.txt
├── trim_tsv/
│ ├── expected.trim_tsv.tsv
│ └── input.tsv
├── tsv_header/
│ ├── expected.output.txt
│ └── input.tsv
├── tsv_to_csv/
│ └── escapes.tsv
├── tsv_to_json/
│ └── test.tsv
├── xlsx_to_csv/
│ ├── expected.3r3c.csv
│ ├── expected.dates.csv
│ ├── expected.list.out
│ ├── expected.spaces.csv
│ ├── expected.unicode.csv
│ ├── test.xls
│ └── test.xlsx
└── yaml_to_json/
└── input.yaml
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
output
src/csv-to-tab/csv-to-tab
src/tab-to-csv/tab-to-csv
src/json-pluck/json-pluck
ve
build
data_tools.egg-info
dist
src/utf8-script/utf8-script
src/utf8-script/utf8-category
================================================
FILE: .pylintrc
================================================
# Keep warnings which flag usages which are wrong or useless.
#
# Keep style warnings if we agree with them and they can always be fixed.
#
[MESSAGES CONTROL]
disable=invalid-name,redefined-outer-name,superfluous-parens,too-many-arguments,too-many-branches,too-many-locals,duplicate-code,too-few-public-methods,too-many-public-methods,no-self-use,too-many-return-statements,too-many-statements,too-many-instance-attributes,too-many-lines,too-many-boolean-expressions
================================================
FILE: LICENSE
================================================
Copyright (C) 2014 Clark Grubb
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
================================================
FILE: Makefile
================================================
MAKEFLAGS += --warn-undefined-variables
SHELL := bash
.SHELLFLAGS := -e -o pipefail -c
.DEFAULT_GOAL := all
.DELETE_ON_ERROR:
.SUFFIXES:
LOCAL_INSTALL_DIR ?= $(shell if [ -d ~/Local/bin ]; then echo ~/Local/bin; else echo /usr/local/bin; fi)
LOCAL_MAN_DIR ?= $(shell if [ -d ~/Local/man ]; then echo ~/Local/man; else echo /usr/local/share/man; fi)
local_man1_dir := $(LOCAL_MAN_DIR)/man1
man1_source := $(wildcard doc/*.1.md)
man1_targets := $(patsubst doc/%.md,man/%,$(man1_source))
pwd := $(shell pwd)
src := $(pwd)/src
VPATH = test
ve := . ve/bin/activate
ve:
python3 -m venv ve
. ve/bin/activate && pip install -r requirements.txt
.PHONY: utf8-script
utf8-script:
(cd src/$@; make)
.PHONY: csv-to-tab
csv-to-tab:
(cd src/$@; make)
.PHONY: json-pluck
json-pluck:
(cd src/$@; make)
.PHONY: tab-to-csv
tab-to-csv:
(cd src/$@; make)
.PHONY: build.c
build.c: utf8-script csv-to-tab tab-to-csv json-pluck
.PHONY: build
build: ve build.c
# To generate the man pages `pandoc` must be installed. On Mac go to
#
# http://johnmacfarlane.net/pandoc/installing.html
#
# and download the installer. On Ubuntu there is a package:
#
# $ sudo apt-get install pandoc
#
# An uninstalled man page can be viewed with the man command:
#
# $ man doc/foo.1
#
man/%.1: doc/%.1.md
pandoc -s -s -w man $< -o $@
.PHONY: man_targets
man_targets: $(man1_targets)
$(local_man1_dir):
mkdir -p $@
.PHONY: install-script
install.script:
./setup.py sdist
.PHONY: install-c
install.c: build.c
cp src/csv-to-tab/csv-to-tab $(LOCAL_INSTALL_DIR)
cp src/json-pluck/json-pluck $(LOCAL_INSTALL_DIR)
cp src/tab-to-csv/tab-to-csv $(LOCAL_INSTALL_DIR)
cp src/utf8-script/utf8-category $(LOCAL_INSTALL_DIR)
cp src/utf8-script/utf8-script $(LOCAL_INSTALL_DIR)
.PHONY: install-man
install.man: $(local_man1_dir)
if [ ! -d $(LOCAL_MAN_DIR)/man1 ]; then \
echo directory does not exist: $(LOCAL_MAN_DIR)/man1; \
false; \
fi
for target in $(man1_targets); \
do \
cp $$target $(LOCAL_MAN_DIR)/man1; \
done
.PHONY: install
install:
@echo
@echo 'To install Python and Bash scripts:'
@echo
@echo ' $$ virtualenv ve'
@echo ' $$ . ve/bin/activate'
@echo ' $$ ./setup.py sdist'
@echo ' $$ pip3 install dist/data-tools-0.1.0.tar.gz'
@echo
@echo 'To install C tools:'
@echo
@echo ' $$ make install.c'
@echo
@echo 'To install man pages:'
@echo
@echo ' $$ make install.man'
@echo
.PHONY: all
all: install
output:
mkdir -p $@
output/%:
mkdir -p $@
.PHONY: test.check_tsv
test.check_tsv:
./data_tools/check-tsv test/check_tsv/input.good.tsv
! ./data_tools/check-tsv test/check_tsv/input.bad.tsv
.PHONY: test.convert_date
test.convert_date: ve
$(ve) && cat test/convert_date/input.txt | ./data_tools/convert_date.py -i %s \
| ./data_tools/convert_date.py -i %Y-%m-%dT%H:%M:%S -o %s \
| diff - test/convert_date/input.txt
$(ve) && cat test/convert_date/input.txt | ./data_tools/convert_date.py -i %s \
| ./data_tools/convert_date.py -o %s \
| diff - test/convert_date/input.txt
.PHONY: test.counting_sort
test.counting_sort: counting_sort/input.txt ve | output/counting_sort
$(ve) && ./data_tools/counting_sort.py $< > output/counting_sort/output.txt
sort $< > output/counting_sort/expected.output.txt
diff output/counting_sort/output.txt output/counting_sort/expected.output.txt
.PHONY: test.csv_to_json
test.csv_to_json: csv_to_json/test.csv ve | output/csv_to_json
$(ve) && ./data_tools/csv_to_json.py $< > output/csv_to_json/test.csv_to_json.json
$(ve) && echo $$'λ,two\nthree,four' | ./data_tools/csv_to_json.py > output/csv_to_json/unicode.json
$(ve) && echo $$'λ,two\nthree,four' \
| ./data_tools/csv_to_json.py --header=first,second > output/csv_to_json/unicode2.json
.PHONY: test.csv_to_tab
test.csv_to_tab: | csv-to-tab output/csv_to_tab
echo -n $$'one,two\nthree,four' | ./src/csv-to-tab/csv-to-tab > output/csv_to_tab/test.csv_to_tab.tab
diff test/csv_to_tab/expected.tab output/csv_to_tab/test.csv_to_tab.tab
echo $$'λ,two\nthree,four' | ./src/csv-to-tab/csv-to-tab > output/csv_to_tab/unicode.tab
diff test/csv_to_tab/expected.unicode.tab output/csv_to_tab/unicode.tab
echo -n $$'one,two\ttwo\nthree,four' | ./src/csv-to-tab/csv-to-tab --escape > output/csv_to_tab/test.csv_to_tab.escape.tab
diff test/csv_to_tab/expected.escape.tab output/csv_to_tab/test.csv_to_tab.escape.tab
.PHONY: test.sv_to_xlsx
test.csv_to_xlsx: ve | output/csv_to_xlsx
$(ve) && ./data_tools/csv_to_xlsx.py -o output/csv_to_xlsx/output.xlsx \
test/csv_files/no-header.csv \
test/csv_files/unicode.csv
.PHONY: test.date_fill
test.date_fill: ve | output/date_fill
$(ve) && ./data_tools/date_fill.py --date-column=0 --format=%Y-%m-%dT%H -i test/date_fill/input.tsv \
> output/date_fill/output.tsv
diff output/date_fill/output.tsv test/date_fill/expected.output.tsv
.PHONY: test.highlight
test.highlight: highlight/input.txt ve | output/highlight
$(ve) && ./data_tools/highlight.py control < $< > output/highlight/output1.txt
diff test/highlight/expected.output.txt output/highlight/output1.txt
$(ve) && ./data_tools/highlight.py control $< > output/highlight/output2.txt
diff test/highlight/expected.output.txt output/highlight/output2.txt
$(ve) && ./data_tools/highlight.py -r control < $< > output/highlight/output3.txt
diff test/highlight/expected.output.txt output/highlight/output3.txt
$(ve) && ./data_tools/highlight.py -r control $< > output/highlight/output4.txt
diff test/highlight/expected.output.txt output/highlight/output4.txt
.PHONY: test.html_table_to_csv
test.html_table_to_csv: ve | output/html_table_to_csv
$(ve) && ./data_tools/html_table_to_csv.py \
< test/html_table_to_csv/test.html \
> output/html_table_to_csv/output.test.csv
diff output/html_table_to_csv/output.test.csv test/html_table_to_csv/expected.test.csv
.PHONY: test.join_tsv
test.join_tsv: ve | output/join_tsv
$(ve) && ./data_tools/join_tsv.py --column=url \
test/join_tsv/input1.tsv \
test/join_tsv/input2.tsv \
> output/join_tsv/output.tsv
diff test/join_tsv/expected.output.tsv output/join_tsv/output.tsv
#
$(ve) && ./data_tools/join_tsv.py --column=url \
test/join_tsv/input1.null.tsv \
test/join_tsv/input2.null.tsv \
> output/join_tsv/output.null.tsv
diff test/join_tsv/expected.output.tsv output/join_tsv/output.null.tsv
#
$(ve) && ./data_tools/join_tsv.py --column=url --left \
test/join_tsv/input1.left.tsv \
test/join_tsv/input2.left.tsv \
> output/join_tsv/output.left.tsv
diff test/join_tsv/expected.output.left.tsv output/join_tsv/output.left.tsv
#
$(ve) && ./data_tools/join_tsv.py --column=url --left \
test/join_tsv/input2.left.tsv \
test/join_tsv/input1.left.tsv \
> output/join_tsv/output.left.tsv
diff test/join_tsv/expected.output.left2.tsv output/join_tsv/output.left.tsv
#
$(ve) && ./data_tools/join_tsv.py --column=url --right \
test/join_tsv/input2.left.tsv \
test/join_tsv/input1.left.tsv \
> output/join_tsv/output.right.tsv
diff test/join_tsv/expected.output.right.tsv output/join_tsv/output.right.tsv
#
$(ve) && ./data_tools/join_tsv.py --column=url --null=NULL \
test/join_tsv/input1.NULL_VALUE.tsv \
test/join_tsv/input2.NULL_VALUE.tsv \
> output/join_tsv/output.NULL_VALUE.tsv
diff test/join_tsv/expected.output.NULL_VALUE.tsv output/join_tsv/output.NULL_VALUE.tsv
#
$(ve) && ./data_tools/join_tsv.py --left-column=url1 --right-column=url2 \
test/join_tsv/input1.diff.tsv \
test/join_tsv/input2.diff.tsv \
> output/join_tsv/output.diff.tsv
diff test/join_tsv/expected.output.diff.tsv output/join_tsv/output.diff.tsv
.PHONY: test.json_diff
test.json_diff: | output/json_diff
-./data_tools/json-diff test/json_diff/1a.json test/json_diff/1b.json > output/json_diff/output1.txt
diff -w test/json_diff/expected.output1.txt output/json_diff/output1.txt
-./data_tools/json-diff test/json_diff/2a.json test/json_diff/2b.json > output/json_diff/output2.txt
diff -w test/json_diff/expected.output2.txt output/json_diff/output2.txt
.PHONY: test.normalize_utf8
test.normalize_utf8: normalize_utf8/input.txt ve | output/normalize_utf8
$(ve) && ./data_tools/normalize_utf8.py < $< > output/normalize_utf8/output.nfc.txt
diff test/normalize_utf8/expected.output.nfc.txt output/normalize_utf8/output.nfc.txt
$(ve) && ./data_tools/normalize_utf8.py $< > output/normalize_utf8/output.nfc.2.txt
diff test/normalize_utf8/expected.output.nfc.txt output/normalize_utf8/output.nfc.2.txt
$(ve) && ./data_tools/normalize_utf8.py --nfd < $< > output/normalize_utf8/output.nfd.txt
diff test/normalize_utf8/expected.output.nfd.txt output/normalize_utf8/output.nfd.txt
.PHONY: test.reservoir_sample
test.reservoir_sample: reservoir_sample/input.txt ve | output/reservoir_sample
$(ve) && ./data_tools/reservoir_sample.py -r 17 -s 3 < $< > output/reservoir_sample/output.txt
diff test/reservoir_sample/expected.output.txt output/reservoir_sample/output.txt
.PHONY: test.tsv_header
test.tsv_header: | output/tsv_header
./data_tools/tsv-header test/tsv_header/input.tsv > output/tsv_header/output.txt
diff test/tsv_header/expected.output.txt output/tsv_header/output.txt
.PHONY: test.trim_tsv
test.trim_tsv: ve | output/trim_tsv
$(ve) && echo -n $$' one \t two \n three \t four' \
| ./data_tools/trim_tsv.py > output/trim_tsv/trim_tsv.tsv
diff test/trim_tsv/expected.trim_tsv.tsv output/trim_tsv/trim_tsv.tsv
$(ve) && ./data_tools/trim_tsv.py test/trim_tsv/input.tsv > output/trim_tsv/output2.tsv
diff test/trim_tsv/expected.trim_tsv.tsv output/trim_tsv/output2.tsv
#.PHONY: test.tab_to_csv
#test.tab_to_csv: tab_to_csv/escapes.tsv | tab-to-csv csv-to-tab output/tab_to_csv
# ./data_tools/tab-to-csv/tab-to-csv -u $< | ./data_tools/csv-to-tab/csv-to-tab -e > output/tab_to_csv/escape.tsv
# diff $< output/tab_to_csv/escape.tsv
.PHONY: test.tsv_to_json
test.tsv_to_json: tsv_to_json/test.tsv ve | output/tsv_to_json
$(ve) && ./data_tools/tsv_to_json.py $< > output/tsv_to_json/test.tsv_to_json.json
.PHONY: test.xlsx_to_csv
test.xlsx_to_csv: xlsx_to_csv/test.xlsx ve | output/xlsx_to_csv
$(ve) && ./data_tools/xlsx_to_csv.py --list $< > output/xlsx_to_csv/list.out
$(ve) && ./data_tools/xlsx_to_csv.py --sheet=three_rows_three_cols $< output/xlsx_to_csv/3r3c.csv
$(ve) && ./data_tools/xlsx_to_csv.py --sheet=unicode $< output/xlsx_to_csv/unicode.csv
$(ve) && ./data_tools/xlsx_to_csv.py --sheet=spaces $< output/xlsx_to_csv/spaces.csv
$(ve) && ./data_tools/xlsx_to_csv.py --sheet=dates $< output/xlsx_to_csv/dates.csv
diff output/xlsx_to_csv/list.out test/xlsx_to_csv/expected.list.out
diff output/xlsx_to_csv/3r3c.csv test/xlsx_to_csv/expected.3r3c.csv
diff output/xlsx_to_csv/unicode.csv test/xlsx_to_csv/expected.unicode.csv
diff output/xlsx_to_csv/spaces.csv test/xlsx_to_csv/expected.spaces.csv
diff output/xlsx_to_csv/dates.csv test/xlsx_to_csv/expected.dates.csv
.PHONY: test.yaml_to_json
test.yaml_to_json: yaml_to_json/input.yaml ve | output/yaml_to_json
$(ve) && ./data_tools/yaml_to_json.py $< > output/yaml_to_json/ouptut1.json
$(ve) && ./data_tools/yaml_to_json.py < $< > output/yaml_to_json/output2.json
python_base := convert_date counting_sort csv_to_json csv_to_tab
python_base += csv_to_xlsx date_fill highlight html_table_to_csv join_tsv
python_base += normalize_utf8 reservoir_sample trim_tsv tsv_to_json
python_base += xlsx_to_csv yaml_to_json
python_tests := $(patsubst %,test.%,$(python_base))
.PHONY: test.c
test.c:
cd src/csv-to-tab && make test
cd src/json-pluck && make test
cd src/tab-to-csv && make test
cd src/utf8-script && make test
.PHONY: test.python
test.python: $(python_tests)
test.shell: test.check_tsv test.json_diff test.tsv_header
.PHONY: test
test: test.python test.shell
.PHONY: pep8
pep8: ve
. ./ve/bin/activate && find data_tools -name '*.py' \
| xargs pep8 --max-line-length=100
.PHONY: pylint
pylint: ve
. ./ve/bin/activate && find data_tools -name '*.py' \
| xargs pylint --rcfile .pylintrc --disable=missing-docstring
shell_scripts := $(shell grep -l '/usr/bin/env bash' data_tools/* 2> /dev/null)
.PHONY: shellcheck
shellcheck:
echo $(shell_scripts) | xargs shellcheck
.PHONY: check.c
check.c:
cd src/csv-to-tab && make check
cd src/json-pluck && make check
cd src/tab-to-csv && make check
cd src/utf8-script && make check
.PHONY: check
check: pylint pep8 shellcheck check.c test
.PHONY: clean.test
clean.test:
-rm -rf output
cd src/csv-to-tab && make $@
cd src/json-pluck && make $@
cd src/tab-to-csv && make $@
cd src/utf8-script && make $@
.PHONY: clean.build
clean.build:
rm -rf ve
-find . -name '*.pyc' | xargs rm
cd src/csv-to-tab && make $@
cd src/json-pluck && make $@
cd src/tab-to-csv && make $@
cd src/utf8-script && make $@
.PHONY: clean.generate
clean.generate:
cd src/utf8-script && make $@
.PHONY: clean
clean: clean.test
================================================
FILE: README.md
================================================
[summary](#summary) | [setup](#setup) | [how to run](#how-to-run)
[.txt](#txt) | [.tsv](#tsv) | [.tab](#tab) | [.csv](#csv) | [.xlsx](#xlsx) | [.json](#json) | [.yaml](#yaml) | [.html](#html) | [.xml](#xml)
[plain text](#plaintext) | [encodings](#encodings) | [newlines](#newlines) | [relational formats](#relational-fmt) | [joins](#joins) | [keys](#keys) | [hierarchical formats](#hierarchical-fmt)
<a name="summary"/>
# SUMMARY
Command line tools for data extraction, data manipulation, and file format conversion.
check-tsv verify rows in TSV file are same length
convert-date convert dates in tabular data using strftime-style formats
counting-sort sort a file using counting sort
csv-to-json convert CSV to JSON
csv-to-postgres import a CSV file into a PostgreSQL table
csv-to-tab convert CSV to tab delimited
csv-to-xlsx convert CSV files to XLSX worksheets
date-fill fill in missing rows in a TSV file with a time series column
date-seq create a sequence of dates
header-sort sort file, keeping header in place
highlight highlight text matching REGEX
html-table-to-csv extract table content from HTML file as CSV
join-tsv perform a relation join on two TSV files
json-pluck convert JSON array to JSON stream
json-diff show differences between two JSON documents
normalize-utf8 write UTF-8 encoded input to standard out in normalized form
postgres-to-csv write a PostgreSQL table to stdout in CSV format
reservoir-sample select N lines from standard input randomly
set-diff find lines in first file which are not in the second
set-intersect find lines common to two files
tab-to-csv convert tab delimited file to CSV
tokenize extract words from English language text
trim-tsv trim whitespace from fields of TSV file
tsv-header show TSV header with ordinal position of each column
tsv-to-json convert TSV to JSON
utf8-category tally UTF-8 encoded characters by general category
utf8-script tally UTF-8 encoded characters by script
utf8-viewer display Unicode points and optionally names of UTF-8 encoded file
xls-to-csv convert XLS to CSV
xlsx-to-csv convert XLSX to CSV
yaml-to-json convert YAML to JSON
The *data tools* are for working with data at the command line. They are meant to complement the tools you already have. Use `help SUBCOMMAND` to see the help page for a *data tool* or browse the help pages on [GitHub](https://github.com/clarkgrubb/data-tools/tree/master/doc).
Command line tools are composable when the output of one command can be the input of another. The output can be redirected to a file whose path is passed as an argument, or the commands can be connected by a shell pipe. Use of a pipe is *tacit programming*: it relieves the programmer of the need to name a file. Furthermore the byte stream is private to the commands on either side of the pipe.
Only tools which read from standard input or write to standard output can participate in a pipeline. Tools in a pipeline must agree on the *format* of the data in the byte stream. The *data tools* support these formats: `.txt`, `.tsv`, `.tab`, `.csv`, `.xls`, `.xlsx`, `.json`, `.yaml`, `.html`, and `.xml`. Some of the *data tools* are *format conversion tools* to be used to convert from one format to another.
<a name="setup"/>
# SETUP
`python3`, `pip3`, `virtualenv`, and `gcc` are required.
To install Python and Bash scripts in a virtual environment:
$ virtualenv ve
$ . ve/bin/activate
$ git clone git@github.com:clarkgrubb/data-tools.git
$ cd data-tools
$ pip3 install -r requirements.txt
$ ./setup.py sdist
$ pip3 install dist/data_tools-0.1.0.tar.gz
To install `utf8-viewer`:
$ git clone git@github.com:clarkgrubb/utf8-viewer.git
$ cd utf8-viewer
$ make install
To install C tools:
$ make install.c
To install man pages:
$ make install.man
<a name="how-to-run"/>
# HOW TO RUN
check-tsv [TSV_FILE]
convert-date [-i FMT] [-o FMT] [-c COLUMN] [-H]
counting-sort [FILE]
csv-to-json [-d DELIMITER] [-q QUOTECHAR] [CSV_FILE]
csv-to-postgres -f CSV_PATH -t TABLE [-d DB] [-h HOST] [-p PORT] [-U USER] [-w|-W]
csv-to-tab [-e|-x|-r] [CSV_FILE]
csv-to-xlsx -o XLSX_FILE CSV_FILE ...
date-fill --date-column NUM --format FORMAT
date-seq [--format=FMT] [--weekdays=DAY[,DAY]...] YYYY[MM[DD[HH]]] YYYY[MM[DD[HH]]]
header-sort [OPTIONS] FILE
highlight REGEX [FILE]
highlight (--red|--green|--yellow|--blue|--magenta|--cyan)=REGEX ... [FILE]
html-table-to-csv [-t TABLE_NUM] [FILE]
join-tsv -c NAME [-l|-r|-f] [-n VALUE] TSV_FILE1 TSV_FILE2
json-pluck < FILE
json-diff [DIFF_OPTIONS] JSON_FILE1 JSON_FILE2
normalize-utf8 [--nfc|--nfd|--nfkc|--nfkd] [FILE]
postgres-to-csv -t TABLE [-d DB] [-h HOST] [-p PORT] [-U USER] [-w|-W]
reservoir-sample [-r SEED] -s NUM [FILE]
set-diff FILE1 FILE2
set-intersect FILE1 FILE2
tab-to-csv [-u] [TAB_DELIMITED_FILE]
tokenize [-n]
trim-tsv [TSV_FILE]
tsv-header [TSV_FILE]
tsv-to-json [TSV_FILE]
utf8-category [-l|--long-names] [-c|--count-ascii|-s|--skip-ascii]
utf8-script [-c|--count-ascii|-s|--skip-ascii]
utf8-viewer [-b|-c|-n] [-w NUM] [FILE]
utf8-viewer [-b|-c|-n] -a BYTE ...
xls-to-csv <same as xlsx-to-csv>
xlsx-to-csv [--date-format=DATE_FMT] XLSX_FILE DIRECTORY
xlsx-to-csv [--date-format=DATE_FMT] --sheet=SHEET XLSX_FILE [OUTPUT_FILE]
xlsx-to-csv --list XLSX_FILE
yaml-to-json [FILE]
<a name="plaintext"/><a name="txt"/>
# PLAIN TEXT
*Plain text* is a sequence of bytes which use an encoding to represent printable characters.
If no other suffix is appropriate, `.txt` is a reasonable suffix for a plain text file.
In plain text, control characters other than for line endings and perhaps tabs are disallowed. Long lines are discouraged.
<a name="encodings"/>
# ENCODINGS
[iconv](#iconv) | [bad bytes](#bad-bytes) | [utf-8](#utf-8) | [utf-16](#utf-16) | [unicode](#unicode)
<a name="iconv"/>
## iconv
The *data tools* expect and produce UTF-8 encoded data. 8-bit encoded ASCII is valid UTF-8.
Use `iconv` to convert a file in a different encoding:
$ echo あ | iconv -f UTF-8 -t UTF-16 > hiragana.utf16.txt
$ iconv -t UTF-8 -f UTF-16 hiragana.utf16.txt
To get a list of supported encodings:
$ iconv -l
<a name="bad-bytes"/>
## bad bytes
Not all sequences of bytes are valid UTF-8; the *data tools* throw exceptions when invalid bytes are encountered. A drastic way to deal with the problem is to strip the invalid bytes:
$ printf 'bad bytes: \ud835\n' | iconv -c -f UTF-8 -t UTF-8
This command strips all non-ASCII characters:
$ echo 'αλφα alpha' | iconv -cs -f UTF-8 -t ASCII
`utf8-viewer` will render invalid UTF-8 bytes with black squares. The black square is itself a Unicode character (U+25A0), so there is ambiguity. The Unicode points are displayed next to the rendered characters, however, and the point will be ---- for invalid characters.
$ utf8-viewer /bin/ls
When a file is in an unknown encoding, one can inspect it byte-by-byte.
`od -b` displays the bytes in octal:
$ od -b /bin/ls
`od -b` is an unequivocal way to look at the data. It removes the confusion caused by the character encoding assumed by the display. On the other hand it is difficult to make sense of octal bytes.
If some of the bytes in a file are ASCII, such as when the encoding is one of the many 8-bit extensions of ASCII, then `od -c` will display the file in an unequivocal yet easier-to-interpret way:
$ ruby -e '(0..255).each { |i| print i.chr }' | iconv -f mac -t utf8 | od -c
`od -c` uses C backslash sequences or octal bytes for non-ASCII and non-printing ASCII characters.
`xxd` displays the data in rows of 16 bytes. Each row is displayed in 3 columns. The first column is the hex offset of the first byte in the row, the second column is the bytes in hex, and third column is the ASCII characters for the bytes, with a period `.` standing in for control characters and upper 8-bit bytes. The `-c` flag changes the number of bytes per row:
$ xxd /bin/ls
$ xxd -c 32 /bin/ls
The `-i` flag will convert the data to a C source literal. The `-r` flag will convert the output of `xxd` back to the original binary format:
$ xxd -i /bin/ls
$ xxd /bin/ls | xxd -r
The `-s` flag and the `-l` flag specify the start byte and the total number of bytes to display:
$ xxd -s 10 -l 20 /bin/ls
Another way to pick out bytes from a file is `dd`:
$ dd bs=1 iseek=10 count=20 if=/etc/passwd 2> /dev/null
`cat -te` uses a unique escape sequence for each byte, but unlike `od`, it does not display
a fixed number of bytes per line; the mapping from input to output is not injective. Still, since it doesn't introduce line breaks at regular intervals, it may be easier to interpret. An example:
$ ruby -e '(0..255).each { |i| print i.chr }' | iconv -f mac -t utf8 | cat -te
`cat -t` renders printable ASCII and newlines; it uses `^` notation for other control characters. Some versions of `cat -t`
use Emacs style `M-X` notation for upper 8-bit bytes. In this case, `X` will be what `cat -t` would have used to render
the character if the upper bit were zero, with the exception of `^J` being used for newline.
The Ruby interpreter can be pressed into service as a tool for performing base conversion:
$ ruby -e 'puts "316".to_i(8).to_s(16)'
ce
The `bc` calculator can also be used:
$ echo $'obase=16\n\nibase=8\n316' | bc
CE
<a name="utf-8"/>
## utf-8
The `utf8-viewer` *data tool* provides an easy way to determine the Unicode points of a sequence of UTF-8 bytes.
$ utf8-viewer foo.txt
If you want to see the character for a Unicode point, use `printf`:
$ printf '\u03bb\n'
`zsh` and recent versions of `bash` honor the `\u` backslash escape sequence in `$' '` strings:
$ echo $'\u03bb'
If you have access to `python` or `ruby`:
$ python -c 'print(u"\u03bb")'
$ ruby -e 'puts "\u03bb"'
The *data tools* provide `utf8-category` and `utf8-script`, which summarize the characters by general category and script:
echo 'It is greater than ∞!' | utf8-category -l
1 Control
14 Lowercase_Letter
1 Uppercase_Letter
5 Other_Punctuation
1 Math_Symbol
$ echo 'αλφα βητα foo bar' | utf8-script
4 Common
8 Greek
6 Latin
Both tools have `-c` and `-s` flags for counting ASCII characters separately or omitting them from the tally entirely.
<a name="utf-16"/>
## utf-16
Unicode points above `U+FFFF` are represented in UTF-16 by a pair of 16-bit characters called _surrogates_:
$ echo -n 𝒷 | iconv -f utf-8 -t utf-16 | xxd
0000000: feff d835 dcb7
The first 16-bit character is the byte order mark (BOM). The second 16-bit character is the _high surrogate_, and the third 16-bit character is the _low surrogate_. A high surrogate is in the range 0xD800 to 0xDBFF, and a low surrogate is in the range 0xDC00 to 0xDFFF.
In some programming languages, one must use surrogates to insert a point from the supplementary planes in a string literal:
$ scala
scala> "\ud835\udcb7"
res0: String = 𝒷
Here is Python code describing the conversion from surrogates to code point and back:
def to_high_and_low_surrogate(n):
assert(0xFFFF < n <= 0x10FFFF)
high = 0xD800 + (((n - 0x10000) >> 10) & 0x3FF)
low = 0xDC00 + ((n - 0x10000) & 0x3FF)
return high, low
def to_code_point(high, low):
assert(0xD800 <= high < 0xDC00)
assert(0xDC00 <= low <= 0xDFFF)
return (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000
<a name="unicode"/>
## unicode
How to look up a Unicode point:
$ curl ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt > /tmp/UnicodeData.txt
$ awk -F';' '$1 == "03BB"' /tmp/UnicodeData.txt
03BB;GREEK SMALL LETTER LAMDA;Ll;0;L;;;;;N;GREEK SMALL LETTER LAMBDA;;039B;;039B
`UnicodeData.txt` is a useful file and possibly deserves a dedicated path on the file system.
The first three fields are "Point", "Name", and "[General Category](http://www.unicode.org/reports/tr44/#General_Category_Values)".
Unicode contains all the characters one is likely to need, but writing code which handles the entire Unicode character
set correctly is sometimes impractical. One might opt to reject characters which are not needed instead. A character
frequency table such as used when breaking ciphers is useful in this context:
$ cat /etc/passwd | ruby -ne '$_.split("").each { |ch| puts "#{ch} #{ch.ord}" }' | sort | uniq -c | sort -nr
Unicode contains different character sequences which are
rendered the same way. An example is SMALL LETTER C WITH CEDILLA `ç`,
which can be represented as a single character: U+00E7 or as SMALL LETTER C
followed by COMBINING CEDILLA: U+0063 U+0327.
When performing a string comparison, the two sequences should often
be regarded as identifical. The easiest way to accomplish this is to put
the strings to be compared into a normalized form. The Unicode standard defines
[four normal forms](http://unicode.org/reports/tr15/). The *data tool* `normalize-utf8` can be used to put a UTF-8 encoded file or stream into any of them.
<a name="newlines"/>
# NEWLINES
[eol markers](#eol-markers) | [set operations](#set-op) | [highlighting](#highlighting) | [sequences](#seq) | [sampling](#sampling)
<a name="eol-markers"/>
## eol markers
The *data tools* interpret LF, CRLF, or CR as end-of-line markers in input. The *data tools* use LF as the end-of-line marker in output. To convert LF line endings to CRLF or CR line endings:
$sed 's/$'"/$(echo \\\r)/"
$ tr '\n' '\r'
To convert CRLF or CR line endings to LF line endings:
$ tr -d '\r'
$ tr '\r' '\n'
For LF to CRLF conversions, another option is the following tools (which might need to be installed, see if your package manager has `dos2unix`). These tools take paths as arguments and modify the files in place:
dos2unix
unix2dos
The Unicode Consortium provides a [complete list](http://www.unicode.org/standard/reports/tr13/tr13-5.html) of Unicode characters that might be treated as EOL markers. In a line-delimited file format these characters should be escaped or removed.
The terminal wraps long lines without any indication that it has done so. The `cut` command and the environment variable `COLUMNS` can be used to truncate long lines instead:
$ cut -c 1-$COLUMNS FILE
$ cut -c $(( $COLUMNS + 1 ))-$(( 2 * $COLUMNS )) FILE
<a name="set-op"/>
## set operations
*Data tools* are provided for finding the lines which two files share in common, or which are exclusive to the first file:
$ set-intersect FILE1 FILE2
$ set-diff FILE1 FILE2
The `cat` command can be used to find the union of two files, with an optional `sort -u` to remove duplicate lines:
$ cat FILE1 FILE2 | sort -u
<a name="highlighting"/>
## highlighting
When inspecting files at the command line, `grep` and `less` are invaluable. `grep` can highlight the search pattern in red:
$ grep --color=always root /etc/passwd
The `highlight` command does the same thing, except that it also prints lines which don't match
the pattern. Also it supports multiple patterns, each with its own color:
$ highlight --red root --green daemon --blue /bin/bash /etc/passwd
Both `grep` and `highlight` use [ANSI Escapes](http://www.ecma-international.org/publications/standards/Ecma-048.htm). If you are paging through the output, use `less -R` to render the escape sequences correctly.
<a name="seq"/>
## sequences
The `seq` command can generate a newline delimited arithmetic sequence:
$ seq 1 3
1
2
3
Zero-padded:
$ seq -w 08 11
08
09
10
11
Step values other than one:
$ seq 1 .5 2
1
1.5
2
The `seq` is useful in conjunction with a shell `for` loop. This will create a hundred empty files:
$ for i in $(seq -w 1 100); do touch foo.$i; done
It is also useful at times to be able to iterate through a sequence of dates. The *data tools* provide `date-seq` for this. For example, suppose that you wanted to fetch a set of URLs which contained a date:
$ for date in $(date-seq --format='%Y/%m/%d' 20130101 20130131)
> do mkdir -p $date
> curl "http://blog.foo.com/${date}" > ${date}/index.html
> done
`date-seq` can iterate though years, months, days, hours, minutes, or seconds. When iterating through days, the `--weekdays` flag can be used to specify days of the week. See the [man page](https://github.com/clarkgrubb/data-tools/blob/master/doc/date-seq.1.md) for details.
<a name="sampling"/>
## sampling
It is desirable at times to take a random sample of lines from a file. Simply taking the first *N* lines often does not yield a representative sample. Instead one should shuffle the file first:
$ sort -R foo.txt | head -3
On large files, randomly shuffling a file is slow. Also, the `sort` installed on Mac OS X does not have the `-R` flag. One can use `awk` to select a random percentage of lines from a file:
$ awk 'rand() < 0.01' foo.txt
This is faster than shuffling the file, but does not produce a precise sample size, even if you know the number of lines in the file.
An efficient and unbiased way to select an exact number of lines from a file is to use reservoir sampling. The *data tool* `reservoir-sample` implements it:
$ reservoir-sample --size 3 < /etc/passwd
# TSV, TAB, CSV, and XLSX
<a name="relational-fmt"/>
# RELATIONAL FORMATS
[tsv](#tsv) | [csv](#csv) | [xlsx](#xlsx)
Much that can be done with a SQL SELECT statement in a database can also be done with `awk`, `sort`, and `join`.
Relational data can be stored in flat files in a variety of ways. On Unix, the `/etc/passwd` file stores records one per line, with colons (:) separating the seven fields. We can use `awk` to query the file.
Get the root entry from `/etc/passwd`:
$ awk -F: '$1 == "root"' /etc/passwd
Count the number of users by their login shell:
$ awk -F: '{cnt[$7] += 1} END {for (sh in cnt) print sh, cnt[sh]}' /etc/passwd
The `/etc/passwd` file format, though venerable, has an ad hoc flavor. In the following sections we consider three formats which are widely used for relational data.
<a name="tsv"/>
## tsv
The IANA, which is responsible for registering MIME types, has a [specification for TSV](http://www.iana.org/assignments/media-types/text/tab-separated-values). Records are newline delimited and fields are tab-delimited. There is no mechanism for escaping or quoting tabs and newlines. Despite this limitation, we prefer to convert the other formats to TSV because `awk`, `sort`, and `join` cannot easily manipulate the other formats. By default Hadoop uses tabs as a field separator.
Trailing spaces in fields can be hidden by tabs, causing joins to mysteriously fail. `cat -te` can be used to expose trailing spaces. The *data tool* `trim-tsv` can be used to clean up a TSV file.
The fact that tabs are visually identical to spaces means that in many applications they *can* be replaced by spaces. This makes tabs available for delimiting fields. One could use a non-printing character, but most applications do not display non-printing characters well: inspecting the data is harder.
Here is how to align the columns of a tab delimited file:
$ tr ':' '\t' < /etc/passwd | column -t -s $'\t'
The default field separator for `awk` is whitespace. The correct way to use `awk` on a TSV is like this:
$ awk 'BEGIN {FS="\t"; OFS="\t"} ...'
The IANA spec says that a TSV file must have a header. Self-describing data is a good practice. On the other hand the header is at times inconvenient—when sorting the file, for example. The repo provides the `header-sort` command to sort a file while keeping the header in place. When we must remove the header, we label the file with a `.tab` suffix instead of a `.tsv` suffix.
Even if a file has a header, `awk` scripts must refer to columns by number instead of name. The following code displays the header names with their numbers:
$ head -1 foo.tsv | tr '\t' '\n' | nl
Python and similar languages have a `split` method which is ideal for parsing a TSV file:
with open(path) as f:
header = f.readline().rstrip('\r\n').split('\t')
for line in f:
fields = line.rstrip('\r\n').split('\t')
...
CSV libraries are sometimes used to read TSV files. This works when the delimiter can be changed from a comma to a tab. The practice is incorrect if the library does not also allow the quote character to be set to none.
The `join` method in Python and similar languages can be used to generate a TSV file. Remember to check for prohibited characters in the data:
import re
RX_PROHIBITED = re.compile(u'[\f\n\r\t\v\x85\u2028\u2029]')
def tsv_replace(field, replace_char=' '):
return RX_PROHIBITED.sub(replace_char, field)
with open(path, 'w') as f:
for row in rows:
f.write(u'\t'.join([tsv_replace(field) for field in row]))
f.write(u'\n')
<a name="csv"/>
## csv
The CSV format is described in [RFC 4180](http://www.ietf.org/rfc/rfc4180.txt).
Note that CSV files do not necessarily have headers. This is perhaps because CSV files are an export format for spreadsheets.
RFC 4180 defines the EOL marker as CRLF. The *data tools* use LF as the EOL marker, however. If you want to conform to the spec, run the output through `unix2dos`. Also note that the final CRLF is optional.
CSV provides a mechanism for quoting commas and EOL markers. Double quotes are used, and double quotes themselves are escaped by doubling them.
The *data tools* repo provides utilities for converting between TSV and CSV:
csv-to-tab
tab-to-csv
Converting from CSV to TSV is problematic if the fields contain tabs or newlines. By default `csv-to-tab` will fail if it encounters any. There are flags to tell `csv-to-tab` to strip, backslash escape, replace with space, or replace with space and squeeze. See the [man page](https://github.com/clarkgrubb/data-tools/blob/master/doc/csv-to-tab.1.md).
<a name="xlsx"/>
## xlsx
XLSX is the default format used by Excel since 2007. Other spreadsheet applications can read it.
XLSX is a ZIP archive of mostly XML files. The `unzip -l` command can be used to list the contents of an XLSX file.
Excel provides the ability to export data in a CSV or TSV format. One exports by choosing the format when saving the workbook. The CSV formats all use 8-bit encodings and are not recommended since Excel spreadsheets can contain Unicode data. To export as TSV, look for the "Unicode Text" or "UTF-16 Unicode Text" option. The file suffix will be `.txt`. The character encoding is UTF-16 and can be converted using `iconv`:
$ iconv -f utf-16 -t utf-8 < foo.txt > foo.tsv
Using Excel to export the data requires having Excel, which is not free. Also Excel must be run in a desktop environment and is difficult to automate. The *data tools* include the script `xslx-to-csv` so the operation can be performed at the command line. To extract the sheets from a workbook as CSV files, run this:
$ xlsx-to-csv WORKBOOK.xlsx OUTPUT_DIR
The directory OUTPUT_DIR will be created and must not already exist.
One can list the sheet names and extract a single sheet to a CSV file:
$ xlsx-to-csv --list WORKBOOK.xlsx
$ xlsx-to-csv --sheet=SHEET WORKBOOK.xlsx SHEET.csv
By default dates are written in `%Y-%m-%dT%H:%M:%S` format. This can be change using the `--date-format` flag. See `man strftime` for instructions on how to specify a date format.
The tool `xls-to-csv` is available for converting the older (pre 2007) Excel spreadsheet to CSV. It has the same interface as `xlsx-to-csv`.
The tool `csv-to-xlsx` is available for creating XLSX workbooks. Each CSV file on the command line becomes a worksheet in the workbook. The worksheet names are derived from the CSV file names; see the man page for details.
Importing UTF-8 encoded data into Excel is not effortless. What I have found to work is to convert the data to a tab delimited `.tab` format, but change the suffix to `.txt` since otherwise Excel will not allow the path to be selected. Then use `File | Import...` and select `Text file`. After the file path is selected, Excel drops the user into a wizard which allows the format of the file to be specified. The default file origin on Macintosh is `Macintosh`, which is an 8-bit encoding. Change it to `Unicode (UTF-8)`. Select `Delimited`. On the second screen, set `Delimiters` to `Tab`, and `Text qualifier`, which controls to quote character, to `{none}`. The optional third screen allows the user to set the date formats of the columns.
<a name="joins"/>
# JOINS
[tab](#join-tab) | [tsv](#join-tsv) | [sqlite](#sqlite) | [postgres](#postgres) | [r](#join-r) | [pandas](#join-pandas) | [hive](#hive) | [spark](#spark)
<a name="join-tab"/>
## tab
To illustrate joining at the command line we create some tab delimited files:
$ grep -v '^#' /etc/passwd | tr ':' '\t' > /tmp/pw.tab
$ grep -v '^#' /etc/group | tr ':' '\t' > /tmp/grp.tab
Here is an example of using `sort` and `join` to join by group id:
$ sort -t $'\t' -k 4,4 /tmp/pw.tab > /tmp/pw.sort.tab
$ sort -t $'\t' -k 3,3 /tmp/grp.tab > /tmp/grp.sort.tab
$ join -t $'\t' -1 4 -2 3 /tmp/pw.sort.tab /tmp/grp.sort.tab
This is tedious because (1) each file must be sorted by the join column, (2) the field delimiter must be specified for each invocation of `sort` and `join`, and (3) the join column index must be determined and specified.
<a name="join-tsv"/>
## tsv
`sort` and `join` don't handle files with headers correctly. Since TSV files have headers, the *data tools* include a `join-tsv` command.
To illustrate using `join-tsv` let's create some TSV files:
$ ( echo $'name\tpw\tuid\tgid\tgecos\thome\tshell'; grep -v '^#' /etc/passwd | tr ':' '\t' ) > /tmp/pw.tsv
$ ( echo $'name\tpw\tgid\tlist'; grep -v '^#' /etc/group | tr ':' '\t' ) > /tmp/grp.tsv
If the join column has the same name in both files, it can be specified with the `-c` or `--column` flag:
$ join-tsv --column=gid /tmp/pw.tsv /tmp/grp.tsv
The output is in TSV format, and in particular it has a header. The order of columns is (1) join column, (2) left file columns other than the join column, (3) right file columns other than the join column. If the join column has different names in the two files, the left name is used in the output.
`join-tsv` reads the smaller of the two files into memory.
`join-tsv` treats an empty string as the null value by default. It can perform left, right, or full outer joins. See the [man page](https://github.com/clarkgrubb/data-tools/blob/master/doc/join-tsv.1.md) for details.
<a name="sqlite"/>
## sqlite
Using SQLite to perform a join:
$ sqlite3
> create table pw ( name text, pw text, uid int, gid int, gecos text, home text, shell text );
> create table grp ( name text, pw text, gid int, list text );
> .separator \t
> .import /tmp/pw.tab pw
> .import /tmp/grp.tab grp
> .mode csv
> .output /tmp/pw_grp.csv
> select * from pw join grp on pw.gid = grp.gid;
There is no way to escape the separator when importing files into SQLite.
<a name="postgres"/>
## postgres
$ tab-to-csv < /tmp/pw.tab > /tmp/pw.csv
$ tab-to-csv < /tmp/grp.tab > /tmp/grp.csv
$ psql
> create table pw ( name text, pw text, uid int, gid int, gecos text, home text, shell text );
> create table grp ( name text, pw text, gid int, list text );
$ ( echo 'copy pw from stdin with (format csv); '; cat /tmp/pw.csv ) | psql
$ ( echo 'copy grp from stdin with (format csv); '; cat /tmp/grp.csv ) | psql
$ psql
> create table pw_grp as select pw.name as pw_name, grp.name as grp_name from pw join grp on pw.gid = grp.gid;
$ echo 'copy pw_grp to stdout with (format csv);' | psql > /tmp/pw_grp.csv
<a name="join-r"/>
## r
Using R to perform a join:
$ /usr/bin/r
> pw = read.delim('/tmp/pw.tsv', quote='')
> grp = read.delim('/tmp/grp.tsv', quote='')
> j = merge(pw, grp, by.x='gid', by.y='gid')
> write.table(j, '/tmp/pw_grp.tsv', row.names=F, sep='\t', quote=F)
<a name="join-pandas"/>
## pandas
Using the Python library *pandas* to perform a join:
$ python
> import pandas as pd
> pw = pd.read_table('/tmp/pw.tsv')
> grp = pd.read_table('/tmp/grp.tsv')
> j = pd.merge(pw, grp, left_on='gid', right_on='gid')
> j.to_csv('/tmp/pw_grp.tsv', sep='\t', index=False)
<a name="hive"/>
## hive
[Hive functions](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-explode)
The native format of Hive is to use `^A` (`%x01`) as field delimiters and newlines as row delimiters.
$ hive -hiveconf mapred.job.tracker=local -hiveconf fs.default.name=file:///tmp -hiveconf hive.metastore.warehouse.dir=file:///tmp/test_hive
> create table passwd ( user string, passwd string, uid int, gid int, gecos string, home string, shell string ) row format delimited fields terminated by '\t' stored as textfile;
> load data local inpath '/tmp/pw.tab' overwrite into table passwd;
> create table group ( group string, passwd string, gid int, members string ) row format delimited fields terminated by '\t' stored as textfile;
> load data local inpath '/tmp/grp.tab' overwrite into table group;
> insert overwrite local directory '/tmp/pw_grp' row format delimited fields terminated by '\t' select * from passwd p join group g on p.gid = g.gid;
Suppose that we have data in a file which is not in first normal form. Here the 4th column contains the name of the children, separated by pipes:
$ cat /tmp/families.tab
Simpson Homer Marge Bart|Lisa|Maggie
Cleaver Ward June Wally|Beaver
We can use Hive to normalize the data:
> create table families (surname string, father string, mother string, children string) row format delimited fields terminated by '\t' stored as textfile;
> load data local inpath '/tmp/families.tab' overwrite into table families;
> create table families2 as select surname, father, mother, split(children, '\\|') as children from families;
> select surname, father, mother, child from families2 lateral view explode(children) foo as child;
The result is the table:
Simpson Homer Marge Bart
Simpson Homer Marge Lisa
Simpson Homer Marge Maggie
Cleaver Ward June Wally
Cleaver Ward June Beaver
<a name="spark"/>
## spark
$ spark-shell
> val pw = sc.textFile("/etc/passwd").filter(line => line(0) != '#').map(line => line.split(":"))
> val grp = sc.textFile("/etc/group").filter(line => line(0) != '#').map(line => line.split(":"))
> val j = pw_gid.join(grp_gid).map(tup => List(tup._1) ++ tup._2._1 ++ tup._2._1)
> j.map(row => row.mkString("\t")).saveAsTextFile("/tmp/pw_grp")
<a name="keys"/>
# KEYS
A *candidate key* is a minimal set of columns which can be used to uniquely identify rows. A primary key is a candidate key, and other candidate keys can be declared using a uniqueness constraint. When a candidate key is declared the database rejects inserts and updates that would violate the uniqueness constraint.
Candidate keys are a property of the data; they aren't necessarily declared in the schema. To verify a candidate key, one checks whether the number of rows in table is the same as the number of distinct values of a column or a set of columns:
> SELECT COUNT(*) FROM customers;
> SELECT COUNT(DISTINCT name) FROM customers;
> SELECT COUNT(*) FROM (SELECT DISTNCT first_name, last_name FROM customers);
Strictly speaking, one should also verify that no proper subset of the columns is also a candidate key.
Usually a join condition is a test of equality between one or more columns from the left relation and the same number of columns from the right relation.
> SELECT c.name, sum(o.amount) FROM customers c JOIN orders o ON c.id = o.customer_id GROUP BY c.name;
Typically it is an error condition if the join columns for neither the left nor the right relation are a candidate key. Consider the following perverse query:
> SELECT c.name, sum(o.amount) FROM customers c JOIN orders o ON c.name = o.customer_name GROUP BY c.name;
If there were _n_ customers with the same name, then the amount associated with their common name would be _n times_ the sum of their orders. Incidentally, keeping the name of the customer in the orders relation is a violation of second normal form if a unique identifier for the customer is already in the orders table.
<a name="hierarchical-fmt"/>
# HIERARCHICAL FORMATS
[json](#json) | [yaml](#yaml) | [html](#html) | [xml](#xml)
<a name="json"/>
## json
[json.org](http://json.org/)
The MongoDB export format is a file of serialized JSON objects, one per line. Whitespace can be added or removed anywhere to a serialized JSON object without changing the data the JSON object represents (except inside strings, and newlines must be escaped in strings). Thus it is always possible to write a JSON object on a single line.
It is easier for most clients to process a large data set in the MongoDB export format, e.g.
{"foo": 1}
{"bar": 2}
{"baz": 3}
...
than to process a large JSON array, e.g.
[{"foo":1},{"bar":2},{"baz":3},...]
This is because the latter format forces most clients to read the entire data set into memory. The *data tools* repo contains a utility for dealing with a large JSON array:
json-pluck < big_array.json > mongo_format.json
The following *data tools* are provided to convert CSV or TSV files to the MongoDB export format. In the case of `csv-to-json`, the CSV file must have a header:
csv-to-json
tsv-to-json
`python -mjson.tool` can be used to pretty print JSON and test whether the JSON is well formed.
$ echo '{"foo": 1, "bar": 2, "baz": [1, 2, 3]}' | python -mjson.tool
{
"bar": 2,
"baz": [
1,
2,
3
],
"foo": 1
}
Other tools for pretty printing JSON are `jq` and `json` which can be installed using the operating system package manager and `npm`, respectively:
$ echo '{"foo": 1, "bar": 2, "baz": [1, 2, 3]}' | jq '.'
$ echo '{"foo": 1, "bar": 2, "baz": [1, 2, 3]}' | json
The `json-diff` script uses `python -mjson.tool` and `diff` to compare two JSON documents.
The utility `jq` can be used to convert JSON to TSV.
$ echo '["foo", "bar", "baz"]' | jq -r 'join("\t")'
When processing JSON, a first task might be to determine what the top level keys in each object are:
$ echo $'{"foo":1,"bar":2}\n{"foo":1,"baz":3}' | jq -r 'keys | .[]' | sort | uniq -c
This command lists the top level keys and their values:
$ echo $'{"foo":1,"bar":2}\n{"foo":1,"baz":3}' | jq -r 'to_entries | .[] | [.key, .value] | join("\t")'
foo 1
bar 2
foo 1
baz 3
The command counts how often top level keys are used with values of a certain type:
$ echo $'{"foo":1,"bar":2}\n{"foo":"one","bar":3}' | jq -r 'to_entries | .[] | [.key, (.value | type)] | join("\t")' | sort | uniq -c
2 bar number
1 foo number
1 foo string
The following two JSON objects contain the same information:
{
"name": "John Smith",
"address": {
"street": "123 Main",
"city": "Jamestown",
"state": "VA"
}
}
{
"name": "John Smith",
"address_street": "123 Main",
"address_city": "Jamestown",
"address_state": "VA"
}
If you want to insert the data into a database table, the second version might be preferred. Here is how to make the conversion with `jq`:
$ cat <<EOF > embedded.json
{
"name": "John Smith",
"address": {
"street": "123 Main",
"city": "Jamestown",
"state": "VA"
}
}
EOF
$ cat embedded.json | jq '[paths(scalars) as $p | { "key": $p | join("_"), "value": getpath($p)}] | from_entries'
The [JSON Schema](https://json-schema.org/) standard can be used make sure that JSON data is as expected:
$ brew install check-jsonschema
$ echo '{"type": "object", "required": ["id"]}' > schema.json
$ echo '[1,2,3]' | check-jsonschema --schemafile schema.json -
Schema validation errors were encountered.
-::$: [1, 2, 3] is not of type 'object'
$ echo '{"foo": 3}' | check-jsonschema --schemafile schema.json -
Schema validation errors were encountered.
-::$: 'id' is a required property
Producers of JSON should give some thought to making the data easier to understand and process. Arrays can be used for tuples of data, but objects will be clearer since the elements will be labeled. Having arrays contain elements of the same type will make the code that consumes the JSON simpler. Similarly, if the values associated with a key in an array of JSON objects is always the same, the code that consumes them will be simpler.
JSON objects can be used for both mappings which are inherent in the data, such as an actual dictionary where the keys are the words and the values are definitions of the words. The more typical case is where the object represents a tuple of data and the keys are names chosen by the developer. In this case, following good practices in regards to naming will make the data easier to understand. Using plural nouns for keys whose values are arrays will provide a strong hint to the consumer what the type of the value is.
Some clients treat a key with a null value and the absense of the key the same. That is, if the client looks up the value for a key in an object and the key isn't present, a null is returned. For this reason, the producer could opt to simply omit keys with null values and keep the data concise. Other clients might treat the two situations differently, but the code might not handle one of the situations correctly. In this case consistency on the part of the producer can result in simpler code and fewer bugs in the consumer.
<a name="yaml"/>
## yaml
To process YAML, convert it to JSON and use tools such as `jq` and `json`:
$ yaml-to-json .travis.yml | jq '.script'
This can also be used to verify that YAML is valid.
<a name="html"/>
## html
[CSS selectors](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_selectors) can be provided to `pup` to extract parts of an HTML document:
$ brew install pup
$ curl https://google.com | pup a
<a href="https://www.google.com/">
here
</a>
When used without arguments, `pup` cleans up the HTML and prints it with a configurable amount of indentation:
$ echo '<html><body><p>Hello' | pup --indent 4
<html>
<head>
</head>
<body>
<p>
Hello
</p>
</body>
</html>
To extract the content of an HTML table from an HTML document:
$ curl 'http://hyperpolyglot.org/scripting' | html-table-to-csv -t 0
The `-t` flag specifies which table to extract. By default the first table, numbered 0, is extracted.
<a name="xml"/>
## xml
JSON objects can be used to represent a row of relational data:
{
"street": "123 Main",
"city": "Jamestwon",
"state": "MA"
}
XML seems to provide at least two different ways to do the same thing:
<address street="123 Main" city="Jamestown" state="MA"/>
<address>
<street>123 Main</street>
<city>Jamestown</city>
<state>MA</state>
</address>
When converting JSON to XML, a potential difficulty is that JSON allows the keys of objects to be arbitrary strings, whereas XML tag names and attribute names cannot contain any of the characters ``! "#$%&'()*+,/;<=>?@[\]^`{|}~``. Furthermore they cannot begin with a hyphen, period, or numeric digit.
In JSON strings, the double quote `"` and backslash `\` characters must be escaped with ``\"`` and ``\\`` sequences.
In XML, the greater than `>`, less than `<`, and ampersand `&` characters must be escaped with ``>``, ``<``, and ``&``. But again there is no way to escape these characters in tag names and attribute names.
To check whether an XML file is *well-formed*, use:
$ xmllint FILE.xml
To pretty-print XML:
$ xmllint --format FILE.xml
To extract an element using an [XPath](https://developer.mozilla.org/en-US/docs/Web/XML/XPath) expression:
$ cat <<EOF > books2.xml
<books>
<book id="1" category="linux">
<title lang="en">Linux Device Drivers</title>
<author>Jonathan Corbet</author>
</book>
<book id="4" category="novel">
<title lang="fr">The Little Prince</title>
<author>Antoine de Saint-Exupéry</author>
</book>
</books>
EOF
$ xmllint --xpath "//title[@lang='fr']" books2.xml
<title lang="fr">The Little Prince</title>
XML has schemas and an XML document is *valid* if it conforms to one. However, the move from DTDs to XML schemas means one must deal with namespaces which are complicated. Libraries such as libxml2 don't implement namespaces completely.
================================================
FILE: data_tools/__init__.py
================================================
================================================
FILE: data_tools/check-tsv
================================================
#!/usr/bin/env bash
set -eu -o pipefail
script='BEGIN {FS="\t"; OFS="\t"} {cnt[NF] += 1} END {for (i in cnt) print i, cnt[i]; if (length(cnt) == 1) exit 0; else exit 1}'
if [ "$#" -gt 1 ]
then
echo "USAGE: tsv-check PATH" >&2
exit 1
elif [ "$#" -eq 1 ]
then
awk "$script" "$1"
else
awk "$script"
fi
================================================
FILE: data_tools/convert_date.py
================================================
#!/usr/bin/env python3
import argparse
import datetime
import sys
DEFAULT_FMT1 = '%Y-%m-%dT%H:%M:%S'
DEFAULT_FMT2 = '%s'
def convert(input_fmt, output_fmt, s):
if input_fmt == '%s':
dt = datetime.datetime.fromtimestamp(int(s))
else:
dt = datetime.datetime.strptime(s, input_fmt)
return dt.strftime(output_fmt)
def convert_date(fin, fout, input_fmt, output_fmt, column):
for lineno, line in enumerate(fin, start=1):
row = line.rstrip().split('\t')
if column > len(row) - 1:
sys.stderr.write(
"Line number {} does not have {} columns\n".format(
lineno, column))
sys.exit(1)
try:
row[column] = convert(input_fmt, output_fmt, row[column])
except ValueError as e:
sys.stderr.write(
"On line number {}: {}\n".format(
lineno, str(e)))
sys.exit(1)
fout.write('\t'.join(row))
fout.write('\n')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--header', '-H',
dest='header',
action='store_true')
parser.add_argument('--input-format', '-i',
dest='input_fmt',
default=None)
parser.add_argument('--output-format', '-o',
dest='output_fmt',
default=None)
parser.add_argument('--column', '-c',
dest='column',
type=int,
default=0)
args = parser.parse_args()
input_fmt = args.input_fmt
output_fmt = args.output_fmt
if input_fmt is None:
if output_fmt == DEFAULT_FMT1:
input_fmt = DEFAULT_FMT2
elif output_fmt == DEFAULT_FMT2:
input_fmt = DEFAULT_FMT1
else:
sys.stderr.write('Use -i to set strftime-style input format.\n')
sys.exit(1)
if output_fmt is None:
if input_fmt == DEFAULT_FMT1:
output_fmt = DEFAULT_FMT2
elif input_fmt == DEFAULT_FMT2:
output_fmt = DEFAULT_FMT1
else:
sys.stderr.write('Use -o to set strftime-style output format.\n')
sys.exit(1)
if args.header:
header = sys.stdin.readline()
sys.stdout.write(header)
convert_date(sys.stdin,
sys.stdout,
input_fmt,
output_fmt,
args.column)
if __name__ == '__main__':
main()
================================================
FILE: data_tools/counting_sort.py
================================================
#!/usr/bin/env python3
import collections
import sys
def usage():
sys.stderr.write("USAGE: counting-sort [FILE]\n")
sys.exit(1)
def counting_sort(input_stream, output_stream):
buckets = collections.defaultdict(int)
for key in input_stream:
buckets[key] += 1
for key in sorted(buckets.keys()):
for _ in range(0, buckets[key]):
output_stream.write(key)
def main():
if len(sys.argv) == 1:
counting_sort(sys.stdin, sys.stdout)
elif len(sys.argv) == 2:
if sys.argv[1] == '--help':
usage()
else:
with open(sys.argv[1]) as input_stream:
counting_sort(input_stream, sys.stdout)
else:
usage()
if __name__ == '__main__':
main()
================================================
FILE: data_tools/csv-to-postgres
================================================
#!/usr/bin/env bash
set -eu -o pipefail
psql=psql
table=
path=
while getopts "d:f:h:p:t:U:wW" opt
do
case "$opt" in
d) psql="$psql -d $OPTARG" ;;
f) path="$OPTARG" ;;
h) psql="$psql -h $OPTARG" ;;
p) psql="$psql -p $OPTARG" ;;
t) table="$OPTARG" ;;
U) psql="$psql -U $OPTARG" ;;
w) psql="$psql -w" ;;
W) psql="$psql -W" ;;
?) table='' ; break ;;
esac
done
if [ -z "$table" ] || [ -z "$path" ]
then
echo "USAGE: csv-to-postgres -f CSV_PATH -t TABLE" >&2
echo >&2
echo " Flags passed to psql:" >&2
echo " [-d DB] [-h HOST] [-p PORT] [-U USER] [-w|-W]" >&2
echo >&2
echo " Environment variable PGPASSWORD can be used to set password." >&2
echo >&2
exit 1
fi
( echo 'copy '"$table"' from stdin with (format csv); '; cat "$path" ) | $psql
================================================
FILE: data_tools/csv_to_json.py
================================================
#!/usr/bin/env python3
import argparse
import csv
import json
import sys
ENCODING = 'utf-8'
def csv_to_json(input_stream, output_stream, header_str, delimiter, quotechar):
reader = csv.reader(input_stream, delimiter=delimiter, quotechar=quotechar)
if header_str:
header = header_str.split(',')
else:
header = reader.__next__()
for row in reader:
output_stream.write(json.dumps(dict(zip(header, row))))
output_stream.write('\n')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('input', nargs='?')
parser.add_argument('--delimiter', '-d',
dest='delimiter',
default=',')
parser.add_argument('--header',
dest='header',
metavar='NAME[,NAME..]')
parser.add_argument('--quotechar', '-q',
dest='quotechar',
default='"')
args = parser.parse_args()
if args.input:
f = open(args.input, encoding=ENCODING)
else:
f = sys.stdin
csv_to_json(f, sys.stdout, args.header, args.delimiter, args.quotechar)
if __name__ == '__main__':
main()
================================================
FILE: data_tools/csv_to_xlsx.py
================================================
#!/usr/bin/env python3
import argparse
import csv
import re
import sys
import openpyxl
REGEX_CSV_SUFFIX = re.compile(r'.csv$', re.I) # pylint: disable=no-member
REGEX_XLSX_SUFFIX = re.compile(r'.xlsx$', re.I) # pylint: disable=no-member
REGEX_INVALID_SHEETNAME_CHARS = re.compile(r'[][*?/\.]')
REGEX_SPACES = re.compile(' +')
MAX_SHEETNAME_LENGTH = 31
ENCODING = 'utf-8'
START_INDEX = 0 if openpyxl.__version__.startswith('1.') else 1
def path_to_sheetname(path):
sheetname = REGEX_CSV_SUFFIX.sub('', path)
sheetname = REGEX_INVALID_SHEETNAME_CHARS.sub(' ', sheetname)
sheetname = REGEX_SPACES.sub(' ', sheetname)
return sheetname[0:MAX_SHEETNAME_LENGTH].strip()
def csv_to_xlsx(input_files, output_file):
wb = openpyxl.Workbook()
sheetnames = {}
for filenum, input_file in enumerate(input_files):
with open(input_file, encoding=ENCODING) as f:
rows = csv.reader(f, dialect=csv.excel)
if filenum == 0:
ws = wb.get_active_sheet()
else:
ws = wb.create_sheet()
sheetname = path_to_sheetname(input_file)
if sheetname in sheetnames:
raise ValueError('files {} and {} result in the same sheet '
'name: "{}"'.format(input_file,
sheetnames[sheetname],
sheetname))
sheetnames[sheetname] = input_file
ws.title = sheetname
for rownum, row in enumerate(rows, start=START_INDEX):
for colnum, value in enumerate(row, start=START_INDEX):
# WHAT ABOUT DATES
ws.cell(row=rownum, column=colnum).value = value # pylint: disable=no-member
wb.save(output_file)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('input_files',
nargs='+',
metavar='CSV_FILE')
parser.add_argument('--output-file', '-o',
dest='output_file',
required=True)
args = parser.parse_args()
if not REGEX_XLSX_SUFFIX.search(args.output_file):
sys.stderr.write('ERROR: output file must have .xlsx '
'suffix: {}\n'.format(args.output_file))
sys.exit(1)
csv_to_xlsx(args.input_files, args.output_file)
if __name__ == '__main__':
main()
================================================
FILE: data_tools/date_fill.py
================================================
#!/usr/bin/env python3
import argparse
import datetime
import re
import sys
import pprint
PP = pprint.PrettyPrinter()
REPORT_STATS = True
FILL_SEPARATOR = ','
DEFAULT_FILL_VALUE = '0'
RX_DIRECTIVE = re.compile(r'%.')
YEAR_DIRECTIVES = set('%y %Y %g %G'.split())
MONTH_DIRECTIVES = set('%b %B %C %d %d %h %m'.split())
DAY_DIRECTIVES = set('%a %A %D %j %u %U %v %V %w %W %x'.split())
HOUR_DIRECTIVES = set('%H %i %k %l %p'.split())
MINUTE_DIRECTIVES = set('%M %R'.split())
SECOND_DIRECTIVES = set('%c %r %s %S %T %X'.split())
def make_year_iterator(start,
end,
fmt):
def date_iter(start_i, end_i):
i = start_i
while True:
if i > end_i:
return
dt = datetime.datetime.strptime(str(i), fmt)
yield dt
i += 1
return date_iter(int(start), int(end))
def make_month_iterator(start,
end,
fmt):
start_yyyy = int(start[0:4])
start_mm = int(start[4:6])
end_yyyy = int(end[0:4])
end_mm = int(end[4:6])
def date_iter(start_yyyy, start_mm, end_yyyy, end_mm):
yyyy = start_yyyy
mm = start_mm
while True:
if yyyy > end_yyyy or (yyyy == end_yyyy and mm > end_mm):
return
dt = datetime.datetime.strptime('%04d%02d' % (yyyy, mm), fmt)
yield dt
mm += 1
if mm == 13:
mm = 1
yyyy += 1
return date_iter(start_yyyy, start_mm, end_yyyy, end_mm)
def make_date_iterator(start_dt,
end_dt,
delta):
def date_iter(start_dt, end_dt):
dt = start_dt
while True:
if dt > end_dt:
return
yield dt
dt += delta
return date_iter(start_dt, end_dt)
def load_rows(input_path, date_column, no_header):
date_to_line = {}
header = None
max_columns = 0
with open(input_path) as f:
if not no_header:
header = f.readline()
for line in f:
data = line.rstrip().split('\t')
if len(data) > max_columns:
max_columns = len(data)
date_to_line[data[date_column]] = line.rstrip()
return date_to_line, header, max_columns
def date_fill(input_path,
date_column,
no_header,
fill_values,
start,
end,
output_fmt,
output_stream):
date_to_line, header, max_column = load_rows(input_path,
date_column,
no_header)
if fill_values:
fill_row = fill_values.split(FILL_SEPARATOR)
else:
fill_row = [DEFAULT_FILL_VALUE] * max_column
start_dt = None
end_dt = None
for date, _ in date_to_line.items():
dt = datetime.datetime.strptime(date, output_fmt)
if not start_dt or dt < start_dt:
start_dt = dt
if not end_dt or dt > end_dt:
end_dt = dt
if start:
start_dt = datetime.datetime.strptime(start, output_fmt)
if end:
end_dt = datetime.datetime.strptime(end, output_fmt)
date_iter = None
directives = set(RX_DIRECTIVE.findall(output_fmt))
if directives & SECOND_DIRECTIVES:
delta = datetime.timedelta(seconds=1)
trunc_fmt = '%Y%m%d%H%M%S'
elif directives & MINUTE_DIRECTIVES:
delta = datetime.timedelta(minutes=1)
trunc_fmt = '%Y%m%d%H%M'
elif directives & HOUR_DIRECTIVES:
delta = datetime.timedelta(hours=1)
trunc_fmt = '%Y%m%d%H'
elif directives & DAY_DIRECTIVES:
delta = datetime.timedelta(days=1)
trunc_fmt = '%Y%m%d'
elif directives & MONTH_DIRECTIVES:
trunc_fmt = '%Y%m'
date_iter = make_month_iterator(start_dt.strftime(trunc_fmt),
end_dt.strftime(trunc_fmt),
output_fmt)
elif directives & YEAR_DIRECTIVES:
trunc_fmt = '%Y'
date_iter = make_year_iterator(start_dt.strftime(trunc_fmt),
end_dt.strftime(trunc_fmt),
output_fmt)
else:
raise Exception('no recognized directives in format: {}'.format(
output_fmt))
if header is not None:
output_stream.write(header)
if not start_dt or not end_dt:
return
if not date_iter:
start_s = start_dt.strftime(trunc_fmt)
start_dt = datetime.datetime.strptime(start_s, trunc_fmt)
end_s = end_dt.strftime(trunc_fmt)
end_dt = datetime.datetime.strptime(end_s, trunc_fmt)
date_iter = make_date_iterator(start_dt, end_dt, delta)
line_count = 0
fill_count = 0
for dt in date_iter:
date = dt.strftime(output_fmt)
line = date_to_line.get(date, None)
if line is None:
fill_count += 1
row = list(fill_row)
row[date_column] = date
line = '\t'.join(row)
output_stream.write(line)
output_stream.write('\n')
line_count += 1
if REPORT_STATS:
output_stream.flush()
sys.stderr.write(
'retained non-header lines: {}\n'
'filled lines: {}\n'
'dropped lines: {}\n'.format(
line_count,
fill_count,
len(date_to_line.keys()) - (line_count - fill_count)))
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--start', '-s', help='YYYY[MM[DD[HH[MI[SS]]]]]',
dest='start',
default=None)
parser.add_argument('--end', '-e', help='YYYY[MM[DD[HH[MI[SS]]]]]',
dest='end',
default=None)
parser.add_argument('--date-column', '-d',
help='column containing date: first column is 0',
type=int,
dest='date_column')
parser.add_argument('--format', '-f',
dest='format', help='strftime style format for output',
default=None)
parser.add_argument('--no-header', '-H', help='if file has no header',
dest='no_header',
action='store_true')
parser.add_argument('--input-path', '-i',
dest='input_path', help='input path for TSV',
required=True)
parser.add_argument('--fill-values', '-v',
help='comma-separated list of fill values',
dest='fill_values',
default='')
args = parser.parse_args()
date_fill(args.input_path,
args.date_column,
args.no_header,
args.fill_values,
args.start,
args.end,
args.format,
sys.stdout)
if __name__ == '__main__':
main()
================================================
FILE: data_tools/date_seq.py
================================================
#!/usr/bin/env python3
import argparse
import datetime
import re
import sys
import pprint
PP = pprint.PrettyPrinter()
REGEX_INPUT_DATE = re.compile(r'^\d{4,14}$')
WEEKDAY_TO_NUMBER = {
'mon': 1,
'tue': 2,
'wed': 3,
'thu': 4,
'fri': 5,
'sat': 6,
'sun': 7
}
def check(dt, fmt, regex_date_filter):
if not regex_date_filter:
return True
if regex_date_filter.match(dt.strftime(fmt)):
return True
return False
def make_year_iterator(start,
end,
regex_date_filter,
fmt):
def date_iter(start_i, end_i):
i = start_i
while True:
if i > end_i:
return
dt = datetime.datetime.strptime(str(i), fmt)
if check(dt, fmt, regex_date_filter):
yield dt
i += 1
return date_iter(int(start), int(end))
def make_month_iterator(start,
end,
regex_date_filter,
fmt):
start_yyyy = int(start[0:4])
start_mm = int(start[4:6])
end_yyyy = int(end[0:4])
end_mm = int(end[4:6])
def date_iter(start_yyyy, start_mm, end_yyyy, end_mm):
yyyy = start_yyyy
mm = start_mm
while True:
if yyyy > end_yyyy or (yyyy == end_yyyy and mm > end_mm):
return
dt = datetime.datetime.strptime('%04d%02d' % (yyyy, mm), fmt)
if check(dt, fmt, regex_date_filter):
yield dt
mm += 1
if mm == 13:
mm = 1
yyyy += 1
return date_iter(start_yyyy, start_mm, end_yyyy, end_mm)
def make_date_iterator(start,
end,
weekday_numbers,
regex_date_filter,
delta,
fmt):
start_dt = datetime.datetime.strptime(start, fmt)
end_dt = datetime.datetime.strptime(end, fmt)
def date_iter(start_dt, end_dt):
dt = start_dt
while True:
if dt > end_dt:
return
if not weekday_numbers:
if check(dt, fmt, regex_date_filter):
yield dt
else:
weekday_number = int(dt.strftime('%u'))
if weekday_number in weekday_numbers:
if check(dt, fmt, regex_date_filter):
yield dt
dt += delta
return date_iter(start_dt, end_dt)
def date_seq(start,
end,
weekdays,
date_filter,
output_fmt,
output_stream):
if len(start) != len(end):
raise Exception('Start and end date must be same length')
if not REGEX_INPUT_DATE.search(start):
raise Exception(
'Start date must be in YYYY[MM[DD[HH[MI[SS]]]]] format.')
if not REGEX_INPUT_DATE.search(end):
raise Exception('End date must be in YYYY[MM[DD[HH[MI[SS]]]]] format.')
if weekdays:
weekday_numbers = [WEEKDAY_TO_NUMBER[wkday.lower()[0:3]]
for wkday
in weekdays.split(',')]
else:
weekday_numbers = []
regex_date_filter = re.compile(date_filter) if date_filter else None
date_iter = None
if len(start) == 4:
fmt = '%Y'
date_iter = make_year_iterator(start, end, regex_date_filter, fmt)
elif len(start) == 6:
fmt = '%Y%m'
date_iter = make_month_iterator(start, end, regex_date_filter, fmt)
elif len(start) == 8:
delta = datetime.timedelta(days=1)
fmt = '%Y%m%d'
elif len(start) == 10:
delta = datetime.timedelta(hours=1)
fmt = '%Y%m%d%H'
elif len(start) == 12:
delta = datetime.timedelta(minutes=1)
fmt = '%Y%m%d%H%M'
elif len(start) == 14:
delta = datetime.timedelta(seconds=1)
fmt = '%Y%m%d%H%M%S'
else:
raise Exception('unexpected argument length: {}'.format(len(start)))
if not date_iter:
date_iter = make_date_iterator(start,
end,
weekday_numbers,
regex_date_filter,
delta,
fmt)
if output_fmt is None:
output_fmt = fmt
for dt in date_iter:
output_stream.write(dt.strftime(output_fmt) + '\n')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('start', help='YYYY[MM[DD[HH[MI[SS]]]]]')
parser.add_argument('end', help='YYYY[MM[DD[HH[MI[SS]]]]]')
parser.add_argument('--format', '-f',
dest='format', help='strftime style format for output',
default=None)
parser.add_argument('--regex', '-r',
dest='date_filter', help='date filter regex.',
default=None)
parser.add_argument('--weekdays', '-w',
dest='weekdays', help='comma separated: Sun,Mon,...',
default=None)
args = parser.parse_args()
date_seq(args.start,
args.end,
args.weekdays,
args.date_filter,
args.format,
sys.stdout)
if __name__ == '__main__':
main()
================================================
FILE: data_tools/header-sort
================================================
#!/usr/bin/env bash
if [ $# -eq 0 ]
then
echo "USAGE: $0 [OPTIONS] FILE"
exit 1
fi
file="${!#}"
if [ ! -f "$file" ]
then
echo "USAGE: $0 [OPTIONS] FILE"
exit 1
fi
head -1 "$file"
tail -n +2 "$file" | sort "${@:1:$((${#}-1))}"
================================================
FILE: data_tools/highlight.py
================================================
#!/usr/bin/env python3
import argparse
import re
import sys
NORMAL = '\033[m'
BLACK_FOREGROUND = '\033[01;30m'
RED_FOREGROUND = '\033[01;31m'
GREEN_FOREGROUND = '\033[01;32m'
YELLOW_FOREGROUND = '\033[01;33m'
BLUE_FOREGROUND = '\033[01;34m'
MAGENTA_FOREGROUND = '\033[01;35m'
CYAN_FOREGROUND = '\033[01;36m'
WHITE_FOREGROUND = '\033[01;37m'
BLACK_BACKGROUND = '\033[01;40m'
RED_BACKGROUND = '\033[01;41m'
GREEN_BACKGROUND = '\033[01;42m'
YELLOW_BACKGROUND = '\033[01;43m'
BLUE_BACKGROUND = '\033[01;44m'
MAGENTA_BACKGROUND = '\033[01;45m'
CYAN_BACKGROUND = '\033[01;46m'
WHITE_BACKGROUND = '\033[01;47m'
BOLD = '\033[01;1m'
ITALIC = '\033[01;3m'
UNDERLINE = '\033[01;4m'
INVERT = '\033[01;7m'
def highlight(input_stream, output_stream, esc_seq_to_pattern):
for line in input_stream:
output_line = line
for esc_seq, pattern in esc_seq_to_pattern.items():
rx = re.compile("({})".format(pattern))
output_line = rx.sub('{}\\1{}'.format(esc_seq, NORMAL),
output_line)
output_stream.write(output_line)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('positional', nargs='*')
parser.add_argument('--black',
dest='black',
metavar='PATTERN')
parser.add_argument('--red', '-r',
dest='red',
metavar='PATTERN')
parser.add_argument('--green', '-g',
dest='green',
metavar='PATTERN')
parser.add_argument('--yellow', '-y',
dest='yellow',
metavar='PATTERN')
parser.add_argument('--blue', '-b',
dest='blue',
metavar='PATTERN')
parser.add_argument('--magenta', '-m',
dest='magenta',
metavar='PATTERN')
parser.add_argument('--cyan', '-c',
dest='cyan',
metavar='PATTERN')
parser.add_argument('--white', '-w',
dest='white',
metavar='PATTERN')
parser.add_argument('--black-background',
dest='black_background',
metavar='PATTERN')
parser.add_argument('--red-background',
dest='red_background',
metavar='PATTERN')
parser.add_argument('--green-background',
dest='green_background',
metavar='PATTERN')
parser.add_argument('--yellow-background',
dest='yellow_background',
metavar='PATTERN')
parser.add_argument('--blue-background',
dest='blue_background',
metavar='PATTERN')
parser.add_argument('--magenta-background',
dest='magenta_background',
metavar='PATTERN')
parser.add_argument('--cyan-background',
dest='cyan_background',
metavar='PATTERN')
parser.add_argument('--white-background',
dest='white_background',
metavar='PATTERN')
parser.add_argument('--normal',
dest='normal',
metavar='PATTERN')
parser.add_argument('--bold',
dest='bold',
metavar='PATTERN')
parser.add_argument('--italic',
dest='italic',
metavar='PATTERN')
parser.add_argument('--underline',
dest='underline',
metavar='PATTERN')
parser.add_argument('--invert', '--reverse',
dest='invert',
metavar='PATTERN')
args = parser.parse_args()
pattern = None
input_path = None
if len(args.positional) == 1:
if args.red or args.black or args.green or args.yellow or args.blue \
or args.magenta or args.cyan or args.white \
or args.black_background or args.red_background \
or args.green_background or args.yellow_background \
or args.blue_background or args.magenta_background \
or args.cyan_background or args.white_background \
or args.normal or args.bold or args.italic or args.underline \
or args.invert:
input_path = args.positional[0]
else:
pattern = args.positional[0]
elif len(args.positional) == 2:
pattern, input_path = args.positional
elif len(args.positional) > 2:
sys.stderr.write('USAGE: hightlight [OPTIONS] [PATTERN] [FILE]\n')
sys.exit(1)
esc_seq_to_pattern = {}
if pattern and args.red:
raise Exception('--red|-r cannot be used with default pattern')
if pattern:
esc_seq_to_pattern[RED_FOREGROUND] = pattern
if args.red:
esc_seq_to_pattern[RED_FOREGROUND] = args.red
if args.black:
esc_seq_to_pattern[BLACK_FOREGROUND] = args.black
if args.green:
esc_seq_to_pattern[GREEN_FOREGROUND] = args.green
if args.yellow:
esc_seq_to_pattern[YELLOW_FOREGROUND] = args.yellow
if args.blue:
esc_seq_to_pattern[BLUE_FOREGROUND] = args.blue
if args.magenta:
esc_seq_to_pattern[MAGENTA_FOREGROUND] = args.magenta
if args.cyan:
esc_seq_to_pattern[CYAN_FOREGROUND] = args.cyan
if args.white:
esc_seq_to_pattern[WHITE_FOREGROUND] = args.white
if args.black_background:
esc_seq_to_pattern[BLACK_BACKGROUND] = args.black_background
if args.red_background:
esc_seq_to_pattern[RED_BACKGROUND] = args.red_background
if args.green_background:
esc_seq_to_pattern[GREEN_BACKGROUND] = args.green_background
if args.yellow_background:
esc_seq_to_pattern[YELLOW_BACKGROUND] = args.yellow_background
if args.blue_background:
esc_seq_to_pattern[BLUE_BACKGROUND] = args.blue_background
if args.magenta_background:
esc_seq_to_pattern[MAGENTA_BACKGROUND] = args.magenta_background
if args.cyan_background:
esc_seq_to_pattern[CYAN_BACKGROUND] = args.cyan_background
if args.white_background:
esc_seq_to_pattern[WHITE_BACKGROUND] = args.white_background
if args.normal:
esc_seq_to_pattern[NORMAL] = args.normal
if args.bold:
esc_seq_to_pattern[BOLD] = args.bold
if args.italic:
esc_seq_to_pattern[ITALIC] = args.italic
if args.underline:
esc_seq_to_pattern[UNDERLINE] = args.underline
if args.invert:
esc_seq_to_pattern[INVERT] = args.invert
if not esc_seq_to_pattern:
sys.stderr.write("No PATTERN specified.\n")
parser.print_help()
sys.exit(1)
if input_path:
with open(input_path) as f:
highlight(f, sys.stdout, esc_seq_to_pattern)
else:
highlight(sys.stdin, sys.stdout, esc_seq_to_pattern)
if __name__ == '__main__':
main()
================================================
FILE: data_tools/html_table_to_csv.py
================================================
#!/usr/bin/env python3
import argparse
import csv
import re
import sys
from typing import IO
import bs4
RX_TH_OR_TD = re.compile(r'^th|td$')
def html_table_to_csv(input_f: IO, output_f: IO, table_num: int) -> None:
doc = bs4.BeautifulSoup(input_f.read(), 'html5lib')
tables = doc.find_all('table')
try:
table = tables[table_num]
trows = table.find_all('tr')
csv_writer = csv.writer(output_f)
for trow in trows:
cells = trow.find_all(RX_TH_OR_TD)
csv_writer.writerow([cell.text.strip() for cell in cells])
except IndexError:
sys.stderr.write('ERROR: no table at index {}\n'.format(table_num))
sys.exit(1)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--table', '-t',
dest='table',
type=int,
default=0)
parser.add_argument('input_path', nargs='?', default='')
args = parser.parse_args()
if args.input_path:
with open(args.input_path) as f:
html_table_to_csv(f, sys.stdout, args.table)
else:
html_table_to_csv(sys.stdin, sys.stdout, args.table)
if __name__ == '__main__':
main()
================================================
FILE: data_tools/join_tsv.py
================================================
#!/usr/bin/env python3
import argparse
import collections
import os
import sys
ENCODING = 'utf-8'
BIG_FIRST = 1
BIG_LAST = 2
JOIN_INNER = 1
JOIN_LEFT = 2
JOIN_RIGHT = 3
JOIN_FULL = 4
DEFAULT_OUTER_NULL = ''
def header_and_column_to_rows(path, column):
with open(path, encoding=ENCODING) as f:
column_to_rows = collections.defaultdict(list)
header = f.readline().rstrip('\r\n').split('\t')
row_len = len(header)
column_index = None
try:
column_index = header.index(column)
except ValueError:
raise Exception('{} does not have a {} column'.format(
path, column))
del(header[column_index])
for lineno, line in enumerate(f, start=2):
fields = line.rstrip('\r\n').split('\t')
if len(fields) != row_len:
raise Exception('row {} does not have {} fields: {}'.format(
lineno,
row_len,
line))
column_value = fields[column_index]
del(fields[column_index])
column_to_rows[column_value].append(fields)
return header, column_to_rows
def print_row(join_value, fields1, fields2, f):
f.write(join_value)
f.write('\t')
f.write('\t'.join(fields1))
f.write('\t')
f.write('\t'.join(fields2))
f.write('\n')
def join_tsv(left_join_column,
right_join_column,
null,
join_type,
path1,
path2,
output_stream,
outer_null):
if os.path.getsize(path1) > os.path.getsize(path2):
big, small, file_order = path1, path2, BIG_FIRST
big_join_column = left_join_column
small_join_column = right_join_column
else:
big, small, file_order = path2, path1, BIG_LAST
big_join_column = right_join_column
small_join_column = left_join_column
outer_join_big, outer_join_small = False, False
small_header, column_to_rows = header_and_column_to_rows(small,
small_join_column)
EMPTY_SMALL_HEADER = [outer_null] * len(small_header)
if join_type == JOIN_FULL:
outer_join_big, outer_join_small = True, True
elif join_type == JOIN_LEFT:
outer_join_big = file_order == BIG_FIRST
outer_join_small = file_order != BIG_FIRST
elif join_type == JOIN_RIGHT:
outer_join_small = file_order == BIG_FIRST
outer_join_big = file_order != BIG_FIRST
with open(big, encoding=ENCODING) as f:
big_header = f.readline().rstrip('\r\n').split('\t')
row_len = len(big_header)
column_index = None
try:
column_index = big_header.index(big_join_column)
except ValueError:
raise Exception('{} does not have a {} column'.format(
big, big_join_column))
del(big_header[column_index])
EMPTY_BIG_HEADER = [outer_null] * len(big_header)
print_row(left_join_column,
big_header if file_order == BIG_FIRST else small_header,
small_header if file_order == BIG_FIRST else big_header,
output_stream)
# used if output_join_small is True
join_values = set()
for lineno, line in enumerate(f, start=2):
big_fields = line.rstrip('\r\n').split('\t')
if len(big_fields) != row_len:
raise Exception('row {} does not have {} fields: {}'.format(
lineno,
row_len,
line))
join_value = big_fields[column_index]
del(big_fields[column_index])
if join_value != null:
small_rows = column_to_rows.get(join_value,
[EMPTY_SMALL_HEADER]
if outer_join_big
else [])
if outer_join_small:
join_values.add(join_value)
for small_fields in small_rows:
print_row(
join_value,
big_fields if file_order == BIG_FIRST
else small_fields,
small_fields if file_order == BIG_FIRST
else big_fields,
output_stream)
if outer_join_small:
big_fields = EMPTY_BIG_HEADER
for join_value, small_rows in column_to_rows.items():
if join_value not in join_values:
for small_fields in small_rows:
print_row(
join_value,
big_fields if file_order == BIG_FIRST
else small_fields,
small_fields if file_order == BIG_FIRST
else big_fields,
output_stream)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('files',
nargs='+',
metavar='TSV_FILE')
parser.add_argument('--column', '-c', '-C',
dest='column')
parser.add_argument('--left', '-l',
dest='left',
action='store_true',
default=False)
parser.add_argument('--left-column', '-L',
dest='left_column',
default=None)
parser.add_argument('--right', '-r',
dest='right',
action='store_true',
default=False)
parser.add_argument('--right-column', '-R',
dest='right_column',
default=None)
parser.add_argument('--full', '-f',
dest='full',
action='store_true',
default=False)
parser.add_argument('--null', '-n',
dest='null',
default='')
parser.add_argument('--outer-null', '-o',
dest='outer_null',
default=DEFAULT_OUTER_NULL)
parser.add_argument('--no-null', '-N',
dest='no_null',
action='store_true',
default=False)
args = parser.parse_args()
if len(args.files) != 2:
sys.stderr.write('must be two files, not {}\n'.format(args.files))
parser.print_help()
sys.exit(1)
left_join_column = None
right_join_column = None
if args.column:
if args.left_column or args.right_column:
sys.stderr.write('--column flag is incompatible with --left-column'
' and --right-column flags\n')
parser.print_help()
sys.exit(1)
left_join_column, right_join_column = args.column, args.column
if args.left_column:
left_join_column = args.left_column
if args.right_column:
right_join_column = args.right_column
if not left_join_column or not right_join_column:
sys.stderr.write('must specify join column(s)\n')
parser.print_help()
sys.exit(1)
join_type = JOIN_INNER
flag_cnt = 0
if args.left:
join_type = JOIN_LEFT
flag_cnt += 1
if args.right:
join_type = JOIN_RIGHT
flag_cnt += 1
if args.full:
join_type = JOIN_FULL
flag_cnt += 1
if flag_cnt > 1:
sys.stderr.write('left, right or full join flags are exclusive\n')
parser.print_help()
sys.exit(1)
join_tsv(left_join_column,
right_join_column,
None if args.no_null else args.null,
join_type,
args.files[0],
args.files[1],
sys.stdout,
args.outer_null)
if __name__ == '__main__':
main()
================================================
FILE: data_tools/json-diff
================================================
#!/usr/bin/env bash
set -eu -o pipefail
if [ "$#" -lt 2 ]
then
echo "USAGE: json-diff [DIFF_OPTIONS] PATH1 PATH2" 1>&2
exit 2
fi
args=("$@")
file1=${args[$(( $# - 2 ))]}
file2=${args[$(( $# - 1 ))]}
unset args[$(( $# - 1 ))]
unset args[$(( $# - 2 ))]
normalized1=$(mktemp)
normalized2=$(mktemp)
function cleanup {
rm -f "$normalized1" "$normalized2"
}
trap cleanup ERR
function cleanup_and_exit {
cleanup
exit "$1"
}
if ! python3 -mjson.tool --sort-keys < "$file1" > "$normalized1"
then
cleanup_and_exit 2
fi
if ! python3 -mjson.tool --sort-keys < "$file2" > "$normalized2"
then
cleanup_and_exit 2
fi
set +u
diff "${args[@]}" "$normalized1" "$normalized2"
diff_retval=$?
set -u
cleanup_and_exit "$diff_retval"
================================================
FILE: data_tools/normalize_utf8.py
================================================
#!/usr/bin/env python3
import argparse
import sys
import unicodedata
ENCODING = 'utf-8'
NFC = 'NFC'
NFD = 'NFD'
NFKC = 'NFKC'
NFKD = 'NFKD'
def normalize_utf8(input_stream, output_stream, normalization_form):
"""
Form must be 'NFC', 'NFD', 'NFKC', or 'NFKC'.
Normalization forms are explained at
http://unicode.org/reports/tr15/
"""
for line in input_stream:
output_stream.write(unicodedata.normalize(normalization_form, line))
def main():
parser = argparse.ArgumentParser()
parser.add_argument('positional_args',
nargs='?')
parser.add_argument('--nfc',
action='store_true',
dest='nfc')
parser.add_argument('--nfd',
action='store_true',
dest='nfd')
parser.add_argument('--nfkc',
action='store_true',
dest='nfkc')
parser.add_argument('--nfkd',
action='store_true',
dest='nfkd')
args = parser.parse_args()
flag_count = 0
if args.nfc:
flag_count += 1
normalization_form = NFC
if args.nfd:
flag_count += 1
normalization_form = NFD
if args.nfkc:
flag_count += 1
normalization_form = NFKC
if args.nfkd:
flag_count += 1
normalization_form = NFKD
if flag_count == 0:
args.nfc = True
normalization_form = NFC
if flag_count > 1:
sys.stderr.write('At most one normalization flag can be used.\n')
parser.print_usage(sys.stderr)
sys.exit(1)
if args.positional_args:
fin = open(args.positional_args, encoding='utf-8')
else:
fin = sys.stdin
normalize_utf8(fin, sys.stdout, normalization_form)
if __name__ == '__main__':
main()
================================================
FILE: data_tools/postgres-to-csv
================================================
#!/usr/bin/env bash
set -eu -o pipefail
psql=psql
table=
while getopts "d:h:p:t:U:wW" opt
do
case "$opt" in
d) psql="$psql -d $OPTARG" ;;
h) psql="$psql -h $OPTARG" ;;
p) psql="$psql -p $OPTARG" ;;
t) table="$OPTARG" ;;
U) psql="$psql -U $OPTARG" ;;
w) psql="$psql -w" ;;
W) psql="$psql -W" ;;
?) table='' ; break ;;
esac
done
if [ -z "$table" ]
then
echo "USAGE: postgres-to-csv -t TABLE" >&2
echo >&2
echo " Flags passed to psql:" >&2
echo " [-d DB] [-h HOST] [-p PORT] [-U USER] [-w|-W]" >&2
echo >&2
echo " Environment variable PGPASSWORD can be used to set password." >&2
echo >&2
exit 1
fi
echo 'copy '"$table"' to stdout with (format csv);' | $psql
================================================
FILE: data_tools/reservoir_sample.py
================================================
#!/usr/bin/env python3
import argparse
import random
import sys
def reservoir_sample(count, input_stream, output_stream):
n = None
output = []
try:
n = int(count)
if n < 1:
raise ValueError
except ValueError:
raise Exception('argument not a positive integer')
for i, line in enumerate(input_stream):
if i < n:
output.append(line)
else:
choice = random.randint(0, i)
if choice < n:
output[choice] = line
for line in output:
output_stream.write(line)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('positional',
nargs='?',
metavar='FILE')
parser.add_argument('--size', '-s',
dest='size',
type=int,
metavar='NUM',
required=True)
parser.add_argument('--random-seed', '-r',
dest='random_seed',
default=None)
args = parser.parse_args()
if args.random_seed:
random.seed(args.random_seed)
if args.positional:
with open(args.positional) as f:
reservoir_sample(args.size, f, sys.stdout)
else:
reservoir_sample(args.size, sys.stdin, sys.stdout)
if __name__ == '__main__':
main()
================================================
FILE: data_tools/set-diff.sh
================================================
#!/usr/bin/env bash
if [ $# -ne 2 ]
then
echo "USAGE: set-diff FILE1 FILE2"
exit 1
fi
sorted1=$(mktemp)
sorted2=$(mktemp)
sort -u "$1" > "$sorted1"
sort -u "$2" > "$sorted2"
comm -23 "$sorted1" "$sorted2"
rm "$sorted1" "$sorted2"
================================================
FILE: data_tools/set-intersect
================================================
#!/usr/bin/env bash
if [ $# -ne 2 ]
then
echo "USAGE: set-intersect FILE1 FILE2"
exit 1
fi
sorted1=$(mktemp)
sorted2=$(mktemp)
sort -u "$1" > "$sorted1"
sort -u "$2" > "$sorted2"
comm -12 "$sorted1" "$sorted2"
rm "$sorted1" "$sorted2"
================================================
FILE: data_tools/tokenize
================================================
#!/usr/bin/env bash
if [ "$#" -eq 1 ] && [ "$1" = "-n" ]
then
tr -C '0-9a-zA-Z\047\052\053\055\057\075\134\136\137\140\174\176' ' ' | tr -s ' ' | tr ' ' '\n'
elif [ "$#" -eq 0 ]
then
tr -C '0-9a-zA-Z\047\052\053\055\057\075\134\136\137\140\174\176' ' ' | tr -s ' '
else
echo "USAGE: tokenize [-n]" 1>&2
exit 1
fi
================================================
FILE: data_tools/trim_tsv.py
================================================
#!/usr/bin/env python3
import sys
DELIMITER = '\t'
def trim_tsv(input_stream, output_stream):
for line in input_stream:
row = line.rstrip('\r\n').split(DELIMITER)
data = [field.strip() for field in row]
output_stream.write(DELIMITER.join(data))
output_stream.write('\n')
def main():
if len(sys.argv) == 1:
trim_tsv(sys.stdin, sys.stdout)
elif len(sys.argv) == 2:
with open(sys.argv[1]) as f:
trim_tsv(f, sys.stdout)
else:
sys.stderr.write("USAGE: trim-tsv [FILE]\n")
sys.exit(1)
if __name__ == '__main__':
main()
================================================
FILE: data_tools/tsv-header
================================================
#!/usr/bin/env bash
set -eu -o pipefail
if [ "$#" -gt 1 ]
then
echo "USAGE: tsv-head PATH" >&2
exit 1
elif [ "$#" -eq 1 ]
then
head -1 "$1" | tr '\t' '\n' | awk '{printf "%6d\t%s\n", NR, $0}'
else
head -1 | tr '\t' '\n' | awk '{printf "%6d\t%s\n", NR, $0}'
fi
================================================
FILE: data_tools/tsv_to_json.py
================================================
#!/usr/bin/env python3
import json
import sys
ENCODING = 'utf-8'
NEWLINE_CHARS = u'\f\n\r\v\x85\u2028\u2029'
def main():
if len(sys.argv) == 1:
f = sys.stdin
elif len(sys.argv) == 2:
if sys.argv[1] == '--help':
sys.stderr.write('USAGE: tsv-to-json [TSV_FILE]\n')
sys.exit(1)
f = open(sys.argv[1], encoding=ENCODING)
else:
sys.stderr.write("USAGE: tsv_to_json.py [FILE]")
sys.exit(1)
header = f.readline().rstrip(NEWLINE_CHARS).split('\t')
for lineno, line in enumerate(f):
fields = line.rstrip(NEWLINE_CHARS).split('\t')
if len(fields) != len(header):
raise Exception('incorrect number of fields at line {}: {}'.format(
lineno,
line))
print(json.dumps(dict(zip(header, fields))))
if __name__ == '__main__':
main()
================================================
FILE: data_tools/xlsx_to_csv.py
================================================
#!/usr/bin/env python3
import argparse
import datetime
import csv
import os
import pprint
import sys
import xlrd
DATE_FMT = '%Y-%m-%dT%H:%M:%S'
ENCODING = 'utf-8'
CSV_SUFFIX = '.csv'
PP = pprint.PrettyPrinter()
def list_xlsx_sheets(xlsx_path, output_stream):
book = xlrd.open_workbook(xlsx_path)
for sheet in sorted(book.sheet_names()):
output_stream.write(sheet)
output_stream.write('\n')
def sheet_name_to_filename(sheet_name):
return sheet_name + CSV_SUFFIX
def cell_to_str(cell, date_fmt, datemode):
if cell.ctype == xlrd.XL_CELL_DATE:
dt = datetime.datetime(*xlrd.xldate_as_tuple(cell.value, datemode))
return dt.strftime(date_fmt)
elif cell.ctype == xlrd.XL_CELL_NUMBER:
if cell.value == int(cell.value):
return str(int(cell.value))
else:
return str(cell.value)
else:
return str(cell.value)
def xlsx_book_to_csv(book, sheet_path, sheet_name, date_fmt):
sheet = book.sheet_by_name(sheet_name)
if sheet_path == '-':
f = sys.stdout
else:
f = open(sheet_path, 'w')
csvw = csv.writer(f, dialect=csv.excel)
for rownum in range(0, sheet.nrows):
row = [cell_to_str(cell, date_fmt, book.datemode)
for cell
in sheet.row(rownum)]
csvw.writerow(row)
if sheet_path != '-':
f.close()
def xlsx_path_to_csv(xlsx_path, sheet_path, sheet_name, date_fmt):
book = xlrd.open_workbook(xlsx_path)
xlsx_book_to_csv(book, sheet_path, sheet_name, date_fmt)
def xlsx_path_to_csvs(xlsx_path, dir_path, date_fmt):
book = xlrd.open_workbook(xlsx_path)
for sheet_name in book.sheet_names():
sheet_path = os.path.join(dir_path,
sheet_name_to_filename(sheet_name))
xlsx_book_to_csv(book, sheet_path, sheet_name, date_fmt)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('paths',
nargs='*',
metavar='PATH')
parser.add_argument('--date-format', '-d',
dest='date_fmt',
default=DATE_FMT)
parser.add_argument('--sheet', '-s',
dest='sheet')
parser.add_argument('--list', '-l',
dest='list',
action='store_true')
args = parser.parse_args()
if args.list:
if len(args.paths) != 1:
raise Exception("USAGE: xlsx-to-csv --list XLSX_FILE")
list_xlsx_sheets(args.paths[0], sys.stdout)
elif args.sheet:
if len(args.paths) == 1:
xlsx_path = args.paths[0]
output_path = sheet_name_to_filename(args.sheet)
elif len(args.paths) == 2:
xlsx_path = args.paths[0]
output_path = args.paths[1]
else:
raise Exception("USAGE xlsx-to-csv --sheet=NAME XLSX_FILE "
"[OUTPUT_FILE]")
xlsx_path_to_csv(xlsx_path, output_path, args.sheet, args.date_fmt)
else:
if len(args.paths) != 2:
raise Exception("USAGE: xlsx-to-csv XLSX_FILE OUTPUT_DIR")
if os.path.exists(args.paths[1]):
sys.stderr.write(
'Something is already at the output path: {}\n'.format(
args.paths[1]))
sys.exit(1)
os.makedirs(args.paths[1])
xlsx_path_to_csvs(args.paths[0], args.paths[1], args.date_fmt)
if __name__ == '__main__':
main()
================================================
FILE: data_tools/yaml_to_json.py
================================================
#!/usr/bin/env python3
import json
import sys
import yaml
def main():
try:
if (len(sys.argv) > 2):
sys.stderr.write("USAGE: yaml-to-json [FILE]")
elif (len(sys.argv) == 2):
with open(sys.argv[1]) as f:
print(json.dumps(yaml.safe_load(f.read())))
else:
print(json.dumps(yaml.safe_load(sys.stdin.read())))
except yaml.scanner.ScannerError as e:
sys.stderr.write(str(e))
sys.exit(1)
if __name__ == '__main__':
main()
================================================
FILE: doc/check-tsv.1.md
================================================
% CHECK-TSV(1)
% Clark Grubb
% March 6, 2015
# NAME
check-tsv - check whether all rows in a TSV file have the same number of columns
# SYNOPSIS
check-tsv [TSV_FILE]
# DESCRIPTION
Check whether all rows in a TSV file have the same number of columns.
The exit status is 0 if all rows have the same number of fields and 1 if they do not.
For each number of fields the number of rows is printed out.
# OPTIONS
none
# SEE ALSO
`awk` (1), `tawk` (1), `tsv-header` (1), `trim-tsv` (1)
================================================
FILE: doc/convert-date.1.md
================================================
% CONVERT-DATE(1)
% Clark Grubb
% July 18, 2015
# NAME
convert-date - convert the date format of a column of tab-delimited data
# SYNOPSIS
convert-date [-i FMT] [-o FMT] [-c COLUMN] [-H]
# DESCRIPTION
Convert the dates in a specified column of a tab-delimited file.
By default the first column is processed. Use the `-c` flag to
specify a different column (the first column is indexed as zero).
If the file has a header, use the `-H` flag to not process it.
Use the `-i` and `-o` flags to set the input and output date formats.
If one of the formats is set to `%s` (i.e. Unix epoch) and the other
format is unspecified, it will be set to `%Y-%m-%dT%H:%M:%S` (ISO 8601 format).
Conversely, if one of the formats is set to `%Y-%m%dT%H:%M:%S` and the
other is unspecified it will be set to `%s`.
# OPTIONS
-c COLUMN
: the column number (zero-based) of the column to convert.
-H / --header
: do not process the first line
-i FMT / --input-format FMT
: the strftime-style format used to parse the input
-o FMT / --output-format FMT
: the stftime-style format used to format the output.
# SEE ALSO
`date` (1), `strftime` (3), `strptime` (3)
================================================
FILE: doc/counting-sort.1.md
================================================
% COUNTING-SORT(1)
% Clark Grubb
% May 6, 2014
# NAME
counting-sort - perform counting sort on a file or standard input
# SYNOPSIS
counting-sort [FILE]
# DESCRIPTION
Counting sort is fast when the number of distinct values is small
compared to the total number of values. For example, when sorting
a file with 3M rows but only 300 distinct values, the regular `sort`
takes 2m30s whereas `counting-sort` only takes 3s.
`counting-sort` only does a lexical sort.
# OPTIONS
None
# SEE ALSO
`sort` (1)
http://en.wikipedia.org/wiki/Counting_sort
================================================
FILE: doc/csv-to-json.1.md
================================================
% CSV-TO-JSON(1)
% Clark Grubb
% June 4, 2013
# NAME
csv-to-json - convert CSV to JSON
# SYNOPSIS
csv-to-json OPTIONS [CSV_FILE]
# DESCRIPTION
Read a CSV file from file specified on command line or standard input and write the corresponding JSON to standard output.
Each row of the JSON output contains a serialized JSON object. The values of the object come from the corresponding row of the CSV file; the header is used for the keys. The \--header flag should be used if the CSV file does not have a header.
# OPTIONS
-d DELIMITER, \--delimiter=DELIMITER
: Used to read CSV files which use DELIMITER to separate fields instead of a comma.
\--header=NAME[,NAME...]
: comma-separated list of column names
-q QUOTECHAR, \--quotechar=QUOTECHAR
: Used to read CSV files which use QUOTECHAR to quote fields instead of double quotes.
# SEE ALSO
`tsv-to-json` (1), `json-ruby` (1)
http://www.ietf.org/rfc/rfc4180.txt
http://json.org
================================================
FILE: doc/csv-to-postgres.1.md
================================================
% CSV-TO-POSTGRES(1)
% Clark Grubb
% March 21, 2015
# NAME
csv-to-postgres - import a CSV file to a PostgreSQL table
# SYNOPSIS
csv-to-postgres -f CSV_PATH -t TABLE [-d DB] [-h HOST] [-p PORT] [-U USER] [-w|-W]
# DESCRIPTION
Import a CSV file into a PostgreSQL table.
The table is not emptied before the new rows are imported.
All options other than `-t` and `-f` are passed to `psql`.
The `PGPASSWORD` environment variable can be used to pass the PostgreSQL
password to `psql`.
# OPTIONS
-d DB
: name of the PostgreSQL database
-f PATH
: path of the CSV file
-h HOST
: PostgreSQL server host
-p PORT
: the port number.
-t TABLE
: the name of the PostgreSQL table to export
-U USER
: the PostgreSQL user to connect as
-w
: do not prompt for PostgreSQL password
-W
: prompt for PostgreSQL password
# SEE ALSO
`postgres-to-csv` (1)
================================================
FILE: doc/csv-to-tab.1.md
================================================
% CSV-TO-TAB(1)
% Clark Grubb
% February 16, 2013
# NAME
csv-to-tab - convert CSV to tab delimited
# SYNOPSIS
csv-to-tab OPTIONS [CSV_FILE]
# DESCRIPTION
Read a CSV file from file specified on command line or standard input and write the corresponding tab delimited file to standard output.
In the tab delimited format fields are delimited by tabs and records are terminated by an end-of-line marker. `csv-to-tab` uses newline as the end-of-line marker.
There is no mechanism for quoting tabs or newlines, and by default `csv-to-tab` will fail if they occur in the fields of the CSV file.
# OPTIONS
-e, \--escape
: Use backslash escape sequences to escape tabs, carriage returns, newlines, and backslashes.
-r, \--replace
: replaces tabs and characters that should be interpreted as newlines as newlines with spaces. The characters treated as newlines are: \\f \\n \\r \\v \\x85 \\u2028 \\u2029.
-x, \--strip
: Remove tabs, carriage returns, and newlines in fields.
# SEE ALSO
`tab-to-csv` (1)
http://www.ietf.org/rfc/rfc4180.txt
http://www.iana.org/assignments/media-types/text/tab-separated-values
================================================
FILE: doc/csv-to-xlsx.1.md
================================================
% CSV-TO-XLSX(1)
% Clark Grubb
% November 7, 2013
# NAME
csv-to-xlsx - convert CSV files to XLSX worksheets
# SYNOPSIS
csv-to-xlsx -o|--output-file XLSX\_PATH CSV\_PATH ...
# DESCRIPTION
Create an XLSX workbook from the CSV files specified on the command line.
Each CSV file becomes a worksheet in the workbook.
The names of the worksheets are derived from CSV file names. Excel worksheet names are limited to 31 characters and these characters are forbidden:
[ ] * ? / \ .
`csv-to-xlsx` replaces forbidden characters with spaces, squeezes multiple adjacents spaces to a single space, truncates to 31 characters, and trims marginal space. If this results in multiple sheets with the same name an error is generated.
XLSX is the default format used by Excel 2007 and later.
# OPTIONS
-o PATH, \--output-file PATH
: the PATH of the XLSX file to create. It must have an .xlsx suffix.
# SEE ALSO
`xlsx-to-csv` (1)
http://www.ietf.org/rfc/rfc4180.txt
http://www.ecma-international.org/publications/standards/Ecma-376.htm
================================================
FILE: doc/date-seq.1.md
================================================
% DATE-SEQ(1)
% Clark Grubb
% June 17, 2013
# NAME
date-seq - print sequence of dates or times
# SYNOPSIS
date-seq [--format=FMT] [--weekdays=DAY[,DAY]...] YYYY[MM[DD[HH]]] YYYY[MM[DD[HH]]]
# DESCRIPTION
Generate a sequence of dates or times.
The command takes two arguments: the start date and the end date. The generated sequence is inclusive.
The format of the date arguments is YYYY[MM[DD[HH[MI[SS]]]]]. As little as the year or as much as the second can be specified. The end date must be the same length as the start date.
If the arguments have a YYYYMMDD format, the sequence will consist of days. If the arguments have a YYYYMMDDHH format, the sequence will consist of hours. Sequences of years, months, minutes, or seconds are also possible.
# OPTIONS
--format
: `strftime` style format string to control output.
--regex
: a regular expression which can be used to filter the sequence. The regular expression should be written to apply to the YYYY[MM[DD[HH[MI[SS]]]]] format, not the output format specified by the --format flag.
--weekdays
: comma separated list of weekdays. Dates for days outside the list are excluded.
# EXAMPLES
Every Monday, Wednesday, and Friday in October 2012:
date-seq --weekdays=Mon,Wed,Fri 20121001 20121031
Every fourth day starting October 1, 2012:
date-seq 20121001 20121101 | awk 'NR % 4 == 0'
The second day of each month of 2012 in YYYY-MM-DD format:
date-seq --format='%F' --regex='.{6}02' 20120101 20121231
The 30 most recent days in YYYYMMDD format:
date-seq 20100101 $(date +'%Y%m%d') | tail -30
# SEE ALSO
`strftime` (3), `seq` (1), `grep` (1), `awk` (1)
================================================
FILE: doc/header-sort.1.md
================================================
% HEADER-SORT(1)
% Clark Grubb
% June 4, 2013
# NAME
header-sort - sort file with header
# SYNOPSIS
header-sort \[OPTIONS\] FILE
# DESCRIPTION
Like `sort`, but the position of the first line is preserved.
# OPTIONS
See `sort` for available options.
# SEE ALSO
`sort` (1)
================================================
FILE: doc/highlight.1.md
================================================
% HIGHLIGHT(1)
% Clark Grubb
% September 12, 2013
# NAME
highlight - highlight text in a stream maching a regular expression
# SYNOPSIS
highlight REGEX [FILE]
highlight (--red|--green|--yellow|--blue|--magenta|--cyan|--white|--black)=REGEX ... [FILE]
highlight (-r|-g|-y|-b|-m|-c|-w)=REGEX ... [FILE]
highlight (--red-background|--green-background|--yellow-background)=REGEX ... [FILE]
highlight (--blue-background|--magenta-background|--cyan-background)=REGEX ... [FILE]
highlight (--white-background|--black-background)=REGEX ... [FILE]
highlight (--bold|--italic|--underline|--inverse)=REGEX ... [FILE]
# DESCRIPTION
Reads lines from file or standard input and writes them to standard out with any
substrings matching REGEX highlighted in red.
This is similar to `grep --color=always REGEX`, but grep will not print
lines which don't match REGEX at all.
The default color is red. The other choices are green, yellow, blue, magenta,
cyan, white, and black.
It is also possible to set the background highlight color or to invert
the video.
Furthermore it is possible to set some text effects: bold, italic, or underline.
Multiple patterns can be specified, but the results when patterns overlap are
unpredictable.
# EXAMPLES
Highlight which shells users are using:
highlight -r /bin/bash -g /bin/sh -b /usr/bin/zsh -m /bin/false < /etc/passwd
# OPTIONS
-r REGEX, \--red=REGEX
: highlight text matching REGEX in red.
-g REGEX, \--green=REGEX
: highlight text matching REGEX in green.
-y REGEX, \--yellow=REGEX
: highlight text matching REGEX in yellow.
-b REGEX, \--blue=REGEX
: highlight text matching REGEX in blue.
-m REGEX, \--magenta=REGEX
: highlight text matching REGEX in magenta.
-c REGEX, \--cyan=REGEX
: highlight text matching REGEX in cyan.
-w REGEX, \--white=REGEX
: highlight text matching REGEX in white.
\--black=REGEX
: highlight text matching REGEX in black.
\--red-background=REGEX
: highlight background of text matching REGEX in red.
\--green-background=REGEX
: highlight background of text matching REGEX in green.
\--yellow-background=REGEX
: highlight background of text matching REGEX in yellow.
\--blue-background=REGEX
: highlight background of text matching REGEX in blue.
\--magenta-background=REGEX
: highlight background of text matching REGEX in magenta.
\--cyan-background=REGEX
: highlight background of text matching REGEX in cyan.
\--white-background=REGEX
: highlight background of text matching REGEX in white.
\--black-background=REGEX
: highlight background of text matching REGEX in black.
\--bold=REGEX
: put text matching REGEX in bold text.
\--italic=REGEX
: put text matching REGEX in italic text. My terminal does not support this, however.
\--underline=REGEX
: underline text matching REGEX.
\--invert=REGEX, \--reverse=REGEX
: highlight text matching REGEX with reverse video.
# SEE ALSO
`grep` (1)
================================================
FILE: doc/html-table-to-csv.1.md
================================================
% HTML-TABLE-TO-CSV(1)
% Clark Grubb
% March 26, 2017
# NAME
html-table-to-csv - convert CSV to JSON
# SYNOPSIS
html-table-to-csv [-t TABLE\_NUM] [HTML_FILE]
# DESCRIPTION
Read a HTML file from file specified on command line or standard input, extract the contents of a table in the document, and write the corresponding CSV to standard output.
# OPTIONS
-t TABLE\_NUM, \--table=TABLE\_NUM
: Used to specify which table to extract from the HTML document. By default the first table, numbered 0, is extracted.
# SEE ALSO
http://www.ietf.org/rfc/rfc4180.txt
================================================
FILE: doc/join-tsv.1.md
================================================
% JOIN-TSV(1)
% Clark Grubb
% October 21, 2013
# NAME
join-tsv - perform a relation join on two TSV files
# SYNOPSIS
join-tsv --column=NAME [--null=VALUE|--no-null] [--left|--right|--full] TSV\_FILE1 TSV\_FILE2
# DESCRIPTION
Perform a relation join on two TSV files. The output is written to standard output in TSV format.
`join-tsv` assumes that TSV\_FILE1 and TSV\_FILE2 are in accordance with the IANA MIME type specificsation.
`join-tsv` is easier to use than `join` when working with TSV files because it preserves the headers. It allows specifying the join column by name. If the join column names differ, the column name if the left (i.e. first) file is used in the output.
`join-tsv` performs the join by reading the smaller file into memory. `join-tsv` can perform left, right, or full outer joins.
The default null value is the empty string. It is not used as a join value. It can be changed to something else with the `--null` flag. The `--no-null` flag can be used to treat all strings including the empty string as join values.
# OPTIONS
-C NAME, \--column=NAME
: the name of the join columns if they are the same. If they differ, use the -L and -R flags.
-L NAME, \--left-column=NAME
: used to specify the name of the join column in the left (i.e. first) TSV file.
-R, \--right-column
: used to specify the name of the join column in the right (i.e. second) TSV file.
-f, \--full
: Perform a full outer join. Rows with a null join value in TSV\_FILE1 or TSV_FILE2 will be included in the output.
-l, \--left
: Perform a left outer join. Rows with a null join value in TSV\_FILE1 will be included in the output.
-r, \--right
: Perform a right outer join. Rows with a null join value in TSV\_FILE2 will be included in the output.
-n VALUE, \--null=VALUE
: use VALUE as the null value. The default null value is the empty string.
-N, \--no-null
: no null value. The empty string can be used as a join value.
-o, \--outer-null
: the null value used in outer joins.
# SEE ALSO
`join` (1)
http://www.iana.org/assignments/media-types/text/tab-separated-values
================================================
FILE: doc/json-diff.1.md
================================================
% JSON-DIFF(1)
% Clark Grubb
% July 29, 2014
# NAME
json-diff - run diff on two JSON documents
# SYNOPSIS
json-diff [DIFF_OPTIONS] PATH1 PATH2
# DESCRIPTION
Run `diff` on two JSON documents. Each document is normalized using `python -mjson.tool`.
# OPTIONS
Any options are passed to `diff`.
# SEE ALSO
`diff` (1)
================================================
FILE: doc/normalize-utf8.1.md
================================================
% NORMALIZE-UTF8(1)
% Clark Grubb
% February 8, 2014
# NAME
normalize-utf8 - convert UTF-8 encoded files or standard input to a normalized form
# SYNOPSIS
normalize-utf8 [--nfc|--nfd|--nfkc|--nfkd] [FILE]
# DESCRIPTION
Put UTF-8 encoded Unicode text into a normalized form.
Unicode contains different character sequences which are
rendered the same way. An example is SMALL LETTER C WITH CEDILLA,
which can be represented as a single character: U+00E7 or as SMALL LETTER C
followed by COMBINING CEDILLA: U+0063 U+0327. When
performing a string comparison, the two sequences should often
be regarded as identifical. If the strings being compared have
been put into normal form, then a simple string comparison can be
used.
The Unicode standard defines four normalization forms. NFC (Normal Form C),
which is the default format used by `normalize-utf8`, favors single character
representations over multiple character representations containing
combining marks. NFC is also called W3C normalization.
Conversely, NFD (Normal Form D) favors multiple character representations
consisting of a simple character representation followed by a combining mark. Converting
a string to NFD is faster because the algorithm for converting a string to NFC starts by
converting it to NFD.
NFKC and NFKD conflate compatibility composites. These are sequences which are
visually distinct but semantically the same. Examples are the ff and ffi ligatures.
# OPTIONS
--nfc
: write input to standard out in Normal Form C
--nfd
: write input to standard out in Normal Form D
--nfkc
: write input to standard out in Normal Form KC
--nfkd
: write input to standard out in Normal Form KD
# SEE ALSO
`utf8-viewer` (1)
http://unicode.org/reports/tr15/
http://www.unicode.org/reports/tr36/
================================================
FILE: doc/postgres-to-csv.1.md
================================================
% POSTGRES-TO-CSV(1)
% Clark Grubb
% March 21, 2015
# NAME
postgres-to-csv - export a PostgreSQL table to a CSV file
# SYNOPSIS
postgres-to-csv -t TABLE [-d DB] [-h HOST] [-p PORT] [-U USER] [-w|-W]
# DESCRIPTION
Write a PostgreSQL table to standard out in CSV format.
All options other than `-t` are passed to `psql`.
The `PGPASSWORD` environment variable can be used to pass the PostgreSQL
password to `psql`.
# OPTIONS
-d DB
: name of the PostgreSQL database
-h HOST
: PostgreSQL server host
-p PORT
: the port number.
-t TABLE
: the name of the PostgreSQL table to export
-U USER
: the PostgreSQL user to connect as
-w
: do not prompt for PostgreSQL password
-W
: prompt for PostgreSQL password
# SEE ALSO
`csv-to-postgres` (1)
================================================
FILE: doc/reservoir-sample.1.md
================================================
% RESERVOIR-SAMPLE(1)
% Clark Grubb
% October 13, 2013
# NAME
reservoir-sample - sample lines from file or standard input
# SYNOPSIS
reservoir-sample [-r|--random-seed SEED] (-s NUM|--size=NUM) [FILE]
# DESCRIPTION
Select NUM lines randomly from FILE or standard input. Each line is equally likely to be chosen.
The script uses reservoir sampling. It is more efficient than randomly shuffling the file
with `sort -R` and then taking the first N lines with `head`.
To select a sample size which is proportional to the size of the input, use `awk`:
awk 'rand() < 0.1'
# OPTIONS
-r SEED, \--random-seed=SEED
: a seed value to be passed to the random number generator.
-s NUM, \--size=NUM
: the size of the sample to select
# SEE ALSO
`sort` (1), `awk` (1), `shuf` (1)
https://en.wikipedia.org/wiki/Reservoir_sampling
================================================
FILE: doc/set-diff.1.md
================================================
% SET-DIFF(1)
% Clark Grubb
% May 6, 2013
# NAME
set-diff - find lines in first file which are not in the second
# SYNOPSIS
set-diff FILE1 FILE2
# DESCRIPTION
List the lines which are in the first file and not in the second.
The lines are output in a sorted order and not necessarily the order of the first file.
If the files are already sorted, it is faster to use `comm -23`.
`comm -23` gives erroneous results with no warning if the input files are not sorted.
# OPTIONS
None
# SEE ALSO
`comm` (1)
`set-intersect` (1)
================================================
FILE: doc/set-intersect.1.md
================================================
% SET-INTERSECT(1)
% Clark Grubb
% May 6, 2013
# NAME
set-intersect - find lines common to two files
# SYNOPSIS
set-intersect FILE1 FILE2
# DESCRIPTION
List the lines which are in both the first file and the second file.
If the files are already sorted, it is faster to use `comm -12`.
`comm -12` gives erroneous results with no warning if the input files
are not sorted.
# OPTIONS
None
# SEE ALSO
`comm` (1)
`set-diff` (1)
================================================
FILE: doc/tab-to-csv.1.md
================================================
% TAB-TO-CSV(1)
% Clark Grubb
% February 16, 2013
# NAME
tab-to-csv - convert tab delimited to CSV
# SYNOPSIS
tab-to-csv OPTIONS [TSV_FILE]
# DESCRIPTION
Read a tab delimited file from file specified on the command line or standard input and write the corresponding CSV file to standard output.
In the tab delimited format fields are delimited by tabs and records are terminated by an end-of-line marker.
# OPTIONS
-u, \--unescape
: Interpret the following backslash sequences when encountered in the data: \n, \r, \t, \\.
# SEE ALSO
`csv-to-tab` (1)
http://www.ietf.org/rfc/rfc4180.txt
http://www.iana.org/assignments/media-types/text/tab-separated-values
================================================
FILE: doc/tokenize.1.md
================================================
% TOKENIZE(1)
% Clark Grubb
% February 15, 2015
# NAME
tokenize - extract words from English language text
# SYNOPSIS
tokenize [-n]
# DESCRIPTION
Exract words from English language text. Words consist of adjacent letters, numbers, and
these punctuation characters:
'*+-/=\^_`|~
Control characters and these punctuation characters delimit words and are removed:
!#$%&(),:;<>?@[]{}
Space characters also delimit words. The words are written out separated by spaces unless
the `-n` flag is used, in which case they are separated by newlines.
Non-ASCII characters delimit words and are removed. It might be desirable to replace accented Latin characters with the unaccented versions. This command can be used:
$ iconv -f utf-8 -t ascii//TRANSLIT
# OPTIONS
-n
: write the words out one per line.
# SEE ALSO
`iconv` (1)
================================================
FILE: doc/trim-tsv.1.md
================================================
% TRIM-TSV(1)
% Clark Grubb
% September 25, 2013
# NAME
trim-tsv - trim whitespace from fields in a tab delimited file
# SYNOPSIS
trim-tsv [TSV_FILE]
# DESCRIPTION
Trim whitespace from fields in a tab delimited file. If no path is specified on the command line, the tool reads from standard input.
# OPTIONS
none
# SEE ALSO
`tawk` (1)
================================================
FILE: doc/tsv-header.1.md
================================================
% TSV-HEADER(1)
% Clark Grubb
% March 6, 2015
# NAME
tsv-header - number the columns in a TSV header
# SYNOPSIS
tsv-header [TSV_FILE]
# DESCRIPTION
Display the columns of a TSV file header, one per line, with their ordinal positions.
The is useful for mapping `awk` script variables, e.g. `$1`, `$2`, ..., to column names.
# OPTIONS
none
# SEE ALSO
`awk` (1), `tawk` (1), `check-tsv` (1)
================================================
FILE: doc/tsv-to-json.1.md
================================================
% TSV-TO-JSON(1)
% Clark Grubb
% June 4, 2013
# NAME
tsv-to-json - convert TSV to JSON
# SYNOPSIS
tsv-to-json OPTIONS [TSV_FILE]
# DESCRIPTION
Read a TSV file from file specified on the command line or standard input and write the corresponding JSON to standard output.
Each row of the JSON output contains a serialized JSON object. The values of the object come from the corresponding row of the CSV file, and the header is used fo the keys.
# OPTIONS
None
# SEE ALSO
`csv-to-json` (1), `json-ruby` (1)
http://www.iana.org/assignments/media-types/text/tab-separated-values
http://json.org
================================================
FILE: doc/utf8-category.1.md
================================================
% UTF8-SCRIPT(1)
% Clark Grubb
% February 14, 2015
# NAME
utf8-script - tally UTF-8 encoded characters by general category
# SYNOPSIS
utf8-script [-l|\--long-names] [-c|\--count-ascii|-s|\--skip-ascii]
# DESCRIPTION
Tally the UTF-8 encoded characters in the standard input stream by general category.
Abbr Long Description
--- ---- -----------
Lu Uppercase_Letter an uppercase letter
Ll Lowercase_Letter a lowercase letter
Lt Titlecase_Letter a digraphic character, with first part uppercase
LC Cased_Letter Lu | Ll | Lt
Lm Modifier_Letter a modifier letter
Lo Other_Letter other letters, including syllables and ideographs
L Letter Lu | Ll | Lt | Lm | Lo
Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
Mc Spacing_Mark a spacing combining mark (positive advance width)
Me Enclosing_Mark an enclosing combining mark
M Mark Mn | Mc | Me
Nd Decimal_Number a decimal digit
Nl Letter_Number a letterlike numeric character
No Other_Number a numeric character of other type
N Number Nd | Nl | No
Pc Connector_Punctuation a connecting punctuation mark, like a tie
Pd Dash_Punctuation a dash or hyphen punctuation mark
Ps Open_Punctuation an opening punctuation mark (of a pair)
Pe Close_Punctuation a closing punctuation mark (of a pair)
Pi Initial_Punctuation an initial quotation mark
Pf Final_Punctuation a final quotation mark
Po Other_Punctuation a punctuation mark of other type
P Punctuation Pc | Pd | Ps | Pe | Pi | Pf | Po
Sm Math_Symbol a symbol of mathematical use
Sc Currency_Symbol a currency sign
Sk Modifier_Symbol a non-letterlike modifier symbol
So Other_Symbol a symbol of other type
S Symbol Sm | Sc | Sk | So
Zs Space_Separator a space character (of various non-zero widths)
Zl Line_Separator U+2028 LINE SEPARATOR only
Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
Z Separator Zs | Zl | Zp
Cc Control a C0 or C1 control code
Cf Format a format control character
Cs Surrogate a surrogate code point
Co Private_Use a private-use character
Cn Unassigned a reserved unassigned code point or a noncharacter
C Other Cc | Cf | Cs | Co | Cn
# OPTIONS
-c, \--count-ascii
: treat ASCII characters as a separate general category called "ASCII".
-l, \--long-names
: use long names for the general categories instead of the two character abbreviations.
-s, \--skip-ascii
: skip ASCII characters. Only characters with Unicode point U+0080 and higher are counted.
# SEE ALSO
http://unicode.org/reports/tr44/#General_Category_Values
================================================
FILE: doc/utf8-script.1.md
================================================
% UTF8-SCRIPT(1)
% Clark Grubb
% February 14, 2015
# NAME
utf8-script - tally characters by UTF-8 script
# SYNOPSIS
utf8-script [-c|\--count-ascii|-s|\--skip-ascii]
# DESCRIPTION
Tally the characters in the standard input stream by UTF-8 script.
# OPTIONS
-c, \--count-ascii
: treat ASCII characters as a separate script called "ASCII".
-s, \--skip-ascii
: skip ASCII characters. Only characters with Unicode point U+0080 and higher are counted.
# SEE ALSO
http://unicode.org/Public/UNIDATA/Scripts.txt
================================================
FILE: doc/xlsx-to-csv.1.md
================================================
% XLSX-TO-CSV(1)
% Clark Grubb
% May 4, 2013
# NAME
xlsx-to-csv - convert .xlsx to .csv
# SYNOPSIS
xlsx-to-csv XLSX\_FILE OUTPUT_DIR
xlsx-to-csv --sheet=SHEET XLSX\_FILE [OUTPUT\_FILE]
xlsx-to-csv --list XLSX\_FILE
# DESCRIPTION
Read a .xlsx file and create a .csv file in DIRECTORY for each worksheet.
DIRECTORY must not already exist.
Output is UTF-8 encoded.
.xlsx files are the format used by Excel since 2007. The .xlsx file format defined by ECMA-376. An .xlsx file is a ZIP archive of a directory containing XML documents. The `unzip -l` command can be used to list the contents of a ZIP archive and hence an .xlsx file.
`xlsx-to-csv` also works on .xls files which were used by Excel before 2007.
The tool can easily take a minute or more to process a large (~100MB) workbook. Unfortunately, it takes about this long just to list the sheet names with the `--list` flag.
Hence it is more efficient to extract all of the sheets from a large workbook even if only one of the sheets is needed.
# OPTIONS
--list
: list the sheets in XLSX\_FILE
--sheet
: only convert SHEET to a .csv file.
--date-format=STRFTIME_FMT
: a `strftime` style format to be used for Excel dates. The default is the ISO 8601 format: '%Y-%m-%dT%H:%M:%S'.
# SEE ALSO
`csv-to-tab` (1), `strftime` (3)
http://www.ecma-international.org/publications/standards/Ecma-376.htm
================================================
FILE: man/check-tsv.1
================================================
.TH CHECK\-TSV 1 "March 6, 2015"
.SH NAME
.PP
check\-tsv \- check whether all rows in a TSV file have the same number
of columns
.SH SYNOPSIS
.PP
check\-tsv [TSV_FILE]
.SH DESCRIPTION
.PP
Check whether all rows in a TSV file have the same number of columns.
.PP
The exit status is 0 if all rows have the same number of fields and 1 if
they do not.
.PP
For each number of fields the number of rows is printed out.
.SH OPTIONS
.PP
none
.SH SEE ALSO
.PP
\f[C]awk\f[] (1), \f[C]tawk\f[] (1), \f[C]tsv\-header\f[] (1),
\f[C]trim\-tsv\f[] (1)
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/convert-date.1
================================================
.TH "CONVERT\-DATE" "1" "July 18, 2015" "" ""
.SH NAME
.PP
convert\-date \- convert the date format of a column of tab\-delimited
data
.SH SYNOPSIS
.PP
convert\-date [\-i FMT] [\-o FMT] [\-c COLUMN] [\-H]
.SH DESCRIPTION
.PP
Convert the dates in a specified column of a tab\-delimited file.
.PP
By default the first column is processed.
Use the \f[C]\-c\f[] flag to specify a different column (the first
column is indexed as zero).
.PP
If the file has a header, use the \f[C]\-H\f[] flag to not process it.
.PP
Use the \f[C]\-i\f[] and \f[C]\-o\f[] flags to set the input and output
date formats.
.PP
If one of the formats is set to \f[C]%s\f[] (i.e.
Unix epoch) and the other format is unspecified, it will be set to
\f[C]%Y\-%m\-%dT%H:%M:%S\f[] (ISO 8601 format).
.PP
Conversely, if one of the formats is set to \f[C]%Y\-%m%dT%H:%M:%S\f[]
and the other is unspecified it will be set to \f[C]%s\f[].
.SH OPTIONS
.TP
.B \-c COLUMN
the column number (zero\-based) of the column to convert.
.RS
.RE
.TP
.B \-H / \-\-header
do not process the first line
.RS
.RE
.TP
.B \-i FMT / \-\-input\-format FMT
the strftime\-style format used to parse the input
.RS
.RE
.TP
.B \-o FMT / \-\-output\-format FMT
the stftime\-style format used to format the output.
.RS
.RE
.SH SEE ALSO
.PP
\f[C]date\f[] (1), \f[C]strftime\f[] (3), \f[C]strptime\f[] (3)
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/counting-sort.1
================================================
.TH COUNTING\-SORT 1 "May 6, 2014"
.SH NAME
.PP
counting\-sort \- perform counting sort on a file or standard input
.SH SYNOPSIS
.PP
counting\-sort [FILE]
.SH DESCRIPTION
.PP
Counting sort is fast when the number of distinct values is small
compared to the total number of values.
For example, when sorting a file with 3M rows but only 300 distinct
values, the regular \f[C]sort\f[] takes 2m30s whereas
\f[C]counting\-sort\f[] only takes 3s.
.PP
\f[C]counting\-sort\f[] only does a lexical sort.
.SH OPTIONS
.PP
None
.SH SEE ALSO
.PP
\f[C]sort\f[] (1)
.PP
http://en.wikipedia.org/wiki/Counting_sort
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/csv-to-json.1
================================================
.TH CSV-TO-JSON 1 "June 4, 2013"
.SH NAME
.PP
csv-to-json - convert CSV to JSON
.SH SYNOPSIS
.PP
csv-to-json OPTIONS [CSV_FILE]
.SH DESCRIPTION
.PP
Read a CSV file from file specified on command line or standard input
and write the corresponding JSON to standard output.
.PP
Each row of the JSON output contains a serialized JSON object.
The values of the object come from the corresponding row of the CSV
file; the header is used for the keys.
The --header flag should be used if the CSV file does not have a header.
.SH OPTIONS
.PP
-d DELIMITER, --delimiter=DELIMITER : Used to read CSV files which use
DELIMITER to separate fields instead of a comma.
.PP
--header=NAME[,NAME...] : comma-separated list of column names
.PP
-q QUOTECHAR, --quotechar=QUOTECHAR : Used to read CSV files which use
QUOTECHAR to quote fields instead of double quotes.
.SH SEE ALSO
.PP
\f[C]tsv-to-json\f[] (1), \f[C]json-ruby\f[] (1)
.PP
http://www.ietf.org/rfc/rfc4180.txt
.PP
http://json.org
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/csv-to-postgres.1
================================================
.TH CSV-TO-POSTGRES 1 "March 21, 2015"
.SH NAME
.PP
csv-to-postgres - import a CSV file to a PostgreSQL table
.SH SYNOPSIS
.PP
csv-to-postgres -f CSV_PATH -t TABLE [-d DB] [-h HOST] [-p PORT] [-U
USER] [-w|-W]
.SH DESCRIPTION
.PP
Import a CSV file into a PostgreSQL table.
.PP
The table is not emptied before the new rows are imported.
.PP
All options other than \f[C]-t\f[] and \f[C]-f\f[] are passed to
\f[C]psql\f[].
.PP
The \f[C]PGPASSWORD\f[] environment variable can be used to pass the
PostgreSQL password to \f[C]psql\f[].
.SH OPTIONS
.PP
-d DB : name of the PostgreSQL database
.PP
-f PATH : path of the CSV file
.PP
-h HOST : PostgreSQL server host
.PP
-p PORT : the port number.
.PP
-t TABLE : the name of the PostgreSQL table to export
.PP
-U USER : the PostgreSQL user to connect as
.PP
-w : do not prompt for PostgreSQL password
.PP
-W : prompt for PostgreSQL password
.SH SEE ALSO
.PP
\f[C]postgres-to-csv\f[] (1)
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/csv-to-tab.1
================================================
.TH "CSV\-TO\-TAB" "1" "February 16, 2013" "" ""
.SH NAME
.PP
csv\-to\-tab \- convert CSV to tab delimited
.SH SYNOPSIS
.PP
csv\-to\-tab OPTIONS [CSV_FILE]
.SH DESCRIPTION
.PP
Read a CSV file from file specified on command line or standard input
and write the corresponding tab delimited file to standard output.
.PP
In the tab delimited format fields are delimited by tabs and records are
terminated by an end\-of\-line marker.
\f[C]csv\-to\-tab\f[] uses newline as the end\-of\-line marker.
.PP
There is no mechanism for quoting tabs or newlines, and by default
\f[C]csv\-to\-tab\f[] will fail if they occur in the fields of the CSV
file.
.SH OPTIONS
.TP
.B \-e, \-\-escape
Use backslash escape sequences to escape tabs, carriage returns,
newlines, and backslashes.
.RS
.RE
.TP
.B \-r, \-\-replace
replaces tabs and characters that should be interpreted as newlines as
newlines with spaces.
The characters treated as newlines are: \\f \\n \\r \\v \\x85 \\u2028
\\u2029.
.RS
.RE
.TP
.B \-x, \-\-strip
Remove tabs, carriage returns, and newlines in fields.
.RS
.RE
.SH SEE ALSO
.PP
\f[C]tab\-to\-csv\f[] (1)
.PP
http://www.ietf.org/rfc/rfc4180.txt
.PP
http://www.iana.org/assignments/media\-types/text/tab\-separated\-values
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/csv-to-xlsx.1
================================================
.TH CSV\-TO\-XLSX 1 "November 7, 2013"
.SH NAME
.PP
csv\-to\-xlsx \- convert CSV files to XLSX worksheets
.SH SYNOPSIS
.PP
csv\-to\-xlsx \-o|\-\-output\-file XLSX_PATH CSV_PATH ...
.SH DESCRIPTION
.PP
Create an XLSX workbook from the CSV files specified on the command
line.
.PP
Each CSV file becomes a worksheet in the workbook.
.PP
The names of the worksheets are derived from CSV file names.
Excel worksheet names are limited to 31 characters and these characters
are forbidden:
.IP
.nf
\f[C]
[\ ]\ *\ ?\ /\ \\\ .
\f[]
.fi
.PP
\f[C]csv\-to\-xlsx\f[] replaces forbidden characters with spaces,
squeezes multiple adjacents spaces to a single space, truncates to 31
characters, and trims marginal space.
If this results in multiple sheets with the same name an error is
generated.
.PP
XLSX is the default format used by Excel 2007 and later.
.SH OPTIONS
.PP
\-o PATH, \-\-output\-file PATH : the PATH of the XLSX file to create.
It must have an .xlsx suffix.
.SH SEE ALSO
.PP
\f[C]xlsx\-to\-csv\f[] (1)
.PP
http://www.ietf.org/rfc/rfc4180.txt
.PP
http://www.ecma\-international.org/publications/standards/Ecma\-376.htm
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/date-seq.1
================================================
.TH DATE\-SEQ 1 "June 17, 2013"
.SH NAME
.PP
date\-seq \- print sequence of dates or times
.SH SYNOPSIS
.PP
date\-seq [\-\-format=FMT][\-\-weekdays=DAY[,DAY]...] YYYY[MM[DD[HH]]]
YYYY[MM[DD[HH]]]
.SH DESCRIPTION
.PP
Generate a sequence of dates or times.
.PP
The command takes two arguments: the start date and the end date.
The generated sequence is inclusive.
.PP
The format of the date arguments is YYYY[MM[DD[HH[MI[SS]]]]].
As little as the year or as much as the second can be specified.
The end date must be the same length as the start date.
.PP
If the arguments have a YYYYMMDD format, the sequence will consist of
days.
If the arguments have a YYYYMMDDHH format, the sequence will consist of
hours.
Sequences of years, months, minutes, or seconds are also possible.
.SH OPTIONS
.PP
\-\-format : \f[C]strftime\f[] style format string to control output.
.PP
\-\-regex : a regular expression which can be used to filter the
sequence.
The regular expression should be written to apply to the
YYYY[MM[DD[HH[MI[SS]]]]] format, not the output format specified by the
\-\-format flag.
.PP
\-\-weekdays : comma separated list of weekdays.
Dates for days outside the list are excluded.
.SH EXAMPLES
.PP
Every Monday, Wednesday, and Friday in October 2012:
.IP
.nf
\f[C]
date\-seq\ \-\-weekdays=Mon,Wed,Fri\ 20121001\ 20121031
\f[]
.fi
.PP
Every fourth day starting October 1, 2012:
.IP
.nf
\f[C]
date\-seq\ 20121001\ 20121101\ |\ awk\ \[aq]NR\ %\ 4\ ==\ 0\[aq]
\f[]
.fi
.PP
The second day of each month of 2012 in YYYY\-MM\-DD format:
.IP
.nf
\f[C]
date\-seq\ \-\-format=\[aq]%F\[aq]\ \-\-regex=\[aq].{6}02\[aq]\ 20120101\ 20121231
\f[]
.fi
.PP
The 30 most recent days in YYYYMMDD format:
.IP
.nf
\f[C]
date\-seq\ 20100101\ $(date\ +\[aq]%Y%m%d\[aq])\ |\ tail\ \-30
\f[]
.fi
.SH SEE ALSO
.PP
\f[C]strftime\f[] (3), \f[C]seq\f[] (1), \f[C]grep\f[] (1), \f[C]awk\f[]
(1)
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/header-sort.1
================================================
.TH HEADER\-SORT 1 "June 4, 2013"
.SH NAME
.PP
header\-sort \- sort file with header
.SH SYNOPSIS
.PP
header\-sort [OPTIONS] FILE
.SH DESCRIPTION
.PP
Like \f[C]sort\f[], but the position of the first line is preserved.
.SH OPTIONS
.PP
See \f[C]sort\f[] for available options.
.SH SEE ALSO
.PP
\f[C]sort\f[] (1)
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/highlight.1
================================================
.TH HIGHLIGHT 1 "September 12, 2013"
.SH NAME
.PP
highlight \- highlight text in a stream maching a regular expression
.SH SYNOPSIS
.PP
highlight REGEX [FILE]
.PP
highlight
(\-\-red|\-\-green|\-\-yellow|\-\-blue|\-\-magenta|\-\-cyan|\-\-white|\-\-black)=REGEX
...
[FILE]
.PP
highlight (\-r|\-g|\-y|\-b|\-m|\-c|\-w)=REGEX ...
[FILE]
.PP
highlight
(\-\-red\-background|\-\-green\-background|\-\-yellow\-background)=REGEX
...
[FILE]
.PP
highlight
(\-\-blue\-background|\-\-magenta\-background|\-\-cyan\-background)=REGEX
...
[FILE]
.PP
highlight (\-\-white\-background|\-\-black\-background)=REGEX ...
[FILE]
.PP
highlight (\-\-bold|\-\-italic|\-\-underline|\-\-inverse)=REGEX ...
[FILE]
.SH DESCRIPTION
.PP
Reads lines from file or standard input and writes them to standard out
with any substrings matching REGEX highlighted in red.
.PP
This is similar to \f[C]grep\ \-\-color=always\ REGEX\f[], but grep will
not print lines which don\[aq]t match REGEX at all.
.PP
The default color is red.
The other choices are green, yellow, blue, magenta, cyan, white, and
black.
.PP
It is also possible to set the background highlight color or to invert
the video.
.PP
Furthermore it is possible to set some text effects: bold, italic, or
underline.
.PP
Multiple patterns can be specified, but the results when patterns
overlap are unpredictable.
.SH EXAMPLES
.PP
Highlight which shells users are using:
.IP
.nf
\f[C]
highlight\ \-r\ /bin/bash\ \-g\ /bin/sh\ \-b\ /usr/bin/zsh\ \-m\ /bin/false\ <\ /etc/passwd
\f[]
.fi
.SH OPTIONS
.PP
\-r REGEX, \-\-red=REGEX : highlight text matching REGEX in red.
.PP
\-g REGEX, \-\-green=REGEX : highlight text matching REGEX in green.
.PP
\-y REGEX, \-\-yellow=REGEX : highlight text matching REGEX in yellow.
.PP
\-b REGEX, \-\-blue=REGEX : highlight text matching REGEX in blue.
.PP
\-m REGEX, \-\-magenta=REGEX : highlight text matching REGEX in magenta.
.PP
\-c REGEX, \-\-cyan=REGEX : highlight text matching REGEX in cyan.
.PP
\-w REGEX, \-\-white=REGEX : highlight text matching REGEX in white.
.PP
\-\-black=REGEX : highlight text matching REGEX in black.
.PP
\-\-red\-background=REGEX : highlight background of text matching REGEX
in red.
.PP
\-\-green\-background=REGEX : highlight background of text matching
REGEX in green.
.PP
\-\-yellow\-background=REGEX : highlight background of text matching
REGEX in yellow.
.PP
\-\-blue\-background=REGEX : highlight background of text matching REGEX
in blue.
.PP
\-\-magenta\-background=REGEX : highlight background of text matching
REGEX in magenta.
.PP
\-\-cyan\-background=REGEX : highlight background of text matching REGEX
in cyan.
.PP
\-\-white\-background=REGEX : highlight background of text matching
REGEX in white.
.PP
\-\-black\-background=REGEX : highlight background of text matching
REGEX in black.
.PP
\-\-bold=REGEX : put text matching REGEX in bold text.
.PP
\-\-italic=REGEX : put text matching REGEX in italic text.
My terminal does not support this, however.
.PP
\-\-underline=REGEX : underline text matching REGEX.
.PP
\-\-invert=REGEX, \-\-reverse=REGEX : highlight text matching REGEX with
reverse video.
.SH SEE ALSO
.PP
\f[C]grep\f[] (1)
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/html-table-to-csv.1
================================================
.\" Automatically generated by Pandoc 1.19.2.1
.\"
.TH "HTML\-TABLE\-TO\-CSV" "1" "March 26, 2017" "" ""
.hy
.SH NAME
.PP
html\-table\-to\-csv \- convert CSV to JSON
.SH SYNOPSIS
.PP
html\-table\-to\-csv [\-t TABLE_NUM] [HTML_FILE]
.SH DESCRIPTION
.PP
Read a HTML file from file specified on command line or standard input,
extract the contents of a table in the document, and write the
corresponding CSV to standard output.
.SH OPTIONS
.TP
.B \-t TABLE_NUM, \-\-table=TABLE_NUM
Used to specify which table to extract from the HTML document.
By default the first table, numbered 0, is extracted.
.RS
.RE
.SH SEE ALSO
.PP
http://www.ietf.org/rfc/rfc4180.txt
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/iso_8859-1.7
================================================
'\" t
.\" Copyright 1993-1995 Daniel Quinlan (quinlan@yggdrasil.com)
.\"
.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
.\" This is free documentation; you can redistribute it and/or
.\" modify it under the terms of the GNU General Public License as
.\" published by the Free Software Foundation; either version 2 of
.\" the License, or (at your option) any later version.
.\"
.\" The GNU General Public License's references to "object code"
.\" and "executables" are to be interpreted as the output of any
.\" document formatting or typesetting system, including
.\" intermediate and printed output.
.\"
.\" This manual is distributed in the hope that it will be useful,
.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
.\" GNU General Public License for more details.
.\"
.\" You should have received a copy of the GNU General Public
.\" License along with this manual; if not, see
.\" <http://www.gnu.org/licenses/>.
.\" %%%LICENSE_END
.\"
.\" Slightly rearranged, aeb, 950713
.\" Updated, dpo, 990531
.TH ISO_8859-1 7 2012-08-14 "Linux" "Linux Programmer's Manual"
.nh
.SH NAME
iso_8859-1 \- ISO 8859-1 character set encoded in octal, decimal,
and hexadecimal
.SH DESCRIPTION
The ISO 8859 standard includes several 8-bit extensions to the ASCII
character set (also known as ISO 646-IRV).
Especially important is
ISO 8859-1, the "Latin Alphabet No. 1", which has become widely
implemented and may already be seen as the de-facto standard ASCII
replacement.
.P
ISO 8859-1 supports the following languages: Afrikaans, Basque,
Catalan, Danish, Dutch, English, Faeroese, Finnish, French, Galician,
German, Icelandic, Irish, Italian, Norwegian, Portuguese, Scottish,
Spanish, and Swedish.
.P
Note that the ISO 8859-1 characters are also the first 256 characters
of ISO 10646 (Unicode).
.SS ISO 8859 alphabets
The full set of ISO 8859 alphabets includes:
.TS
l l.
ISO 8859-1 West European languages (Latin-1)
ISO 8859-2 Central and East European languages (Latin-2)
ISO 8859-3 Southeast European and miscellaneous languages (Latin-3)
ISO 8859-4 Scandinavian/Baltic languages (Latin-4)
ISO 8859-5 Latin/Cyrillic
ISO 8859-6 Latin/Arabic
ISO 8859-7 Latin/Greek
ISO 8859-8 Latin/Hebrew
ISO 8859-9 Latin-1 modification for Turkish (Latin-5)
ISO 8859-10 Lappish/Nordic/Eskimo languages (Latin-6)
ISO 8859-11 Latin/Thai
ISO 8859-13 Baltic Rim languages (Latin-7)
ISO 8859-14 Celtic (Latin-8)
ISO 8859-15 West European languages (Latin-9)
ISO 8859-16 Romanian (Latin-10)
.TE
.SS ISO 8859-1 characters
The following table displays the characters in ISO 8859-1 (Latin-1),
which are printable and unlisted in the
.BR ascii (7)
manual page.
The fourth column will only show the proper glyphs
in an environment configured for ISO 8859-1.
.TS
l2 l2 l2 c2 lp-1.
Oct Dec Hex Char Description
_
240 160 A0 NO-BREAK SPACE
241 161 A1 INVERTED EXCLAMATION MARK
242 162 A2 CENT SIGN
243 163 A3 POUND SIGN
244 164 A4 CURRENCY SIGN
245 165 A5 YEN SIGN
246 166 A6 BROKEN BAR
247 167 A7 SECTION SIGN
250 168 A8 DIAERESIS
251 169 A9 COPYRIGHT SIGN
252 170 AA FEMININE ORDINAL INDICATOR
253 171 AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
254 172 AC NOT SIGN
255 173 AD SOFT HYPHEN (shown as a hyphen at line breaks) [1]
256 174 AE REGISTERED SIGN
257 175 AF MACRON
260 176 B0 DEGREE SIGN
261 177 B1 PLUS-MINUS SIGN
262 178 B2 SUPERSCRIPT TWO
263 179 B3 SUPERSCRIPT THREE
264 180 B4 ACUTE ACCENT
265 181 B5 MICRO SIGN
266 182 B6 PILCROW SIGN
267 183 B7 MIDDLE DOT
270 184 B8 CEDILLA
271 185 B9 SUPERSCRIPT ONE
272 186 BA MASCULINE ORDINAL INDICATOR
273 187 BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
274 188 BC VULGAR FRACTION ONE QUARTER
275 189 BD VULGAR FRACTION ONE HALF
276 190 BE VULGAR FRACTION THREE QUARTERS
277 191 BF INVERTED QUESTION MARK
300 192 C0 LATIN CAPITAL LETTER A WITH GRAVE
301 193 C1 LATIN CAPITAL LETTER A WITH ACUTE
302 194 C2 LATIN CAPITAL LETTER A WITH CIRCUMFLEX
303 195 C3 LATIN CAPITAL LETTER A WITH TILDE
304 196 C4 LATIN CAPITAL LETTER A WITH DIAERESIS
305 197 C5 LATIN CAPITAL LETTER A WITH RING ABOVE
306 198 C6 LATIN CAPITAL LETTER AE
307 199 C7 LATIN CAPITAL LETTER C WITH CEDILLA
310 200 C8 LATIN CAPITAL LETTER E WITH GRAVE
311 201 C9 LATIN CAPITAL LETTER E WITH ACUTE
312 202 CA LATIN CAPITAL LETTER E WITH CIRCUMFLEX
313 203 CB LATIN CAPITAL LETTER E WITH DIAERESIS
314 204 CC LATIN CAPITAL LETTER I WITH GRAVE
315 205 CD LATIN CAPITAL LETTER I WITH ACUTE
316 206 CE LATIN CAPITAL LETTER I WITH CIRCUMFLEX
317 207 CF LATIN CAPITAL LETTER I WITH DIAERESIS
320 208 D0 LATIN CAPITAL LETTER ETH
321 209 D1 LATIN CAPITAL LETTER N WITH TILDE
322 210 D2 LATIN CAPITAL LETTER O WITH GRAVE
323 211 D3 LATIN CAPITAL LETTER O WITH ACUTE
324 212 D4 LATIN CAPITAL LETTER O WITH CIRCUMFLEX
325 213 D5 LATIN CAPITAL LETTER O WITH TILDE
326 214 D6 LATIN CAPITAL LETTER O WITH DIAERESIS
327 215 D7 MULTIPLICATION SIGN
330 216 D8 LATIN CAPITAL LETTER O WITH STROKE
331 217 D9 LATIN CAPITAL LETTER U WITH GRAVE
332 218 DA LATIN CAPITAL LETTER U WITH ACUTE
333 219 DB LATIN CAPITAL LETTER U WITH CIRCUMFLEX
334 220 DC LATIN CAPITAL LETTER U WITH DIAERESIS
335 221 DD LATIN CAPITAL LETTER Y WITH ACUTE
336 222 DE LATIN CAPITAL LETTER THORN
337 223 DF LATIN SMALL LETTER SHARP S
340 224 E0 LATIN SMALL LETTER A WITH GRAVE
341 225 E1 LATIN SMALL LETTER A WITH ACUTE
342 226 E2 LATIN SMALL LETTER A WITH CIRCUMFLEX
343 227 E3 LATIN SMALL LETTER A WITH TILDE
344 228 E4 LATIN SMALL LETTER A WITH DIAERESIS
345 229 E5 LATIN SMALL LETTER A WITH RING ABOVE
346 230 E6 LATIN SMALL LETTER AE
347 231 E7 LATIN SMALL LETTER C WITH CEDILLA
350 232 E8 LATIN SMALL LETTER E WITH GRAVE
351 233 E9 LATIN SMALL LETTER E WITH ACUTE
352 234 EA LATIN SMALL LETTER E WITH CIRCUMFLEX
353 235 EB LATIN SMALL LETTER E WITH DIAERESIS
354 236 EC LATIN SMALL LETTER I WITH GRAVE
355 237 ED LATIN SMALL LETTER I WITH ACUTE
356 238 EE LATIN SMALL LETTER I WITH CIRCUMFLEX
357 239 EF LATIN SMALL LETTER I WITH DIAERESIS
360 240 F0 LATIN SMALL LETTER ETH
361 241 F1 LATIN SMALL LETTER N WITH TILDE
362 242 F2 LATIN SMALL LETTER O WITH GRAVE
363 243 F3 LATIN SMALL LETTER O WITH ACUTE
364 244 F4 LATIN SMALL LETTER O WITH CIRCUMFLEX
365 245 F5 LATIN SMALL LETTER O WITH TILDE
366 246 F6 LATIN SMALL LETTER O WITH DIAERESIS
367 247 F7 DIVISION SIGN
370 248 F8 LATIN SMALL LETTER O WITH STROKE
371 249 F9 LATIN SMALL LETTER U WITH GRAVE
372 250 FA LATIN SMALL LETTER U WITH ACUTE
373 251 FB LATIN SMALL LETTER U WITH CIRCUMFLEX
374 252 FC LATIN SMALL LETTER U WITH DIAERESIS
375 253 FD LATIN SMALL LETTER Y WITH ACUTE
376 254 FE LATIN SMALL LETTER THORN
377 255 FF LATIN SMALL LETTER Y WITH DIAERESIS
.TE
.IP [1] 4
See
.BR groff_char (7)
(soft hyphen) and the standard ISO 8859-1 ("shy",
paragraph 6.3.3)
or the equivalent version from your national standardization body.
.SH SEE ALSO
.BR ascii (7),
.BR iso_8859-15 (7)
.SH COLOPHON
This page is part of release 3.54 of the Linux
.I man-pages
project.
A description of the project,
and information about reporting bugs,
can be found at
\%http://www.kernel.org/doc/man\-pages/.
================================================
FILE: man/join-tsv.1
================================================
.TH JOIN-TSV 1 "October 21, 2013"
.SH NAME
.PP
join-tsv - perform a relation join on two TSV files
.SH SYNOPSIS
.PP
join-tsv --column=NAME [--null=VALUE|--no-null] [--left|--right|--full]
TSV_FILE1 TSV_FILE2
.SH DESCRIPTION
.PP
Perform a relation join on two TSV files.
The output is written to standard output in TSV format.
.PP
\f[C]join-tsv\f[] assumes that TSV_FILE1 and TSV_FILE2 are in accordance
with the IANA MIME type specificsation.
.PP
\f[C]join-tsv\f[] is easier to use than \f[C]join\f[] when working with
TSV files because it preserves the headers.
It allows specifying the join column by name.
If the join column names differ, the column name if the left (i.e.
first) file is used in the output.
.PP
\f[C]join-tsv\f[] performs the join by reading the smaller file into
memory.
\f[C]join-tsv\f[] can perform left, right, or full outer joins.
.PP
The default null value is the empty string.
It is not used as a join value.
It can be changed to something else with the \f[C]--null\f[] flag.
The \f[C]--no-null\f[] flag can be used to treat all strings including
the empty string as join values.
.SH OPTIONS
.PP
-C NAME, --column=NAME : the name of the join columns if they are the
same.
If they differ, use the -L and -R flags.
.PP
-L NAME, --left-column=NAME : used to specify the name of the join
column in the left (i.e.
first) TSV file.
.PP
-R, --right-column : used to specify the name of the join column in the
right (i.e.
second) TSV file.
.PP
-f, --full : Perform a full outer join.
Rows with a null join value in TSV_FILE1 or TSV_FILE2 will be included
in the output.
.PP
-l, --left : Perform a left outer join.
Rows with a null join value in TSV_FILE1 will be included in the output.
.PP
-r, --right : Perform a right outer join.
Rows with a null join value in TSV_FILE2 will be included in the output.
.PP
-n VALUE, --null=VALUE : use VALUE as the null value.
The default null value is the empty string.
.PP
-N, --no-null : no null value.
The empty string can be used as a join value.
.PP
-o, --outer-null : the null value used in outer joins.
.SH SEE ALSO
.PP
\f[C]join\f[] (1)
.PP
http://www.iana.org/assignments/media-types/text/tab-separated-values
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/json-diff.1
================================================
.TH JSON\-DIFF 1 "July 29, 2014"
.SH NAME
.PP
json\-diff \- run diff on two JSON documents
.SH SYNOPSIS
.PP
json\-diff [DIFF_OPTIONS] PATH1 PATH2
.SH DESCRIPTION
.PP
Run \f[C]diff\f[] on two JSON documents.
Each document is normalized using \f[C]python\ \-mjson.tool\f[].
.SH OPTIONS
.PP
Any options are passed to \f[C]diff\f[].
.SH SEE ALSO
.PP
\f[C]diff\f[] (1)
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/normalize-utf8.1
================================================
.TH NORMALIZE-UTF8 1 "February 8, 2014"
.SH NAME
.PP
normalize-utf8 - convert UTF-8 encoded files or standard input to a
normalized form
.SH SYNOPSIS
.PP
normalize-utf8 [--nfc|--nfd|--nfkc|--nfkd] [FILE]
.SH DESCRIPTION
.PP
Put UTF-8 encoded Unicode text into a normalized form.
.PP
Unicode contains different character sequences which are rendered the
same way.
An example is SMALL LETTER C WITH CEDILLA, which can be represented as a
single character: U+00E7 or as SMALL LETTER C followed by COMBINING
CEDILLA: U+0063 U+0327.
When performing a string comparison, the two sequences should often be
regarded as identifical.
If the strings being compared have been put into normal form, then a
simple string comparison can be used.
.PP
The Unicode standard defines four normalization forms.
NFC (Normal Form C), which is the default format used by
\f[C]normalize-utf8\f[], favors single character representations over
multiple character representations containing combining marks.
NFC is also called W3C normalization.
.PP
Conversely, NFD (Normal Form D) favors multiple character
representations consisting of a simple character representation followed
by a combining mark.
Converting a string to NFD is faster because the algorithm for
converting a string to NFC starts by converting it to NFD.
.PP
NFKC and NFKD conflate compatibility composites.
These are sequences which are visually distinct but semantically the
same.
Examples are the ff and ffi ligatures.
.SH OPTIONS
.PP
--nfc : write input to standard out in Normal Form C
.PP
--nfd : write input to standard out in Normal Form D
.PP
--nfkc : write input to standard out in Normal Form KC
.PP
--nfkd : write input to standard out in Normal Form KD
.SH SEE ALSO
.PP
\f[C]utf8-viewer\f[] (1)
.PP
http://unicode.org/reports/tr15/
.PP
http://www.unicode.org/reports/tr36/
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/postgres-to-csv.1
================================================
.TH POSTGRES-TO-CSV 1 "March 21, 2015"
.SH NAME
.PP
postgres-to-csv - export a PostgreSQL table to a CSV file
.SH SYNOPSIS
.PP
postgres-to-csv -t TABLE [-d DB] [-h HOST] [-p PORT] [-U USER] [-w|-W]
.SH DESCRIPTION
.PP
Write a PostgreSQL table to standard out in CSV format.
.PP
All options other than \f[C]-t\f[] are passed to \f[C]psql\f[].
.PP
The \f[C]PGPASSWORD\f[] environment variable can be used to pass the
PostgreSQL password to \f[C]psql\f[].
.SH OPTIONS
.PP
-d DB : name of the PostgreSQL database
.PP
-h HOST : PostgreSQL server host
.PP
-p PORT : the port number.
.PP
-t TABLE : the name of the PostgreSQL table to export
.PP
-U USER : the PostgreSQL user to connect as
.PP
-w : do not prompt for PostgreSQL password
.PP
-W : prompt for PostgreSQL password
.SH SEE ALSO
.PP
\f[C]csv-to-postgres\f[] (1)
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/reservoir-sample.1
================================================
.TH RESERVOIR-SAMPLE 1 "October 13, 2013"
.SH NAME
.PP
reservoir-sample - sample lines from file or standard input
.SH SYNOPSIS
.PP
reservoir-sample [-r|--random-seed SEED] (-s NUM|--size=NUM) [FILE]
.SH DESCRIPTION
.PP
Select NUM lines randomly from FILE or standard input.
Each line is equally likely to be chosen.
.PP
The script uses reservoir sampling.
It is more efficient than randomly shuffling the file with
\f[C]sort\ -R\f[] and then taking the first N lines with \f[C]head\f[].
.PP
To select a sample size which is proportional to the size of the input,
use \f[C]awk\f[]:
.IP
.nf
\f[C]
awk\ \[aq]rand()\ <\ 0.1\[aq]
\f[]
.fi
.SH OPTIONS
.PP
-r SEED, --random-seed=SEED : a seed value to be passed to the random
number generator.
.PP
-s NUM, --size=NUM : the size of the sample to select
.SH SEE ALSO
.PP
\f[C]sort\f[] (1), \f[C]awk\f[] (1), \f[C]shuf\f[] (1)
.PP
https://en.wikipedia.org/wiki/Reservoir_sampling
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/set-diff.1
================================================
.TH SET\-DIFF 1 "May 6, 2013"
.SH NAME
.PP
set\-diff \- find lines in first file which are not in the second
.SH SYNOPSIS
.PP
set\-diff FILE1 FILE2
.SH DESCRIPTION
.PP
List the lines which are in the first file and not in the second.
.PP
The lines are output in a sorted order and not necessarily the order of
the first file.
.PP
If the files are already sorted, it is faster to use
\f[C]comm\ \-23\f[].
.PP
\f[C]comm\ \-23\f[] gives erroneous results with no warning if the input
files are not sorted.
.SH OPTIONS
.PP
None
.SH SEE ALSO
.PP
\f[C]comm\f[] (1)
.PP
\f[C]set\-intersect\f[] (1)
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/set-intersect.1
================================================
.TH SET\-INTERSECT 1 "May 6, 2013"
.SH NAME
.PP
set\-intersect \- find lines common to two files
.SH SYNOPSIS
.PP
set\-intersect FILE1 FILE2
.SH DESCRIPTION
.PP
List the lines which are in both the first file and the second file.
.PP
If the files are already sorted, it is faster to use
\f[C]comm\ \-12\f[].
.PP
\f[C]comm\ \-12\f[] gives erroneous results with no warning if the input
files are not sorted.
.SH OPTIONS
.PP
None
.SH SEE ALSO
.PP
\f[C]comm\f[] (1)
.PP
\f[C]set\-diff\f[] (1)
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/tab-to-csv.1
================================================
.TH TAB-TO-CSV 1 "February 16, 2013"
.SH NAME
.PP
tab-to-csv - convert tab delimited to CSV
.SH SYNOPSIS
.PP
tab-to-csv OPTIONS [TSV_FILE]
.SH DESCRIPTION
.PP
Read a tab delimited file from file specified on the command line or
standard input and write the corresponding CSV file to standard output.
.PP
In the tab delimited format fields are delimited by tabs and records are
terminated by an end-of-line marker.
.SH OPTIONS
.PP
-u, --unescape : Interpret the following backslash sequences when
encountered in the data: , , , \\.
.SH SEE ALSO
.PP
\f[C]csv-to-tab\f[] (1)
.PP
http://www.ietf.org/rfc/rfc4180.txt
.PP
http://www.iana.org/assignments/media-types/text/tab-separated-values
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/tokenize.1
================================================
.TH TOKENIZE 1 "February 15, 2015"
.SH NAME
.PP
tokenize - extract words from English language text
.SH SYNOPSIS
.PP
tokenize [-n]
.SH DESCRIPTION
.PP
Exract words from English language text.
Words consist of adjacent letters, numbers, and these punctuation
characters:
.IP
.nf
\f[C]
\[aq]*+-/=\\^_`|~
\f[]
.fi
.PP
Control characters and these punctuation characters delimit words and
are removed:
.IP
.nf
\f[C]
!#$%&(),:;<>?\@[]{}
\f[]
.fi
.PP
Space characters also delimit words.
The words are written out separated by spaces unless the \f[C]-n\f[]
flag is used, in which case they are separated by newlines.
.PP
Non-ASCII characters delimit words and are removed.
It might be desirable to replace accented Latin characters with the
unaccented versions.
This command can be used:
.IP
.nf
\f[C]
$\ iconv\ -f\ utf-8\ -t\ ascii//TRANSLIT
\f[]
.fi
.SH OPTIONS
.PP
-n : write the words out one per line.
.SH SEE ALSO
.PP
\f[C]iconv\f[] (1)
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/trim-tsv.1
================================================
.TH TRIM\-TSV 1 "September 25, 2013"
.SH NAME
.PP
trim\-tsv \- trim whitespace from fields in a tab delimited file
.SH SYNOPSIS
.PP
trim\-tsv [TSV_FILE]
.SH DESCRIPTION
.PP
Trim whitespace from fields in a tab delimited file.
If no path is specified on the command line, the tool reads from
standard input.
.SH OPTIONS
.PP
none
.SH SEE ALSO
.PP
\f[C]tawk\f[] (1)
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/tsv-header.1
================================================
.TH TSV\-HEADER 1 "March 6, 2015"
.SH NAME
.PP
tsv\-header \- number the columns in a TSV header
.SH SYNOPSIS
.PP
tsv\-header [TSV_FILE]
.SH DESCRIPTION
.PP
Display the columns of a TSV file header, one per line, with their
ordinal positions.
.PP
The is useful for mapping \f[C]awk\f[] script variables, e.g.
\f[C]$1\f[], \f[C]$2\f[], ..., to column names.
.SH OPTIONS
.PP
none
.SH SEE ALSO
.PP
\f[C]awk\f[] (1), \f[C]tawk\f[] (1), \f[C]check\-tsv\f[] (1)
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/tsv-to-json.1
================================================
.TH TSV-TO-JSON 1 "June 4, 2013"
.SH NAME
.PP
tsv-to-json - convert TSV to JSON
.SH SYNOPSIS
.PP
tsv-to-json OPTIONS [TSV_FILE]
.SH DESCRIPTION
.PP
Read a TSV file from file specified on the command line or standard
input and write the corresponding JSON to standard output.
.PP
Each row of the JSON output contains a serialized JSON object.
The values of the object come from the corresponding row of the CSV
file, and the header is used fo the keys.
.SH OPTIONS
.PP
None
.SH SEE ALSO
.PP
\f[C]csv-to-json\f[] (1), \f[C]json-ruby\f[] (1)
.PP
http://www.iana.org/assignments/media-types/text/tab-separated-values
.PP
http://json.org
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/utf8-category.1
================================================
.TH UTF8-SCRIPT 1 "February 14, 2015"
.SH NAME
.PP
utf8-script - tally UTF-8 encoded characters by general category
.SH SYNOPSIS
.PP
utf8-script [-l|--long-names] [-c|--count-ascii|-s|--skip-ascii]
.SH DESCRIPTION
.PP
Tally the UTF-8 encoded characters in the standard input stream by
general category.
.IP
.nf
\f[C]
\ Abbr\ \ Long\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Description
\ ---\ \ \ ----\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ -----------
\ Lu\ \ \ \ Uppercase_Letter\ \ \ \ \ \ \ \ an\ uppercase\ letter
\ Ll\ \ \ \ Lowercase_Letter\ \ \ \ \ \ \ \ a\ lowercase\ letter
\ Lt\ \ \ \ Titlecase_Letter\ \ \ \ \ \ \ \ a\ digraphic\ character,\ with\ first\ part\ uppercase
\ LC\ \ \ \ Cased_Letter\ \ \ \ \ \ \ \ \ \ \ \ Lu\ |\ Ll\ |\ Lt
\ Lm\ \ \ \ Modifier_Letter\ \ \ \ \ \ \ \ \ a\ modifier\ letter
\ Lo\ \ \ \ Other_Letter\ \ \ \ \ \ \ \ \ \ \ \ other\ letters,\ including\ syllables\ and\ ideographs
\ L\ \ \ \ \ Letter\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Lu\ |\ Ll\ |\ Lt\ |\ Lm\ |\ Lo
\ Mn\ \ \ \ Nonspacing_Mark\ \ \ \ \ \ \ \ \ a\ nonspacing\ combining\ mark\ (zero\ advance\ width)
\ Mc\ \ \ \ Spacing_Mark\ \ \ \ \ \ \ \ \ \ \ \ a\ spacing\ combining\ mark\ (positive\ advance\ width)
\ Me\ \ \ \ Enclosing_Mark\ \ \ \ \ \ \ \ \ \ an\ enclosing\ combining\ mark
\ M\ \ \ \ \ Mark\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Mn\ |\ Mc\ |\ Me
\ Nd\ \ \ \ Decimal_Number\ \ \ \ \ \ \ \ \ \ a\ decimal\ digit
\ Nl\ \ \ \ Letter_Number\ \ \ \ \ \ \ \ \ \ \ a\ letterlike\ numeric\ character
\ No\ \ \ \ Other_Number\ \ \ \ \ \ \ \ \ \ \ \ a\ numeric\ character\ of\ other\ type
\ N\ \ \ \ \ Number\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Nd\ |\ Nl\ |\ No
\ Pc\ \ \ \ Connector_Punctuation\ \ \ a\ connecting\ punctuation\ mark,\ like\ a\ tie
\ Pd\ \ \ \ Dash_Punctuation\ \ \ \ \ \ \ \ a\ dash\ or\ hyphen\ punctuation\ mark
\ Ps\ \ \ \ Open_Punctuation\ \ \ \ \ \ \ \ an\ opening\ punctuation\ mark\ (of\ a\ pair)
\ Pe\ \ \ \ Close_Punctuation\ \ \ \ \ \ \ a\ closing\ punctuation\ mark\ (of\ a\ pair)
\ Pi\ \ \ \ Initial_Punctuation\ \ \ \ \ an\ initial\ quotation\ mark
\ Pf\ \ \ \ Final_Punctuation\ \ \ \ \ \ \ a\ final\ quotation\ mark
\ Po\ \ \ \ Other_Punctuation\ \ \ \ \ \ \ a\ punctuation\ mark\ of\ other\ type
\ P\ \ \ \ \ Punctuation\ \ \ \ \ \ \ \ \ \ \ \ \ Pc\ |\ Pd\ |\ Ps\ |\ Pe\ |\ Pi\ |\ Pf\ |\ Po
\ Sm\ \ \ \ Math_Symbol\ \ \ \ \ \ \ \ \ \ \ \ \ a\ symbol\ of\ mathematical\ use
\ Sc\ \ \ \ Currency_Symbol\ \ \ \ \ \ \ \ \ a\ currency\ sign
\ Sk\ \ \ \ Modifier_Symbol\ \ \ \ \ \ \ \ \ a\ non-letterlike\ modifier\ symbol
\ So\ \ \ \ Other_Symbol\ \ \ \ \ \ \ \ \ \ \ \ a\ symbol\ of\ other\ type
\ S\ \ \ \ \ Symbol\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Sm\ |\ Sc\ |\ Sk\ |\ So
\ Zs\ \ \ \ Space_Separator\ \ \ \ \ \ \ \ \ a\ space\ character\ (of\ various\ non-zero\ widths)
\ Zl\ \ \ \ Line_Separator\ \ \ \ \ \ \ \ \ \ U+2028\ LINE\ SEPARATOR\ only
\ Zp\ \ \ \ Paragraph_Separator\ \ \ \ \ U+2029\ PARAGRAPH\ SEPARATOR\ only
\ Z\ \ \ \ \ Separator\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Zs\ |\ Zl\ |\ Zp
\ Cc\ \ \ \ Control\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ a\ C0\ or\ C1\ control\ code
\ Cf\ \ \ \ Format\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ a\ format\ control\ character
\ Cs\ \ \ \ Surrogate\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ a\ surrogate\ code\ point
\ Co\ \ \ \ Private_Use\ \ \ \ \ \ \ \ \ \ \ \ \ a\ private-use\ character
\ Cn\ \ \ \ Unassigned\ \ \ \ \ \ \ \ \ \ \ \ \ \ a\ reserved\ unassigned\ code\ point\ or\ a\ noncharacter
\ C\ \ \ \ \ Other\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Cc\ |\ Cf\ |\ Cs\ |\ Co\ |\ Cn
\f[]
.fi
.SH OPTIONS
.PP
-c, --count-ascii : treat ASCII characters as a separate general
category called "ASCII".
.PP
-l, --long-names : use long names for the general categories instead of
the two character abbreviations.
.PP
-s, --skip-ascii : skip ASCII characters.
Only characters with Unicode point U+0080 and higher are counted.
.SH SEE ALSO
.PP
http://unicode.org/reports/tr44/#General_Category_Values
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/utf8-script.1
================================================
.TH UTF8-SCRIPT 1 "February 14, 2015"
.SH NAME
.PP
utf8-script - tally characters by UTF-8 script
.SH SYNOPSIS
.PP
utf8-script [-c|--count-ascii|-s|--skip-ascii]
.SH DESCRIPTION
.PP
Tally the characters in the standard input stream by UTF-8 script.
.SH OPTIONS
.PP
-c, --count-ascii : treat ASCII characters as a separate script called
"ASCII".
.PP
-s, --skip-ascii : skip ASCII characters.
Only characters with Unicode point U+0080 and higher are counted.
.SH SEE ALSO
.PP
http://unicode.org/Public/UNIDATA/Scripts.txt
.SH AUTHORS
Clark Grubb.
================================================
FILE: man/xlsx-to-csv.1
================================================
.TH XLSX-TO-CSV 1 "May 4, 2013"
.SH NAME
.PP
xlsx-to-csv - convert .xlsx to .csv
.SH SYNOPSIS
.IP
.nf
\f[C]
xlsx-to-csv\ XLSX\\_FILE\ OUTPUT_DIR
xlsx-to-csv\ --sheet=SHEET\ XLSX\\_FILE\ [OUTPUT\\_FILE]
xlsx-to-csv\ --list\ XLSX\\_FILE
\f[]
.fi
.SH DESCRIPTION
.PP
Read a .xlsx file and create a .csv file in DIRECTORY for each
worksheet.
.PP
DIRECTORY must not already exist.
.PP
Output is UTF-8 encoded.
.PP
\&.xlsx files are the format used by Excel since 2007.
The .xlsx file format defined by ECMA-376.
An .xlsx file is a ZIP archive of a directory containing XML documents.
The \f[C]unzip\ -l\f[] command can be used to list the contents of a ZIP
archive and hence an .xlsx file.
.PP
\f[C]xlsx-to-csv\f[] also works on .xls files which were used by Excel
before 2007.
.PP
The tool can easily take a minute or more to process a large (~100MB)
workbook.
Unfortunately, it takes about this long just to list the sheet names
with the \f[C]--list\f[] flag.
Hence it is more efficient to extract all of the sheets from a large
workbook even if only one of the sheets is needed.
.SH OPTIONS
.PP
--list : list the sheets in XLSX_FILE
.PP
--sheet : only convert SHEET to a .csv file.
.PP
--date-format=STRFTIME_FMT : a \f[C]strftime\f[] style format to be used
for Excel dates.
The default is the ISO 8601 format: \[aq]%Y-%m-%dT%H:%M:%S\[aq].
.SH SEE ALSO
.PP
\f[C]csv-to-tab\f[] (1), \f[C]strftime\f[] (3)
.PP
http://www.ecma-international.org/publications/standards/Ecma-376.htm
.SH AUTHORS
Clark Grubb.
================================================
FILE: requirements.txt
================================================
beautifulsoup4==4.13.4
html5lib==1.1
lxml==6.0.0
openpyxl==2.4.5
pep8==1.7.0
pylint==1.6.5
PyYAML==6.0.2
setuptools==80.9.0
xlrd==1.0.0
================================================
FILE: setup.py
================================================
#!/usr/bin/env python3
import sys
from setuptools import setup
SHELL_TOOLS = [
'data_tools/check-tsv',
'data_tools/csv-to-postgres',
'data_tools/header-sort',
'data_tools/json-diff',
'data_tools/postgres-to-csv',
'data_tools/set-intersect',
'data_tools/tokenize',
'data_tools/tsv-header'
]
C_TOOLS = [
'src/csv-to-tab/csv-to-tab',
'src/json-pluck/json-pluck',
'src/tab-to-csv/tab-to-csv',
'src/utf8-script/utf8-category',
'src/utf8-script/utf8-script'
]
def build():
pass
scripts = []
if sys.platform != 'win32':
build()
scripts = SHELL_TOOLS
setup(
name='data-tools',
version='0.1.0',
description='File format conversion tools',
url='https://github.com/clarkgrubb/data-tools',
author='Clark Grubb',
license='MIT',
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.5'
],
keywords='file format conversion data tools json csv tsv',
packages=[
'data_tools'
],
scripts=scripts,
entry_points={
'console_scripts': [
'csv-to-json = data_tools.csv_to_json:main',
'convert-date = data_tools.convert_date:main',
'counting-sort = data_tools.counting_sort:main',
'csv-to-xlsx = data_tools.csv_to_xlsx:main',
'data-fill = data_tools.data_fill:main',
'date-seq = data_tools.date_seq:main',
'highlight = data_tools.highlight:main',
'html-table-to-csv = data_tools.html_table_to_csv:main',
'join-tsv = data_tools.join_tsv:main',
'normalize-utf8 = data_tools.normalize_utf8:main',
'reservoir-sample = data_tools.reservoir_sample:main',
'trim-tsv = data_tools.trim_tsv:main',
'tsv-to-json = data_tools.tsv_to_json:main',
'xlsx-to-csv = data_tools.xlsx_to_csv:main',
'yaml-to-json = data_tools.yaml_to_json:main'
]
},
install_requires=[
'beautifulsoup4>=4.5.3',
'html5lib>=0.999999999',
'openpyxl>=2.4.5',
'PyYAML>=3.12',
'xlrd==1.0.0',
],
python_requires='>=3.4'
)
================================================
FILE: src/csv-to-tab/Makefile
================================================
MAKEFLAGS += --warn-undefined-variables
SHELL := bash
.SHELLFLAGS := -eu -o pipefail -c
.DEFAULT_GOAL := all
.DELETE_ON_ERROR:
.SUFFIXES:
csv-to-tab: csv_to_tab.c
gcc -O2 -o $@ $<
.PHONY: all
all: csv-to-tab
output:
mkdir $@
output/%.tab: test/input/%.csv csv-to-tab | output
./csv-to-tab < $< > $@
diff test/expected.output/$*.tab $@
tests := one two three four
test_files := $(patsubst %,output/%.tab,$(tests))
.PHONY: test.default
test.default: csv-to-tab
./csv-to-tab -x < test/input/backslash.csv > output/backslash.default.tab
diff test/expected.output/backslash.default.tab output/backslash.default.tab
.PHONY: test.escape
test.escape: csv-to-tab
./csv-to-tab -e < test/input/tab.csv > output/tab.escape.tab
diff test/expected.output/tab.escape.tab output/tab.escape.tab
./csv-to-tab -e < test/input/cr.csv > output/cr.escape.tab
diff test/expected.output/cr.escape.tab output/cr.escape.tab
./csv-to-tab -e < test/input/newline.csv > output/newline.escape.tab
diff test/expected.output/newline.escape.tab output/newline.escape.tab
./csv-to-tab -e < test/input/backslash.csv > output/backslash.escape.tab
diff test/expected.output/backslash.escape.tab output/backslash.escape.tab
.PHONY: test.replace
test.replace: csv-to-tab
./csv-to-tab -r < test/input/tab.csv > output/tab.replace.tab
diff test/expected.output/tab.replace.tab output/tab.replace.tab
./csv-to-tab -r < test/input/cr.csv > output/cr.replace.tab
diff test/expected.output/cr.replace.tab output/cr.replace.tab
./csv-to-tab -r < test/input/newline.csv > output/newline.replace.tab
diff test/expected.output/newline.replace.tab output/newline.replace.tab
./csv-to-tab -r < test/input/backslash.csv > output/backslash.replace.tab
diff test/expected.output/backslash.replace.tab output/backslash.replace.tab
.PHONY: test.strip
test.strip: csv-to-tab
./csv-to-tab -x < test/input/tab.csv > output/tab.strip.tab
diff test/expected.output/tab.strip.tab output/tab.strip.tab
./csv-to-tab -x < test/input/cr.csv > output/cr.strip.tab
diff test/expected.output/cr.strip.tab output/cr.strip.tab
./csv-to-tab -x < test/input/newline.csv > output/newline.strip.tab
diff test/expected.output/newline.strip.tab output/newline.strip.tab
./csv-to-tab -x < test/input/backslash.csv > output/backslash.strip.tab
diff test/expected.output/backslash.strip.tab output/backslash.strip.tab
.PHONY: test
test: $(test_files) test.default test.escape test.replace test.strip
.PHONY: cppcheck
cppcheck:
cppcheck --enable=all csv_to_tab.c
.PHONY: check
check: cppcheck test
state.png: state.dot
dot -Tpng < $< > $@
.PHONY: clean.build
clean.build:
rm csv-to-tab
.PHONY: clean.test
clean.test:
rm -rf output
.PHONY: clean
clean: clean.test
================================================
FILE: src/csv-to-tab/README.md
================================================
# OVERVIEW
Convert a UTF-8 encoded CSV file to a UTF-8 encoded tab delimited file.
# DATA DEFINITIONS
CSV format: [RFC 4180](https://tools.ietf.org/html/rfc4180).
TSV format: [IANA](https://www.iana.org/assignments/media-types/text/tab-separated-values).
TAB format: same as the TSV format, except that the header is optional.
The TSV format requires a header, but the CSV format does not. It is difficult for a program to verify that a header is present, hence the name `csv-to-tab` instead of `csv-to-tab`.
Another requirement of a TSV file is for each row to have the same number of fields. This is also a requirement of a CSV. The command does not check this.
CSV files are supposed to use CRLF to terminate records. As a convenience, the code will also accept a document which uses LF to terminate records, or even a mix of CRLF and LF. The code does not accept CR as a record terminator, even though some CSV producers--e.g. Excel on Mac--produce this format. Accepting a mix of CRLF, LF, and CR would make the number of line endings ambiguous. Use `sed` to convert such a file.
The TSV format does not specify the exact format of the EOL. We use LF.
The TSV format cannot include tabs or EOL. To be safe, we prohibit any character that might be interpreted as an EOL according to the Unicode consortium. The default behavior is to fail. Options are provided to strip, backslash escape, or replace with a space character.
# CHARACTER ENCODING
The data standards we follow don't specify the character encoding. We use UTF-8. Use `iconv` to convert from another encoding.
# FLAGS
-x --strip
-r --replace
-e --escape
# STATE MACHINE
This describes the state transitions of the C code in the default mode; i.e. when no flags are set.
<img src="state.png">
# TODO
* Enforce that each row has same number of fields. (optional flag?)
* Restore -p N/--pad=N parameter.
================================================
FILE: src/csv-to-tab/csv_to_tab.c
================================================
#include <errno.h>
#include <getopt.h>
#include <locale.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
enum parse_state {
outside_field,
quoted_field,
quoted_field_after_dquote,
unquoted_field,
unquoted_field_after_cr,
before_newline
};
enum invalid_char {
invalid_char_fail,
invalid_char_escape,
invalid_char_replace,
invalid_char_strip
};
void
fatal(char *msg, size_t lineno, size_t offsetno, size_t src_lineno) {
fprintf(stderr, "ERROR: line: %zu: offset: %zu: source: %zu: %s\n",
lineno, offsetno, src_lineno, msg);
exit(1);
}
void
handle_invalid_char(wint_t replacement,
char *msg,
enum invalid_char invalid_char_treatment,
size_t lineno,
size_t offsetno,
size_t src_lineno) {
if (invalid_char_treatment == invalid_char_fail)
fatal(msg, lineno, offsetno, src_lineno);
else if (invalid_char_treatment == invalid_char_escape) {
putwchar(L'\\');
putwchar(replacement);
}
else if (invalid_char_treatment == invalid_char_replace)
putwchar(L' ');
else if (invalid_char_treatment == invalid_char_strip)
return;
else
fatal("unexpected invalid character treatment", lineno, offsetno, src_lineno);
}
int
csv_to_tab(FILE *input_stream, enum invalid_char invalid_char_treatment) {
wint_t ch;
enum parse_state state = outside_field;
size_t lineno = 1, offsetno = 0;
while ((ch = fgetwc(input_stream)) != WEOF) {
offsetno += 1;
switch (ch) {
case L'\t':
switch (state) {
case outside_field:
state = unquoted_field;
handle_invalid_char(L't', "tab in data", invalid_char_treatment, lineno, offsetno,
__LINE__);
break;
case quoted_field:
case unquoted_field:
handle_invalid_char(L't', "tab in data", invalid_char_treatment, lineno, offsetno,
__LINE__);
break;
case unquoted_field_after_cr:
state = unquoted_field;
handle_invalid_char(L'r', "carriage return in data", invalid_char_treatment, lineno,
offsetno, __LINE__);
handle_invalid_char(L't', "tab in data", invalid_char_treatment, lineno, offsetno,
__LINE__);
break;
case quoted_field_after_dquote:
case before_newline:
fatal("unexpected tab", lineno, offsetno, __LINE__);
break;
default:
fatal("unexpected state", lineno, offsetno, __LINE__);
}
break;
case L'\\':
switch (state) {
case unquoted_field_after_cr:
state = unquoted_field;
handle_invalid_char(L'r', "carriage return in data", invalid_char_treatment, lineno,
offsetno, __LINE__);
if (invalid_char_treatment == invalid_char_escape)
putwchar(L'\\');
putwchar(L'\\');
break;
case outside_field:
state = unquoted_field;
if (invalid_char_treatment == invalid_char_escape)
putwchar(L'\\');
putwchar(L'\\');
break;
case quoted_field:
case unquoted_field:
if (invalid_char_treatment == invalid_char_escape)
putwchar(L'\\');
putwchar(L'\\');
break;
case quoted_field_after_dquote:
case before_newline:
fatal("unexpected backslash", lineno, offsetno, __LINE__);
break;
default:
fatal("unexpected state", lineno, offsetno, __LINE__);
}
break;
case L'"':
switch (state) {
case outside_field:
state = quoted_field;
break;
case quoted_field:
state = quoted_field_after_dquote;
break;
case quoted_field_after_dquote:
putwchar(L'"');
state = quoted_field;
break;
case unquoted_field_after_cr:
case unquoted_field:
case before_newline:
fatal("unexpected double quote", lineno, offsetno, __LINE__);
break;
default:
fatal("unexpected state", lineno, offsetno, __LINE__);
}
break;
case L',':
switch (state) {
case outside_field:
putwchar(L'\t');
break;
case quoted_field:
putwchar(ch);
break;
case quoted_field_after_dquote:
putwchar(L'\t');
state = outside_field;
break;
case unquoted_field_after_cr:
handle_invalid_char(L'r', "carriage return in data", invalid_char_treatment, lineno,
offsetno, __LINE__);
putwchar(L'\t');
state = outside_field;
break;
case unquoted_field:
putwchar(L'\t');
state = outside_field;
break;
case before_newline:
fatal("unexpected comma", lineno, offsetno, __LINE__);
default:
fatal("unexpected state", lineno, offsetno, __LINE__);
}
break;
case L'\n':
lineno += 1;
offsetno = 0;
switch (state) {
case quoted_field:
handle_invalid_char(L'n', "newline in data", invalid_char_treatment, lineno, offsetno,
__LINE__);
break;
case outside_field:
case quoted_field_after_dquote:
case unquoted_field:
case before_newline:
case unquoted_field_after_cr:
putwchar(L'\n');
state = outside_field;
break;
default:
fatal("unexpected state", lineno, offsetno, __LINE__);
}
break;
case L'\r':
switch (state) {
case quoted_field:
/* TODO: flag for escaping or replacing */
break;
case quoted_field_after_dquote:
case outside_field:
state = before_newline;
break;
case unquoted_field_after_cr:
handle_invalid_char(L'r', "carriage return in data", invalid_char_treatment, lineno,
offsetno, __LINE__);
state = unquoted_field_after_cr;
break;
case unquoted_field:
state = unquoted_field_after_cr;
break;
default:
fatal("unexpected carriage return", lineno, offsetno, __LINE__);
}
break;
default:
switch (state) {
case outside_field:
putwchar(ch);
state = unquoted_field;
break;
case quoted_field:
putwchar(ch);
break;
case quoted_field_after_dquote:
fatal("unescaped double quote", lineno, offsetno, __LINE__);
break;
case unquoted_field_after_cr:
state = unquoted_field;
handle_invalid_char(L'r', "carriage return in data", invalid_char_treatment, lineno,
offsetno, __LINE__);
putwchar(ch);
break;
case unquoted_field:
putwchar(ch);
break;
case before_newline:
default:
fatal("unexpected state", lineno, offsetno, __LINE__);
}
}
}
if (state == quoted_field)
fatal("unterminated double quote", lineno, offsetno, __LINE__);
if (ferror(stdin)) {
perror("error reading input stream");
exit(1);
}
return 0;
}
int
main(int argc, char **argv) {
static struct option long_opts[] = {
{"escape", no_argument, NULL, 'e'},
{"replace", no_argument, NULL, 'r'},
{"strip", no_argument, NULL, 'x'},
{0, 0, 0, 0}
};
int opti;
enum invalid_char invalid_char_treatment = invalid_char_fail;
setlocale(LC_ALL, "");
while (1) {
int ch = getopt_long(argc, argv, "dert:x", long_opts, &opti);
if (-1 == ch) {
break;
}
switch (ch) {
case 'e':
invalid_char_treatment = invalid_char_escape;
break;
case 'x':
invalid_char_treatment = invalid_char_strip;
break;
case 'r':
invalid_char_treatment = invalid_char_replace;
break;
default:
fprintf(stderr, "unexpected arg: %d\n", ch);
exit(1);
}
}
FILE *f;
if (optind == argc)
f = stdin;
else if (optind == argc - 1) {
f = fopen(argv[optind], "r");
if (!f) {
fprintf(stderr, "error opening %s: %s\n", argv[optind], strerror(errno));
exit(1);
}
}
else {
fprintf(stderr, "USAGE: csv-to-tab [--escape|--strip|--replace] [PATH]\n");
exit(1);
}
return csv_to_tab(f, invalid_char_treatment);
}
================================================
FILE: src/csv-to-tab/state.dot
================================================
digraph {
start -> outside_field;
outside_field -> outside_field [label="\\n"];
outside_field -> before_newline [label="\\r"];
outside_field -> outside_field [label=","];
outside_field -> finish [label="<eof>"];
outside_field -> unquoted_field [label="\\"];
outside_field -> unquoted_field [label="<other>"];
outside_field -> fatal [label="\\t"];
outside_field -> quoted_field [label="\""];
unquoted_field -> outside_field [label=","];
unquoted_field -> outside_field [label="\\n"];
unquoted_field -> unquoted_field [label="<other>"];
unquoted_field -> finish [label="<eof>"];
unquoted_field -> fatal [label="\""];
unquoted_field -> unquoted_field [label="\\"];
unquoted_field -> fatal [label="\\t"];
unquoted_field -> unquoted_field_after_cr [label="\\r"];
unquoted_field_after_cr -> outside_field [label=","];
unquoted_field_after_cr -> outside_field [label="\\n"];
unquoted_field_after_cr -> unquoted_field [label="<other>"];
unquoted_field_after_cr -> finish [label="<eof>"];
unquoted_field_after_cr -> fatal [label="\""];
unquoted_field_after_cr -> unquoted_field [label="\\"];
unquoted_field_after_cr -> fatal [label="\\t"];
unquoted_field_after_cr -> unquoted_field_after_cr [label="\\r"];
quoted_field -> quoted_field [label="<other>"];
quoted_field -> quoted_field_after_dquote [label="\""];
quoted_field -> fatal [label="<eof>"];
quoted_field -> quoted_field [label="\\"];
quoted_field -> fatal [label="\\t"];
quoted_field -> fatal [label="\\n"];
quoted_field -> fatal [label="\\r"];
quoted_field_after_dquote -> quoted_field [label="\""];
quoted_field_after_dquote -> outside_field [label=","];
quoted_field_after_dquote -> outside_field [label="\\n"];
quoted_field_after_dquote -> before_newline [label="\\r"];
quoted_field_after_dquote -> fatal [label="<other>"];
before_newline -> outside_field [label="\\n"];
before_newline -> fatal [label="<other>"];
}
================================================
FILE: src/csv-to-tab/test/expected.output/backslash.default.tab
================================================
one\two three
four five
================================================
FILE: src/csv-to-tab/test/expected.output/backslash.escape.tab
================================================
one\\two three
four five
================================================
FILE: src/csv-to-tab/test/expected.output/backslash.replace.tab
================================================
one\two three
four five
================================================
FILE: src/csv-to-tab/test/expected.output/backslash.strip.tab
================================================
one\two three
four five
================================================
FILE: src/csv-to-tab/test/expected.output/cr.escape.tab
================================================
one\rtwo three
four five
================================================
FILE: src/csv-to-tab/test/expected.output/cr.replace.tab
================================================
one two three
four five
================================================
FILE: src/csv-to-tab/test/expected.output/cr.strip.tab
================================================
onetwo three
four five
================================================
FILE: src/csv-to-tab/test/expected.output/four.tab
================================================
hello "bob" one two
three four five
================================================
FILE: src/csv-to-tab/test/expected.output/newline.escape.tab
================================================
one\ntwo three
four five
================================================
FILE: src/csv-to-tab/test/expected.output/newline.replace.tab
================================================
one two three
four five
================================================
FILE: src/csv-to-tab/test/expected.output/newline.strip.tab
================================================
onetwo three
four five
================================================
FILE: src/csv-to-tab/test/expected.output/one.tab
================================================
one two
three four
================================================
FILE: src/csv-to-tab/test/expected.output/tab.escape.tab
================================================
one\ttwo three
four five
================================================
FILE: src/csv-
gitextract_t4c_0bd0/
├── .gitignore
├── .pylintrc
├── LICENSE
├── Makefile
├── README.md
├── data_tools/
│ ├── __init__.py
│ ├── check-tsv
│ ├── convert_date.py
│ ├── counting_sort.py
│ ├── csv-to-postgres
│ ├── csv_to_json.py
│ ├── csv_to_xlsx.py
│ ├── date_fill.py
│ ├── date_seq.py
│ ├── header-sort
│ ├── highlight.py
│ ├── html_table_to_csv.py
│ ├── join_tsv.py
│ ├── json-diff
│ ├── normalize_utf8.py
│ ├── postgres-to-csv
│ ├── reservoir_sample.py
│ ├── set-diff.sh
│ ├── set-intersect
│ ├── tokenize
│ ├── trim_tsv.py
│ ├── tsv-header
│ ├── tsv_to_json.py
│ ├── xlsx_to_csv.py
│ └── yaml_to_json.py
├── doc/
│ ├── check-tsv.1.md
│ ├── convert-date.1.md
│ ├── counting-sort.1.md
│ ├── csv-to-json.1.md
│ ├── csv-to-postgres.1.md
│ ├── csv-to-tab.1.md
│ ├── csv-to-xlsx.1.md
│ ├── date-seq.1.md
│ ├── header-sort.1.md
│ ├── highlight.1.md
│ ├── html-table-to-csv.1.md
│ ├── join-tsv.1.md
│ ├── json-diff.1.md
│ ├── normalize-utf8.1.md
│ ├── postgres-to-csv.1.md
│ ├── reservoir-sample.1.md
│ ├── set-diff.1.md
│ ├── set-intersect.1.md
│ ├── tab-to-csv.1.md
│ ├── tokenize.1.md
│ ├── trim-tsv.1.md
│ ├── tsv-header.1.md
│ ├── tsv-to-json.1.md
│ ├── utf8-category.1.md
│ ├── utf8-script.1.md
│ └── xlsx-to-csv.1.md
├── man/
│ ├── check-tsv.1
│ ├── convert-date.1
│ ├── counting-sort.1
│ ├── csv-to-json.1
│ ├── csv-to-postgres.1
│ ├── csv-to-tab.1
│ ├── csv-to-xlsx.1
│ ├── date-seq.1
│ ├── header-sort.1
│ ├── highlight.1
│ ├── html-table-to-csv.1
│ ├── iso_8859-1.7
│ ├── join-tsv.1
│ ├── json-diff.1
│ ├── normalize-utf8.1
│ ├── postgres-to-csv.1
│ ├── reservoir-sample.1
│ ├── set-diff.1
│ ├── set-intersect.1
│ ├── tab-to-csv.1
│ ├── tokenize.1
│ ├── trim-tsv.1
│ ├── tsv-header.1
│ ├── tsv-to-json.1
│ ├── utf8-category.1
│ ├── utf8-script.1
│ └── xlsx-to-csv.1
├── requirements.txt
├── setup.py
├── src/
│ ├── csv-to-tab/
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── csv_to_tab.c
│ │ ├── state.dot
│ │ └── test/
│ │ ├── expected.output/
│ │ │ ├── backslash.default.tab
│ │ │ ├── backslash.escape.tab
│ │ │ ├── backslash.replace.tab
│ │ │ ├── backslash.strip.tab
│ │ │ ├── cr.escape.tab
│ │ │ ├── cr.replace.tab
│ │ │ ├── cr.strip.tab
│ │ │ ├── four.tab
│ │ │ ├── newline.escape.tab
│ │ │ ├── newline.replace.tab
│ │ │ ├── newline.strip.tab
│ │ │ ├── one.tab
│ │ │ ├── tab.escape.tab
│ │ │ ├── tab.replace.tab
│ │ │ ├── tab.strip.tab
│ │ │ ├── three.tab
│ │ │ └── two.tab
│ │ └── input/
│ │ ├── backslash.csv
│ │ ├── cr.csv
│ │ ├── four.csv
│ │ ├── newline.csv
│ │ ├── one.csv
│ │ ├── tab.csv
│ │ ├── three.csv
│ │ └── two.csv
│ ├── json-pluck/
│ │ ├── Makefile
│ │ ├── json_pluck.c
│ │ └── test/
│ │ ├── expected.output/
│ │ │ ├── sample.json
│ │ │ └── sample2.json
│ │ └── input/
│ │ ├── sample.json
│ │ └── sample2.json
│ ├── tab-to-csv/
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── tab_to_csv.c
│ │ └── test/
│ │ ├── expected.output/
│ │ │ ├── backslash.default.csv
│ │ │ ├── backslash.unescape.csv
│ │ │ ├── cr.default.csv
│ │ │ ├── cr.unescape.csv
│ │ │ ├── newline.default.csv
│ │ │ ├── newline.unescape.csv
│ │ │ ├── one.csv
│ │ │ ├── tab.default.csv
│ │ │ └── tab.unescape.csv
│ │ └── input/
│ │ ├── backslash.tab
│ │ ├── cr.tab
│ │ ├── newline.tab
│ │ ├── one.tab
│ │ └── tab.tab
│ └── utf8-script/
│ ├── Makefile
│ ├── README.md
│ ├── Scripts.txt
│ ├── UnicodeData.txt
│ ├── generate_category.rb
│ ├── generate_script.rb
│ ├── test/
│ │ ├── utf8-category/
│ │ │ ├── expected.output/
│ │ │ │ └── one.txt
│ │ │ └── input/
│ │ │ └── one.txt
│ │ └── utf8-script/
│ │ ├── expected.output/
│ │ │ └── one.txt
│ │ └── input/
│ │ └── one.txt
│ ├── utf8_category.c
│ ├── utf8_category.c.erb
│ ├── utf8_script.c
│ └── utf8_script.c.erb
└── test/
├── check_tsv/
│ ├── input.bad.tsv
│ └── input.good.tsv
├── check_yaml/
│ ├── bad.yaml
│ └── good.yaml
├── convert_date/
│ └── input.txt
├── counting_sort/
│ └── input.txt
├── csv_files/
│ ├── no-header.csv
│ ├── no-quote.csv
│ ├── quoted-chars.csv
│ ├── single-quote.csv
│ ├── unequal-rows.csv
│ └── unicode.csv
├── csv_to_json/
│ └── test.csv
├── csv_to_postgres/
│ ├── customers.csv
│ └── customers.sql
├── csv_to_tab/
│ ├── expected.escape.tab
│ ├── expected.strip.tab
│ ├── expected.tab
│ └── expected.unicode.tab
├── date_fill/
│ ├── expected.output.tsv
│ └── input.tsv
├── highlight/
│ ├── expected.output.txt
│ └── input.txt
├── html_table_to_csv/
│ ├── expected.test.csv
│ └── test.html
├── join_tsv/
│ ├── expected.output.NULL_VALUE.tsv
│ ├── expected.output.diff.tsv
│ ├── expected.output.left.tsv
│ ├── expected.output.left2.tsv
│ ├── expected.output.right.tsv
│ ├── expected.output.tsv
│ ├── input1.NULL_VALUE.tsv
│ ├── input1.diff.tsv
│ ├── input1.left.tsv
│ ├── input1.null.tsv
│ ├── input1.tsv
│ ├── input2.NULL_VALUE.tsv
│ ├── input2.diff.tsv
│ ├── input2.left.tsv
│ ├── input2.null.tsv
│ └── input2.tsv
├── json_diff/
│ ├── 1a.json
│ ├── 1b.json
│ ├── 2a.json
│ ├── 2b.json
│ ├── expected.output1.txt
│ └── expected.output2.txt
├── normalize_utf8/
│ ├── expected.output.nfc.txt
│ ├── expected.output.nfd.txt
│ ├── expected.output.txt
│ └── input.txt
├── reservoir_sample/
│ ├── expected.output.txt
│ └── input.txt
├── trim_tsv/
│ ├── expected.trim_tsv.tsv
│ └── input.tsv
├── tsv_header/
│ ├── expected.output.txt
│ └── input.tsv
├── tsv_to_csv/
│ └── escapes.tsv
├── tsv_to_json/
│ └── test.tsv
├── xlsx_to_csv/
│ ├── expected.3r3c.csv
│ ├── expected.dates.csv
│ ├── expected.list.out
│ ├── expected.spaces.csv
│ ├── expected.unicode.csv
│ ├── test.xls
│ └── test.xlsx
└── yaml_to_json/
└── input.yaml
SYMBOL INDEX (113 symbols across 24 files)
FILE: data_tools/convert_date.py
function convert (line 11) | def convert(input_fmt, output_fmt, s):
function convert_date (line 20) | def convert_date(fin, fout, input_fmt, output_fmt, column):
function main (line 39) | def main():
FILE: data_tools/counting_sort.py
function usage (line 7) | def usage():
function counting_sort (line 12) | def counting_sort(input_stream, output_stream):
function main (line 21) | def main():
FILE: data_tools/csv_to_json.py
function csv_to_json (line 10) | def csv_to_json(input_stream, output_stream, header_str, delimiter, quot...
function main (line 23) | def main():
FILE: data_tools/csv_to_xlsx.py
function path_to_sheetname (line 19) | def path_to_sheetname(path):
function csv_to_xlsx (line 27) | def csv_to_xlsx(input_files, output_file):
function main (line 54) | def main():
FILE: data_tools/date_fill.py
function make_year_iterator (line 24) | def make_year_iterator(start,
function make_month_iterator (line 39) | def make_month_iterator(start,
function make_date_iterator (line 64) | def make_date_iterator(start_dt,
function load_rows (line 79) | def load_rows(input_path, date_column, no_header):
function date_fill (line 95) | def date_fill(input_path,
function main (line 197) | def main():
FILE: data_tools/date_seq.py
function check (line 23) | def check(dt, fmt, regex_date_filter):
function make_year_iterator (line 31) | def make_year_iterator(start,
function make_month_iterator (line 48) | def make_month_iterator(start,
function make_date_iterator (line 75) | def make_date_iterator(start,
function date_seq (line 103) | def date_seq(start,
function main (line 166) | def main():
FILE: data_tools/highlight.py
function highlight (line 32) | def highlight(input_stream, output_stream, esc_seq_to_pattern):
function main (line 42) | def main():
FILE: data_tools/html_table_to_csv.py
function html_table_to_csv (line 13) | def html_table_to_csv(input_f: IO, output_f: IO, table_num: int) -> None:
function main (line 28) | def main():
FILE: data_tools/join_tsv.py
function header_and_column_to_rows (line 18) | def header_and_column_to_rows(path, column):
function print_row (line 46) | def print_row(join_value, fields1, fields2, f):
function join_tsv (line 55) | def join_tsv(left_join_column,
function main (line 149) | def main():
FILE: data_tools/normalize_utf8.py
function normalize_utf8 (line 13) | def normalize_utf8(input_stream, output_stream, normalization_form):
function main (line 26) | def main():
FILE: data_tools/reservoir_sample.py
function reservoir_sample (line 7) | def reservoir_sample(count, input_stream, output_stream):
function main (line 31) | def main():
FILE: data_tools/trim_tsv.py
function trim_tsv (line 8) | def trim_tsv(input_stream, output_stream):
function main (line 16) | def main():
FILE: data_tools/tsv_to_json.py
function main (line 9) | def main():
FILE: data_tools/xlsx_to_csv.py
function list_xlsx_sheets (line 16) | def list_xlsx_sheets(xlsx_path, output_stream):
function sheet_name_to_filename (line 23) | def sheet_name_to_filename(sheet_name):
function cell_to_str (line 27) | def cell_to_str(cell, date_fmt, datemode):
function xlsx_book_to_csv (line 40) | def xlsx_book_to_csv(book, sheet_path, sheet_name, date_fmt):
function xlsx_path_to_csv (line 56) | def xlsx_path_to_csv(xlsx_path, sheet_path, sheet_name, date_fmt):
function xlsx_path_to_csvs (line 61) | def xlsx_path_to_csvs(xlsx_path, dir_path, date_fmt):
function main (line 69) | def main():
FILE: data_tools/yaml_to_json.py
function main (line 8) | def main():
FILE: setup.py
function build (line 24) | def build():
FILE: src/csv-to-tab/csv_to_tab.c
type parse_state (line 10) | enum parse_state {
type invalid_char (line 19) | enum invalid_char {
function fatal (line 26) | void
function handle_invalid_char (line 33) | void
function csv_to_tab (line 54) | int
function main (line 256) | int
FILE: src/json-pluck/json_pluck.c
function fatal (line 7) | void
type json_type (line 14) | typedef bool json_type;
type json_type_stack (line 19) | typedef struct {
function init_stack (line 25) | void
function push_stack (line 32) | void
function json_type (line 43) | json_type
function json_type (line 52) | json_type
function json_pluck (line 60) | int
function main (line 201) | int
FILE: src/tab-to-csv/tab_to_csv.c
function write_wchars (line 13) | static inline void
function tab_to_csv (line 41) | int tab_to_csv(FILE *input_stream, bool unescape_char) {
function main (line 219) | int
FILE: src/utf8-script/generate_category.rb
class DecisionNode (line 49) | class DecisionNode
method initialize (line 52) | def initialize(categories, left, right, x = nil)
method best_x (line 71) | def best_x
method render_increment_counts (line 88) | def render_increment_counts(indent = INDENT + INDENT)
class Categories (line 103) | class Categories
method initialize (line 109) | def initialize(path)
method dump (line 139) | def dump
method count (line 143) | def count
method render_enum (line 147) | def render_enum
method render_category_strings (line 156) | def render_category_strings
method render_long_category_strings (line 164) | def render_long_category_strings
method starts (line 178) | def starts
method ranges (line 183) | def ranges
method ranges_intersecting_interval (line 187) | def ranges_intersecting_interval(start, ending)
method name_of_intersecting_category (line 195) | def name_of_intersecting_category(start, ending)
method probability (line 204) | def probability(start, ending)
method entropy (line 208) | def entropy(start, ending)
method add_unknown_ranges (line 213) | def add_unknown_ranges
method collapse_ranges (line 242) | def collapse_ranges
function generate (line 265) | def generate(categories, tree, template, output_stream)
FILE: src/utf8-script/generate_script.rb
class DecisionNode (line 11) | class DecisionNode
method initialize (line 14) | def initialize(scripts, left, right, x = nil)
method best_x (line 33) | def best_x
method render_increment_counts (line 50) | def render_increment_counts(indent = INDENT + INDENT)
class Scripts (line 65) | class Scripts
method initialize (line 74) | def initialize(path)
method dump (line 96) | def dump
method count (line 100) | def count
method render_enum (line 104) | def render_enum
method render_script_strings (line 113) | def render_script_strings
method starts (line 121) | def starts
method ranges (line 126) | def ranges
method ranges_intersecting_interval (line 130) | def ranges_intersecting_interval(start, ending)
method name_of_intersecting_script (line 138) | def name_of_intersecting_script(start, ending)
method probability (line 147) | def probability(start, ending)
method entropy (line 151) | def entropy(start, ending)
method add_unknown_ranges (line 156) | def add_unknown_ranges
method collapse_ranges (line 185) | def collapse_ranges
function generate (line 208) | def generate(scripts, tree, template, output_stream)
FILE: src/utf8-script/utf8_category.c
type option (line 9) | struct option
type unicode_category (line 16) | enum unicode_category {
function usage (line 115) | void
function main (line 121) | int
FILE: src/utf8-script/utf8_script.c
type option (line 9) | struct option
type unicode_script (line 15) | enum unicode_script {
function usage (line 363) | void
function main (line 369) | int
FILE: test/csv_to_postgres/customers.sql
type customers (line 1) | create table customers ( name text, id integer, address text )
Condensed preview — 219 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (3,399K chars).
[
{
"path": ".gitignore",
"chars": 176,
"preview": "output\nsrc/csv-to-tab/csv-to-tab\nsrc/tab-to-csv/tab-to-csv\nsrc/json-pluck/json-pluck\nve\nbuild\ndata_tools.egg-info\ndist\ns"
},
{
"path": ".pylintrc",
"chars": 466,
"preview": "# Keep warnings which flag usages which are wrong or useless.\n#\n# Keep style warnings if we agree with them and they can"
},
{
"path": "LICENSE",
"chars": 1056,
"preview": "Copyright (C) 2014 Clark Grubb\n\n\nPermission is hereby granted, free of charge, to any person obtaining\na copy of this so"
},
{
"path": "Makefile",
"chars": 12762,
"preview": "MAKEFLAGS += --warn-undefined-variables\nSHELL := bash\n.SHELLFLAGS := -e -o pipefail -c\n.DEFAULT_GOAL := all\n.DELETE_ON_E"
},
{
"path": "README.md",
"chars": 42463,
"preview": "[summary](#summary) | [setup](#setup) | [how to run](#how-to-run)\n\n[.txt](#txt) | [.tsv](#tsv) | [.tab](#tab) | [.csv](#"
},
{
"path": "data_tools/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "data_tools/check-tsv",
"chars": 318,
"preview": "#!/usr/bin/env bash\n\nset -eu -o pipefail\n\nscript='BEGIN {FS=\"\\t\"; OFS=\"\\t\"} {cnt[NF] += 1} END {for (i in cnt) print i, "
},
{
"path": "data_tools/convert_date.py",
"chars": 2556,
"preview": "#!/usr/bin/env python3\n\nimport argparse\nimport datetime\nimport sys\n\nDEFAULT_FMT1 = '%Y-%m-%dT%H:%M:%S'\nDEFAULT_FMT2 = '%"
},
{
"path": "data_tools/counting_sort.py",
"chars": 763,
"preview": "#!/usr/bin/env python3\n\nimport collections\nimport sys\n\n\ndef usage():\n sys.stderr.write(\"USAGE: counting-sort [FILE]\\n"
},
{
"path": "data_tools/csv-to-postgres",
"chars": 856,
"preview": "#!/usr/bin/env bash\n\nset -eu -o pipefail\n\npsql=psql\ntable=\npath=\n\nwhile getopts \"d:f:h:p:t:U:wW\" opt\ndo\n case \"$opt\" "
},
{
"path": "data_tools/csv_to_json.py",
"chars": 1199,
"preview": "#!/usr/bin/env python3\nimport argparse\nimport csv\nimport json\nimport sys\n\nENCODING = 'utf-8'\n\n\ndef csv_to_json(input_str"
},
{
"path": "data_tools/csv_to_xlsx.py",
"chars": 2464,
"preview": "#!/usr/bin/env python3\n\nimport argparse\nimport csv\nimport re\nimport sys\n\nimport openpyxl\n\nREGEX_CSV_SUFFIX = re.compile("
},
{
"path": "data_tools/date_fill.py",
"chars": 7108,
"preview": "#!/usr/bin/env python3\n\nimport argparse\nimport datetime\nimport re\nimport sys\n\nimport pprint\nPP = pprint.PrettyPrinter()\n"
},
{
"path": "data_tools/date_seq.py",
"chars": 5406,
"preview": "#!/usr/bin/env python3\n\nimport argparse\nimport datetime\nimport re\nimport sys\n\nimport pprint\nPP = pprint.PrettyPrinter()\n"
},
{
"path": "data_tools/header-sort",
"chars": 246,
"preview": "#!/usr/bin/env bash\n\nif [ $# -eq 0 ]\nthen\n echo \"USAGE: $0 [OPTIONS] FILE\"\n exit 1\nfi\n\nfile=\"${!#}\"\n\nif [ ! -f \"$f"
},
{
"path": "data_tools/highlight.py",
"chars": 7078,
"preview": "#!/usr/bin/env python3\n\nimport argparse\nimport re\nimport sys\n\nNORMAL = '\\033[m'\nBLACK_FOREGROUND = '\\033[01;30m'\nRED_FOR"
},
{
"path": "data_tools/html_table_to_csv.py",
"chars": 1223,
"preview": "#!/usr/bin/env python3\nimport argparse\nimport csv\nimport re\nimport sys\nfrom typing import IO\n\nimport bs4\n\nRX_TH_OR_TD = "
},
{
"path": "data_tools/join_tsv.py",
"chars": 8021,
"preview": "#!/usr/bin/env python3\n\nimport argparse\nimport collections\nimport os\nimport sys\n\nENCODING = 'utf-8'\nBIG_FIRST = 1\nBIG_LA"
},
{
"path": "data_tools/json-diff",
"chars": 748,
"preview": "#!/usr/bin/env bash\n\nset -eu -o pipefail\n\nif [ \"$#\" -lt 2 ]\nthen\n echo \"USAGE: json-diff [DIFF_OPTIONS] PATH1 PATH2\" "
},
{
"path": "data_tools/normalize_utf8.py",
"chars": 1870,
"preview": "#!/usr/bin/env python3\nimport argparse\nimport sys\nimport unicodedata\n\nENCODING = 'utf-8'\nNFC = 'NFC'\nNFD = 'NFD'\nNFKC = "
},
{
"path": "data_tools/postgres-to-csv",
"chars": 770,
"preview": "#!/usr/bin/env bash\n\nset -eu -o pipefail\n\npsql=psql\ntable=\n\nwhile getopts \"d:h:p:t:U:wW\" opt\ndo\n case \"$opt\" in\n "
},
{
"path": "data_tools/reservoir_sample.py",
"chars": 1400,
"preview": "#!/usr/bin/env python3\nimport argparse\nimport random\nimport sys\n\n\ndef reservoir_sample(count, input_stream, output_strea"
},
{
"path": "data_tools/set-diff.sh",
"chars": 241,
"preview": "#!/usr/bin/env bash\n\nif [ $# -ne 2 ]\nthen\n echo \"USAGE: set-diff FILE1 FILE2\"\n exit 1\nfi\n\nsorted1=$(mktemp)\nsorted"
},
{
"path": "data_tools/set-intersect",
"chars": 246,
"preview": "#!/usr/bin/env bash\n\nif [ $# -ne 2 ]\nthen\n echo \"USAGE: set-intersect FILE1 FILE2\"\n exit 1\nfi\n\nsorted1=$(mktemp)\ns"
},
{
"path": "data_tools/tokenize",
"chars": 330,
"preview": "#!/usr/bin/env bash\n\nif [ \"$#\" -eq 1 ] && [ \"$1\" = \"-n\" ]\nthen\n tr -C '0-9a-zA-Z\\047\\052\\053\\055\\057\\075\\134\\136\\137\\"
},
{
"path": "data_tools/trim_tsv.py",
"chars": 617,
"preview": "#!/usr/bin/env python3\n\nimport sys\n\nDELIMITER = '\\t'\n\n\ndef trim_tsv(input_stream, output_stream):\n for line in input_"
},
{
"path": "data_tools/tsv-header",
"chars": 278,
"preview": "#!/usr/bin/env bash\n\nset -eu -o pipefail\n\nif [ \"$#\" -gt 1 ]\nthen\n echo \"USAGE: tsv-head PATH\" >&2\n exit 1\nelif [ \""
},
{
"path": "data_tools/tsv_to_json.py",
"chars": 877,
"preview": "#!/usr/bin/env python3\nimport json\nimport sys\n\nENCODING = 'utf-8'\nNEWLINE_CHARS = u'\\f\\n\\r\\v\\x85\\u2028\\u2029'\n\n\ndef main"
},
{
"path": "data_tools/xlsx_to_csv.py",
"chars": 3503,
"preview": "#!/usr/bin/env python3\nimport argparse\nimport datetime\nimport csv\nimport os\nimport pprint\nimport sys\nimport xlrd\n\nDATE_F"
},
{
"path": "data_tools/yaml_to_json.py",
"chars": 523,
"preview": "#!/usr/bin/env python3\n\nimport json\nimport sys\nimport yaml\n\n\ndef main():\n try:\n if (len(sys.argv) > 2):\n "
},
{
"path": "doc/check-tsv.1.md",
"chars": 490,
"preview": "% CHECK-TSV(1)\n% Clark Grubb\n% March 6, 2015\n\n# NAME\n\ncheck-tsv - check whether all rows in a TSV file have the same num"
},
{
"path": "doc/convert-date.1.md",
"chars": 1154,
"preview": "% CONVERT-DATE(1)\n% Clark Grubb\n% July 18, 2015\n\n# NAME\n\nconvert-date - convert the date format of a column of tab-delim"
},
{
"path": "doc/counting-sort.1.md",
"chars": 554,
"preview": "% COUNTING-SORT(1)\n% Clark Grubb\n% May 6, 2014\n\n# NAME\n\ncounting-sort - perform counting sort on a file or standard inpu"
},
{
"path": "doc/csv-to-json.1.md",
"chars": 945,
"preview": "% CSV-TO-JSON(1)\n% Clark Grubb\n% June 4, 2013\n\n\n# NAME\n\ncsv-to-json - convert CSV to JSON\n\n# SYNOPSIS\n\ncsv-to-json OPTIO"
},
{
"path": "doc/csv-to-postgres.1.md",
"chars": 851,
"preview": "% CSV-TO-POSTGRES(1)\n% Clark Grubb\n% March 21, 2015\n\n# NAME\n\ncsv-to-postgres - import a CSV file to a PostgreSQL table\n\n"
},
{
"path": "doc/csv-to-tab.1.md",
"chars": 1122,
"preview": "% CSV-TO-TAB(1)\n% Clark Grubb\n% February 16, 2013\n\n\n# NAME\n\ncsv-to-tab - convert CSV to tab delimited\n\n# SYNOPSIS\n\ncsv-t"
},
{
"path": "doc/csv-to-xlsx.1.md",
"chars": 1046,
"preview": "% CSV-TO-XLSX(1)\n% Clark Grubb\n% November 7, 2013\n\n\n# NAME\n\ncsv-to-xlsx - convert CSV files to XLSX worksheets\n\n# SYNOPS"
},
{
"path": "doc/date-seq.1.md",
"chars": 1651,
"preview": "% DATE-SEQ(1)\n% Clark Grubb\n% June 17, 2013\n\n\n# NAME\n\ndate-seq - print sequence of dates or times\n\n# SYNOPSIS\n\ndate-seq "
},
{
"path": "doc/header-sort.1.md",
"chars": 282,
"preview": "% HEADER-SORT(1)\n% Clark Grubb\n% June 4, 2013\n\n\n# NAME\n\nheader-sort - sort file with header\n\n# SYNOPSIS\n\nheader-sort \\[O"
},
{
"path": "doc/highlight.1.md",
"chars": 2899,
"preview": "% HIGHLIGHT(1)\n% Clark Grubb\n% September 12, 2013\n\n\n# NAME\n\nhighlight - highlight text in a stream maching a regular exp"
},
{
"path": "doc/html-table-to-csv.1.md",
"chars": 568,
"preview": "% HTML-TABLE-TO-CSV(1)\n% Clark Grubb\n% March 26, 2017\n\n\n# NAME\n\nhtml-table-to-csv - convert CSV to JSON\n\n# SYNOPSIS\n\nhtm"
},
{
"path": "doc/join-tsv.1.md",
"chars": 2106,
"preview": "% JOIN-TSV(1)\n% Clark Grubb\n% October 21, 2013\n\n\n# NAME\n\njoin-tsv - perform a relation join on two TSV files\n\n# SYNOPSIS"
},
{
"path": "doc/json-diff.1.md",
"chars": 325,
"preview": "% JSON-DIFF(1)\n% Clark Grubb\n% July 29, 2014\n\n\n# NAME\n\njson-diff - run diff on two JSON documents\n\n# SYNOPSIS\n\njson-diff"
},
{
"path": "doc/normalize-utf8.1.md",
"chars": 1790,
"preview": "% NORMALIZE-UTF8(1)\n% Clark Grubb\n% February 8, 2014\n\n\n# NAME\n\nnormalize-utf8 - convert UTF-8 encoded files or standard "
},
{
"path": "doc/postgres-to-csv.1.md",
"chars": 751,
"preview": "% POSTGRES-TO-CSV(1)\n% Clark Grubb\n% March 21, 2015\n\n# NAME\n\npostgres-to-csv - export a PostgreSQL table to a CSV file\n\n"
},
{
"path": "doc/reservoir-sample.1.md",
"chars": 835,
"preview": "% RESERVOIR-SAMPLE(1)\n% Clark Grubb\n% October 13, 2013\n\n# NAME\n\nreservoir-sample - sample lines from file or standard in"
},
{
"path": "doc/set-diff.1.md",
"chars": 538,
"preview": "% SET-DIFF(1)\n% Clark Grubb\n% May 6, 2013\n\n\n# NAME\n\nset-diff - find lines in first file which are not in the second\n\n# S"
},
{
"path": "doc/set-intersect.1.md",
"chars": 438,
"preview": "% SET-INTERSECT(1)\n% Clark Grubb\n% May 6, 2013\n\n\n# NAME\n\nset-intersect - find lines common to two files\n\n# SYNOPSIS\n\nset"
},
{
"path": "doc/tab-to-csv.1.md",
"chars": 671,
"preview": "% TAB-TO-CSV(1)\n% Clark Grubb\n% February 16, 2013\n\n\n# NAME\n\ntab-to-csv - convert tab delimited to CSV\n\n# SYNOPSIS\n\ntab-t"
},
{
"path": "doc/tokenize.1.md",
"chars": 846,
"preview": "% TOKENIZE(1)\n% Clark Grubb\n% February 15, 2015\n\n# NAME\n\ntokenize - extract words from English language text\n\n# SYNOPSIS"
},
{
"path": "doc/trim-tsv.1.md",
"chars": 347,
"preview": "% TRIM-TSV(1)\n% Clark Grubb\n% September 25, 2013\n\n\n# NAME\n\ntrim-tsv - trim whitespace from fields in a tab delimited fil"
},
{
"path": "doc/tsv-header.1.md",
"chars": 399,
"preview": "% TSV-HEADER(1)\n% Clark Grubb\n% March 6, 2015\n\n# NAME\n\ntsv-header - number the columns in a TSV header\n\n# SYNOPSIS\n\ntsv-"
},
{
"path": "doc/tsv-to-json.1.md",
"chars": 605,
"preview": "% TSV-TO-JSON(1)\n% Clark Grubb\n% June 4, 2013\n\n\n# NAME\n\ntsv-to-json - convert TSV to JSON\n\n# SYNOPSIS\n\ntsv-to-json OPTIO"
},
{
"path": "doc/utf8-category.1.md",
"chars": 3230,
"preview": "% UTF8-SCRIPT(1)\n% Clark Grubb\n% February 14, 2015\n\n\n# NAME\n\nutf8-script - tally UTF-8 encoded characters by general cat"
},
{
"path": "doc/utf8-script.1.md",
"chars": 518,
"preview": "% UTF8-SCRIPT(1)\n% Clark Grubb\n% February 14, 2015\n\n\n# NAME\n\nutf8-script - tally characters by UTF-8 script \n\n# SYNOPSIS"
},
{
"path": "doc/xlsx-to-csv.1.md",
"chars": 1411,
"preview": "% XLSX-TO-CSV(1)\n% Clark Grubb\n% May 4, 2013\n\n# NAME\n\nxlsx-to-csv - convert .xlsx to .csv\n\n# SYNOPSIS\n\n xlsx-to-csv X"
},
{
"path": "man/check-tsv.1",
"chars": 563,
"preview": ".TH CHECK\\-TSV 1 \"March 6, 2015\" \n.SH NAME\n.PP\ncheck\\-tsv \\- check whether all rows in a TSV file have the same number\no"
},
{
"path": "man/convert-date.1",
"chars": 1364,
"preview": ".TH \"CONVERT\\-DATE\" \"1\" \"July 18, 2015\" \"\" \"\"\n.SH NAME\n.PP\nconvert\\-date \\- convert the date format of a column of tab\\-"
},
{
"path": "man/counting-sort.1",
"chars": 625,
"preview": ".TH COUNTING\\-SORT 1 \"May 6, 2014\" \n.SH NAME\n.PP\ncounting\\-sort \\- perform counting sort on a file or standard input\n.SH"
},
{
"path": "man/csv-to-json.1",
"chars": 1000,
"preview": ".TH CSV-TO-JSON 1 \"June 4, 2013\" \n.SH NAME\n.PP\ncsv-to-json - convert CSV to JSON\n.SH SYNOPSIS\n.PP\ncsv-to-json OPTIONS [C"
},
{
"path": "man/csv-to-postgres.1",
"chars": 955,
"preview": ".TH CSV-TO-POSTGRES 1 \"March 21, 2015\" \n.SH NAME\n.PP\ncsv-to-postgres - import a CSV file to a PostgreSQL table\n.SH SYNOP"
},
{
"path": "man/csv-to-tab.1",
"chars": 1250,
"preview": ".TH \"CSV\\-TO\\-TAB\" \"1\" \"February 16, 2013\" \"\" \"\"\n.SH NAME\n.PP\ncsv\\-to\\-tab \\- convert CSV to tab delimited\n.SH SYNOPSIS\n"
},
{
"path": "man/csv-to-xlsx.1",
"chars": 1145,
"preview": ".TH CSV\\-TO\\-XLSX 1 \"November 7, 2013\" \n.SH NAME\n.PP\ncsv\\-to\\-xlsx \\- convert CSV files to XLSX worksheets\n.SH SYNOPSIS\n"
},
{
"path": "man/date-seq.1",
"chars": 1894,
"preview": ".TH DATE\\-SEQ 1 \"June 17, 2013\" \n.SH NAME\n.PP\ndate\\-seq \\- print sequence of dates or times\n.SH SYNOPSIS\n.PP\ndate\\-seq ["
},
{
"path": "man/header-sort.1",
"chars": 337,
"preview": ".TH HEADER\\-SORT 1 \"June 4, 2013\" \n.SH NAME\n.PP\nheader\\-sort \\- sort file with header\n.SH SYNOPSIS\n.PP\nheader\\-sort [OPT"
},
{
"path": "man/highlight.1",
"chars": 3168,
"preview": ".TH HIGHLIGHT 1 \"September 12, 2013\" \n.SH NAME\n.PP\nhighlight \\- highlight text in a stream maching a regular expression\n"
},
{
"path": "man/html-table-to-csv.1",
"chars": 682,
"preview": ".\\\" Automatically generated by Pandoc 1.19.2.1\n.\\\"\n.TH \"HTML\\-TABLE\\-TO\\-CSV\" \"1\" \"March 26, 2017\" \"\" \"\"\n.hy\n.SH NAME\n.P"
},
{
"path": "man/iso_8859-1.7",
"chars": 7157,
"preview": "'\\\" t\n.\\\" Copyright 1993-1995 Daniel Quinlan (quinlan@yggdrasil.com)\n.\\\"\n.\\\" %%%LICENSE_START(GPLv2+_DOC_FULL)\n.\\\" This "
},
{
"path": "man/join-tsv.1",
"chars": 2202,
"preview": ".TH JOIN-TSV 1 \"October 21, 2013\" \n.SH NAME\n.PP\njoin-tsv - perform a relation join on two TSV files\n.SH SYNOPSIS\n.PP\njoi"
},
{
"path": "man/json-diff.1",
"chars": 390,
"preview": ".TH JSON\\-DIFF 1 \"July 29, 2014\" \n.SH NAME\n.PP\njson\\-diff \\- run diff on two JSON documents\n.SH SYNOPSIS\n.PP\njson\\-diff "
},
{
"path": "man/normalize-utf8.1",
"chars": 1853,
"preview": ".TH NORMALIZE-UTF8 1 \"February 8, 2014\" \n.SH NAME\n.PP\nnormalize-utf8 - convert UTF-8 encoded files or standard input to "
},
{
"path": "man/postgres-to-csv.1",
"chars": 842,
"preview": ".TH POSTGRES-TO-CSV 1 \"March 21, 2015\" \n.SH NAME\n.PP\npostgres-to-csv - export a PostgreSQL table to a CSV file\n.SH SYNOP"
},
{
"path": "man/reservoir-sample.1",
"chars": 948,
"preview": ".TH RESERVOIR-SAMPLE 1 \"October 13, 2013\" \n.SH NAME\n.PP\nreservoir-sample - sample lines from file or standard input\n.SH "
},
{
"path": "man/set-diff.1",
"chars": 617,
"preview": ".TH SET\\-DIFF 1 \"May 6, 2013\" \n.SH NAME\n.PP\nset\\-diff \\- find lines in first file which are not in the second\n.SH SYNOPS"
},
{
"path": "man/set-intersect.1",
"chars": 516,
"preview": ".TH SET\\-INTERSECT 1 \"May 6, 2013\" \n.SH NAME\n.PP\nset\\-intersect \\- find lines common to two files\n.SH SYNOPSIS\n.PP\nset\\-"
},
{
"path": "man/tab-to-csv.1",
"chars": 712,
"preview": ".TH TAB-TO-CSV 1 \"February 16, 2013\" \n.SH NAME\n.PP\ntab-to-csv - convert tab delimited to CSV\n.SH SYNOPSIS\n.PP\ntab-to-csv"
},
{
"path": "man/tokenize.1",
"chars": 963,
"preview": ".TH TOKENIZE 1 \"February 15, 2015\" \n.SH NAME\n.PP\ntokenize - extract words from English language text\n.SH SYNOPSIS\n.PP\nto"
},
{
"path": "man/trim-tsv.1",
"chars": 389,
"preview": ".TH TRIM\\-TSV 1 \"September 25, 2013\" \n.SH NAME\n.PP\ntrim\\-tsv \\- trim whitespace from fields in a tab delimited file\n.SH "
},
{
"path": "man/tsv-header.1",
"chars": 482,
"preview": ".TH TSV\\-HEADER 1 \"March 6, 2015\" \n.SH NAME\n.PP\ntsv\\-header \\- number the columns in a TSV header\n.SH SYNOPSIS\n.PP\ntsv\\-"
},
{
"path": "man/tsv-to-json.1",
"chars": 659,
"preview": ".TH TSV-TO-JSON 1 \"June 4, 2013\" \n.SH NAME\n.PP\ntsv-to-json - convert TSV to JSON\n.SH SYNOPSIS\n.PP\ntsv-to-json OPTIONS [T"
},
{
"path": "man/utf8-category.1",
"chars": 3969,
"preview": ".TH UTF8-SCRIPT 1 \"February 14, 2015\" \n.SH NAME\n.PP\nutf8-script - tally UTF-8 encoded characters by general category\n.SH"
},
{
"path": "man/utf8-script.1",
"chars": 546,
"preview": ".TH UTF8-SCRIPT 1 \"February 14, 2015\" \n.SH NAME\n.PP\nutf8-script - tally characters by UTF-8 script\n.SH SYNOPSIS\n.PP\nutf8"
},
{
"path": "man/xlsx-to-csv.1",
"chars": 1503,
"preview": ".TH XLSX-TO-CSV 1 \"May 4, 2013\" \n.SH NAME\n.PP\nxlsx-to-csv - convert .xlsx to .csv\n.SH SYNOPSIS\n.IP\n.nf\n\\f[C]\nxlsx-to-csv"
},
{
"path": "requirements.txt",
"chars": 136,
"preview": "beautifulsoup4==4.13.4\nhtml5lib==1.1\nlxml==6.0.0\nopenpyxl==2.4.5\npep8==1.7.0\npylint==1.6.5\nPyYAML==6.0.2\nsetuptools==80."
},
{
"path": "setup.py",
"chars": 2273,
"preview": "#!/usr/bin/env python3\nimport sys\nfrom setuptools import setup\n\nSHELL_TOOLS = [\n 'data_tools/check-tsv',\n 'data_to"
},
{
"path": "src/csv-to-tab/Makefile",
"chars": 2734,
"preview": "MAKEFLAGS += --warn-undefined-variables\nSHELL := bash\n.SHELLFLAGS := -eu -o pipefail -c\n.DEFAULT_GOAL := all\n.DELETE_ON_"
},
{
"path": "src/csv-to-tab/README.md",
"chars": 1911,
"preview": "# OVERVIEW\n\nConvert a UTF-8 encoded CSV file to a UTF-8 encoded tab delimited file.\n\n# DATA DEFINITIONS\n\nCSV format: [RF"
},
{
"path": "src/csv-to-tab/csv_to_tab.c",
"chars": 8368,
"preview": "#include <errno.h>\n#include <getopt.h>\n#include <locale.h>\n#include <stdbool.h>\n#include <stdio.h>\n#include <stdlib.h>\n#"
},
{
"path": "src/csv-to-tab/state.dot",
"chars": 2187,
"preview": "digraph {\n start -> outside_field;\n\n outside_field -> outside_field [label=\"\\\\n\"];\n outside_field -"
},
{
"path": "src/csv-to-tab/test/expected.output/backslash.default.tab",
"chars": 24,
"preview": "one\\two\tthree\nfour\tfive\n"
},
{
"path": "src/csv-to-tab/test/expected.output/backslash.escape.tab",
"chars": 25,
"preview": "one\\\\two\tthree\nfour\tfive\n"
},
{
"path": "src/csv-to-tab/test/expected.output/backslash.replace.tab",
"chars": 24,
"preview": "one\\two\tthree\nfour\tfive\n"
},
{
"path": "src/csv-to-tab/test/expected.output/backslash.strip.tab",
"chars": 24,
"preview": "one\\two\tthree\nfour\tfive\n"
},
{
"path": "src/csv-to-tab/test/expected.output/cr.escape.tab",
"chars": 25,
"preview": "one\\rtwo\tthree\nfour\tfive\n"
},
{
"path": "src/csv-to-tab/test/expected.output/cr.replace.tab",
"chars": 24,
"preview": "one two\tthree\nfour\tfive\n"
},
{
"path": "src/csv-to-tab/test/expected.output/cr.strip.tab",
"chars": 23,
"preview": "onetwo\tthree\nfour\tfive\n"
},
{
"path": "src/csv-to-tab/test/expected.output/four.tab",
"chars": 36,
"preview": "hello \"bob\"\tone\ttwo\nthree\tfour\tfive\n"
},
{
"path": "src/csv-to-tab/test/expected.output/newline.escape.tab",
"chars": 25,
"preview": "one\\ntwo\tthree\nfour\tfive\n"
},
{
"path": "src/csv-to-tab/test/expected.output/newline.replace.tab",
"chars": 24,
"preview": "one two\tthree\nfour\tfive\n"
},
{
"path": "src/csv-to-tab/test/expected.output/newline.strip.tab",
"chars": 23,
"preview": "onetwo\tthree\nfour\tfive\n"
},
{
"path": "src/csv-to-tab/test/expected.output/one.tab",
"chars": 19,
"preview": "one\ttwo\nthree\tfour\n"
},
{
"path": "src/csv-to-tab/test/expected.output/tab.escape.tab",
"chars": 25,
"preview": "one\\ttwo\tthree\nfour\tfive\n"
},
{
"path": "src/csv-to-tab/test/expected.output/tab.replace.tab",
"chars": 24,
"preview": "one two\tthree\nfour\tfive\n"
},
{
"path": "src/csv-to-tab/test/expected.output/tab.strip.tab",
"chars": 23,
"preview": "onetwo\tthree\nfour\tfive\n"
},
{
"path": "src/csv-to-tab/test/expected.output/three.tab",
"chars": 402,
"preview": "id\ttimestamp\tdatetime\tobject.content\turl\tprovider.name\tmeta.lang\tmeta.demographics.country\tmeta.sentiment\ttopics.id\tlink"
},
{
"path": "src/csv-to-tab/test/expected.output/two.tab",
"chars": 28,
"preview": "one one\ttwo\nthree\tfour four\n"
},
{
"path": "src/csv-to-tab/test/input/backslash.csv",
"chars": 24,
"preview": "one\\two,three\nfour,five\n"
},
{
"path": "src/csv-to-tab/test/input/cr.csv",
"chars": 24,
"preview": "one\rtwo,three\nfour,five\n"
},
{
"path": "src/csv-to-tab/test/input/four.csv",
"chars": 40,
"preview": "\"hello \"\"bob\"\"\",one,two\nthree,four,five\n"
},
{
"path": "src/csv-to-tab/test/input/newline.csv",
"chars": 26,
"preview": "\"one\ntwo\",three\nfour,five\n"
},
{
"path": "src/csv-to-tab/test/input/one.csv",
"chars": 19,
"preview": "one,two\nthree,four\n"
},
{
"path": "src/csv-to-tab/test/input/tab.csv",
"chars": 24,
"preview": "one\ttwo,three\nfour,five\n"
},
{
"path": "src/csv-to-tab/test/input/three.csv",
"chars": 406,
"preview": "id,timestamp,datetime,object.content,url,provider.name,meta.lang,meta.demographics.country,meta.sentiment,topics.id,link"
},
{
"path": "src/csv-to-tab/test/input/two.csv",
"chars": 32,
"preview": "\"one one\",two\nthree,\"four four\"\n"
},
{
"path": "src/json-pluck/Makefile",
"chars": 601,
"preview": "MAKEFLAGS += --warn-undefined-variables\nSHELL := bash\n.SHELLFLAGS := -eu -o pipefail -c\n.DEFAULT_GOAL := all\n.DELETE_ON_"
},
{
"path": "src/json-pluck/json_pluck.c",
"chars": 4250,
"preview": "#include <locale.h>\n#include <stdio.h>\n#include <stdbool.h>\n#include <stdlib.h>\n#include <wchar.h>\n\nvoid\nfatal(char *msg"
},
{
"path": "src/json-pluck/test/expected.output/sample.json",
"chars": 51,
"preview": "\"foo\"\n\"bar\"\n\"]\"\n[\"baz\",\"bum\"]\n{\"whatever\":[3,4,5]}\n"
},
{
"path": "src/json-pluck/test/expected.output/sample2.json",
"chars": 26,
"preview": "\"one\"\n{\"two\":2,\"three\":3}\n"
},
{
"path": "src/json-pluck/test/input/sample.json",
"chars": 58,
"preview": "[\"foo\", \"bar\", \"]\", [\"baz\", \"bum\"], {\"whatever\": [3,4,5]}]"
},
{
"path": "src/json-pluck/test/input/sample2.json",
"chars": 32,
"preview": "[\"one\", {\"two\": 2, \"three\": 3}]\n"
},
{
"path": "src/tab-to-csv/.gitignore",
"chars": 16,
"preview": "fast-tsv-to-csv\n"
},
{
"path": "src/tab-to-csv/Makefile",
"chars": 1886,
"preview": "MAKEFLAGS += --warn-undefined-variables\nSHELL := bash\n.SHELLFLAGS := -eu -o pipefail -c\n.DEFAULT_GOAL := all\n.DELETE_ON_"
},
{
"path": "src/tab-to-csv/README.md",
"chars": 1247,
"preview": "# OVERVIEW\n\nConvert a UTF-8 encoded TSV file to a UTF-8 encoded CSV file.\n\n# DATA DEFINITIONS\n\nTSV format: [IANA](https:"
},
{
"path": "src/tab-to-csv/tab_to_csv.c",
"chars": 6245,
"preview": "#include <errno.h>\n#include <getopt.h>\n#include <locale.h>\n#include <stdbool.h>\n#include <stdio.h>\n#include <stdlib.h>\n#"
},
{
"path": "src/tab-to-csv/test/expected.output/backslash.default.csv",
"chars": 24,
"preview": "foo\\\\foo,bar\r\nbaz,quux\r\n"
},
{
"path": "src/tab-to-csv/test/expected.output/backslash.unescape.csv",
"chars": 23,
"preview": "foo\\foo,bar\r\nbaz,quux\r\n"
},
{
"path": "src/tab-to-csv/test/expected.output/cr.default.csv",
"chars": 24,
"preview": "foo\\rfoo,bar\r\nbaz,quux\r\n"
},
{
"path": "src/tab-to-csv/test/expected.output/cr.unescape.csv",
"chars": 25,
"preview": "\"foo\rfoo\",bar\r\nbaz,quux\r\n"
},
{
"path": "src/tab-to-csv/test/expected.output/newline.default.csv",
"chars": 24,
"preview": "foo\\nfoo,bar\r\nbaz,quux\r\n"
},
{
"path": "src/tab-to-csv/test/expected.output/newline.unescape.csv",
"chars": 25,
"preview": "\"foo\nfoo\",bar\r\nbaz,quux\r\n"
},
{
"path": "src/tab-to-csv/test/expected.output/one.csv",
"chars": 19,
"preview": "foo,bar\r\nbaz,quux\r\n"
},
{
"path": "src/tab-to-csv/test/expected.output/tab.default.csv",
"chars": 24,
"preview": "foo\\tfoo,bar\r\nbaz,quux\r\n"
},
{
"path": "src/tab-to-csv/test/expected.output/tab.unescape.csv",
"chars": 23,
"preview": "foo\tfoo,bar\r\nbaz,quux\r\n"
},
{
"path": "src/tab-to-csv/test/input/backslash.tab",
"chars": 22,
"preview": "foo\\\\foo\tbar\nbaz\tquux\n"
},
{
"path": "src/tab-to-csv/test/input/cr.tab",
"chars": 22,
"preview": "foo\\rfoo\tbar\nbaz\tquux\n"
},
{
"path": "src/tab-to-csv/test/input/newline.tab",
"chars": 22,
"preview": "foo\\nfoo\tbar\nbaz\tquux\n"
},
{
"path": "src/tab-to-csv/test/input/one.tab",
"chars": 17,
"preview": "foo\tbar\nbaz\tquux\n"
},
{
"path": "src/tab-to-csv/test/input/tab.tab",
"chars": 22,
"preview": "foo\\tfoo\tbar\nbaz\tquux\n"
},
{
"path": "src/utf8-script/Makefile",
"chars": 1424,
"preview": "MAKEFLAGS += --warn-undefined-variables\nSHELL := bash\n.SHELLFLAGS := -eu -o pipefail -c\n.DEFAULT_GOAL := all\n.DELETE_ON_"
},
{
"path": "src/utf8-script/README.md",
"chars": 3334,
"preview": "# Overview\n\nThis utility takes UTF-8 encoded input and tallies the characters\nby Unicode script. For example:\n\n $ ec"
},
{
"path": "src/utf8-script/Scripts.txt",
"chars": 189586,
"preview": "# Scripts-16.0.0.txt\n# Date: 2024-04-30, 21:48:40 GMT\n# © 2024 Unicode®, Inc.\n# Unicode and the Unicode Logo are registe"
},
{
"path": "src/utf8-script/UnicodeData.txt",
"chars": 2175362,
"preview": "0000;<control>;Cc;0;BN;;;;;N;NULL;;;;\n0001;<control>;Cc;0;BN;;;;;N;START OF HEADING;;;;\n0002;<control>;Cc;0;BN;;;;;N;STA"
},
{
"path": "src/utf8-script/generate_category.rb",
"chars": 7173,
"preview": "#!/usr/bin/env ruby\n\nrequire 'erb'\nrequire 'pp'\n\nINDENT = ' '.freeze\n\nLONG_CATEGORIES = {\n 'Lu' => 'Uppercase_Letter',"
},
{
"path": "src/utf8-script/generate_script.rb",
"chars": 5601,
"preview": "#!/usr/bin/env ruby\n\nrequire 'erb'\nrequire 'pp'\n\nEncoding.default_external = Encoding::UTF_8\nEncoding.default_internal ="
},
{
"path": "src/utf8-script/test/utf8-category/expected.output/one.txt",
"chars": 16,
"preview": "1\tCc\n14\tLl\n3\tZs\n"
},
{
"path": "src/utf8-script/test/utf8-category/input/one.txt",
"chars": 18,
"preview": "αλφα βετα foo bar\n"
},
{
"path": "src/utf8-script/test/utf8-script/expected.output/one.txt",
"chars": 25,
"preview": "4\tCommon\n8\tGreek\n6\tLatin\n"
},
{
"path": "src/utf8-script/test/utf8-script/input/one.txt",
"chars": 18,
"preview": "αλφα βετα foo bar\n"
},
{
"path": "src/utf8-script/utf8_category.c",
"chars": 451874,
"preview": "#include <getopt.h>\n#include <locale.h>\n#include <stdbool.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <wchar.h>\n#"
},
{
"path": "src/utf8-script/utf8_category.c.erb",
"chars": 1937,
"preview": "#include <getopt.h>\n#include <locale.h>\n#include <stdbool.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <wchar.h>\n#"
},
{
"path": "src/utf8-script/utf8_script.c",
"chars": 261987,
"preview": "#include <getopt.h>\n#include <locale.h>\n#include <stdbool.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <wchar.h>\n#"
},
{
"path": "src/utf8-script/utf8_script.c.erb",
"chars": 1617,
"preview": "#include <getopt.h>\n#include <locale.h>\n#include <stdbool.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <wchar.h>\n#"
},
{
"path": "test/check_tsv/input.bad.tsv",
"chars": 0,
"preview": ""
},
{
"path": "test/check_tsv/input.good.tsv",
"chars": 18,
"preview": "one\ttwo\nthree\tfour"
},
{
"path": "test/check_yaml/bad.yaml",
"chars": 25,
"preview": "foo: \"this is open\nbar: 3"
},
{
"path": "test/check_yaml/good.yaml",
"chars": 46,
"preview": "foo:\n - one\n - two\n - three\n - four\nbar: 3"
},
{
"path": "test/convert_date/input.txt",
"chars": 7,
"preview": "100000\n"
},
{
"path": "test/counting_sort/input.txt",
"chars": 89,
"preview": "one\none\ntwo\nthree\nfour\nfive\nfive\none\ntwo\nthree\none\nsix\nsix\none\none\none\none\ntwo\ntwo\nthree\n"
},
{
"path": "test/csv_files/no-header.csv",
"chars": 41,
"preview": "1,2,3,4,hello\n5,6,7,8,nil\n9,10,11,12,foo\n"
},
{
"path": "test/csv_files/no-quote.csv",
"chars": 78,
"preview": "a,b,c,d,message\n1,2,3,4,'hello world'\n5,6,7,8,'you don''t say'\n9,10,11,12,bar\n"
},
{
"path": "test/csv_files/quoted-chars.csv",
"chars": 65,
"preview": "1,2,3,\"foo\nbar\"\n4,5,6,baz\n7,8,9,\"one\ttwo\"\n10,11,12,\"three \n four\""
},
{
"path": "test/csv_files/single-quote.csv",
"chars": 79,
"preview": "a,b,c,d,message\n1,2,3,4,'hello, world'\n5,6,7,8,'you don''t say'\n9,10,11,12,bar\n"
},
{
"path": "test/csv_files/unequal-rows.csv",
"chars": 82,
"preview": "1,2,3,4,hello\n5,6,7,8,nil\n5,6,7,8,nil\n5,6,7,8,nil\n5,6,7,8,nil\n5,6,7,8,nil\n9,10,11\n"
},
{
"path": "test/csv_files/unicode.csv",
"chars": 26,
"preview": "αλφα\r\nβητα\r\nγαμμα\r\nδελτα\r\n"
},
{
"path": "test/csv_to_json/test.csv",
"chars": 80,
"preview": "a,b,c,d,message\n1,2,3,4,\"hello, world\"\n5,6,7,8,\"he said \"\"yes\"\"\"\n9,10,11,12,foo\n"
},
{
"path": "test/csv_to_postgres/customers.csv",
"chars": 63,
"preview": "John Smith,123,\"Jamestown, VA\"\nMary Smith,456,\"Millerstown, PA\""
},
{
"path": "test/csv_to_postgres/customers.sql",
"chars": 64,
"preview": "create table customers ( name text, id integer, address text );\n"
},
{
"path": "test/csv_to_tab/expected.escape.tab",
"chars": 23,
"preview": "one\ttwo\\ttwo\nthree\tfour"
},
{
"path": "test/csv_to_tab/expected.strip.tab",
"chars": 21,
"preview": "one\ttwotwo\nthree\tfour"
},
{
"path": "test/csv_to_tab/expected.tab",
"chars": 18,
"preview": "one\ttwo\nthree\tfour"
},
{
"path": "test/csv_to_tab/expected.unicode.tab",
"chars": 17,
"preview": "λ\ttwo\nthree\tfour\n"
},
{
"path": "test/date_fill/expected.output.tsv",
"chars": 233,
"preview": "2015-07-29T00\t6\n2015-07-29T01\t49\n2015-07-29T02\t0\n2015-07-29T03\t45\n2015-07-29T04\t56\n2015-07-29T05\t55\n2015-07-29T06\t59\n201"
},
{
"path": "test/date_fill/input.tsv",
"chars": 152,
"preview": "2015-07-29T00\t6\n2015-07-29T01\t49\n2015-07-29T03\t45\n2015-07-29T04\t56\n2015-07-29T05\t55\n2015-07-29T06\t59\n2015-07-29T08\t135\n2"
},
{
"path": "test/highlight/expected.output.txt",
"chars": 563,
"preview": "0000;<\u001b[01;31mcontrol\u001b[m>;Cc;0;BN;;;;;N;NULL;;;;\n0001;<\u001b[01;31mcontrol\u001b[m>;Cc;0;BN;;;;;N;START OF HEADING;;;;\n0002;<\u001b[01"
},
{
"path": "test/highlight/input.txt",
"chars": 453,
"preview": "0000;<control>;Cc;0;BN;;;;;N;NULL;;;;\n0001;<control>;Cc;0;BN;;;;;N;START OF HEADING;;;;\n0002;<control>;Cc;0;BN;;;;;N;STA"
},
{
"path": "test/html_table_to_csv/expected.test.csv",
"chars": 14,
"preview": "foo,bar\r\n1,2\r\n"
},
{
"path": "test/html_table_to_csv/test.html",
"chars": 74,
"preview": "<html><body><table><tr><th>foo<th>bar<tr><td>1<td>2</table></body></html>\n"
},
{
"path": "test/join_tsv/expected.output.NULL_VALUE.tsv",
"chars": 70,
"preview": "url\ttitle\tscore\nhttp://google.com\tGoogle\t33\nhttp://yahoo.com\tYahoo\t77\n"
},
{
"path": "test/join_tsv/expected.output.diff.tsv",
"chars": 71,
"preview": "url1\ttitle\tscore\nhttp://google.com\tGoogle\t33\nhttp://yahoo.com\tYahoo\t77\n"
},
{
"path": "test/join_tsv/expected.output.left.tsv",
"chars": 68,
"preview": "url\ttitle\tscore\nhttp://google.com\tGoogle\t33\nhttp://yahoo.com\tYahoo\t\n"
},
{
"path": "test/join_tsv/expected.output.left2.tsv",
"chars": 44,
"preview": "url\tscore\ttitle\nhttp://google.com\t33\tGoogle\n"
},
{
"path": "test/join_tsv/expected.output.right.tsv",
"chars": 68,
"preview": "url\tscore\ttitle\nhttp://google.com\t33\tGoogle\nhttp://yahoo.com\t\tYahoo\n"
},
{
"path": "test/join_tsv/expected.output.tsv",
"chars": 70,
"preview": "url\ttitle\tscore\nhttp://google.com\tGoogle\t33\nhttp://yahoo.com\tYahoo\t77\n"
},
{
"path": "test/join_tsv/input1.NULL_VALUE.tsv",
"chars": 72,
"preview": "url\ttitle\nNULL\tnull url\nhttp://google.com\tGoogle\nhttp://yahoo.com\tYahoo\n"
},
{
"path": "test/join_tsv/input1.diff.tsv",
"chars": 59,
"preview": "url1\ttitle\nhttp://google.com\tGoogle\nhttp://yahoo.com\tYahoo\n"
},
{
"path": "test/join_tsv/input1.left.tsv",
"chars": 58,
"preview": "url\ttitle\nhttp://google.com\tGoogle\nhttp://yahoo.com\tYahoo\n"
},
{
"path": "test/join_tsv/input1.null.tsv",
"chars": 68,
"preview": "url\ttitle\n\tNULL URL\nhttp://google.com\tGoogle\nhttp://yahoo.com\tYahoo\n"
},
{
"path": "test/join_tsv/input1.tsv",
"chars": 58,
"preview": "url\ttitle\nhttp://google.com\tGoogle\nhttp://yahoo.com\tYahoo\n"
},
{
"path": "test/join_tsv/input2.NULL_VALUE.tsv",
"chars": 58,
"preview": "url\tscore\nhttp://google.com\t33\nhttp://yahoo.com\t77\nNULL\t0\n"
},
{
"path": "test/join_tsv/input2.diff.tsv",
"chars": 52,
"preview": "url2\tscore\nhttp://google.com\t33\nhttp://yahoo.com\t77\n"
},
{
"path": "test/join_tsv/input2.left.tsv",
"chars": 31,
"preview": "url\tscore\nhttp://google.com\t33\n"
},
{
"path": "test/join_tsv/input2.null.tsv",
"chars": 54,
"preview": "url\tscore\nhttp://google.com\t33\nhttp://yahoo.com\t77\n\t0\n"
},
{
"path": "test/join_tsv/input2.tsv",
"chars": 51,
"preview": "url\tscore\nhttp://google.com\t33\nhttp://yahoo.com\t77\n"
},
{
"path": "test/json_diff/1a.json",
"chars": 21,
"preview": "{\"foo\": 1, \"bar\": 2}\n"
},
{
"path": "test/json_diff/1b.json",
"chars": 21,
"preview": "{\"bar\": 2, \"foo\": 1}\n"
},
{
"path": "test/json_diff/2a.json",
"chars": 21,
"preview": "{\"foo\": 1, \"bar\": 2}\n"
},
{
"path": "test/json_diff/2b.json",
"chars": 11,
"preview": "{\"foo\": 1}\n"
},
{
"path": "test/json_diff/expected.output1.txt",
"chars": 0,
"preview": ""
},
{
"path": "test/json_diff/expected.output2.txt",
"chars": 20,
"preview": "2d1\n< \"bar\": 2,\n"
},
{
"path": "test/normalize_utf8/expected.output.nfc.txt",
"chars": 9,
"preview": "français\n"
}
]
// ... and 19 more files (download for full content)
About this extraction
This page contains the full source code of the clarkgrubb/data-tools GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 219 files (3.2 MB), approximately 838.0k tokens, and a symbol index with 113 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.