Repository: harelba/q
Branch: master
Commit: 03e8b3950557
Files: 56
Total size: 643.0 KB

Directory structure:
gitextract_x4ti_kab/

├── .github/
│   ├── FUNDING.yml
│   └── workflows/
│       └── build-and-package.yaml
├── .gitignore
├── LICENSE
├── QSQL-NOTES.md
├── README.markdown
├── benchmark-config.sh
├── bin/
│   ├── .qrc
│   ├── __init__.py
│   ├── q.bat
│   └── q.py
├── conftest.py
├── dist/
│   ├── fpm-config
│   ├── test-rpm-inside-container.sh
│   ├── test-using-deb.sh
│   └── test-using-rpm.sh
├── doc/
│   ├── AUTHORS
│   ├── IMPLEMENTATION.markdown
│   ├── LICENSE
│   ├── RATIONALE.markdown
│   ├── THANKS
│   └── USAGE.markdown
├── examples/
│   ├── EXAMPLES.markdown
│   ├── exampledatafile
│   └── group-emails-example
├── mkdocs/
│   ├── README.md
│   ├── docs/
│   │   ├── about.md
│   │   ├── fsg9b9b1.txt
│   │   ├── google0efeb4ff0a886e81.html
│   │   ├── index.md
│   │   ├── index_cn.md
│   │   ├── js/
│   │   │   └── google-analytics.js
│   │   └── stylesheets/
│   │       └── extra.css
│   ├── generate-web-site.sh
│   ├── mkdocs.yml
│   ├── requirements.txt
│   └── theme/
│       └── main.html
├── prepare-benchmark-env
├── pyoxidizer.bzl
├── pytest.ini
├── requirements.txt
├── run-benchmark
├── run-coverage.sh
├── run-tests.sh
├── setup.py
├── test/
│   ├── BENCHMARK.md
│   ├── __init__.py
│   ├── benchmark-results/
│   │   └── source-files-1443b7418b46594ad256abd9db4a7671cb251e6a/
│   │       └── 2020-09-17-v2.0.17/
│   │           ├── octosql_v0.3.0.benchmark-results
│   │           ├── q-benchmark-2.7.18.benchmark-results
│   │           ├── q-benchmark-3.6.4.benchmark-results
│   │           ├── q-benchmark-3.7.9.benchmark-results
│   │           ├── q-benchmark-3.8.5.benchmark-results
│   │           ├── summary.benchmark-results
│   │           └── textql_2.0.3.benchmark-results
│   └── test_suite.py
└── test-requirements.txt

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/FUNDING.yml
================================================
# These are supported funding model platforms

github: harelba


================================================
FILE: .github/workflows/build-and-package.yaml
================================================
name: BuildAndPackage

on:
  push:
    tags:
      - "v*"
    branches: master
  pull_request:
    branches: master
    paths-ignore:
      - "*.md"
      - "*.markdown"
      - "mkdocs/**/*"
    tags-ignore:
      - "*"

jobs:
  version_info:
    runs-on: ubuntu-18.04
    steps:
      - name: Checkout
        uses: actions/checkout@v2
      - id: vars
        run: |
          set -x -e

          echo "github event ref is ${{ github.ref }}"

          if [ "x${{ startsWith(github.ref, 'refs/tags/v') }}" == "xtrue" ]
          then
            echo "Trigger was a version tag - ${{ github.ref }}"
            echo ::set-output name=q_version::${GITHUB_REF#refs/tags/v}
            echo ::set-output name=is_release::true
          else
            # For testing version propagation inside the PR
            echo "Either branch of a non-version tag - setting version to 0.0.0"
            echo ::set-output name=q_version::0.0.0
            echo ::set-output name=is_release::false
          fi

    outputs:
      q_version: ${{ steps.vars.outputs.q_version }}
      is_release: ${{ steps.vars.outputs.is_release }}

  check_version_info:
    runs-on: ubuntu-18.04
    needs: version_info
    steps:
      - name: test q_version
        run: |
          set -e -x

          echo "outputs: ${{ toJson(needs.version_info) }}"

  create-man:
    runs-on: ubuntu-18.04
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Install Ruby
      uses: ruby/setup-ruby@v1
      with:
        ruby-version: '2.6'
    - name: Create man page
      run: |
        set -x -e
        gem install ronn

        ronn doc/USAGE.markdown
        # Must be gzipped, otherwise debian does not install it
        gzip doc/USAGE
    - name: Upload man page
      uses: actions/upload-artifact@v1.0.0
      with:
        name: q-man-page
        path: doc/USAGE.gz

  build-linux:
    runs-on: ubuntu-18.04
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Cache pyox
      uses: actions/cache@v2
      with:
        path: |
          ~/.cache/pyoxidizer
        key: ${{ runner.os }}-pyox
    - name: Install pyoxidizer
      run: |
        set -e -x

        sudo apt-get update
        sudo apt-get install -y zip sqlite3 rpm

        curl -o pyoxidizer.zip -L "https://github.com/indygreg/PyOxidizer/releases/download/pyoxidizer%2F0.17/pyoxidizer-0.17.0-linux_x86_64.zip"
        unzip pyoxidizer.zip
        chmod +x ./pyoxidizer
    - name: Create Q Executable - Linux
      run: |
        set -e -x

        ./pyoxidizer build --release

        export Q_EXECUTABLE=./build/x86_64-unknown-linux-gnu/release/install/q
        chmod 755 $Q_EXECUTABLE

        seq 1 100 | $Q_EXECUTABLE -c 1 "select sum(c1),count(*) from -" -S test.sqlite

        mkdir -p packages/linux/
        cp $Q_EXECUTABLE packages/linux/linux-q
    - name: Upload Linux Executable
      uses: actions/upload-artifact@v1.0.0
      with:
        name: linux-q
        path: packages/linux/linux-q

  test-linux:
    needs: build-linux
    runs-on: ubuntu-18.04
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Install Python for Testing
      uses: actions/setup-python@v2
      with:
        python-version: '3.8.12'
        architecture: 'x64'
    - name: Prepare Testing
      run: |
        set -e -x

        pip3 install -r test-requirements.txt
    - name: Download Linux Executable
      uses: actions/download-artifact@v2
      with:
        name: linux-q
    - name: Run Tests on Linux Executable
      run: |
        set -x -e

        find ./ -ls

        chmod 755 ./linux-q

        Q_EXECUTABLE=`pwd`/linux-q Q_SKIP_EXECUTABLE_VALIDATION=true ./run-tests.sh -v

  package-linux-deb:
    needs: [test-linux, create-man, version_info]
    runs-on: ubuntu-18.04
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Install Ruby
      uses: ruby/setup-ruby@v1
      with:
        ruby-version: '2.6'
    - name: Downoad man page
      uses: actions/download-artifact@v2
      with:
        name: q-man-page
    - name: Download Linux Executable
      uses: actions/download-artifact@v2
      with:
        name: linux-q
    - name: Build DEB Package
      run: |
        set -e -x

        mkdir -p packages/linux/

        find ./ -ls

        chmod 755 ./linux-q

        export q_version=${{ needs.version_info.outputs.q_version }}

        gem install fpm
        cp dist/fpm-config ~/.fpm
        fpm -s dir -t deb --deb-use-file-permissions -p packages/linux/q-text-as-data-${q_version}-1.x86_64.deb --version ${q_version} ./linux-q=/usr/bin/q USAGE.gz=/usr/share/man/man1/q.1.gz
    - name: Upload DEB Package
      uses: actions/upload-artifact@v1.0.0
      with:
        name: q-text-as-data-${{ needs.version_info.outputs.q_version }}-1.x86_64.deb
        path: packages/linux/q-text-as-data-${{ needs.version_info.outputs.q_version }}-1.x86_64.deb

  test-deb-packaging:
    runs-on: ubuntu-18.04
    needs: [package-linux-deb, version_info]
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Download DEB
      uses: actions/download-artifact@v2
      with:
        name: q-text-as-data-${{ needs.version_info.outputs.q_version }}-1.x86_64.deb
    - name: Install Python for Testing
      uses: actions/setup-python@v2
      with:
        python-version: '3.8.12'
        architecture: 'x64'
    - name: Prepare Testing
      run: |
        set -e -x

        pip3 install -r test-requirements.txt
    - name: Test DEB Package Installation
      run: ./dist/test-using-deb.sh ./q-text-as-data-${{ needs.version_info.outputs.q_version }}-1.x86_64.deb

  package-linux-rpm:
    needs: [test-linux, create-man, version_info]
    runs-on: ubuntu-18.04
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Install Ruby
      uses: ruby/setup-ruby@v1
      with:
        ruby-version: '2.6'
    - name: Download man page
      uses: actions/download-artifact@v2
      with:
        name: q-man-page
    - name: Download Linux Executable
      uses: actions/download-artifact@v2
      with:
        name: linux-q
    - name: Build RPM Package
      run: |
        set -e -x

        mkdir -p packages/linux


        chmod 755 ./linux-q

        export q_version=${{ needs.version_info.outputs.q_version }}

        gem install fpm
        cp dist/fpm-config ~/.fpm
        fpm -s dir -t rpm --rpm-use-file-permissions -p packages/linux/q-text-as-data-${q_version}.x86_64.rpm --version ${q_version} ./linux-q=/usr/bin/q USAGE.gz=/usr/share/man/man1/q.1.gz
    - name: Upload RPM Package
      uses: actions/upload-artifact@v1.0.0
      with:
        name: q-text-as-data-${{ needs.version_info.outputs.q_version }}.x86_64.rpm
        path: packages/linux/q-text-as-data-${{ needs.version_info.outputs.q_version }}.x86_64.rpm

  test-rpm-packaging:
    runs-on: ubuntu-18.04
    needs: [package-linux-rpm, version_info]
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Download RPM
      uses: actions/download-artifact@v2
      with:
        name: q-text-as-data-${{ needs.version_info.outputs.q_version }}.x86_64.rpm
    - name: Retest using RPM
      run: ./dist/test-using-rpm.sh ./q-text-as-data-${{ needs.version_info.outputs.q_version }}.x86_64.rpm

  build-mac:
    runs-on: macos-11
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Cache pyox
      uses: actions/cache@v2
      with:
        path: |
          ~/.cache/pyoxidizer
        key: ${{ runner.os }}-pyox
    - name: Install pyoxidizer
      run: |
        set -e -x

        curl -o  pyoxidizer.zip -L "https://github.com/indygreg/PyOxidizer/releases/download/pyoxidizer%2F0.17/pyoxidizer-0.17.0-macos-universal.zip"
        unzip pyoxidizer.zip
        mv macos-universal/pyoxidizer ./pyoxidizer

        chmod +x ./pyoxidizer
    - name: Create Q Executable - Mac
      run: |
        set -e -x

        ./pyoxidizer build --release

        export Q_EXECUTABLE=./build/x86_64-apple-darwin/release/install/q
        chmod 755 $Q_EXECUTABLE

        seq 1 100 | $Q_EXECUTABLE -c 1 "select sum(c1),count(*) from -" -S test.sqlite

        mkdir -p packages/macos/
        cp $Q_EXECUTABLE packages/macos/macos-q
    - name: Upload MacOS Executable
      uses: actions/upload-artifact@v1.0.0
      with:
        name: macos-q
        path: packages/macos/macos-q

  test-mac:
    needs: build-mac
    runs-on: macos-11
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Install Python for Testing
      uses: actions/setup-python@v2
      with:
        python-version: '3.8.12'
        architecture: 'x64'
    - name: Prepare Testing
      run: |
        set -e -x

        pip3 install wheel

        pip3 install -r test-requirements.txt
    - name: Download MacOS Executable
      uses: actions/download-artifact@v2
      with:
        name: macos-q
    - name: Run Tests on MacOS Executable
      run: |
        set -e -x

        chmod 755 ./macos-q

        Q_EXECUTABLE=`pwd`/macos-q Q_SKIP_EXECUTABLE_VALIDATION=true ./run-tests.sh -v

  not-package-mac:
    # create-man is not needed, as it's generated inside the brew formula independently
    needs: [test-mac]
    runs-on: macos-11
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Not Packaging Mac
      run: |
        echo "homebrew mac cannot be packaged from the source code itself, due to the package build process of homebrew. See https://github.com/harelba/homebrew-q"

  not-test-mac-packaging:
    needs: not-package-mac
    runs-on: macos-11
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Not Testing Mac Packaging
      run: |
        echo "homebrew mac packaging cannot be tested here, due to the package build process of homebrew. See https://github.com/harelba/homebrew-q"

  build-windows:
    runs-on: windows-latest
    needs: version_info
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Install MSVC build tools
      uses: ilammy/msvc-dev-cmd@v1
    - name: Install Python
      uses: actions/setup-python@v2
      with:
        python-version: '3.8.10'
        architecture: 'x64'
    - name: Install pyoxidizer
      shell: bash
      run: |
        set -x -e

        python3 -V
        pip3 -V

        pip3 install pyoxidizer
    - name: Create Q Executable - Windows
      shell: bash
      run: |
        set -e -x

        pyoxidizer build --release --var Q_VERSION ${{ needs.version_info.outputs.q_version }}

        export Q_EXECUTABLE=./build/x86_64-pc-windows-msvc/release/install/q
        chmod 755 $Q_EXECUTABLE

        seq 1 100 | $Q_EXECUTABLE -c 1 "select sum(c1),count(*) from -" -S test.sqlite

        mkdir -p packages/windows/
        cp $Q_EXECUTABLE packages/windows/win-q.exe

        find ./ -ls
    - name: Upload Linux Executable
      uses: actions/upload-artifact@v1.0.0
      with:
        name: win-q.exe
        path: packages/windows/win-q.exe

  not-really-test-windows:
    needs: build-windows
    runs-on: windows-latest
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Install Python for Testing
      uses: actions/setup-python@v2
      with:
        python-version: '3.8'
        architecture: 'x64'
    - name: Download Windows Executable
      uses: actions/download-artifact@v2
      with:
        name: win-q.exe
    - name: Not-Really-Test Windows
      shell: bash
      continue-on-error: true
      run: |
        echo "Tests are not compatible with Windows (path separators, tmp folder names etc.). Only a sanity wil be tested"

        chmod +x ./win-q.exe

        seq 1 10000 | ./win-q.exe -c 1 "select sum(c1),count(*) from -" -S some-db.sqlite

  package-windows:
    needs: [create-man, not-really-test-windows, version_info]
    runs-on: windows-latest
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Install MSVC build tools
      uses: ilammy/msvc-dev-cmd@v1
    - name: Install Python
      uses: actions/setup-python@v2
      with:
        python-version: '3.8.10'
        architecture: 'x64'
    - name: Install pyoxidizer
      shell: bash
      run: |
        set -x -e

        python3 -V
        pip3 -V

        pip3 install pyoxidizer
    - name: Create Q MSI - Windows
      shell: bash
      run: |
        set -e -x

        pyoxidizer build --release msi_installer --var Q_VERSION ${{ needs.version_info.outputs.q_version }}

        export Q_MSI=./build/x86_64-pc-windows-msvc/release/msi_installer/q-text-as-data-${{ needs.version_info.outputs.q_version }}.msi
        chmod 755 $Q_MSI

        mkdir -p packages/windows/
        cp $Q_MSI packages/windows/q-text-as-data-${{ needs.version_info.outputs.q_version }}.msi

    - name: Upload Windows MSI
      uses: actions/upload-artifact@v1.0.0
      with:
        name: q-text-as-data-${{ needs.version_info.outputs.q_version }}.msi
        path: packages/windows/q-text-as-data-${{ needs.version_info.outputs.q_version }}.msi

  test-windows-packaging:
    needs: [package-windows, version_info]
    runs-on: windows-latest
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Download Windows Package
      uses: actions/download-artifact@v2
      with:
        name: q-text-as-data-${{ needs.version_info.outputs.q_version }}.msi
    - name: Test Install of MSI
      continue-on-error: true
      shell: powershell
      run: |
        $process = Start-Process msiexec.exe -ArgumentList "/i q-text-as-data-${{ needs.version_info.outputs.q_version }}.msi -l* msi-install.log /norestart /quiet" -PassThru -Wait
        $process.ExitCode
        gc msi-install.log

        exit $process.ExitCode
    - name: Test Uninstall of MSI
      continue-on-error: true
      shell: powershell
      run: |
        $process = Start-Process msiexec.exe -ArgumentList "/u q-text-as-data-${{ needs.version_info.outputs.q_version }}.msi /norestart /quiet" -PassThru -Wait
        $process.ExitCode
        exit $process.ExitCode

  perform-prerelease:
    # We'd like artifacts to be uploaded regardless of tests succeeded or not,
    # this is why the dependency here is not on test-X-packaging jobs
    needs: [package-linux-deb, package-linux-rpm, not-package-mac, package-windows, version_info]
    runs-on: ubuntu-latest
    if: needs.version_info.outputs.is_release == 'false'
    steps:
    - name: Download All Artifacts
      uses: actions/download-artifact@v2
      with:
        path: artifacts/
    - name: Timestamp pre-release
      run: |
        set -e -x

        echo "Workflow finished at $(date)" >> artifacts/workflow-finish-time.txt
    - name: Create pre-release
      uses: "marvinpinto/action-automatic-releases@v1.2.1"
      with:
        repo_token: "${{ secrets.GITHUB_TOKEN }}"
        automatic_release_tag: "latest"
        prerelease: true
        title: "Next Release Development Build"
        files: |
          artifacts/**/*

  perform-release:
    needs: [not-test-mac-packaging, test-deb-packaging, test-rpm-packaging, test-windows-packaging, version_info]
    runs-on: ubuntu-latest
    if: needs.version_info.outputs.is_release == 'true'
    steps:
    - name: Download All Artifacts
      uses: actions/download-artifact@v2
      with:
        path: artifacts/
    - uses: "marvinpinto/action-automatic-releases@v1.2.1"
      with:
        repo_token: "${{ secrets.GITHUB_TOKEN }}"
        prerelease: false
        files: |
          artifacts/**/*


================================================
FILE: .gitignore
================================================
build
q.spec
q.1
*.pyc
.vagrant
rpm_build_area
*.deb
setup.exe
win_output
win_build
packages
.idea/
dist/windows/
generated-site/
benchmark_data.tar.gz
_benchmark_data/
q.egg-info/
.pytest_cache/
*.qsql
htmlcov/
*.sqlite
*.tar.gz
.coverage
.DS_Store
*.egg


================================================
FILE: LICENSE
================================================
                    GNU GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

                            Preamble

  The GNU General Public License is a free, copyleft license for
software and other kinds of works.

  The licenses for most software and other practical works are designed
to take away your freedom to share and change the works.  By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.  We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors.  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.

  To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights.  Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received.  You must make sure that they, too, receive
or can get the source code.  And you must show them these terms so they
know their rights.

  Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.

  For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software.  For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.

  Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so.  This is fundamentally incompatible with the aim of
protecting users' freedom to change the software.  The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable.  Therefore, we
have designed this version of the GPL to prohibit the practice for those
products.  If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.

  Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary.  To prevent this, the GPL assures that
patents cannot be used to render the program non-free.

  The precise terms and conditions for copying, distribution and
modification follow.

                       TERMS AND CONDITIONS

  0. Definitions.

  "This License" refers to version 3 of the GNU General Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.

  "The Program" refers to any copyrightable work licensed under this
License.  Each licensee is addressed as "you".  "Licensees" and
"recipients" may be individuals or organizations.

  To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy.  The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.

  A "covered work" means either the unmodified Program or a work based
on the Program.

  To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy.  Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.

  To "convey" a work means any kind of propagation that enables other
parties to make or receive copies.  Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.

  An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License.  If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.

  1. Source Code.

  The "source code" for a work means the preferred form of the work
for making modifications to it.  "Object code" means any non-source
form of a work.

  A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.

  The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form.  A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.

  The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities.  However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work.  For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.

  The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.

  The Corresponding Source for a work in source code form is that
same work.

  2. Basic Permissions.

  All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met.  This License explicitly affirms your unlimited
permission to run the unmodified Program.  The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work.  This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.

  You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force.  You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright.  Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.

  Conveying under any other circumstances is permitted solely under
the conditions stated below.  Sublicensing is not allowed; section 10
makes it unnecessary.

  3. Protecting Users' Legal Rights From Anti-Circumvention Law.

  No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.

  When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.

  4. Conveying Verbatim Copies.

  You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.

  You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.

  5. Conveying Modified Source Versions.

  You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:

    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.

    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".

    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.

    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.

  A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit.  Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.

  6. Conveying Non-Source Forms.

  You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:

    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.

    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.

    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.

    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.

    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.

  A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.

  A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling.  In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage.  For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product.  A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.

  "Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source.  The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.

  If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information.  But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).

  The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed.  Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.

  Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.

  7. Additional Terms.

  "Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law.  If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.

  When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it.  (Additional permissions may be written to require their own
removal in certain cases when you modify the work.)  You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.

  Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:

    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or

    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or

    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or

    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or

    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or

    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.

  All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10.  If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term.  If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.

  If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.

  Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.

  8. Termination.

  You may not propagate or modify a covered work except as expressly
provided under this License.  Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).

  However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.

  Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

  Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License.  If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.

  9. Acceptance Not Required for Having Copies.

  You are not required to accept this License in order to receive or
run a copy of the Program.  Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance.  However,
nothing other than this License grants you permission to propagate or
modify any covered work.  These actions infringe copyright if you do
not accept this License.  Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.

  10. Automatic Licensing of Downstream Recipients.

  Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License.  You are not responsible
for enforcing compliance by third parties with this License.

  An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations.  If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.

  You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License.  For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.

  11. Patents.

  A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based.  The
work thus licensed is called the contributor's "contributor version".

  A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version.  For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.

  Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.

  In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement).  To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.

  If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients.  "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.

  If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.

  A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License.  You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.

  Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.

  12. No Surrender of Others' Freedom.

  If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all.  For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.

  13. Use with the GNU Affero General Public License.

  Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work.  The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.

  14. Revised Versions of this License.

  The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

  Each version is given a distinguishing version number.  If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation.  If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.

  If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.

  Later license versions may give you additional or different
permissions.  However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.

  15. Disclaimer of Warranty.

  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

  16. Limitation of Liability.

  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.

  17. Interpretation of Sections 15 and 16.

  If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.

                     END OF TERMS AND CONDITIONS

            How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    {one line to give the program's name and a brief idea of what it does.}
    Copyright (C) {year}  {name of author}

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

  If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:

    {project}  Copyright (C) {year}  {fullname}
    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".

  You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<http://www.gnu.org/licenses/>.

  The GNU General Public License does not permit incorporating your program
into proprietary programs.  If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library.  If this is what you want to do, use the GNU Lesser General
Public License instead of this License.  But first, please read
<http://www.gnu.org/philosophy/why-not-lgpl.html>.


================================================
FILE: QSQL-NOTES.md
================================================

## Major changes and additions in the new `3.x` version
This is the list of new/changed functionality in this version. Large changes, please make sure to read the details if you're already using q.

* **Automatic Immutable Caching** - Automatic caching of data files (into `<my-csv-filename>.qsql` files), with huge speedups for medium/large files. Enabled through `-C readwrite` or `-C read`
* **Direct querying of standard sqlite databases** - Just use it as a table name in the query. Format is `select ... from <sqlitedb_filename>:::<table_name>`, or just `<sqlitedb_filename>` if the database contains only one table. Multiple separate sqlite databases are fully supported in the same query.
* **Direct querying of the `qsql` cache files** - The user can query directly from the `qsql` files, removing the need for the original files. Just use `select ... from <my-csv-filename>.qsql`. Please wait until the non-beta version is out before thinking about deleting any of your original files...
* **Revamped `.qrc` mechanism** - allows opting-in to caching without specifying it in every query. By default, caching is **disabled**, for backward compatibility and for finding usability issues.
* **Save-to-db is now reusable for queries** - `--save-db-to-disk` option (`-S`) has been enhanced to match the new capabilities. You can query the resulting file directly through q, using the method mentioned above (it's just a standard sqlite database).
* **Only python3 is supported from now on** - Shouldn't be an issue, since q is a self-contained binary executable which has its own python embedded in it. Internally, q is now packaged with Python 3.8. After everything cools down, I'll probably bump this to 3.9/3.10.
* **Minimal Linux Version Bumped** - Works with CentOS 8, Ubuntu 18.04+, Debian 10+. Currently only for x86_64. Depends on glibc version 2.25+. Haven't tested it on other architectures. Issuing other architectures will be possible later on
* **Completely revamped binary packaging** - Using [pyoxidizer](https://github.com/indygreg/PyOxidizer)

The following sections provide the details of each of the new functionalities in this major version.

## Automatic caching of data files
Speeding up subsequent reads from the same file by several orders of magnitude by automatically creating an immutable cache file for each tabular text file.  

For example, reading a 0.9GB file with 1M rows and 100 columns without caching takes ~50 seconds. When the cache exists, querying the same file will take around ~1-2 seconds. Obviously, the cache can be used in order to perform any query and not just the original query that was used for creating the cache.

When caching is enabled, the cache is created on the first read of a file, and used automatically when reading it in other queries. A separate cache is being created for each file that is being used, allowing reuse in multiple use-cases. For example, if two csv files each have their own cache file from previous queries, then running a query that JOINs these two files would use the caches as well (without loading the data into memory), speeding it up considerably.

The tradeoff for using cache files is disk space - A new file with the postfix `.qsql` is created and automatically detected and used in queries as needed. This file is essentially a standard sqlite file (with some additional metadata tables), and can be used directly by any standard sqlite tool later on.

For backward compatibility, the caching option is not turned on by default. You'd need to use the new `-C <mode>` to determine the caching mode. Available options are as follows:
* `none` - The default,  provides the original q's behaviour without caching
* `read` - Only reads cache files if they exists, but doesn't create any new ones
* `readwrite` - Uses cache files if they exists, or creates new ones if they don't. Writing new cache files doesn't interfere with the actual run of the query, so this option can be used in order to dynamically create the cache files if they don't exist

Content signatures are being stored in the caches, allowing to detect a state where the original file has been modified after the cache has been created. q will issue an error if this happens. For now, just delete the `.qsql` file in order to recreate the cache. In the future, another `-C` option would be added to automatically recreate the updated cache in such a case. Notice that the content signature contains various q flags which affect parsing, so make sure to use the same parameters to q when performing the queries, otherwise q will issue an error.

Notice that when running with `-A`, the cache is not written, even when `-C` is set to `readwrite`. This is due to the fact that `-A` does not really read the entire content of the files. For now, if you'd like to just prepare the cache without running the actual query, you can run it with a `select 1` query or something, although in terms of speed it will mostly not matter. If there's demand for adding an explicit `prepare caches only` option, I'll consider adding it.

## Revamped `.qrc` mechanism
Adding `-C <mode>` for each query can be cumbersome at some point, so the `.qrc` file has been revamped for easy addition of default parameters. 

For example, if you want the caching behaviour to be `read` all the time, then just add a `~/.qrc` file, and set the following in it:
```
[options]
caching_mode=read
```

All other flags and parameters to q can be controlled by the `.qrc` file. To see the proper names for each parameter, run `q --dump-defaults` and it will dump a default `.qrc` file that contains all parameters to `stdout`.

## Direct querying of standard sqlite databases
q now supports direct querying of standard sqlite databases. The syntax for accessing a table inside an sqlite database is `<sqlite-filename>:::<table_name>`. A query can contain any mix of sqlite files, qsql files or regular delimited files.

For example, this command joins two tables from two separate sqlite databases:
```
$ q "select count(*) from mydatabase1.sqlite:::mytable1 a left join mydatabase2.sqlite:::mytable2 b on (a.c1 = b.c1)"
```

Running queries on sqlite databases does not usually entail loading the data into memory. Databases are attached to a virtual database and queried directly from disk. This means that querying speed is practically identical to standard sqlite access. This is also true when multiple sqlite databases are used in a single query. The same mechanism is being used by q whenever it uses a qsql file (either directly or as a cache of a delimited fild). 

sqlite itself does have a pre-compiled limit of the number of databases that can be attached simultanously. If this limit is reached, then q will attach as many databases as possible, and then continue processing by loading additional tables into memory in order to execute the query. The standard limit in sqlite3 (unless compiled specifically with another limit) is 10 databases. This allows q to access as many as 8 user databases without having to load any data into memory (2 databases are always used for q's internal logic). Using more databases in a single query than this pre-compiled sqlite limit would slow things down, since some of the data would go into memory, but the query should still provide correct results.

Whenever the sqlite database file contains only one table, the table name part can be ommitted, and the user can specify only the sqlite-filename as the table name. For example, querying an sqlite database `mydatabase.sqlite` that only has one table `mytable` is possible with `q "SELECT ... FROM mydatabase.sqlite"`. There's no need to specify the table name in this case.

Since `.qsql` files are also standard sqlite files, they can be queried directly as well. This allows the user to actually delete the original CSV file and use the caches as if they were the original files. For example:

```
$ q "select count(*) from myfile.csv.qsql"
```

Notice that there's no need to write the `:::<table-name>` as part of the table name, since `qsql` files that are created as caches contain only one table (e.g. the table matching the original file).

Running a query that uses an sqlite/qsql database without specifying a table name will fail if there is more than one table in the database, showing the list of existing tables. This can be used in order to detect which tables exist in the database without resorting to other tools. For example:
```
$ q "select * from chinook.db:::blah"
Table blah could not be found in sqlite file chinook.db . Existing table names: albums,sqlite_sequence,artists,customers,employees,genres,invoices,invoice_items,media_types,playlists,playlist_track,tracks,sqlite_stat1
```

## Storing source data into a disk database
The `-S` option (`--save-db-to-disk`) has been modified to match the new capabilities. It works with all types of input tables/files, and writes the output database as a standard sqlite database. I've considered making the output a multi-table `qsql` file (e.g. with the additional metadata that q uses), but some things still need to be ironed out in order to make these qsql files work seamlessly with all other aspects of q. This will probably happen in the next version.  

This database can be accessed directly by q later on, by providing `<sqlite-database>:::<table-name>` as the table name in the query. The table names that are chosen match the original file names, but go through the following process:
* The names are normalised in order to by compatible with sqlite restrictions (e.g. `x.csv` is normalised to `x_dot_csv`)
* duplicate table names are de-deduped by adding `_<sequence-number>` to their names (e.g. two different csv files in separate folders which both have the name `companies` will be written to the file as `companies` and `companies_2`)

This table-name normalisation happens also inside `.qsql` cache files, but in most cases there won't be any need to know these table names, since q automatically detects table names for databases which have a single-table.

## File-concatenation and wildcard-matching features - Breaking change
File concatenation using '+' has been removed in this version, which is a breaking change.

This was a controversial feature anyway, and can be done using standard SQL relatively easily. It also complicated the caching implementation significantly, and it seemed that it was not worth it. If there's demand for bringing this feature back, please write to me and I'll consider re-adding it. 

If you have a case of using file concatenation, you can use the following SQL instead:
```
# Instead of writing
$ q "select * from myfile1+myfile2"
# Use the following:
$ q "select * from (select * from myfile1 UNION ALL select * from myfile2)"
```

This will provide the same results, but the error checking is a bit less robust, so be mindful on whether you're performing the right query on the right files.

Conceptually, this is similar to wildcard matching (e.g. `select * from myfolder/myfile*`), but I have decided to leave wildcard-matching intact, since it seems to be a more common use-case. Cache creation and use is limited for now when using wildcards. Use the same method as described above for file concatenation if you wanna make sure that caches are being used.

After this version is fully stabilised, I'll make more efforts to consolidate wildcard (and perhaps concatenation) to fully utilise caching seamlessly.

## Code runs only on python 3
Removed the dual py2/py3 support. Since q is packaged as a self-contained executable, along with python 3.8 itself, then this is not needed anymore.

Users which for some reason still use q's main source code file directly and use python 2 would need to stay with the latest 2.0.19 release. In some next version, q's code structure is going to change significantly anyway in order to become a standard python module, so using the main source code file directly would not be possible.

If you are such a user, and this decision hurts you considerably, please ping me.


================================================
FILE: README.markdown
================================================
[![Build and Package](https://github.com/harelba/q/workflows/BuildAndPackage/badge.svg?branch=master)](https://github.com/harelba/q/actions?query=branch%3Amaster)

# q - Text as Data
q's purpose is to bring SQL expressive power to the Linux command line and to provide easy access to text as actual data.

q allows the following:

* Performing SQL-like statements directly on tabular text data, auto-caching the data in order to accelerate additional querying on the same file. 
* Performing SQL statements directly on multi-file sqlite3 databases, without having to merge them or load them into memory

The following table shows the impact of using caching:

|    Rows   | Columns | File Size | Query time without caching | Query time with caching | Speed Improvement |
|:---------:|:-------:|:---------:|:--------------------------:|:-----------------------:|:-----------------:|
| 5,000,000 |   100   |   4.8GB   |    4 minutes, 47 seconds   |       1.92 seconds      |        x149       |
| 1,000,000 |   100   |   983MB   |        50.9 seconds        |      0.461 seconds      |        x110       |
| 1,000,000 |    50   |   477MB   |        27.1 seconds        |      0.272 seconds      |        x99        |
|  100,000  |   100   |    99MB   |         5.2 seconds        |      0.141 seconds      |        x36        |
|  100,000  |    50   |    48MB   |         2.7 seconds        |      0.105 seconds      |        x25        |

Notice that for the current version, caching is **not enabled** by default, since the caches take disk space. Use `-C readwrite` or `-C read` to enable it for a query, or add `caching_mode` to `.qrc` to set a new default.
 
q's web site is [https://harelba.github.io/q/](https://harelba.github.io/q/) or [https://q.textasdata.wiki](https://q.textasdata.wiki) It contains everything you need to download and use q immediately.


## Usage Examples
q treats ordinary files as database tables, and supports all SQL constructs, such as `WHERE`, `GROUP BY`, `JOIN`s, etc. It supports automatic column name and type detection, and provides full support for multiple character encodings.

Here are some example commands to get the idea:

```bash
$ q "SELECT COUNT(*) FROM ./clicks_file.csv WHERE c3 > 32.3"

$ ps -ef | q -H "SELECT UID, COUNT(*) cnt FROM - GROUP BY UID ORDER BY cnt DESC LIMIT 3"

$ q "select count(*) from some_db.sqlite3:::albums a left join another_db.sqlite3:::tracks t on (a.album_id = t.album_id)"
```

Detailed examples are in [here](https://harelba.github.io/q/#examples)

## Installation.
**New Major Version `3.1.6` is out with a lot of significant additions.**

Instructions for all OSs are [here](https://harelba.github.io/q/#installation).

The previous version `2.0.19` Can still be downloaded from [here](https://github.com/harelba/q/releases/tag/2.0.19)  

## Contact
Any feedback/suggestions/complaints regarding this tool would be much appreciated. Contributions are most welcome as well, of course.

Linkedin: [Harel Ben Attia](https://www.linkedin.com/in/harelba/)

Twitter [@harelba](https://twitter.com/harelba)

Email [harelba@gmail.com](mailto:harelba@gmail.com)

q on twitter: [#qtextasdata](https://twitter.com/hashtag/qtextasdata?src=hashtag_click)

Patreon: [harelba](https://www.patreon.com/harelba) - All the money received is donated to the [Center for the Prevention and Treatment of Domestic Violence](https://www.gov.il/he/departments/bureaus/molsa-almab-ramla) in my hometown - Ramla, Israel.


================================================
FILE: benchmark-config.sh
================================================
#!/bin/bash

BENCHMARK_PYTHON_VERSIONS=(3.8.5)


================================================
FILE: bin/.qrc
================================================
#
# q options ini file. Put either in your home folder as .qrc or in the working directory 
#   (both will be merged in that order)
#
# All options should reside in an [options] section
#
# Available options:
# * delimiter - escaped string (e.g. use \t for tab or \x20 for space)
# * outputdelimiter - escaped string (e.g. use \t for tab or \x20 for space)
# * gzipped - boolean True or False
# * beautify - boolean True or False
# * header_skip - integer number of lines to skip at the beginning of the file
# * formatting - regular string - post-query formatting - see docs for details
# * encoding - regular string - required encoding.
#
# All options have a matching command line option. See --help for details on defaults

[options]
#delimiter: \t
#output_delimiter: \t
#gzipped: False
#beautify: True
#skip_header: False
#formatting: 1=%4.3f,2=%4.3f
#encoding: UTF-8


================================================
FILE: bin/__init__.py
================================================
#!/usr/bin/env python


================================================
FILE: bin/q.bat
================================================
@echo off

setlocal
if exist "%~dp0..\python.exe" ( "%~dp0..\python" "%~dp0q" %* ) else ( python "%~dp0q" %* )
endlocal


================================================
FILE: bin/q.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

#   Copyright (C) 2012-2021 Harel Ben-Attia
#
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 3, or (at your option)
#   any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details (doc/LICENSE contains
#   a copy of it)
#
#
# Name      : q (With respect to The Q Continuum)
# Author    : Harel Ben-Attia - harelba@gmail.com, harelba @ github, @harelba on twitter
#
#
# q allows performing SQL-like statements on tabular text data.
#
# Its purpose is to bring SQL expressive power to manipulating text data using the Linux command line.
#
# Full Documentation and details in https://harelba.github.io/q/
#
# Run with --help for command line details
#
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from collections import OrderedDict
from sqlite3.dbapi2 import OperationalError
from uuid import uuid4

q_version = '3.1.6'

#__all__ = [ 'QTextAsData' ]

import os
import sys
import sqlite3
import glob
from argparse import ArgumentParser
import codecs
import locale
import time
import re
from six.moves import configparser, range, filter
import traceback
import csv
import uuid
import math
import six
import io
import json
import datetime
import hashlib

if six.PY2:
    assert False, 'Python 2 is not longer supported by q'

long = int
unicode = six.text_type

DEBUG = bool(os.environ.get('Q_DEBUG', None)) or '-V' in sys.argv
SQL_DEBUG = False

if DEBUG:
    def xprint(*args,**kwargs):
        print(datetime.datetime.utcnow().isoformat()," DEBUG ",*args,file=sys.stderr,**kwargs)

    def iprint(*args,**kwargs):
        print(datetime.datetime.utcnow().isoformat()," INFO ",*args,file=sys.stderr,**kwargs)

    def sqlprint(*args,**kwargs):
        pass
else:
    def xprint(*args,**kwargs): pass
    def iprint(*args,**kwargs): pass
    def sqlprint(*args,**kwargs): pass

if SQL_DEBUG:
    def sqlprint(*args,**kwargs):
        print(datetime.datetime.utcnow().isoformat(), " SQL ", *args, file=sys.stderr, **kwargs)


def get_stdout_encoding(encoding_override=None):
    if encoding_override is not None and encoding_override != 'none':
       return encoding_override

    if sys.stdout.isatty():
        return sys.stdout.encoding
    else:
        return locale.getpreferredencoding()

SHOW_SQL = False

sha_algorithms = {
    1 : hashlib.sha1,
    224: hashlib.sha224,
    256: hashlib.sha256,
    386: hashlib.sha384,
    512: hashlib.sha512
}

def sha(data,algorithm,encoding):
    try:
        f = sha_algorithms[algorithm]
        return f(six.text_type(data).encode(encoding)).hexdigest()
    except Exception as e:
        print(e)

# For backward compatibility only (doesn't handle encoding well enough)
def sha1(data):
    return hashlib.sha1(six.text_type(data).encode('utf-8')).hexdigest()

# TODO Add caching of compiled regexps - Will be added after benchmarking capability is baked in
def regexp(regular_expression, data):
    if data is not None:
        if not isinstance(data, str) and not isinstance(data, unicode):
            data = str(data)
        return re.search(regular_expression, data) is not None
    else:
        return False

def regexp_extract(regular_expression, data,group_number):
    if data is not None:
        if not isinstance(data, str) and not isinstance(data, unicode):
            data = str(data)
        m = re.search(regular_expression, data)
        if m is not None:
            return m.groups()[group_number]
    else:
        return False

def md5(data,encoding):
    m = hashlib.md5()
    m.update(six.text_type(data).encode(encoding))
    return m.hexdigest()

def sqrt(data):
    return math.sqrt(data)

def power(data,p):
    return data**p

def file_ext(data):
    if data is None:
        return None

    return os.path.splitext(data)[1]

def file_folder(data):
    if data is None:
        return None
    return os.path.split(data)[0]

def file_basename(data):
    if data is None:
        return None
    return os.path.split(data)[1]
    
def file_basename_no_ext(data):
    if data is None:
        return None

    return os.path.split(os.path.splitext(data)[0])[-1]

def percentile(l, p):
    # TODO Alpha implementation, need to provide multiple interpolation methods, and add tests
    if not l:
        return None
    k = p*(len(l) - 1)
    f = math.floor(k)
    c = math.ceil(k)
    if c == f:
        return l[int(k)]
    return (c-k) * l[int(f)] + (k-f) * l[int(c)]

# TODO Streaming Percentile to prevent memory consumption blowup for large datasets
class StrictPercentile(object):
    def __init__(self):
        self.values = []
        self.p = None

    def step(self,value,p):
        if self.p is None:
          self.p = p
        self.values.append(value)

    def finalize(self):
        if len(self.values) == 0 or (self.p < 0 or self.p > 1):
            return None
        else:
            return percentile(sorted(self.values),self.p)

class StdevPopulation(object):
    def __init__(self):
        self.M = 0.0
        self.S = 0.0
        self.k = 0

    def step(self, value):
        try:
            # Ignore nulls
            if value is None:
                return
            val = float(value) # if fails, skips this iteration, which also ignores nulls
            tM = self.M
            self.k += 1
            self.M += ((val - tM) / self.k)
            self.S += ((val - tM) * (val - self.M))
        except ValueError:
            # TODO propagate udf errors to console
            raise Exception("Data is not numeric when calculating stddev (%s)" % value)

    def finalize(self):
        if self.k <= 1: # avoid division by zero
            return None
        else:
            return math.sqrt(self.S / (self.k))

class StdevSample(object):
    def __init__(self):
        self.M = 0.0
        self.S = 0.0
        self.k = 0

    def step(self, value):
        try:
            # Ignore nulls
            if value is None:
                return
            val = float(value) # if fails, skips this iteration, which also ignores nulls
            tM = self.M
            self.k += 1
            self.M += ((val - tM) / self.k)
            self.S += ((val - tM) * (val - self.M))
        except ValueError:
            # TODO propagate udf errors to console
            raise Exception("Data is not numeric when calculating stddev (%s)" % value)

    def finalize(self):
        if self.k <= 1: # avoid division by zero
            return None
        else:
            return math.sqrt(self.S / (self.k-1))

class FunctionType(object):
    REGULAR = 1
    AGG = 2

class UserFunctionDef(object):
    def __init__(self,func_type,name,usage,description,func_or_obj,param_count):
        self.func_type = func_type
        self.name = name
        self.usage = usage
        self.description = description
        self.func_or_obj = func_or_obj
        self.param_count = param_count

user_functions = [
    UserFunctionDef(FunctionType.REGULAR,
                    "regexp","regexp(<regular_expression>,<expr>) = <1|0>",
                    "Find regexp in string expression. Returns 1 if found or 0 if not",
                    regexp,
                    2),
    UserFunctionDef(FunctionType.REGULAR,
                    "regexp_extract","regexp_extract(<regular_expression>,<expr>,group_number) = <substring|null>",
                    "Get regexp capture group content",
                    regexp_extract,
                    3),
    UserFunctionDef(FunctionType.REGULAR,
                    "sha","sha(<expr>,<encoding>,<algorithm>) = <hex-string-of-sha>",
                    "Calculate sha of some expression. Algorithm can be one of 1,224,256,384,512. For now encoding must be manually provided. Will use the input encoding automatically in the future.",
                    sha,
                    3),
    UserFunctionDef(FunctionType.REGULAR,
                    "sha1","sha1(<expr>) = <hex-string-of-sha>",
                    "Exists for backward compatibility only, since it doesn't handle encoding properly. Calculates sha1 of some expression",
                    sha1,
                    1),
    UserFunctionDef(FunctionType.REGULAR,
                    "md5","md5(<expr>,<encoding>) = <hex-string-of-md5>",
                    "Calculate md5 of expression. Returns a hex-string of the result. Currently requires to manually provide the encoding of the data. Will be taken automatically from the input encoding in the future.",
                    md5,
                    2),
    UserFunctionDef(FunctionType.REGULAR,
                    "sqrt","sqrt(<expr>) = <square-root>",
                    "Calculate the square root of the expression",
                    sqrt,
                    1),
    UserFunctionDef(FunctionType.REGULAR,
                    "power","power(<expr1>,<expr2>) = <expr1-to-the-power-of-expr2>",
                    "Raise expr1 to the power of expr2",
                    power,
                    2),
    UserFunctionDef(FunctionType.REGULAR,
                    "file_ext","file_ext(<expr>) = <filename-extension-or-empty-string>",
                    "Get the extension of a filename",
                    file_ext,
                    1),
    UserFunctionDef(FunctionType.REGULAR,
                    "file_folder","file_folder(<expr>) = <folder-name-of-filename>",
                    "Get the folder part of a filename",
                    file_folder,
                    1),
    UserFunctionDef(FunctionType.REGULAR,
                    "file_basename","file_basename(<expr>) = <basename-of-filename-including-extension>",
                    "Get the basename of a filename, including extension if any",
                    file_basename,
                    1),
    UserFunctionDef(FunctionType.REGULAR,
                    "file_basename_no_ext","file_basename_no_ext(<expr>) = <basename-of-filename-without-extension>",
                    "Get the basename of a filename, without the extension if there is one",
                    file_basename_no_ext,
                    1),
    UserFunctionDef(FunctionType.AGG,
                    "percentile","percentile(<expr>,<percentile-in-the-range-0-to-1>) = <percentile-value>",
                    "Calculate the strict percentile of a set of a values.",
                    StrictPercentile,
                    2),
    UserFunctionDef(FunctionType.AGG,
                    "stddev_pop","stddev_pop(<expr>) = <stddev-value>",
                    "Calculate the population standard deviation of a set of values",
                    StdevPopulation,
                    1),
    UserFunctionDef(FunctionType.AGG,
                    "stddev_sample","stddev_sample(<expr>) = <stddev-value>",
                    "Calculate the sample standard deviation of a set of values",
                    StdevSample,
                    1)
]

def print_user_functions():
    for udf in user_functions:
        print("Function: %s" % udf.name)
        print("     Usage: %s" % udf.usage)
        print("     Description: %s" % udf.description)

class Sqlite3DBResults(object):
    def __init__(self,query_column_names,results):
        self.query_column_names = query_column_names
        self.results = results

    def __str__(self):
        return "Sqlite3DBResults<result_count=%d,query_column_names=%s>" % (len(self.results),str(self.query_column_names))
    __repr__ = __str__

def get_sqlite_type_affinity(sqlite_type):
    sqlite_type = sqlite_type.upper()
    if 'INT' in sqlite_type:
        return 'INTEGER'
    elif 'CHAR' in sqlite_type or 'TEXT' in sqlite_type or 'CLOB' in sqlite_type:
        return 'TEXT'
    elif 'BLOB' in sqlite_type:
        return 'BLOB'
    elif 'REAL' in sqlite_type or 'FLOA' in sqlite_type or 'DOUB' in sqlite_type:
        return 'REAL'
    else:
        return 'NUMERIC'

def sqlite_type_to_python_type(sqlite_type):
    SQLITE_AFFINITY_TO_PYTHON_TYPE_NAMES = {
        'INTEGER': long,
        'TEXT': unicode,
        'BLOB': bytes,
        'REAL': float,
        'NUMERIC': float
    }
    return SQLITE_AFFINITY_TO_PYTHON_TYPE_NAMES[get_sqlite_type_affinity(sqlite_type)]


class Sqlite3DB(object):
    # TODO Add metadata table with qsql file version

    QCATALOG_TABLE_NAME = '_qcatalog'
    NUMERIC_COLUMN_TYPES =  {int, long, float}
    PYTHON_TO_SQLITE_TYPE_NAMES = { str: 'TEXT', int: 'INT', long : 'INT' , float: 'REAL', None: 'TEXT' }


    def __str__(self):
        return "Sqlite3DB<url=%s>" % self.sqlite_db_url
    __repr__ = __str__

    def __init__(self, db_id, sqlite_db_url, sqlite_db_filename, create_qcatalog, show_sql=SHOW_SQL):
        self.show_sql = show_sql
        self.create_qcatalog = create_qcatalog

        self.db_id = db_id
        # TODO Is this needed anymore?
        self.sqlite_db_filename = sqlite_db_filename
        self.sqlite_db_url = sqlite_db_url
        self.conn = sqlite3.connect(self.sqlite_db_url, uri=True)
        self.last_temp_table_id = 10000
        self.cursor = self.conn.cursor()
        self.add_user_functions()

        if create_qcatalog:
            self.create_qcatalog_table()
        else:
            xprint('Not creating qcatalog for db_id %s' % db_id)

    def retrieve_all_table_names(self):
        return [x[0] for x in self.execute_and_fetch("select tbl_name from sqlite_master where type='table'").results]

    def get_sqlite_table_info(self,table_name):
        return self.execute_and_fetch('PRAGMA table_info(%s)' % table_name).results

    def get_sqlite_database_list(self):
        return self.execute_and_fetch('pragma database_list').results

    def find_new_table_name(self,planned_table_name):
        existing_table_names = self.retrieve_all_table_names()

        possible_indices = range(1,1000)

        for index in possible_indices:
            if index == 1:
                suffix = ''
            else:
                suffix = '_%s' % index

            table_name_attempt = '%s%s' % (planned_table_name,suffix)

            if table_name_attempt not in existing_table_names:
                xprint("Found free table name %s in db %s for planned table name %s" % (table_name_attempt,self.db_id,planned_table_name))
                return table_name_attempt

        # TODO Add test for this
        raise Exception('Cannot find free table name in db %s for planned table name %s' % (self.db_id,planned_table_name))

    def create_qcatalog_table(self):
        if not self.qcatalog_table_exists():
            xprint("qcatalog table does not exist. Creating it")
            r = self.conn.execute("""CREATE TABLE %s ( 
                               qcatalog_entry_id text not null primary key,
                               content_signature_key text,
                               temp_table_name text,
                               content_signature text,
                               creation_time text,
                               source_type text,
                               source text)""" % self.QCATALOG_TABLE_NAME).fetchall()
        else:
            xprint("qcatalog table already exists. No need to create it")

    def qcatalog_table_exists(self):
        return sqlite_table_exists(self.conn,self.QCATALOG_TABLE_NAME)

    def calculate_content_signature_key(self,content_signature):
        assert type(content_signature) == OrderedDict
        pp = json.dumps(content_signature,sort_keys=True)
        xprint("Calculating content signature for:",pp,six.b(pp))
        return hashlib.sha1(six.b(pp)).hexdigest()

    def add_to_qcatalog_table(self, temp_table_name, content_signature, creation_time,source_type, source):
        assert source is not None
        assert source_type is not None
        content_signature_key = self.calculate_content_signature_key(content_signature)
        xprint("db_id: %s Adding to qcatalog table: %s. Calculated signature key %s" % (self.db_id, temp_table_name,content_signature_key))
        r = self.execute_and_fetch(
            'INSERT INTO %s (qcatalog_entry_id,content_signature_key, temp_table_name,content_signature,creation_time,source_type,source) VALUES (?,?,?,?,?,?,?)' % self.QCATALOG_TABLE_NAME,
                              (str(uuid4()),content_signature_key,temp_table_name,json.dumps(content_signature),creation_time,source_type,source))
        # Ensure transaction is completed
        self.conn.commit()

    def get_from_qcatalog(self, content_signature):
        content_signature_key = self.calculate_content_signature_key(content_signature)
        xprint("Finding table in db_id %s that matches content signature key %s" % (self.db_id,content_signature_key))

        field_names = ["content_signature_key", "temp_table_name", "content_signature", "creation_time","source_type","source","qcatalog_entry_id"]

        q = "SELECT %s FROM %s where content_signature_key = ?" % (",".join(field_names),self.QCATALOG_TABLE_NAME)
        r = self.execute_and_fetch(q,(content_signature_key,))

        if r is None:
            return None

        if len(r.results) == 0:
            return None

        if len(r.results) > 1:
            raise Exception("Bug - Exactly one result should have been provided: %s" % str(r.results))

        d = dict(zip(field_names,r.results[0]))
        return d

    def get_from_qcatalog_using_table_name(self, temp_table_name):
        xprint("getting from qcatalog using table name")

        field_names = ["content_signature", "temp_table_name","creation_time","source_type","source","content_signature_key","qcatalog_entry_id"]

        q = "SELECT %s FROM %s where temp_table_name = ?" % (",".join(field_names),self.QCATALOG_TABLE_NAME)
        xprint("Query from qcatalog %s params %s" % (q,str(temp_table_name,)))
        r = self.execute_and_fetch(q,(temp_table_name,))
        xprint("results: ",r.results)

        if r is None:
            return None

        if len(r.results) == 0:
            return None

        if len(r.results) > 1:
            raise Exception("Bug - Exactly one result should have been provided: %s" % str(r.results))

        d = dict(zip(field_names,r.results[0]))
        # content_signature should be the first in the list of field_names
        cs = OrderedDict(json.loads(r.results[0][0]))
        if self.calculate_content_signature_key(cs) != d['content_signature_key']:
            raise Exception('Table contains an invalid entry - content signature key is not matching the actual content signature')
        return d

    def get_all_from_qcatalog(self):
        xprint("getting from qcatalog using table name")

        field_names = ["temp_table_name", "content_signature", "creation_time","source_type","source","qcatalog_entry_id"]

        q = "SELECT %s FROM %s" % (",".join(field_names),self.QCATALOG_TABLE_NAME)
        xprint("Query from qcatalog %s" % q)
        r = self.execute_and_fetch(q)

        if r is None:
            return None

        def convert(res):
            d = dict(zip(field_names, res))
            cs = OrderedDict(json.loads(res[1]))
            d['content_signature_key'] = self.calculate_content_signature_key(cs)
            return d

        rr = [convert(r) for r in r.results]

        return rr

    def done(self):
        xprint("Closing database %s" % self.db_id)
        try:
            self.conn.commit()
            self.conn.close()
            xprint("Database %s closed" % self.db_id)
        except Exception as e:
            xprint("Could not close database %s" % self.db_id)
            raise

    def add_user_functions(self):
        for udf in user_functions:
            if type(udf.func_or_obj) == type(object):
                self.conn.create_aggregate(udf.name,udf.param_count,udf.func_or_obj)
            elif type(udf.func_or_obj) == type(md5):
                self.conn.create_function(udf.name,udf.param_count,udf.func_or_obj)
            else:
                raise Exception("Invalid user function definition %s" % str(udf))

    def is_numeric_type(self, column_type):
        return column_type in Sqlite3DB.NUMERIC_COLUMN_TYPES

    def update_many(self, sql, params):
        try:
            sqlprint(sql, " params: " + str(params))
            self.cursor.executemany(sql, params)
            _ = self.cursor.fetchall()
        finally:
            pass  # cursor.close()

    def execute_and_fetch(self, q,params = None):
        try:
            try:
                if self.show_sql:
                    print(repr(q))
                if params is None:
                    r = self.cursor.execute(q)
                else:
                    r = self.cursor.execute(q,params)
                if self.cursor.description is not None:
                    # we decode the column names, so they can be encoded to any output format later on
                    query_column_names = [c[0] for c in self.cursor.description]
                else:
                    query_column_names = None
                result = self.cursor.fetchall()
            finally:
                pass  # cursor.close()
        except OperationalError as e:
            raise SqliteOperationalErrorException("Failed executing sqlite query %s with params %s . error: %s" % (q,params,str(e)),e)
        return Sqlite3DBResults(query_column_names,result)

    def _get_as_list_str(self, l):
        return ",".join(['"%s"' % x.replace('"', '""') for x in l])

    def generate_insert_row(self, table_name, column_names):
        col_names_str = self._get_as_list_str(column_names)
        question_marks = ", ".join(["?" for i in range(0, len(column_names))])
        return 'INSERT INTO %s (%s) VALUES (%s)' % (table_name, col_names_str, question_marks)

    # Get a list of column names so order will be preserved (Could have used OrderedDict, but
    # then we would need python 2.7)
    def generate_create_table(self, table_name, column_names, column_dict):
        # Convert dict from python types to db types
        column_name_to_db_type = dict(
            (n, Sqlite3DB.PYTHON_TO_SQLITE_TYPE_NAMES[t]) for n, t in six.iteritems(column_dict))
        column_defs = ','.join(['"%s" %s' % (
            n.replace('"', '""'), column_name_to_db_type[n]) for n in column_names])
        return 'CREATE TABLE %s (%s)' % (table_name, column_defs)

    def generate_temp_table_name(self):
        # WTF - From my own past mutable-self
        self.last_temp_table_id += 1
        tn = "temp_table_%s" % self.last_temp_table_id
        return tn

    def generate_drop_table(self, table_name):
        return "DROP TABLE %s" % table_name

    def drop_table(self, table_name):
        return self.execute_and_fetch(self.generate_drop_table(table_name))

    def attach_and_copy_table(self, from_db, relevant_table,stop_after_analysis):
        xprint("Attaching %s into db %s and copying table %s into it" % (from_db,self,relevant_table))
        temp_db_id = 'temp_db_id'
        q = "attach '%s' as %s" % (from_db.sqlite_db_url,temp_db_id)
        xprint("Attach query: %s" % q)
        c = self.execute_and_fetch(q)

        new_temp_table_name = 'temp_table_%s' % (self.last_temp_table_id + 1)
        fully_qualified_table_name = '%s.%s' % (temp_db_id,relevant_table)

        if stop_after_analysis:
            limit = ' limit 100'
        else:
            limit = ''

        copy_query = 'create table %s as select * from %s %s' % (new_temp_table_name,fully_qualified_table_name,limit)
        copy_results = self.execute_and_fetch(copy_query)
        xprint("Copied %s.%s into %s in db_id %s. Results %s" % (temp_db_id,relevant_table,new_temp_table_name,self.db_id,copy_results))
        self.last_temp_table_id += 1

        xprint("Copied table into %s. Detaching db that was attached temporarily" % self.db_id)

        q = "detach database %s" % temp_db_id
        xprint("detach query: %s" % q)
        c = self.execute_and_fetch(q)
        xprint(c)
        return new_temp_table_name


class CouldNotConvertStringToNumericValueException(Exception):

    def __init__(self, msg):
        self.msg = msg

    def __str(self):
        return repr(self.msg)

class SqliteOperationalErrorException(Exception):

    def __init__(self, msg,original_error):
        self.msg = msg
        self.original_error = original_error

    def __str(self):
        return repr(self.msg) + "//" + repr(self.original_error)

class IncorrectDefaultValueException(Exception):

    def __init__(self, option_type,option,actual_value):
        self.option_type = option_type
        self.option = option
        self.actual_value = actual_value

    def __str__(self):
        return repr(self)

class NonExistentTableNameInQsql(Exception):

    def __init__(self, qsql_filename,table_name,existing_table_names):
        self.qsql_filename = qsql_filename
        self.table_name = table_name
        self.existing_table_names = existing_table_names

class NonExistentTableNameInSqlite(Exception):

    def __init__(self, qsql_filename,table_name,existing_table_names):
        self.qsql_filename = qsql_filename
        self.table_name = table_name
        self.existing_table_names = existing_table_names

class TooManyTablesInQsqlException(Exception):

    def __init__(self, qsql_filename,existing_table_names):
        self.qsql_filename = qsql_filename
        self.existing_table_names = existing_table_names

class NoTableInQsqlExcption(Exception):

    def __init__(self, qsql_filename):
        self.qsql_filename = qsql_filename

class TooManyTablesInSqliteException(Exception):

    def __init__(self, qsql_filename,existing_table_names):
        self.qsql_filename = qsql_filename
        self.existing_table_names = existing_table_names

class NoTablesInSqliteException(Exception):

    def __init__(self, sqlite_filename):
        self.sqlite_filename = sqlite_filename

class ColumnMaxLengthLimitExceededException(Exception):

    def __init__(self, msg):
        self.msg = msg

    def __str(self):
        return repr(self.msg)

class CouldNotParseInputException(Exception):

    def __init__(self, msg):
        self.msg = msg

    def __str(self):
        return repr(self.msg)

class BadHeaderException(Exception):

    def __init__(self, msg):
        self.msg = msg

    def __str(self):
        return repr(self.msg)

class EncodedQueryException(Exception):

    def __init__(self, msg):
        self.msg = msg

    def __str(self):
        return repr(self.msg)


class CannotUnzipDataStreamException(Exception):

    def __init__(self):
        pass

class UniversalNewlinesExistException(Exception):

    def __init__(self):
        pass

class EmptyDataException(Exception):

    def __init__(self):
        pass

class MissingHeaderException(Exception):

    def __init__(self,msg):
        self.msg = msg

class InvalidQueryException(Exception):

    def __init__(self,msg):
        self.msg = msg

class TooManyAttachedDatabasesException(Exception):

    def __init__(self,msg):
        self.msg = msg

class FileNotFoundException(Exception):

    def __init__(self, msg):
        self.msg = msg

    def __str(self):
        return repr(self.msg)

class UnknownFileTypeException(Exception):

    def __init__(self, msg):
        self.msg = msg

    def __str(self):
        return repr(self.msg)


class ColumnCountMismatchException(Exception):

    def __init__(self, msg):
        self.msg = msg

class ContentSignatureNotFoundException(Exception):

    def __init__(self, msg):
        self.msg = msg

class StrictModeColumnCountMismatchException(Exception):

    def __init__(self,atomic_fn, expected_col_count,actual_col_count,lines_read):
        self.atomic_fn = atomic_fn
        self.expected_col_count = expected_col_count
        self.actual_col_count = actual_col_count
        self.lines_read = lines_read

class FluffyModeColumnCountMismatchException(Exception):

    def __init__(self,atomic_fn, expected_col_count,actual_col_count,lines_read):
        self.atomic_fn = atomic_fn
        self.expected_col_count = expected_col_count
        self.actual_col_count = actual_col_count
        self.lines_read = lines_read

class ContentSignatureDiffersException(Exception):

    def __init__(self,original_filename, other_filename, filenames_str,key,source_value,signature_value):
        self.original_filename = original_filename
        self.other_filename = other_filename
        self.filenames_str = filenames_str
        self.key = key
        self.source_value = source_value
        self.signature_value = signature_value


class ContentSignatureDataDiffersException(Exception):

    def __init__(self,msg):
        self.msg = msg


class InvalidQSqliteFileException(Exception):

    def __init__(self,msg):
        self.msg = msg


class MaximumSourceFilesExceededException(Exception):

    def __init__(self,msg):
        self.msg = msg


# Simplistic Sql "parsing" class... We'll eventually require a real SQL parser which will provide us with a parse tree
#
# A "qtable" is a filename which behaves like an SQL table...
class Sql(object):

    def __init__(self, sql, data_streams):
        # Currently supports only standard SELECT statements

        # Holds original SQL
        self.sql = sql
        # Holds sql parts
        self.sql_parts = sql.split()
        self.data_streams = data_streams

        self.qtable_metadata_dict = OrderedDict()

        # Set of qtable names
        self.qtable_names = []
        # Dict from qtable names to their positions in sql_parts. Value here is a *list* of positions,
        # since it is possible that the same qtable_name (file) is referenced in multiple positions
        # and we don't want the database table to be recreated for each
        # reference
        self.qtable_name_positions = {}
        # Dict from qtable names to their effective (actual database) table
        # names
        self.qtable_name_effective_table_names = {}

        self.query_column_names = None

        # Go over all sql parts
        idx = 0
        while idx < len(self.sql_parts):
            # Get the part string
            part = self.sql_parts[idx]
            # If it's a FROM or a JOIN
            if part.upper() in ['FROM', 'JOIN']:
                # and there is nothing after it,
                if idx == len(self.sql_parts) - 1:
                    # Just fail
                    raise InvalidQueryException(
                        'FROM/JOIN is missing a table name after it')

                qtable_name = self.sql_parts[idx + 1]
                # Otherwise, the next part contains the qtable name. In most cases the next part will be only the qtable name.
                # We handle one special case here, where this is a subquery as a column: "SELECT (SELECT ... FROM qtable),100 FROM ...".
                # In that case, there will be an ending paranthesis as part of the name, and we want to handle this case gracefully.
                # This is obviously a hack of a hack :) Just until we have
                # complete parsing capabilities
                if ')' in qtable_name:
                    leftover = qtable_name[qtable_name.index(')'):]
                    self.sql_parts.insert(idx + 2, leftover)
                    qtable_name = qtable_name[:qtable_name.index(')')]
                    self.sql_parts[idx + 1] = qtable_name

                if qtable_name[0] != '(':
                    normalized_qtable_name = self.normalize_qtable_name(qtable_name)
                    xprint("Normalized qtable name for %s is %s" % (qtable_name,normalized_qtable_name))
                    self.qtable_names += [normalized_qtable_name]

                    if normalized_qtable_name not in self.qtable_name_positions.keys():
                        self.qtable_name_positions[normalized_qtable_name] = []

                    self.qtable_name_positions[normalized_qtable_name].append(idx + 1)
                    self.sql_parts[idx + 1] = normalized_qtable_name
                    idx += 2
                else:
                    idx += 1
            else:
                idx += 1
        xprint("Final sql parts: %s" % self.sql_parts)

    def normalize_qtable_name(self,qtable_name):
        if self.data_streams.is_data_stream(qtable_name):
            return qtable_name

        if ':::' in qtable_name:
            qsql_filename, table_name = qtable_name.split(":::", 1)
            return '%s:::%s' % (os.path.realpath(os.path.abspath(qsql_filename)),table_name)
        else:
            return os.path.realpath(os.path.abspath(qtable_name))

    def set_effective_table_name(self, qtable_name, effective_table_name):
        if qtable_name in self.qtable_name_effective_table_names.keys():
            if self.qtable_name_effective_table_names[qtable_name] != effective_table_name:
                raise Exception(
                    "Already set effective table name for qtable %s. Trying to change the effective table name from %s to %s" %
                    (qtable_name,self.qtable_name_effective_table_names[qtable_name],effective_table_name))

        xprint("Setting effective table name for %s - effective table name is set to %s" % (qtable_name,effective_table_name))
        self.qtable_name_effective_table_names[
            qtable_name] = effective_table_name

    def get_effective_sql(self,table_name_mapping=None):
        if len(list(filter(lambda x: x is None, self.qtable_name_effective_table_names))) != 0:
            assert False, 'There are qtables without effective tables'

        effective_sql = [x for x in self.sql_parts]

        xprint("Effective table names",self.qtable_name_effective_table_names)
        for qtable_name, positions in six.iteritems(self.qtable_name_positions):
            xprint("Positions for qtable name %s are %s" % (qtable_name,positions))
            for pos in positions:
                if table_name_mapping is not None:
                    x = self.qtable_name_effective_table_names[qtable_name]
                    effective_sql[pos] = table_name_mapping[x]
                else:
                    effective_sql[pos] = self.qtable_name_effective_table_names[qtable_name]

        return " ".join(effective_sql)

    def get_qtable_name_effective_table_names(self):
        return self.qtable_name_effective_table_names

    def execute_and_fetch(self, db):
        x = self.get_effective_sql()
        xprint("Final query: %s" % x)
        db_results_obj = db.execute_and_fetch(x)
        return db_results_obj

    def materialize_using(self,loaded_table_structures_dict):
        xprint("Materializing sql object: %s" % str(self.qtable_names))
        xprint("loaded table structures dict %s" % loaded_table_structures_dict)
        for qtable_name in self.qtable_names:
            table_structure = loaded_table_structures_dict[qtable_name]

            table_name_in_disk_db = table_structure.get_table_name_for_querying()

            effective_table_name = '%s.%s' % (table_structure.db_id, table_name_in_disk_db)

            # for a single file - no need to create a union, just use the table name
            self.set_effective_table_name(qtable_name, effective_table_name)
            xprint("Materialized filename %s to effective table name %s" % (qtable_name,effective_table_name))


class TableColumnInferer(object):

    def __init__(self, input_params):
        self.inferred = False
        self.mode = input_params.parsing_mode
        self.rows = []
        self.skip_header = input_params.skip_header
        self.header_row = None
        self.header_row_filename = None
        self.expected_column_count = input_params.expected_column_count
        self.input_delimiter = input_params.delimiter
        self.disable_column_type_detection = input_params.disable_column_type_detection

    def _generate_content_signature(self):
        return OrderedDict({
            "inferred": self.inferred,
            "mode": self.mode,
            "rows": "\n".join([",".join(x) for x in self.rows]),
            "skip_header": self.skip_header,
            "header_row": self.header_row,
            "expected_column_count": self.expected_column_count,
            "input_delimiter": self.input_delimiter,
            "disable_column_type_detection": self.disable_column_type_detection
        })

    def analyze(self, filename, col_vals):
        if self.inferred:
            assert False, "Already inferred columns"

        if self.skip_header and self.header_row is None:
            self.header_row = col_vals
            self.header_row_filename = filename
        else:
            self.rows.append(col_vals)

        if len(self.rows) < 100:
            return False

        self.do_analysis()
        return True

    def force_analysis(self):
        # This method is called whenever there is no more data, and an analysis needs
        # to be performed immediately, regardless of the amount of sample data that has
        # been collected
        self.do_analysis()

    def determine_type_of_value(self, value):
        if self.disable_column_type_detection:
            return str

        if value is not None:
            value = value.strip()
        if value == '' or value is None:
            return None

        try:
            i = int(value)
            if type(i) == long:
                return long
            else:
                return int
        except:
            pass

        try:
            f = float(value)
            return float
        except:
            pass

        return str

    def determine_type_of_value_list(self, value_list):
        type_list = [self.determine_type_of_value(v) for v in value_list]
        all_types = set(type_list)
        if len(set(type_list)) == 1:
            # all the sample lines are of the same type
            return type_list[0]
        else:
            # check for the number of types without nulls,
            type_list_without_nulls = list(filter(
                lambda x: x is not None, type_list))
            # If all the sample lines are of the same type,
            if len(set(type_list_without_nulls)) == 1:
                # return it
                return type_list_without_nulls[0]
            else:
                # If there are only two types, one float an one int, then choose a float type
                if len(set(type_list_without_nulls)) == 2 and float in type_list_without_nulls and int in type_list_without_nulls:
                    return float
                return str

    def do_analysis(self):
        if self.mode == 'strict':
            self._do_strict_analysis()
        elif self.mode in ['relaxed']:
            self._do_relaxed_analysis()
        else:
            raise Exception('Unknown parsing mode %s' % self.mode)

        if self.column_count == 1 and self.expected_column_count != 1 and self.expected_column_count is not None:
            print(f"Warning: column count is one (expected column count is {self.expected_column_count} - did you provide the correct delimiter?", file=sys.stderr)

        self.infer_column_types()
        self.infer_column_names()
        self.inferred = True

    def validate_column_names(self, value_list):
        column_name_errors = []
        for v in value_list:
            if v is None:
                # we allow column names to be None, in relaxed mode it'll be filled with default names.
                # RLRL
                continue
            if ',' in v:
                column_name_errors.append(
                    (v, "Column name cannot contain commas"))
                continue
            if self.input_delimiter in v:
                column_name_errors.append(
                    (v, "Column name cannot contain the input delimiter. Please make sure you've set the correct delimiter"))
                continue
            if '\n' in v:
                column_name_errors.append(
                    (v, "Column name cannot contain newline"))
                continue
            if v != v.strip():
                column_name_errors.append(
                    (v, "Column name contains leading/trailing spaces"))
                continue
            try:
                v.encode("utf-8", "strict").decode("utf-8")
            except:
                column_name_errors.append(
                    (v, "Column name must be UTF-8 Compatible"))
                continue
            # We're checking for column duplication for each field in order to be able to still provide it along with other errors
            if len(list(filter(lambda x: x == v,value_list))) > 1:
                entry = (v, "Column name is duplicated")
                # Don't duplicate the error report itself
                if entry not in column_name_errors:
                    column_name_errors.append(entry)
                continue
            nul_index = v.find("\x00")
            if nul_index >= 0:
                column_name_errors.append(
                    (v, "Column name cannot contain NUL"))
                continue
            t = self.determine_type_of_value(v)
            if t != str:
                column_name_errors.append((v, "Column name must be a string"))
        return column_name_errors

    def infer_column_names(self):
        if self.header_row is not None:
            column_name_errors = self.validate_column_names(self.header_row)
            if len(column_name_errors) > 0:
                raise BadHeaderException("Header must contain only strings and not numbers or empty strings: '%s'\n%s" % (
                    ",".join(self.header_row), "\n".join(["'%s': %s" % (x, y) for x, y in column_name_errors])))

            # use header row in order to name columns
            if len(self.header_row) < self.column_count:
                if self.mode == 'strict':
                    raise ColumnCountMismatchException("Strict mode. Header row contains less columns than expected column count(%s vs %s)" % (
                        len(self.header_row), self.column_count))
                elif self.mode in ['relaxed']:
                    # in relaxed mode, add columns to fill the missing ones
                    self.header_row = self.header_row + \
                        ['c%s' % (x + len(self.header_row) + 1)
                         for x in range(self.column_count - len(self.header_row))]
            elif len(self.header_row) > self.column_count:
                if self.mode == 'strict':
                    raise ColumnCountMismatchException("Strict mode. Header row contains more columns than expected column count (%s vs %s)" % (
                        len(self.header_row), self.column_count))
                elif self.mode in ['relaxed']:
                    # In relaxed mode, just cut the extra column names
                    self.header_row = self.header_row[:self.column_count]
            self.column_names = self.header_row
        else:
            # Column names are cX starting from 1
            self.column_names = ['c%s' % (i + 1)
                                 for i in range(self.column_count)]

    def _do_relaxed_analysis(self):
        column_count_list = [len(col_vals) for col_vals in self.rows]

        if len(self.rows) == 0:
            if self.header_row is None:
                self.column_count = 0
            else:
                self.column_count = len(self.header_row)
        else:
            if self.expected_column_count is not None:
                self.column_count = self.expected_column_count
            else:
                # If not specified, we'll take the largest row in the sample rows
                self.column_count = max(column_count_list)

    def get_column_count_summary(self, column_count_list):
        counts = {}
        for column_count in column_count_list:
            counts[column_count] = counts.get(column_count, 0) + 1
        return six.u(", ").join([six.u("{} rows with {} columns".format(v, k)) for k, v in six.iteritems(counts)])

    def _do_strict_analysis(self):
        column_count_list = [len(col_vals) for col_vals in self.rows]

        if len(set(column_count_list)) != 1:
            raise ColumnCountMismatchException('Strict mode. Column Count is expected to identical. Multiple column counts exist at the first part of the file. Try to check your delimiter, or change to relaxed mode. Details: %s' % (
                self.get_column_count_summary(column_count_list)))

        self.column_count = len(self.rows[0])

        if self.expected_column_count is not None and self.column_count != self.expected_column_count:
            raise ColumnCountMismatchException('Strict mode. Column count is expected to be %s but is %s' % (
                self.expected_column_count, self.column_count))

        self.infer_column_types()

    def infer_column_types(self):
        assert self.column_count > -1
        self.column_types = []
        self.column_types2 = []
        for column_number in range(self.column_count):
            column_value_list = [
                row[column_number] if column_number < len(row) else None for row in self.rows]
            column_type = self.determine_type_of_value_list(column_value_list)
            self.column_types.append(column_type)

            column_value_list2 = [row[column_number] if column_number < len(
                row) else None for row in self.rows[1:]]
            column_type2 = self.determine_type_of_value_list(
                column_value_list2)
            self.column_types2.append(column_type2)

        comparison = map(
            lambda x: x[0] == x[1], zip(self.column_types, self.column_types2))
        if False in comparison and not self.skip_header:
            number_of_column_types = len(set(self.column_types))
            if number_of_column_types == 1 and list(set(self.column_types))[0] == str:
                print('Warning - There seems to be header line in the file, but -H has not been specified. All fields will be detected as text fields, and the header line will appear as part of the data', file=sys.stderr)

    def get_column_dict(self):
        return OrderedDict(zip(self.column_names, self.column_types))

    def get_column_count(self):
        return self.column_count

    def get_column_names(self):
        return self.column_names

    def get_column_types(self):
        return self.column_types


def py3_encoded_csv_reader(encoding, f, dialect,row_data_only=False,**kwargs):
    try:
        xprint("f is %s" % str(f))
        xprint("dialect is %s" % dialect)
        csv_reader = csv.reader(f, dialect, **kwargs)

        if row_data_only:
            for row in csv_reader:
                yield row
        else:
            for row in csv_reader:
                yield (f.filename(),f.isfirstline(),row)

    except UnicodeDecodeError as e1:
        raise CouldNotParseInputException(e1)
    except ValueError as e:
        # TODO Add test for this
        if str(e) is not None and str(e).startswith('could not convert string to'):
            raise CouldNotConvertStringToNumericValueException(str(e))
        else:
            raise CouldNotParseInputException(str(e))
    except Exception as e:
        if str(e).startswith("field larger than field limit"):
            raise ColumnMaxLengthLimitExceededException(str(e))
        elif 'universal-newline' in str(e):
            raise UniversalNewlinesExistException()
        else:
            raise

encoded_csv_reader = py3_encoded_csv_reader

def normalized_filename(filename):
    return filename

class TableCreatorState(object):
    INITIALIZED = 'INITIALIZED'
    ANALYZED = 'ANALYZED'
    FULLY_READ = 'FULLY_READ'

class MaterializedStateType(object):
    UNKNOWN = 'unknown'
    DELIMITED_FILE = 'delimited-file'
    QSQL_FILE = 'qsql-file'
    SQLITE_FILE = 'sqlite-file'
    DATA_STREAM = 'data-stream'

class TableSourceType(object):
    DELIMITED_FILE = 'file'
    DELIMITED_FILE_WITH_UNUSED_QSQL = 'file-with-unused-qsql'
    QSQL_FILE = 'qsql-file'
    QSQL_FILE_WITH_ORIGINAL = 'qsql-file-with-original'
    SQLITE_FILE = 'sqlite-file'
    DATA_STREAM = 'data-stream'

def skip_BOM(f):
    try:
        BOM = f.buffer.read(3)

        if BOM != six.b('\xef\xbb\xbf'):
            # TODO Add test for this (propagates to try:except)
            raise Exception('Value of BOM is not as expected - Value is "%s"' % str(BOM))
    except Exception as e:
        # TODO Add a test for this
        raise Exception('Tried to skip BOM for "utf-8-sig" encoding and failed. Error message is ' + str(e))

def detect_qtable_name_source_info(qtable_name,data_streams,read_caching_enabled):
    data_stream = data_streams.get_for_filename(qtable_name)
    xprint("Found data stream %s" % data_stream)

    if data_stream is not None:
        return MaterializedStateType.DATA_STREAM, TableSourceType.DATA_STREAM,(data_stream,)

    if ':::' in qtable_name:
        qsql_filename, table_name = qtable_name.split(":::", 1)
        if not os.path.exists(qsql_filename):
            raise FileNotFoundException("Could not find file %s" % qsql_filename)

        if is_qsql_file(qsql_filename):
            return MaterializedStateType.QSQL_FILE, TableSourceType.QSQL_FILE, (qsql_filename, table_name,)
        if is_sqlite_file(qsql_filename):
            return MaterializedStateType.SQLITE_FILE, TableSourceType.SQLITE_FILE, (qsql_filename, table_name,)
        raise UnknownFileTypeException("Cannot detect the type of table %s" % qtable_name)
    else:
        if is_qsql_file(qtable_name):
            return MaterializedStateType.QSQL_FILE, TableSourceType.QSQL_FILE, (qtable_name, None)
        if is_sqlite_file(qtable_name):
            return MaterializedStateType.SQLITE_FILE, TableSourceType.SQLITE_FILE, (qtable_name, None)
        matching_qsql_file_candidate = qtable_name + '.qsql'

        table_source_type = TableSourceType.DELIMITED_FILE
        if is_qsql_file(matching_qsql_file_candidate):
            if read_caching_enabled:
                xprint("Found matching qsql file for original file %s (matching file %s) and read caching is enabled. Using it" % (qtable_name,matching_qsql_file_candidate))
                return MaterializedStateType.QSQL_FILE, TableSourceType.QSQL_FILE_WITH_ORIGINAL, (matching_qsql_file_candidate, None)
            else:
                xprint("Found matching qsql file for original file %s (matching file %s), but read caching is disabled. Not using it" % (qtable_name,matching_qsql_file_candidate))
                table_source_type = TableSourceType.DELIMITED_FILE_WITH_UNUSED_QSQL


        return MaterializedStateType.DELIMITED_FILE,table_source_type ,(qtable_name, None)


def is_sqlite_file(filename):
    if not os.path.exists(filename):
        return False

    f = open(filename,'rb')
    magic = f.read(16)
    f.close()
    return magic == six.b("SQLite format 3\x00")

def sqlite_table_exists(cursor,table_name):
    results = cursor.execute("select count(*) from sqlite_master where type='table' and tbl_name == '%s'" % table_name).fetchall()
    return results[0][0] == 1

def is_qsql_file(filename):
    if not is_sqlite_file(filename):
        return False

    db = Sqlite3DB('check_qsql_db',filename,filename,create_qcatalog=False)
    qcatalog_exists = db.qcatalog_table_exists()
    db.done()
    return qcatalog_exists

def normalize_filename_to_table_name(filename):
    xprint("Normalizing filename %s" % filename)
    if filename[0].isdigit():
        xprint("Filename starts with a digit, adding prefix")
        filename = 't_%s' % filename
    if filename.lower().endswith(".qsql"):
        filename = filename[:-5]
    elif filename.lower().endswith('.sqlite'):
        filename = filename[:-7]
    elif filename.lower().endswith('.sqlite3'):
        filename = filename[:-8]
    return filename.replace("-","_dash_").replace(".","_dot_").replace('?','_qm_').replace("/","_slash_").replace("\\","_backslash_").replace(":","_colon_").replace(" ","_space_").replace("+","_plus_")

def validate_content_signature(original_filename, source_signature,other_filename, content_signature,scope=None,dump=False):
    if dump:
        xprint("Comparing: source value: %s target value: %s" % (source_signature,content_signature))

    s = "%s vs %s:" % (original_filename,other_filename)
    if scope is None:
        scope = []
    for k in source_signature:
        if type(source_signature[k]) == OrderedDict:
            validate_content_signature(original_filename, source_signature[k],other_filename, content_signature[k],scope + [k])
        else:
            if k not in content_signature:
                raise ContentSignatureDataDiffersException("%s Content Signatures differ. %s is missing from content signature" % (s,k))
            if source_signature[k] != content_signature[k]:
                if k == 'rows':
                    raise ContentSignatureDataDiffersException("%s Content Signatures differ at %s.%s (actual analysis data differs)" % (s,".".join(scope),k))
                else:
                    raise ContentSignatureDiffersException(original_filename, other_filename, original_filename,".".join(scope + [k]),source_signature[k],content_signature[k])

class DelimitedFileReader(object):
    def __init__(self,atomic_fns, input_params, dialect, f = None,external_f_name = None):
        if f is not None:
            assert len(atomic_fns) == 0

        self.atomic_fns = atomic_fns
        self.input_params = input_params
        self.dialect = dialect

        self.f = f
        self.lines_read = 0
        self.file_number = -1

        self.skipped_bom = False

        self.is_open = f is not None

        self.external_f = f is not None
        self.external_f_name = external_f_name

    def get_lines_read(self):
        return self.lines_read

    def get_size_hash(self):
        if self.atomic_fns is None or len(self.atomic_fns) == 0:
            return "data-stream-size"
        else:
            return ",".join(map(str,[os.stat(atomic_fn).st_size for atomic_fn in self.atomic_fns]))

    def get_last_modification_time_hash(self):
        if self.atomic_fns is None or len(self.atomic_fns) == 0:
            return "data stream-lmt"
        else:
            x = ",".join(map(lambda x: ':%s:' % x,[os.stat(x).st_mtime_ns for x in self.atomic_fns]))
            res = hashlib.sha1(six.b(x)).hexdigest() + '///' + x
            xprint("Hash of last modification time is %s" % res)
            return res

    def open_file(self):
        if self.external_f:
            xprint("External f has been provided. No need to open the file")
            return

        # TODO Support universal newlines for gzipped and stdin data as well

        xprint("XX Opening file %s" % ",".join(self.atomic_fns))
        import fileinput

        def q_openhook(filename, mode):
            if self.input_params.gzipped_input or filename.endswith('.gz'):
                import gzip
                f = gzip.open(filename,mode='rt',encoding=self.input_params.input_encoding)
            else:
                if six.PY3:
                    if self.input_params.with_universal_newlines:
                        f = io.open(filename, 'rU', newline=None, encoding=self.input_params.input_encoding)
                    else:
                        f = io.open(filename, 'r', newline=None, encoding=self.input_params.input_encoding)
                else:
                    if self.input_params.with_universal_newlines:
                        file_opening_mode = 'rbU'
                    else:
                        file_opening_mode = 'rb'
                    f = open(filename, file_opening_mode)

            if self.input_params.input_encoding == 'utf-8-sig' and not self.skipped_bom:
                skip_BOM(f)

            return f

        f = fileinput.input(self.atomic_fns,mode='rb',openhook=q_openhook)

        self.f = f
        self.is_open = True
        xprint("Actually opened file %s" % self.f)
        return f

    def close_file(self):
        if not self.is_open:
            # TODO Convert to assertion
            raise Exception("Bug - file should already be open: %s" % ",".join(self.atomic_fns))

        self.f.close()
        xprint("XX Closed file %s" % ",".join(self.atomic_fns))

    def generate_rows(self):
        csv_reader = encoded_csv_reader(self.input_params.input_encoding, self.f, dialect=self.dialect,row_data_only=self.external_f)
        try:
            # TODO Some order with regard to separating data-streams for actual files
            if self.external_f:
                for col_vals in csv_reader:
                    self.lines_read += 1
                    yield self.external_f_name,0, self.lines_read == 0, col_vals
            else:
                for file_name,is_first_line,col_vals in csv_reader:
                    if is_first_line:
                        self.file_number = self.file_number + 1
                    self.lines_read += 1
                    yield file_name,self.file_number,is_first_line,col_vals
        except ColumnMaxLengthLimitExceededException as e:
            msg = "Column length is larger than the maximum. Offending file is '%s' - Line is %s, counting from 1 (encoding %s). The line number is the raw line number of the file, ignoring whether there's a header or not" % (",".join(self.atomic_fns),self.lines_read + 1,self.input_params.input_encoding)
            raise ColumnMaxLengthLimitExceededException(msg)
        except UniversalNewlinesExistException as e2:
            # No need to translate the exception, but we want it to be explicitly defined here for clarity
            raise UniversalNewlinesExistException()

class MaterializedState(object):
    def __init__(self, table_source_type,qtable_name, engine_id):
        xprint("Creating new MS: %s %s" % (id(self), qtable_name))

        self.table_source_type = table_source_type

        self.qtable_name = qtable_name
        self.engine_id = engine_id

        self.db_to_use = None
        self.db_id = None

        self.source_type = None
        self.source = None

        self.mfs_structure = None

        self.start_time = None
        self.end_time = None
        self.duration = None

        self.effective_table_name = None


    def get_materialized_state_type(self):
        return MaterializedStateType.UNKNOWN

    def get_planned_table_name(self):
        assert False, 'not implemented'

    def autodetect_table_name(self):
        xprint("Autodetecting table name. db_to_use=%s" % self.db_to_use)
        existing_table_names = self.db_to_use.retrieve_all_table_names()
        xprint("Existing table names: %s" % existing_table_names)

        possible_indices = range(1,1000)

        for index in possible_indices:
            if index == 1:
                suffix = ''
            else:
                suffix = '_%s' % index

            table_name_attempt = '%s%s' % (self.get_planned_table_name(),suffix)
            xprint("Table name attempt: index=%s name=%s" % (index,table_name_attempt))

            if table_name_attempt not in existing_table_names:
                xprint("Found free table name %s for source type %s source %s" % (table_name_attempt,self.source_type,self.source))
                return table_name_attempt

        raise Exception('Cannot find free table name for source type %s source %s' % (self.source_type,self.source))

    def initialize(self):
        self.start_time = time.time()

    def finalize(self):
        self.end_time = time.time()
        self.duration = self.end_time - self.start_time

    def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
        assert False, 'not implemented'

    def make_data_available(self,stop_after_analysis):
        assert False, 'not implemented'

class MaterializedDelimitedFileState(MaterializedState):
    def __init__(self, table_source_type,qtable_name, input_params, dialect_id,engine_id,target_table_name=None):
        super().__init__(table_source_type,qtable_name,engine_id)

        self.input_params = input_params
        self.dialect_id = dialect_id
        self.target_table_name = target_table_name

        self.content_signature = None

        self.atomic_fns = None

        self.can_store_as_cached = None

    def get_materialized_state_type(self):
        return MaterializedStateType.DELIMITED_FILE

    def initialize(self):
        super(MaterializedDelimitedFileState, self).initialize()

        self.atomic_fns = self.materialize_file_list(self.qtable_name)
        self.delimited_file_reader = DelimitedFileReader(self.atomic_fns,self.input_params,self.dialect_id)

        self.source_type = self.table_source_type
        self.source = ",".join(self.atomic_fns)

        return

    def materialize_file_list(self,qtable_name):
        materialized_file_list = []

        unfound_files = []
        # First check if the file exists without globbing. This will ensure that we don't support non-existent files
        if os.path.exists(qtable_name):
            # If it exists, then just use it
            found_files = [qtable_name]
        else:
            # If not, then try with globs (and sort for predictability)
            found_files = list(sorted(glob.glob(qtable_name)))
            # If no files
            if len(found_files) == 0:
                unfound_files += [qtable_name]
        materialized_file_list += found_files

        # If there are no files to go over,
        if len(unfound_files) == 1:
            raise FileNotFoundException(
                "No files matching '%s' have been found" % unfound_files[0])
        elif len(unfound_files) > 1:
            # TODO Add test for this
            raise FileNotFoundException(
                "The following files have not been found for table %s: %s" % (qtable_name,",".join(unfound_files)))

        # deduplicate with matching qsql files
        filtered_file_list = list(filter(lambda x: not x.endswith('.qsql'),materialized_file_list))
        xprint("Filtered qsql files from glob search. Original file count: %s new file count: %s" % (len(materialized_file_list),len(filtered_file_list)))

        l = len(filtered_file_list)
        # If this proves to be a problem for users in terms of usability, then we'll just materialize the files
        # into the adhoc db, as with the db attach limit of sqlite
        if l > 500:
            msg = "Maximum source files for table must be 500. Table is name is %s Number of actual files is %s" % (qtable_name,l)
            raise MaximumSourceFilesExceededException(msg)

        absolute_path_list = [os.path.abspath(x) for x in filtered_file_list]
        return absolute_path_list

    def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
        if forced_db_to_use is not None:
            self.db_id = forced_db_to_use.db_id
            self.db_to_use = forced_db_to_use
            self.can_store_as_cached = False
            assert self.target_table_name is None
            self.target_table_name = self.autodetect_table_name()
            return

        self.can_store_as_cached = True

        self.db_id = '%s' % self._generate_db_name(self.atomic_fns[0])
        xprint("Database id is %s" % self.db_id)
        self.db_to_use = Sqlite3DB(self.db_id, 'file:%s?mode=memory&cache=shared' % self.db_id, 'memory<%s>' % self.db_id,create_qcatalog=True)

        if self.target_table_name is None:
            self.target_table_name = self.autodetect_table_name()


    def __analyze_delimited_file(self,database_info):
        xprint("Analyzing delimited file")
        if self.target_table_name is not None:
            target_sqlite_table_name = self.target_table_name
        else:
            assert False

        xprint("Target sqlite table name is %s" % target_sqlite_table_name)
        # Create the matching database table and populate it
        table_creator = TableCreator(self.qtable_name, self.delimited_file_reader,self.input_params, sqlite_db=database_info.sqlite_db,
                                     target_sqlite_table_name=target_sqlite_table_name)
        table_creator.perform_analyze(self.dialect_id)
        xprint("after perform_analyze")
        self.content_signature = table_creator._generate_content_signature()

        now = datetime.datetime.utcnow().isoformat()

        database_info.sqlite_db.add_to_qcatalog_table(target_sqlite_table_name,
                                          self.content_signature,
                                          now,
                                          self.source_type,
                                          self.source)
        return table_creator

    def _generate_disk_db_filename(self, filenames_str):
        fn = '%s.qsql' % (os.path.abspath(filenames_str).replace("+","__"))
        return fn


    def _get_should_read_from_cache(self, disk_db_filename):
        disk_db_file_exists = os.path.exists(disk_db_filename)

        should_read_from_cache = self.input_params.read_caching and disk_db_file_exists

        return should_read_from_cache

    def calculate_should_read_from_cache(self):
        # TODO cache filename is chosen according to first filename only, which makes multi-file (glob) caching difficult
        #  cache writing is blocked for now in these cases. Will be added in the future (see save_cache_to_disk_if_needed)
        disk_db_filename = self._generate_disk_db_filename(self.atomic_fns[0])
        should_read_from_cache = self._get_should_read_from_cache(disk_db_filename)
        xprint("should read from cache %s" % should_read_from_cache)
        return disk_db_filename,should_read_from_cache

    def get_planned_table_name(self):
        return normalize_filename_to_table_name(os.path.basename(self.atomic_fns[0]))

    def make_data_available(self,stop_after_analysis):
        xprint("In make_data_available. db_id %s db_to_use %s" % (self.db_id,self.db_to_use))
        assert self.db_id is not None

        disk_db_filename, should_read_from_cache = self.calculate_should_read_from_cache()
        xprint("disk_db_filename=%s should_read_from_cache=%s" % (disk_db_filename,should_read_from_cache))

        database_info = DatabaseInfo(self.db_id,self.db_to_use, needs_closing=True)
        xprint("db %s (%s) has been added to the database list" % (self.db_id, self.db_to_use))

        self.delimited_file_reader.open_file()

        table_creator = self.__analyze_delimited_file(database_info)

        self.mfs_structure = MaterializedStateTableStructure(self.qtable_name, self.atomic_fns, self.db_id,
                                                             table_creator.column_inferer.get_column_names(),
                                                             table_creator.column_inferer.get_column_types(),
                                                             None,
                                                             self.target_table_name,
                                                             self.source_type,
                                                             self.source,
                                                             self.get_planned_table_name())

        content_signature = table_creator.content_signature
        content_signature_key = self.db_to_use.calculate_content_signature_key(content_signature)
        xprint("table creator signature key: %s" % content_signature_key)

        relevant_table = self.db_to_use.get_from_qcatalog(content_signature)['temp_table_name']

        if not stop_after_analysis:
            table_creator.perform_read_fully(self.dialect_id)

            self.save_cache_to_disk_if_needed(disk_db_filename, table_creator)


        self.delimited_file_reader.close_file()

        return database_info, relevant_table

    def save_cache_to_disk_if_needed(self, disk_db_filename, table_creator):
        if len(self.atomic_fns) > 1:
            xprint("Cannot save cache for multi-files for now, deciding auto-naming for cache is challenging. Will be added in the future.")
            return

        effective_write_caching = self.input_params.write_caching
        if effective_write_caching:
            if self.can_store_as_cached:
                assert self.table_source_type != TableSourceType.DELIMITED_FILE_WITH_UNUSED_QSQL
                xprint("Going to write file cache for %s. Disk filename is %s" % (",".join(self.atomic_fns), disk_db_filename))
                self._store_qsql(table_creator.sqlite_db, disk_db_filename)
            else:
                xprint("Database has been provided externally. Skipping storing a cached version of the data")

    def _store_qsql(self, source_sqlite_db, disk_db_filename):
        xprint("Storing data as disk db")
        disk_db_conn = sqlite3.connect(disk_db_filename)
        with disk_db_conn:
            source_sqlite_db.conn.backup(disk_db_conn)
        xprint("Written db to disk: disk db filename %s" % (disk_db_filename))
        disk_db_conn.close()

    def _generate_db_name(self, qtable_name):
        return 'e_%s_fn_%s' % (self.engine_id,normalize_filename_to_table_name(qtable_name))


class MaterialiedDataStreamState(MaterializedDelimitedFileState):
    def __init__(self, table_source_type, qtable_name, input_params, dialect_id, engine_id, data_stream, stream_target_db): ## should pass adhoc_db
        assert data_stream is not None

        super().__init__(table_source_type, qtable_name, input_params, dialect_id, engine_id,target_table_name=None)

        self.data_stream = data_stream

        self.stream_target_db = stream_target_db

        self.target_table_name = None

    def get_planned_table_name(self):
        return 'data_stream_%s' % (normalize_filename_to_table_name(self.source))

    def get_materialized_state_type(self):
        return MaterializedStateType.DATA_STREAM

    def initialize(self):
        self.start_time = time.time()
        if self.input_params.gzipped_input:
            raise CannotUnzipDataStreamException()

        self.source_type = self.table_source_type
        self.source = self.data_stream.stream_id

        self.delimited_file_reader = DelimitedFileReader([], self.input_params, self.dialect_id, f=self.data_stream.stream,external_f_name=self.source)

    def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
        assert forced_db_to_use is None

        self.db_id = self.stream_target_db.db_id
        self.db_to_use = self.stream_target_db

        self.target_table_name = self.autodetect_table_name()

        return

    def calculate_should_read_from_cache(self):
        # No disk_db_filename, and no reading from cache when reading a datastream
        return None, False

    def finalize(self):
        super(MaterialiedDataStreamState, self).finalize()

    def save_cache_to_disk_if_needed(self, disk_db_filename, table_creator):
        xprint("Saving to cache is disabled for data streams")
        return


class MaterializedSqliteState(MaterializedState):
    def __init__(self,table_source_type,qtable_name,sqlite_filename,table_name, engine_id):
        super(MaterializedSqliteState, self).__init__(table_source_type,qtable_name,engine_id)
        self.sqlite_filename = sqlite_filename
        self.table_name = table_name

        self.table_name_autodetected = None

    def initialize(self):
        super(MaterializedSqliteState, self).initialize()

        self.table_name_autodetected = False
        if self.table_name is None:
            self.table_name = self.autodetect_table_name()
            self.table_name_autodetected = True
            return

        self.validate_table_name()

    def get_planned_table_name(self):
        if self.table_name_autodetected:
            return normalize_filename_to_table_name(os.path.basename(self.qtable_name))
        else:
            return self.table_name


    def autodetect_table_name(self):
        db = Sqlite3DB('temp_db','file:%s?immutable=1' % self.sqlite_filename,self.sqlite_filename,create_qcatalog=False)
        try:
            table_names = list(sorted(db.retrieve_all_table_names()))
            if len(table_names) == 1:
                return table_names[0]
            elif len(table_names) == 0:
                raise NoTablesInSqliteException(self.sqlite_filename)
            else:
                raise TooManyTablesInSqliteException(self.sqlite_filename,table_names)
        finally:
            db.done()

    def validate_table_name(self):
        db = Sqlite3DB('temp_db', 'file:%s?immutable=1' % self.sqlite_filename, self.sqlite_filename,
                       create_qcatalog=False)
        try:
            table_names = list(db.retrieve_all_table_names())
            if self.table_name.lower() not in map(lambda x:x.lower(),table_names):
                raise NonExistentTableNameInSqlite(self.sqlite_filename, self.table_name, table_names)
        finally:
            db.done()

    def finalize(self):
        super(MaterializedSqliteState, self).finalize()

    def get_materialized_state_type(self):
        return MaterializedStateType.SQLITE_FILE

    def _generate_qsql_only_db_name__temp(self, filenames_str):
        return 'e_%s_fn_%s' % (self.engine_id,hashlib.sha1(six.b(filenames_str)).hexdigest())

    def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
        self.source = self.sqlite_filename
        self.source_type = self.table_source_type

        self.db_id = '%s' % self._generate_qsql_only_db_name__temp(self.qtable_name)

        x = 'file:%s?immutable=1' % self.sqlite_filename
        self.db_to_use = Sqlite3DB(self.db_id, x, self.sqlite_filename,create_qcatalog=False)

        if forced_db_to_use:
            xprint("Forced sqlite db_to_use %s" % forced_db_to_use)
            new_table_name = forced_db_to_use.attach_and_copy_table(self.db_to_use,self.table_name,stop_after_analysis)
            self.table_name = new_table_name
            self.db_id = forced_db_to_use.db_id
            self.db_to_use = forced_db_to_use

        return

    def make_data_available(self,stop_after_analysis):
        xprint("db %s (%s) has been added to the database list" % (self.db_id, self.db_to_use))

        database_info,relevant_table = DatabaseInfo(self.db_id,self.db_to_use, needs_closing=True), self.table_name

        column_names, column_types, sqlite_column_types = self._extract_information()

        self.mfs_structure = MaterializedStateTableStructure(self.qtable_name, [self.qtable_name], self.db_id,
                                                             column_names, column_types, sqlite_column_types,
                                                             self.table_name,
                                                             self.source_type,self.source,
                                                             self.get_planned_table_name())
        return database_info, relevant_table

    def _extract_information(self):
        table_list = self.db_to_use.retrieve_all_table_names()
        if len(table_list) == 1:
            table_name = table_list[0][0]
            xprint("Only one table in sqlite database, choosing it: %s" % table_name)
        else:
            # self.table_name has either beein autodetected, or validated as an existing table up the stack
            table_name = self.table_name
            xprint("Multiple tables in sqlite file. Using provided table name %s" % self.table_name)

        table_info = self.db_to_use.get_sqlite_table_info(table_name)
        xprint('Table info is %s' % table_info)
        column_names = list(map(lambda x: x[1], table_info))
        sqlite_column_types = list(map(lambda x: x[2].lower(),table_info))
        column_types = list(map(lambda x: sqlite_type_to_python_type(x[2]), table_info))
        xprint("Column names and types for table %s: %s" % (table_name, list(zip(column_names, zip(sqlite_column_types,column_types)))))
        self.content_signature = OrderedDict()

        return column_names, column_types, sqlite_column_types


class MaterializedQsqlState(MaterializedState):
    def __init__(self,table_source_type,qtable_name,qsql_filename,table_name, engine_id,input_params,dialect_id):
        super(MaterializedQsqlState, self).__init__(table_source_type,qtable_name,engine_id)
        self.qsql_filename = qsql_filename
        self.table_name = table_name

        # These are for cases where the qsql file is just a cache and the original is still there, used for content
        # validation
        self.input_params = input_params
        self.dialect_id = dialect_id

        self.table_name_autodetected = None

    def initialize(self):
        super(MaterializedQsqlState, self).initialize()

        self.table_name_autodetected = False
        if self.table_name is None:
            self.table_name = self.autodetect_table_name()
            self.table_name_autodetected = True
            return

        self.validate_table_name()

    def get_planned_table_name(self):
        if self.table_name_autodetected:
            return normalize_filename_to_table_name(os.path.basename(self.qtable_name))
        else:
            return self.table_name


    def autodetect_table_name(self):
        db = Sqlite3DB('temp_db','file:%s?immutable=1' % self.qsql_filename,self.qsql_filename,create_qcatalog=False)
        assert db.qcatalog_table_exists()
        try:
            qcatalog_entries = db.get_all_from_qcatalog()
            if len(qcatalog_entries) == 0:
                raise NoTableInQsqlExcption(self.qsql_filename)
            elif len(qcatalog_entries) == 1:
                return qcatalog_entries[0]['temp_table_name']
            else:
                # TODO Add a test for this
                table_names = list(sorted([x['temp_table_name'] for x in qcatalog_entries]))
                raise TooManyTablesInQsqlException(self.qsql_filename,table_names)
        finally:
            db.done()

    def validate_table_name(self):
        db = Sqlite3DB('temp_db', 'file:%s?immutable=1' % self.qsql_filename, self.qsql_filename,
                       create_qcatalog=False)
        assert db.qcatalog_table_exists()
        try:
            entry = db.get_from_qcatalog_using_table_name(self.table_name)
            if entry is None:
                qcatalog_entries = db.get_all_from_qcatalog()
                table_names = list(sorted([x['temp_table_name'] for x in qcatalog_entries]))
                raise NonExistentTableNameInQsql(self.qsql_filename,self.table_name,table_names)
        finally:
            db.done()

    def finalize(self):
        super(MaterializedQsqlState, self).finalize()

    def get_materialized_state_type(self):
        return MaterializedStateType.QSQL_FILE

    def _generate_qsql_only_db_name__temp(self, filenames_str):
        return 'e_%s_fn_%s' % (self.engine_id,hashlib.sha1(six.b(filenames_str)).hexdigest())

    def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
        self.source = self.qsql_filename
        self.source_type = self.table_source_type

        self.db_id = '%s' % self._generate_qsql_only_db_name__temp(self.qtable_name)

        x = 'file:%s?immutable=1' % self.qsql_filename
        self.db_to_use = Sqlite3DB(self.db_id, x, self.qsql_filename,create_qcatalog=False)

        if forced_db_to_use:
            xprint("Forced qsql to use forced_db: %s" % forced_db_to_use)

            # TODO RLRL Move query to Sqlite3DB
            all_table_names = [(x[0],x[1]) for x in self.db_to_use.execute_and_fetch("select content_signature_key,temp_table_name from %s" % self.db_to_use.QCATALOG_TABLE_NAME).results]
            csk,t = list(filter(lambda x: x[1] == self.table_name,all_table_names))[0]
            xprint("Copying table %s from db_id %s" % (t,self.db_id))
            d = self.db_to_use.get_from_qcatalog_using_table_name(t)

            new_table_name = forced_db_to_use.attach_and_copy_table(self.db_to_use,self.table_name,stop_after_analysis)

            xprint("CS",d['content_signature'])
            cs = OrderedDict(json.loads(d['content_signature']))
            forced_db_to_use.add_to_qcatalog_table(new_table_name, cs, d['creation_time'],
                                    d['source_type'], d['source'])

            self.table_name = new_table_name
            self.db_id = forced_db_to_use.db_id
            self.db_to_use = forced_db_to_use

        return

    def make_data_available(self,stop_after_analysis):
        xprint("db %s (%s) has been added to the database list" % (self.db_id, self.db_to_use))

        database_info,relevant_table = self._read_table_from_cache(stop_after_analysis)

        column_names, column_types, sqlite_column_types = self._extract_information()

        self.mfs_structure = MaterializedStateTableStructure(self.qtable_name, [self.qtable_name], self.db_id,
                                                             column_names, column_types, sqlite_column_types,
                                                             self.table_name,
                                                             self.source_type,self.source,
                                                             self.get_planned_table_name())
        return database_info, relevant_table

    def _extract_information(self):
        assert self.db_to_use.qcatalog_table_exists()
        table_info = self.db_to_use.get_sqlite_table_info(self.table_name)
        xprint('table_name=%s Table info is %s' % (self.table_name,table_info))

        x = self.db_to_use.get_from_qcatalog_using_table_name(self.table_name)

        column_names = list(map(lambda x: x[1], table_info))
        sqlite_column_types = list(map(lambda x: x[2].lower(),table_info))
        column_types = list(map(lambda x: sqlite_type_to_python_type(x[2]), table_info))
        self.content_signature = OrderedDict(
            **json.loads(x['content_signature']))
        xprint('Inferred column names and types from qsql: %s' % list(zip(column_names, zip(sqlite_column_types,column_types))))

        return column_names, column_types, sqlite_column_types

    def _backing_original_file_exists(self):
        return '%s.qsql' % self.qtable_name == self.qsql_filename

    def _read_table_from_cache(self, stop_after_analysis):
        if self._backing_original_file_exists():
            xprint("Found a matching source file for qsql file with qtable name %s. Checking content signature by creating a temp MFDS + analysis" % self.qtable_name)
            mdfs = MaterializedDelimitedFileState(TableSourceType.DELIMITED_FILE,self.qtable_name,self.input_params,self.dialect_id,self.engine_id,target_table_name=None)
            mdfs.initialize()
            mdfs.choose_db_to_use(forced_db_to_use=None,stop_after_analysis=stop_after_analysis)
            _,_ = mdfs.make_data_available(stop_after_analysis=True)

            original_file_content_signature = mdfs.content_signature
            original_file_content_signature_key = self.db_to_use.calculate_content_signature_key(original_file_content_signature)

            qcatalog_entry = self.db_to_use.get_from_qcatalog_using_table_name(self.table_name)

            if qcatalog_entry is None:
                raise Exception('missing content signature!')

            xprint("Actual Signature Key: %s Expected Signature Key: %s" % (qcatalog_entry['content_signature_key'],original_file_content_signature_key))
            actual_content_signature = json.loads(qcatalog_entry['content_signature'])

            xprint("Validating content signatures: original %s vs qsql %s" % (original_file_content_signature,actual_content_signature))
            validate_content_signature(self.qtable_name, original_file_content_signature, self.qsql_filename, actual_content_signature,dump=True)
            mdfs.finalize()
        return DatabaseInfo(self.db_id,self.db_to_use, needs_closing=True), self.table_name


class MaterializedStateTableStructure(object):
    def __init__(self,qtable_name, atomic_fns, db_id, column_names, python_column_types, sqlite_column_types, table_name_for_querying,source_type,source,planned_table_name):
        self.qtable_name = qtable_name
        self.atomic_fns = atomic_fns
        self.db_id = db_id
        self.column_names = column_names
        self.python_column_types = python_column_types
        self.table_name_for_querying = table_name_for_querying
        self.source_type = source_type
        self.source = source
        self.planned_table_name = planned_table_name

        if sqlite_column_types is not None:
            self.sqlite_column_types = sqlite_column_types
        else:
            self.sqlite_column_types = [Sqlite3DB.PYTHON_TO_SQLITE_TYPE_NAMES[t].lower() for t in python_column_types]

    def get_table_name_for_querying(self):
        return self.table_name_for_querying

    def __str__(self):
        return "MaterializedStateTableStructure<%s>" % self.__dict__
    __repr__ = __str__

class TableCreator(object):
    def __str__(self):
        return "TableCreator<%s>" % str(self)
    __repr__ = __str__

    def __init__(self, qtable_name, delimited_file_reader,input_params,sqlite_db=None,target_sqlite_table_name=None):

        self.qtable_name = qtable_name
        self.delimited_file_reader = delimited_file_reader

        self.db_id = sqlite_db.db_id

        self.sqlite_db = sqlite_db
        self.target_sqlite_table_name = target_sqlite_table_name

        self.skip_header = input_params.skip_header
        self.gzipped = input_params.gzipped_input
        self.table_created = False

        self.encoding = input_params.input_encoding
        self.mode = input_params.parsing_mode
        self.expected_column_count = input_params.expected_column_count
        self.input_delimiter = input_params.delimiter
        self.with_universal_newlines = input_params.with_universal_newlines

        self.column_inferer = TableColumnInferer(input_params)

        self.pre_creation_rows = []
        self.buffered_inserts = []
        self.effective_column_names = None

        # Column type indices for columns that contain numeric types. Lazily initialized
        # so column inferer can do its work before this information is needed
        self.numeric_column_indices = None

        self.state = TableCreatorState.INITIALIZED

        self.content_signature = None

    def _generate_content_signature(self):
        if self.state != TableCreatorState.ANALYZED:
            # TODO Change to assertion
            raise Exception('Bug - Wrong state %s. Table needs to be analyzed before a content signature can be calculated' % self.state)

        size = self.delimited_file_reader.get_size_hash()
        last_modification_time = self.delimited_file_reader.get_last_modification_time_hash()

        m = OrderedDict({
            "_signature_version": "v1",
            "skip_header": self.skip_header,
            "gzipped": self.gzipped,
            "with_universal_newlines": self.with_universal_newlines,
            "encoding": self.encoding,
            "mode": self.mode,
            "expected_column_count": self.expected_column_count,
            "input_delimiter": self.input_delimiter,
            "inferer": self.column_inferer._generate_content_signature(),
            "original_file_size": size,
            "last_modification_time": last_modification_time
        })

        return m

    def validate_extra_header_if_needed(self, file_number, filename,col_vals):
        xprint("HHX validate",file_number,filename,col_vals)
        if not self.skip_header:
            xprint("No need to validate header")
            return False

        if file_number == 0:
            xprint("First file, no need to validate extra header")
            return False

        header_already_exists = self.column_inferer.header_row is not None

        if header_already_exists:
            xprint("Validating extra header")
            if tuple(self.column_inferer.header_row) != tuple(col_vals):
                raise BadHeaderException("Extra header '{}' in file '{}' mismatches original header '{}' from file '{}'. Table name is '{}'".format(
                    ",".join(col_vals),filename,
                    ",".join(self.column_inferer.header_row),
                    self.column_inferer.header_row_filename,
                    self.qtable_name))
            xprint("header already exists: %s" % self.column_inferer.header_row)
        else:
            xprint("Header doesn't already exist")

        return header_already_exists

    def _populate(self,dialect,stop_after_analysis=False):
        total_data_lines_read = 0
        try:
            try:
                for file_name,file_number,is_first_line,col_vals in self.delimited_file_reader.generate_rows():
                    if is_first_line:
                        if self.validate_extra_header_if_needed(file_number,file_name,col_vals):
                            continue
                    self._insert_row(file_name, col_vals)
                    if stop_after_analysis:
                        if self.column_inferer.inferred:
                            xprint("Stopping after analysis")
                            return
                if self.delimited_file_reader.get_lines_read() == 0 and self.skip_header:
                    raise MissingHeaderException("Header line is expected but missing in file %s" % ",".join(self.delimited_file_reader.atomic_fns))

                total_data_lines_read += self.delimited_file_reader.lines_read - (1 if self.skip_header else 0)
                xprint("Total Data lines read %s" % total_data_lines_read)
            except StrictModeColumnCountMismatchException as e:
                raise ColumnCountMismatchException(
                    'Strict mode - Expected %s columns instead of %s columns in file %s row %s. Either use relaxed modes or check your delimiter' % (
                    e.expected_col_count, e.actual_col_count, normalized_filename(e.atomic_fn), e.lines_read))
            except FluffyModeColumnCountMismatchException as e:
                raise ColumnCountMismatchException(
                    'Deprecated fluffy mode - Too many columns in file %s row %s (%s fields instead of %s fields). Consider moving to either relaxed or strict mode' % (
                    normalized_filename(e.atomic_fn), e.lines_read, e.actual_col_count, e.expected_col_count))
        finally:
            self._flush_inserts()

        if not self.table_created:
            self.column_inferer.force_analysis()
            self._do_create_table(self.qtable_name)

        self.sqlite_db.conn.commit()

    def perform_analyze(self, dialect):
        xprint("Analyzing... %s" % dialect)
        if self.state == TableCreatorState.INITIALIZED:
            self._populate(dialect,stop_after_analysis=True)
            self.state = TableCreatorState.ANALYZED

            self.content_signature = self._generate_content_signature()
            content_signature_key = self.sqlite_db.calculate_content_signature_key(self.content_signature)
            xprint("Setting content signature after analysis: %s" % content_signature_key)
        else:
            # TODO Convert to assertion
            raise Exception('Bug - Wrong state %s' % self.state)

    def perform_read_fully(self, dialect):
        if self.state == TableCreatorState.ANALYZED:
            self._populate(dialect,stop_after_analysis=False)
            self.state = TableCreatorState.FULLY_READ
        else:
            # TODO Convert to assertion
            raise Exception('Bug - Wrong state %s' % self.state)

    def _flush_pre_creation_rows(self, filename):
        for i, col_vals in enumerate(self.pre_creation_rows):
            if self.skip_header and i == 0:
                # skip header line
                continue
            self._insert_row(filename, col_vals)
        self._flush_inserts()
        self.pre_creation_rows = []

    def _insert_row(self, filename, col_vals):
        # If table has not been created yet
        if not self.table_created:
            # Try to create it along with another "example" line of data
            self.try_to_create_table(filename, col_vals)

        # If the table is still not created, then we don't have enough data, just
        # store the data and return
        if not self.table_created:
            self.pre_creation_rows.append(col_vals)
            return


        # The table already exists, so we can just add a new row
        self._insert_row_i(col_vals)

    def initialize_numeric_column_indices_if_needed(self):
        # Lazy initialization of numeric column indices
        if self.numeric_column_indices is None:
            column_types = self.column_inferer.get_column_types()
            self.numeric_column_indices = [idx for idx, column_type in enumerate(
                column_types) if self.sqlite_db.is_numeric_type(column_type)]

    def nullify_values_if_needed(self, col_vals):
        new_vals = col_vals[:]
        col_count = len(col_vals)
        for i in self.numeric_column_indices:
            if i >= col_count:
                continue
            v = col_vals[i]
            if v == '':
                new_vals[i] = None
        return new_vals

    def normalize_col_vals(self, col_vals):
        # Make sure that numeric column indices are initializd
        self.initialize_numeric_column_indices_if_needed()

        col_vals = self.nullify_values_if_needed(col_vals)

        expected_col_count = self.column_inferer.get_column_count()
        actual_col_count = len(col_vals)
        if self.mode == 'strict':
            if actual_col_count != expected_col_count:
                raise StrictModeColumnCountMismatchException(",".join(self.delimited_file_reader.atomic_fns), expected_col_count,actual_col_count,self.delimited_file_reader.get_lines_read())
            return col_vals

        # in all non strict mode, we add dummy data to missing columns

        if actual_col_count < expected_col_count:
            col_vals = col_vals + \
                [None for x in range(expected_col_count - actual_col_count)]

        # in relaxed mode, we merge all extra columns to the last column value
        if self.mode == 'relaxed':
            if actual_col_count > expected_col_count:
                xxx = col_vals[:expected_col_count - 1] + \
                    [self.input_delimiter.join([v if v  is not None else '' for v in
                        col_vals[expected_col_count - 1:]])]
                return xxx
            else:
                return col_vals

        assert False, "Unidentified parsing mode %s" % self.mode

    def _insert_row_i(self, col_vals):
        col_vals = self.normalize_col_vals(col_vals)

        if self.effective_column_names is None:
            self.effective_column_names = self.column_inferer.column_names[:len(col_vals)]

        if len(self.effective_column_names) > 0:
            self.buffered_inserts.append(col_vals)
        else:
            self.buffered_inserts.append([""])

        if len(self.buffered_inserts) < 5000:
            return
        self._flush_inserts()

    def _flush_inserts(self):
        # If the table is still not created, then we don't have enough data
        if not self.table_created:
            return

        if len(self.buffered_inserts) > 0:
            insert_row_stmt = self.sqlite_db.generate_insert_row(
                self.target_sqlite_table_name, self.effective_column_names)

            self.sqlite_db.update_many(insert_row_stmt, self.buffered_inserts)
        self.buffered_inserts = []

    def try_to_create_table(self, filename, col_vals):
        if self.table_created:
            # TODO Convert to assertion
            raise Exception('Table is already created')

        # Add that line to the column inferer
        result = self.column_inferer.analyze(filename, col_vals)
        # If inferer succeeded,
        if result:
            self._do_create_table(filename)
        else:
            pass  # We don't have enough information for creating the table yet

    def _do_create_table(self,filename):
        # Get the column definition dict from the inferer
        column_dict = self.column_inferer.get_column_dict()

        # Guard against empty tables (instead of preventing the creation, just create with a dummy column)
        if len(column_dict) == 0:
            column_dict = { 'dummy_column_for_empty_tables' : str }
            ordered_column_names = [ 'dummy_column_for_empty_tables' ]
        else:
            ordered_column_names = self.column_inferer.get_column_names()

        # Create the CREATE TABLE statement
        create_table_stmt = self.sqlite_db.generate_create_table(
            self.target_sqlite_table_name, ordered_column_names, column_dict)
        # And create the table itself
        self.sqlite_db.execute_and_fetch(create_table_stmt)
        # Mark the table as created
        self.table_created = True
        self._flush_pre_creation_rows(filename)


def determine_max_col_lengths(m,output_field_quoting_func,output_delimiter):
    if len(m) == 0:
        return []
    max_lengths = [0 for x in range(0, len(m[0]))]
    for row_index in range(0, len(m)):
        for col_index in range(0, len(m[0])):
            # TODO Optimize this
            new_len = len("{}".format(output_field_quoting_func(output_delimiter,m[row_index][col_index])))
            if new_len > max_lengths[col_index]:
                max_lengths[col_index] = new_len
    return max_lengths

def print_credentials():
    print("q version %s" % q_version, file=sys.stderr)
    print("Python: %s" % " // ".join([str(x).strip() for x in sys.version.split("\n")]), file=sys.stderr)
    print("Copyright (C) 2012-2021 Harel Ben-Attia (harelba@gmail.com, @harelba on twitter)", file=sys.stderr)
    print("https://harelba.github.io/q/", file=sys.stderr)
    print(file=sys.stderr)

class QWarning(object):
    def __init__(self,exception,msg):
        self.exception = exception
        self.msg = msg

class QError(object):
    def __init__(self,exception,msg,errorcode):
        self.exception = exception
        self.msg = msg
        self.errorcode = errorcode
        self.traceback = traceback.format_exc()

    def __str__(self):
        return "QError<errorcode=%s,msg=%s,exception=%s,traceback=%s>" % (self.errorcode,self.msg,self.exception,str(self.traceback))
    __repr__ = __str__

class QMetadata(object):
    def __init__(self,table_structures={},new_table_structures={},output_column_name_list=None):
        self.table_structures = table_structures
        self.new_table_structures = new_table_structures
        self.output_column_name_list = output_column_name_list

    def __str__(self):
        return "QMetadata<%s" % (self.__dict__)
    __repr__ = __str__

class QOutput(object):
    def __init__(self,data=None,metadata=None,warnings=[],error=None):
        self.data = data
        self.metadata = metadata

        self.warnings = warnings
        self.error = error
        if error is None:
            self.status = 'ok'
        else:
            self.status = 'error'

    def __str__(self):
        s = []
        s.append('status=%s' % self.status)
        if self.error is not None:
            s.append("error=%s" % self.error.msg)
        if len(self.warnings) > 0:
            s.append("warning_count=%s" % len(self.warnings))
        if self.data is not None:
            s.append("row_count=%s" % len(self.data))
        else:
            s.append("row_count=None")
        if self.metadata is not None:
            s.append("metadata=<%s>" % self.metadata)
        else:
            s.append("metadata=None")
        return "QOutput<%s>" % ",".join(s)
    __repr__ = __str__

class QInputParams(object):
    def __init__(self,skip_header=False,
            delimiter=' ',input_encoding='UTF-8',gzipped_input=False,with_universal_newlines=False,parsing_mode='relaxed',
            expected_column_count=None,keep_leading_whitespace_in_values=False,
            disable_double_double_quoting=False,disable_escaped_double_quoting=False,
            disable_column_type_detection=False,
            input_quoting_mode='minimal',stdin_file=None,stdin_filename='-',
            max_column_length_limit=131072,
            read_caching=False,
            write_caching=False,
            max_attached_sqlite_databases = 10):
        self.skip_header = skip_header
        self.delimiter = delimiter
        self.input_encoding = input_encoding
        self.gzipped_input = gzipped_input
        self.with_universal_newlines = with_universal_newlines
        self.parsing_mode = parsing_mode
        self.expected_column_count = expected_column_count
        self.keep_leading_whitespace_in_values = keep_leading_whitespace_in_values
        self.disable_double_double_quoting = disable_double_double_quoting
        self.disable_escaped_double_quoting = disable_escaped_double_quoting
        self.input_quoting_mode = input_quoting_mode
        self.disable_column_type_detection = disable_column_type_detection
        self.max_column_length_limit = max_column_length_limit
        self.read_caching = read_caching
        self.write_caching = write_caching
        self.max_attached_sqlite_databases = max_attached_sqlite_databases

    def merged_with(self,input_params):
        params = QInputParams(**self.__dict__)
        if input_params is not None:
            params.__dict__.update(**input_params.__dict__)
        return params

    def __str__(self):
        return "QInputParams<%s>" % str(self.__dict__)

    def __repr__(self):
        return "QInputParams(...)"

class DataStream(object):
    # TODO Can stream-id be removed?
    def __init__(self,stream_id,filename,stream):
        self.stream_id = stream_id
        self.filename = filename
        self.stream = stream

    def __str__(self):
        return "QDataStream<stream_id=%s,filename=%s,stream=%s>" % (self.stream_id,self.filename,self.stream)
    __repr__ = __str__


class DataStreams(object):
    def __init__(self, data_streams_dict):
        assert type(data_streams_dict) == dict
        self.validate(data_streams_dict)
        self.data_streams_dict = data_streams_dict

    def validate(self,d):
        for k in d:
            v = d[k]
            if type(k) != str or type(v) != DataStream:
                raise Exception('Bug - Invalid dict: %s' % str(d))

    def get_for_filename(self, filename):
        xprint("Data streams dict is %s. Trying to find %s" % (self.data_streams_dict,filename))
        x = self.data_streams_dict.get(filename)
        return x

    def is_data_stream(self,filename):
        return filename in self.data_streams_dict

class DatabaseInfo(object):
    def __init__(self,db_id,sqlite_db,needs_closing):
        self.db_id = db_id
        self.sqlite_db = sqlite_db
        self.needs_closing = needs_closing

    def __str__(self):
        return "DatabaseInfo<sqlite_db=%s,needs_closing=%s>" % (self.sqlite_db,self.needs_closing)
    __repr__ = __str__

class QTextAsData(object):
    def __init__(self,default_input_params=QInputParams(),data_streams_dict=None):
        self.engine_id = str(uuid.uuid4()).replace("-","_")

        self.default_input_params = default_input_params
        xprint("Default input params: %s" % self.default_input_params)

        self.loaded_table_structures_dict = OrderedDict()
        self.databases = OrderedDict()

        if data_streams_dict is not None:
            self.data_streams = DataStreams(data_streams_dict)
        else:
            self.data_streams = DataStreams({})

        # Create DB object
        self.query_level_db_id = 'query_e_%s' % self.engine_id
        self.query_level_db = Sqlite3DB(self.query_level_db_id,
                                        'file:%s?mode=memory&cache=shared' % self.query_level_db_id,'<query-level-db>',create_qcatalog=True)
        self.adhoc_db_id = 'adhoc_e_%s' % self.engine_id
        self.adhoc_db_name = 'file:%s?mode=memory&cache=shared' % self.adhoc_db_id
        self.adhoc_db = Sqlite3DB(self.adhoc_db_id,self.adhoc_db_name,'<adhoc-db>',create_qcatalog=True)
        self.query_level_db.conn.execute("attach '%s' as %s" % (self.adhoc_db_name,self.adhoc_db_id))

        self.add_db_to_database_list(DatabaseInfo(self.query_level_db_id,self.query_level_db,needs_closing=True))
        self.add_db_to_database_list(DatabaseInfo(self.adhoc_db_id,self.adhoc_db,needs_closing=True))

    def done(self):
        xprint("Inside done: Database list is %s" % self.databases)
        for db_id in reversed(self.databases.keys()):
            database_info = self.databases[db_id]
            if database_info.needs_closing:
                xprint("Gonna close database %s - %s" % (db_id,self.databases[db_id]))
                self.databases[db_id].sqlite_db.done()
                xprint("Database %s has been closed" % db_id)
            else:
                xprint("No need to close database %s" % db_id)
        xprint("Closed all databases")

    input_quoting_modes = {   'minimal' : csv.QUOTE_MINIMAL,
                        'all' : csv.QUOTE_ALL,
                        # nonnumeric is not supported for input quoting modes, since we determine the data types
                        # ourselves instead of letting the csv module try to identify the types
                        'none' : csv.QUOTE_NONE }

    def determine_proper_dialect(self,input_params):

        input_quoting_mode_csv_numeral = QTextAsData.input_quoting_modes[input_params.input_quoting_mode]

        if input_params.keep_leading_whitespace_in_values:
            skip_initial_space = False
        else:
            skip_initial_space = True

        dialect = {'skipinitialspace': skip_initial_space,
                    'delimiter': input_params.delimiter, 'quotechar': '"' }
        dialect['quoting'] = input_quoting_mode_csv_numeral
        dialect['doublequote'] = input_params.disable_double_double_quoting

        if input_params.disable_escaped_double_quoting:
            dialect['escapechar'] = '\\'

        return dialect

    def get_dialect_id(self,filename):
        return 'q_dialect_%s' % filename

    def _open_files_and_get_mfss(self,qtable_name,input_params,dialect):
        materialized_file_dict = OrderedDict()

        materialized_state_type,table_source_type,source_info = detect_qtable_name_source_info(qtable_name,self.data_streams,read_caching_enabled=input_params.read_caching)
        xprint("Detected source type %s source info %s" % (materialized_state_type,source_info))

        if materialized_state_type == MaterializedStateType.DATA_STREAM:
            (data_stream,) = source_info
            ms = MaterialiedDataStreamState(table_source_type,qtable_name,input_params,dialect,self.engine_id,data_stream,stream_target_db=self.adhoc_db)
            effective_qtable_name = data_stream.stream_id
        elif materialized_state_type == MaterializedStateType.QSQL_FILE:
            (qsql_filename,table_name) = source_info
            ms = MaterializedQsqlState(table_source_type,qtable_name, qsql_filename=qsql_filename, table_name=table_name,
                                       engine_id=self.engine_id, input_params=input_params, dialect_id=dialect)
            effective_qtable_name = '%s:::%s' % (qsql_filename, table_name)
        elif materialized_state_type == MaterializedStateType.SQLITE_FILE:
            (sqlite_filename,table_name) = source_info
            ms = MaterializedSqliteState(table_source_type,qtable_name, sqlite_filename=sqlite_filename, table_name=table_name,
                                       engine_id=self.engine_id)
            effective_qtable_name = '%s:::%s' % (sqlite_filename, table_name)
        elif materialized_state_type == MaterializedStateType.DELIMITED_FILE:
            (source_qtable_name,_) = source_info
            ms = MaterializedDelimitedFileState(table_source_type,source_qtable_name, input_params, dialect, self.engine_id)
            effective_qtable_name = source_qtable_name
        else:
            assert False, "Unknown file type for qtable %s should have exited with an exception" % (qtable_name)

        assert effective_qtable_name not in materialized_file_dict
        materialized_file_dict[effective_qtable_name] = ms

        xprint("MS dict: %s" % str(materialized_file_dict))

        return list([item for item in materialized_file_dict.values()])

    def _load_mfs(self,mfs,input_params,dialect_id,stop_after_analysis):
        xprint("Loading MFS:", mfs)

        materialized_state_type = mfs.get_materialized_state_type()
        xprint("Detected materialized state type for %s: %s" % (mfs.qtable_name,materialized_state_type))

        mfs.initialize()

        if not materialized_state_type in [MaterializedStateType.DATA_STREAM]:
            if stop_after_analysis or self.should_copy_instead_of_attach(input_params):
                xprint("Should copy instead of attaching. Forcing db to use to adhoc db")
                forced_db_to_use = self.adhoc_db
            else:
                forced_db_to_use = None
        else:
            forced_db_to_use = None

        mfs.choose_db_to_use(forced_db_to_use,stop_after_analysis)
        xprint("Chosen db to use: source %s source_type %s db_id %s db_to_use %s" % (mfs.source,mfs.source_type,mfs.db_id,mfs.db_to_use))

        database_info,relevant_table = mfs.make_data_available(stop_after_analysis)

        if not self.is_adhoc_db(mfs.db_to_use) and not self.should_copy_instead_of_attach(input_params):
            if not self.already_attached_to_query_level_db(mfs.db_to_use):
                self.attach_to_db(mfs.db_to_use, self.query_level_db)
                self.add_db_to_database_list(database_info)
            else:
                xprint("DB %s is already attached to query level db. No need to attach it again.")

        mfs.finalize()

        xprint("MFS Loaded")

        return mfs.source,mfs.source_type

    def add_db_to_database_list(self,database_info):
        db_id = database_info.db_id
        assert db_id is not None
        assert database_info.sqlite_db is not None
        if db_id in self.databases:
            # TODO Convert to assertion
            if id(database_info.sqlite_db) != id(self.databases[db_id].sqlite_db):
                raise Exception('Bug - database already in database list: db_id %s: old %s new %s' % (db_id,self.databases[db_id],database_info))
            else:
                return
        self.databases[db_id] = database_info

    def is_adhoc_db(self,db_to_use):
        return db_to_use.db_id == self.adhoc_db_id

    def should_copy_instead_of_attach(self,input_params):
        attached_database_count = len(self.query_level_db.get_sqlite_database_list())
        x = attached_database_count >= input_params.max_attached_sqlite_databases
        xprint("should_copy_instead_of_attach: attached_database_count=%s should_copy=%s" % (attached_database_count,x))
        return x

    def _load_data(self,qtable_name,input_params=QInputParams(),stop_after_analysis=False):
        xprint("Attempting to load data for materialized file names %s" % qtable_name)

        q_dialect = self.determine_proper_dialect(input_params)
        xprint("Dialect is %s" % q_dialect)
        dialect_id = self.get_dialect_id(qtable_name)
        csv.register_dialect(dialect_id, **q_dialect)

        xprint("qtable metadata for loading is %s" % qtable_name)
        mfss = self._open_files_and_get_mfss(qtable_name,
                                             input_params,
                                             dialect_id)
        assert len(mfss) == 1, "one MS now encapsulated an entire table"
        mfs = mfss[0]

        xprint("MFS to load: %s" % mfs)

        if qtable_name in self.loaded_table_structures_dict.keys():
            xprint("Atomic filename %s found. no need to load" % qtable_name)
            return None

        xprint("qtable %s not found - loading" % qtable_name)


        self._load_mfs(mfs, input_params, dialect_id, stop_after_analysis)
        xprint("Loaded: source-type %s source %s mfs_structure %s" % (mfs.source_type, mfs.source, mfs.mfs_structure))

        assert qtable_name not in self.loaded_table_structures_dict, "loaded_table_structures_dict has been changed to have a non-list value"
        self.loaded_table_structures_dict[qtable_name] = mfs.mfs_structure

        return mfs.mfs_structure

    def already_attached_to_query_level_db(self,db_to_attach):
        attached_dbs = list(map(lambda x:x[1],self.query_level_db.get_sqlite_database_list()))
        return db_to_attach.db_id in attached_dbs

    def attach_to_db(self, target_db, source_db):
        q = "attach '%s' as %s" % (target_db.sqlite_db_url,target_db.db_id)
        xprint("Attach query: %s" % q)
        try:
            c = source_db.execute_and_fetch(q)
        except SqliteOperationalErrorException as e:
            if 'too many attached databases' in str(e):
                raise TooManyAttachedDatabasesException('There are too many attached databases. Use a proper --max-attached-sqlite-databases parameter which is below the maximum. Original error: %s' % str(e))
        except Exception as e1:
            raise

    def detach_from_db(self, target_db, source_db):
        q = "detach %s" % (target_db.db_id)
        xprint("Detach query: %s" % q)
        try:
            c = source_db.execute_and_fetch(q)
        except Exception as e1:
            raise

    def load_data(self,filename,input_params=QInputParams(),stop_after_analysis=False):
        return self._load_data(filename,input_params,stop_after_analysis=stop_after_analysis)

    def _ensure_data_is_loaded_for_sql(self,sql_object,input_params,data_streams=None,stop_after_analysis=False):
        xprint("Ensuring Data load")
        new_table_structures = OrderedDict()

        # For each "table name"
        for qtable_name in sql_object.qtable_names:
            tss = self._load_data(qtable_name,input_params,stop_after_analysis=stop_after_analysis)
            if tss is not None:
                xprint("New Table Structures:",new_table_structures)
                assert qtable_name not in new_table_structures, "new_table_structures was changed not to contain a list as a value"
                new_table_structures[qtable_name] = tss

        return new_table_structures

    def materialize_query_level_db(self,save_db_to_disk_filename,sql_object):
        # TODO More robust creation - Create the file in a separate folder and move it to the target location only after success

        materialized_db = Sqlite3DB("materialized","file:%s" % save_db_to_disk_filename,save_db_to_disk_filename,create_qcatalog=False)
        table_name_mapping = OrderedDict()

        # For each table in the query
        effective_table_names = sql_object.get_qtable_name_effective_table_names()

        for i, qtable_name in enumerate(effective_table_names):
            # table name, in the format db_id.table_name
            effective_table_name_for_qtable_name = effective_table_names[qtable_name]

            source_db_id, actual_table_name_in_db = effective_table_name_for_qtable_name.split(".", 1)
            # The DatabaseInfo instance for this db
            source_database = self.databases[source_db_id]
            if source_db_id != self.query_level_db_id:
                self.attach_to_db(source_database.sqlite_db,materialized_db)

            ts = self.loaded_table_structures_dict[qtable_name]
            proposed_new_table_name = ts.planned_table_name
            xprint("Proposed table name is %s" % proposed_new_table_name)

            new_table_name = materialized_db.find_new_table_name(proposed_new_table_name)

            xprint("Materializing",source_db_id,actual_table_name_in_db,"as",new_table_name)
            # Copy the table into the materialized database
            xx = materialized_db.execute_and_fetch('CREATE TABLE %s AS SELECT * FROM %s' % (new_table_name,effective_table_name_for_qtable_name))

            table_name_mapping[effective_table_name_for_qtable_name] = new_table_name

            # TODO RLRL Preparation for writing materialized database as a qsql file
            # if source_database.sqlite_db.qcatalog_table_exists():
            #     qcatalog_entry = source_database.sqlite_db.get_from_qcatalog_using_table_name(actual_table_name_in_db)
            #     # TODO RLRL Encapsulate dictionary transform inside qcatalog access methods
            #     materialized_db.add_to_qcatalog_table(new_table_name,OrderedDict(json.loads(qcatalog_entry['content_signature'])),
            #                                           qcatalog_entry['creation_time'],
            #                                           qcatalog_entry['source_type'],
            #                                           qcatalog_entry['source_type'])
            #     xprint("PQX Added to qcatalog",source_db_id,actual_table_name_in_db,'as',new_table_name)
            # else:
            #     xprint("PQX Skipped adding to qcatalog",source_db_id,actual_table_name_in_db)

            if source_db_id != self.query_level_db:
                self.detach_from_db(source_database.sqlite_db,materialized_db)

        return table_name_mapping

    def validate_query(self,sql_object,table_structures):

        for qtable_name in sql_object.qtable_names:
            relevant_table_structures = [table_structures[qtable_name]]

            column_names = None
            column_types = None
            for ts in relevant_table_structures:
                names = ts.column_names
                types = ts.python_column_types
                xprint("Comparing column names: %s with %s" % (column_names,names))
                if column_names is None:
                    column_names = names
                else:
                    if column_names != names:
                        raise BadHeaderException("Column names differ for table %s: %s vs %s" % (
                            qtable_name, ",".join(column_names), ",".join(names)))

                xprint("Comparing column types: %s with %s" % (column_types,types))
                if column_types is None:
                    column_types = types
                else:
                    if column_types != types:
                        raise BadHeaderException("Column types differ for table %s: %s vs %s" % (
                        qtable_name, ",".join(column_types), ",".join(types)))

                xprint("All column names match for qtable name %s: column names: %s column types: %s" % (ts.qtable_name,column_names,column_types))

        xprint("Query validated")

    def _execute(self,query_str,input_params=None,data_streams=None,stop_after_analysis=False,save_db_to_disk_filename=None):
        warnings = []
        error = None
        table_structures = []

        db_results_obj = None

        effective_input_params = self.default_input_params.merged_with(input_params)

        if type(query_str) != unicode:
            try:
                # Heuristic attempt to auto convert the query to unicode before failing
                query_str = query_str.decode('utf-8')
            except:
                error = QError(EncodedQueryException(''),"Query should be in unicode. Please make sure to provide a unicode literal string or decode it using proper the character encoding.",91)
                return QOutput(error = error)


        try:
            # Create SQL statement
            sql_object = Sql('%s' % query_str, self.data_streams)

            load_start_time = time.time()
            iprint("Going to ensure data is loaded. Currently loaded tables: %s" % str(self.loaded_table_structures_dict))
            new_table_structures = self._ensure_data_is_loaded_for_sql(sql_object,effective_input_params,data_streams,stop_after_analysis=stop_after_analysis)
            iprint("Ensured data is loaded. loaded tables: %s" % self.loaded_table_structures_dict)

            self.validate_query(sql_object,self.loaded_table_structures_dict)

            iprint("Query validated")

            sql_object.materialize_using(self.loaded_table_structures_dict)

            iprint("Materialized sql object")

            if save_db_to_disk_filename is not None:
                xprint("Saving query data to disk")
                dump_start_time = time.time()
                table_name_mapping = self.materialize_query_level_db(save_db_to_disk_filename,sql_object)
                print("Data has been saved into %s . Saving has taken %4.3f seconds" % (save_db_to_disk_filename,time.time()-dump_start_time), file=sys.stderr)
                effective_sql = sql_object.get_effective_sql(table_name_mapping)
                print("Query to run on the database: %s;" % effective_sql, file=sys.stderr)
                command_line = 'echo "%s" | sqlite3 %s' % (effective_sql,save_db_to_disk_filename)
                print("You can run the query directly from the command line using the following command: %s" % command_line, file=sys.stderr)

                # TODO Propagate dump results using a different output class instead of an empty one
                return QOutput()

            # Ensure that adhoc db is not in the middle of a transaction
            self.adhoc_db.conn.commit()

            all_databases = self.query_level_db.get_sqlite_database_list()
            xprint("Query level db: databases %s" % all_databases)

            # Execute the query and fetch the data
            db_results_obj = sql_object.execute_and_fetch(self.query_level_db)
            iprint("Query executed")

            if len(db_results_obj.results) == 0:
                warnings.append(QWarning(None, "Warning - data is empty"))

            return QOutput(
                data = db_results_obj.results,
                metadata = QMetadata(
                    table_structures=self.loaded_table_structures_dict,
                    new_table_structures=new_table_structures,
                    output_column_name_list=db_results_obj.query_column_names),
                warnings = warnings,
                error = error)
        except InvalidQueryException as e:
            error = QError(e,str(e),118)
        except MissingHeaderException as e:
            error = QError(e,e.msg,117)
        except FileNotFoundException as e:
            error = QError(e,e.msg,30)
        except SqliteOperationalErrorException as e:
            xprint("Sqlite Operational error: %s" % e)
            msg = str(e.original_error)
            error = QError(e,"query error: %s" % msg,1)
            if "no such column" in msg and effective_input_params.skip_header:
                warnings.append(QWarning(e,'Warning - There seems to be a "no such column" error, and -H (header line) exists. Please make sure that you are using the column names from the header line and not the default (cXX) column names. Another issue might be that the file contains a BOM. Files that are encoded with UTF8 and contain a BOM can be read by specifying `-e utf-9-sig` in the command line. Support for non-UTF8 encoding will be provided in the future.'))
        except ColumnCountMismatchException as e:
            error = QError(e,e.msg,2)
        except (UnicodeDecodeError, UnicodeError) as e:
            error = QError(e,"Cannot decode data. Try to change the encoding by setting it using the -e parameter. Error:%s" % e,3)
        except BadHeaderException as e:
            error = QError(e,"Bad header row: %s" % e.msg,35)
        except CannotUnzipDataStreamException as e:
            error = QError(e,"Cannot decompress standard input. Pipe the input through zcat in order to decompress.",36)
        except UniversalNewlinesExistException as e:
            error = QError(e,"Data contains universal newlines. Run q with -U to use universal newlines. Please note that q still doesn't support universal newlines for .gz files or for stdin. Route the data through a regular file to use -U.",103)
        # deprecated, but shouldn't be used:  error = QError(e,"Standard Input must be provided in order to use it as a table",61)
        except CouldNotConvertStringToNumericValueException as e:
            error = QError(e,"Could not convert string to a numeric value. Did you use `-w nonnumeric` with unquoted string values? Error: %s" % e.msg,58)
        except CouldNotParseInputException as e:
            error = QError(e,"Could not parse the input. Please make sure to set the proper -w input-wrapping parameter for your input, and that you use the proper input encoding (-e). Error: %s" % e.msg,59)
        except ColumnMaxLengthLimitExceededException as e:
            error = QError(e,e.msg,31)
        # deprecated, but shouldn't be used: error = QError(e,e.msg,79)
        except ContentSignatureDiffersException as e:
            error = QError(e,"%s vs %s: Content Signatures for table %s differ at %s (source value '%s' disk signature value '%s')" %
                           (e.original_filename,e.other_filename,e.filenames_str,e.key,e.source_value,e.signature_value),80)
        except ContentSignatureDataDiffersException as e:
            error = QError(e,e.msg,81)
        except MaximumSourceFilesExceededException as e:
            error = QError(e,e.msg,82)
        except ContentSignatureNotFoundException as e:
            error = QError(e,e.msg,83)
        except NonExistentTableNameInQsql as e:
            msg = "Table %s could not be found in qsql file %s . Existing table names: %s" % (e.table_name,e.qsql_filename,",".join(e.existing_table_names))
            error = QError(e,msg,84)
        except NonExistentTableNameInSqlite as e:
            msg = "Table %s could not be found in sqlite file %s . Existing table names: %s" % (e.table_name,e.qsql_filename,",".join(e.existing_table_names))
            error = QError(e,msg,85)
        except TooManyTablesInQsqlException as e:
            msg = "Could not autodetect table name in qsql file. Existing Tables %s" % ",".join(e.existing_table_names)
            error = QError(e,msg,86)
        except NoTableInQsqlExcption as e:
            msg = "Could not autodetect table name in qsql file. File contains no record of a table"
            error = QError(e,msg,97)
        except TooManyTablesInSqliteException as e:
            msg = "Could not autodetect table name in sqlite file %s . Existing tables: %s" % (e.qsql_filename,",".join(e.existing_table_names))
            error = QError(e,msg,87)
        except NoTablesInSqliteException as e:
            msg = "sqlite file %s has no tables" % e.sqlite_filename
            error = QError(e,msg,88)
        except TooManyAttachedDatabasesException as e:
            msg = str(e)
            error = QError(e,msg,89)
        except UnknownFileTypeException as e:
            msg = str(e)
            error = QError(e,msg,95)
        except KeyboardInterrupt as e:
            warnings.append(QWarning(e,"Interrupted"))
        except Exception as e:
            global DEBUG
            if DEBUG:
                xprint(traceback.format_exc())
            error = QError(e,repr(e),199)

        return QOutput(data=None,warnings = warnings,error = error , metadata=QMetadata(table_structures=self.loaded_table_structures_dict,new_table_structures=self.loaded_table_structures_dict,output_column_name_list=[]))

    def execute(self,query_str,input_params=None,save_db_to_disk_filename=None):
        r = self._execute(query_str,input_params,stop_after_analysis=False,save_db_to_disk_filename=save_db_to_disk_filename)
        return r

    def unload(self):
        # TODO This would fail, since table structures are just value objects now. Will be fixed as part of making q a full python module
        for qtable_name,table_creator in six.iteritems(self.loaded_table_structures_dict):
            try:
                table_creator.drop_table()
            except:
                # Support no-table select queries
                pass
        self.loaded_table_structures_dict = OrderedDict()

    def analyze(self,query_str,input_params=None,data_streams=None):
        q_output = self._execute(query_str,input_params,data_streams=data_streams,stop_after_analysis=True)

        return q_output

def escape_double_quotes_if_needed(v):
    x = v.replace(six.u('"'), six.u('""'))
    return x

def quote_none_func(output_delimiter,v):
    return v

def quote_minimal_func(output_delimiter,v):
    if v is None:
        return v
    t = type(v)
    if (t == str or t == unicode) and ((output_delimiter in v) or ('\n' in v) or ('"' in v)):
        return six.u('"{}"').format(escape_double_quotes_if_needed(v))
    return v

def quote_nonnumeric_func(output_delimiter,v):
    if v is None:
        return v
    if type(v) == str or type(v) == unicode:
        return six.u('"{}"').format(escape_double_quotes_if_needed(v))
    return v

def quote_all_func(output_delimiter,v):
    if type(v) == str or type(v) == unicode:
        return six.u('"{}"').format(escape_double_quotes_if_needed(v))
    else:
        return six.u('"{}"').format(v)

class QOutputParams(object):
    def __init__(self,
            delimiter=' ',
            beautify=False,
            output_quoting_mode='minimal',
            formatting=None,
            output_header=False,
                 encoding=None):
        self.delimiter = delimiter
        self.beautify = beautify
        self.output_quoting_mode = output_quoting_mode
        self.formatting = formatting
        self.output_header = output_header
        self.encoding = encoding

    def __str__(self):
        return "QOutputParams<%s>" % str(self.__dict__)

    def __repr__(self):
        return "QOutputParams(...)"

class QOutputPrinter(object):
    output_quoting_modes = {   'minimal' : quote_minimal_func,
                        'all' : quote_all_func,
                        'nonnumeric' : quote_nonnumeric_func,
                        'none' : quote_none_func }

    def __init__(self,output_params,show_tracebacks=False):
        self.output_params = output_params
        self.show_tracebacks = show_tracebacks

        self.output_field_quoting_func = QOutputPrinter.output_quoting_modes[output_params.output_quoting_mode]

    def print_errors_and_warnings(self,f,results):
        if results.status == 'error':
            error = results.error
            print(error.msg, file=f)
            if self.show_tracebacks:
                print(error.traceback, file=f)

        for warning in results.warnings:
            print("%s" % warning.msg, file=f)

    def print_analysis(self,f_out,f_err,results):
        self.print_errors_and_warnings(f_err,results)

        if results.metadata is None:
            return

        if results.metadata.table_structures is None:
            return

        for qtable_name in results.metadata.table_structures:
            table_structures = results.metadata.table_structures[qtable_name]
            print("Table: %s" % qtable_name,file=f_out)
            print("  Sources:",file=f_out)
            dl = results.metadata.new_table_structures[qtable_name]
            print("    source_type: %s source: %s" % (dl.source_type,dl.source),file=f_out)
            print("  Fields:",file=f_out)
            for n,t in zip(table_structures.column_names,table_structures.sqlite_column_types):
                print("    `%s` - %s" % (n,t), file=f_out)

    def print_output(self,f_out,f_err,results):
        try:
            self._print_output(f_out,f_err,results)
        except (UnicodeEncodeError, UnicodeError) as e:
            print("Cannot encode data. Error:%s" % e, file=f_err)
            sys.exit(3)
        except IOError as e:
            if e.errno == 32:
                # broken pipe, that's ok
                pass
            else:
                # don't miss other problems for now
                raise
        except KeyboardInterrupt:
            pass

    def _print_output(self,f_out,f_err,results):
        self.print_errors_and_warnings(f_err,results)

        data = results.data

        if data is None:
            return

        # If the user requested beautifying the output
        if self.output_params.beautify:
            if self.output_params.output_header:
                data_with_possible_headers = data + [tuple(results.metadata.output_column_name_list)]
            else:
                data_with_possible_headers = data
            max_lengths = determine_max_col_lengths(data_with_possible_headers,self.output_field_quoting_func,self.output_params.delimiter)

        if self.output_params.formatting:
            formatting_dict = dict(
                [(x.split("=")[0], x.split("=")[1]) for x in self.output_params.formatting.split(",")])
        else:
            formatting_dict = {}

        try:
            if self.output_params.output_header and results.metadata.output_column_name_list is not None:
                data.insert(0,results.metadata.output_column_name_list)
            for rownum, row in enumerate(data):
                row_str = []
                skip_formatting = rownum == 0 and self.output_params.output_header
                for i, col in enumerate(row):
                    if str(i + 1) in formatting_dict.keys() and not skip_formatting:
                        fmt_str = formatting_dict[str(i + 1)]
                    else:
                        if self.output_params.beautify:
                            fmt_str = six.u("{{0:<{}}}").format(max_lengths[i])
                        else:
                            fmt_str = six.u("{}")

                    if col is not None:
                        xx = self.output_field_quoting_func(self.output_params.delimiter,col)
                        row_str.append(fmt_str.format(xx))
                    else:
                        row_str.append(fmt_str.format(""))


                xxxx = six.u(self.output_params.delimiter).join(row_str) + six.u("\n")
                f_out.write(xxxx)
        except (UnicodeEncodeError, UnicodeError) as e:
            print("Cannot encode data. Error:%s" % e, file=sys.stderr)
            sys.exit(3)
        except TypeError as e:
            print("Error while formatting output: %s" % e, file=sys.stderr)
            sys.exit(4)
        except IOError as e:
            if e.errno == 32:
                # broken pipe, that's ok
                pass
            else:
                # don't miss other problem for now
                raise
        except KeyboardInterrupt:
            pass

        try:
            # Prevent python bug when order of pipe shutdowns is reversed
            f_out.flush()
        except IOError as e:
            pass

def get_option_with_default(p, option_type, option, default):
    try:
        if not p.has_option('options', option):
            return default
        if p.get('options',option) == 'None':
            return None
        if option_type == 'boolean':
            r = p.getboolean('options', option)
            return r
        elif option_type == 'int':
            r = p.getint('options', option)
            return r
        elif option_type == 'string':
            r = p.get('options', option)
            return r
        else:
            raise Exception("Unknown option type %s " % option_type)
    except ValueError as e:
        raise IncorrectDefaultValueException(option_type,option,p.get("options",option))

QRC_FILENAME_ENVVAR = 'QRC_FILENAME'

def dump_default_values_as_qrc(parser,exclusions):
    m = parser.parse_args([]).__dict__
    m.pop('leftover')
    print("[options]",file=sys.stdout)
    for k in sorted(m.keys()):
        if k not in exclusions:
            print("%s=%s" % (k,m[k]),file=sys.stdout)

USAGE_TEXT = """
	q <flags> <query>

	Example Execution for a delimited file:

		q "select * from myfile.csv"

	Example Execution for an sqlite3 database:

		q "select * from mydatabase.sqlite:::my_table_name"

            or

		q "select * from mydatabase.sqlite"

            if the database file contains only one table

	Auto-caching of delimited files can be activated through `-C readwrite` (writes new caches if needed)  or `-C read` (only reads existing cache files)

	Setting the default caching mode (`-C`) can be done by writing a `~/.qrc` file. See docs for more info.
	
q's purpose is to bring SQL expressive power to the Linux command line and to provide easy access to text as actual data.

q allows the following:

* Performing SQL-like statements directly on tabular text data, auto-caching the data in order to accelerate additional querying on the same file
* Performing SQL statements directly on multi-file sqlite3 databases, without having to merge them or load them into memory

Changing the default values for parameters can be done by creating a `~/.qrc` file. Run q with `--dump-defaults` in order to dump a default `.qrc` file into stdout.

See https://github.com/harelba/q for more details.

"""

def run_standalone():
    sqlite3.enable_callback_tracebacks(True)

    p, qrc_filename = parse_qrc_file()

    args, options, parser = initialize_command_line_parser(p, qrc_filename)

    dump_defaults_and_stop__if_needed(options, parser)

    dump_version_and_stop__if_needed(options)

    STDOUT, default_input_params, q_output_printer, query_strs = parse_options(args, options)

    data_streams_dict = initialize_default_data_streams()

    q_engine = QTextAsData(default_input_params=default_input_params,data_streams_dict=data_streams_dict)

    execute_queries(STDOUT, options, q_engine, q_output_printer, query_strs)

    q_engine.done()

    sys.exit(0)


def dump_version_and_stop__if_needed(options):
    if options.version:
        print_credentials()
        sys.exit(0)


def dump_defaults_and_stop__if_needed(options, parser):
    if options.dump_defaults:
        dump_default_values_as_qrc(parser, ['dump-defaults', 'version'])
        sys.exit(0)


def execute_queries(STDOUT, options, q_engine, q_output_printer, query_strs):
    for query_str in query_strs:
        if options.analyze_only:
            q_output = q_engine.analyze(query_str)
            q_output_printer.print_analysis(STDOUT, sys.stderr, q_output)
        else:
            q_output = q_engine.execute(query_str, save_db_to_disk_filename=options.save_db_to_disk_filename)
            q_output_printer.print_output(STDOUT, sys.stderr, q_output)

        if q_output.status == 'error':
            sys.exit(q_output.error.errorcode)


def initialize_command_line_parser(p, qrc_filename):
    try:
        default_verbose = get_option_with_default(p, 'boolean', 'verbose', False)
        default_save_db_to_disk = get_option_with_default(p, 'string', 'save_db_to_disk_filename', None)
        default_caching_mode = get_option_with_default(p, 'string', 'caching_mode', 'none')

        default_skip_header = get_option_with_default(p, 'boolean', 'skip_header', False)
        default_delimiter = get_option_with_default(p, 'string', 'delimiter', None)
        default_pipe_delimited = get_option_with_default(p, 'boolean', 'pipe_delimited', False)
        default_tab_delimited = get_option_with_default(p, 'boolean', 'tab_delimited', False)
        default_encoding = get_option_with_default(p, 'string', 'encoding', 'UTF-8')
        default_gzipped = get_option_with_default(p, 'boolean', 'gzipped', False)
        default_analyze_only = get_option_with_default(p, 'boolean', 'analyze_only', False)
        default_mode = get_option_with_default(p, 'string', 'mode', "relaxed")
        default_column_count = get_option_with_default(p, 'string', 'column_count', None)
        default_keep_leading_whitespace_in_values = get_option_with_default(p, 'boolean',
                                                                            'keep_leading_whitespace_in_values', False)
        default_disable_double_double_quoting = get_option_with_default(p, 'boolean', 'disable_double_double_quoting',
                                                                        True)
        default_disable_escaped_double_quoting = get_option_with_default(p, 'boolean', 'disable_escaped_double_quoting',
                                                                         True)
        default_disable_column_type_detection = get_option_with_default(p, 'boolean', 'disable_column_type_detection',
                                                                        False)
        default_input_quoting_mode = get_option_with_default(p, 'string', 'input_quoting_mode', 'minimal')
        default_max_column_length_limit = get_option_with_default(p, 'int', 'max_column_length_limit', 131072)
        default_with_universal_newlines = get_option_with_default(p, 'boolean', 'with_universal_newlines', False)

        default_output_delimiter = get_option_with_default(p, 'string', 'output_delimiter', None)
        default_pipe_delimited_output = get_option_with_default(p, 'boolean', 'pipe_delimited_output', False)
        default_tab_delimited_output = get_option_with_default(p, 'boolean', 'tab_delimited_output', False)
        default_output_header = get_option_with_default(p, 'boolean', 'output_header', False)
        default_beautify = get_option_with_default(p, 'boolean', 'beautify', False)
        default_formatting = get_option_with_default(p, 'string', 'formatting', None)
        default_output_encoding = get_option_with_default(p, 'string', 'output_encoding', 'none')
        default_output_quoting_mode = get_option_with_default(p, 'string', 'output_quoting_mode', 'minimal')
        default_list_user_functions = get_option_with_default(p, 'boolean', 'list_user_functions', False)
        default_overwrite_qsql = get_option_with_default(p, 'boolean', 'overwrite_qsql', False)

        default_query_filename = get_option_with_default(p, 'string', 'query_filename', None)
        default_query_encoding = get_option_with_default(p, 'string', 'query_encoding', locale.getpreferredencoding())
        default_max_attached_sqlite_databases = get_option_with_default(p,'int','max_attached_sqlite_databases', 10)
    except IncorrectDefaultValueException as e:
        print("Incorrect value '%s' for option %s in .qrc file %s (option type is %s)" % (
        e.actual_value, e.option, qrc_filename, e.option_type))
        sys.exit(199)
    parser = ArgumentParser(prog="q",usage=USAGE_TEXT)
    parser.add_argument("-v", "--version", action="store_true", help="Print version")
    parser.add_argument("-V", "--verbose", default=default_verbose, action="store_true",
                      help="Print debug info in case of problems")
    parser.add_argument("-S", "--save-db-to-disk", dest="save_db_to_disk_filename", default=default_save_db_to_disk,
                      help="Save database to an sqlite database file")
    parser.add_argument("-C", "--caching-mode", default=default_caching_mode,
                      help="Choose the autocaching mode (none/read/readwrite). Autocaches files to disk db so further queries will be faster. Caching is done to a side-file with the same name of the table, but with an added extension .qsql")
    parser.add_argument("--dump-defaults", action="store_true",
                      help="Dump all default values for parameters and exit. Can be used in order to make sure .qrc file content is being read properly.")
    parser.add_argument("--max-attached-sqlite-databases", default=default_max_attached_sqlite_databases,type=int,
                      help="Set the maximum number of concurrently-attached sqlite dbs. This is a compile time definition of sqlite. q's performance will slow down once this limit is reached for a query, since it will perform table copies in order to avoid that limit.")
    # -----------------------------------------------
    input_data_option_group = parser.add_argument_group("Input Data Options")
    input_data_option_group.add_argument("-H", "--skip-header", default=default_skip_header,
                                       action="store_true",
                                       help="Skip header row. This has been changed from earlier version - Only one header row is supported, and the header row is used for column naming")
    input_data_option_group.add_argument("-d", "--delimiter", default=default_delimiter,
                                       help="Field delimiter. If none specified, then space is used as the delimiter.")
    input_data_option_group.add_argument("-p", "--pipe-delimited", default=default_pipe_delimited,
                                       action="store_true",
                                       help="Same as -d '|'. Added for convenience and readability")
    input_data_option_group.add_argument("-t", "--tab-delimited", default=default_tab_delimited,
                                       action="store_true",
                                       help="Same as -d <tab>. Just a shorthand for handling standard tab delimited file You can use $'\\t' if you want (this is how Linux expects to provide tabs in the command line")
    input_data_option_group.add_argument("-e", "--encoding", default=default_encoding,
                                       help="Input file encoding. Defaults to UTF-8. set to none for not setting any encoding - faster, but at your own risk...")
    input_data_option_group.add_argument("-z", "--gzipped", default=default_gzipped, action="store_true",
                                       help="Data is gzipped. Useful for reading from stdin. For files, .gz means automatic gunzipping")
    input_data_option_group.add_argument("-A", "--analyze-only", default=default_analyze_only,
                                       action='store_true',
                                       help="Analyze sample input and provide information about data types")
    input_data_option_group.add_argument("-m", "--mode", default=default_mode,
                                       help="Data parsing mode. fluffy, relaxed and strict. In strict mode, the -c column-count parameter must be supplied as well")
    input_data_option_group.add_argument("-c", "--column-count", default=default_column_count,
                                       help="Specific column count when using relaxed or strict mode")
    input_data_option_group.add_argument("-k", "--keep-leading-whitespace", dest="keep_leading_whitespace_in_values",
                                       default=default_keep_leading_whitespace_in_values, action="store_true",
                                       help="Keep leading whitespace in values. Default behavior strips leading whitespace off values, in order to provide out-of-the-box usability for simple use cases. If you need to preserve whitespace, use this flag.")
    input_data_option_group.add_argument("--disable-double-double-quoting", 
                                       default=default_disable_double_double_quoting, action="store_false",
                                       help="Disable support for double double-quoting for escaping the double quote character. By default, you can use \"\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.")
    input_data_option_group.add_argument("--disable-escaped-double-quoting", 
                                       default=default_disable_escaped_double_quoting, action="store_false",
                                       help="Disable support for escaped double-quoting for escaping the double quote character. By default, you can use \\\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.")
    input_data_option_group.add_argument("--as-text", dest="disable_column_type_detection",
                                       default=default_disable_column_type_detection, action="store_true",
                                       help="Don't detect column types - All columns will be treated as text columns")
    input_data_option_group.add_argument("-w", "--input-quoting-mode", 
                                       default=default_input_quoting_mode,
                                       help="Input quoting mode. Possible values are all, minimal and none. Note the slightly misleading parameter name, and see the matching -W parameter for output quoting.")
    input_data_option_group.add_argument("-M", "--max-column-length-limit", 
                                       default=default_max_column_length_limit,
                                       help="Sets the maximum column length.")
    input_data_option_group.add_argument("-U", "--with-universal-newlines", 
                                       default=default_with_universal_newlines, action="store_true",
                                       help="Expect universal newlines in the data. Limitation: -U works only with regular files for now, stdin or .gz files are not supported yet.")
    # -----------------------------------------------
    output_data_option_group = parser.add_argument_group("Output Options")
    output_data_option_group.add_argument("-D", "--output-delimiter", 
                                        default=default_output_delimiter,
                                        help="Field delimiter for output. If none specified, then the -d delimiter is used if present, or space if no delimiter is specified")
    output_data_option_group.add_argument("-P", "--pipe-delimited-output", 
                                        default=default_pipe_delimited_output, action="store_true",
                                        help="Same as -D '|'. Added for convenience and readability.")
    output_data_option_group.add_argument("-T", "--tab-delimited-output", 
                                        default=default_tab_delimited_output, action="store_true",
                                        help="Same as -D <tab>. Just a shorthand for outputting tab delimited output. You can use -D $'\\t' if you want.")
    output_data_option_group.add_argument("-O", "--output-header", default=default_output_header,
                                        action="store_true",
                                        help="Output header line. Output column-names are determined from the query itself. Use column aliases in order to set your column names in the query. For example, 'select name FirstName,value1/value2 MyCalculation from ...'. This can be used even if there was no header in the input.")
    output_data_option_group.add_argument("-b", "--beautify", default=default_beautify,
                                        action="store_true",
                                        help="Beautify output according to actual values. Might be slow...")
    output_data_option_group.add_argument("-f", "--formatting", default=default_formatting,
                                        help="Output-level formatting, in the format X=fmt,Y=fmt etc, where X,Y are output column numbers (e.g. 1 for first SELECT column etc.")
    output_data_option_group.add_argument("-E", "--output-encoding", 
                                        default=default_output_encoding,
                                        help="Output encoding. Defaults to 'none', leading to selecting the system/terminal encoding")
    output_data_option_group.add_argument("-W", "--output-quoting-mode", 
                                        default=default_output_quoting_mode,
                                        help="Output quoting mode. Possible values are all, minimal, nonnumeric and none. Note the slightly misleading parameter name, and see the matching -w parameter for input quoting.")
    output_data_option_group.add_argument("-L", "--list-user-functions", 
                                        default=default_list_user_functions, action="store_true",
                                        help="List all user functions")
    parser.add_argument("--overwrite-qsql", default=default_overwrite_qsql,
                      help="When used, qsql files (both caches and store-to-db) will be overwritten if they already exist. Use with care.")
    # -----------------------------------------------
    query_option_group = parser.add_argument_group("Query Related Options")
    query_option_group.add_argument("-q", "--query-filename", default=default_query_filename,
                                  help="Read query from the provided filename instead of the command line, possibly using the provided query encoding (using -Q).")
    query_option_group.add_argument("-Q", "--query-encoding", default=default_query_encoding,
                                  help="query text encoding. Experimental. Please send your feedback on this")
    # -----------------------------------------------
    parser.add_argument('leftover', nargs='*')
    args = parser.parse_args()
    return args.leftover, args, parser


def parse_qrc_file():
    p = configparser.ConfigParser()
    if QRC_FILENAME_ENVVAR in os.environ:
        qrc_filename = os.environ[QRC_FILENAME_ENVVAR]
        if qrc_filename != 'None':
            xprint("qrc filename is %s" % qrc_filename)
            if os.path.exists(qrc_filename):
                p.read([os.environ[QRC_FILENAME_ENVVAR]])
            else:
                print('QRC_FILENAME env var exists, but cannot find qrc file at %s' % qrc_filename, file=sys.stderr)
                sys.exit(244)
        else:
            pass  # special handling of 'None' env var value for QRC_FILENAME. Allows to eliminate the default ~/.qrc reading
    else:
        qrc_filename = os.path.expanduser('~/.qrc')
        p.read([qrc_filename, '.qrc'])
    return p, qrc_filename


def initialize_default_data_streams():
    data_streams_dict = {
        '-': DataStream('stdin', '-', sys.stdin)
    }
    return data_streams_dict


def parse_options(args, options):
    if options.list_user_functions:
        print_user_functions()
        sys.exit(0)
    if len(args) == 0 and options.query_filename is None:
        print_credentials()
        print("Must provide at least one query in the command line, or through a file with the -q parameter",
              file=sys.stderr)
        sys.exit(1)
    if options.query_filename is not None:
        if len(args) != 0:
            print("Can't provide both a query file and a query on the command line", file=sys.stderr)
            sys.exit(1)
        try:
            f = open(options.query_filename, 'rb')
            query_strs = [f.read()]
            f.close()
        except:
            print("Could not read query from file %s" % options.query_filename, file=sys.stderr)
            sys.exit(1)
    else:
        if sys.stdin.encoding is not None:
            query_strs = [x.encode(sys.stdin.encoding) for x in args]
        else:
            query_strs = args
    if options.query_encoding is not None and options.query_encoding != 'none':
        try:
            for idx in range(len(query_strs)):
                query_strs[idx] = query_strs[idx].decode(options.query_encoding).strip()

                if len(query_strs[idx]) == 0:
                    print("Query cannot be empty (query number %s)" % (idx + 1), file=sys.stderr)
                    sys.exit(1)

        except Exception as e:
            print("Could not decode query number %s using the provided query encoding (%s)" % (
            idx + 1, options.query_encoding), file=sys.stderr)
            sys.exit(3)
    ###
    if options.mode not in ['relaxed', 'strict']:
        print("Parsing mode can either be relaxed or strict", file=sys.stderr)
        sys.exit(13)
    output_encoding = get_stdout_encoding(options.output_encoding)
    try:
        if six.PY3:
            STDOUT = codecs.getwriter(output_encoding)(sys.stdout.buffer)
        else:
            STDOUT = codecs.getwriter(output_encoding)(sys.stdout)
    except:
        print("Could not create output stream using output encoding %s" % (output_encoding), file=sys.stderr)
        sys.exit(200)
    # If the user flagged for a tab-delimited file then set the delimiter to tab
    if options.tab_delimited:
        if options.delimiter is not None and options.delimiter != '\t':
            print("Warning: -t parameter overrides -d parameter (%s)" % options.delimiter, file=sys.stderr)
        options.delimiter = '\t'
    # If the user flagged for a pipe-delimited file then set the delimiter to pipe
    if options.pipe_delimited:
        if options.delimiter is not None and options.delimiter != '|':
            print("Warning: -p parameter overrides -d parameter (%s)" % options.delimiter, file=sys.stderr)
        options.delimiter = '|'
    if options.delimiter is None:
        options.delimiter = ' '
    elif len(options.delimiter) != 1:
        print("Delimiter must be one character only", file=sys.stderr)
        sys.exit(5)
    if options.tab_delimited_output:
        if options.output_delimiter is not None and options.output_delimiter != '\t':
            print("Warning: -T parameter overrides -D parameter (%s)" % options.output_delimiter, file=sys.stderr)
        options.output_delimiter = '\t'
    if options.pipe_delimited_output:
        if options.output_delimiter is not None and options.output_delimiter != '|':
            print("Warning: -P parameter overrides -D parameter (%s)" % options.output_delimiter, file=sys.stderr)
        options.output_delimiter = '|'
    if options.output_delimiter:
        # If output delimiter is specified, then we use it
        options.output_delimiter = options.output_delimiter
    else:
        # Otherwise,
        if options.delimiter:
            # if an input delimiter is specified, then we use it as the output as
            # well
            options.output_delimiter = options.delimiter
        else:
            # if no input delimiter is specified, then we use space as the default
            # (since no input delimiter means any whitespace)
            options.output_delimiter = " "
    try:
        max_column_length_limit = int(options.max_column_length_limit)
    except:
        print("Max column length limit must be an integer larger than 2 (%s)" % options.max_column_length_limit,
              file=sys.stderr)
        sys.exit(31)
    if max_column_length_limit < 3:
        print("Maximum column length must be larger than 2",file=sys.stderr)
        sys.exit(31)

    csv.field_size_limit(max_column_length_limit)
    xprint("Max column length limit is %s" % options.max_column_length_limit)

    if options.input_quoting_mode not in list(QTextAsData.input_quoting_modes.keys()):
        print("Input quoting mode can only be one of %s. It cannot be set to '%s'" % (
        ",".join(sorted(QTextAsData.input_quoting_modes.keys())), options.input_quoting_mode), file=sys.stderr)
        sys.exit(55)
    if options.output_quoting_mode not in list(QOutputPrinter.output_quoting_modes.keys()):
        print("Output quoting mode can only be one of %s. It cannot be set to '%s'" % (
        ",".join(QOutputPrinter.output_quoting_modes.keys()), options.input_quoting_mode), file=sys.stderr)
        sys.exit(56)
    if options.column_count is not None:
        expected_column_count = int(options.column_count)
        if expected_column_count < 1 or expected_column_count > int(options.max_column_length_limit):
            print("Column count must be between 1 and %s" % int(options.max_column_length_limit),file=sys.stderr)
            sys.exit(90)
    else:
        # infer automatically
        expected_column_count = None
    if options.encoding != 'none':
        try:
            codecs.lookup(options.encoding)
        except LookupError:
            print("Encoding %s could not be found" % options.encoding, file=sys.stderr)
            sys.exit(10)
    if options.save_db_to_disk_filename is not None:
        if options.analyze_only:
            print("Cannot save database to disk when running with -A (analyze-only) option.", file=sys.stderr)
            sys.exit(119)

        print("Going to save data into a disk database: %s" % options.save_db_to_disk_filename, file=sys.stderr)
        if os.path.exists(options.save_db_to_disk_filename):
            print("Disk database file %s already exists." % options.save_db_to_disk_filename, file=sys.stderr)
            sys.exit(77)
    # sys.exit(78) Deprecated, but shouldn't be reused
    if options.caching_mode not in ['none', 'read', 'readwrite']:
        print("caching mode must be none,read or readwrite",file=sys.stderr)
        sys.exit(85)
    read_caching = options.caching_mode in ['read', 'readwrite']
    write_caching = options.caching_mode in ['readwrite']

    if options.max_attached_sqlite_databases <= 3:
        print("Max attached sqlite databases must be larger than 3")
        sys.exit(99)

    default_input_params = QInputParams(skip_header=options.skip_header,
                                        delimiter=options.delimiter,
                                        input_encoding=options.encoding,
                                        gzipped_input=options.gzipped,
                                        with_universal_newlines=options.with_universal_newlines,
                                        parsing_mode=options.mode,
                                        expected_column_count=expected_column_count,
                                        keep_leading_whitespace_in_values=options.keep_leading_whitespace_in_values,
                                        disable_double_double_quoting=options.disable_double_double_quoting,
                                        disable_escaped_double_quoting=options.disable_escaped_double_quoting,
                                        input_quoting_mode=options.input_quoting_mode,
                                        disable_column_type_detection=options.disable_column_type_detection,
                                        max_column_length_limit=max_column_length_limit,
                                        read_caching=read_caching,
                                        write_caching=write_caching,
                                        max_attached_sqlite_databases=options.max_attached_sqlite_databases)

    output_params = QOutputParams(
        delimiter=options.output_delimiter,
        beautify=options.beautify,
        output_quoting_mode=options.output_quoting_mode,
        formatting=options.formatting,
        output_header=options.output_header,
        encoding=output_encoding)
    q_output_printer = QOutputPrinter(output_params, show_tracebacks=DEBUG)

    return STDOUT, default_input_params, q_output_printer, query_strs


if __name__ == '__main__':
    run_standalone()


================================================
FILE: conftest.py
================================================
#!/usr/bin/env python

# Required so pytest can find files properly


================================================
FILE: dist/fpm-config
================================================
-s dir
--name q-text-as-data
--license GPLv3
--architecture x86_64
--description "q allows to perform SQL-like statements on tabular text data."
--url https://github.com/harelba/q
--maintainer "Harel Ben-Attia <harelba@gmail.com>"


================================================
FILE: dist/test-rpm-inside-container.sh
================================================
#!/bin/bash
set -x
set -e

yum install -y python38 sqlite perl gcc python3-devel sqlite-devel
pip3 install -r test-requirements.txt

rpm -i $1
Q_EXECUTABLE=q Q_SKIP_EXECUTABLE_VALIDATION=true ./run-tests.sh -v


================================================
FILE: dist/test-using-deb.sh
================================================
#!/bin/bash

set -x
set -e

sudo dpkg -i $1
Q_EXECUTABLE=q Q_SKIP_EXECUTABLE_VALIDATION=true ./run-tests.sh -v


================================================
FILE: dist/test-using-rpm.sh
================================================
#!/bin/bash

set -x
set -e

RPM_LOCATION=$1

docker run -i -v `pwd`:/q-sources -w /q-sources centos:8 /bin/bash -e -x ./dist/test-rpm-inside-container.sh ${RPM_LOCATION}


================================================
FILE: doc/AUTHORS
================================================
  Copyright (C) 2012-2014 Harel Ben-Attia (harelba@gmail.com, @harelba on twitter)

Harel Ben-Attia <harelba@gmail.com> wrote the main program


================================================
FILE: doc/IMPLEMENTATION.markdown
================================================
# q - Treating Text as a Database 

## Implementation 

The current implementation is written in Python using an in-memory database, in order to prevent the need for external dependencies. The implementation itself supports SELECT statements, including JOINs (Subqueries are supported only in the WHERE clause for now).

Please note that there is currently no checks and bounds on data size - It's up to the user to make sure things don't get too big.

Please make sure to read the limitations section as well.

Code wise, I'm planning for a big refactoring, and I have added full test suite in the latest version, so it'll be easier to do properly.

## Tests

The code includes a test suite runnable through `test/test-all`. If you're planning on sending a pull request, I'd appreciate if you could make sure that it doesn't fail. Additional ideas related to testing are most welcome.

## Contact
Any feedback/suggestions/complaints regarding this tool would be much appreciated. Contributions are most welcome as well, of course.

Harel Ben-Attia, harelba@gmail.com, [@harelba](https://twitter.com/harelba) on Twitter


================================================
FILE: doc/LICENSE
================================================
GNU GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

                            Preamble

  The GNU General Public License is a free, copyleft license for
software and other kinds of works.

  The licenses for most software and other practical works are designed
to take away your freedom to share and change the works.  By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.  We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors.  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.

  To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights.  Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received.  You must make sure that they, too, receive
or can get the source code.  And you must show them these terms so they
know their rights.

  Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.

  For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software.  For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.

  Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so.  This is fundamentally incompatible with the aim of
protecting users' freedom to change the software.  The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable.  Therefore, we
have designed this version of the GPL to prohibit the practice for those
products.  If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.

  Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary.  To prevent this, the GPL assures that
patents cannot be used to render the program non-free.

  The precise terms and conditions for copying, distribution and
modification follow.

                       TERMS AND CONDITIONS

  0. Definitions.

  "This License" refers to version 3 of the GNU General Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.

  "The Program" refers to any copyrightable work licensed under this
License.  Each licensee is addressed as "you".  "Licensees" and
"recipients" may be individuals or organizations.

  To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy.  The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.

  A "covered work" means either the unmodified Program or a work based
on the Program.

  To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy.  Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.

  To "convey" a work means any kind of propagation that enables other
parties to make or receive copies.  Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.

  An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License.  If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.

  1. Source Code.

  The "source code" for a work means the preferred form of the work
for making modifications to it.  "Object code" means any non-source
form of a work.

  A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.

  The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form.  A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.

  The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities.  However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work.  For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.

  The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.

  The Corresponding Source for a work in source code form is that
same work.

  2. Basic Permissions.

  All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met.  This License explicitly affirms your unlimited
permission to run the unmodified Program.  The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work.  This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.

  You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force.  You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright.  Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.

  Conveying under any other circumstances is permitted solely under
the conditions stated below.  Sublicensing is not allowed; section 10
makes it unnecessary.

  3. Protecting Users' Legal Rights From Anti-Circumvention Law.

  No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.

  When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.

  4. Conveying Verbatim Copies.

  You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.

  You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.

  5. Conveying Modified Source Versions.

  You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:

    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.

    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".

    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.

    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.

  A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit.  Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.

  6. Conveying Non-Source Forms.

  You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:

    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.

    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.

    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.

    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.

    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.

  A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.

  A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling.  In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage.  For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product.  A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.

  "Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source.  The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.

  If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information.  But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).

  The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed.  Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.

  Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.

  7. Additional Terms.

  "Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law.  If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.

  When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it.  (Additional permissions may be written to require their own
removal in certain cases when you modify the work.)  You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.

  Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:

    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or

    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or

    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or

    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or

    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or

    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.

  All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10.  If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term.  If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.

  If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.

  Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.

  8. Termination.

  You may not propagate or modify a covered work except as expressly
provided under this License.  Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).

  However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.

  Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

  Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License.  If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.

  9. Acceptance Not Required for Having Copies.

  You are not required to accept this License in order to receive or
run a copy of the Program.  Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance.  However,
nothing other than this License grants you permission to propagate or
modify any covered work.  These actions infringe copyright if you do
not accept this License.  Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.

  10. Automatic Licensing of Downstream Recipients.

  Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License.  You are not responsible
for enforcing compliance by third parties with this License.

  An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations.  If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.

  You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License.  For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.

  11. Patents.

  A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based.  The
work thus licensed is called the contributor's "contributor version".

  A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version.  For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.

  Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.

  In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement).  To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.

  If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients.  "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.

  If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.

  A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License.  You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.

  Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.

  12. No Surrender of Others' Freedom.

  If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all.  For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.

  13. Use with the GNU Affero General Public License.

  Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work.  The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.

  14. Revised Versions of this License.

  The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

  Each version is given a distinguishing version number.  If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation.  If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.

  If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.

  Later license versions may give you additional or different
permissions.  However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.

  15. Disclaimer of Warranty.

  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

  16. Limitation of Liability.

  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.

  17. Interpretation of Sections 15 and 16.

  If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.

                     END OF TERMS AND CONDITIONS

            How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    {one line to give the program's name and a brief idea of what it does.}
    Copyright (C) {year}  {name of author}

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

  If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:

    {project}  Copyright (C) {year}  {fullname}
    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".

  You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<http://www.gnu.org/licenses/>.

  The GNU General Public License does not permit incorporating your program
into proprietary programs.  If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library.  If this is what you want to do, use the GNU Lesser General
Public License instead of this License.  But first, please read
<http://www.gnu.org/philosophy/why-not-lgpl.html>.


================================================
FILE: doc/RATIONALE.markdown
================================================
# q - Treating Text as a Database 

## Why aren't other Linux tools enough?
The standard Linux tools are amazing and I use them all the time, but the whole idea of Linux is mixing-and-matching the best tools for each part of job. This tool adds the declarative power of SQL to the Linux toolset, without loosing any of the other tools' benefits. In fact, I often use q together with other Linux tools, the same way I pipe awk/sed and grep together all the time.

One additional thing to note is that many Linux tools treat text as text and not as data. In that sense, you can look at q as a meta-tool which provides access to all the data-related tools that SQL provides (e.g. expressions, ordering, grouping, aggregation etc.).

## Philosophy
This tool has been designed with general Linux/Unix design principles in mind. If you're interested in these general design principles, read the amazing book http://catb.org/~esr/writings/taoup/ and specifically http://catb.org/~esr/writings/taoup/html/ch01s06.html. If you believe that the way this tool works goes strongly against any of the principles, I would love to hear your view about it.

## Contact
Any feedback/suggestions/complaints regarding this tool would be much appreciated. Contributions are most welcome as well, of course.

Harel Ben-Attia, harelba@gmail.com, [@harelba](https://twitter.com/harelba) on Twitter


================================================
FILE: doc/THANKS
================================================
  Copyright (C) 2012-2014 Harel Ben-Attia (harelba@gmail.com, @harelba on twitter)

Jens Neu (jens@zeeroos.de) - For writing the initial RPM package spec
barsnick (https://github.com/barsnick) - Thanks for additional RPM help
StreakyCobra (https://github.com/StreakyCobra) - For providing Arch Linux RPMs


================================================
FILE: doc/USAGE.markdown
================================================
# q - Text as Data

## SYNOPSIS
	`q <flags> <query>`

	Example Execution for a delimited file:

		q "select * from myfile.csv"

	Example Execution for an sqlite3 database:

		q "select * from mydatabase.sqlite:::my_table_name"

            or

		q "select * from mydatabase.sqlite"

            if the database file contains only one table

	Auto-caching of delimited files can be activated through `-C readwrite` (writes new caches if needed)  or `-C read` (only reads existing cache files)

	Setting the default caching mode (`-C`) can be done by writing a `~/.qrc` file. See docs for more info.
	
## DESCRIPTION
q's purpose is to bring SQL expressive power to the Linux command line and to provide easy access to text as actual data.

q allows the following:

* Performing SQL-like statements directly on tabular text data, auto-caching the data in order to accelerate additional querying on the same file
* Performing SQL statements directly on multi-file sqlite3 databases, without having to merge them or load them into memory

Query should be an SQL-like query which contains filenames instead of table names (or - for stdin). The query itself should be provided as one parameter to the tool (i.e. enclosed in quotes).

The following filename types are supported:

* Delimited-file filenames, including relative/absolute paths
* sqlite3 database filenames, with an additional `:::<table_name>` for accessing a specific table. If a database contains only one table, then denoting the table name is not needed. Examples: `mydatabase.sqlite3:::users_table` or `my_single_table_database.sqlite`.

Use `-H` to signify that the input contains a header line. Column names will be detected automatically in that case, and can be used in the query. If this option is not provided, columns will be named cX, starting with 1 (e.g. q "SELECT c3,c8 from ...").

Use `-d` to specify the input delimiter.

Column types are auto detected by the tool, no casting is needed.

Please note that column names that include spaces need to be used in the query with back-ticks, as per the sqlite standard.

Query/Input/Output encodings are fully supported (and q tries to provide out-of-the-box usability in that area). Please use `-e`,`-E` and `-Q` to control encoding if needed.

All sqlite3 SQL constructs are supported, including joins across files (use an alias for each table), with the exception of CTE (for now).

See https://github.com/harelba/q for more details.

## QUERY
q gets one parameter - An SQL-like query. 

Any standard SQL expression, condition (both WHERE and HAVING), GROUP BY, ORDER BY etc. are allowed.

JOINs are supported and Subqueries are supported in the WHERE clause, but unfortunately not in the FROM clause for now. Use table aliases when performing JOINs.

The SQL syntax itself is sqlite's syntax. For details look at https://www.sqlite.org/lang.html or search the net for examples.

**NOTE:** Full type detection is implemented, so there is no need for any casting or anything.

**NOTE2:** When using the `-O` output header option, use column name aliases if you want to control the output column names. For example, `q -O -H "select count(*) cnt,sum(*) as mysum from -"` would output `cnt` and `mysum` as the output header column names.

## RUNTIME OPTIONS
q can also get some runtime flags. The following parameters can be used, all optional:

````
Options:
  -h, --help            show this help message and exit
  -v, --version         Print version
  -V, --verbose         Print debug info in case of problems
  -S SAVE_DB_TO_DISK_FILENAME, --save-db-to-disk=SAVE_DB_TO_DISK_FILENAME
                        Save database to an sqlite database file
  -C CACHING_MODE, --caching-mode=CACHING_MODE
                        Choose the autocaching mode (none/read/readwrite).
                        Autocaches files to disk db so further queries will be
                        faster. Caching is done to a side-file with the same
                        name of the table, but with an added extension .qsql
  --dump-defaults       Dump all default values for parameters and exit. Can
                        be used in order to make sure .qrc file content is
                        being read properly.
  --max-attached-sqlite-databases=MAX_ATTACHED_SQLITE_DATABASES
                        Set the maximum number of concurrently-attached sqlite
                        dbs. This is a compile time definition of sqlite. q's
                        performance will slow down once this limit is reached
                        for a query, since it will perform table copies in
                        order to avoid that limit.
  --overwrite-qsql=OVERWRITE_QSQL
                        When used, qsql files (both caches and store-to-db)
                        will be overwritten if they already exist. Use with
                        care.

  Input Data Options:
    -H, --skip-header   Skip header row. This has been changed from earlier
                        version - Only one header row is supported, and the
                        header row is used for column naming
    -d DELIMITER, --delimiter=DELIMITER
                        Field delimiter. If none specified, then space is used
                        as the delimiter.
    -p, --pipe-delimited
                        Same as -d '|'. Added for convenience and readability
    -t, --tab-delimited
                        Same as -d <tab>. Just a shorthand for handling
                        standard tab delimited file You can use $'\t' if you
                        want (this is how Linux expects to provide tabs in the
                        command line
    -e ENCODING, --encoding=ENCODING
                        Input file encoding. Defaults to UTF-8. set to none
                        for not setting any encoding - faster, but at your own
                        risk...
    -z, --gzipped       Data is gzipped. Useful for reading from stdin. For
                        files, .gz means automatic gunzipping
    -A, --analyze-only  Analyze sample input and provide information about
                        data types
    -m MODE, --mode=MODE
                        Data parsing mode. fluffy, relaxed and strict. In
                        strict mode, the -c column-count parameter must be
                        supplied as well
    -c COLUMN_COUNT, --column-count=COLUMN_COUNT
                        Specific column count when using relaxed or strict
                        mode
    -k, --keep-leading-whitespace
                        Keep leading whitespace in values. Default behavior
                        strips leading whitespace off values, in order to
                        provide out-of-the-box usability for simple use cases.
                        If you need to preserve whitespace, use this flag.
    --disable-double-double-quoting
                        Disable support for double double-quoting for escaping
                        the double quote character. By default, you can use ""
                        inside double quoted fields to escape double quotes.
                        Mainly for backward compatibility.
    --disable-escaped-double-quoting
                        Disable support for escaped double-quoting for
                        escaping the double quote character. By default, you
                        can use \" inside double quoted fields to escape
                        double quotes. Mainly for backward compatibility.
    --as-text           Don't detect column types - All columns will be
                        treated as text columns
    -w INPUT_QUOTING_MODE, --input-quoting-mode=INPUT_QUOTING_MODE
                        Input quoting mode. Possible values are all, minimal
                        and none. Note the slightly misleading parameter name,
                        and see the matching -W parameter for output quoting.
    -M MAX_COLUMN_LENGTH_LIMIT, --max-column-length-limit=MAX_COLUMN_LENGTH_LIMIT
                        Sets the maximum column length.
    -U, --with-universal-newlines
                        Expect universal newlines in the data. Limitation: -U
                        works only with regular files for now, stdin or .gz
                        files are not supported yet.

  Output Options:
    -D OUTPUT_DELIMITER, --output-delimiter=OUTPUT_DELIMITER
                        Field delimiter for output. If none specified, then
                        the -d delimiter is used if present, or space if no
                        delimiter is specified
    -P, --pipe-delimited-output
                        Same as -D '|'. Added for convenience and readability.
    -T, --tab-delimited-output
                        Same as -D <tab>. Just a shorthand for outputting tab
                        delimited output. You can use -D $'\t' if you want.
    -O, --output-header
                        Output header line. Output column-names are determined
                        from the query itself. Use column aliases in order to
                        set your column names in the query. For example,
                        'select name FirstName,value1/value2 MyCalculation
                        from ...'. This can be used even if there was no
                        header in the input.
    -b, --beautify      Beautify output according to actual values. Might be
                        slow...
    -f FORMATTING, --formatting=FORMATTING
                        Output-level formatting, in the format X=fmt,Y=fmt
                        etc, where X,Y are output column numbers (e.g. 1 for
                        first SELECT column etc.
    -E OUTPUT_ENCODING, --output-encoding=OUTPUT_ENCODING
                        Output encoding. Defaults to 'none', leading to
                        selecting the system/terminal encoding
    -W OUTPUT_QUOTING_MODE, --output-quoting-mode=OUTPUT_QUOTING_MODE
                        Output quoting mode. Possible values are all, minimal,
                        nonnumeric and none. Note the slightly misleading
                        parameter name, and see the matching -w parameter for
                        input quoting.
    -L, --list-user-functions
                        List all user functions

  Query Related Options:
    -q QUERY_FILENAME, --query-filename=QUERY_FILENAME
                        Read query from the provided filename instead of the
                        command line, possibly using the provided query
                        encoding (using -Q).
    -Q QUERY_ENCODING, --query-encoding=QUERY_ENCODING
                        query text encoding. Experimental. Please send your
                        feedback on this
```

### Table names
The table names are the actual file names that you want to read from. Path names are allowed. Use "-" if you want to read from stdin (e.g. `q "SELECT * FROM -"`)

Wildcard matches are supported - For example: `SELECT ... FROM ... mydata*.dat`

Files with .gz extension are considered to be gzipped and decompressed on the fly.

### Parsing Modes
q supports two parsing modes:

* `relaxed` - This is the default mode. It tries to lean towards simplicity of use. When a row doesn't contains enough columns, they'll be filled with nulls, and when there are too many, the extra values will be merged to the last column. Defining the number of expected columns in this mode is done using the `-c` parameter. If it is not provided, then the number of columns is detected automatically (In most use cases, there is no need to specify `-c`)
* `strict` - Strict mode is for hardcore csv/tsv parsing. Whenever a row doesn't contain the proper number of columns, processing will stop. `-c` must be provided when using this mode

### Output formatting option
The format of F is as a list of X=f separated by commas, where X is a column number and f is a python format:

* X - column number - This is the SELECTed column (or expression) number, not the one from the original table. E.g, 1 is the first SELECTed column, 3 is the third SELECTed column.
* f - A python formatting string such as {} - See https://www.w3schools.com/python/ref_string_format.asp for details if needed.

## EXAMPLES
Example 1: `ls -ltrd * | q "select c1,count(1) from - group by c1"`

	This example would print a count of each unique permission string in the current folder.

Example 2: `seq 1 1000 | q "select avg(c1),sum(c1) from -"`

	This example would provide the average and the sum of the numbers in the range 1 to 1000

Example 3: `sudo find /tmp -ls | q "select c5,c6,sum(c7)/1024.0/1024 as total from - group by c5,c6 order by total desc"`

	This example will output the total size in MB per user+group in the /tmp subtree

Example 4: `ps -ef | q -H "select UID,count(*) cnt from - group by UID order by cnt desc limit 3"`

	This example will show process counts per UID, calculated from ps data. Note that the column names provided by ps are being used as column name in the query (The -H flag activates that option)

## AUTHOR
Harel Ben-Attia (harelba@gmail.com)

[@harelba](https://twitter.com/harelba) on Twitter

Any feedback/suggestions/complaints regarding this tool would be much appreciated. Contributions are most welcome as well, of course.

## COPYRIGHT
Copyright (C) 2012--2021 Harel Ben Attia

This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version.

This program is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA 


================================================
FILE: examples/EXAMPLES.markdown
================================================
# q - Treating Text as a Database 

See below for a JOIN example.

## Tutorial
This is a tutorial for beginners. If you're familiar with the concept and just wanna see some full fledged examples, take a look [here](README.markdown#examples) in the main page.

Tutorial steps:

1.  We'll start with a simple example and work from there. The file `exampledatafile` contains the output of an `ls -l` command, a list of files in some directory. In this example we'll do some calculations on this file list.
  * The following commands will count the lines in the file *exampledatafile*, effectively getting the number of files in the directory. The output will be exactly as if we ran the `wc -l` command.  

            q "SELECT COUNT(1) FROM exampledatafile"    

            cat exampledatafile | q "SELECT COUNT(1) FROM -"   
        
  * Now, let's assume we want to know the number of files per date in the directory. Notice that the date is in column 6.

            q "SELECT c6,COUNT(1) FROM exampledatafile GROUP BY c6"   

  * The results will show the number of files per date. However, there's a lot of "noise" - dates in which there is only one file. Let's leave only the ones which have 3 files or more:  

            q "SELECT c6,COUNT(1) AS cnt FROM exampledatafile GROUP BY c6 HAVING cnt >= 3"   

  * Now, let's see if we can get something more interesting. The following command will provide the **total size** of the files for each date. Notice that the file size is in c5.  

            q "SELECT c6,SUM(c5) AS size FROM exampledatafile GROUP BY c6"   

  * We can see the results. However, the sums are in bytes. Let's show the same results but in KB:  

            q "SELECT c6,SUM(c5)/1024.0 AS size FROM exampledatafile GROUP BY c6"  

  * The last command provided us with a list of results, but there is no order and the list is too long. Let's get the Top 5 dates:  

            q "SELECT c6,SUM(c5)/1024.0 AS size FROM exampledatafile GROUP BY c6 ORDER BY size DESC LIMIT 5"   

  * Now we'll see how we can format the output itself, so it looks better:  

            q -f "2=%4.2f" "SELECT c6,SUM(c5)/1024.0 AS size FROM exampledatafile GROUP BY c6 ORDER BY size DESC LIMIT 5"  
        
  * (An example of using JOIN will be added here - In the mean time just remember you have to use table alias for JOINed "tables")
        
2. A more complicated example, showing time manipulation. Let's assume that we have a file with a timestamp as its first column. We'll show how it's possible to get the number of rows per full minute:  

        q "SELECT DATETIME(ROUND(c1/60000)*60000/1000,'unixepoch','-05:00') as min, COUNT(1) FROM datafile*.gz GROUP BY min"  
        
   There are several things to notice here:
   
   * The timestamp value is in the first column, hence c1.
   * The timestamp is assumed to be a unix epoch timestamp, but in ms, and DATETIME accepts seconds, so we need to divide by 1000
   * The full-minute rounding is done by dividing by 60000 (ms), rounding and then multiplying by the same amount. Rounding to an hour, for example, would be the same except for having 3600000 instead of 60000.
   * We use DATETIME's capability in order to output the time in localtime format. In that case, it's converted to New York time (hence the -5 hours)
   * The filename is actually all files matching `datafile*.gz` - Multiple files can be read, and since they have a .gz extension, they are decompressed on the fly.
   * **NOTE:** For non-SQL people, the date manipulation may seem odd at first, but this is standard SQL processing for timestamps and it's easy to get used to.

## JOIN example

__Command 1 (Join data from two files):__

The following command _joins_ an ls output (`exampledatafile`) and a file containing rows of **group-name,email**  (`group-emails-example`) and provides a row of **filename,email** for each of the emails of the group. For brevity of output, there is also a filter for a specific filename called `ppp` which is achieved using a WHERE clause.
```bash
q "select myfiles.c8,emails.c2 from exampledatafile myfiles join group-emails-example emails on (myfiles.c4 = emails.c1) where myfiles.c8 = 'ppp'"
```

__Output 1: (rows of filename,email):__
```bash
ppp dip.1@otherdomain.com
ppp dip.2@otherdomain.com
```

You can see that the ppp filename appears twice, each time matched to one of the emails of the group `dip` to which it belongs. Take a look at the files [`exampledatafile`](exampledatafile) and [`group-emails-example`](group-emails-example) for the data.

## Writing the data into an sqlite3 database
q now supports writing its data into a disk base sqlite3 database file. In order to write the data to a database disk use the `-S` parameter (`--save-db-to-disk`) with a filename as a parameter. Note that you still need to provide a query as a parameter, even though it will not be executed. The tool will provide the proper sqlite3 query to run after writing the data to the database, allowing you to copy-paste it into the sqlite3 command line. If you don't care about running any query, just use "select 1" as the query.

Here's an example that will write the output into `some.db` for further processing. Note that we've added the `-c 1` parameter to prevent q warning us about having only one column.
```
$ seq 1 100 | ./q "select count(*) from -" -S some.db -c 1
Going to save data into a disk database: some.db
Data has been loaded in 0.002 seconds
Saving data to db file some.db
Data has been saved into some.db . Saving has taken 0.018 seconds
Query to run on the database: select count(*) from `-`;

$ sqlite3 some.db
SQLite version 3.19.3 2017-06-27 16:48:08
Enter ".help" for usage hints.
sqlite> .tables
-
sqlite> .schema
CREATE TABLE IF NOT EXISTS "-" ("c1" INT);
sqlite> select count(*) from `-`;
100
sqlite>
```

Note that table names are explicitly set to the filenames in the original query (e.g. filenames), which means that in many cases you'd need to escape the table names in sqlite3 with backticks. For example, the name of the table above is `-`, and in order to use it in an sqlite3 query, it is backticked, otherwise it won't conform to a proper table name. I've decided to emphasize consistency and simplicity in this case, instead of trying to provide some normalization/sanitation of filenames, since I believe that doing it would cause much confusion and will be less effective. Any ideas and comments are this are most welcome obviously.

### Choosing the method of writing the sqlite3 database
There's another parameter that controls the method of writing to the sqlite3 database - `--save-db-to-disk-method`. The value can either be `standard` or `fast`. The fast method requires changes in the packaging of q, since it's dependent on another python module (https://github.com/husio/python-sqlite3-backup by @husio - Thanks!). However, there are some complications with seamlessly packaging it without possibly causing some backward compatibility issues (see PR #159 for some details), so it's not the standard method as of yet. If you're an advanced user, and in need for the faster method due to very large files etc., you'd need to manually install this python package for the fast method to work - Run `pip install sqlitebck` on your python installation. Obviously, I'm considering this as a bug that I need to fix.

## Installation
Installation instructions can be found [here](../doc/INSTALL.markdown)

## Contact
Any feedback/suggestions/complaints regarding this tool would be much appreciated. Contributions are most welcome as well, of course.

Harel Ben-Attia, harelba@gmail.com, [@harelba](https://twitter.com/harelba) on Twitter


================================================
FILE: examples/exampledatafile
================================================
-rw-r--r--  1 root root     2064 2006-11-23 21:33 netscsid.conf
-rw-r--r--  1 root root     1343 2007-01-09 20:39 wodim.conf
-rw-r--r--  1 root root      112 2007-06-22 18:08 apg.conf
-rw-r--r--  1 root root    15752 2009-07-25 18:13 ltrace.conf
-rw-r--r--  1 root root      624 2010-05-16 14:18 mtools.conf
-rw-r--r--  1 root root      395 2010-06-20 11:11 anacrontab
-rw-r--r--  1 root root    18673 2010-10-18 06:49 globash.rc
-rw-r--r--  1 root root    23958 2010-11-15 10:07 mime.types
-rw-r--r--  1 root root      449 2010-11-15 10:07 mailcap.order
-rw-r--r--  1 root root     8453 2010-12-03 22:32 nanorc
-rwxr-xr-x  1 root root      268 2010-12-07 12:10 rmt
-rw-r--r--  1 root root     1147 2011-01-04 16:27 rarfiles.lst
-rw-r--r--  1 root root      600 2011-03-09 13:22 deluser.conf
drwxr-xr-x  2 root root     4096 2011-03-15 23:05 ODBCDataSources
-rw-r--r--  1 root root        0 2011-03-15 23:05 odbc.ini
-rw-r--r--  1 root root      801 2011-03-17 20:09 mke2fs.conf
drwxr-xr-x  2 root root     4096 2011-04-30 19:12 insserv.conf.d
-rw-r--r--  1 root root      839 2011-04-30 19:12 insserv.conf
drwxr-xr-x  3 root root     4096 2011-04-30 19:12 insserv
-rw-r--r--  1 root root      373 2011-05-01 02:15 rearj.cfg
-rw-r--r--  1 root root     1260 2011-05-02 15:19 ucf.conf
-rw-r-----  1 root daemon    144 2011-05-16 13:32 at.deny
-rw-r--r--  1 root root     4496 2011-05-17 23:21 wgetrc
drwxr-xr-x  2 root root     4096 2011-05-18 12:01 libpaper.d
-rw-r--r--  1 root root     1975 2011-05-18 13:00 bash.bashrc
-rw-r-----  1 root fuse      216 2011-05-18 13:12 fuse.conf
-rw-r--r--  1 root root    19666 2011-05-24 18:26 services
-rw-r--r--  1 root root      887 2011-05-24 18:26 rpc
-rw-r--r--  1 root root     2859 2011-05-24 18:26 protocols
-rw-r--r--  1 root root     4728 2011-06-07 14:10 hdparm.conf
-rw-r--r--  1 root root     2083 2011-06-10 19:58 sysctl.conf
-rw-r--r--  1 root root     2290 2011-06-14 18:51 libuser.conf
-rw-r--r--  1 root root     1195 2011-06-17 20:13 rsyslog.conf
-rw-r--r--  1 root root     2570 2011-06-22 13:39 locale.alias
-rw-r--r--  1 root root     2969 2011-06-23 10:01 debconf.conf
-rw-r--r--  1 root root     3828 2011-06-24 12:28 securetty
-rw-r--r--  1 root root    10551 2011-06-24 12:28 login.defs
-rw-r--r--  1 root root       91 2011-07-08 20:13 networks
-rw-r--r--  1 root root      267 2011-07-08 20:13 legal
-rw-r--r--  1 root root       92 2011-07-08 20:13 host.conf
-rw-r--r--  1 root root       11 2011-07-08 20:13 debian_version
-rw-r--r--  1 root root    10183 2011-07-18 23:45 sensors3.conf
-rw-r--r--  1 root root     3587 2011-07-27 14:14 lftp.conf
-rw-r--r--  1 root root     5173 2011-07-27 14:32 manpath.config
-rw-r--r--  1 root root      645 2011-07-27 14:36 ts.conf
-rw-r--r--  1 root root     1586 2011-07-27 14:57 request-key.conf
-rw-r--r--  1 root root      111 2011-08-08 23:52 magic.mime
-rw-r--r--  1 root root      111 2011-08-08 23:52 magic
-rw-r--r--  1 root root      321 2011-08-09 19:16 blkid.conf
drwxr-xr-x  2 root root     4096 2011-08-09 19:19 usb_modeswitch.d
-rw-r--r--  1 root root     3279 2011-08-11 15:59 lsb-base-logging.sh
-rw-r--r--  1 root root      326 2011-08-17 16:15 updatedb.conf
-rw-r--r--  1 root root      552 2011-08-19 04:05 pam.conf
-rw-r--r--  1 root root      652 2011-08-25 16:14 zsh_command_not_found
-rw-r--r--  1 root root      592 2011-08-26 11:58 usb_modeswitch.conf
-rw-r--r--  1 root root     1721 2011-09-01 19:49 inputrc
-r--r-----  1 root root      574 2011-09-11 22:09 sudoers
drwxr-xr-x  2 root root     4096 2011-09-19 12:51 lsb-base
-rw-r--r--  1 root root      724 2011-09-20 03:04 crontab
-rw-r--r--  1 root root      643 2011-09-20 08:04 colord.conf
-rw-r--r--  1 root root      599 2011-10-04 18:19 logrotate.conf
-rw-r--r--  1 root root      344 2011-10-04 21:56 bindresvport.blacklist
-rw-r--r--  1 root root     3343 2011-10-04 21:56 gai.conf
-rw-r--r--  1 root root    58753 2011-10-04 22:53 bash_completion
drwxr-xr-x  2 root root     4096 2011-10-05 22:05 update-notifier
-rw-r--r--  1 root root      100 2011-10-08 01:45 lsb-release
-rw-r--r--  1 root root       13 2011-10-09 09:31 issue.net
-rw-r--r--  1 root root       20 2011-10-09 09:31 issue
-rw-r--r--  1 root root     1309 2011-10-09 09:41 kerneloops.conf
drwxr-xr-x  2 root root     4096 2011-10-12 16:26 opt
-rw-r--r--  1 root root       34 2011-10-12 16:26 ld.so.conf
drwxr-xr-x  2 root root     4096 2011-10-12 16:27 terminfo
drwxr-xr-x  2 root root     4096 2011-10-12 16:27 python2.7
-rw-r--r--  1 root root      547 2011-10-12 16:27 profile
drwxr-xr-x  2 root root     4096 2011-10-12 16:27 iproute2
-rw-r--r--  1 root root       79 2011-10-12 16:27 environment
-rw-r--r--  1 root root      165 2011-10-12 16:27 shells
drwxr-xr-x  2 root root     4096 2011-10-12 16:27 depmod.d
-rw-r--r--  1 root root     2981 2011-10-12 16:27 adduser.conf
drwxr-xr-x  3 root root     4096 2011-10-12 16:27 udev
drwxr-xr-x  2 root root     4096 2011-10-12 16:27 sysctl.d
-rwxr-xr-x  1 root root      306 2011-10-12 16:27 rc.local
drwxr-xr-x  6 root root     4096 2011-10-12 16:27 network
drwxr-xr-x  5 root root     4096 2011-10-12 16:27 initramfs-tools
drwxr-xr-x  3 root root     4096 2011-10-12 16:27 systemd
drwxr-xr-x  2 root root     4096 2011-10-12 16:27 sudoers.d
drwxr-xr-x  2 root root     4096 2011-10-12 16:27 vim
drwxr-xr-x  2 root root     4096 2011-10-12 16:27 newt
drwxr-xr-x  4 root root     4096 2011-10-12 16:27 dhcp
drwxr-xr-x  2 root root     4096 2011-10-12 16:27 cron.hourly
drwxr-xr-x  2 root root     4096 2011-10-12 16:27 python
drwxr-xr-x  2 root root     4096 2011-10-12 16:27 kbd
drwxr-xr-x  2 root root     4096 2011-10-12 16:27 console-setup
drwxr-xr-x  3 root root     4096 2011-10-12 16:28 ca-certificates
drwxr-xr-x  4 root root     4096 2011-10-12 16:28 perl
drwxr-xr-x  3 root root     4096 2011-10-12 16:28 pkcs11
drwxr-xr-x  5 root root     4096 2011-10-12 16:28 pm
drwxr-xr-x  6 root root     4096 2011-10-12 16:28 gconf
drwxr-xr-x  6 root root     4096 2011-10-12 16:28 apm
drwxr-xr-x  5 root root     4096 2011-10-12 16:28 polkit-1
drwxr-xr-x  3 root root     4096 2011-10-12 16:28 emacs
drwxr-xr-x  5 root root     4096 2011-10-12 16:28 ConsoleKit
drwxr-xr-x  4 root root     4096 2011-10-12 16:28 ghostscript
drwxr-xr-x  3 root root     4096 2011-10-12 16:28 doc-base
drwxr-xr-x  3 root root     4096 2011-10-12 16:28 gnome-settings-daemon
drwxr-xr-x  3 root root     4096 2011-10-12 16:28 etc
drwxr-xr-x  3 root root     4096 2011-10-12 16:28 sound
drwxr-xr-x  3 root root     4096 2011-10-12 16:29 gnome-vfs-2.0
drwxr-xr-x  3 root root     4096 2011-10-12 16:29 ifplugd
drwxr-xr-x  3 root root     4096 2011-10-12 16:29 dhcp3
drwxr-xr-x  4 root root     4096 2011-10-12 16:29 fonts
drwxr-xr-x  4 root root     4096 2011-10-12 16:29 ssl
-rw-r--r--  1 root root     7014 2011-10-12 16:29 ca-certificates.conf
drwxr-xr-x  3 root root     4096 2011-10-12 16:29 foomatic
drwxr-xr-x  2 root root     4096 2011-10-12 16:29 gtk-3.0
-rw-r--r--  1 root root      880 2011-10-12 16:29 hosts.deny
-rw-r--r--  1 root root      580 2011-10-12 16:29 hosts.allow
drwxr-xr-x  2 root root     4096 2011-10-12 16:29 sensors.d
drwxr-xr-x  4 root root     4096 2011-10-12 16:29 dbus-1
drwxr-xr-x  2 root root     4096 2011-10-12 16:29 groff
drwxr-xr-x  2 root root     4096 2011-10-12 16:29 calendar
drwxr-xr-x  4 root root     4096 2011-10-12 16:29 security
drwxr-xr-x  3 root root     4096 2011-10-12 16:29 apparmor
drwxr-xr-x  2 root root     4096 2011-10-12 16:29 profile.d
drwxr-xr-x  2 root root     4096 2011-10-12 16:29 grub.d
drwxr-s---  2 root dip      4096 2011-10-12 16:29 chatscripts
drwxr-xr-x  3 root root     4096 2011-10-12 16:29 update-manager
drwxr-xr-x  3 root root     4096 2011-10-12 16:29 ufw
drwxr-xr-x  2 root root     4096 2011-10-12 16:29 rsyslog.d
drwxr-xr-x  3 root root     4096 2011-10-12 16:30 acpi
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 gnome-app-install
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 cron.monthly
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 cron.d
drwxr-xr-x  5 root root     4096 2011-10-12 16:30 apport
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 cron.weekly
drwxr-xr-x  3 root root     4096 2011-10-12 16:30 avahi
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 at-spi2
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 bluetooth
drwxr-xr-x  3 root root     4096 2011-10-12 16:30 sgml
drwxr-xr-x  4 root root     4096 2011-10-12 16:30 defoma
drwxr-xr-x  3 root root     4096 2011-10-12 16:30 compizconfig
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 checkbox.d
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 skel
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 gdb
drwxr-xr-x  3 root root     4096 2011-10-12 16:30 firefox
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 obex-data-server
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 UPower
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 snmp
-rw-r--r--  1 root root      513 2011-10-12 16:30 nsswitch.conf
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 wpa_supplicant
drwxr-xr-x  8 root dip      4096 2011-10-12 16:30 ppp
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 pcmcia
drwxr-xr-x  5 root root     4096 2011-10-12 16:30 NetworkManager
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 cupshelpers
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 xml
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 thunderbird
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 update-motd.d
drwxr-xr-x  4 root root     4096 2011-10-12 16:30 speech-dispatcher
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 ginn
drwxr-xr-x  2 root root    12288 2011-10-12 16:30 brltty
-rw-r--r--  1 root root       33 2011-10-12 16:30 brlapi.key
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 gamin
-rw-r--r--  1 root root     7649 2011-10-12 16:30 pnm2ppa.conf
drwxr-xr-x  2 root root     4096 2011-10-12 16:30 hp
drwxr-xr-x  4 root root     4096 2011-10-12 16:30 mono
drwxr-xr-x  2 root root     4096 2011-10-12 16:31 xul-ext
drwxr-xr-x  3 root root     4096 2011-10-12 16:31 sane.d
-rw-r--r--  1 root root       54 2011-10-12 16:31 crypttab
-rw-r--r--  1 root root      227 2011-12-18 11:43 hosts
-rw-r--r--  1 root root       13 2011-12-18 11:43 hostname
-rw-r--r--  1 root root       10 2011-12-18 11:45 adjtime
drwxr-xr-x  2 root root     4096 2011-12-18 11:51 libreoffice
drwxr-xr-x  2 root root     4096 2011-12-18 11:52 dictionaries-common
-rw-r--r--  1 root root      350 2011-12-18 11:52 popularity-contest.conf
-rw-r--r--  1 root root        7 2011-12-18 11:52 papersize
-rw-r--r--  1 root root       91 2011-12-18 11:52 kernel-img.conf
-rw-r--r--  1 root root       15 2011-12-18 12:02 timezone
-rw-r--r--  1 root root     2197 2011-12-18 12:02 localtime
drwxr-xr-x  2 root root     4096 2011-12-18 12:04 ldap
drwxr-xr-x  2 root root     4096 2011-12-18 12:04 pulse
drwxr-xr-x  2 root root     4096 2011-12-18 12:04 timidity
drwxr-xr-x  2 root root     4096 2011-12-18 12:04 wildmidi
drwxr-xr-x  2 root root     4096 2011-12-18 12:04 gtk-2.0
drwxr-xr-x  5 root root     4096 2011-12-18 12:05 java-6-openjdk
drwxr-xr-x  2 root root     4096 2011-12-18 12:05 icedtea-web
drwxr-xr-x  6 root root     4096 2011-12-18 12:08 kernel
drwxr-xr-x  3 root root     4096 2011-12-18 12:09 OpenCL
drwxr-xr-x  3 root root     4096 2011-12-18 12:09 dkms
drwxr-xr-x  2 root root     4096 2011-12-18 12:09 modprobe.d
-rw-------  1 root harel       0 2011-12-18 13:21 mtab.fuselock
drwxr-xr-x  2 root root     4096 2011-12-18 13:30 gnome
drwxr-xr-x  4 root root     4096 2011-12-18 14:44 java-6-sun
drwxr-xr-x  2 root root     4096 2011-12-18 15:06 subversion
drwxr-xr-x  2 root root     4096 2011-12-18 15:37 bonobo-activation
drwxr-xr-x  2 root root     4096 2011-12-19 10:13 purple
drwxr-xr-x  2 root root     4096 2011-12-19 14:27 lightdm
drwxr-xr-x  2 root root     4096 2011-12-19 22:49 ld.so.conf.d
drwxr-xr-x  5 root root     4096 2011-12-19 22:50 xdg
drwxr-xr-x  6 root root     4096 2011-12-19 23:19 resolvconf
drwxr-xr-x  2 root root     4096 2011-12-19 23:19 rcS.d
drwxr-xr-x  2 root root     4096 2011-12-22 18:57 ssh
drwxr-xr-x  2 root root     4096 2011-12-23 12:05 qt3
drwxr-xr-x  2 root root     4096 2011-12-23 16:09 openvpn
drwxr-xr-x  4 root root     4096 2011-12-23 17:02 vlc
drwxr-xr-x  4 root root     4096 2011-12-23 17:17 dconf
drwxr-xr-x  6 root root     4096 2011-12-23 17:17 gdm
drwxr-xr-x  3 root root     4096 2011-12-24 18:47 samba
drwxr-xr-x  2 root root     4096 2011-12-25 10:39 gtags
drwxr-xr-x  2 root root     4096 2012-01-03 16:01 cron.daily
drwxr-xr-x  7 root root     4096 2012-01-03 16:01 apache2
-rw-r--r--  1 root root      664 2012-01-06 11:11 fstab.bak
-rw-r--r--  1 root root      211 2012-01-10 09:40 modules
-rw-------  1 root root      789 2012-01-11 17:49 gshadow-
-rw-------  1 root root      951 2012-01-11 17:49 group-
-rw-------  1 root root     1343 2012-01-11 17:49 shadow-
-rw-------  1 root root     1863 2012-01-11 17:49 passwd-
-rw-r-----  1 root shadow   1343 2012-01-11 17:49 shadow
-rw-r--r--  1 root root     1878 2012-01-11 17:49 passwd
drwxr-xr-x  5 root root     4096 2012-01-11 17:49 logcheck
drwxr-xr-x  8 root root     4096 2012-01-11 17:49 apparmor.d
drwxr-xr-x  2 root root     4096 2012-01-11 17:49 init
drwxr-xr-x  3 root root     4096 2012-01-11 17:49 mysql
drwxr-xr-x  4 root root     4096 2012-01-13 12:47 dpkg
drwxr-xr-x  3 root root     4096 2012-01-13 12:47 bash_completion.d
drwxr-xr-x  2 root root     4096 2012-01-13 12:48 R
drwxr-xr-x 10 root root     4096 2012-01-16 16:08 X11
drwxr-xr-x  2 root root    12288 2012-01-21 19:44 alternatives
-rw-r--r--  1 root root      773 2012-01-22 14:03 fstab
drwxr-xr-x  3 root root     4096 2012-01-27 10:53 java
drwxr-xr-x  3 root root     4096 2012-01-28 17:24 gimp
drwxr-xr-x  6 root root     4096 2012-01-28 17:27 apt
-rw-r--r--  1 root root    23432 2012-01-28 17:35 mailcap
drwxr-xr-x  2 root root     4096 2012-01-28 17:35 logrotate.d
drwxr-xr-x  2 root root     4096 2012-01-28 17:35 default
drwxr-xr-x  2 root root     4096 2012-01-28 17:35 init.d
-rw-r--r--  1 root root      972 2012-01-28 17:35 group
-rw-r-----  1 root shadow    807 2012-01-28 17:35 gshadow
drwxr-xr-x  2 root root     4096 2012-01-28 17:35 pam.d
drwxr-xr-x  2 root root     4096 2012-01-28 17:35 rc6.d
drwxr-xr-x  2 root root     4096 2012-01-28 17:35 rc5.d
drwxr-xr-x  2 root root     4096 2012-01-28 17:35 rc4.d
drwxr-xr-x  2 root root     4096 2012-01-28 17:35 rc3.d
drwxr-xr-x  2 root root     4096 2012-01-28 17:35 rc2.d
drwxr-xr-x  2 root root     4096 2012-01-28 17:35 rc1.d
drwxr-xr-x  2 root root     4096 2012-01-28 17:35 rc0.d
-rw-r--r--  1 root root   136548 2012-01-28 17:35 ld.so.cache
-rw-r--r--  1 root root      697 2012-01-31 00:40 mtab
drwxr-xr-x  4 root lp       4096 2012-01-31 00:48 cups


================================================
FILE: examples/group-emails-example
================================================
root root.1@mydomain.com
harel harel.1@mydomain.com
root root.2@mydomain.com
root root.3@mydomain.com
daemon daemon.1@otherdomain.com
dip dip.1@otherdomain.com
dip dip.2@otherdomain.com
fuse fuse.A@mydomain.com
fuse fuse.B@mydomain.com
fuse fuse.C@mydomain.com
lpa lpa.1@mydomain.com
shadow forsaken.1@mydomain.com


================================================
FILE: mkdocs/README.md
================================================

# Generate web site

# mkdocs folder under project root
$ `cd mkdocs`

* create a pyenv virtual environment 

$ `pip install -r requirements.txt`

$ `./generate-web-site.sh` (static files will be generated into `./generated-site`)

$ `git checkout gh-pages`

$ `cd ../`   # back to project root

$ `scp -r mkdocs/generated-site/* ./`

$ `git add` all modified files

* commit to git 

$ `git push origin gh-pages`


================================================
FILE: mkdocs/docs/about.md
================================================
# About

### Linkedin: [Harel Ben Attia](https://www.linkedin.com/in/harelba/)

### Twitter [@harelba](https://twitter.com/harelba)

### Email [harelba@gmail.com](mailto:harelba@gmail.com)

### Patreon [harelba](https://www.patreon.com/harelba)
All the money received is donated to the [Center for the Prevention and Treatment of Domestic Violence](https://www.gov.il/he/departments/bureaus/molsa-almab-ramla) in my hometown - Ramla, Israel.

<a href="https://www.patreon.com/bePatron?u=65276930" data-patreon-widget-type="become-patron-button">Become a Patron!</a><script async src="https://c6.patreon.com/becomePatronButton.bundle.js"></script>

### Chinese translation [jinzhencheng@outlook.com](mailto:jinzhencheng@outlook.com)


================================================
FILE: mkdocs/docs/fsg9b9b1.txt
================================================


================================================
FILE: mkdocs/docs/google0efeb4ff0a886e81.html
================================================
google-site-verification: google0efeb4ff0a886e81.html

================================================
FILE: mkdocs/docs/index.md
================================================
# q - Run SQL directly on CSV or TSV files

[![GitHub Stars](https://img.shields.io/github/stars/harelba/q.svg?style=social&label=GitHub Stars&maxAge=600)](https://GitHub.com/harelba/q/stargazers/)
[![GitHub forks](https://img.shields.io/github/forks/harelba/q.svg?style=social&label=GitHub Forks&maxAge=600)](https://GitHub.com/harelba/q/network/)

## Overview
q's purpose is to bring SQL expressive power to the Linux command line by providing easy access to text as actual data, and allowing direct access to multi-file sqlite3 databases.

```bash
	q <flags> <sql-query>
```

q allows the following:

* Performing SQL-like statements directly on tabular text data, auto-caching the data in order to accelerate additional querying on the same file

```bash
    # Simple query from a file, columns are named c1...cN
    q "select c1,c5 from myfile.csv"

    # -d '|' sets the input delimiter, -H says there's a header
    q -d '|' -H "select my_field from myfile.delimited-file-with-pipes"

    # -C readwrite writes a cache for the csv file
    q -d , -H "select my_field from myfile.csv" -C readwrite

    # -C read tells q to use the cache
    q -d , -H "select my_field from myfile.csv" -C read

    # Setting the default caching mode (`-C`) can be done by writing a `~/.qrc` file
```

* Performing SQL statements directly on multi-file sqlite3 databases, without having to merge them or load them into memory

```bash
    q "select * from mydatabase.sqlite:::my_table_name"

        or

    q "select * from mydatabase.sqlite"

        if the database file contains only one table

    # sqlite files are autodetected, no need for any special filename extension
```

The following table shows the impact of using caching:

|    Rows   | Columns | File Size | Query time without caching | Query time with caching | Speed Improvement |
|:---------:|:-------:|:---------:|:--------------------------:|:-----------------------:|:-----------------:|
| 5,000,000 |   100   |   4.8GB   |    4 minutes, 47 seconds   |       1.92 seconds      |        x149       |
| 1,000,000 |   100   |   983MB   |        50.9 seconds        |      0.461 seconds      |        x110       |
| 1,000,000 |    50   |   477MB   |        27.1 seconds        |      0.272 seconds      |        x99        |
|  100,000  |   100   |    99MB   |         5.2 seconds        |      0.141 seconds      |        x36        |
|  100,000  |    50   |    48MB   |         2.7 seconds        |      0.105 seconds      |        x25        |

Notice that for the current version, caching is **not enabled** by default, since the caches take disk space. Use `-C readwrite` or `-C read` to enable it for a query, or add `caching_mode` to `.qrc` to set a new default.
 
q treats ordinary files as database tables, and supports all SQL constructs, such as `WHERE`, `GROUP BY`, `JOIN`s, etc. It supports automatic column name and type detection, and provides full support for multiple character encodings.

The new features - autocaching, direct querying of sqlite database and the use of `~/.qrc` file are described in detail in [here](https://github.com/harelba/q/blob/master/QSQL-NOTES.md).

Download the tool using the links in the [installation](#installation) below and play with it.

### Encodings
|                                        |                                                 |
|:--------------------------------------:|:-----------------------------------------------:|
| 完全支持所有的字符编码                 | すべての文字エンコーディングを完全にサポート    |
| 모든 문자 인코딩이 완벽하게 지원됩니다 | все кодировки символов полностью поддерживаются |

**Non-english users:** q fully supports all types of encoding. Use `-e data-encoding` to set the input data encoding, `-Q query-encoding` to set the query encoding, and use `-E output-encoding` to set the output encoding. Sensible defaults are in place for all three parameters. Please contact me if you encounter any issues and I'd be glad to help.

**Files with BOM:** Files which contain a BOM ([Byte Order Mark](https://en.wikipedia.org/wiki/Byte_order_mark)) are not properly supported inside python's csv module. q contains a workaround that allows reading UTF8 files which contain a BOM - Use `-e utf-8-sig` for this. I plan to separate the BOM handling from the encoding itself, which would allow to support BOMs for all encodings.

## Installation

| Format | Instructions | Comments |
:---|:---|:---|
|[OSX](https://github.com/harelba/q/releases/download/v3.1.6/macos-q)|Run `brew install harelba/q/q` in order to install q (moved it to its own tap), or download the standalone executable directly from the link on the left|A man page is available, just run `man q`||
|[RPM Package](https://github.com/harelba/q/releases/download/v3.1.6/q-text-as-data-3.1.6.x86_64.rpm)| run `rpm -ivh <package-filename>` or `rpm -U <package-filename>` if you already have an older version of q.| A man page is available for this release. Just enter `man q`.|
|[DEB Package](https://github.com/harelba/q/releases/download/v3.1.6/q-text-as-data-3.1.6-1.x86_64.deb)| Run `sudo dpkg -i <package-filename>`|A man page is available for this release. Just enter `man q`. Some installations don't install the man page properly for some reason. I'll fix this soon|
|[Windows Installer](https://github.com/harelba/q/releases/download/v3.1.6/q-text-as-data-3.1.6.msi)|Run the installer executable and hit next next next... q.exe will be added to the PATH so you can access it everywhere.|Windows doesn't update the PATH retroactively for open windows, so you'll need to open a new `cmd`/`bash` window after the installation is done.|
|[Source tar.gz](https://github.com/harelba/q/archive/refs/tags/v3.1.6.tar.gz)|Full source file tree for latest stable version. Note that q.py cannot be used directly anymore, as it requires python dependencies||
|[Source zip](https://github.com/harelba/q/archive/refs/tags/v3.1.6.zip)|Full source file tree for the latest stable version. Note that q.py cannot be used directly anymore, as it requires python dependencies||

I will add packages for additional Linux Distributions if there's demand for it. If you're interested in another Linux distribution, please ping me. It's relatively easy to add new ones with the new packaging flow.

The previous version `2.0.19` can be downloaded directly from [here](https://github.com/harelba/q/releases/tag/2.0.19). Please let me know if for some reason the new version is not suitable for your needs, and you're planning on using the previous one.

## Requirements
q is packaged as a compiled standalone-executable that has no dependencies, not even python itself. This was done by using the awesome [pyoxidizer](https://github.com/indygreg/PyOxidizer) project.


## Examples

This section shows example flows that highlight the main features. For more basic examples, see [here](#getting-started-examples).

### Basic Examples:

```bash
# Prepare some data
$ seq 1 1000000 > myfile.csv

# Query it
$ q "select sum(c1),count(*) from myfile.csv where c1 % 3 = 0"
166666833333 333333

# Use q to query from stdin
$ ps -ef | q -b -H "SELECT UID, COUNT(*) cnt FROM - GROUP BY UID ORDER BY cnt DESC LIMIT 3"
501 288
0   115
270 17
```

### Auto-caching Examples

```bash
# (time command output has been shortened for berevity)

# Prepare some data
$ seq 1 1000000 > myfile.csv

# Read from the resulting file 
$ time q "select sum(c1),count(*) from myfile.csv"
500000500000 1000000
total_time=4.108 seconds

# Running with `-C readwrite` auto-creates a cache file if there is none. The cache filename would be myfile.csv.qsql. The query runs as usual
$ time q "select sum(c1),count(*) from myfile.csv" -C readwrite
500000500000 1000000
total_time=4.057 seconds

# Now run with `-C read`. The query will run from the cache file and not the original. As the file gets bigger, the difference will be much more noticable
$ time q "select sum(c1),count(*) from myfile.csv" -C read
500000500000 1000000
total_time=0.229 seconds

# Now let's try another query on that file. Notice the short query duration. The cache is being used for any query that uses this file, and queries on multiple files that contain caches will reuse the cache as well.
$ time q "select avg(c1) from myfile.csv" -C read
500000.5
total_time=0.217 seconds

# You can also query the qsql file directly, as it's just a standard sqlite3 DB file (see next section for q's support of reading directly from sqlite DBs)
$ time q "select sum(c1),count(*) from myfile.csv.qsql"
500000500000 1000000
total_time=0.226 seconds

# Now let's delete the original csv file (be careful when deleting original data)
$ rm -vf myfile.csv

# Running another query directly on the qsql file just works
$ time q "select sum(c1),count(*) from myfile.csv.qsql"
500000500000 1000000
total_time=0.226 seconds

# See the `.qrc` section below if you want to set the default `-C` (`--caching-mode`) to something other than `none` (the default)
```

### Direct sqlite Querying Examples

```bash
# Download example sqlite3 database from https://www.sqlitetutorial.net/sqlite-sample-database/ and unzip it. The resulting file will be chinook.db
$ curl -L https://www.sqlitetutorial.net/wp-content/uploads/2018/03/chinook.zip | tar -xvf -

# Now we can query the database directly, specifying the name of the table in the query (<db_name>:::<table_name>)
$ q "select count(*) from chinook.db:::albums"
347

# Let's take the top 5 longest tracks of album id 34. The -b option just beautifies the output, and -O tells q to output the column names as headers
$ q "select * from chinook.db:::tracks where albumid = '34' order by milliseconds desc limit 5" -b -O
TrackId Name                       AlbumId MediaTypeId GenreId Composer Milliseconds Bytes    UnitPrice
407     "Só Tinha De Ser Com Você" 34      1           7       Vários   389642       13085596 0.99
398     "Only A Dream In Rio"      34      1           7       Vários   371356       12192989 0.99
393     "Tarde Em Itapoã"          34      1           7       Vários   313704       10344491 0.99
401     "Momentos Que Marcam"      34      1           7       Vários   280137       9313740  0.99
391     "Garota De Ipanema"        34      1           7       Vários   279536       9141343  0.99

# Let's now copy the chinook database to another file, as if it's just another different database
$ cp chinook.db another_db.db

# Now we can run a join query between the two databases. They could have been any two different databases, using the copy of chinook is just for simplicity
# Let's get the top-5 longest albums, using albums from the first database and tracks from the second database. The track times are converted to seconds, and rounded to two digits after the decimal point.
$ q -b -O "select a.title,round(sum(t.milliseconds)/1000.0/60,2) total_album_time_seconds from chinook.db:::albums a left join another_database.db:::tracks t on (a.albumid = t.albumid) group by a.albumid order by total_album_time_seconds desc limit 5"
Title                                      total_album_time_seconds
"Lost, Season 3"                           1177.76
"Battlestar Galactica (Classic), Season 1" 1170.23
"Lost, Season 1"                           1080.92
"Lost, Season 2"                           1054.83
"Heroes, Season 1"                         996.34
```

### Analysis Examples

```bash
# Let's create a simple CSV file without a header. Make sure to copy only the three lines, press enter, and
# then press Ctrl-D to exit so the file will be written.
$ cat > some-data-without-header.csv
harel,1,2
ben,3,4
attia,5,6
<Ctrl-D>

# Let's run q on it with -A, to see the detected structure of the file. `-d ,` sets the delimiter to a comma
$ q -d , "select * from some-data-without-header.csv" -A
Table: /Users/harelben-attia/dev/harelba/q/some-data-without-header.csv
  Sources:
    source_type: file source: /Users/harelben-attia/dev/harelba/q/some-data-without-header.csv
  Fields:
    `c1` - text
    `c2` - int
    `c3` - int

# Now let's create another simple CSV file, this time with a header (-H tells q to expect a header in the file)
$ cat > some-data.csv
planet_id,name,diameter_km,length_of_day_hours
1000,Earth,12756,24
2000,Mars,6792,24.7
3000,Jupiter,142984,9.9
<Ctrl-D>

# Let's run q with -A to see the analysis results.
$ q -b -O -H -d , "select * from some-data.csv" -A
Table: /Users/harelben-attia/dev/harelba/q/some-data.csv
  Sources:
    source_type: file source: /Users/harelben-attia/dev/harelba/q/some-data.csv
  Fields:
    `planet_id` - int
    `name` - text
    `diameter_km` - int
    `length_of_day_hours` - real

# Let's run it with `-C readwrite` so a cache will be created
$ q -b -O -H -d , "select * from some-data.csv" -C readwrite
planet_id,name   ,diameter_km,length_of_day_hours
1000     ,Earth  ,12756      ,24.0
2000     ,Mars   ,6792       ,24.7
3000     ,Jupiter,142984     ,9.9

# Running another query that uses some-data.csv with -A will now show that a qsql exists for that file. The source-type 
# will be "file-with-unused-qsql". The qsql cache is not being used, since by default, q does not activate caching
# so backward compatibility is maintained
$ q -b -O -H -d , "select * from some-data.csv" -A
Table: /Users/harelben-attia/dev/harelba/q/some-data.csv
  Sources:
    source_type: file-with-unused-qsql source: /Users/harelben-attia/dev/harelba/q/some-data.csv
  Fields:
    `planet_id` - int
    `name` - text
    `diameter_km` - int
    `length_of_day_hours` - real

# Now let's run another query, this time with `-C read`, telling q to use the qsql caches. This time source-type will 
# be "qsql-file-with-original", and the cache will be used when querying:
$ q -b -O -H -d , "select * from some-data.csv" -A -C read
Table: /Users/harelben-attia/dev/harelba/q/some-data.csv
  Sources:
    source_type: qsql-file-with-original source: /Users/harelben-attia/dev/harelba/q/some-data.csv.qsql
  Fields:
    `planet_id` - int
    `name` - text
    `diameter_km` - int
    `length_of_day_hours` - real

# Let's now read directly from the qsql file. Notice the change in the table name inside the query. `-C read` is not needed
# here. The source-type will be "qsql-file" 
$ q -b -O -H -d , "select * from some-data.csv.qsql" -A
Table: /Users/harelben-attia/dev/harelba/q/some-data.csv.qsql
  Sources:
    source_type: qsql-file source: /Users/harelben-attia/dev/harelba/q/some-data.csv.qsql
  Fields:
    `planet_id` - int
    `name` - text
    `diameter_km` - int
    `length_of_day_hours` - real
```

## Usage
Query should be an SQL-like query which contains filenames instead of table names (or - for stdin). The query itself should be provided as one parameter to the tool (i.e. enclosed in quotes).

All sqlite3 SQL constructs are supported, including joins across files (use an alias for each table). Take a look at the [limitations](#limitations) section below for some rarely-used use cases which are not fully supported.

q gets a full SQL query as a parameter. Remember to double-quote the query.

Historically, q supports multiple queries on the same command-line, loading each data file only once, even if it is used by multiple queries on the same q invocation. This is still supported. However, due to the new automatic-caching capabilities, this is not really required. Activate caching, and a cache file will be automatically created for each file. q Will use the cache behind the scenes in order to speed up queries. The speed up is extremely significant, so consider using caching for large files.

The following filename types are supported:

* **Delimited-file filenames** - including relative/absolute paths. E.g. `./my_folder/my_file.csv` or `/var/tmp/my_file.csv`
* **sqlite3 database filenames**
    * **With Multiple Tables** - Add an additional `:::<table_name>` for accessing a specific table. For example `mydatabase.sqlite3:::users_table`.
    * **With One Table Only** - Just specify the database filename, no need for a table name postfix. For example `my_single_table_database.sqlite`.
* **`.qsql` cache files** - q can auto-generate cache files for delimited files, and they can be queried directly as a table, since they contain only one table, as they are essentially standard sqlite datbases

Use `-H` to signify that the input contains a header line. Column names will be detected automatically in that case, and can be used in the query. If this option is not provided, columns will be named cX, starting with 1 (e.g. `q "SELECT c3,c8 from ..."`).

Use `-d` to specify the input delimiter.

Column types are auto detected by the tool, no casting is needed. Note that there's a flag `--as-text` which forces all columns to be treated as text columns.

Please note that column names that include spaces need to be used in the query with back-ticks, as per the sqlite standard. Make sure to use single-quotes around the query, so bash/zsh won't interpret the backticks.

Query/Input/Output encodings are fully supported (and q tries to provide out-of-the-box usability in that area). Please use `-e`,`-E` and `-Q` to control encoding if needed.

JOINs are supported and Subqueries are supported in the WHERE clause, but unfortunately not in the FROM clause for now. Use table aliases when performing JOINs.

The SQL syntax itself is sqlite's syntax. For details look at https://www.sqlite.org/lang.html or search the net for examples.

NOTE: When using the `-O` output header option, use column name aliases if you want to control the output column names. For example, `q -O -H "select count(*) cnt,sum(*) as mysum from -"` would output `cnt` and `mysum` as the output header column names.

``` bash
Options:
  -h, --help            show this help message and exit
  -v, --version         Print version
  -V, --verbose         Print debug info in case of problems
  -S SAVE_DB_TO_DISK_FILENAME, --save-db-to-disk=SAVE_DB_TO_DISK_FILENAME
                        Save database to an sqlite database file
  -C CACHING_MODE, --caching-mode=CACHING_MODE
                        Choose the autocaching mode (none/read/readwrite).
                        Autocaches files to disk db so further queries will be
                        faster. Caching is done to a side-file with the same
                        name of the table, but with an added extension .qsql
  --dump-defaults       Dump all default values for parameters and exit. Can
                        be used in order to make sure .qrc file content is
                        being read properly.
  --max-attached-sqlite-databases=MAX_ATTACHED_SQLITE_DATABASES
                        Set the maximum number of concurrently-attached sqlite
                        dbs. This is a compile time definition of sqlite. q's
                        performance will slow down once this limit is reached
                        for a query, since it will perform table copies in
                        order to avoid that limit.
  --overwrite-qsql=OVERWRITE_QSQL
                        When used, qsql files (both caches and store-to-db)
                        will be overwritten if they already exist. Use with
                        care.

  Input Data Options:
    -H, --skip-header   Skip header row. This has been changed from earlier
                        version - Only one header row is supported, and the
                        header row is used for column naming
    -d DELIMITER, --delimiter=DELIMITER
                        Field delimiter. If none specified, then space is used
                        as the delimiter.
    -p, --pipe-delimited
                        Same as -d '|'. Added for convenience and readability
    -t, --tab-delimited
                        Same as -d <tab>. Just a shorthand for handling
                        standard tab delimited file You can use $'\t' if you
                        want (this is how Linux expects to provide tabs in the
                        command line
    -e ENCODING, --encoding=ENCODING
                        Input file encoding. Defaults to UTF-8. set to none
                        for not setting any encoding - faster, but at your own
                        risk...
    -z, --gzipped       Data is gzipped. Useful for reading from stdin. For
                        files, .gz means automatic gunzipping
    -A, --analyze-only  Analyze sample input and provide information about
                        data types
    -m MODE, --mode=MODE
                        Data parsing mode. fluffy, relaxed and strict. In
                        strict mode, the -c column-count parameter must be
                        supplied as well
    -c COLUMN_COUNT, --column-count=COLUMN_COUNT
                        Specific column count when using relaxed or strict
                        mode
    -k, --keep-leading-whitespace
                        Keep leading whitespace in values. Default behavior
                        strips leading whitespace off values, in order to
                        provide out-of-the-box usability for simple use cases.
                        If you need to preserve whitespace, use this flag.
    --disable-double-double-quoting
                        Disable support for double double-quoting for escaping
                        the double quote character. By default, you can use ""
                        inside double quoted fields to escape double quotes.
                        Mainly for backward compatibility.
    --disable-escaped-double-quoting
                        Disable support for escaped double-quoting for
                        escaping the double quote character. By default, you
                        can use \" inside double quoted fields to escape
                        double quotes. Mainly for backward compatibility.
    --as-text           Don't detect column types - All columns will be
                        treated as text columns
    -w INPUT_QUOTING_MODE, --input-quoting-mode=INPUT_QUOTING_MODE
                        Input quoting mode. Possible values are all, minimal
                        and none. Note the slightly misleading parameter name,
                        and see the matching -W parameter for output quoting.
    -M MAX_COLUMN_LENGTH_LIMIT, --max-column-length-limit=MAX_COLUMN_LENGTH_LIMIT
                        Sets the maximum column length.
    -U, --with-universal-newlines
                        Expect universal newlines in the data. Limitation: -U
                        works only with regular files for now, stdin or .gz
                        files are not supported yet.

  Output Options:
    -D OUTPUT_DELIMITER, --output-delimiter=OUTPUT_DELIMITER
                        Field delimiter for output. If none specified, then
                        the -d delimiter is used if present, or space if no
                        delimiter is specified
    -P, --pipe-delimited-output
                        Same as -D '|'. Added for convenience and readability.
    -T, --tab-delimited-output
                        Same as -D <tab>. Just a shorthand for outputting tab
                        delimited output. You can use -D $'\t' if you want.
    -O, --output-header
                        Output header line. Output column-names are determined
                        from the query itself. Use column aliases in order to
                        set your column names in the query. For example,
                        'select name FirstName,value1/value2 MyCalculation
                        from ...'. This can be used even if there was no
                        header in the input.
    -b, --beautify      Beautify output according to actual values. Might be
                        slow...
    -f FORMATTING, --formatting=FORMATTING
                        Output-level formatting, in the format X=fmt,Y=fmt
                        etc, where X,Y are output column numbers (e.g. 1 for
                        first SELECT column etc.
    -E OUTPUT_ENCODING, --output-encoding=OUTPUT_ENCODING
                        Output encoding. Defaults to 'none', leading to
                        selecting the system/terminal encoding
    -W OUTPUT_QUOTING_MODE, --output-quoting-mode=OUTPUT_QUOTING_MODE
                        Output quoting mode. Possible values are all, minimal,
                        nonnumeric and none. Note the slightly misleading
                        parameter name, and see the matching -w parameter for
                        input quoting.
    -L, --list-user-functions
                        List all user functions

  Query Related Options:
    -q QUERY_FILENAME, --query-filename=QUERY_FILENAME
                        Read query from the provided filename instead of the
                        command line, possibly using the provided query
                        encoding (using -Q).
    -Q QUERY_ENCODING, --query-encoding=QUERY_ENCODING
                        query text encoding. Experimental. Please send your
                        feedback on this
```

### Setting the default values for parameters
It's possible to set default values for parameters which are used often by configuring them in the file `~/.qrc`.

The file format is as follows:
```bash
[options]
<setting>=<default-value>
```

It's possible to generate a default `.qrc` file by running `q --dump-defaults` and write the output into the `.qrc` file.

One valuable use-case for this could be setting the caching-mode to `read`. This will make q automatically use generated `.qsql` cache files if they exist. Whenever you want a cache file to be generated, just use `-C readwrite` and a `.qsql` file will be generated if it doesn't exist.

Here's the content of the `~/.qrc` file for enabling cache reads by default:
```bash
[options]
caching_mode=read
```
  
## Getting Started Examples
This section shows some more basic examples of simple SQL constructs. 

For some more complex use-cases, see the [examples](#examples) at the beginning of the documentation.

NOTES:

* The `-H` flag in the examples below signifies that the file has a header row which is used for naming columns.
* The `-t` flag is just a shortcut for saying that the file is a tab-separated file (any delimiter is supported - Use the `-d` flag).
* Queries are given using upper case for clarity, but actual query keywords such as SELECT and WHERE are not really case sensitive.

Basic Example List:

* [Example 1 - COUNT DISTINCT values of specific field (uuid of clicks data)](#example-1)
* [Example 2 - Filter numeric data, controlling ORDERing and LIMITing output](#example-2)
* [Example 3 - Illustrate GROUP BY](#example-3)
* [Example 4 - More complex GROUP BY (group by time expression)](#example-4)
* [Example 5 - Read input from standard input](#example-5)
* [Example 6 - Use column names from header row](#example-6)
* [Example 7 - JOIN two files](#example-7)

### Example 1
Perform a COUNT DISTINCT values of specific field (uuid of clicks data).

``` bash
q -H -t "SELECT COUNT(DISTINCT(uuid)) FROM ./clicks.csv"
```
Output
``` bash
229
```
### Example 2
Filter numeric data, controlling ORDERing and LIMITing output

Note that q understands that the column is numeric and filters according to its numeric value (real numeric value comparison, not string comparison).

``` bash
q -H -t "SELECT request_id,score FROM ./clicks.csv WHERE score > 0.7 ORDER BY score DESC LIMIT 5"
```
Output:
``` bash
2cfab5ceca922a1a2179dc4687a3b26e    1.0
f6de737b5aa2c46a3db3208413a54d64    0.986665809568
766025d25479b95a224bd614141feee5    0.977105183282
2c09058a1b82c6dbcf9dc463e73eddd2    0.703255121794
```

### Example 3
Illustrate GROUP BY

``` bash
q -t -H "SELECT hashed_source_machine,count(*) FROM ./clicks.csv GROUP BY hashed_source_machine"
```
Output:
``` bash
47d9087db433b9ba.domain.com 400000
```

### Example 4
More complex GROUP BY (group by time expression)

``` bash
q -t -H "SELECT strftime('%H:%M',date_time) hour_and_minute,count(*) FROM ./clicks.csv GROUP BY hour_and_minute"
```
Output:
``` bash
07:00   138148
07:01   140026
07:02   121826
```

### Example 5
Read input from standard input

Calculates the total size per user/group in the /tmp subtree.

``` bash
sudo find /tmp -ls | q "SELECT c5,c6,sum(c7)/1024.0/1024 AS total FROM - GROUP BY c5,c6 ORDER BY total desc"
```
Output:
``` bash
mapred hadoop   304.00390625
root   root     8.0431451797485
smith  smith    4.34389972687
```

### Example 6
Use column names from header row

Calculate the top 3 user ids with the largest number of owned processes, sorted in descending order.

Note the usage of the autodetected column name UID in the query.

``` bash
ps -ef | q -H "SELECT UID,COUNT(*) cnt FROM - GROUP BY UID ORDER BY cnt DESC LIMIT 3"
```
Output:
``` bash
root 152
harel 119
avahi 2
```

### Example 7
JOIN two files

The following command joins an ls output (exampledatafile) and a file containing rows of group-name,email (group-emails-example) and provides a row of filename,email for each of the emails of the group. For brevity of output, there is also a filter for a specific filename called ppp which is achieved using a WHERE clause.

``` bash
q "SELECT myfiles.c8,emails.c2 FROM exampledatafile myfiles JOIN group-emails-example emails ON (myfiles.c4 = emails.c1) WHERE myfiles.c8 = 'ppp'"
```
Output:
``` bash
ppp dip.1@otherdomain.com
ppp dip.2@otherdomain.com
```

You can see that the ppp filename appears twice, each time matched to one of the emails of the group dip to which it belongs. Take a look at the files `exampledatafile` and `group-emails-example` for the data.

Column name detection is supported for JOIN scenarios as well. Just specify `-H` in the command line and make sure that the source files contain the header rows.

## Implementation
Behind the scenes q creates a "virtual" sqlite3 database that does not contain data of its own, but attaches to multiple other databases as follows:

* When reading delimited files or data from `stdin`, it will analyze the data and construct an in-memory "adhoc database" that contains it. This adhoc database will be attached to the virtual database
* When a delimited file has a `.qsql` cache, it will attach to that file directly, without having to read it into memory
* When querying a standard sqlite3 file, it will be attached to the virtual database to it as well, without reading it into memory. sqlite3 files are auto-detected, no need for any special filename extension

The user query will be executed directly on the virtual database, using the attached databases.

sqlite3 itself has a limit on the number of attached databases (usually 10). If that limit is reached, q will automatically attach databases until that limit is reached, and will load additional tables into the adhoc database's in-memory database.

Please make sure to read the [limitations](#limitations) section as well.

## Development

### Tests
The code includes a test suite runnable through `run-tests.sh`. By default, it uses the python source code for running the tests. However, it is possible to provide a path to an actual executable to the tests using the `Q_EXECUTABLE` env var. This is actually being used during the build and packaging process, in order to test the resulting binary.  

## Limitations
Here's the list of known limitations. Please contact me if you have a use case that needs any of those missing capabilities.

* Common Table Expressions (CTE) are not supported for now. Will be implemented soon - See [here](https://github.com/harelba/q/issues/67) and [here](https://github.com/harelba/q/issues/124) for details.
* `FROM <subquery>` is not supported
* Spaces in file names are not supported. Use stdin for piping the data into q, or rename the file
* Some rare cases of subqueries are not supported yet.
* Queries with more than 10 different sqlite3 databases will load some data into memory
* up to 500 tables are supported in a single query

## Rationale
Have you ever stared at a text file on the screen, hoping it would have been a database so you could ask anything you want about it? I had that feeling many times, and I've finally understood that it's not the database that I want. It's the language - SQL.

SQL is a declarative language for data, and as such it allows me to define what I want without caring about how exactly it's done. This is the reason SQL is so powerful, because it treats data as data and not as bits and bytes (and chars).

The goal of this tool is to provide a bridge between the world of text files and of SQL.

### Why aren't other Linux tools enough?
The standard Linux tools are amazing and I use them all the time, but the whole idea of Linux is mixing-and-matching the best tools for each part of job. This tool adds the declarative power of SQL to the Linux toolset, without loosing any of the other tools' benefits. In fact, I often use q together with other Linux tools, the same way I pipe awk/sed and grep together all the time.

One additional thing to note is that many Linux tools treat text as text and not as data. In that sense, you can look at q as a meta-tool which provides access to all the data-related tools that SQL provides (e.g. expressions, ordering, grouping, aggregation etc.).

### Philosophy
This tool has been designed with general Linux/Unix design principles in mind. If you're interested in these general design principles, read this amazing [book](http://catb.org/~esr/writings/taoup/) and specifically [this part](http://catb.org/~esr/writings/taoup/html/ch01s06.html). If you believe that the way this tool works goes strongly against any of the principles, I would love to hear your view about it.

## Future

* Expose python as a python module - Planned as a goal after the new version `3.x` is out


================================================
FILE: mkdocs/docs/index_cn.md
================================================
# q - 直接在CSV或TSV文件上运行SQL

[![GitHub Stars](https://img.shields.io/github/stars/harelba/q.svg?style=social&label=GitHub Stars&maxAge=600)](https://GitHub.com/harelba/q/stargazers/)
[![GitHub forks](https://img.shields.io/github/forks/harelba/q.svg?style=social&label=GitHub Forks&maxAge=600)](https://GitHub.com/harelba/q/network/)


## 概述
q 是一个可以运行在 CSV / TSV 文件(或其他表格式的文本文件)上运行类SQL命令的命令行工具。

q 将普通文本（如上述）作为数据库表，且支持所有的SQL语法如：WHERE、GROUP BY、各种JOIN等。此外，还拥有自动识别列名和列类型及广泛支持多种编码的特性。

``` bash
q "SELECT COUNT(*) FROM ./clicks_file.csv WHERE c3 > 32.3"
```

``` bash
ps -ef | q -H "SELECT UID,COUNT(*) cnt FROM - GROUP BY UID ORDER BY cnt DESC LIMIT 3"
```

查看[示例](#示例)或[安装](#安装)体验.

|                                        |                                                 |
|:--------------------------------------:|:-----------------------------------------------:|
| 完全支持所有的字符编码                 | すべての文字エンコーディングを完全にサポート    |
| 모든 문자 인코딩이 완벽하게 지원됩니다 | все кодировки символов полностью поддерживаются |


**非英语用户:** q 完全支持所有类型的字符编码。 使用 `-e data-encoding` 设置输入编码; 使用 `-Q query-encoding` 设置查询编码; 使用 `-E output-encoding` 设置输出编码;
如上三个参数均设有合理的默认值。<br/>

> 如果遇到问题请与我联系，期待与你交流。

**含有BOM的文件:** python的csv模块并不能很好的支持含有[Byte Order Mark](https://en.wikipedia.org/wiki/Byte_order_mark) 的文件。针对该种情况，使用 `-e utf-8-sig` 命令参数可读取包含BOM的UTF8编码文件。

> 我们计划将BOM相关处理与编码'解耦', 这样就可以支持所有编码的BOM文件了。

## 安装

| 格式 | 说明 | 备注 |
|:---|:---|:---|
|[OSX](https://github.com/harelba/q/releases/download/2.0.19/q-x86_64-Darwin)|运行 `brew install q`| 该方式暂不支持MAN手册, 可以使用 `q --help` 查看帮助||
|[RPM Package](https://github.com/harelba/q/releases/download/2.0.19/q-text-as-data-2.0.19-1.x86_64.rpm)| 运行 `rpm -ivh <package-filename>` 如果安装过旧版则运行 `rpm -U <package-filename>` | 该方式支持MAN手册，可运行`man q`查看|
|[DEB Package](https://github.com/harelba/q/releases/download/2.0.19/q-text-as-data_2.0.19-2_amd64.deb)| 运行 `sudo dpkg -i <package-filename>`|该方式支持MAN手册，可运行`man q`查看|
|[Windows Installer](https://github.com/harelba/q/releases/download/2.0.19/q-AMD64-Windows-installer.exe)|运行安装可执行文件，一直点击下一步、下一步... q.exe 将被添加至PATH，以便于随处运行|PATH更新后并不会即时生效，重新打开cmd命令窗口便可|
|[tar.gz](https://github.com/harelba/q/archive/2.0.19.tar.gz)|最新稳定版的所有源码文件。提示，q.py 文件不能直接使用，因为它需要python依赖||
|[zip](https://github.com/harelba/q/archive/2.0.19.zip)|最新稳定版的所有源码文件。提示，q.py 文件不能直接使用，因为它需要python依赖||

**旧版本可以在这儿[下载](https://github.com/harelba/packages-for-q) 。按理说不会有人愿意用旧版本，要是你计划使用旧版，希望能与你交流。**

## 须知
从`2.0.9`版本开始，不需要任何外部依赖。Python(3.7)和其他所需的库包含在了安装文件中且与系统隔离。

## 使用

``` bash
q <flags> "<query>"

  最简单的执行语句：q "SELECT * FROM myfile" 该语句会输出文件内容
```

q 支持在表格式的文本上执行类SQL命令。它的初衷是为Linux命令行附加SQL的表达力且实现对文本数据的轻松访问。

类SQL的查询将*文件名(或标准输入流)看作表名*。查询语句会作为命令输入的一个参数（使用引号包裹)，如果将多个文件看作一张表，可以这样写 `文件名1+文件名2....`或者使用通配符（比如：`my_files*.csv`)。

使用 `-H` 表示输入内容中包含表头。该情况下列名会被自动识别，如果没有指定该参数，列名将会被以`cX`命名，`X`从1开始（比如: `q "SELECT c3,c8 from ..."`) 。

使用 `-d` 声明输入的分隔符。

列类型可由工具自动识别，无需强制转换。 提示，使用`--as-text` 可以强制将所有列类型转换为文本类型。

依据sqlite规范，如果列名中含有空格，需要使用反引号 (即：`) 引起来。

完全支持查询/输入/输出的编码设置（q 力争提供一种开箱即用的方法), 可以分别使用`-Q`,`-e` 和 `-E`来指定编码设置类型。

支持所有的sqlite3 SQL方法，包括文件之间的 JOIN（可以为文件设置别名）操作。在下面的[限制](#限制)小节可以看到一些少有使用的、欠支持的说明。

### 查询

q 的每一个参数都是由双引号包裹的一条完整的SQL语句。所有的查询语句会依次执行，最终结果以标准输出流形式输出。 提示，在同一命令行中执行多条查询语句时，仅在执行第一条查询语句时需要耗时载入数据，其他查询语句即时执行。

支持所有标准SQL语法，条件（WHERE 和 HAVING）、GROUP BY、ORDER BY等。

在WHERE条件查询中，支持JOIN操作和子查询，但在FROM子句中并不支持。JOIN操作时，可以为文件起别名。

SQL语法同sqlite的语法，详情见 https://www.sqlite.org/lang.html 或上网找一些示例。

**注意**：

* 支持所有类型的自动识别，无需强制转换或其他操作。
  
* 如果重命名输出列，则需要为列指定别名并使用 `-O` 声明。如: `q -O -H "select count(*) cnt,sum(*) as mysum from -"` 便会将`cnt`和`mysum`作为列名输出。

### 指令

``` bash
使用:
        q 支持在表格式的文本数据上执行类SQL查询。

        它的初衷是为Linux命令行附加SQL的表达力且实现对文本数据的轻松访问。

        基本操作是 q "SQL查询语句" 表名便是文件名（使用 - 从标注输入中读取数据）。若输入内容包含表头时，可以使用 -H 指定列名。若无表头，则列将会自动命名为 c1...cN。

        列类型可被自动识别。可以使用 -A 命令查看每列的名称及其类型。

        可以使用 -d (或 -t) 指定分隔符，使用 -D 指定输出分割符。

        支持所有的sqlite3 SQL方法。

        示例:
            
          例子1: ls -ltrd * | q "select c1,count(1) from - group by c1" 
          上例将会输出当前目录下，所有文件的权限表达式分组及每组数量。

          例子2: seq 1 1000 | q "select avg(c1),sum(c1) from -" 
          上例将会输出1到1000的平均数与和数。
          
          例子3: sudo find /tmp -ls | q "select c5,c6,sum(c7)/1024.0/1024 as total from - group by c5,c6 order by total desc" 
          上例将会输出在/tmp目录下，相同'用户+组'的文件所占用的MB磁盘空间。

          更多详情见 https://github.com/harelba/q/ 或查看帮助
    
选项：
  -h, --help            显示此帮助信息并退出 
  -v, --version         显示版本号
  -V, --verbose         出现问题时显示调试信息
  -S SAVE_DB_TO_DISK_FILENAME, --save-db-to-disk=SAVE_DB_TO_DISK_FILENAME
                        将数据库保存为一个 sqlite 数据库文件
  --save-db-to-disk-method=SAVE_DB_TO_DISK_METHOD
                        保存数据库到磁盘的方法
                        'standard' 不需要任何设置
                        'fast'需要手动在python的安装目录下执行`pip install sqlitebck`
                        打包的问题解决后，'fast'即被作为默认方式
  数据相关的选项:
  
    -H, --skip-header   忽略表头，在早期的版本中已修改为：仅支持用于标明列名的一行表头
    -d DELIMITER, --delimiter=DELIMITER
                        列分隔符，若无特别指定，默认为空格符
    -p, --pipe-delimited
                        作用同 -d '|'，为了方便和可读性提供该参数
    -t, --tab-delimited
                        作用同 -d <tab>，这仅是一种简写，也可以在Linux命令行中使用$'\t'
    -e ENCODING, --encoding=ENCODING
                        输入文件的编码，默认是UTF-8
    -z, --gzipped       压缩数据，对于从输入流读取文件非常高效 .gz 是自动压缩后文件扩展名
    -A, --analyze-only  简单分析：各列的数据类型
    -m MODE, --mode=MODE
                        数据解析模式: 松散, 宽松和严格。在严格模式下必须指定 -c 
                        --column-count 参数。
    -c COLUMN_COUNT, --column-count=COLUMN_COUNT
                        当使用宽松或严格模式时，用于指定列的数量
    -k, --keep-leading-whitespace
                        保留每列前的空格。为了使其开箱即用，默认去除了列前的空格
                        如果有需要，可以指定该参数
    --disable-double-double-quoting
                        禁止一对双引号的转义。默认可以使用 "" 转义双引号
                        主要为了向后兼容
    --disable-escaped-double-quoting
                        禁止转义双引号
                        默认可以在双引号字段中使用 \" 进行转义
                        主要为了向后兼容 
    --as-text           不识别列类型（所有列被当作文本类型）
    -w INPUT_QUOTING_MODE, --input-quoting-mode=INPUT_QUOTING_MODE
                        输入内容的转义模式，可选值 all、minimal、none
                        该参数稍有误导性，-W 指定输出内容的转义模式 
    -M MAX_COLUMN_LENGTH_LIMIT, --max-column-length-limit=MAX_COLUMN_LENGTH_LIMIT
                        设置列的最大长度
    -U, --with-universal-newlines
                        设置通用换行符
                        -U 参数当前仅适用于常规文件，输入流或.gz类文件暂不支持

  输出相关的选项:
    -D OUTPUT_DELIMITER, --output-delimiter=OUTPUT_DELIMITER
                        输出列间的分隔符
                        若未指定，则与 -d 指定的分隔符相同；若均为指定，则默认为空格符
    -P, --pipe-delimited-output
                        同 -D '|' 为了方便和可读性提供该参数
    -T, --tab-delimited-output
                        同 -D <tab> 这仅是一种简写，也可以在Linux命令行中使用$'\t' 
    -O, --output-header
                        输出表头，输出的列名是由查询中指定的别名
                        如: 'select name FirstName, value1/value2 MyCalculation
                        from ...' 即使输入时未指定表头仍可使用该参数。
    -b, --beautify      美化输出结果，可能较慢...
    -f FORMATTING, --formatting=FORMATTING
                        格式化输出列
                        如格式X=fmt，Y=fmt等，上述中的X、Y是指第几列（如：1 表示 SELECT 
                        的第一列)
    -E OUTPUT_ENCODING, --output-encoding=OUTPUT_ENCODING
                        输出内容的编码，默认是 'none'，跟随系统或终端的编码
    -W OUTPUT_QUOTING_MODE, --output-quoting-mode=OUTPUT_QUOTING_MODE
                        输出内容的转义模式，可选值 all、minimal、none
                        该参数稍有误导性，-w 指定输入内容的转义模式 
    -L, --list-user-functions
                        列出所有内置函数

  查询相关的参数:
    -q QUERY_FILENAME, --query-filename=QUERY_FILENAME
                        指定文件名，由文件中读取查询语句。
                        该操作常与查询编码（使用 -Q)一同使用
    -Q QUERY_ENCODING, --query-encoding=QUERY_ENCODING
                        查询编码(包含查询语句的文件编码)
                        实验性参数，对该参数的意见可反馈
```

## 示例
下述 `-H` 参数的例子，表示文件中含有表头时使用该参数。

`-t` 参数是指定文件以 tab 作为分隔符的缩写（可以使用 `-d` 参数指定任意分隔符）。

为了清楚起见，查询关键字均使用大写，实际上关键字(如 SELECT、WHERE等)对大小写并不敏感。

示例目录:

* [例1 - 统计指定列唯一值的数量](#1)
* [例2 - 数值条件过滤、排序并限制输出数](#2)
* [例3 - GROUP BY简单示例](#3)
* [例4 - GROUP BY进阶示例 (以时间格式分组)](#4)
* [例5 - 标准输入流作为输入](#5)
* [例6 - 使用表头中列名](#6)
* [例7 - JOIN 两个文件](#7)

### 例1
对指定字段（点击数据中的uuid）执行 COUNT DISTINCT 

``` bash
q -H -t "SELECT COUNT(DISTINCT(uuid)) FROM ./clicks.csv"
```
输出:
``` bash
229
```

### 例2
过滤数值数据、排序并限制输出数量

注意：q 将其看作数值类型并对其进行数值过滤(数值比较而不是字符串比较)

``` bash
q -H -t "SELECT request_id,score FROM ./clicks.csv WHERE score > 0.7 ORDER BY score DESC LIMIT 5"
```
输出:
``` bash
2cfab5ceca922a1a2179dc4687a3b26e    1.0
f6de737b5aa2c46a3db3208413a54d64    0.986665809568
766025d25479b95a224bd614141feee5    0.977105183282
2c09058a1b82c6dbcf9dc463e73eddd2    0.703255121794
```

### 例3
GROUP BY 简单示例

``` bash
q -t -H "SELECT hashed_source_machine,count(*) FROM ./clicks.csv GROUP BY hashed_source_machine"
```
输出:
``` bash
47d9087db433b9ba.domain.com 400000
```

### 例4
GROUP BY进阶示例 (以时间格式分组)

``` bash
q -t -H "SELECT strftime('%H:%M',date_time) hour_and_minute,count(*) FROM ./clicks.csv GROUP BY hour_and_minute"
```
输出:
``` bash
07:00   138148
07:01   140026
07:02   121826
```

### 例5
标准输入流作为输入

计算 /tmp 目录下各 user/group 的占用空间大小

``` bash
sudo find /tmp -ls | q "SELECT c5,c6,sum(c7)/1024.0/1024 AS total FROM - GROUP BY c5,c6 ORDER BY total desc"
```
输出:
``` bash
mapred hadoop   304.00390625
root   root     8.0431451797485
smith  smith    4.34389972687
```

### 例6
使用表头中列名

计算拥有进程数最多的前3位用户名及其数量

注意: 该查询中自动识别了列名

``` bash
ps -ef | q -H "SELECT UID,COUNT(*) cnt FROM - GROUP BY UID ORDER BY cnt DESC LIMIT 3"
```
输出:
``` bash
root 152
harel 119
avahi 2
```

### 例7
JOIN 两个文件

如下命令中JOIN一个ls命令输出内容文件（exampledatafile) 和一个包含group_name、email两列字段的文件（group-emails-example)，每一邮件组均包含filename、email列, 为了输出简便，使用WHERE条件过滤出名为 ppp 的文件

``` bash
q "SELECT myfiles.c8,emails.c2 FROM exampledatafile myfiles JOIN group-emails-example emails ON (myfiles.c4 = emails.c1) WHERE myfiles.c8 = 'ppp'"
```
输出:
``` bash
ppp dip.1@otherdomain.com
ppp dip.2@otherdomain.com
```
可以看出 ppp 文件出现了两次，每次都匹配到了它所属的dip邮件组（如例中 dip.1@... /  dip2@...)，可以在 `exampledatafile` 和 `group-emails-example` 文件中查看数据。

JOIN 的应用场景中也支持列名识别，在查询包含表头的文件时，只需指定 `-H` 参数即可。

## 声明
为了避免引用外部依赖，当前是使用由Python编写的内存数据库实现的。当前是支持 SELECT 语句及 各种JOIN （ 目前仅在 WHERE 语句中支持子查询)。
若想对数据进一步分析，可以使用 `--save-db-to-disk` 参数，以将结果输出为 sqlite 数据库文件，然后使用 `sqlite3` 语句来执行查询操作。

需要提示的是，当前并没有对数据量的大小进行检测和限制 - 也就是说，需要用户自己掌控文件大小。

请务必阅读[限制](#限制)小节。

## 开发

### 测试
源码中包含了测试用例，可以通过 `test/test-all` 来执行。若想要提交 PR的话，一定先确保其均执行成功。

## 限制
如下罗列了一些已知的限制，若你的使用场景中需要用到以下标明的限制，请联系我。

* 不支持 `FROM <subquery>` 
* 不支持公用表表达式(CTE)
* 不支持文件名中包含空格 (可以将文件以标准输入流的方式输入 q 或重命名文件)
* 不支持较少用到的子查询

## 原理
你是否曾经盯着屏幕上的文本文件发呆，希望它要是数据库就好了，这样就可以找出自己想要的内容？我曾有过很多次，最终顿悟。我想要的不是数据库，而是 SQL。

SQL 是一种面向数据声明的语言，它允许自定义数据内容而无需关心其执行过程。这也正是SQL强大之处，因为它对于数据'所见即所得'，而不是将数据看作字节码。

本工具的目的是：在文本文件和SQL之间搭建一座桥梁。

### 为什么其他Linux工具不能满足需求？
传统的Linux工具库也很酷，我也经常使用它们， 但Linux的整体理念是为任一部分搭配最好的工具。本工具为传统Linux工具集新添了 SQL 族类工具，其他工具并不会失去本来优势。
事实上，我也经常将 q 和其他Linux工具搭配使用，就如同使用管道将 awk/sed 和 grep 搭配使用一样。

另外需要注意的是,许多Linux工具就将文本看作文本，而不是数据。从这个意义上来讲，可以将 q 看作提供了 SQL 功能（如：表达式、排序、分组、聚合等）的元工具。

### 理念

本工具的设计遵从了 Linux/Unix 的传统设计原则。若你对这些设计原则感兴趣，可以阅读 [这本书](http://catb.org/~esr/writings/taoup/) ，尤其是书中 [这部分](http://catb.org/~esr/writings/taoup/html/ch01s06.html)
若你认为本工具工作方式与之背道而驰，愿洗耳恭听你的建议。

## 展望

* 主要方向：将其作为python的模块公开。 在公开之前，需要对处理标准输入流做一些内部API的完善。
* 支持分布式以提高算力。


================================================
FILE: mkdocs/docs/js/google-analytics.js
================================================
// Monitor all download links in GA

var dlCnt = 0;
var tocCnt = 0;

function GAizeDownloadLink(a) {
        var url = a.href;
        var x = url.indexOf("?");
        if (x != -1) {
            url = url.substr(0, x);
        }
        var url_test = url.match(/^http.*(archive\/|releases\/)(?<path>.*)/);
        if (url_test) {
            a.event_action = url_test.groups.path;
            console.log("Converting download link to be GA aware: " + url + " . download path is " + a.event_action);
            dlCnt = dlCnt + 1;
            a.onclick = function() {
                console.log("Sending GA event for link" + url);
                var that = this;
                gtag('event','perform download', { 'event_category': 'Downloads', 'event_label': 'Download ' + this.event_action  , 'value': 1 });
                setTimeout(function() {
                    location.href = that.href;
                }, 500);
                return false;
            };
        }
}

function GAizeTOCLink(l) {
	tocCnt = tocCnt + 1;
           l.onclick = function() {
               url_test = l.href.match(/^https?:\/\/.+(#.*)$/i);
               toc_name = url_test[1];
                var that = this;
                console.log("Sending GA event for toc link " + this.href);
                
                gtag('event','navigate', { 'event_category': 'Navigation', 'event_label': 'go to ' + toc_name, 'value': 1 });
                setTimeout(function() {
                    location.href = that.href;
                }, 250);
                return false;
            };

}

window.onload = function() {
    var anchors = document.getElementsByTagName('a');
    for (i = 0; i < anchors.length; i++) {
      GAizeDownloadLink(anchors[i]);
    }
    var toc_links = document.querySelectorAll('div.md-sidebar[data-md-component=toc] a.md-nav__link');
    for (i = 0; i < toc_links.length; i++) {
      GAizeTOCLink(toc_links[i]);
    }
    console.log("Converted " + dlCnt + " download links and " + tocCnt + " TOC links to be GA aware");
}


================================================
FILE: mkdocs/docs/stylesheets/extra.css
================================================

div.md-content pre {
  background-color: black;
  color: #41FF00;
}

.md-typeset code pre {
  background-color: black;
  color: #41FF00;
}

.md-typeset p code {
  color: rgba(0,0,0,.87);
}

.md-typeset code.bash {
  color: #41FF00;
}

.md-typeset__scrollwrap {
  text-align: center;
}

.md-typeset .headerlink {
  opacity: 50%;
}

article.md-content__inner.md-typeset>p {
  text-align: left;
}

.md-nav__link[data-md-state=blur] {
    color: rgba(0.3,0.5,0.4,.4)
}

.md-nav__link[data-md-state=current] {
    font-weight: 700;
}


================================================
FILE: mkdocs/generate-web-site.sh
================================================
#!/bin/bash

mkdocs build -c -s -d ./generated-site


================================================
FILE: mkdocs/mkdocs.yml
================================================
site_name: q - Text as Data
site_url: https://harelba.github.io/q/
repo_url: https://github.com/harelba/q
edit_uri: ""
site_description: Text as Data - q is a command line tool that allows direct execution of SQL-like queries on CSVs/TSVs (and any other tabular text files).
site_author: Harel Ben-Attia
copyright: 'Copyright &copy; 2012-2019 Harel Ben-Attia'
google_analytics: 
  - "UA-48316355-1"
  - "auto"
nav:
    - Home: index.md
    - 首页: index_cn.md
    - About: about.md
theme: 
  name: material
  language: 'en'
  palette:
    primary: purple
    accent: amber
  fonts:
    text: 'Roboto'
    code: 'Roboto Mono'
  favicon: 'img/q-logo1.ico'
  logo: 'img/q-logo1.ico'
  custom_dir: 'theme'
extra:
  social:
    - type: 'github'
      link: 'https://github.com/harelba'
    - type: 'twitter'
      link: 'https://twitter.com/harelba'
    - type: 'linkedin'
      link: 'https://www.linkedin.com/in/harelba'
extra_css:
  - 'stylesheets/extra.css'
extra_javascript:
  - 'js/google-analytics.js'
markdown_extensions:
  - meta
  - toc:
      permalink: true
  - tables
  - fenced_code
  - admonition
    #  - codehilite


================================================
FILE: mkdocs/requirements.txt
================================================
Click==7.0
Deprecated==1.2.7
Jinja2==2.10.3
Markdown==3.1.1
MarkupSafe==1.1.1
PyGithub==1.45
PyJWT==1.7.1
PyYAML==5.3
Pygments==2.5.2
certifi==2019.11.28
chardet==3.0.4
htmlmin==0.1.12
idna==2.8
jsmin==2.2.2
livereload==2.6.1
mkdocs-bootstrap4==0.1.2
mkdocs-bootswatch==1.0
mkdocs-git-committers-plugin==0.1.8
mkdocs-material==4.6.0
mkdocs-minify-plugin==0.2.1
mkdocs==1.0.4
pep562==1.0
pymdown-extensions==6.2.1
requests==2.22.0
six==1.14.0
tornado==6.0.3
urllib3==1.25.8
wrapt==1.11.2


================================================
FILE: mkdocs/theme/main.html
================================================
{% extends "base.html" %}

{% block analytics %}
<!-- Global site tag (gtag.js) - Google Analytics -->
{% set analytics = config.google_analytics %}
<script async src="https://www.googletagmanager.com/gtag/js?id={{ analytics[0] }}"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

gtag('config', '{{ analytics[0] }}');
gtag('send','pageview');

/* Register handler to log search on blur */
  document.addEventListener("DOMContentLoaded", () => {
    if (document.forms.search) {
      var query = document.forms.search.query
      query.addEventListener("blur", function() {
        if (this.value) {
          var path = document.location.pathname;
          ga("send", "pageview", path + "?q=" + this.value)
        }
      })
    }
  });
</script>
{% endblock %}


================================================
FILE: prepare-benchmark-env
================================================
#!/bin/bash

set -e

eval "$(pyenv init -)"
eval "$(pyenv virtualenv-init -)"

source benchmark-config.sh

if [ ! -f ./benchmark_data.tar.gz ];
then
	echo benchmark data not found. downloading it
  curl "https://s3.amazonaws.com/harelba-q-public/benchmark_data.tar.gz" -o ./benchmark_data.tar.gz
else
  echo no need to download benchmark data
fi

if [ ! -d ./_benchmark_data ];
then
	echo extracting benchmark data
  tar xvfz benchmark_data.tar.gz
  echo benchmark data is ready
else
  echo no need to extract benchmark data
fi

for ver in "${BENCHMARK_PYTHON_VERSIONS[@]}"
do
  echo installing $ver 
  pyenv install -s $ver

  venv_name=q-benchmark-$ver
  echo create venv $venv_name
  pyenv virtualenv -f $ver $venv_name
  echo activate venv $venv_name
  pyenv activate $venv_name
  pyenv version
  echo installing requirements $venv_name
  pip install -r ./requirements.txt
  echo deactivating $venv_name
  pyenv deactivate    
done


================================================
FILE: pyoxidizer.bzl
================================================
# This file defines how PyOxidizer application building and packaging is
# performed. See PyOxidizer's documentation at
# https://pyoxidizer.readthedocs.io/en/stable/ for details of this
# configuration file format.

PYTHON_VERSION = VARS.get("PYTHON_VERSION","3.8")
Q_VERSION = VARS.get("Q_VERSION","0.0.1")

# Configuration files consist of functions which define build "targets."
# This function creates a Python executable and installs it in a destination
# directory.
def make_exe():
    dist = default_python_distribution(python_version=PYTHON_VERSION)

    policy = dist.make_python_packaging_policy()
    policy.set_resource_handling_mode("classify")
    policy.resources_location = "in-memory"
    policy.resources_location_fallback = "filesystem-relative:Lib"
    policy.allow_in_memory_shared_library_loading = False

    python_config = dist.make_python_interpreter_config()

    python_config.run_module = "bin.q"

    exe = dist.to_python_executable(
        name="q",

        packaging_policy=policy,

        config=python_config,
    )

    exe.pip_install(["wheel"])

    exe.add_python_resources(exe.pip_install(["-r", "requirements.txt"]))
    exe.add_python_resources(exe.pip_install(["-e", "."]))

    exe.add_python_resources(exe.read_package_root(
        path="./",
        packages=["bin"],
    ))

    return exe

def make_embedded_resources(exe):
    return exe.to_embedded_resources()

def make_install(exe):
    # Create an object that represents our installed application file layout.
    files = FileManifest()

    # Add the generated executable to our install layout in the root directory.
    files.add_python_resource(".", exe)

    return files

def make_msi(exe):
    # See the full docs for more. But this will convert your Python executable
    # into a `WiXMSIBuilder` Starlark type, which will be converted to a Windows
    # .msi installer when it is built.
    builder = exe.to_wix_msi_builder(
        # Simple identifier of your app.
        "q",
        # The name of your application.
        "q-text-as-data",
        # The version of your application.
        Q_VERSION,
        # The author/manufacturer of your application.
        "Harel Ben-Attia"
    )
    return builder


# Dynamically enable automatic code signing.
def register_code_signers():
    # You will need to run with `pyoxidizer build --var ENABLE_CODE_SIGNING 1` for
    # this if block to be evaluated.
    if not VARS.get("ENABLE_CODE_SIGNING"):
        return

    # Use a code signing certificate in a .pfx/.p12 file, prompting the
    # user for its path and password to open.
    # pfx_path = prompt_input("path to code signing certificate file")
    # pfx_password = prompt_password(
    #     "password for code signing certificate file",
    #     confirm = True
    # )
    # signer = code_signer_from_pfx_file(pfx_path, pfx_password)

    # Use a code signing certificate in the Windows certificate store, specified
    # by its SHA-1 thumbprint. (This allows you to use YubiKeys and other
    # hardware tokens if they speak to the Windows certificate APIs.)
    # sha1_thumbprint = prompt_input(
    #     "SHA-1 thumbprint of code signing certificate in Windows store"
    # )
    # signer = code_signer_from_windows_store_sha1_thumbprint(sha1_thumbprint)

    # Choose a code signing certificate automatically from the Windows
    # certificate store.
    # signer = code_signer_from_windows_store_auto()

    # Activate your signer so it gets called automatically.
    # signer.activate()


# Call our function to set up automatic code signers.
register_code_signers()

# Tell PyOxidizer about the build targets defined above.
register_target("exe", make_exe)
register_target("resources", make_embedded_resources, depends=["exe"], default_build_script=True)
register_target("install", make_install, depends=["exe"], default=True)
register_target("msi_installer", make_msi, depends=["exe"])

# Resolve whatever targets the invoker of this configuration file is requesting
# be resolved.
resolve_targets()


================================================
FILE: pytest.ini
================================================
[pytest]
markers =
  benchmark: Benchmark tests


================================================
FILE: requirements.txt
================================================
six==1.11.0
flake8==3.6.0
setuptools<45.0.0


================================================
FILE: run-benchmark
================================================
#!/bin/bash

# Usage: ./run-benchmark.sh <benchmark-id> <q-executable>
set -e

get_abs_filename() {
  # $1 : relative filename
  echo "$(cd "$(dirname "$1")" && pwd)/$(basename "$1")"
}

eval "$(pyenv init -)"
eval "$(pyenv virtualenv-init -)"

if [ "x$1" == "x" ];
then
	echo Benchmark id must be provided as a parameter
  exit 1
fi
Q_BENCHMARK_ID=$1
shift

if [ "x$1" == "x" ];
then
  EFFECTIVE_Q_EXECUTABLE="source-files-$(git rev-parse HEAD)"
else
  ABS_Q_EXECUTABLE="$(get_abs_filename $1)"
  export Q_EXECUTABLE=$ABS_Q_EXECUTABLE
	if [ ! -f $ABS_Q_EXECUTABLE ]
	then
		echo "q executable must exist ($ABS_Q_EXECUTABLE)"
		exit 1
	fi
  EFFECTIVE_Q_EXECUTABLE="${ABS_Q_EXECUTABLE//\//__}"
  shift
fi

echo "Q executable to use is $EFFECTIVE_Q_EXECUTABLE"

PYTEST_OPTIONS="$@"
echo "pytest options are $PYTEST_OPTIONS"

mkdir -p ./test/benchmark-results

# Must be provided to the benchmark code so it knows where to write the results to
export Q_BENCHMARK_RESULTS_FOLDER="./test/benchmark-results/${EFFECTIVE_Q_EXECUTABLE}/${Q_BENCHMARK_ID}/"
echo Benchmark results folder is $Q_BENCHMARK_RESULTS_FOLDER
mkdir -p $Q_BENCHMARK_RESULTS_FOLDER

source benchmark-config.sh
LATEST_PYTHON_VERSION=${BENCHMARK_PYTHON_VERSIONS[${#BENCHMARK_PYTHON_VERSIONS[@]}-1]}

ALL_FILES=()

for ver in "${BENCHMARK_PYTHON_VERSIONS[@]}"
do
venv_name=q-benchmark-$ver
echo activating $venv_name
pyenv activate $venv_name
echo "==== testing inside $venv_name ==="
if [[ -f $Q_BENCHMARK_RESULTS_FOLDER/${venv_name}.benchmark-results ]]
then
	echo "Results files for version $ver already exists skipping benchmark for this version"
	continue
fi

export Q_BENCHMARK_NAME=${venv_name}
export Q_BENCHMARK_ADDITIONAL_PARAMS="-C read"

Q_BENCHMARK_NAME=${venv_name}-with-caching Q_BENCHMARK_DATA_DIR=./_benchmark_data_with_qsql_caches pytest -m benchmark -k test_q_matrix -v -s $PYTEST_OPTIONS
Q_BENCHMARK_NAME=${venv_name} Q_BENCHMARK_DATA_DIR=./_benchmark_data pytest -m benchmark -k test_q_matrix -v -s $PYTEST_OPTIONS

RESULT_FILE="${Q_BENCHMARK_RESULTS_FOLDER}/$venv_name.benchmark-results"
echo "==== Done. Results are in $RESULT_FILE"
ALL_FILES[${#ALL_FILES[@]}]="$RESULT_FILE"
echo "Deactivating"
pyenv deactivate
done

exit 0

pyenv activate q-benchmark-${LATEST_PYTHON_VERSION}
echo "==== testing textql ==="
if [[ -f `ls $Q_BENCHMARK_RESULTS_FOLDER/textql*.benchmark-results` ]]
then
	echo "Results files for textql already exist. Skipping benchmark for textql"
else
	pytest -m benchmark -k test_textql_matrix -v -s $PYTEST_OPTIONS
	RESULT_FILE="textql*.benchmark-results"
	ALL_FILES[${#ALL_FILES[@]}]="${Q_BENCHMARK_RESULTS_FOLDER}/$RESULT_FILE"
	echo "Done. Results are in textql.benchmark-results"
fi

echo "==== testing octosql ==="
if [[ -f $Q_BENCHMARK_RESULTS_FOLDER/octosql.benchmark-results ]]
then
	echo "Results files for octosql aready exist. Skipping benchmark for octosql"
else
	pytest -m benchmark -k test_octosql_matrix -v -s $PYTEST_OPTIONS
	RESULT_FILE="octosql*.benchmark-results"
	ALL_FILES[${#ALL_FILES[@]}]="${Q_BENCHMARK_RESULTS_FOLDER}/$RESULT_FILE"
	echo "Done. Results are in octosql.benchmark-results"
fi

summary_file="$Q_BENCHMARK_RESULTS_FOLDER/summary.benchmark-results"

rm -vf $summary_file

paste ${ALL_FILES[*]} > $summary_file
echo "Done. final results file is $summary_file"
pyenv deactivate


================================================
FILE: run-coverage.sh
================================================
#!/bin/bash

set -e

rm -vf ./htmlcov/*

pytest -m "not benchmark" --cov --cov-report html "$@"

function cleanup() {
  kill %1
}

# TODO Fix

# python -m http.server 8000 
# open http://localhost:8000/htmlcov/


================================================
FILE: run-tests.sh
================================================
#!/bin/bash

pytest -m 'not benchmark' "$@"


================================================
FILE: setup.py
================================================
#!/usr/bin/env python

from setuptools import setup
import setuptools

q_version = '3.1.6'

with open("README.markdown", "r", encoding="utf-8") as fh:
    long_description = fh.read()

setup(
    name='q',
    url='https://github.com/harelba/q',
    license='LICENSE',
    version=q_version,
    author='Harel Ben-Attia',
    description="Run SQL directly on CSV or TSV files",
    long_description=long_description,
    long_description_content_type="text/markdown",
    author_email='harelba@gmail.com',
    install_requires=[
        'six==1.11.0'
    ],
    package_dir={"": "bin"},
    packages=setuptools.find_packages(where="bin"),
    entry_points={
        'console_scripts': [
            'q = bin.q:run_standalone'
        ]
    }
)


================================================
FILE: test/BENCHMARK.md
================================================


NOTE: *Please don't use or publish this benchmark data yet. See below for details*

# Update
q now provides inherent automatic caching capabilities, writing the CSV/TSV file to a `.qsql` file that sits beside the original file. After the cache exists (created as part of an initial query on a file), q knows to use it behind the scenes without changing the query itself, speeding up performance significantly.

The following table shows the impact of using caching in q:

|    Rows   | Columns | File Size | Query time without caching | Query time with caching | Speed Improvement |
|:---------:|:-------:|:---------:|:--------------------------:|:-----------------------:|:-----------------:|
| 5,000,000 |   100   |   4.8GB   |    4 minutes, 47 seconds   |       1.92 seconds      |        x149       |
| 1,000,000 |   100   |   983MB   |        50.9 seconds        |      0.461 seconds      |        x110       |
| 1,000,000 |    50   |   477MB   |        27.1 seconds        |      0.272 seconds      |        x99        |
|  100,000  |   100   |    99MB   |         5.2 seconds        |      0.141 seconds      |        x36        |
|  100,000  |    50   |    48MB   |         2.7 seconds        |      0.105 seconds      |        x25        |


Effectively, `.qsql` files are just standard sqlite3 files, with an additional metadata table that is used for detecting changes in the original delimited file. This means that any tool that can read sqlite3 files can use these files directly. The tradeoff is of course the additional disk usage that the cache files take.

A good side-effect to this addition, is that q now knows how to directly query multi-file sqlite3 databases. This means that the user can query any sqlite3 database file, or the `.qsql` file itself, even when the original file doesn't exist anymore. For example:

```bash
q "select a.*,b.* from my_file.csv.qsql a left join some-sqlite3-database:::some_table_name b on (a.id = b.id)"
```

NOTE: In the current version, caching is not enabled by default - Use `-C readwrite` to enable reading+writing cache files, or `-C read` to just read any existing cache files. A `~/.qrc` file can be added in order to make these options the default if you want.

The benchmark results below reflect the peformance without the caching, e.g. directly reading the delimited files, parsing them and performing the query.

I'll update benchmark results later on to provide cached results as well.

# Overview
This just a preliminary benchmark, originally created for validating performance optimizations and suggestions from users, and analyzing q's move to python3. After writing it, I thought it might be interesting to test its speed against textql and octosql as well.

The results I'm getting are somewhat surprising, to the point of me questioning them a bit, so it would be great to validate the further before finalizing the benchmark results.

The most surprising results are as follows:
* python3 vs python2 - A huge improvement (for large files, execution times with python 3 are around 40% of the times for python 2)
* python3 vs textql (written in golang) - Seems that textql becomes slower than the python3 q version as the data sizes grows (both rows and columns)

I would love to validate these results by having other people run the benchmark as well and send me their results. 

If you're interested, follow the instructions and run the benchmark on your machine. After the benchmark is finished, send me the final results file, along with some details about your hardware, and i'll add it to the spreadsheet. <harelba@gmail.com>

I've tried to make running the benchmark as seamless as possible, but there obviously might be errors/issues. Please contact me if you encounter any issue, or just open a ticket.

# Benchmark
This is an initial version of the benchmark, along with some results. The following is compared:
* q running on multiple python versions
* textql 2.0.3
* octosql v0.3.0

The specific python versions which are being tested are specified in `benchmark-config.sh`.

This is by no means a scientific benchmark, and it only focuses on the data loading time which is the only significant factor for comparison (e.g. the query itself is a very simple count query). Also, it does not try to provide any usability comparison between q and textql/octosql, an interesting topic on its own.

## Methodology
The idea was to compare the time sensitivity of row and column count. 

* Row counts: 1,10,100,1000,10000,100000,1000000
* Column counts: 1,5,10,20,50,100
* Iterations for each combination: 10

File sizes:
* 1M rows by 100 columns - 976MB (~1GB) - Largest file
* 1M rows by 50 columns - 477MB

The benchmark executes simple `select count(*) from <file>` queries for each combination, calculating the mean and stddev of each set of iterations. The stddev is used in order to measure the validity of the results.

The graphs below only compare the means of the results, the standard deviations are written into the google sheet itself, and can be viewed there if needed.

Instructions on how to run the benchmark are at the bottom section of this document, after the results section.

## Hardware
OSX Catalina on a 15" Macbook Pro from Mid 2015, with 16GB of RAM, and an internal Flash Drive of 256GB.

## Results
(Results are automatically updated from the baseline tab in the google spreadsheet).

Detailed results below.

Summary:
* All python 3 versions (3.6/3.7/3.8) provide similar results across all scales.
* python 3.x provides significantly better results than python2. Improvement grows as the file size grows (20% improvement for small files, up to ~70% improvement for the largest file)
* textql seems to provide faster results than q (py3) for smaller files, up to around 30MB of data. As the size grows further, it becomes slower than q, up to 80% (74 seconds vs 41 seconds) for the largest file
* The larger the files, textql becomes slower than q-py3 (up to 80% more time than q for the largest file)
* octosql is significantly slower than both q and textql, even for small files with a low number of rows and columns

### Data for 1M rows

#### Run time durations for 1M rows and different column counts:
|   rows  	| columns 	| File Size 	| python 2.7 	| python 3.6 	| python 3.7 	| python 3.8 	| textql 	| octosql 	|
|:-------:	|:-------:	|:---------:	|:----------:	|:----------:	|:----------:	|:----------:	|:------:	|:-------:	|
| 1000000 	|    1    	|    17M    	|    5.15    	|    4.24    	|    4.08    	|    3.98    	|  2.90  	|  49.95  	|
| 1000000 	|    5    	|    37M    	|    10.68   	|    5.37    	|    5.26    	|    5.14    	|  5.88  	|  54.69  	|
| 1000000 	|    10   	|    89M    	|    17.56   	|    7.25    	|    7.15    	|    7.01    	|  9.69  	|  65.32  	|
| 1000000 	|    20   	|    192M   	|    30.28   	|    10.96   	|    10.78   	|    10.64   	|  17.34 	|  83.94  	|
| 1000000 	|    50   	|    477M   	|    71.56   	|    21.98   	|    21.59   	|    21.70   	|  38.57 	|  158.26 	|
| 1000000 	|   100   	|    986M   	|   131.86   	|    41.71   	|    40.82   	|    41.02   	|  74.62 	|  289.58 	|

#### Comparison between python 3.x and python 2 run times (1M rows):
(>100% is slower than q-py2, <100% is faster than q-py2)

|   rows    | columns 	| file size 	| q-py2 runtime 	| q-py3.6 vs q-py2 runtime 	| q-py3.7 vs q-py2 runtime 	| q-py3.8 vs q-py2 runtime 	|
|:-------:	|:-------:	|:---------:	|:-------------:	|:------------------------:	|:------------------------:	|:------------------------:	|
| 1000000 	|    1    	|    17M    	|    100.00%    	|          82.34%          	|          79.34%          	|          77.36%          	|
| 1000000 	|    5    	|    37M    	|    100.00%    	|          50.25%          	|          49.22%          	|          48.08%          	|
| 1000000 	|    10   	|    89M    	|    100.00%    	|          41.30%          	|          40.69%          	|          39.93%          	|
| 1000000 	|    20   	|    192M   	|    100.00%    	|          36.18%          	|          35.59%          	|          35.14%          	|
| 1000000 	|    50   	|    477M   	|    100.00%    	|          30.71%          	|          30.17%          	|          30.32%          	|
| 1000000 	|   100   	|    986M   	|    100.00%    	|          31.63%          	|          30.96%          	|          31.11%          	|

#### textql and octosql comparison against q-py3 run time (1M rows):
(>100% is slower than q-py3, <100% is faster than q-py3)

|   rows  	| columns 	| file size 	| avg q-py3 runtime 	| textql vs q-py3 runtime 	| octosql vs q-py3 runtime 	|
|:-------:	|:-------:	|:---------:	|:-----------------:	|:-----------------------:	|:------------------------:	|
| 1000000 	|    1    	|    17M    	|      100.00%      	|          70.67%         	|         1217.76%         	|
| 1000000 	|    5    	|    37M    	|      100.00%      	|         111.86%         	|         1040.70%         	|
| 1000000 	|    10   	|    89M    	|      100.00%      	|         135.80%         	|          915.28%         	|
| 1000000 	|    20   	|    192M   	|      100.00%      	|         160.67%         	|          777.92%         	|
| 1000000 	|    50   	|    477M   	|      100.00%      	|         177.26%         	|          727.40%         	|
| 1000000 	|   100   	|    986M   	|      100.00%      	|         181.19%         	|          703.15%         	|

### Sensitivity to column count 
Based on a the largest file size of 1,000,000 rows.

![Sensitivity to column count](https://docs.google.com/spreadsheets/d/e/2PACX-1vQy9Zm4I322Tdf5uoiFFJx6Oi3Z4AMq7He3fUUtsEQVQIdTGfWgjxFD6k8PAy9wBjvFkqaG26oBgNTP/pubchart?oid=1585602598&format=image)

### Sensitivity to line count (per column count)

#### 1 Column Table
![1 column table](https://docs.google.com/spreadsheets/d/e/2PACX-1vQy9Zm4I322Tdf5uoiFFJx6Oi3Z4AMq7He3fUUtsEQVQIdTGfWgjxFD6k8PAy9wBjvFkqaG26oBgNTP/pubchart?oid=1119350798&format=image)

#### 5 Column Table
![5 column table](https://docs.google.com/spreadsheets/d/e/2PACX-1vQy9Zm4I322Tdf5uoiFFJx6Oi3Z4AMq7He3fUUtsEQVQIdTGfWgjxFD6k8PAy9wBjvFkqaG26oBgNTP/pubchart?oid=599223098&format=image)

#### 10 Column Table
![10 column table](https://docs.google.com/spreadsheets/d/e/2PACX-1vQy9Zm4I322Tdf5uoiFFJx6Oi3Z4AMq7He3fUUtsEQVQIdTGfWgjxFD6k8PAy9wBjvFkqaG26oBgNTP/pubchart?oid=82695414&format=image)

#### 20 Column Table
![20 column table](https://docs.google.com/spreadsheets/d/e/2PACX-1vQy9Zm4I322Tdf5uoiFFJx6Oi3Z4AMq7He3fUUtsEQVQIdTGfWgjxFD6k8PAy9wBjvFkqaG26oBgNTP/pubchart?oid=1573199483&format=image)

#### 50 Column Table
![50 column table](https://docs.google.com/spreadsheets/d/e/2PACX-1vQy9Zm4I322Tdf5uoiFFJx6Oi3Z4AMq7He3fUUtsEQVQIdTGfWgjxFD6k8PAy9wBjvFkqaG26oBgNTP/pubchart?oid=448568670&format=image)

#### 100 Column Table
![100 column table](https://docs.google.com/spreadsheets/d/e/2PACX-1vQy9Zm4I322Tdf5uoiFFJx6Oi3Z4AMq7He3fUUtsEQVQIdTGfWgjxFD6k8PAy9wBjvFkqaG26oBgNTP/pubchart?oid=2101488258&format=image)

## Running the benchmark
Please note that the initial run generates large files, so you'd need more than 3GB of free space available. All the generated files reside in the `_benchmark_data/` folder.

Part of the preparation flow will download the benchmark data as needed.

### Preparations
* Prerequisites:
  * pyenv installed
  * pyenv-virtualenv installed
  * [`textql`](https://github.com/dinedal/textql#install)
  * [`octosql`](https://github.com/cube2222/octosql#installation)

Run `./prepare-benchmark-env`

### Execution
Run `./run-benchmark <benchmark-id>`.

Benchmark output files will be written to `./benchmark-results/<q-executable>/<benchmark-id>/`.

* `benchmark-id` is the id you wanna give the benchmark.
* `q-executable` is the name of the q executable being used for the benchmark. If none has been provided through Q_EXECUTABLE, then the value will be the last commit hash. Note that there is no checking of whether the working tree is clean. 

The summary of benchmark will be written to `./benchmark-results/<benchmark-id>/summary.benchmark-results``

By default, the benchmark will use the source python files inside the project. If you wanna run it on one of the standalone binary executable, the set Q_EXECUTABLE to the full path of the q binary.

For anyone helping with running the benchmark, don't use this parameter for now, just test against a clean checkout of the code using `./run-benchmark <benchmark-id>`.

## Benchmark Development info
### Running against the standalone binary
* `./run-benchmark` can accept a second parameter with the q executable. If it gets this parameter, it will use this path for running q. This provides a way to test the standalone q binaries in the new packaging format. When this parameter does not exist, the benchmark is executed directly from the source code.

### Updating the benchmark markdown document file
The results should reside in the following [google sheet](https://docs.google.com/spreadsheets/d/1Ljr8YIJwUQ5F4wr6ATga5Aajpu1CvQp1pe52KGrLkbY/edit?usp=sharing). 

add a new tab to the google sheet, and paste the content of `summary.benchmark-results` to the new sheet.


================================================
FILE: test/__init__.py
================================================


================================================
FILE: test/benchmark-results/source-files-1443b7418b46594ad256abd9db4a7671cb251e6a/2020-09-17-v2.0.17/octosql_v0.3.0.benchmark-results
================================================
lines	columns	octosql_v0.3.0_mean	octosql_v0.3.0_stddev
1	1	0.582091641426	0.0235290239617
10	1	0.596219730377	0.0320124029461
100	1	0.575977492332	0.0199296245316
1000	1	0.56785056591	0.00846389017466
10000	1	1.1466334343	0.00760108698846
100000	1	5.49565172195	0.131791932977
1000000	1	49.9513648033	0.443430523063
lines	columns	octosql_v0.3.0_mean	octosql_v0.3.0_stddev
1	5	0.582160949707	0.0274409391571
10	5	0.57046456337	0.0199413000359
100	5	0.585747480392	0.0372543971623
1000	5	0.572268772125	0.00384300349763
10000	5	1.15530762672	0.0117990775856
100000	5	6.10629923344	0.146711842919
1000000	5	54.6851765394	0.315486399525
lines	columns	octosql_v0.3.0_mean	octosql_v0.3.0_stddev
1	10	0.586222410202	0.0232479065914
10	10	0.59000480175	0.0186508192447
100	10	0.581873703003	0.0331332482772
1000	10	0.569027900696	0.0103675493106
10000	10	1.40067322254	0.00583352224401
100000	10	7.30705575943	0.0165839217599
1000000	10	65.3242264032	0.512552576414
lines	columns	octosql_v0.3.0_mean	octosql_v0.3.0_stddev
1	20	0.571048212051	0.0166919396871
10	20	0.594776701927	0.0368900941023
100	20	0.561370825768	0.00907051791451
1000	20	0.577527880669	0.00983965108957
10000	20	1.90710241795	0.00757011452155
100000	20	9.8267291069	0.127844155326
1000000	20	83.9448960066	0.46121344046
lines	columns	octosql_v0.3.0_mean	octosql_v0.3.0_stddev
1	50	0.572030115128	0.0253648479103
10	50	0.56993534565	0.0230474303306
100	50	0.563336873055	0.00964411866903
1000	50	0.826378440857	0.00941629472813
10000	50	3.27872717381	0.126592845956
100000	50	17.890055728	0.116794666005
1000000	50	158.262442636	0.826290454446
lines	columns	octosql_v0.3.0_mean	octosql_v0.3.0_stddev
1	100	0.569358110428	0.0279801762531
10	100	0.580981063843	0.0272341107532
100	100	0.559471726418	0.00668155858429
1000	100	1.08161640167	0.00698594638512
10000	100	5.67823712826	0.0123398407167
100000	100	32.2797194242	0.315508270241
1000000	100	289.582628798	0.929455236817


================================================
FILE: test/benchmark-results/source-files-1443b7418b46594ad256abd9db4a7671cb251e6a/2020-09-17-v2.0.17/q-benchmark-2.7.18.benchmark-results
================================================
lines	columns	q-benchmark-2.7.18_mean	q-benchmark-2.7.18_stddev
1	1	0.106449890137	0.002010027753
10	1	0.106737875938	0.00224112203891
100	1	0.107839012146	0.00102954061006
1000	1	0.113026666641	0.00147361890226
10000	1	0.160376381874	0.00569766179806
100000	1	0.608236479759	0.00604026519608
1000000	1	5.14807910919	0.0584474028762
lines	columns	q-benchmark-2.7.18_mean	q-benchmark-2.7.18_stddev
1	5	0.106719517708	0.00236752032369
10	5	0.107823801041	0.00238873169438
100	5	0.109785079956	0.0013047675259
1000	5	0.120395207405	0.00207224422629
10000	5	0.21783041954	0.00522254475716
100000	5	1.17115747929	0.0221394865225
1000000	5	10.6830974817	0.339822977934
lines	columns	q-benchmark-2.7.18_mean	q-benchmark-2.7.18_stddev
1	10	0.104981088638	0.00166552032929
10	10	0.108320140839	0.00204034349199
100	10	0.112528729439	0.00168376477305
1000	10	0.13019015789	0.00253773120965
10000	10	0.284891676903	0.00384009140782
100000	10	1.84725661278	0.00860738744089
1000000	10	17.5610994339	0.228322442172
lines	columns	q-benchmark-2.7.18_mean	q-benchmark-2.7.18_stddev
1	20	0.106477689743	0.00254429925697
10	20	0.108580899239	0.00173704653824
100	20	0.118750286102	0.00247623639866
1000	20	0.146431708336	0.00249685551944
10000	20	0.419492387772	0.00248210434668
100000	20	3.15847921371	0.0550301268026
1000000	20	30.279082489	0.124978814506
lines	columns	q-benchmark-2.7.18_mean	q-benchmark-2.7.18_stddev
1	50	0.105411934853	0.00171651054128
10	50	0.109102797508	0.00111620290512
100	50	0.135682177544	0.00196166766665
1000	50	0.198261427879	0.00396172489054
10000	50	0.821499919891	0.0111642692132
100000	50	7.05980975628	0.121182371277
1000000	50	71.5645889759	5.02009516291
lines	columns	q-benchmark-2.7.18_mean	q-benchmark-2.7.18_stddev
1	100	0.10662381649	0.00193146624495
10	100	0.110662698746	0.00171461379583
100	100	0.163547992706	0.00166570196628
1000	100	0.280023741722	0.00337543024145
10000	100	1.46053376198	0.0221691284465
100000	100	13.2369835854	0.309375896258
1000000	100	131.864977288	1.22415449691


================================================
FILE: test/benchmark-results/source-files-1443b7418b46594ad256abd9db4a7671cb251e6a/2020-09-17-v2.0.17/q-benchmark-3.6.4.benchmark-results
================================================
lines	columns	q-benchmark-3.6.4_mean	q-benchmark-3.6.4_stddev
1	1	0.10342762470245362	0.0017673875851759295
10	1	0.10239293575286865	0.0012505611685910795
100	1	0.10317318439483643	0.0010581783881541751
1000	1	0.10687050819396973	0.0014050135772919004
10000	1	0.1447664737701416	0.001841256227287192
100000	1	0.5162809371948243	0.006962985088492867
1000000	1	4.238853335380554	0.04834401143632507
lines	columns	q-benchmark-3.6.4_mean	q-benchmark-3.6.4_stddev
1	5	0.10211825370788574	0.0022568191323651568
10	5	0.1025341272354126	0.0016446470901070106
100	5	0.1053577184677124	0.0015298114223855884
1000	5	0.10980842113494874	0.002536098780902228
10000	5	0.1590113162994385	0.003123074098301634
100000	5	0.6348223447799682	0.0082691507829872
1000000	5	5.368562030792236	0.11628913334105236
lines	columns	q-benchmark-3.6.4_mean	q-benchmark-3.6.4_stddev
1	10	0.10251858234405517	0.0015963869535345293
10	10	0.10278875827789306	0.0009920577082124496
100	10	0.10715732574462891	0.002033320000941064
1000	10	0.11389360427856446	0.0023603847702423973
10000	10	0.17806434631347656	0.001114054252191835
100000	10	0.8252989768981933	0.0037080843359275904
1000000	10	7.252838873863221	0.029052130546213153
lines	columns	q-benchmark-3.6.4_mean	q-benchmark-3.6.4_stddev
1	20	0.10367965698242188	0.003661761341842434
10	20	0.10489590167999267	0.001977141196109372
100	20	0.11108210086822509	0.0014801173497056886
1000	20	0.12110791206359864	0.001648524669420912
10000	20	0.2178968906402588	0.0019298316207276716
100000	20	1.1962245225906372	0.010541407803235559
1000000	20	10.956057572364807	0.12677108174061705
lines	columns	q-benchmark-3.6.4_mean	q-benchmark-3.6.4_stddev
1	50	0.10458300113677979	0.0016367630302744722
10	50	0.10616152286529541	0.002345135740908088
100	50	0.12375867366790771	0.00238414904864133
1000	50	0.14462883472442628	0.0022428030896492978
10000	50	0.34488487243652344	0.004867441221052092
100000	50	2.3394312858581543	0.02263239858944125
1000000	50	21.979821610450745	0.09080404939303836
lines	columns	q-benchmark-3.6.4_mean	q-benchmark-3.6.4_stddev
1	100	0.10372309684753418	0.0010299126833031144
10	100	0.10784556865692138	0.0016557634029464607
100	100	0.14526791572570802	0.0028194506905186724
1000	100	0.18315494060516357	0.0023585311962114673
10000	100	0.5586131334304809	0.004808492789681402
100000	100	4.287398314476013	0.00957500108409644
1000000	100	41.706851434707644	0.4161526076289425


================================================
FILE: test/benchmark-results/source-files-1443b7418b46594ad256abd9db4a7671cb251e6a/2020-09-17-v2.0.17/q-benchmark-3.7.9.benchmark-results
================================================
lines	columns	q-benchmark-3.7.9_mean	q-benchmark-3.7.9_stddev
1	1	0.08099310398101807	0.001417385651688644
10	1	0.0822291374206543	0.0014809900020001858
100	1	0.08169686794281006	0.002108157069167563
1000	1	0.08690853118896484	0.0012595326919263487
10000	1	0.12215542793273926	0.0020152625320395434
100000	1	0.4825761795043945	0.0050418000028856335
1000000	1	4.084399747848511	0.027731958079814215
lines	columns	q-benchmark-3.7.9_mean	q-benchmark-3.7.9_stddev
1	5	0.0817826271057129	0.002665533758836163
10	5	0.08261749744415284	0.0019205430658525572
100	5	0.08472237586975098	0.002571239449841039
1000	5	0.08973510265350342	0.002323797583077552
10000	5	0.13746986389160157	0.001964971666036654
100000	5	0.60649254322052	0.007131635266871318
1000000	5	5.2585612535476685	0.05661789407928516
lines	columns	q-benchmark-3.7.9_mean	q-benchmark-3.7.9_stddev
1	10	0.08112843036651611	0.002251300165899426
10	10	0.08175232410430908	0.0014557171018568637
100	10	0.08572309017181397	0.0019643550214810675
1000	10	0.09268453121185302	0.001816414236580489
10000	10	0.15538835525512695	0.0024978076091814994
100000	10	0.7879442930221557	0.009412516078916211
1000000	10	7.146207928657532	0.06659760176757985
lines	columns	q-benchmark-3.7.9_mean	q-benchmark-3.7.9_stddev
1	20	0.08142082691192627	0.001304584466639188
10	20	0.08197519779205323	0.0014842098503865223
100	20	0.08949971199035645	0.0009937446141285785
1000	20	0.09955930709838867	0.0013978961740806384
10000	20	0.1966566801071167	0.0028489273218240147
100000	20	1.1518636226654053	0.006410720031542237
1000000	20	10.776052689552307	0.04739925571001746
lines	columns	q-benchmark-3.7.9_mean	q-benchmark-3.7.9_stddev
1	50	0.08237688541412354	0.0016494314799953837
10	50	0.08519520759582519	0.002610550182895596
100	50	0.10423583984375	0.0018808335751867933
1000	50	0.12195603847503662	0.0023611894043373983
10000	50	0.3163540124893188	0.002761333651520998
100000	50	2.237372374534607	0.009955353920396077
1000000	50	21.59097549915314	0.081188190530421
lines	columns	q-benchmark-3.7.9_mean	q-benchmark-3.7.9_stddev
1	100	0.08336784839630126	0.0013840724401561887
10	100	0.0864112138748169	0.0017946939354350697
100	100	0.12199611663818359	0.0013003743156634682
1000	100	0.15871686935424806	0.0035993681064501234
10000	100	0.5243751525878906	0.004370273273595629
100000	100	4.175828623771667	0.016127303710583043
1000000	100	40.82292411327362	0.12328165162380703


================================================
FILE: test/benchmark-results/source-files-1443b7418b46594ad256abd9db4a7671cb251e6a/2020-09-17-v2.0.17/q-benchmark-3.8.5.benchmark-results
================================================
lines	columns	q-benchmark-3.8.5_mean	q-benchmark-3.8.5_stddev
1	1	0.10138180255889892	0.0017947074090971444
10	1	0.10056869983673096	0.003442371291904885
100	1	0.10126984119415283	0.0016392348107127808
1000	1	0.10484635829925537	0.0019743937339163262
10000	1	0.1400548219680786	0.0024523366133394117
100000	1	0.4901275157928467	0.003970374711691596
1000000	1	3.982502889633179	0.045292138461945054
lines	columns	q-benchmark-3.8.5_mean	q-benchmark-3.8.5_stddev
1	5	0.09946837425231933	0.0018876161478998787
10	5	0.099178147315979	0.0014194733014858227
100	5	0.10171806812286377	0.0017580984705406846
1000	5	0.10602672100067138	0.002000261880840017
10000	5	0.15207929611206056	0.0015802680033212048
100000	5	0.609218978881836	0.006150144273259608
1000000	5	5.13688440322876	0.03649575898109647
lines	columns	q-benchmark-3.8.5_mean	q-benchmark-3.8.5_stddev
1	10	0.09925477504730225	0.002168389758635997
10	10	0.09943633079528809	0.0016154501074880502
100	10	0.10376312732696533	0.0017275485891005433
1000	10	0.11087138652801513	0.0016934328033239559
10000	10	0.17246220111846924	0.0023824485659318527
100000	10	0.7999232530593872	0.003442975393506892
1000000	10	7.012071299552917	0.059217904448851263
lines	columns	q-benchmark-3.8.5_mean	q-benchmark-3.8.5_stddev
1	20	0.10027089118957519	0.0020291529595204906
10	20	0.10038816928863525	0.001957086760826999
100	20	0.10723590850830078	0.0013833918448622436
1000	20	0.11735000610351562	0.0020318895390750882
10000	20	0.21264209747314453	0.00482341642419078
100000	20	1.1567201137542724	0.002987096441878969
1000000	20	10.640758633613586	0.06116581724028616
lines	columns	q-benchmark-3.8.5_mean	q-benchmark-3.8.5_stddev
1	50	0.10066506862640381	0.002051307639276982
10	50	0.10588631629943848	0.0035835389655972105
100	50	0.11841504573822022	0.001608174845404568
1000	50	0.14032282829284667	0.002640027148889162
10000	50	0.33160474300384524	0.0027796660009712947
100000	50	2.258401036262512	0.011041280982383895
1000000	50	21.70080256462097	0.15897944629180621
lines	columns	q-benchmark-3.8.5_mean	q-benchmark-3.8.5_stddev
1	100	0.10147004127502442	0.0021285682695135768
10	100	0.10471885204315186	0.001248479289219899
100	100	0.13894760608673096	0.002307980025026551
1000	100	0.17586205005645753	0.0023822296091426
10000	100	0.5414002418518067	0.0036291866664635458
100000	100	4.222555088996887	0.08562968951916528
1000000	100	41.021552324295044	0.16033566363076862


================================================
FILE: test/benchmark-results/source-files-1443b7418b46594ad256abd9db4a7671cb251e6a/2020-09-17-v2.0.17/summary.benchmark-results
================================================
lines	columns	q-benchmark-2.7.18_mean	q-benchmark-2.7.18_stddev	lines	columns	q-benchmark-3.6.4_mean	q-benchmark-3.6.4_stddev	lines	columns	q-benchmark-3.7.9_mean	q-benchmark-3.7.9_stddev	lines	columns	q-benchmark-3.8.5_mean	q-benchmark-3.8.5_stddev	lines	columns	textql_2.0.3_mean	textql_2.0.3_stddev	lines	columns	octosql_v0.3.0_mean	octosql_v0.3.0_stddev
1	1	0.106449890137	0.002010027753	1	1	0.10342762470245362	0.0017673875851759295	1	1	0.08099310398101807	0.001417385651688644	1	1	0.10138180255889892	0.0017947074090971444	1	1	0.0196103572845	0.00207355214257	1	1	0.582091641426	0.0235290239617
10	1	0.106737875938	0.00224112203891	10	1	0.10239293575286865	0.0012505611685910795	10	1	0.0822291374206543	0.0014809900020001858	10	1	0.10056869983673096	0.003442371291904885	10	1	0.0186784029007	0.000970810220668	10	1	0.596219730377	0.0320124029461
100	1	0.107839012146	0.00102954061006	100	1	0.10317318439483643	0.0010581783881541751	100	1	0.08169686794281006	0.002108157069167563	100	1	0.10126984119415283	0.0016392348107127808	100	1	0.019472026825	0.00181951524514	100	1	0.575977492332	0.0199296245316
1000	1	0.113026666641	0.00147361890226	1000	1	0.10687050819396973	0.0014050135772919004	1000	1	0.08690853118896484	0.0012595326919263487	1000	1	0.10484635829925537	0.0019743937339163262	1000	1	0.022180891037	0.00116649968967	1000	1	0.56785056591	0.00846389017466
10000	1	0.160376381874	0.00569766179806	10000	1	0.1447664737701416	0.001841256227287192	10000	1	0.12215542793273926	0.0020152625320395434	10000	1	0.1400548219680786	0.0024523366133394117	10000	1	0.051066827774	0.0018168767618	10000	1	1.1466334343	0.00760108698846
100000	1	0.608236479759	0.00604026519608	100000	1	0.5162809371948243	0.006962985088492867	100000	1	0.4825761795043945	0.0050418000028856335	100000	1	0.4901275157928467	0.003970374711691596	100000	1	0.307463979721	0.00246268029188	100000	1	5.49565172195	0.131791932977
1000000	1	5.14807910919	0.0584474028762	1000000	1	4.238853335380554	0.04834401143632507	1000000	1	4.084399747848511	0.027731958079814215	1000000	1	3.982502889633179	0.045292138461945054	1000000	1	2.89862303734	0.022182722976	1000000	1	49.9513648033	0.443430523063
lines	columns	q-benchmark-2.7.18_mean	q-benchmark-2.7.18_stddev	lines	columns	q-benchmark-3.6.4_mean	q-benchmark-3.6.4_stddev	lines	columns	q-benchmark-3.7.9_mean	q-benchmark-3.7.9_stddev	lines	columns	q-benchmark-3.8.5_mean	q-benchmark-3.8.5_stddev	lines	columns	textql_2.0.3_mean	textql_2.0.3_stddev	lines	columns	octosql_v0.3.0_mean	octosql_v0.3.0_stddev
1	5	0.106719517708	0.00236752032369	1	5	0.10211825370788574	0.0022568191323651568	1	5	0.0817826271057129	0.002665533758836163	1	5	0.09946837425231933	0.0018876161478998787	1	5	0.0195286750793	0.0017840569109	1	5	0.582160949707	0.0274409391571
10	5	0.107823801041	0.00238873169438	10	5	0.1025341272354126	0.0016446470901070106	10	5	0.08261749744415284	0.0019205430658525572	10	5	0.099178147315979	0.0014194733014858227	10	5	0.0183676958084	0.000925251595491	10	5	0.57046456337	0.0199413000359
100	5	0.109785079956	0.0013047675259	100	5	0.1053577184677124	0.0015298114223855884	100	5	0.08472237586975098	0.002571239449841039	100	5	0.10171806812286377	0.0017580984705406846	100	5	0.0199447393417	0.000907007099218	100	5	0.585747480392	0.0372543971623
1000	5	0.120395207405	0.00207224422629	1000	5	0.10980842113494874	0.002536098780902228	1000	5	0.08973510265350342	0.002323797583077552	1000	5	0.10602672100067138	0.002000261880840017	1000	5	0.0263328790665	0.00165486505938	1000	5	0.572268772125	0.00384300349763
10000	5	0.21783041954	0.00522254475716	10000	5	0.1590113162994385	0.003123074098301634	10000	5	0.13746986389160157	0.001964971666036654	10000	5	0.15207929611206056	0.0015802680033212048	10000	5	0.0826982736588	0.00152451583229	10000	5	1.15530762672	0.0117990775856
100000	5	1.17115747929	0.0221394865225	100000	5	0.6348223447799682	0.0082691507829872	100000	5	0.60649254322052	0.007131635266871318	100000	5	0.609218978881836	0.006150144273259608	100000	5	0.60660867691	0.00395761320274	100000	5	6.10629923344	0.146711842919
1000000	5	10.6830974817	0.339822977934	1000000	5	5.368562030792236	0.11628913334105236	1000000	5	5.2585612535476685	0.05661789407928516	1000000	5	5.13688440322876	0.03649575898109647	1000000	5	5.87811236382	0.0304332294491	1000000	5	54.6851765394	0.315486399525
lines	columns	q-benchmark-2.7.18_mean	q-benchmark-2.7.18_stddev	lines	columns	q-benchmark-3.6.4_mean	q-benchmark-3.6.4_stddev	lines	columns	q-benchmark-3.7.9_mean	q-benchmark-3.7.9_stddev	lines	columns	q-benchmark-3.8.5_mean	q-benchmark-3.8.5_stddev	lines	columns	textql_2.0.3_mean	textql_2.0.3_stddev	lines	columns	octosql_v0.3.0_mean	octosql_v0.3.0_stddev
1	10	0.104981088638	0.00166552032929	1	10	0.10251858234405517	0.0015963869535345293	1	10	0.08112843036651611	0.002251300165899426	1	10	0.09925477504730225	0.002168389758635997	1	10	0.0191783189774	0.00107718516178	1	10	0.586222410202	0.0232479065914
10	10	0.108320140839	0.00204034349199	10	10	0.10278875827789306	0.0009920577082124496	10	10	0.08175232410430908	0.0014557171018568637	10	10	0.09943633079528809	0.0016154501074880502	10	10	0.0185215950012	0.000840353961363	10	10	0.59000480175	0.0186508192447
100	10	0.112528729439	0.00168376477305	100	10	0.10715732574462891	0.002033320000941064	100	10	0.08572309017181397	0.0019643550214810675	100	10	0.10376312732696533	0.0017275485891005433	100	10	0.0209223031998	0.00164494657684	100	10	0.581873703003	0.0331332482772
1000	10	0.13019015789	0.00253773120965	1000	10	0.11389360427856446	0.0023603847702423973	1000	10	0.09268453121185302	0.001816414236580489	1000	10	0.11087138652801513	0.0016934328033239559	1000	10	0.0309282779694	0.00110848590345	1000	10	0.569027900696	0.0103675493106
10000	10	0.284891676903	0.00384009140782	10000	10	0.17806434631347656	0.001114054252191835	10000	10	0.15538835525512695	0.0024978076091814994	10000	10	0.17246220111846924	0.0023824485659318527	10000	10	0.121016025543	0.00105071105139	10000	10	1.40067322254	0.00583352224401
100000	10	1.84725661278	0.00860738744089	100000	10	0.8252989768981933	0.0037080843359275904	100000	10	0.7879442930221557	0.009412516078916211	100000	10	0.7999232530593872	0.003442975393506892	100000	10	0.987622976303	0.00699348302979	100000	10	7.30705575943	0.0165839217599
1000000	10	17.5610994339	0.228322442172	1000000	10	7.252838873863221	0.029052130546213153	1000000	10	7.146207928657532	0.06659760176757985	1000000	10	7.012071299552917	0.059217904448851263	1000000	10	9.69240145683	0.0354453778052	1000000	10	65.3242264032	0.512552576414
lines	columns	q-benchmark-2.7.18_mean	q-benchmark-2.7.18_stddev	lines	columns	q-benchmark-3.6.4_mean	q-benchmark-3.6.4_stddev	lines	columns	q-benchmark-3.7.9_mean	q-benchmark-3.7.9_stddev	lines	columns	q-benchmark-3.8.5_mean	q-benchmark-3.8.5_stddev	lines	columns	textql_2.0.3_mean	textql_2.0.3_stddev	lines	columns	octosql_v0.3.0_mean	octosql_v0.3.0_stddev
1	20	0.106477689743	0.00254429925697	1	20	0.10367965698242188	0.003661761341842434	1	20	0.08142082691192627	0.001304584466639188	1	20	0.10027089118957519	0.0020291529595204906	1	20	0.0202306985855	0.00159619251952	1	20	0.571048212051	0.0166919396871
10	20	0.108580899239	0.00173704653824	10	20	0.10489590167999267	0.001977141196109372	10	20	0.08197519779205323	0.0014842098503865223	10	20	0.10038816928863525	0.001957086760826999	10	20	0.0187650680542	0.000845692486156	10	20	0.594776701927	0.0368900941023
100	20	0.118750286102	0.00247623639866	100	20	0.11108210086822509	0.0014801173497056886	100	20	0.08949971199035645	0.0009937446141285785	100	20	0.10723590850830078	0.0013833918448622436	100	20	0.0211876153946	0.000993808448942	100	20	0.561370825768	0.00907051791451
1000	20	0.146431708336	0.00249685551944	1000	20	0.12110791206359864	0.001648524669420912	1000	20	0.09955930709838867	0.0013978961740806384	1000	20	0.11735000610351562	0.0020318895390750882	1000	20	0.0404737234116	0.00122415059261	1000	20	0.577527880669	0.00983965108957
10000	20	0.419492387772	0.00248210434668	10000	20	0.2178968906402588	0.0019298316207276716	10000	20	0.1966566801071167	0.0028489273218240147	10000	20	0.21264209747314453	0.00482341642419078	10000	20	0.197762489319	0.00198188642677	10000	20	1.90710241795	0.00757011452155
100000	20	3.15847921371	0.0550301268026	100000	20	1.1962245225906372	0.010541407803235559	100000	20	1.1518636226654053	0.006410720031542237	100000	20	1.1567201137542724	0.002987096441878969	100000	20	1.75432097912	0.00692372147543	100000	20	9.8267291069	0.127844155326
1000000	20	30.279082489	0.124978814506	1000000	20	10.956057572364807	0.12677108174061705	1000000	20	10.776052689552307	0.04739925571001746	1000000	20	10.640758633613586	0.06116581724028616	1000000	20	17.3383012295	0.0410164637448	1000000	20	83.9448960066	0.46121344046
lines	columns	q-benchmark-2.7.18_mean	q-benchmark-2.7.18_stddev	lines	columns	q-benchmark-3.6.4_mean	q-benchmark-3.6.4_stddev	lines	columns	q-benchmark-3.7.9_mean	q-benchmark-3.7.9_stddev	lines	columns	q-benchmark-3.8.5_mean	q-benchmark-3.8.5_stddev	lines	columns	textql_2.0.3_mean	textql_2.0.3_stddev	lines	columns	octosql_v0.3.0_mean	octosql_v0.3.0_stddev
1	50	0.105411934853	0.00171651054128	1	50	0.10458300113677979	0.0016367630302744722	1	50	0.08237688541412354	0.0016494314799953837	1	50	0.10066506862640381	0.002051307639276982	1	50	0.0205577373505	0.00133922342068	1	50	0.572030115128	0.0253648479103
10	50	0.109102797508	0.00111620290512	10	50	0.10616152286529541	0.002345135740908088	10	50	0.08519520759582519	0.002610550182895596	10	50	0.10588631629943848	0.0035835389655972105	10	50	0.0195438146591	0.000791630611893	10	50	0.56993534565	0.0230474303306
100	50	0.135682177544	0.00196166766665	100	50	0.12375867366790771	0.00238414904864133	100	50	0.10423583984375	0.0018808335751867933	100	50	0.11841504573822022	0.001608174845404568	100	50	0.0246078014374	0.00108949795701	100	50	0.563336873055	0.00964411866903
1000	50	0.198261427879	0.00396172489054	1000	50	0.14462883472442628	0.0022428030896492978	1000	50	0.12195603847503662	0.0023611894043373983	1000	50	0.14032282829284667	0.002640027148889162	1000	50	0.063302564621	0.00058195987294	1000	50	0.826378440857	0.00941629472813
10000	50	0.821499919891	0.0111642692132	10000	50	0.34488487243652344	0.004867441221052092	10000	50	0.3163540124893188	0.002761333651520998	10000	50	0.33160474300384524	0.0027796660009712947	10000	50	0.410061001778	0.00294901155085	10000	50	3.27872717381	0.126592845956
100000	50	7.05980975628	0.121182371277	100000	50	2.3394312858581543	0.02263239858944125	100000	50	2.237372374534607	0.009955353920396077	100000	50	2.258401036262512	0.011041280982383895	100000	50	3.87797718048	0.0123467913678	100000	50	17.890055728	0.116794666005
1000000	50	71.5645889759	5.02009516291	1000000	50	21.979821610450745	0.09080404939303836	1000000	50	21.59097549915314	0.081188190530421	1000000	50	21.70080256462097	0.15897944629180621	1000000	50	38.5674883366	0.0602820291386	1000000	50	158.262442636	0.826290454446
lines	columns	q-benchmark-2.7.18_mean	q-benchmark-2.7.18_stddev	lines	columns	q-benchmark-3.6.4_mean	q-benchmark-3.6.4_stddev	lines	columns	q-benchmark-3.7.9_mean	q-benchmark-3.7.9_stddev	lines	columns	q-benchmark-3.8.5_mean	q-benchmark-3.8.5_stddev	lines	columns	textql_2.0.3_mean	textql_2.0.3_stddev	lines	columns	octosql_v0.3.0_mean	octosql_v0.3.0_stddev
1	100	0.10662381649	0.00193146624495	1	100	0.10372309684753418	0.0010299126833031144	1	100	0.08336784839630126	0.0013840724401561887	1	100	0.10147004127502442	0.0021285682695135768	1	100	0.0216581106186	0.00103280947157	1	100	0.569358110428	0.0279801762531
10	100	0.110662698746	0.00171461379583	10	100	0.10784556865692138	0.0016557634029464607	10	100	0.0864112138748169	0.0017946939354350697	10	100	0.10471885204315186	0.001248479289219899	10	100	0.021723818779	0.000920429257416	10	100	0.580981063843	0.0272341107532
100	100	0.163547992706	0.00166570196628	100	100	0.14526791572570802	0.0028194506905186724	100	100	0.12199611663818359	0.0013003743156634682	100	100	0.13894760608673096	0.002307980025026551	100	100	0.0299471855164	0.00130217326679	100	100	0.559471726418	0.00668155858429
1000	100	0.280023741722	0.00337543024145	1000	100	0.18315494060516357	0.0023585311962114673	1000	100	0.15871686935424806	0.0035993681064501234	1000	100	0.17586205005645753	0.0023822296091426	1000	100	0.0996923923492	0.00155352212734	1000	100	1.08161640167	0.00698594638512
10000	100	1.46053376198	0.0221691284465	10000	100	0.5586131334304809	0.004808492789681402	10000	100	0.5243751525878906	0.004370273273595629	10000	100	0.5414002418518067	0.0036291866664635458	10000	100	0.767001605034	0.00328944029633	10000	100	5.67823712826	0.0123398407167
100000	100	13.2369835854	0.309375896258	100000	100	4.287398314476013	0.00957500108409644	100000	100	4.175828623771667	0.016127303710583043	100000	100	4.222555088996887	0.08562968951916528	100000	100	7.46734063625	0.0262039846119	100000	100	32.2797194242	0.315508270241
1000000	100	131.864977288	1.22415449691	1000000	100	41.706851434707644	0.4161526076289425	1000000	100	40.82292411327362	0.12328165162380703	1000000	100	41.021552324295044	0.16033566363076862	1000000	100	74.6216712952	0.0994037504394	1000000	100	289.582628798	0.929455236817


================================================
FILE: test/benchmark-results/source-files-1443b7418b46594ad256abd9db4a7671cb251e6a/2020-09-17-v2.0.17/textql_2.0.3.benchmark-results
================================================
lines	columns	textql_2.0.3_mean	textql_2.0.3_stddev
1	1	0.0196103572845	0.00207355214257
10	1	0.0186784029007	0.000970810220668
100	1	0.019472026825	0.00181951524514
1000	1	0.022180891037	0.00116649968967
10000	1	0.051066827774	0.0018168767618
100000	1	0.307463979721	0.00246268029188
1000000	1	2.89862303734	0.022182722976
lines	columns	textql_2.0.3_mean	textql_2.0.3_stddev
1	5	0.0195286750793	0.0017840569109
10	5	0.0183676958084	0.000925251595491
100	5	0.0199447393417	0.000907007099218
1000	5	0.0263328790665	0.00165486505938
10000	5	0.0826982736588	0.00152451583229
100000	5	0.60660867691	0.00395761320274
1000000	5	5.87811236382	0.0304332294491
lines	columns	textql_2.0.3_mean	textql_2.0.3_stddev
1	10	0.0191783189774	0.00107718516178
10	10	0.0185215950012	0.000840353961363
100	10	0.0209223031998	0.00164494657684
1000	10	0.0309282779694	0.00110848590345
10000	10	0.121016025543	0.00105071105139
100000	10	0.987622976303	0.00699348302979
1000000	10	9.69240145683	0.0354453778052
lines	columns	textql_2.0.3_mean	textql_2.0.3_stddev
1	20	0.0202306985855	0.00159619251952
10	20	0.0187650680542	0.000845692486156
100	20	0.0211876153946	0.000993808448942
1000	20	0.0404737234116	0.00122415059261
10000	20	0.197762489319	0.00198188642677
100000	20	1.75432097912	0.00692372147543
1000000	20	17.3383012295	0.0410164637448
lines	columns	textql_2.0.3_mean	textql_2.0.3_stddev
1	50	0.0205577373505	0.00133922342068
10	50	0.0195438146591	0.000791630611893
100	50	0.0246078014374	0.00108949795701
1000	50	0.063302564621	0.00058195987294
10000	50	0.410061001778	0.00294901155085
100000	50	3.87797718048	0.0123467913678
1000000	50	38.5674883366	0.0602820291386
lines	columns	textql_2.0.3_mean	textql_2.0.3_stddev
1	100	0.0216581106186	0.00103280947157
10	100	0.021723818779	0.000920429257416
100	100	0.0299471855164	0.00130217326679
1000	100	0.0996923923492	0.00155352212734
10000	100	0.767001605034	0.00328944029633
100000	100	7.46734063625	0.0262039846119
1000000	100	74.6216712952	0.0994037504394


================================================
FILE: test/test_suite.py
================================================
#!/usr/bin/env python3

#
# test suite for q.
# 
# Prefer end-to-end tests, running the actual q command and testing stdout/stderr, and the return code.
# Some utilities are provided for making that easy, see other tests for examples.
#
# Q_EXECUTABLE env var can be used to inject the path of q. This allows full e2e testing of the resulting executable
# instead of just testing the python code.
#
# Tests are compatible with Linux and OSX (path separators, tmp folder, etc.).

from __future__ import print_function

import collections
import functools
import tempfile
import unittest
import random
import json
import uuid
from collections import OrderedDict
from json import JSONEncoder
from subprocess import PIPE, Popen, STDOUT
import sys
import os
import time
from tempfile import NamedTemporaryFile
import locale
import pprint
import six
from six.moves import range
import codecs
import itertools
from gzip import GzipFile
import pytest
import uuid
import sqlite3
import re
import collections

sys.path.append(os.path.join(os.path.abspath(os.path.dirname(sys.argv[0])),'..','bin'))
from bin.q import QTextAsData, QOutput, QOutputPrinter, QInputParams, DataStream, Sqlite3DB

# q uses this encoding as the default output encoding. Some of the tests use it in order to 
# make sure that the output is correctly encoded
SYSTEM_ENCODING = locale.getpreferredencoding()

EXAMPLES = os.path.abspath(os.path.join(os.getcwd(), 'examples'))

Q_EXECUTABLE = os.getenv('Q_EXECUTABLE', os.path.abspath('./bin/q.py'))
Q_SKIP_EXECUTABLE_VALIDATION = os.getenv('Q_SKIP_EXECUTABLE_VALIDATION','false')

if not Q_SKIP_EXECUTABLE_VALIDATION == 'true':
    Q_EXECUTABLE = os.path.abspath(Q_EXECUTABLE)
    if not os.path.exists(Q_EXECUTABLE):
        raise Exception("q executable must reside in {}".format(Q_EXECUTABLE))
else:
    Q_EXECUTABLE = os.getenv('Q_EXECUTABLE')
    # Skip checking of executable (useful for testing that q is in the path)
    pass

DEBUG = '-v' in sys.argv
if os.environ.get('Q_DEBUG'):
    DEBUG = True

def batch(iterable, n=1):
    r = []
    l = len(iterable)
    for ndx in range(0, l, n):
        r += [iterable[ndx:min(ndx + n, l)]]
    return r

def partition(pred, iterable):
    t1, t2 = itertools.tee(iterable)
    return list(itertools.filterfalse(pred, t1)), list(filter(pred, t2))

def run_command(cmd_to_run,env_to_inject=None):
    global DEBUG
    if DEBUG:
        print("CMD: {}".format(cmd_to_run))

    if env_to_inject is None:
        env_to_inject = os.environ

    env = env_to_inject

    p = Popen(cmd_to_run, stdout=PIPE, stderr=PIPE, shell=True,env=env)
    o, e = p.communicate()
    # remove last newline
    o = o.rstrip()
    e = e.strip()
    # split rows
    if o != six.b(''):
        o = o.split(six.b(os.linesep))
    else:
        o = []
    if e != six.b(''):
        e = e.split(six.b(os.linesep))
    else:
        e = []

    res = (p.returncode, o, e)
    if DEBUG:
        print("RESULT:{}".format(res))
    return res


uneven_ls_output = six.b("""drwxr-xr-x   2 root     root      4096 Jun 11  2012 /selinux
drwxr-xr-x   2 root     root      4096 Apr 19  2013 /mnt
drwxr-xr-x   2 root     root      4096 Apr 24  2013 /srv
drwx------   2 root     root     16384 Jun 21  2013 /lost+found
lrwxrwxrwx   1 root     root        33 Jun 21  2013 /initrd.img.old -> /boot/initrd.img-3.8.0-19-generic
drwxr-xr-x   2 root     root      4096 Jun 21  2013 /cdrom
drwxr-xr-x   3 root     root      4096 Jun 21  2013 /home
lrwxrwxrwx   1 root     root        29 Jun 21  2013 /vmlinuz -> boot/vmlinuz-3.8.0-19-generic
lrwxrwxrwx   1 root     root        32 Jun 21  2013 /initrd.img -> boot/initrd.img-3.8.0-19-generic
""")


find_output = six.b("""8257537   32 drwxrwxrwt 218 root     root        28672 Mar  1 11:00 /tmp
8299123    4 drwxrwxr-x   2 harel    harel        4096 Feb 27 10:06 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/stormdist/testTopology3fad644a-54c0-4def-b19e-77ca97941595-1-1393513576
8263229  964 -rw-rw-r--   1 mapred   mapred      984569 Feb 27 10:06 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/stormdist/testTopology3fad644a-54c0-4def-b19e-77ca97941595-1-1393513576/stormcode.ser
8263230    4 -rw-rw-r--   1 harel    harel        1223 Feb 27 10:06 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/stormdist/testTopology3fad644a-54c0-4def-b19e-77ca97941595-1-1393513576/stormconf.ser
8299113    4 drwxrwxr-x   2 harel    harel        4096 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate
8263406    4 -rw-rw-r--   1 harel    harel        2002 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514168746
8263476    0 -rw-rw-r--   1 harel    harel           0 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514168746.version
8263607    0 -rw-rw-r--   1 harel    harel           0 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514169735.version
8263533    0 -rw-rw-r--   1 harel    harel           0 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514172733.version
8263604    0 -rw-rw-r--   1 harel    harel           0 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514175754.version
""")


header_row = six.b('name,value1,value2')
sample_data_rows = [six.b('a,1,0'), six.b('b,2,0'), six.b('c,,0')]
sample_data_rows_with_empty_string = [six.b('a,aaa,0'), six.b('b,bbb,0'), six.b('c,,0')]
sample_data_no_header = six.b("\n").join(sample_data_rows) + six.b("\n")
sample_data_with_empty_string_no_header = six.b("\n").join(
    sample_data_rows_with_empty_string) + six.b("\n")
sample_data_with_header = header_row + six.b("\n") + sample_data_no_header
sample_data_with_missing_header_names = six.b("name,value1\n") + sample_data_no_header

def generate_sample_data_with_header(header):
    return header + six.b("\n") + sample_data_no_header

sample_quoted_data = six.b('''non_quoted regular_double_quoted double_double_quoted escaped_double_quoted multiline_double_double_quoted multiline_escaped_double_quoted
control-value-1 "control-value-2" control-value-3 "control-value-4" control-value-5 "control-value-6"
non-quoted-value "this is a quoted value" "this is a ""double double"" quoted value" "this is an escaped \\"quoted value\\"" "this is a double double quoted ""multiline
  value""." "this is an escaped \\"multiline
  value\\"."
control-value-1 "control-value-2" control-value-3 "control-value-4" control-value-5 "control-value-6"
''')

double_double_quoted_data = six.b('''regular_double_quoted double_double_quoted
"this is a quoted value" "this is a quoted value with ""double double quotes"""
''')

escaped_double_quoted_data = six.b('''regular_double_quoted escaped_double_quoted
"this is a quoted value" "this is a quoted value with \\"escaped double quotes\\""
''')

combined_quoted_data = six.b('''regular_double_quoted double_double_quoted escaped_double_quoted
"this is a quoted value" "this is a quoted value with ""double double quotes""" "this is a quoted value with \\"escaped double quotes\\""
''')

sample_quoted_data2 = six.b('"quoted data" 23\nunquoted-data 54')

sample_quoted_data2_with_newline = six.b('"quoted data with\na new line inside it":23\nunquoted-data:54')

one_column_data = six.b('''data without commas 1
data without commas 2
''')

# Values with leading whitespace
sample_data_rows_with_spaces = [six.b('a,1,0'), six.b('   b,   2,0'), six.b('c,,0')]
sample_data_with_spaces_no_header = six.b("\n").join(
    sample_data_rows_with_spaces) + six.b("\n")

header_row_with_spaces = six.b('name,value 1,value2')
sample_data_with_spaces_with_header = header_row_with_spaces + \
    six.b("\n") + sample_data_with_spaces_no_header

long_value1 = "23683289372328372328373"
int_value = "2328372328373"
sample_data_with_long_values = "%s\n%s\n%s" % (long_value1,int_value,int_value)


def one_column_warning(e):
    return e[0].startswith(six.b('Warning: column count is one'))

def sqlite_dict_factory(cursor, row):
    d = {}
    for idx, col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d

class AbstractQTestCase(unittest.TestCase):

    def create_file_with_data(self, data, encoding=None,prefix=None,suffix=None,use_real_path=True):
        if encoding is not None:
            raise Exception('Deprecated: Encoding must be none')
        tmpfile = NamedTemporaryFile(delete=False,prefix=prefix,suffix=suffix)
        tmpfile.write(data)
        tmpfile.close()
        if use_real_path:
            tmpfile.name = os.path.realpath(tmpfile.name)
        return tmpfile

    def generate_tmpfile_name(self,prefix=None,suffix=None):
        tmpfile = NamedTemporaryFile(delete=False,prefix=prefix,suffix=suffix)
        os.remove(tmpfile.name)
        return os.path.realpath(tmpfile.name)

    def arrays_to_csv_file_content(self,delimiter,header_row_list,cell_list):
        all_rows = [delimiter.join(row) for row in [header_row_list] + cell_list]
        return six.b("\n").join(all_rows)

    def create_qsql_file_with_content_and_return_filename(self, header_row,cell_list):
        csv_content = self.arrays_to_csv_file_content(six.b(','),header_row,cell_list)
        tmpfile = self.create_file_with_data(csv_content)

        cmd = '%s -d , -H "select count(*) from %s" -C readwrite' % (Q_EXECUTABLE,tmpfile.name)
        r, o, e = run_command(cmd)
        self.assertEqual(r, 0)

        created_qsql_filename = '%s.qsql' % tmpfile.name
        self.assertTrue(os.path.exists(created_qsql_filename))

        return created_qsql_filename

    def arrays_to_qsql_file_content(self, header_row,cell_list):
        csv_content = self.arrays_to_csv_file_content(six.b(','),header_row,cell_list)
        tmpfile = self.create_file_with_data(csv_content)

        cmd = '%s -d , -H "select count(*) from %s" -C readwrite' % (Q_EXECUTABLE,tmpfile.name)
        r, o, e = run_command(cmd)
        self.assertEqual(r, 0)

        matching_qsql_filename = '%s.qsql' % tmpfile.name
        f = open(matching_qsql_filename,'rb')
        qsql_file_bytes = f.read()
        f.close()

        self.assertEqual(matching_qsql_filename,'%s.qsql' % tmpfile.name)

        return qsql_file_bytes

    def write_file(self,filename,data):
        f = open(filename,'wb')
        f.write(data)
        f.close()

    def create_folder_with_files(self,filename_to_content_dict,prefix, suffix):
        name = self.random_tmp_filename(prefix,suffix)
        os.makedirs(name)
        for filename,content in six.iteritems(filename_to_content_dict):
            if os.path.sep in filename:
                os.makedirs('%s/%s' % (name,os.path.split(filename)[0]))
            f = open(os.path.join(name,filename),'wb')
            f.write(content)
            f.close()
        return name

    def cleanup_folder(self,tmpfolder):
        if not tmpfolder.startswith(os.path.realpath('/var/tmp')):
            raise Exception('Guard against accidental folder deletions: %s' % tmpfolder)
        global DEBUG
        if not DEBUG:
            print("should have removed tmpfolder %s. Not doing it for the sake of safety. # TODO re-add" % tmpfolder)
            pass # os.remove(tmpfolder)

    def cleanup(self, tmpfile):
        global DEBUG
        if not DEBUG:
            os.remove(tmpfile.name)

    def random_tmp_filename(self,prefix,postfix):
        # TODO Use more robust method for this
        path = '/var/tmp'
        return os.path.realpath('%s/%s-%s.%s' % (path,prefix,random.randint(0,1000000000),postfix))


def get_sqlite_table_list(c,exclude_qcatalog=True):
    if exclude_qcatalog:
        r = c.execute("select tbl_name from sqlite_master where type='table' and tbl_name != '_qcatalog'").fetchall()
    else:
        r = c.execute("select tbl_name from sqlite_master where type='table'").fetchall()

    return r

class SaveToSqliteTests(AbstractQTestCase):

    # Returns a folder with files and a header in each, one column named 'a'
    def generate_files_in_folder(self,batch_size, file_count):
        numbers = list(range(1, 1 + batch_size * file_count))
        numbers_as_text = batch([str(x) for x in numbers], n=batch_size)

        content_list = list(map(six.b, ['a\n' + "\n".join(x) + '\n' for x in numbers_as_text]))

        filename_list = list(map(lambda x: 'file-%s' % x, range(file_count)))
        d = collections.OrderedDict(zip(filename_list, content_list))

        tmpfolder = self.create_folder_with_files(d, 'split-files', 'sqlite-stuff')
        return (tmpfolder,filename_list)

    # 11074  3.8.2021 10:53  bin/q.py "select count(*) from xxxx/file-95 left join xxxx/file-96 left join xxxx/file-97 left join xxxx/file-97 left join xxxx/file-98 left join xxxx/*" -c 1 -C readwrite -A
    # # fails because it takes qsql files as well

    def test_save_glob_files_to_sqlite(self):
        BATCH_SIZE = 50
        FILE_COUNT = 5

        tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)

        output_sqlite_file = self.random_tmp_filename("x","sqlite")

        cmd = '%s -H "select count(*) from %s/*" -c 1 -S %s' % (Q_EXECUTABLE,tmpfolder,output_sqlite_file)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 4)

        c = sqlite3.connect(output_sqlite_file)
        results = c.execute('select a from file_dash_0').fetchall()
        self.assertEqual(len(results),BATCH_SIZE*FILE_COUNT)
        self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1,BATCH_SIZE*FILE_COUNT+1)))
        tables = get_sqlite_table_list(c)
        self.assertEqual(len(tables),1)

        c.close()

        self.cleanup_folder(tmpfolder)

    def test_save_multiple_files_to_sqlite(self):
        BATCH_SIZE = 50
        FILE_COUNT = 5

        tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)

        output_sqlite_file = self.random_tmp_filename("x","sqlite")

        tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
        cmd = '%s -H "select count(*) from %s" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 4)

        c = sqlite3.connect(output_sqlite_file)

        tables = get_sqlite_table_list(c)
        self.assertEqual(len(tables), FILE_COUNT)

        for i in range(FILE_COUNT):
            results = c.execute('select a from file_dash_%s' % i).fetchall()
            self.assertEqual(len(results),BATCH_SIZE)
            self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))

        c.close()

        self.cleanup_folder(tmpfolder)

    def test_save_multiple_files_to_sqlite_without_duplicates(self):
        BATCH_SIZE = 50
        FILE_COUNT = 5

        tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)

        output_sqlite_file = self.random_tmp_filename("x","sqlite")

        tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])

        # duplicate the left-joins for all the files, so the query will contain each filename twice
        tables_as_str = "%s left join %s" % (tables_as_str,tables_as_str)

        cmd = '%s -H "select count(*) from %s" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 4)

        c = sqlite3.connect(output_sqlite_file)

        tables = get_sqlite_table_list(c)
        # total table count should still be FILE_COUNT, even with the duplications
        self.assertEqual(len(tables), FILE_COUNT)

        for i in range(FILE_COUNT):
            results = c.execute('select a from file_dash_%s' % i).fetchall()
            self.assertEqual(len(results),BATCH_SIZE)
            self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))

        c.close()

        self.cleanup_folder(tmpfolder)

    def test_sqlite_file_is_not_created_if_some_table_does_not_exist(self):
        BATCH_SIZE = 50
        FILE_COUNT = 5

        tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)

        output_sqlite_file = self.random_tmp_filename("x","sqlite")

        tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])

        tables_as_str = tables_as_str + ' left join %s/non_existent_table' % (tmpfolder)

        cmd = '%s -H "select count(*) from %s" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 30)
        self.assertEqual(len(e), 2)
        self.assertEqual(e[0],six.b("Going to save data into a disk database: %s" % output_sqlite_file))
        self.assertEqual(e[1],six.b("No files matching '%s/non_existent_table' have been found" % tmpfolder))

        self.assertTrue(not os.path.exists(output_sqlite_file))

        self.cleanup_folder(tmpfolder)

    def test_recurring_glob_and_separate_files_in_same_query_when_writing_to_sqlite(self):
        BATCH_SIZE = 50
        FILE_COUNT = 5

        tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)

        output_sqlite_file = self.random_tmp_filename("x","sqlite")

        tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
        # The same files are left-joined in the query as an additional "left join <folder>/*". This should create an additional table
        # in the sqlite file, with all the data in it
        cmd = '%s -H "select count(*) from %s left join %s/*" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,tmpfolder,output_sqlite_file)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 4)

        c = sqlite3.connect(output_sqlite_file)

        tables = get_sqlite_table_list(c)
        # plus the additional table from the glob
        self.assertEqual(len(tables), FILE_COUNT+1)

        # check all the per-file tables
        for i in range(FILE_COUNT):
            results = c.execute('select a from file_dash_%s' % i).fetchall()
            self.assertEqual(len(results),BATCH_SIZE)
            self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))

        # ensure the glob-based table exists, with an _2 added to the name, as the original "file_dash_0" already exists in the sqlite db
        results = c.execute('select a from file_dash_0_2').fetchall()
        self.assertEqual(len(results),FILE_COUNT*BATCH_SIZE)
        self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1,1+FILE_COUNT*BATCH_SIZE)))
        c.close()

        self.cleanup_folder(tmpfolder)

    def test_empty_sqlite_handling(self):
        fn = self.generate_tmpfile_name("empty",".sqlite")

        c = sqlite3.connect(fn)
        c.execute('create table x (a int)').fetchall()
        c.execute('drop table x').fetchall()
        c.close()

        cmd = '%s "select * from %s"' % (Q_EXECUTABLE,fn)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,88)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),1)
        self.assertEqual(e[0],six.b('sqlite file %s has no tables' % fn))

    def test_storing_to_disk_too_many_qsql_files(self):
        BATCH_SIZE = 10
        MAX_ATTACHED_DBS = 5
        FILE_COUNT = MAX_ATTACHED_DBS + 4

        numbers_as_text = batch([str(x) for x in range(1, 1 + BATCH_SIZE * FILE_COUNT)], n=BATCH_SIZE)

        content_list = map(six.b, ["\n".join(x) for x in numbers_as_text])

        filename_list = list(map(lambda x: 'file-%s' % x, range(FILE_COUNT)))
        d = collections.OrderedDict(zip(filename_list, content_list))

        tmpfolder = self.create_folder_with_files(d, 'split-files', 'attach-limit')

        for fn in filename_list:
            cmd = '%s -c 1 "select count(*) from %s/%s" -C readwrite' % (Q_EXECUTABLE,tmpfolder, fn)
            retcode, o, e = run_command(cmd)

            self.assertEqual(retcode, 0)

        output_sqlite_file = self.generate_tmpfile_name("many-sqlites",".sqlite")

        table_refs = list(['select * from %s/%s.qsql' % (tmpfolder,x) for x in filename_list])
        table_refs_str = " UNION ALL ".join(table_refs)
        # Limit max attached dbs according to the parameter (must be below the hardcoded sqlite limit, which is 10 when having a standard version compiled)
        cmd = '%s "select * from (%s)" -S %s --max-attached-sqlite-databases=%s' % (Q_EXECUTABLE,table_refs_str,output_sqlite_file,MAX_ATTACHED_DBS)
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode,0)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),4)

        c = sqlite3.connect(output_sqlite_file)
        tables_results = c.execute("select tbl_name from sqlite_master where type='table'").fetchall()
        table_names = list(sorted([x[0] for x in tables_results]))
        self.assertEqual(len(table_names),FILE_COUNT)

        for i,tn in enumerate(table_names):
            self.assertEqual(tn,'file_dash_%s' % i)

            table_content = c.execute('select * from %s' % tn).fetchall()
            self.assertEqual(len(table_content),BATCH_SIZE)

            cmd = '%s "select * from %s:::%s"' % (Q_EXECUTABLE,output_sqlite_file,tn)
            retcode, o, e = run_command(cmd)
            self.assertEqual(retcode, 0)
            self.assertEqual(len(e),0)
            self.assertEqual(len(o),BATCH_SIZE)
            self.assertEqual(o,list([six.b(str(x)) for x in range(1 + i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)]))

        self.cleanup_folder(tmpfolder)

    def test_storing_to_disk_too_many_sqlite_files(self):
        # a variation of test_storing_to_disk_too_many_qsql_files, which deletes the qcatalog file from the caches,
        # so they'll be just regular sqlite files

        BATCH_SIZE = 10
        MAX_ATTACHED_DBS = 5
        FILE_COUNT = MAX_ATTACHED_DBS + 4

        numbers_as_text = batch([str(x) for x in range(1, 1 + BATCH_SIZE * FILE_COUNT)], n=BATCH_SIZE)

        content_list = map(six.b, ["\n".join(x) for x in numbers_as_text])

        filename_list = list(map(lambda x: 'file-%s' % x, range(FILE_COUNT)))
        d = collections.OrderedDict(zip(filename_list, content_list))

        tmpfolder = self.create_folder_with_files(d, 'split-files', 'attach-limit')

        for fn in filename_list:
            cmd = '%s -c 1 "select count(*) from %s/%s" -C readwrite' % (Q_EXECUTABLE,tmpfolder, fn)
            retcode, o, e = run_command(cmd)

            self.assertEqual(retcode, 0)

            c = sqlite3.connect('%s/%s.qsql' % (tmpfolder,fn))
            c.execute('drop table _qcatalog').fetchall()
            c.close()
            os.rename('%s/%s.qsql' % (tmpfolder,fn),'%s/%s.sqlite' % (tmpfolder,fn))

        output_sqlite_file = self.generate_tmpfile_name("many-sqlites",".sqlite")

        table_refs = list(['select * from %s/%s.sqlite' % (tmpfolder,x) for x in filename_list])
        table_refs_str = " UNION ALL ".join(table_refs)
        # Limit max attached dbs according to the parameter (must be below the hardcoded sqlite limit, which is 10 when having a standard version compiled)
        cmd = '%s "select * from (%s)" -S %s --max-attached-sqlite-databases=%s' % (Q_EXECUTABLE,table_refs_str,output_sqlite_file,MAX_ATTACHED_DBS)
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode,0)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),4)

        c = sqlite3.connect(output_sqlite_file)
        tables_results = c.execute("select tbl_name from sqlite_master where type='table'").fetchall()
        table_names = list(sorted([x[0] for x in tables_results]))
        self.assertEqual(len(table_names),FILE_COUNT)

        for i,tn in enumerate(table_names):
            self.assertEqual(tn,'file_dash_%s' % i)

            table_content = c.execute('select * from %s' % tn).fetchall()
            self.assertEqual(len(table_content),BATCH_SIZE)

            cmd = '%s "select * from %s:::%s"' % (Q_EXECUTABLE,output_sqlite_file,tn)
            retcode, o, e = run_command(cmd)
            self.assertEqual(retcode, 0)
            self.assertEqual(len(e),0)
            self.assertEqual(len(o),BATCH_SIZE)
            self.assertEqual(o,list([six.b(str(x)) for x in range(1 + i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)]))

        self.cleanup_folder(tmpfolder)

    def test_storing_to_disk_too_many_sqlite_files__over_the_sqlite_limit(self):
        # a variation of test_storing_to_disk_too_many_sqlite_files, but with a limit above the sqlite hardcoded limit
        MAX_ATTACHED_DBS = 20 # standard sqlite limit is 10, so q should throw an error

        BATCH_SIZE = 10
        FILE_COUNT = MAX_ATTACHED_DBS + 4

        numbers_as_text = batch([str(x) for x in range(1, 1 + BATCH_SIZE * FILE_COUNT)], n=BATCH_SIZE)

        content_list = map(six.b, ["\n".join(x) for x in numbers_as_text])

        filename_list = list(map(lambda x: 'file-%s' % x, range(FILE_COUNT)))
        d = collections.OrderedDict(zip(filename_list, content_list))

        tmpfolder = self.create_folder_with_files(d, 'split-files', 'attach-limit')

        for fn in filename_list:
            cmd = '%s -c 1 "select count(*) from %s/%s" -C readwrite' % (Q_EXECUTABLE,tmpfolder, fn)
            retcode, o, e = run_command(cmd)

            self.assertEqual(retcode, 0)

            c = sqlite3.connect('%s/%s.qsql' % (tmpfolder,fn))
            c.execute('drop table _qcatalog').fetchall()
            c.close()
            os.rename('%s/%s.qsql' % (tmpfolder,fn),'%s/%s.sqlite' % (tmpfolder,fn))

        output_sqlite_file = self.generate_tmpfile_name("many-sqlites",".sqlite")

        table_refs = list(['select * from %s/%s.sqlite' % (tmpfolder,x) for x in filename_list])
        table_refs_str = " UNION ALL ".join(table_refs)
        # Limit max attached dbs according to the parameter (must be below the hardcoded sqlite limit, which is 10 when having a standard version compiled)
        cmd = '%s "select * from (%s)" -S %s --max-attached-sqlite-databases=%s' % (Q_EXECUTABLE,table_refs_str,output_sqlite_file,MAX_ATTACHED_DBS)
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode,89)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),2)
        self.assertTrue(e[0].startswith(six.b('Going to save data into')))
        self.assertTrue(e[1].startswith(six.b('There are too many attached databases. Use a proper --max-attached-sqlite-databases parameter which is below the maximum')))

        self.cleanup_folder(tmpfolder)

    def test_qtable_name_normalization__starting_with_a_digit(self):
        numbers = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 101)]

        header = [six.b('aa'), six.b('bb'), six.b('cc')]

        base_filename_with_digits = '010'

        new_tmp_folder = self.create_folder_with_files({
            base_filename_with_digits : self.arrays_to_csv_file_content(six.b(','),header,numbers)
        },prefix='xx',suffix='digits')

        effective_filename = '%s/010' % new_tmp_folder

        output_sqlite_filename = self.generate_tmpfile_name("starting-with-digit",".sqlite")
        cmd = '%s -d , -H "select count(aa),count(bb),count(cc) from %s" -S %s' % (Q_EXECUTABLE,effective_filename,output_sqlite_filename)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),4)

        c = sqlite3.connect(output_sqlite_filename)
        results = c.execute('select aa,bb,cc from t_%s' % base_filename_with_digits).fetchall()
        self.assertEqual(results,list([(x,x,x) for x in range(1,101)]))
        c.close()

        self.cleanup_folder(new_tmp_folder)

    def test_qtable_name_normalization(self):
        x = [six.b(a) for a in map(str, range(1, 101))]
        large_file_data = six.b("val\n") + six.b("\n").join(x)
        tmpfile = self.create_file_with_data(large_file_data)

        tmpfile_folder = os.path.dirname(tmpfile.name)
        tmpfile_basename = os.path.basename(tmpfile.name)

        cmd = 'cd %s && %s -c 1 -H -D , -O "select a.val,b.val from %s a cross join ./%s b on (a.val = b.val * 2)"' % (tmpfile_folder,Q_EXECUTABLE,tmpfile_basename,tmpfile_basename)
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 51)

        evens = list(filter(lambda x: x%2 == 0,range(1,101)))
        expected_result_rows = [six.b('val,val')] + [six.b('%d,%d' % (x,x / 2)) for x in evens]
        self.assertEqual(o,expected_result_rows)

    def test_qtable_name_normalization2(self):
        cmd = '%s "select * from"' % Q_EXECUTABLE

        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 118)
        self.assertEqual(len(e), 1)
        self.assertEqual(e[0],six.b('FROM/JOIN is missing a table name after it'))

    def test_qtable_name_normalization3(self):
        # with a space after the from
        cmd = '%s "select * from "' % Q_EXECUTABLE

        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 118)
        self.assertEqual(len(e), 1)
        self.assertEqual(e[0],six.b('FROM/JOIN is missing a table name after it'))

    def test_save_multiple_files_to_sqlite_while_caching_them(self):
        BATCH_SIZE = 50
        FILE_COUNT = 5

        tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)

        output_sqlite_file = self.random_tmp_filename("x","sqlite")

        tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
        cmd = '%s -H "select count(*) from %s" -c 1 -S %s -C readwrite' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 4)

        c = sqlite3.connect(output_sqlite_file)

        tables = get_sqlite_table_list(c)
        self.assertEqual(len(tables), FILE_COUNT)

        for i,filename in enumerate(filename_list):
            matching_table_name = 'file_dash_%s' % i

            results = c.execute('select a from %s' % matching_table_name).fetchall()
            self.assertEqual(len(results),BATCH_SIZE)
            self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))

            # check actual resulting qsql file for the file
            cmd = '%s -c 1 -H "select a from %s/%s"' % (Q_EXECUTABLE,tmpfolder,filename)
            retcode, o, e = run_command(cmd)

            self.assertEqual(retcode, 0)
            self.assertEqual(len(o), BATCH_SIZE)
            self.assertEqual(sum(map(int,o)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))
            self.assertEqual(len(e), 0)

            # check analysis returns proper file-with-unused-qsql for each file, since by default `-C none` which means don't read the cache
            # even if it exists
            cmd = '%s -c 1 -H "select a from %s/%s" -A' % (Q_EXECUTABLE,tmpfolder,filename)
            retcode, o, e = run_command(cmd)

            self.assertEqual(retcode, 0)
            self.assertEqual(len(o), 5)
            self.assertEqual(o,[
                six.b('Table: %s/file-%s' % (tmpfolder,i)),
                six.b('  Sources:'),
                six.b('    source_type: file-with-unused-qsql source: %s/file-%s' % (tmpfolder,i)),
                six.b('  Fields:'),
                six.b('    `a` - int')
            ])

            cmd = '%s -c 1 -H "select a from %s/%s" -A -C read' % (Q_EXECUTABLE,tmpfolder,filename)
            retcode, o, e = run_command(cmd)

            self.assertEqual(retcode, 0)
            self.assertEqual(len(o), 5)
            self.assertEqual(o,[
                six.b('Table: %s/file-%s' % (tmpfolder,i)),
                six.b('  Sources:'),
                six.b('    source_type: qsql-file-with-original source: %s/file-%s.qsql' % (tmpfolder,i)),
                six.b('  Fields:'),
                six.b('    `a` - int')
            ])

            # check qsql file is readable directly through q
            cmd = '%s -c 1 -H "select a from %s/%s.qsql"' % (Q_EXECUTABLE,tmpfolder,filename)
            retcode, o, e = run_command(cmd)

            self.assertEqual(retcode, 0)
            self.assertEqual(len(o), BATCH_SIZE)
            self.assertEqual(sum(map(int,o)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))
            self.assertEqual(len(e), 0)

            # check analysis returns proper qsql-with-original for each file when running directly against the qsql file
            cmd = '%s -c 1 -H "select a from %s/%s.qsql" -A' % (Q_EXECUTABLE,tmpfolder,filename)
            retcode, o, e = run_command(cmd)

            self.assertEqual(retcode, 0)
            self.assertEqual(len(o), 5)
            self.assertEqual(o,[
                six.b('Table: %s/file-%s.qsql' % (tmpfolder,i)),
                six.b('  Sources:'),
                six.b('    source_type: qsql-file source: %s/file-%s.qsql' % (tmpfolder,i)),
                six.b('  Fields:'),
                six.b('    `a` - int')
            ])
        c.close()

        import glob
        filename_list_with_qsql = list(map(lambda x: x+'.qsql',filename_list))

        files_in_folder = glob.glob('%s/*' % tmpfolder)
        regular_files,qsql_files = partition(lambda x: x.endswith('.qsql'),files_in_folder)

        self.assertEqual(len(files_in_folder),2*FILE_COUNT)
        self.assertEqual(sorted(list(map(os.path.basename,regular_files))),sorted(list(map(os.path.basename,filename_list))))
        self.assertEqual(sorted(list(map(os.path.basename,qsql_files))),sorted(list(map(os.path.basename,filename_list_with_qsql))))

        self.cleanup_folder(tmpfolder)

    def test_globs_ignore_matching_qsql_files(self):
        BATCH_SIZE = 10
        FILE_COUNT = 5

        tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)

        tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
        cmd = '%s -H "select count(*) from %s" -c 1 -C readwrite' % (Q_EXECUTABLE,tables_as_str)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b(str(pow(BATCH_SIZE,FILE_COUNT))))

        cmd = '%s -H "select a from %s/*" -c 1 -C read' % (Q_EXECUTABLE,tmpfolder)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), BATCH_SIZE*FILE_COUNT)
        self.assertEqual(len(e), 0)
        self.assertEqual(sum(map(int,o)),sum(range(1,1+BATCH_SIZE*FILE_COUNT)))

        self.cleanup_folder(tmpfolder)

    def test_error_on_reading_from_multi_table_sqlite_without_explicit_table_name(self):
        BATCH_SIZE = 50
        FILE_COUNT = 5

        tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)

        output_sqlite_file = self.random_tmp_filename("x","sqlite")

        tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
        cmd = '%s -H "select count(*) from %s" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 4)


        cmd = '%s -H "select count(*) from %s"' % (Q_EXECUTABLE,output_sqlite_file)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 87)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        self.assertEqual(e[0],six.b("Could not autodetect table name in sqlite file %s . Existing tables: file_dash_0,file_dash_1,file_dash_2,file_dash_3,file_dash_4" % output_sqlite_file))

        self.cleanup_folder(tmpfolder)

    def test_error_on_trying_to_specify_an_explicit_non_existent_qsql_file(self):
        cmd = '%s -H "select count(*) from /non-existent-folder/non-existent.qsql:::mytable"' % (Q_EXECUTABLE)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 30)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        self.assertEqual(e[0],six.b("Could not find file /non-existent-folder/non-existent.qsql"))

    def test_error_on_providing_a_non_qsql_file_when_specifying_an_explicit_table(self):
        data = six.b("\x1f\x8b\x08\x00\tZ\x0ea\x00\x03\xed\x93\xdd\n\xc20\x0cF\xf3(}\x01ij\x93\xf6y:\xd9P\x10)\xb3\xbe\xbf\x9d\x1d\xbbQ\xc6\x06F\x10rn\xbe\x9b\xd0\xfc\x1c\x9a-\x88\x83\x88\x91\xd9\xbc2\xb4\xc4#\xb5\x9c1\x8e\x1czb\x8a\xd1\x19t\xdeS\x00\xc3\xf2\xa3\x01<\xee%\x8du\x94s\x1a\xfbk\xd7\xdf\x0e\xa9\x94Kz\xaf\xabe\xc3\xb0\xf2\xce\xbc\xc7\x92\x7fB\xb6\x1fv\xfd2\xf5\x1e\x81h\xa3\xff\x10'\xff\x8c\x04\x06\xc5'\x03\xf5oO\xe2=v\xf9o\xff\x9f\xd1\xa9\xff_\x90m'\xdec\x9f\x7f\x9c\xfc\xd7T\xff\x8a\xa2(\x92<\x01WY\x0c\x06\x00\x0c\x00\x00")
        tmpfilename = self.random_tmp_filename('xx','yy')
        f = open(tmpfilename,'wb')
        f.write(data)
        f.close()

        cmd = '%s -H "select count(*) from %s:::mytable1"' % (Q_EXECUTABLE,tmpfilename)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 95)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        self.assertEqual(e[0],six.b("Cannot detect the type of table %s:::mytable1" % tmpfilename))

    def test_error_on_providing_a_non_qsql_file_when_not_specifying_an_explicit_table(self):
        data = six.b("\x1f\x8b\x08\x00\tZ\x0ea\x00\x03\xed\x93\xdd\n\xc20\x0cF\xf3(}\x01ij\x93\xf6y:\xd9P\x10)\xb3\xbe\xbf\x9d\x1d\xbbQ\xc6\x06F\x10rn\xbe\x9b\xd0\xfc\x1c\x9a-\x88\x83\x88\x91\xd9\xbc2\xb4\xc4#\xb5\x9c1\x8e\x1czb\x8a\xd1\x19t\xdeS\x00\xc3\xf2\xa3\x01<\xee%\x8du\x94s\x1a\xfbk\xd7\xdf\x0e\xa9\x94Kz\xaf\xabe\xc3\xb0\xf2\xce\xbc\xc7\x92\x7fB\xb6\x1fv\xfd2\xf5\x1e\x81h\xa3\xff\x10'\xff\x8c\x04\x06\xc5'\x03\xf5oO\xe2=v\xf9o\xff\x9f\xd1\xa9\xff_\x90m'\xdec\x9f\x7f\x9c\xfc\xd7T\xff\x8a\xa2(\x92<\x01WY\x0c\x06\x00\x0c\x00\x00")
        tmpfilename = self.random_tmp_filename('xx','yy')
        f = open(tmpfilename,'wb')
        f.write(data)
        f.close()

        cmd = '%s -H "select count(*) from %s"' % (Q_EXECUTABLE,tmpfilename)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 59)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        self.assertTrue(e[0].startswith(six.b("Could not parse the input. Please make sure to set the proper -w input-wrapping parameter for your input, and that you use the proper input encoding (-e). Error:")))

class OldSaveDbToDiskTests(AbstractQTestCase):

    def test_join_with_stdin_and_save(self):
        x = [six.b(a) for a in map(str,range(1,101))]
        large_file_data = six.b("val\n") + six.b("\n").join(x)
        tmpfile = self.create_file_with_data(large_file_data)
        tmpfile_expected_table_name = os.path.basename(tmpfile.name)

        disk_db_filename = self.random_tmp_filename('save-to-db','sqlite')

        cmd = '(echo id ; seq 1 2 10) | ' + Q_EXECUTABLE + ' -c 1 -H -O "select stdin.*,f.* from - stdin left join %s f on (stdin.id * 10 = f.val)" -S %s' % \
            (tmpfile.name,disk_db_filename)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 4)

        self.assertEqual(e[0],six.b('Going to save data into a disk database: %s' % disk_db_filename))
        self.assertTrue(e[1].startswith(six.b('Data has been saved into %s . Saving has taken ' % disk_db_filename)))
        self.assertEqual(e[2],six.b('Query to run on the database: select stdin.*,f.* from data_stream_stdin stdin left join %s f on (stdin.id * 10 = f.val);' % \
                         tmpfile_expected_table_name))
        self.assertEqual(e[3],six.b('You can run the query directly from the command line using the following command: echo "select stdin.*,f.* from data_stream_stdin stdin left join %s f on (stdin.id * 10 = f.val)" | sqlite3 %s' %
                                    (tmpfile_expected_table_name,disk_db_filename)))

        P = re.compile(six.b("^Query to run on the database: (?P<query_to_run_on_db>.*)$"))
        m = P.search(e[2])
        query_to_run_on_db = m.groupdict()['query_to_run_on_db']

        self.assertTrue(os.path.exists(disk_db_filename))

        # validate disk db content natively
        c = sqlite3.connect(disk_db_filename)
        c.row_factory = sqlite_dict_factory
        t0_results = c.execute('select * from data_stream_stdin').fetchall()
        self.assertEqual(len(t0_results),5)
        self.assertEqual(sorted(list(t0_results[0].keys())), ['id'])
        self.assertEqual(list(map(lambda x:x['id'],t0_results)),[1,3,5,7,9])
        t1_results = c.execute('select * from %s' % tmpfile_expected_table_name).fetchall()
        self.assertEqual(len(t1_results),100)
        self.assertEqual(sorted(list(t1_results[0].keys())), ['val'])
        self.assertEqual("\n".join(list(map(lambda x:str(x['val']),t1_results))),"\n".join(map(str,range(1,101))))

        query_results = c.execute(query_to_run_on_db.decode('utf-8')).fetchall()

        self.assertEqual(query_results[0],{ 'id': 1 , 'val': 10})
        self.assertEqual(query_results[1],{ 'id': 3 , 'val': 30})
        self.assertEqual(query_results[2],{ 'id': 5 , 'val': 50})
        self.assertEqual(query_results[3],{ 'id': 7 , 'val': 70})
        self.assertEqual(query_results[4],{ 'id': 9 , 'val': 90})

        self.cleanup(tmpfile)

    def test_join_with_qsql_file(self):
        numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
        numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]

        header = [six.b('aa'), six.b('bb'), six.b('cc')]

        new_tmp_folder = self.create_folder_with_files({
            'some_csv_file': self.arrays_to_csv_file_content(six.b(','),header,numbers1),
            'some_qsql_database.qsql' : self.arrays_to_qsql_file_content(header,numbers2)
        },prefix='xx',suffix='yy')

        effective_filename1 = '%s/some_csv_file' % new_tmp_folder
        effective_filename2 = '%s/some_qsql_database.qsql' % new_tmp_folder

        cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(small_file.aa) from %s large_file left join %s small_file on (small_file.aa == large_file.bb)"' % \
              (effective_filename1,effective_filename2)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(o),1)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('50005000,55'))

    # TODO RLRL Check if needed anymore

    # def test_creation_of_qsql_database(self):
    #     numbers = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
    #     header = [six.b('aa'), six.b('bb'), six.b('cc')]
    #
    #     qsql_filename = self.create_qsql_file_with_content_and_return_filename(header,numbers)
    #
    #     conn = sqlite3.connect(qsql_filename)
    #     qcatalog = conn.execute('select temp_table_name,source_type,source from _qcatalog').fetchall()
    #     print(qcatalog)
    #
    #     cmd = '%s "select count(*) from %s" -A' % (Q_EXECUTABLE,qsql_filename)
    #     retcode, o, e = run_command(cmd)
    #     print(o)

    def test_join_with_qsql_file_and_save(self):
        numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
        numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]

        header = [six.b('aa'), six.b('bb'), six.b('cc')]

        saved_qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')

        new_tmp_folder = self.create_folder_with_files({
            'some_csv_file': self.arrays_to_csv_file_content(six.b(','),header,numbers1),
            'some_qsql_database' : self.arrays_to_csv_file_content(six.b(','),header,numbers2)
        },prefix='xx',suffix='yy')
        cmd = '%s -d , -H "select count(*) from %s/some_qsql_database" -C readwrite' % (Q_EXECUTABLE,new_tmp_folder)
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode,0)
        os.remove('%s/some_qsql_database' % new_tmp_folder)

        effective_filename1 = '%s/some_csv_file' % new_tmp_folder
        effective_filename2 = '%s/some_qsql_database.qsql' % new_tmp_folder

        cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(small_file.aa) from %s large_file left join %s small_file on (small_file.aa == large_file.bb)" -S %s' % \
              (effective_filename1,effective_filename2,saved_qsql_with_multiple_tables)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)

        conn = sqlite3.connect(saved_qsql_with_multiple_tables)
        c1 = conn.execute('select count(*) from some_csv_file').fetchall()
        c2 = conn.execute('select count(*) from some_qsql_database').fetchall()

        self.assertEqual(c1[0][0],10000)
        self.assertEqual(c2[0][0],10)


    def test_saving_to_db_with_same_basename_files(self):
        numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
        numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]

        header = [six.b('aa'), six.b('bb'), six.b('cc')]

        qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')

        new_tmp_folder = self.create_folder_with_files({
            'filename1': self.arrays_to_csv_file_content(six.b(','),header,numbers1),
            'otherfolder/filename1' : self.arrays_to_csv_file_content(six.b(','),header,numbers2)
        },prefix='xx',suffix='yy')

        effective_filename1 = '%s/filename1' % new_tmp_folder
        effective_filename2 = '%s/otherfolder/filename1' % new_tmp_folder

        expected_stored_table_name1 = 'filename1'
        expected_stored_table_name2 = 'filename1_2'

        cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" -S %s' % \
              (effective_filename1,effective_filename2,qsql_with_multiple_tables)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 4)
        self.assertEqual(e[0], six.b('Going to save data into a disk database: %s' % qsql_with_multiple_tables))
        self.assertTrue(e[1].startswith(six.b('Data has been saved into %s . Saving has taken' % qsql_with_multiple_tables)))
        self.assertEqual(e[2],six.b('Query to run on the database: select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb);' % \
                                    (expected_stored_table_name1,expected_stored_table_name2)))
        self.assertEqual(e[3],six.b('You can run the query directly from the command line using the following command: echo "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" | sqlite3 %s' % \
                                    (expected_stored_table_name1,expected_stored_table_name2,qsql_with_multiple_tables)))

        #self.assertTrue(False) # pxpx - need to actually test reading from the saved db file
        conn = sqlite3.connect(qsql_with_multiple_tables)
        c1 = conn.execute('select count(*) from filename1').fetchall()
        c2 = conn.execute('select count(*) from filename1_2').fetchall()

        self.assertEqual(c1[0][0],10000)
        self.assertEqual(c2[0][0],10)


    def test_error_when_not_specifying_table_name_in_multi_table_qsql(self):
        numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
        numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]

        header = [six.b('aa'), six.b('bb'), six.b('cc')]

        qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')

        new_tmp_folder = self.create_folder_with_files({
            'filename1': self.arrays_to_csv_file_content(six.b(','),header,numbers1),
            'otherfolder/filename1' : self.arrays_to_csv_file_content(six.b(','),header,numbers2)
        },prefix='xx',suffix='yy')

        effective_filename1 = '%s/filename1' % new_tmp_folder
        effective_filename2 = '%s/otherfolder/filename1' % new_tmp_folder

        expected_stored_table_name1 = 'filename1'
        expected_stored_table_name2 = 'filename1_2'

        cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" -S %s' % \
              (effective_filename1,effective_filename2,qsql_with_multiple_tables)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 4)

        # Actual tests

        cmd = '%s "select count(*) from %s"' % (Q_EXECUTABLE,qsql_with_multiple_tables)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 87)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),1)
        self.assertEqual(e[0],six.b('Could not autodetect table name in sqlite file %s . Existing tables: %s,%s' % (qsql_with_multiple_tables,expected_stored_table_name1,expected_stored_table_name2)))

    def test_error_when_not_specifying_table_name_in_multi_table_sqlite(self):
        sqlite_with_multiple_tables = self.generate_tmpfile_name(suffix='.sqlite')

        c = sqlite3.connect(sqlite_with_multiple_tables)
        c.execute('create table my_table_1 (x int, y int)').fetchall()
        c.execute('create table my_table_2 (x int, y int)').fetchall()
        c.close()

        cmd = '%s "select count(*) from %s"' % (Q_EXECUTABLE,sqlite_with_multiple_tables)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 87)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        print(e[0])
        self.assertEqual(e[0],six.b('Could not autodetect table name in sqlite file %s . Existing tables: my_table_1,my_table_2' % sqlite_with_multiple_tables))

    def test_querying_from_multi_table_sqlite_using_explicit_table_name(self):
        sqlite_with_multiple_tables = self.generate_tmpfile_name(suffix='.sqlite')

        c = sqlite3.connect(sqlite_with_multiple_tables)
        c.execute('create table my_table_1 (x int, y int)').fetchall()
        c.execute('insert into my_table_1 (x,y) values (100,200),(300,400)').fetchall()
        c.execute('commit').fetchall()
        c.execute('create table my_table_2 (x int, y int)').fetchall()
        c.close()

        cmd = '%s -d , "select * from %s:::my_table_1"' % (Q_EXECUTABLE,sqlite_with_multiple_tables)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('100,200'))
        self.assertEqual(o[1],six.b('300,400'))

        # Check again, this time with a different output delimiter and with explicit column names
        cmd = '%s -t "select x,y from %s:::my_table_1"' % (Q_EXECUTABLE,sqlite_with_multiple_tables)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('100\t200'))
        self.assertEqual(o[1],six.b('300\t400'))


    def test_error_when_specifying_nonexistent_table_name_in_multi_table_qsql(self):
        numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
        numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]

        header = [six.b('aa'), six.b('bb'), six.b('cc')]

        qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')

        new_tmp_folder = self.create_folder_with_files({
            'filename1': self.arrays_to_csv_file_content(six.b(','),header,numbers1),
            'otherfolder/filename1' : self.arrays_to_csv_file_content(six.b(','),header,numbers2)
        },prefix='xx',suffix='yy')

        effective_filename1 = '%s/filename1' % new_tmp_folder
        effective_filename2 = '%s/otherfolder/filename1' % new_tmp_folder

        expected_stored_table_name1 = 'filename1'
        expected_stored_table_name2 = 'filename1_2'

        cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" -S %s' % \
              (effective_filename1,effective_filename2,qsql_with_multiple_tables)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 4)

        # Actual tests

        cmd = '%s "select count(*) from %s:::non_existent_table"' % (Q_EXECUTABLE,qsql_with_multiple_tables)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 85)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),1)
        self.assertEqual(e[0],six.b('Table non_existent_table could not be found in sqlite file %s . Existing table names: %s,%s' % \
                                    (qsql_with_multiple_tables,expected_stored_table_name1,expected_stored_table_name2)))

    def test_querying_multi_table_qsql_file(self):
        numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
        numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]

        header = [six.b('aa'), six.b('bb'), six.b('cc')]

        qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')

        new_tmp_folder = self.create_folder_with_files({
            'filename1': self.arrays_to_csv_file_content(six.b(','),header,numbers1),
            'otherfolder/filename1' : self.arrays_to_csv_file_content(six.b(','),header,numbers2)
        },prefix='xx',suffix='yy')

        effective_filename1 = '%s/filename1' % new_tmp_folder
        effective_filename2 = '%s/otherfolder/filename1' % new_tmp_folder

        expected_stored_table_name1 = 'filename1'
        expected_stored_table_name2 = 'filename1_2'

        cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" -S %s' % \
              (effective_filename1,effective_filename2,qsql_with_multiple_tables)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 4)

        # Actual tests

        cmd = '%s "select count(*) from %s:::%s"' % (Q_EXECUTABLE,qsql_with_multiple_tables,expected_stored_table_name1)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),1)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('10000'))

        cmd = '%s "select count(*) from %s:::%s"' % (Q_EXECUTABLE,qsql_with_multiple_tables,expected_stored_table_name2)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),1)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('10'))

    def test_preventing_db_overwrite(self):
        db_filename = self.random_tmp_filename('store-to-disk', 'db')
        self.assertFalse(os.path.exists(db_filename))

        retcode, o, e = run_command('seq 1 1000 | ' + Q_EXECUTABLE + ' "select count(*) from -" -c 1 -S %s' % db_filename)

        self.assertTrue(retcode == 0)
        self.assertTrue(os.path.exists(db_filename))

        retcode2, o2, e2 = run_command('seq 1 1000 | ' + Q_EXECUTABLE + ' "select count(*) from -" -c 1 -S %s' % db_filename)
        self.assertTrue(retcode2 != 0)
        self.assertTrue(e2[0].startswith(six.b('Going to save data into a disk database')))
        self.assertTrue(e2[1] == six.b('Disk database file {} already exists.'.format(db_filename)))

        os.remove(db_filename)


class BasicTests(AbstractQTestCase):

    def test_basic_aggregation(self):
        retcode, o, e = run_command(
            'seq 1 10 | ' + Q_EXECUTABLE + ' "select sum(c1),avg(c1) from -"')
        self.assertTrue(retcode == 0)
        self.assertTrue(len(o) == 1)
        self.assertTrue(len(e) == 0)

        s = sum(range(1, 11))
        self.assertTrue(o[0] == six.b('%s %s' % (s, s / 10.0)))

    def test_select_one_column(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)

        cmd = Q_EXECUTABLE + ' -d , "select c1 from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 0)

        self.assertEqual(six.b(" ").join(o), six.b('a b c'))

        self.cleanup(tmpfile)

    def test_column_separation(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)
        cmd = Q_EXECUTABLE + ' -d , "select c1,c2,c3 from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], sample_data_rows[0])
        self.assertEqual(o[1], sample_data_rows[1])
        self.assertEqual(o[2], sample_data_rows[2])

        self.cleanup(tmpfile)

    def test_header_exception_on_numeric_header_data(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)
        cmd = Q_EXECUTABLE + ' -d , "select * from %s" -A -H' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertNotEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 3)
        self.assertTrue(
            six.b('Bad header row: Header must contain only strings') in e[0])
        self.assertTrue(six.b("Column name must be a string") in e[1])
        self.assertTrue(six.b("Column name must be a string") in e[2])

        self.cleanup(tmpfile)

    def test_different_header_in_second_file(self):
        folder_name = self.create_folder_with_files({
            'file1': self.arrays_to_csv_file_content(six.b(','),[six.b('a'),six.b('b')],[[six.b(str(x)),six.b(str(x))] for x in range(1,6)]),
            'file2': self.arrays_to_csv_file_content(six.b(','),[six.b('c'),six.b('d')],[[six.b(str(x)),six.b(str(x))] for x in range(1,6)])
        },prefix="xx",suffix="aa")

        cmd = Q_EXECUTABLE + ' -d , "select * from %s/*" -H' % (folder_name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 35)
        self.assertEqual(len(e),1)
        self.assertEqual(e[0],six.b("Bad header row: Extra header 'c,d' in file '%s/file2' mismatches original header 'a,b' from file '%s/file1'. Table name is '%s/*'" % (folder_name,folder_name,folder_name)))

    def test_data_with_header(self):
        tmpfile = self.create_file_with_data(sample_data_with_header)
        cmd = Q_EXECUTABLE + ' -d , "select name from %s" -H' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(six.b(" ").join(o), six.b("a b c"))

        self.cleanup(tmpfile)

    def test_output_header_when_input_header_exists(self):
        tmpfile = self.create_file_with_data(sample_data_with_header)
        cmd = Q_EXECUTABLE + ' -d , "select name from %s" -H -O' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 4)
        self.assertEqual(o[0],six.b('name'))
        self.assertEqual(o[1],six.b('a'))
        self.assertEqual(o[2],six.b('b'))
        self.assertEqual(o[3],six.b('c'))

        self.cleanup(tmpfile)

    def test_generated_column_name_warning_when_header_line_exists(self):
        tmpfile = self.create_file_with_data(sample_data_with_header)
        cmd = Q_EXECUTABLE + ' -d , "select c3 from %s" -H' % tmpfile.name

        retcode, o, e = run_command(cmd)

        self.assertNotEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 2)
        self.assertTrue(six.b('no such column: c3') in e[0])
        self.assertTrue(
            e[1].startswith(six.b('Warning - There seems to be a "no such column" error, and -H (header line) exists. Please make sure that you are using the column names from the header line and not the default (cXX) column names')))

        self.cleanup(tmpfile)

    def test_empty_data(self):
        tmpfile = self.create_file_with_data(six.b(''))
        cmd = Q_EXECUTABLE + ' -d , "select * from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)

        self.assertTrue(six.b('Warning - data is empty') in e[0])

        self.cleanup(tmpfile)

    def test_empty_data_with_header_param(self):
        tmpfile = self.create_file_with_data(six.b(''))
        cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -H' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertNotEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)

        m = six.b("Header line is expected but missing in file %s" % tmpfile.name)
        self.assertTrue(m in e[0])

        self.cleanup(tmpfile)

    def test_one_row_of_data_without_header_param(self):
        tmpfile = self.create_file_with_data(header_row)
        cmd = Q_EXECUTABLE + ' -d , "select c2 from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], six.b('value1'))

        self.cleanup(tmpfile)

    def test_one_row_of_data_with_header_param(self):
        tmpfile = self.create_file_with_data(header_row)
        cmd = Q_EXECUTABLE + ' -d , "select name from %s" -H' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)

        self.assertTrue(six.b('Warning - data is empty') in e[0])

        self.cleanup(tmpfile)

    def test_dont_leading_keep_whitespace_in_values(self):
        tmpfile = self.create_file_with_data(sample_data_with_spaces_no_header)
        cmd = Q_EXECUTABLE + ' -d , "select c1 from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 3)

        self.assertEqual(o[0], six.b('a'))
        self.assertEqual(o[1], six.b('b'))
        self.assertEqual(o[2], six.b('c'))

        self.cleanup(tmpfile)

    def test_keep_leading_whitespace_in_values(self):
        tmpfile = self.create_file_with_data(sample_data_with_spaces_no_header)
        cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -k' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 3)

        self.assertEqual(o[0], six.b('a'))
        self.assertEqual(o[1], six.b('   b'))
        self.assertEqual(o[2], six.b('c'))

        self.cleanup(tmpfile)

    def test_no_impact_of_keeping_leading_whitespace_on_integers(self):
        tmpfile = self.create_file_with_data(sample_data_with_spaces_no_header)
        cmd = Q_EXECUTABLE + ' -d , "select c2 from %s" -k -A' % tmpfile.name
        retcode, o, e = run_command(cmd)

        f = open("/var/tmp/XXX","wb")
        f.write(six.b("\n").join(o))
        f.write(six.b("STDERR:"))
        f.write(six.b("\n").join(e))
        f.close()

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 7)


        self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1], six.b('  Sources:'))
        self.assertEqual(o[2], six.b('    source_type: file source: %s') % six.b(tmpfile.name))
        self.assertEqual(o[3], six.b('  Fields:'))
        self.assertEqual(o[4], six.b('    `c1` - text'))
        self.assertEqual(o[5], six.b('    `c2` - int'))
        self.assertEqual(o[6], six.b('    `c3` - int'))


        self.cleanup(tmpfile)

    def test_spaces_in_header_row(self):
        tmpfile = self.create_file_with_data(
            header_row_with_spaces + six.b("\n") + sample_data_no_header)
        cmd = Q_EXECUTABLE + ' -d , "select name,\\`value 1\\` from %s" -H' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 3)

        self.assertEqual(o[0], six.b('a,1'))
        self.assertEqual(o[1], six.b('b,2'))
        self.assertEqual(o[2], six.b('c,'))

        self.cleanup(tmpfile)

    def test_no_query_in_command_line(self):
        cmd = Q_EXECUTABLE + ' -d , ""'
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 1)
        self.assertEqual(len(e), 1)
        self.assertEqual(len(o), 0)

        self.assertEqual(e[0],six.b('Query cannot be empty (query number 1)'))

    def test_empty_query_in_command_line(self):
        cmd = Q_EXECUTABLE + ' -d , "  "'
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 1)
        self.assertEqual(len(e), 1)
        self.assertEqual(len(o), 0)

        self.assertEqual(e[0],six.b('Query cannot be empty (query number 1)'))

    def test_failure_in_query_stops_processing_queries(self):
        cmd = Q_EXECUTABLE + ' -d , "select 500" "select 300" "wrong-query" "select 8000"'
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 1)
        self.assertEqual(len(e), 1)
        self.assertEqual(len(o), 2)
        self.assertEqual(o[0],six.b('500'))
        self.assertEqual(o[1],six.b('300'))

    def test_multiple_queries_in_command_line(self):
        cmd = Q_EXECUTABLE + ' -d , "select 500" "select 300+100" "select 300" "select 200"'
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 4)

        self.assertEqual(o[0],six.b('500'))
        self.assertEqual(o[1],six.b('400'))
        self.assertEqual(o[2],six.b('300'))
        self.assertEqual(o[3],six.b('200'))

    def test_literal_calculation_query(self):
        cmd = Q_EXECUTABLE + ' -d , "select 1+40/6"'
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 1)

        self.assertEqual(o[0],six.b('7'))

    def test_literal_calculation_query_float_result(self):
        cmd = Q_EXECUTABLE + ' -d , "select 1+40/6.0"'
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 1)

        self.assertEqual(o[0],six.b('7.666666666666667'))

    def test_use_query_file(self):
        tmp_data_file = self.create_file_with_data(sample_data_with_header)
        tmp_query_file = self.create_file_with_data(six.b("select name from %s" % tmp_data_file.name))

        cmd = Q_EXECUTABLE + ' -d , -q %s -H' % tmp_query_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 3)

        self.assertEqual(o[0], six.b('a'))
        self.assertEqual(o[1], six.b('b'))
        self.assertEqual(o[2], six.b('c'))

        self.cleanup(tmp_data_file)
        self.cleanup(tmp_query_file)

    def test_use_query_file_with_incorrect_query_encoding(self):
        tmp_data_file = self.create_file_with_data(sample_data_with_header)
        tmp_query_file = self.create_file_with_data(six.b("select name,'Hr\xc3\xa1\xc4\x8d' from %s" % tmp_data_file.name),encoding=None)

        cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q ascii' % tmp_query_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,3)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),1)

        self.assertTrue(e[0].startswith(six.b('Could not decode query number 1 using the provided query encoding (ascii)')))

        self.cleanup(tmp_data_file)
        self.cleanup(tmp_query_file)

    def test_output_header_with_non_ascii_names(self):
        OUTPUT_ENCODING = 'utf-8'

        tmp_data_file = self.create_file_with_data(sample_data_with_header)
        tmp_query_file = self.create_file_with_data(six.b("select name,'Hr\xc3\xa1\xc4\x8d' Hr\xc3\xa1\xc4\x8d from %s" % tmp_data_file.name),encoding=None)

        cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q utf-8 -O -E %s' % (tmp_query_file.name,OUTPUT_ENCODING)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(o),4)
        self.assertEqual(len(e),0)

        self.assertEqual(o[0].decode(OUTPUT_ENCODING), u'name,Hr\xe1\u010d')
        self.assertEqual(o[1].decode(OUTPUT_ENCODING), u'a,Hr\xe1\u010d')
        self.assertEqual(o[2].decode(OUTPUT_ENCODING), u'b,Hr\xe1\u010d')
        self.assertEqual(o[3].decode(OUTPUT_ENCODING), u'c,Hr\xe1\u010d')

        self.cleanup(tmp_data_file)
        self.cleanup(tmp_query_file)

    def test_use_query_file_with_query_encoding(self):
        OUTPUT_ENCODING = 'utf-8'

        tmp_data_file = self.create_file_with_data(sample_data_with_header)
        tmp_query_file = self.create_file_with_data(six.b("select name,'Hr\xc3\xa1\xc4\x8d' from %s" % tmp_data_file.name),encoding=None)

        cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q utf-8 -E %s' % (tmp_query_file.name,OUTPUT_ENCODING)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 3)

        self.assertEqual(o[0].decode(OUTPUT_ENCODING), u'a,Hr\xe1\u010d')
        self.assertEqual(o[1].decode(OUTPUT_ENCODING), u'b,Hr\xe1\u010d')
        self.assertEqual(o[2].decode(OUTPUT_ENCODING), u'c,Hr\xe1\u010d')

        self.cleanup(tmp_data_file)
        self.cleanup(tmp_query_file)

    def test_use_query_file_and_command_line(self):
        tmp_data_file = self.create_file_with_data(sample_data_with_header)
        tmp_query_file = self.create_file_with_data(six.b("select name from %s" % tmp_data_file.name))

        cmd = Q_EXECUTABLE + ' -d , -q %s -H "select * from ppp"' % tmp_query_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 1)
        self.assertEqual(len(e), 1)
        self.assertEqual(len(o), 0)

        self.assertTrue(e[0].startswith(six.b("Can't provide both a query file and a query on the command line")))

        self.cleanup(tmp_data_file)
        self.cleanup(tmp_query_file)

    def test_select_output_encoding(self):
        tmp_data_file = self.create_file_with_data(sample_data_with_header)
        tmp_query_file = self.create_file_with_data(six.b("select 'Hr\xc3\xa1\xc4\x8d' from %s" % tmp_data_file.name),encoding=None)

        for target_encoding in ['utf-8','ibm852']:
            cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q utf-8 -E %s' % (tmp_query_file.name,target_encoding)
            retcode, o, e = run_command(cmd)

            self.assertEqual(retcode, 0)
            self.assertEqual(len(e), 0)
            self.assertEqual(len(o), 3)

            self.assertEqual(o[0].decode(target_encoding), u'Hr\xe1\u010d')
            self.assertEqual(o[1].decode(target_encoding), u'Hr\xe1\u010d')
            self.assertEqual(o[2].decode(target_encoding), u'Hr\xe1\u010d')

        self.cleanup(tmp_data_file)
        self.cleanup(tmp_query_file)

    def test_select_failed_output_encoding(self):
        tmp_data_file = self.create_file_with_data(sample_data_with_header)
        tmp_query_file = self.create_file_with_data(six.b("select 'Hr\xc3\xa1\xc4\x8d' from %s" % tmp_data_file.name),encoding=None)

        cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q utf-8 -E ascii' % tmp_query_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 3)
        self.assertEqual(len(e), 1)
        self.assertEqual(len(o), 0)

        self.assertTrue(e[0].startswith(six.b('Cannot encode data')))

        self.cleanup(tmp_data_file)
        self.cleanup(tmp_query_file)


    def test_use_query_file_with_empty_query(self):
        tmp_query_file = self.create_file_with_data(six.b("   "))

        cmd = Q_EXECUTABLE + ' -d , -q %s -H' % tmp_query_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 1)
        self.assertEqual(len(e), 1)
        self.assertEqual(len(o), 0)

        self.assertTrue(e[0].startswith(six.b("Query cannot be empty")))

        self.cleanup(tmp_query_file)

    def test_use_non_existent_query_file(self):
        cmd = Q_EXECUTABLE + ' -d , -q non-existent-query-file -H'
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 1)
        self.assertEqual(len(e), 1)
        self.assertEqual(len(o), 0)

        self.assertTrue(e[0].startswith(six.b("Could not read query from file")))

    def test_nonexistent_file(self):
        cmd = Q_EXECUTABLE + ' "select * from non-existent-file"'

        retcode, o, e = run_command(cmd)

        self.assertNotEqual(retcode,0)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),1)

        self.assertEqual(e[0],six.b("No files matching '%s/non-existent-file' have been found" % os.getcwd()))

    def test_default_column_max_length_parameter__short_enough(self):
        huge_text = six.b("x" * 131000)

        file_data = six.b("a,b,c\n1,{},3\n".format(huge_text))

        tmpfile = self.create_file_with_data(file_data)

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0],six.b('1'))

        self.cleanup(tmpfile)

    def test_default_column_max_length_parameter__too_long(self):
        huge_text = six.b("x") * 132000

        file_data = six.b("a,b,c\n1,{},3\n".format(huge_text))

        tmpfile = self.create_file_with_data(file_data)

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 31)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)

        self.assertTrue(e[0].startswith(six.b("Column length is larger than the maximum")))
        self.assertTrue(six.b("Offending file is '{}'".format(tmpfile.name)) in e[0])
        self.assertTrue(six.b('Line is 2') in e[0])

        self.cleanup(tmpfile)

    def test_column_max_length_parameter(self):
        file_data = six.b("a,b,c\nvery-long-text,2,3\n")
        tmpfile = self.create_file_with_data(file_data)

        cmd = Q_EXECUTABLE + ' -H -d , -M 3 "select a from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 31)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)

        self.assertTrue(e[0].startswith(six.b("Column length is larger than the maximum")))
        self.assertTrue((six.b("Offending file is '%s'" % tmpfile.name)) in e[0])
        self.assertTrue(six.b('Line is 2') in e[0])

        cmd2 = Q_EXECUTABLE + ' -H -d , -M 300 -H "select a from %s"' % tmpfile.name
        retcode2, o2, e2 = run_command(cmd2)

        self.assertEqual(retcode2, 0)
        self.assertEqual(len(o2), 1)
        self.assertEqual(len(e2), 0)

        self.assertEqual(o2[0],six.b('very-long-text'))

        self.cleanup(tmpfile)

    def test_invalid_column_max_length_parameter(self):
        file_data = six.b("a,b,c\nvery-long-text,2,3\n")
        tmpfile = self.create_file_with_data(file_data)

        cmd = Q_EXECUTABLE + ' -H -d , -M xx "select a from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 31)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)

        self.assertEqual(e[0],six.b('Max column length limit must be an integer larger than 2 (xx)'))

        self.cleanup(tmpfile)

    def test_duplicate_column_name_detection(self):
        file_data = six.b("a,b,a\n10,20,30\n30,40,50")
        tmpfile = self.create_file_with_data(file_data)

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 35)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 2)

        self.assertTrue(e[0].startswith(six.b('Bad header row:')))
        self.assertEqual(e[1],six.b("'a': Column name is duplicated"))

        self.cleanup(tmpfile)

    def test_join_with_stdin(self):
        x = [six.b(a) for a in map(str,range(1,101))]
        large_file_data = six.b("val\n") + six.b("\n").join(x)
        tmpfile = self.create_file_with_data(large_file_data)

        cmd = '(echo id ; seq 1 2 10) | %s -c 1 -H -O "select stdin.*,f.* from - stdin left join %s f on (stdin.id * 10 = f.val)"' % (Q_EXECUTABLE,tmpfile.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 6)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0],six.b('id val'))
        self.assertEqual(o[1],six.b('1 10'))
        self.assertEqual(o[2],six.b('3 30'))
        self.assertEqual(o[3],six.b('5 50'))
        self.assertEqual(o[4],six.b('7 70'))
        self.assertEqual(o[5],six.b('9 90'))

        self.cleanup(tmpfile)

    def test_concatenated_files(self):
        file_data1 = six.b("a,b,c\n10,11,12\n20,21,22")
        tmpfile1 = self.create_file_with_data(file_data1)
        tmpfile1_folder = os.path.dirname(tmpfile1.name)
        tmpfile1_filename = os.path.basename(tmpfile1.name)
        expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')

        file_data2 = six.b("a,b,c\n30,31,32\n40,41,42")
        tmpfile2 = self.create_file_with_data(file_data2)
        tmpfile2_folder = os.path.dirname(tmpfile2.name)
        tmpfile2_filename = os.path.basename(tmpfile2.name)
        expected_cache_filename2 = os.path.join(tmpfile2_folder,tmpfile2_filename + '.qsql')

        cmd = Q_EXECUTABLE + ' -O -H -d , "select * from %s UNION ALL select * from %s" -C none' % (tmpfile1.name,tmpfile2.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 5)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('a,b,c'))
        self.assertEqual(o[1],six.b('10,11,12'))
        self.assertEqual(o[2],six.b('20,21,22'))
        self.assertEqual(o[3],six.b('30,31,32'))
        self.assertEqual(o[4],six.b('40,41,42'))

        self.cleanup(tmpfile1)
        self.cleanup(tmpfile2)

    def test_out_of_range_expected_column_count(self):
        cmd = '%s "select count(*) from some_table" -c -1' % Q_EXECUTABLE
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 90)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        self.assertEqual(e[0], six.b('Column count must be between 1 and 131072'))

    def test_out_of_range_expected_column_count__with_explicit_limit(self):
        cmd = '%s "select count(*) from some_table" -c -1 -M 100' % Q_EXECUTABLE
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 90)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        self.assertEqual(e[0], six.b('Column count must be between 1 and 100'))

    def test_other_out_of_range_expected_column_count__with_explicit_limit(self):
        cmd = '%s "select count(*) from some_table" -c 101 -M 100' % Q_EXECUTABLE
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 90)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        self.assertEqual(e[0], six.b('Column count must be between 1 and 100'))

    def test_explicit_limit_of_columns__data_is_ok(self):
        file_data1 = six.b("191\n192\n")
        tmpfile1 = self.create_file_with_data(file_data1)

        cmd = '%s "select count(*) from %s" -c 1 -M 3' % (Q_EXECUTABLE,tmpfile1.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0], six.b('2'))

        self.cleanup(tmpfile1)

class ManyOpenFilesTests(AbstractQTestCase):


    def test_multi_file_header_skipping(self):
        BATCH_SIZE = 50
        FILE_COUNT = 5

        numbers = list(range(1,1+BATCH_SIZE*FILE_COUNT))
        numbers_as_text = batch([str(x) for x in numbers],n=BATCH_SIZE)

        content_list = list(map(six.b,['a\n' + "\n".join(x)+'\n' for x in numbers_as_text]))

        filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
        d = collections.OrderedDict(zip(filename_list, content_list))

        tmpfolder = self.create_folder_with_files(d,'split-files','multi-header')

        cmd = '%s -d , -H -c 1 "select count(a),sum(a) from %s/*" -C none' % (Q_EXECUTABLE,tmpfolder)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0],six.b("%s,%s" % (BATCH_SIZE*FILE_COUNT,sum(numbers))))

        self.cleanup_folder(tmpfolder)

    def test_that_globs_dont_max_out_sqlite_attached_database_limits(self):
        BATCH_SIZE = 50
        FILE_COUNT = 40

        numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)

        content_list = map(six.b,["\n".join(x)+'\n' for x in numbers_as_text])

        filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
        d = collections.OrderedDict(zip(filename_list, content_list))

        tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
        #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')

        cmd = 'cd %s && %s -c 1 "select count(*) from *" -C none --max-attached-sqlite-databases=10' % (tmpfolder,Q_EXECUTABLE)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))

        self.cleanup_folder(tmpfolder)

    def test_maxing_out_max_attached_database_limits__regular_files(self):
        BATCH_SIZE = 50
        FILE_COUNT = 40

        numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)

        content_list = map(six.b,["\n".join(x)+'\n' for x in numbers_as_text])

        filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
        d = collections.OrderedDict(zip(filename_list, content_list))

        tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
        #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')

        unioned_subquery = " UNION ALL ".join(["select * from %s/%s" % (tmpfolder,filename) for filename in filename_list])
        cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C none --max-attached-sqlite-databases=10' % (tmpfolder,Q_EXECUTABLE,unioned_subquery)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))

        self.cleanup_folder(tmpfolder)

    def test_maxing_out_max_attached_database_limits__with_qsql_files_below_attached_limit(self):
        MAX_ATTACHED_SQLITE_DATABASES = 10

        BATCH_SIZE = 50
        FILE_COUNT = MAX_ATTACHED_SQLITE_DATABASES - 1

        numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)

        content_list = map(six.b,["\n".join(x)+'\n' for x in numbers_as_text])

        filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
        d = collections.OrderedDict(zip(filename_list, content_list))

        tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
        #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')

        # Execute the query with -C readwrite, so all qsql files will be created
        unioned_subquery = " UNION ALL ".join(["select * from %s/%s" % (tmpfolder,filename) for filename in filename_list])
        cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite --max-attached-sqlite-databases=%s' % (tmpfolder,Q_EXECUTABLE,unioned_subquery,MAX_ATTACHED_SQLITE_DATABASES)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))

        # Now execute the same query with -C readwrite, so all files will be read directly from the qsql files
        cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite' % (tmpfolder,Q_EXECUTABLE,unioned_subquery)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))

        self.cleanup_folder(tmpfolder)

    def test_maxing_out_max_attached_database_limits__with_qsql_files_above_attached_limit(self):
        MAX_ATTACHED_SQLITE_DATABASES = 10

        BATCH_SIZE = 50
        # Here's the difference from test_maxing_out_max_attached_database_limits__with_qsql_files_below_attached_limit
        # We're trying to cache 2 times the number of files than the number of databases that can be attached.
        # Expectation is that only a part of the files will be cached
        FILE_COUNT = MAX_ATTACHED_SQLITE_DATABASES * 2

        numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)

        content_list = map(six.b,["\n".join(x)+'\n' for x in numbers_as_text])

        filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
        d = collections.OrderedDict(zip(filename_list, content_list))

        tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
        #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')

        # Execute the query with -C readwrite, so all qsql files will be created
        unioned_subquery = " UNION ALL ".join(["select * from %s/%s" % (tmpfolder,filename) for filename in filename_list])
        cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite --max-attached-sqlite-databases=%s' % (tmpfolder,Q_EXECUTABLE,unioned_subquery,MAX_ATTACHED_SQLITE_DATABASES)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))

        # Now execute the same query with -C readwrite, so all files will be read directly from the qsql files
        cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite' % (tmpfolder,Q_EXECUTABLE,unioned_subquery)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))

        from glob import glob
        files_in_folder = [os.path.basename(x) for x in glob('%s/*' % (tmpfolder))]

        expected_files_in_folder = filename_list + list(map(lambda x: 'file-%s.qsql' % x,range(MAX_ATTACHED_SQLITE_DATABASES-2)))

        self.assertEqual(sorted(files_in_folder),sorted(expected_files_in_folder))

        self.cleanup_folder(tmpfolder)

    def test_maxing_out_max_attached_database_limits__with_directly_using_qsql_files(self):
        MAX_ATTACHED_SQLITE_DATABASES = 10

        BATCH_SIZE = 50
        FILE_COUNT = MAX_ATTACHED_SQLITE_DATABASES * 2

        numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)

        content_list = map(six.b,["\n".join(x)+'\n' for x in numbers_as_text])

        filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
        d = collections.OrderedDict(zip(filename_list, content_list))

        tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
        #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')

        # Prepare qsql for each of the files (separately, just for simplicity)
        for fn in filename_list:
            cmd = 'cd %s && %s -c 1 "select count(*) from %s" -C readwrite' % (tmpfolder,Q_EXECUTABLE,fn)
            retcode, o, e = run_command(cmd)

            self.assertEqual(retcode, 0)
            self.assertEqual(len(o), 1)
            self.assertEqual(len(e), 0)

        # Now execute a big query which uses the created qsql files
        unioned_subquery = " UNION ALL ".join(["select * from %s/%s.qsql" % (tmpfolder,filename) for filename in filename_list])

        cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite' % (tmpfolder,Q_EXECUTABLE,unioned_subquery)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))

        self.cleanup_folder(tmpfolder)

    def test_too_many_open_files_for_one_table(self):
        # Previously file opening was parallel, causing too-many-open-files

        MAX_ALLOWED_FILES = 500

        BATCH_SIZE = 2
        FILE_COUNT = MAX_ALLOWED_FILES + 1

        numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)

        content_list = map(six.b,["\n".join(x) for x in numbers_as_text])

        filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
        d = collections.OrderedDict(zip(filename_list, content_list))

        tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')

        cmd = 'cd %s && %s -c 1 "select count(*) from * where 1 = 1 or c1 != 2" -C none' % (tmpfolder,Q_EXECUTABLE)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 82)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        x = six.b('Maximum source files for table must be %s. Table is name is %s/* Number of actual files is %s' % (MAX_ALLOWED_FILES,os.path.realpath(tmpfolder),FILE_COUNT))
        print(x)
        self.assertEqual(e[0],x)

        self.cleanup_folder(tmpfolder)

    def test_many_open_files_for_one_table(self):
        # Previously file opening was parallel, causing too-many-open-files

        BATCH_SIZE = 2
        FILE_COUNT = 500

        numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)

        content_list = map(six.b,["\n".join(x) for x in numbers_as_text])

        filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
        d = collections.OrderedDict(zip(filename_list, content_list))

        tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
        #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')

        cmd = 'cd %s && %s -c 1 "select count(*) from * where 1 = 1 or c1 != 2" -C none' % (tmpfolder,Q_EXECUTABLE)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))

        self.cleanup_folder(tmpfolder)

    def test_many_open_files_for_two_tables(self):
        BATCH_SIZE = 2
        FILE_COUNT = 500

        numbers_as_text = batch([str(x) for x in range(1, 1 + BATCH_SIZE * FILE_COUNT)], n=BATCH_SIZE)

        content_list = map(six.b, ["\n".join(x) for x in numbers_as_text])

        filename_list = list(map(lambda x: 'file-%s' % x, range(FILE_COUNT)))
        d = collections.OrderedDict(zip(filename_list, content_list))

        tmpfolder1 = self.create_folder_with_files(d, 'split-files1', 'blah')
        tmpfolder2 = self.create_folder_with_files(d, 'split-files1', 'blah')

        cmd = '%s -c 1 "select count(*) from %s/* a left join %s/* b on (a.c1 = b.c1)" -C none' % (
            Q_EXECUTABLE,
            tmpfolder1,
            tmpfolder2)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], six.b(str(BATCH_SIZE * FILE_COUNT)))

        self.cleanup_folder(tmpfolder1)
        self.cleanup_folder(tmpfolder2)


class GzippingTests(AbstractQTestCase):

    def test_gzipped_file(self):
        tmpfile = self.create_file_with_data(
            six.b('\x1f\x8b\x08\x08\xf2\x18\x12S\x00\x03xxxxxx\x003\xe42\xe22\xe62\xe12\xe52\xe32\xe7\xb2\xe0\xb2\xe424\xe0\x02\x00\xeb\xbf\x8a\x13\x15\x00\x00\x00'))

        cmd = Q_EXECUTABLE + ' -z "select sum(c1),avg(c1) from %s"' % tmpfile.name

        retcode, o, e = run_command(cmd)
        self.assertTrue(retcode == 0)
        self.assertTrue(len(o) == 1)
        self.assertTrue(len(e) == 0)

        s = sum(range(1, 11))
        self.assertTrue(o[0] == six.b('%s %s' % (s, s / 10.0)))

        self.cleanup(tmpfile)


class DelimiterTests(AbstractQTestCase):

    def test_delimition_mistake_with_header(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)

        cmd = Q_EXECUTABLE + ' -d " " "select * from %s" -H' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertNotEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 2)

        self.assertTrue(e[0].startswith(six.b("Bad header row")))
        self.assertTrue(six.b("Column name cannot contain commas") in e[1])

        self.cleanup(tmpfile)

    def test_tab_delimition_parameter(self):
        tmpfile = self.create_file_with_data(
            sample_data_no_header.replace(six.b(","), six.b("\t")))
        cmd = Q_EXECUTABLE + ' -t "select c1,c2,c3 from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("\t")))
        self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("\t")))
        self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("\t")))

        self.cleanup(tmpfile)

    def test_pipe_delimition_parameter(self):
        tmpfile = self.create_file_with_data(
            sample_data_no_header.replace(six.b(","), six.b("|")))
        cmd = Q_EXECUTABLE + ' -p "select c1,c2,c3 from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("|")))
        self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("|")))
        self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("|")))

        self.cleanup(tmpfile)

    def test_tab_delimition_parameter__with_manual_override_attempt(self):
        tmpfile = self.create_file_with_data(
            sample_data_no_header.replace(six.b(","), six.b("\t")))
        cmd = Q_EXECUTABLE + ' -t -d , "select c1,c2,c3 from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 1)
        self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("\t")))
        self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("\t")))
        self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("\t")))
        self.assertEqual(e[0],six.b('Warning: -t parameter overrides -d parameter (,)'))

        self.cleanup(tmpfile)

    def test_pipe_delimition_parameter__with_manual_override_attempt(self):
        tmpfile = self.create_file_with_data(
            sample_data_no_header.replace(six.b(","), six.b("|")))
        cmd = Q_EXECUTABLE + ' -p -d , "select c1,c2,c3 from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 1)
        self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("|")))
        self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("|")))
        self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("|")))
        self.assertEqual(e[0],six.b('Warning: -p parameter overrides -d parameter (,)'))

        self.cleanup(tmpfile)

    def test_output_delimiter(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)
        cmd = Q_EXECUTABLE + ' -d , -D "|" "select c1,c2,c3 from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("|")))
        self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("|")))
        self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("|")))

        self.cleanup(tmpfile)

    def test_output_delimiter_tab_parameter(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)
        cmd = Q_EXECUTABLE + ' -d , -T "select c1,c2,c3 from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("\t")))
        self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("\t")))
        self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("\t")))

        self.cleanup(tmpfile)

    def test_output_delimiter_pipe_parameter(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)
        cmd = Q_EXECUTABLE + ' -d , -P "select c1,c2,c3 from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("|")))
        self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("|")))
        self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("|")))

        self.cleanup(tmpfile)

    def test_output_delimiter_tab_parameter__with_manual_override_attempt(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)
        cmd = Q_EXECUTABLE + ' -d , -T -D "|" "select c1,c2,c3 from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 1)

        self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("\t")))
        self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("\t")))
        self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("\t")))
        self.assertEqual(e[0], six.b('Warning: -T parameter overrides -D parameter (|)'))

        self.cleanup(tmpfile)

    def test_output_delimiter_pipe_parameter__with_manual_override_attempt(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)
        cmd = Q_EXECUTABLE + ' -d , -P -D ":" "select c1,c2,c3 from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 1)

        self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("|")))
        self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("|")))
        self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("|")))
        self.assertEqual(e[0],six.b('Warning: -P parameter overrides -D parameter (:)'))

        self.cleanup(tmpfile)


class AnalysisTests(AbstractQTestCase):

    def test_analyze_result(self):
        d = "\n".join(['%s\t%s\t%s' % (x+1,x+1,x+1) for x in range(100)])
        tmpfile = self.create_file_with_data(six.b(d))

        cmd = Q_EXECUTABLE + ' -c 1 "select count(*) from %s" -A' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 5)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1], six.b('  Sources:'))
        self.assertEqual(o[2], six.b('    source_type: file source: %s' %(tmpfile.name)))
        self.assertEqual(o[3], six.b('  Fields:'))
        self.assertEqual(o[4], six.b('    `c1` - text'))

        self.cleanup(tmpfile)

    def test_analyze_result_with_data_stream(self):
        d = "\n".join(['%s\t%s\t%s' % (x+1,x+1,x+1) for x in range(100)])
        tmpfile = self.create_file_with_data(six.b(d))

        cmd = 'cat %s | %s  -c 1 "select count(*) from -" -A' % (tmpfile.name,Q_EXECUTABLE)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 5)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], six.b('Table: -'))
        self.assertEqual(o[1], six.b('  Sources:'))
        self.assertEqual(o[2], six.b('    source_type: data-stream source: stdin'))
        self.assertEqual(o[3], six.b('  Fields:'))
        self.assertEqual(o[4], six.b('    `c1` - text'))

        self.cleanup(tmpfile)

    def test_column_analysis(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)

        cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -A' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1],six.b('  Sources:'))
        self.assertEqual(o[2],six.b('    source_type: file source: %s' % tmpfile.name))
        self.assertEqual(o[3],six.b('  Fields:'))
        self.assertEqual(o[4], six.b('    `c1` - text'))
        self.assertEqual(o[5], six.b('    `c2` - int'))
        self.assertEqual(o[6], six.b('    `c3` - int'))

        self.cleanup(tmpfile)

    def test_column_analysis_with_mixed_ints_and_floats(self):
        tmpfile = self.create_file_with_data(six.b("""planet_id,name,diameter_km,length_of_day_hours\n1000,Earth,12756,24\n2000,Mars,6792,24.7\n3000,Jupiter,142984,9.9"""))

        cmd = Q_EXECUTABLE + ' -d , -H "select * from %s" -A' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),8)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1],six.b('  Sources:'))
        self.assertEqual(o[2],six.b('    source_type: file source: %s' % tmpfile.name))
        self.assertEqual(o[3],six.b('  Fields:'))
        self.assertEqual(o[4], six.b('    `planet_id` - int'))
        self.assertEqual(o[5], six.b('    `name` - text'))
        self.assertEqual(o[6], six.b('    `diameter_km` - int'))
        self.assertEqual(o[7], six.b('    `length_of_day_hours` - real'))

        self.cleanup(tmpfile)

    def test_column_analysis_with_mixed_ints_and_floats_and_nulls(self):
        tmpfile = self.create_file_with_data(six.b("""planet_id,name,diameter_km,length_of_day_hours\n1000,Earth,12756,24\n2000,Mars,6792,24.7\n2500,Venus,,\n3000,Jupiter,142984,9.9"""))

        cmd = Q_EXECUTABLE + ' -d , -H "select * from %s" -A' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),8)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1],six.b('  Sources:'))
        self.assertEqual(o[2],six.b('    source_type: file source: %s' % tmpfile.name))
        self.assertEqual(o[3],six.b('  Fields:'))
        self.assertEqual(o[4], six.b('    `planet_id` - int'))
        self.assertEqual(o[5], six.b('    `name` - text'))
        self.assertEqual(o[6], six.b('    `diameter_km` - int'))
        self.assertEqual(o[7], six.b('    `length_of_day_hours` - real'))

        self.cleanup(tmpfile)

    def test_column_analysis_no_header(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)

        cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -A' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1],six.b('  Sources:'))
        self.assertEqual(o[2],six.b('    source_type: file source: %s' % tmpfile.name))
        self.assertEqual(o[3],six.b('  Fields:'))
        self.assertEqual(o[4], six.b('    `c1` - text'))
        self.assertEqual(o[5], six.b('    `c2` - int'))
        self.assertEqual(o[6], six.b('    `c3` - int'))

    def test_column_analysis_with_unexpected_header(self):
        tmpfile = self.create_file_with_data(sample_data_with_header)
        cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -A' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 7)
        self.assertEqual(len(e), 1)

        self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1],six.b('  Sources:'))
        self.assertEqual(o[2],six.b('    source_type: file source: %s' % tmpfile.name))
        self.assertEqual(o[3],six.b('  Fields:'))
        self.assertEqual(o[4],six.b('    `c1` - text'))
        self.assertEqual(o[5],six.b('    `c2` - text'))
        self.assertEqual(o[6],six.b('    `c3` - text'))

        self.assertEqual(
            e[0], six.b('Warning - There seems to be header line in the file, but -H has not been specified. All fields will be detected as text fields, and the header line will appear as part of the data'))

        self.cleanup(tmpfile)

    def test_column_analysis_for_spaces_in_header_row(self):
        tmpfile = self.create_file_with_data(
            header_row_with_spaces + six.b("\n") + sample_data_no_header)
        cmd = Q_EXECUTABLE + ' -d , "select name,\\`value 1\\` from %s" -H -A' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 7)

        self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1],six.b('  Sources:'))
        self.assertEqual(o[2],six.b('    source_type: file source: %s' % tmpfile.name))
        self.assertEqual(o[3],six.b('  Fields:'))
        self.assertEqual(o[4], six.b('    `name` - text'))
        self.assertEqual(o[5], six.b('    `value 1` - int'))
        self.assertEqual(o[6], six.b('    `value2` - int'))

        self.cleanup(tmpfile)

    def test_column_analysis_with_header(self):
        tmpfile = self.create_file_with_data(sample_data_with_header)
        cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -A -H' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertNotEqual(retcode, 0)
        self.assertEqual(len(o),7)
        self.assertEqual(len(e),2)
        self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1],six.b('  Sources:'))
        self.assertEqual(o[2],six.b('    source_type: file source: %s' % tmpfile.name))
        self.assertEqual(o[3],six.b('  Fields:'))
        self.assertEqual(o[4], six.b('    `name` - text'))
        self.assertEqual(o[5], six.b('    `value1` - int'))
        self.assertEqual(o[6], six.b('    `value2` - int'))

        self.assertEqual(e[0],six.b('query error: no such column: c1'))
        self.assertTrue(e[1].startswith(six.b('Warning - There seems to be a ')))

        self.cleanup(tmpfile)


class StdInTests(AbstractQTestCase):

    def test_stdin_input(self):
        cmd = six.b('printf "%s" | ' + Q_EXECUTABLE + ' -d , "select c1,c2,c3 from -"') % sample_data_no_header
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], sample_data_rows[0])
        self.assertEqual(o[1], sample_data_rows[1])
        self.assertEqual(o[2], sample_data_rows[2])

    def test_attempt_to_unzip_stdin(self):
        tmpfile = self.create_file_with_data(
            six.b('\x1f\x8b\x08\x08\xf2\x18\x12S\x00\x03xxxxxx\x003\xe42\xe22\xe62\xe12\xe52\xe32\xe7\xb2\xe0\xb2\xe424\xe0\x02\x00\xeb\xbf\x8a\x13\x15\x00\x00\x00'))

        cmd = 'cat %s | ' % tmpfile.name + Q_EXECUTABLE + ' -z "select sum(c1),avg(c1) from -"'

        retcode, o, e = run_command(cmd)
        self.assertTrue(retcode != 0)
        self.assertTrue(len(o) == 0)
        self.assertTrue(len(e) == 1)

        self.assertEqual(e[0],six.b('Cannot decompress standard input. Pipe the input through zcat in order to decompress.'))

        self.cleanup(tmpfile)

class QuotingTests(AbstractQTestCase):
    def test_non_quoted_values_in_quoted_data(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data)

        cmd = Q_EXECUTABLE + ' -d " " "select c1 from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)


        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),4)

        self.assertTrue(o[0],'non_quoted')
        self.assertTrue(o[1],'control-value-1')
        self.assertTrue(o[2],'non-quoted-value')
        self.assertTrue(o[3],'control-value-1')

        self.cleanup(tmp_data_file)

    def test_regular_quoted_values_in_quoted_data(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data)

        cmd = Q_EXECUTABLE + ' -d " " "select c2 from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),4)

        self.assertTrue(o[0],'regular_double_quoted')
        self.assertTrue(o[1],'control-value-2')
        self.assertTrue(o[2],'this is a quoted value')
        self.assertTrue(o[3],'control-value-2')

        self.cleanup(tmp_data_file)

    def test_double_double_quoted_values_in_quoted_data(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data)

        cmd = Q_EXECUTABLE + ' -d " " "select c3 from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),4)

        self.assertTrue(o[0],'double_double_quoted')
        self.assertTrue(o[1],'control-value-3')
        self.assertTrue(o[2],'this is a "double double" quoted value')
        self.assertTrue(o[3],'control-value-3')

        self.cleanup(tmp_data_file)

    def test_escaped_double_quoted_values_in_quoted_data(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data)

        cmd = Q_EXECUTABLE + ' -d " " "select c4 from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),4)

        self.assertTrue(o[0],'escaped_double_quoted')
        self.assertTrue(o[1],'control-value-4')
        self.assertTrue(o[2],'this is an escaped "quoted value"')
        self.assertTrue(o[3],'control-value-4')

        self.cleanup(tmp_data_file)

    def test_none_input_quoting_mode_in_relaxed_mode(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data2)

        cmd = Q_EXECUTABLE + ' -d " " -m relaxed -D , -w none -W none "select * from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b('"quoted,data",23'))
        self.assertEqual(o[1],six.b('unquoted-data,54,'))

        self.cleanup(tmp_data_file)

    def test_none_input_quoting_mode_in_strict_mode(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data2)

        cmd = Q_EXECUTABLE + ' -d " " -m strict -D , -w none "select * from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertNotEqual(retcode,0)
        self.assertEqual(len(e),1)
        self.assertEqual(len(o),0)

        self.assertTrue(e[0].startswith(six.b('Strict mode. Column Count is expected to identical')))

        self.cleanup(tmp_data_file)

    def test_minimal_input_quoting_mode(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data2)

        cmd = Q_EXECUTABLE + ' -d " " -D , -w minimal "select * from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b('quoted data,23'))
        self.assertEqual(o[1],six.b('unquoted-data,54'))

        self.cleanup(tmp_data_file)

    def test_all_input_quoting_mode(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data2)

        cmd = Q_EXECUTABLE + ' -d " " -D , -w all "select * from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b('quoted data,23'))
        self.assertEqual(o[1],six.b('unquoted-data,54'))

        self.cleanup(tmp_data_file)

    def test_incorrect_input_quoting_mode(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data2)

        cmd = Q_EXECUTABLE + ' -d " " -D , -w unknown_wrapping_mode "select * from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertNotEqual(retcode,0)
        self.assertEqual(len(e),1)
        self.assertEqual(len(o),0)

        self.assertTrue(e[0].startswith(six.b('Input quoting mode can only be one of all,minimal,none')))
        self.assertTrue(six.b('unknown_wrapping_mode') in e[0])

        self.cleanup(tmp_data_file)

    def test_none_output_quoting_mode(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data2)

        cmd = Q_EXECUTABLE + ' -d " " -D , -w all -W none "select * from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b('quoted data,23'))
        self.assertEqual(o[1],six.b('unquoted-data,54'))

        self.cleanup(tmp_data_file)

    def test_minimal_output_quoting_mode__without_need_to_quote_in_output(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data2)

        cmd = Q_EXECUTABLE + ' -d " " -D , -w all -W minimal "select * from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b('quoted data,23'))
        self.assertEqual(o[1],six.b('unquoted-data,54'))

        self.cleanup(tmp_data_file)

    def test_minimal_output_quoting_mode__with_need_to_quote_in_output_due_to_delimiter(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data2)

        # output delimiter is set to space, so the output will contain it
        cmd = Q_EXECUTABLE + ' -d " " -D " " -w all -W minimal "select * from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b('"quoted data" 23'))
        self.assertEqual(o[1],six.b('unquoted-data 54'))

        self.cleanup(tmp_data_file)

    def test_minimal_output_quoting_mode__with_need_to_quote_in_output_due_to_newline(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data2_with_newline)

        # Delimiter is set to colon (:), so it will not be inside the data values (this will make sure that the newline is the one causing the quoting)
        cmd = Q_EXECUTABLE + " -d ':' -w all -W minimal \"select c1,c2,replace(c1,'with' || x'0a' || 'a new line inside it','NEWLINE-REMOVED') from %s\"" % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),3)

        self.assertEqual(o[0],six.b('"quoted data with'))
        # Notice that the third column here is not quoted, because we replaced the newline with something else
        self.assertEqual(o[1],six.b('a new line inside it":23:quoted data NEWLINE-REMOVED'))
        self.assertEqual(o[2],six.b('unquoted-data:54:unquoted-data'))

        self.cleanup(tmp_data_file)

    def test_nonnumeric_output_quoting_mode(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data2)

        cmd = Q_EXECUTABLE + ' -d " " -D , -w all -W nonnumeric "select * from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b('"quoted data",23'))
        self.assertEqual(o[1],six.b('"unquoted-data",54'))

        self.cleanup(tmp_data_file)

    def test_all_output_quoting_mode(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data2)

        cmd = Q_EXECUTABLE + ' -d " " -D , -w all -W all "select * from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b('"quoted data","23"'))
        self.assertEqual(o[1],six.b('"unquoted-data","54"'))

        self.cleanup(tmp_data_file)

    def _internal_test_consistency_of_chaining_output_to_input(self,input_data,input_wrapping_mode,output_wrapping_mode):

        tmp_data_file = self.create_file_with_data(input_data)

        basic_cmd = Q_EXECUTABLE + ' -w %s -W %s "select * from -"' % (input_wrapping_mode,output_wrapping_mode)
        chained_cmd = 'cat %s | %s | %s | %s' % (tmp_data_file.name,basic_cmd,basic_cmd,basic_cmd)

        retcode, o, e = run_command(chained_cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(six.b("\n").join(o),input_data)

        self.cleanup(tmp_data_file)

    def test_consistency_of_chaining_minimal_wrapping_to_minimal_wrapping(self):
        input_data = six.b('"quoted data" 23\nunquoted-data 54')
        self._internal_test_consistency_of_chaining_output_to_input(input_data,'minimal','minimal')

    def test_consistency_of_chaining_all_wrapping_to_all_wrapping(self):
        input_data = six.b('"quoted data" "23"\n"unquoted-data" "54"')
        self._internal_test_consistency_of_chaining_output_to_input(input_data,'all','all')

    def test_input_field_quoting_and_data_types_with_encoding(self):
        OUTPUT_ENCODING = 'utf-8'

        # Checks combination of minimal input field quoting, with special characters that need to be decoded -
        # Both content and proper data types are verified
        data = six.b('111,22.22,"testing text with special characters - citt\xc3\xa0 ",http://somekindofurl.com,12.13.14.15,12.1\n')
        tmp_data_file = self.create_file_with_data(data)

        cmd = Q_EXECUTABLE + ' -d , "select * from %s" -E %s' % (tmp_data_file.name,OUTPUT_ENCODING)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),1)

        self.assertEqual(o[0].decode('utf-8'),u'111,22.22,testing text with special characters - citt\xe0 ,http://somekindofurl.com,12.13.14.15,12.1')

        cmd = Q_EXECUTABLE + ' -d , "select * from %s" -A' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),10)

        self.assertEqual(o[0],six.b('Table: %s' % tmp_data_file.name))
        self.assertEqual(o[1],six.b('  Sources:'))
        self.assertEqual(o[2],six.b('    source_type: file source: %s' % tmp_data_file.name))
        self.assertEqual(o[3],six.b('  Fields:'))
        self.assertEqual(o[4],six.b('    `c1` - int'))
        self.assertEqual(o[5],six.b('    `c2` - real'))
        self.assertEqual(o[6],six.b('    `c3` - text'))
        self.assertEqual(o[7],six.b('    `c4` - text'))
        self.assertEqual(o[8],six.b('    `c5` - text'))
        self.assertEqual(o[9],six.b('    `c6` - real'))

        self.cleanup(tmp_data_file)

    def test_multiline_double_double_quoted_values_in_quoted_data(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data)

        # FIXME Need to convert \0a to proper encoding suitable for the person running the tests.
        cmd = Q_EXECUTABLE + ' -d " " "select replace(c5,X\'0A\',\'::\') from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),4)

        self.assertTrue(o[0],six.b('multiline_double_double_quoted'))
        self.assertTrue(o[1],six.b('control-value-5'))
        self.assertTrue(o[2],six.b('this is a double double quoted "multiline\n value".'))
        self.assertTrue(o[3],six.b('control-value-5'))

        self.cleanup(tmp_data_file)

    def test_multiline_escaped_double_quoted_values_in_quoted_data(self):
        tmp_data_file = self.create_file_with_data(sample_quoted_data)

        # FIXME Need to convert \0a to proper encoding suitable for the person running the tests.
        cmd = Q_EXECUTABLE + ' -d " " "select replace(c6,X\'0A\',\'::\') from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),4)

        self.assertTrue(o[0],'multiline_escaped_double_quoted')
        self.assertTrue(o[1],'control-value-6')
        self.assertTrue(o[2],'this is an escaped "multiline:: value".')
        self.assertTrue(o[3],'control-value-6')

        self.cleanup(tmp_data_file)

    def test_disable_double_double_quoted_data_flag__values(self):
        # This test (and flag) is meant to verify backward comptibility only. It is possible that
        # this flag will be removed completely in the future

        tmp_data_file = self.create_file_with_data(double_double_quoted_data)

        cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting "select c2 from %s" -W none' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b('double_double_quoted'))
        self.assertEqual(o[1],six.b('this is a quoted value with "double'))

        cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting "select c3 from %s" -W none' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b(''))
        self.assertEqual(o[1],six.b('double'))

        cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting "select c4 from %s" -W none' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b(''))
        self.assertEqual(o[1],six.b('quotes"""'))

        self.cleanup(tmp_data_file)

    def test_disable_escaped_double_quoted_data_flag__values(self):
        # This test (and flag) is meant to verify backward comptibility only. It is possible that
        # this flag will be removed completely in the future

        tmp_data_file = self.create_file_with_data(escaped_double_quoted_data)

        cmd = Q_EXECUTABLE + ' -d " " --disable-escaped-double-quoting "select c2 from %s" -W none' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b('escaped_double_quoted'))
        self.assertEqual(o[1],six.b('this is a quoted value with \\escaped'))

        cmd = Q_EXECUTABLE + ' -d " " --disable-escaped-double-quoting "select c3 from %s" -W none' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b(''))
        self.assertEqual(o[1],six.b('double'))

        cmd = Q_EXECUTABLE + ' -d " " --disable-escaped-double-quoting "select c4 from %s" -W none' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b(''))
        self.assertEqual(o[1],six.b('quotes\\""'))

        self.cleanup(tmp_data_file)

    def test_combined_quoted_data_flags__number_of_columns_detected(self):
        # This test (and flags) is meant to verify backward comptibility only. It is possible that
        # these flags will be removed completely in the future
        tmp_data_file = self.create_file_with_data(combined_quoted_data)

        cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting --disable-escaped-double-quoting "select * from %s" -A' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        o = o[o.index(six.b('  Fields:'))+1:]

        self.assertEqual(len(o),7) # found 7 fields

        cmd = Q_EXECUTABLE + ' -d " " --disable-escaped-double-quoting "select * from %s" -A' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        o = o[o.index(six.b('  Fields:'))+1:]

        self.assertEqual(len(o),5) # found 5 fields

        cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting "select * from %s" -A' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        o = o[o.index(six.b('  Fields:'))+1:]

        self.assertEqual(len(o),5) # found 5 fields

        cmd = Q_EXECUTABLE + ' -d " " "select * from %s" -A' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        o = o[o.index(six.b('  Fields:'))+1:]

        self.assertEqual(len(o),3) # found only 3 fields, which is the correct amount

        self.cleanup(tmp_data_file)


class EncodingTests(AbstractQTestCase):

    def test_utf8_with_bom_encoding(self):
        utf_8_data_with_bom = six.b('\xef\xbb\xbf"typeid","limit","apcost","date","checkpointId"\n"1","2","5","1,2,3,4,5,6,7","3000,3001,3002"\n"2","2","5","1,2,3,4,5,6,7","3003,3004,3005"\n')
        tmp_data_file = self.create_file_with_data(utf_8_data_with_bom,encoding=None)

        cmd = Q_EXECUTABLE + ' -d , -H -O -e utf-8-sig "select * from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(e),0)
        self.assertEqual(len(o),3)

        self.assertEqual(o[0],six.b('typeid,limit,apcost,date,checkpointId'))
        self.assertEqual(o[1],six.b('1,2,5,"1,2,3,4,5,6,7","3000,3001,3002"'))
        self.assertEqual(o[2],six.b('2,2,5,"1,2,3,4,5,6,7","3003,3004,3005"'))

        self.cleanup(tmp_data_file)


class QrcTests(AbstractQTestCase):

    def test_explicit_qrc_filename_not_found(self):
        non_existent_filename = str(uuid.uuid4())
        env_to_inject = { 'QRC_FILENAME': non_existent_filename}
        cmd = Q_EXECUTABLE + ' "select 1"'
        retcode, o, e = run_command(cmd, env_to_inject=env_to_inject)

        self.assertEqual(retcode, 244)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        self.assertTrue(e[0] == six.b('QRC_FILENAME env var exists, but cannot find qrc file at %s' % non_existent_filename))

    def test_explicit_qrc_filename_that_exists(self):
        tmp_qrc_file = self.create_file_with_data(six.b('''[options]
output_delimiter=|
'''))
        env_to_inject = { 'QRC_FILENAME': tmp_qrc_file.name}
        cmd = Q_EXECUTABLE + ' "select 1,2"'
        retcode, o, e = run_command(cmd, env_to_inject=env_to_inject)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertTrue(o[0] == six.b('1|2'))

        self.cleanup(tmp_qrc_file)

    def test_all_default_options(self):
        # Create a qrc file that contains all default values inside the qrc file, but with some different values than the regular defaults
        tmp_qrc_file = self.create_file_with_data(six.b('''[options]
analyze_only=True
beautify=True
caching_mode=readwrite
column_count=32
delimiter=,
disable_column_type_detection=True
disable_double_double_quoting=False
disable_escaped_double_quoting=False
encoding=ascii
formatting=xxx
gzipped=True
input_quoting_mode=all
keep_leading_whitespace_in_values=True
list_user_functions=True
max_attached_sqlite_databases=888
max_column_length_limit=8888
mode=strict
output_delimiter=|
output_encoding=utf-8
output_header=True
output_quoting_mode=all
overwrite_qsql=False
pipe_delimited=True
pipe_delimited_output=True
query_encoding=ascii
query_filename=query-filename
save_db_to_disk_filename=save-db-to-disk-filename
skip_header=True
tab_delimited=True
tab_delimited_output=true
verbose=True
with_universal_newlines=True
'''))
        env_to_inject = { 'QRC_FILENAME': tmp_qrc_file.name}
        cmd = Q_EXECUTABLE + ' --dump-defaults'
        retcode, o, e = run_command(cmd, env_to_inject=env_to_inject)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 34)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0],six.b('[options]'))
        o = o[1:]

        m = {}
        for r in o:
            key,val = r.split(six.b("="),1)
            m[key] = val

        self.assertEqual(m[six.b('analyze_only')],six.b('True'))
        self.assertEqual(m[six.b('beautify')],six.b('True'))
        self.assertEqual(m[six.b('caching_mode')],six.b('readwrite'))
        self.assertEqual(m[six.b('column_count')],six.b('32'))
        self.assertEqual(m[six.b('delimiter')],six.b(','))
        self.assertEqual(m[six.b('disable_column_type_detection')],six.b('True'))
        self.assertEqual(m[six.b('disable_double_double_quoting')],six.b('False'))
        self.assertEqual(m[six.b('disable_escaped_double_quoting')],six.b('False'))
        self.assertEqual(m[six.b('encoding')],six.b('ascii'))
        self.assertEqual(m[six.b('formatting')],six.b('xxx'))
        self.assertEqual(m[six.b('gzipped')],six.b('True'))
        self.assertEqual(m[six.b('input_quoting_mode')],six.b('all'))
        self.assertEqual(m[six.b('keep_leading_whitespace_in_values')],six.b('True'))
        self.assertEqual(m[six.b('list_user_functions')],six.b('True'))
        self.assertEqual(m[six.b('max_attached_sqlite_databases')],six.b('888'))
        self.assertEqual(m[six.b('max_column_length_limit')],six.b('8888'))
        self.assertEqual(m[six.b('mode')],six.b('strict'))
        self.assertEqual(m[six.b('output_delimiter')],six.b('|'))
        self.assertEqual(m[six.b('output_encoding')],six.b('utf-8'))
        self.assertEqual(m[six.b('output_header')],six.b('True'))
        self.assertEqual(m[six.b('output_quoting_mode')],six.b('all'))
        self.assertEqual(m[six.b('overwrite_qsql')],six.b('False'))
        self.assertEqual(m[six.b('pipe_delimited')],six.b('True'))
        self.assertEqual(m[six.b('pipe_delimited_output')],six.b('True'))
        self.assertEqual(m[six.b('query_encoding')],six.b('ascii'))
        self.assertEqual(m[six.b('query_filename')],six.b('query-filename'))
        self.assertEqual(m[six.b('save_db_to_disk_filename')],six.b('save-db-to-disk-filename'))
        self.assertEqual(m[six.b('skip_header')],six.b('True'))
        self.assertEqual(m[six.b('tab_delimited')],six.b('True'))
        self.assertEqual(m[six.b('tab_delimited_output')],six.b('True'))
        self.assertEqual(m[six.b('verbose')],six.b('True'))
        self.assertEqual(m[six.b('with_universal_newlines')],six.b('True'))

        self.cleanup(tmp_qrc_file)

    def test_caching_readwrite_using_qrc_file(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)
        tmpfile_folder = os.path.dirname(tmpfile.name)
        tmpfile_filename = os.path.basename(tmpfile.name)
        expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')

        cmd = Q_EXECUTABLE + ' -d , "select * from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(o),3)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('a,1,0'))
        self.assertEqual(o[1],six.b('b,2,0'))
        self.assertEqual(o[2],six.b('c,,0'))

        # Ensure default does not create a cache file
        self.assertTrue(not os.path.exists(expected_cache_filename))

        tmp_qrc_file = self.create_file_with_data(six.b('''[options]
caching_mode=readwrite
'''))
        env_to_inject = { 'QRC_FILENAME': tmp_qrc_file.name}
        cmd = Q_EXECUTABLE + ' -d , "select * from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd, env_to_inject=env_to_inject)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),3)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('a,1,0'))
        self.assertEqual(o[1],six.b('b,2,0'))
        self.assertEqual(o[2],six.b('c,,0'))

        # Ensure that qrc file caching is being used and caching is activated (cache file should exist)
        self.assertTrue(os.path.exists(expected_cache_filename))

        self.cleanup(tmp_qrc_file)
        self.cleanup(tmpfile)


class QsqlUsageTests(AbstractQTestCase):

    def test_concatenate_same_qsql_file_with_single_table(self):
        numbers = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]

        qsql_file_data = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers)

        tmpfile = self.create_file_with_data(qsql_file_data,suffix='.qsql')

        cmd = Q_EXECUTABLE + ' -t "select count(*) from (select * from %s union all select * from %s)"' % (tmpfile.name,tmpfile.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('20000'))

    def test_query_qsql_with_single_table(self):
        numbers = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]

        qsql_file_data = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers)

        tmpfile = self.create_file_with_data(qsql_file_data)

        cmd = Q_EXECUTABLE + ' -t "select sum(aa),sum(bb),sum(cc) from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('50005000\t50005000\t50005000'))

    def test_query_qsql_with_single_table_with_explicit_non_existent_tablename(self):
        numbers = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]

        qsql_file_data = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers)

        tmpfile = self.create_file_with_data(qsql_file_data)

        c = sqlite3.connect(tmpfile.name)
        actual_table_name = c.execute('select temp_table_name from _qcatalog').fetchall()[0][0]
        c.close()


        cmd = '%s -t "select sum(aa),sum(bb),sum(cc) from %s:::non-existent"' % (Q_EXECUTABLE,tmpfile.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 84)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),1)
        self.assertEqual(e[0],six.b('Table non-existent could not be found in qsql file %s . Existing table names: %s' % (tmpfile.name,actual_table_name)))

    def test_query_qsql_with_single_table_with_explicit_table_name(self):
        numbers = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]

        qsql_file_data = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers)

        tmpfile = self.create_file_with_data(qsql_file_data)

        c = sqlite3.connect(tmpfile.name)
        actual_table_name = c.execute('select temp_table_name from _qcatalog').fetchall()[0][0]
        c.close()


        cmd = '%s -t "select sum(aa),sum(bb),sum(cc) from %s:::%s"' % (Q_EXECUTABLE,tmpfile.name,actual_table_name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),1)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('50005000\t50005000\t50005000'))

    def test_query_multi_qsql_with_single_table(self):
        numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
        qsql_file_data1 = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers1)
        tmpfile1 = self.create_file_with_data(qsql_file_data1,suffix='.qsql')

        numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
        qsql_file_data2 = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers2)
        tmpfile2 = self.create_file_with_data(qsql_file_data2,suffix='.qsql')

        cmd = Q_EXECUTABLE + ' -t "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)"' % (tmpfile2.name,tmpfile1.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('55\t55\t55'))

    def test_query_concatenated_qsqls_each_with_single_table(self):
        numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
        qsql_file_data1 = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers1)
        tmpfile1 = self.create_file_with_data(qsql_file_data1,suffix='.qsql')

        numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
        qsql_file_data2 = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers2)
        tmpfile2 = self.create_file_with_data(qsql_file_data2,suffix='.qsql')

        cmd = Q_EXECUTABLE + ' -t "select sum(aa),sum(bb),sum(cc) from (select * from %s union all select * from %s)"' % (tmpfile2.name,tmpfile1.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('50005055\t50005055\t50005055'))

    def test_concatenated_qsql_and_data_stream__column_names_mismatch(self):
        N1 = 10000
        N2 = 100

        numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, N1 + 1)]
        csv_file_data1 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'), six.b('bb'), six.b('cc')], numbers1)
        tmpfile1 = self.create_file_with_data(csv_file_data1)
        expected_cache_filename1 = '%s.qsql' % tmpfile1.name

        cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertTrue(os.path.exists(expected_cache_filename1))

        cmd = 'seq 1 %s | %s -c 1 "select count(*) from (select * from %s UNION ALL select * from -)"' % (N2, Q_EXECUTABLE,expected_cache_filename1)

        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 1)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),1)
        self.assertEqual(e[0],six.b('query error: SELECTs to the left and right of UNION ALL do not have the same number of result columns'))

    def test_concatenated_qsql_and_data_stream(self):
        N1 = 10000
        N2 = 100

        numbers1 = [[six.b(str(i))] for i in range(1, N1 + 1)]
        csv_file_data1 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('c1')], numbers1)
        tmpfile1 = self.create_file_with_data(csv_file_data1)
        expected_cache_filename1 = '%s.qsql' % tmpfile1.name

        cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertTrue(os.path.exists(expected_cache_filename1))

        cmd = 'seq 1 %s | %s -t -c 1 "select count(*),sum(c1) from (select * from %s UNION ALL select * from -)"' % (N2, Q_EXECUTABLE,expected_cache_filename1)

        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),1)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('%s\t%s' % (N1+N2,sum(range(1,N1+1)) + sum(range(1,N2+1)))))

    def test_concatenated_qsql_and_data_stream__explicit_table_name(self):
        N1 = 10000
        N2 = 100

        numbers1 = [[six.b(str(i))] for i in range(1, N1 + 1)]
        csv_file_data1 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('c1')], numbers1)
        tmpfile1 = self.create_file_with_data(csv_file_data1)
        tmpfile1_expected_table_name = os.path.basename(tmpfile1.name)

        expected_cache_filename1 = '%s.qsql' % tmpfile1.name

        cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertTrue(os.path.exists(expected_cache_filename1))

        cmd = 'seq 1 %s | %s -t -c 1 "select count(*),sum(c1) from (select * from %s:::%s UNION ALL select * from -)"' % (N2, Q_EXECUTABLE,expected_cache_filename1,tmpfile1_expected_table_name)

        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),1)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('%s\t%s' % (N1+N2,sum(range(1,N1+1)) + sum(range(1,N2+1)))))

    def test_write_to_qsql__check_chosen_table_name(self):
        numbers1 = [[six.b(str(i))] for i in range(1, 10001)]
        csv_file_data1 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('c1')], numbers1)
        tmpfile1 = self.create_file_with_data(csv_file_data1)
        expected_cache_filename1 = '%s.qsql' % tmpfile1.name

        cmd = Q_EXECUTABLE + ' -c 1 -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertTrue(os.path.exists(expected_cache_filename1))

        c = sqlite3.connect(expected_cache_filename1)
        qcatalog_entries = c.execute('select temp_table_name from _qcatalog').fetchall()
        self.assertEqual(len(qcatalog_entries),1)
        self.assertEqual(qcatalog_entries[0][0],os.path.basename(tmpfile1.name))

    def test_concatenated_mixes_qsql_with_single_table_and_csv(self):
        numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
        csv_file_data1 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'), six.b('bb'), six.b('cc')], numbers1)
        tmpfile1 = self.create_file_with_data(csv_file_data1)
        expected_cache_filename1 = '%s.qsql' % tmpfile1.name

        numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
        csv_file_data2 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'), six.b('bb'), six.b('cc')], numbers2)
        tmpfile2 = self.create_file_with_data(csv_file_data2)
        expected_cache_filename2 = '%s.qsql' % tmpfile2.name


        cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertTrue(os.path.exists(expected_cache_filename1))

        cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile2.name
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertTrue(os.path.exists(expected_cache_filename2))

        # csv and qsql files prepared. now test all four combinations

        cmd = Q_EXECUTABLE + ' -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s union all select * from %s)"' % (tmpfile1.name,tmpfile2.name)
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),2)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('cnt\tsum_aa\tsum_bb\tsum_cc'))
        self.assertEqual(o[1],six.b('10010\t50005055\t50005055\t50005055'))

        cmd = Q_EXECUTABLE + ' -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s union all select * from %s.qsql)"' % (tmpfile1.name,tmpfile2.name)
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),2)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('cnt\tsum_aa\tsum_bb\tsum_cc'))
        self.assertEqual(o[1],six.b('10010\t50005055\t50005055\t50005055'))

        cmd = Q_EXECUTABLE + ' -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s.qsql union all select * from %s)"' % (tmpfile1.name,tmpfile2.name)
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),2)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('cnt\tsum_aa\tsum_bb\tsum_cc'))
        self.assertEqual(o[1],six.b('10010\t50005055\t50005055\t50005055'))

        cmd = Q_EXECUTABLE + ' -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s.qsql union all select * from %s.qsql)"' % (tmpfile1.name,tmpfile2.name)
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),2)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('cnt\tsum_aa\tsum_bb\tsum_cc'))
        self.assertEqual(o[1],six.b('10010\t50005055\t50005055\t50005055'))

    def test_analysis_of_concatenated_mixes_qsql_with_single_table_and_csv(self):
        numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
        csv_file_data1 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'), six.b('bb'), six.b('cc')], numbers1)
        tmpfile1 = self.create_file_with_data(csv_file_data1)
        expected_cache_filename1 = '%s.qsql' % tmpfile1.name

        numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
        csv_file_data2 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'), six.b('bb'), six.b('cc')], numbers2)
        tmpfile2 = self.create_file_with_data(csv_file_data2)
        expected_cache_filename2 = '%s.qsql' % tmpfile2.name

        cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertTrue(os.path.exists(expected_cache_filename1))

        cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile2.name
        retcode, o, e = run_command(cmd)
        self.assertEqual(retcode, 0)
        self.assertTrue(os.path.exists(expected_cache_filename2))

        # csv and qsql files prepared

        # Test function, will be used multiple times, each time with a different combination

        def do_check(caching_mode,
                     file1_source_type,file1_table_postfix,file1_postfix,
                     file2_source_type,file2_table_postfix,file2_postfix):
            cmd = '%s -C %s -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s%s UNION ALL select * from %s%s)" -A' % (
                Q_EXECUTABLE,
                caching_mode,
                tmpfile1.name,
                file1_table_postfix,
                tmpfile2.name,
                file2_table_postfix)

            retcode, o, e = run_command(cmd)
            self.assertEqual(retcode, 0)
            self.assertEqual(len(o),14)
            self.assertEqual(len(e),0)
            self.assertEqual(o, [
                six.b('Table: %s%s' % (tmpfile1.name,file1_table_postfix)),
                six.b('  Sources:'),
                six.b('    source_type: %s source: %s%s' % (file1_source_type,tmpfile1.name,file1_postfix)),
                six.b('  Fields:'),
                six.b('    `aa` - int'),
                six.b('    `bb` - int'),
                six.b('    `cc` - int'),
                six.b('Table: %s%s' % (tmpfile2.name,file2_table_postfix)),
                six.b('  Sources:'),
                six.b('    source_type: %s source: %s%s' % (file2_source_type,tmpfile2.name,file2_postfix)),
                six.b('  Fields:'),
                six.b('    `aa` - int'),
                six.b('    `bb` - int'),
                six.b('    `cc` - int')])

        # now test *the analysis results* of all four combinations, adding `-C read`, so the
        # qsql will be used. Running with `-C none`, would have caused the qsql not to be used even if the qsql file exists

        do_check(caching_mode='read',
                 file1_source_type='qsql-file-with-original',file1_table_postfix='',file1_postfix='.qsql',
                 file2_source_type='qsql-file-with-original',file2_table_postfix='',file2_postfix='.qsql')
        do_check('read',
                 file1_source_type='qsql-file-with-original',file1_table_postfix='',file1_postfix='.qsql',
                 file2_source_type='qsql-file',file2_table_postfix='.qsql',file2_postfix='.qsql')
        do_check('read',
                 file1_source_type='qsql-file',file1_table_postfix='.qsql',file1_postfix='.qsql',
                 file2_source_type='qsql-file-with-original',file2_table_postfix='',file2_postfix='.qsql')
        do_check('read',
                 file1_source_type='qsql-file',file1_table_postfix='.qsql',file1_postfix='.qsql',
                 file2_source_type='qsql-file',file2_table_postfix='.qsql',file2_postfix='.qsql')

        # Now test the all combinations again, this time with `-C none`, to make sure that by
        # default, the qsql file is not used, and -A shows that fact

        do_check(caching_mode='none',
                 file1_source_type='file-with-unused-qsql',file1_table_postfix='',file1_postfix='',
                 file2_source_type='file-with-unused-qsql',file2_table_postfix='',file2_postfix='')
        do_check('none',
                 file1_source_type='file-with-unused-qsql',file1_table_postfix='',file1_postfix='',
                 file2_source_type='qsql-file',file2_table_postfix='.qsql',file2_postfix='.qsql')
        do_check('none',
                 file1_source_type='qsql-file',file1_table_postfix='.qsql',file1_postfix='.qsql',
                 file2_source_type='file-with-unused-qsql',file2_table_postfix='',file2_postfix='')
        do_check('none',
                 file1_source_type='qsql-file',file1_table_postfix='.qsql',file1_postfix='.qsql',
                 file2_source_type='qsql-file',file2_table_postfix='.qsql',file2_postfix='.qsql')

    def test_mixed_qsql_with_single_table_and_csv__missing_header_parameter_for_csv(self):
        numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
        qsql_file_data1 = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers1)
        tmpfile1 = self.create_file_with_data(qsql_file_data1,suffix='.qsql')

        numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
        csv_file_data2 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'), six.b('bb'), six.b('cc')], numbers2)
        tmpfile2 = self.create_file_with_data(csv_file_data2)

        cmd = Q_EXECUTABLE + ' -t "select sum(aa),sum(bb),sum(cc) from (select * from %s union all select * from %s)"' % (tmpfile1.name,tmpfile2.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 1)
        self.assertEqual(e[0],six.b('Warning - There seems to be header line in the file, but -H has not been specified. All fields will be detected as text fields, and the header line will appear as part of the data'))
        self.assertEqual(o[0],six.b('50005055.0\t50005055.0\t50005055.0'))

    def test_qsql_with_multiple_tables_direct_use(self):
        numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
        qsql_filename1 = self.create_qsql_file_with_content_and_return_filename([six.b('aa'), six.b('bb'), six.b('cc')],numbers1)
        expected_stored_table_name1 = os.path.basename(qsql_filename1)[:-5]

        numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
        qsql_filename2 = self.create_qsql_file_with_content_and_return_filename([six.b('aa'), six.b('bb'), six.b('cc')],numbers2)
        expected_stored_table_name2 = os.path.basename(qsql_filename2)[:-5]

        qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')

        cmd = '%s -t "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s large_file left join %s small_file on (large_file.aa == small_file.bb)" -S %s' % \
              (Q_EXECUTABLE,qsql_filename1,qsql_filename2,qsql_with_multiple_tables)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 4)
        self.assertEqual(e[0], six.b('Going to save data into a disk database: %s' % qsql_with_multiple_tables))
        self.assertTrue(e[1].startswith(six.b('Data has been saved into %s . Saving has taken' % qsql_with_multiple_tables)))
        self.assertEqual(e[2],six.b('Query to run on the database: select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s large_file left join %s small_file on (large_file.aa == small_file.bb);' % \
                                    (expected_stored_table_name1,expected_stored_table_name2)))
        self.assertEqual(e[3],six.b('You can run the query directly from the command line using the following command: echo "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s large_file left join %s small_file on (large_file.aa == small_file.bb)" | sqlite3 %s' % \
                                    (expected_stored_table_name1,expected_stored_table_name2,qsql_with_multiple_tables)))

        cmd = '%s -d , "select count(*) cnt,sum(aa),sum(bb),sum(cc) from %s:::%s"' % (Q_EXECUTABLE,qsql_with_multiple_tables,expected_stored_table_name1)
        r, o, e = run_command(cmd)

        self.assertEqual(r,0)
        self.assertEqual(len(o),1)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('10000,50005000,50005000,50005000'))

    def test_direct_use_of_sqlite_db_with_one_table(self):
        tmpfile = self.create_file_with_data(six.b(''),suffix='.sqlite')
        os.remove(tmpfile.name)
        c = sqlite3.connect(tmpfile.name)
        c.execute(' create table mytable (x int, y int)').fetchall()
        c.execute(' insert into mytable (x,y) values (100,200),(300,400)').fetchall()
        c.commit()
        c.close()

        cmd = Q_EXECUTABLE + ' -t "select sum(x),sum(y) from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('400\t600'))

        cmd = Q_EXECUTABLE + ' -t "select sum(x),sum(y) from %s:::mytable"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('400\t600'))

    def test_direct_use_of_sqlite_db_with_one_table__nonexistent_table(self):
        tmpfile = self.create_file_with_data(six.b(''),suffix='.sqlite')
        os.remove(tmpfile.name)
        c = sqlite3.connect(tmpfile.name)
        c.execute(' create table some_numbers (x int, y int)').fetchall()
        c.execute(' insert into some_numbers (x,y) values (100,200),(300,400)').fetchall()
        c.commit()
        c.close()

        cmd = Q_EXECUTABLE + ' -t "select sum(x),sum(y) from %s:::non_existent"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 85)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        self.assertEqual(e[0],six.b('Table non_existent could not be found in sqlite file %s . Existing table names: some_numbers' % (tmpfile.name)))


    def test_qsql_creation_and_direct_use(self):
        numbers = [[six.b(str(i)),six.b(str(i)),six.b(str(i))] for i in range(1,10001)]

        file_data = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'),six.b('bb'),six.b('cc')],numbers)

        tmpfile = self.create_file_with_data(file_data)
        tmpfile_folder = os.path.dirname(tmpfile.name)
        tmpfile_filename = os.path.basename(tmpfile.name)
        expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')

        cmd = Q_EXECUTABLE + ' -H -t "select sum(aa),sum(bb),sum(cc) from %s" -H -C readwrite' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('50005000\t50005000\t50005000'))

        self.assertTrue(os.path.exists(expected_cache_filename))

        self.cleanup(tmpfile)

        # Get the data using a comma delimiter, to make sure that column parsing was done correctlyAdding to qcatalog table:
        cmd = Q_EXECUTABLE + ' -D , "select count(*),sum(aa),sum(bb),sum(cc) from %s"' % expected_cache_filename
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('10000,50005000,50005000,50005000'))

    def test_analysis_of_qsql_direct_usage(self):
        numbers = [[six.b(str(i)),six.b(str(i)),six.b(str(i))] for i in range(1,10001)]

        file_data = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'),six.b('bb'),six.b('cc')],numbers)

        tmpfile = self.create_file_with_data(file_data)
        tmpfile_folder = os.path.dirname(tmpfile.name)
        tmpfile_filename = os.path.basename(tmpfile.name)
        expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')

        cmd = Q_EXECUTABLE + ' -H -t "select sum(aa),sum(bb),sum(cc) from %s" -H -C readwrite' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('50005000\t50005000\t50005000'))

        self.assertTrue(os.path.exists(expected_cache_filename))

        self.cleanup(tmpfile)

        cmd = Q_EXECUTABLE + ' "select * from %s" -A' % expected_cache_filename
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 7)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('Table: %s' % expected_cache_filename))
        self.assertEqual(o[1],six.b("  Sources:"))
        self.assertEqual(o[2],six.b('    source_type: qsql-file source: %s' % expected_cache_filename))
        self.assertEqual(o[3],six.b("  Fields:"))
        self.assertEqual(o[4],six.b('    `aa` - int'))
        self.assertEqual(o[5],six.b('    `bb` - int'))
        self.assertEqual(o[6],six.b('    `cc` - int'))

    def test_analysis_of_qsql_direct_usage2(self):
        numbers = [[six.b(str(i)),six.b(str(i)),six.b(str(i))] for i in range(1,10001)]

        file_data = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'),six.b('bb'),six.b('cc')],numbers)

        tmpfile = self.create_file_with_data(file_data)
        tmpfile_folder = os.path.dirname(tmpfile.name)
        tmpfile_filename = os.path.basename(tmpfile.name)
        expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')

        cmd = Q_EXECUTABLE + ' -H -t "select sum(aa),sum(bb),sum(cc) from %s" -H -C readwrite' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('50005000\t50005000\t50005000'))

        self.assertTrue(os.path.exists(expected_cache_filename))

        self.cleanup(tmpfile)

        cmd = Q_EXECUTABLE + ' "select * from %s" -A' % expected_cache_filename
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 7)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('Table: %s' % expected_cache_filename))
        self.assertEqual(o[1],six.b("  Sources:"))
        self.assertEqual(o[2],six.b('    source_type: qsql-file source: %s' % expected_cache_filename))
        self.assertEqual(o[3],six.b("  Fields:"))
        self.assertEqual(o[4],six.b('    `aa` - int'))
        self.assertEqual(o[5],six.b('    `bb` - int'))
        self.assertEqual(o[6],six.b('    `cc` - int'))

    def test_direct_qsql_usage_for_single_table_qsql_file(self):
        disk_db_filename = self.random_tmp_filename('save-to-db','qsql')

        cmd = 'seq 1 10000 | %s -t "select sum(aa),sum(bb),sum(cc) from -" -S %s' % (Q_EXECUTABLE,disk_db_filename)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)

        cmd = '%s -D, "select count(*),sum(c1) from %s:::data_stream_stdin"' % (Q_EXECUTABLE,disk_db_filename)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),1)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('10000,50005000'))

    def test_direct_qsql_usage_for_single_table_qsql_file__nonexistent_table(self):
        disk_db_filename = self.random_tmp_filename('save-to-db','qsql')

        cmd = 'seq 1 10000 | %s -t "select sum(aa),sum(bb),sum(cc) from -" -S %s' % (Q_EXECUTABLE,disk_db_filename)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)

        cmd = '%s -D, "select count(*),sum(c1) from %s:::unknown_table_name"' % (Q_EXECUTABLE,disk_db_filename)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 85)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),1)
        self.assertEqual(e[0],six.b('Table unknown_table_name could not be found in sqlite file %s . Existing table names: data_stream_stdin' % (disk_db_filename)))

    def test_direct_qsql_usage_from_written_data_stream(self):
        disk_db_filename = self.random_tmp_filename('save-to-db','qsql')

        cmd = 'seq 1 10000 | %s -t "select sum(aa),sum(bb),sum(cc) from -" -S %s' % (Q_EXECUTABLE,disk_db_filename)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)

        cmd = '%s -D, "select count(*),sum(c1) from %s:::data_stream_stdin"' % (Q_EXECUTABLE,disk_db_filename)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),1)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('10000,50005000'))

    def test_direct_qsql_self_join(self):
        disk_db_filename = self.random_tmp_filename('save-to-db','qsql')

        N = 100
        cmd = 'seq 1 %s | %s -t "select count(*),sum(c1) from -" -S %s' % (N,Q_EXECUTABLE,disk_db_filename)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)

        cmd = '%s -D, "select count(*),sum(a.c1),sum(b.c1) from %s:::data_stream_stdin a left join %s:::data_stream_stdin b"' % (Q_EXECUTABLE,disk_db_filename,disk_db_filename)
        retcode, o, e = run_command(cmd)

        expected_sum = sum(range(1,N+1))*N

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),1)
        self.assertEqual(len(e),0)
        self.assertEqual(o[0],six.b('10000,%s,%s' % (expected_sum,expected_sum)))


class CachingTests(AbstractQTestCase):

    def test_cache_empty_file(self):
        file_data = six.b("a,b,c")
        tmpfile = self.create_file_with_data(file_data)
        tmpfile_folder = os.path.dirname(tmpfile.name)
        tmpfile_filename = os.path.basename(tmpfile.name)
        tmpfile_expected_table_name = os.path.basename(tmpfile.name)
        expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C none' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        self.assertEqual(e[0],six.b("Warning - data is empty"))

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        self.assertEqual(e[0],six.b("Warning - data is empty"))

        # After readwrite caching has been activated, the cache file is expected to exist
        self.assertTrue(os.path.exists(expected_cache_filename))

        # Read the cache file directly, to make sure it's a valid sqlite file
        import sqlite3
        db = sqlite3.connect(expected_cache_filename)
        table_list = db.execute("select content_signature_key,temp_table_name,content_signature,creation_time,source_type,source from _qcatalog where temp_table_name == '%s'" % (tmpfile_expected_table_name)).fetchall()
        self.assertTrue(len(table_list) == 1)
        table_metadata = table_list[0]
        results = db.execute("select * from %s" % table_metadata[1]).fetchall()
        self.assertTrue(len(results) == 0)

        self.cleanup(tmpfile)

    def test_reading_the_wrong_cache__original_file_having_different_data(self):
        file_data1 = six.b("a,b,c\n10,20,30\n30,40,50")

        tmpfile1 = self.create_file_with_data(file_data1)
        tmpfile1_folder = os.path.dirname(tmpfile1.name)
        tmpfile1_filename = os.path.basename(tmpfile1.name)
        expected_cache_filename = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)
        self.assertTrue(o[0], six.b('10'))
        self.assertEqual(o[1], six.b('30'))

        # Ensure cache has been created
        self.assertTrue(os.path.exists(expected_cache_filename))

        # Overwrite the original file
        file_data2 = six.b("a,b,c\n10,20,30\n30,40,50\n50,60,70")
        self.write_file(tmpfile1.name,file_data2)

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile1.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 81)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        self.assertEqual(e[0], six.b('%s vs %s.qsql: Content Signatures differ at inferer.rows (actual analysis data differs)' % \
                                     (tmpfile1.name,tmpfile1.name)))


    def test_reading_the_wrong_cache__original_file_having_different_delimiter(self):
        file_data1 = six.b("a,b,c\n10,20,30\n30,40,50")

        tmpfile1 = self.create_file_with_data(file_data1)
        tmpfile1_folder = os.path.dirname(tmpfile1.name)
        tmpfile1_filename = os.path.basename(tmpfile1.name)
        expected_cache_filename = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)
        self.assertTrue(o[0], six.b('10'))
        self.assertEqual(o[1], six.b('30'))

        # Ensure cache has been created
        self.assertTrue(os.path.exists(expected_cache_filename))

        # Overwrite the original file
        file_data2 = six.b("a\tb\tc\n10\t20\t30\n30\t40\t50")
        self.write_file(tmpfile1.name,file_data2)

        cmd = Q_EXECUTABLE + ' -H -t "select a from %s" -C read' % tmpfile1.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 80)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        x = six.b("%s vs %s.qsql: Content Signatures for table %s differ at input_delimiter (source value '\t' disk signature value ',')" % \
                                     (tmpfile1.name,tmpfile1.name,tmpfile1.name))
        self.assertEqual(e[0], x)

    def test_rename_cache_and_read_from_it(self):
        # create a file, along with its qsql
        file_data1 = six.b("a,b,c\n10,20,30\n30,40,50")

        tmpfile1 = self.create_file_with_data(file_data1)
        tmpfile1_folder = os.path.dirname(tmpfile1.name)
        tmpfile1_filename = os.path.basename(tmpfile1.name)
        expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)
        self.assertTrue(o[0], six.b('10'))
        self.assertEqual(o[1], six.b('30'))
        # Ensure cache has been created
        self.assertTrue(os.path.exists(expected_cache_filename1))

        tmp_fn = self.generate_tmpfile_name("aa","qsql")
        os.rename(expected_cache_filename1,tmp_fn)

        cmd = '%s "select a from %s"' % (Q_EXECUTABLE,tmp_fn)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)
        self.assertTrue(o[0], six.b('10'))
        self.assertEqual(o[1], six.b('30'))


    def test_reading_the_wrong_cache__qsql_file_not_having_a_matching_content_signature(self):
        # create a file, along with its qsql
        file_data1 = six.b("a,b,c\n10,20,30\n30,40,50")

        tmpfile1 = self.create_file_with_data(file_data1)
        tmpfile1_folder = os.path.dirname(tmpfile1.name)
        tmpfile1_filename = os.path.basename(tmpfile1.name)
        expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)
        self.assertTrue(o[0], six.b('10'))
        self.assertEqual(o[1], six.b('30'))
        # Ensure cache has been created
        self.assertTrue(os.path.exists(expected_cache_filename1))

        file_data2 = six.b("c,d,e\n10,20,30\n30,40,50")

        # create another file with a different header, along with its qsql
        tmpfile2 = self.create_file_with_data(file_data2)
        tmpfile2_folder = os.path.dirname(tmpfile2.name)
        tmpfile2_filename = os.path.basename(tmpfile2.name)
        expected_cache_filename2 = os.path.join(tmpfile2_folder,tmpfile2_filename + '.qsql')

        cmd = Q_EXECUTABLE + ' -H -d , "select c from %s" -C readwrite' % tmpfile2.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)
        self.assertTrue(o[0], six.b('10'))
        self.assertEqual(o[1], six.b('30'))
        # Ensure cache has been created
        self.assertTrue(os.path.exists(expected_cache_filename2))

        # now take the second qsql file as if it was the first. Execution on file 1 should fail, since the qsql file
        # does not really contain the table we're after

        os.remove(expected_cache_filename1)
        os.rename(expected_cache_filename2,expected_cache_filename1)

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile1.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 80)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)
        x = six.b("%s vs %s.qsql: Content Signatures for table %s differ at inferer.header_row (source value '['a', 'b', 'c']' disk signature value '['c', 'd', 'e']')" % (tmpfile1.name,tmpfile1.name,tmpfile1.name))
        self.assertEqual(e[0], x)

    def test_reading_the_wrong_cache__qsql_file_not_having_any_content_signature(self):
        # create a file, along with its qsql
        file_data1 = six.b("a,b,c\n10,20,30\n30,40,50")

        tmpfile1 = self.create_file_with_data(file_data1)
        tmpfile1_folder = os.path.dirname(tmpfile1.name)
        tmpfile1_filename = os.path.basename(tmpfile1.name)
        expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)
        self.assertTrue(o[0], six.b('10'))
        self.assertEqual(o[1], six.b('30'))
        # Ensure cache has been created
        self.assertTrue(os.path.exists(expected_cache_filename1))

        file_data2 = six.b("c,d,e\n10,20,30\n30,40,50")

        # delete qcatalog content, so no entries will be available
        c = sqlite3.connect(expected_cache_filename1)
        c.execute('delete from _qcatalog').fetchall()
        c.commit()
        c.close()

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile1.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 97)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),1)
        self.assertEqual(e[0],six.b("Could not autodetect table name in qsql file. File contains no record of a table"))


    def test_cache_full_flow(self):
        file_data = six.b("a,b,c\n10,20,30\n30,40,50")
        tmpfile = self.create_file_with_data(file_data)
        tmpfile_folder = os.path.dirname(tmpfile.name)
        tmpfile_filename = os.path.basename(tmpfile.name)
        expected_tmpfile_table_name = tmpfile_filename
        expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C none' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)
        self.assertTrue(o[0],six.b('10'))
        self.assertEqual(o[1],six.b('30'))

        # Ensure cache has not been created
        self.assertTrue(not os.path.exists(expected_cache_filename))

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)
        self.assertTrue(o[0],six.b('10'))
        self.assertEqual(o[1],six.b('30'))

        # Ensure cache has not been created, as cache mode is "read" only
        self.assertTrue(not os.path.exists(expected_cache_filename))

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)
        self.assertTrue(o[0],six.b('10'))
        self.assertEqual(o[1],six.b('30'))

        # After readwrite caching has been activated, the cache file is expected to exist
        self.assertTrue(os.path.exists(expected_cache_filename))

        # Read the cache file directly, to make sure it's a valid sqlite file
        db = sqlite3.connect(expected_cache_filename)
        table_list = db.execute("select content_signature_key,temp_table_name,content_signature,creation_time,source_type,source from _qcatalog where temp_table_name == '%s'" % expected_tmpfile_table_name).fetchall()
        self.assertTrue(len(table_list) == 1)
        table_metadata = table_list[0]
        results = db.execute("select * from %s" % table_metadata[1]).fetchall()
        self.assertEqual(results[0],(10,20,30))
        self.assertEqual(results[1],(30,40,50))

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)
        self.assertTrue(o[0],six.b('10'))
        self.assertEqual(o[1],six.b('30'))

        # After readwrite caching has been activated, the cache file is expected to exist
        self.assertTrue(os.path.exists(expected_cache_filename))

        self.cleanup(tmpfile)

    def test_cache_full_flow_with_concatenated_files(self):
        file_data1 = six.b("a,b,c\n10,11,12\n20,21,22")
        tmpfile1 = self.create_file_with_data(file_data1)
        tmpfile1_folder = os.path.dirname(tmpfile1.name)
        tmpfile1_filename = os.path.basename(tmpfile1.name)
        expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')

        file_data2 = six.b("a,b,c\n30,31,32\n40,41,42")
        tmpfile2 = self.create_file_with_data(file_data2)
        tmpfile2_folder = os.path.dirname(tmpfile2.name)
        tmpfile2_filename = os.path.basename(tmpfile2.name)
        expected_cache_filename2 = os.path.join(tmpfile2_folder,tmpfile2_filename + '.qsql')

        cmd = Q_EXECUTABLE + ' -O -H -d , "select * from (select * from %s UNION ALL select * from %s)" -C readwrite' % (tmpfile1.name,tmpfile2.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 5)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('a,b,c'))
        self.assertEqual(o[1],six.b('10,11,12'))
        self.assertEqual(o[2],six.b('20,21,22'))
        self.assertEqual(o[3],six.b('30,31,32'))
        self.assertEqual(o[4],six.b('40,41,42'))

        self.assertTrue(os.path.exists(expected_cache_filename1))
        self.assertTrue(os.path.exists(expected_cache_filename2))

        self.cleanup(tmpfile1)
        self.cleanup(tmpfile2)


    def test_analyze_result_with_cache_file(self):
        file_data = six.b("a,b,c\n10,20,30\n30,40,50")
        tmpfile = self.create_file_with_data(file_data)
        tmpfile_folder = os.path.dirname(tmpfile.name)
        tmpfile_filename = os.path.basename(tmpfile.name)
        expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')

        # Ensure cache has not been created yet
        self.assertTrue(not os.path.exists(expected_cache_filename))

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)
        self.assertTrue(o[0],six.b('10'))
        self.assertEqual(o[1],six.b('30'))

        # Ensure cache is now created
        self.assertTrue(os.path.exists(expected_cache_filename))

        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read -A' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),7)
        self.assertEqual(len(e),0)

        self.assertEqual(o[0],six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1],six.b('  Sources:'))
        self.assertEqual(o[2],six.b('    source_type: qsql-file-with-original source: %s.qsql' % tmpfile.name))
        self.assertEqual(o[3],six.b('  Fields:'))
        self.assertEqual(o[4],six.b('    `a` - int'))
        self.assertEqual(o[5],six.b('    `b` - int'))
        self.assertEqual(o[6],six.b('    `c` - int'))

        # delete the newly created cache
        os.remove(expected_cache_filename)

        # Now rerun the analysis without the cache file
        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read -A' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o),7)
        self.assertEqual(len(e),0)

        self.assertEqual(o[0],six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1],six.b('  Sources:'))
        self.assertEqual(o[2],six.b('    source_type: file source: %s' % tmpfile.name))
        self.assertEqual(o[3],six.b('  Fields:'))
        self.assertEqual(o[4],six.b('    `a` - int'))
        self.assertEqual(o[5],six.b('    `b` - int'))
        self.assertEqual(o[6],six.b('    `c` - int'))

        self.cleanup(tmpfile)

    def test_partial_caching_exists(self):
        file1_data = six.b("a,b,c\n10,20,30\n30,40,50\n60,70,80")
        tmpfile1 = self.create_file_with_data(file1_data)
        tmpfile1_folder = os.path.dirname(tmpfile1.name)
        tmpfile1_filename = os.path.basename(tmpfile1.name)
        expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')

        file2_data = six.b("b,x\n10,linewith10\n20,linewith20\n30,linewith30\n40,linewith40")
        tmpfile2 = self.create_file_with_data(file2_data)
        tmpfile2_folder = os.path.dirname(tmpfile2.name)
        tmpfile2_filename = os.path.basename(tmpfile2.name)
        expected_cache_filename2 = os.path.join(tmpfile2_folder,tmpfile2_filename + '.qsql')

        # Use only first file, and cache
        cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 0)
        self.assertTrue(o[0],six.b('10'))
        self.assertEqual(o[1],six.b('30'))

        # Ensure cache has been created for file 1
        self.assertTrue(os.path.exists(expected_cache_filename1))

        # Use both files with read caching, one should be read from cache, the other from the file
        cmd = Q_EXECUTABLE + ' -H -d , "select file1.a,file1.b,file1.c,file2.x from %s file1 left join %s file2 on (file1.b = file2.b)" -C read' % (tmpfile1.name,tmpfile2.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('10,20,30,linewith20'))
        self.assertEqual(o[1],six.b('30,40,50,linewith40'))
        self.assertEqual(o[2],six.b('60,70,80,'))

        # Ensure cache has NOT been created for file 2
        self.assertTrue(not os.path.exists(expected_cache_filename2))

        # Now rerun the query, this time with readwrite caching, so the second file cache will be written
        cmd = Q_EXECUTABLE + ' -H -d , "select file1.a,file1.b,file1.c,file2.x from %s file1 left join %s file2 on (file1.b = file2.b)" -C readwrite' % (tmpfile1.name,tmpfile2.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 0)
        self.assertEqual(o[0],six.b('10,20,30,linewith20'))
        self.assertEqual(o[1],six.b('30,40,50,linewith40'))
        self.assertEqual(o[2],six.b('60,70,80,'))

        # Ensure cache has now been created for file 2
        self.assertTrue(os.path.exists(expected_cache_filename2))

        self.cleanup(tmpfile1)
        self.cleanup(tmpfile2)


class UserFunctionTests(AbstractQTestCase):
    def test_regexp_int_data_handling(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)

        cmd = Q_EXECUTABLE + ' -d , "select c2 from %s where regexp(\'^1\',c2)"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0],six.b("1"))

        self.cleanup(tmpfile)

    def test_percentile_func(self):
        cmd = 'seq 1000 1999 | %s "select substr(c1,0,3),percentile(c1,0),percentile(c1,0.5),percentile(c1,1) from - group by substr(c1,0,3)" -c 1' % Q_EXECUTABLE
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 10)
        self.assertEqual(len(e), 0)

        output_table = [l.split(six.b(" ")) for l in o]
        group_labels = [int(row[0]) for row in output_table]
        minimum_values = [float(row[1]) for row in output_table]
        median_values = [float(row[2]) for row in output_table]
        max_values = [float(row[3]) for row in output_table]

        base_values = list(range(1000,2000,100))

        self.assertEqual(group_labels,list(range(10,20)))
        self.assertEqual(minimum_values,base_values)
        self.assertEqual(median_values,list(map(lambda x: x + 49.5,base_values)))
        self.assertEqual(max_values,list(map(lambda x: x + 99,base_values)))

    def test_regexp_null_data_handling(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)

        cmd = Q_EXECUTABLE + ' -d , "select count(*) from %s where regexp(\'^\',c2)"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0],six.b("2"))

        self.cleanup(tmpfile)

    def test_md5_function(self):
        cmd = 'seq 1 4 | %s -c 1 -d , "select c1,md5(c1,\'utf-8\') from -"' % Q_EXECUTABLE
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(o),4)
        self.assertEqual(len(e),0)

        self.assertEqual(tuple(o[0].split(six.b(','),1)),(six.b('1'),six.b('c4ca4238a0b923820dcc509a6f75849b')))
        self.assertEqual(tuple(o[1].split(six.b(','),1)),(six.b('2'),six.b('c81e728d9d4c2f636f067f89cc14862c')))
        self.assertEqual(tuple(o[2].split(six.b(','),1)),(six.b('3'),six.b('eccbc87e4b5ce2fe28308fd9f2a7baf3')))
        self.assertEqual(tuple(o[3].split(six.b(','),1)),(six.b('4'),six.b('a87ff679a2f3e71d9181a67b7542122c')))

    def test_stddev_functions(self):
        tmpfile = self.create_file_with_data(six.b("\n".join(map(str,[234,354,3234,123,4234,234,634,56,65]))))

        cmd = '%s -c 1 -d , "select round(stddev_pop(c1),10),round(stddev_sample(c1),10) from %s"' % (Q_EXECUTABLE,tmpfile.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(o),1)
        self.assertEqual(len(e),0)

        self.assertEqual(o[0],six.b('1479.7015464838,1569.4604964764'))

        self.cleanup(tmpfile)

    def test_sqrt_function(self):
        cmd = 'seq 1 5 | %s -c 1 -d , "select round(sqrt(c1),10) from -"' % Q_EXECUTABLE
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(o),5)
        self.assertEqual(len(e),0)

        self.assertEqual(o[0],six.b('1.0'))
        self.assertEqual(o[1],six.b('1.4142135624'))
        self.assertEqual(o[2],six.b('1.7320508076'))
        self.assertEqual(o[3],six.b('2.0'))
        self.assertEqual(o[4],six.b('2.2360679775'))

    def test_power_function(self):
        cmd = 'seq 1 5 | %s -c 1 -d , "select round(power(c1,2.5),10) from -"' % Q_EXECUTABLE
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(o),5)
        self.assertEqual(len(e),0)

        self.assertEqual(o[0],six.b('1.0'))
        self.assertEqual(o[1],six.b('5.6568542495'))
        self.assertEqual(o[2],six.b('15.5884572681'))
        self.assertEqual(o[3],six.b('32.0'))
        self.assertEqual(o[4],six.b('55.9016994375'))

    def test_file_functions(self):
        filenames = [
            "file1",
            "file2.csv",
            "/var/tmp/file3",
            "/var/tmp/file4.gz",
            ""
        ]
        data = "\n".join(filenames)

        cmd = 'echo "%s" | %s -c 1 -d , "select file_folder(c1),file_ext(c1),file_basename(c1),file_basename_no_ext(c1) from -"' % (data,Q_EXECUTABLE)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(o),5)
        self.assertEqual(len(e),0)
        self.assertEqual(o,[
            b',,file1,file1',
            b',.csv,file2.csv,file2',
            b'/var/tmp,,file3,file3',
            b'/var/tmp,.gz,file4.gz,file4',
            b',,,'
        ])


    def test_sha1_function(self):
        cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha1(c1) from -"' % Q_EXECUTABLE
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(o),4)
        self.assertEqual(len(e),0)

        self.assertEqual(o[0],six.b('1,356a192b7913b04c54574d18c28d46e6395428ab'))
        self.assertEqual(o[1],six.b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0'))
        self.assertEqual(o[2],six.b('3,77de68daecd823babbb58edb1c8e14d7106e83bb'))
        self.assertEqual(o[3],six.b('4,1b6453892473a467d07372d45eb05abc2031647a'))

    def test_regexp_extract_function(self):
        query = """
            select 
              regexp_extract('was ([0-9]+) seconds and ([0-9]+) ms',c1,0),
              regexp_extract('was ([0-9]+) seconds and ([0-9]+) ms',c1,1),
              regexp_extract('non-existent-(regexp)',c1,0) 
            from
              -
        """

        cmd = 'echo "Duration was 322 seconds and 240 ms" | %s -c 1 -d , "%s"' % (Q_EXECUTABLE,query)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(o),1)
        self.assertEqual(len(e),0)

        self.assertEqual(o[0],six.b('322,240,'))

    def test_sha_function(self):
        cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha(c1,1,\'utf-8\') as sha1,sha(c1,224,\'utf-8\') as sha224,sha(c1,256,\'utf-8\') as sha256 from -"' % Q_EXECUTABLE
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(o),4)
        self.assertEqual(len(e),0)

        self.assertEqual(o[0],six.b('1,356a192b7913b04c54574d18c28d46e6395428ab,e25388fde8290dc286a6164fa2d97e551b53498dcbf7bc378eb1f178,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b'))
        self.assertEqual(o[1],six.b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0,58b2aaa0bfae7acc021b3260e941117b529b2e69de878fd7d45c61a9,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35'))
        self.assertEqual(o[2],six.b('3,77de68daecd823babbb58edb1c8e14d7106e83bb,4cfc3a1811fe40afa401b25ef7fa0379f1f7c1930a04f8755d678474,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce'))
        self.assertEqual(o[3],six.b('4,1b6453892473a467d07372d45eb05abc2031647a,271f93f45e9b4067327ed5c8cd30a034730aaace4382803c3e1d6c2f,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a'))


class MultiHeaderTests(AbstractQTestCase):
    def test_output_header_when_multiple_input_headers_exist(self):
        TMPFILE_COUNT = 5
        tmpfiles = [self.create_file_with_data(sample_data_with_header) for x in range(TMPFILE_COUNT)]

        tmpfilenames = " UNION ALL ".join(map(lambda x:"select * from %s" % x.name, tmpfiles))

        cmd = Q_EXECUTABLE + ' -d , "select name,value1,value2 from (%s) order by name" -H -O' % tmpfilenames
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), TMPFILE_COUNT*3+1)
        self.assertEqual(o[0], six.b("name,value1,value2"))

        for i in range (TMPFILE_COUNT):
            self.assertEqual(o[1+i],sample_data_rows[0])
        for i in range (TMPFILE_COUNT):
            self.assertEqual(o[TMPFILE_COUNT+1+i],sample_data_rows[1])
        for i in range (TMPFILE_COUNT):
            self.assertEqual(o[TMPFILE_COUNT*2+1+i],sample_data_rows[2])

        for oi in o[1:]:
            self.assertTrue(six.b('name') not in oi)

        for i in range(TMPFILE_COUNT):
            self.cleanup(tmpfiles[i])

    def test_output_header_when_extra_header_column_names_are_different__concatenation_replacement(self):
        tmpfile1 = self.create_file_with_data(sample_data_with_header)
        tmpfile2 = self.create_file_with_data(generate_sample_data_with_header(six.b('othername,value1,value2')))

        cmd = Q_EXECUTABLE + ' -d , "select name,value1,value2 from (select * from %s union all select * from %s) order by name" -H -O' % (tmpfile1.name,tmpfile2.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 7)
        self.assertEqual(len(e), 0)
        self.assertTrue(o, [
            six.b('name,value1,value2'),
            six.b('a,1,0'),
            six.b('a,1,0'),
            six.b('b,2,0'),
            six.b('b,2,0'),
            six.b('c,,0'),
            six.b('c,,0')
        ])

        self.cleanup(tmpfile1)
        self.cleanup(tmpfile2)

    def test_output_header_when_extra_header_has_different_number_of_columns(self):
        tmpfile1 = self.create_file_with_data(sample_data_with_header)
        tmpfile2 = self.create_file_with_data(generate_sample_data_with_header(six.b('name,value1')))

        cmd = Q_EXECUTABLE + ' -d , "select name,value1,value2 from (select * from %s UNION ALL select * from %s) order by name" -H -O' % (tmpfile1.name,tmpfile2.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 7)
        self.assertEqual(len(e), 0)
        self.assertTrue(o, [
            six.b('name,value1,value2'),
            six.b('a,1,0'),
            six.b('a,1,0'),
            six.b('b,2,0'),
            six.b('b,2,0'),
            six.b('c,,0'),
            six.b('c,,0')
        ])

        self.cleanup(tmpfile1)
        self.cleanup(tmpfile2)


class ParsingModeTests(AbstractQTestCase):

    def test_strict_mode_column_count_mismatch_error(self):
        tmpfile = self.create_file_with_data(uneven_ls_output)
        cmd = Q_EXECUTABLE + ' -m strict "select count(*) from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertNotEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)

        self.assertTrue(six.b("Column Count is expected to identical") in e[0])

        self.cleanup(tmpfile)

    def test_strict_mode_too_large_specific_column_count(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)
        cmd = Q_EXECUTABLE + ' -d , -m strict -c 4 "select count(*) from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertNotEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)

        self.assertEqual(
            e[0], six.b("Strict mode. Column count is expected to be 4 but is 3"))

        self.cleanup(tmpfile)

    def test_strict_mode_too_small_specific_column_count(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)
        cmd = Q_EXECUTABLE + ' -d , -m strict -c 2 "select count(*) from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertNotEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)

        self.assertEqual(
            e[0], six.b("Strict mode. Column count is expected to be 2 but is 3"))

        self.cleanup(tmpfile)

    def test_relaxed_mode_missing_columns_in_header(self):
        tmpfile = self.create_file_with_data(
            sample_data_with_missing_header_names)
        cmd = Q_EXECUTABLE + ' -d , -m relaxed "select count(*) from %s" -H -A' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 7)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0],six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1],six.b('  Sources:'))
        self.assertEqual(o[2],six.b('    source_type: file source: %s') % six.b(tmpfile.name))
        self.assertEqual(o[3],six.b('  Fields:'))
        self.assertEqual(o[4],six.b('    `name` - text'))
        self.assertEqual(o[5],six.b('    `value1` - int'))
        self.assertEqual(o[6],six.b('    `c3` - int'))

        self.cleanup(tmpfile)

    def test_strict_mode_missing_columns_in_header(self):
        tmpfile = self.create_file_with_data(
            sample_data_with_missing_header_names)
        cmd = Q_EXECUTABLE + ' -d , -m strict "select count(*) from %s" -H -A' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertNotEqual(retcode, 0)
        self.assertEqual(len(o), 0)
        self.assertEqual(len(e), 1)

        self.assertEqual(
            e[0], six.b('Strict mode. Header row contains less columns than expected column count(2 vs 3)'))

        self.cleanup(tmpfile)

    def test_output_delimiter_with_missing_fields(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)
        cmd = Q_EXECUTABLE + ' -d , "select * from %s" -D ";"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], six.b('a;1;0'))
        self.assertEqual(o[1], six.b('b;2;0'))
        self.assertEqual(o[2], six.b('c;;0'))

        self.cleanup(tmpfile)

    def test_handling_of_null_integers(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)
        cmd = Q_EXECUTABLE + ' -d , "select avg(c2) from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], six.b('1.5'))

        self.cleanup(tmpfile)

    def test_empty_integer_values_converted_to_null(self):
        tmpfile = self.create_file_with_data(sample_data_no_header)
        cmd = Q_EXECUTABLE + ' -d , "select * from %s where c2 is null"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], six.b('c,,0'))

        self.cleanup(tmpfile)

    def test_empty_string_values_not_converted_to_null(self):
        tmpfile = self.create_file_with_data(
            sample_data_with_empty_string_no_header)
        cmd = Q_EXECUTABLE + ' -d , "select * from %s where c2 == %s"' % (
            tmpfile.name, "''")
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], six.b('c,,0'))

        self.cleanup(tmpfile)

    def test_relaxed_mode_detected_columns(self):
        tmpfile = self.create_file_with_data(uneven_ls_output)
        cmd = Q_EXECUTABLE + ' -m relaxed "select count(*) from %s" -A' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)

        column_rows = o[o.index(six.b('  Fields:'))+1:]

        self.assertEqual(len(column_rows), 11)

        column_tuples = [x.strip().split(six.b(" ")) for x in column_rows]
        column_info = [(x[0], x[2]) for x in column_tuples]
        column_names = [x[0] for x in column_tuples]
        column_types = [x[2] for x in column_tuples]

        self.assertEqual(column_names, [six.b('`c{}`'.format(x)) for x in range(1, 12)])
        self.assertEqual(column_types, list(map(lambda x:six.b(x),[
                          'text', 'int', 'text', 'text', 'int', 'text', 'int', 'int', 'text', 'text', 'text'])))

        self.cleanup(tmpfile)

    def test_relaxed_mode_detected_columns_with_specific_column_count(self):
        tmpfile = self.create_file_with_data(uneven_ls_output)
        cmd = Q_EXECUTABLE + ' -m relaxed "select count(*) from %s" -A -c 9' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)

        column_rows = o[o.index(six.b('  Fields:'))+1:]

        self.assertEqual(len(column_rows), 9)

        column_tuples = [x.strip().split(six.b(" ")) for x in column_rows]
        column_info = [(x[0], x[2]) for x in column_tuples]
        column_names = [x[0] for x in column_tuples]
        column_types = [x[2] for x in column_tuples]

        self.assertEqual(column_names, [six.b('`c{}`'.format(x)) for x in range(1, 10)])
        self.assertEqual(
            column_types, list(map(lambda x:six.b(x),['text', 'int', 'text', 'text', 'int', 'text', 'int', 'int', 'text'])))

        self.cleanup(tmpfile)

    def test_relaxed_mode_last_column_data_with_specific_column_count(self):
        tmpfile = self.create_file_with_data(uneven_ls_output)
        cmd = Q_EXECUTABLE + ' -m relaxed "select c9 from %s" -c 9' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 9)
        self.assertEqual(len(e), 0)

        expected_output = list(map(lambda x:six.b(x),["/selinux", "/mnt", "/srv", "/lost+found", '"/initrd.img.old -> /boot/initrd.img-3.8.0-19-generic"',
                           "/cdrom", "/home", '"/vmlinuz -> boot/vmlinuz-3.8.0-19-generic"', '"/initrd.img -> boot/initrd.img-3.8.0-19-generic"']))

        self.assertEqual(o, expected_output)

        self.cleanup(tmpfile)

    def test_1_column_warning_in_relaxed_mode(self):
        tmpfile = self.create_file_with_data(one_column_data)
        cmd = Q_EXECUTABLE + ' -m relaxed "select c1 from %s" -d ,' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b('data without commas 1'))
        self.assertEqual(o[1],six.b('data without commas 2'))

        self.cleanup(tmpfile)

    def test_1_column_warning_in_strict_mode(self):
        tmpfile = self.create_file_with_data(one_column_data)
        cmd = Q_EXECUTABLE + ' -m relaxed "select c1 from %s" -d , -m strict' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b('data without commas 1'))
        self.assertEqual(o[1],six.b('data without commas 2'))

        self.cleanup(tmpfile)


    def test_1_column_warning_suppression_in_relaxed_mode_when_column_count_is_specific(self):
        tmpfile = self.create_file_with_data(one_column_data)
        cmd = Q_EXECUTABLE + ' -m relaxed "select c1 from %s" -d , -m relaxed -c 1' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b('data without commas 1'))
        self.assertEqual(o[1],six.b('data without commas 2'))

        self.cleanup(tmpfile)

    def test_1_column_warning_suppression_in_strict_mode_when_column_count_is_specific(self):
        tmpfile = self.create_file_with_data(one_column_data)
        cmd = Q_EXECUTABLE + ' -m relaxed "select c1 from %s" -d , -m strict -c 1' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o),2)

        self.assertEqual(o[0],six.b('data without commas 1'))
        self.assertEqual(o[1],six.b('data without commas 2'))

        self.cleanup(tmpfile)

    def test_fluffy_mode__as_relaxed_mode(self):
        tmpfile = self.create_file_with_data(uneven_ls_output)
        cmd = Q_EXECUTABLE + ' -m relaxed "select c9 from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 9)
        self.assertEqual(len(e), 0)

        expected_output = list(map(lambda x:six.b(x),["/selinux", "/mnt", "/srv", "/lost+found",
                           "/initrd.img.old", "/cdrom", "/home", "/vmlinuz", "/initrd.img"]))

        self.assertEqual(o, expected_output)

        self.cleanup(tmpfile)

    def test_relaxed_mode_column_count_mismatch__was_previously_fluffy_mode_test(self):
        data_row = six.b("column1 column2 column3 column4")
        data_list = [data_row] * 1000
        data_list[950] = six.b("column1 column2 column3 column4 column5")
        tmpfile = self.create_file_with_data(six.b("\n").join(data_list))

        cmd = Q_EXECUTABLE + ' -m relaxed "select * from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)
        self.assertEqual(len(o),1000)
        self.assertEqual(len(e),0)
        self.assertEqual(o[950],six.b('column1 column2 column3 "column4 column5"'))

        self.cleanup(tmpfile)

    def test_strict_mode_column_count_mismatch__less_columns(self):
        data_row = six.b("column1 column2 column3 column4")
        data_list = [data_row] * 1000
        data_list[750] = six.b("column1 column3 column4")
        tmpfile = self.create_file_with_data(six.b("\n").join(data_list))

        cmd = Q_EXECUTABLE + ' -m strict "select * from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertNotEqual(retcode,0)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),1)
        self.assertTrue(e[0].startswith(six.b("Strict mode - Expected 4 columns instead of 3 columns")))
        self.assertTrue(six.b(' row 751.') in e[0])

        self.cleanup(tmpfile)

    def test_strict_mode_column_count_mismatch__more_columns(self):
        data_row = six.b("column1 column2 column3 column4")
        data_list = [data_row] * 1000
        data_list[750] = six.b("column1 column2 column3 column4 column5")
        tmpfile = self.create_file_with_data(six.b("\n").join(data_list))

        cmd = Q_EXECUTABLE + ' -m strict "select * from %s"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertNotEqual(retcode,0)
        self.assertEqual(len(o),0)
        self.assertEqual(len(e),1)
        self.assertTrue(e[0].startswith(six.b("Strict mode - Expected 4 columns instead of 5 columns")))
        self.assertTrue(six.b(' row 751.') in e[0])

        self.cleanup(tmpfile)


class FormattingTests(AbstractQTestCase):

    def test_column_formatting(self):
        # TODO Decide if this breaking change is reasonable
        #cmd = 'seq 1 10 | ' + Q_EXECUTABLE + ' -f 1=%4.3f,2=%4.3f "select sum(c1),avg(c1) from -" -c 1'
        cmd = 'seq 1 10 | ' + Q_EXECUTABLE + ' -f 1={:4.3f},2={:4.3f} "select sum(c1),avg(c1) from -" -c 1'

        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 1)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], six.b('55.000 5.500'))

    def test_column_formatting_with_output_header(self):
        perl_regex = "'s/1\n/column_name\n1\n/;'"
        # TODO Decide if this breaking change is reasonable
        #cmd = 'seq 1 10 | perl -pe ' + perl_regex + ' | ' + Q_EXECUTABLE + ' -f 1=%4.3f,2=%4.3f "select sum(column_name) mysum,avg(column_name) myavg from -" -c 1 -H -O'
        cmd = 'seq 1 10 | LANG=C perl -pe ' + perl_regex + ' | ' + Q_EXECUTABLE + ' -f 1={:4.3f},2={:4.3f} "select sum(column_name) mysum,avg(column_name) myavg from -" -c 1 -H -O'

        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], six.b('mysum myavg'))
        self.assertEqual(o[1], six.b('55.000 5.500'))

    def py3_test_successfuly_parse_universal_newlines_without_explicit_flag(self):
        def list_as_byte_list(l):
            return list(map(lambda x:six.b(x),l))

        expected_output = list(map(lambda x:list_as_byte_list(x),[['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-May-07', '6850000', 'USD', 'b'],
                           ['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-Oct-06', '6000000', 'USD', 'a'],
                           ['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-Jan-08', '25000000', 'USD', 'c'],
                           ['mycityfaces', 'MyCityFaces', '7', 'web', 'Scottsdale', 'AZ', '1-Jan-08', '50000', 'USD', 'seed'],
                           ['flypaper', 'Flypaper', '', 'web', 'Phoenix', 'AZ', '1-Feb-08', '3000000', 'USD', 'a'],
                           ['infusionsoft', 'Infusionsoft', '105', 'software', 'Gilbert', 'AZ', '1-Oct-07', '9000000', 'USD', 'a']]))

        data = six.b('permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round\rlifelock,LifeLock,,web,Tempe,AZ,1-May-07,6850000,USD,b\rlifelock,LifeLock,,web,Tempe,AZ,1-Oct-06,6000000,USD,a\rlifelock,LifeLock,,web,Tempe,AZ,1-Jan-08,25000000,USD,c\rmycityfaces,MyCityFaces,7,web,Scottsdale,AZ,1-Jan-08,50000,USD,seed\rflypaper,Flypaper,,web,Phoenix,AZ,1-Feb-08,3000000,USD,a\rinfusionsoft,Infusionsoft,105,software,Gilbert,AZ,1-Oct-07,9000000,USD,a')
        tmp_data_file = self.create_file_with_data(data)

        cmd = Q_EXECUTABLE + ' -d , -H "select * from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 6)

        actual_output = list(map(lambda row: row.split(six.b(",")),o))

        self.assertEqual(actual_output,expected_output)

        self.cleanup(tmp_data_file)

    test_parsing_universal_newlines_without_explicit_flag = py3_test_successfuly_parse_universal_newlines_without_explicit_flag

    def test_universal_newlines_parsing_flag(self):
        def list_as_byte_list(l):
            return list(map(lambda x:six.b(x),l))

        expected_output = list(map(lambda x:list_as_byte_list(x),[['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-May-07', '6850000', 'USD', 'b'],
                           ['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-Oct-06', '6000000', 'USD', 'a'],
                           ['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-Jan-08', '25000000', 'USD', 'c'],
                           ['mycityfaces', 'MyCityFaces', '7', 'web', 'Scottsdale', 'AZ', '1-Jan-08', '50000', 'USD', 'seed'],
                           ['flypaper', 'Flypaper', '', 'web', 'Phoenix', 'AZ', '1-Feb-08', '3000000', 'USD', 'a'],
                           ['infusionsoft', 'Infusionsoft', '105', 'software', 'Gilbert', 'AZ', '1-Oct-07', '9000000', 'USD', 'a']]))

        data = six.b('permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round\rlifelock,LifeLock,,web,Tempe,AZ,1-May-07,6850000,USD,b\rlifelock,LifeLock,,web,Tempe,AZ,1-Oct-06,6000000,USD,a\rlifelock,LifeLock,,web,Tempe,AZ,1-Jan-08,25000000,USD,c\rmycityfaces,MyCityFaces,7,web,Scottsdale,AZ,1-Jan-08,50000,USD,seed\rflypaper,Flypaper,,web,Phoenix,AZ,1-Feb-08,3000000,USD,a\rinfusionsoft,Infusionsoft,105,software,Gilbert,AZ,1-Oct-07,9000000,USD,a')
        tmp_data_file = self.create_file_with_data(data)

        cmd = Q_EXECUTABLE + ' -d , -H -U "select permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round from %s"' % tmp_data_file.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode,0)

        if len(e) == 2 or len(e) == 1:
            # In python 3.7, there's a deprecation warning for the 'U' file opening mode, which is ok for now
            self.assertIn(len(e), [1,2])
            self.assertTrue(b"DeprecationWarning: 'U' mode is deprecated" in e[0])
        elif len(e) != 0:
            # Nothing should be output to stderr in other versions
            self.assertTrue(False,msg='Unidentified output in stderr')

        self.assertEqual(len(o), 6)

        actual_output = list(map(lambda row: row.split(six.b(",")),o))

        self.assertEqual(actual_output,expected_output)

        self.cleanup(tmp_data_file)


class SqlTests(AbstractQTestCase):

    def test_find_example(self):
        tmpfile = self.create_file_with_data(find_output)
        cmd = Q_EXECUTABLE + ' "select c5,c6,sum(c7)/1024.0/1024 as total from %s group by c5,c6 order by total desc"' % tmpfile.name
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)
        self.assertEqual(len(e), 0)

        self.assertEqual(o[0], six.b('mapred mapred 0.9389581680297852'))
        self.assertEqual(o[1], six.b('root root 0.02734375'))
        self.assertEqual(o[2], six.b('harel harel 0.010888099670410156'))

        self.cleanup(tmpfile)

    def test_join_example(self):
        cmd = Q_EXECUTABLE + ' "select myfiles.c8,emails.c2 from {0}/exampledatafile myfiles join {0}/group-emails-example emails on (myfiles.c4 = emails.c1) where myfiles.c8 = \'ppp\'"'.format(EXAMPLES)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 2)

        self.assertEqual(o[0], six.b('ppp dip.1@otherdomain.com'))
        self.assertEqual(o[1], six.b('ppp dip.2@otherdomain.com'))

    def test_join_example_with_output_header(self):
        cmd = Q_EXECUTABLE + ' -O "select myfiles.c8 aaa,emails.c2 bbb from {0}/exampledatafile myfiles join {0}/group-emails-example emails on (myfiles.c4 = emails.c1) where myfiles.c8 = \'ppp\'"'.format(EXAMPLES)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(o), 3)

        self.assertEqual(o[0], six.b('aaa bbb'))
        self.assertEqual(o[1], six.b('ppp dip.1@otherdomain.com'))
        self.assertEqual(o[2], six.b('ppp dip.2@otherdomain.com'))

    def test_self_join1(self):
        tmpfile = self.create_file_with_data(six.b("\n").join([six.b("{} 9000".format(i)) for i in range(0,10)]))
        cmd = Q_EXECUTABLE + ' "select * from %s a1 join %s a2 on (a1.c1 = a2.c1)"' % (tmpfile.name,tmpfile.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 10)

        self.cleanup(tmpfile)

    def test_self_join_reuses_table(self):
        tmpfile = self.create_file_with_data(six.b("\n").join([six.b("{} 9000".format(i)) for i in range(0,10)]))
        cmd = Q_EXECUTABLE + ' "select * from %s a1 join %s a2 on (a1.c1 = a2.c1)" -A' % (tmpfile.name,tmpfile.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 6)

        self.assertEqual(o[0],six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1],six.b('  Sources:'))
        self.assertEqual(o[2],six.b('    source_type: file source: %s') % six.b(tmpfile.name))
        self.assertEqual(o[3],six.b('  Fields:'))
        self.assertEqual(o[4],six.b('    `c1` - int'))
        self.assertEqual(o[5],six.b('    `c2` - int'))

        self.cleanup(tmpfile)

    def test_self_join2(self):
        tmpfile1 = self.create_file_with_data(six.b("\n").join([six.b("{} 9000".format(i)) for i in range(0,10)]))
        cmd = Q_EXECUTABLE + ' "select * from %s a1 join %s a2 on (a1.c2 = a2.c2)"' % (tmpfile1.name,tmpfile1.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 10*10)

        self.cleanup(tmpfile1)

        tmpfile2 = self.create_file_with_data(six.b("\n").join([six.b("{} 9000".format(i)) for i in range(0,10)]))
        cmd = Q_EXECUTABLE + ' "select * from %s a1 join %s a2 on (a1.c2 = a2.c2) join %s a3 on (a1.c2 = a3.c2)"' % (tmpfile2.name,tmpfile2.name,tmpfile2.name)
        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 10*10*10)

        self.cleanup(tmpfile2)

    def test_disable_column_type_detection(self):
        tmpfile = self.create_file_with_data(six.b('''regular_text,text_with_digits1,text_with_digits2,float_number
"regular text 1",67,"67",12.3
"regular text 2",067,"067",22.3
"regular text 3",123,"123",33.4
"regular text 4",-123,"-123",0122.2
'''))

        # Check original column type detection
        cmd = Q_EXECUTABLE + ' -A -d , -H "select * from %s"' % (tmpfile.name)

        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 8)

        self.assertEqual(o[0],six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1], six.b('  Sources:'))
        self.assertEqual(o[2], six.b('    source_type: file source: %s') % six.b(tmpfile.name))
        self.assertEqual(o[3], six.b('  Fields:'))
        self.assertEqual(o[4], six.b('    `regular_text` - text'))
        self.assertEqual(o[5], six.b('    `text_with_digits1` - int'))
        self.assertEqual(o[6], six.b('    `text_with_digits2` - int'))
        self.assertEqual(o[7], six.b('    `float_number` - real'))

        # Check column types detected when actual detection is disabled
        cmd = Q_EXECUTABLE + ' -A -d , -H --as-text "select * from %s"' % (tmpfile.name)

        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 8)

        self.assertEqual(o[0],six.b('Table: %s' % tmpfile.name))
        self.assertEqual(o[1],six.b('  Sources:'))
        self.assertEqual(o[2],six.b('    source_type: file source: %s') % six.b(tmpfile.name))
        self.assertEqual(o[3],six.b('  Fields:'))
        self.assertEqual(o[4],six.b('    `regular_text` - text'))
        self.assertEqual(o[5],six.b('    `text_with_digits1` - text'))
        self.assertEqual(o[6],six.b('    `text_with_digits2` - text'))
        self.assertEqual(o[7],six.b('    `float_number` - text'))

        # Get actual data with regular detection
        cmd = Q_EXECUTABLE + ' -d , -H "select * from %s"' % (tmpfile.name)

        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 4)

        self.assertEqual(o[0],six.b("regular text 1,67,67,12.3"))
        self.assertEqual(o[1],six.b("regular text 2,67,67,22.3"))
        self.assertEqual(o[2],six.b("regular text 3,123,123,33.4"))
        self.assertEqual(o[3],six.b("regular text 4,-123,-123,122.2"))

        # Get actual data without detection
        cmd = Q_EXECUTABLE + ' -d , -H --as-text "select * from %s"' % (tmpfile.name)

        retcode, o, e = run_command(cmd)

        self.assertEqual(retcode, 0)
        self.assertEqual(len(e), 0)
        self.assertEqual(len(o), 4)

        self.assertEqual(o[0],six.b("regular text 1,67,67,12.3"))
        self.assertEqual(o[1],six.b("regular text 2,067,067,22.3"))
        self.assertEqual(o[2],six.b("regular text 3,123,123,33.4"))
        self.assertEqual(o[3],six.b("regular text 4,-123,-123,0122.2"))

        self.cleanup(tmpfile)


class BasicModuleTests(AbstractQTestCase):

    def test_engine_isolation(self):
        tmpfile1 = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
        tmpfile2 = self.create_file_with_data(six.b("d e f\n10 20 30\n40 50 60"))

        # Run file 1 on engine 1
        q1 = QTextAsData(QInputParams(skip_header=True,delimiter=' '))
        r = q1.execute('select * from %s' % tmpfile1.name)
        print("QueryQuery",file=sys.stdout)

        self.assertTrue(r.status == 'ok')
        self.assertEqual(len(r.warnings),0)
        self.assertEqual(len(r.data),2)
        self.assertEqual(r.metadata.output_column_name_list,['a','b','c'])
        self.assertEqual(r.data,[(1,2,3),(4,5,6)])
        self.assertTrue(tmpfile1.name in r.metadata.table_structures)
        self.assertTrue(tmpfile1.name in r.metadata.new_table_structures)
        self.assertEqual(r.metadata.table_structures[tmpfile1.name].atomic_fns,[tmpfile1.name])
        self.assertEqual(r.metadata.table_structures[tmpfile1.name].source_type,'file')
        self.assertEqual(r.metadata.table_structures[tmpfile1.name].source,tmpfile1.name)

        # run file 1 on engine 2
        q2 = QTextAsData(QInputParams(skip_header=True,delimiter=' '))
        r2 = q2.execute('select * from %s' % tmpfile1.name)
        print("QueryQuery",file=sys.stdout)

        self.assertTrue(r2.status == 'ok')
        self.assertEqual(len(r2.warnings),0)
        self.assertEqual(len(r2.data),2)
        self.assertEqual(r2.metadata.output_column_name_list,['a','b','c'])
        self.assertEqual(r2.data,[(1,2,3),(4,5,6)])
        self.assertTrue(tmpfile1.name in r2.metadata.table_structures)
        self.assertTrue(tmpfile1.name in r2.metadata.new_table_structures)
        self.assertEqual(r2.metadata.table_structures[tmpfile1.name].atomic_fns,[tmpfile1.name])
        self.assertEqual(r2.metadata.table_structures[tmpfile1.name].source_type,'file')
        self.assertEqual(r2.metadata.table_structures[tmpfile1.name].source,tmpfile1.name)

        # run file 2 on engine 1
        r3 = q1.execute('select * from %s' % tmpfile2.name)
        print("QueryQuery",file=sys.stdout)

        print(r3)
        self.assertTrue(r3.status == 'ok')
        self.assertEqual(len(r3.warnings),0)
        self.assertEqual(len(r3.data),2)
        self.assertEqual(r3.metadata.output_column_name_list,['d','e','f'])
        self.assertEqual(r3.data,[(10,20,30),(40,50,60)])
        self.assertTrue(tmpfile2.name in r3.metadata.table_structures)
        self.assertTrue(tmpfile2.name in r3.metadata.new_table_structures)
        self.assertEqual(r3.metadata.table_structures[tmpfile2.name].atomic_fns,[tmpfile2.name])
        self.assertEqual(r3.metadata.table_structures[tmpfile2.name].source,tmpfile2.name)
        self.assertEqual(r3.metadata.table_structures[tmpfile2.name].source_type,'file')

        q1.done()
        q2.done()

        self.cleanup(tmpfile1)
        self.cleanup(tmpfile2)

    def test_simple_query(self):
        tmpfile = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))

        q = QTextAsData(QInputParams(skip_header=True,delimiter=' '))
        r = q.execute('select * from %s' % tmpfile.name)

        self.assertTrue(r.status == 'ok')
        self.assertEqual(len(r.warnings),0)
        self.assertEqual(len(r.data),2)
        self.assertEqual(r.metadata.output_column_name_list,['a','b','c'])
        self.assertEqual(r.data,[(1,2,3),(4,5,6)])
        self.assertTrue(tmpfile.name in r.metadata.table_structures)
        self.assertTrue(tmpfile.name in r.metadata.new_table_structures)
        self.assertEqual(r.metadata.table_structures[tmpfile.name].atomic_fns,[tmpfile.name])
        self.assertEqual(r.metadata.table_structures[tmpfile.name].source_type,'file')
        self.assertEqual(r.metadata.table_structures[tmpfile.name].source,tmpfile.name)

        q.done()
        self.cleanup(tmpfile)

    def test_loaded_data_reuse(self):
        tmpfile = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))

        q = QTextAsData(QInputParams(skip_header=True,delimiter=' '))
        r1 = q.execute('select * from %s' % tmpfile.name)

        r2 = q.execute('select * from %s' % tmpfile.name)

        self.assertTrue(r1.status == 'ok')
        self.assertEqual(len(r1.warnings),0)
        self.assertEqual(len(r1.data),2)
        self.assertEqual(r1.metadata.output_column_name_list,['a','b','c'])
        self.assertEqual(r1.data,[(1,2,3),(4,5,6)])
        self.assertTrue(tmpfile.name in r1.metadata.table_structures)
        self.assertTrue(tmpfile.name in r1.metadata.new_table_structures)
        self.assertEqual(r1.metadata.table_structures[tmpfile.name].atomic_fns,[tmpfile.name])
        self.assertEqual(r1.metadata.table_structures[tmpfile.name].source_type,'file')
        self.assertEqual(r1.metadata.table_structures[tmpfile.name].source,tmpfile.name)

        self.assertTrue(r2.status == 'ok')
        self.assertTrue(tmpfile.name in r2.metadata.table_structures)
        self.assertTrue(tmpfile.name not in r2.metadata.new_table_structures)
        self.assertEqual(r2.data,r1.data)
        self.assertEqual(r2.metadata.output_column_name_list,r2.metadata.output_column_name_list)
        self.assertEqual(len(r2.warnings),0)

        q.done()

        self.cleanup(tmpfile)

    def test_stdin_injection(self):
        tmpfile = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))

        data_streams_dict = {
            '-': DataStream('stdin','-',codecs.open(tmpfile.name,'rb',encoding='utf-8'))
        }
        q = QTextAsData(QInputParams(skip_header=True,delimiter=' '),data_streams_dict=data_streams_dict)
        r = q.execute('select * from -')

        self.assertTrue(r.status == 'ok')
        self.assertEqual(len(r.warnings),0)
        self.assertEqual(len(r.data),2)
        self.assertEqual(r.metadata.output_column_name_list,['a','b','c'])
        self.assertEqual(r.data,[(1,2,3),(4,5,6)])
        self.assertEqual(r.metadata.new_table_structures['-'],r.metadata.table_structures['-'])
        self.assertEqual(r.metadata.table_structures['-'].column_names,['a','b','c'])
        self.assertEqual(r.metadata.table_structures['-'].python_column_types,[int,int,int])
        self.assertEqual(r.metadata.table_structures['-'].sqlite_column_types,['int','int','int'])
        self.assertEqual(r.metadata.table_structures['-'].source_type,'data-stream')
        self.assertEqual(r.metadata.table_structures['-'].source,'stdin')

        q.done()
        self.cleanup(tmpfile)

    def test_named_stdin_injection(self):
        tmpfile = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))

        data_streams_dict = {
            'my_stdin_data': DataStream('my_stdin_data','my_stdin_data',codecs.open(tmpfile.name,'rb',encoding='utf-8'))
        }

        q = QTextAsData(QInputParams(skip_header=True,delimiter=' '),data_streams_dict=data_streams_dict)
        r = q.execute('select a from my_stdin_data')

        self.assertTrue(r.status == 'ok')
        self.assertEqual(len(r.warnings),0)
        self.assertEqual(len(r.data),2)
        self.assertEqual(r.metadata.output_column_name_list,['a'])
        self.assertEqual(r.data,[(1,),(4,)])
        self.assertTrue('my_stdin_data' in r.metadata.table_structures)
        self.assertTrue('my_stdin_data' in r.metadata.new_table_structures)
        self.assertEqual(r.metadata.table_structures['my_stdin_data'].qtable_name,'my_stdin_data')

        q.done()
        self.cleanup(tmpfile)

    def test_data_stream_isolation(self):
        tmpfile1 = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
        tmpfile2 = self.create_file_with_data(six.b("d e f\n7 8 9\n10 11 12"))

        data_streams_dict = {
            'a-': DataStream('a-','a-',codecs.open(tmpfile1.name, 'rb', encoding='utf-8')),
            'b-': DataStream('b-','b-',codecs.open(tmpfile2.name, 'rb', encoding='utf-8'))
        }

        q = QTextAsData(QInputParams(skip_header=True,delimiter=' '),data_streams_dict=data_streams_dict)
        r1 = q.execute('select * from a-')

        self.assertTrue(r1.status == 'ok')
        self.assertEqual(len(r1.warnings),0)
        self.assertEqual(len(r1.data),2)
        self.assertEqual(r1.metadata.output_column_name_list,['a','b','c'])
        self.assertEqual(r1.data,[(1,2,3),(4,5,6)])
        self.assertTrue('a-' in r1.metadata.table_structures)
        self.assertEqual(len(r1.metadata.table_structures),1)
        self.assertEqual(r1.metadata.table_structures['a-'].source_type, 'data-stream')
        self.assertEqual(r1.metadata.table_structures['a-'].source, 'a-')
        self.assertEqual(r1.metadata.table_structures['a-'].column_names, ['a','b','c'])
        self.assertEqual(r1.metadata.table_structures['a-'].python_column_types, [int,int,int])
        self.assertEqual(r1.metadata.table_structures['a-'].sqlite_column_types, ['int','int','int'])

        r2 = q.execute('select * from b-')

        self.assertTrue(r2.status == 'ok')
        self.assertEqual(len(r2.warnings),0)
        self.assertEqual(len(r2.data),2)
        self.assertEqual(r2.metadata.output_column_name_list,['d','e','f'])
        self.assertEqual(r2.data,[(7,8,9),(10,11,12)])

        self.assertEqual(len(r1.metadata.table_structures),2)
        self.assertTrue('b-' in r1.metadata.table_structures)
        self.assertEqual(r1.metadata.table_structures['b-'].source_type, 'data-stream')
        self.assertEqual(r1.metadata.table_structures['b-'].source, 'b-')
        self.assertEqual(r1.metadata.table_structures['b-'].column_names, ['d','e','f'])
        self.assertEqual(r1.metadata.table_structures['b-'].python_column_types, [int,int,int])
        self.assertEqual(r1.metadata.table_structures['b-'].sqlite_column_types, ['int','int','int'])

        q.done()
        self.cleanup(tmpfile1)
        self.cleanup(tmpfile2)

    def test_multiple_stdin_injection(self):
        tmpfile1 = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
        tmpfile2 = self.create_file_with_data(six.b("d e f\n7 8 9\n10 11 12"))

        data_streams_dict = {
            'my_stdin_data1': DataStream('my_stdin_data1','my_stdin_data1',codecs.open(tmpfile1.name,'rb',encoding='utf-8')),
            'my_stdin_data2': DataStream('my_stdin_data2','my_stdin_data2',codecs.open(tmpfile2.name,'rb',encoding='utf-8'))
        }
        q = QTextAsData(QInputParams(skip_header=True,delimiter=' '),data_streams_dict=data_streams_dict)
        r1 = q.execute('select * from my_stdin_data1')

        self.assertTrue(r1.status == 'ok')
        self.assertEqual(len(r1.warnings),0)
        self.assertEqual(len(r1.data),2)
        self.assertEqual(r1.metadata.output_column_name_list,['a','b','c'])
        self.assertEqual(r1.data,[(1,2,3),(4,5,6)])
        self.assertTrue('my_stdin_data1' in r1.metadata.table_structures)
        self.assertTrue('my_stdin_data1' in r1.metadata.new_table_structures)
        self.assertEqual(r1.metadata.table_structures['my_stdin_data1'].qtable_name,'my_stdin_data1')

        r2 = q.execute('select * from my_stdin_data2')

        self.assertTrue(r2.status == 'ok')
        self.assertEqual(len(r2.warnings),0)
        self.assertEqual(len(r2.data),2)
        self.assertEqual(r2.metadata.output_column_name_list,['d','e','f'])
        self.assertEqual(r2.data,[(7,8,9),(10,11,12)])
        # There should be another data load, even though it's the same 'filename' as before
        self.assertTrue('my_stdin_data2' in r2.metadata.table_structures)
        self.assertTrue('my_stdin_data2' in r2.metadata.new_table_structures)
        self.assertEqual(r2.metadata.table_structures['my_stdin_data2'].qtable_name,'my_stdin_data2')

        r3 = q.execute('select aa.*,bb.* from my_stdin_data1 aa join my_stdin_data2 bb')

        self.assertTrue(r3.status == 'ok')
        self.assertEqual(len(r3.warnings),0)
        self.assertEqual(len(r3.data),4)
        self.assertEqual(r3.metadata.output_column_name_list,['a','b','c','d','e','f'])
        self.assertEqual(r3.data,[(1,2,3,7,8,9),(1,2,3,10,11,12),(4,5,6,7,8,9),(4,5,6,10,11,12)])
        self.assertTrue('my_stdin_data1' in r3.metadata.table_structures)
        self.assertTrue('my_stdin_data1' not in r3.metadata.new_table_structures)

        q.done()
        self.cleanup(tmpfile1)
        self.cleanup(tmpfile2)

    def test_different_input_params_for_different_files(self):
        tmpfile1 = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
        tmpfile2 = self.create_file_with_data(six.b("7\t8\t9\n10\t11\t12"))

        q = QTextAsData(QInputParams(skip_header=True,delimiter=' '))

        q.load_data(tmpfile1.name,QInputParams(skip_header=True,delimiter=' '))
        q.load_data(tmpfile2.name,QInputParams(skip_header=False,delimiter='\t'))

        r = q.execute('select aa.*,bb.* from %s aa join %s bb' % (tmpfile1.name,tmpfile2.name))

        self.assertTrue(r.status == 'ok')
        self.assertEqual(len(r.warnings),0)
        self.assertEqual(len(r.data),4)
        self.assertEqual(r.metadata.output_column_name_list,['a','b','c','c1','c2','c3'])
        self.assertEqual(r.data,[(1,2,3,7,8,9),(1,2,3,10,11,12),(4,5,6,7,8,9),(4,5,6,10,11,12)])
        self.assertTrue(tmpfile1.name not in r.metadata.new_table_structures)
        self.assertTrue(tmpfile2.name not in r.metadata.new_table_structures)

        q.done()
        self.cleanup(tmpfile1)
        self.cleanup(tmpfile2)

    def test_different_input_params_for_different_files_2(self):
        tmpfile1 = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
        tmpfile2 = self.create_file_with_data(six.b("7\t8\t9\n10\t11\t12"))

        q = QTextAsData()

        q.load_data(tmpfile1.name,QInputParams(skip_header=True,delimiter=' '))
        q.load_data(tmpfile2.name,QInputParams(skip_header=False,delimiter='\t'))

        r = q.execute('select aa.*,bb.* from %s aa join %s bb' % (tmpfile1.name,tmpfile2.name))

        self.assertTrue(r.status == 'ok')
        self.assertEqual(len(r.warnings),0)
        self.assertEqual(len(r.data),4)
        self.assertEqual(r.metadata.output_column_name_list,['a','b','c','c1','c2','c3'])
        self.assertEqual(r.data,[(1,2,3,7,8,9),(1,2,3,10,11,12),(4,5,6,7,8,9),(4,5,6,10,11,12)])
        self.assertTrue(tmpfile1.name not in r.metadata.new_table_structures)
        self.assertTrue(tmpfile2.name not in r.metadata.new_table_structures)

        q.done()
        self.cleanup(tmpfile1)
        self.cleanup(tmpfile2)

    def test_input_params_override(self):
        tmpfile = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))

        default_input_params = QInputParams()

        for k in default_input_params.__dict__.keys():
            setattr(default_input_params,k,'GARBAGE')

        q = QTextAsData(default_input_params)

        r = q.execute('select * from %s' % tmpfile.name)

        self.assertTrue(r.status == 'error')

        overwriting_input_params = QInputParams(skip_header=True,delimiter=' ')

        r2 = q.execute('select * from %s' % tmpfile.name,input_params=overwriting_input_params)

        self.assertTrue(r2.status == 'ok')
        self.assertEqual(len(r2.warnings),0)
        self.assertEqual(len(r2.data),2)
        self.assertEqual(r2.metadata.output_column_name_list,['a','b','c'])
        self.assertEqual(r2.data,[(1,2,3),(4,5,6)])
        self.assertTrue(tmpfile.name in r2.metadata.table_structures)
        self.assertTrue(tmpfile.name in r2.metadata.new_table_structures)
        self.assertEqual(r2.metadata.table_structures[tmpfile.name].atomic_fns,[tmpfile.name])
        self.assertEqual(r2.metadata.table_structures[tmpfile.name].source,tmpfile.name)
        self.assertEqual(r2.metadata.table_structures[tmpfile.name].source_type,'file')

        q.done()
        self.cleanup(tmpfile)

    def test_input_params_merge(self):
        input_params = QInputParams()

        for k in input_params.__dict__.keys():
            setattr(input_params,k,'GARBAGE')

        merged_input_params = input_params.merged_with(QInputParams())

        for k in merged_input_params.__dict__.keys():
            self.assertTrue(getattr(merged_input_params,k) != 'GARBAGE')

        for k in input_params.__dict__.keys():
            self.assertTrue(getattr(merged_input_params,k) != 'GARBAGE')

    def test_table_analysis_with_syntax_error(self):

        q = QTextAsData()

        q_output = q.analyze("bad syntax")

        q.done()
        self.assertTrue(q_output.status == 'error')
        self.assertTrue(q_output.error.msg.startswith('query error'))

    def test_execute_response(self):
        tmpfile = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))

        q = QTextAsData()

        q_output = q.execute("select a,c from %s" % tmpfile.name,QInputParams(skip_header=True))

        self.assertTrue(q_output.status == 'ok')
        self.assertTrue(q_output.error is None)
        self.assertEqual(len(q_output.warnings),0)
        self.assertEqual(len(q_output.data),2)
        self.assertEqual(q_output.data,[ (1,3),(4,6) ])
        self.assertTrue(q_output.metadata is not None)

        metadata = q_output.metadata

        self.assertEqual(metadata.output_column_name_list, [ 'a','c'])
        self.assertTrue(tmpfile.name in metadata.new_table_structures)
        self.assertEqual(len(metadata.table_structures),1)

        table_structure = metadata.new_table_structures[tmpfile.name]

        self.assertEqual(table_structure.column_names,[ 'a','b','c'])
        self.assertEqual(table_structure.python_column_types,[ int,int,int])
        self.assertEqual(table_structure.sqlite_column_types,[ 'int','int','int'])
        self.assertEqual(table_structure.qtable_name, tmpfile.name)
        self.assertEqual(table_structure.atomic_fns,[tmpfile.name])
        self.assertEqual(table_structure.source_type,'file')
        self.assertEqual(table_structure.source,tmpfile.name)

        q.done()
        self.cleanup(tmpfile)

    def test_analyze_response(self):
        tmpfile = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))

        q = QTextAsData()

        q_output = q.analyze("select a,c from %s" % tmpfile.name,QInputParams(skip_header=True))

        self.assertTrue(q_output.status == 'ok')
        self.assertTrue(q_output.error is None)
        self.assertEqual(len(q_output.warnings),0)
        self.assertEqual(len(q_output.data),2)
        self.assertEqual(q_output.data,[ (1,3),(4,6) ])
        self.assertTrue(q_output.metadata is not None)

        metadata = q_output.metadata

        self.assertEqual(metadata.output_column_name_list, [ 'a','c'])
        self.assertEqual(len(metadata.table_structures),1)
        self.assertTrue(tmpfile.name in metadata.new_table_structures)

        table_structure = metadata.table_structures[tmpfile.name]

        self.assertEqual(table_structure.column_names,[ 'a','b','c'])
        self.assertEqual(table_structure.python_column_types,[ int,int,int])
        self.assertEqual(table_structure.sqlite_column_types,[ 'int','int','int'])
        self.assertEqual(table_structure.qtable_name, tmpfile.name)
        self.assertEqual(table_structure.atomic_fns,[tmpfile.name])
        self.assertEqual(table_structure.source_type,'file')
        self.assertEqual(table_structure.source,tmpfile.name)

        q.done()
        self.cleanup(tmpfile)

    def test_load_data_from_string_without_previous_data_load(self):
        input_str = six.u('column1,column2,column3\n') + six.u('\n').join([six.u('value1,2.5,value3')] * 1000)


        data_streams_dict = {
            'my_data': DataStream('my_data_stream_id','my_data',six.StringIO(input_str))
        }
        q = QTextAsData(default_input_params=QInputParams(skip_header=True,delimiter=','),data_streams_dict=data_streams_dict)

        q_output = q.execute('select column2,column3 from my_data')

        self.assertTrue(q_output.status == 'ok')
        self.assertTrue(q_output.error is None)
        self.assertEqual(len(q_output.warnings),0)
        self.assertTrue(len(q_output.data),1000)
        self.assertEqual(len(set(q_output.data)),1)
        self.assertEqual(list(set(q_output.data))[0],(2.5,'value3'))

        metadata = q_output.metadata

        self.assertTrue(metadata.output_column_name_list,['column2','column3'])
        self.assertTrue('my_data' in metadata.new_table_structures)
        self.assertEqual(len(metadata.table_structures),1)

        table_structure = metadata.table_structures['my_data']

        self.assertEqual(table_structure.column_names,['column1','column2','column3'])
        self.assertEqual(table_structure.sqlite_column_types,['text','real','text'])
        self.assertEqual(table_structure.python_column_types,[str,float,str])
        self.assertEqual(table_structure.qtable_name, 'my_data')
        self.assertEqual(table_structure.source_type, 'data-stream')
        self.assertEqual(table_structure.source, 'my_data_stream_id')

        q.done()

    def test_load_data_from_string_with_previous_data_load(self):
        input_str = six.u('column1,column2,column3\n') + six.u('\n').join([six.u('value1,2.5,value3')] * 1000)

        data_streams_dict = {
            'my_data': DataStream('a','my_data',six.StringIO(input_str))
        }
        q = QTextAsData(default_input_params=QInputParams(skip_header=True,delimiter=','),data_streams_dict=data_streams_dict)

        dl = q.load_data('my_data',QInputParams(skip_header=True,delimiter=','))

        q_output = q.execute('select column2,column3 from my_data')

        self.assertTrue(q_output.status == 'ok')
        self.assertTrue(q_output.error is None)
        self.assertEqual(len(q_output.warnings),0)
        self.assertTrue(len(q_output.data),1000)
        self.assertEqual(len(set(q_output.data)),1)
        self.assertEqual(list(set(q_output.data))[0],(2.5,'value3'))

        metadata = q_output.metadata

        self.assertTrue(metadata.output_column_name_list,['column2','column3'])
        self.assertTrue('my_data' not in metadata.new_table_structures)
        self.assertEqual(len(metadata.table_structures),1)

        table_structure = metadata.table_structures['my_data']

        self.assertEqual(table_structure.column_names,['column1','column2','column3'])
        self.assertEqual(table_structure.sqlite_column_types,['text','real','text'])
        self.assertEqual(table_structure.python_column_types,[str,float,str])
        self.assertEqual(table_structure.qtable_name, 'my_data')

        q.done()


class BenchmarkAttemptResults(object):
    def __init__(self, attempt, lines, columns, duration,return_code):
        self.attempt = attempt
        self.lines = lines
        self.columns = columns
        self.duration = duration
        self.return_code = return_code

    def __str__(self):
        return "{}".format(self.__dict__)
    __repr__ = __str__

class BenchmarkResults(object):
    def __init__(self, lines, columns, attempt_results, mean, stddev):
        self.lines = lines
        self.columns = columns
        self.attempt_results = attempt_results
        self.mean = mean
        self.stddev = stddev

    def __str__(self):
        return "{}".format(self.__dict__)
    __repr__ = __str__

@pytest.mark.benchmark
class BenchmarkTests(AbstractQTestCase):

    BENCHMARK_DIR = os.environ.get('Q_BENCHMARK_DATA_DIR')

    def _ensure_benchmark_data_dir_exists(self):
        try:
            os.mkdir(BenchmarkTests.BENCHMARK_DIR)
        except Exception as e:
            pass

    def _create_benchmark_file_if_needed(self):
        self._ensure_benchmark_data_dir_exists()

        if os.path.exists('{}/'.format(BenchmarkTests.BENCHMARK_DIR)):
            return

        g = GzipFile('unit-file.csv.gz')
        d = g.read().decode('utf-8')
        f = open('{}/benchmark-file.csv'.format(BenchmarkTests.BENCHMARK_DIR), 'w')
        for i in range(100):
            f.write(d)
        f.close()

    def _prepare_test_file(self, lines, columns):

        filename = '{}/_benchmark_data__lines_{}_columns_{}.csv'.format(BenchmarkTests.BENCHMARK_DIR,lines, columns)

        if os.path.exists(filename):
            return filename

        c = ['c{}'.format(x + 1) for x in range(columns)]

        # write a header line
        ff = open(filename,'w')
        ff.write(",".join(c))
        ff.write('\n')
        ff.close()

        r, o, e = run_command('head -{} {}/benchmark-file.csv | ' + Q_EXECUTABLE + ' -d , "select {} from -" >> {}'.format(lines, BenchmarkTests.BENCHMARK_DIR, ','.join(c), filename))
        self.assertEqual(r, 0)
        # Create file cache as part of preparation
        r, o, e = run_command(Q_EXECUTABLE + ' -C readwrite -d , "select count(*) from %s"' % filename)
        self.asserEqual(r, 0)
        return filename

    def _decide_result(self,attempt_results):

        failed = list(filter(lambda a: a.return_code != 0,attempt_results))

        if len(failed) == 0:
            mean = sum([x.duration for x in attempt_results]) / len(attempt_results)
            sum_squared = sum([(x.duration - mean)**2 for x in attempt_results])
            ddof = 0
            pvar = sum_squared / (len(attempt_results) - ddof)
            stddev = pvar ** 0.5
        else:
            mean = None
            stddev = None

        return BenchmarkResults(
            attempt_results[0].lines,
            attempt_results[0].columns,
            attempt_results,
            mean,
            stddev
        )

    def _perform_test_performance_matrix(self,name,generate_cmd_function):
        results = []

        benchmark_results_folder = os.environ.get("Q_BENCHMARK_RESULTS_FOLDER",'')
        if benchmark_results_folder == "":
            raise Exception("Q_BENCHMARK_RESULTS_FOLDER must be provided as an environment variable")

        self._create_benchmark_file_if_needed()
        for columns in [1, 5, 10, 20, 50, 100]:
            for lines in [1, 10, 100, 1000, 10000, 100000, 1000000]:
                attempt_results = []
                for attempt in range(10):
                    filename = self._prepare_test_file(lines, columns)
                    if DEBUG:
                        print("Testing {}".format(filename))
                    t0 = time.time()
                    r, o, e = run_command(generate_cmd_function(filename,lines,columns))
                    duration = time.time() - t0
                    attempt_result = BenchmarkAttemptResults(attempt, lines, columns, duration, r)
                    attempt_results += [attempt_result]
                    if DEBUG:
                        print("Results: {}".format(attempt_result.__dict__))
                final_result = self._decide_result(attempt_results)
                results += [final_result]

        series_fields = [six.u('lines'),six.u('columns')]
        value_fields = [six.u('mean'),six.u('stddev')]

        all_fields = series_fields + value_fields

        output_filename = '{}/{}.benchmark-results'.format(benchmark_results_folder,name)
        output_file = open(output_filename,'w')
        for columns,g in itertools.groupby(sorted(results,key=lambda x:x.columns),key=lambda x:x.columns):
            x = six.u("\t").join(series_fields + [six.u('{}_{}').format(name, f) for f in value_fields])
            print(x,file = output_file)
            for result in g:
                print(six.u("\t").join(map(str,[getattr(result,f) for f in all_fields])),file=output_file)
        output_file.close()

        print("results have been written to : {}".format(output_filename))
        if DEBUG:
            print("RESULTS FOR {}".format(name))
            print(open(output_filename,'r').read())

    def test_q_matrix(self):
        Q_BENCHMARK_NAME = os.environ.get('Q_BENCHMARK_NAME')
        if Q_BENCHMARK_NAME is None:
            raise Exception('Q_BENCHMARK_NAME must be provided as an env var')

        def generate_q_cmd(data_filename, line_count, column_count):
            Q_BENCHMARK_ADDITIONAL_PARAMS = os.environ.get('Q_BENCHMARK_ADDITIONAL_PARAMS') or ''
            additional_params = ''
            additional_params = additional_params + ' ' + Q_BENCHMARK_ADDITIONAL_PARAMS
            return '{} -d , {} "select count(*) from {}"'.format(Q_EXECUTABLE,additional_params, data_filename)
        self._perform_test_performance_matrix(Q_BENCHMARK_NAME,generate_q_cmd)

    def _get_textql_version(self):
        r,o,e = run_command("textql --version")
        if r != 0:
            raise Exception("Could not find textql")
        if len(e) != 0:
            raise Exception("Errors while getting textql version")
        return o[0]

    def _get_octosql_version(self):
        r,o,e = run_command("octosql --version")
        if r != 0:
            raise Exception("Could not find octosql")
        if len(e) != 0:
            raise Exception("Errors while getting octosql version")
        version = re.findall('v[0-9]+\\.[0-9]+\\.[0-9]+',str(o[0],encoding='utf-8'))[0]
        return version

    def test_textql_matrix(self):
        def generate_textql_cmd(data_filename,line_count,column_count):
            return 'textql -dlm , -sql "select count(*)" {}'.format(data_filename)

        name = 'textql_%s' % self._get_textql_version()
        self._perform_test_performance_matrix(name,generate_textql_cmd)

    def test_octosql_matrix(self):
        config_fn = self.random_tmp_filename('octosql', 'config')
        def generate_octosql_cmd(data_filename,line_count,column_count):
            j = """
dataSources:
  - name: bmdata
    type: csv
    config:
      path: "{}"
      headerRow: false
      batchSize: 10000
""".format(data_filename)[1:]
            f = open(config_fn,'w')
            f.write(j)
            f.close()
            return 'octosql -c {} -o batch-csv "select count(*) from bmdata a"'.format(config_fn)

        name = 'octosql_%s' % self._get_octosql_version()
        self._perform_test_performance_matrix(name,generate_octosql_cmd)

def suite():
    tl = unittest.TestLoader()
    basic_stuff = tl.loadTestsFromTestCase(BasicTests)
    parsing_mode = tl.loadTestsFromTestCase(ParsingModeTests)
    sql = tl.loadTestsFromTestCase(SqlTests)
    formatting = tl.loadTestsFromTestCase(FormattingTests)
    basic_module_stuff = tl.loadTestsFromTestCase(BasicModuleTests)
    save_db_to_disk_tests = tl.loadTestsFromTestCase(SaveDbToDiskTests)
    user_functions_tests = tl.loadTestsFromTestCase(UserFunctionTests)
    multi_header_tests = tl.loadTestsFromTestCase(MultiHeaderTests)
    return unittest.TestSuite([basic_module_stuff, basic_stuff, parsing_mode, sql, formatting,save_db_to_disk_tests,multi_header_tests,user_functions_tests])

if __name__ == '__main__':
    if len(sys.argv) > 1:
        suite = unittest.TestSuite()
        if '.' in sys.argv[1]:
            c,m = sys.argv[1].split(".")
            suite.addTest(globals()[c](m))
        else:
            tl = unittest.TestLoader()
            tc = tl.loadTestsFromTestCase(globals()[sys.argv[1]])
            suite = unittest.TestSuite([tc])
    else:
        suite = suite()

    test_runner = unittest.TextTestRunner(verbosity=2)
    result = test_runner.run(suite)
    sys.exit(not result.wasSuccessful())


================================================
FILE: test-requirements.txt
================================================
pytest==6.2.2
flake8==3.6.0
six==1.11.0