[
  {
    "path": ".gitattributes",
    "content": "internal/gen-go/* linguist-generated=true"
  },
  {
    "path": ".github/workflows/test.yml",
    "content": "name: Test\non:\n  push:\n    branches:\n      - main\n  pull_request:\n    branches:\n      - '*'\n\njobs:\n  test:\n    strategy:\n      matrix:\n        go:\n        - '1.17.x'\n        - '1.18.x'\n        - '1.19.x'\n        tags:\n        - ''\n        - purego\n        label:\n        - [self-hosted, linux, arm64, segment]\n        - ubuntu-latest\n\n    runs-on: ${{ matrix.label }}\n\n    env:\n      PARQUETGODEBUG: tracebuf=1\n\n    steps:\n    - uses: actions/checkout@v3\n\n    - name: Setup Go ${{ matrix.go }}\n      uses: actions/setup-go@v3\n      with:\n        go-version: ${{ matrix.go }}\n\n    - name: Download Dependencies\n      run: go mod download\n\n    - name: Run Tests\n      run: go test -trimpath -race -tags=${{ matrix.tags }} ./...\n\n    - name: Run Benchmarks\n      run: go test -trimpath -short -tags=${{ matrix.tags }} -run '^$' -bench . -benchtime 1x ./...\n\n  format:\n    runs-on: ubuntu-latest\n\n    steps:\n    - uses: actions/checkout@v3\n\n    - name: Setup Go ${{ matrix.go }}\n      uses: actions/setup-go@v3\n      with:\n        go-version: 1.19.x\n\n    - name: Validate formatting\n      run: make format\n\n  # https://github.com/golangci/golangci-lint/issues/2649\n  # lint:\n  #   runs-on: ubuntu-latest\n  #   steps:\n  #     - uses: actions/checkout@v3\n\n  #     - uses: actions/setup-go@v3\n  #       with:\n  #         go-version: 1.18.x\n\n  #     - name: golangci-lint\n  #       uses: golangci/golangci-lint-action@v3\n  #       with:\n  #         version: latest\n"
  },
  {
    "path": ".gitignore",
    "content": "# Binaries for programs and plugins\n*.exe\n*.exe~\n*.dll\n*.so\n*.dylib\n\n# Test binary, built with `go test -c`\n*.test\n\n# Output of the go coverage tool, specifically when used with LiteIDE\n*.out\n\n# Dependency directories (remove the comment below to include it)\n# vendor/\n\n# Emacs\n*~\n#*#\n.#\n"
  },
  {
    "path": ".mailmap",
    "content": "Achille Roussel <achille@segment.com> Achille <achille@segment.com>\nThomas Pelletier <thomas.pelletier@segment.com> Thomas Pelletier <pelletier.thomas@gmail.com>\n"
  },
  {
    "path": ".words",
    "content": "\nRowType\nTwilio\nbottlenecked\ndecompressors\nint96\nmillis\nnanos\nreindexing\nrepositions\nschemas\nColumnPages\nPageIndex\nZstandard\nxxHash\ncardinality\nenums\n32bit\ndic\nBlart\nVersenwald\npurego\nstdlib\nunscaled\ncespare\nbitset\nchecksumming\n"
  },
  {
    "path": "AUTHORS.txt",
    "content": "Achille Roussel <achille@segment.com>\nFrederic Branczyk <fbranczyk@gmail.com>\nJulien Fabre <julien@segment.com>\nKevin Burke <kevin.burke@segment.com>\nThomas Pelletier <thomas.pelletier@segment.com>\n"
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "content": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nIn the interest of fostering an open and welcoming environment, we as\ncontributors and maintainers pledge to making participation in our project and\nour community a harassment-free experience for everyone, regardless of age, body\nsize, disability, ethnicity, sex characteristics, gender identity and expression,\nlevel of experience, education, socio-economic status, nationality, personal\nappearance, race, religion, or sexual identity and orientation.\n\n## Our Standards\n\nExamples of behavior that contributes to creating a positive environment\ninclude:\n\n- Using welcoming and inclusive language\n- Being respectful of differing viewpoints and experiences\n- Gracefully accepting constructive criticism\n- Focusing on what is best for the community\n- Showing empathy towards other community members\n\nExamples of unacceptable behavior by participants include:\n\n- The use of sexualized language or imagery and unwelcome sexual attention or\n  advances\n- Trolling, insulting/derogatory comments, and personal or political attacks\n- Public or private harassment\n- Publishing others' private information, such as a physical or electronic\n  address, without explicit permission\n- Other conduct which could reasonably be considered inappropriate in a\n  professional setting\n\n## Our Responsibilities\n\nProject maintainers are responsible for clarifying the standards of acceptable\nbehavior and are expected to take appropriate and fair corrective action in\nresponse to any instances of unacceptable behavior.\n\nProject maintainers have the right and responsibility to remove, edit, or\nreject comments, commits, code, wiki edits, issues, and other contributions\nthat are not aligned to this Code of Conduct, or to ban temporarily or\npermanently any contributor for other behaviors that they deem inappropriate,\nthreatening, offensive, or harmful.\n\n## Scope\n\nThis Code of Conduct applies both within project spaces and in public spaces\nwhen an individual is representing the project or its community. Examples of\nrepresenting a project or community include using an official project e-mail\naddress, posting via an official social media account, or acting as an appointed\nrepresentative at an online or offline event. Representation of a project may be\nfurther defined and clarified by project maintainers.\n\n## Enforcement\n\nInstances of abusive, harassing, or otherwise unacceptable behavior may be\nreported by contacting the project team at open-source@twilio.com. All\ncomplaints will be reviewed and investigated and will result in a response that\nis deemed necessary and appropriate to the circumstances. The project team is\nobligated to maintain confidentiality with regard to the reporter of an incident.\nFurther details of specific enforcement policies may be posted separately.\n\nProject maintainers who do not follow or enforce the Code of Conduct in good\nfaith may face temporary or permanent repercussions as determined by other\nmembers of the project's leadership.\n\n## Attribution\n\nThis Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,\navailable at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html\n\n[homepage]: https://www.contributor-covenant.org\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Contributing to segmentio/parquet\n\n## Code of Conduct\n\nHelp us keep the project open and inclusive. Please be kind to and\nconsiderate of other developers, as we all have the same goal: make\nthe project as good as it can be.\n\n* [Code of Conduct](./CODE_OF_CONDUCT.md)\n\n## Licensing\n\nAll third party contributors acknowledge that any contributions they provide\nwill be made under the same open source license that the open source project\nis provided under.\n\n## Contributing\n\n* Open an Issue to report bugs or discuss non-trivial changes.\n* Open a Pull Request to submit a code change for review.\n\n### Coding Rules\n\nTo ensure consistency throughout the source code, keep these rules in mind\nwhen submitting contributions:\n\n* All features or bug fixes must be tested by one or more tests.\n* All exported types, functions, and symbols must be documented.\n* All code must be formatted with `go fmt`.\n"
  },
  {
    "path": "LICENSE",
    "content": "\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n\n--------------------------------------------------------------------------------\n\nThis product includes code from Apache Parquet.\n\n* deprecated/parquet.go is based on Apache Parquet's thrift file\n* format/parquet.go is based on Apache Parquet's thrift file\n\nCopyright: 2014 The Apache Software Foundation.\nHome page: https://github.com/apache/parquet-format\nLicense: http://www.apache.org/licenses/LICENSE-2.0\n"
  },
  {
    "path": "Makefile",
    "content": ".PHONY: format\n\nAUTHORS.txt: .mailmap\n\tgo install github.com/kevinburke/write_mailmap@latest\n\twrite_mailmap > AUTHORS.txt\n\nformat:\n\tgo install github.com/kevinburke/differ@latest\n\tdiffer gofmt -w .\n\ntest:\n\tgo test -v -trimpath -race -tags= ./...\n"
  },
  {
    "path": "README.md",
    "content": "# Project has been Archived\n\nDevelopment has moved to https://github.com/parquet-go/parquet-go. No API's have\nchanged, we just decided to create a new organization for this library. Thank\nyou to all of the contributors for your hard work.\n\n# segmentio/parquet-go\n\nHigh-performance Go library to manipulate parquet files.\n"
  },
  {
    "path": "allocator.go",
    "content": "package parquet\n\nimport \"github.com/segmentio/parquet-go/internal/unsafecast\"\n\ntype allocator struct{ buffer []byte }\n\nfunc (a *allocator) makeBytes(n int) []byte {\n\tif free := cap(a.buffer) - len(a.buffer); free < n {\n\t\tnewCap := 2 * cap(a.buffer)\n\t\tif newCap == 0 {\n\t\t\tnewCap = 4096\n\t\t}\n\t\tfor newCap < n {\n\t\t\tnewCap *= 2\n\t\t}\n\t\ta.buffer = make([]byte, 0, newCap)\n\t}\n\n\ti := len(a.buffer)\n\tj := len(a.buffer) + n\n\ta.buffer = a.buffer[:j]\n\treturn a.buffer[i:j:j]\n}\n\nfunc (a *allocator) copyBytes(v []byte) []byte {\n\tb := a.makeBytes(len(v))\n\tcopy(b, v)\n\treturn b\n}\n\nfunc (a *allocator) copyString(v string) string {\n\tb := a.makeBytes(len(v))\n\tcopy(b, v)\n\treturn unsafecast.BytesToString(b)\n}\n\nfunc (a *allocator) reset() {\n\ta.buffer = a.buffer[:0]\n}\n\n// rowAllocator is a memory allocator used to make a copy of rows referencing\n// memory buffers that parquet-go does not have ownership of.\n//\n// This type is used in the implementation of various readers and writers that\n// need to capture rows passed to the ReadRows/WriteRows methods. Copies to a\n// local buffer is necessary in those cases to repect the reader/writer\n// contracts that do not allow the implementations to retain the rows they\n// are passed as arguments.\n//\n// See: RowBuffer, DedupeRowReader, DedupeRowWriter\ntype rowAllocator struct{ allocator }\n\nfunc (a *rowAllocator) capture(row Row) {\n\tfor i, v := range row {\n\t\tswitch v.Kind() {\n\t\tcase ByteArray, FixedLenByteArray:\n\t\t\trow[i].ptr = unsafecast.AddressOfBytes(a.copyBytes(v.byteArray()))\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "array.go",
    "content": "package parquet\n\nimport (\n\t\"unsafe\"\n\n\t\"github.com/segmentio/parquet-go/sparse\"\n)\n\nfunc makeArrayValue(values []Value, offset uintptr) sparse.Array {\n\tptr := *(*unsafe.Pointer)(unsafe.Pointer(&values))\n\treturn sparse.UnsafeArray(unsafe.Add(ptr, offset), len(values), unsafe.Sizeof(Value{}))\n}\n\nfunc makeArrayString(values []string) sparse.Array {\n\tstr := \"\"\n\tptr := *(*unsafe.Pointer)(unsafe.Pointer(&values))\n\treturn sparse.UnsafeArray(ptr, len(values), unsafe.Sizeof(str))\n}\n\nfunc makeArrayBE128(values []*[16]byte) sparse.Array {\n\tptr := *(*unsafe.Pointer)(unsafe.Pointer(&values))\n\treturn sparse.UnsafeArray(ptr, len(values), unsafe.Sizeof((*[16]byte)(nil)))\n}\n"
  },
  {
    "path": "array_go18.go",
    "content": "//go:build go1.18\n\npackage parquet\n\nimport (\n\t\"unsafe\"\n\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n\t\"github.com/segmentio/parquet-go/sparse\"\n)\n\nfunc makeArray(base unsafe.Pointer, length int, offset uintptr) sparse.Array {\n\treturn sparse.UnsafeArray(base, length, offset)\n}\n\nfunc makeArrayOf[T any](s []T) sparse.Array {\n\tvar model T\n\treturn makeArray(unsafecast.PointerOf(s), len(s), unsafe.Sizeof(model))\n}\n\nfunc makeSlice[T any](a sparse.Array) []T {\n\treturn slice[T](a.Index(0), a.Len())\n}\n\nfunc slice[T any](p unsafe.Pointer, n int) []T {\n\treturn unsafe.Slice((*T)(p), n)\n}\n\ntype sliceHeader struct {\n\tbase unsafe.Pointer\n\tlen  int\n\tcap  int\n}\n"
  },
  {
    "path": "bitmap.go",
    "content": "package parquet\n\nimport \"sync\"\n\ntype bitmap struct {\n\tbits []uint64\n}\n\nfunc (m *bitmap) reset(size int) {\n\tsize = (size + 63) / 64\n\tif cap(m.bits) < size {\n\t\tm.bits = make([]uint64, size, 2*size)\n\t} else {\n\t\tm.bits = m.bits[:size]\n\t\tm.clear()\n\t}\n}\n\nfunc (m *bitmap) clear() {\n\tfor i := range m.bits {\n\t\tm.bits[i] = 0\n\t}\n}\n\nvar (\n\tbitmapPool sync.Pool // *bitmap\n)\n\nfunc acquireBitmap(n int) *bitmap {\n\tb, _ := bitmapPool.Get().(*bitmap)\n\tif b == nil {\n\t\tb = &bitmap{bits: make([]uint64, n, 2*n)}\n\t} else {\n\t\tb.reset(n)\n\t}\n\treturn b\n}\n\nfunc releaseBitmap(b *bitmap) {\n\tif b != nil {\n\t\tbitmapPool.Put(b)\n\t}\n}\n"
  },
  {
    "path": "bloom/block.go",
    "content": "package bloom\n\nimport \"unsafe\"\n\n// Word represents 32 bits words of bloom filter blocks.\ntype Word uint32\n\n// Block represents bloom filter blocks which contain eight 32 bits words.\ntype Block [8]Word\n\n// Bytes returns b as a byte slice.\nfunc (b *Block) Bytes() []byte {\n\treturn unsafe.Slice((*byte)(unsafe.Pointer(b)), BlockSize)\n}\n\nconst (\n\t// BlockSize is the size of bloom filter blocks in bytes.\n\tBlockSize = 32\n\n\tsalt0 = 0x47b6137b\n\tsalt1 = 0x44974d91\n\tsalt2 = 0x8824ad5b\n\tsalt3 = 0xa2b7289d\n\tsalt4 = 0x705495c7\n\tsalt5 = 0x2df1424b\n\tsalt6 = 0x9efc4947\n\tsalt7 = 0x5c6bfb31\n)\n"
  },
  {
    "path": "bloom/block_amd64.go",
    "content": "//go:build !purego\n\npackage bloom\n\nimport \"golang.org/x/sys/cpu\"\n\n// The functions in this file are SIMD-optimized versions of the functions\n// declared in block_optimized.go for x86 targets.\n//\n// The optimization yields measurable improvements over the pure Go versions:\n//\n// goos: darwin\n// goarch: amd64\n// pkg: github.com/segmentio/parquet-go/bloom\n// cpu: Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz\n//\n// name         old time/op    new time/op     delta\n// BlockInsert    11.6ns ± 4%      2.0ns ± 3%   -82.37%  (p=0.000 n=8+8)\n// BlockCheck     12.6ns ±28%      2.1ns ± 4%   -83.12%  (p=0.000 n=10+8)\n//\n// name         old speed      new speed       delta\n// BlockInsert  2.73GB/s ±13%  15.70GB/s ± 3%  +475.96%  (p=0.000 n=9+8)\n// BlockCheck   2.59GB/s ±23%  15.06GB/s ± 4%  +482.25%  (p=0.000 n=10+8)\n//\n// Note that the numbers above are a comparison to the routines implemented in\n// block_optimized.go; the delta comparing to functions in block_default.go is\n// significantly larger but not very interesting since those functions have no\n// practical use cases.\nvar hasAVX2 = cpu.X86.HasAVX2\n\n//go:noescape\nfunc blockInsert(b *Block, x uint32)\n\n//go:noescape\nfunc blockCheck(b *Block, x uint32) bool\n\nfunc (b *Block) Insert(x uint32) { blockInsert(b, x) }\n\nfunc (b *Block) Check(x uint32) bool { return blockCheck(b, x) }\n"
  },
  {
    "path": "bloom/block_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n#define salt0 0x47b6137b\n#define salt1 0x44974d91\n#define salt2 0x8824ad5b\n#define salt3 0xa2b7289d\n#define salt4 0x705495c7\n#define salt5 0x2df1424b\n#define salt6 0x9efc4947\n#define salt7 0x5c6bfb31\n\nDATA ones+0(SB)/4, $1\nDATA ones+4(SB)/4, $1\nDATA ones+8(SB)/4, $1\nDATA ones+12(SB)/4, $1\nDATA ones+16(SB)/4, $1\nDATA ones+20(SB)/4, $1\nDATA ones+24(SB)/4, $1\nDATA ones+28(SB)/4, $1\nGLOBL ones(SB), RODATA|NOPTR, $32\n\nDATA salt+0(SB)/4, $salt0\nDATA salt+4(SB)/4, $salt1\nDATA salt+8(SB)/4, $salt2\nDATA salt+12(SB)/4, $salt3\nDATA salt+16(SB)/4, $salt4\nDATA salt+20(SB)/4, $salt5\nDATA salt+24(SB)/4, $salt6\nDATA salt+28(SB)/4, $salt7\nGLOBL salt(SB), RODATA|NOPTR, $32\n\n// This initial block is a SIMD implementation of the mask function declared in\n// block_default.go and block_optimized.go. For each of the 8 x 32 bits words of\n// the bloom filter block, the operation performed is:\n//\n//      block[i] = 1 << ((x * salt[i]) >> 27)\n//\n// Arguments\n// ---------\n//\n// * src is a memory location where the value to use when computing the mask is\n//   located. The memory location is not modified.\n//\n// * tmp is a YMM register used as scratch space to hold intermediary results in\n//   the algorithm.\n//\n// * dst is a YMM register where the final mask is written.\n//\n#define generateMask(src, tmp, dst) \\\n    VMOVDQA ones(SB), dst \\\n    VPBROADCASTD src, tmp \\\n    VPMULLD salt(SB), tmp, tmp \\\n    VPSRLD $27, tmp, tmp \\\n    VPSLLVD tmp, dst, dst\n\n#define insert(salt, src, dst) \\\n    MOVL src, CX \\\n    IMULL salt, CX \\\n    SHRL $27, CX \\\n    MOVL $1, DX \\\n    SHLL CX, DX \\\n    ORL DX, dst\n\n#define check(salt, b, x) \\\n    MOVL b, CX \\\n    MOVL x, DX \\\n    IMULL salt, DX \\\n    SHRL $27, DX \\\n    BTL DX, CX \\\n    JAE notfound\n\n// func blockInsert(b *Block, x uint32)\nTEXT ·blockInsert(SB), NOSPLIT, $0-16\n    MOVQ b+0(FP), AX\n    CMPB ·hasAVX2(SB), $0\n    JE fallback\navx2:\n    generateMask(x+8(FP), Y1, Y0)\n    // Set all 1 bits of the mask in the bloom filter block.\n    VPOR (AX), Y0, Y0\n    VMOVDQU Y0, (AX)\n    VZEROUPPER\n    RET\nfallback:\n    MOVL x+8(FP), BX\n    insert($salt0, BX, 0(AX))\n    insert($salt1, BX, 4(AX))\n    insert($salt2, BX, 8(AX))\n    insert($salt3, BX, 12(AX))\n    insert($salt4, BX, 16(AX))\n    insert($salt5, BX, 20(AX))\n    insert($salt6, BX, 24(AX))\n    insert($salt7, BX, 28(AX))\n    RET\n\n// func blockCheck(b *Block, x uint32) bool\nTEXT ·blockCheck(SB), NOSPLIT, $0-17\n    MOVQ b+0(FP), AX\n    CMPB ·hasAVX2(SB), $0\n    JE fallback\navx2:\n    generateMask(x+8(FP), Y1, Y0)\n    // Compare the 1 bits of the mask with the bloom filter block, then compare\n    // the result with the mask, expecting equality if the value `x` was present\n    // in the block.\n    VPAND (AX), Y0, Y1 // Y0 = block & mask\n    VPTEST Y0, Y1      // if (Y0 & ^Y1) != 0 { CF = 1 }\n    SETCS ret+16(FP)   // return CF == 1\n    VZEROUPPER\n    RET\nfallback:\n    MOVL x+8(FP), BX\n    check($salt0, 0(AX), BX)\n    check($salt1, 4(AX), BX)\n    check($salt2, 8(AX), BX)\n    check($salt3, 12(AX), BX)\n    check($salt4, 16(AX), BX)\n    check($salt5, 20(AX), BX)\n    check($salt6, 24(AX), BX)\n    check($salt7, 28(AX), BX)\n    MOVB $1, CX\n    JMP done\nnotfound:\n    XORB CX, CX\ndone:\n    MOVB CX, ret+16(FP)\n    RET\n"
  },
  {
    "path": "bloom/block_default.go",
    "content": "//go:build purego && parquet.bloom.no_unroll\n\npackage bloom\n\n// This file contains direct translation of the algorithms described in the\n// parquet bloom filter spec:\n// https://github.com/apache/parquet-format/blob/master/BloomFilter.md\n//\n// There are no practical reasons to eable the parquet.bloom.no_unroll build\n// tag, the code is left here as a reference to ensure that the optimized\n// implementations of block operations behave the same as the functions in this\n// file.\n\nvar salt = [8]uint32{\n\t0: salt0,\n\t1: salt1,\n\t2: salt2,\n\t3: salt3,\n\t4: salt4,\n\t5: salt5,\n\t6: salt6,\n\t7: salt7,\n}\n\nfunc (w *Word) set(i uint) {\n\t*w |= Word(1 << i)\n}\n\nfunc (w Word) has(i uint) bool {\n\treturn ((w >> Word(i)) & 1) != 0\n}\n\nfunc mask(x uint32) Block {\n\tvar b Block\n\tfor i := uint(0); i < 8; i++ {\n\t\ty := x * salt[i]\n\t\tb[i].set(uint(y) >> 27)\n\t}\n\treturn b\n}\n\nfunc (b *Block) Insert(x uint32) {\n\tmasked := mask(x)\n\tfor i := uint(0); i < 8; i++ {\n\t\tfor j := uint(0); j < 32; j++ {\n\t\t\tif masked[i].has(j) {\n\t\t\t\tb[i].set(j)\n\t\t\t}\n\t\t}\n\t}\n}\n\nfunc (b *Block) Check(x uint32) bool {\n\tmasked := mask(x)\n\tfor i := uint(0); i < 8; i++ {\n\t\tfor j := uint(0); j < 32; j++ {\n\t\t\tif masked[i].has(j) {\n\t\t\t\tif !b[i].has(j) {\n\t\t\t\t\treturn false\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\treturn true\n}\n"
  },
  {
    "path": "bloom/block_optimized.go",
    "content": "//go:build (!amd64 || purego) && !parquet.bloom.no_unroll\n\npackage bloom\n\n// The functions in this file are optimized versions of the algorithms described\n// in https://github.com/apache/parquet-format/blob/master/BloomFilter.md\n//\n// The functions are manual unrolling of the loops, which yield significant\n// performance improvements:\n//\n// goos: darwin\n// goarch: amd64\n// pkg: github.com/segmentio/parquet-go/bloom\n// cpu: Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz\n//\n// name         old time/op    new time/op      delta\n// BlockInsert     327ns ± 1%        12ns ± 4%    -96.47%  (p=0.000 n=9+8)\n// BlockCheck      240ns ± 4%        13ns ±28%    -94.75%  (p=0.000 n=8+10)\n//\n// name         old speed      new speed        delta\n// BlockInsert  97.8MB/s ± 1%  2725.0MB/s ±13%  +2686.59%  (p=0.000 n=9+9)\n// BlockCheck    133MB/s ± 4%    2587MB/s ±23%  +1838.46%  (p=0.000 n=8+10)\n//\n// The benchmarks measure throughput based on the byte size of a bloom filter\n// block.\n\nfunc (b *Block) Insert(x uint32) {\n\tb[0] |= 1 << ((x * salt0) >> 27)\n\tb[1] |= 1 << ((x * salt1) >> 27)\n\tb[2] |= 1 << ((x * salt2) >> 27)\n\tb[3] |= 1 << ((x * salt3) >> 27)\n\tb[4] |= 1 << ((x * salt4) >> 27)\n\tb[5] |= 1 << ((x * salt5) >> 27)\n\tb[6] |= 1 << ((x * salt6) >> 27)\n\tb[7] |= 1 << ((x * salt7) >> 27)\n}\n\nfunc (b *Block) Check(x uint32) bool {\n\treturn ((b[0] & (1 << ((x * salt0) >> 27))) != 0) &&\n\t\t((b[1] & (1 << ((x * salt1) >> 27))) != 0) &&\n\t\t((b[2] & (1 << ((x * salt2) >> 27))) != 0) &&\n\t\t((b[3] & (1 << ((x * salt3) >> 27))) != 0) &&\n\t\t((b[4] & (1 << ((x * salt4) >> 27))) != 0) &&\n\t\t((b[5] & (1 << ((x * salt5) >> 27))) != 0) &&\n\t\t((b[6] & (1 << ((x * salt6) >> 27))) != 0) &&\n\t\t((b[7] & (1 << ((x * salt7) >> 27))) != 0)\n}\n\nfunc (f SplitBlockFilter) insertBulk(x []uint64) {\n\tfor i := range x {\n\t\tf.Insert(x[i])\n\t}\n}\n"
  },
  {
    "path": "bloom/block_test.go",
    "content": "package bloom_test\n\nimport (\n\t\"math\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/bloom\"\n)\n\nfunc TestBlock(t *testing.T) {\n\tfor i := uint64(0); i < math.MaxUint32; i = (i * 2) + 1 {\n\t\tx := uint32(i)\n\t\tb := bloom.Block{}\n\t\tb.Insert(x)\n\t\tif !b.Check(x) {\n\t\t\tt.Fatalf(\"bloom filter block does not contain the value that was inserted: %d\", x)\n\t\t}\n\t\tif b.Check(x - 1) {\n\t\t\tt.Fatalf(\"bloom filter block contains value that was not inserted: %d\", ^x)\n\t\t}\n\t\tif b.Check(x + 1) {\n\t\t\tt.Fatalf(\"bloom filter block contains value that was not inserted: %d\", ^x)\n\t\t}\n\t\tif b.Check(^x) {\n\t\t\tt.Fatalf(\"bloom filter block contains value that was not inserted: %d\", ^x)\n\t\t}\n\t}\n}\n\nfunc BenchmarkBlockInsert(b *testing.B) {\n\tx := bloom.Block{}\n\tfor i := 0; i < b.N; i++ {\n\t\tx.Insert(uint32(i))\n\t}\n\tb.SetBytes(bloom.BlockSize)\n}\n\nfunc BenchmarkBlockCheck(b *testing.B) {\n\tx := bloom.Block{}\n\tx.Insert(42)\n\tfor i := 0; i < b.N; i++ {\n\t\tx.Check(42)\n\t}\n\tb.SetBytes(bloom.BlockSize)\n}\n"
  },
  {
    "path": "bloom/bloom.go",
    "content": "// Package bloom implements parquet bloom filters.\npackage bloom\n\nfunc fasthash1x64(value uint64, scale int32) uint64 {\n\treturn ((value >> 32) * uint64(scale)) >> 32\n}\n\nfunc fasthash4x64(dst, src *[4]uint64, scale int32) {\n\tdst[0] = ((src[0] >> 32) * uint64(scale)) >> 32\n\tdst[1] = ((src[1] >> 32) * uint64(scale)) >> 32\n\tdst[2] = ((src[2] >> 32) * uint64(scale)) >> 32\n\tdst[3] = ((src[3] >> 32) * uint64(scale)) >> 32\n}\n"
  },
  {
    "path": "bloom/bloom_test.go",
    "content": "package bloom\n\nimport (\n\t\"math/rand\"\n\t\"testing\"\n)\n\n// Test file for internal functions of the bloom package.\nvar global4x64 [4]uint64\n\nfunc TestFasthash(t *testing.T) {\n\tr := rand.NewSource(0).(rand.Source64)\n\n\tsrc := [4]uint64{r.Uint64(), r.Uint64(), r.Uint64(), r.Uint64()}\n\tdst := [4]uint64{0, 0, 0, 0}\n\texp := [4]uint64{483, 125, 335, 539}\n\tmod := int32(1024)\n\n\tfasthash4x64(&dst, &src, mod)\n\n\tif dst != exp {\n\t\tt.Errorf(\"got=%v want=%v\", dst, exp)\n\t}\n}\n\nfunc BenchmarkFasthash(b *testing.B) {\n\tsrc := [4]uint64{}\n\tdst := [4]uint64{}\n\tmod := int32(1024)\n\n\tfor i := 0; i < b.N; i++ {\n\t\tfasthash4x64(&dst, &src, mod)\n\t}\n\n\tb.SetBytes(32)\n\tglobal4x64 = dst // use it so the loop isn't optimized away\n}\n"
  },
  {
    "path": "bloom/filter.go",
    "content": "package bloom\n\nimport (\n\t\"io\"\n\t\"sync\"\n\t\"unsafe\"\n)\n\n// Filter is an interface representing read-only bloom filters where programs\n// can probe for the possible presence of a hash key.\ntype Filter interface {\n\tCheck(uint64) bool\n}\n\n// SplitBlockFilter is an in-memory implementation of the parquet bloom filters.\n//\n// This type is useful to construct bloom filters that are later serialized\n// to a storage medium.\ntype SplitBlockFilter []Block\n\n// MakeSplitBlockFilter constructs a SplitBlockFilter value from the data byte\n// slice.\nfunc MakeSplitBlockFilter(data []byte) SplitBlockFilter {\n\tp := *(*unsafe.Pointer)(unsafe.Pointer(&data))\n\tn := len(data) / BlockSize\n\treturn unsafe.Slice((*Block)(p), n)\n}\n\n// NumSplitBlocksOf returns the number of blocks in a filter intended to hold\n// the given number of values and bits of filter per value.\n//\n// This function is useful to determine the number of blocks when creating bloom\n// filters in memory, for example:\n//\n//\tf := make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(n, 10))\nfunc NumSplitBlocksOf(numValues int64, bitsPerValue uint) int {\n\tnumBytes := ((uint(numValues) * bitsPerValue) + 7) / 8\n\tnumBlocks := (numBytes + (BlockSize - 1)) / BlockSize\n\treturn int(numBlocks)\n}\n\n// Reset clears the content of the filter f.\nfunc (f SplitBlockFilter) Reset() {\n\tfor i := range f {\n\t\tf[i] = Block{}\n\t}\n}\n\n// Block returns a pointer to the block that the given value hashes to in the\n// bloom filter.\nfunc (f SplitBlockFilter) Block(x uint64) *Block { return &f[fasthash1x64(x, int32(len(f)))] }\n\n// InsertBulk adds all values from x into f.\nfunc (f SplitBlockFilter) InsertBulk(x []uint64) { filterInsertBulk(f, x) }\n\n// Insert adds x to f.\nfunc (f SplitBlockFilter) Insert(x uint64) { filterInsert(f, x) }\n\n// Check tests whether x is in f.\nfunc (f SplitBlockFilter) Check(x uint64) bool { return filterCheck(f, x) }\n\n// Bytes converts f to a byte slice.\n//\n// The returned slice shares the memory of f. The method is intended to be used\n// to serialize the bloom filter to a storage medium.\nfunc (f SplitBlockFilter) Bytes() []byte {\n\treturn unsafe.Slice(*(**byte)(unsafe.Pointer(&f)), len(f)*BlockSize)\n}\n\n// CheckSplitBlock is similar to bloom.SplitBlockFilter.Check but reads the\n// bloom filter of n bytes from r.\n//\n// The size n of the bloom filter is assumed to be a multiple of the block size.\nfunc CheckSplitBlock(r io.ReaderAt, n int64, x uint64) (bool, error) {\n\tblock := acquireBlock()\n\tdefer releaseBlock(block)\n\toffset := BlockSize * fasthash1x64(x, int32(n/BlockSize))\n\t_, err := r.ReadAt(block.Bytes(), int64(offset))\n\treturn block.Check(uint32(x)), err\n}\n\nvar (\n\tblockPool sync.Pool\n)\n\nfunc acquireBlock() *Block {\n\tb, _ := blockPool.Get().(*Block)\n\tif b == nil {\n\t\tb = new(Block)\n\t}\n\treturn b\n}\n\nfunc releaseBlock(b *Block) {\n\tif b != nil {\n\t\tblockPool.Put(b)\n\t}\n}\n"
  },
  {
    "path": "bloom/filter_amd64.go",
    "content": "//go:build !purego\n\npackage bloom\n\n// This file contains the signatures for bloom filter algorithms implemented in\n// filter_amd64.s.\n//\n// The assembly code provides significant speedups on filter inserts and checks,\n// with the greatest gains seen on the bulk insert operation where the use of\n// vectorized code yields great results.\n//\n// The following sections record the kind of performance improvements we were\n// able to measure, comparing with performing the filter block lookups in Go\n// and calling to the block insert and check routines:\n//\n// name              old time/op    new time/op     delta\n// FilterInsertBulk    45.1ns ± 2%    17.8ns ± 3%   -60.41%  (p=0.000 n=10+10)\n// FilterInsert        3.48ns ± 2%     2.55ns ± 1%  -26.86%  (p=0.000 n=10+8)\n// FilterCheck         3.64ns ± 3%     2.66ns ± 2%  -26.82%  (p=0.000 n=10+9)\n//\n// name              old speed      new speed       delta\n// FilterInsertBulk  11.4GB/s ± 2%  28.7GB/s ± 3%  +152.61%  (p=0.000 n=10+10)\n// FilterInsert      9.19GB/s ± 2%  12.56GB/s ± 1%  +36.71%  (p=0.000 n=10+8)\n// FilterCheck       8.80GB/s ± 3%  12.03GB/s ± 2%  +36.61%  (p=0.000 n=10+9)\n\n//go:noescape\nfunc filterInsertBulk(f []Block, x []uint64)\n\n//go:noescape\nfunc filterInsert(f []Block, x uint64)\n\n//go:noescape\nfunc filterCheck(f []Block, x uint64) bool\n"
  },
  {
    "path": "bloom/filter_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n#define salt0 0x47b6137b\n#define salt1 0x44974d91\n#define salt2 0x8824ad5b\n#define salt3 0xa2b7289d\n#define salt4 0x705495c7\n#define salt5 0x2df1424b\n#define salt6 0x9efc4947\n#define salt7 0x5c6bfb31\n\n// See block_amd64.s for a description of this algorithm.\n#define generateMask(src, dst) \\\n    VMOVDQA ones(SB), dst \\\n    VPMULLD salt(SB), src, src \\\n    VPSRLD $27, src, src \\\n    VPSLLVD src, dst, dst\n\n#define applyMask(src, dst) \\\n    VPOR dst, src, src \\\n    VMOVDQU src, dst\n\n#define fasthash1x64(scale, value) \\\n    SHRQ $32, value \\\n    IMULQ scale, value \\\n    SHRQ $32, value \\\n    SHLQ $5, value\n\n#define fasthash4x64(scale, value) \\\n    VPSRLQ $32, value, value \\\n    VPMULUDQ scale, value, value \\\n    VPSRLQ $32, value, value \\\n    VPSLLQ $5, value, value\n\n#define extract4x64(srcYMM, srcXMM, tmpXMM, r0, r1, r2, r3) \\\n    VEXTRACTI128 $1, srcYMM, tmpXMM \\\n    MOVQ srcXMM, r0 \\\n    VPEXTRQ $1, srcXMM, r1 \\\n    MOVQ tmpXMM, r2 \\\n    VPEXTRQ $1, tmpXMM, r3\n\n#define insert(salt, src, dst) \\\n    MOVL src, CX \\\n    IMULL salt, CX \\\n    SHRL $27, CX \\\n    MOVL $1, DX \\\n    SHLL CX, DX \\\n    ORL DX, dst\n\n#define check(salt, b, x) \\\n    MOVL b, CX \\\n    MOVL x, DX \\\n    IMULL salt, DX \\\n    SHRL $27, DX \\\n    BTL DX, CX \\\n    JAE notfound\n\n// func filterInsertBulk(f []Block, x []uint64)\nTEXT ·filterInsertBulk(SB), NOSPLIT, $0-48\n    MOVQ f_base+0(FP), AX\n    MOVQ f_len+8(FP), CX\n    MOVQ x_base+24(FP), BX\n    MOVQ x_len+32(FP), DX\n    CMPB ·hasAVX2(SB), $0\n    JE fallback\navx2:\n    VPBROADCASTQ f_base+8(FP), Y0\n    // Loop initialization, SI holds the current index in `x`, DI is the number\n    // of elements in `x` rounded down to the nearest multiple of 4.\n    XORQ SI, SI\n    MOVQ DX, DI\n    SHRQ $2, DI\n    SHLQ $2, DI\navx2loop4x64:\n    CMPQ SI, DI\n    JAE avx2loop1x64\n\n    // The masks and indexes for 4 input hashes are computed in each loop\n    // iteration. The hashes are loaded in Y1 so we can use vector instructions\n    // to compute all 4 indexes in parallel. The lower 32 bits of the hashes are\n    // also broadcasted in 4 YMM registers to compute the 4 masks that will then\n    // be applied to the filter.\n    VMOVDQU (BX)(SI*8), Y1\n    VPBROADCASTD 0(BX)(SI*8), Y2\n    VPBROADCASTD 8(BX)(SI*8), Y3\n    VPBROADCASTD 16(BX)(SI*8), Y4\n    VPBROADCASTD 24(BX)(SI*8), Y5\n\n    fasthash4x64(Y0, Y1)\n    generateMask(Y2, Y6)\n    generateMask(Y3, Y7)\n    generateMask(Y4, Y8)\n    generateMask(Y5, Y9)\n\n    // The next block of instructions move indexes from the vector to general\n    // purpose registers in order to use them as offsets when applying the mask\n    // to the filter.\n    extract4x64(Y1, X1, X10, R8, R9, R10, R11)\n\n    // Apply masks to the filter; this operation is sensitive to aliasing, when\n    // blocks overlap the, CPU has to serialize the reads and writes, which has\n    // a measurable impact on throughput. This would be frequent for small bloom\n    // filters which may have only a few blocks, the probability of seeing\n    // overlapping blocks on large filters should be small enough to make this\n    // a non-issue though.\n    applyMask(Y6, (AX)(R8*1))\n    applyMask(Y7, (AX)(R9*1))\n    applyMask(Y8, (AX)(R10*1))\n    applyMask(Y9, (AX)(R11*1))\n\n    ADDQ $4, SI\n    JMP avx2loop4x64\navx2loop1x64:\n    // Compute trailing elements in `x` if the length was not a multiple of 4.\n    // This is the same algorithm as the one in the loop4x64 section, working\n    // on a single mask/block pair at a time.\n    CMPQ SI, DX\n    JE avx2done\n    MOVQ (BX)(SI*8), R8\n    VPBROADCASTD (BX)(SI*8), Y0\n    fasthash1x64(CX, R8)\n    generateMask(Y0, Y1)\n    applyMask(Y1, (AX)(R8*1))\n    INCQ SI\n    JMP avx2loop1x64\navx2done:\n    VZEROUPPER\n    JMP done\nfallback:\n    XORQ SI, SI\n    MOVQ DX, DI\n    MOVQ CX, R10\nloop:\n    CMPQ SI, DI\n    JE done\n    MOVLQZX (BX)(SI*8), R8\n    MOVQ (BX)(SI*8), R9\n    fasthash1x64(R10, R9)\n    insert($salt0, R8, 0(AX)(R9*1))\n    insert($salt1, R8, 4(AX)(R9*1))\n    insert($salt2, R8, 8(AX)(R9*1))\n    insert($salt3, R8, 12(AX)(R9*1))\n    insert($salt4, R8, 16(AX)(R9*1))\n    insert($salt5, R8, 20(AX)(R9*1))\n    insert($salt6, R8, 24(AX)(R9*1))\n    insert($salt7, R8, 28(AX)(R9*1))\n    INCQ SI\n    JMP loop\ndone:\n    RET\n\n// func filterInsert(f []Block, x uint64)\nTEXT ·filterInsert(SB), NOSPLIT, $0-32\n    MOVQ f_base+0(FP), AX\n    MOVQ f_len+8(FP), BX\n    MOVQ x+24(FP), CX\n    fasthash1x64(BX, CX)\n    CMPB ·hasAVX2(SB), $0\n    JE fallback\navx2:\n    VPBROADCASTD x+24(FP), Y1\n    generateMask(Y1, Y0)\n    applyMask(Y0, (AX)(CX*1))\n    VZEROUPPER\n    RET\nfallback:\n    ADDQ CX, AX\n    MOVL x+24(FP), BX\n    insert($salt0, BX, 0(AX))\n    insert($salt1, BX, 4(AX))\n    insert($salt2, BX, 8(AX))\n    insert($salt3, BX, 12(AX))\n    insert($salt4, BX, 16(AX))\n    insert($salt5, BX, 20(AX))\n    insert($salt6, BX, 24(AX))\n    insert($salt7, BX, 28(AX))\n    RET\n\n// func filterCheck(f []Block, x uint64) bool\nTEXT ·filterCheck(SB), NOSPLIT, $0-33\n    MOVQ f_base+0(FP), AX\n    MOVQ f_len+8(FP), BX\n    MOVQ x+24(FP), CX\n    fasthash1x64(BX, CX)\n    CMPB ·hasAVX2(SB), $0\n    JE fallback\navx2:\n    VPBROADCASTD x+24(FP), Y1\n    generateMask(Y1, Y0)\n    VPAND (AX)(CX*1), Y0, Y1\n    VPTEST Y0, Y1\n    SETCS ret+32(FP)\n    VZEROUPPER\n    RET\nfallback:\n    ADDQ CX, AX\n    MOVL x+24(FP), BX\n    check($salt0, 0(AX), BX)\n    check($salt1, 4(AX), BX)\n    check($salt2, 8(AX), BX)\n    check($salt3, 12(AX), BX)\n    check($salt4, 16(AX), BX)\n    check($salt5, 20(AX), BX)\n    check($salt6, 24(AX), BX)\n    check($salt7, 28(AX), BX)\n    MOVB $1, CX\n    JMP done\nnotfound:\n    XORB CX, CX\ndone:\n    MOVB CX, ret+32(FP)\n    RET\n"
  },
  {
    "path": "bloom/filter_default.go",
    "content": "//go:build purego || !amd64\n\npackage bloom\n\nfunc filterInsertBulk(f []Block, x []uint64) {\n\tfor i := range x {\n\t\tfilterInsert(f, x[i])\n\t}\n}\n\nfunc filterInsert(f []Block, x uint64) {\n\tf[fasthash1x64(x, int32(len(f)))].Insert(uint32(x))\n}\n\nfunc filterCheck(f []Block, x uint64) bool {\n\treturn f[fasthash1x64(x, int32(len(f)))].Check(uint32(x))\n}\n"
  },
  {
    "path": "bloom/filter_test.go",
    "content": "package bloom_test\n\nimport (\n\t\"bytes\"\n\t\"math/rand\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/bloom\"\n)\n\nfunc TestSplitBlockFilter(t *testing.T) {\n\tconst N = 1000\n\tconst S = 3\n\tf := make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(N, 10))\n\tp := rand.New(rand.NewSource(S))\n\n\t// Half of the values are inserted individually.\n\tfor i := 0; i < N/2; i++ {\n\t\tf.Insert(p.Uint64())\n\t}\n\t// The other half is inserted as a bulk operation.\n\tb := make([]uint64, N/2)\n\tfor i := range b {\n\t\tb[i] = p.Uint64()\n\t}\n\tf.InsertBulk(b)\n\n\tif f.Block(0) == nil {\n\t\tt.Fatal(\"looking up filter block returned impossible nil value\")\n\t}\n\n\tfor _, test := range []struct {\n\t\tscenario string\n\t\tfilter   bloom.Filter\n\t}{\n\t\t{scenario: \"filter\", filter: f},\n\t\t{scenario: \"reader\", filter: newSerializedFilter(f.Bytes())},\n\t} {\n\t\tt.Run(test.scenario, func(t *testing.T) {\n\t\t\tp.Seed(S)\n\t\t\tfalsePositives := 0\n\n\t\t\tfor i := 0; i < N; i++ {\n\t\t\t\tx := p.Uint64()\n\n\t\t\t\tif !test.filter.Check(x) {\n\t\t\t\t\tt.Fatalf(\"bloom filter block does not contain the value #%d that was inserted: %d\", i, x)\n\t\t\t\t}\n\t\t\t\tif test.filter.Check(^x) {\n\t\t\t\t\tfalsePositives++\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tif r := (float64(falsePositives) / N); r > 0.01 {\n\t\t\t\tt.Fatalf(\"bloom filter triggered too many false positives: %g%%\", r*100)\n\t\t\t}\n\t\t})\n\t}\n\n\tt.Run(\"Reset\", func(t *testing.T) {\n\t\tallZeros := true\n\t\tfor _, b := range f.Bytes() {\n\t\t\tif b != 0 {\n\t\t\t\tallZeros = false\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\t\tif allZeros {\n\t\t\tt.Fatal(\"bloom filter bytes were all zero after inserting keys\")\n\t\t}\n\t\tf.Reset()\n\t\tfor i, b := range f.Bytes() {\n\t\t\tif b != 0 {\n\t\t\t\tt.Fatalf(\"bloom filter byte at index %d was not zero after resetting the filter: %02X\", i, b)\n\t\t\t}\n\t\t}\n\t})\n}\n\nfunc TestSplitBlockFilterBug1(t *testing.T) {\n\t// This test exercises the case where we bulk insert a single key in the\n\t// filter, which skips the core of the optimized assembly routines and runs\n\t// through the loop handling tails of remaining keys after consuming groups\n\t// of two or more.\n\t//\n\t// The use of quick.Check in bloom filter tests of the parquet package had\n\t// uncovered a bug which was reproduced here in isolation when debugging.\n\th := [1]uint64{0b1000101001000001001001111000000100011011001000011110011100110000}\n\tf := make(bloom.SplitBlockFilter, 1)\n\tf.InsertBulk(h[:])\n\tif !f.Check(h[0]) {\n\t\tt.Error(\"value inserted in the filter was not found\")\n\t}\n}\n\ntype serializedFilter struct {\n\tbytes.Reader\n}\n\nfunc (f *serializedFilter) Check(x uint64) bool {\n\tok, _ := bloom.CheckSplitBlock(&f.Reader, f.Size(), x)\n\treturn ok\n}\n\nfunc newSerializedFilter(b []byte) *serializedFilter {\n\tf := new(serializedFilter)\n\tf.Reset(b)\n\treturn f\n}\n\nfunc BenchmarkFilterInsertBulk(b *testing.B) {\n\tf := make(bloom.SplitBlockFilter, 99)\n\tx := make([]uint64, 16)\n\tr := rand.NewSource(0).(rand.Source64)\n\n\tfor i := range x {\n\t\tx[i] = r.Uint64()\n\t}\n\n\tfor i := 0; i < b.N; i++ {\n\t\tf.InsertBulk(x)\n\t}\n\n\tb.SetBytes(bloom.BlockSize * int64(len(x)))\n}\n\nfunc BenchmarkFilterInsert(b *testing.B) {\n\tf := make(bloom.SplitBlockFilter, 1)\n\tfor i := 0; i < b.N; i++ {\n\t\tf.Insert(uint64(i))\n\t}\n\tb.SetBytes(bloom.BlockSize)\n}\n\nfunc BenchmarkFilterCheck(b *testing.B) {\n\tf := make(bloom.SplitBlockFilter, 1)\n\tf.Insert(42)\n\tfor i := 0; i < b.N; i++ {\n\t\tf.Check(42)\n\t}\n\tb.SetBytes(bloom.BlockSize)\n}\n"
  },
  {
    "path": "bloom/hash.go",
    "content": "package bloom\n\nimport \"github.com/segmentio/parquet-go/bloom/xxhash\"\n\n// Hash is an interface abstracting the hashing algorithm used in bloom filters.\n//\n// Hash instances must be safe to use concurrently from multiple goroutines.\ntype Hash interface {\n\t// Returns the 64 bit hash of the value passed as argument.\n\tSum64(value []byte) uint64\n\n\t// Compute hashes of individual values of primitive types.\n\tSum64Uint8(value uint8) uint64\n\tSum64Uint16(value uint16) uint64\n\tSum64Uint32(value uint32) uint64\n\tSum64Uint64(value uint64) uint64\n\tSum64Uint128(value [16]byte) uint64\n\n\t// Compute hashes of the array of fixed size values passed as arguments,\n\t// returning the number of hashes written to the destination buffer.\n\tMultiSum64Uint8(dst []uint64, src []uint8) int\n\tMultiSum64Uint16(dst []uint64, src []uint16) int\n\tMultiSum64Uint32(dst []uint64, src []uint32) int\n\tMultiSum64Uint64(dst []uint64, src []uint64) int\n\tMultiSum64Uint128(dst []uint64, src [][16]byte) int\n}\n\n// XXH64 is an implementation of the Hash interface using the XXH64 algorithm.\ntype XXH64 struct{}\n\nfunc (XXH64) Sum64(b []byte) uint64 {\n\treturn xxhash.Sum64(b)\n}\n\nfunc (XXH64) Sum64Uint8(v uint8) uint64 {\n\treturn xxhash.Sum64Uint8(v)\n}\n\nfunc (XXH64) Sum64Uint16(v uint16) uint64 {\n\treturn xxhash.Sum64Uint16(v)\n}\n\nfunc (XXH64) Sum64Uint32(v uint32) uint64 {\n\treturn xxhash.Sum64Uint32(v)\n}\n\nfunc (XXH64) Sum64Uint64(v uint64) uint64 {\n\treturn xxhash.Sum64Uint64(v)\n}\n\nfunc (XXH64) Sum64Uint128(v [16]byte) uint64 {\n\treturn xxhash.Sum64Uint128(v)\n}\n\nfunc (XXH64) MultiSum64Uint8(h []uint64, v []uint8) int {\n\treturn xxhash.MultiSum64Uint8(h, v)\n}\n\nfunc (XXH64) MultiSum64Uint16(h []uint64, v []uint16) int {\n\treturn xxhash.MultiSum64Uint16(h, v)\n}\n\nfunc (XXH64) MultiSum64Uint32(h []uint64, v []uint32) int {\n\treturn xxhash.MultiSum64Uint32(h, v)\n}\n\nfunc (XXH64) MultiSum64Uint64(h []uint64, v []uint64) int {\n\treturn xxhash.MultiSum64Uint64(h, v)\n}\n\nfunc (XXH64) MultiSum64Uint128(h []uint64, v [][16]byte) int {\n\treturn xxhash.MultiSum64Uint128(h, v)\n}\n\nvar (\n\t_ Hash = XXH64{}\n)\n"
  },
  {
    "path": "bloom/xxhash/LICENSE",
    "content": "The following files in this directory were derived from the open-source\nproject at https://github.com/cespare/xxhash. A copy of the original\nlicense is provided below.\n------------------------------------------------------------------------\n\nCopyright (c) 2016 Caleb Spare\n\nMIT License\n\nPermission is hereby granted, free of charge, to any person obtaining\na copy of this software and associated documentation files (the\n\"Software\"), to deal in the Software without restriction, including\nwithout limitation the rights to use, copy, modify, merge, publish,\ndistribute, sublicense, and/or sell copies of the Software, and to\npermit persons to whom the Software is furnished to do so, subject to\nthe following conditions:\n\nThe above copyright notice and this permission notice shall be\nincluded in all copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,\nEXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\nMERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\nNONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE\nLIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION\nOF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION\nWITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n"
  },
  {
    "path": "bloom/xxhash/sum64uint.go",
    "content": "package xxhash\n\nfunc Sum64Uint8(v uint8) uint64 {\n\th := prime5 + 1\n\th ^= uint64(v) * prime5\n\treturn avalanche(rol11(h) * prime1)\n}\n\nfunc Sum64Uint16(v uint16) uint64 {\n\th := prime5 + 2\n\th ^= uint64(v&0xFF) * prime5\n\th = rol11(h) * prime1\n\th ^= uint64(v>>8) * prime5\n\th = rol11(h) * prime1\n\treturn avalanche(h)\n}\n\nfunc Sum64Uint32(v uint32) uint64 {\n\th := prime5 + 4\n\th ^= uint64(v) * prime1\n\treturn avalanche(rol23(h)*prime2 + prime3)\n}\n\nfunc Sum64Uint64(v uint64) uint64 {\n\th := prime5 + 8\n\th ^= round(0, v)\n\treturn avalanche(rol27(h)*prime1 + prime4)\n}\n\nfunc Sum64Uint128(v [16]byte) uint64 {\n\th := prime5 + 16\n\th ^= round(0, u64(v[:8]))\n\th = rol27(h)*prime1 + prime4\n\th ^= round(0, u64(v[8:]))\n\th = rol27(h)*prime1 + prime4\n\treturn avalanche(h)\n}\n"
  },
  {
    "path": "bloom/xxhash/sum64uint_amd64.go",
    "content": "//go:build !purego\n\npackage xxhash\n\nimport \"golang.org/x/sys/cpu\"\n\n// This file contains the declaration of signatures for the multi hashing\n// functions implemented in sum64uint_amd64.s, which provides vectorized\n// versions of the algorithms written in sum64uint_purego.go.\n//\n// The use of SIMD optimization yields measurable throughput increases when\n// computing multiple hash values in parallel compared to hashing values\n// individually in loops:\n//\n// name                   old speed      new speed      delta\n// MultiSum64Uint8/4KB    4.94GB/s ± 2%  6.82GB/s ± 5%  +38.00%  (p=0.000 n=10+10)\n// MultiSum64Uint16/4KB   3.44GB/s ± 2%  4.63GB/s ± 4%  +34.56%  (p=0.000 n=10+10)\n// MultiSum64Uint32/4KB   4.84GB/s ± 2%  6.39GB/s ± 4%  +31.94%  (p=0.000 n=10+10)\n// MultiSum64Uint64/4KB   3.77GB/s ± 2%  4.95GB/s ± 2%  +31.14%  (p=0.000 n=9+10)\n// MultiSum64Uint128/4KB  1.84GB/s ± 2%  3.11GB/s ± 4%  +68.70%  (p=0.000 n=9+10)\n//\n// name                   old hash/s     new hash/s     delta\n// MultiSum64Uint8/4KB        617M ± 2%      852M ± 5%  +38.00%  (p=0.000 n=10+10)\n// MultiSum64Uint16/4KB       431M ± 2%      579M ± 4%  +34.56%  (p=0.000 n=10+10)\n// MultiSum64Uint32/4KB       605M ± 2%      799M ± 4%  +31.94%  (p=0.000 n=10+10)\n// MultiSum64Uint64/4KB       471M ± 2%      618M ± 2%  +31.14%  (p=0.000 n=9+10)\n// MultiSum64Uint128/4KB      231M ± 2%      389M ± 4%  +68.70%  (p=0.000 n=9+10)\n//\n// The benchmarks measure the throughput of hashes produced, as a rate of values\n// and bytes.\n\nvar hasAVX512 = cpu.X86.HasAVX512 &&\n\tcpu.X86.HasAVX512F &&\n\tcpu.X86.HasAVX512CD\n\n//go:noescape\nfunc MultiSum64Uint8(h []uint64, v []uint8) int\n\n//go:noescape\nfunc MultiSum64Uint16(h []uint64, v []uint16) int\n\n//go:noescape\nfunc MultiSum64Uint32(h []uint64, v []uint32) int\n\n//go:noescape\nfunc MultiSum64Uint64(h []uint64, v []uint64) int\n\n//go:noescape\nfunc MultiSum64Uint128(h []uint64, v [][16]byte) int\n"
  },
  {
    "path": "bloom/xxhash/sum64uint_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n/*\nThe algorithms in this file are assembly versions of the Go functions in the\nsum64uint_default.go file.\n\nThe implementations are mostly direct translations of the Go code to assembly,\nleveraging SIMD instructions to process chunks of the input variables in\nparallel at each loop iteration. To maximize utilization of the CPU capacity,\nsome of the functions unroll two steps of the vectorized loop per iteration,\nwhich yields further throughput because the CPU is able to process some of the\ninstruction from the two steps in parallel due to having no data dependencies\nbetween the inputs and outputs.\n\nThe use of AVX-512 yields a significant increase in throughput on all the\nalgorithms, in most part thanks to the VPMULLQ instructions which compute\n8 x 64 bits multiplication. There were no equivalent instruction in AVX2, which\nrequired emulating vector multiplication with a combination of 32 bits multiply,\nadditions, shifts, and masks: the amount of instructions and data dependencies\nresulted in the AVX2 code yielding equivalent performance characteristics for a\nmuch higher complexity.\n\nThe benchmark results below showcase the improvements that the AVX-512 code\nyields on the XXH64 algorithms:\n\nname                   old speed      new speed       delta\nMultiSum64Uint8/4KB    4.97GB/s ± 0%  14.59GB/s ± 1%  +193.73%  (p=0.000 n=10+10)\nMultiSum64Uint16/4KB   3.55GB/s ± 0%   9.46GB/s ± 0%  +166.20%  (p=0.000 n=10+9)\nMultiSum64Uint32/4KB   4.48GB/s ± 0%  13.93GB/s ± 1%  +210.93%  (p=0.000 n=10+10)\nMultiSum64Uint64/4KB   3.57GB/s ± 0%  11.12GB/s ± 1%  +211.73%  (p=0.000 n=9+10)\nMultiSum64Uint128/4KB  2.54GB/s ± 0%   6.49GB/s ± 1%  +155.69%  (p=0.000 n=10+10)\n\nname                   old hash/s     new hash/s      delta\nMultiSum64Uint8/4KB        621M ± 0%      1823M ± 1%  +193.73%  (p=0.000 n=10+10)\nMultiSum64Uint16/4KB       444M ± 0%      1182M ± 0%  +166.20%  (p=0.000 n=10+9)\nMultiSum64Uint32/4KB       560M ± 0%      1742M ± 1%  +210.93%  (p=0.000 n=10+10)\nMultiSum64Uint64/4KB       446M ± 0%      1391M ± 1%  +211.73%  (p=0.000 n=9+10)\nMultiSum64Uint128/4KB      317M ± 0%       811M ± 1%  +155.69%  (p=0.000 n=10+10)\n\nThe functions perform runtime detection of AVX-512 support by testing the value\nof the xxhash.hasAVX512 variable declared and initialized in sum64uint_amd64.go.\nBranch mispredictions on those tests are very unlikely since the value is never\nmodified by the application. The cost of the comparisons are also amortized by\nthe bulk APIs of the MultiSum64* functions (a single test is required per call).\n\nIf a bug is suspected in the vectorized code, compiling the program or running\nthe tests with -tags=purego can help verify whether the behavior changes when\nthe program does not use the assembly versions.\n\nMaintenance of these functions can be complex; however, the XXH64 algorithm is\nunlikely to evolve, and the implementations unlikely to change. The tests in\nsum64uint_test.go compare the outputs of MultiSum64* functions with the\nreference xxhash.Sum64 function, future maintainers can rely on those tests\npassing as a guarantee that they have not introduced regressions.\n*/\n\n#define PRIME1 0x9E3779B185EBCA87\n#define PRIME2 0xC2B2AE3D27D4EB4F\n#define PRIME3 0x165667B19E3779F9\n#define PRIME4 0x85EBCA77C2B2AE63\n#define PRIME5 0x27D4EB2F165667C5\n\n#define prime1 R12\n#define prime2 R13\n#define prime3 R14\n#define prime4 R15\n#define prime5 R15 // same as prime4 because they are not used together\n\n#define prime1ZMM Z12\n#define prime2ZMM Z13\n#define prime3ZMM Z14\n#define prime4ZMM Z15\n#define prime5ZMM Z15\n\nDATA prime1vec<>+0(SB)/8, $PRIME1\nDATA prime1vec<>+8(SB)/8, $PRIME1\nDATA prime1vec<>+16(SB)/8, $PRIME1\nDATA prime1vec<>+24(SB)/8, $PRIME1\nDATA prime1vec<>+32(SB)/8, $PRIME1\nDATA prime1vec<>+40(SB)/8, $PRIME1\nDATA prime1vec<>+48(SB)/8, $PRIME1\nDATA prime1vec<>+56(SB)/8, $PRIME1\nGLOBL prime1vec<>(SB), RODATA|NOPTR, $64\n\nDATA prime2vec<>+0(SB)/8, $PRIME2\nDATA prime2vec<>+8(SB)/8, $PRIME2\nDATA prime2vec<>+16(SB)/8, $PRIME2\nDATA prime2vec<>+24(SB)/8, $PRIME2\nDATA prime2vec<>+32(SB)/8, $PRIME2\nDATA prime2vec<>+40(SB)/8, $PRIME2\nDATA prime2vec<>+48(SB)/8, $PRIME2\nDATA prime2vec<>+56(SB)/8, $PRIME2\nGLOBL prime2vec<>(SB), RODATA|NOPTR, $64\n\nDATA prime3vec<>+0(SB)/8, $PRIME3\nDATA prime3vec<>+8(SB)/8, $PRIME3\nDATA prime3vec<>+16(SB)/8, $PRIME3\nDATA prime3vec<>+24(SB)/8, $PRIME3\nDATA prime3vec<>+32(SB)/8, $PRIME3\nDATA prime3vec<>+40(SB)/8, $PRIME3\nDATA prime3vec<>+48(SB)/8, $PRIME3\nDATA prime3vec<>+56(SB)/8, $PRIME3\nGLOBL prime3vec<>(SB), RODATA|NOPTR, $64\n\nDATA prime4vec<>+0(SB)/8, $PRIME4\nDATA prime4vec<>+8(SB)/8, $PRIME4\nDATA prime4vec<>+16(SB)/8, $PRIME4\nDATA prime4vec<>+24(SB)/8, $PRIME4\nDATA prime4vec<>+32(SB)/8, $PRIME4\nDATA prime4vec<>+40(SB)/8, $PRIME4\nDATA prime4vec<>+48(SB)/8, $PRIME4\nDATA prime4vec<>+56(SB)/8, $PRIME4\nGLOBL prime4vec<>(SB), RODATA|NOPTR, $64\n\nDATA prime5vec<>+0(SB)/8, $PRIME5\nDATA prime5vec<>+8(SB)/8, $PRIME5\nDATA prime5vec<>+16(SB)/8, $PRIME5\nDATA prime5vec<>+24(SB)/8, $PRIME5\nDATA prime5vec<>+32(SB)/8, $PRIME5\nDATA prime5vec<>+40(SB)/8, $PRIME5\nDATA prime5vec<>+48(SB)/8, $PRIME5\nDATA prime5vec<>+56(SB)/8, $PRIME5\nGLOBL prime5vec<>(SB), RODATA|NOPTR, $64\n\nDATA prime5vec1<>+0(SB)/8, $PRIME5+1\nDATA prime5vec1<>+8(SB)/8, $PRIME5+1\nDATA prime5vec1<>+16(SB)/8, $PRIME5+1\nDATA prime5vec1<>+24(SB)/8, $PRIME5+1\nDATA prime5vec1<>+32(SB)/8, $PRIME5+1\nDATA prime5vec1<>+40(SB)/8, $PRIME5+1\nDATA prime5vec1<>+48(SB)/8, $PRIME5+1\nDATA prime5vec1<>+56(SB)/8, $PRIME5+1\nGLOBL prime5vec1<>(SB), RODATA|NOPTR, $64\n\nDATA prime5vec2<>+0(SB)/8, $PRIME5+2\nDATA prime5vec2<>+8(SB)/8, $PRIME5+2\nDATA prime5vec2<>+16(SB)/8, $PRIME5+2\nDATA prime5vec2<>+24(SB)/8, $PRIME5+2\nDATA prime5vec2<>+32(SB)/8, $PRIME5+2\nDATA prime5vec2<>+40(SB)/8, $PRIME5+2\nDATA prime5vec2<>+48(SB)/8, $PRIME5+2\nDATA prime5vec2<>+56(SB)/8, $PRIME5+2\nGLOBL prime5vec2<>(SB), RODATA|NOPTR, $64\n\nDATA prime5vec4<>+0(SB)/8, $PRIME5+4\nDATA prime5vec4<>+8(SB)/8, $PRIME5+4\nDATA prime5vec4<>+16(SB)/8, $PRIME5+4\nDATA prime5vec4<>+24(SB)/8, $PRIME5+4\nDATA prime5vec4<>+32(SB)/8, $PRIME5+4\nDATA prime5vec4<>+40(SB)/8, $PRIME5+4\nDATA prime5vec4<>+48(SB)/8, $PRIME5+4\nDATA prime5vec4<>+56(SB)/8, $PRIME5+4\nGLOBL prime5vec4<>(SB), RODATA|NOPTR, $64\n\nDATA prime5vec8<>+0(SB)/8, $PRIME5+8\nDATA prime5vec8<>+8(SB)/8, $PRIME5+8\nDATA prime5vec8<>+16(SB)/8, $PRIME5+8\nDATA prime5vec8<>+24(SB)/8, $PRIME5+8\nDATA prime5vec8<>+32(SB)/8, $PRIME5+8\nDATA prime5vec8<>+40(SB)/8, $PRIME5+8\nDATA prime5vec8<>+48(SB)/8, $PRIME5+8\nDATA prime5vec8<>+56(SB)/8, $PRIME5+8\nGLOBL prime5vec8<>(SB), RODATA|NOPTR, $64\n\nDATA prime5vec16<>+0(SB)/8, $PRIME5+16\nDATA prime5vec16<>+8(SB)/8, $PRIME5+16\nDATA prime5vec16<>+16(SB)/8, $PRIME5+16\nDATA prime5vec16<>+24(SB)/8, $PRIME5+16\nDATA prime5vec16<>+32(SB)/8, $PRIME5+16\nDATA prime5vec16<>+40(SB)/8, $PRIME5+16\nDATA prime5vec16<>+48(SB)/8, $PRIME5+16\nDATA prime5vec16<>+56(SB)/8, $PRIME5+16\nGLOBL prime5vec16<>(SB), RODATA|NOPTR, $64\n\nDATA lowbytemask<>+0(SB)/8, $0xFF\nDATA lowbytemask<>+8(SB)/8, $0xFF\nDATA lowbytemask<>+16(SB)/8, $0xFF\nDATA lowbytemask<>+24(SB)/8, $0xFF\nDATA lowbytemask<>+32(SB)/8, $0xFF\nDATA lowbytemask<>+40(SB)/8, $0xFF\nDATA lowbytemask<>+48(SB)/8, $0xFF\nDATA lowbytemask<>+56(SB)/8, $0xFF\nGLOBL lowbytemask<>(SB), RODATA|NOPTR, $64\n\nDATA vpermi2qeven<>+0(SB)/8, $0\nDATA vpermi2qeven<>+8(SB)/8, $2\nDATA vpermi2qeven<>+16(SB)/8, $4\nDATA vpermi2qeven<>+24(SB)/8, $6\nDATA vpermi2qeven<>+32(SB)/8, $(1<<3)|0\nDATA vpermi2qeven<>+40(SB)/8, $(1<<3)|2\nDATA vpermi2qeven<>+48(SB)/8, $(1<<3)|4\nDATA vpermi2qeven<>+56(SB)/8, $(1<<3)|6\nGLOBL vpermi2qeven<>(SB), RODATA|NOPTR, $64\n\nDATA vpermi2qodd<>+0(SB)/8, $1\nDATA vpermi2qodd<>+8(SB)/8, $3\nDATA vpermi2qodd<>+16(SB)/8, $5\nDATA vpermi2qodd<>+24(SB)/8, $7\nDATA vpermi2qodd<>+32(SB)/8, $(1<<3)|1\nDATA vpermi2qodd<>+40(SB)/8, $(1<<3)|3\nDATA vpermi2qodd<>+48(SB)/8, $(1<<3)|5\nDATA vpermi2qodd<>+56(SB)/8, $(1<<3)|7\nGLOBL vpermi2qodd<>(SB), RODATA|NOPTR, $64\n\n#define round(input, acc) \\\n\tIMULQ prime2, input \\\n\tADDQ  input, acc \\\n\tROLQ  $31, acc \\\n\tIMULQ prime1, acc\n\n#define avalanche(tmp, acc) \\\n    MOVQ acc, tmp \\\n    SHRQ $33, tmp \\\n    XORQ tmp, acc \\\n    IMULQ prime2, acc \\\n    MOVQ acc, tmp \\\n    SHRQ $29, tmp \\\n    XORQ tmp, acc \\\n    IMULQ prime3, acc \\\n    MOVQ acc, tmp \\\n    SHRQ $32, tmp \\\n    XORQ tmp, acc\n\n#define round8x64(input, acc) \\\n    VPMULLQ prime2ZMM, input, input \\\n    VPADDQ input, acc, acc \\\n    VPROLQ $31, acc, acc \\\n    VPMULLQ prime1ZMM, acc, acc\n\n#define avalanche8x64(tmp, acc) \\\n    VPSRLQ $33, acc, tmp \\\n    VPXORQ tmp, acc, acc \\\n    VPMULLQ prime2ZMM, acc, acc \\\n    VPSRLQ $29, acc, tmp \\\n    VPXORQ tmp, acc, acc \\\n    VPMULLQ prime3ZMM, acc, acc \\\n    VPSRLQ $32, acc, tmp \\\n    VPXORQ tmp, acc, acc\n\n// func MultiSum64Uint8(h []uint64, v []uint8) int\nTEXT ·MultiSum64Uint8(SB), NOSPLIT, $0-54\n    MOVQ $PRIME1, prime1\n    MOVQ $PRIME2, prime2\n    MOVQ $PRIME3, prime3\n    MOVQ $PRIME5, prime5\n\n    MOVQ h_base+0(FP), AX\n    MOVQ h_len+8(FP), CX\n    MOVQ v_base+24(FP), BX\n    MOVQ v_len+32(FP), DX\n\n    CMPQ CX, DX\n    CMOVQGT DX, CX\n    MOVQ CX, ret+48(FP)\n\n    XORQ SI, SI\n    CMPQ CX, $32\n    JB loop\n    CMPB ·hasAVX512(SB), $0\n    JE loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n\n    VMOVDQU64 prime1vec<>(SB), prime1ZMM\n    VMOVDQU64 prime2vec<>(SB), prime2ZMM\n    VMOVDQU64 prime3vec<>(SB), prime3ZMM\n    VMOVDQU64 prime5vec<>(SB), prime5ZMM\n    VMOVDQU64 prime5vec1<>(SB), Z6\nloop32x64:\n    VMOVDQA64 Z6, Z0\n    VMOVDQA64 Z6, Z3\n    VMOVDQA64 Z6, Z20\n    VMOVDQA64 Z6, Z23\n    VPMOVZXBQ (BX)(SI*1), Z1\n    VPMOVZXBQ 8(BX)(SI*1), Z4\n    VPMOVZXBQ 16(BX)(SI*1), Z21\n    VPMOVZXBQ 24(BX)(SI*1), Z24\n\n    VPMULLQ prime5ZMM, Z1, Z1\n    VPMULLQ prime5ZMM, Z4, Z4\n    VPMULLQ prime5ZMM, Z21, Z21\n    VPMULLQ prime5ZMM, Z24, Z24\n    VPXORQ Z1, Z0, Z0\n    VPXORQ Z4, Z3, Z3\n    VPXORQ Z21, Z20, Z20\n    VPXORQ Z24, Z23, Z23\n    VPROLQ $11, Z0, Z0\n    VPROLQ $11, Z3, Z3\n    VPROLQ $11, Z20, Z20\n    VPROLQ $11, Z23, Z23\n    VPMULLQ prime1ZMM, Z0, Z0\n    VPMULLQ prime1ZMM, Z3, Z3\n    VPMULLQ prime1ZMM, Z20, Z20\n    VPMULLQ prime1ZMM, Z23, Z23\n\n    avalanche8x64(Z1, Z0)\n    avalanche8x64(Z4, Z3)\n    avalanche8x64(Z21, Z20)\n    avalanche8x64(Z24, Z23)\n\n    VMOVDQU64 Z0, (AX)(SI*8)\n    VMOVDQU64 Z3, 64(AX)(SI*8)\n    VMOVDQU64 Z20, 128(AX)(SI*8)\n    VMOVDQU64 Z23, 192(AX)(SI*8)\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JB loop32x64\n    VZEROUPPER\nloop:\n    CMPQ SI, CX\n    JE done\n    MOVQ $PRIME5+1, R8\n    MOVBQZX (BX)(SI*1), R9\n\n    IMULQ prime5, R9\n    XORQ R9, R8\n    ROLQ $11, R8\n    IMULQ prime1, R8\n    avalanche(R9, R8)\n\n    MOVQ R8, (AX)(SI*8)\n    INCQ SI\n    JMP loop\ndone:\n    RET\n\n// func MultiSum64Uint16(h []uint64, v []uint16) int\nTEXT ·MultiSum64Uint16(SB), NOSPLIT, $0-54\n    MOVQ $PRIME1, prime1\n    MOVQ $PRIME2, prime2\n    MOVQ $PRIME3, prime3\n    MOVQ $PRIME5, prime5\n\n    MOVQ h_base+0(FP), AX\n    MOVQ h_len+8(FP), CX\n    MOVQ v_base+24(FP), BX\n    MOVQ v_len+32(FP), DX\n\n    CMPQ CX, DX\n    CMOVQGT DX, CX\n    MOVQ CX, ret+48(FP)\n\n    XORQ SI, SI\n    CMPQ CX, $16\n    JB loop\n    CMPB ·hasAVX512(SB), $0\n    JE loop\n\n    MOVQ CX, DI\n    SHRQ $4, DI\n    SHLQ $4, DI\n\n    VMOVDQU64 prime1vec<>(SB), prime1ZMM\n    VMOVDQU64 prime2vec<>(SB), prime2ZMM\n    VMOVDQU64 prime3vec<>(SB), prime3ZMM\n    VMOVDQU64 prime5vec<>(SB), prime5ZMM\n    VMOVDQU64 prime5vec2<>(SB), Z6\n    VMOVDQU64 lowbytemask<>(SB), Z7\nloop16x64:\n    VMOVDQA64 Z6, Z0\n    VMOVDQA64 Z6, Z3\n    VPMOVZXWQ (BX)(SI*2), Z1\n    VPMOVZXWQ 16(BX)(SI*2), Z4\n\n    VMOVDQA64 Z1, Z8\n    VMOVDQA64 Z4, Z9\n    VPSRLQ $8, Z8, Z8\n    VPSRLQ $8, Z9, Z9\n    VPANDQ Z7, Z1, Z1\n    VPANDQ Z7, Z4, Z4\n\n    VPMULLQ prime5ZMM, Z1, Z1\n    VPMULLQ prime5ZMM, Z4, Z4\n    VPXORQ Z1, Z0, Z0\n    VPXORQ Z4, Z3, Z3\n    VPROLQ $11, Z0, Z0\n    VPROLQ $11, Z3, Z3\n    VPMULLQ prime1ZMM, Z0, Z0\n    VPMULLQ prime1ZMM, Z3, Z3\n\n    VPMULLQ prime5ZMM, Z8, Z8\n    VPMULLQ prime5ZMM, Z9, Z9\n    VPXORQ Z8, Z0, Z0\n    VPXORQ Z9, Z3, Z3\n    VPROLQ $11, Z0, Z0\n    VPROLQ $11, Z3, Z3\n    VPMULLQ prime1ZMM, Z0, Z0\n    VPMULLQ prime1ZMM, Z3, Z3\n\n    avalanche8x64(Z1, Z0)\n    avalanche8x64(Z4, Z3)\n\n    VMOVDQU64 Z0, (AX)(SI*8)\n    VMOVDQU64 Z3, 64(AX)(SI*8)\n    ADDQ $16, SI\n    CMPQ SI, DI\n    JB loop16x64\n    VZEROUPPER\nloop:\n    CMPQ SI, CX\n    JE done\n    MOVQ $PRIME5+2, R8\n    MOVWQZX (BX)(SI*2), R9\n\n    MOVQ R9, R10\n    SHRQ $8, R10\n    ANDQ $0xFF, R9\n\n    IMULQ prime5, R9\n    XORQ R9, R8\n    ROLQ $11, R8\n    IMULQ prime1, R8\n\n    IMULQ prime5, R10\n    XORQ R10, R8\n    ROLQ $11, R8\n    IMULQ prime1, R8\n\n    avalanche(R9, R8)\n\n    MOVQ R8, (AX)(SI*8)\n    INCQ SI\n    JMP loop\ndone:\n    RET\n\n// func MultiSum64Uint32(h []uint64, v []uint32) int\nTEXT ·MultiSum64Uint32(SB), NOSPLIT, $0-54\n    MOVQ $PRIME1, prime1\n    MOVQ $PRIME2, prime2\n    MOVQ $PRIME3, prime3\n\n    MOVQ h_base+0(FP), AX\n    MOVQ h_len+8(FP), CX\n    MOVQ v_base+24(FP), BX\n    MOVQ v_len+32(FP), DX\n\n    CMPQ CX, DX\n    CMOVQGT DX, CX\n    MOVQ CX, ret+48(FP)\n\n    XORQ SI, SI\n    CMPQ CX, $32\n    JB loop\n    CMPB ·hasAVX512(SB), $0\n    JE loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n\n    VMOVDQU64 prime1vec<>(SB), prime1ZMM\n    VMOVDQU64 prime2vec<>(SB), prime2ZMM\n    VMOVDQU64 prime3vec<>(SB), prime3ZMM\n    VMOVDQU64 prime5vec4<>(SB), Z6\nloop32x64:\n    VMOVDQA64 Z6, Z0\n    VMOVDQA64 Z6, Z3\n    VMOVDQA64 Z6, Z20\n    VMOVDQA64 Z6, Z23\n    VPMOVZXDQ (BX)(SI*4), Z1\n    VPMOVZXDQ 32(BX)(SI*4), Z4\n    VPMOVZXDQ 64(BX)(SI*4), Z21\n    VPMOVZXDQ 96(BX)(SI*4), Z24\n\n    VPMULLQ prime1ZMM, Z1, Z1\n    VPMULLQ prime1ZMM, Z4, Z4\n    VPMULLQ prime1ZMM, Z21, Z21\n    VPMULLQ prime1ZMM, Z24, Z24\n    VPXORQ Z1, Z0, Z0\n    VPXORQ Z4, Z3, Z3\n    VPXORQ Z21, Z20, Z20\n    VPXORQ Z24, Z23, Z23\n    VPROLQ $23, Z0, Z0\n    VPROLQ $23, Z3, Z3\n    VPROLQ $23, Z20, Z20\n    VPROLQ $23, Z23, Z23\n    VPMULLQ prime2ZMM, Z0, Z0\n    VPMULLQ prime2ZMM, Z3, Z3\n    VPMULLQ prime2ZMM, Z20, Z20\n    VPMULLQ prime2ZMM, Z23, Z23\n    VPADDQ prime3ZMM, Z0, Z0\n    VPADDQ prime3ZMM, Z3, Z3\n    VPADDQ prime3ZMM, Z20, Z20\n    VPADDQ prime3ZMM, Z23, Z23\n\n    avalanche8x64(Z1, Z0)\n    avalanche8x64(Z4, Z3)\n    avalanche8x64(Z21, Z20)\n    avalanche8x64(Z24, Z23)\n\n    VMOVDQU64 Z0, (AX)(SI*8)\n    VMOVDQU64 Z3, 64(AX)(SI*8)\n    VMOVDQU64 Z20, 128(AX)(SI*8)\n    VMOVDQU64 Z23, 192(AX)(SI*8)\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JB loop32x64\n    VZEROUPPER\nloop:\n    CMPQ SI, CX\n    JE done\n    MOVQ $PRIME5+4, R8\n    MOVLQZX (BX)(SI*4), R9\n\n    IMULQ prime1, R9\n    XORQ R9, R8\n    ROLQ $23, R8\n    IMULQ prime2, R8\n    ADDQ prime3, R8\n    avalanche(R9, R8)\n\n    MOVQ R8, (AX)(SI*8)\n    INCQ SI\n    JMP loop\ndone:\n    RET\n\n// func MultiSum64Uint64(h []uint64, v []uint64) int\nTEXT ·MultiSum64Uint64(SB), NOSPLIT, $0-54\n    MOVQ $PRIME1, prime1\n    MOVQ $PRIME2, prime2\n    MOVQ $PRIME3, prime3\n    MOVQ $PRIME4, prime4\n\n    MOVQ h_base+0(FP), AX\n    MOVQ h_len+8(FP), CX\n    MOVQ v_base+24(FP), BX\n    MOVQ v_len+32(FP), DX\n\n    CMPQ CX, DX\n    CMOVQGT DX, CX\n    MOVQ CX, ret+48(FP)\n\n    XORQ SI, SI\n    CMPQ CX, $32\n    JB loop\n    CMPB ·hasAVX512(SB), $0\n    JE loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n\n    VMOVDQU64 prime1vec<>(SB), prime1ZMM\n    VMOVDQU64 prime2vec<>(SB), prime2ZMM\n    VMOVDQU64 prime3vec<>(SB), prime3ZMM\n    VMOVDQU64 prime4vec<>(SB), prime4ZMM\n    VMOVDQU64 prime5vec8<>(SB), Z6\nloop32x64:\n    VMOVDQA64 Z6, Z0\n    VMOVDQA64 Z6, Z3\n    VMOVDQA64 Z6, Z20\n    VMOVDQA64 Z6, Z23\n    VMOVDQU64 (BX)(SI*8), Z1\n    VMOVDQU64 64(BX)(SI*8), Z4\n    VMOVDQU64 128(BX)(SI*8), Z21\n    VMOVDQU64 192(BX)(SI*8), Z24\n\n    VPXORQ Z2, Z2, Z2\n    VPXORQ Z5, Z5, Z5\n    VPXORQ Z22, Z22, Z22\n    VPXORQ Z25, Z25, Z25\n    round8x64(Z1, Z2)\n    round8x64(Z4, Z5)\n    round8x64(Z21, Z22)\n    round8x64(Z24, Z25)\n\n    VPXORQ Z2, Z0, Z0\n    VPXORQ Z5, Z3, Z3\n    VPXORQ Z22, Z20, Z20\n    VPXORQ Z25, Z23, Z23\n    VPROLQ $27, Z0, Z0\n    VPROLQ $27, Z3, Z3\n    VPROLQ $27, Z20, Z20\n    VPROLQ $27, Z23, Z23\n    VPMULLQ prime1ZMM, Z0, Z0\n    VPMULLQ prime1ZMM, Z3, Z3\n    VPMULLQ prime1ZMM, Z20, Z20\n    VPMULLQ prime1ZMM, Z23, Z23\n    VPADDQ prime4ZMM, Z0, Z0\n    VPADDQ prime4ZMM, Z3, Z3\n    VPADDQ prime4ZMM, Z20, Z20\n    VPADDQ prime4ZMM, Z23, Z23\n\n    avalanche8x64(Z1, Z0)\n    avalanche8x64(Z4, Z3)\n    avalanche8x64(Z21, Z20)\n    avalanche8x64(Z24, Z23)\n\n    VMOVDQU64 Z0, (AX)(SI*8)\n    VMOVDQU64 Z3, 64(AX)(SI*8)\n    VMOVDQU64 Z20, 128(AX)(SI*8)\n    VMOVDQU64 Z23, 192(AX)(SI*8)\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JB loop32x64\n    VZEROUPPER\nloop:\n    CMPQ SI, CX\n    JE done\n    MOVQ $PRIME5+8, R8\n    MOVQ (BX)(SI*8), R9\n\n    XORQ R10, R10\n    round(R9, R10)\n    XORQ R10, R8\n    ROLQ $27, R8\n    IMULQ prime1, R8\n    ADDQ prime4, R8\n    avalanche(R9, R8)\n\n    MOVQ R8, (AX)(SI*8)\n    INCQ SI\n    JMP loop\ndone:\n    RET\n\n// func MultiSum64Uint128(h []uint64, v [][16]byte) int\nTEXT ·MultiSum64Uint128(SB), NOSPLIT, $0-54\n    MOVQ $PRIME1, prime1\n    MOVQ $PRIME2, prime2\n    MOVQ $PRIME3, prime3\n    MOVQ $PRIME4, prime4\n\n    MOVQ h_base+0(FP), AX\n    MOVQ h_len+8(FP), CX\n    MOVQ v_base+24(FP), BX\n    MOVQ v_len+32(FP), DX\n\n    CMPQ CX, DX\n    CMOVQGT DX, CX\n    MOVQ CX, ret+48(FP)\n\n    XORQ SI, SI\n    CMPQ CX, $16\n    JB loop\n    CMPB ·hasAVX512(SB), $0\n    JE loop\n\n    MOVQ CX, DI\n    SHRQ $4, DI\n    SHLQ $4, DI\n\n    VMOVDQU64 prime1vec<>(SB), prime1ZMM\n    VMOVDQU64 prime2vec<>(SB), prime2ZMM\n    VMOVDQU64 prime3vec<>(SB), prime3ZMM\n    VMOVDQU64 prime4vec<>(SB), prime4ZMM\n    VMOVDQU64 prime5vec16<>(SB), Z6\n    VMOVDQU64 vpermi2qeven<>(SB), Z7\n    VMOVDQU64 vpermi2qodd<>(SB), Z8\nloop16x64:\n    // This algorithm is slightly different from the other ones, because it is\n    // the only case where the input values are larger than the output (128 bits\n    // vs 64 bits).\n    //\n    // Computing the XXH64 of 128 bits values requires doing two passes over the\n    // lower and upper 64 bits. The lower and upper quad/ words are split in\n    // separate vectors, the first pass is applied on the vector holding the\n    // lower bits of 8 input values, then the second pass is applied with the\n    // vector holding the upper bits.\n    //\n    // Following the model used in the other functions, we unroll the work of\n    // two consecutive groups of 8 values per loop iteration in order to\n    // maximize utilization of CPU resources.\n    CMPQ SI, DI\n    JE loop\n    VMOVDQA64 Z6, Z0\n    VMOVDQA64 Z6, Z20\n    VMOVDQU64 (BX), Z1\n    VMOVDQU64 64(BX), Z9\n    VMOVDQU64 128(BX), Z21\n    VMOVDQU64 192(BX), Z29\n\n    VMOVDQA64 Z7, Z2\n    VMOVDQA64 Z8, Z3\n    VMOVDQA64 Z7, Z22\n    VMOVDQA64 Z8, Z23\n\n    VPERMI2Q Z9, Z1, Z2\n    VPERMI2Q Z9, Z1, Z3\n    VPERMI2Q Z29, Z21, Z22\n    VPERMI2Q Z29, Z21, Z23\n\n    // Compute the rounds on inputs.\n    VPXORQ Z4, Z4, Z4\n    VPXORQ Z5, Z5, Z5\n    VPXORQ Z24, Z24, Z24\n    VPXORQ Z25, Z25, Z25\n    round8x64(Z2, Z4)\n    round8x64(Z3, Z5)\n    round8x64(Z22, Z24)\n    round8x64(Z23, Z25)\n\n    // Lower 64 bits.\n    VPXORQ Z4, Z0, Z0\n    VPXORQ Z24, Z20, Z20\n    VPROLQ $27, Z0, Z0\n    VPROLQ $27, Z20, Z20\n    VPMULLQ prime1ZMM, Z0, Z0\n    VPMULLQ prime1ZMM, Z20, Z20\n    VPADDQ prime4ZMM, Z0, Z0\n    VPADDQ prime4ZMM, Z20, Z20\n\n    // Upper 64 bits.\n    VPXORQ Z5, Z0, Z0\n    VPXORQ Z25, Z20, Z20\n    VPROLQ $27, Z0, Z0\n    VPROLQ $27, Z20, Z20\n    VPMULLQ prime1ZMM, Z0, Z0\n    VPMULLQ prime1ZMM, Z20, Z20\n    VPADDQ prime4ZMM, Z0, Z0\n    VPADDQ prime4ZMM, Z20, Z20\n\n    avalanche8x64(Z1, Z0)\n    avalanche8x64(Z21, Z20)\n    VMOVDQU64 Z0, (AX)(SI*8)\n    VMOVDQU64 Z20, 64(AX)(SI*8)\n    ADDQ $256, BX\n    ADDQ $16, SI\n    JMP loop16x64\n    VZEROUPPER\nloop:\n    CMPQ SI, CX\n    JE done\n    MOVQ $PRIME5+16, R8\n    MOVQ (BX), DX\n    MOVQ 8(BX), DI\n\n    XORQ R9, R9\n    XORQ R10, R10\n    round(DX, R9)\n    round(DI, R10)\n\n    XORQ R9, R8\n    ROLQ $27, R8\n    IMULQ prime1, R8\n    ADDQ prime4, R8\n\n    XORQ R10, R8\n    ROLQ $27, R8\n    IMULQ prime1, R8\n    ADDQ prime4, R8\n\n    avalanche(R9, R8)\n    MOVQ R8, (AX)(SI*8)\n    ADDQ $16, BX\n    INCQ SI\n    JMP loop\ndone:\n    RET\n"
  },
  {
    "path": "bloom/xxhash/sum64uint_purego.go",
    "content": "//go:build purego || !amd64\n\npackage xxhash\n\nfunc MultiSum64Uint8(h []uint64, v []uint8) int {\n\tn := min(len(h), len(v))\n\th = h[:n]\n\tv = v[:n]\n\tfor i := range v {\n\t\th[i] = Sum64Uint8(v[i])\n\t}\n\treturn n\n}\n\nfunc MultiSum64Uint16(h []uint64, v []uint16) int {\n\tn := min(len(h), len(v))\n\th = h[:n]\n\tv = v[:n]\n\tfor i := range v {\n\t\th[i] = Sum64Uint16(v[i])\n\t}\n\treturn n\n}\n\nfunc MultiSum64Uint32(h []uint64, v []uint32) int {\n\tn := min(len(h), len(v))\n\th = h[:n]\n\tv = v[:n]\n\tfor i := range v {\n\t\th[i] = Sum64Uint32(v[i])\n\t}\n\treturn n\n}\n\nfunc MultiSum64Uint64(h []uint64, v []uint64) int {\n\tn := min(len(h), len(v))\n\th = h[:n]\n\tv = v[:n]\n\tfor i := range v {\n\t\th[i] = Sum64Uint64(v[i])\n\t}\n\treturn n\n}\n\nfunc MultiSum64Uint128(h []uint64, v [][16]byte) int {\n\tn := min(len(h), len(v))\n\th = h[:n]\n\tv = v[:n]\n\tfor i := range v {\n\t\th[i] = Sum64Uint128(v[i])\n\t}\n\treturn n\n}\n\nfunc min(a, b int) int {\n\tif a < b {\n\t\treturn a\n\t}\n\treturn b\n}\n"
  },
  {
    "path": "bloom/xxhash/sum64uint_test.go",
    "content": "package xxhash_test\n\nimport (\n\t\"encoding/binary\"\n\t\"fmt\"\n\t\"testing\"\n\t\"testing/quick\"\n\t\"time\"\n\n\t\"github.com/segmentio/parquet-go/bloom/xxhash\"\n)\n\nfunc TestSumUint8(t *testing.T) {\n\tb := [1]byte{0: 42}\n\th := xxhash.Sum64Uint8(42)\n\tx := xxhash.Sum64(b[:])\n\tif h != x {\n\t\tt.Errorf(\"got %064b; want %064b\", h, x)\n\t}\n}\n\nfunc TestSumUint16(t *testing.T) {\n\tb := [2]byte{0: 42}\n\th := xxhash.Sum64Uint16(42)\n\tx := xxhash.Sum64(b[:])\n\tif h != x {\n\t\tt.Errorf(\"got %064b; want %064b\", h, x)\n\t}\n}\n\nfunc TestSumUint32(t *testing.T) {\n\tb := [4]byte{0: 42}\n\th := xxhash.Sum64Uint32(42)\n\tx := xxhash.Sum64(b[:])\n\tif h != x {\n\t\tt.Errorf(\"got %064b; want %064b\", h, x)\n\t}\n}\n\nfunc TestSumUint64(t *testing.T) {\n\tb := [8]byte{0: 42}\n\th := xxhash.Sum64Uint64(42)\n\tx := xxhash.Sum64(b[:])\n\tif h != x {\n\t\tt.Errorf(\"got %064b; want %064b\", h, x)\n\t}\n}\n\nfunc TestSumUint128(t *testing.T) {\n\tb := [16]byte{0: 42}\n\th := xxhash.Sum64Uint128(b)\n\tx := xxhash.Sum64(b[:])\n\tif h != x {\n\t\tt.Errorf(\"got %064b; want %064b\", h, x)\n\t}\n}\n\nfunc TestMultiSum64Uint8(t *testing.T) {\n\tf := func(v []uint8) bool {\n\t\th := make([]uint64, len(v))\n\t\tn := xxhash.MultiSum64Uint8(h, v)\n\t\tif n != len(v) {\n\t\t\tt.Errorf(\"return value mismatch: got %d; want %d\", n, len(v))\n\t\t\treturn false\n\t\t}\n\t\tfor i := range h {\n\t\t\tx := xxhash.Sum64(v[i : i+1])\n\t\t\tif h[i] != x {\n\t\t\t\tt.Errorf(\"sum at index %d mismatch: got %064b; want %064b\", i, h[i], x)\n\t\t\t\treturn false\n\t\t\t}\n\t\t}\n\t\treturn true\n\t}\n\tif err := quick.Check(f, nil); err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMultiSum64Uint16(t *testing.T) {\n\tf := func(v []uint16) bool {\n\t\th := make([]uint64, len(v))\n\t\tn := xxhash.MultiSum64Uint16(h, v)\n\t\tif n != len(v) {\n\t\t\tt.Errorf(\"return value mismatch: got %d; want %d\", n, len(v))\n\t\t\treturn false\n\t\t}\n\t\tfor i := range h {\n\t\t\tb := [2]byte{}\n\t\t\tbinary.LittleEndian.PutUint16(b[:], v[i])\n\t\t\tx := xxhash.Sum64(b[:])\n\t\t\tif h[i] != x {\n\t\t\t\tt.Errorf(\"sum at index %d mismatch: got %064b; want %064b\", i, h[i], x)\n\t\t\t\treturn false\n\t\t\t}\n\t\t}\n\t\treturn true\n\t}\n\tif err := quick.Check(f, nil); err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMultiSum64Uint32(t *testing.T) {\n\tf := func(v []uint32) bool {\n\t\th := make([]uint64, len(v))\n\t\tn := xxhash.MultiSum64Uint32(h, v)\n\t\tif n != len(v) {\n\t\t\tt.Errorf(\"return value mismatch: got %d; want %d\", n, len(v))\n\t\t\treturn false\n\t\t}\n\t\tfor i := range h {\n\t\t\tb := [4]byte{}\n\t\t\tbinary.LittleEndian.PutUint32(b[:], v[i])\n\t\t\tx := xxhash.Sum64(b[:])\n\t\t\tif h[i] != x {\n\t\t\t\tt.Errorf(\"sum at index %d mismatch: got %064b; want %064b\", i, h[i], x)\n\t\t\t\treturn false\n\t\t\t}\n\t\t}\n\t\treturn true\n\t}\n\tif err := quick.Check(f, nil); err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMultiSum64Uint64(t *testing.T) {\n\tf := func(v []uint64) bool {\n\t\th := make([]uint64, len(v))\n\t\tn := xxhash.MultiSum64Uint64(h, v)\n\t\tif n != len(v) {\n\t\t\tt.Errorf(\"return value mismatch: got %d; want %d\", n, len(v))\n\t\t\treturn false\n\t\t}\n\t\tfor i := range h {\n\t\t\tb := [8]byte{}\n\t\t\tbinary.LittleEndian.PutUint64(b[:], v[i])\n\t\t\tx := xxhash.Sum64(b[:])\n\t\t\tif h[i] != x {\n\t\t\t\tt.Errorf(\"sum at index %d mismatch: got %064b; want %064b\", i, h[i], x)\n\t\t\t\treturn false\n\t\t\t}\n\t\t}\n\t\treturn true\n\t}\n\tif err := quick.Check(f, nil); err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMultiSum64Uint128(t *testing.T) {\n\tf := func(v [][16]byte) bool {\n\t\th := make([]uint64, len(v))\n\t\tn := xxhash.MultiSum64Uint128(h, v)\n\t\tif n != len(v) {\n\t\t\tt.Errorf(\"return value mismatch: got %d; want %d\", n, len(v))\n\t\t\treturn false\n\t\t}\n\t\tfor i := range h {\n\t\t\tx := xxhash.Sum64(v[i][:])\n\t\t\tif h[i] != x {\n\t\t\t\tt.Errorf(\"sum at index %d mismatch: got %064b; want %064b\", i, h[i], x)\n\t\t\t\treturn false\n\t\t\t}\n\t\t}\n\t\treturn true\n\t}\n\tif err := quick.Check(f, nil); err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc reportThroughput(b *testing.B, loops, count int, start time.Time) {\n\tthroughput := float64(loops*count) / time.Since(start).Seconds()\n\t// Measure the throughput of writes to the output buffer;\n\t// it makes the results comparable across benchmarks that\n\t// have inputs of different sizes.\n\tb.SetBytes(8 * int64(count))\n\tb.ReportMetric(0, \"ns/op\")\n\tb.ReportMetric(throughput, \"hash/s\")\n}\n\nconst benchmarkBufferSize = 4096\n\nfunc BenchmarkMultiSum64Uint8(b *testing.B) {\n\tin := make([]uint8, benchmarkBufferSize)\n\tfor i := range in {\n\t\tin[i] = uint8(i)\n\t}\n\tb.Run(fmt.Sprintf(\"%dKB\", benchmarkBufferSize/1024), func(b *testing.B) {\n\t\tout := make([]uint64, len(in))\n\t\tstart := time.Now()\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\t_ = xxhash.MultiSum64Uint8(out, in)\n\t\t}\n\t\treportThroughput(b, b.N, len(out), start)\n\t})\n}\n\nfunc BenchmarkMultiSum64Uint16(b *testing.B) {\n\tin := make([]uint16, benchmarkBufferSize/2)\n\tfor i := range in {\n\t\tin[i] = uint16(i)\n\t}\n\tb.Run(fmt.Sprintf(\"%dKB\", benchmarkBufferSize/1024), func(b *testing.B) {\n\t\tout := make([]uint64, len(in))\n\t\tstart := time.Now()\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\t_ = xxhash.MultiSum64Uint16(out, in)\n\t\t}\n\t\treportThroughput(b, b.N, len(out), start)\n\t})\n}\n\nfunc BenchmarkMultiSum64Uint32(b *testing.B) {\n\tin := make([]uint32, benchmarkBufferSize/4)\n\tfor i := range in {\n\t\tin[i] = uint32(i)\n\t}\n\tb.Run(fmt.Sprintf(\"%dKB\", benchmarkBufferSize/1024), func(b *testing.B) {\n\t\tout := make([]uint64, len(in))\n\t\tstart := time.Now()\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\t_ = xxhash.MultiSum64Uint32(out, in)\n\t\t}\n\t\treportThroughput(b, b.N, len(out), start)\n\t})\n}\n\nfunc BenchmarkMultiSum64Uint64(b *testing.B) {\n\tin := make([]uint64, benchmarkBufferSize/8)\n\tfor i := range in {\n\t\tin[i] = uint64(i)\n\t}\n\tb.Run(fmt.Sprintf(\"%dKB\", benchmarkBufferSize/1024), func(b *testing.B) {\n\t\tout := make([]uint64, len(in))\n\t\tstart := time.Now()\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\t_ = xxhash.MultiSum64Uint64(out, in)\n\t\t}\n\t\treportThroughput(b, b.N, len(out), start)\n\t})\n}\n\nfunc BenchmarkMultiSum64Uint128(b *testing.B) {\n\tin := make([][16]byte, benchmarkBufferSize/16)\n\tfor i := range in {\n\t\tbinary.LittleEndian.PutUint64(in[i][:8], uint64(i))\n\t\tbinary.LittleEndian.PutUint64(in[i][8:], uint64(i))\n\t}\n\tb.Run(fmt.Sprintf(\"%dKB\", benchmarkBufferSize/1024), func(b *testing.B) {\n\t\tout := make([]uint64, len(in))\n\t\tstart := time.Now()\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\t_ = xxhash.MultiSum64Uint128(out, in)\n\t\t}\n\t\treportThroughput(b, b.N, len(out), start)\n\t})\n}\n"
  },
  {
    "path": "bloom/xxhash/xxhash.go",
    "content": "// Package xxhash is an extension of github.com/cespare/xxhash which adds\n// routines optimized to hash arrays of fixed size elements.\npackage xxhash\n\nimport (\n\t\"encoding/binary\"\n\t\"math/bits\"\n)\n\nconst (\n\tprime1 uint64 = 0x9E3779B185EBCA87\n\tprime2 uint64 = 0xC2B2AE3D27D4EB4F\n\tprime3 uint64 = 0x165667B19E3779F9\n\tprime4 uint64 = 0x85EBCA77C2B2AE63\n\tprime5 uint64 = 0x27D4EB2F165667C5\n\t// Pre-computed operations because the compiler otherwise complains that the\n\t// results overflow 64 bit integers.\n\tprime1plus2 uint64 = 0x60EA27EEADC0B5D6 // prime1 + prime2\n\tnegprime1   uint64 = 0x61C8864E7A143579 // -prime1\n)\n\nfunc avalanche(h uint64) uint64 {\n\th ^= h >> 33\n\th *= prime2\n\th ^= h >> 29\n\th *= prime3\n\th ^= h >> 32\n\treturn h\n}\n\nfunc round(acc, input uint64) uint64 {\n\tacc += input * prime2\n\tacc = rol31(acc)\n\tacc *= prime1\n\treturn acc\n}\n\nfunc mergeRound(acc, val uint64) uint64 {\n\tval = round(0, val)\n\tacc ^= val\n\tacc = acc*prime1 + prime4\n\treturn acc\n}\n\nfunc u64(b []byte) uint64 { return binary.LittleEndian.Uint64(b) }\nfunc u32(b []byte) uint32 { return binary.LittleEndian.Uint32(b) }\n\nfunc rol1(x uint64) uint64  { return bits.RotateLeft64(x, 1) }\nfunc rol7(x uint64) uint64  { return bits.RotateLeft64(x, 7) }\nfunc rol11(x uint64) uint64 { return bits.RotateLeft64(x, 11) }\nfunc rol12(x uint64) uint64 { return bits.RotateLeft64(x, 12) }\nfunc rol18(x uint64) uint64 { return bits.RotateLeft64(x, 18) }\nfunc rol23(x uint64) uint64 { return bits.RotateLeft64(x, 23) }\nfunc rol27(x uint64) uint64 { return bits.RotateLeft64(x, 27) }\nfunc rol31(x uint64) uint64 { return bits.RotateLeft64(x, 31) }\n"
  },
  {
    "path": "bloom/xxhash/xxhash_amd64.go",
    "content": "//go:build !purego\n\npackage xxhash\n\n// Sum64 computes the 64-bit xxHash digest of b.\nfunc Sum64(b []byte) uint64\n"
  },
  {
    "path": "bloom/xxhash/xxhash_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n#define PRIME1 0x9E3779B185EBCA87\n#define PRIME2 0xC2B2AE3D27D4EB4F\n#define PRIME3 0x165667B19E3779F9\n#define PRIME4 0x85EBCA77C2B2AE63\n#define PRIME5 0x27D4EB2F165667C5\n\nDATA prime3<>+0(SB)/8, $PRIME3\nGLOBL prime3<>(SB), RODATA|NOPTR, $8\n\nDATA prime5<>+0(SB)/8, $PRIME5\nGLOBL prime5<>(SB), RODATA|NOPTR, $8\n\n// Register allocation:\n// AX\th\n// SI\tpointer to advance through b\n// DX\tn\n// BX\tloop end\n// R8\tv1, k1\n// R9\tv2\n// R10\tv3\n// R11\tv4\n// R12\ttmp\n// R13\tPRIME1\n// R14\tPRIME2\n// DI\tPRIME4\n\n// round reads from and advances the buffer pointer in SI.\n// It assumes that R13 has PRIME1 and R14 has PRIME2.\n#define round(r) \\\n\tMOVQ  (SI), R12 \\\n\tADDQ  $8, SI    \\\n\tIMULQ R14, R12  \\\n\tADDQ  R12, r    \\\n\tROLQ  $31, r    \\\n\tIMULQ R13, r\n\n// mergeRound applies a merge round on the two registers acc and val.\n// It assumes that R13 has PRIME1, R14 has PRIME2, and DI has PRIME4.\n#define mergeRound(acc, val) \\\n\tIMULQ R14, val \\\n\tROLQ  $31, val \\\n\tIMULQ R13, val \\\n\tXORQ  val, acc \\\n\tIMULQ R13, acc \\\n\tADDQ  DI, acc\n\n// func Sum64(b []byte) uint64\nTEXT ·Sum64(SB), NOSPLIT, $0-32\n\t// Load fixed primes.\n\tMOVQ $PRIME1, R13\n\tMOVQ $PRIME2, R14\n\tMOVQ $PRIME4, DI\n\n\t// Load slice.\n\tMOVQ b_base+0(FP), SI\n\tMOVQ b_len+8(FP), DX\n\tLEAQ (SI)(DX*1), BX\n\n\t// The first loop limit will be len(b)-32.\n\tSUBQ $32, BX\n\n\t// Check whether we have at least one block.\n\tCMPQ DX, $32\n\tJLT  noBlocks\n\n\t// Set up initial state (v1, v2, v3, v4).\n\tMOVQ R13, R8\n\tADDQ R14, R8\n\tMOVQ R14, R9\n\tXORQ R10, R10\n\tXORQ R11, R11\n\tSUBQ R13, R11\n\n\t// Loop until SI > BX.\nblockLoop:\n\tround(R8)\n\tround(R9)\n\tround(R10)\n\tround(R11)\n\n\tCMPQ SI, BX\n\tJLE  blockLoop\n\n\tMOVQ R8, AX\n\tROLQ $1, AX\n\tMOVQ R9, R12\n\tROLQ $7, R12\n\tADDQ R12, AX\n\tMOVQ R10, R12\n\tROLQ $12, R12\n\tADDQ R12, AX\n\tMOVQ R11, R12\n\tROLQ $18, R12\n\tADDQ R12, AX\n\n\tmergeRound(AX, R8)\n\tmergeRound(AX, R9)\n\tmergeRound(AX, R10)\n\tmergeRound(AX, R11)\n\n\tJMP afterBlocks\n\nnoBlocks:\n\tMOVQ $PRIME5, AX\n\nafterBlocks:\n\tADDQ DX, AX\n\n\t// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.\n\tADDQ $24, BX\n\n\tCMPQ SI, BX\n\tJG   fourByte\n\nwordLoop:\n\t// Calculate k1.\n\tMOVQ  (SI), R8\n\tADDQ  $8, SI\n\tIMULQ R14, R8\n\tROLQ  $31, R8\n\tIMULQ R13, R8\n\n\tXORQ  R8, AX\n\tROLQ  $27, AX\n\tIMULQ R13, AX\n\tADDQ  DI, AX\n\n\tCMPQ SI, BX\n\tJLE  wordLoop\n\nfourByte:\n\tADDQ $4, BX\n\tCMPQ SI, BX\n\tJG   singles\n\n\tMOVL  (SI), R8\n\tADDQ  $4, SI\n\tIMULQ R13, R8\n\tXORQ  R8, AX\n\n\tROLQ  $23, AX\n\tIMULQ R14, AX\n\tADDQ  prime3<>(SB), AX\n\nsingles:\n\tADDQ $4, BX\n\tCMPQ SI, BX\n\tJGE  finalize\n\nsinglesLoop:\n\tMOVBQZX (SI), R12\n\tADDQ    $1, SI\n\tIMULQ   prime5<>(SB), R12\n\tXORQ    R12, AX\n\n\tROLQ  $11, AX\n\tIMULQ R13, AX\n\n\tCMPQ SI, BX\n\tJL   singlesLoop\n\nfinalize:\n\tMOVQ  AX, R12\n\tSHRQ  $33, R12\n\tXORQ  R12, AX\n\tIMULQ R14, AX\n\tMOVQ  AX, R12\n\tSHRQ  $29, R12\n\tXORQ  R12, AX\n\tIMULQ prime3<>(SB), AX\n\tMOVQ  AX, R12\n\tSHRQ  $32, R12\n\tXORQ  R12, AX\n\n\tMOVQ AX, ret+24(FP)\n\tRET\n"
  },
  {
    "path": "bloom/xxhash/xxhash_purego.go",
    "content": "//go:build purego || !amd64\n\npackage xxhash\n\n// Sum64 computes the 64-bit xxHash digest of b.\nfunc Sum64(b []byte) uint64 {\n\tvar n = len(b)\n\tvar h uint64\n\n\tif n >= 32 {\n\t\tv1 := prime1plus2\n\t\tv2 := prime2\n\t\tv3 := uint64(0)\n\t\tv4 := negprime1\n\t\tfor len(b) >= 32 {\n\t\t\tv1 = round(v1, u64(b[0:8:len(b)]))\n\t\t\tv2 = round(v2, u64(b[8:16:len(b)]))\n\t\t\tv3 = round(v3, u64(b[16:24:len(b)]))\n\t\t\tv4 = round(v4, u64(b[24:32:len(b)]))\n\t\t\tb = b[32:len(b):len(b)]\n\t\t}\n\t\th = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)\n\t\th = mergeRound(h, v1)\n\t\th = mergeRound(h, v2)\n\t\th = mergeRound(h, v3)\n\t\th = mergeRound(h, v4)\n\t} else {\n\t\th = prime5\n\t}\n\n\th += uint64(n)\n\n\ti, end := 0, len(b)\n\tfor ; i+8 <= end; i += 8 {\n\t\tk1 := round(0, u64(b[i:i+8:len(b)]))\n\t\th ^= k1\n\t\th = rol27(h)*prime1 + prime4\n\t}\n\tif i+4 <= end {\n\t\th ^= uint64(u32(b[i:i+4:len(b)])) * prime1\n\t\th = rol23(h)*prime2 + prime3\n\t\ti += 4\n\t}\n\tfor ; i < end; i++ {\n\t\th ^= uint64(b[i]) * prime5\n\t\th = rol11(h) * prime1\n\t}\n\n\treturn avalanche(h)\n}\n"
  },
  {
    "path": "bloom/xxhash/xxhash_test.go",
    "content": "package xxhash_test\n\nimport (\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/bloom/xxhash\"\n)\n\nfunc TestSum64(t *testing.T) {\n\tfor _, tt := range []struct {\n\t\tname  string\n\t\tinput string\n\t\twant  uint64\n\t}{\n\t\t{\"empty\", \"\", 0xef46db3751d8e999},\n\t\t{\"a\", \"a\", 0xd24ec4f1a98c6e5b},\n\t\t{\"as\", \"as\", 0x1c330fb2d66be179},\n\t\t{\"asd\", \"asd\", 0x631c37ce72a97393},\n\t\t{\"asdf\", \"asdf\", 0x415872f599cea71e},\n\t\t{\n\t\t\t\"len=63\",\n\t\t\t// Exactly 63 characters, which exercises all code paths.\n\t\t\t\"Call me Ishmael. Some years ago--never mind how long precisely-\",\n\t\t\t0x02a2e85470d6fd96,\n\t\t},\n\t} {\n\t\tt.Run(tt.name, func(t *testing.T) {\n\t\t\tif got := xxhash.Sum64([]byte(tt.input)); got != tt.want {\n\t\t\t\tt.Fatalf(\"Sum64: got 0x%x; want 0x%x\", got, tt.want)\n\t\t\t}\n\t\t})\n\t}\n}\n\nvar benchmarks = []struct {\n\tname string\n\tn    int64\n}{\n\t{\"4B\", 4},\n\t{\"16B\", 16},\n\t{\"100B\", 100},\n\t{\"4KB\", 4e3},\n\t{\"10MB\", 10e6},\n}\n\nfunc BenchmarkSum64(b *testing.B) {\n\tfor _, bb := range benchmarks {\n\t\tin := make([]byte, bb.n)\n\t\tfor i := range in {\n\t\t\tin[i] = byte(i)\n\t\t}\n\t\tb.Run(bb.name, func(b *testing.B) {\n\t\t\tb.SetBytes(bb.n)\n\t\t\tfor i := 0; i < b.N; i++ {\n\t\t\t\t_ = xxhash.Sum64(in)\n\t\t\t}\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "bloom.go",
    "content": "package parquet\n\nimport (\n\t\"io\"\n\n\t\"github.com/segmentio/parquet-go/bloom\"\n\t\"github.com/segmentio/parquet-go/bloom/xxhash\"\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/format\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\n// BloomFilter is an interface allowing applications to test whether a key\n// exists in a bloom filter.\ntype BloomFilter interface {\n\t// Implement the io.ReaderAt interface as a mechanism to allow reading the\n\t// raw bits of the filter.\n\tio.ReaderAt\n\n\t// Returns the size of the bloom filter (in bytes).\n\tSize() int64\n\n\t// Tests whether the given value is present in the filter.\n\t//\n\t// A non-nil error may be returned if reading the filter failed. This may\n\t// happen if the filter was lazily loaded from a storage medium during the\n\t// call to Check for example. Applications that can guarantee that the\n\t// filter was in memory at the time Check was called can safely ignore the\n\t// error, which would always be nil in this case.\n\tCheck(value Value) (bool, error)\n}\n\ntype bloomFilter struct {\n\tio.SectionReader\n\thash  bloom.Hash\n\tcheck func(io.ReaderAt, int64, uint64) (bool, error)\n}\n\nfunc (f *bloomFilter) Check(v Value) (bool, error) {\n\treturn f.check(&f.SectionReader, f.Size(), v.hash(f.hash))\n}\n\nfunc (v Value) hash(h bloom.Hash) uint64 {\n\tswitch v.Kind() {\n\tcase Boolean:\n\t\treturn h.Sum64Uint8(v.byte())\n\tcase Int32, Float:\n\t\treturn h.Sum64Uint32(v.uint32())\n\tcase Int64, Double:\n\t\treturn h.Sum64Uint64(v.uint64())\n\tdefault: // Int96, ByteArray, FixedLenByteArray, or null\n\t\treturn h.Sum64(v.byteArray())\n\t}\n}\n\nfunc newBloomFilter(file io.ReaderAt, offset int64, header *format.BloomFilterHeader) *bloomFilter {\n\tif header.Algorithm.Block != nil {\n\t\tif header.Hash.XxHash != nil {\n\t\t\tif header.Compression.Uncompressed != nil {\n\t\t\t\treturn &bloomFilter{\n\t\t\t\t\tSectionReader: *io.NewSectionReader(file, offset, int64(header.NumBytes)),\n\t\t\t\t\thash:          bloom.XXH64{},\n\t\t\t\t\tcheck:         bloom.CheckSplitBlock,\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\treturn nil\n}\n\n// The BloomFilterColumn interface is a declarative representation of bloom filters\n// used when configuring filters on a parquet writer.\ntype BloomFilterColumn interface {\n\t// Returns the path of the column that the filter applies to.\n\tPath() []string\n\n\t// Returns the hashing algorithm used when inserting values into a bloom\n\t// filter.\n\tHash() bloom.Hash\n\n\t// Returns an encoding which can be used to write columns of values to the\n\t// filter.\n\tEncoding() encoding.Encoding\n\n\t// Returns the size of the filter needed to encode values in the filter,\n\t// assuming each value will be encoded with the given number of bits.\n\tSize(numValues int64) int\n}\n\n// SplitBlockFilter constructs a split block bloom filter object for the column\n// at the given path, with the given bitsPerValue.\n//\n// If you are unsure what number of bitsPerValue to use, 10 is a reasonable\n// tradeoff between size and error rate for common datasets.\n//\n// For more information on the tradeoff between size and error rate, consult\n// this website: https://hur.st/bloomfilter/?n=4000&p=0.1&m=&k=1\nfunc SplitBlockFilter(bitsPerValue uint, path ...string) BloomFilterColumn {\n\treturn splitBlockFilter{\n\t\tbitsPerValue: bitsPerValue,\n\t\tpath:         path,\n\t}\n}\n\ntype splitBlockFilter struct {\n\tbitsPerValue uint\n\tpath         []string\n}\n\nfunc (f splitBlockFilter) Path() []string              { return f.path }\nfunc (f splitBlockFilter) Hash() bloom.Hash            { return bloom.XXH64{} }\nfunc (f splitBlockFilter) Encoding() encoding.Encoding { return splitBlockEncoding{} }\n\nfunc (f splitBlockFilter) Size(numValues int64) int {\n\treturn bloom.BlockSize * bloom.NumSplitBlocksOf(numValues, f.bitsPerValue)\n}\n\n// Creates a header from the given bloom filter.\n//\n// For now there is only one type of filter supported, but we provide this\n// function to suggest a model for extending the implementation if new filters\n// are added to the parquet specs.\nfunc bloomFilterHeader(filter BloomFilterColumn) (header format.BloomFilterHeader) {\n\tswitch filter.(type) {\n\tcase splitBlockFilter:\n\t\theader.Algorithm.Block = &format.SplitBlockAlgorithm{}\n\t}\n\tswitch filter.Hash().(type) {\n\tcase bloom.XXH64:\n\t\theader.Hash.XxHash = &format.XxHash{}\n\t}\n\theader.Compression.Uncompressed = &format.BloomFilterUncompressed{}\n\treturn header\n}\n\nfunc searchBloomFilterColumn(filters []BloomFilterColumn, path columnPath) BloomFilterColumn {\n\tfor _, f := range filters {\n\t\tif path.equal(f.Path()) {\n\t\t\treturn f\n\t\t}\n\t}\n\treturn nil\n}\n\nconst (\n\t// Size of the stack buffer used to perform bulk operations on bloom filters.\n\t//\n\t// This value was determined as being a good default empirically,\n\t// 128 x uint64 makes a 1KiB buffer which amortizes the cost of calling\n\t// methods of bloom filters while not causing too much stack growth either.\n\tfilterEncodeBufferSize = 128\n)\n\ntype splitBlockEncoding struct {\n\tencoding.NotSupported\n}\n\nfunc (splitBlockEncoding) EncodeBoolean(dst []byte, src []byte) ([]byte, error) {\n\tsplitBlockEncodeUint8(bloom.MakeSplitBlockFilter(dst), src)\n\treturn dst, nil\n}\n\nfunc (splitBlockEncoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) {\n\tsplitBlockEncodeUint32(bloom.MakeSplitBlockFilter(dst), unsafecast.Int32ToUint32(src))\n\treturn dst, nil\n}\n\nfunc (splitBlockEncoding) EncodeInt64(dst []byte, src []int64) ([]byte, error) {\n\tsplitBlockEncodeUint64(bloom.MakeSplitBlockFilter(dst), unsafecast.Int64ToUint64(src))\n\treturn dst, nil\n}\n\nfunc (e splitBlockEncoding) EncodeInt96(dst []byte, src []deprecated.Int96) ([]byte, error) {\n\tsplitBlockEncodeFixedLenByteArray(bloom.MakeSplitBlockFilter(dst), deprecated.Int96ToBytes(src), 12)\n\treturn dst, nil\n}\n\nfunc (splitBlockEncoding) EncodeFloat(dst []byte, src []float32) ([]byte, error) {\n\tsplitBlockEncodeUint32(bloom.MakeSplitBlockFilter(dst), unsafecast.Float32ToUint32(src))\n\treturn dst, nil\n}\n\nfunc (splitBlockEncoding) EncodeDouble(dst []byte, src []float64) ([]byte, error) {\n\tsplitBlockEncodeUint64(bloom.MakeSplitBlockFilter(dst), unsafecast.Float64ToUint64(src))\n\treturn dst, nil\n}\n\nfunc (splitBlockEncoding) EncodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, error) {\n\tfilter := bloom.MakeSplitBlockFilter(dst)\n\tbuffer := make([]uint64, 0, filterEncodeBufferSize)\n\tbaseOffset := offsets[0]\n\n\tfor _, endOffset := range offsets[1:] {\n\t\tvalue := src[baseOffset:endOffset:endOffset]\n\t\tbaseOffset = endOffset\n\n\t\tif len(buffer) == cap(buffer) {\n\t\t\tfilter.InsertBulk(buffer)\n\t\t\tbuffer = buffer[:0]\n\t\t}\n\n\t\tbuffer = append(buffer, xxhash.Sum64(value))\n\t}\n\n\tfilter.InsertBulk(buffer)\n\treturn dst, nil\n}\n\nfunc (splitBlockEncoding) EncodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error) {\n\tfilter := bloom.MakeSplitBlockFilter(dst)\n\tif size == 16 {\n\t\tsplitBlockEncodeUint128(filter, unsafecast.BytesToUint128(src))\n\t} else {\n\t\tsplitBlockEncodeFixedLenByteArray(filter, src, size)\n\t}\n\treturn dst, nil\n}\n\nfunc splitBlockEncodeFixedLenByteArray(filter bloom.SplitBlockFilter, data []byte, size int) {\n\tbuffer := make([]uint64, 0, filterEncodeBufferSize)\n\n\tfor i, j := 0, size; j <= len(data); {\n\t\tif len(buffer) == cap(buffer) {\n\t\t\tfilter.InsertBulk(buffer)\n\t\t\tbuffer = buffer[:0]\n\t\t}\n\t\tbuffer = append(buffer, xxhash.Sum64(data[i:j]))\n\t\ti += size\n\t\tj += size\n\t}\n\n\tfilter.InsertBulk(buffer)\n}\n\nfunc splitBlockEncodeUint8(filter bloom.SplitBlockFilter, values []uint8) {\n\tbuffer := make([]uint64, filterEncodeBufferSize)\n\n\tfor i := 0; i < len(values); {\n\t\tn := xxhash.MultiSum64Uint8(buffer, values[i:])\n\t\tfilter.InsertBulk(buffer[:n])\n\t\ti += n\n\t}\n}\n\nfunc splitBlockEncodeUint32(filter bloom.SplitBlockFilter, values []uint32) {\n\tbuffer := make([]uint64, filterEncodeBufferSize)\n\n\tfor i := 0; i < len(values); {\n\t\tn := xxhash.MultiSum64Uint32(buffer, values[i:])\n\t\tfilter.InsertBulk(buffer[:n])\n\t\ti += n\n\t}\n}\n\nfunc splitBlockEncodeUint64(filter bloom.SplitBlockFilter, values []uint64) {\n\tbuffer := make([]uint64, filterEncodeBufferSize)\n\n\tfor i := 0; i < len(values); {\n\t\tn := xxhash.MultiSum64Uint64(buffer, values[i:])\n\t\tfilter.InsertBulk(buffer[:n])\n\t\ti += n\n\t}\n}\n\nfunc splitBlockEncodeUint128(filter bloom.SplitBlockFilter, values [][16]byte) {\n\tbuffer := make([]uint64, filterEncodeBufferSize)\n\n\tfor i := 0; i < len(values); {\n\t\tn := xxhash.MultiSum64Uint128(buffer, values[i:])\n\t\tfilter.InsertBulk(buffer[:n])\n\t\ti += n\n\t}\n}\n"
  },
  {
    "path": "bloom_test.go",
    "content": "package parquet\n\nimport (\n\t\"math/rand\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/bloom\"\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/internal/quick\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\nfunc TestSplitBlockFilter(t *testing.T) {\n\tnewFilter := func(numValues int) bloom.SplitBlockFilter {\n\t\treturn make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(int64(numValues), 11))\n\t}\n\n\tenc := SplitBlockFilter(10, \"$\").Encoding()\n\n\tcheck := func(filter bloom.SplitBlockFilter, value Value) bool {\n\t\treturn filter.Check(value.hash(&bloom.XXH64{}))\n\t}\n\n\ttests := []struct {\n\t\tscenario string\n\t\tfunction interface{}\n\t}{\n\t\t{\n\t\t\tscenario: \"BOOLEAN\",\n\t\t\tfunction: func(values []bool) bool {\n\t\t\t\tfilter := newFilter(len(values))\n\t\t\t\tenc.EncodeBoolean(filter.Bytes(), unsafecast.BoolToBytes(values))\n\t\t\t\tfor _, v := range values {\n\t\t\t\t\tif !check(filter, ValueOf(v)) {\n\t\t\t\t\t\treturn false\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\treturn true\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"INT32\",\n\t\t\tfunction: func(values []int32) bool {\n\t\t\t\tfilter := newFilter(len(values))\n\t\t\t\tenc.EncodeInt32(filter.Bytes(), values)\n\t\t\t\tfor _, v := range values {\n\t\t\t\t\tif !check(filter, ValueOf(v)) {\n\t\t\t\t\t\treturn false\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\treturn true\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"INT64\",\n\t\t\tfunction: func(values []int64) bool {\n\t\t\t\tfilter := newFilter(len(values))\n\t\t\t\tenc.EncodeInt64(filter.Bytes(), values)\n\t\t\t\tfor _, v := range values {\n\t\t\t\t\tif !check(filter, ValueOf(v)) {\n\t\t\t\t\t\treturn false\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\treturn true\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"INT96\",\n\t\t\tfunction: func(values []deprecated.Int96) bool {\n\t\t\t\tfilter := newFilter(len(values))\n\t\t\t\tenc.EncodeInt96(filter.Bytes(), values)\n\t\t\t\tfor _, v := range values {\n\t\t\t\t\tif !check(filter, ValueOf(v)) {\n\t\t\t\t\t\treturn false\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\treturn true\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"FLOAT\",\n\t\t\tfunction: func(values []float32) bool {\n\t\t\t\tfilter := newFilter(len(values))\n\t\t\t\tenc.EncodeFloat(filter.Bytes(), values)\n\t\t\t\tfor _, v := range values {\n\t\t\t\t\tif !check(filter, ValueOf(v)) {\n\t\t\t\t\t\treturn false\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\treturn true\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"DOUBLE\",\n\t\t\tfunction: func(values []float64) bool {\n\t\t\t\tfilter := newFilter(len(values))\n\t\t\t\tenc.EncodeDouble(filter.Bytes(), values)\n\t\t\t\tfor _, v := range values {\n\t\t\t\t\tif !check(filter, ValueOf(v)) {\n\t\t\t\t\t\treturn false\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\treturn true\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"BYTE_ARRAY\",\n\t\t\tfunction: func(values [][]byte) bool {\n\t\t\t\tcontent := make([]byte, 0, 512)\n\t\t\t\toffsets := make([]uint32, len(values))\n\t\t\t\tfor _, value := range values {\n\t\t\t\t\toffsets = append(offsets, uint32(len(content)))\n\t\t\t\t\tcontent = append(content, value...)\n\t\t\t\t}\n\t\t\t\toffsets = append(offsets, uint32(len(content)))\n\t\t\t\tfilter := newFilter(len(values))\n\t\t\t\tenc.EncodeByteArray(filter.Bytes(), content, offsets)\n\t\t\t\tfor _, v := range values {\n\t\t\t\t\tif !check(filter, ValueOf(v)) {\n\t\t\t\t\t\treturn false\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\treturn true\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"FIXED_LEN_BYTE_ARRAY\",\n\t\t\tfunction: func(values []byte) bool {\n\t\t\t\tfilter := newFilter(len(values))\n\t\t\t\tenc.EncodeFixedLenByteArray(filter.Bytes(), values, 1)\n\t\t\t\tfor _, v := range values {\n\t\t\t\t\tif !check(filter, ValueOf([1]byte{v})) {\n\t\t\t\t\t\treturn false\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\treturn true\n\t\t\t},\n\t\t},\n\t}\n\n\tfor _, test := range tests {\n\t\tt.Run(test.scenario, func(t *testing.T) {\n\t\t\tif err := quick.Check(test.function); err != nil {\n\t\t\t\tt.Error(err)\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc BenchmarkSplitBlockFilter(b *testing.B) {\n\tconst N = 1000\n\tf := make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(N, 10)).Bytes()\n\te := SplitBlockFilter(10, \"$\").Encoding()\n\n\tv := make([]int64, N)\n\tr := rand.NewSource(10)\n\tfor i := range v {\n\t\tv[i] = r.Int63()\n\t}\n\n\tfor i := 0; i < b.N; i++ {\n\t\te.EncodeInt64(f, v)\n\t}\n\n\tb.SetBytes(8 * N)\n}\n"
  },
  {
    "path": "buffer.go",
    "content": "package parquet\n\nimport (\n\t\"log\"\n\t\"runtime\"\n\t\"sort\"\n\t\"sync\"\n\t\"sync/atomic\"\n\n\t\"github.com/segmentio/parquet-go/internal/debug\"\n)\n\n// Buffer represents an in-memory group of parquet rows.\n//\n// The main purpose of the Buffer type is to provide a way to sort rows before\n// writing them to a parquet file. Buffer implements sort.Interface as a way\n// to support reordering the rows that have been written to it.\ntype Buffer struct {\n\tconfig  *RowGroupConfig\n\tschema  *Schema\n\trowbuf  []Row\n\tcolbuf  [][]Value\n\tchunks  []ColumnChunk\n\tcolumns []ColumnBuffer\n\tsorted  []ColumnBuffer\n}\n\n// NewBuffer constructs a new buffer, using the given list of buffer options\n// to configure the buffer returned by the function.\n//\n// The function panics if the buffer configuration is invalid. Programs that\n// cannot guarantee the validity of the options passed to NewBuffer should\n// construct the buffer configuration independently prior to calling this\n// function:\n//\n//\tconfig, err := parquet.NewRowGroupConfig(options...)\n//\tif err != nil {\n//\t\t// handle the configuration error\n//\t\t...\n//\t} else {\n//\t\t// this call to create a buffer is guaranteed not to panic\n//\t\tbuffer := parquet.NewBuffer(config)\n//\t\t...\n//\t}\nfunc NewBuffer(options ...RowGroupOption) *Buffer {\n\tconfig, err := NewRowGroupConfig(options...)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\tbuf := &Buffer{\n\t\tconfig: config,\n\t}\n\tif config.Schema != nil {\n\t\tbuf.configure(config.Schema)\n\t}\n\treturn buf\n}\n\nfunc (buf *Buffer) configure(schema *Schema) {\n\tif schema == nil {\n\t\treturn\n\t}\n\tsortingColumns := buf.config.Sorting.SortingColumns\n\tbuf.sorted = make([]ColumnBuffer, len(sortingColumns))\n\n\tforEachLeafColumnOf(schema, func(leaf leafColumn) {\n\t\tnullOrdering := nullsGoLast\n\t\tcolumnIndex := int(leaf.columnIndex)\n\t\tcolumnType := leaf.node.Type()\n\t\tbufferCap := buf.config.ColumnBufferCapacity\n\t\tdictionary := (Dictionary)(nil)\n\t\tencoding := encodingOf(leaf.node)\n\n\t\tif isDictionaryEncoding(encoding) {\n\t\t\testimatedDictBufferSize := columnType.EstimateSize(bufferCap)\n\t\t\tdictBuffer := columnType.NewValues(\n\t\t\t\tmake([]byte, 0, estimatedDictBufferSize),\n\t\t\t\tnil,\n\t\t\t)\n\t\t\tdictionary = columnType.NewDictionary(columnIndex, 0, dictBuffer)\n\t\t\tcolumnType = dictionary.Type()\n\t\t}\n\n\t\tsortingIndex := searchSortingColumn(sortingColumns, leaf.path)\n\t\tif sortingIndex < len(sortingColumns) && sortingColumns[sortingIndex].NullsFirst() {\n\t\t\tnullOrdering = nullsGoFirst\n\t\t}\n\n\t\tcolumn := columnType.NewColumnBuffer(columnIndex, bufferCap)\n\t\tswitch {\n\t\tcase leaf.maxRepetitionLevel > 0:\n\t\t\tcolumn = newRepeatedColumnBuffer(column, leaf.maxRepetitionLevel, leaf.maxDefinitionLevel, nullOrdering)\n\t\tcase leaf.maxDefinitionLevel > 0:\n\t\t\tcolumn = newOptionalColumnBuffer(column, leaf.maxDefinitionLevel, nullOrdering)\n\t\t}\n\t\tbuf.columns = append(buf.columns, column)\n\n\t\tif sortingIndex < len(sortingColumns) {\n\t\t\tif sortingColumns[sortingIndex].Descending() {\n\t\t\t\tcolumn = &reversedColumnBuffer{column}\n\t\t\t}\n\t\t\tbuf.sorted[sortingIndex] = column\n\t\t}\n\t})\n\n\tbuf.schema = schema\n\tbuf.rowbuf = make([]Row, 0, 1)\n\tbuf.colbuf = make([][]Value, len(buf.columns))\n\tbuf.chunks = make([]ColumnChunk, len(buf.columns))\n\n\tfor i, column := range buf.columns {\n\t\tbuf.chunks[i] = column\n\t}\n}\n\n// Size returns the estimated size of the buffer in memory (in bytes).\nfunc (buf *Buffer) Size() int64 {\n\tsize := int64(0)\n\tfor _, col := range buf.columns {\n\t\tsize += col.Size()\n\t}\n\treturn size\n}\n\n// NumRows returns the number of rows written to the buffer.\nfunc (buf *Buffer) NumRows() int64 { return int64(buf.Len()) }\n\n// ColumnChunks returns the buffer columns.\nfunc (buf *Buffer) ColumnChunks() []ColumnChunk { return buf.chunks }\n\n// ColumnBuffer returns the buffer columns.\n//\n// This method is similar to ColumnChunks, but returns a list of ColumnBuffer\n// instead of a ColumnChunk values (the latter being read-only); calling\n// ColumnBuffers or ColumnChunks with the same index returns the same underlying\n// objects, but with different types, which removes the need for making a type\n// assertion if the program needed to write directly to the column buffers.\n// The presence of the ColumnChunks method is still required to satisfy the\n// RowGroup interface.\nfunc (buf *Buffer) ColumnBuffers() []ColumnBuffer { return buf.columns }\n\n// Schema returns the schema of the buffer.\n//\n// The schema is either configured by passing a Schema in the option list when\n// constructing the buffer, or lazily discovered when the first row is written.\nfunc (buf *Buffer) Schema() *Schema { return buf.schema }\n\n// SortingColumns returns the list of columns by which the buffer will be\n// sorted.\n//\n// The sorting order is configured by passing a SortingColumns option when\n// constructing the buffer.\nfunc (buf *Buffer) SortingColumns() []SortingColumn { return buf.config.Sorting.SortingColumns }\n\n// Len returns the number of rows written to the buffer.\nfunc (buf *Buffer) Len() int {\n\tif len(buf.columns) == 0 {\n\t\treturn 0\n\t} else {\n\t\t// All columns have the same number of rows.\n\t\treturn buf.columns[0].Len()\n\t}\n}\n\n// Less returns true if row[i] < row[j] in the buffer.\nfunc (buf *Buffer) Less(i, j int) bool {\n\tfor _, col := range buf.sorted {\n\t\tswitch {\n\t\tcase col.Less(i, j):\n\t\t\treturn true\n\t\tcase col.Less(j, i):\n\t\t\treturn false\n\t\t}\n\t}\n\treturn false\n}\n\n// Swap exchanges the rows at indexes i and j.\nfunc (buf *Buffer) Swap(i, j int) {\n\tfor _, col := range buf.columns {\n\t\tcol.Swap(i, j)\n\t}\n}\n\n// Reset clears the content of the buffer, allowing it to be reused.\nfunc (buf *Buffer) Reset() {\n\tfor _, col := range buf.columns {\n\t\tcol.Reset()\n\t}\n}\n\n// Write writes a row held in a Go value to the buffer.\nfunc (buf *Buffer) Write(row interface{}) error {\n\tif buf.schema == nil {\n\t\tbuf.configure(SchemaOf(row))\n\t}\n\n\tbuf.rowbuf = buf.rowbuf[:1]\n\tdefer clearRows(buf.rowbuf)\n\n\tbuf.rowbuf[0] = buf.schema.Deconstruct(buf.rowbuf[0], row)\n\t_, err := buf.WriteRows(buf.rowbuf)\n\treturn err\n}\n\n// WriteRows writes parquet rows to the buffer.\nfunc (buf *Buffer) WriteRows(rows []Row) (int, error) {\n\tdefer func() {\n\t\tfor i, colbuf := range buf.colbuf {\n\t\t\tclearValues(colbuf)\n\t\t\tbuf.colbuf[i] = colbuf[:0]\n\t\t}\n\t}()\n\n\tif buf.schema == nil {\n\t\treturn 0, ErrRowGroupSchemaMissing\n\t}\n\n\tfor _, row := range rows {\n\t\tfor _, value := range row {\n\t\t\tcolumnIndex := value.Column()\n\t\t\tbuf.colbuf[columnIndex] = append(buf.colbuf[columnIndex], value)\n\t\t}\n\t}\n\n\tfor columnIndex, values := range buf.colbuf {\n\t\tif _, err := buf.columns[columnIndex].WriteValues(values); err != nil {\n\t\t\t// TODO: an error at this stage will leave the buffer in an invalid\n\t\t\t// state since the row was partially written. Applications are not\n\t\t\t// expected to continue using the buffer after getting an error,\n\t\t\t// maybe we can enforce it?\n\t\t\treturn 0, err\n\t\t}\n\t}\n\n\treturn len(rows), nil\n}\n\n// WriteRowGroup satisfies the RowGroupWriter interface.\nfunc (buf *Buffer) WriteRowGroup(rowGroup RowGroup) (int64, error) {\n\trowGroupSchema := rowGroup.Schema()\n\tswitch {\n\tcase rowGroupSchema == nil:\n\t\treturn 0, ErrRowGroupSchemaMissing\n\tcase buf.schema == nil:\n\t\tbuf.configure(rowGroupSchema)\n\tcase !nodesAreEqual(buf.schema, rowGroupSchema):\n\t\treturn 0, ErrRowGroupSchemaMismatch\n\t}\n\tif !sortingColumnsHavePrefix(rowGroup.SortingColumns(), buf.SortingColumns()) {\n\t\treturn 0, ErrRowGroupSortingColumnsMismatch\n\t}\n\tn := buf.NumRows()\n\tr := rowGroup.Rows()\n\tdefer r.Close()\n\t_, err := CopyRows(bufferWriter{buf}, r)\n\treturn buf.NumRows() - n, err\n}\n\n// Rows returns a reader exposing the current content of the buffer.\n//\n// The buffer and the returned reader share memory. Mutating the buffer\n// concurrently to reading rows may result in non-deterministic behavior.\nfunc (buf *Buffer) Rows() Rows { return newRowGroupRows(buf, ReadModeSync) }\n\n// bufferWriter is an adapter for Buffer which implements both RowWriter and\n// PageWriter to enable optimizations in CopyRows for types that support writing\n// rows by copying whole pages instead of calling WriteRow repeatedly.\ntype bufferWriter struct{ buf *Buffer }\n\nfunc (w bufferWriter) WriteRows(rows []Row) (int, error) {\n\treturn w.buf.WriteRows(rows)\n}\n\nfunc (w bufferWriter) WriteValues(values []Value) (int, error) {\n\treturn w.buf.columns[values[0].Column()].WriteValues(values)\n}\n\nfunc (w bufferWriter) WritePage(page Page) (int64, error) {\n\treturn CopyValues(w.buf.columns[page.Column()], page.Values())\n}\n\nvar (\n\t_ RowGroup       = (*Buffer)(nil)\n\t_ RowGroupWriter = (*Buffer)(nil)\n\t_ sort.Interface = (*Buffer)(nil)\n\n\t_ RowWriter   = (*bufferWriter)(nil)\n\t_ PageWriter  = (*bufferWriter)(nil)\n\t_ ValueWriter = (*bufferWriter)(nil)\n)\n\ntype buffer struct {\n\tdata  []byte\n\trefc  uintptr\n\tpool  *bufferPool\n\tstack []byte\n}\n\nfunc (b *buffer) refCount() int {\n\treturn int(atomic.LoadUintptr(&b.refc))\n}\n\nfunc (b *buffer) ref() {\n\tatomic.AddUintptr(&b.refc, +1)\n}\n\nfunc (b *buffer) unref() {\n\tif atomic.AddUintptr(&b.refc, ^uintptr(0)) == 0 {\n\t\tif b.pool != nil {\n\t\t\tb.pool.put(b)\n\t\t}\n\t}\n}\n\nfunc monitorBufferRelease(b *buffer) {\n\tif rc := b.refCount(); rc != 0 {\n\t\tlog.Printf(\"PARQUETGODEBUG: buffer garbage collected with non-zero reference count\\n%s\", string(b.stack))\n\t}\n}\n\ntype bufferPool struct {\n\t// Buckets are split in two groups for short and large buffers. In the short\n\t// buffer group (below 256KB), the growth rate between each bucket is 2. The\n\t// growth rate changes to 1.5 in the larger buffer group.\n\t//\n\t// Short buffer buckets:\n\t// ---------------------\n\t//   4K, 8K, 16K, 32K, 64K, 128K, 256K\n\t//\n\t// Large buffer buckets:\n\t// ---------------------\n\t//   364K, 546K, 819K ...\n\t//\n\tbuckets [bufferPoolBucketCount]sync.Pool\n}\n\nfunc (p *bufferPool) newBuffer(bufferSize, bucketSize int) *buffer {\n\tb := &buffer{\n\t\tdata: make([]byte, bufferSize, bucketSize),\n\t\trefc: 1,\n\t\tpool: p,\n\t}\n\tif debug.TRACEBUF > 0 {\n\t\tb.stack = make([]byte, 4096)\n\t\truntime.SetFinalizer(b, monitorBufferRelease)\n\t}\n\treturn b\n}\n\n// get returns a buffer from the levelled buffer pool. size is used to choose\n// the appropriate pool.\nfunc (p *bufferPool) get(bufferSize int) *buffer {\n\tbucketIndex, bucketSize := bufferPoolBucketIndexAndSizeOfGet(bufferSize)\n\n\tb := (*buffer)(nil)\n\tif bucketIndex >= 0 {\n\t\tb, _ = p.buckets[bucketIndex].Get().(*buffer)\n\t}\n\n\tif b == nil {\n\t\tb = p.newBuffer(bufferSize, bucketSize)\n\t} else {\n\t\tb.data = b.data[:bufferSize]\n\t\tb.ref()\n\t}\n\n\tif debug.TRACEBUF > 0 {\n\t\tb.stack = b.stack[:runtime.Stack(b.stack[:cap(b.stack)], false)]\n\t}\n\treturn b\n}\n\nfunc (p *bufferPool) put(b *buffer) {\n\tif b.pool != p {\n\t\tpanic(\"BUG: buffer returned to a different pool than the one it was allocated from\")\n\t}\n\tif b.refCount() != 0 {\n\t\tpanic(\"BUG: buffer returned to pool with a non-zero reference count\")\n\t}\n\tif bucketIndex, _ := bufferPoolBucketIndexAndSizeOfPut(cap(b.data)); bucketIndex >= 0 {\n\t\tp.buckets[bucketIndex].Put(b)\n\t}\n}\n\nconst (\n\tbufferPoolBucketCount         = 32\n\tbufferPoolMinSize             = 4096\n\tbufferPoolLastShortBucketSize = 262144\n)\n\nfunc bufferPoolNextSize(size int) int {\n\tif size < bufferPoolLastShortBucketSize {\n\t\treturn size * 2\n\t} else {\n\t\treturn size + (size / 2)\n\t}\n}\n\nfunc bufferPoolBucketIndexAndSizeOfGet(size int) (int, int) {\n\tlimit := bufferPoolMinSize\n\n\tfor i := 0; i < bufferPoolBucketCount; i++ {\n\t\tif size <= limit {\n\t\t\treturn i, limit\n\t\t}\n\t\tlimit = bufferPoolNextSize(limit)\n\t}\n\n\treturn -1, size\n}\n\nfunc bufferPoolBucketIndexAndSizeOfPut(size int) (int, int) {\n\t// When releasing buffers, some may have a capacity that is not one of the\n\t// bucket sizes (due to the use of append for example). In this case, we\n\t// have to put the buffer is the highest bucket with a size less or equal\n\t// to the buffer capacity.\n\tif limit := bufferPoolMinSize; size >= limit {\n\t\tfor i := 0; i < bufferPoolBucketCount; i++ {\n\t\t\tn := bufferPoolNextSize(limit)\n\t\t\tif size < n {\n\t\t\t\treturn i, limit\n\t\t\t}\n\t\t\tlimit = n\n\t\t}\n\t}\n\treturn -1, size\n}\n\nvar (\n\tbuffers bufferPool\n)\n\ntype bufferedPage struct {\n\tPage\n\tvalues           *buffer\n\toffsets          *buffer\n\trepetitionLevels *buffer\n\tdefinitionLevels *buffer\n}\n\nfunc newBufferedPage(page Page, values, offsets, definitionLevels, repetitionLevels *buffer) *bufferedPage {\n\tp := &bufferedPage{\n\t\tPage:             page,\n\t\tvalues:           values,\n\t\toffsets:          offsets,\n\t\tdefinitionLevels: definitionLevels,\n\t\trepetitionLevels: repetitionLevels,\n\t}\n\tbufferRef(values)\n\tbufferRef(offsets)\n\tbufferRef(definitionLevels)\n\tbufferRef(repetitionLevels)\n\treturn p\n}\n\nfunc (p *bufferedPage) Slice(i, j int64) Page {\n\treturn newBufferedPage(\n\t\tp.Page.Slice(i, j),\n\t\tp.values,\n\t\tp.offsets,\n\t\tp.definitionLevels,\n\t\tp.repetitionLevels,\n\t)\n}\n\nfunc (p *bufferedPage) Retain() {\n\tbufferRef(p.values)\n\tbufferRef(p.offsets)\n\tbufferRef(p.definitionLevels)\n\tbufferRef(p.repetitionLevels)\n}\n\nfunc (p *bufferedPage) Release() {\n\tbufferUnref(p.values)\n\tbufferUnref(p.offsets)\n\tbufferUnref(p.definitionLevels)\n\tbufferUnref(p.repetitionLevels)\n}\n\nfunc bufferRef(buf *buffer) {\n\tif buf != nil {\n\t\tbuf.ref()\n\t}\n}\n\nfunc bufferUnref(buf *buffer) {\n\tif buf != nil {\n\t\tbuf.unref()\n\t}\n}\n\n// Retain is a helper function to increment the reference counter of pages\n// backed by memory which can be granularly managed by the application.\n//\n// Usage of this function is optional and with Release, is intended to allow\n// finer grain memory management in the application. Most programs should be\n// able to rely on automated memory management provided by the Go garbage\n// collector instead.\n//\n// The function should be called when a page lifetime is about to be shared\n// between multiple goroutines or layers of an application, and the program\n// wants to express \"sharing ownership\" of the page.\n//\n// Calling this function on pages that do not embed a reference counter does\n// nothing.\nfunc Retain(page Page) {\n\tif p, _ := page.(retainable); p != nil {\n\t\tp.Retain()\n\t}\n}\n\n// Release is a helper function to decrement the reference counter of pages\n// backed by memory which can be granularly managed by the application.\n//\n// Usage of this is optional and with Retain, is intended to allow finer grained\n// memory management in the application, at the expense of potentially causing\n// panics if the page is used after its reference count has reached zero. Most\n// programs should be able to rely on automated memory management provided by\n// the Go garbage collector instead.\n//\n// The function should be called to return a page to the internal buffer pool,\n// when a goroutine \"releases ownership\" it acquired either by being the single\n// owner (e.g. capturing the return value from a ReadPage call) or having gotten\n// shared ownership by calling Retain.\n//\n// Calling this function on pages that do not embed a reference counter does\n// nothing.\nfunc Release(page Page) {\n\tif p, _ := page.(releasable); p != nil {\n\t\tp.Release()\n\t}\n}\n\ntype retainable interface {\n\tRetain()\n}\n\ntype releasable interface {\n\tRelease()\n}\n\nvar (\n\t_ retainable = (*bufferedPage)(nil)\n\t_ releasable = (*bufferedPage)(nil)\n)\n"
  },
  {
    "path": "buffer_go18.go",
    "content": "//go:build go1.18\n\npackage parquet\n\nimport (\n\t\"reflect\"\n\t\"sort\"\n)\n\n// GenericBuffer is similar to a Buffer but uses a type parameter to define the\n// Go type representing the schema of rows in the buffer.\n//\n// See GenericWriter for details about the benefits over the classic Buffer API.\ntype GenericBuffer[T any] struct {\n\tbase  Buffer\n\twrite bufferFunc[T]\n}\n\n// NewGenericBuffer is like NewBuffer but returns a GenericBuffer[T] suited to write\n// rows of Go type T.\n//\n// The type parameter T should be a map, struct, or any. Any other types will\n// cause a panic at runtime. Type checking is a lot more effective when the\n// generic parameter is a struct type, using map and interface types is somewhat\n// similar to using a Writer.  If using an interface type for the type parameter,\n// then providing a schema at instantiation is required.\n//\n// If the option list may explicitly declare a schema, it must be compatible\n// with the schema generated from T.\nfunc NewGenericBuffer[T any](options ...RowGroupOption) *GenericBuffer[T] {\n\tconfig, err := NewRowGroupConfig(options...)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\n\tt := typeOf[T]()\n\tif config.Schema == nil && t != nil {\n\t\tconfig.Schema = schemaOf(dereference(t))\n\t}\n\n\tif config.Schema == nil {\n\t\tpanic(\"generic buffer must be instantiated with schema or concrete type.\")\n\t}\n\n\tbuf := &GenericBuffer[T]{\n\t\tbase: Buffer{config: config},\n\t}\n\tbuf.base.configure(config.Schema)\n\tbuf.write = bufferFuncOf[T](t, config.Schema)\n\treturn buf\n}\n\nfunc typeOf[T any]() reflect.Type {\n\tvar v T\n\treturn reflect.TypeOf(v)\n}\n\ntype bufferFunc[T any] func(*GenericBuffer[T], []T) (int, error)\n\nfunc bufferFuncOf[T any](t reflect.Type, schema *Schema) bufferFunc[T] {\n\tif t == nil {\n\t\treturn (*GenericBuffer[T]).writeRows\n\t}\n\tswitch t.Kind() {\n\tcase reflect.Interface, reflect.Map:\n\t\treturn (*GenericBuffer[T]).writeRows\n\n\tcase reflect.Struct:\n\t\treturn makeBufferFunc[T](t, schema)\n\n\tcase reflect.Pointer:\n\t\tif e := t.Elem(); e.Kind() == reflect.Struct {\n\t\t\treturn makeBufferFunc[T](t, schema)\n\t\t}\n\t}\n\tpanic(\"cannot create buffer for values of type \" + t.String())\n}\n\nfunc makeBufferFunc[T any](t reflect.Type, schema *Schema) bufferFunc[T] {\n\twriteRows := writeRowsFuncOf(t, schema, nil)\n\treturn func(buf *GenericBuffer[T], rows []T) (n int, err error) {\n\t\terr = writeRows(buf.base.columns, makeArrayOf(rows), columnLevels{})\n\t\tif err == nil {\n\t\t\tn = len(rows)\n\t\t}\n\t\treturn n, err\n\t}\n}\n\nfunc (buf *GenericBuffer[T]) Size() int64 {\n\treturn buf.base.Size()\n}\n\nfunc (buf *GenericBuffer[T]) NumRows() int64 {\n\treturn buf.base.NumRows()\n}\n\nfunc (buf *GenericBuffer[T]) ColumnChunks() []ColumnChunk {\n\treturn buf.base.ColumnChunks()\n}\n\nfunc (buf *GenericBuffer[T]) ColumnBuffers() []ColumnBuffer {\n\treturn buf.base.ColumnBuffers()\n}\n\nfunc (buf *GenericBuffer[T]) SortingColumns() []SortingColumn {\n\treturn buf.base.SortingColumns()\n}\n\nfunc (buf *GenericBuffer[T]) Len() int {\n\treturn buf.base.Len()\n}\n\nfunc (buf *GenericBuffer[T]) Less(i, j int) bool {\n\treturn buf.base.Less(i, j)\n}\n\nfunc (buf *GenericBuffer[T]) Swap(i, j int) {\n\tbuf.base.Swap(i, j)\n}\n\nfunc (buf *GenericBuffer[T]) Reset() {\n\tbuf.base.Reset()\n}\n\nfunc (buf *GenericBuffer[T]) Write(rows []T) (int, error) {\n\tif len(rows) == 0 {\n\t\treturn 0, nil\n\t}\n\treturn buf.write(buf, rows)\n}\n\nfunc (buf *GenericBuffer[T]) WriteRows(rows []Row) (int, error) {\n\treturn buf.base.WriteRows(rows)\n}\n\nfunc (buf *GenericBuffer[T]) WriteRowGroup(rowGroup RowGroup) (int64, error) {\n\treturn buf.base.WriteRowGroup(rowGroup)\n}\n\nfunc (buf *GenericBuffer[T]) Rows() Rows {\n\treturn buf.base.Rows()\n}\n\nfunc (buf *GenericBuffer[T]) Schema() *Schema {\n\treturn buf.base.Schema()\n}\n\nfunc (buf *GenericBuffer[T]) writeRows(rows []T) (int, error) {\n\tif cap(buf.base.rowbuf) < len(rows) {\n\t\tbuf.base.rowbuf = make([]Row, len(rows))\n\t} else {\n\t\tbuf.base.rowbuf = buf.base.rowbuf[:len(rows)]\n\t}\n\tdefer clearRows(buf.base.rowbuf)\n\n\tschema := buf.base.Schema()\n\tfor i := range rows {\n\t\tbuf.base.rowbuf[i] = schema.Deconstruct(buf.base.rowbuf[i], &rows[i])\n\t}\n\n\treturn buf.base.WriteRows(buf.base.rowbuf)\n}\n\nvar (\n\t_ RowGroup       = (*GenericBuffer[any])(nil)\n\t_ RowGroupWriter = (*GenericBuffer[any])(nil)\n\t_ sort.Interface = (*GenericBuffer[any])(nil)\n\n\t_ RowGroup       = (*GenericBuffer[struct{}])(nil)\n\t_ RowGroupWriter = (*GenericBuffer[struct{}])(nil)\n\t_ sort.Interface = (*GenericBuffer[struct{}])(nil)\n\n\t_ RowGroup       = (*GenericBuffer[map[struct{}]struct{}])(nil)\n\t_ RowGroupWriter = (*GenericBuffer[map[struct{}]struct{}])(nil)\n\t_ sort.Interface = (*GenericBuffer[map[struct{}]struct{}])(nil)\n)\n"
  },
  {
    "path": "buffer_go18_test.go",
    "content": "//go:build go1.18\n\npackage parquet_test\n\nimport (\n\t\"encoding/binary\"\n\t\"errors\"\n\t\"fmt\"\n\t\"io\"\n\t\"math/rand\"\n\t\"reflect\"\n\t\"sort\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc TestGenericBuffer(t *testing.T) {\n\ttestGenericBuffer[booleanColumn](t)\n\ttestGenericBuffer[int32Column](t)\n\ttestGenericBuffer[int64Column](t)\n\ttestGenericBuffer[int96Column](t)\n\ttestGenericBuffer[floatColumn](t)\n\ttestGenericBuffer[doubleColumn](t)\n\ttestGenericBuffer[byteArrayColumn](t)\n\ttestGenericBuffer[fixedLenByteArrayColumn](t)\n\ttestGenericBuffer[stringColumn](t)\n\ttestGenericBuffer[indexedStringColumn](t)\n\ttestGenericBuffer[uuidColumn](t)\n\ttestGenericBuffer[timeColumn](t)\n\ttestGenericBuffer[timeInMillisColumn](t)\n\ttestGenericBuffer[mapColumn](t)\n\ttestGenericBuffer[decimalColumn](t)\n\ttestGenericBuffer[addressBook](t)\n\ttestGenericBuffer[contact](t)\n\ttestGenericBuffer[listColumn2](t)\n\ttestGenericBuffer[listColumn1](t)\n\ttestGenericBuffer[listColumn0](t)\n\ttestGenericBuffer[nestedListColumn1](t)\n\ttestGenericBuffer[nestedListColumn](t)\n\ttestGenericBuffer[*contact](t)\n\ttestGenericBuffer[paddedBooleanColumn](t)\n\ttestGenericBuffer[optionalInt32Column](t)\n\ttestGenericBuffer[repeatedInt32Column](t)\n}\n\nfunc testGenericBuffer[Row any](t *testing.T) {\n\tvar model Row\n\tt.Run(reflect.TypeOf(model).Name(), func(t *testing.T) {\n\t\terr := quickCheck(func(rows []Row) bool {\n\t\t\tif len(rows) == 0 {\n\t\t\t\treturn true // TODO: fix support for parquet files with zero rows\n\t\t\t}\n\t\t\tif err := testGenericBufferRows(rows); err != nil {\n\t\t\t\tt.Error(err)\n\t\t\t\treturn false\n\t\t\t}\n\t\t\treturn true\n\t\t})\n\t\tif err != nil {\n\t\t\tt.Error(err)\n\t\t}\n\t})\n}\n\nfunc testGenericBufferRows[Row any](rows []Row) error {\n\tsetNullPointers(rows)\n\tbuffer := parquet.NewGenericBuffer[Row]()\n\t_, err := buffer.Write(rows)\n\tif err != nil {\n\t\treturn err\n\t}\n\treader := parquet.NewGenericRowGroupReader[Row](buffer)\n\tresult := make([]Row, len(rows))\n\tn, err := reader.Read(result)\n\tif err != nil && !errors.Is(err, io.EOF) {\n\t\treturn err\n\t}\n\tif n < len(rows) {\n\t\treturn fmt.Errorf(\"not enough values were read: want=%d got=%d\", len(rows), n)\n\t}\n\tif !reflect.DeepEqual(rows, result) {\n\t\treturn fmt.Errorf(\"rows mismatch:\\nwant: %#v\\ngot:  %#v\", rows, result)\n\t}\n\treturn nil\n}\n\nfunc setNullPointers[Row any](rows []Row) {\n\tif len(rows) > 0 && reflect.TypeOf(rows[0]).Kind() == reflect.Pointer {\n\t\tfor i := range rows {\n\t\t\tv := reflect.ValueOf(&rows[i]).Elem()\n\t\t\tif v.IsNil() {\n\t\t\t\tv.Set(reflect.New(v.Type().Elem()))\n\t\t\t}\n\t\t}\n\t}\n}\n\ntype generator[T any] interface {\n\tgenerate(*rand.Rand) T\n}\n\nfunc BenchmarkGenericBuffer(b *testing.B) {\n\tbenchmarkGenericBuffer[benchmarkRowType](b)\n\tbenchmarkGenericBuffer[booleanColumn](b)\n\tbenchmarkGenericBuffer[int32Column](b)\n\tbenchmarkGenericBuffer[int64Column](b)\n\tbenchmarkGenericBuffer[floatColumn](b)\n\tbenchmarkGenericBuffer[doubleColumn](b)\n\tbenchmarkGenericBuffer[byteArrayColumn](b)\n\tbenchmarkGenericBuffer[fixedLenByteArrayColumn](b)\n\tbenchmarkGenericBuffer[stringColumn](b)\n\tbenchmarkGenericBuffer[indexedStringColumn](b)\n\tbenchmarkGenericBuffer[uuidColumn](b)\n\tbenchmarkGenericBuffer[timeColumn](b)\n\tbenchmarkGenericBuffer[timeInMillisColumn](b)\n\tbenchmarkGenericBuffer[mapColumn](b)\n\tbenchmarkGenericBuffer[decimalColumn](b)\n\tbenchmarkGenericBuffer[contact](b)\n\tbenchmarkGenericBuffer[paddedBooleanColumn](b)\n\tbenchmarkGenericBuffer[optionalInt32Column](b)\n\tbenchmarkGenericBuffer[repeatedInt32Column](b)\n}\n\nfunc benchmarkGenericBuffer[Row generator[Row]](b *testing.B) {\n\tvar model Row\n\tb.Run(reflect.TypeOf(model).Name(), func(b *testing.B) {\n\t\tprng := rand.New(rand.NewSource(0))\n\t\trows := make([]Row, benchmarkNumRows)\n\t\tfor i := range rows {\n\t\t\trows[i] = rows[i].generate(prng)\n\t\t}\n\n\t\tb.Run(\"go1.17\", func(b *testing.B) {\n\t\t\tbuffer := parquet.NewBuffer(parquet.SchemaOf(rows[0]))\n\t\t\ti := 0\n\t\t\tbenchmarkRowsPerSecond(b, func() int {\n\t\t\t\tfor j := 0; j < benchmarkRowsPerStep; j++ {\n\t\t\t\t\tif err := buffer.Write(&rows[i]); err != nil {\n\t\t\t\t\t\tb.Fatal(err)\n\t\t\t\t\t}\n\t\t\t\t}\n\n\t\t\t\ti += benchmarkRowsPerStep\n\t\t\t\ti %= benchmarkNumRows\n\n\t\t\t\tif i == 0 {\n\t\t\t\t\tbuffer.Reset()\n\t\t\t\t}\n\t\t\t\treturn benchmarkRowsPerStep\n\t\t\t})\n\t\t})\n\n\t\tb.Run(\"go1.18\", func(b *testing.B) {\n\t\t\tbuffer := parquet.NewGenericBuffer[Row]()\n\t\t\ti := 0\n\t\t\tbenchmarkRowsPerSecond(b, func() int {\n\t\t\t\tn, err := buffer.Write(rows[i : i+benchmarkRowsPerStep])\n\t\t\t\tif err != nil {\n\t\t\t\t\tb.Fatal(err)\n\t\t\t\t}\n\n\t\t\t\ti += benchmarkRowsPerStep\n\t\t\t\ti %= benchmarkNumRows\n\n\t\t\t\tif i == 0 {\n\t\t\t\t\tbuffer.Reset()\n\t\t\t\t}\n\t\t\t\treturn n\n\t\t\t})\n\t\t})\n\t})\n}\n\nfunc TestIssue327(t *testing.T) {\n\tt.Run(\"untagged nested lists should panic\", func(t *testing.T) {\n\t\ttype testType struct {\n\t\t\tListOfLists [][]int\n\t\t}\n\n\t\tdefer func() {\n\t\t\tif r := recover(); r == nil {\n\t\t\t\tt.Errorf(\"Nested lists without the list tag should panic\")\n\t\t\t}\n\t\t}()\n\n\t\t_ = parquet.NewGenericBuffer[testType]()\n\t})\n}\n\nfunc TestIssue346(t *testing.T) {\n\ttype TestType struct {\n\t\tKey int\n\t}\n\n\tschema := parquet.SchemaOf(TestType{})\n\tbuffer := parquet.NewGenericBuffer[any](schema)\n\n\tdata := make([]any, 1)\n\tdata[0] = TestType{Key: 0}\n\t_, _ = buffer.Write(data)\n}\n\nfunc TestIssue347(t *testing.T) {\n\ttype TestType struct {\n\t\tKey int\n\t}\n\n\t// instantiating with concrete type shouldn't panic\n\t_ = parquet.NewGenericBuffer[TestType]()\n\n\t// instantiating with schema and interface type parameter shouldn't panic\n\tschema := parquet.SchemaOf(TestType{})\n\t_ = parquet.NewGenericBuffer[any](schema)\n\n\tdefer func() {\n\t\tif r := recover(); r == nil {\n\t\t\tt.Errorf(\"instantiating generic buffer without schema and with interface \" +\n\t\t\t\t\"type parameter should panic\")\n\t\t}\n\t}()\n\t_ = parquet.NewGenericBuffer[any]()\n}\n\nfunc BenchmarkSortGenericBuffer(b *testing.B) {\n\ttype Row struct {\n\t\tI0 int64\n\t\tI1 int64\n\t\tI2 int64\n\t\tI3 int64\n\t\tI4 int64\n\t\tI5 int64\n\t\tI6 int64\n\t\tI7 int64\n\t\tI8 int64\n\t\tI9 int64\n\t\tID [16]byte\n\t}\n\n\tbuf := parquet.NewGenericBuffer[Row](\n\t\tparquet.SortingRowGroupConfig(\n\t\t\tparquet.SortingColumns(\n\t\t\t\tparquet.Ascending(\"ID\"),\n\t\t\t),\n\t\t),\n\t)\n\n\trows := make([]Row, 10e3)\n\tprng := rand.New(rand.NewSource(0))\n\n\tfor i := range rows {\n\t\tbinary.LittleEndian.PutUint64(rows[i].ID[:8], uint64(i))\n\t\tbinary.LittleEndian.PutUint64(rows[i].ID[8:], ^uint64(i))\n\t}\n\n\tbuf.Write(rows)\n\tb.ResetTimer()\n\n\tfor i := 0; i < b.N; i++ {\n\t\tfor j := 0; j < 10; j++ {\n\t\t\tbuf.Swap(prng.Intn(len(rows)), prng.Intn(len(rows)))\n\t\t}\n\n\t\tsort.Sort(buf)\n\t}\n}\n"
  },
  {
    "path": "buffer_internal_test.go",
    "content": "package parquet\n\nimport (\n\t\"fmt\"\n\t\"math/rand\"\n\t\"testing\"\n)\n\nfunc TestBufferAlwaysCorrectSize(t *testing.T) {\n\tvar p bufferPool\n\tfor i := 0; i < 1000; i++ {\n\t\tn := rand.Intn(1024 * 1024)\n\t\tb := p.get(n)\n\t\tif len(b.data) != n {\n\t\t\tt.Fatalf(\"Expected buffer of size %d, got %d\", n, len(b.data))\n\t\t}\n\t\tb.unref()\n\t}\n}\n\nfunc TestBufferPoolBucketIndexAndSizeOf(t *testing.T) {\n\ttests := []struct {\n\t\tsize        int\n\t\tbucketIndex int\n\t\tbucketSize  int\n\t}{\n\t\t{size: 0, bucketIndex: 0, bucketSize: 4096},\n\t\t{size: 1, bucketIndex: 0, bucketSize: 4096},\n\t\t{size: 2049, bucketIndex: 0, bucketSize: 4096},\n\t\t{size: 4096, bucketIndex: 0, bucketSize: 4096},\n\t\t{size: 4097, bucketIndex: 1, bucketSize: 8192},\n\t\t{size: 8192, bucketIndex: 1, bucketSize: 8192},\n\t\t{size: 8193, bucketIndex: 2, bucketSize: 16384},\n\t\t{size: 16384, bucketIndex: 2, bucketSize: 16384},\n\t\t{size: 16385, bucketIndex: 3, bucketSize: 32768},\n\t\t{size: 32768, bucketIndex: 3, bucketSize: 32768},\n\t\t{size: 32769, bucketIndex: 4, bucketSize: 65536},\n\t\t{size: 262143, bucketIndex: 6, bucketSize: 262144},\n\t\t{size: 262144, bucketIndex: 6, bucketSize: 262144},\n\t\t{size: 262145, bucketIndex: 7, bucketSize: 393216},\n\t}\n\n\tfor _, test := range tests {\n\t\tt.Run(fmt.Sprintf(\"size=%d\", test.size), func(t *testing.T) {\n\t\t\tbucketIndex, bucketSize := bufferPoolBucketIndexAndSizeOfGet(test.size)\n\n\t\t\tif bucketIndex != test.bucketIndex {\n\t\t\t\tt.Errorf(\"wrong bucket index, want %d but got %d\", test.bucketIndex, bucketIndex)\n\t\t\t}\n\n\t\t\tif bucketSize != test.bucketSize {\n\t\t\t\tt.Errorf(\"wrong bucket size, want %d but got %d\", test.bucketSize, bucketSize)\n\t\t\t}\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "buffer_pool.go",
    "content": "package parquet\n\nimport (\n\t\"fmt\"\n\t\"io\"\n\t\"os\"\n\t\"path/filepath\"\n\t\"sync\"\n)\n\n// BufferPool is an interface abstracting the underlying implementation of\n// page buffer pools.\n//\n// The parquet-go package provides two implementations of this interface, one\n// backed by in-memory buffers (on the Go heap), and the other using temporary\n// files on disk.\n//\n// Applications which need finer grain control over the allocation and retention\n// of page buffers may choose to provide their own implementation and install it\n// via the parquet.ColumnPageBuffers writer option.\n//\n// BufferPool implementations must be safe to use concurrently from multiple\n// goroutines.\ntype BufferPool interface {\n\t// GetBuffer is called when a parquet writer needs to acquire a new\n\t// page buffer from the pool.\n\tGetBuffer() io.ReadWriteSeeker\n\n\t// PutBuffer is called when a parquet writer releases a page buffer to\n\t// the pool.\n\t//\n\t// The parquet.Writer type guarantees that the buffers it calls this method\n\t// with were previously acquired by a call to GetBuffer on the same\n\t// pool, and that it will not use them anymore after the call.\n\tPutBuffer(io.ReadWriteSeeker)\n}\n\n// NewBufferPool creates a new in-memory page buffer pool.\n//\n// The implementation is backed by sync.Pool and allocates memory buffers on the\n// Go heap.\nfunc NewBufferPool() BufferPool { return new(memoryBufferPool) }\n\ntype memoryBuffer struct {\n\tdata []byte\n\toff  int\n}\n\nfunc (p *memoryBuffer) Reset() {\n\tp.data, p.off = p.data[:0], 0\n}\n\nfunc (p *memoryBuffer) Read(b []byte) (n int, err error) {\n\tn = copy(b, p.data[p.off:])\n\tp.off += n\n\tif p.off == len(p.data) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\nfunc (p *memoryBuffer) Write(b []byte) (int, error) {\n\tn := copy(p.data[p.off:cap(p.data)], b)\n\tp.data = p.data[:p.off+n]\n\n\tif n < len(b) {\n\t\tp.data = append(p.data, b[n:]...)\n\t}\n\n\tp.off += len(b)\n\treturn len(b), nil\n}\n\nfunc (p *memoryBuffer) WriteTo(w io.Writer) (int64, error) {\n\tn, err := w.Write(p.data[p.off:])\n\tp.off += n\n\treturn int64(n), err\n}\n\nfunc (p *memoryBuffer) Seek(offset int64, whence int) (int64, error) {\n\tswitch whence {\n\tcase io.SeekCurrent:\n\t\toffset += int64(p.off)\n\tcase io.SeekEnd:\n\t\toffset += int64(len(p.data))\n\t}\n\tif offset < 0 {\n\t\treturn 0, fmt.Errorf(\"seek: negative offset: %d<0\", offset)\n\t}\n\tif offset > int64(len(p.data)) {\n\t\toffset = int64(len(p.data))\n\t}\n\tp.off = int(offset)\n\treturn offset, nil\n}\n\ntype memoryBufferPool struct{ sync.Pool }\n\nfunc (pool *memoryBufferPool) GetBuffer() io.ReadWriteSeeker {\n\tb, _ := pool.Get().(*memoryBuffer)\n\tif b == nil {\n\t\tb = new(memoryBuffer)\n\t} else {\n\t\tb.Reset()\n\t}\n\treturn b\n}\n\nfunc (pool *memoryBufferPool) PutBuffer(buf io.ReadWriteSeeker) {\n\tif b, _ := buf.(*memoryBuffer); b != nil {\n\t\tpool.Put(b)\n\t}\n}\n\ntype fileBufferPool struct {\n\terr     error\n\ttempdir string\n\tpattern string\n}\n\n// NewFileBufferPool creates a new on-disk page buffer pool.\nfunc NewFileBufferPool(tempdir, pattern string) BufferPool {\n\tpool := &fileBufferPool{\n\t\ttempdir: tempdir,\n\t\tpattern: pattern,\n\t}\n\tpool.tempdir, pool.err = filepath.Abs(pool.tempdir)\n\treturn pool\n}\n\nfunc (pool *fileBufferPool) GetBuffer() io.ReadWriteSeeker {\n\tif pool.err != nil {\n\t\treturn &errorBuffer{err: pool.err}\n\t}\n\tf, err := os.CreateTemp(pool.tempdir, pool.pattern)\n\tif err != nil {\n\t\treturn &errorBuffer{err: err}\n\t}\n\treturn f\n}\n\nfunc (pool *fileBufferPool) PutBuffer(buf io.ReadWriteSeeker) {\n\tif f, _ := buf.(*os.File); f != nil {\n\t\tdefer f.Close()\n\t\tos.Remove(f.Name())\n\t}\n}\n\ntype errorBuffer struct{ err error }\n\nfunc (buf *errorBuffer) Read([]byte) (int, error)          { return 0, buf.err }\nfunc (buf *errorBuffer) Write([]byte) (int, error)         { return 0, buf.err }\nfunc (buf *errorBuffer) ReadFrom(io.Reader) (int64, error) { return 0, buf.err }\nfunc (buf *errorBuffer) WriteTo(io.Writer) (int64, error)  { return 0, buf.err }\nfunc (buf *errorBuffer) Seek(int64, int) (int64, error)    { return 0, buf.err }\n\nvar (\n\tdefaultColumnBufferPool  memoryBufferPool\n\tdefaultSortingBufferPool memoryBufferPool\n\n\t_ io.ReaderFrom = (*errorBuffer)(nil)\n\t_ io.WriterTo   = (*errorBuffer)(nil)\n\t_ io.WriterTo   = (*memoryBuffer)(nil)\n)\n\ntype readerAt struct {\n\treader io.ReadSeeker\n\toffset int64\n}\n\nfunc (r *readerAt) ReadAt(b []byte, off int64) (int, error) {\n\tif r.offset < 0 || off != r.offset {\n\t\toff, err := r.reader.Seek(off, io.SeekStart)\n\t\tif err != nil {\n\t\t\treturn 0, err\n\t\t}\n\t\tr.offset = off\n\t}\n\tn, err := r.reader.Read(b)\n\tr.offset += int64(n)\n\treturn n, err\n}\n\nfunc newReaderAt(r io.ReadSeeker) io.ReaderAt {\n\tif rr, ok := r.(io.ReaderAt); ok {\n\t\treturn rr\n\t}\n\treturn &readerAt{reader: r, offset: -1}\n}\n"
  },
  {
    "path": "buffer_pool_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"bytes\"\n\t\"io\"\n\t\"strings\"\n\t\"testing\"\n\t\"testing/iotest\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc TestBufferPool(t *testing.T) {\n\ttestBufferPool(t, parquet.NewBufferPool())\n}\n\nfunc TestFileBufferPool(t *testing.T) {\n\ttestBufferPool(t, parquet.NewFileBufferPool(\"/tmp\", \"buffers.*\"))\n}\n\nfunc testBufferPool(t *testing.T, pool parquet.BufferPool) {\n\ttests := []struct {\n\t\tscenario string\n\t\tfunction func(*testing.T, parquet.BufferPool)\n\t}{\n\t\t{\n\t\t\tscenario: \"write bytes\",\n\t\t\tfunction: testBufferPoolWriteBytes,\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"write string\",\n\t\t\tfunction: testBufferPoolWriteString,\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"copy to buffer\",\n\t\t\tfunction: testBufferPoolCopyToBuffer,\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"copy from buffer\",\n\t\t\tfunction: testBufferPoolCopyFromBuffer,\n\t\t},\n\t}\n\n\tfor _, test := range tests {\n\t\tt.Run(test.scenario, func(t *testing.T) { test.function(t, pool) })\n\t}\n}\n\nfunc testBufferPoolWriteBytes(t *testing.T, pool parquet.BufferPool) {\n\tconst content = \"Hello World!\"\n\n\tbuffer := pool.GetBuffer()\n\tdefer pool.PutBuffer(buffer)\n\n\t_, err := buffer.Write([]byte(content))\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\tassertBufferContent(t, buffer, content)\n}\n\nfunc testBufferPoolWriteString(t *testing.T, pool parquet.BufferPool) {\n\tconst content = \"Hello World!\"\n\n\tbuffer := pool.GetBuffer()\n\tdefer pool.PutBuffer(buffer)\n\n\t_, err := io.WriteString(buffer, content)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tassertBufferContent(t, buffer, content)\n}\n\nfunc testBufferPoolCopyToBuffer(t *testing.T, pool parquet.BufferPool) {\n\tconst content = \"ABC\"\n\n\tbuffer := pool.GetBuffer()\n\tdefer pool.PutBuffer(buffer)\n\n\treader := strings.NewReader(content)\n\t_, err := io.Copy(buffer, struct{ io.Reader }{reader})\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tassertBufferContent(t, buffer, content)\n}\n\nfunc testBufferPoolCopyFromBuffer(t *testing.T, pool parquet.BufferPool) {\n\tconst content = \"0123456789\"\n\n\tbuffer := pool.GetBuffer()\n\tdefer pool.PutBuffer(buffer)\n\n\tif _, err := io.WriteString(buffer, content); err != nil {\n\t\tt.Fatal(err)\n\t}\n\tif _, err := buffer.Seek(0, io.SeekStart); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\twriter := new(bytes.Buffer)\n\t_, err := io.Copy(struct{ io.Writer }{writer}, buffer)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tassertBufferContent(t, bytes.NewReader(writer.Bytes()), content)\n}\n\nfunc assertBufferContent(t *testing.T, b io.ReadSeeker, s string) {\n\tt.Helper()\n\n\toffset, err := b.Seek(0, io.SeekStart)\n\tif err != nil {\n\t\tt.Error(\"seek:\", err)\n\t}\n\tif offset != 0 {\n\t\tt.Errorf(\"seek: invalid offset returned: want=0 got=%d\", offset)\n\t}\n\tif err := iotest.TestReader(b, []byte(s)); err != nil {\n\t\tt.Error(\"iotest:\", err)\n\t}\n}\n"
  },
  {
    "path": "buffer_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"bytes\"\n\t\"errors\"\n\t\"io\"\n\t\"math\"\n\t\"math/rand\"\n\t\"reflect\"\n\t\"sort\"\n\t\"strconv\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n\t\"github.com/segmentio/parquet-go/encoding\"\n)\n\nvar bufferTests = [...]struct {\n\tscenario string\n\ttyp      parquet.Type\n\tvalues   [][]interface{}\n}{\n\t{\n\t\tscenario: \"boolean\",\n\t\ttyp:      parquet.BooleanType,\n\t\tvalues: [][]interface{}{\n\t\t\t{},\n\t\t\t{false},\n\t\t\t{true},\n\t\t\t{\n\t\t\t\tfalse, true, false, false, true, true,\n\t\t\t\tfalse, false, false, true, false, true,\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"int32\",\n\t\ttyp:      parquet.Int32Type,\n\t\tvalues: [][]interface{}{\n\t\t\t{},\n\t\t\t{int32(0)},\n\t\t\t{int32(1)},\n\t\t\t{\n\t\t\t\tint32(1), int32(2), int32(3), int32(4), int32(5), int32(6),\n\t\t\t\tint32(math.MaxInt8), int32(math.MaxInt16), int32(math.MaxInt32),\n\t\t\t\tint32(7), int32(9), int32(9), int32(0),\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"int64\",\n\t\ttyp:      parquet.Int64Type,\n\t\tvalues: [][]interface{}{\n\t\t\t{},\n\t\t\t{int64(0)},\n\t\t\t{int64(1)},\n\t\t\t{\n\t\t\t\tint64(1), int64(2), int64(3), int64(4), int64(5), int64(6),\n\t\t\t\tint64(math.MaxInt8), int64(math.MaxInt16), int64(math.MaxInt64), int64(7),\n\t\t\t\tint64(9), int64(9), int64(0),\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"float\",\n\t\ttyp:      parquet.FloatType,\n\t\tvalues: [][]interface{}{\n\t\t\t{},\n\t\t\t{float32(0)},\n\t\t\t{float32(1)},\n\t\t\t{\n\t\t\t\tfloat32(1), float32(2), float32(3), float32(4), float32(5), float32(6),\n\t\t\t\tfloat32(0.5), float32(math.SmallestNonzeroFloat32), float32(math.MaxFloat32), float32(7),\n\t\t\t\tfloat32(9), float32(9), float32(0),\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"double\",\n\t\ttyp:      parquet.DoubleType,\n\t\tvalues: [][]interface{}{\n\t\t\t{},\n\t\t\t{float64(0)},\n\t\t\t{float64(1)},\n\t\t\t{\n\t\t\t\tfloat64(1), float64(2), float64(3), float64(4), float64(5), float64(6),\n\t\t\t\tfloat64(0.5), float64(math.SmallestNonzeroFloat64), float64(math.MaxFloat64), float64(7),\n\t\t\t\tfloat64(9), float64(9), float64(0),\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"string\",\n\t\ttyp:      parquet.ByteArrayType,\n\t\tvalues: [][]interface{}{\n\t\t\t{},\n\t\t\t{\"\"},\n\t\t\t{\"Hello World!\"},\n\t\t\t{\n\t\t\t\t\"ABCDEFG\", \"HIJKLMN\", \"OPQRSTU\", \"VWXZY01\", \"2345678\",\n\t\t\t\t\"90!@#$%\", \"^&*()_+\", \"Hello World!\", \"Answer=42\", \"ABCEDFG\",\n\t\t\t\t\"HIJKLMN\", \"OPQRSTU\", \"VWXYZ\",\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"fixed length byte array\",\n\t\ttyp:      parquet.FixedLenByteArrayType(10),\n\t\tvalues: [][]interface{}{\n\t\t\t{},\n\t\t\t{[10]byte{}},\n\t\t\t{[10]byte{0: 1}},\n\t\t\t{\n\t\t\t\t[10]byte{0: 0}, [10]byte{0: 2}, [10]byte{0: 1}, [10]byte{0: 4}, [10]byte{0: 3},\n\t\t\t\t[10]byte{0: 6}, [10]byte{0: 5}, [10]byte{0: 8}, [10]byte{0: 7}, [10]byte{0: 10},\n\t\t\t\t[10]byte{0: 11}, [10]byte{0: 12}, [10]byte{9: 0xFF},\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"uuid\",\n\t\ttyp:      parquet.UUID().Type(),\n\t\tvalues: [][]interface{}{\n\t\t\t{},\n\t\t\t{[16]byte{}},\n\t\t\t{[16]byte{0: 1}},\n\t\t\t{\n\t\t\t\t[16]byte{0: 0}, [16]byte{0: 2}, [16]byte{0: 1}, [16]byte{0: 4}, [16]byte{0: 3},\n\t\t\t\t[16]byte{0: 6}, [16]byte{0: 5}, [16]byte{0: 8}, [16]byte{0: 7}, [16]byte{0: 10},\n\t\t\t\t[16]byte{0: 11}, [16]byte{0: 12}, [16]byte{15: 0xFF},\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"uint32\",\n\t\ttyp:      parquet.Uint(32).Type(),\n\t\tvalues: [][]interface{}{\n\t\t\t{},\n\t\t\t{uint32(0)},\n\t\t\t{uint32(1)},\n\t\t\t{\n\t\t\t\tuint32(1), uint32(2), uint32(3), uint32(4), uint32(5), uint32(6),\n\t\t\t\tuint32(math.MaxInt8), uint32(math.MaxInt16), uint32(math.MaxUint32), uint32(7),\n\t\t\t\tuint32(9), uint32(9), uint32(0),\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"uint64\",\n\t\ttyp:      parquet.Uint(64).Type(),\n\t\tvalues: [][]interface{}{\n\t\t\t{},\n\t\t\t{uint64(0)},\n\t\t\t{uint64(1)},\n\t\t\t{\n\t\t\t\tuint64(1), uint64(2), uint64(3), uint64(4), uint64(5), uint64(6),\n\t\t\t\tuint64(math.MaxInt8), uint64(math.MaxInt16), uint64(math.MaxUint64),\n\t\t\t\tuint64(7), uint64(9), uint64(9), uint64(0),\n\t\t\t},\n\t\t},\n\t},\n}\n\nfunc TestBuffer(t *testing.T) {\n\tfor _, test := range bufferTests {\n\t\tt.Run(test.scenario, func(t *testing.T) {\n\t\t\tfor _, config := range [...]struct {\n\t\t\t\tscenario string\n\t\t\t\ttyp      parquet.Type\n\t\t\t}{\n\t\t\t\t{scenario: \"plain\", typ: test.typ},\n\t\t\t\t{scenario: \"indexed\", typ: test.typ.NewDictionary(0, 0, test.typ.NewValues(nil, nil)).Type()},\n\t\t\t} {\n\t\t\t\tt.Run(config.scenario, func(t *testing.T) {\n\t\t\t\t\tfor _, mod := range [...]struct {\n\t\t\t\t\t\tscenario string\n\t\t\t\t\t\tfunction func(parquet.Node) parquet.Node\n\t\t\t\t\t}{\n\t\t\t\t\t\t{scenario: \"optional\", function: parquet.Optional},\n\t\t\t\t\t\t{scenario: \"repeated\", function: parquet.Repeated},\n\t\t\t\t\t\t{scenario: \"required\", function: parquet.Required},\n\t\t\t\t\t} {\n\t\t\t\t\t\tt.Run(mod.scenario, func(t *testing.T) {\n\t\t\t\t\t\t\tfor _, ordering := range [...]struct {\n\t\t\t\t\t\t\t\tscenario string\n\t\t\t\t\t\t\t\tsorting  parquet.SortingColumn\n\t\t\t\t\t\t\t\tsortFunc func(parquet.Type, []parquet.Value)\n\t\t\t\t\t\t\t}{\n\t\t\t\t\t\t\t\t{scenario: \"unordered\", sorting: nil, sortFunc: unordered},\n\t\t\t\t\t\t\t\t{scenario: \"ascending\", sorting: parquet.Ascending(\"data\"), sortFunc: ascending},\n\t\t\t\t\t\t\t\t{scenario: \"descending\", sorting: parquet.Descending(\"data\"), sortFunc: descending},\n\t\t\t\t\t\t\t} {\n\t\t\t\t\t\t\t\tt.Run(ordering.scenario, func(t *testing.T) {\n\t\t\t\t\t\t\t\t\tschema := parquet.NewSchema(\"test\", parquet.Group{\n\t\t\t\t\t\t\t\t\t\t\"data\": mod.function(parquet.Leaf(config.typ)),\n\t\t\t\t\t\t\t\t\t})\n\n\t\t\t\t\t\t\t\t\toptions := []parquet.RowGroupOption{\n\t\t\t\t\t\t\t\t\t\tschema,\n\t\t\t\t\t\t\t\t\t\tparquet.ColumnBufferCapacity(100),\n\t\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t\t\tif ordering.sorting != nil {\n\t\t\t\t\t\t\t\t\t\toptions = append(options,\n\t\t\t\t\t\t\t\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\t\t\t\t\t\t\t\tparquet.SortingColumns(ordering.sorting),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t)\n\t\t\t\t\t\t\t\t\t}\n\n\t\t\t\t\t\t\t\t\tcontent := new(bytes.Buffer)\n\t\t\t\t\t\t\t\t\tbuffer := parquet.NewBuffer(options...)\n\n\t\t\t\t\t\t\t\t\tfor _, values := range test.values {\n\t\t\t\t\t\t\t\t\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\t\t\t\t\t\t\t\t\tdefer content.Reset()\n\t\t\t\t\t\t\t\t\t\t\tdefer buffer.Reset()\n\t\t\t\t\t\t\t\t\t\t\tfields := schema.Fields()\n\t\t\t\t\t\t\t\t\t\t\ttestBuffer(t, fields[0], buffer, &parquet.Plain, values, ordering.sortFunc)\n\t\t\t\t\t\t\t\t\t\t})\n\t\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t\t})\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t})\n\t\t\t\t\t}\n\t\t\t\t})\n\t\t\t}\n\t\t})\n\t}\n}\n\ntype sortFunc func(parquet.Type, []parquet.Value)\n\nfunc unordered(typ parquet.Type, values []parquet.Value) {}\n\nfunc ascending(typ parquet.Type, values []parquet.Value) {\n\tsort.Slice(values, func(i, j int) bool { return typ.Compare(values[i], values[j]) < 0 })\n}\n\nfunc descending(typ parquet.Type, values []parquet.Value) {\n\tsort.Slice(values, func(i, j int) bool { return typ.Compare(values[i], values[j]) > 0 })\n}\n\nfunc testBuffer(t *testing.T, node parquet.Node, buffer *parquet.Buffer, encoding encoding.Encoding, values []interface{}, sortFunc sortFunc) {\n\trepetitionLevel := 0\n\tdefinitionLevel := 0\n\tif !node.Required() {\n\t\tdefinitionLevel = 1\n\t}\n\n\tminValue := parquet.Value{}\n\tmaxValue := parquet.Value{}\n\tbatch := make([]parquet.Value, len(values))\n\tfor i := range values {\n\t\tbatch[i] = parquet.ValueOf(values[i]).Level(repetitionLevel, definitionLevel, 0)\n\t}\n\n\tfor i := range batch {\n\t\t_, err := buffer.WriteRows([]parquet.Row{batch[i : i+1]})\n\t\tif err != nil {\n\t\t\tt.Fatalf(\"writing value to row group: %v\", err)\n\t\t}\n\t}\n\n\tnumRows := buffer.NumRows()\n\tif numRows != int64(len(batch)) {\n\t\tt.Fatalf(\"number of rows mismatch: want=%d got=%d\", len(batch), numRows)\n\t}\n\n\ttyp := node.Type()\n\tfor _, value := range batch {\n\t\tif minValue.IsNull() || typ.Compare(value, minValue) < 0 {\n\t\t\tminValue = value\n\t\t}\n\t\tif maxValue.IsNull() || typ.Compare(value, maxValue) > 0 {\n\t\t\tmaxValue = value\n\t\t}\n\t}\n\n\tsortFunc(typ, batch)\n\tsort.Sort(buffer)\n\n\tpage := buffer.ColumnBuffers()[0].Page()\n\tnumValues := page.NumValues()\n\tif numValues != int64(len(batch)) {\n\t\tt.Fatalf(\"number of values mistmatch: want=%d got=%d\", len(batch), numValues)\n\t}\n\n\tnumNulls := page.NumNulls()\n\tif numNulls != 0 {\n\t\tt.Fatalf(\"number of nulls mismatch: want=0 got=%d\", numNulls)\n\t}\n\n\tmin, max, hasBounds := page.Bounds()\n\tif !hasBounds && numRows > 0 {\n\t\tt.Fatal(\"page bounds are missing\")\n\t}\n\tif !parquet.Equal(min, minValue) {\n\t\tt.Fatalf(\"min value mismatch: want=%v got=%v\", minValue, min)\n\t}\n\tif !parquet.Equal(max, maxValue) {\n\t\tt.Fatalf(\"max value mismatch: want=%v got=%v\", maxValue, max)\n\t}\n\n\t// We write a single value per row, so num values = num rows for all pages\n\t// including repeated ones, which makes it OK to slice the pages using the\n\t// number of values as a proxy for the row indexes.\n\thalfValues := numValues / 2\n\n\tfor _, test := range [...]struct {\n\t\tscenario string\n\t\tvalues   []parquet.Value\n\t\treader   parquet.ValueReader\n\t}{\n\t\t{\"page\", batch, page.Values()},\n\t\t{\"head\", batch[:halfValues], page.Slice(0, halfValues).Values()},\n\t\t{\"tail\", batch[halfValues:], page.Slice(halfValues, numValues).Values()},\n\t} {\n\t\tv := [1]parquet.Value{}\n\t\ti := 0\n\n\t\tfor {\n\t\t\tn, err := test.reader.ReadValues(v[:])\n\t\t\tif n > 0 {\n\t\t\t\tif n != 1 {\n\t\t\t\t\tt.Fatalf(\"reading value from %q reader returned the wrong count: want=1 got=%d\", test.scenario, n)\n\t\t\t\t}\n\t\t\t\tif i < len(test.values) {\n\t\t\t\t\tif !parquet.Equal(v[0], test.values[i]) {\n\t\t\t\t\t\tt.Fatalf(\"%q value at index %d mismatches: want=%v got=%v\", test.scenario, i, test.values[i], v[0])\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\ti++\n\t\t\t}\n\t\t\tif err != nil {\n\t\t\t\tif err == io.EOF {\n\t\t\t\t\tbreak\n\t\t\t\t}\n\t\t\t\tt.Fatalf(\"reading value from %q reader: %v\", test.scenario, err)\n\t\t\t}\n\t\t}\n\n\t\tif i != len(test.values) {\n\t\t\tt.Errorf(\"wrong number of values read from %q reader: want=%d got=%d\", test.scenario, len(test.values), i)\n\t\t}\n\t}\n}\n\nfunc TestBufferGenerateBloomFilters(t *testing.T) {\n\ttype Point3D struct {\n\t\tX float64\n\t\tY float64\n\t\tZ float64\n\t}\n\n\tf := func(rows []Point3D) bool {\n\t\tif len(rows) == 0 { // TODO: support writing files with no rows\n\t\t\treturn true\n\t\t}\n\n\t\toutput := new(bytes.Buffer)\n\t\tbuffer := parquet.NewBuffer()\n\t\twriter := parquet.NewWriter(output,\n\t\t\tparquet.BloomFilters(\n\t\t\t\tparquet.SplitBlockFilter(10, \"X\"),\n\t\t\t\tparquet.SplitBlockFilter(10, \"Y\"),\n\t\t\t\tparquet.SplitBlockFilter(10, \"Z\"),\n\t\t\t),\n\t\t)\n\t\tfor i := range rows {\n\t\t\tbuffer.Write(&rows[i])\n\t\t}\n\t\t_, err := copyRowsAndClose(writer, buffer.Rows())\n\t\tif err != nil {\n\t\t\tt.Error(err)\n\t\t\treturn false\n\t\t}\n\t\tif err := writer.Close(); err != nil {\n\t\t\tt.Error(err)\n\t\t\treturn false\n\t\t}\n\n\t\treader := bytes.NewReader(output.Bytes())\n\t\tf, err := parquet.OpenFile(reader, reader.Size())\n\t\tif err != nil {\n\t\t\tt.Error(err)\n\t\t\treturn false\n\t\t}\n\t\trowGroup := f.RowGroups()[0]\n\t\tcolumns := rowGroup.ColumnChunks()\n\t\tx := columns[0]\n\t\ty := columns[1]\n\t\tz := columns[2]\n\n\t\tfor i, col := range []parquet.ColumnChunk{x, y, z} {\n\t\t\tif col.BloomFilter() == nil {\n\t\t\t\tt.Errorf(\"column %d has no bloom filter despite being configured to have one\", i)\n\t\t\t\treturn false\n\t\t\t}\n\t\t}\n\n\t\tfx := x.BloomFilter()\n\t\tfy := y.BloomFilter()\n\t\tfz := z.BloomFilter()\n\n\t\ttest := func(f parquet.BloomFilter, v float64) bool {\n\t\t\tif ok, err := f.Check(parquet.ValueOf(v)); err != nil {\n\t\t\t\tt.Errorf(\"unexpected error checking bloom filter: %v\", err)\n\t\t\t\treturn false\n\t\t\t} else if !ok {\n\t\t\t\tt.Errorf(\"bloom filter does not contain value %g\", v)\n\t\t\t\treturn false\n\t\t\t}\n\t\t\treturn true\n\t\t}\n\n\t\tfor _, row := range rows {\n\t\t\tif !test(fx, row.X) || !test(fy, row.Y) || !test(fz, row.Z) {\n\t\t\t\treturn false\n\t\t\t}\n\t\t}\n\n\t\treturn true\n\t}\n\n\tif err := quickCheck(f); err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestBufferRoundtripNestedRepeated(t *testing.T) {\n\ttype C struct {\n\t\tD int\n\t}\n\ttype B struct {\n\t\tC []C\n\t}\n\ttype A struct {\n\t\tB []B\n\t}\n\n\t// Write enough objects to exceed first page\n\tbuffer := parquet.NewBuffer()\n\tvar objs []A\n\tfor i := 0; i < 6; i++ {\n\t\to := A{[]B{{[]C{\n\t\t\t{i},\n\t\t\t{i},\n\t\t}}}}\n\t\tbuffer.Write(&o)\n\t\tobjs = append(objs, o)\n\t}\n\n\tbuf := new(bytes.Buffer)\n\tw := parquet.NewWriter(buf, parquet.PageBufferSize(100))\n\tw.WriteRowGroup(buffer)\n\tw.Flush()\n\tw.Close()\n\n\tfile := bytes.NewReader(buf.Bytes())\n\tr := parquet.NewReader(file)\n\tfor i := 0; ; i++ {\n\t\to := new(A)\n\t\terr := r.Read(o)\n\t\tif errors.Is(err, io.EOF) {\n\t\t\tif i < len(objs) {\n\t\t\t\tt.Errorf(\"too few rows were read: %d<%d\", i, len(objs))\n\t\t\t}\n\t\t\tbreak\n\t\t}\n\t\tif !reflect.DeepEqual(*o, objs[i]) {\n\t\t\tt.Errorf(\"points mismatch at row index %d: want=%v got=%v\", i, objs[i], o)\n\t\t}\n\t}\n}\n\nfunc TestBufferRoundtripNestedRepeatedPointer(t *testing.T) {\n\ttype C struct {\n\t\tD *int\n\t}\n\ttype B struct {\n\t\tC []C\n\t}\n\ttype A struct {\n\t\tB []B\n\t}\n\n\t// Write enough objects to exceed first page\n\tbuffer := parquet.NewBuffer()\n\tvar objs []A\n\tfor i := 0; i < 6; i++ {\n\t\tj := i\n\t\to := A{[]B{{[]C{\n\t\t\t{&j},\n\t\t\t{nil},\n\t\t}}}}\n\t\tbuffer.Write(&o)\n\t\tobjs = append(objs, o)\n\t}\n\n\tbuf := new(bytes.Buffer)\n\tw := parquet.NewWriter(buf, parquet.PageBufferSize(100))\n\tw.WriteRowGroup(buffer)\n\tw.Flush()\n\tw.Close()\n\n\tfile := bytes.NewReader(buf.Bytes())\n\tr := parquet.NewReader(file)\n\tfor i := 0; ; i++ {\n\t\to := new(A)\n\t\terr := r.Read(o)\n\t\tif err == io.EOF {\n\t\t\tbreak\n\t\t}\n\t\tif !reflect.DeepEqual(*o, objs[i]) {\n\t\t\tt.Errorf(\"points mismatch at row index %d: want=%v got=%v\", i, objs[i], o)\n\t\t}\n\t}\n}\n\nfunc TestRoundtripNestedRepeatedBytes(t *testing.T) {\n\ttype B struct {\n\t\tC []byte\n\t}\n\ttype A struct {\n\t\tA string\n\t\tB []B\n\t}\n\n\tvar objs []A\n\tfor i := 0; i < 2; i++ {\n\t\to := A{\n\t\t\t\"test\" + strconv.Itoa(i),\n\t\t\t[]B{\n\t\t\t\t{[]byte{byte(i)}},\n\t\t\t},\n\t\t}\n\t\tobjs = append(objs, o)\n\t}\n\n\tbuf := new(bytes.Buffer)\n\tw := parquet.NewWriter(buf, parquet.PageBufferSize(100))\n\tfor _, o := range objs {\n\t\tw.Write(&o)\n\t}\n\tw.Close()\n\n\tfile := bytes.NewReader(buf.Bytes())\n\n\tr := parquet.NewReader(file)\n\tfor i := 0; ; i++ {\n\t\to := new(A)\n\t\terr := r.Read(o)\n\t\tif errors.Is(err, io.EOF) {\n\t\t\tif i < len(objs) {\n\t\t\t\tt.Errorf(\"too few rows were read: %d<%d\", i, len(objs))\n\t\t\t}\n\t\t\tbreak\n\t\t}\n\t\tif !reflect.DeepEqual(*o, objs[i]) {\n\t\t\tt.Errorf(\"points mismatch at row index %d: want=%v got=%v\", i, objs[i], o)\n\t\t}\n\t}\n}\n\nfunc TestBufferSeekToRow(t *testing.T) {\n\ttype B struct {\n\t\tI int\n\t\tC []string\n\t}\n\ttype A struct {\n\t\tB []B\n\t}\n\n\tbuffer := parquet.NewBuffer()\n\tvar objs []A\n\tfor i := 0; i < 2; i++ {\n\t\to := A{\n\t\t\tB: []B{\n\t\t\t\t{I: i, C: []string{\"foo\", strconv.Itoa(i)}},\n\t\t\t\t{I: i + 1, C: []string{\"bar\", strconv.Itoa(i + 1)}},\n\t\t\t},\n\t\t}\n\t\tbuffer.Write(&o)\n\t\tobjs = append(objs, o)\n\t}\n\n\tbuf := new(bytes.Buffer)\n\tw := parquet.NewWriter(buf)\n\tw.WriteRowGroup(buffer)\n\tw.Flush()\n\tw.Close()\n\n\tfile := bytes.NewReader(buf.Bytes())\n\tr := parquet.NewReader(file)\n\n\ti := 1\n\to := new(A)\n\tif err := r.SeekToRow(int64(i)); err != nil {\n\t\tt.Fatal(err)\n\t}\n\tif err := r.Read(o); err != nil {\n\t\tt.Fatal(err)\n\t}\n\tif !reflect.DeepEqual(*o, objs[i]) {\n\t\tt.Errorf(\"points mismatch at row index %d: want=%v got=%v\", i, objs[i], o)\n\t}\n}\n\ntype TestStruct struct {\n\tA *string `parquet:\"a,optional,dict\"`\n}\n\nfunc TestOptionalDictWriteRowGroup(t *testing.T) {\n\ts := parquet.SchemaOf(&TestStruct{})\n\n\tstr1 := \"test1\"\n\tstr2 := \"test2\"\n\trecords := []*TestStruct{\n\t\t{A: nil},\n\t\t{A: &str1},\n\t\t{A: nil},\n\t\t{A: &str2},\n\t\t{A: nil},\n\t}\n\n\tbuf := parquet.NewBuffer(s)\n\tfor _, rec := range records {\n\t\trow := s.Deconstruct(nil, rec)\n\t\t_, err := buf.WriteRows([]parquet.Row{row})\n\t\tif err != nil {\n\t\t\tt.Fatal(err)\n\t\t}\n\t}\n\n\tb := bytes.NewBuffer(nil)\n\tw := parquet.NewWriter(b)\n\t_, err := w.WriteRowGroup(buf)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n}\n\nfunc TestNullsSortFirst(t *testing.T) {\n\ts := parquet.SchemaOf(&TestStruct{})\n\n\tstr1 := \"test1\"\n\tstr2 := \"test2\"\n\trecords := []*TestStruct{\n\t\t{A: &str1},\n\t\t{A: nil},\n\t\t{A: &str2},\n\t}\n\tbuf := parquet.NewBuffer(\n\t\ts,\n\t\tparquet.SortingRowGroupConfig(parquet.SortingColumns(parquet.NullsFirst(parquet.Ascending(s.Columns()[0][0])))),\n\t)\n\tfor _, rec := range records {\n\t\trow := s.Deconstruct(nil, rec)\n\t\t_, err := buf.WriteRows([]parquet.Row{row})\n\t\tif err != nil {\n\t\t\tt.Fatal(err)\n\t\t}\n\t}\n\n\tsort.Sort(buf)\n\n\trows := buf.Rows()\n\tdefer rows.Close()\n\trowBuf := make([]parquet.Row, len(records))\n\tif _, err := rows.ReadRows(rowBuf); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tresultRecords := make([]TestStruct, len(records))\n\tfor i, r := range rowBuf {\n\t\tif err := s.Reconstruct(&resultRecords[i], r); err != nil {\n\t\t\tt.Fatal(err)\n\t\t}\n\t}\n\n\tif resultRecords[0].A != nil {\n\t\tt.Fatal(\"expected null to sort first, but found\", resultRecords)\n\t}\n}\n\nfunc generateBenchmarkBufferRows(n int) (*parquet.Schema, []parquet.Row) {\n\tmodel := new(benchmarkRowType)\n\tschema := parquet.SchemaOf(model)\n\tprng := rand.New(rand.NewSource(0))\n\trows := make([]parquet.Row, n)\n\n\tfor i := range rows {\n\t\tio.ReadFull(prng, model.ID[:])\n\t\tmodel.Value = prng.Float64()\n\t\trows[i] = make(parquet.Row, 0, 2)\n\t\trows[i] = schema.Deconstruct(rows[i], model)\n\t}\n\n\treturn schema, rows\n}\n\nfunc BenchmarkBufferReadRows100x(b *testing.B) {\n\tschema, rows := generateBenchmarkBufferRows(benchmarkNumRows)\n\tbuffer := parquet.NewBuffer(schema)\n\n\tfor i := 0; i < len(rows); i += benchmarkRowsPerStep {\n\t\tj := i + benchmarkRowsPerStep\n\t\tif _, err := buffer.WriteRows(rows[i:j]); err != nil {\n\t\t\tb.Fatal(err)\n\t\t}\n\t}\n\n\tbufferRows := buffer.Rows()\n\tdefer bufferRows.Close()\n\n\tbenchmarkRowsPerSecond(b, func() int {\n\t\tn, err := bufferRows.ReadRows(rows[:benchmarkRowsPerStep])\n\t\tif err != nil {\n\t\t\tif errors.Is(err, io.EOF) {\n\t\t\t\terr = bufferRows.SeekToRow(0)\n\t\t\t}\n\t\t\tif err != nil {\n\t\t\t\tb.Fatal(err)\n\t\t\t}\n\t\t}\n\t\treturn n\n\t})\n}\n\nfunc BenchmarkBufferWriteRows100x(b *testing.B) {\n\tschema, rows := generateBenchmarkBufferRows(benchmarkNumRows)\n\tbuffer := parquet.NewBuffer(schema)\n\n\ti := 0\n\tbenchmarkRowsPerSecond(b, func() int {\n\t\tn, err := buffer.WriteRows(rows[i : i+benchmarkRowsPerStep])\n\t\tif err != nil {\n\t\t\tb.Fatal(err)\n\t\t}\n\n\t\ti += benchmarkRowsPerStep\n\t\ti %= benchmarkNumRows\n\n\t\tif i == 0 {\n\t\t\tbuffer.Reset()\n\t\t}\n\t\treturn n\n\t})\n}\n"
  },
  {
    "path": "column.go",
    "content": "package parquet\n\nimport (\n\t\"encoding/binary\"\n\t\"fmt\"\n\t\"io\"\n\t\"reflect\"\n\n\t\"github.com/segmentio/parquet-go/compress\"\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/format\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\n// Column represents a column in a parquet file.\n//\n// Methods of Column values are safe to call concurrently from multiple\n// goroutines.\n//\n// Column instances satisfy the Node interface.\ntype Column struct {\n\ttyp         Type\n\tfile        *File\n\tschema      *format.SchemaElement\n\torder       *format.ColumnOrder\n\tpath        columnPath\n\tcolumns     []*Column\n\tchunks      []*format.ColumnChunk\n\tcolumnIndex []*format.ColumnIndex\n\toffsetIndex []*format.OffsetIndex\n\tencoding    encoding.Encoding\n\tcompression compress.Codec\n\n\tdepth              int8\n\tmaxRepetitionLevel byte\n\tmaxDefinitionLevel byte\n\tindex              int16\n}\n\n// Type returns the type of the column.\n//\n// The returned value is unspecified if c is not a leaf column.\nfunc (c *Column) Type() Type { return c.typ }\n\n// Optional returns true if the column is optional.\nfunc (c *Column) Optional() bool { return schemaRepetitionTypeOf(c.schema) == format.Optional }\n\n// Repeated returns true if the column may repeat.\nfunc (c *Column) Repeated() bool { return schemaRepetitionTypeOf(c.schema) == format.Repeated }\n\n// Required returns true if the column is required.\nfunc (c *Column) Required() bool { return schemaRepetitionTypeOf(c.schema) == format.Required }\n\n// Leaf returns true if c is a leaf column.\nfunc (c *Column) Leaf() bool { return c.index >= 0 }\n\n// Fields returns the list of fields on the column.\nfunc (c *Column) Fields() []Field {\n\tfields := make([]Field, len(c.columns))\n\tfor i, column := range c.columns {\n\t\tfields[i] = column\n\t}\n\treturn fields\n}\n\n// Encoding returns the encodings used by this column.\nfunc (c *Column) Encoding() encoding.Encoding { return c.encoding }\n\n// Compression returns the compression codecs used by this column.\nfunc (c *Column) Compression() compress.Codec { return c.compression }\n\n// Path of the column in the parquet schema.\nfunc (c *Column) Path() []string { return c.path[1:] }\n\n// Name returns the column name.\nfunc (c *Column) Name() string { return c.schema.Name }\n\n// Columns returns the list of child columns.\n//\n// The method returns the same slice across multiple calls, the program must\n// treat it as a read-only value.\nfunc (c *Column) Columns() []*Column { return c.columns }\n\n// Column returns the child column matching the given name.\nfunc (c *Column) Column(name string) *Column {\n\tfor _, child := range c.columns {\n\t\tif child.Name() == name {\n\t\t\treturn child\n\t\t}\n\t}\n\treturn nil\n}\n\n// Pages returns a reader exposing all pages in this column, across row groups.\nfunc (c *Column) Pages() Pages {\n\tif c.index < 0 {\n\t\treturn emptyPages{}\n\t}\n\tr := &columnPages{\n\t\tpages: make([]filePages, len(c.file.rowGroups)),\n\t}\n\tfor i := range r.pages {\n\t\tr.pages[i].init(c.file.rowGroups[i].(*fileRowGroup).columns[c.index].(*fileColumnChunk))\n\t}\n\treturn r\n}\n\ntype columnPages struct {\n\tpages []filePages\n\tindex int\n}\n\nfunc (c *columnPages) ReadPage() (Page, error) {\n\tfor {\n\t\tif c.index >= len(c.pages) {\n\t\t\treturn nil, io.EOF\n\t\t}\n\t\tp, err := c.pages[c.index].ReadPage()\n\t\tif err == nil || err != io.EOF {\n\t\t\treturn p, err\n\t\t}\n\t\tc.index++\n\t}\n}\n\nfunc (c *columnPages) SeekToRow(rowIndex int64) error {\n\tc.index = 0\n\n\tfor c.index < len(c.pages) && c.pages[c.index].chunk.rowGroup.NumRows >= rowIndex {\n\t\trowIndex -= c.pages[c.index].chunk.rowGroup.NumRows\n\t\tc.index++\n\t}\n\n\tif c.index < len(c.pages) {\n\t\tif err := c.pages[c.index].SeekToRow(rowIndex); err != nil {\n\t\t\treturn err\n\t\t}\n\t\tfor i := range c.pages[c.index:] {\n\t\t\tp := &c.pages[c.index+i]\n\t\t\tif err := p.SeekToRow(0); err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\t\t}\n\t}\n\treturn nil\n}\n\nfunc (c *columnPages) Close() error {\n\tvar lastErr error\n\n\tfor i := range c.pages {\n\t\tif err := c.pages[i].Close(); err != nil {\n\t\t\tlastErr = err\n\t\t}\n\t}\n\n\tc.pages = nil\n\tc.index = 0\n\treturn lastErr\n}\n\n// Depth returns the position of the column relative to the root.\nfunc (c *Column) Depth() int { return int(c.depth) }\n\n// MaxRepetitionLevel returns the maximum value of repetition levels on this\n// column.\nfunc (c *Column) MaxRepetitionLevel() int { return int(c.maxRepetitionLevel) }\n\n// MaxDefinitionLevel returns the maximum value of definition levels on this\n// column.\nfunc (c *Column) MaxDefinitionLevel() int { return int(c.maxDefinitionLevel) }\n\n// Index returns the position of the column in a row. Only leaf columns have a\n// column index, the method returns -1 when called on non-leaf columns.\nfunc (c *Column) Index() int { return int(c.index) }\n\n// GoType returns the Go type that best represents the parquet column.\nfunc (c *Column) GoType() reflect.Type { return goTypeOf(c) }\n\n// Value returns the sub-value in base for the child column at the given\n// index.\nfunc (c *Column) Value(base reflect.Value) reflect.Value {\n\treturn base.MapIndex(reflect.ValueOf(&c.schema.Name).Elem())\n}\n\n// String returns a human-readable string representation of the column.\nfunc (c *Column) String() string { return c.path.String() + \": \" + sprint(c.Name(), c) }\n\nfunc (c *Column) forEachLeaf(do func(*Column)) {\n\tif len(c.columns) == 0 {\n\t\tdo(c)\n\t} else {\n\t\tfor _, child := range c.columns {\n\t\t\tchild.forEachLeaf(do)\n\t\t}\n\t}\n}\n\nfunc openColumns(file *File) (*Column, error) {\n\tcl := columnLoader{}\n\n\tc, err := cl.open(file, nil)\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\n\t// Validate that there aren't extra entries in the row group columns,\n\t// which would otherwise indicate that there are dangling data pages\n\t// in the file.\n\tfor index, rowGroup := range file.metadata.RowGroups {\n\t\tif cl.rowGroupColumnIndex != len(rowGroup.Columns) {\n\t\t\treturn nil, fmt.Errorf(\"row group at index %d contains %d columns but %d were referenced by the column schemas\",\n\t\t\t\tindex, len(rowGroup.Columns), cl.rowGroupColumnIndex)\n\t\t}\n\t}\n\n\t_, err = c.setLevels(0, 0, 0, 0)\n\treturn c, err\n}\n\nfunc (c *Column) setLevels(depth, repetition, definition, index int) (int, error) {\n\tif depth > MaxColumnDepth {\n\t\treturn -1, fmt.Errorf(\"cannot represent parquet columns with more than %d nested levels: %s\", MaxColumnDepth, c.path)\n\t}\n\tif index > MaxColumnIndex {\n\t\treturn -1, fmt.Errorf(\"cannot represent parquet rows with more than %d columns: %s\", MaxColumnIndex, c.path)\n\t}\n\tif repetition > MaxRepetitionLevel {\n\t\treturn -1, fmt.Errorf(\"cannot represent parquet columns with more than %d repetition levels: %s\", MaxRepetitionLevel, c.path)\n\t}\n\tif definition > MaxDefinitionLevel {\n\t\treturn -1, fmt.Errorf(\"cannot represent parquet columns with more than %d definition levels: %s\", MaxDefinitionLevel, c.path)\n\t}\n\n\tswitch schemaRepetitionTypeOf(c.schema) {\n\tcase format.Optional:\n\t\tdefinition++\n\tcase format.Repeated:\n\t\trepetition++\n\t\tdefinition++\n\t}\n\n\tc.depth = int8(depth)\n\tc.maxRepetitionLevel = byte(repetition)\n\tc.maxDefinitionLevel = byte(definition)\n\tdepth++\n\n\tif len(c.columns) > 0 {\n\t\tc.index = -1\n\t} else {\n\t\tc.index = int16(index)\n\t\tindex++\n\t}\n\n\tvar err error\n\tfor _, child := range c.columns {\n\t\tif index, err = child.setLevels(depth, repetition, definition, index); err != nil {\n\t\t\treturn -1, err\n\t\t}\n\t}\n\treturn index, nil\n}\n\ntype columnLoader struct {\n\tschemaIndex         int\n\tcolumnOrderIndex    int\n\trowGroupColumnIndex int\n}\n\nfunc (cl *columnLoader) open(file *File, path []string) (*Column, error) {\n\tc := &Column{\n\t\tfile:   file,\n\t\tschema: &file.metadata.Schema[cl.schemaIndex],\n\t}\n\tc.path = columnPath(path).append(c.schema.Name)\n\n\tcl.schemaIndex++\n\tnumChildren := int(c.schema.NumChildren)\n\n\tif numChildren == 0 {\n\t\tc.typ = schemaElementTypeOf(c.schema)\n\n\t\tif cl.columnOrderIndex < len(file.metadata.ColumnOrders) {\n\t\t\tc.order = &file.metadata.ColumnOrders[cl.columnOrderIndex]\n\t\t\tcl.columnOrderIndex++\n\t\t}\n\n\t\trowGroups := file.metadata.RowGroups\n\t\trowGroupColumnIndex := cl.rowGroupColumnIndex\n\t\tcl.rowGroupColumnIndex++\n\n\t\tc.chunks = make([]*format.ColumnChunk, 0, len(rowGroups))\n\t\tc.columnIndex = make([]*format.ColumnIndex, 0, len(rowGroups))\n\t\tc.offsetIndex = make([]*format.OffsetIndex, 0, len(rowGroups))\n\n\t\tfor i, rowGroup := range rowGroups {\n\t\t\tif rowGroupColumnIndex >= len(rowGroup.Columns) {\n\t\t\t\treturn nil, fmt.Errorf(\"row group at index %d does not have enough columns\", i)\n\t\t\t}\n\t\t\tc.chunks = append(c.chunks, &rowGroup.Columns[rowGroupColumnIndex])\n\t\t}\n\n\t\tif len(file.columnIndexes) > 0 {\n\t\t\tfor i := range rowGroups {\n\t\t\t\tif rowGroupColumnIndex >= len(file.columnIndexes) {\n\t\t\t\t\treturn nil, fmt.Errorf(\"row group at index %d does not have enough column index pages\", i)\n\t\t\t\t}\n\t\t\t\tc.columnIndex = append(c.columnIndex, &file.columnIndexes[rowGroupColumnIndex])\n\t\t\t}\n\t\t}\n\n\t\tif len(file.offsetIndexes) > 0 {\n\t\t\tfor i := range rowGroups {\n\t\t\t\tif rowGroupColumnIndex >= len(file.offsetIndexes) {\n\t\t\t\t\treturn nil, fmt.Errorf(\"row group at index %d does not have enough offset index pages\", i)\n\t\t\t\t}\n\t\t\t\tc.offsetIndex = append(c.offsetIndex, &file.offsetIndexes[rowGroupColumnIndex])\n\t\t\t}\n\t\t}\n\n\t\tif len(c.chunks) > 0 {\n\t\t\t// Pick the encoding and compression codec of the first chunk.\n\t\t\t//\n\t\t\t// Technically each column chunk may use a different compression\n\t\t\t// codec, and each page of the column chunk might have a different\n\t\t\t// encoding. Exposing these details does not provide a lot of value\n\t\t\t// to the end user.\n\t\t\t//\n\t\t\t// Programs that wish to determine the encoding and compression of\n\t\t\t// each page of the column should iterate through the pages and read\n\t\t\t// the page headers to determine which compression and encodings are\n\t\t\t// applied.\n\t\t\tfor _, encoding := range c.chunks[0].MetaData.Encoding {\n\t\t\t\tif c.encoding == nil {\n\t\t\t\t\tc.encoding = LookupEncoding(encoding)\n\t\t\t\t}\n\t\t\t\tif encoding != format.Plain && encoding != format.RLE {\n\t\t\t\t\tc.encoding = LookupEncoding(encoding)\n\t\t\t\t\tbreak\n\t\t\t\t}\n\t\t\t}\n\t\t\tc.compression = LookupCompressionCodec(c.chunks[0].MetaData.Codec)\n\t\t}\n\n\t\treturn c, nil\n\t}\n\n\tc.typ = &groupType{}\n\tc.columns = make([]*Column, numChildren)\n\n\tfor i := range c.columns {\n\t\tif cl.schemaIndex >= len(file.metadata.Schema) {\n\t\t\treturn nil, fmt.Errorf(\"column %q has more children than there are schemas in the file: %d > %d\",\n\t\t\t\tc.schema.Name, cl.schemaIndex+1, len(file.metadata.Schema))\n\t\t}\n\n\t\tvar err error\n\t\tc.columns[i], err = cl.open(file, c.path)\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"%s: %w\", c.schema.Name, err)\n\t\t}\n\t}\n\n\treturn c, nil\n}\n\nfunc schemaElementTypeOf(s *format.SchemaElement) Type {\n\tif lt := s.LogicalType; lt != nil {\n\t\t// A logical type exists, the Type interface implementations in this\n\t\t// package are all based on the logical parquet types declared in the\n\t\t// format sub-package so we can return them directly via a pointer type\n\t\t// conversion.\n\t\tswitch {\n\t\tcase lt.UTF8 != nil:\n\t\t\treturn (*stringType)(lt.UTF8)\n\t\tcase lt.Map != nil:\n\t\t\treturn (*mapType)(lt.Map)\n\t\tcase lt.List != nil:\n\t\t\treturn (*listType)(lt.List)\n\t\tcase lt.Enum != nil:\n\t\t\treturn (*enumType)(lt.Enum)\n\t\tcase lt.Decimal != nil:\n\t\t\t// A parquet decimal can be one of several different physical types.\n\t\t\tif t := s.Type; t != nil {\n\t\t\t\tvar typ Type\n\t\t\t\tswitch kind := Kind(*s.Type); kind {\n\t\t\t\tcase Int32:\n\t\t\t\t\ttyp = Int32Type\n\t\t\t\tcase Int64:\n\t\t\t\t\ttyp = Int64Type\n\t\t\t\tcase FixedLenByteArray:\n\t\t\t\t\tif s.TypeLength == nil {\n\t\t\t\t\t\tpanic(\"DECIMAL using FIXED_LEN_BYTE_ARRAY must specify a length\")\n\t\t\t\t\t}\n\t\t\t\t\ttyp = FixedLenByteArrayType(int(*s.TypeLength))\n\t\t\t\tdefault:\n\t\t\t\t\tpanic(\"DECIMAL must be of type INT32, INT64, or FIXED_LEN_BYTE_ARRAY but got \" + kind.String())\n\t\t\t\t}\n\t\t\t\treturn &decimalType{\n\t\t\t\t\tdecimal: *lt.Decimal,\n\t\t\t\t\tType:    typ,\n\t\t\t\t}\n\t\t\t}\n\t\tcase lt.Date != nil:\n\t\t\treturn (*dateType)(lt.Date)\n\t\tcase lt.Time != nil:\n\t\t\treturn (*timeType)(lt.Time)\n\t\tcase lt.Timestamp != nil:\n\t\t\treturn (*timestampType)(lt.Timestamp)\n\t\tcase lt.Integer != nil:\n\t\t\treturn (*intType)(lt.Integer)\n\t\tcase lt.Unknown != nil:\n\t\t\treturn (*nullType)(lt.Unknown)\n\t\tcase lt.Json != nil:\n\t\t\treturn (*jsonType)(lt.Json)\n\t\tcase lt.Bson != nil:\n\t\t\treturn (*bsonType)(lt.Bson)\n\t\tcase lt.UUID != nil:\n\t\t\treturn (*uuidType)(lt.UUID)\n\t\t}\n\t}\n\n\tif ct := s.ConvertedType; ct != nil {\n\t\t// This column contains no logical type but has a converted type, it\n\t\t// was likely created by an older parquet writer. Convert the legacy\n\t\t// type representation to the equivalent logical parquet type.\n\t\tswitch *ct {\n\t\tcase deprecated.UTF8:\n\t\t\treturn &stringType{}\n\t\tcase deprecated.Map:\n\t\t\treturn &mapType{}\n\t\tcase deprecated.MapKeyValue:\n\t\t\treturn &groupType{}\n\t\tcase deprecated.List:\n\t\t\treturn &listType{}\n\t\tcase deprecated.Enum:\n\t\t\treturn &enumType{}\n\t\tcase deprecated.Decimal:\n\t\t\tif s.Scale != nil && s.Precision != nil {\n\t\t\t\t// A parquet decimal can be one of several different physical types.\n\t\t\t\tif t := s.Type; t != nil {\n\t\t\t\t\tvar typ Type\n\t\t\t\t\tswitch kind := Kind(*s.Type); kind {\n\t\t\t\t\tcase Int32:\n\t\t\t\t\t\ttyp = Int32Type\n\t\t\t\t\tcase Int64:\n\t\t\t\t\t\ttyp = Int64Type\n\t\t\t\t\tcase FixedLenByteArray:\n\t\t\t\t\t\tif s.TypeLength == nil {\n\t\t\t\t\t\t\tpanic(\"DECIMAL using FIXED_LEN_BYTE_ARRAY must specify a length\")\n\t\t\t\t\t\t}\n\t\t\t\t\t\ttyp = FixedLenByteArrayType(int(*s.TypeLength))\n\t\t\t\t\tcase ByteArray:\n\t\t\t\t\t\ttyp = ByteArrayType\n\t\t\t\t\tdefault:\n\t\t\t\t\t\tpanic(\"DECIMAL must be of type INT32, INT64, BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY but got \" + kind.String())\n\t\t\t\t\t}\n\t\t\t\t\treturn &decimalType{\n\t\t\t\t\t\tdecimal: format.DecimalType{\n\t\t\t\t\t\t\tScale:     *s.Scale,\n\t\t\t\t\t\t\tPrecision: *s.Precision,\n\t\t\t\t\t\t},\n\t\t\t\t\t\tType: typ,\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\tcase deprecated.Date:\n\t\t\treturn &dateType{}\n\t\tcase deprecated.TimeMillis:\n\t\t\treturn &timeType{IsAdjustedToUTC: true, Unit: Millisecond.TimeUnit()}\n\t\tcase deprecated.TimeMicros:\n\t\t\treturn &timeType{IsAdjustedToUTC: true, Unit: Microsecond.TimeUnit()}\n\t\tcase deprecated.TimestampMillis:\n\t\t\treturn &timestampType{IsAdjustedToUTC: true, Unit: Millisecond.TimeUnit()}\n\t\tcase deprecated.TimestampMicros:\n\t\t\treturn &timestampType{IsAdjustedToUTC: true, Unit: Microsecond.TimeUnit()}\n\t\tcase deprecated.Uint8:\n\t\t\treturn &unsignedIntTypes[0]\n\t\tcase deprecated.Uint16:\n\t\t\treturn &unsignedIntTypes[1]\n\t\tcase deprecated.Uint32:\n\t\t\treturn &unsignedIntTypes[2]\n\t\tcase deprecated.Uint64:\n\t\t\treturn &unsignedIntTypes[3]\n\t\tcase deprecated.Int8:\n\t\t\treturn &signedIntTypes[0]\n\t\tcase deprecated.Int16:\n\t\t\treturn &signedIntTypes[1]\n\t\tcase deprecated.Int32:\n\t\t\treturn &signedIntTypes[2]\n\t\tcase deprecated.Int64:\n\t\t\treturn &signedIntTypes[3]\n\t\tcase deprecated.Json:\n\t\t\treturn &jsonType{}\n\t\tcase deprecated.Bson:\n\t\t\treturn &bsonType{}\n\t\tcase deprecated.Interval:\n\t\t\t// TODO\n\t\t}\n\t}\n\n\tif t := s.Type; t != nil {\n\t\t// The column only has a physical type, convert it to one of the\n\t\t// primitive types supported by this package.\n\t\tswitch kind := Kind(*t); kind {\n\t\tcase Boolean:\n\t\t\treturn BooleanType\n\t\tcase Int32:\n\t\t\treturn Int32Type\n\t\tcase Int64:\n\t\t\treturn Int64Type\n\t\tcase Int96:\n\t\t\treturn Int96Type\n\t\tcase Float:\n\t\t\treturn FloatType\n\t\tcase Double:\n\t\t\treturn DoubleType\n\t\tcase ByteArray:\n\t\t\treturn ByteArrayType\n\t\tcase FixedLenByteArray:\n\t\t\tif s.TypeLength != nil {\n\t\t\t\treturn FixedLenByteArrayType(int(*s.TypeLength))\n\t\t\t}\n\t\t}\n\t}\n\n\t// If we reach this point, we are likely reading a parquet column that was\n\t// written with a non-standard type or is in a newer version of the format\n\t// than this package supports.\n\treturn &nullType{}\n}\n\nfunc schemaRepetitionTypeOf(s *format.SchemaElement) format.FieldRepetitionType {\n\tif s.RepetitionType != nil {\n\t\treturn *s.RepetitionType\n\t}\n\treturn format.Required\n}\n\nfunc (c *Column) decompress(compressedPageData []byte, uncompressedPageSize int32) (page *buffer, err error) {\n\tpage = buffers.get(int(uncompressedPageSize))\n\tpage.data, err = c.compression.Decode(page.data, compressedPageData)\n\tif err != nil {\n\t\tpage.unref()\n\t\tpage = nil\n\t}\n\treturn page, err\n}\n\n// DecodeDataPageV1 decodes a data page from the header, compressed data, and\n// optional dictionary passed as arguments.\nfunc (c *Column) DecodeDataPageV1(header DataPageHeaderV1, page []byte, dict Dictionary) (Page, error) {\n\treturn c.decodeDataPageV1(header, &buffer{data: page}, dict, -1)\n}\n\nfunc (c *Column) decodeDataPageV1(header DataPageHeaderV1, page *buffer, dict Dictionary, size int32) (Page, error) {\n\tvar pageData = page.data\n\tvar err error\n\n\tif isCompressed(c.compression) {\n\t\tif page, err = c.decompress(pageData, size); err != nil {\n\t\t\treturn nil, fmt.Errorf(\"decompressing data page v1: %w\", err)\n\t\t}\n\t\tdefer page.unref()\n\t\tpageData = page.data\n\t}\n\n\tvar numValues = int(header.NumValues())\n\tvar repetitionLevels *buffer\n\tvar definitionLevels *buffer\n\n\tif c.maxRepetitionLevel > 0 {\n\t\tencoding := lookupLevelEncoding(header.RepetitionLevelEncoding(), c.maxRepetitionLevel)\n\t\trepetitionLevels, pageData, err = decodeLevelsV1(encoding, numValues, pageData)\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"decoding repetition levels of data page v1: %w\", err)\n\t\t}\n\t\tdefer repetitionLevels.unref()\n\t}\n\n\tif c.maxDefinitionLevel > 0 {\n\t\tencoding := lookupLevelEncoding(header.DefinitionLevelEncoding(), c.maxDefinitionLevel)\n\t\tdefinitionLevels, pageData, err = decodeLevelsV1(encoding, numValues, pageData)\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"decoding definition levels of data page v1: %w\", err)\n\t\t}\n\t\tdefer definitionLevels.unref()\n\n\t\t// Data pages v1 did not embed the number of null values,\n\t\t// so we have to compute it from the definition levels.\n\t\tnumValues -= countLevelsNotEqual(definitionLevels.data, c.maxDefinitionLevel)\n\t}\n\n\treturn c.decodeDataPage(header, numValues, repetitionLevels, definitionLevels, page, pageData, dict)\n}\n\n// DecodeDataPageV2 decodes a data page from the header, compressed data, and\n// optional dictionary passed as arguments.\nfunc (c *Column) DecodeDataPageV2(header DataPageHeaderV2, page []byte, dict Dictionary) (Page, error) {\n\treturn c.decodeDataPageV2(header, &buffer{data: page}, dict, -1)\n}\n\nfunc (c *Column) decodeDataPageV2(header DataPageHeaderV2, page *buffer, dict Dictionary, size int32) (Page, error) {\n\tvar numValues = int(header.NumValues())\n\tvar pageData = page.data\n\tvar err error\n\tvar repetitionLevels *buffer\n\tvar definitionLevels *buffer\n\n\tif length := header.RepetitionLevelsByteLength(); length > 0 {\n\t\tif c.maxRepetitionLevel == 0 {\n\t\t\t// In some cases we've observed files which have a non-zero\n\t\t\t// repetition level despite the column not being repeated\n\t\t\t// (nor nested within a repeated column).\n\t\t\t//\n\t\t\t// See https://github.com/apache/parquet-testing/pull/24\n\t\t\tpageData, err = skipLevelsV2(pageData, length)\n\t\t} else {\n\t\t\tencoding := lookupLevelEncoding(header.RepetitionLevelEncoding(), c.maxRepetitionLevel)\n\t\t\trepetitionLevels, pageData, err = decodeLevelsV2(encoding, numValues, pageData, length)\n\t\t}\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"decoding repetition levels of data page v2: %w\", io.ErrUnexpectedEOF)\n\t\t}\n\t\tif repetitionLevels != nil {\n\t\t\tdefer repetitionLevels.unref()\n\t\t}\n\t}\n\n\tif length := header.DefinitionLevelsByteLength(); length > 0 {\n\t\tif c.maxDefinitionLevel == 0 {\n\t\t\tpageData, err = skipLevelsV2(pageData, length)\n\t\t} else {\n\t\t\tencoding := lookupLevelEncoding(header.DefinitionLevelEncoding(), c.maxDefinitionLevel)\n\t\t\tdefinitionLevels, pageData, err = decodeLevelsV2(encoding, numValues, pageData, length)\n\t\t}\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"decoding definition levels of data page v2: %w\", io.ErrUnexpectedEOF)\n\t\t}\n\t\tif definitionLevels != nil {\n\t\t\tdefer definitionLevels.unref()\n\t\t}\n\t}\n\n\tif isCompressed(c.compression) && header.IsCompressed() {\n\t\tif page, err = c.decompress(pageData, size); err != nil {\n\t\t\treturn nil, fmt.Errorf(\"decompressing data page v2: %w\", err)\n\t\t}\n\t\tdefer page.unref()\n\t\tpageData = page.data\n\t}\n\n\tnumValues -= int(header.NumNulls())\n\treturn c.decodeDataPage(header, numValues, repetitionLevels, definitionLevels, page, pageData, dict)\n}\n\nfunc (c *Column) decodeDataPage(header DataPageHeader, numValues int, repetitionLevels, definitionLevels, page *buffer, data []byte, dict Dictionary) (Page, error) {\n\tpageEncoding := LookupEncoding(header.Encoding())\n\tpageType := c.Type()\n\n\tif isDictionaryEncoding(pageEncoding) {\n\t\t// In some legacy configurations, the PLAIN_DICTIONARY encoding is used\n\t\t// on data page headers to indicate that the page contains indexes into\n\t\t// the dictionary page, but the page is still encoded using the RLE\n\t\t// encoding in this case, so we convert it to RLE_DICTIONARY.\n\t\tpageEncoding = &RLEDictionary\n\t\tpageType = indexedPageType{newIndexedType(pageType, dict)}\n\t}\n\n\tvar vbuf, obuf *buffer\n\tvar pageValues []byte\n\tvar pageOffsets []uint32\n\n\tif pageEncoding.CanDecodeInPlace() {\n\t\tvbuf = page\n\t\tpageValues = data\n\t} else {\n\t\tvbuf = buffers.get(pageType.EstimateDecodeSize(numValues, data, pageEncoding))\n\t\tdefer vbuf.unref()\n\t\tpageValues = vbuf.data\n\t}\n\n\t// Page offsets not needed when dictionary-encoded\n\tif pageType.Kind() == ByteArray && !isDictionaryEncoding(pageEncoding) {\n\t\tobuf = buffers.get(4 * (numValues + 1))\n\t\tdefer obuf.unref()\n\t\tpageOffsets = unsafecast.BytesToUint32(obuf.data)\n\t}\n\n\tvalues := pageType.NewValues(pageValues, pageOffsets)\n\tvalues, err := pageType.Decode(values, data, pageEncoding)\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\n\tnewPage := pageType.NewPage(c.Index(), numValues, values)\n\tswitch {\n\tcase c.maxRepetitionLevel > 0:\n\t\tnewPage = newRepeatedPage(\n\t\t\tnewPage,\n\t\t\tc.maxRepetitionLevel,\n\t\t\tc.maxDefinitionLevel,\n\t\t\trepetitionLevels.data,\n\t\t\tdefinitionLevels.data,\n\t\t)\n\tcase c.maxDefinitionLevel > 0:\n\t\tnewPage = newOptionalPage(\n\t\t\tnewPage,\n\t\t\tc.maxDefinitionLevel,\n\t\t\tdefinitionLevels.data,\n\t\t)\n\t}\n\n\treturn newBufferedPage(newPage, vbuf, obuf, repetitionLevels, definitionLevels), nil\n}\n\nfunc decodeLevelsV1(enc encoding.Encoding, numValues int, data []byte) (*buffer, []byte, error) {\n\tif len(data) < 4 {\n\t\treturn nil, data, io.ErrUnexpectedEOF\n\t}\n\ti := 4\n\tj := 4 + int(binary.LittleEndian.Uint32(data))\n\tif j > len(data) {\n\t\treturn nil, data, io.ErrUnexpectedEOF\n\t}\n\tlevels, err := decodeLevels(enc, numValues, data[i:j])\n\treturn levels, data[j:], err\n}\n\nfunc decodeLevelsV2(enc encoding.Encoding, numValues int, data []byte, length int64) (*buffer, []byte, error) {\n\tlevels, err := decodeLevels(enc, numValues, data[:length])\n\treturn levels, data[length:], err\n}\n\nfunc decodeLevels(enc encoding.Encoding, numValues int, data []byte) (levels *buffer, err error) {\n\tlevels = buffers.get(numValues)\n\tlevels.data, err = enc.DecodeLevels(levels.data, data)\n\tif err != nil {\n\t\tlevels.unref()\n\t\tlevels = nil\n\t} else {\n\t\tswitch {\n\t\tcase len(levels.data) < numValues:\n\t\t\terr = fmt.Errorf(\"decoding level expected %d values but got only %d\", numValues, len(levels.data))\n\t\tcase len(levels.data) > numValues:\n\t\t\tlevels.data = levels.data[:numValues]\n\t\t}\n\t}\n\treturn levels, err\n}\n\nfunc skipLevelsV2(data []byte, length int64) ([]byte, error) {\n\tif length >= int64(len(data)) {\n\t\treturn data, io.ErrUnexpectedEOF\n\t}\n\treturn data[length:], nil\n}\n\n// DecodeDictionary decodes a data page from the header and compressed data\n// passed as arguments.\nfunc (c *Column) DecodeDictionary(header DictionaryPageHeader, page []byte) (Dictionary, error) {\n\treturn c.decodeDictionary(header, &buffer{data: page}, -1)\n}\n\nfunc (c *Column) decodeDictionary(header DictionaryPageHeader, page *buffer, size int32) (Dictionary, error) {\n\tpageData := page.data\n\n\tif isCompressed(c.compression) {\n\t\tvar err error\n\t\tif page, err = c.decompress(pageData, size); err != nil {\n\t\t\treturn nil, fmt.Errorf(\"decompressing dictionary page: %w\", err)\n\t\t}\n\t\tdefer page.unref()\n\t\tpageData = page.data\n\t}\n\n\tpageType := c.Type()\n\tpageEncoding := header.Encoding()\n\tif pageEncoding == format.PlainDictionary {\n\t\tpageEncoding = format.Plain\n\t}\n\n\tnumValues := int(header.NumValues())\n\tvalues := pageType.NewValues(nil, nil)\n\tvalues, err := pageType.Decode(values, pageData, LookupEncoding(pageEncoding))\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\treturn pageType.NewDictionary(int(c.index), numValues, values), nil\n}\n\nvar (\n\t_ Node = (*Column)(nil)\n)\n"
  },
  {
    "path": "column_buffer.go",
    "content": "package parquet\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"io\"\n\t\"sort\"\n\t\"unsafe\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding/plain\"\n\t\"github.com/segmentio/parquet-go/internal/bitpack\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n\t\"github.com/segmentio/parquet-go/sparse\"\n)\n\n// ColumnBuffer is an interface representing columns of a row group.\n//\n// ColumnBuffer implements sort.Interface as a way to support reordering the\n// rows that have been written to it.\n//\n// The current implementation has a limitation which prevents applications from\n// providing custom versions of this interface because it contains unexported\n// methods. The only way to create ColumnBuffer values is to call the\n// NewColumnBuffer of Type instances. This limitation may be lifted in future\n// releases.\ntype ColumnBuffer interface {\n\t// Exposes a read-only view of the column buffer.\n\tColumnChunk\n\n\t// The column implements ValueReaderAt as a mechanism to read values at\n\t// specific locations within the buffer.\n\tValueReaderAt\n\n\t// The column implements ValueWriter as a mechanism to optimize the copy\n\t// of values into the buffer in contexts where the row information is\n\t// provided by the values because the repetition and definition levels\n\t// are set.\n\tValueWriter\n\n\t// For indexed columns, returns the underlying dictionary holding the column\n\t// values. If the column is not indexed, nil is returned.\n\tDictionary() Dictionary\n\n\t// Returns a copy of the column. The returned copy shares no memory with\n\t// the original, mutations of either column will not modify the other.\n\tClone() ColumnBuffer\n\n\t// Returns the column as a Page.\n\tPage() Page\n\n\t// Clears all rows written to the column.\n\tReset()\n\n\t// Returns the current capacity of the column (rows).\n\tCap() int\n\n\t// Returns the number of rows currently written to the column.\n\tLen() int\n\n\t// Compares rows at index i and j and reports whether i < j.\n\tLess(i, j int) bool\n\n\t// Swaps rows at index i and j.\n\tSwap(i, j int)\n\n\t// Returns the size of the column buffer in bytes.\n\tSize() int64\n\n\t// This method is employed to write rows from arrays of Go values into the\n\t// column buffer. The method is currently unexported because it uses unsafe\n\t// APIs which would be difficult for applications to leverage, increasing\n\t// the risk of introducing bugs in the code. As a consequence, applications\n\t// cannot use custom implementations of the ColumnBuffer interface since\n\t// they cannot declare an unexported method that would match this signature.\n\t// It means that in order to create a ColumnBuffer value, programs need to\n\t// go through a call to NewColumnBuffer on a Type instance. We make this\n\t// trade off for now as it is preferrable to optimize for safety over\n\t// extensibility in the public APIs, we might revisit in the future if we\n\t// learn about valid use cases for custom column buffer types.\n\twriteValues(rows sparse.Array, levels columnLevels)\n}\n\ntype columnLevels struct {\n\trepetitionDepth byte\n\trepetitionLevel byte\n\tdefinitionLevel byte\n}\n\nfunc columnIndexOfNullable(base ColumnBuffer, maxDefinitionLevel byte, definitionLevels []byte) ColumnIndex {\n\treturn &nullableColumnIndex{\n\t\tColumnIndex:        base.ColumnIndex(),\n\t\tmaxDefinitionLevel: maxDefinitionLevel,\n\t\tdefinitionLevels:   definitionLevels,\n\t}\n}\n\ntype nullableColumnIndex struct {\n\tColumnIndex\n\tmaxDefinitionLevel byte\n\tdefinitionLevels   []byte\n}\n\nfunc (index *nullableColumnIndex) NullPage(i int) bool {\n\treturn index.NullCount(i) == int64(len(index.definitionLevels))\n}\n\nfunc (index *nullableColumnIndex) NullCount(i int) int64 {\n\treturn int64(countLevelsNotEqual(index.definitionLevels, index.maxDefinitionLevel))\n}\n\ntype nullOrdering func(column ColumnBuffer, i, j int, maxDefinitionLevel, definitionLevel1, definitionLevel2 byte) bool\n\nfunc nullsGoFirst(column ColumnBuffer, i, j int, maxDefinitionLevel, definitionLevel1, definitionLevel2 byte) bool {\n\tif definitionLevel1 != maxDefinitionLevel {\n\t\treturn definitionLevel2 == maxDefinitionLevel\n\t} else {\n\t\treturn definitionLevel2 == maxDefinitionLevel && column.Less(i, j)\n\t}\n}\n\nfunc nullsGoLast(column ColumnBuffer, i, j int, maxDefinitionLevel, definitionLevel1, definitionLevel2 byte) bool {\n\treturn definitionLevel1 == maxDefinitionLevel && (definitionLevel2 != maxDefinitionLevel || column.Less(i, j))\n}\n\n// reversedColumnBuffer is an adapter of ColumnBuffer which inverses the order\n// in which rows are ordered when the column gets sorted.\n//\n// This type is used when buffers are constructed with sorting columns ordering\n// values in descending order.\ntype reversedColumnBuffer struct{ ColumnBuffer }\n\nfunc (col *reversedColumnBuffer) Less(i, j int) bool { return col.ColumnBuffer.Less(j, i) }\n\n// optionalColumnBuffer is an implementation of the ColumnBuffer interface used\n// as a wrapper to an underlying ColumnBuffer to manage the creation of\n// definition levels.\n//\n// Null values are not written to the underlying column; instead, the buffer\n// tracks offsets of row values in the column, null row values are represented\n// by the value -1 and a definition level less than the max.\n//\n// This column buffer type is used for all leaf columns that have a non-zero\n// max definition level and a zero repetition level, which may be because the\n// column or one of its parent(s) are marked optional.\ntype optionalColumnBuffer struct {\n\tbase               ColumnBuffer\n\treordered          bool\n\tmaxDefinitionLevel byte\n\trows               []int32\n\tsortIndex          []int32\n\tdefinitionLevels   []byte\n\tnullOrdering       nullOrdering\n}\n\nfunc newOptionalColumnBuffer(base ColumnBuffer, maxDefinitionLevel byte, nullOrdering nullOrdering) *optionalColumnBuffer {\n\tn := base.Cap()\n\treturn &optionalColumnBuffer{\n\t\tbase:               base,\n\t\tmaxDefinitionLevel: maxDefinitionLevel,\n\t\trows:               make([]int32, 0, n),\n\t\tdefinitionLevels:   make([]byte, 0, n),\n\t\tnullOrdering:       nullOrdering,\n\t}\n}\n\nfunc (col *optionalColumnBuffer) Clone() ColumnBuffer {\n\treturn &optionalColumnBuffer{\n\t\tbase:               col.base.Clone(),\n\t\treordered:          col.reordered,\n\t\tmaxDefinitionLevel: col.maxDefinitionLevel,\n\t\trows:               append([]int32{}, col.rows...),\n\t\tdefinitionLevels:   append([]byte{}, col.definitionLevels...),\n\t\tnullOrdering:       col.nullOrdering,\n\t}\n}\n\nfunc (col *optionalColumnBuffer) Type() Type {\n\treturn col.base.Type()\n}\n\nfunc (col *optionalColumnBuffer) NumValues() int64 {\n\treturn int64(len(col.definitionLevels))\n}\n\nfunc (col *optionalColumnBuffer) ColumnIndex() ColumnIndex {\n\treturn columnIndexOfNullable(col.base, col.maxDefinitionLevel, col.definitionLevels)\n}\n\nfunc (col *optionalColumnBuffer) OffsetIndex() OffsetIndex {\n\treturn col.base.OffsetIndex()\n}\n\nfunc (col *optionalColumnBuffer) BloomFilter() BloomFilter {\n\treturn col.base.BloomFilter()\n}\n\nfunc (col *optionalColumnBuffer) Dictionary() Dictionary {\n\treturn col.base.Dictionary()\n}\n\nfunc (col *optionalColumnBuffer) Column() int {\n\treturn col.base.Column()\n}\n\nfunc (col *optionalColumnBuffer) Pages() Pages {\n\treturn onePage(col.Page())\n}\n\nfunc (col *optionalColumnBuffer) Page() Page {\n\t// No need for any cyclic sorting if the rows have not been reordered.\n\t// This case is also important because the cyclic sorting modifies the\n\t// buffer which makes it unsafe to read the buffer concurrently.\n\tif col.reordered {\n\t\tnumNulls := countLevelsNotEqual(col.definitionLevels, col.maxDefinitionLevel)\n\t\tnumValues := len(col.rows) - numNulls\n\n\t\tif numValues > 0 {\n\t\t\tif cap(col.sortIndex) < numValues {\n\t\t\t\tcol.sortIndex = make([]int32, numValues)\n\t\t\t}\n\t\t\tsortIndex := col.sortIndex[:numValues]\n\t\t\ti := 0\n\t\t\tfor _, j := range col.rows {\n\t\t\t\tif j >= 0 {\n\t\t\t\t\tsortIndex[j] = int32(i)\n\t\t\t\t\ti++\n\t\t\t\t}\n\t\t\t}\n\n\t\t\t// Cyclic sort: O(N)\n\t\t\tfor i := range sortIndex {\n\t\t\t\tfor j := int(sortIndex[i]); i != j; j = int(sortIndex[i]) {\n\t\t\t\t\tcol.base.Swap(i, j)\n\t\t\t\t\tsortIndex[i], sortIndex[j] = sortIndex[j], sortIndex[i]\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\ti := 0\n\t\tfor _, r := range col.rows {\n\t\t\tif r >= 0 {\n\t\t\t\tcol.rows[i] = int32(i)\n\t\t\t\ti++\n\t\t\t}\n\t\t}\n\n\t\tcol.reordered = false\n\t}\n\n\treturn newOptionalPage(col.base.Page(), col.maxDefinitionLevel, col.definitionLevels)\n}\n\nfunc (col *optionalColumnBuffer) Reset() {\n\tcol.base.Reset()\n\tcol.rows = col.rows[:0]\n\tcol.definitionLevels = col.definitionLevels[:0]\n}\n\nfunc (col *optionalColumnBuffer) Size() int64 {\n\treturn int64(4*len(col.rows)+4*len(col.sortIndex)+len(col.definitionLevels)) + col.base.Size()\n}\n\nfunc (col *optionalColumnBuffer) Cap() int { return cap(col.rows) }\n\nfunc (col *optionalColumnBuffer) Len() int { return len(col.rows) }\n\nfunc (col *optionalColumnBuffer) Less(i, j int) bool {\n\treturn col.nullOrdering(\n\t\tcol.base,\n\t\tint(col.rows[i]),\n\t\tint(col.rows[j]),\n\t\tcol.maxDefinitionLevel,\n\t\tcol.definitionLevels[i],\n\t\tcol.definitionLevels[j],\n\t)\n}\n\nfunc (col *optionalColumnBuffer) Swap(i, j int) {\n\t// Because the underlying column does not contain null values, we cannot\n\t// swap its values at indexes i and j. We swap the row indexes only, then\n\t// reorder the underlying buffer using a cyclic sort when the buffer is\n\t// materialized into a page view.\n\tcol.reordered = true\n\tcol.rows[i], col.rows[j] = col.rows[j], col.rows[i]\n\tcol.definitionLevels[i], col.definitionLevels[j] = col.definitionLevels[j], col.definitionLevels[i]\n}\n\nfunc (col *optionalColumnBuffer) WriteValues(values []Value) (n int, err error) {\n\trowIndex := int32(col.base.Len())\n\n\tfor n < len(values) {\n\t\t// Collect index range of contiguous null values, from i to n. If this\n\t\t// for loop exhausts the values, all remaining if statements and for\n\t\t// loops will be no-ops and the loop will terminate.\n\t\ti := n\n\t\tfor n < len(values) && values[n].definitionLevel != col.maxDefinitionLevel {\n\t\t\tn++\n\t\t}\n\n\t\t// Write the contiguous null values up until the first non-null value\n\t\t// obtained in the for loop above.\n\t\tfor _, v := range values[i:n] {\n\t\t\tcol.rows = append(col.rows, -1)\n\t\t\tcol.definitionLevels = append(col.definitionLevels, v.definitionLevel)\n\t\t}\n\n\t\t// Collect index range of contiguous non-null values, from i to n.\n\t\ti = n\n\t\tfor n < len(values) && values[n].definitionLevel == col.maxDefinitionLevel {\n\t\t\tn++\n\t\t}\n\n\t\t// As long as i < n we have non-null values still to write. It is\n\t\t// possible that we just exhausted the input values in which case i == n\n\t\t// and the outer for loop will terminate.\n\t\tif i < n {\n\t\t\tcount, err := col.base.WriteValues(values[i:n])\n\t\t\tcol.definitionLevels = appendLevel(col.definitionLevels, col.maxDefinitionLevel, count)\n\n\t\t\tfor count > 0 {\n\t\t\t\tcol.rows = append(col.rows, rowIndex)\n\t\t\t\trowIndex++\n\t\t\t\tcount--\n\t\t\t}\n\n\t\t\tif err != nil {\n\t\t\t\treturn n, err\n\t\t\t}\n\t\t}\n\t}\n\treturn n, nil\n}\n\nfunc (col *optionalColumnBuffer) writeValues(rows sparse.Array, levels columnLevels) {\n\t// The row count is zero when writing an null optional value, in which case\n\t// we still need to output a row to the buffer to record the definition\n\t// level.\n\tif rows.Len() == 0 {\n\t\tcol.definitionLevels = append(col.definitionLevels, levels.definitionLevel)\n\t\tcol.rows = append(col.rows, -1)\n\t\treturn\n\t}\n\n\tcol.definitionLevels = appendLevel(col.definitionLevels, levels.definitionLevel, rows.Len())\n\n\ti := len(col.rows)\n\tj := len(col.rows) + rows.Len()\n\n\tif j <= cap(col.rows) {\n\t\tcol.rows = col.rows[:j]\n\t} else {\n\t\ttmp := make([]int32, j, 2*j)\n\t\tcopy(tmp, col.rows)\n\t\tcol.rows = tmp\n\t}\n\n\tif levels.definitionLevel != col.maxDefinitionLevel {\n\t\tbroadcastValueInt32(col.rows[i:], -1)\n\t} else {\n\t\tbroadcastRangeInt32(col.rows[i:], int32(col.base.Len()))\n\t\tcol.base.writeValues(rows, levels)\n\t}\n}\n\nfunc (col *optionalColumnBuffer) ReadValuesAt(values []Value, offset int64) (int, error) {\n\tlength := int64(len(col.definitionLevels))\n\tif offset < 0 {\n\t\treturn 0, errRowIndexOutOfBounds(offset, length)\n\t}\n\tif offset >= length {\n\t\treturn 0, io.EOF\n\t}\n\tif length -= offset; length < int64(len(values)) {\n\t\tvalues = values[:length]\n\t}\n\n\tnumNulls1 := int64(countLevelsNotEqual(col.definitionLevels[:offset], col.maxDefinitionLevel))\n\tnumNulls2 := int64(countLevelsNotEqual(col.definitionLevels[offset:offset+length], col.maxDefinitionLevel))\n\n\tif numNulls2 < length {\n\t\tn, err := col.base.ReadValuesAt(values[:length-numNulls2], offset-numNulls1)\n\t\tif err != nil {\n\t\t\treturn n, err\n\t\t}\n\t}\n\n\tif numNulls2 > 0 {\n\t\tcolumnIndex := ^int16(col.Column())\n\t\ti := numNulls2 - 1\n\t\tj := length - 1\n\t\tdefinitionLevels := col.definitionLevels[offset : offset+length]\n\t\tmaxDefinitionLevel := col.maxDefinitionLevel\n\n\t\tfor n := len(definitionLevels) - 1; n >= 0 && j > i; n-- {\n\t\t\tif definitionLevels[n] != maxDefinitionLevel {\n\t\t\t\tvalues[j] = Value{definitionLevel: definitionLevels[n], columnIndex: columnIndex}\n\t\t\t} else {\n\t\t\t\tvalues[j] = values[i]\n\t\t\t\ti--\n\t\t\t}\n\t\t\tj--\n\t\t}\n\t}\n\n\treturn int(length), nil\n}\n\n// repeatedColumnBuffer is an implementation of the ColumnBuffer interface used\n// as a wrapper to an underlying ColumnBuffer to manage the creation of\n// repetition levels, definition levels, and map rows to the region of the\n// underlying buffer that contains their sequence of values.\n//\n// Null values are not written to the underlying column; instead, the buffer\n// tracks offsets of row values in the column, null row values are represented\n// by the value -1 and a definition level less than the max.\n//\n// This column buffer type is used for all leaf columns that have a non-zero\n// max repetition level, which may be because the column or one of its parent(s)\n// are marked repeated.\ntype repeatedColumnBuffer struct {\n\tbase               ColumnBuffer\n\treordered          bool\n\tmaxRepetitionLevel byte\n\tmaxDefinitionLevel byte\n\trows               []offsetMapping\n\trepetitionLevels   []byte\n\tdefinitionLevels   []byte\n\tbuffer             []Value\n\treordering         *repeatedColumnBuffer\n\tnullOrdering       nullOrdering\n}\n\n// The offsetMapping type maps the logical offset of rows within the repetition\n// and definition levels, to the base offsets in the underlying column buffers\n// where the non-null values have been written.\ntype offsetMapping struct {\n\toffset     uint32\n\tbaseOffset uint32\n}\n\nfunc newRepeatedColumnBuffer(base ColumnBuffer, maxRepetitionLevel, maxDefinitionLevel byte, nullOrdering nullOrdering) *repeatedColumnBuffer {\n\tn := base.Cap()\n\treturn &repeatedColumnBuffer{\n\t\tbase:               base,\n\t\tmaxRepetitionLevel: maxRepetitionLevel,\n\t\tmaxDefinitionLevel: maxDefinitionLevel,\n\t\trows:               make([]offsetMapping, 0, n/8),\n\t\trepetitionLevels:   make([]byte, 0, n),\n\t\tdefinitionLevels:   make([]byte, 0, n),\n\t\tnullOrdering:       nullOrdering,\n\t}\n}\n\nfunc (col *repeatedColumnBuffer) Clone() ColumnBuffer {\n\treturn &repeatedColumnBuffer{\n\t\tbase:               col.base.Clone(),\n\t\treordered:          col.reordered,\n\t\tmaxRepetitionLevel: col.maxRepetitionLevel,\n\t\tmaxDefinitionLevel: col.maxDefinitionLevel,\n\t\trows:               append([]offsetMapping{}, col.rows...),\n\t\trepetitionLevels:   append([]byte{}, col.repetitionLevels...),\n\t\tdefinitionLevels:   append([]byte{}, col.definitionLevels...),\n\t\tnullOrdering:       col.nullOrdering,\n\t}\n}\n\nfunc (col *repeatedColumnBuffer) Type() Type {\n\treturn col.base.Type()\n}\n\nfunc (col *repeatedColumnBuffer) NumValues() int64 {\n\treturn int64(len(col.definitionLevels))\n}\n\nfunc (col *repeatedColumnBuffer) ColumnIndex() ColumnIndex {\n\treturn columnIndexOfNullable(col.base, col.maxDefinitionLevel, col.definitionLevels)\n}\n\nfunc (col *repeatedColumnBuffer) OffsetIndex() OffsetIndex {\n\treturn col.base.OffsetIndex()\n}\n\nfunc (col *repeatedColumnBuffer) BloomFilter() BloomFilter {\n\treturn col.base.BloomFilter()\n}\n\nfunc (col *repeatedColumnBuffer) Dictionary() Dictionary {\n\treturn col.base.Dictionary()\n}\n\nfunc (col *repeatedColumnBuffer) Column() int {\n\treturn col.base.Column()\n}\n\nfunc (col *repeatedColumnBuffer) Pages() Pages {\n\treturn onePage(col.Page())\n}\n\nfunc (col *repeatedColumnBuffer) Page() Page {\n\tif col.reordered {\n\t\tif col.reordering == nil {\n\t\t\tcol.reordering = col.Clone().(*repeatedColumnBuffer)\n\t\t}\n\n\t\tcolumn := col.reordering\n\t\tcolumn.Reset()\n\t\tmaxNumValues := 0\n\t\tdefer func() {\n\t\t\tclearValues(col.buffer[:maxNumValues])\n\t\t}()\n\n\t\tbaseOffset := 0\n\n\t\tfor _, row := range col.rows {\n\t\t\trowOffset := int(row.offset)\n\t\t\trowLength := repeatedRowLength(col.repetitionLevels[rowOffset:])\n\t\t\tnumNulls := countLevelsNotEqual(col.definitionLevels[rowOffset:rowOffset+rowLength], col.maxDefinitionLevel)\n\t\t\tnumValues := rowLength - numNulls\n\n\t\t\tif numValues > 0 {\n\t\t\t\tif numValues > cap(col.buffer) {\n\t\t\t\t\tcol.buffer = make([]Value, numValues)\n\t\t\t\t} else {\n\t\t\t\t\tcol.buffer = col.buffer[:numValues]\n\t\t\t\t}\n\t\t\t\tn, err := col.base.ReadValuesAt(col.buffer, int64(row.baseOffset))\n\t\t\t\tif err != nil && n < numValues {\n\t\t\t\t\treturn newErrorPage(col.Type(), col.Column(), \"reordering rows of repeated column: %w\", err)\n\t\t\t\t}\n\t\t\t\tif _, err := column.base.WriteValues(col.buffer); err != nil {\n\t\t\t\t\treturn newErrorPage(col.Type(), col.Column(), \"reordering rows of repeated column: %w\", err)\n\t\t\t\t}\n\t\t\t\tif numValues > maxNumValues {\n\t\t\t\t\tmaxNumValues = numValues\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tcolumn.rows = append(column.rows, offsetMapping{\n\t\t\t\toffset:     uint32(len(column.repetitionLevels)),\n\t\t\t\tbaseOffset: uint32(baseOffset),\n\t\t\t})\n\n\t\t\tcolumn.repetitionLevels = append(column.repetitionLevels, col.repetitionLevels[rowOffset:rowOffset+rowLength]...)\n\t\t\tcolumn.definitionLevels = append(column.definitionLevels, col.definitionLevels[rowOffset:rowOffset+rowLength]...)\n\t\t\tbaseOffset += numValues\n\t\t}\n\n\t\tcol.swapReorderingBuffer(column)\n\t\tcol.reordered = false\n\t}\n\n\treturn newRepeatedPage(\n\t\tcol.base.Page(),\n\t\tcol.maxRepetitionLevel,\n\t\tcol.maxDefinitionLevel,\n\t\tcol.repetitionLevels,\n\t\tcol.definitionLevels,\n\t)\n}\n\nfunc (col *repeatedColumnBuffer) swapReorderingBuffer(buf *repeatedColumnBuffer) {\n\tcol.base, buf.base = buf.base, col.base\n\tcol.rows, buf.rows = buf.rows, col.rows\n\tcol.repetitionLevels, buf.repetitionLevels = buf.repetitionLevels, col.repetitionLevels\n\tcol.definitionLevels, buf.definitionLevels = buf.definitionLevels, col.definitionLevels\n}\n\nfunc (col *repeatedColumnBuffer) Reset() {\n\tcol.base.Reset()\n\tcol.rows = col.rows[:0]\n\tcol.repetitionLevels = col.repetitionLevels[:0]\n\tcol.definitionLevels = col.definitionLevels[:0]\n}\n\nfunc (col *repeatedColumnBuffer) Size() int64 {\n\treturn int64(8*len(col.rows)+len(col.repetitionLevels)+len(col.definitionLevels)) + col.base.Size()\n}\n\nfunc (col *repeatedColumnBuffer) Cap() int { return cap(col.rows) }\n\nfunc (col *repeatedColumnBuffer) Len() int { return len(col.rows) }\n\nfunc (col *repeatedColumnBuffer) Less(i, j int) bool {\n\trow1 := col.rows[i]\n\trow2 := col.rows[j]\n\tless := col.nullOrdering\n\trow1Length := repeatedRowLength(col.repetitionLevels[row1.offset:])\n\trow2Length := repeatedRowLength(col.repetitionLevels[row2.offset:])\n\n\tfor k := 0; k < row1Length && k < row2Length; k++ {\n\t\tx := int(row1.baseOffset)\n\t\ty := int(row2.baseOffset)\n\t\tdefinitionLevel1 := col.definitionLevels[int(row1.offset)+k]\n\t\tdefinitionLevel2 := col.definitionLevels[int(row2.offset)+k]\n\t\tswitch {\n\t\tcase less(col.base, x, y, col.maxDefinitionLevel, definitionLevel1, definitionLevel2):\n\t\t\treturn true\n\t\tcase less(col.base, y, x, col.maxDefinitionLevel, definitionLevel2, definitionLevel1):\n\t\t\treturn false\n\t\t}\n\t}\n\n\treturn row1Length < row2Length\n}\n\nfunc (col *repeatedColumnBuffer) Swap(i, j int) {\n\t// Because the underlying column does not contain null values, and may hold\n\t// an arbitrary number of values per row, we cannot swap its values at\n\t// indexes i and j. We swap the row indexes only, then reorder the base\n\t// column buffer when its view is materialized into a page by creating a\n\t// copy and writing rows back to it following the order of rows in the\n\t// repeated column buffer.\n\tcol.reordered = true\n\tcol.rows[i], col.rows[j] = col.rows[j], col.rows[i]\n}\n\nfunc (col *repeatedColumnBuffer) WriteValues(values []Value) (numValues int, err error) {\n\tmaxRowLen := 0\n\tdefer func() {\n\t\tclearValues(col.buffer[:maxRowLen])\n\t}()\n\n\tfor i := 0; i < len(values); {\n\t\tj := i\n\n\t\tif values[j].repetitionLevel == 0 {\n\t\t\tj++\n\t\t}\n\n\t\tfor j < len(values) && values[j].repetitionLevel != 0 {\n\t\t\tj++\n\t\t}\n\n\t\tif err := col.writeRow(values[i:j]); err != nil {\n\t\t\treturn numValues, err\n\t\t}\n\n\t\tif len(col.buffer) > maxRowLen {\n\t\t\tmaxRowLen = len(col.buffer)\n\t\t}\n\n\t\tnumValues += j - i\n\t\ti = j\n\t}\n\n\treturn numValues, nil\n}\n\nfunc (col *repeatedColumnBuffer) writeRow(row []Value) error {\n\tcol.buffer = col.buffer[:0]\n\n\tfor _, v := range row {\n\t\tif v.definitionLevel == col.maxDefinitionLevel {\n\t\t\tcol.buffer = append(col.buffer, v)\n\t\t}\n\t}\n\n\tbaseOffset := col.base.NumValues()\n\tif len(col.buffer) > 0 {\n\t\tif _, err := col.base.WriteValues(col.buffer); err != nil {\n\t\t\treturn err\n\t\t}\n\t}\n\n\tif row[0].repetitionLevel == 0 {\n\t\tcol.rows = append(col.rows, offsetMapping{\n\t\t\toffset:     uint32(len(col.repetitionLevels)),\n\t\t\tbaseOffset: uint32(baseOffset),\n\t\t})\n\t}\n\n\tfor _, v := range row {\n\t\tcol.repetitionLevels = append(col.repetitionLevels, v.repetitionLevel)\n\t\tcol.definitionLevels = append(col.definitionLevels, v.definitionLevel)\n\t}\n\n\treturn nil\n}\n\nfunc (col *repeatedColumnBuffer) writeValues(row sparse.Array, levels columnLevels) {\n\tif levels.repetitionLevel == 0 {\n\t\tcol.rows = append(col.rows, offsetMapping{\n\t\t\toffset:     uint32(len(col.repetitionLevels)),\n\t\t\tbaseOffset: uint32(col.base.NumValues()),\n\t\t})\n\t}\n\n\tif row.Len() == 0 {\n\t\tcol.repetitionLevels = append(col.repetitionLevels, levels.repetitionLevel)\n\t\tcol.definitionLevels = append(col.definitionLevels, levels.definitionLevel)\n\t\treturn\n\t}\n\n\tcol.repetitionLevels = appendLevel(col.repetitionLevels, levels.repetitionLevel, row.Len())\n\tcol.definitionLevels = appendLevel(col.definitionLevels, levels.definitionLevel, row.Len())\n\n\tif levels.definitionLevel == col.maxDefinitionLevel {\n\t\tcol.base.writeValues(row, levels)\n\t}\n}\n\nfunc (col *repeatedColumnBuffer) ReadValuesAt(values []Value, offset int64) (int, error) {\n\t// TODO:\n\tpanic(\"NOT IMPLEMENTED\")\n}\n\n// repeatedRowLength gives the length of the repeated row starting at the\n// beginning of the repetitionLevels slice.\nfunc repeatedRowLength(repetitionLevels []byte) int {\n\t// If a repetition level exists, at least one value is required to represent\n\t// the column.\n\tif len(repetitionLevels) > 0 {\n\t\t// The subsequent levels will represent the start of a new record when\n\t\t// they go back to zero.\n\t\tif i := bytes.IndexByte(repetitionLevels[1:], 0); i >= 0 {\n\t\t\treturn i + 1\n\t\t}\n\t}\n\treturn len(repetitionLevels)\n}\n\n// =============================================================================\n// The types below are in-memory implementations of the ColumnBuffer interface\n// for each parquet type.\n//\n// These column buffers are created by calling NewColumnBuffer on parquet.Type\n// instances; each parquet type manages to construct column buffers of the\n// appropriate type, which ensures that we are packing as many values as we\n// can in memory.\n//\n// See Type.NewColumnBuffer for details about how these types get created.\n// =============================================================================\n\ntype booleanColumnBuffer struct{ booleanPage }\n\nfunc newBooleanColumnBuffer(typ Type, columnIndex int16, numValues int32) *booleanColumnBuffer {\n\t// Boolean values are bit-packed, we can fit up to 8 values per byte.\n\tbufferSize := (numValues + 7) / 8\n\treturn &booleanColumnBuffer{\n\t\tbooleanPage: booleanPage{\n\t\t\ttyp:         typ,\n\t\t\tbits:        make([]byte, 0, bufferSize),\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *booleanColumnBuffer) Clone() ColumnBuffer {\n\treturn &booleanColumnBuffer{\n\t\tbooleanPage: booleanPage{\n\t\t\ttyp:         col.typ,\n\t\t\tbits:        append([]byte{}, col.bits...),\n\t\t\toffset:      col.offset,\n\t\t\tnumValues:   col.numValues,\n\t\t\tcolumnIndex: col.columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *booleanColumnBuffer) ColumnIndex() ColumnIndex {\n\treturn booleanColumnIndex{&col.booleanPage}\n}\n\nfunc (col *booleanColumnBuffer) OffsetIndex() OffsetIndex {\n\treturn booleanOffsetIndex{&col.booleanPage}\n}\n\nfunc (col *booleanColumnBuffer) BloomFilter() BloomFilter { return nil }\n\nfunc (col *booleanColumnBuffer) Dictionary() Dictionary { return nil }\n\nfunc (col *booleanColumnBuffer) Pages() Pages { return onePage(col.Page()) }\n\nfunc (col *booleanColumnBuffer) Page() Page { return &col.booleanPage }\n\nfunc (col *booleanColumnBuffer) Reset() {\n\tcol.bits = col.bits[:0]\n\tcol.offset = 0\n\tcol.numValues = 0\n}\n\nfunc (col *booleanColumnBuffer) Cap() int { return 8 * cap(col.bits) }\n\nfunc (col *booleanColumnBuffer) Len() int { return int(col.numValues) }\n\nfunc (col *booleanColumnBuffer) Less(i, j int) bool {\n\ta := col.valueAt(i)\n\tb := col.valueAt(j)\n\treturn a != b && !a\n}\n\nfunc (col *booleanColumnBuffer) valueAt(i int) bool {\n\tj := uint32(i) / 8\n\tk := uint32(i) % 8\n\treturn ((col.bits[j] >> k) & 1) != 0\n}\n\nfunc (col *booleanColumnBuffer) setValueAt(i int, v bool) {\n\t// `offset` is always zero in the page of a column buffer\n\tj := uint32(i) / 8\n\tk := uint32(i) % 8\n\tx := byte(0)\n\tif v {\n\t\tx = 1\n\t}\n\tcol.bits[j] = (col.bits[j] & ^(1 << k)) | (x << k)\n}\n\nfunc (col *booleanColumnBuffer) Swap(i, j int) {\n\ta := col.valueAt(i)\n\tb := col.valueAt(j)\n\tcol.setValueAt(i, b)\n\tcol.setValueAt(j, a)\n}\n\nfunc (col *booleanColumnBuffer) WriteBooleans(values []bool) (int, error) {\n\tcol.writeValues(sparse.MakeBoolArray(values).UnsafeArray(), columnLevels{})\n\treturn len(values), nil\n}\n\nfunc (col *booleanColumnBuffer) WriteValues(values []Value) (int, error) {\n\tvar model Value\n\tcol.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{})\n\treturn len(values), nil\n}\n\nfunc (col *booleanColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) {\n\tnumBytes := bitpack.ByteCount(uint(col.numValues) + uint(rows.Len()))\n\tif cap(col.bits) < numBytes {\n\t\tcol.bits = append(make([]byte, 0, max(numBytes, 2*cap(col.bits))), col.bits...)\n\t}\n\tcol.bits = col.bits[:numBytes]\n\ti := 0\n\tr := 8 - (int(col.numValues) % 8)\n\tbytes := rows.Uint8Array()\n\n\tif r <= bytes.Len() {\n\t\t// First we attempt to write enough bits to align the number of values\n\t\t// in the column buffer on 8 bytes. After this step the next bit should\n\t\t// be written at the zero'th index of a byte of the buffer.\n\t\tif r < 8 {\n\t\t\tvar b byte\n\t\t\tfor i < r {\n\t\t\t\tv := bytes.Index(i)\n\t\t\t\tb |= (v & 1) << uint(i)\n\t\t\t\ti++\n\t\t\t}\n\t\t\tx := uint(col.numValues) / 8\n\t\t\ty := uint(col.numValues) % 8\n\t\t\tcol.bits[x] = (b << y) | (col.bits[x] & ^(0xFF << y))\n\t\t\tcol.numValues += int32(i)\n\t\t}\n\n\t\tif n := ((bytes.Len() - i) / 8) * 8; n > 0 {\n\t\t\t// At this stage, we know that that we have at least 8 bits to write\n\t\t\t// and the bits will be aligned on the address of a byte in the\n\t\t\t// output buffer. We can work on 8 values per loop iteration,\n\t\t\t// packing them into a single byte and writing it to the output\n\t\t\t// buffer. This effectively reduces by 87.5% the number of memory\n\t\t\t// stores that the program needs to perform to generate the values.\n\t\t\ti += sparse.GatherBits(col.bits[col.numValues/8:], bytes.Slice(i, i+n))\n\t\t\tcol.numValues += int32(n)\n\t\t}\n\t}\n\n\tfor i < bytes.Len() {\n\t\tx := uint(col.numValues) / 8\n\t\ty := uint(col.numValues) % 8\n\t\tb := bytes.Index(i)\n\t\tcol.bits[x] = ((b & 1) << y) | (col.bits[x] & ^(1 << y))\n\t\tcol.numValues++\n\t\ti++\n\t}\n\n\tcol.bits = col.bits[:bitpack.ByteCount(uint(col.numValues))]\n}\n\nfunc (col *booleanColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) {\n\ti := int(offset)\n\tswitch {\n\tcase i < 0:\n\t\treturn 0, errRowIndexOutOfBounds(offset, int64(col.numValues))\n\tcase i >= int(col.numValues):\n\t\treturn 0, io.EOF\n\tdefault:\n\t\tfor n < len(values) && i < int(col.numValues) {\n\t\t\tvalues[n] = col.makeValue(col.valueAt(i))\n\t\t\tn++\n\t\t\ti++\n\t\t}\n\t\tif n < len(values) {\n\t\t\terr = io.EOF\n\t\t}\n\t\treturn n, err\n\t}\n}\n\ntype int32ColumnBuffer struct{ int32Page }\n\nfunc newInt32ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int32ColumnBuffer {\n\treturn &int32ColumnBuffer{\n\t\tint32Page: int32Page{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      make([]int32, 0, numValues),\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *int32ColumnBuffer) Clone() ColumnBuffer {\n\treturn &int32ColumnBuffer{\n\t\tint32Page: int32Page{\n\t\t\ttyp:         col.typ,\n\t\t\tvalues:      append([]int32{}, col.values...),\n\t\t\tcolumnIndex: col.columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *int32ColumnBuffer) ColumnIndex() ColumnIndex { return int32ColumnIndex{&col.int32Page} }\n\nfunc (col *int32ColumnBuffer) OffsetIndex() OffsetIndex { return int32OffsetIndex{&col.int32Page} }\n\nfunc (col *int32ColumnBuffer) BloomFilter() BloomFilter { return nil }\n\nfunc (col *int32ColumnBuffer) Dictionary() Dictionary { return nil }\n\nfunc (col *int32ColumnBuffer) Pages() Pages { return onePage(col.Page()) }\n\nfunc (col *int32ColumnBuffer) Page() Page { return &col.int32Page }\n\nfunc (col *int32ColumnBuffer) Reset() { col.values = col.values[:0] }\n\nfunc (col *int32ColumnBuffer) Cap() int { return cap(col.values) }\n\nfunc (col *int32ColumnBuffer) Len() int { return len(col.values) }\n\nfunc (col *int32ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] }\n\nfunc (col *int32ColumnBuffer) Swap(i, j int) {\n\tcol.values[i], col.values[j] = col.values[j], col.values[i]\n}\n\nfunc (col *int32ColumnBuffer) Write(b []byte) (int, error) {\n\tif (len(b) % 4) != 0 {\n\t\treturn 0, fmt.Errorf(\"cannot write INT32 values from input of size %d\", len(b))\n\t}\n\tcol.values = append(col.values, unsafecast.BytesToInt32(b)...)\n\treturn len(b), nil\n}\n\nfunc (col *int32ColumnBuffer) WriteInt32s(values []int32) (int, error) {\n\tcol.values = append(col.values, values...)\n\treturn len(values), nil\n}\n\nfunc (col *int32ColumnBuffer) WriteValues(values []Value) (int, error) {\n\tvar model Value\n\tcol.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{})\n\treturn len(values), nil\n}\n\nfunc (col *int32ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) {\n\tif n := len(col.values) + rows.Len(); n > cap(col.values) {\n\t\tcol.values = append(make([]int32, 0, max(n, 2*cap(col.values))), col.values...)\n\t}\n\tn := len(col.values)\n\tcol.values = col.values[:n+rows.Len()]\n\tsparse.GatherInt32(col.values[n:], rows.Int32Array())\n\n}\n\nfunc (col *int32ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) {\n\ti := int(offset)\n\tswitch {\n\tcase i < 0:\n\t\treturn 0, errRowIndexOutOfBounds(offset, int64(len(col.values)))\n\tcase i >= len(col.values):\n\t\treturn 0, io.EOF\n\tdefault:\n\t\tfor n < len(values) && i < len(col.values) {\n\t\t\tvalues[n] = col.makeValue(col.values[i])\n\t\t\tn++\n\t\t\ti++\n\t\t}\n\t\tif n < len(values) {\n\t\t\terr = io.EOF\n\t\t}\n\t\treturn n, err\n\t}\n}\n\ntype int64ColumnBuffer struct{ int64Page }\n\nfunc newInt64ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int64ColumnBuffer {\n\treturn &int64ColumnBuffer{\n\t\tint64Page: int64Page{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      make([]int64, 0, numValues),\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *int64ColumnBuffer) Clone() ColumnBuffer {\n\treturn &int64ColumnBuffer{\n\t\tint64Page: int64Page{\n\t\t\ttyp:         col.typ,\n\t\t\tvalues:      append([]int64{}, col.values...),\n\t\t\tcolumnIndex: col.columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *int64ColumnBuffer) ColumnIndex() ColumnIndex { return int64ColumnIndex{&col.int64Page} }\n\nfunc (col *int64ColumnBuffer) OffsetIndex() OffsetIndex { return int64OffsetIndex{&col.int64Page} }\n\nfunc (col *int64ColumnBuffer) BloomFilter() BloomFilter { return nil }\n\nfunc (col *int64ColumnBuffer) Dictionary() Dictionary { return nil }\n\nfunc (col *int64ColumnBuffer) Pages() Pages { return onePage(col.Page()) }\n\nfunc (col *int64ColumnBuffer) Page() Page { return &col.int64Page }\n\nfunc (col *int64ColumnBuffer) Reset() { col.values = col.values[:0] }\n\nfunc (col *int64ColumnBuffer) Cap() int { return cap(col.values) }\n\nfunc (col *int64ColumnBuffer) Len() int { return len(col.values) }\n\nfunc (col *int64ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] }\n\nfunc (col *int64ColumnBuffer) Swap(i, j int) {\n\tcol.values[i], col.values[j] = col.values[j], col.values[i]\n}\n\nfunc (col *int64ColumnBuffer) Write(b []byte) (int, error) {\n\tif (len(b) % 8) != 0 {\n\t\treturn 0, fmt.Errorf(\"cannot write INT64 values from input of size %d\", len(b))\n\t}\n\tcol.values = append(col.values, unsafecast.BytesToInt64(b)...)\n\treturn len(b), nil\n}\n\nfunc (col *int64ColumnBuffer) WriteInt64s(values []int64) (int, error) {\n\tcol.values = append(col.values, values...)\n\treturn len(values), nil\n}\n\nfunc (col *int64ColumnBuffer) WriteValues(values []Value) (int, error) {\n\tvar model Value\n\tcol.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{})\n\treturn len(values), nil\n}\n\nfunc (col *int64ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) {\n\tif n := len(col.values) + rows.Len(); n > cap(col.values) {\n\t\tcol.values = append(make([]int64, 0, max(n, 2*cap(col.values))), col.values...)\n\t}\n\tn := len(col.values)\n\tcol.values = col.values[:n+rows.Len()]\n\tsparse.GatherInt64(col.values[n:], rows.Int64Array())\n}\n\nfunc (col *int64ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) {\n\ti := int(offset)\n\tswitch {\n\tcase i < 0:\n\t\treturn 0, errRowIndexOutOfBounds(offset, int64(len(col.values)))\n\tcase i >= len(col.values):\n\t\treturn 0, io.EOF\n\tdefault:\n\t\tfor n < len(values) && i < len(col.values) {\n\t\t\tvalues[n] = col.makeValue(col.values[i])\n\t\t\tn++\n\t\t\ti++\n\t\t}\n\t\tif n < len(values) {\n\t\t\terr = io.EOF\n\t\t}\n\t\treturn n, err\n\t}\n}\n\ntype int96ColumnBuffer struct{ int96Page }\n\nfunc newInt96ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int96ColumnBuffer {\n\treturn &int96ColumnBuffer{\n\t\tint96Page: int96Page{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      make([]deprecated.Int96, 0, numValues),\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *int96ColumnBuffer) Clone() ColumnBuffer {\n\treturn &int96ColumnBuffer{\n\t\tint96Page: int96Page{\n\t\t\ttyp:         col.typ,\n\t\t\tvalues:      append([]deprecated.Int96{}, col.values...),\n\t\t\tcolumnIndex: col.columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *int96ColumnBuffer) ColumnIndex() ColumnIndex { return int96ColumnIndex{&col.int96Page} }\n\nfunc (col *int96ColumnBuffer) OffsetIndex() OffsetIndex { return int96OffsetIndex{&col.int96Page} }\n\nfunc (col *int96ColumnBuffer) BloomFilter() BloomFilter { return nil }\n\nfunc (col *int96ColumnBuffer) Dictionary() Dictionary { return nil }\n\nfunc (col *int96ColumnBuffer) Pages() Pages { return onePage(col.Page()) }\n\nfunc (col *int96ColumnBuffer) Page() Page { return &col.int96Page }\n\nfunc (col *int96ColumnBuffer) Reset() { col.values = col.values[:0] }\n\nfunc (col *int96ColumnBuffer) Cap() int { return cap(col.values) }\n\nfunc (col *int96ColumnBuffer) Len() int { return len(col.values) }\n\nfunc (col *int96ColumnBuffer) Less(i, j int) bool { return col.values[i].Less(col.values[j]) }\n\nfunc (col *int96ColumnBuffer) Swap(i, j int) {\n\tcol.values[i], col.values[j] = col.values[j], col.values[i]\n}\n\nfunc (col *int96ColumnBuffer) Write(b []byte) (int, error) {\n\tif (len(b) % 12) != 0 {\n\t\treturn 0, fmt.Errorf(\"cannot write INT96 values from input of size %d\", len(b))\n\t}\n\tcol.values = append(col.values, deprecated.BytesToInt96(b)...)\n\treturn len(b), nil\n}\n\nfunc (col *int96ColumnBuffer) WriteInt96s(values []deprecated.Int96) (int, error) {\n\tcol.values = append(col.values, values...)\n\treturn len(values), nil\n}\n\nfunc (col *int96ColumnBuffer) WriteValues(values []Value) (int, error) {\n\tfor _, v := range values {\n\t\tcol.values = append(col.values, v.Int96())\n\t}\n\treturn len(values), nil\n}\n\nfunc (col *int96ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) {\n\tfor i := 0; i < rows.Len(); i++ {\n\t\tp := rows.Index(i)\n\t\tcol.values = append(col.values, *(*deprecated.Int96)(p))\n\t}\n}\n\nfunc (col *int96ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) {\n\ti := int(offset)\n\tswitch {\n\tcase i < 0:\n\t\treturn 0, errRowIndexOutOfBounds(offset, int64(len(col.values)))\n\tcase i >= len(col.values):\n\t\treturn 0, io.EOF\n\tdefault:\n\t\tfor n < len(values) && i < len(col.values) {\n\t\t\tvalues[n] = col.makeValue(col.values[i])\n\t\t\tn++\n\t\t\ti++\n\t\t}\n\t\tif n < len(values) {\n\t\t\terr = io.EOF\n\t\t}\n\t\treturn n, err\n\t}\n}\n\ntype floatColumnBuffer struct{ floatPage }\n\nfunc newFloatColumnBuffer(typ Type, columnIndex int16, numValues int32) *floatColumnBuffer {\n\treturn &floatColumnBuffer{\n\t\tfloatPage: floatPage{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      make([]float32, 0, numValues),\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *floatColumnBuffer) Clone() ColumnBuffer {\n\treturn &floatColumnBuffer{\n\t\tfloatPage: floatPage{\n\t\t\ttyp:         col.typ,\n\t\t\tvalues:      append([]float32{}, col.values...),\n\t\t\tcolumnIndex: col.columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *floatColumnBuffer) ColumnIndex() ColumnIndex { return floatColumnIndex{&col.floatPage} }\n\nfunc (col *floatColumnBuffer) OffsetIndex() OffsetIndex { return floatOffsetIndex{&col.floatPage} }\n\nfunc (col *floatColumnBuffer) BloomFilter() BloomFilter { return nil }\n\nfunc (col *floatColumnBuffer) Dictionary() Dictionary { return nil }\n\nfunc (col *floatColumnBuffer) Pages() Pages { return onePage(col.Page()) }\n\nfunc (col *floatColumnBuffer) Page() Page { return &col.floatPage }\n\nfunc (col *floatColumnBuffer) Reset() { col.values = col.values[:0] }\n\nfunc (col *floatColumnBuffer) Cap() int { return cap(col.values) }\n\nfunc (col *floatColumnBuffer) Len() int { return len(col.values) }\n\nfunc (col *floatColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] }\n\nfunc (col *floatColumnBuffer) Swap(i, j int) {\n\tcol.values[i], col.values[j] = col.values[j], col.values[i]\n}\n\nfunc (col *floatColumnBuffer) Write(b []byte) (int, error) {\n\tif (len(b) % 4) != 0 {\n\t\treturn 0, fmt.Errorf(\"cannot write FLOAT values from input of size %d\", len(b))\n\t}\n\tcol.values = append(col.values, unsafecast.BytesToFloat32(b)...)\n\treturn len(b), nil\n}\n\nfunc (col *floatColumnBuffer) WriteFloats(values []float32) (int, error) {\n\tcol.values = append(col.values, values...)\n\treturn len(values), nil\n}\n\nfunc (col *floatColumnBuffer) WriteValues(values []Value) (int, error) {\n\tvar model Value\n\tcol.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{})\n\treturn len(values), nil\n}\n\nfunc (col *floatColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) {\n\tif n := len(col.values) + rows.Len(); n > cap(col.values) {\n\t\tcol.values = append(make([]float32, 0, max(n, 2*cap(col.values))), col.values...)\n\t}\n\tn := len(col.values)\n\tcol.values = col.values[:n+rows.Len()]\n\tsparse.GatherFloat32(col.values[n:], rows.Float32Array())\n}\n\nfunc (col *floatColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) {\n\ti := int(offset)\n\tswitch {\n\tcase i < 0:\n\t\treturn 0, errRowIndexOutOfBounds(offset, int64(len(col.values)))\n\tcase i >= len(col.values):\n\t\treturn 0, io.EOF\n\tdefault:\n\t\tfor n < len(values) && i < len(col.values) {\n\t\t\tvalues[n] = col.makeValue(col.values[i])\n\t\t\tn++\n\t\t\ti++\n\t\t}\n\t\tif n < len(values) {\n\t\t\terr = io.EOF\n\t\t}\n\t\treturn n, err\n\t}\n}\n\ntype doubleColumnBuffer struct{ doublePage }\n\nfunc newDoubleColumnBuffer(typ Type, columnIndex int16, numValues int32) *doubleColumnBuffer {\n\treturn &doubleColumnBuffer{\n\t\tdoublePage: doublePage{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      make([]float64, 0, numValues),\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *doubleColumnBuffer) Clone() ColumnBuffer {\n\treturn &doubleColumnBuffer{\n\t\tdoublePage: doublePage{\n\t\t\ttyp:         col.typ,\n\t\t\tvalues:      append([]float64{}, col.values...),\n\t\t\tcolumnIndex: col.columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *doubleColumnBuffer) ColumnIndex() ColumnIndex { return doubleColumnIndex{&col.doublePage} }\n\nfunc (col *doubleColumnBuffer) OffsetIndex() OffsetIndex { return doubleOffsetIndex{&col.doublePage} }\n\nfunc (col *doubleColumnBuffer) BloomFilter() BloomFilter { return nil }\n\nfunc (col *doubleColumnBuffer) Dictionary() Dictionary { return nil }\n\nfunc (col *doubleColumnBuffer) Pages() Pages { return onePage(col.Page()) }\n\nfunc (col *doubleColumnBuffer) Page() Page { return &col.doublePage }\n\nfunc (col *doubleColumnBuffer) Reset() { col.values = col.values[:0] }\n\nfunc (col *doubleColumnBuffer) Cap() int { return cap(col.values) }\n\nfunc (col *doubleColumnBuffer) Len() int { return len(col.values) }\n\nfunc (col *doubleColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] }\n\nfunc (col *doubleColumnBuffer) Swap(i, j int) {\n\tcol.values[i], col.values[j] = col.values[j], col.values[i]\n}\n\nfunc (col *doubleColumnBuffer) Write(b []byte) (int, error) {\n\tif (len(b) % 8) != 0 {\n\t\treturn 0, fmt.Errorf(\"cannot write DOUBLE values from input of size %d\", len(b))\n\t}\n\tcol.values = append(col.values, unsafecast.BytesToFloat64(b)...)\n\treturn len(b), nil\n}\n\nfunc (col *doubleColumnBuffer) WriteDoubles(values []float64) (int, error) {\n\tcol.values = append(col.values, values...)\n\treturn len(values), nil\n}\n\nfunc (col *doubleColumnBuffer) WriteValues(values []Value) (int, error) {\n\tvar model Value\n\tcol.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{})\n\treturn len(values), nil\n}\n\nfunc (col *doubleColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) {\n\tif n := len(col.values) + rows.Len(); n > cap(col.values) {\n\t\tcol.values = append(make([]float64, 0, max(n, 2*cap(col.values))), col.values...)\n\t}\n\tn := len(col.values)\n\tcol.values = col.values[:n+rows.Len()]\n\tsparse.GatherFloat64(col.values[n:], rows.Float64Array())\n}\n\nfunc (col *doubleColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) {\n\ti := int(offset)\n\tswitch {\n\tcase i < 0:\n\t\treturn 0, errRowIndexOutOfBounds(offset, int64(len(col.values)))\n\tcase i >= len(col.values):\n\t\treturn 0, io.EOF\n\tdefault:\n\t\tfor n < len(values) && i < len(col.values) {\n\t\t\tvalues[n] = col.makeValue(col.values[i])\n\t\t\tn++\n\t\t\ti++\n\t\t}\n\t\tif n < len(values) {\n\t\t\terr = io.EOF\n\t\t}\n\t\treturn n, err\n\t}\n}\n\ntype byteArrayColumnBuffer struct {\n\tbyteArrayPage\n\tlengths []uint32\n\tscratch []byte\n}\n\nfunc newByteArrayColumnBuffer(typ Type, columnIndex int16, numValues int32) *byteArrayColumnBuffer {\n\treturn &byteArrayColumnBuffer{\n\t\tbyteArrayPage: byteArrayPage{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      make([]byte, 0, typ.EstimateSize(int(numValues))),\n\t\t\toffsets:     make([]uint32, 0, numValues+1),\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t\tlengths: make([]uint32, 0, numValues),\n\t}\n}\n\nfunc (col *byteArrayColumnBuffer) Clone() ColumnBuffer {\n\treturn &byteArrayColumnBuffer{\n\t\tbyteArrayPage: byteArrayPage{\n\t\t\ttyp:         col.typ,\n\t\t\tvalues:      col.cloneValues(),\n\t\t\toffsets:     col.cloneOffsets(),\n\t\t\tcolumnIndex: col.columnIndex,\n\t\t},\n\t\tlengths: col.cloneLengths(),\n\t}\n}\n\nfunc (col *byteArrayColumnBuffer) cloneLengths() []uint32 {\n\tlengths := make([]uint32, len(col.lengths))\n\tcopy(lengths, col.lengths)\n\treturn lengths\n}\n\nfunc (col *byteArrayColumnBuffer) ColumnIndex() ColumnIndex {\n\treturn byteArrayColumnIndex{&col.byteArrayPage}\n}\n\nfunc (col *byteArrayColumnBuffer) OffsetIndex() OffsetIndex {\n\treturn byteArrayOffsetIndex{&col.byteArrayPage}\n}\n\nfunc (col *byteArrayColumnBuffer) BloomFilter() BloomFilter { return nil }\n\nfunc (col *byteArrayColumnBuffer) Dictionary() Dictionary { return nil }\n\nfunc (col *byteArrayColumnBuffer) Pages() Pages { return onePage(col.Page()) }\n\nfunc (col *byteArrayColumnBuffer) Page() Page {\n\tif len(col.lengths) > 0 && orderOfUint32(col.offsets) < 1 { // unordered?\n\t\tif cap(col.scratch) < len(col.values) {\n\t\t\tcol.scratch = make([]byte, 0, cap(col.values))\n\t\t} else {\n\t\t\tcol.scratch = col.scratch[:0]\n\t\t}\n\n\t\tfor i := range col.lengths {\n\t\t\tn := len(col.scratch)\n\t\t\tcol.scratch = append(col.scratch, col.index(i)...)\n\t\t\tcol.offsets[i] = uint32(n)\n\t\t}\n\n\t\tcol.values, col.scratch = col.scratch, col.values\n\t}\n\t// The offsets have the total length as the last item. Since we are about to\n\t// expose the column buffer's internal state as a Page value we ensure that\n\t// the last offset is the total length of all values.\n\tcol.offsets = append(col.offsets[:len(col.lengths)], uint32(len(col.values)))\n\treturn &col.byteArrayPage\n}\n\nfunc (col *byteArrayColumnBuffer) Reset() {\n\tcol.values = col.values[:0]\n\tcol.offsets = col.offsets[:0]\n\tcol.lengths = col.lengths[:0]\n}\n\nfunc (col *byteArrayColumnBuffer) NumRows() int64 { return int64(col.Len()) }\n\nfunc (col *byteArrayColumnBuffer) NumValues() int64 { return int64(col.Len()) }\n\nfunc (col *byteArrayColumnBuffer) Cap() int { return cap(col.lengths) }\n\nfunc (col *byteArrayColumnBuffer) Len() int { return len(col.lengths) }\n\nfunc (col *byteArrayColumnBuffer) Less(i, j int) bool {\n\treturn bytes.Compare(col.index(i), col.index(j)) < 0\n}\n\nfunc (col *byteArrayColumnBuffer) Swap(i, j int) {\n\tcol.offsets[i], col.offsets[j] = col.offsets[j], col.offsets[i]\n\tcol.lengths[i], col.lengths[j] = col.lengths[j], col.lengths[i]\n}\n\nfunc (col *byteArrayColumnBuffer) Write(b []byte) (int, error) {\n\t_, n, err := col.writeByteArrays(b)\n\treturn n, err\n}\n\nfunc (col *byteArrayColumnBuffer) WriteByteArrays(values []byte) (int, error) {\n\tn, _, err := col.writeByteArrays(values)\n\treturn n, err\n}\n\nfunc (col *byteArrayColumnBuffer) writeByteArrays(values []byte) (count, bytes int, err error) {\n\tbaseCount := len(col.lengths)\n\tbaseBytes := len(col.values) + (plain.ByteArrayLengthSize * len(col.lengths))\n\n\terr = plain.RangeByteArray(values, func(value []byte) error {\n\t\tcol.append(unsafecast.BytesToString(value))\n\t\treturn nil\n\t})\n\n\tcount = len(col.lengths) - baseCount\n\tbytes = (len(col.values) - baseBytes) + (plain.ByteArrayLengthSize * count)\n\treturn count, bytes, err\n}\n\nfunc (col *byteArrayColumnBuffer) WriteValues(values []Value) (int, error) {\n\tvar model Value\n\tcol.writeValues(makeArrayValue(values, unsafe.Offsetof(model.ptr)), columnLevels{})\n\treturn len(values), nil\n}\n\nfunc (col *byteArrayColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) {\n\tfor i := 0; i < rows.Len(); i++ {\n\t\tp := rows.Index(i)\n\t\tcol.append(*(*string)(p))\n\t}\n}\n\nfunc (col *byteArrayColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) {\n\ti := int(offset)\n\tswitch {\n\tcase i < 0:\n\t\treturn 0, errRowIndexOutOfBounds(offset, int64(len(col.lengths)))\n\tcase i >= len(col.lengths):\n\t\treturn 0, io.EOF\n\tdefault:\n\t\tfor n < len(values) && i < len(col.lengths) {\n\t\t\tvalues[n] = col.makeValueBytes(col.index(i))\n\t\t\tn++\n\t\t\ti++\n\t\t}\n\t\tif n < len(values) {\n\t\t\terr = io.EOF\n\t\t}\n\t\treturn n, err\n\t}\n}\n\nfunc (col *byteArrayColumnBuffer) append(value string) {\n\tcol.offsets = append(col.offsets, uint32(len(col.values)))\n\tcol.lengths = append(col.lengths, uint32(len(value)))\n\tcol.values = append(col.values, value...)\n}\n\nfunc (col *byteArrayColumnBuffer) index(i int) []byte {\n\toffset := col.offsets[i]\n\tlength := col.lengths[i]\n\tend := offset + length\n\treturn col.values[offset:end:end]\n}\n\ntype fixedLenByteArrayColumnBuffer struct {\n\tfixedLenByteArrayPage\n\ttmp []byte\n}\n\nfunc newFixedLenByteArrayColumnBuffer(typ Type, columnIndex int16, numValues int32) *fixedLenByteArrayColumnBuffer {\n\tsize := typ.Length()\n\treturn &fixedLenByteArrayColumnBuffer{\n\t\tfixedLenByteArrayPage: fixedLenByteArrayPage{\n\t\t\ttyp:         typ,\n\t\t\tsize:        size,\n\t\t\tdata:        make([]byte, 0, typ.EstimateSize(int(numValues))),\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t\ttmp: make([]byte, size),\n\t}\n}\n\nfunc (col *fixedLenByteArrayColumnBuffer) Clone() ColumnBuffer {\n\treturn &fixedLenByteArrayColumnBuffer{\n\t\tfixedLenByteArrayPage: fixedLenByteArrayPage{\n\t\t\ttyp:         col.typ,\n\t\t\tsize:        col.size,\n\t\t\tdata:        append([]byte{}, col.data...),\n\t\t\tcolumnIndex: col.columnIndex,\n\t\t},\n\t\ttmp: make([]byte, col.size),\n\t}\n}\n\nfunc (col *fixedLenByteArrayColumnBuffer) ColumnIndex() ColumnIndex {\n\treturn fixedLenByteArrayColumnIndex{&col.fixedLenByteArrayPage}\n}\n\nfunc (col *fixedLenByteArrayColumnBuffer) OffsetIndex() OffsetIndex {\n\treturn fixedLenByteArrayOffsetIndex{&col.fixedLenByteArrayPage}\n}\n\nfunc (col *fixedLenByteArrayColumnBuffer) BloomFilter() BloomFilter { return nil }\n\nfunc (col *fixedLenByteArrayColumnBuffer) Dictionary() Dictionary { return nil }\n\nfunc (col *fixedLenByteArrayColumnBuffer) Pages() Pages { return onePage(col.Page()) }\n\nfunc (col *fixedLenByteArrayColumnBuffer) Page() Page { return &col.fixedLenByteArrayPage }\n\nfunc (col *fixedLenByteArrayColumnBuffer) Reset() { col.data = col.data[:0] }\n\nfunc (col *fixedLenByteArrayColumnBuffer) Cap() int { return cap(col.data) / col.size }\n\nfunc (col *fixedLenByteArrayColumnBuffer) Len() int { return len(col.data) / col.size }\n\nfunc (col *fixedLenByteArrayColumnBuffer) Less(i, j int) bool {\n\treturn bytes.Compare(col.index(i), col.index(j)) < 0\n}\n\nfunc (col *fixedLenByteArrayColumnBuffer) Swap(i, j int) {\n\tt, u, v := col.tmp[:col.size], col.index(i), col.index(j)\n\tcopy(t, u)\n\tcopy(u, v)\n\tcopy(v, t)\n}\n\nfunc (col *fixedLenByteArrayColumnBuffer) index(i int) []byte {\n\tj := (i + 0) * col.size\n\tk := (i + 1) * col.size\n\treturn col.data[j:k:k]\n}\n\nfunc (col *fixedLenByteArrayColumnBuffer) Write(b []byte) (int, error) {\n\tn, err := col.WriteFixedLenByteArrays(b)\n\treturn n * col.size, err\n}\n\nfunc (col *fixedLenByteArrayColumnBuffer) WriteFixedLenByteArrays(values []byte) (int, error) {\n\td, m := len(values)/col.size, len(values)%col.size\n\tif m != 0 {\n\t\treturn 0, fmt.Errorf(\"cannot write FIXED_LEN_BYTE_ARRAY values of size %d from input of size %d\", col.size, len(values))\n\t}\n\tcol.data = append(col.data, values...)\n\treturn d, nil\n}\n\nfunc (col *fixedLenByteArrayColumnBuffer) WriteValues(values []Value) (int, error) {\n\tfor _, v := range values {\n\t\tcol.data = append(col.data, v.byteArray()...)\n\t}\n\treturn len(values), nil\n}\n\nfunc (col *fixedLenByteArrayColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) {\n\tn := col.size * rows.Len()\n\ti := len(col.data)\n\tj := len(col.data) + n\n\n\tif cap(col.data) < j {\n\t\tcol.data = append(make([]byte, 0, max(i+n, 2*cap(col.data))), col.data...)\n\t}\n\n\tcol.data = col.data[:j]\n\tnewData := col.data[i:]\n\n\tfor i := 0; i < rows.Len(); i++ {\n\t\tp := rows.Index(i)\n\t\tcopy(newData[i*col.size:], unsafe.Slice((*byte)(p), col.size))\n\t}\n}\n\nfunc (col *fixedLenByteArrayColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) {\n\ti := int(offset) * col.size\n\tswitch {\n\tcase i < 0:\n\t\treturn 0, errRowIndexOutOfBounds(offset, int64(len(col.data)/col.size))\n\tcase i >= len(col.data):\n\t\treturn 0, io.EOF\n\tdefault:\n\t\tfor n < len(values) && i < len(col.data) {\n\t\t\tvalues[n] = col.makeValueBytes(col.data[i : i+col.size])\n\t\t\tn++\n\t\t\ti += col.size\n\t\t}\n\t\tif n < len(values) {\n\t\t\terr = io.EOF\n\t\t}\n\t\treturn n, err\n\t}\n}\n\ntype uint32ColumnBuffer struct{ uint32Page }\n\nfunc newUint32ColumnBuffer(typ Type, columnIndex int16, numValues int32) *uint32ColumnBuffer {\n\treturn &uint32ColumnBuffer{\n\t\tuint32Page: uint32Page{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      make([]uint32, 0, numValues),\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *uint32ColumnBuffer) Clone() ColumnBuffer {\n\treturn &uint32ColumnBuffer{\n\t\tuint32Page: uint32Page{\n\t\t\ttyp:         col.typ,\n\t\t\tvalues:      append([]uint32{}, col.values...),\n\t\t\tcolumnIndex: col.columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *uint32ColumnBuffer) ColumnIndex() ColumnIndex { return uint32ColumnIndex{&col.uint32Page} }\n\nfunc (col *uint32ColumnBuffer) OffsetIndex() OffsetIndex { return uint32OffsetIndex{&col.uint32Page} }\n\nfunc (col *uint32ColumnBuffer) BloomFilter() BloomFilter { return nil }\n\nfunc (col *uint32ColumnBuffer) Dictionary() Dictionary { return nil }\n\nfunc (col *uint32ColumnBuffer) Pages() Pages { return onePage(col.Page()) }\n\nfunc (col *uint32ColumnBuffer) Page() Page { return &col.uint32Page }\n\nfunc (col *uint32ColumnBuffer) Reset() { col.values = col.values[:0] }\n\nfunc (col *uint32ColumnBuffer) Cap() int { return cap(col.values) }\n\nfunc (col *uint32ColumnBuffer) Len() int { return len(col.values) }\n\nfunc (col *uint32ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] }\n\nfunc (col *uint32ColumnBuffer) Swap(i, j int) {\n\tcol.values[i], col.values[j] = col.values[j], col.values[i]\n}\n\nfunc (col *uint32ColumnBuffer) Write(b []byte) (int, error) {\n\tif (len(b) % 4) != 0 {\n\t\treturn 0, fmt.Errorf(\"cannot write INT32 values from input of size %d\", len(b))\n\t}\n\tcol.values = append(col.values, unsafecast.BytesToUint32(b)...)\n\treturn len(b), nil\n}\n\nfunc (col *uint32ColumnBuffer) WriteUint32s(values []uint32) (int, error) {\n\tcol.values = append(col.values, values...)\n\treturn len(values), nil\n}\n\nfunc (col *uint32ColumnBuffer) WriteValues(values []Value) (int, error) {\n\tvar model Value\n\tcol.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{})\n\treturn len(values), nil\n}\n\nfunc (col *uint32ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) {\n\tif n := len(col.values) + rows.Len(); n > cap(col.values) {\n\t\tcol.values = append(make([]uint32, 0, max(n, 2*cap(col.values))), col.values...)\n\t}\n\tn := len(col.values)\n\tcol.values = col.values[:n+rows.Len()]\n\tsparse.GatherUint32(col.values[n:], rows.Uint32Array())\n}\n\nfunc (col *uint32ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) {\n\ti := int(offset)\n\tswitch {\n\tcase i < 0:\n\t\treturn 0, errRowIndexOutOfBounds(offset, int64(len(col.values)))\n\tcase i >= len(col.values):\n\t\treturn 0, io.EOF\n\tdefault:\n\t\tfor n < len(values) && i < len(col.values) {\n\t\t\tvalues[n] = col.makeValue(col.values[i])\n\t\t\tn++\n\t\t\ti++\n\t\t}\n\t\tif n < len(values) {\n\t\t\terr = io.EOF\n\t\t}\n\t\treturn n, err\n\t}\n}\n\ntype uint64ColumnBuffer struct{ uint64Page }\n\nfunc newUint64ColumnBuffer(typ Type, columnIndex int16, numValues int32) *uint64ColumnBuffer {\n\treturn &uint64ColumnBuffer{\n\t\tuint64Page: uint64Page{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      make([]uint64, 0, numValues),\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *uint64ColumnBuffer) Clone() ColumnBuffer {\n\treturn &uint64ColumnBuffer{\n\t\tuint64Page: uint64Page{\n\t\t\ttyp:         col.typ,\n\t\t\tvalues:      append([]uint64{}, col.values...),\n\t\t\tcolumnIndex: col.columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *uint64ColumnBuffer) ColumnIndex() ColumnIndex { return uint64ColumnIndex{&col.uint64Page} }\n\nfunc (col *uint64ColumnBuffer) OffsetIndex() OffsetIndex { return uint64OffsetIndex{&col.uint64Page} }\n\nfunc (col *uint64ColumnBuffer) BloomFilter() BloomFilter { return nil }\n\nfunc (col *uint64ColumnBuffer) Dictionary() Dictionary { return nil }\n\nfunc (col *uint64ColumnBuffer) Pages() Pages { return onePage(col.Page()) }\n\nfunc (col *uint64ColumnBuffer) Page() Page { return &col.uint64Page }\n\nfunc (col *uint64ColumnBuffer) Reset() { col.values = col.values[:0] }\n\nfunc (col *uint64ColumnBuffer) Cap() int { return cap(col.values) }\n\nfunc (col *uint64ColumnBuffer) Len() int { return len(col.values) }\n\nfunc (col *uint64ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] }\n\nfunc (col *uint64ColumnBuffer) Swap(i, j int) {\n\tcol.values[i], col.values[j] = col.values[j], col.values[i]\n}\n\nfunc (col *uint64ColumnBuffer) Write(b []byte) (int, error) {\n\tif (len(b) % 8) != 0 {\n\t\treturn 0, fmt.Errorf(\"cannot write INT64 values from input of size %d\", len(b))\n\t}\n\tcol.values = append(col.values, unsafecast.BytesToUint64(b)...)\n\treturn len(b), nil\n}\n\nfunc (col *uint64ColumnBuffer) WriteUint64s(values []uint64) (int, error) {\n\tcol.values = append(col.values, values...)\n\treturn len(values), nil\n}\n\nfunc (col *uint64ColumnBuffer) WriteValues(values []Value) (int, error) {\n\tvar model Value\n\tcol.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{})\n\treturn len(values), nil\n}\n\nfunc (col *uint64ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) {\n\tif n := len(col.values) + rows.Len(); n > cap(col.values) {\n\t\tcol.values = append(make([]uint64, 0, max(n, 2*cap(col.values))), col.values...)\n\t}\n\tn := len(col.values)\n\tcol.values = col.values[:n+rows.Len()]\n\tsparse.GatherUint64(col.values[n:], rows.Uint64Array())\n}\n\nfunc (col *uint64ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) {\n\ti := int(offset)\n\tswitch {\n\tcase i < 0:\n\t\treturn 0, errRowIndexOutOfBounds(offset, int64(len(col.values)))\n\tcase i >= len(col.values):\n\t\treturn 0, io.EOF\n\tdefault:\n\t\tfor n < len(values) && i < len(col.values) {\n\t\t\tvalues[n] = col.makeValue(col.values[i])\n\t\t\tn++\n\t\t\ti++\n\t\t}\n\t\tif n < len(values) {\n\t\t\terr = io.EOF\n\t\t}\n\t\treturn n, err\n\t}\n}\n\ntype be128ColumnBuffer struct{ be128Page }\n\nfunc newBE128ColumnBuffer(typ Type, columnIndex int16, numValues int32) *be128ColumnBuffer {\n\treturn &be128ColumnBuffer{\n\t\tbe128Page: be128Page{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      make([][16]byte, 0, numValues),\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *be128ColumnBuffer) Clone() ColumnBuffer {\n\treturn &be128ColumnBuffer{\n\t\tbe128Page: be128Page{\n\t\t\ttyp:         col.typ,\n\t\t\tvalues:      append([][16]byte{}, col.values...),\n\t\t\tcolumnIndex: col.columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *be128ColumnBuffer) ColumnIndex() ColumnIndex {\n\treturn be128ColumnIndex{&col.be128Page}\n}\n\nfunc (col *be128ColumnBuffer) OffsetIndex() OffsetIndex {\n\treturn be128OffsetIndex{&col.be128Page}\n}\n\nfunc (col *be128ColumnBuffer) BloomFilter() BloomFilter { return nil }\n\nfunc (col *be128ColumnBuffer) Dictionary() Dictionary { return nil }\n\nfunc (col *be128ColumnBuffer) Pages() Pages { return onePage(col.Page()) }\n\nfunc (col *be128ColumnBuffer) Page() Page { return &col.be128Page }\n\nfunc (col *be128ColumnBuffer) Reset() { col.values = col.values[:0] }\n\nfunc (col *be128ColumnBuffer) Cap() int { return cap(col.values) }\n\nfunc (col *be128ColumnBuffer) Len() int { return len(col.values) }\n\nfunc (col *be128ColumnBuffer) Less(i, j int) bool {\n\treturn lessBE128(&col.values[i], &col.values[j])\n}\n\nfunc (col *be128ColumnBuffer) Swap(i, j int) {\n\tcol.values[i], col.values[j] = col.values[j], col.values[i]\n}\n\nfunc (col *be128ColumnBuffer) WriteValues(values []Value) (int, error) {\n\tif n := len(col.values) + len(values); n > cap(col.values) {\n\t\tcol.values = append(make([][16]byte, 0, max(n, 2*cap(col.values))), col.values...)\n\t}\n\tn := len(col.values)\n\tcol.values = col.values[:n+len(values)]\n\tnewValues := col.values[n:]\n\tfor i, v := range values {\n\t\tcopy(newValues[i][:], v.byteArray())\n\t}\n\treturn len(values), nil\n}\n\nfunc (col *be128ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) {\n\tif n := len(col.values) + rows.Len(); n > cap(col.values) {\n\t\tcol.values = append(make([][16]byte, 0, max(n, 2*cap(col.values))), col.values...)\n\t}\n\tn := len(col.values)\n\tcol.values = col.values[:n+rows.Len()]\n\tsparse.GatherUint128(col.values[n:], rows.Uint128Array())\n}\n\nfunc (col *be128ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) {\n\ti := int(offset)\n\tswitch {\n\tcase i < 0:\n\t\treturn 0, errRowIndexOutOfBounds(offset, int64(len(col.values)))\n\tcase i >= len(col.values):\n\t\treturn 0, io.EOF\n\tdefault:\n\t\tfor n < len(values) && i < len(col.values) {\n\t\t\tvalues[n] = col.makeValue(&col.values[i])\n\t\t\tn++\n\t\t\ti++\n\t\t}\n\t\tif n < len(values) {\n\t\t\terr = io.EOF\n\t\t}\n\t\treturn n, err\n\t}\n}\n\nvar (\n\t_ sort.Interface = (ColumnBuffer)(nil)\n\t_ io.Writer      = (*byteArrayColumnBuffer)(nil)\n\t_ io.Writer      = (*fixedLenByteArrayColumnBuffer)(nil)\n)\n"
  },
  {
    "path": "column_buffer_amd64.go",
    "content": "//go:build !purego\n\npackage parquet\n\nimport (\n\t\"github.com/segmentio/parquet-go/internal/bytealg\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n\t\"github.com/segmentio/parquet-go/sparse\"\n\t\"golang.org/x/sys/cpu\"\n)\n\nfunc broadcastValueInt32(dst []int32, src int8) {\n\tbytealg.Broadcast(unsafecast.Int32ToBytes(dst), byte(src))\n}\n\n//go:noescape\nfunc broadcastRangeInt32AVX2(dst []int32, base int32)\n\nfunc broadcastRangeInt32(dst []int32, base int32) {\n\tif len(dst) >= 8 && cpu.X86.HasAVX2 {\n\t\tbroadcastRangeInt32AVX2(dst, base)\n\t} else {\n\t\tfor i := range dst {\n\t\t\tdst[i] = base + int32(i)\n\t\t}\n\t}\n}\n\n//go:noescape\nfunc writePointersBE128(values [][16]byte, rows sparse.Array)\n"
  },
  {
    "path": "column_buffer_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n// func broadcastRangeInt32AVX2(dst []int32, base int32)\nTEXT ·broadcastRangeInt32AVX2(SB), NOSPLIT, $0-28\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), BX\n    MOVL base+24(FP), CX\n    XORQ SI, SI\n\n    CMPQ BX, $8\n    JB test1x4\n\n    VMOVDQU ·range0n8(SB), Y0         // [0,1,2,3,4,5,6,7]\n    VPBROADCASTD ·range0n8+32(SB), Y1 // [8,8,8,8,8,8,8,8]\n    VPBROADCASTD base+24(FP), Y2      // [base...]\n    VPADDD Y2, Y0, Y0                 // [base,base+1,...]\n\n    MOVQ BX, DI\n    SHRQ $3, DI\n    SHLQ $3, DI\n    JMP test8x4\nloop8x4:\n    VMOVDQU Y0, (AX)(SI*4)\n    VPADDD Y1, Y0, Y0\n    ADDQ $8, SI\ntest8x4:\n    CMPQ SI, DI\n    JNE loop8x4\n    VZEROUPPER\n    JMP test1x4\n\nloop1x4:\n    INCQ SI\n    MOVL CX, DX\n    IMULL SI, DX\n    MOVL DX, -4(AX)(SI*4)\ntest1x4:\n    CMPQ SI, BX\n    JNE loop1x4\n    RET\n\n// func writePointersBE128(values [][16]byte, rows sparse.Array)\nTEXT ·writePointersBE128(SB), NOSPLIT, $0-48\n    MOVQ values_base+0(FP), AX\n    MOVQ rows_array_ptr+24(FP), BX\n    MOVQ rows_array_len+32(FP), CX\n    MOVQ rows_array_off+40(FP), DX\n\n    XORQ SI, SI\n    JMP test\nloop:\n    PXOR X0, X0\n    MOVQ (BX), DI // *[16]byte\n    CMPQ DI, $0\n    JE next\n    MOVOU (DI), X0\nnext:\n    MOVOU X0, (AX)\n    ADDQ $16, AX\n    ADDQ DX, BX\n    INCQ SI\ntest:\n    CMPQ SI, CX\n    JNE loop\n    RET\n"
  },
  {
    "path": "column_buffer_go18.go",
    "content": "//go:build go1.18\n\npackage parquet\n\nimport (\n\t\"encoding/json\"\n\t\"math/bits\"\n\t\"reflect\"\n\t\"time\"\n\t\"unsafe\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n\t\"github.com/segmentio/parquet-go/sparse\"\n)\n\n// writeRowsFunc is the type of functions that apply rows to a set of column\n// buffers.\n//\n// - columns is the array of column buffer where the rows are written.\n//\n// - rows is the array of Go values to write to the column buffers.\n//\n//   - levels is used to track the column index, repetition and definition levels\n//     of values when writing optional or repeated columns.\ntype writeRowsFunc func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error\n\n// writeRowsFuncOf generates a writeRowsFunc function for the given Go type and\n// parquet schema. The column path indicates the column that the function is\n// being generated for in the parquet schema.\nfunc writeRowsFuncOf(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc {\n\tif leaf, exists := schema.Lookup(path...); exists && leaf.Node.Type().LogicalType() != nil && leaf.Node.Type().LogicalType().Json != nil {\n\t\treturn writeRowsFuncOfJSON(t, schema, path)\n\t}\n\n\tswitch t {\n\tcase reflect.TypeOf(deprecated.Int96{}):\n\t\treturn writeRowsFuncOfRequired(t, schema, path)\n\tcase reflect.TypeOf(time.Time{}):\n\t\treturn writeRowsFuncOfTime(t, schema, path)\n\t}\n\n\tswitch t.Kind() {\n\tcase reflect.Bool,\n\t\treflect.Int,\n\t\treflect.Uint,\n\t\treflect.Int32,\n\t\treflect.Uint32,\n\t\treflect.Int64,\n\t\treflect.Uint64,\n\t\treflect.Float32,\n\t\treflect.Float64,\n\t\treflect.String:\n\t\treturn writeRowsFuncOfRequired(t, schema, path)\n\n\tcase reflect.Slice:\n\t\tif t.Elem().Kind() == reflect.Uint8 {\n\t\t\treturn writeRowsFuncOfRequired(t, schema, path)\n\t\t} else {\n\t\t\treturn writeRowsFuncOfSlice(t, schema, path)\n\t\t}\n\n\tcase reflect.Array:\n\t\tif t.Elem().Kind() == reflect.Uint8 {\n\t\t\treturn writeRowsFuncOfRequired(t, schema, path)\n\t\t}\n\n\tcase reflect.Pointer:\n\t\treturn writeRowsFuncOfPointer(t, schema, path)\n\n\tcase reflect.Struct:\n\t\treturn writeRowsFuncOfStruct(t, schema, path)\n\n\tcase reflect.Map:\n\t\treturn writeRowsFuncOfMap(t, schema, path)\n\t}\n\n\tpanic(\"cannot convert Go values of type \" + typeNameOf(t) + \" to parquet value\")\n}\n\nfunc writeRowsFuncOfRequired(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc {\n\tcolumn := schema.mapping.lookup(path)\n\tcolumnIndex := column.columnIndex\n\treturn func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error {\n\t\tcolumns[columnIndex].writeValues(rows, levels)\n\t\treturn nil\n\t}\n}\n\nfunc writeRowsFuncOfOptional(t reflect.Type, schema *Schema, path columnPath, writeRows writeRowsFunc) writeRowsFunc {\n\tnullIndex := nullIndexFuncOf(t)\n\treturn func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error {\n\t\tif rows.Len() == 0 {\n\t\t\treturn writeRows(columns, rows, levels)\n\t\t}\n\n\t\tnulls := acquireBitmap(rows.Len())\n\t\tdefer releaseBitmap(nulls)\n\t\tnullIndex(nulls.bits, rows)\n\n\t\tnullLevels := levels\n\t\tlevels.definitionLevel++\n\t\t// In this function, we are dealing with optional values which are\n\t\t// neither pointers nor slices; for example, a int32 field marked\n\t\t// \"optional\" in its parent struct.\n\t\t//\n\t\t// We need to find zero values, which should be represented as nulls\n\t\t// in the parquet column. In order to minimize the calls to writeRows\n\t\t// and maximize throughput, we use the nullIndex and nonNullIndex\n\t\t// functions, which are type-specific implementations of the algorithm.\n\t\t//\n\t\t// Sections of the input that are contiguous nulls or non-nulls can be\n\t\t// sent to a single call to writeRows to be written to the underlying\n\t\t// buffer since they share the same definition level.\n\t\t//\n\t\t// This optimization is defeated by inputs alternating null and non-null\n\t\t// sequences of single values, we do not expect this condition to be a\n\t\t// common case.\n\t\tfor i := 0; i < rows.Len(); {\n\t\t\tj := 0\n\t\t\tx := i / 64\n\t\t\ty := i % 64\n\n\t\t\tif y != 0 {\n\t\t\t\tif b := nulls.bits[x] >> uint(y); b == 0 {\n\t\t\t\t\tx++\n\t\t\t\t\ty = 0\n\t\t\t\t} else {\n\t\t\t\t\ty += bits.TrailingZeros64(b)\n\t\t\t\t\tgoto writeNulls\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tfor x < len(nulls.bits) && nulls.bits[x] == 0 {\n\t\t\t\tx++\n\t\t\t}\n\n\t\t\tif x < len(nulls.bits) {\n\t\t\t\ty = bits.TrailingZeros64(nulls.bits[x]) % 64\n\t\t\t}\n\n\t\twriteNulls:\n\t\t\tif j = x*64 + y; j > rows.Len() {\n\t\t\t\tj = rows.Len()\n\t\t\t}\n\n\t\t\tif i < j {\n\t\t\t\tif err := writeRows(columns, rows.Slice(i, j), nullLevels); err != nil {\n\t\t\t\t\treturn err\n\t\t\t\t}\n\t\t\t\ti = j\n\t\t\t}\n\n\t\t\tif y != 0 {\n\t\t\t\tif b := nulls.bits[x] >> uint(y); b == (1<<uint64(y))-1 {\n\t\t\t\t\tx++\n\t\t\t\t\ty = 0\n\t\t\t\t} else {\n\t\t\t\t\ty += bits.TrailingZeros64(^b)\n\t\t\t\t\tgoto writeNonNulls\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tfor x < len(nulls.bits) && nulls.bits[x] == ^uint64(0) {\n\t\t\t\tx++\n\t\t\t}\n\n\t\t\tif x < len(nulls.bits) {\n\t\t\t\ty = bits.TrailingZeros64(^nulls.bits[x]) % 64\n\t\t\t}\n\n\t\twriteNonNulls:\n\t\t\tif j = x*64 + y; j > rows.Len() {\n\t\t\t\tj = rows.Len()\n\t\t\t}\n\n\t\t\tif i < j {\n\t\t\t\tif err := writeRows(columns, rows.Slice(i, j), levels); err != nil {\n\t\t\t\t\treturn err\n\t\t\t\t}\n\t\t\t\ti = j\n\t\t\t}\n\t\t}\n\n\t\treturn nil\n\t}\n}\n\nfunc writeRowsFuncOfPointer(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc {\n\telemType := t.Elem()\n\telemSize := uintptr(elemType.Size())\n\twriteRows := writeRowsFuncOf(elemType, schema, path)\n\n\tif len(path) == 0 {\n\t\t// This code path is taken when generating a writeRowsFunc for a pointer\n\t\t// type. In this case, we do not need to increase the definition level\n\t\t// since we are not deailng with an optional field but a pointer to the\n\t\t// row type.\n\t\treturn func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error {\n\t\t\tif rows.Len() == 0 {\n\t\t\t\treturn writeRows(columns, rows, levels)\n\t\t\t}\n\n\t\t\tfor i := 0; i < rows.Len(); i++ {\n\t\t\t\tp := *(*unsafe.Pointer)(rows.Index(i))\n\t\t\t\ta := sparse.Array{}\n\t\t\t\tif p != nil {\n\t\t\t\t\ta = makeArray(p, 1, elemSize)\n\t\t\t\t}\n\t\t\t\tif err := writeRows(columns, a, levels); err != nil {\n\t\t\t\t\treturn err\n\t\t\t\t}\n\t\t\t}\n\n\t\t\treturn nil\n\t\t}\n\t}\n\n\treturn func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error {\n\t\tif rows.Len() == 0 {\n\t\t\treturn writeRows(columns, rows, levels)\n\t\t}\n\n\t\tfor i := 0; i < rows.Len(); i++ {\n\t\t\tp := *(*unsafe.Pointer)(rows.Index(i))\n\t\t\ta := sparse.Array{}\n\t\t\telemLevels := levels\n\t\t\tif p != nil {\n\t\t\t\ta = makeArray(p, 1, elemSize)\n\t\t\t\telemLevels.definitionLevel++\n\t\t\t}\n\t\t\tif err := writeRows(columns, a, elemLevels); err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\t\t}\n\n\t\treturn nil\n\t}\n}\n\nfunc writeRowsFuncOfSlice(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc {\n\telemType := t.Elem()\n\telemSize := uintptr(elemType.Size())\n\twriteRows := writeRowsFuncOf(elemType, schema, path)\n\n\t// When the element is a pointer type, the writeRows function will be an\n\t// instance returned by writeRowsFuncOfPointer, which handles incrementing\n\t// the definition level if the pointer value is not nil.\n\tdefinitionLevelIncrement := byte(0)\n\tif elemType.Kind() != reflect.Ptr {\n\t\tdefinitionLevelIncrement = 1\n\t}\n\n\treturn func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error {\n\t\tif rows.Len() == 0 {\n\t\t\treturn writeRows(columns, rows, levels)\n\t\t}\n\n\t\tlevels.repetitionDepth++\n\n\t\tfor i := 0; i < rows.Len(); i++ {\n\t\t\tp := (*sliceHeader)(rows.Index(i))\n\t\t\ta := makeArray(p.base, p.len, elemSize)\n\t\t\tb := sparse.Array{}\n\n\t\t\telemLevels := levels\n\t\t\tif a.Len() > 0 {\n\t\t\t\tb = a.Slice(0, 1)\n\t\t\t\telemLevels.definitionLevel += definitionLevelIncrement\n\t\t\t}\n\n\t\t\tif err := writeRows(columns, b, elemLevels); err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\n\t\t\tif a.Len() > 1 {\n\t\t\t\telemLevels.repetitionLevel = elemLevels.repetitionDepth\n\n\t\t\t\tif err := writeRows(columns, a.Slice(1, a.Len()), elemLevels); err != nil {\n\t\t\t\t\treturn err\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\treturn nil\n\t}\n}\n\nfunc writeRowsFuncOfStruct(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc {\n\ttype column struct {\n\t\toffset    uintptr\n\t\twriteRows writeRowsFunc\n\t}\n\n\tfields := structFieldsOf(t)\n\tcolumns := make([]column, len(fields))\n\n\tfor i, f := range fields {\n\t\toptional := false\n\t\tcolumnPath := path.append(f.Name)\n\t\tforEachStructTagOption(f, func(_ reflect.Type, option, _ string) {\n\t\t\tswitch option {\n\t\t\tcase \"list\":\n\t\t\t\tcolumnPath = columnPath.append(\"list\", \"element\")\n\t\t\tcase \"optional\":\n\t\t\t\toptional = true\n\t\t\t}\n\t\t})\n\n\t\twriteRows := writeRowsFuncOf(f.Type, schema, columnPath)\n\t\tif optional {\n\t\t\tswitch f.Type.Kind() {\n\t\t\tcase reflect.Pointer, reflect.Slice:\n\t\t\tdefault:\n\t\t\t\twriteRows = writeRowsFuncOfOptional(f.Type, schema, columnPath, writeRows)\n\t\t\t}\n\t\t}\n\n\t\tcolumns[i] = column{\n\t\t\toffset:    f.Offset,\n\t\t\twriteRows: writeRows,\n\t\t}\n\t}\n\n\treturn func(buffers []ColumnBuffer, rows sparse.Array, levels columnLevels) error {\n\t\tif rows.Len() == 0 {\n\t\t\tfor _, column := range columns {\n\t\t\t\tif err := column.writeRows(buffers, rows, levels); err != nil {\n\t\t\t\t\treturn err\n\t\t\t\t}\n\t\t\t}\n\t\t} else {\n\t\t\tfor _, column := range columns {\n\t\t\t\tif err := column.writeRows(buffers, rows.Offset(column.offset), levels); err != nil {\n\t\t\t\t\treturn err\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn nil\n\t}\n}\n\nfunc writeRowsFuncOfMap(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc {\n\tkeyPath := path.append(\"key_value\", \"key\")\n\tkeyType := t.Key()\n\tkeySize := uintptr(keyType.Size())\n\twriteKeys := writeRowsFuncOf(keyType, schema, keyPath)\n\n\tvaluePath := path.append(\"key_value\", \"value\")\n\tvalueType := t.Elem()\n\tvalueSize := uintptr(valueType.Size())\n\twriteValues := writeRowsFuncOf(valueType, schema, valuePath)\n\n\twriteKeyValues := func(columns []ColumnBuffer, keys, values sparse.Array, levels columnLevels) error {\n\t\tif err := writeKeys(columns, keys, levels); err != nil {\n\t\t\treturn err\n\t\t}\n\t\tif err := writeValues(columns, values, levels); err != nil {\n\t\t\treturn err\n\t\t}\n\t\treturn nil\n\t}\n\n\treturn func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error {\n\t\tif rows.Len() == 0 {\n\t\t\treturn writeKeyValues(columns, rows, rows, levels)\n\t\t}\n\n\t\tlevels.repetitionDepth++\n\t\tmapKey := reflect.New(keyType).Elem()\n\t\tmapValue := reflect.New(valueType).Elem()\n\n\t\tfor i := 0; i < rows.Len(); i++ {\n\t\t\tm := reflect.NewAt(t, rows.Index(i)).Elem()\n\n\t\t\tif m.Len() == 0 {\n\t\t\t\tempty := sparse.Array{}\n\t\t\t\tif err := writeKeyValues(columns, empty, empty, levels); err != nil {\n\t\t\t\t\treturn err\n\t\t\t\t}\n\t\t\t} else {\n\t\t\t\telemLevels := levels\n\t\t\t\telemLevels.definitionLevel++\n\n\t\t\t\tfor it := m.MapRange(); it.Next(); {\n\t\t\t\t\tmapKey.SetIterKey(it)\n\t\t\t\t\tmapValue.SetIterValue(it)\n\n\t\t\t\t\tk := makeArray(unsafecast.PointerOfValue(mapKey), 1, keySize)\n\t\t\t\t\tv := makeArray(unsafecast.PointerOfValue(mapValue), 1, valueSize)\n\n\t\t\t\t\tif err := writeKeyValues(columns, k, v, elemLevels); err != nil {\n\t\t\t\t\t\treturn err\n\t\t\t\t\t}\n\n\t\t\t\t\telemLevels.repetitionLevel = elemLevels.repetitionDepth\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\treturn nil\n\t}\n}\n\nfunc writeRowsFuncOfJSON(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc {\n\t// If this is a string or a byte array write directly.\n\tswitch t.Kind() {\n\tcase reflect.String:\n\t\treturn writeRowsFuncOfRequired(t, schema, path)\n\tcase reflect.Slice:\n\t\tif t.Elem().Kind() == reflect.Uint8 {\n\t\t\treturn writeRowsFuncOfRequired(t, schema, path)\n\t\t}\n\t}\n\n\t// Otherwise handle with a json.Marshal\n\tasStrT := reflect.TypeOf(string(\"\"))\n\twriter := writeRowsFuncOfRequired(asStrT, schema, path)\n\n\treturn func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error {\n\t\tif rows.Len() == 0 {\n\t\t\treturn writer(columns, rows, levels)\n\t\t}\n\t\tfor i := 0; i < rows.Len(); i++ {\n\t\t\tval := reflect.NewAt(t, rows.Index(i))\n\t\t\tasI := val.Interface()\n\n\t\t\tb, err := json.Marshal(asI)\n\t\t\tif err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\n\t\t\tasStr := string(b)\n\t\t\ta := sparse.MakeStringArray([]string{asStr})\n\t\t\tif err := writer(columns, a.UnsafeArray(), levels); err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\t\t}\n\t\treturn nil\n\t}\n}\n\nfunc writeRowsFuncOfTime(_ reflect.Type, schema *Schema, path columnPath) writeRowsFunc {\n\tt := reflect.TypeOf(int64(0))\n\telemSize := uintptr(t.Size())\n\twriteRows := writeRowsFuncOf(t, schema, path)\n\n\tcol, _ := schema.Lookup(path...)\n\tunit := Nanosecond.TimeUnit()\n\tlt := col.Node.Type().LogicalType()\n\tif lt != nil && lt.Timestamp != nil {\n\t\tunit = lt.Timestamp.Unit\n\t}\n\n\treturn func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error {\n\t\tif rows.Len() == 0 {\n\t\t\treturn writeRows(columns, rows, levels)\n\t\t}\n\n\t\ttimes := rows.TimeArray()\n\t\tfor i := 0; i < times.Len(); i++ {\n\t\t\tt := times.Index(i)\n\t\t\tvar val int64\n\t\t\tswitch {\n\t\t\tcase unit.Millis != nil:\n\t\t\t\tval = t.UnixMilli()\n\t\t\tcase unit.Micros != nil:\n\t\t\t\tval = t.UnixMicro()\n\t\t\tdefault:\n\t\t\t\tval = t.UnixNano()\n\t\t\t}\n\n\t\t\ta := makeArray(unsafecast.PointerOfValue(reflect.ValueOf(val)), 1, elemSize)\n\t\t\tif err := writeRows(columns, a, levels); err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\t\t}\n\n\t\treturn nil\n\t}\n}\n"
  },
  {
    "path": "column_buffer_purego.go",
    "content": "//go:build !amd64 || purego\n\npackage parquet\n\nimport \"github.com/segmentio/parquet-go/sparse\"\n\nfunc broadcastValueInt32(dst []int32, src int8) {\n\tvalue := 0x01010101 * int32(src)\n\tfor i := range dst {\n\t\tdst[i] = value\n\t}\n}\n\nfunc broadcastRangeInt32(dst []int32, base int32) {\n\tfor i := range dst {\n\t\tdst[i] = base + int32(i)\n\t}\n}\n\nfunc writePointersBE128(values [][16]byte, rows sparse.Array) {\n\tfor i := range values {\n\t\tp := *(**[16]byte)(rows.Index(i))\n\n\t\tif p != nil {\n\t\t\tvalues[i] = *p\n\t\t} else {\n\t\t\tvalues[i] = [16]byte{}\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "column_buffer_test.go",
    "content": "package parquet\n\nimport (\n\t\"testing\"\n)\n\nfunc TestBroadcastValueInt32(t *testing.T) {\n\tbuf := make([]int32, 123)\n\tbroadcastValueInt32(buf, 0x0A)\n\n\tfor i, v := range buf {\n\t\tif v != 0x0A0A0A0A {\n\t\t\tt.Fatalf(\"wrong value at index %d: %v\", i, v)\n\t\t}\n\t}\n}\n\nfunc TestBroadcastRangeInt32(t *testing.T) {\n\tbuf := make([]int32, 123)\n\tbroadcastRangeInt32(buf, 1)\n\n\tfor i, v := range buf {\n\t\tif v != int32(1+i) {\n\t\t\tt.Fatalf(\"wrong value at index %d: %v\", i, v)\n\t\t}\n\t}\n}\n\nfunc BenchmarkBroadcastValueInt32(b *testing.B) {\n\tbuf := make([]int32, 1000)\n\tfor i := 0; i < b.N; i++ {\n\t\tbroadcastValueInt32(buf, -1)\n\t}\n\tb.SetBytes(4 * int64(len(buf)))\n}\n\nfunc BenchmarkBroadcastRangeInt32(b *testing.B) {\n\tbuf := make([]int32, 1000)\n\tfor i := 0; i < b.N; i++ {\n\t\tbroadcastRangeInt32(buf, 0)\n\t}\n\tb.SetBytes(4 * int64(len(buf)))\n}\n\n// https://github.com/segmentio/parquet-go/issues/501\nfunc TestIssue501(t *testing.T) {\n\tcol := newBooleanColumnBuffer(BooleanType, 0, 2055208)\n\n\t// write all trues and then flush the buffer\n\t_, err := col.WriteBooleans([]bool{true, true, true, true, true, true, true, true})\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\tcol.Reset()\n\n\t// write a single false, we are trying to trip a certain line of code in WriteBooleans\n\t_, err = col.WriteBooleans([]bool{false})\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\t// now write 7 booleans at once, this will cause WriteBooleans to attempt its \"alignment\" logic\n\t_, err = col.WriteBooleans([]bool{false, false, false, false, false, false, false})\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\n\tfor i := 0; i < 8; i++ {\n\t\tread := make([]Value, 1)\n\t\t_, err = col.ReadValuesAt(read, int64(i))\n\t\tif err != nil {\n\t\t\tt.Fatal(err)\n\t\t}\n\t\tif read[0].Boolean() {\n\t\t\tt.Fatalf(\"expected false at index %d\", i)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "column_chunk.go",
    "content": "package parquet\n\nimport (\n\t\"io\"\n)\n\n// The ColumnChunk interface represents individual columns of a row group.\ntype ColumnChunk interface {\n\t// Returns the column type.\n\tType() Type\n\n\t// Returns the index of this column in its parent row group.\n\tColumn() int\n\n\t// Returns a reader exposing the pages of the column.\n\tPages() Pages\n\n\t// Returns the components of the page index for this column chunk,\n\t// containing details about the content and location of pages within the\n\t// chunk.\n\t//\n\t// Note that the returned value may be the same across calls to these\n\t// methods, programs must treat those as read-only.\n\t//\n\t// If the column chunk does not have a page index, the methods return nil.\n\tColumnIndex() ColumnIndex\n\tOffsetIndex() OffsetIndex\n\tBloomFilter() BloomFilter\n\n\t// Returns the number of values in the column chunk.\n\t//\n\t// This quantity may differ from the number of rows in the parent row group\n\t// because repeated columns may hold zero or more values per row.\n\tNumValues() int64\n}\n\ntype pageAndValueWriter interface {\n\tPageWriter\n\tValueWriter\n}\n\ntype readRowsFunc func(*rowGroupRows, []Row, byte) (int, error)\n\nfunc readRowsFuncOf(node Node, columnIndex int, repetitionDepth byte) (int, readRowsFunc) {\n\tvar read readRowsFunc\n\n\tif node.Repeated() {\n\t\trepetitionDepth++\n\t}\n\n\tif node.Leaf() {\n\t\tcolumnIndex, read = readRowsFuncOfLeaf(columnIndex, repetitionDepth)\n\t} else {\n\t\tcolumnIndex, read = readRowsFuncOfGroup(node, columnIndex, repetitionDepth)\n\t}\n\n\tif node.Repeated() {\n\t\tread = readRowsFuncOfRepeated(read, repetitionDepth)\n\t}\n\n\treturn columnIndex, read\n}\n\n//go:noinline\nfunc readRowsFuncOfRepeated(read readRowsFunc, repetitionDepth byte) readRowsFunc {\n\treturn func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) {\n\t\tfor i := range rows {\n\t\t\t// Repeated columns have variable number of values, we must process\n\t\t\t// them one row at a time because we cannot predict how many values\n\t\t\t// need to be consumed in each iteration.\n\t\t\trow := rows[i : i+1]\n\n\t\t\t// The first pass looks for values marking the beginning of a row by\n\t\t\t// having a repetition level equal to the current level.\n\t\t\tn, err := read(r, row, repetitionLevel)\n\t\t\tif err != nil {\n\t\t\t\t// The error here may likely be io.EOF, the read function may\n\t\t\t\t// also have successfully read a row, which is indicated by a\n\t\t\t\t// non-zero count. In this case, we increment the index to\n\t\t\t\t// indicate to the caller than rows up to i+1 have been read.\n\t\t\t\tif n > 0 {\n\t\t\t\t\ti++\n\t\t\t\t}\n\t\t\t\treturn i, err\n\t\t\t}\n\n\t\t\t// The read function may return no errors and also read no rows in\n\t\t\t// case where it had more values to read but none corresponded to\n\t\t\t// the current repetition level. This is an indication that we will\n\t\t\t// not be able to read more rows at this stage, we must return to\n\t\t\t// the caller to let it set the repetition level to its current\n\t\t\t// depth, which may allow us to read more values when called again.\n\t\t\tif n == 0 {\n\t\t\t\treturn i, nil\n\t\t\t}\n\n\t\t\t// When we reach this stage, we have successfully read the first\n\t\t\t// values of a row of repeated columns. We continue consuming more\n\t\t\t// repeated values until we get the indication that we consumed\n\t\t\t// them all (the read function returns zero and no errors).\n\t\t\tfor {\n\t\t\t\tn, err := read(r, row, repetitionDepth)\n\t\t\t\tif err != nil {\n\t\t\t\t\treturn i + 1, err\n\t\t\t\t}\n\t\t\t\tif n == 0 {\n\t\t\t\t\tbreak\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn len(rows), nil\n\t}\n}\n\n//go:noinline\nfunc readRowsFuncOfGroup(node Node, columnIndex int, repetitionDepth byte) (int, readRowsFunc) {\n\tfields := node.Fields()\n\n\tif len(fields) == 0 {\n\t\treturn columnIndex, func(*rowGroupRows, []Row, byte) (int, error) {\n\t\t\treturn 0, io.EOF\n\t\t}\n\t}\n\n\tif len(fields) == 1 {\n\t\t// Small optimization for a somewhat common case of groups with a single\n\t\t// column (like nested list elements for example); there is no need to\n\t\t// loop over the group of a single element, we can simply skip to calling\n\t\t// the inner read function.\n\t\treturn readRowsFuncOf(fields[0], columnIndex, repetitionDepth)\n\t}\n\n\tgroup := make([]readRowsFunc, len(fields))\n\tfor i := range group {\n\t\tcolumnIndex, group[i] = readRowsFuncOf(fields[i], columnIndex, repetitionDepth)\n\t}\n\n\treturn columnIndex, func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) {\n\t\t// When reading a group, we use the first column as an indicator of how\n\t\t// may rows can be read during this call.\n\t\tn, err := group[0](r, rows, repetitionLevel)\n\n\t\tif n > 0 {\n\t\t\t// Read values for all rows that the group is able to consume.\n\t\t\t// Getting io.EOF from calling the read functions indicate that\n\t\t\t// we consumed all values of that particular column, but there may\n\t\t\t// be more to read in other columns, therefore we must always read\n\t\t\t// all columns and cannot stop on the first error.\n\t\t\tfor _, read := range group[1:] {\n\t\t\t\t_, err2 := read(r, rows[:n], repetitionLevel)\n\t\t\t\tif err2 != nil && err2 != io.EOF {\n\t\t\t\t\treturn 0, err2\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\treturn n, err\n\t}\n}\n\n//go:noinline\nfunc readRowsFuncOfLeaf(columnIndex int, repetitionDepth byte) (int, readRowsFunc) {\n\tvar read readRowsFunc\n\n\tif repetitionDepth == 0 {\n\t\tread = func(r *rowGroupRows, rows []Row, _ byte) (int, error) {\n\t\t\t// When the repetition depth is zero, we know that there is exactly\n\t\t\t// one value per row for this column, and therefore we can consume\n\t\t\t// as many values as there are rows to fill.\n\t\t\tcol := &r.columns[columnIndex]\n\t\t\tbuf := r.buffer(columnIndex)\n\n\t\t\tfor i := range rows {\n\t\t\t\tif col.offset == col.length {\n\t\t\t\t\tn, err := col.values.ReadValues(buf)\n\t\t\t\t\tcol.offset = 0\n\t\t\t\t\tcol.length = int32(n)\n\t\t\t\t\tif n == 0 && err != nil {\n\t\t\t\t\t\treturn 0, err\n\t\t\t\t\t}\n\t\t\t\t}\n\n\t\t\t\trows[i] = append(rows[i], buf[col.offset])\n\t\t\t\tcol.offset++\n\t\t\t}\n\n\t\t\treturn len(rows), nil\n\t\t}\n\t} else {\n\t\tread = func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) {\n\t\t\t// When the repetition depth is not zero, we know that we will be\n\t\t\t// called with a single row as input. We attempt to read at most one\n\t\t\t// value of a single row and return to the caller.\n\t\t\tcol := &r.columns[columnIndex]\n\t\t\tbuf := r.buffer(columnIndex)\n\n\t\t\tif col.offset == col.length {\n\t\t\t\tn, err := col.values.ReadValues(buf)\n\t\t\t\tcol.offset = 0\n\t\t\t\tcol.length = int32(n)\n\t\t\t\tif n == 0 && err != nil {\n\t\t\t\t\treturn 0, err\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tif buf[col.offset].repetitionLevel != repetitionLevel {\n\t\t\t\treturn 0, nil\n\t\t\t}\n\n\t\t\trows[0] = append(rows[0], buf[col.offset])\n\t\t\tcol.offset++\n\t\t\treturn 1, nil\n\t\t}\n\t}\n\n\treturn columnIndex + 1, read\n}\n"
  },
  {
    "path": "column_index.go",
    "content": "package parquet\n\nimport (\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding/plain\"\n\t\"github.com/segmentio/parquet-go/format\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\ntype ColumnIndex interface {\n\t// NumPages returns the number of paged in the column index.\n\tNumPages() int\n\n\t// Returns the number of null values in the page at the given index.\n\tNullCount(int) int64\n\n\t// Tells whether the page at the given index contains null values only.\n\tNullPage(int) bool\n\n\t// PageIndex return min/max bounds for the page at the given index in the\n\t// column.\n\tMinValue(int) Value\n\tMaxValue(int) Value\n\n\t// IsAscending returns true if the column index min/max values are sorted\n\t// in ascending order (based on the ordering rules of the column's logical\n\t// type).\n\tIsAscending() bool\n\n\t// IsDescending returns true if the column index min/max values are sorted\n\t// in descending order (based on the ordering rules of the column's logical\n\t// type).\n\tIsDescending() bool\n}\n\n// NewColumnIndex constructs a ColumnIndex instance from the given parquet\n// format column index. The kind argument configures the type of values\nfunc NewColumnIndex(kind Kind, index *format.ColumnIndex) ColumnIndex {\n\treturn &formatColumnIndex{\n\t\tkind:  kind,\n\t\tindex: index,\n\t}\n}\n\ntype formatColumnIndex struct {\n\tkind  Kind\n\tindex *format.ColumnIndex\n}\n\nfunc (f *formatColumnIndex) NumPages() int {\n\treturn len(f.index.MinValues)\n}\n\nfunc (f *formatColumnIndex) NullCount(i int) int64 {\n\tif len(f.index.NullCounts) > 0 {\n\t\treturn f.index.NullCounts[i]\n\t}\n\treturn 0\n}\n\nfunc (f *formatColumnIndex) NullPage(i int) bool {\n\treturn len(f.index.NullPages) > 0 && f.index.NullPages[i]\n}\n\nfunc (f *formatColumnIndex) MinValue(i int) Value {\n\tif f.NullPage(i) {\n\t\treturn Value{}\n\t}\n\treturn f.kind.Value(f.index.MinValues[i])\n}\n\nfunc (f *formatColumnIndex) MaxValue(i int) Value {\n\tif f.NullPage(i) {\n\t\treturn Value{}\n\t}\n\treturn f.kind.Value(f.index.MaxValues[i])\n}\n\nfunc (f *formatColumnIndex) IsAscending() bool {\n\treturn f.index.BoundaryOrder == format.Ascending\n}\n\nfunc (f *formatColumnIndex) IsDescending() bool {\n\treturn f.index.BoundaryOrder == format.Descending\n}\n\ntype fileColumnIndex struct{ chunk *fileColumnChunk }\n\nfunc (i fileColumnIndex) NumPages() int {\n\treturn len(i.chunk.columnIndex.NullPages)\n}\n\nfunc (i fileColumnIndex) NullCount(j int) int64 {\n\tif len(i.chunk.columnIndex.NullCounts) > 0 {\n\t\treturn i.chunk.columnIndex.NullCounts[j]\n\t}\n\treturn 0\n}\n\nfunc (i fileColumnIndex) NullPage(j int) bool {\n\treturn len(i.chunk.columnIndex.NullPages) > 0 && i.chunk.columnIndex.NullPages[j]\n}\n\nfunc (i fileColumnIndex) MinValue(j int) Value {\n\tif i.NullPage(j) {\n\t\treturn Value{}\n\t}\n\treturn i.makeValue(i.chunk.columnIndex.MinValues[j])\n}\n\nfunc (i fileColumnIndex) MaxValue(j int) Value {\n\tif i.NullPage(j) {\n\t\treturn Value{}\n\t}\n\treturn i.makeValue(i.chunk.columnIndex.MaxValues[j])\n}\n\nfunc (i fileColumnIndex) IsAscending() bool {\n\treturn i.chunk.columnIndex.BoundaryOrder == format.Ascending\n}\n\nfunc (i fileColumnIndex) IsDescending() bool {\n\treturn i.chunk.columnIndex.BoundaryOrder == format.Descending\n}\n\nfunc (i *fileColumnIndex) makeValue(b []byte) Value {\n\treturn i.chunk.column.typ.Kind().Value(b)\n}\n\ntype emptyColumnIndex struct{}\n\nfunc (emptyColumnIndex) NumPages() int       { return 0 }\nfunc (emptyColumnIndex) NullCount(int) int64 { return 0 }\nfunc (emptyColumnIndex) NullPage(int) bool   { return false }\nfunc (emptyColumnIndex) MinValue(int) Value  { return Value{} }\nfunc (emptyColumnIndex) MaxValue(int) Value  { return Value{} }\nfunc (emptyColumnIndex) IsAscending() bool   { return false }\nfunc (emptyColumnIndex) IsDescending() bool  { return false }\n\ntype booleanColumnIndex struct{ page *booleanPage }\n\nfunc (i booleanColumnIndex) NumPages() int       { return 1 }\nfunc (i booleanColumnIndex) NullCount(int) int64 { return 0 }\nfunc (i booleanColumnIndex) NullPage(int) bool   { return false }\nfunc (i booleanColumnIndex) MinValue(int) Value  { return makeValueBoolean(i.page.min()) }\nfunc (i booleanColumnIndex) MaxValue(int) Value  { return makeValueBoolean(i.page.max()) }\nfunc (i booleanColumnIndex) IsAscending() bool   { return false }\nfunc (i booleanColumnIndex) IsDescending() bool  { return false }\n\ntype int32ColumnIndex struct{ page *int32Page }\n\nfunc (i int32ColumnIndex) NumPages() int       { return 1 }\nfunc (i int32ColumnIndex) NullCount(int) int64 { return 0 }\nfunc (i int32ColumnIndex) NullPage(int) bool   { return false }\nfunc (i int32ColumnIndex) MinValue(int) Value  { return makeValueInt32(i.page.min()) }\nfunc (i int32ColumnIndex) MaxValue(int) Value  { return makeValueInt32(i.page.max()) }\nfunc (i int32ColumnIndex) IsAscending() bool   { return false }\nfunc (i int32ColumnIndex) IsDescending() bool  { return false }\n\ntype int64ColumnIndex struct{ page *int64Page }\n\nfunc (i int64ColumnIndex) NumPages() int       { return 1 }\nfunc (i int64ColumnIndex) NullCount(int) int64 { return 0 }\nfunc (i int64ColumnIndex) NullPage(int) bool   { return false }\nfunc (i int64ColumnIndex) MinValue(int) Value  { return makeValueInt64(i.page.min()) }\nfunc (i int64ColumnIndex) MaxValue(int) Value  { return makeValueInt64(i.page.max()) }\nfunc (i int64ColumnIndex) IsAscending() bool   { return false }\nfunc (i int64ColumnIndex) IsDescending() bool  { return false }\n\ntype int96ColumnIndex struct{ page *int96Page }\n\nfunc (i int96ColumnIndex) NumPages() int       { return 1 }\nfunc (i int96ColumnIndex) NullCount(int) int64 { return 0 }\nfunc (i int96ColumnIndex) NullPage(int) bool   { return false }\nfunc (i int96ColumnIndex) MinValue(int) Value  { return makeValueInt96(i.page.min()) }\nfunc (i int96ColumnIndex) MaxValue(int) Value  { return makeValueInt96(i.page.max()) }\nfunc (i int96ColumnIndex) IsAscending() bool   { return false }\nfunc (i int96ColumnIndex) IsDescending() bool  { return false }\n\ntype floatColumnIndex struct{ page *floatPage }\n\nfunc (i floatColumnIndex) NumPages() int       { return 1 }\nfunc (i floatColumnIndex) NullCount(int) int64 { return 0 }\nfunc (i floatColumnIndex) NullPage(int) bool   { return false }\nfunc (i floatColumnIndex) MinValue(int) Value  { return makeValueFloat(i.page.min()) }\nfunc (i floatColumnIndex) MaxValue(int) Value  { return makeValueFloat(i.page.max()) }\nfunc (i floatColumnIndex) IsAscending() bool   { return false }\nfunc (i floatColumnIndex) IsDescending() bool  { return false }\n\ntype doubleColumnIndex struct{ page *doublePage }\n\nfunc (i doubleColumnIndex) NumPages() int       { return 1 }\nfunc (i doubleColumnIndex) NullCount(int) int64 { return 0 }\nfunc (i doubleColumnIndex) NullPage(int) bool   { return false }\nfunc (i doubleColumnIndex) MinValue(int) Value  { return makeValueDouble(i.page.min()) }\nfunc (i doubleColumnIndex) MaxValue(int) Value  { return makeValueDouble(i.page.max()) }\nfunc (i doubleColumnIndex) IsAscending() bool   { return false }\nfunc (i doubleColumnIndex) IsDescending() bool  { return false }\n\ntype byteArrayColumnIndex struct{ page *byteArrayPage }\n\nfunc (i byteArrayColumnIndex) NumPages() int       { return 1 }\nfunc (i byteArrayColumnIndex) NullCount(int) int64 { return 0 }\nfunc (i byteArrayColumnIndex) NullPage(int) bool   { return false }\nfunc (i byteArrayColumnIndex) MinValue(int) Value  { return makeValueBytes(ByteArray, i.page.min()) }\nfunc (i byteArrayColumnIndex) MaxValue(int) Value  { return makeValueBytes(ByteArray, i.page.max()) }\nfunc (i byteArrayColumnIndex) IsAscending() bool   { return false }\nfunc (i byteArrayColumnIndex) IsDescending() bool  { return false }\n\ntype fixedLenByteArrayColumnIndex struct{ page *fixedLenByteArrayPage }\n\nfunc (i fixedLenByteArrayColumnIndex) NumPages() int       { return 1 }\nfunc (i fixedLenByteArrayColumnIndex) NullCount(int) int64 { return 0 }\nfunc (i fixedLenByteArrayColumnIndex) NullPage(int) bool   { return false }\nfunc (i fixedLenByteArrayColumnIndex) MinValue(int) Value {\n\treturn makeValueBytes(FixedLenByteArray, i.page.min())\n}\nfunc (i fixedLenByteArrayColumnIndex) MaxValue(int) Value {\n\treturn makeValueBytes(FixedLenByteArray, i.page.max())\n}\nfunc (i fixedLenByteArrayColumnIndex) IsAscending() bool  { return false }\nfunc (i fixedLenByteArrayColumnIndex) IsDescending() bool { return false }\n\ntype uint32ColumnIndex struct{ page *uint32Page }\n\nfunc (i uint32ColumnIndex) NumPages() int       { return 1 }\nfunc (i uint32ColumnIndex) NullCount(int) int64 { return 0 }\nfunc (i uint32ColumnIndex) NullPage(int) bool   { return false }\nfunc (i uint32ColumnIndex) MinValue(int) Value  { return makeValueUint32(i.page.min()) }\nfunc (i uint32ColumnIndex) MaxValue(int) Value  { return makeValueUint32(i.page.max()) }\nfunc (i uint32ColumnIndex) IsAscending() bool   { return false }\nfunc (i uint32ColumnIndex) IsDescending() bool  { return false }\n\ntype uint64ColumnIndex struct{ page *uint64Page }\n\nfunc (i uint64ColumnIndex) NumPages() int       { return 1 }\nfunc (i uint64ColumnIndex) NullCount(int) int64 { return 0 }\nfunc (i uint64ColumnIndex) NullPage(int) bool   { return false }\nfunc (i uint64ColumnIndex) MinValue(int) Value  { return makeValueUint64(i.page.min()) }\nfunc (i uint64ColumnIndex) MaxValue(int) Value  { return makeValueUint64(i.page.max()) }\nfunc (i uint64ColumnIndex) IsAscending() bool   { return false }\nfunc (i uint64ColumnIndex) IsDescending() bool  { return false }\n\ntype be128ColumnIndex struct{ page *be128Page }\n\nfunc (i be128ColumnIndex) NumPages() int       { return 1 }\nfunc (i be128ColumnIndex) NullCount(int) int64 { return 0 }\nfunc (i be128ColumnIndex) NullPage(int) bool   { return false }\nfunc (i be128ColumnIndex) MinValue(int) Value  { return makeValueBytes(FixedLenByteArray, i.page.min()) }\nfunc (i be128ColumnIndex) MaxValue(int) Value  { return makeValueBytes(FixedLenByteArray, i.page.max()) }\nfunc (i be128ColumnIndex) IsAscending() bool   { return false }\nfunc (i be128ColumnIndex) IsDescending() bool  { return false }\n\n// The ColumnIndexer interface is implemented by types that support generating\n// parquet column indexes.\n//\n// The package does not export any types that implement this interface, programs\n// must call NewColumnIndexer on a Type instance to construct column indexers.\ntype ColumnIndexer interface {\n\t// Resets the column indexer state.\n\tReset()\n\n\t// Add a page to the column indexer.\n\tIndexPage(numValues, numNulls int64, min, max Value)\n\n\t// Generates a format.ColumnIndex value from the current state of the\n\t// column indexer.\n\t//\n\t// The returned value may reference internal buffers, in which case the\n\t// values remain valid until the next call to IndexPage or Reset on the\n\t// column indexer.\n\tColumnIndex() format.ColumnIndex\n}\n\ntype baseColumnIndexer struct {\n\tnullPages  []bool\n\tnullCounts []int64\n}\n\nfunc (i *baseColumnIndexer) reset() {\n\ti.nullPages = i.nullPages[:0]\n\ti.nullCounts = i.nullCounts[:0]\n}\n\nfunc (i *baseColumnIndexer) observe(numValues, numNulls int64) {\n\ti.nullPages = append(i.nullPages, numValues == numNulls)\n\ti.nullCounts = append(i.nullCounts, numNulls)\n}\n\nfunc (i *baseColumnIndexer) columnIndex(minValues, maxValues [][]byte, minOrder, maxOrder int) format.ColumnIndex {\n\treturn format.ColumnIndex{\n\t\tNullPages:     i.nullPages,\n\t\tNullCounts:    i.nullCounts,\n\t\tMinValues:     minValues,\n\t\tMaxValues:     maxValues,\n\t\tBoundaryOrder: boundaryOrderOf(minOrder, maxOrder),\n\t}\n}\n\ntype booleanColumnIndexer struct {\n\tbaseColumnIndexer\n\tminValues []bool\n\tmaxValues []bool\n}\n\nfunc newBooleanColumnIndexer() *booleanColumnIndexer {\n\treturn new(booleanColumnIndexer)\n}\n\nfunc (i *booleanColumnIndexer) Reset() {\n\ti.reset()\n\ti.minValues = i.minValues[:0]\n\ti.maxValues = i.maxValues[:0]\n}\n\nfunc (i *booleanColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {\n\ti.observe(numValues, numNulls)\n\ti.minValues = append(i.minValues, min.boolean())\n\ti.maxValues = append(i.maxValues, max.boolean())\n}\n\nfunc (i *booleanColumnIndexer) ColumnIndex() format.ColumnIndex {\n\treturn i.columnIndex(\n\t\tsplitFixedLenByteArrays(unsafecast.BoolToBytes(i.minValues), 1),\n\t\tsplitFixedLenByteArrays(unsafecast.BoolToBytes(i.maxValues), 1),\n\t\torderOfBool(i.minValues),\n\t\torderOfBool(i.maxValues),\n\t)\n}\n\ntype int32ColumnIndexer struct {\n\tbaseColumnIndexer\n\tminValues []int32\n\tmaxValues []int32\n}\n\nfunc newInt32ColumnIndexer() *int32ColumnIndexer {\n\treturn new(int32ColumnIndexer)\n}\n\nfunc (i *int32ColumnIndexer) Reset() {\n\ti.reset()\n\ti.minValues = i.minValues[:0]\n\ti.maxValues = i.maxValues[:0]\n}\n\nfunc (i *int32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {\n\ti.observe(numValues, numNulls)\n\ti.minValues = append(i.minValues, min.int32())\n\ti.maxValues = append(i.maxValues, max.int32())\n}\n\nfunc (i *int32ColumnIndexer) ColumnIndex() format.ColumnIndex {\n\treturn i.columnIndex(\n\t\tsplitFixedLenByteArrays(unsafecast.Int32ToBytes(i.minValues), 4),\n\t\tsplitFixedLenByteArrays(unsafecast.Int32ToBytes(i.maxValues), 4),\n\t\torderOfInt32(i.minValues),\n\t\torderOfInt32(i.maxValues),\n\t)\n}\n\ntype int64ColumnIndexer struct {\n\tbaseColumnIndexer\n\tminValues []int64\n\tmaxValues []int64\n}\n\nfunc newInt64ColumnIndexer() *int64ColumnIndexer {\n\treturn new(int64ColumnIndexer)\n}\n\nfunc (i *int64ColumnIndexer) Reset() {\n\ti.reset()\n\ti.minValues = i.minValues[:0]\n\ti.maxValues = i.maxValues[:0]\n}\n\nfunc (i *int64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {\n\ti.observe(numValues, numNulls)\n\ti.minValues = append(i.minValues, min.int64())\n\ti.maxValues = append(i.maxValues, max.int64())\n}\n\nfunc (i *int64ColumnIndexer) ColumnIndex() format.ColumnIndex {\n\treturn i.columnIndex(\n\t\tsplitFixedLenByteArrays(unsafecast.Int64ToBytes(i.minValues), 8),\n\t\tsplitFixedLenByteArrays(unsafecast.Int64ToBytes(i.maxValues), 8),\n\t\torderOfInt64(i.minValues),\n\t\torderOfInt64(i.maxValues),\n\t)\n}\n\ntype int96ColumnIndexer struct {\n\tbaseColumnIndexer\n\tminValues []deprecated.Int96\n\tmaxValues []deprecated.Int96\n}\n\nfunc newInt96ColumnIndexer() *int96ColumnIndexer {\n\treturn new(int96ColumnIndexer)\n}\n\nfunc (i *int96ColumnIndexer) Reset() {\n\ti.reset()\n\ti.minValues = i.minValues[:0]\n\ti.maxValues = i.maxValues[:0]\n}\n\nfunc (i *int96ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {\n\ti.observe(numValues, numNulls)\n\ti.minValues = append(i.minValues, min.Int96())\n\ti.maxValues = append(i.maxValues, max.Int96())\n}\n\nfunc (i *int96ColumnIndexer) ColumnIndex() format.ColumnIndex {\n\treturn i.columnIndex(\n\t\tsplitFixedLenByteArrays(deprecated.Int96ToBytes(i.minValues), 12),\n\t\tsplitFixedLenByteArrays(deprecated.Int96ToBytes(i.maxValues), 12),\n\t\tdeprecated.OrderOfInt96(i.minValues),\n\t\tdeprecated.OrderOfInt96(i.maxValues),\n\t)\n}\n\ntype floatColumnIndexer struct {\n\tbaseColumnIndexer\n\tminValues []float32\n\tmaxValues []float32\n}\n\nfunc newFloatColumnIndexer() *floatColumnIndexer {\n\treturn new(floatColumnIndexer)\n}\n\nfunc (i *floatColumnIndexer) Reset() {\n\ti.reset()\n\ti.minValues = i.minValues[:0]\n\ti.maxValues = i.maxValues[:0]\n}\n\nfunc (i *floatColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {\n\ti.observe(numValues, numNulls)\n\ti.minValues = append(i.minValues, min.float())\n\ti.maxValues = append(i.maxValues, max.float())\n}\n\nfunc (i *floatColumnIndexer) ColumnIndex() format.ColumnIndex {\n\treturn i.columnIndex(\n\t\tsplitFixedLenByteArrays(unsafecast.Float32ToBytes(i.minValues), 4),\n\t\tsplitFixedLenByteArrays(unsafecast.Float32ToBytes(i.maxValues), 4),\n\t\torderOfFloat32(i.minValues),\n\t\torderOfFloat32(i.maxValues),\n\t)\n}\n\ntype doubleColumnIndexer struct {\n\tbaseColumnIndexer\n\tminValues []float64\n\tmaxValues []float64\n}\n\nfunc newDoubleColumnIndexer() *doubleColumnIndexer {\n\treturn new(doubleColumnIndexer)\n}\n\nfunc (i *doubleColumnIndexer) Reset() {\n\ti.reset()\n\ti.minValues = i.minValues[:0]\n\ti.maxValues = i.maxValues[:0]\n}\n\nfunc (i *doubleColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {\n\ti.observe(numValues, numNulls)\n\ti.minValues = append(i.minValues, min.double())\n\ti.maxValues = append(i.maxValues, max.double())\n}\n\nfunc (i *doubleColumnIndexer) ColumnIndex() format.ColumnIndex {\n\treturn i.columnIndex(\n\t\tsplitFixedLenByteArrays(unsafecast.Float64ToBytes(i.minValues), 8),\n\t\tsplitFixedLenByteArrays(unsafecast.Float64ToBytes(i.maxValues), 8),\n\t\torderOfFloat64(i.minValues),\n\t\torderOfFloat64(i.maxValues),\n\t)\n}\n\ntype byteArrayColumnIndexer struct {\n\tbaseColumnIndexer\n\tsizeLimit int\n\tminValues []byte\n\tmaxValues []byte\n}\n\nfunc newByteArrayColumnIndexer(sizeLimit int) *byteArrayColumnIndexer {\n\treturn &byteArrayColumnIndexer{sizeLimit: sizeLimit}\n}\n\nfunc (i *byteArrayColumnIndexer) Reset() {\n\ti.reset()\n\ti.minValues = i.minValues[:0]\n\ti.maxValues = i.maxValues[:0]\n}\n\nfunc (i *byteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {\n\ti.observe(numValues, numNulls)\n\ti.minValues = plain.AppendByteArray(i.minValues, min.byteArray())\n\ti.maxValues = plain.AppendByteArray(i.maxValues, max.byteArray())\n}\n\nfunc (i *byteArrayColumnIndexer) ColumnIndex() format.ColumnIndex {\n\tminValues := splitByteArrays(i.minValues)\n\tmaxValues := splitByteArrays(i.maxValues)\n\tif sizeLimit := i.sizeLimit; sizeLimit > 0 {\n\t\tfor i, v := range minValues {\n\t\t\tminValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit)\n\t\t}\n\t\tfor i, v := range maxValues {\n\t\t\tmaxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit)\n\t\t}\n\t}\n\treturn i.columnIndex(\n\t\tminValues,\n\t\tmaxValues,\n\t\torderOfBytes(minValues),\n\t\torderOfBytes(maxValues),\n\t)\n}\n\ntype fixedLenByteArrayColumnIndexer struct {\n\tbaseColumnIndexer\n\tsize      int\n\tsizeLimit int\n\tminValues []byte\n\tmaxValues []byte\n}\n\nfunc newFixedLenByteArrayColumnIndexer(size, sizeLimit int) *fixedLenByteArrayColumnIndexer {\n\treturn &fixedLenByteArrayColumnIndexer{\n\t\tsize:      size,\n\t\tsizeLimit: sizeLimit,\n\t}\n}\n\nfunc (i *fixedLenByteArrayColumnIndexer) Reset() {\n\ti.reset()\n\ti.minValues = i.minValues[:0]\n\ti.maxValues = i.maxValues[:0]\n}\n\nfunc (i *fixedLenByteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {\n\ti.observe(numValues, numNulls)\n\ti.minValues = append(i.minValues, min.byteArray()...)\n\ti.maxValues = append(i.maxValues, max.byteArray()...)\n}\n\nfunc (i *fixedLenByteArrayColumnIndexer) ColumnIndex() format.ColumnIndex {\n\tminValues := splitFixedLenByteArrays(i.minValues, i.size)\n\tmaxValues := splitFixedLenByteArrays(i.maxValues, i.size)\n\tif sizeLimit := i.sizeLimit; sizeLimit > 0 {\n\t\tfor i, v := range minValues {\n\t\t\tminValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit)\n\t\t}\n\t\tfor i, v := range maxValues {\n\t\t\tmaxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit)\n\t\t}\n\t}\n\treturn i.columnIndex(\n\t\tminValues,\n\t\tmaxValues,\n\t\torderOfBytes(minValues),\n\t\torderOfBytes(maxValues),\n\t)\n}\n\ntype uint32ColumnIndexer struct {\n\tbaseColumnIndexer\n\tminValues []uint32\n\tmaxValues []uint32\n}\n\nfunc newUint32ColumnIndexer() *uint32ColumnIndexer {\n\treturn new(uint32ColumnIndexer)\n}\n\nfunc (i *uint32ColumnIndexer) Reset() {\n\ti.reset()\n\ti.minValues = i.minValues[:0]\n\ti.maxValues = i.maxValues[:0]\n}\n\nfunc (i *uint32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {\n\ti.observe(numValues, numNulls)\n\ti.minValues = append(i.minValues, min.uint32())\n\ti.maxValues = append(i.maxValues, max.uint32())\n}\n\nfunc (i *uint32ColumnIndexer) ColumnIndex() format.ColumnIndex {\n\treturn i.columnIndex(\n\t\tsplitFixedLenByteArrays(unsafecast.Uint32ToBytes(i.minValues), 4),\n\t\tsplitFixedLenByteArrays(unsafecast.Uint32ToBytes(i.maxValues), 4),\n\t\torderOfUint32(i.minValues),\n\t\torderOfUint32(i.maxValues),\n\t)\n}\n\ntype uint64ColumnIndexer struct {\n\tbaseColumnIndexer\n\tminValues []uint64\n\tmaxValues []uint64\n}\n\nfunc newUint64ColumnIndexer() *uint64ColumnIndexer {\n\treturn new(uint64ColumnIndexer)\n}\n\nfunc (i *uint64ColumnIndexer) Reset() {\n\ti.reset()\n\ti.minValues = i.minValues[:0]\n\ti.maxValues = i.maxValues[:0]\n}\n\nfunc (i *uint64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {\n\ti.observe(numValues, numNulls)\n\ti.minValues = append(i.minValues, min.uint64())\n\ti.maxValues = append(i.maxValues, max.uint64())\n}\n\nfunc (i *uint64ColumnIndexer) ColumnIndex() format.ColumnIndex {\n\treturn i.columnIndex(\n\t\tsplitFixedLenByteArrays(unsafecast.Uint64ToBytes(i.minValues), 8),\n\t\tsplitFixedLenByteArrays(unsafecast.Uint64ToBytes(i.maxValues), 8),\n\t\torderOfUint64(i.minValues),\n\t\torderOfUint64(i.maxValues),\n\t)\n}\n\ntype be128ColumnIndexer struct {\n\tbaseColumnIndexer\n\tminValues [][16]byte\n\tmaxValues [][16]byte\n}\n\nfunc newBE128ColumnIndexer() *be128ColumnIndexer {\n\treturn new(be128ColumnIndexer)\n}\n\nfunc (i *be128ColumnIndexer) Reset() {\n\ti.reset()\n\ti.minValues = i.minValues[:0]\n\ti.maxValues = i.maxValues[:0]\n}\n\nfunc (i *be128ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {\n\ti.observe(numValues, numNulls)\n\tif !min.IsNull() {\n\t\ti.minValues = append(i.minValues, *(*[16]byte)(min.byteArray()))\n\t}\n\tif !max.IsNull() {\n\t\ti.maxValues = append(i.maxValues, *(*[16]byte)(max.byteArray()))\n\t}\n}\n\nfunc (i *be128ColumnIndexer) ColumnIndex() format.ColumnIndex {\n\tminValues := splitFixedLenByteArrays(unsafecast.Uint128ToBytes(i.minValues), 16)\n\tmaxValues := splitFixedLenByteArrays(unsafecast.Uint128ToBytes(i.maxValues), 16)\n\treturn i.columnIndex(\n\t\tminValues,\n\t\tmaxValues,\n\t\torderOfBytes(minValues),\n\t\torderOfBytes(maxValues),\n\t)\n}\n\nfunc truncateLargeMinByteArrayValue(value []byte, sizeLimit int) []byte {\n\tif len(value) > sizeLimit {\n\t\tvalue = value[:sizeLimit]\n\t}\n\treturn value\n}\n\n// truncateLargeMaxByteArrayValue truncates the given byte array to the given size limit.\n// If the given byte array is truncated, it is incremented by 1 in place.\nfunc truncateLargeMaxByteArrayValue(value []byte, sizeLimit int) []byte {\n\tif len(value) > sizeLimit {\n\t\tvalue = value[:sizeLimit]\n\t\tincrementByteArrayInplace(value)\n\t}\n\treturn value\n}\n\n// incrementByteArray increments the given byte array by 1.\n// Reference: https://github.com/apache/parquet-mr/blob/master/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java#L124\nfunc incrementByteArrayInplace(value []byte) {\n\tfor i := len(value) - 1; i >= 0; i-- {\n\t\tvalue[i]++\n\t\tif value[i] != 0 { // Did not overflow: 0xFF -> 0x00\n\t\t\treturn\n\t\t}\n\t}\n\t// Fully overflowed, so restore all to 0xFF\n\tfor i := range value {\n\t\tvalue[i] = 0xFF\n\t}\n}\n\nfunc splitByteArrays(data []byte) [][]byte {\n\tlength := 0\n\tplain.RangeByteArray(data, func([]byte) error {\n\t\tlength++\n\t\treturn nil\n\t})\n\tbuffer := make([]byte, 0, len(data)-(4*length))\n\tvalues := make([][]byte, 0, length)\n\tplain.RangeByteArray(data, func(value []byte) error {\n\t\toffset := len(buffer)\n\t\tbuffer = append(buffer, value...)\n\t\tvalues = append(values, buffer[offset:])\n\t\treturn nil\n\t})\n\treturn values\n}\n\nfunc splitFixedLenByteArrays(data []byte, size int) [][]byte {\n\tdata = copyBytes(data)\n\tvalues := make([][]byte, len(data)/size)\n\tfor i := range values {\n\t\tj := (i + 0) * size\n\t\tk := (i + 1) * size\n\t\tvalues[i] = data[j:k:k]\n\t}\n\treturn values\n}\n\nfunc boundaryOrderOf(minOrder, maxOrder int) format.BoundaryOrder {\n\tif minOrder == maxOrder {\n\t\tswitch {\n\t\tcase minOrder > 0:\n\t\t\treturn format.Ascending\n\t\tcase minOrder < 0:\n\t\t\treturn format.Descending\n\t\t}\n\t}\n\treturn format.Unordered\n}\n"
  },
  {
    "path": "column_index_internal_test.go",
    "content": "package parquet\n\nimport (\n\t\"bytes\"\n\t\"testing\"\n)\n\nfunc TestIncrementByteArrayInplace(t *testing.T) {\n\ttestCases := [][]byte{\n\t\t{0x00, 0x01, 0x02, 0x03}, {0x00, 0x01, 0x02, 0x04},\n\t\t{0x00, 0x01, 0x02, 0xFF}, {0x00, 0x01, 0x03, 0x00},\n\t\t{0x00, 0x01, 0xFF, 0xFF}, {0x00, 0x02, 0x00, 0x00},\n\t\t{0xFF, 0xFF, 0xFF, 0xFF}, {0xFF, 0xFF, 0xFF, 0xFF},\n\t}\n\n\tfor i := 0; i < len(testCases); i += 2 {\n\t\tinput := testCases[i]\n\t\texpected := testCases[i+1]\n\t\tactual := copyBytes(input)\n\t\tincrementByteArrayInplace(actual)\n\t\tif !bytes.Equal(actual, expected) {\n\t\t\tt.Errorf(\"incrementByteArrayInplace(%v) = %v, want %v\", input, actual, expected)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "column_index_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc TestBinaryColumnIndexMinMax(t *testing.T) {\n\ttestCases := [][]interface{}{\n\t\t// kind, type, page min, page max, size limit, [value to search, expected result]...\n\t\t{parquet.ByteArray, parquet.ByteArrayType,\n\t\t\t[]byte{0, 0, 0, 0, 0, 0}, []byte{1, 2, 3, 4, 5, 6}, 4,\n\t\t\t[]byte{0, 0, 0, 0, 0, 0}, true,\n\t\t\t[]byte{0, 1, 2, 3, 4, 5}, true,\n\t\t\t[]byte{1, 2, 3, 4}, true,\n\t\t\t[]byte{1, 2, 3, 4, 5, 6}, true, // the page max value should be a hit\n\t\t\t[]byte{1, 2, 3, 4, 5, 7}, true, // false positive due to size limit\n\t\t\t[]byte{1, 2, 3, 5}, true, // false positive due to size limit\n\t\t\t[]byte{1, 2, 3, 5, 6, 7}, false, // should be no hit since it definitely exceeds page max\n\t\t\t[]byte{2, 3, 4, 5}, false, // should be no hit since it definitely exceeds page max\n\t\t},\n\t\t{parquet.FixedLenByteArray, parquet.FixedLenByteArrayType(6),\n\t\t\t[]byte{0, 0, 0, 0, 0, 0}, []byte{1, 2, 3, 4, 5, 6}, 4,\n\t\t\t[]byte{0, 0, 0, 0, 0, 0}, true,\n\t\t\t[]byte{0, 1, 2, 3, 4, 5}, true,\n\t\t\t[]byte{1, 2, 3, 4, 0, 0}, true,\n\t\t\t[]byte{1, 2, 3, 4, 5, 6}, true, // the page max value should be a hit\n\t\t\t[]byte{1, 2, 3, 4, 5, 7}, true, // false positive due to size limit\n\t\t\t[]byte{1, 2, 3, 4, 0xFF, 0xFF}, true, // false positive due to size limit\n\t\t\t[]byte{1, 2, 3, 5, 0, 0}, false, // should be no hit since it definitely exceeds page max\n\t\t\t[]byte{1, 2, 3, 5, 6, 7}, false, // should be no hit since it definitely exceeds page max\n\t\t\t[]byte{2, 3, 4, 5, 0, 0}, false, // should be no hit since it definitely exceeds page max\n\t\t},\n\t}\n\tfor _, testCase := range testCases {\n\t\tkind := testCase[0].(parquet.Kind)\n\t\ttyp := testCase[1].(parquet.Type)\n\t\tmin := testCase[2].([]byte)\n\t\tmax := testCase[3].([]byte)\n\t\tsizeLimit := testCase[4].(int)\n\t\tindexer := typ.NewColumnIndexer(sizeLimit)\n\t\tindexer.IndexPage(100, 0,\n\t\t\tparquet.ValueOf(min),\n\t\t\tparquet.ValueOf(max),\n\t\t)\n\t\tformatIndex := indexer.ColumnIndex()\n\t\tcolumnIndex := parquet.NewColumnIndex(kind, &formatIndex)\n\t\tfor i := 5; i < len(testCase); i += 2 {\n\t\t\tvalue := testCase[i].([]byte)\n\t\t\texpected := testCase[i+1].(bool)\n\n\t\t\tv := parquet.ValueOf(value)\n\t\t\tactual := parquet.Search(columnIndex, v, typ) == 0\n\t\t\tif actual != expected {\n\t\t\t\tt.Errorf(\"checkByteArrayMinMax(%v, %v, %v, %v) = %v, want %v\", min, max, value, sizeLimit, actual, expected)\n\t\t\t}\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "column_mapping.go",
    "content": "package parquet\n\n// LeafColumn is a struct type representing leaf columns of a parquet schema.\ntype LeafColumn struct {\n\tNode               Node\n\tPath               []string\n\tColumnIndex        int\n\tMaxRepetitionLevel int\n\tMaxDefinitionLevel int\n}\n\nfunc columnMappingOf(schema Node) (mapping columnMappingGroup, columns [][]string) {\n\tmapping = make(columnMappingGroup)\n\tcolumns = make([][]string, 0, 16)\n\n\tforEachLeafColumnOf(schema, func(leaf leafColumn) {\n\t\tpath := make(columnPath, len(leaf.path))\n\t\tcopy(path, leaf.path)\n\t\tcolumns = append(columns, path)\n\n\t\tgroup := mapping\n\t\tfor len(path) > 1 {\n\t\t\tcolumnName := path[0]\n\t\t\tg, ok := group[columnName].(columnMappingGroup)\n\t\t\tif !ok {\n\t\t\t\tg = make(columnMappingGroup)\n\t\t\t\tgroup[columnName] = g\n\t\t\t}\n\t\t\tgroup, path = g, path[1:]\n\t\t}\n\n\t\tleaf.path = path // use the copy\n\t\tgroup[path[0]] = &columnMappingLeaf{column: leaf}\n\t})\n\n\treturn mapping, columns\n}\n\ntype columnMapping interface {\n\tlookup(path columnPath) leafColumn\n}\n\ntype columnMappingGroup map[string]columnMapping\n\nfunc (group columnMappingGroup) lookup(path columnPath) leafColumn {\n\tif len(path) > 0 {\n\t\tc, ok := group[path[0]]\n\t\tif ok {\n\t\t\treturn c.lookup(path[1:])\n\t\t}\n\t}\n\treturn leafColumn{columnIndex: -1}\n}\n\nfunc (group columnMappingGroup) lookupClosest(path columnPath) leafColumn {\n\tfor len(path) > 0 {\n\t\tg, ok := group[path[0]].(columnMappingGroup)\n\t\tif ok {\n\t\t\tgroup, path = g, path[1:]\n\t\t} else {\n\t\t\tfirstName := \"\"\n\t\t\tfirstLeaf := (*columnMappingLeaf)(nil)\n\t\t\tfor name, child := range group {\n\t\t\t\tif leaf, ok := child.(*columnMappingLeaf); ok {\n\t\t\t\t\tif firstLeaf == nil || name < firstName {\n\t\t\t\t\t\tfirstName, firstLeaf = name, leaf\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\tif firstLeaf != nil {\n\t\t\t\treturn firstLeaf.column\n\t\t\t}\n\t\t\tbreak\n\t\t}\n\t}\n\treturn leafColumn{columnIndex: -1}\n}\n\ntype columnMappingLeaf struct {\n\tcolumn leafColumn\n}\n\nfunc (leaf *columnMappingLeaf) lookup(path columnPath) leafColumn {\n\tif len(path) == 0 {\n\t\treturn leaf.column\n\t}\n\treturn leafColumn{columnIndex: -1}\n}\n"
  },
  {
    "path": "column_mapping_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"fmt\"\n\t\"strings\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc ExampleSchema_Lookup() {\n\tschema := parquet.SchemaOf(struct {\n\t\tFirstName  string `parquet:\"first_name\"`\n\t\tLastName   string `parquet:\"last_name\"`\n\t\tAttributes []struct {\n\t\t\tName  string `parquet:\"name\"`\n\t\t\tValue string `parquet:\"value\"`\n\t\t} `parquet:\"attributes\"`\n\t}{})\n\n\tfor _, path := range schema.Columns() {\n\t\tleaf, _ := schema.Lookup(path...)\n\t\tfmt.Printf(\"%d => %q\\n\", leaf.ColumnIndex, strings.Join(path, \".\"))\n\t}\n\n\t// Output:\n\t// 0 => \"first_name\"\n\t// 1 => \"last_name\"\n\t// 2 => \"attributes.name\"\n\t// 3 => \"attributes.value\"\n}\n"
  },
  {
    "path": "column_path.go",
    "content": "package parquet\n\nimport (\n\t\"strings\"\n)\n\ntype columnPath []string\n\nfunc (path columnPath) append(names ...string) columnPath {\n\treturn append(path[:len(path):len(path)], names...)\n}\n\nfunc (path columnPath) equal(other columnPath) bool {\n\treturn stringsAreEqual(path, other)\n}\n\nfunc (path columnPath) less(other columnPath) bool {\n\treturn stringsAreOrdered(path, other)\n}\n\nfunc (path columnPath) String() string {\n\treturn strings.Join(path, \".\")\n}\n\nfunc stringsAreEqual(strings1, strings2 []string) bool {\n\tif len(strings1) != len(strings2) {\n\t\treturn false\n\t}\n\n\tfor i := range strings1 {\n\t\tif strings1[i] != strings2[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\n\treturn true\n}\n\nfunc stringsAreOrdered(strings1, strings2 []string) bool {\n\tn := len(strings1)\n\n\tif n > len(strings2) {\n\t\tn = len(strings2)\n\t}\n\n\tfor i := 0; i < n; i++ {\n\t\tif strings1[i] >= strings2[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\n\treturn len(strings1) <= len(strings2)\n}\n\ntype leafColumn struct {\n\tnode               Node\n\tpath               columnPath\n\tmaxRepetitionLevel byte\n\tmaxDefinitionLevel byte\n\tcolumnIndex        int16\n}\n\nfunc forEachLeafColumnOf(node Node, do func(leafColumn)) {\n\tforEachLeafColumn(node, nil, 0, 0, 0, do)\n}\n\nfunc forEachLeafColumn(node Node, path columnPath, columnIndex, maxRepetitionLevel, maxDefinitionLevel int, do func(leafColumn)) int {\n\tswitch {\n\tcase node.Optional():\n\t\tmaxDefinitionLevel++\n\tcase node.Repeated():\n\t\tmaxRepetitionLevel++\n\t\tmaxDefinitionLevel++\n\t}\n\n\tif node.Leaf() {\n\t\tdo(leafColumn{\n\t\t\tnode:               node,\n\t\t\tpath:               path,\n\t\t\tmaxRepetitionLevel: makeRepetitionLevel(maxRepetitionLevel),\n\t\t\tmaxDefinitionLevel: makeDefinitionLevel(maxDefinitionLevel),\n\t\t\tcolumnIndex:        makeColumnIndex(columnIndex),\n\t\t})\n\t\treturn columnIndex + 1\n\t}\n\n\tfor _, field := range node.Fields() {\n\t\tcolumnIndex = forEachLeafColumn(\n\t\t\tfield,\n\t\t\tpath.append(field.Name()),\n\t\t\tcolumnIndex,\n\t\t\tmaxRepetitionLevel,\n\t\t\tmaxDefinitionLevel,\n\t\t\tdo,\n\t\t)\n\t}\n\n\treturn columnIndex\n}\n\nfunc lookupColumnPath(node Node, path columnPath) Node {\n\tfor node != nil && len(path) > 0 {\n\t\tnode = fieldByName(node, path[0])\n\t\tpath = path[1:]\n\t}\n\treturn node\n}\n\nfunc hasColumnPath(node Node, path columnPath) bool {\n\treturn lookupColumnPath(node, path) != nil\n}\n"
  },
  {
    "path": "column_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"fmt\"\n\t\"math/rand\"\n\t\"testing\"\n\n\t\"github.com/google/uuid\"\n\t\"github.com/segmentio/parquet-go\"\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\nfunc TestColumnPageIndex(t *testing.T) {\n\tfor _, config := range [...]struct {\n\t\tname string\n\t\ttest func(*testing.T, rows) bool\n\t}{\n\t\t{\n\t\t\tname: \"buffer\",\n\t\t\ttest: testColumnPageIndexWithBuffer,\n\t\t},\n\t\t{\n\t\t\tname: \"file\",\n\t\t\ttest: testColumnPageIndexWithFile,\n\t\t},\n\t} {\n\t\tt.Run(config.name, func(t *testing.T) {\n\t\t\tfor _, test := range [...]struct {\n\t\t\t\tscenario string\n\t\t\t\tfunction func(*testing.T) interface{}\n\t\t\t}{\n\t\t\t\t{\n\t\t\t\t\tscenario: \"boolean\",\n\t\t\t\t\tfunction: func(t *testing.T) interface{} {\n\t\t\t\t\t\treturn func(rows []struct{ Value bool }) bool { return config.test(t, makeRows(rows)) }\n\t\t\t\t\t},\n\t\t\t\t},\n\n\t\t\t\t{\n\t\t\t\t\tscenario: \"int32\",\n\t\t\t\t\tfunction: func(t *testing.T) interface{} {\n\t\t\t\t\t\treturn func(rows []struct{ Value int32 }) bool { return config.test(t, makeRows(rows)) }\n\t\t\t\t\t},\n\t\t\t\t},\n\n\t\t\t\t{\n\t\t\t\t\tscenario: \"int64\",\n\t\t\t\t\tfunction: func(t *testing.T) interface{} {\n\t\t\t\t\t\treturn func(rows []struct{ Value int64 }) bool { return config.test(t, makeRows(rows)) }\n\t\t\t\t\t},\n\t\t\t\t},\n\n\t\t\t\t{\n\t\t\t\t\tscenario: \"int96\",\n\t\t\t\t\tfunction: func(t *testing.T) interface{} {\n\t\t\t\t\t\treturn func(rows []struct{ Value deprecated.Int96 }) bool { return config.test(t, makeRows(rows)) }\n\t\t\t\t\t},\n\t\t\t\t},\n\n\t\t\t\t{\n\t\t\t\t\tscenario: \"uint32\",\n\t\t\t\t\tfunction: func(t *testing.T) interface{} {\n\t\t\t\t\t\treturn func(rows []struct{ Value uint32 }) bool { return config.test(t, makeRows(rows)) }\n\t\t\t\t\t},\n\t\t\t\t},\n\n\t\t\t\t{\n\t\t\t\t\tscenario: \"uint64\",\n\t\t\t\t\tfunction: func(t *testing.T) interface{} {\n\t\t\t\t\t\treturn func(rows []struct{ Value uint64 }) bool { return config.test(t, makeRows(rows)) }\n\t\t\t\t\t},\n\t\t\t\t},\n\n\t\t\t\t{\n\t\t\t\t\tscenario: \"float32\",\n\t\t\t\t\tfunction: func(t *testing.T) interface{} {\n\t\t\t\t\t\treturn func(rows []struct{ Value float32 }) bool { return config.test(t, makeRows(rows)) }\n\t\t\t\t\t},\n\t\t\t\t},\n\n\t\t\t\t{\n\t\t\t\t\tscenario: \"float64\",\n\t\t\t\t\tfunction: func(t *testing.T) interface{} {\n\t\t\t\t\t\treturn func(rows []struct{ Value float64 }) bool { return config.test(t, makeRows(rows)) }\n\t\t\t\t\t},\n\t\t\t\t},\n\n\t\t\t\t{\n\t\t\t\t\tscenario: \"string\",\n\t\t\t\t\tfunction: func(t *testing.T) interface{} {\n\t\t\t\t\t\treturn func(rows []struct{ Value string }) bool { return config.test(t, makeRows(rows)) }\n\t\t\t\t\t},\n\t\t\t\t},\n\n\t\t\t\t{\n\t\t\t\t\tscenario: \"uuid\",\n\t\t\t\t\tfunction: func(t *testing.T) interface{} {\n\t\t\t\t\t\treturn func(rows []struct{ Value uuid.UUID }) bool { return config.test(t, makeRows(rows)) }\n\t\t\t\t\t},\n\t\t\t\t},\n\t\t\t} {\n\t\t\t\tt.Run(test.scenario, func(t *testing.T) {\n\t\t\t\t\tif err := quickCheck(test.function(t)); err != nil {\n\t\t\t\t\t\tt.Error(err)\n\t\t\t\t\t}\n\t\t\t\t})\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc testColumnPageIndexWithBuffer(t *testing.T, rows rows) bool {\n\tif len(rows) > 0 {\n\t\tb := parquet.NewBuffer()\n\t\tfor _, row := range rows {\n\t\t\tb.Write(row)\n\t\t}\n\t\tif err := checkRowGroupColumnIndex(b); err != nil {\n\t\t\tt.Error(err)\n\t\t\treturn false\n\t\t}\n\t\tif err := checkRowGroupOffsetIndex(b); err != nil {\n\t\t\tt.Error(err)\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc checkRowGroupColumnIndex(rowGroup parquet.RowGroup) error {\n\tfor i, column := range rowGroup.ColumnChunks() {\n\t\tif err := checkColumnChunkColumnIndex(column); err != nil {\n\t\t\treturn fmt.Errorf(\"column chunk @i=%d: %w\", i, err)\n\t\t}\n\t}\n\treturn nil\n}\n\nfunc checkColumnChunkColumnIndex(columnChunk parquet.ColumnChunk) error {\n\tcolumnType := columnChunk.Type()\n\tcolumnIndex := columnChunk.ColumnIndex()\n\tnumPages := columnIndex.NumPages()\n\tpagesRead := 0\n\tstats := newColumnStats(columnType)\n\tpages := columnChunk.Pages()\n\tdefer pages.Close()\n\n\terr := forEachPage(pages, func(page parquet.Page) error {\n\t\tpageMin, pageMax, hasBounds := page.Bounds()\n\t\tif !hasBounds {\n\t\t\treturn fmt.Errorf(\"page bounds are missing\")\n\t\t}\n\t\tindexMin := columnIndex.MinValue(pagesRead)\n\t\tindexMax := columnIndex.MaxValue(pagesRead)\n\n\t\tif !parquet.Equal(pageMin, indexMin) {\n\t\t\treturn fmt.Errorf(\"max page value mismatch: index=%q page=%q\", indexMin, pageMin)\n\t\t}\n\t\tif !parquet.Equal(pageMax, indexMax) {\n\t\t\treturn fmt.Errorf(\"max page value mismatch: index=%q page=%q\", indexMax, pageMax)\n\t\t}\n\n\t\tnumNulls := int64(0)\n\t\tnumValues := int64(0)\n\t\terr := forEachValue(page.Values(), func(value parquet.Value) error {\n\t\t\tstats.observe(value)\n\t\t\tif value.IsNull() {\n\t\t\t\tnumNulls++\n\t\t\t}\n\t\t\tnumValues++\n\t\t\treturn nil\n\t\t})\n\t\tif err != nil {\n\t\t\treturn err\n\t\t}\n\n\t\tnullCount := columnIndex.NullCount(pagesRead)\n\t\tif numNulls != nullCount {\n\t\t\treturn fmt.Errorf(\"number of null values mimatch: index=%d page=%d\", nullCount, numNulls)\n\t\t}\n\n\t\tnullPage := columnIndex.NullPage(pagesRead)\n\t\tif numNulls > 0 && numNulls == numValues && !nullPage {\n\t\t\treturn fmt.Errorf(\"page only contained null values but the index did not categorize it as a null page: nulls=%d\", numNulls)\n\t\t}\n\n\t\tstats.pageRead()\n\t\tpagesRead++\n\t\treturn nil\n\t})\n\tif err != nil {\n\t\treturn fmt.Errorf(\"page @i=%d: %w\", pagesRead, err)\n\t}\n\tif pagesRead != numPages {\n\t\treturn fmt.Errorf(\"number of pages found in column index differs from the number of pages read: index=%d read=%d\", numPages, pagesRead)\n\t}\n\n\tactualOrder := columnIndexOrder(columnIndex)\n\tobservedOrder := observedIndexOrder(columnType, stats.minValues, stats.maxValues)\n\txorAscending := (columnIndex.IsAscending() || observedOrder == ascendingIndexOrder) &&\n\t\t!(columnIndex.IsAscending() && observedOrder == ascendingIndexOrder)\n\txorDescending := (columnIndex.IsDescending() || observedOrder == descendingIndexOrder) &&\n\t\t!(columnIndex.IsDescending() && observedOrder == descendingIndexOrder)\n\n\tif xorAscending || xorDescending {\n\t\treturn fmt.Errorf(\"column index is declared to be %s while observed values %s (min values %s, max values %s)\",\n\t\t\tactualOrder,\n\t\t\tobservedOrder,\n\t\t\tvalueOrder(columnType, stats.minValues),\n\t\t\tvalueOrder(columnType, stats.maxValues),\n\t\t)\n\t}\n\n\treturn nil\n}\n\nfunc checkRowGroupOffsetIndex(rowGroup parquet.RowGroup) error {\n\tfor i, column := range rowGroup.ColumnChunks() {\n\t\tif err := checkColumnChunkOffsetIndex(column); err != nil {\n\t\t\treturn fmt.Errorf(\"column chunk @i=%d: %w\", i, err)\n\t\t}\n\t}\n\treturn nil\n}\n\nfunc checkColumnChunkOffsetIndex(columnChunk parquet.ColumnChunk) error {\n\toffsetIndex := columnChunk.OffsetIndex()\n\tnumPages := offsetIndex.NumPages()\n\tpagesRead := 0\n\trowIndex := int64(0)\n\n\tpages := columnChunk.Pages()\n\tdefer pages.Close()\n\n\terr := forEachPage(pages, func(page parquet.Page) error {\n\t\tif firstRowIndex := offsetIndex.FirstRowIndex(pagesRead); firstRowIndex != rowIndex {\n\t\t\treturn fmt.Errorf(\"row number mismatch: index=%d page=%d\", firstRowIndex, rowIndex)\n\t\t}\n\t\trowIndex += int64(page.NumRows())\n\t\tpagesRead++\n\t\treturn nil\n\t})\n\tif err != nil {\n\t\treturn fmt.Errorf(\"page @i=%d: %w\", pagesRead, err)\n\t}\n\n\tif pagesRead != numPages {\n\t\treturn fmt.Errorf(\"number of pages found in offset index differs from the number of pages read: index=%d read=%d\", numPages, pagesRead)\n\t}\n\n\treturn nil\n}\n\nfunc testColumnPageIndexWithFile(t *testing.T, rows rows) bool {\n\tif len(rows) > 0 {\n\t\tr := rand.New(rand.NewSource(5))\n\t\tf, err := createParquetFile(rows,\n\t\t\tparquet.PageBufferSize(r.Intn(49)+1),\n\t\t\tparquet.ColumnIndexSizeLimit(4096),\n\t\t)\n\t\tif err != nil {\n\t\t\tt.Error(err)\n\t\t\treturn false\n\t\t}\n\t\tif err := checkFileColumnIndex(f); err != nil {\n\t\t\tt.Error(err)\n\t\t\treturn false\n\t\t}\n\t\tif err := checkFileOffsetIndex(f); err != nil {\n\t\t\tt.Error(err)\n\t\t\treturn false\n\t\t}\n\t\tfor i, rowGroup := range f.RowGroups() {\n\t\t\tif err := checkRowGroupColumnIndex(rowGroup); err != nil {\n\t\t\t\tt.Errorf(\"checking column index of row group @i=%d: %v\", i, err)\n\t\t\t\treturn false\n\t\t\t}\n\t\t\tif err := checkRowGroupOffsetIndex(rowGroup); err != nil {\n\t\t\t\tt.Errorf(\"checking offset index of row group @i=%d: %v\", i, err)\n\t\t\t\treturn false\n\t\t\t}\n\t\t}\n\t}\n\treturn true\n}\n\nfunc checkFileColumnIndex(f *parquet.File) error {\n\tcolumnIndexes := f.ColumnIndexes()\n\ti := 0\n\treturn forEachColumnChunk(f, func(col *parquet.Column, chunk parquet.ColumnChunk) error {\n\t\tcolumnIndex := chunk.ColumnIndex()\n\t\tif n := columnIndex.NumPages(); n <= 0 {\n\t\t\treturn fmt.Errorf(\"invalid number of pages found in the column index: %d\", n)\n\t\t}\n\t\tif i >= len(columnIndexes) {\n\t\t\treturn fmt.Errorf(\"more column indexes were read when iterating over column chunks than when reading from the file (i=%d,n=%d)\", i, len(columnIndexes))\n\t\t}\n\n\t\tindex1 := columnIndex\n\t\tindex2 := &fileColumnIndex{\n\t\t\tkind:        col.Type().Kind(),\n\t\t\tColumnIndex: columnIndexes[i],\n\t\t}\n\n\t\tnumPages1 := index1.NumPages()\n\t\tnumPages2 := index2.NumPages()\n\t\tif numPages1 != numPages2 {\n\t\t\treturn fmt.Errorf(\"number of pages mismatch: got=%d want=%d\", numPages1, numPages2)\n\t\t}\n\n\t\tfor j := 0; j < numPages1; j++ {\n\t\t\tnullCount1 := index1.NullCount(j)\n\t\t\tnullCount2 := index2.NullCount(j)\n\t\t\tif nullCount1 != nullCount2 {\n\t\t\t\treturn fmt.Errorf(\"null count of page %d/%d mismatch: got=%d want=%d\", i, numPages1, nullCount1, nullCount2)\n\t\t\t}\n\n\t\t\tnullPage1 := index1.NullPage(j)\n\t\t\tnullPage2 := index2.NullPage(j)\n\t\t\tif nullPage1 != nullPage2 {\n\t\t\t\treturn fmt.Errorf(\"null page of page %d/%d mismatch: got=%t want=%t\", i, numPages1, nullPage1, nullPage2)\n\t\t\t}\n\n\t\t\tminValue1 := index1.MinValue(j)\n\t\t\tminValue2 := index2.MinValue(j)\n\t\t\tif !parquet.Equal(minValue1, minValue2) {\n\t\t\t\treturn fmt.Errorf(\"min value of page %d/%d mismatch: got=%v want=%v\", i, numPages1, minValue1, minValue2)\n\t\t\t}\n\n\t\t\tmaxValue1 := index1.MaxValue(j)\n\t\t\tmaxValue2 := index2.MaxValue(j)\n\t\t\tif !parquet.Equal(maxValue1, maxValue2) {\n\t\t\t\treturn fmt.Errorf(\"max value of page %d/%d mismatch: got=%v want=%v\", i, numPages1, maxValue1, maxValue2)\n\t\t\t}\n\n\t\t\tisAscending1 := index1.IsAscending()\n\t\t\tisAscending2 := index2.IsAscending()\n\t\t\tif isAscending1 != isAscending2 {\n\t\t\t\treturn fmt.Errorf(\"ascending state of page %d/%d mismatch: got=%t want=%t\", i, numPages1, isAscending1, isAscending2)\n\t\t\t}\n\n\t\t\tisDescending1 := index1.IsDescending()\n\t\t\tisDescending2 := index2.IsDescending()\n\t\t\tif isDescending1 != isDescending2 {\n\t\t\t\treturn fmt.Errorf(\"descending state of page %d/%d mismatch: got=%t want=%t\", i, numPages1, isDescending1, isDescending2)\n\t\t\t}\n\t\t}\n\n\t\ti++\n\t\treturn nil\n\t})\n}\n\nfunc checkFileOffsetIndex(f *parquet.File) error {\n\toffsetIndexes := f.OffsetIndexes()\n\ti := 0\n\treturn forEachColumnChunk(f, func(col *parquet.Column, chunk parquet.ColumnChunk) error {\n\t\toffsetIndex := chunk.OffsetIndex()\n\t\tif n := offsetIndex.NumPages(); n <= 0 {\n\t\t\treturn fmt.Errorf(\"invalid number of pages found in the offset index: %d\", n)\n\t\t}\n\t\tif i >= len(offsetIndexes) {\n\t\t\treturn fmt.Errorf(\"more offset indexes were read when iterating over column chunks than when reading from the file (i=%d,n=%d)\", i, len(offsetIndexes))\n\t\t}\n\n\t\tindex1 := offsetIndex\n\t\tindex2 := (*fileOffsetIndex)(&offsetIndexes[i])\n\n\t\tnumPages1 := index1.NumPages()\n\t\tnumPages2 := index2.NumPages()\n\t\tif numPages1 != numPages2 {\n\t\t\treturn fmt.Errorf(\"number of pages mismatch: got=%d want=%d\", numPages1, numPages2)\n\t\t}\n\n\t\tfor j := 0; j < numPages1; j++ {\n\t\t\toffset1 := index1.Offset(j)\n\t\t\toffset2 := index2.Offset(j)\n\t\t\tif offset1 != offset2 {\n\t\t\t\treturn fmt.Errorf(\"offsets of page %d/%d mismatch: got=%d want=%d\", i, numPages1, offset1, offset2)\n\t\t\t}\n\n\t\t\tcompressedPageSize1 := index1.CompressedPageSize(j)\n\t\t\tcompressedPageSize2 := index2.CompressedPageSize(j)\n\t\t\tif compressedPageSize1 != compressedPageSize2 {\n\t\t\t\treturn fmt.Errorf(\"compressed page size of page %d/%d mismatch: got=%d want=%d\", i, numPages1, compressedPageSize1, compressedPageSize2)\n\t\t\t}\n\n\t\t\tfirstRowIndex1 := index1.FirstRowIndex(j)\n\t\t\tfirstRowIndex2 := index2.FirstRowIndex(j)\n\t\t\tif firstRowIndex1 != firstRowIndex2 {\n\t\t\t\treturn fmt.Errorf(\"first row index of page %d/%d mismatch: got=%d want=%d\", i, numPages1, firstRowIndex1, firstRowIndex2)\n\t\t\t}\n\t\t}\n\n\t\ti++\n\t\treturn nil\n\t})\n}\n\ntype fileColumnIndex struct {\n\tkind parquet.Kind\n\tformat.ColumnIndex\n}\n\nfunc (i *fileColumnIndex) NumPages() int                { return len(i.NullPages) }\nfunc (i *fileColumnIndex) NullCount(j int) int64        { return i.NullCounts[j] }\nfunc (i *fileColumnIndex) NullPage(j int) bool          { return i.NullPages[j] }\nfunc (i *fileColumnIndex) MinValue(j int) parquet.Value { return i.kind.Value(i.MinValues[j]) }\nfunc (i *fileColumnIndex) MaxValue(j int) parquet.Value { return i.kind.Value(i.MaxValues[j]) }\nfunc (i *fileColumnIndex) IsAscending() bool            { return i.BoundaryOrder == format.Ascending }\nfunc (i *fileColumnIndex) IsDescending() bool           { return i.BoundaryOrder == format.Descending }\n\ntype fileOffsetIndex format.OffsetIndex\n\nfunc (i *fileOffsetIndex) NumPages() int      { return len(i.PageLocations) }\nfunc (i *fileOffsetIndex) Offset(j int) int64 { return i.PageLocations[j].Offset }\nfunc (i *fileOffsetIndex) CompressedPageSize(j int) int64 {\n\treturn int64(i.PageLocations[j].CompressedPageSize)\n}\nfunc (i *fileOffsetIndex) FirstRowIndex(j int) int64 { return i.PageLocations[j].FirstRowIndex }\n\ntype columnStats struct {\n\tpage       int\n\tcolumnType parquet.Type\n\tminValues  []parquet.Value\n\tmaxValues  []parquet.Value\n}\n\nfunc newColumnStats(columnType parquet.Type) *columnStats {\n\treturn &columnStats{columnType: columnType}\n}\n\nfunc (c *columnStats) observe(value parquet.Value) {\n\tif c.page >= len(c.minValues) {\n\t\tc.minValues = append(c.minValues, value.Clone())\n\t} else if c.columnType.Compare(c.minValues[c.page], value) > 0 {\n\t\tc.minValues[c.page] = value.Clone()\n\t}\n\n\tif c.page >= len(c.maxValues) {\n\t\tc.maxValues = append(c.maxValues, value.Clone())\n\t} else if c.columnType.Compare(c.maxValues[c.page], value) < 0 {\n\t\tc.maxValues[c.page] = value.Clone()\n\t}\n}\n\nfunc (c *columnStats) pageRead() {\n\tc.page++\n}\n\ntype indexOrder int\n\nconst (\n\tinvalidIndexOrder indexOrder = iota\n\tunorderedIndexOrder\n\tascendingIndexOrder\n\tdescendingIndexOrder\n)\n\nfunc (o indexOrder) String() string {\n\tswitch o {\n\tcase unorderedIndexOrder:\n\t\treturn \"unordered\"\n\tcase ascendingIndexOrder:\n\t\treturn \"ascending\"\n\tcase descendingIndexOrder:\n\t\treturn \"descending\"\n\tdefault:\n\t\treturn \"invalid\"\n\t}\n}\n\nfunc columnIndexOrder(index parquet.ColumnIndex) indexOrder {\n\tswitch {\n\tcase index.IsAscending() && index.IsDescending():\n\t\treturn invalidIndexOrder\n\tcase index.IsAscending():\n\t\treturn ascendingIndexOrder\n\tcase index.IsDescending():\n\t\treturn descendingIndexOrder\n\tdefault:\n\t\treturn unorderedIndexOrder\n\t}\n}\n\nfunc observedIndexOrder(columnType parquet.Type, minValues []parquet.Value, maxValues []parquet.Value) indexOrder {\n\ta := valueOrder(columnType, minValues)\n\tb := valueOrder(columnType, maxValues)\n\n\tswitch {\n\tcase a == ascendingIndexOrder && b == ascendingIndexOrder:\n\t\treturn ascendingIndexOrder\n\tcase a == descendingIndexOrder && b == descendingIndexOrder:\n\t\treturn descendingIndexOrder\n\tdefault:\n\t\treturn unorderedIndexOrder\n\t}\n}\n\nfunc valueOrder(columnType parquet.Type, values []parquet.Value) indexOrder {\n\tswitch len(values) {\n\tcase 0, 1:\n\t\treturn unorderedIndexOrder\n\t}\n\n\tvar order int\n\tfor i := 1; i < len(values); i++ {\n\t\tnext := columnType.Compare(values[i-1], values[i])\n\t\tif next == 0 {\n\t\t\tcontinue\n\t\t}\n\t\tif order == 0 {\n\t\t\torder = next\n\t\t\tcontinue\n\t\t}\n\t\tif order != next {\n\t\t\treturn unorderedIndexOrder\n\t\t}\n\t}\n\n\tif order > 0 {\n\t\treturn descendingIndexOrder\n\t}\n\n\treturn ascendingIndexOrder\n}\n"
  },
  {
    "path": "compare.go",
    "content": "package parquet\n\nimport (\n\t\"encoding/binary\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n)\n\n// CompareDescending constructs a comparison function which inverses the order\n// of values.\n//\n//go:noinline\nfunc CompareDescending(cmp func(Value, Value) int) func(Value, Value) int {\n\treturn func(a, b Value) int { return -cmp(a, b) }\n}\n\n// CompareNullsFirst constructs a comparison function which assumes that null\n// values are smaller than all other values.\n//\n//go:noinline\nfunc CompareNullsFirst(cmp func(Value, Value) int) func(Value, Value) int {\n\treturn func(a, b Value) int {\n\t\tswitch {\n\t\tcase a.IsNull():\n\t\t\tif b.IsNull() {\n\t\t\t\treturn 0\n\t\t\t}\n\t\t\treturn -1\n\t\tcase b.IsNull():\n\t\t\treturn +1\n\t\tdefault:\n\t\t\treturn cmp(a, b)\n\t\t}\n\t}\n}\n\n// CompareNullsLast constructs a comparison function which assumes that null\n// values are greater than all other values.\n//\n//go:noinline\nfunc CompareNullsLast(cmp func(Value, Value) int) func(Value, Value) int {\n\treturn func(a, b Value) int {\n\t\tswitch {\n\t\tcase a.IsNull():\n\t\t\tif b.IsNull() {\n\t\t\t\treturn 0\n\t\t\t}\n\t\t\treturn +1\n\t\tcase b.IsNull():\n\t\t\treturn -1\n\t\tdefault:\n\t\t\treturn cmp(a, b)\n\t\t}\n\t}\n}\n\nfunc compareBool(v1, v2 bool) int {\n\tswitch {\n\tcase !v1 && v2:\n\t\treturn -1\n\tcase v1 && !v2:\n\t\treturn +1\n\tdefault:\n\t\treturn 0\n\t}\n}\n\nfunc compareInt32(v1, v2 int32) int {\n\tswitch {\n\tcase v1 < v2:\n\t\treturn -1\n\tcase v1 > v2:\n\t\treturn +1\n\tdefault:\n\t\treturn 0\n\t}\n}\n\nfunc compareInt64(v1, v2 int64) int {\n\tswitch {\n\tcase v1 < v2:\n\t\treturn -1\n\tcase v1 > v2:\n\t\treturn +1\n\tdefault:\n\t\treturn 0\n\t}\n}\n\nfunc compareInt96(v1, v2 deprecated.Int96) int {\n\tswitch {\n\tcase v1.Less(v2):\n\t\treturn -1\n\tcase v2.Less(v1):\n\t\treturn +1\n\tdefault:\n\t\treturn 0\n\t}\n}\n\nfunc compareFloat32(v1, v2 float32) int {\n\tswitch {\n\tcase v1 < v2:\n\t\treturn -1\n\tcase v1 > v2:\n\t\treturn +1\n\tdefault:\n\t\treturn 0\n\t}\n}\n\nfunc compareFloat64(v1, v2 float64) int {\n\tswitch {\n\tcase v1 < v2:\n\t\treturn -1\n\tcase v1 > v2:\n\t\treturn +1\n\tdefault:\n\t\treturn 0\n\t}\n}\n\nfunc compareUint32(v1, v2 uint32) int {\n\tswitch {\n\tcase v1 < v2:\n\t\treturn -1\n\tcase v1 > v2:\n\t\treturn +1\n\tdefault:\n\t\treturn 0\n\t}\n}\n\nfunc compareUint64(v1, v2 uint64) int {\n\tswitch {\n\tcase v1 < v2:\n\t\treturn -1\n\tcase v1 > v2:\n\t\treturn +1\n\tdefault:\n\t\treturn 0\n\t}\n}\n\nfunc compareBE128(v1, v2 *[16]byte) int {\n\tx := binary.BigEndian.Uint64(v1[:8])\n\ty := binary.BigEndian.Uint64(v2[:8])\n\tswitch {\n\tcase x < y:\n\t\treturn -1\n\tcase x > y:\n\t\treturn +1\n\t}\n\tx = binary.BigEndian.Uint64(v1[8:])\n\ty = binary.BigEndian.Uint64(v2[8:])\n\tswitch {\n\tcase x < y:\n\t\treturn -1\n\tcase x > y:\n\t\treturn +1\n\tdefault:\n\t\treturn 0\n\t}\n}\n\nfunc lessBE128(v1, v2 *[16]byte) bool {\n\tx := binary.BigEndian.Uint64(v1[:8])\n\ty := binary.BigEndian.Uint64(v2[:8])\n\tswitch {\n\tcase x < y:\n\t\treturn true\n\tcase x > y:\n\t\treturn false\n\t}\n\tx = binary.BigEndian.Uint64(v1[8:])\n\ty = binary.BigEndian.Uint64(v2[8:])\n\treturn x < y\n}\n\nfunc compareRowsFuncOf(schema *Schema, sortingColumns []SortingColumn) func(Row, Row) int {\n\tleafColumns := make([]leafColumn, len(sortingColumns))\n\tcanCompareRows := true\n\n\tforEachLeafColumnOf(schema, func(leaf leafColumn) {\n\t\tif leaf.maxRepetitionLevel > 0 {\n\t\t\tcanCompareRows = false\n\t\t}\n\n\t\tif sortingIndex := searchSortingColumn(sortingColumns, leaf.path); sortingIndex < len(sortingColumns) {\n\t\t\tleafColumns[sortingIndex] = leaf\n\n\t\t\tif leaf.maxDefinitionLevel > 0 {\n\t\t\t\tcanCompareRows = false\n\t\t\t}\n\t\t}\n\t})\n\n\t// This is an optimization for the common case where rows\n\t// are sorted by non-optional, non-repeated columns.\n\t//\n\t// The sort function can make the assumption that it will\n\t// find the column value at the current column index, and\n\t// does not need to scan the rows looking for values with\n\t// a matching column index.\n\tif canCompareRows {\n\t\treturn compareRowsFuncOfColumnIndexes(leafColumns, sortingColumns)\n\t}\n\n\treturn compareRowsFuncOfColumnValues(leafColumns, sortingColumns)\n}\n\nfunc compareRowsUnordered(Row, Row) int { return 0 }\n\n//go:noinline\nfunc compareRowsFuncOfIndexColumns(compareFuncs []func(Row, Row) int) func(Row, Row) int {\n\treturn func(row1, row2 Row) int {\n\t\tfor _, compare := range compareFuncs {\n\t\t\tif cmp := compare(row1, row2); cmp != 0 {\n\t\t\t\treturn cmp\n\t\t\t}\n\t\t}\n\t\treturn 0\n\t}\n}\n\n//go:noinline\nfunc compareRowsFuncOfIndexAscending(columnIndex int16, typ Type) func(Row, Row) int {\n\treturn func(row1, row2 Row) int { return typ.Compare(row1[columnIndex], row2[columnIndex]) }\n}\n\n//go:noinline\nfunc compareRowsFuncOfIndexDescending(columnIndex int16, typ Type) func(Row, Row) int {\n\treturn func(row1, row2 Row) int { return -typ.Compare(row1[columnIndex], row2[columnIndex]) }\n}\n\n//go:noinline\nfunc compareRowsFuncOfColumnIndexes(leafColumns []leafColumn, sortingColumns []SortingColumn) func(Row, Row) int {\n\tcompareFuncs := make([]func(Row, Row) int, len(sortingColumns))\n\n\tfor sortingIndex, sortingColumn := range sortingColumns {\n\t\tleaf := leafColumns[sortingIndex]\n\t\ttyp := leaf.node.Type()\n\n\t\tif sortingColumn.Descending() {\n\t\t\tcompareFuncs[sortingIndex] = compareRowsFuncOfIndexDescending(leaf.columnIndex, typ)\n\t\t} else {\n\t\t\tcompareFuncs[sortingIndex] = compareRowsFuncOfIndexAscending(leaf.columnIndex, typ)\n\t\t}\n\t}\n\n\tswitch len(compareFuncs) {\n\tcase 0:\n\t\treturn compareRowsUnordered\n\tcase 1:\n\t\treturn compareFuncs[0]\n\tdefault:\n\t\treturn compareRowsFuncOfIndexColumns(compareFuncs)\n\t}\n}\n\n//go:noinline\nfunc compareRowsFuncOfColumnValues(leafColumns []leafColumn, sortingColumns []SortingColumn) func(Row, Row) int {\n\thighestColumnIndex := int16(0)\n\tcolumnIndexes := make([]int16, len(sortingColumns))\n\tcompareFuncs := make([]func(Value, Value) int, len(sortingColumns))\n\n\tfor sortingIndex, sortingColumn := range sortingColumns {\n\t\tleaf := leafColumns[sortingIndex]\n\t\tcompare := leaf.node.Type().Compare\n\n\t\tif sortingColumn.Descending() {\n\t\t\tcompare = CompareDescending(compare)\n\t\t}\n\n\t\tif leaf.maxDefinitionLevel > 0 {\n\t\t\tif sortingColumn.NullsFirst() {\n\t\t\t\tcompare = CompareNullsFirst(compare)\n\t\t\t} else {\n\t\t\t\tcompare = CompareNullsLast(compare)\n\t\t\t}\n\t\t}\n\n\t\tcolumnIndexes[sortingIndex] = leaf.columnIndex\n\t\tcompareFuncs[sortingIndex] = compare\n\n\t\tif leaf.columnIndex > highestColumnIndex {\n\t\t\thighestColumnIndex = leaf.columnIndex\n\t\t}\n\t}\n\n\treturn func(row1, row2 Row) int {\n\t\tcolumns1 := make([][2]int32, 0, 64)\n\t\tcolumns2 := make([][2]int32, 0, 64)\n\n\t\ti1 := 0\n\t\ti2 := 0\n\n\t\tfor columnIndex := int16(0); columnIndex <= highestColumnIndex; columnIndex++ {\n\t\t\tj1 := i1 + 1\n\t\t\tj2 := i2 + 1\n\n\t\t\tfor j1 < len(row1) && row1[j1].columnIndex == ^columnIndex {\n\t\t\t\tj1++\n\t\t\t}\n\n\t\t\tfor j2 < len(row2) && row2[j2].columnIndex == ^columnIndex {\n\t\t\t\tj2++\n\t\t\t}\n\n\t\t\tcolumns1 = append(columns1, [2]int32{int32(i1), int32(j1)})\n\t\t\tcolumns2 = append(columns2, [2]int32{int32(i2), int32(j2)})\n\t\t\ti1 = j1\n\t\t\ti2 = j2\n\t\t}\n\n\t\tfor i, compare := range compareFuncs {\n\t\t\tcolumnIndex := columnIndexes[i]\n\t\t\toffsets1 := columns1[columnIndex]\n\t\t\toffsets2 := columns2[columnIndex]\n\t\t\tvalues1 := row1[offsets1[0]:offsets1[1]:offsets1[1]]\n\t\t\tvalues2 := row2[offsets2[0]:offsets2[1]:offsets2[1]]\n\t\t\ti1 := 0\n\t\t\ti2 := 0\n\n\t\t\tfor i1 < len(values1) && i2 < len(values2) {\n\t\t\t\tif cmp := compare(values1[i1], values2[i2]); cmp != 0 {\n\t\t\t\t\treturn cmp\n\t\t\t\t}\n\t\t\t\ti1++\n\t\t\t\ti2++\n\t\t\t}\n\n\t\t\tif i1 < len(values1) {\n\t\t\t\treturn +1\n\t\t\t}\n\t\t\tif i2 < len(values2) {\n\t\t\t\treturn -1\n\t\t\t}\n\t\t}\n\t\treturn 0\n\t}\n}\n"
  },
  {
    "path": "compare_test.go",
    "content": "package parquet\n\nimport \"testing\"\n\nfunc assertCompare(t *testing.T, a, b Value, cmp func(Value, Value) int, want int) {\n\tif got := cmp(a, b); got != want {\n\t\tt.Errorf(\"compare(%v, %v): got=%d want=%d\", a, b, got, want)\n\t}\n}\n\nfunc TestCompareNullsFirst(t *testing.T) {\n\tcmp := CompareNullsFirst(Int32Type.Compare)\n\tassertCompare(t, Value{}, Value{}, cmp, 0)\n\tassertCompare(t, Value{}, ValueOf(int32(0)), cmp, -1)\n\tassertCompare(t, ValueOf(int32(0)), Value{}, cmp, +1)\n\tassertCompare(t, ValueOf(int32(0)), ValueOf(int32(1)), cmp, -1)\n}\n\nfunc TestCompareNullsLast(t *testing.T) {\n\tcmp := CompareNullsLast(Int32Type.Compare)\n\tassertCompare(t, Value{}, Value{}, cmp, 0)\n\tassertCompare(t, Value{}, ValueOf(int32(0)), cmp, +1)\n\tassertCompare(t, ValueOf(int32(0)), Value{}, cmp, -1)\n\tassertCompare(t, ValueOf(int32(0)), ValueOf(int32(1)), cmp, -1)\n}\n\nfunc BenchmarkCompareBE128(b *testing.B) {\n\tv1 := [16]byte{}\n\tv2 := [16]byte{}\n\n\tfor i := 0; i < b.N; i++ {\n\t\tcompareBE128(&v1, &v2)\n\t}\n}\n\nfunc BenchmarkLessBE128(b *testing.B) {\n\tv1 := [16]byte{}\n\tv2 := [16]byte{}\n\n\tfor i := 0; i < b.N; i++ {\n\t\tlessBE128(&v1, &v2)\n\t}\n}\n"
  },
  {
    "path": "compress/brotli/brotli.go",
    "content": "// Package brotli implements the BROTLI parquet compression codec.\npackage brotli\n\nimport (\n\t\"io\"\n\n\t\"github.com/andybalholm/brotli\"\n\t\"github.com/segmentio/parquet-go/compress\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\nconst (\n\tDefaultQuality = 0\n\tDefaultLGWin   = 0\n)\n\ntype Codec struct {\n\t// Quality controls the compression-speed vs compression-density trade-offs.\n\t// The higher the quality, the slower the compression. Range is 0 to 11.\n\tQuality int\n\t// LGWin is the base 2 logarithm of the sliding window size.\n\t// Range is 10 to 24. 0 indicates automatic configuration based on Quality.\n\tLGWin int\n\n\tr compress.Decompressor\n\tw compress.Compressor\n}\n\nfunc (c *Codec) String() string {\n\treturn \"BROTLI\"\n}\n\nfunc (c *Codec) CompressionCodec() format.CompressionCodec {\n\treturn format.Brotli\n}\n\nfunc (c *Codec) Encode(dst, src []byte) ([]byte, error) {\n\treturn c.w.Encode(dst, src, func(w io.Writer) (compress.Writer, error) {\n\t\treturn brotli.NewWriterOptions(w, brotli.WriterOptions{\n\t\t\tQuality: c.Quality,\n\t\t\tLGWin:   c.LGWin,\n\t\t}), nil\n\t})\n}\n\nfunc (c *Codec) Decode(dst, src []byte) ([]byte, error) {\n\treturn c.r.Decode(dst, src, func(r io.Reader) (compress.Reader, error) {\n\t\treturn reader{brotli.NewReader(r)}, nil\n\t})\n}\n\ntype reader struct{ *brotli.Reader }\n\nfunc (reader) Close() error { return nil }\n"
  },
  {
    "path": "compress/compress.go",
    "content": "// Package compress provides the generic APIs implemented by parquet compression\n// codecs.\n//\n// https://github.com/apache/parquet-format/blob/master/Compression.md\npackage compress\n\nimport (\n\t\"bytes\"\n\t\"io\"\n\t\"sync\"\n\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\n// The Codec interface represents parquet compression codecs implemented by the\n// compress sub-packages.\n//\n// Codec instances must be safe to use concurrently from multiple goroutines.\ntype Codec interface {\n\t// Returns a human-readable name for the codec.\n\tString() string\n\n\t// Returns the code of the compression codec in the parquet format.\n\tCompressionCodec() format.CompressionCodec\n\n\t// Writes the compressed version of src to dst and returns it.\n\t//\n\t// The method automatically reallocates the output buffer if its capacity\n\t// was too small to hold the compressed data.\n\tEncode(dst, src []byte) ([]byte, error)\n\n\t// Writes the uncompressed version of src to dst and returns it.\n\t//\n\t// The method automatically reallocates the output buffer if its capacity\n\t// was too small to hold the uncompressed data.\n\tDecode(dst, src []byte) ([]byte, error)\n}\n\ntype Reader interface {\n\tio.ReadCloser\n\tReset(io.Reader) error\n}\n\ntype Writer interface {\n\tio.WriteCloser\n\tReset(io.Writer)\n}\n\ntype Compressor struct {\n\twriters sync.Pool // *writer\n}\n\ntype writer struct {\n\toutput bytes.Buffer\n\twriter Writer\n}\n\nfunc (c *Compressor) Encode(dst, src []byte, newWriter func(io.Writer) (Writer, error)) ([]byte, error) {\n\tw, _ := c.writers.Get().(*writer)\n\tif w != nil {\n\t\tw.output = *bytes.NewBuffer(dst[:0])\n\t\tw.writer.Reset(&w.output)\n\t} else {\n\t\tw = new(writer)\n\t\tw.output = *bytes.NewBuffer(dst[:0])\n\t\tvar err error\n\t\tif w.writer, err = newWriter(&w.output); err != nil {\n\t\t\treturn dst, err\n\t\t}\n\t}\n\n\tdefer func() {\n\t\tw.output = *bytes.NewBuffer(nil)\n\t\tw.writer.Reset(io.Discard)\n\t\tc.writers.Put(w)\n\t}()\n\n\tif _, err := w.writer.Write(src); err != nil {\n\t\treturn w.output.Bytes(), err\n\t}\n\tif err := w.writer.Close(); err != nil {\n\t\treturn w.output.Bytes(), err\n\t}\n\treturn w.output.Bytes(), nil\n}\n\ntype Decompressor struct {\n\treaders sync.Pool // *reader\n}\n\ntype reader struct {\n\tinput  bytes.Reader\n\treader Reader\n}\n\nfunc (d *Decompressor) Decode(dst, src []byte, newReader func(io.Reader) (Reader, error)) ([]byte, error) {\n\tr, _ := d.readers.Get().(*reader)\n\tif r != nil {\n\t\tr.input.Reset(src)\n\t\tif err := r.reader.Reset(&r.input); err != nil {\n\t\t\treturn dst, err\n\t\t}\n\t} else {\n\t\tr = new(reader)\n\t\tr.input.Reset(src)\n\t\tvar err error\n\t\tif r.reader, err = newReader(&r.input); err != nil {\n\t\t\treturn dst, err\n\t\t}\n\t}\n\n\tdefer func() {\n\t\tr.input.Reset(nil)\n\t\tif err := r.reader.Reset(nil); err == nil {\n\t\t\td.readers.Put(r)\n\t\t}\n\t}()\n\n\tif cap(dst) == 0 {\n\t\tdst = make([]byte, 0, 2*len(src))\n\t} else {\n\t\tdst = dst[:0]\n\t}\n\n\tfor {\n\t\tn, err := r.reader.Read(dst[len(dst):cap(dst)])\n\t\tdst = dst[:len(dst)+n]\n\n\t\tif err != nil {\n\t\t\tif err == io.EOF {\n\t\t\t\terr = nil\n\t\t\t}\n\t\t\treturn dst, err\n\t\t}\n\n\t\tif len(dst) == cap(dst) {\n\t\t\ttmp := make([]byte, len(dst), 2*len(dst))\n\t\t\tcopy(tmp, dst)\n\t\t\tdst = tmp\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "compress/compress_test.go",
    "content": "package compress_test\n\nimport (\n\t\"bytes\"\n\t\"io\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/compress\"\n\t\"github.com/segmentio/parquet-go/compress/brotli\"\n\t\"github.com/segmentio/parquet-go/compress/gzip\"\n\t\"github.com/segmentio/parquet-go/compress/lz4\"\n\t\"github.com/segmentio/parquet-go/compress/snappy\"\n\t\"github.com/segmentio/parquet-go/compress/uncompressed\"\n\t\"github.com/segmentio/parquet-go/compress/zstd\"\n)\n\nvar tests = [...]struct {\n\tscenario string\n\tcodec    compress.Codec\n}{\n\t{\n\t\tscenario: \"uncompressed\",\n\t\tcodec:    new(uncompressed.Codec),\n\t},\n\n\t{\n\t\tscenario: \"snappy\",\n\t\tcodec:    new(snappy.Codec),\n\t},\n\n\t{\n\t\tscenario: \"gzip\",\n\t\tcodec:    new(gzip.Codec),\n\t},\n\n\t{\n\t\tscenario: \"brotli\",\n\t\tcodec:    new(brotli.Codec),\n\t},\n\n\t{\n\t\tscenario: \"zstd\",\n\t\tcodec:    new(zstd.Codec),\n\t},\n\n\t{\n\t\tscenario: \"lz4\",\n\t\tcodec:    new(lz4.Codec),\n\t},\n}\n\nvar testdata = bytes.Repeat([]byte(\"1234567890qwertyuiopasdfghjklzxcvbnm\"), 10e3)\n\nfunc TestCompressionCodec(t *testing.T) {\n\tbuffer := make([]byte, 0, len(testdata))\n\toutput := make([]byte, 0, len(testdata))\n\n\tfor _, test := range tests {\n\t\tt.Run(test.scenario, func(t *testing.T) {\n\t\t\tconst N = 10\n\t\t\t// Run the test multiple times to exercise codecs that maintain\n\t\t\t// state across compression/decompression.\n\t\t\tfor i := 0; i < N; i++ {\n\t\t\t\tvar err error\n\n\t\t\t\tbuffer, err = test.codec.Encode(buffer[:0], testdata)\n\t\t\t\tif err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\n\t\t\t\toutput, err = test.codec.Decode(output[:0], buffer)\n\t\t\t\tif err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\n\t\t\t\tif !bytes.Equal(testdata, output) {\n\t\t\t\t\tt.Errorf(\"content mismatch after compressing and decompressing (attempt %d/%d)\", i+1, N)\n\t\t\t\t}\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc BenchmarkEncode(b *testing.B) {\n\tbuffer := make([]byte, 0, len(testdata))\n\n\tfor _, test := range tests {\n\t\tb.Run(test.scenario, func(b *testing.B) {\n\t\t\tb.SetBytes(int64(len(testdata)))\n\t\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\t\tbuffer, _ = test.codec.Encode(buffer[:0], testdata)\n\t\t\t})\n\t\t})\n\t}\n}\n\nfunc BenchmarkDecode(b *testing.B) {\n\tbuffer := make([]byte, 0, len(testdata))\n\toutput := make([]byte, 0, len(testdata))\n\n\tfor _, test := range tests {\n\t\tb.Run(test.scenario, func(b *testing.B) {\n\t\t\tbuffer, _ = test.codec.Encode(buffer[:0], testdata)\n\t\t\tb.SetBytes(int64(len(testdata)))\n\t\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\t\toutput, _ = test.codec.Encode(output[:0], buffer)\n\t\t\t})\n\t\t})\n\t}\n}\n\ntype simpleReader struct{ io.Reader }\n\nfunc (s *simpleReader) Close() error            { return nil }\nfunc (s *simpleReader) Reset(r io.Reader) error { s.Reader = r; return nil }\n\ntype simpleWriter struct{ io.Writer }\n\nfunc (s *simpleWriter) Close() error      { return nil }\nfunc (s *simpleWriter) Reset(w io.Writer) { s.Writer = w }\n\nfunc BenchmarkCompressor(b *testing.B) {\n\tcompressor := compress.Compressor{}\n\tsrc := make([]byte, 1000)\n\tdst := make([]byte, 1000)\n\n\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\tdst, _ = compressor.Encode(dst, src, func(w io.Writer) (compress.Writer, error) {\n\t\t\treturn &simpleWriter{Writer: w}, nil\n\t\t})\n\t})\n}\n\nfunc BenchmarkDecompressor(b *testing.B) {\n\tdecompressor := compress.Decompressor{}\n\tsrc := make([]byte, 1000)\n\tdst := make([]byte, 1000)\n\n\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\tdst, _ = decompressor.Decode(dst, src, func(r io.Reader) (compress.Reader, error) {\n\t\t\treturn &simpleReader{Reader: r}, nil\n\t\t})\n\t})\n}\n\nfunc benchmarkZeroAllocsPerRun(b *testing.B, f func()) {\n\tif allocs := testing.AllocsPerRun(b.N, f); allocs != 0 && !testing.Short() {\n\t\tb.Errorf(\"too many memory allocations: %g > 0\", allocs)\n\t}\n}\n"
  },
  {
    "path": "compress/gzip/gzip.go",
    "content": "// Package gzip implements the GZIP parquet compression codec.\npackage gzip\n\nimport (\n\t\"io\"\n\t\"strings\"\n\n\t\"github.com/klauspost/compress/gzip\"\n\t\"github.com/segmentio/parquet-go/compress\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\nconst (\n\temptyGzip = \"\\x1f\\x8b\\b\\x00\\x00\\x00\\x00\\x00\\x02\\xff\\x01\\x00\\x00\\xff\\xff\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\"\n)\n\nconst (\n\tNoCompression      = gzip.NoCompression\n\tBestSpeed          = gzip.BestSpeed\n\tBestCompression    = gzip.BestCompression\n\tDefaultCompression = gzip.DefaultCompression\n\tHuffmanOnly        = gzip.HuffmanOnly\n)\n\ntype Codec struct {\n\tLevel int\n\n\tr compress.Decompressor\n\tw compress.Compressor\n}\n\nfunc (c *Codec) String() string {\n\treturn \"GZIP\"\n}\n\nfunc (c *Codec) CompressionCodec() format.CompressionCodec {\n\treturn format.Gzip\n}\n\nfunc (c *Codec) Encode(dst, src []byte) ([]byte, error) {\n\treturn c.w.Encode(dst, src, func(w io.Writer) (compress.Writer, error) {\n\t\treturn gzip.NewWriterLevel(w, c.Level)\n\t})\n}\n\nfunc (c *Codec) Decode(dst, src []byte) ([]byte, error) {\n\treturn c.r.Decode(dst, src, func(r io.Reader) (compress.Reader, error) {\n\t\tz, err := gzip.NewReader(r)\n\t\tif err != nil {\n\t\t\treturn nil, err\n\t\t}\n\t\treturn &reader{Reader: z}, nil\n\t})\n}\n\ntype reader struct {\n\t*gzip.Reader\n\temptyGzip strings.Reader\n}\n\nfunc (r *reader) Reset(rr io.Reader) error {\n\tif rr == nil {\n\t\tr.emptyGzip.Reset(emptyGzip)\n\t\trr = &r.emptyGzip\n\t}\n\treturn r.Reader.Reset(rr)\n}\n"
  },
  {
    "path": "compress/lz4/lz4.go",
    "content": "// Package lz4 implements the LZ4_RAW parquet compression codec.\npackage lz4\n\nimport (\n\t\"github.com/pierrec/lz4/v4\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\ntype Level = lz4.CompressionLevel\n\nconst (\n\tFast   = lz4.Fast\n\tLevel1 = lz4.Level1\n\tLevel2 = lz4.Level2\n\tLevel3 = lz4.Level3\n\tLevel4 = lz4.Level4\n\tLevel5 = lz4.Level5\n\tLevel6 = lz4.Level6\n\tLevel7 = lz4.Level7\n\tLevel8 = lz4.Level8\n\tLevel9 = lz4.Level9\n)\n\nconst (\n\tDefaultLevel = Fast\n)\n\ntype Codec struct {\n\tLevel Level\n}\n\nfunc (c *Codec) String() string {\n\treturn \"LZ4_RAW\"\n}\n\nfunc (c *Codec) CompressionCodec() format.CompressionCodec {\n\treturn format.Lz4Raw\n}\n\nfunc (c *Codec) Encode(dst, src []byte) ([]byte, error) {\n\tdst = reserveAtLeast(dst, len(src)/4)\n\n\tcompressor := lz4.CompressorHC{Level: c.Level}\n\tfor {\n\t\tn, err := compressor.CompressBlock(src, dst)\n\t\tif err != nil { // see Decode for details about error handling\n\t\t\tdst = make([]byte, 2*len(dst))\n\t\t} else if n == 0 {\n\t\t\tdst = reserveAtLeast(dst, lz4.CompressBlockBound(len(src)))\n\t\t} else {\n\t\t\treturn dst[:n], nil\n\t\t}\n\t}\n}\n\nfunc (c *Codec) Decode(dst, src []byte) ([]byte, error) {\n\t// 3x seems like a common compression ratio, so we optimistically size the\n\t// output buffer to that size. Feel free to change the value if you observe\n\t// different behaviors.\n\tdst = reserveAtLeast(dst, 3*len(src))\n\n\tfor {\n\t\tn, err := lz4.UncompressBlock(src, dst)\n\t\t// The lz4 package does not expose the error values, they are declared\n\t\t// in internal/lz4errors. Based on what I read of the implementation,\n\t\t// the only condition where this function errors is if the output buffer\n\t\t// was too short.\n\t\t//\n\t\t// https://github.com/pierrec/lz4/blob/a5532e5996ee86d17f8ce2694c08fb5bf3c6b471/internal/lz4block/block.go#L45-L53\n\t\tif err != nil {\n\t\t\tdst = make([]byte, 2*len(dst))\n\t\t} else {\n\t\t\treturn dst[:n], nil\n\t\t}\n\t}\n}\n\nfunc reserveAtLeast(b []byte, n int) []byte {\n\tif cap(b) < n {\n\t\tb = make([]byte, n)\n\t} else {\n\t\tb = b[:cap(b)]\n\t}\n\treturn b\n}\n"
  },
  {
    "path": "compress/snappy/snappy.go",
    "content": "// Package snappy implements the SNAPPY parquet compression codec.\npackage snappy\n\nimport (\n\t\"github.com/klauspost/compress/snappy\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\ntype Codec struct {\n}\n\n// The snappy.Reader and snappy.Writer implement snappy encoding/decoding with\n// a framing protocol, but snappy requires the implementation to use the raw\n// snappy block encoding. This is why we need to use snappy.Encode/snappy.Decode\n// and have to ship custom implementations of the compressed reader and writer.\n\nfunc (c *Codec) String() string {\n\treturn \"SNAPPY\"\n}\n\nfunc (c *Codec) CompressionCodec() format.CompressionCodec {\n\treturn format.Snappy\n}\n\nfunc (c *Codec) Encode(dst, src []byte) ([]byte, error) {\n\treturn snappy.Encode(dst, src), nil\n}\n\nfunc (c *Codec) Decode(dst, src []byte) ([]byte, error) {\n\treturn snappy.Decode(dst, src)\n}\n"
  },
  {
    "path": "compress/uncompressed/uncompressed.go",
    "content": "// Package uncompressed provides implementations of the compression codec\n// interfaces as pass-through without applying any compression nor\n// decompression.\npackage uncompressed\n\nimport (\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\ntype Codec struct {\n}\n\nfunc (c *Codec) String() string {\n\treturn \"UNCOMPRESSED\"\n}\n\nfunc (c *Codec) CompressionCodec() format.CompressionCodec {\n\treturn format.Uncompressed\n}\n\nfunc (c *Codec) Encode(dst, src []byte) ([]byte, error) {\n\treturn append(dst[:0], src...), nil\n}\n\nfunc (c *Codec) Decode(dst, src []byte) ([]byte, error) {\n\treturn append(dst[:0], src...), nil\n}\n"
  },
  {
    "path": "compress/zstd/zstd.go",
    "content": "// Package zstd implements the ZSTD parquet compression codec.\npackage zstd\n\nimport (\n\t\"sync\"\n\n\t\"github.com/klauspost/compress/zstd\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\ntype Level = zstd.EncoderLevel\n\nconst (\n\t// SpeedFastest will choose the fastest reasonable compression.\n\t// This is roughly equivalent to the fastest Zstandard mode.\n\tSpeedFastest = zstd.SpeedFastest\n\n\t// SpeedDefault is the default \"pretty fast\" compression option.\n\t// This is roughly equivalent to the default Zstandard mode (level 3).\n\tSpeedDefault = zstd.SpeedDefault\n\n\t// SpeedBetterCompression will yield better compression than the default.\n\t// Currently it is about zstd level 7-8 with ~ 2x-3x the default CPU usage.\n\t// By using this, notice that CPU usage may go up in the future.\n\tSpeedBetterCompression = zstd.SpeedBetterCompression\n\n\t// SpeedBestCompression will choose the best available compression option.\n\t// This will offer the best compression no matter the CPU cost.\n\tSpeedBestCompression = zstd.SpeedBestCompression\n)\n\nconst (\n\tDefaultLevel = SpeedDefault\n)\n\ntype Codec struct {\n\tLevel Level\n\n\tencoders sync.Pool // *zstd.Encoder\n\tdecoders sync.Pool // *zstd.Decoder\n}\n\nfunc (c *Codec) String() string {\n\treturn \"ZSTD\"\n}\n\nfunc (c *Codec) CompressionCodec() format.CompressionCodec {\n\treturn format.Zstd\n}\n\nfunc (c *Codec) Encode(dst, src []byte) ([]byte, error) {\n\te, _ := c.encoders.Get().(*zstd.Encoder)\n\tif e == nil {\n\t\tvar err error\n\t\te, err = zstd.NewWriter(nil,\n\t\t\tzstd.WithEncoderConcurrency(1),\n\t\t\tzstd.WithEncoderLevel(c.level()),\n\t\t\tzstd.WithZeroFrames(true),\n\t\t\tzstd.WithEncoderCRC(false),\n\t\t)\n\t\tif err != nil {\n\t\t\treturn dst[:0], err\n\t\t}\n\t}\n\tdefer c.encoders.Put(e)\n\treturn e.EncodeAll(src, dst[:0]), nil\n}\n\nfunc (c *Codec) Decode(dst, src []byte) ([]byte, error) {\n\td, _ := c.decoders.Get().(*zstd.Decoder)\n\tif d == nil {\n\t\tvar err error\n\t\td, err = zstd.NewReader(nil,\n\t\t\tzstd.WithDecoderConcurrency(1),\n\t\t)\n\t\tif err != nil {\n\t\t\treturn dst[:0], err\n\t\t}\n\t}\n\tdefer c.decoders.Put(d)\n\treturn d.DecodeAll(src, dst[:0])\n}\n\nfunc (c *Codec) level() Level {\n\tif c.Level != 0 {\n\t\treturn c.Level\n\t}\n\treturn DefaultLevel\n}\n"
  },
  {
    "path": "compress.go",
    "content": "package parquet\n\nimport (\n\t\"fmt\"\n\n\t\"github.com/segmentio/parquet-go/compress\"\n\t\"github.com/segmentio/parquet-go/compress/brotli\"\n\t\"github.com/segmentio/parquet-go/compress/gzip\"\n\t\"github.com/segmentio/parquet-go/compress/lz4\"\n\t\"github.com/segmentio/parquet-go/compress/snappy\"\n\t\"github.com/segmentio/parquet-go/compress/uncompressed\"\n\t\"github.com/segmentio/parquet-go/compress/zstd\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\nvar (\n\t// Uncompressed is a parquet compression codec representing uncompressed\n\t// pages.\n\tUncompressed uncompressed.Codec\n\n\t// Snappy is the SNAPPY parquet compression codec.\n\tSnappy snappy.Codec\n\n\t// Gzip is the GZIP parquet compression codec.\n\tGzip = gzip.Codec{\n\t\tLevel: gzip.DefaultCompression,\n\t}\n\n\t// Brotli is the BROTLI parquet compression codec.\n\tBrotli = brotli.Codec{\n\t\tQuality: brotli.DefaultQuality,\n\t\tLGWin:   brotli.DefaultLGWin,\n\t}\n\n\t// Zstd is the ZSTD parquet compression codec.\n\tZstd = zstd.Codec{\n\t\tLevel: zstd.DefaultLevel,\n\t}\n\n\t// Lz4Raw is the LZ4_RAW parquet compression codec.\n\tLz4Raw = lz4.Codec{\n\t\tLevel: lz4.DefaultLevel,\n\t}\n\n\t// Table of compression codecs indexed by their code in the parquet format.\n\tcompressionCodecs = [...]compress.Codec{\n\t\tformat.Uncompressed: &Uncompressed,\n\t\tformat.Snappy:       &Snappy,\n\t\tformat.Gzip:         &Gzip,\n\t\tformat.Brotli:       &Brotli,\n\t\tformat.Zstd:         &Zstd,\n\t\tformat.Lz4Raw:       &Lz4Raw,\n\t}\n)\n\n// LookupCompressionCodec returns the compression codec associated with the\n// given code.\n//\n// The function never returns nil. If the encoding is not supported,\n// an \"unsupported\" codec is returned.\nfunc LookupCompressionCodec(codec format.CompressionCodec) compress.Codec {\n\tif codec >= 0 && int(codec) < len(compressionCodecs) {\n\t\tif c := compressionCodecs[codec]; c != nil {\n\t\t\treturn c\n\t\t}\n\t}\n\treturn &unsupported{codec}\n}\n\ntype unsupported struct {\n\tcodec format.CompressionCodec\n}\n\nfunc (u *unsupported) String() string {\n\treturn \"UNSUPPORTED\"\n}\n\nfunc (u *unsupported) CompressionCodec() format.CompressionCodec {\n\treturn u.codec\n}\n\nfunc (u *unsupported) Encode(dst, src []byte) ([]byte, error) {\n\treturn dst[:0], u.error()\n}\n\nfunc (u *unsupported) Decode(dst, src []byte) ([]byte, error) {\n\treturn dst[:0], u.error()\n}\n\nfunc (u *unsupported) error() error {\n\treturn fmt.Errorf(\"unsupported compression codec: %s\", u.codec)\n}\n\nfunc isCompressed(c compress.Codec) bool {\n\treturn c != nil && c.CompressionCodec() != format.Uncompressed\n}\n"
  },
  {
    "path": "config.go",
    "content": "package parquet\n\nimport (\n\t\"fmt\"\n\t\"math\"\n\t\"runtime/debug\"\n\t\"strings\"\n\t\"sync\"\n\n\t\"github.com/segmentio/parquet-go/compress\"\n)\n\n// ReadMode is an enum that is used to configure the way that a File reads pages.\ntype ReadMode int\n\nconst (\n\tReadModeSync  ReadMode = iota // ReadModeSync reads pages synchronously on demand (Default).\n\tReadModeAsync                 // ReadModeAsync reads pages asynchronously in the background.\n)\n\nconst (\n\tDefaultColumnIndexSizeLimit = 16\n\tDefaultColumnBufferCapacity = 16 * 1024\n\tDefaultPageBufferSize       = 256 * 1024\n\tDefaultWriteBufferSize      = 32 * 1024\n\tDefaultDataPageVersion      = 2\n\tDefaultDataPageStatistics   = false\n\tDefaultSkipPageIndex        = false\n\tDefaultSkipBloomFilters     = false\n\tDefaultMaxRowsPerRowGroup   = math.MaxInt64\n\tDefaultReadMode             = ReadModeSync\n)\n\nconst (\n\tparquetGoModulePath = \"github.com/segmentio/parquet-go\"\n)\n\nvar (\n\tdefaultCreatedByInfo string\n\tdefaultCreatedByOnce sync.Once\n)\n\nfunc defaultCreatedBy() string {\n\tdefaultCreatedByOnce.Do(func() {\n\t\tcreatedBy := parquetGoModulePath\n\t\tbuild, ok := debug.ReadBuildInfo()\n\t\tif ok {\n\t\t\tfor _, mod := range build.Deps {\n\t\t\t\tif mod.Replace == nil && mod.Path == parquetGoModulePath {\n\t\t\t\t\tsemver, _, buildsha := parseModuleVersion(mod.Version)\n\t\t\t\t\tcreatedBy = formatCreatedBy(createdBy, semver, buildsha)\n\t\t\t\t\tbreak\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tdefaultCreatedByInfo = createdBy\n\t})\n\treturn defaultCreatedByInfo\n}\n\nfunc parseModuleVersion(version string) (semver, datetime, buildsha string) {\n\tsemver, version = splitModuleVersion(version)\n\tdatetime, version = splitModuleVersion(version)\n\tbuildsha, _ = splitModuleVersion(version)\n\tsemver = strings.TrimPrefix(semver, \"v\")\n\treturn\n}\n\nfunc splitModuleVersion(s string) (head, tail string) {\n\tif i := strings.IndexByte(s, '-'); i < 0 {\n\t\thead = s\n\t} else {\n\t\thead, tail = s[:i], s[i+1:]\n\t}\n\treturn\n}\n\nfunc formatCreatedBy(application, version, build string) string {\n\treturn application + \" version \" + version + \"(build \" + build + \")\"\n}\n\n// The FileConfig type carries configuration options for parquet files.\n//\n// FileConfig implements the FileOption interface so it can be used directly\n// as argument to the OpenFile function when needed, for example:\n//\n//\tf, err := parquet.OpenFile(reader, size, &parquet.FileConfig{\n//\t\tSkipPageIndex:    true,\n//\t\tSkipBloomFilters: true,\n//\t\tReadMode:         ReadModeAsync,\n//\t})\ntype FileConfig struct {\n\tSkipPageIndex    bool\n\tSkipBloomFilters bool\n\tReadBufferSize   int\n\tReadMode         ReadMode\n\tSchema           *Schema\n}\n\n// DefaultFileConfig returns a new FileConfig value initialized with the\n// default file configuration.\nfunc DefaultFileConfig() *FileConfig {\n\treturn &FileConfig{\n\t\tSkipPageIndex:    DefaultSkipPageIndex,\n\t\tSkipBloomFilters: DefaultSkipBloomFilters,\n\t\tReadBufferSize:   defaultReadBufferSize,\n\t\tReadMode:         DefaultReadMode,\n\t\tSchema:           nil,\n\t}\n}\n\n// NewFileConfig constructs a new file configuration applying the options passed\n// as arguments.\n//\n// The function returns an non-nil error if some of the options carried invalid\n// configuration values.\nfunc NewFileConfig(options ...FileOption) (*FileConfig, error) {\n\tconfig := DefaultFileConfig()\n\tconfig.Apply(options...)\n\treturn config, config.Validate()\n}\n\n// Apply applies the given list of options to c.\nfunc (c *FileConfig) Apply(options ...FileOption) {\n\tfor _, opt := range options {\n\t\topt.ConfigureFile(c)\n\t}\n}\n\n// ConfigureFile applies configuration options from c to config.\nfunc (c *FileConfig) ConfigureFile(config *FileConfig) {\n\t*config = FileConfig{\n\t\tSkipPageIndex:    c.SkipPageIndex,\n\t\tSkipBloomFilters: c.SkipBloomFilters,\n\t\tReadBufferSize:   coalesceInt(c.ReadBufferSize, config.ReadBufferSize),\n\t\tReadMode:         ReadMode(coalesceInt(int(c.ReadMode), int(config.ReadMode))),\n\t\tSchema:           coalesceSchema(c.Schema, config.Schema),\n\t}\n}\n\n// Validate returns a non-nil error if the configuration of c is invalid.\nfunc (c *FileConfig) Validate() error {\n\treturn nil\n}\n\n// The ReaderConfig type carries configuration options for parquet readers.\n//\n// ReaderConfig implements the ReaderOption interface so it can be used directly\n// as argument to the NewReader function when needed, for example:\n//\n//\treader := parquet.NewReader(output, schema, &parquet.ReaderConfig{\n//\t\t// ...\n//\t})\ntype ReaderConfig struct {\n\tSchema *Schema\n}\n\n// DefaultReaderConfig returns a new ReaderConfig value initialized with the\n// default reader configuration.\nfunc DefaultReaderConfig() *ReaderConfig {\n\treturn &ReaderConfig{}\n}\n\n// NewReaderConfig constructs a new reader configuration applying the options\n// passed as arguments.\n//\n// The function returns an non-nil error if some of the options carried invalid\n// configuration values.\nfunc NewReaderConfig(options ...ReaderOption) (*ReaderConfig, error) {\n\tconfig := DefaultReaderConfig()\n\tconfig.Apply(options...)\n\treturn config, config.Validate()\n}\n\n// Apply applies the given list of options to c.\nfunc (c *ReaderConfig) Apply(options ...ReaderOption) {\n\tfor _, opt := range options {\n\t\topt.ConfigureReader(c)\n\t}\n}\n\n// ConfigureReader applies configuration options from c to config.\nfunc (c *ReaderConfig) ConfigureReader(config *ReaderConfig) {\n\t*config = ReaderConfig{\n\t\tSchema: coalesceSchema(c.Schema, config.Schema),\n\t}\n}\n\n// Validate returns a non-nil error if the configuration of c is invalid.\nfunc (c *ReaderConfig) Validate() error {\n\treturn nil\n}\n\n// The WriterConfig type carries configuration options for parquet writers.\n//\n// WriterConfig implements the WriterOption interface so it can be used directly\n// as argument to the NewWriter function when needed, for example:\n//\n//\twriter := parquet.NewWriter(output, schema, &parquet.WriterConfig{\n//\t\tCreatedBy: \"my test program\",\n//\t})\ntype WriterConfig struct {\n\tCreatedBy            string\n\tColumnPageBuffers    BufferPool\n\tColumnIndexSizeLimit int\n\tPageBufferSize       int\n\tWriteBufferSize      int\n\tDataPageVersion      int\n\tDataPageStatistics   bool\n\tMaxRowsPerRowGroup   int64\n\tKeyValueMetadata     map[string]string\n\tSchema               *Schema\n\tBloomFilters         []BloomFilterColumn\n\tCompression          compress.Codec\n\tSorting              SortingConfig\n}\n\n// DefaultWriterConfig returns a new WriterConfig value initialized with the\n// default writer configuration.\nfunc DefaultWriterConfig() *WriterConfig {\n\treturn &WriterConfig{\n\t\tCreatedBy:            defaultCreatedBy(),\n\t\tColumnPageBuffers:    &defaultColumnBufferPool,\n\t\tColumnIndexSizeLimit: DefaultColumnIndexSizeLimit,\n\t\tPageBufferSize:       DefaultPageBufferSize,\n\t\tWriteBufferSize:      DefaultWriteBufferSize,\n\t\tDataPageVersion:      DefaultDataPageVersion,\n\t\tDataPageStatistics:   DefaultDataPageStatistics,\n\t\tMaxRowsPerRowGroup:   DefaultMaxRowsPerRowGroup,\n\t\tSorting: SortingConfig{\n\t\t\tSortingBuffers: &defaultSortingBufferPool,\n\t\t},\n\t}\n}\n\n// NewWriterConfig constructs a new writer configuration applying the options\n// passed as arguments.\n//\n// The function returns an non-nil error if some of the options carried invalid\n// configuration values.\nfunc NewWriterConfig(options ...WriterOption) (*WriterConfig, error) {\n\tconfig := DefaultWriterConfig()\n\tconfig.Apply(options...)\n\treturn config, config.Validate()\n}\n\n// Apply applies the given list of options to c.\nfunc (c *WriterConfig) Apply(options ...WriterOption) {\n\tfor _, opt := range options {\n\t\topt.ConfigureWriter(c)\n\t}\n}\n\n// ConfigureWriter applies configuration options from c to config.\nfunc (c *WriterConfig) ConfigureWriter(config *WriterConfig) {\n\tkeyValueMetadata := config.KeyValueMetadata\n\tif len(c.KeyValueMetadata) > 0 {\n\t\tif keyValueMetadata == nil {\n\t\t\tkeyValueMetadata = make(map[string]string, len(c.KeyValueMetadata))\n\t\t}\n\t\tfor k, v := range c.KeyValueMetadata {\n\t\t\tkeyValueMetadata[k] = v\n\t\t}\n\t}\n\n\t*config = WriterConfig{\n\t\tCreatedBy:            coalesceString(c.CreatedBy, config.CreatedBy),\n\t\tColumnPageBuffers:    coalesceBufferPool(c.ColumnPageBuffers, config.ColumnPageBuffers),\n\t\tColumnIndexSizeLimit: coalesceInt(c.ColumnIndexSizeLimit, config.ColumnIndexSizeLimit),\n\t\tPageBufferSize:       coalesceInt(c.PageBufferSize, config.PageBufferSize),\n\t\tWriteBufferSize:      coalesceInt(c.WriteBufferSize, config.WriteBufferSize),\n\t\tDataPageVersion:      coalesceInt(c.DataPageVersion, config.DataPageVersion),\n\t\tDataPageStatistics:   config.DataPageStatistics,\n\t\tMaxRowsPerRowGroup:   config.MaxRowsPerRowGroup,\n\t\tKeyValueMetadata:     keyValueMetadata,\n\t\tSchema:               coalesceSchema(c.Schema, config.Schema),\n\t\tBloomFilters:         coalesceBloomFilters(c.BloomFilters, config.BloomFilters),\n\t\tCompression:          coalesceCompression(c.Compression, config.Compression),\n\t\tSorting:              coalesceSortingConfig(c.Sorting, config.Sorting),\n\t}\n}\n\n// Validate returns a non-nil error if the configuration of c is invalid.\nfunc (c *WriterConfig) Validate() error {\n\tconst baseName = \"parquet.(*WriterConfig).\"\n\treturn errorInvalidConfiguration(\n\t\tvalidateNotNil(baseName+\"ColumnPageBuffers\", c.ColumnPageBuffers),\n\t\tvalidatePositiveInt(baseName+\"ColumnIndexSizeLimit\", c.ColumnIndexSizeLimit),\n\t\tvalidatePositiveInt(baseName+\"PageBufferSize\", c.PageBufferSize),\n\t\tvalidateOneOfInt(baseName+\"DataPageVersion\", c.DataPageVersion, 1, 2),\n\t\tc.Sorting.Validate(),\n\t)\n}\n\n// The RowGroupConfig type carries configuration options for parquet row groups.\n//\n// RowGroupConfig implements the RowGroupOption interface so it can be used\n// directly as argument to the NewBuffer function when needed, for example:\n//\n//\tbuffer := parquet.NewBuffer(&parquet.RowGroupConfig{\n//\t\tColumnBufferCapacity: 10_000,\n//\t})\ntype RowGroupConfig struct {\n\tColumnBufferCapacity int\n\tSchema               *Schema\n\tSorting              SortingConfig\n}\n\n// DefaultRowGroupConfig returns a new RowGroupConfig value initialized with the\n// default row group configuration.\nfunc DefaultRowGroupConfig() *RowGroupConfig {\n\treturn &RowGroupConfig{\n\t\tColumnBufferCapacity: DefaultColumnBufferCapacity,\n\t\tSorting: SortingConfig{\n\t\t\tSortingBuffers: &defaultSortingBufferPool,\n\t\t},\n\t}\n}\n\n// NewRowGroupConfig constructs a new row group configuration applying the\n// options passed as arguments.\n//\n// The function returns an non-nil error if some of the options carried invalid\n// configuration values.\nfunc NewRowGroupConfig(options ...RowGroupOption) (*RowGroupConfig, error) {\n\tconfig := DefaultRowGroupConfig()\n\tconfig.Apply(options...)\n\treturn config, config.Validate()\n}\n\n// Validate returns a non-nil error if the configuration of c is invalid.\nfunc (c *RowGroupConfig) Validate() error {\n\tconst baseName = \"parquet.(*RowGroupConfig).\"\n\treturn errorInvalidConfiguration(\n\t\tvalidatePositiveInt(baseName+\"ColumnBufferCapacity\", c.ColumnBufferCapacity),\n\t\tc.Sorting.Validate(),\n\t)\n}\n\nfunc (c *RowGroupConfig) Apply(options ...RowGroupOption) {\n\tfor _, opt := range options {\n\t\topt.ConfigureRowGroup(c)\n\t}\n}\n\nfunc (c *RowGroupConfig) ConfigureRowGroup(config *RowGroupConfig) {\n\t*config = RowGroupConfig{\n\t\tColumnBufferCapacity: coalesceInt(c.ColumnBufferCapacity, config.ColumnBufferCapacity),\n\t\tSchema:               coalesceSchema(c.Schema, config.Schema),\n\t\tSorting:              coalesceSortingConfig(c.Sorting, config.Sorting),\n\t}\n}\n\n// The SortingConfig type carries configuration options for parquet row groups.\n//\n// SortingConfig implements the SortingOption interface so it can be used\n// directly as argument to the NewSortingWriter function when needed,\n// for example:\n//\n//\tbuffer := parquet.NewSortingWriter[Row](\n//\t\tparquet.SortingWriterConfig(\n//\t\t\tparquet.DropDuplicatedRows(true),\n//\t\t),\n//\t})\ntype SortingConfig struct {\n\tSortingBuffers     BufferPool\n\tSortingColumns     []SortingColumn\n\tDropDuplicatedRows bool\n}\n\n// DefaultSortingConfig returns a new SortingConfig value initialized with the\n// default row group configuration.\nfunc DefaultSortingConfig() *SortingConfig {\n\treturn &SortingConfig{\n\t\tSortingBuffers: &defaultSortingBufferPool,\n\t}\n}\n\n// NewSortingConfig constructs a new sorting configuration applying the\n// options passed as arguments.\n//\n// The function returns an non-nil error if some of the options carried invalid\n// configuration values.\nfunc NewSortingConfig(options ...SortingOption) (*SortingConfig, error) {\n\tconfig := DefaultSortingConfig()\n\tconfig.Apply(options...)\n\treturn config, config.Validate()\n}\n\nfunc (c *SortingConfig) Validate() error {\n\tconst baseName = \"parquet.(*SortingConfig).\"\n\treturn errorInvalidConfiguration(\n\t\tvalidateNotNil(baseName+\"SortingBuffers\", c.SortingBuffers),\n\t)\n}\n\nfunc (c *SortingConfig) Apply(options ...SortingOption) {\n\tfor _, opt := range options {\n\t\topt.ConfigureSorting(c)\n\t}\n}\n\nfunc (c *SortingConfig) ConfigureSorting(config *SortingConfig) {\n\t*config = coalesceSortingConfig(*c, *config)\n}\n\n// FileOption is an interface implemented by types that carry configuration\n// options for parquet files.\ntype FileOption interface {\n\tConfigureFile(*FileConfig)\n}\n\n// ReaderOption is an interface implemented by types that carry configuration\n// options for parquet readers.\ntype ReaderOption interface {\n\tConfigureReader(*ReaderConfig)\n}\n\n// WriterOption is an interface implemented by types that carry configuration\n// options for parquet writers.\ntype WriterOption interface {\n\tConfigureWriter(*WriterConfig)\n}\n\n// RowGroupOption is an interface implemented by types that carry configuration\n// options for parquet row groups.\ntype RowGroupOption interface {\n\tConfigureRowGroup(*RowGroupConfig)\n}\n\n// SortingOption is an interface implemented by types that carry configuration\n// options for parquet sorting writers.\ntype SortingOption interface {\n\tConfigureSorting(*SortingConfig)\n}\n\n// SkipPageIndex is a file configuration option which prevents automatically\n// reading the page index when opening a parquet file, when set to true. This is\n// useful as an optimization when programs know that they will not need to\n// consume the page index.\n//\n// Defaults to false.\nfunc SkipPageIndex(skip bool) FileOption {\n\treturn fileOption(func(config *FileConfig) { config.SkipPageIndex = skip })\n}\n\n// SkipBloomFilters is a file configuration option which prevents automatically\n// reading the bloom filters when opening a parquet file, when set to true.\n// This is useful as an optimization when programs know that they will not need\n// to consume the bloom filters.\n//\n// Defaults to false.\nfunc SkipBloomFilters(skip bool) FileOption {\n\treturn fileOption(func(config *FileConfig) { config.SkipBloomFilters = skip })\n}\n\n// FileReadMode is a file configuration option which controls the way pages\n// are read. Currently the only two options are ReadModeAsync and ReadModeSync\n// which control whether or not pages are loaded asynchronously. It can be\n// advantageous to use ReadModeAsync if your reader is backed by network\n// storage.\n//\n// Defaults to ReadModeSync.\nfunc FileReadMode(mode ReadMode) FileOption {\n\treturn fileOption(func(config *FileConfig) { config.ReadMode = mode })\n}\n\n// ReadBufferSize is a file configuration option which controls the default\n// buffer sizes for reads made to the provided io.Reader. The default of 4096\n// is appropriate for disk based access but if your reader is backed by network\n// storage it can be advantageous to increase this value to something more like\n// 4 MiB.\n//\n// Defaults to 4096.\nfunc ReadBufferSize(size int) FileOption {\n\treturn fileOption(func(config *FileConfig) { config.ReadBufferSize = size })\n}\n\n// FileSchema is used to pass a known schema in while opening a Parquet file.\n// This optimization is only useful if your application is currently opening\n// an extremely large number of parquet files with the same, known schema.\n//\n// Defaults to nil.\nfunc FileSchema(schema *Schema) FileOption {\n\treturn fileOption(func(config *FileConfig) { config.Schema = schema })\n}\n\n// PageBufferSize configures the size of column page buffers on parquet writers.\n//\n// Note that the page buffer size refers to the in-memory buffers where pages\n// are generated, not the size of pages after encoding and compression.\n// This design choice was made to help control the amount of memory needed to\n// read and write pages rather than controlling the space used by the encoded\n// representation on disk.\n//\n// Defaults to 256KiB.\nfunc PageBufferSize(size int) WriterOption {\n\treturn writerOption(func(config *WriterConfig) { config.PageBufferSize = size })\n}\n\n// WriteBufferSize configures the size of the write buffer.\n//\n// Setting the writer buffer size to zero deactivates buffering, all writes are\n// immediately sent to the output io.Writer.\n//\n// Defaults to 32KiB.\nfunc WriteBufferSize(size int) WriterOption {\n\treturn writerOption(func(config *WriterConfig) { config.WriteBufferSize = size })\n}\n\n// MaxRowsPerRowGroup configures the maximum number of rows that a writer will\n// produce in each row group.\n//\n// This limit is useful to control size of row groups in both number of rows and\n// byte size. While controlling the byte size of a row group is difficult to\n// achieve with parquet due to column encoding and compression, the number of\n// rows remains a useful proxy.\n//\n// Defaults to unlimited.\nfunc MaxRowsPerRowGroup(numRows int64) WriterOption {\n\tif numRows <= 0 {\n\t\tnumRows = DefaultMaxRowsPerRowGroup\n\t}\n\treturn writerOption(func(config *WriterConfig) { config.MaxRowsPerRowGroup = numRows })\n}\n\n// CreatedBy creates a configuration option which sets the name of the\n// application that created a parquet file.\n//\n// The option formats the \"CreatedBy\" file metadata according to the convention\n// described by the parquet spec:\n//\n//\t\"<application> version <version> (build <build>)\"\n//\n// By default, the option is set to the parquet-go module name, version, and\n// build hash.\nfunc CreatedBy(application, version, build string) WriterOption {\n\tcreatedBy := formatCreatedBy(application, version, build)\n\treturn writerOption(func(config *WriterConfig) { config.CreatedBy = createdBy })\n}\n\n// ColumnPageBuffers creates a configuration option to customize the buffer pool\n// used when constructing row groups. This can be used to provide on-disk buffers\n// as swap space to ensure that the parquet file creation will no be bottlenecked\n// on the amount of memory available.\n//\n// Defaults to using in-memory buffers.\nfunc ColumnPageBuffers(buffers BufferPool) WriterOption {\n\treturn writerOption(func(config *WriterConfig) { config.ColumnPageBuffers = buffers })\n}\n\n// ColumnIndexSizeLimit creates a configuration option to customize the size\n// limit of page boundaries recorded in column indexes.\n//\n// Defaults to 16.\nfunc ColumnIndexSizeLimit(sizeLimit int) WriterOption {\n\treturn writerOption(func(config *WriterConfig) { config.ColumnIndexSizeLimit = sizeLimit })\n}\n\n// DataPageVersion creates a configuration option which configures the version of\n// data pages used when creating a parquet file.\n//\n// Defaults to version 2.\nfunc DataPageVersion(version int) WriterOption {\n\treturn writerOption(func(config *WriterConfig) { config.DataPageVersion = version })\n}\n\n// DataPageStatistics creates a configuration option which defines whether data\n// page statistics are emitted. This option is useful when generating parquet\n// files that intend to be backward compatible with older readers which may not\n// have the ability to load page statistics from the column index.\n//\n// Defaults to false.\nfunc DataPageStatistics(enabled bool) WriterOption {\n\treturn writerOption(func(config *WriterConfig) { config.DataPageStatistics = enabled })\n}\n\n// KeyValueMetadata creates a configuration option which adds key/value metadata\n// to add to the metadata of parquet files.\n//\n// This option is additive, it may be used multiple times to add more than one\n// key/value pair.\n//\n// Keys are assumed to be unique, if the same key is repeated multiple times the\n// last value is retained. While the parquet format does not require unique keys,\n// this design decision was made to optimize for the most common use case where\n// applications leverage this extension mechanism to associate single values to\n// keys. This may create incompatibilities with other parquet libraries, or may\n// cause some key/value pairs to be lost when open parquet files written with\n// repeated keys. We can revisit this decision if it ever becomes a blocker.\nfunc KeyValueMetadata(key, value string) WriterOption {\n\treturn writerOption(func(config *WriterConfig) {\n\t\tif config.KeyValueMetadata == nil {\n\t\t\tconfig.KeyValueMetadata = map[string]string{key: value}\n\t\t} else {\n\t\t\tconfig.KeyValueMetadata[key] = value\n\t\t}\n\t})\n}\n\n// BloomFilters creates a configuration option which defines the bloom filters\n// that parquet writers should generate.\n//\n// The compute and memory footprint of generating bloom filters for all columns\n// of a parquet schema can be significant, so by default no filters are created\n// and applications need to explicitly declare the columns that they want to\n// create filters for.\nfunc BloomFilters(filters ...BloomFilterColumn) WriterOption {\n\tfilters = append([]BloomFilterColumn{}, filters...)\n\treturn writerOption(func(config *WriterConfig) { config.BloomFilters = filters })\n}\n\n// Compression creates a configuration option which sets the default compression\n// codec used by a writer for columns where none were defined.\nfunc Compression(codec compress.Codec) WriterOption {\n\treturn writerOption(func(config *WriterConfig) { config.Compression = codec })\n}\n\n// SortingWriterConfig is a writer option which applies configuration specific\n// to sorting writers.\nfunc SortingWriterConfig(options ...SortingOption) WriterOption {\n\toptions = append([]SortingOption{}, options...)\n\treturn writerOption(func(config *WriterConfig) { config.Sorting.Apply(options...) })\n}\n\n// ColumnBufferCapacity creates a configuration option which defines the size of\n// row group column buffers.\n//\n// Defaults to 16384.\nfunc ColumnBufferCapacity(size int) RowGroupOption {\n\treturn rowGroupOption(func(config *RowGroupConfig) { config.ColumnBufferCapacity = size })\n}\n\n// SortingRowGroupConfig is a row group option which applies configuration\n// specific sorting row groups.\nfunc SortingRowGroupConfig(options ...SortingOption) RowGroupOption {\n\toptions = append([]SortingOption{}, options...)\n\treturn rowGroupOption(func(config *RowGroupConfig) { config.Sorting.Apply(options...) })\n}\n\n// SortingColumns creates a configuration option which defines the sorting order\n// of columns in a row group.\n//\n// The order of sorting columns passed as argument defines the ordering\n// hierarchy; when elements are equal in the first column, the second column is\n// used to order rows, etc...\nfunc SortingColumns(columns ...SortingColumn) SortingOption {\n\t// Make a copy so that we do not retain the input slice generated implicitly\n\t// for the variable argument list, and also avoid having a nil slice when\n\t// the option is passed with no sorting columns, so we can differentiate it\n\t// from it not being passed.\n\tcolumns = append([]SortingColumn{}, columns...)\n\treturn sortingOption(func(config *SortingConfig) { config.SortingColumns = columns })\n}\n\n// SortingBuffers creates a configuration option which sets the pool of buffers\n// used to hold intermediary state when sorting parquet rows.\n//\n// Defaults to using in-memory buffers.\nfunc SortingBuffers(buffers BufferPool) SortingOption {\n\treturn sortingOption(func(config *SortingConfig) { config.SortingBuffers = buffers })\n}\n\n// DropDuplicatedRows configures whether a sorting writer will keep or remove\n// duplicated rows.\n//\n// Two rows are considered duplicates if the values of their all their sorting\n// columns are equal.\n//\n// Defaults to false\nfunc DropDuplicatedRows(drop bool) SortingOption {\n\treturn sortingOption(func(config *SortingConfig) { config.DropDuplicatedRows = drop })\n}\n\ntype fileOption func(*FileConfig)\n\nfunc (opt fileOption) ConfigureFile(config *FileConfig) { opt(config) }\n\ntype readerOption func(*ReaderConfig)\n\nfunc (opt readerOption) ConfigureReader(config *ReaderConfig) { opt(config) }\n\ntype writerOption func(*WriterConfig)\n\nfunc (opt writerOption) ConfigureWriter(config *WriterConfig) { opt(config) }\n\ntype rowGroupOption func(*RowGroupConfig)\n\nfunc (opt rowGroupOption) ConfigureRowGroup(config *RowGroupConfig) { opt(config) }\n\ntype sortingOption func(*SortingConfig)\n\nfunc (opt sortingOption) ConfigureSorting(config *SortingConfig) { opt(config) }\n\nfunc coalesceInt(i1, i2 int) int {\n\tif i1 != 0 {\n\t\treturn i1\n\t}\n\treturn i2\n}\n\nfunc coalesceInt64(i1, i2 int64) int64 {\n\tif i1 != 0 {\n\t\treturn i1\n\t}\n\treturn i2\n}\n\nfunc coalesceString(s1, s2 string) string {\n\tif s1 != \"\" {\n\t\treturn s1\n\t}\n\treturn s2\n}\n\nfunc coalesceBytes(b1, b2 []byte) []byte {\n\tif b1 != nil {\n\t\treturn b1\n\t}\n\treturn b2\n}\n\nfunc coalesceBufferPool(p1, p2 BufferPool) BufferPool {\n\tif p1 != nil {\n\t\treturn p1\n\t}\n\treturn p2\n}\n\nfunc coalesceSchema(s1, s2 *Schema) *Schema {\n\tif s1 != nil {\n\t\treturn s1\n\t}\n\treturn s2\n}\n\nfunc coalesceSortingColumns(s1, s2 []SortingColumn) []SortingColumn {\n\tif s1 != nil {\n\t\treturn s1\n\t}\n\treturn s2\n}\n\nfunc coalesceSortingConfig(c1, c2 SortingConfig) SortingConfig {\n\treturn SortingConfig{\n\t\tSortingBuffers:     coalesceBufferPool(c1.SortingBuffers, c2.SortingBuffers),\n\t\tSortingColumns:     coalesceSortingColumns(c1.SortingColumns, c2.SortingColumns),\n\t\tDropDuplicatedRows: c1.DropDuplicatedRows,\n\t}\n}\n\nfunc coalesceBloomFilters(f1, f2 []BloomFilterColumn) []BloomFilterColumn {\n\tif f1 != nil {\n\t\treturn f1\n\t}\n\treturn f2\n}\n\nfunc coalesceCompression(c1, c2 compress.Codec) compress.Codec {\n\tif c1 != nil {\n\t\treturn c1\n\t}\n\treturn c2\n}\n\nfunc validatePositiveInt(optionName string, optionValue int) error {\n\tif optionValue > 0 {\n\t\treturn nil\n\t}\n\treturn errorInvalidOptionValue(optionName, optionValue)\n}\n\nfunc validatePositiveInt64(optionName string, optionValue int64) error {\n\tif optionValue > 0 {\n\t\treturn nil\n\t}\n\treturn errorInvalidOptionValue(optionName, optionValue)\n}\n\nfunc validateOneOfInt(optionName string, optionValue int, supportedValues ...int) error {\n\tfor _, value := range supportedValues {\n\t\tif value == optionValue {\n\t\t\treturn nil\n\t\t}\n\t}\n\treturn errorInvalidOptionValue(optionName, optionValue)\n}\n\nfunc validateNotNil(optionName string, optionValue interface{}) error {\n\tif optionValue != nil {\n\t\treturn nil\n\t}\n\treturn errorInvalidOptionValue(optionName, optionValue)\n}\n\nfunc errorInvalidOptionValue(optionName string, optionValue interface{}) error {\n\treturn fmt.Errorf(\"invalid option value: %s: %v\", optionName, optionValue)\n}\n\nfunc errorInvalidConfiguration(reasons ...error) error {\n\tvar err *invalidConfiguration\n\n\tfor _, reason := range reasons {\n\t\tif reason != nil {\n\t\t\tif err == nil {\n\t\t\t\terr = new(invalidConfiguration)\n\t\t\t}\n\t\t\terr.reasons = append(err.reasons, reason)\n\t\t}\n\t}\n\n\tif err != nil {\n\t\treturn err\n\t}\n\n\treturn nil\n}\n\ntype invalidConfiguration struct {\n\treasons []error\n}\n\nfunc (err *invalidConfiguration) Error() string {\n\terrorMessage := new(strings.Builder)\n\tfor _, reason := range err.reasons {\n\t\terrorMessage.WriteString(reason.Error())\n\t\terrorMessage.WriteString(\"\\n\")\n\t}\n\terrorString := errorMessage.String()\n\tif errorString != \"\" {\n\t\terrorString = errorString[:len(errorString)-1]\n\t}\n\treturn errorString\n}\n\nvar (\n\t_ FileOption     = (*FileConfig)(nil)\n\t_ ReaderOption   = (*ReaderConfig)(nil)\n\t_ WriterOption   = (*WriterConfig)(nil)\n\t_ RowGroupOption = (*RowGroupConfig)(nil)\n\t_ SortingOption  = (*SortingConfig)(nil)\n)\n"
  },
  {
    "path": "convert.go",
    "content": "package parquet\n\nimport (\n\t\"encoding/binary\"\n\t\"encoding/hex\"\n\t\"fmt\"\n\t\"io\"\n\t\"math\"\n\t\"math/big\"\n\t\"strconv\"\n\t\"sync\"\n\t\"time\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\n// ConvertError is an error type returned by calls to Convert when the conversion\n// of parquet schemas is impossible or the input row for the conversion is\n// malformed.\ntype ConvertError struct {\n\tPath []string\n\tFrom Node\n\tTo   Node\n}\n\n// Error satisfies the error interface.\nfunc (e *ConvertError) Error() string {\n\tsourceType := e.From.Type()\n\ttargetType := e.To.Type()\n\n\tsourceRepetition := fieldRepetitionTypeOf(e.From)\n\ttargetRepetition := fieldRepetitionTypeOf(e.To)\n\n\treturn fmt.Sprintf(\"cannot convert parquet column %q from %s %s to %s %s\",\n\t\tcolumnPath(e.Path),\n\t\tsourceRepetition,\n\t\tsourceType,\n\t\ttargetRepetition,\n\t\ttargetType,\n\t)\n}\n\n// Conversion is an interface implemented by types that provide conversion of\n// parquet rows from one schema to another.\n//\n// Conversion instances must be safe to use concurrently from multiple goroutines.\ntype Conversion interface {\n\t// Applies the conversion logic on the src row, returning the result\n\t// appended to dst.\n\tConvert(rows []Row) (int, error)\n\t// Converts the given column index in the target schema to the original\n\t// column index in the source schema of the conversion.\n\tColumn(int) int\n\t// Returns the target schema of the conversion.\n\tSchema() *Schema\n}\n\ntype conversion struct {\n\tcolumns []conversionColumn\n\tschema  *Schema\n\tbuffers sync.Pool\n\t// This field is used to size the column buffers held in the sync.Pool since\n\t// they are intended to store the source rows being converted from.\n\tnumberOfSourceColumns int\n}\n\ntype conversionBuffer struct {\n\tcolumns [][]Value\n}\n\ntype conversionColumn struct {\n\tsourceIndex   int\n\tconvertValues conversionFunc\n}\n\ntype conversionFunc func([]Value) error\n\nfunc convertToSelf(column []Value) error { return nil }\n\n//go:noinline\nfunc convertToType(targetType, sourceType Type) conversionFunc {\n\treturn func(column []Value) error {\n\t\tfor i, v := range column {\n\t\t\tv, err := sourceType.ConvertValue(v, targetType)\n\t\t\tif err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\t\t\tcolumn[i].ptr = v.ptr\n\t\t\tcolumn[i].u64 = v.u64\n\t\t\tcolumn[i].kind = v.kind\n\t\t}\n\t\treturn nil\n\t}\n}\n\n//go:noinline\nfunc convertToValue(value Value) conversionFunc {\n\treturn func(column []Value) error {\n\t\tfor i := range column {\n\t\t\tcolumn[i] = value\n\t\t}\n\t\treturn nil\n\t}\n}\n\n//go:noinline\nfunc convertToZero(kind Kind) conversionFunc {\n\treturn func(column []Value) error {\n\t\tfor i := range column {\n\t\t\tcolumn[i].ptr = nil\n\t\t\tcolumn[i].u64 = 0\n\t\t\tcolumn[i].kind = ^int8(kind)\n\t\t}\n\t\treturn nil\n\t}\n}\n\n//go:noinline\nfunc convertToLevels(repetitionLevels, definitionLevels []byte) conversionFunc {\n\treturn func(column []Value) error {\n\t\tfor i := range column {\n\t\t\tr := column[i].repetitionLevel\n\t\t\td := column[i].definitionLevel\n\t\t\tcolumn[i].repetitionLevel = repetitionLevels[r]\n\t\t\tcolumn[i].definitionLevel = definitionLevels[d]\n\t\t}\n\t\treturn nil\n\t}\n}\n\n//go:noinline\nfunc multiConversionFunc(conversions []conversionFunc) conversionFunc {\n\tswitch len(conversions) {\n\tcase 0:\n\t\treturn convertToSelf\n\tcase 1:\n\t\treturn conversions[0]\n\tdefault:\n\t\treturn func(column []Value) error {\n\t\t\tfor _, conv := range conversions {\n\t\t\t\tif err := conv(column); err != nil {\n\t\t\t\t\treturn err\n\t\t\t\t}\n\t\t\t}\n\t\t\treturn nil\n\t\t}\n\t}\n}\n\nfunc (c *conversion) getBuffer() *conversionBuffer {\n\tb, _ := c.buffers.Get().(*conversionBuffer)\n\tif b == nil {\n\t\tb = &conversionBuffer{\n\t\t\tcolumns: make([][]Value, c.numberOfSourceColumns),\n\t\t}\n\t\tvalues := make([]Value, c.numberOfSourceColumns)\n\t\tfor i := range b.columns {\n\t\t\tb.columns[i] = values[i : i : i+1]\n\t\t}\n\t}\n\treturn b\n}\n\nfunc (c *conversion) putBuffer(b *conversionBuffer) {\n\tc.buffers.Put(b)\n}\n\n// Convert here satisfies the Conversion interface, and does the actual work\n// to convert between the source and target Rows.\nfunc (c *conversion) Convert(rows []Row) (int, error) {\n\tsource := c.getBuffer()\n\tdefer c.putBuffer(source)\n\n\tfor n, row := range rows {\n\t\tfor i, values := range source.columns {\n\t\t\tsource.columns[i] = values[:0]\n\t\t}\n\t\trow.Range(func(columnIndex int, columnValues []Value) bool {\n\t\t\tsource.columns[columnIndex] = append(source.columns[columnIndex], columnValues...)\n\t\t\treturn true\n\t\t})\n\t\trow = row[:0]\n\n\t\tfor columnIndex, conv := range c.columns {\n\t\t\tcolumnOffset := len(row)\n\t\t\tif conv.sourceIndex < 0 {\n\t\t\t\t// When there is no source column, we put a single value as\n\t\t\t\t// placeholder in the column. This is a condition where the\n\t\t\t\t// target contained a column which did not exist at had not\n\t\t\t\t// other columns existing at that same level.\n\t\t\t\trow = append(row, Value{})\n\t\t\t} else {\n\t\t\t\t// We must copy to the output row first and not mutate the\n\t\t\t\t// source columns because multiple target columns may map to\n\t\t\t\t// the same source column.\n\t\t\t\trow = append(row, source.columns[conv.sourceIndex]...)\n\t\t\t}\n\t\t\tcolumnValues := row[columnOffset:]\n\n\t\t\tif err := conv.convertValues(columnValues); err != nil {\n\t\t\t\treturn n, err\n\t\t\t}\n\n\t\t\t// Since the column index may have changed between the source and\n\t\t\t// taget columns we ensure that the right value is always written\n\t\t\t// to the output row.\n\t\t\tfor i := range columnValues {\n\t\t\t\tcolumnValues[i].columnIndex = ^int16(columnIndex)\n\t\t\t}\n\t\t}\n\n\t\trows[n] = row\n\t}\n\n\treturn len(rows), nil\n}\n\nfunc (c *conversion) Column(i int) int {\n\treturn c.columns[i].sourceIndex\n}\n\nfunc (c *conversion) Schema() *Schema {\n\treturn c.schema\n}\n\ntype identity struct{ schema *Schema }\n\nfunc (id identity) Convert(rows []Row) (int, error) { return len(rows), nil }\nfunc (id identity) Column(i int) int                { return i }\nfunc (id identity) Schema() *Schema                 { return id.schema }\n\n// Convert constructs a conversion function from one parquet schema to another.\n//\n// The function supports converting between schemas where the source or target\n// have extra columns; if there are more columns in the source, they will be\n// stripped out of the rows. Extra columns in the target schema will be set to\n// null or zero values.\n//\n// The returned function is intended to be used to append the converted source\n// row to the destination buffer.\nfunc Convert(to, from Node) (conv Conversion, err error) {\n\tschema, _ := to.(*Schema)\n\tif schema == nil {\n\t\tschema = NewSchema(\"\", to)\n\t}\n\n\tif nodesAreEqual(to, from) {\n\t\treturn identity{schema}, nil\n\t}\n\n\ttargetMapping, targetColumns := columnMappingOf(to)\n\tsourceMapping, sourceColumns := columnMappingOf(from)\n\tcolumns := make([]conversionColumn, len(targetColumns))\n\n\tfor i, path := range targetColumns {\n\t\ttargetColumn := targetMapping.lookup(path)\n\t\tsourceColumn := sourceMapping.lookup(path)\n\n\t\tconversions := []conversionFunc{}\n\t\tif sourceColumn.node != nil {\n\t\t\ttargetType := targetColumn.node.Type()\n\t\t\tsourceType := sourceColumn.node.Type()\n\t\t\tif !typesAreEqual(targetType, sourceType) {\n\t\t\t\tconversions = append(conversions,\n\t\t\t\t\tconvertToType(targetType, sourceType),\n\t\t\t\t)\n\t\t\t}\n\n\t\t\trepetitionLevels := make([]byte, len(path)+1)\n\t\t\tdefinitionLevels := make([]byte, len(path)+1)\n\t\t\ttargetRepetitionLevel := byte(0)\n\t\t\ttargetDefinitionLevel := byte(0)\n\t\t\tsourceRepetitionLevel := byte(0)\n\t\t\tsourceDefinitionLevel := byte(0)\n\t\t\ttargetNode := to\n\t\t\tsourceNode := from\n\n\t\t\tfor j := 0; j < len(path); j++ {\n\t\t\t\ttargetNode = fieldByName(targetNode, path[j])\n\t\t\t\tsourceNode = fieldByName(sourceNode, path[j])\n\n\t\t\t\ttargetRepetitionLevel, targetDefinitionLevel = applyFieldRepetitionType(\n\t\t\t\t\tfieldRepetitionTypeOf(targetNode),\n\t\t\t\t\ttargetRepetitionLevel,\n\t\t\t\t\ttargetDefinitionLevel,\n\t\t\t\t)\n\t\t\t\tsourceRepetitionLevel, sourceDefinitionLevel = applyFieldRepetitionType(\n\t\t\t\t\tfieldRepetitionTypeOf(sourceNode),\n\t\t\t\t\tsourceRepetitionLevel,\n\t\t\t\t\tsourceDefinitionLevel,\n\t\t\t\t)\n\n\t\t\t\trepetitionLevels[sourceRepetitionLevel] = targetRepetitionLevel\n\t\t\t\tdefinitionLevels[sourceDefinitionLevel] = targetDefinitionLevel\n\t\t\t}\n\n\t\t\trepetitionLevels = repetitionLevels[:sourceRepetitionLevel+1]\n\t\t\tdefinitionLevels = definitionLevels[:sourceDefinitionLevel+1]\n\n\t\t\tif !isDirectLevelMapping(repetitionLevels) || !isDirectLevelMapping(definitionLevels) {\n\t\t\t\tconversions = append(conversions,\n\t\t\t\t\tconvertToLevels(repetitionLevels, definitionLevels),\n\t\t\t\t)\n\t\t\t}\n\n\t\t} else {\n\t\t\ttargetType := targetColumn.node.Type()\n\t\t\ttargetKind := targetType.Kind()\n\t\t\tsourceColumn = sourceMapping.lookupClosest(path)\n\t\t\tif sourceColumn.node != nil {\n\t\t\t\tconversions = append(conversions,\n\t\t\t\t\tconvertToZero(targetKind),\n\t\t\t\t)\n\t\t\t} else {\n\t\t\t\tconversions = append(conversions,\n\t\t\t\t\tconvertToValue(ZeroValue(targetKind)),\n\t\t\t\t)\n\t\t\t}\n\t\t}\n\n\t\tcolumns[i] = conversionColumn{\n\t\t\tsourceIndex:   int(sourceColumn.columnIndex),\n\t\t\tconvertValues: multiConversionFunc(conversions),\n\t\t}\n\t}\n\n\tc := &conversion{\n\t\tcolumns:               columns,\n\t\tschema:                schema,\n\t\tnumberOfSourceColumns: len(sourceColumns),\n\t}\n\treturn c, nil\n}\n\nfunc isDirectLevelMapping(levels []byte) bool {\n\tfor i, level := range levels {\n\t\tif level != byte(i) {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\n// ConvertRowGroup constructs a wrapper of the given row group which applies\n// the given schema conversion to its rows.\nfunc ConvertRowGroup(rowGroup RowGroup, conv Conversion) RowGroup {\n\tschema := conv.Schema()\n\tnumRows := rowGroup.NumRows()\n\trowGroupColumns := rowGroup.ColumnChunks()\n\n\tcolumns := make([]ColumnChunk, numLeafColumnsOf(schema))\n\tforEachLeafColumnOf(schema, func(leaf leafColumn) {\n\t\ti := leaf.columnIndex\n\t\tj := conv.Column(int(leaf.columnIndex))\n\t\tif j < 0 {\n\t\t\tcolumns[i] = &missingColumnChunk{\n\t\t\t\ttyp:    leaf.node.Type(),\n\t\t\t\tcolumn: i,\n\t\t\t\t// TODO: we assume the number of values is the same as the\n\t\t\t\t// number of rows, which may not be accurate when the column is\n\t\t\t\t// part of a repeated group; neighbor columns may be repeated in\n\t\t\t\t// which case it would be impossible for this chunk not to be.\n\t\t\t\tnumRows:   numRows,\n\t\t\t\tnumValues: numRows,\n\t\t\t\tnumNulls:  numRows,\n\t\t\t}\n\t\t} else {\n\t\t\tcolumns[i] = rowGroupColumns[j]\n\t\t}\n\t})\n\n\t// Sorting columns must exist on the conversion schema in order to be\n\t// advertised on the converted row group otherwise the resulting rows\n\t// would not be in the right order.\n\tsorting := []SortingColumn{}\n\tfor _, col := range rowGroup.SortingColumns() {\n\t\tif !hasColumnPath(schema, col.Path()) {\n\t\t\tbreak\n\t\t}\n\t\tsorting = append(sorting, col)\n\t}\n\n\treturn &convertedRowGroup{\n\t\t// The pair of rowGroup+conv is retained to construct a converted row\n\t\t// reader by wrapping the underlying row reader of the row group because\n\t\t// it allows proper reconstruction of the repetition and definition\n\t\t// levels.\n\t\t//\n\t\t// TODO: can we figure out how to set the repetition and definition\n\t\t// levels when reading values from missing column pages? At first sight\n\t\t// it appears complex to do, however:\n\t\t//\n\t\t// * It is possible that having these levels when reading values of\n\t\t//   missing column pages is not necessary in some scenarios (e.g. when\n\t\t//   merging row groups).\n\t\t//\n\t\t// * We may be able to assume the repetition and definition levels at\n\t\t//   the call site (e.g. in the functions reading rows from columns).\n\t\t//\n\t\t// Columns of the source row group which do not exist in the target are\n\t\t// masked to prevent loading unneeded pages when reading rows from the\n\t\t// converted row group.\n\t\trowGroup: maskMissingRowGroupColumns(rowGroup, len(columns), conv),\n\t\tcolumns:  columns,\n\t\tsorting:  sorting,\n\t\tconv:     conv,\n\t}\n}\n\nfunc maskMissingRowGroupColumns(r RowGroup, numColumns int, conv Conversion) RowGroup {\n\trowGroupColumns := r.ColumnChunks()\n\tcolumns := make([]ColumnChunk, len(rowGroupColumns))\n\tmissing := make([]missingColumnChunk, len(columns))\n\tnumRows := r.NumRows()\n\n\tfor i := range missing {\n\t\tmissing[i] = missingColumnChunk{\n\t\t\ttyp:       rowGroupColumns[i].Type(),\n\t\t\tcolumn:    int16(i),\n\t\t\tnumRows:   numRows,\n\t\t\tnumValues: numRows,\n\t\t\tnumNulls:  numRows,\n\t\t}\n\t}\n\n\tfor i := range columns {\n\t\tcolumns[i] = &missing[i]\n\t}\n\n\tfor i := 0; i < numColumns; i++ {\n\t\tj := conv.Column(i)\n\t\tif j >= 0 && j < len(columns) {\n\t\t\tcolumns[j] = rowGroupColumns[j]\n\t\t}\n\t}\n\n\treturn &rowGroup{\n\t\tschema:  r.Schema(),\n\t\tnumRows: numRows,\n\t\tcolumns: columns,\n\t}\n}\n\ntype missingColumnChunk struct {\n\ttyp       Type\n\tcolumn    int16\n\tnumRows   int64\n\tnumValues int64\n\tnumNulls  int64\n}\n\nfunc (c *missingColumnChunk) Type() Type               { return c.typ }\nfunc (c *missingColumnChunk) Column() int              { return int(c.column) }\nfunc (c *missingColumnChunk) Pages() Pages             { return onePage(missingPage{c}) }\nfunc (c *missingColumnChunk) ColumnIndex() ColumnIndex { return missingColumnIndex{c} }\nfunc (c *missingColumnChunk) OffsetIndex() OffsetIndex { return missingOffsetIndex{} }\nfunc (c *missingColumnChunk) BloomFilter() BloomFilter { return missingBloomFilter{} }\nfunc (c *missingColumnChunk) NumValues() int64         { return 0 }\n\ntype missingColumnIndex struct{ *missingColumnChunk }\n\nfunc (i missingColumnIndex) NumPages() int       { return 1 }\nfunc (i missingColumnIndex) NullCount(int) int64 { return i.numNulls }\nfunc (i missingColumnIndex) NullPage(int) bool   { return true }\nfunc (i missingColumnIndex) MinValue(int) Value  { return Value{} }\nfunc (i missingColumnIndex) MaxValue(int) Value  { return Value{} }\nfunc (i missingColumnIndex) IsAscending() bool   { return true }\nfunc (i missingColumnIndex) IsDescending() bool  { return false }\n\ntype missingOffsetIndex struct{}\n\nfunc (missingOffsetIndex) NumPages() int                { return 1 }\nfunc (missingOffsetIndex) Offset(int) int64             { return 0 }\nfunc (missingOffsetIndex) CompressedPageSize(int) int64 { return 0 }\nfunc (missingOffsetIndex) FirstRowIndex(int) int64      { return 0 }\n\ntype missingBloomFilter struct{}\n\nfunc (missingBloomFilter) ReadAt([]byte, int64) (int, error) { return 0, io.EOF }\nfunc (missingBloomFilter) Size() int64                       { return 0 }\nfunc (missingBloomFilter) Check(Value) (bool, error)         { return false, nil }\n\ntype missingPage struct{ *missingColumnChunk }\n\nfunc (p missingPage) Column() int                       { return int(p.column) }\nfunc (p missingPage) Dictionary() Dictionary            { return nil }\nfunc (p missingPage) NumRows() int64                    { return p.numRows }\nfunc (p missingPage) NumValues() int64                  { return p.numValues }\nfunc (p missingPage) NumNulls() int64                   { return p.numNulls }\nfunc (p missingPage) Bounds() (min, max Value, ok bool) { return }\nfunc (p missingPage) Slice(i, j int64) Page             { return p }\nfunc (p missingPage) Size() int64                       { return 0 }\nfunc (p missingPage) RepetitionLevels() []byte          { return nil }\nfunc (p missingPage) DefinitionLevels() []byte          { return nil }\nfunc (p missingPage) Data() encoding.Values             { return p.typ.NewValues(nil, nil) }\nfunc (p missingPage) Values() ValueReader               { return &missingPageValues{page: p} }\n\ntype missingPageValues struct {\n\tpage missingPage\n\tread int64\n}\n\nfunc (r *missingPageValues) ReadValues(values []Value) (int, error) {\n\tremain := r.page.numValues - r.read\n\tif int64(len(values)) > remain {\n\t\tvalues = values[:remain]\n\t}\n\tfor i := range values {\n\t\t// TODO: how do we set the repetition and definition levels here?\n\t\tvalues[i] = Value{columnIndex: ^r.page.column}\n\t}\n\tif r.read += int64(len(values)); r.read == r.page.numValues {\n\t\treturn len(values), io.EOF\n\t}\n\treturn len(values), nil\n}\n\nfunc (r *missingPageValues) Close() error {\n\tr.read = r.page.numValues\n\treturn nil\n}\n\ntype convertedRowGroup struct {\n\trowGroup RowGroup\n\tcolumns  []ColumnChunk\n\tsorting  []SortingColumn\n\tconv     Conversion\n}\n\nfunc (c *convertedRowGroup) NumRows() int64                  { return c.rowGroup.NumRows() }\nfunc (c *convertedRowGroup) ColumnChunks() []ColumnChunk     { return c.columns }\nfunc (c *convertedRowGroup) Schema() *Schema                 { return c.conv.Schema() }\nfunc (c *convertedRowGroup) SortingColumns() []SortingColumn { return c.sorting }\nfunc (c *convertedRowGroup) Rows() Rows {\n\trows := c.rowGroup.Rows()\n\treturn &convertedRows{\n\t\tCloser: rows,\n\t\trows:   rows,\n\t\tconv:   c.conv,\n\t}\n}\n\n// ConvertRowReader constructs a wrapper of the given row reader which applies\n// the given schema conversion to the rows.\nfunc ConvertRowReader(rows RowReader, conv Conversion) RowReaderWithSchema {\n\treturn &convertedRows{rows: &forwardRowSeeker{rows: rows}, conv: conv}\n}\n\ntype convertedRows struct {\n\tio.Closer\n\trows RowReadSeeker\n\tconv Conversion\n}\n\nfunc (c *convertedRows) ReadRows(rows []Row) (int, error) {\n\tn, err := c.rows.ReadRows(rows)\n\tif n > 0 {\n\t\tvar convErr error\n\t\tn, convErr = c.conv.Convert(rows[:n])\n\t\tif convErr != nil {\n\t\t\terr = convErr\n\t\t}\n\t}\n\treturn n, err\n}\n\nfunc (c *convertedRows) Schema() *Schema {\n\treturn c.conv.Schema()\n}\n\nfunc (c *convertedRows) SeekToRow(rowIndex int64) error {\n\treturn c.rows.SeekToRow(rowIndex)\n}\n\nvar (\n\ttrueBytes  = []byte(`true`)\n\tfalseBytes = []byte(`false`)\n\tunixEpoch  = time.Date(1970, time.January, 1, 0, 0, 0, 0, time.UTC)\n)\n\nfunc convertBooleanToInt32(v Value) (Value, error) {\n\treturn v.convertToInt32(int32(v.byte())), nil\n}\n\nfunc convertBooleanToInt64(v Value) (Value, error) {\n\treturn v.convertToInt64(int64(v.byte())), nil\n}\n\nfunc convertBooleanToInt96(v Value) (Value, error) {\n\treturn v.convertToInt96(deprecated.Int96{0: uint32(v.byte())}), nil\n}\n\nfunc convertBooleanToFloat(v Value) (Value, error) {\n\treturn v.convertToFloat(float32(v.byte())), nil\n}\n\nfunc convertBooleanToDouble(v Value) (Value, error) {\n\treturn v.convertToDouble(float64(v.byte())), nil\n}\n\nfunc convertBooleanToByteArray(v Value) (Value, error) {\n\treturn v.convertToByteArray([]byte{v.byte()}), nil\n}\n\nfunc convertBooleanToFixedLenByteArray(v Value, size int) (Value, error) {\n\tb := []byte{v.byte()}\n\tc := make([]byte, size)\n\tcopy(c, b)\n\treturn v.convertToFixedLenByteArray(c), nil\n}\n\nfunc convertBooleanToString(v Value) (Value, error) {\n\tb := ([]byte)(nil)\n\tif v.boolean() {\n\t\tb = trueBytes\n\t} else {\n\t\tb = falseBytes\n\t}\n\treturn v.convertToByteArray(b), nil\n}\n\nfunc convertInt32ToBoolean(v Value) (Value, error) {\n\treturn v.convertToBoolean(v.int32() != 0), nil\n}\n\nfunc convertInt32ToInt64(v Value) (Value, error) {\n\treturn v.convertToInt64(int64(v.int32())), nil\n}\n\nfunc convertInt32ToInt96(v Value) (Value, error) {\n\treturn v.convertToInt96(deprecated.Int32ToInt96(v.int32())), nil\n}\n\nfunc convertInt32ToFloat(v Value) (Value, error) {\n\treturn v.convertToFloat(float32(v.int32())), nil\n}\n\nfunc convertInt32ToDouble(v Value) (Value, error) {\n\treturn v.convertToDouble(float64(v.int32())), nil\n}\n\nfunc convertInt32ToByteArray(v Value) (Value, error) {\n\tb := make([]byte, 4)\n\tbinary.LittleEndian.PutUint32(b, v.uint32())\n\treturn v.convertToByteArray(b), nil\n}\n\nfunc convertInt32ToFixedLenByteArray(v Value, size int) (Value, error) {\n\tb := make([]byte, 4)\n\tc := make([]byte, size)\n\tbinary.LittleEndian.PutUint32(b, v.uint32())\n\tcopy(c, b)\n\treturn v.convertToFixedLenByteArray(c), nil\n}\n\nfunc convertInt32ToString(v Value) (Value, error) {\n\treturn v.convertToByteArray(strconv.AppendInt(nil, int64(v.int32()), 10)), nil\n}\n\nfunc convertInt64ToBoolean(v Value) (Value, error) {\n\treturn v.convertToBoolean(v.int64() != 0), nil\n}\n\nfunc convertInt64ToInt32(v Value) (Value, error) {\n\treturn v.convertToInt32(int32(v.int64())), nil\n}\n\nfunc convertInt64ToInt96(v Value) (Value, error) {\n\treturn v.convertToInt96(deprecated.Int64ToInt96(v.int64())), nil\n}\n\nfunc convertInt64ToFloat(v Value) (Value, error) {\n\treturn v.convertToFloat(float32(v.int64())), nil\n}\n\nfunc convertInt64ToDouble(v Value) (Value, error) {\n\treturn v.convertToDouble(float64(v.int64())), nil\n}\n\nfunc convertInt64ToByteArray(v Value) (Value, error) {\n\tb := make([]byte, 8)\n\tbinary.LittleEndian.PutUint64(b, v.uint64())\n\treturn v.convertToByteArray(b), nil\n}\n\nfunc convertInt64ToFixedLenByteArray(v Value, size int) (Value, error) {\n\tb := make([]byte, 8)\n\tc := make([]byte, size)\n\tbinary.LittleEndian.PutUint64(b, v.uint64())\n\tcopy(c, b)\n\treturn v.convertToFixedLenByteArray(c), nil\n}\n\nfunc convertInt64ToString(v Value) (Value, error) {\n\treturn v.convertToByteArray(strconv.AppendInt(nil, v.int64(), 10)), nil\n}\n\nfunc convertInt96ToBoolean(v Value) (Value, error) {\n\treturn v.convertToBoolean(!v.int96().IsZero()), nil\n}\n\nfunc convertInt96ToInt32(v Value) (Value, error) {\n\treturn v.convertToInt32(v.int96().Int32()), nil\n}\n\nfunc convertInt96ToInt64(v Value) (Value, error) {\n\treturn v.convertToInt64(v.int96().Int64()), nil\n}\n\nfunc convertInt96ToFloat(v Value) (Value, error) {\n\treturn v, invalidConversion(v, \"INT96\", \"FLOAT\")\n}\n\nfunc convertInt96ToDouble(v Value) (Value, error) {\n\treturn v, invalidConversion(v, \"INT96\", \"DOUBLE\")\n}\n\nfunc convertInt96ToByteArray(v Value) (Value, error) {\n\treturn v.convertToByteArray(v.byteArray()), nil\n}\n\nfunc convertInt96ToFixedLenByteArray(v Value, size int) (Value, error) {\n\tb := v.byteArray()\n\tif len(b) < size {\n\t\tc := make([]byte, size)\n\t\tcopy(c, b)\n\t\tb = c\n\t} else {\n\t\tb = b[:size]\n\t}\n\treturn v.convertToFixedLenByteArray(b), nil\n}\n\nfunc convertInt96ToString(v Value) (Value, error) {\n\treturn v.convertToByteArray([]byte(v.String())), nil\n}\n\nfunc convertFloatToBoolean(v Value) (Value, error) {\n\treturn v.convertToBoolean(v.float() != 0), nil\n}\n\nfunc convertFloatToInt32(v Value) (Value, error) {\n\treturn v.convertToInt32(int32(v.float())), nil\n}\n\nfunc convertFloatToInt64(v Value) (Value, error) {\n\treturn v.convertToInt64(int64(v.float())), nil\n}\n\nfunc convertFloatToInt96(v Value) (Value, error) {\n\treturn v, invalidConversion(v, \"FLOAT\", \"INT96\")\n}\n\nfunc convertFloatToDouble(v Value) (Value, error) {\n\treturn v.convertToDouble(float64(v.float())), nil\n}\n\nfunc convertFloatToByteArray(v Value) (Value, error) {\n\tb := make([]byte, 4)\n\tbinary.LittleEndian.PutUint32(b, v.uint32())\n\treturn v.convertToByteArray(b), nil\n}\n\nfunc convertFloatToFixedLenByteArray(v Value, size int) (Value, error) {\n\tb := make([]byte, 4)\n\tc := make([]byte, size)\n\tbinary.LittleEndian.PutUint32(b, v.uint32())\n\tcopy(c, b)\n\treturn v.convertToFixedLenByteArray(c), nil\n}\n\nfunc convertFloatToString(v Value) (Value, error) {\n\treturn v.convertToByteArray(strconv.AppendFloat(nil, float64(v.float()), 'g', -1, 32)), nil\n}\n\nfunc convertDoubleToBoolean(v Value) (Value, error) {\n\treturn v.convertToBoolean(v.double() != 0), nil\n}\n\nfunc convertDoubleToInt32(v Value) (Value, error) {\n\treturn v.convertToInt32(int32(v.double())), nil\n}\n\nfunc convertDoubleToInt64(v Value) (Value, error) {\n\treturn v.convertToInt64(int64(v.double())), nil\n}\n\nfunc convertDoubleToInt96(v Value) (Value, error) {\n\treturn v, invalidConversion(v, \"FLOAT\", \"INT96\")\n}\n\nfunc convertDoubleToFloat(v Value) (Value, error) {\n\treturn v.convertToFloat(float32(v.double())), nil\n}\n\nfunc convertDoubleToByteArray(v Value) (Value, error) {\n\tb := make([]byte, 8)\n\tbinary.LittleEndian.PutUint64(b, v.uint64())\n\treturn v.convertToByteArray(b), nil\n}\n\nfunc convertDoubleToFixedLenByteArray(v Value, size int) (Value, error) {\n\tb := make([]byte, 8)\n\tc := make([]byte, size)\n\tbinary.LittleEndian.PutUint64(b, v.uint64())\n\tcopy(c, b)\n\treturn v.convertToFixedLenByteArray(c), nil\n}\n\nfunc convertDoubleToString(v Value) (Value, error) {\n\treturn v.convertToByteArray(strconv.AppendFloat(nil, v.double(), 'g', -1, 64)), nil\n}\n\nfunc convertByteArrayToBoolean(v Value) (Value, error) {\n\treturn v.convertToBoolean(!isZero(v.byteArray())), nil\n}\n\nfunc convertByteArrayToInt32(v Value) (Value, error) {\n\tb := make([]byte, 4)\n\tcopy(b, v.byteArray())\n\treturn v.convertToInt32(int32(binary.LittleEndian.Uint32(b))), nil\n}\n\nfunc convertByteArrayToInt64(v Value) (Value, error) {\n\tb := make([]byte, 8)\n\tcopy(b, v.byteArray())\n\treturn v.convertToInt64(int64(binary.LittleEndian.Uint64(b))), nil\n}\n\nfunc convertByteArrayToInt96(v Value) (Value, error) {\n\tb := make([]byte, 12)\n\tcopy(b, v.byteArray())\n\treturn v.convertToInt96(deprecated.Int96{\n\t\t0: binary.LittleEndian.Uint32(b[0:4]),\n\t\t1: binary.LittleEndian.Uint32(b[4:8]),\n\t\t2: binary.LittleEndian.Uint32(b[8:12]),\n\t}), nil\n}\n\nfunc convertByteArrayToFloat(v Value) (Value, error) {\n\tb := make([]byte, 4)\n\tcopy(b, v.byteArray())\n\treturn v.convertToFloat(math.Float32frombits(binary.LittleEndian.Uint32(b))), nil\n}\n\nfunc convertByteArrayToDouble(v Value) (Value, error) {\n\tb := make([]byte, 8)\n\tcopy(b, v.byteArray())\n\treturn v.convertToDouble(math.Float64frombits(binary.LittleEndian.Uint64(b))), nil\n}\n\nfunc convertByteArrayToFixedLenByteArray(v Value, size int) (Value, error) {\n\tb := v.byteArray()\n\tif len(b) < size {\n\t\tc := make([]byte, size)\n\t\tcopy(c, b)\n\t\tb = c\n\t} else {\n\t\tb = b[:size]\n\t}\n\treturn v.convertToFixedLenByteArray(b), nil\n}\n\nfunc convertFixedLenByteArrayToString(v Value) (Value, error) {\n\tb := v.byteArray()\n\tc := make([]byte, hex.EncodedLen(len(b)))\n\thex.Encode(c, b)\n\treturn v.convertToByteArray(c), nil\n}\n\nfunc convertStringToBoolean(v Value) (Value, error) {\n\tb, err := strconv.ParseBool(v.string())\n\tif err != nil {\n\t\treturn v, conversionError(v, \"STRING\", \"BOOLEAN\", err)\n\t}\n\treturn v.convertToBoolean(b), nil\n}\n\nfunc convertStringToInt32(v Value) (Value, error) {\n\ti, err := strconv.ParseInt(v.string(), 10, 32)\n\tif err != nil {\n\t\treturn v, conversionError(v, \"STRING\", \"INT32\", err)\n\t}\n\treturn v.convertToInt32(int32(i)), nil\n}\n\nfunc convertStringToInt64(v Value) (Value, error) {\n\ti, err := strconv.ParseInt(v.string(), 10, 64)\n\tif err != nil {\n\t\treturn v, conversionError(v, \"STRING\", \"INT64\", err)\n\t}\n\treturn v.convertToInt64(i), nil\n}\n\nfunc convertStringToInt96(v Value) (Value, error) {\n\ti, ok := new(big.Int).SetString(v.string(), 10)\n\tif !ok {\n\t\treturn v, conversionError(v, \"STRING\", \"INT96\", strconv.ErrSyntax)\n\t}\n\tb := i.Bytes()\n\tc := make([]byte, 12)\n\tcopy(c, b)\n\ti96 := deprecated.BytesToInt96(c)\n\treturn v.convertToInt96(i96[0]), nil\n}\n\nfunc convertStringToFloat(v Value) (Value, error) {\n\tf, err := strconv.ParseFloat(v.string(), 32)\n\tif err != nil {\n\t\treturn v, conversionError(v, \"STRING\", \"FLOAT\", err)\n\t}\n\treturn v.convertToFloat(float32(f)), nil\n}\n\nfunc convertStringToDouble(v Value) (Value, error) {\n\tf, err := strconv.ParseFloat(v.string(), 64)\n\tif err != nil {\n\t\treturn v, conversionError(v, \"STRING\", \"DOUBLE\", err)\n\t}\n\treturn v.convertToDouble(f), nil\n}\n\nfunc convertStringToFixedLenByteArray(v Value, size int) (Value, error) {\n\tb := v.byteArray()\n\tc := make([]byte, size)\n\t_, err := hex.Decode(c, b)\n\tif err != nil {\n\t\treturn v, conversionError(v, \"STRING\", \"BYTE_ARRAY\", err)\n\t}\n\treturn v.convertToFixedLenByteArray(c), nil\n}\n\nfunc convertStringToDate(v Value, tz *time.Location) (Value, error) {\n\tt, err := time.ParseInLocation(\"2006-01-02\", v.string(), tz)\n\tif err != nil {\n\t\treturn v, conversionError(v, \"STRING\", \"DATE\", err)\n\t}\n\td := daysSinceUnixEpoch(t)\n\treturn v.convertToInt32(int32(d)), nil\n}\n\nfunc convertStringToTimeMillis(v Value, tz *time.Location) (Value, error) {\n\tt, err := time.ParseInLocation(\"15:04:05.999\", v.string(), tz)\n\tif err != nil {\n\t\treturn v, conversionError(v, \"STRING\", \"TIME\", err)\n\t}\n\tm := nearestMidnightLessThan(t)\n\tmilliseconds := t.Sub(m).Milliseconds()\n\treturn v.convertToInt32(int32(milliseconds)), nil\n}\n\nfunc convertStringToTimeMicros(v Value, tz *time.Location) (Value, error) {\n\tt, err := time.ParseInLocation(\"15:04:05.999999\", v.string(), tz)\n\tif err != nil {\n\t\treturn v, conversionError(v, \"STRING\", \"TIME\", err)\n\t}\n\tm := nearestMidnightLessThan(t)\n\tmicroseconds := t.Sub(m).Microseconds()\n\treturn v.convertToInt64(microseconds), nil\n}\n\nfunc convertDateToTimestamp(v Value, u format.TimeUnit, tz *time.Location) (Value, error) {\n\tt := unixEpoch.AddDate(0, 0, int(v.int32()))\n\td := timeUnitDuration(u)\n\treturn v.convertToInt64(int64(t.In(tz).Sub(unixEpoch) / d)), nil\n}\n\nfunc convertDateToString(v Value) (Value, error) {\n\tt := unixEpoch.AddDate(0, 0, int(v.int32()))\n\tb := t.AppendFormat(make([]byte, 0, 10), \"2006-01-02\")\n\treturn v.convertToByteArray(b), nil\n}\n\nfunc convertTimeMillisToString(v Value, tz *time.Location) (Value, error) {\n\tt := time.UnixMilli(int64(v.int32())).In(tz)\n\tb := t.AppendFormat(make([]byte, 0, 12), \"15:04:05.999\")\n\treturn v.convertToByteArray(b), nil\n}\n\nfunc convertTimeMicrosToString(v Value, tz *time.Location) (Value, error) {\n\tt := time.UnixMicro(v.int64()).In(tz)\n\tb := t.AppendFormat(make([]byte, 0, 15), \"15:04:05.999999\")\n\treturn v.convertToByteArray(b), nil\n}\n\nfunc convertTimestampToDate(v Value, u format.TimeUnit, tz *time.Location) (Value, error) {\n\tt := timestamp(v, u, tz)\n\td := daysSinceUnixEpoch(t)\n\treturn v.convertToInt32(int32(d)), nil\n}\n\nfunc convertTimestampToTimeMillis(v Value, u format.TimeUnit, sourceZone, targetZone *time.Location) (Value, error) {\n\tt := timestamp(v, u, sourceZone)\n\tm := nearestMidnightLessThan(t)\n\tmilliseconds := t.In(targetZone).Sub(m).Milliseconds()\n\treturn v.convertToInt32(int32(milliseconds)), nil\n}\n\nfunc convertTimestampToTimeMicros(v Value, u format.TimeUnit, sourceZone, targetZone *time.Location) (Value, error) {\n\tt := timestamp(v, u, sourceZone)\n\tm := nearestMidnightLessThan(t)\n\tmicroseconds := t.In(targetZone).Sub(m).Microseconds()\n\treturn v.convertToInt64(int64(microseconds)), nil\n}\n\nfunc convertTimestampToTimestamp(v Value, sourceUnit, targetUnit format.TimeUnit) (Value, error) {\n\tsourceScale := timeUnitDuration(sourceUnit).Nanoseconds()\n\ttargetScale := timeUnitDuration(targetUnit).Nanoseconds()\n\ttargetValue := (v.int64() * sourceScale) / targetScale\n\treturn v.convertToInt64(targetValue), nil\n}\n\nconst nanosecondsPerDay = 24 * 60 * 60 * 1e9\n\nfunc daysSinceUnixEpoch(t time.Time) int {\n\treturn int(t.Sub(unixEpoch).Hours()) / 24\n}\n\nfunc nearestMidnightLessThan(t time.Time) time.Time {\n\ty, m, d := t.Date()\n\treturn time.Date(y, m, d, 0, 0, 0, 0, t.Location())\n}\n\nfunc timestamp(v Value, u format.TimeUnit, tz *time.Location) time.Time {\n\treturn unixEpoch.In(tz).Add(time.Duration(v.int64()) * timeUnitDuration(u))\n}\n\nfunc timeUnitDuration(unit format.TimeUnit) time.Duration {\n\tswitch {\n\tcase unit.Millis != nil:\n\t\treturn time.Millisecond\n\tcase unit.Micros != nil:\n\t\treturn time.Microsecond\n\tdefault:\n\t\treturn time.Nanosecond\n\t}\n}\n\nfunc invalidConversion(value Value, from, to string) error {\n\treturn fmt.Errorf(\"%s to %s: %s: %w\", from, to, value, ErrInvalidConversion)\n}\n\nfunc conversionError(value Value, from, to string, err error) error {\n\treturn fmt.Errorf(\"%s to %s: %q: %s: %w\", from, to, value.string(), err, ErrInvalidConversion)\n}\n"
  },
  {
    "path": "convert_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"reflect\"\n\t\"testing\"\n\t\"time\"\n\n\t\"github.com/segmentio/parquet-go\"\n\t\"github.com/segmentio/parquet-go/deprecated\"\n)\n\ntype AddressBook1 struct {\n\tOwner             string   `parquet:\"owner,zstd\"`\n\tOwnerPhoneNumbers []string `parquet:\"ownerPhoneNumbers,gzip\"`\n}\n\ntype AddressBook2 struct {\n\tOwner             string    `parquet:\"owner,zstd\"`\n\tOwnerPhoneNumbers []string  `parquet:\"ownerPhoneNumbers,gzip\"`\n\tContacts          []Contact `parquet:\"contacts\"`\n\tExtra             string    `parquet:\"extra\"`\n}\n\ntype AddressBook3 struct {\n\tOwner    string     `parquet:\"owner,zstd\"`\n\tContacts []Contact2 `parquet:\"contacts\"`\n}\n\ntype Contact2 struct {\n\tName         string   `parquet:\"name\"`\n\tPhoneNumbers []string `parquet:\"phoneNumbers,zstd\"`\n\tAddresses    []string `parquet:\"addresses,zstd\"`\n}\n\ntype AddressBook4 struct {\n\tOwner    string     `parquet:\"owner,zstd\"`\n\tContacts []Contact2 `parquet:\"contacts\"`\n\tExtra    string     `parquet:\"extra\"`\n}\n\ntype SimpleNumber struct {\n\tNumber *int64 `parquet:\"number,optional\"`\n}\n\ntype SimpleContact struct {\n\tNumbers []SimpleNumber `parquet:\"numbers\"`\n}\n\ntype SimpleAddressBook struct {\n\tName    string\n\tContact SimpleContact\n}\n\ntype SimpleAddressBook2 struct {\n\tName    string\n\tContact SimpleContact\n\tExtra   string\n}\n\ntype ListOfIDs struct {\n\tIDs []uint64\n}\n\nvar conversionTests = [...]struct {\n\tscenario string\n\tfrom     interface{}\n\tto       interface{}\n}{\n\t{\n\t\tscenario: \"convert between rows which have the same schema\",\n\t\tfrom: AddressBook{\n\t\t\tOwner: \"Julien Le Dem\",\n\t\t\tOwnerPhoneNumbers: []string{\n\t\t\t\t\"555 123 4567\",\n\t\t\t\t\"555 666 1337\",\n\t\t\t},\n\t\t\tContacts: []Contact{\n\t\t\t\t{\n\t\t\t\t\tName:        \"Dmitriy Ryaboy\",\n\t\t\t\t\tPhoneNumber: \"555 987 6543\",\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\tName: \"Chris Aniszczyk\",\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\t\tto: AddressBook{\n\t\t\tOwner: \"Julien Le Dem\",\n\t\t\tOwnerPhoneNumbers: []string{\n\t\t\t\t\"555 123 4567\",\n\t\t\t\t\"555 666 1337\",\n\t\t\t},\n\t\t\tContacts: []Contact{\n\t\t\t\t{\n\t\t\t\t\tName:        \"Dmitriy Ryaboy\",\n\t\t\t\t\tPhoneNumber: \"555 987 6543\",\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\tName: \"Chris Aniszczyk\",\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"missing column\",\n\t\tfrom:     struct{ FirstName, LastName string }{FirstName: \"Luke\", LastName: \"Skywalker\"},\n\t\tto:       struct{ LastName string }{LastName: \"Skywalker\"},\n\t},\n\n\t{\n\t\tscenario: \"missing optional column\",\n\t\tfrom: struct {\n\t\t\tFirstName *string\n\t\t\tLastName  string\n\t\t}{FirstName: newString(\"Luke\"), LastName: \"Skywalker\"},\n\t\tto: struct{ LastName string }{LastName: \"Skywalker\"},\n\t},\n\n\t{\n\t\tscenario: \"missing repeated column\",\n\t\tfrom: struct {\n\t\t\tID    uint64\n\t\t\tNames []string\n\t\t}{ID: 42, Names: []string{\"me\", \"myself\", \"I\"}},\n\t\tto: struct{ ID uint64 }{ID: 42},\n\t},\n\n\t{\n\t\tscenario: \"extra column\",\n\t\tfrom:     struct{ LastName string }{LastName: \"Skywalker\"},\n\t\tto:       struct{ FirstName, LastName string }{LastName: \"Skywalker\"},\n\t},\n\n\t{\n\t\tscenario: \"extra optional column\",\n\t\tfrom:     struct{ ID uint64 }{ID: 2},\n\t\tto: struct {\n\t\t\tID      uint64\n\t\t\tDetails *struct{ FirstName, LastName string }\n\t\t}{ID: 2, Details: nil},\n\t},\n\n\t{\n\t\tscenario: \"extra repeated column\",\n\t\tfrom:     struct{ ID uint64 }{ID: 1},\n\t\tto: struct {\n\t\t\tID    uint64\n\t\t\tNames []string\n\t\t}{ID: 1, Names: []string{}},\n\t},\n\n\t{\n\t\tscenario: \"extra required column from repeated\",\n\t\tfrom: struct{ ListOfIDs ListOfIDs }{\n\t\t\tListOfIDs: ListOfIDs{IDs: []uint64{0, 1, 2}},\n\t\t},\n\t\tto: struct {\n\t\t\tMainID    uint64\n\t\t\tListOfIDs ListOfIDs\n\t\t}{\n\t\t\tListOfIDs: ListOfIDs{IDs: []uint64{0, 1, 2}},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"extra fields in repeated group\",\n\t\tfrom: struct{ Books []AddressBook1 }{\n\t\t\tBooks: []AddressBook1{\n\t\t\t\t{\n\t\t\t\t\tOwner:             \"me\",\n\t\t\t\t\tOwnerPhoneNumbers: []string{\"123-456-7890\", \"321-654-0987\"},\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\tOwner:             \"you\",\n\t\t\t\t\tOwnerPhoneNumbers: []string{\"000-000-0000\"},\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\t\tto: struct{ Books []AddressBook2 }{\n\t\t\tBooks: []AddressBook2{\n\t\t\t\t{\n\t\t\t\t\tOwner:             \"me\",\n\t\t\t\t\tOwnerPhoneNumbers: []string{\"123-456-7890\", \"321-654-0987\"},\n\t\t\t\t\tContacts:          []Contact{},\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\tOwner:             \"you\",\n\t\t\t\t\tOwnerPhoneNumbers: []string{\"000-000-0000\"},\n\t\t\t\t\tContacts:          []Contact{},\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"extra column on complex struct\",\n\t\tfrom: AddressBook{\n\t\t\tOwner:             \"Julien Le Dem\",\n\t\t\tOwnerPhoneNumbers: []string{},\n\t\t\tContacts: []Contact{\n\t\t\t\t{\n\t\t\t\t\tName:        \"Dmitriy Ryaboy\",\n\t\t\t\t\tPhoneNumber: \"555 987 6543\",\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\tName: \"Chris Aniszczyk\",\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\t\tto: AddressBook2{\n\t\t\tOwner:             \"Julien Le Dem\",\n\t\t\tOwnerPhoneNumbers: []string{},\n\t\t\tContacts: []Contact{\n\t\t\t\t{\n\t\t\t\t\tName:        \"Dmitriy Ryaboy\",\n\t\t\t\t\tPhoneNumber: \"555 987 6543\",\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\tName: \"Chris Aniszczyk\",\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"required to optional leaf\",\n\t\tfrom:     struct{ Name string }{Name: \"Luke\"},\n\t\tto:       struct{ Name *string }{Name: newString(\"Luke\")},\n\t},\n\n\t{\n\t\tscenario: \"required to repeated leaf\",\n\t\tfrom:     struct{ Name string }{Name: \"Luke\"},\n\t\tto:       struct{ Name []string }{Name: []string{\"Luke\"}},\n\t},\n\n\t{\n\t\tscenario: \"optional to required leaf\",\n\t\tfrom:     struct{ Name *string }{Name: newString(\"Luke\")},\n\t\tto:       struct{ Name string }{Name: \"Luke\"},\n\t},\n\n\t{\n\t\tscenario: \"optional to repeated leaf\",\n\t\tfrom:     struct{ Name *string }{Name: newString(\"Luke\")},\n\t\tto:       struct{ Name []string }{Name: []string{\"Luke\"}},\n\t},\n\n\t{\n\t\tscenario: \"optional to repeated leaf (null)\",\n\t\tfrom:     struct{ Name *string }{Name: nil},\n\t\tto:       struct{ Name []string }{Name: []string{}},\n\t},\n\n\t{\n\t\tscenario: \"repeated to required leaf\",\n\t\tfrom:     struct{ Name []string }{Name: []string{\"Luke\", \"Han\", \"Leia\"}},\n\t\tto:       struct{ Name string }{Name: \"Luke\"},\n\t},\n\n\t{\n\t\tscenario: \"repeated to optional leaf\",\n\t\tfrom:     struct{ Name []string }{Name: []string{\"Luke\", \"Han\", \"Leia\"}},\n\t\tto:       struct{ Name *string }{Name: newString(\"Luke\")},\n\t},\n\n\t{\n\t\tscenario: \"required to optional group\",\n\t\tfrom: struct{ Book AddressBook }{\n\t\t\tBook: AddressBook{\n\t\t\t\tOwner: \"Julien Le Dem\",\n\t\t\t\tOwnerPhoneNumbers: []string{\n\t\t\t\t\t\"555 123 4567\",\n\t\t\t\t\t\"555 666 1337\",\n\t\t\t\t},\n\t\t\t\tContacts: []Contact{\n\t\t\t\t\t{\n\t\t\t\t\t\tName:        \"Dmitriy Ryaboy\",\n\t\t\t\t\t\tPhoneNumber: \"555 987 6543\",\n\t\t\t\t\t},\n\t\t\t\t\t{\n\t\t\t\t\t\tName: \"Chris Aniszczyk\",\n\t\t\t\t\t},\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\t\tto: struct{ Book *AddressBook }{\n\t\t\tBook: &AddressBook{\n\t\t\t\tOwner: \"Julien Le Dem\",\n\t\t\t\tOwnerPhoneNumbers: []string{\n\t\t\t\t\t\"555 123 4567\",\n\t\t\t\t\t\"555 666 1337\",\n\t\t\t\t},\n\t\t\t\tContacts: []Contact{\n\t\t\t\t\t{\n\t\t\t\t\t\tName:        \"Dmitriy Ryaboy\",\n\t\t\t\t\t\tPhoneNumber: \"555 987 6543\",\n\t\t\t\t\t},\n\t\t\t\t\t{\n\t\t\t\t\t\tName: \"Chris Aniszczyk\",\n\t\t\t\t\t},\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"required to optional group (empty)\",\n\t\tfrom: struct{ Book AddressBook }{\n\t\t\tBook: AddressBook{},\n\t\t},\n\t\tto: struct{ Book *AddressBook }{\n\t\t\tBook: &AddressBook{\n\t\t\t\tOwnerPhoneNumbers: []string{},\n\t\t\t\tContacts:          []Contact{},\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"optional to required group (null)\",\n\t\tfrom: struct{ Book *AddressBook }{\n\t\t\tBook: nil,\n\t\t},\n\t\tto: struct{ Book AddressBook }{\n\t\t\tBook: AddressBook{\n\t\t\t\tOwnerPhoneNumbers: []string{},\n\t\t\t\tContacts:          []Contact{},\n\t\t\t},\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"optional to repeated group (null)\",\n\t\tfrom:     struct{ Book *AddressBook }{Book: nil},\n\t\tto:       struct{ Book []AddressBook }{Book: []AddressBook{}},\n\t},\n\n\t{\n\t\tscenario: \"optional to repeated optional group (null)\",\n\t\tfrom:     struct{ Book *AddressBook }{Book: nil},\n\t\tto:       struct{ Book []*AddressBook }{Book: []*AddressBook{}},\n\t},\n\n\t{\n\t\tscenario: \"handle nested repeated elements during conversion\",\n\t\tfrom: AddressBook3{\n\t\t\tOwner: \"Julien Le Dem\",\n\t\t\tContacts: []Contact2{\n\t\t\t\t{\n\t\t\t\t\tName: \"Dmitriy Ryaboy\",\n\t\t\t\t\tPhoneNumbers: []string{\n\t\t\t\t\t\t\"555 987 6543\",\n\t\t\t\t\t\t\"555 123 4567\",\n\t\t\t\t\t},\n\t\t\t\t\tAddresses: []string{},\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\tName: \"Chris Aniszczyk\",\n\t\t\t\t\tPhoneNumbers: []string{\n\t\t\t\t\t\t\"555 345 8129\",\n\t\t\t\t\t},\n\t\t\t\t\tAddresses: []string{\n\t\t\t\t\t\t\"42 Wallaby Way Sydney\",\n\t\t\t\t\t\t\"1 White House Way\",\n\t\t\t\t\t},\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\tName: \"Bob Ross\",\n\t\t\t\t\tPhoneNumbers: []string{\n\t\t\t\t\t\t\"555 198 3628\",\n\t\t\t\t\t},\n\t\t\t\t\tAddresses: []string{\n\t\t\t\t\t\t\"::1\",\n\t\t\t\t\t},\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\t\tto: AddressBook4{\n\t\t\tOwner: \"Julien Le Dem\",\n\t\t\tContacts: []Contact2{\n\t\t\t\t{\n\t\t\t\t\tName: \"Dmitriy Ryaboy\",\n\t\t\t\t\tPhoneNumbers: []string{\n\t\t\t\t\t\t\"555 987 6543\",\n\t\t\t\t\t\t\"555 123 4567\",\n\t\t\t\t\t},\n\t\t\t\t\tAddresses: []string{},\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\tName: \"Chris Aniszczyk\",\n\t\t\t\t\tPhoneNumbers: []string{\n\t\t\t\t\t\t\"555 345 8129\",\n\t\t\t\t\t},\n\t\t\t\t\tAddresses: []string{\n\t\t\t\t\t\t\"42 Wallaby Way Sydney\",\n\t\t\t\t\t\t\"1 White House Way\",\n\t\t\t\t\t},\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\tName: \"Bob Ross\",\n\t\t\t\t\tPhoneNumbers: []string{\n\t\t\t\t\t\t\"555 198 3628\",\n\t\t\t\t\t},\n\t\t\t\t\tAddresses: []string{\n\t\t\t\t\t\t\"::1\",\n\t\t\t\t\t},\n\t\t\t\t},\n\t\t\t},\n\t\t\tExtra: \"\",\n\t\t},\n\t},\n\n\t{\n\t\tscenario: \"handle nested repeated elements during conversion\",\n\t\tfrom: SimpleAddressBook{\n\t\t\tName: \"New Contact\",\n\t\t\tContact: SimpleContact{\n\t\t\t\tNumbers: []SimpleNumber{\n\t\t\t\t\t{\n\t\t\t\t\t\tNumber: nil,\n\t\t\t\t\t},\n\t\t\t\t\t{\n\t\t\t\t\t\tNumber: newInt64(1329),\n\t\t\t\t\t},\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\t\tto: SimpleAddressBook2{\n\t\t\tName: \"New Contact\",\n\t\t\tContact: SimpleContact{\n\t\t\t\tNumbers: []SimpleNumber{\n\t\t\t\t\t{\n\t\t\t\t\t\tNumber: nil,\n\t\t\t\t\t},\n\t\t\t\t\t{\n\t\t\t\t\t\tNumber: newInt64(1329),\n\t\t\t\t\t},\n\t\t\t\t},\n\t\t\t},\n\t\t\tExtra: \"\",\n\t\t},\n\t},\n}\n\nfunc TestConvert(t *testing.T) {\n\tfor _, test := range conversionTests {\n\t\tt.Run(test.scenario, func(t *testing.T) {\n\t\t\tto := parquet.SchemaOf(test.to)\n\t\t\tfrom := parquet.SchemaOf(test.from)\n\n\t\t\tconv, err := parquet.Convert(to, from)\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t}\n\n\t\t\trow := from.Deconstruct(nil, test.from)\n\t\t\trowbuf := []parquet.Row{row}\n\t\t\tn, err := conv.Convert(rowbuf)\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t}\n\t\t\tif n != 1 {\n\t\t\t\tt.Errorf(\"wrong number of rows got converted: want=1 got=%d\", n)\n\t\t\t}\n\t\t\trow = rowbuf[0]\n\n\t\t\tvalue := reflect.New(reflect.TypeOf(test.to))\n\t\t\tif err := to.Reconstruct(value.Interface(), row); err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t}\n\n\t\t\tvalue = value.Elem()\n\t\t\tif !reflect.DeepEqual(value.Interface(), test.to) {\n\t\t\t\tt.Errorf(\"converted value mismatch:\\nwant = %#v\\ngot  = %#v\", test.to, value.Interface())\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc newInt64(i int64) *int64    { return &i }\nfunc newString(s string) *string { return &s }\n\nfunc TestConvertValue(t *testing.T) {\n\tnow := time.Unix(42, 0)\n\tms := now.UnixMilli()\n\tus := now.UnixMicro()\n\tns := now.UnixNano()\n\n\tmsType := parquet.Timestamp(parquet.Millisecond).Type()\n\tmsVal := parquet.ValueOf(ms)\n\tif msVal.Int64() != ms {\n\t\tt.Errorf(\"converted value mismatch:\\nwant = %+v\\ngot  = %+v\", ms, msVal.Int64())\n\t}\n\n\tusType := parquet.Timestamp(parquet.Microsecond).Type()\n\tusVal := parquet.ValueOf(us)\n\tif usVal.Int64() != us {\n\t\tt.Errorf(\"converted value mismatch:\\nwant = %+v\\ngot  = %+v\", us, usVal.Int64())\n\t}\n\n\tnsType := parquet.Timestamp(parquet.Nanosecond).Type()\n\tnsVal := parquet.ValueOf(ns)\n\tif nsVal.Int64() != ns {\n\t\tt.Errorf(\"converted value mismatch:\\nwant = %+v\\ngot  = %+v\", ns, nsVal.Int64())\n\t}\n\n\tvar timestampConversionTests = [...]struct {\n\t\tscenario  string\n\t\tfromType  parquet.Type\n\t\tfromValue parquet.Value\n\t\ttoType    parquet.Type\n\t\ttoValue   parquet.Value\n\t}{\n\t\t{\n\t\t\tscenario:  \"true to boolean\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(true),\n\t\t\ttoType:    parquet.BooleanType,\n\t\t\ttoValue:   parquet.BooleanValue(true),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"true to int32\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(true),\n\t\t\ttoType:    parquet.Int32Type,\n\t\t\ttoValue:   parquet.Int32Value(1),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"true to int64\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(true),\n\t\t\ttoType:    parquet.Int64Type,\n\t\t\ttoValue:   parquet.Int64Value(1),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"true to int96\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(true),\n\t\t\ttoType:    parquet.Int96Type,\n\t\t\ttoValue:   parquet.Int96Value(deprecated.Int96{0: 1}),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"true to float\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(true),\n\t\t\ttoType:    parquet.FloatType,\n\t\t\ttoValue:   parquet.FloatValue(1),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"true to double\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(true),\n\t\t\ttoType:    parquet.FloatType,\n\t\t\ttoValue:   parquet.FloatValue(1),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"true to byte array\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(true),\n\t\t\ttoType:    parquet.ByteArrayType,\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte{1}),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"true to fixed length byte array\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(true),\n\t\t\ttoType:    parquet.FixedLenByteArrayType(4),\n\t\t\ttoValue:   parquet.FixedLenByteArrayValue([]byte{1, 0, 0, 0}),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"true to string\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(true),\n\t\t\ttoType:    parquet.String().Type(),\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte(`true`)),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"false to boolean\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(false),\n\t\t\ttoType:    parquet.BooleanType,\n\t\t\ttoValue:   parquet.BooleanValue(false),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"false to int32\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(false),\n\t\t\ttoType:    parquet.Int32Type,\n\t\t\ttoValue:   parquet.Int32Value(0),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"false to int64\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(false),\n\t\t\ttoType:    parquet.Int64Type,\n\t\t\ttoValue:   parquet.Int64Value(0),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"false to int96\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(false),\n\t\t\ttoType:    parquet.Int96Type,\n\t\t\ttoValue:   parquet.Int96Value(deprecated.Int96{}),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"false to float\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(false),\n\t\t\ttoType:    parquet.FloatType,\n\t\t\ttoValue:   parquet.FloatValue(0),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"false to double\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(false),\n\t\t\ttoType:    parquet.FloatType,\n\t\t\ttoValue:   parquet.FloatValue(0),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"false to byte array\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(false),\n\t\t\ttoType:    parquet.ByteArrayType,\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte{0}),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"false to fixed length byte array\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(false),\n\t\t\ttoType:    parquet.FixedLenByteArrayType(4),\n\t\t\ttoValue:   parquet.FixedLenByteArrayValue([]byte{0, 0, 0, 0}),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"false to string\",\n\t\t\tfromType:  parquet.BooleanType,\n\t\t\tfromValue: parquet.BooleanValue(false),\n\t\t\ttoType:    parquet.String().Type(),\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte(`false`)),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int32 to true\",\n\t\t\tfromType:  parquet.Int32Type,\n\t\t\tfromValue: parquet.Int32Value(10),\n\t\t\ttoType:    parquet.BooleanType,\n\t\t\ttoValue:   parquet.BooleanValue(true),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int32 to false\",\n\t\t\tfromType:  parquet.Int32Type,\n\t\t\tfromValue: parquet.Int32Value(0),\n\t\t\ttoType:    parquet.BooleanType,\n\t\t\ttoValue:   parquet.BooleanValue(false),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int32 to int32\",\n\t\t\tfromType:  parquet.Int32Type,\n\t\t\tfromValue: parquet.Int32Value(42),\n\t\t\ttoType:    parquet.Int32Type,\n\t\t\ttoValue:   parquet.Int32Value(42),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int32 to int64\",\n\t\t\tfromType:  parquet.Int32Type,\n\t\t\tfromValue: parquet.Int32Value(-21),\n\t\t\ttoType:    parquet.Int64Type,\n\t\t\ttoValue:   parquet.Int64Value(-21),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int32 to int96\",\n\t\t\tfromType:  parquet.Int32Type,\n\t\t\tfromValue: parquet.Int32Value(123),\n\t\t\ttoType:    parquet.Int96Type,\n\t\t\ttoValue:   parquet.Int96Value(deprecated.Int96{0: 123}),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int32 to float\",\n\t\t\tfromType:  parquet.Int32Type,\n\t\t\tfromValue: parquet.Int32Value(9),\n\t\t\ttoType:    parquet.FloatType,\n\t\t\ttoValue:   parquet.FloatValue(9),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int32 to double\",\n\t\t\tfromType:  parquet.Int32Type,\n\t\t\tfromValue: parquet.Int32Value(100),\n\t\t\ttoType:    parquet.DoubleType,\n\t\t\ttoValue:   parquet.DoubleValue(100),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int32 to byte array\",\n\t\t\tfromType:  parquet.Int32Type,\n\t\t\tfromValue: parquet.Int32Value(1 << 8),\n\t\t\ttoType:    parquet.ByteArrayType,\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte{0, 1, 0, 0}),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int32 to fixed length byte array\",\n\t\t\tfromType:  parquet.Int32Type,\n\t\t\tfromValue: parquet.Int32Value(1 << 8),\n\t\t\ttoType:    parquet.FixedLenByteArrayType(3),\n\t\t\ttoValue:   parquet.FixedLenByteArrayValue([]byte{0, 1, 0}),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int32 to string\",\n\t\t\tfromType:  parquet.Int32Type,\n\t\t\tfromValue: parquet.Int32Value(12345),\n\t\t\ttoType:    parquet.String().Type(),\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte(`12345`)),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int64 to true\",\n\t\t\tfromType:  parquet.Int64Type,\n\t\t\tfromValue: parquet.Int64Value(10),\n\t\t\ttoType:    parquet.BooleanType,\n\t\t\ttoValue:   parquet.BooleanValue(true),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int64 to false\",\n\t\t\tfromType:  parquet.Int64Type,\n\t\t\tfromValue: parquet.Int64Value(0),\n\t\t\ttoType:    parquet.BooleanType,\n\t\t\ttoValue:   parquet.BooleanValue(false),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int64 to int32\",\n\t\t\tfromType:  parquet.Int64Type,\n\t\t\tfromValue: parquet.Int64Value(-21),\n\t\t\ttoType:    parquet.Int32Type,\n\t\t\ttoValue:   parquet.Int32Value(-21),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int64 to int64\",\n\t\t\tfromType:  parquet.Int64Type,\n\t\t\tfromValue: parquet.Int64Value(42),\n\t\t\ttoType:    parquet.Int64Type,\n\t\t\ttoValue:   parquet.Int64Value(42),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int64 to int96\",\n\t\t\tfromType:  parquet.Int64Type,\n\t\t\tfromValue: parquet.Int64Value(123),\n\t\t\ttoType:    parquet.Int96Type,\n\t\t\ttoValue:   parquet.Int96Value(deprecated.Int96{0: 123}),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int64 to float\",\n\t\t\tfromType:  parquet.Int64Type,\n\t\t\tfromValue: parquet.Int64Value(9),\n\t\t\ttoType:    parquet.FloatType,\n\t\t\ttoValue:   parquet.FloatValue(9),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int64 to double\",\n\t\t\tfromType:  parquet.Int64Type,\n\t\t\tfromValue: parquet.Int64Value(100),\n\t\t\ttoType:    parquet.DoubleType,\n\t\t\ttoValue:   parquet.DoubleValue(100),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int64 to byte array\",\n\t\t\tfromType:  parquet.Int64Type,\n\t\t\tfromValue: parquet.Int64Value(1 << 8),\n\t\t\ttoType:    parquet.ByteArrayType,\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte{0, 1, 0, 0, 0, 0, 0, 0}),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int64 to fixed length byte array\",\n\t\t\tfromType:  parquet.Int64Type,\n\t\t\tfromValue: parquet.Int64Value(1 << 8),\n\t\t\ttoType:    parquet.FixedLenByteArrayType(3),\n\t\t\ttoValue:   parquet.FixedLenByteArrayValue([]byte{0, 1, 0}),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int64 to string\",\n\t\t\tfromType:  parquet.Int64Type,\n\t\t\tfromValue: parquet.Int64Value(1234567890),\n\t\t\ttoType:    parquet.String().Type(),\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte(`1234567890`)),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"float to true\",\n\t\t\tfromType:  parquet.FloatType,\n\t\t\tfromValue: parquet.FloatValue(0.1),\n\t\t\ttoType:    parquet.BooleanType,\n\t\t\ttoValue:   parquet.BooleanValue(true),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"float to false\",\n\t\t\tfromType:  parquet.FloatType,\n\t\t\tfromValue: parquet.FloatValue(0),\n\t\t\ttoType:    parquet.BooleanType,\n\t\t\ttoValue:   parquet.BooleanValue(false),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"float to int32\",\n\t\t\tfromType:  parquet.FloatType,\n\t\t\tfromValue: parquet.FloatValue(9.9),\n\t\t\ttoType:    parquet.Int32Type,\n\t\t\ttoValue:   parquet.Int32Value(9),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"float to int64\",\n\t\t\tfromType:  parquet.FloatType,\n\t\t\tfromValue: parquet.FloatValue(-1.5),\n\t\t\ttoType:    parquet.Int64Type,\n\t\t\ttoValue:   parquet.Int64Value(-1),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"float to float\",\n\t\t\tfromType:  parquet.FloatType,\n\t\t\tfromValue: parquet.FloatValue(1.234),\n\t\t\ttoType:    parquet.FloatType,\n\t\t\ttoValue:   parquet.FloatValue(1.234),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"float to double\",\n\t\t\tfromType:  parquet.FloatType,\n\t\t\tfromValue: parquet.FloatValue(-0.5),\n\t\t\ttoType:    parquet.DoubleType,\n\t\t\ttoValue:   parquet.DoubleValue(-0.5),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"float to string\",\n\t\t\tfromType:  parquet.FloatType,\n\t\t\tfromValue: parquet.FloatValue(0.125),\n\t\t\ttoType:    parquet.String().Type(),\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte(`0.125`)),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"double to true\",\n\t\t\tfromType:  parquet.DoubleType,\n\t\t\tfromValue: parquet.DoubleValue(0.1),\n\t\t\ttoType:    parquet.BooleanType,\n\t\t\ttoValue:   parquet.BooleanValue(true),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"double to false\",\n\t\t\tfromType:  parquet.DoubleType,\n\t\t\tfromValue: parquet.DoubleValue(0),\n\t\t\ttoType:    parquet.BooleanType,\n\t\t\ttoValue:   parquet.BooleanValue(false),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"double to int32\",\n\t\t\tfromType:  parquet.DoubleType,\n\t\t\tfromValue: parquet.DoubleValue(9.9),\n\t\t\ttoType:    parquet.Int32Type,\n\t\t\ttoValue:   parquet.Int32Value(9),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"double to int64\",\n\t\t\tfromType:  parquet.DoubleType,\n\t\t\tfromValue: parquet.DoubleValue(-1.5),\n\t\t\ttoType:    parquet.Int64Type,\n\t\t\ttoValue:   parquet.Int64Value(-1),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"double to float\",\n\t\t\tfromType:  parquet.DoubleType,\n\t\t\tfromValue: parquet.DoubleValue(1.234),\n\t\t\ttoType:    parquet.FloatType,\n\t\t\ttoValue:   parquet.FloatValue(1.234),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"double to double\",\n\t\t\tfromType:  parquet.DoubleType,\n\t\t\tfromValue: parquet.DoubleValue(-0.5),\n\t\t\ttoType:    parquet.DoubleType,\n\t\t\ttoValue:   parquet.DoubleValue(-0.5),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"double to string\",\n\t\t\tfromType:  parquet.DoubleType,\n\t\t\tfromValue: parquet.DoubleValue(0.125),\n\t\t\ttoType:    parquet.String().Type(),\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte(`0.125`)),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"string to true\",\n\t\t\tfromType:  parquet.String().Type(),\n\t\t\tfromValue: parquet.ByteArrayValue([]byte(`true`)),\n\t\t\ttoType:    parquet.BooleanType,\n\t\t\ttoValue:   parquet.BooleanValue(true),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"string to false\",\n\t\t\tfromType:  parquet.String().Type(),\n\t\t\tfromValue: parquet.ByteArrayValue([]byte(`false`)),\n\t\t\ttoType:    parquet.BooleanType,\n\t\t\ttoValue:   parquet.BooleanValue(false),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"string to int32\",\n\t\t\tfromType:  parquet.String().Type(),\n\t\t\tfromValue: parquet.ByteArrayValue([]byte(`-21`)),\n\t\t\ttoType:    parquet.Int32Type,\n\t\t\ttoValue:   parquet.Int32Value(-21),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"string to int64\",\n\t\t\tfromType:  parquet.String().Type(),\n\t\t\tfromValue: parquet.ByteArrayValue([]byte(`42`)),\n\t\t\ttoType:    parquet.Int64Type,\n\t\t\ttoValue:   parquet.Int64Value(42),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"string to int96\",\n\t\t\tfromType:  parquet.String().Type(),\n\t\t\tfromValue: parquet.ByteArrayValue([]byte(`123`)),\n\t\t\ttoType:    parquet.Int96Type,\n\t\t\ttoValue:   parquet.Int96Value(deprecated.Int96{0: 123}),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"string to float\",\n\t\t\tfromType:  parquet.String().Type(),\n\t\t\tfromValue: parquet.ByteArrayValue([]byte(`-0.5`)),\n\t\t\ttoType:    parquet.FloatType,\n\t\t\ttoValue:   parquet.FloatValue(-0.5),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"string to double\",\n\t\t\tfromType:  parquet.String().Type(),\n\t\t\tfromValue: parquet.ByteArrayValue([]byte(`0.5`)),\n\t\t\ttoType:    parquet.DoubleType,\n\t\t\ttoValue:   parquet.DoubleValue(0.5),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"string to byte array\",\n\t\t\tfromType:  parquet.String().Type(),\n\t\t\tfromValue: parquet.ByteArrayValue([]byte(`ABC`)),\n\t\t\ttoType:    parquet.ByteArrayType,\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte(`ABC`)),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"string to fixed length byte array\",\n\t\t\tfromType:  parquet.String().Type(),\n\t\t\tfromValue: parquet.ByteArrayValue([]byte(`99B816772522447EBF76821A7C5ADF65`)),\n\t\t\ttoType:    parquet.FixedLenByteArrayType(16),\n\t\t\ttoValue: parquet.FixedLenByteArrayValue([]byte{\n\t\t\t\t0x99, 0xb8, 0x16, 0x77, 0x25, 0x22, 0x44, 0x7e,\n\t\t\t\t0xbf, 0x76, 0x82, 0x1a, 0x7c, 0x5a, 0xdf, 0x65,\n\t\t\t}),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"string to string\",\n\t\t\tfromType:  parquet.String().Type(),\n\t\t\tfromValue: parquet.ByteArrayValue([]byte(`Hello World!`)),\n\t\t\ttoType:    parquet.String().Type(),\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte(`Hello World!`)),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"string to date\",\n\t\t\tfromType:  parquet.String().Type(),\n\t\t\tfromValue: parquet.ByteArrayValue([]byte(`1970-01-03`)),\n\t\t\ttoType:    parquet.Date().Type(),\n\t\t\ttoValue:   parquet.Int32Value(2),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"string to millisecond time\",\n\t\t\tfromType:  parquet.String().Type(),\n\t\t\tfromValue: parquet.ByteArrayValue([]byte(`12:34:56.789`)),\n\t\t\ttoType:    parquet.Time(parquet.Millisecond).Type(),\n\t\t\ttoValue:   parquet.Int32Value(45296789),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"string to microsecond time\",\n\t\t\tfromType:  parquet.String().Type(),\n\t\t\tfromValue: parquet.ByteArrayValue([]byte(`12:34:56.789012`)),\n\t\t\ttoType:    parquet.Time(parquet.Microsecond).Type(),\n\t\t\ttoValue:   parquet.Int64Value(45296789012),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"date to millisecond timestamp\",\n\t\t\tfromType:  parquet.Date().Type(),\n\t\t\tfromValue: parquet.Int32Value(19338),\n\t\t\ttoType:    parquet.Timestamp(parquet.Millisecond).Type(),\n\t\t\ttoValue:   parquet.Int64Value(1670803200000),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"date to microsecond timestamp\",\n\t\t\tfromType:  parquet.Date().Type(),\n\t\t\tfromValue: parquet.Int32Value(19338),\n\t\t\ttoType:    parquet.Timestamp(parquet.Microsecond).Type(),\n\t\t\ttoValue:   parquet.Int64Value(1670803200000000),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"date to string\",\n\t\t\tfromType:  parquet.Date().Type(),\n\t\t\tfromValue: parquet.Int32Value(18995),\n\t\t\ttoType:    parquet.String().Type(),\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte(`2022-01-03`)),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"millisecond time to string\",\n\t\t\tfromType:  parquet.Time(parquet.Millisecond).Type(),\n\t\t\tfromValue: parquet.Int32Value(45296789),\n\t\t\ttoType:    parquet.String().Type(),\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte(`12:34:56.789`)),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"microsecond time to string\",\n\t\t\tfromType:  parquet.Time(parquet.Microsecond).Type(),\n\t\t\tfromValue: parquet.Int64Value(45296789012),\n\t\t\ttoType:    parquet.String().Type(),\n\t\t\ttoValue:   parquet.ByteArrayValue([]byte(`12:34:56.789012`)),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"millisecond timestamp to date\",\n\t\t\tfromType:  parquet.Timestamp(parquet.Millisecond).Type(),\n\t\t\tfromValue: parquet.Int64Value(1670888613000),\n\t\t\ttoType:    parquet.Date().Type(),\n\t\t\ttoValue:   parquet.Int32Value(19338),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"microsecond timestamp to date\",\n\t\t\tfromType:  parquet.Timestamp(parquet.Microsecond).Type(),\n\t\t\tfromValue: parquet.Int64Value(1670888613000123),\n\t\t\ttoType:    parquet.Date().Type(),\n\t\t\ttoValue:   parquet.Int32Value(19338),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"millisecond timestamp to millisecond time\",\n\t\t\tfromType:  parquet.Timestamp(parquet.Millisecond).Type(),\n\t\t\tfromValue: parquet.Int64Value(1670888613123),\n\t\t\ttoType:    parquet.Time(parquet.Millisecond).Type(),\n\t\t\ttoValue:   parquet.Int32Value(85413123),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"millisecond timestamp to micronsecond time\",\n\t\t\tfromType:  parquet.Timestamp(parquet.Millisecond).Type(),\n\t\t\tfromValue: parquet.Int64Value(1670888613123),\n\t\t\ttoType:    parquet.Time(parquet.Microsecond).Type(),\n\t\t\ttoValue:   parquet.Int64Value(85413123000),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"microsecond timestamp to millisecond time\",\n\t\t\tfromType:  parquet.Timestamp(parquet.Microsecond).Type(),\n\t\t\tfromValue: parquet.Int64Value(1670888613123456),\n\t\t\ttoType:    parquet.Time(parquet.Millisecond).Type(),\n\t\t\ttoValue:   parquet.Int32Value(85413123),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"microsecond timestamp to micronsecond time\",\n\t\t\tfromType:  parquet.Timestamp(parquet.Microsecond).Type(),\n\t\t\tfromValue: parquet.Int64Value(1670888613123456),\n\t\t\ttoType:    parquet.Time(parquet.Microsecond).Type(),\n\t\t\ttoValue:   parquet.Int64Value(85413123456),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"micros to nanos\",\n\t\t\tfromType:  usType,\n\t\t\tfromValue: usVal,\n\t\t\ttoType:    nsType,\n\t\t\ttoValue:   parquet.Int64Value(ns),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"millis to nanos\",\n\t\t\tfromType:  msType,\n\t\t\tfromValue: msVal,\n\t\t\ttoType:    nsType,\n\t\t\ttoValue:   parquet.Int64Value(ns),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"nanos to micros\",\n\t\t\tfromType:  nsType,\n\t\t\tfromValue: nsVal,\n\t\t\ttoType:    usType,\n\t\t\ttoValue:   parquet.Int64Value(us),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"nanos to nanos\",\n\t\t\tfromType:  nsType,\n\t\t\tfromValue: nsVal,\n\t\t\ttoType:    nsType,\n\t\t\ttoValue:   parquet.Int64Value(ns),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int64 to nanos\",\n\t\t\tfromType:  parquet.Int64Type,\n\t\t\tfromValue: nsVal,\n\t\t\ttoType:    nsType,\n\t\t\ttoValue:   parquet.Int64Value(ns),\n\t\t},\n\n\t\t{\n\t\t\tscenario:  \"int64 to int64\",\n\t\t\tfromType:  parquet.Int64Type,\n\t\t\tfromValue: nsVal,\n\t\t\ttoType:    parquet.Int64Type,\n\t\t\ttoValue:   parquet.Int64Value(ns),\n\t\t},\n\t}\n\n\tfor _, test := range timestampConversionTests {\n\t\tt.Run(test.scenario, func(t *testing.T) {\n\t\t\t// Set levels to ensure that they are retained by the conversion.\n\t\t\tfrom := test.fromValue.Level(1, 2, 3)\n\t\t\twant := test.toValue.Level(1, 2, 3)\n\n\t\t\tgot, err := test.toType.ConvertValue(from, test.fromType)\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t}\n\n\t\t\tif !parquet.DeepEqual(want, got) {\n\t\t\t\tt.Errorf(\"converted value mismatch:\\nwant = %+v\\ngot  = %+v\", want, got)\n\t\t\t}\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "dedupe.go",
    "content": "package parquet\n\n// DedupeRowReader constructs a row reader which drops duplicated consecutive\n// rows, according to the comparator function passed as argument.\n//\n// If the underlying reader produces a sequence of rows sorted by the same\n// comparison predicate, the output is guaranteed to produce unique rows only.\nfunc DedupeRowReader(reader RowReader, compare func(Row, Row) int) RowReader {\n\treturn &dedupeRowReader{reader: reader, compare: compare}\n}\n\ntype dedupeRowReader struct {\n\treader  RowReader\n\tcompare func(Row, Row) int\n\tdedupe\n}\n\nfunc (d *dedupeRowReader) ReadRows(rows []Row) (int, error) {\n\tfor {\n\t\tn, err := d.reader.ReadRows(rows)\n\t\tn = d.deduplicate(rows[:n], d.compare)\n\n\t\tif n > 0 || err != nil {\n\t\t\treturn n, err\n\t\t}\n\t}\n}\n\n// DedupeRowWriter constructs a row writer which drops duplicated consecutive\n// rows, according to the comparator function passed as argument.\n//\n// If the writer is given a sequence of rows sorted by the same comparison\n// predicate, the output is guaranteed to contain unique rows only.\nfunc DedupeRowWriter(writer RowWriter, compare func(Row, Row) int) RowWriter {\n\treturn &dedupeRowWriter{writer: writer, compare: compare}\n}\n\ntype dedupeRowWriter struct {\n\twriter  RowWriter\n\tcompare func(Row, Row) int\n\tdedupe\n\trows []Row\n}\n\nfunc (d *dedupeRowWriter) WriteRows(rows []Row) (int, error) {\n\t// We need to make a copy because we cannot modify the rows slice received\n\t// as argument to respect the RowWriter contract.\n\td.rows = append(d.rows[:0], rows...)\n\tdefer func() {\n\t\tfor i := range d.rows {\n\t\t\td.rows[i] = Row{}\n\t\t}\n\t}()\n\n\tif n := d.deduplicate(d.rows, d.compare); n > 0 {\n\t\tw, err := d.writer.WriteRows(d.rows[:n])\n\t\tif err != nil {\n\t\t\treturn w, err\n\t\t}\n\t}\n\n\t// Return the number of rows received instead of the number of deduplicated\n\t// rows actually written to the underlying writer because we have to repsect\n\t// the RowWriter contract.\n\treturn len(rows), nil\n}\n\ntype dedupe struct {\n\talloc   rowAllocator\n\tlastRow Row\n\tuniq    []Row\n\tdupe    []Row\n}\n\nfunc (d *dedupe) reset() {\n\td.alloc.reset()\n\td.lastRow = d.lastRow[:0]\n}\n\nfunc (d *dedupe) deduplicate(rows []Row, compare func(Row, Row) int) int {\n\tdefer func() {\n\t\tfor i := range d.uniq {\n\t\t\td.uniq[i] = Row{}\n\t\t}\n\t\tfor i := range d.dupe {\n\t\t\td.dupe[i] = Row{}\n\t\t}\n\t\td.uniq = d.uniq[:0]\n\t\td.dupe = d.dupe[:0]\n\t}()\n\n\tlastRow := d.lastRow\n\n\tfor _, row := range rows {\n\t\tif len(lastRow) != 0 && compare(row, lastRow) == 0 {\n\t\t\td.dupe = append(d.dupe, row)\n\t\t} else {\n\t\t\tlastRow = row\n\t\t\td.uniq = append(d.uniq, row)\n\t\t}\n\t}\n\n\trows = rows[:0]\n\trows = append(rows, d.uniq...)\n\trows = append(rows, d.dupe...)\n\n\td.alloc.reset()\n\td.alloc.capture(lastRow)\n\td.lastRow = append(d.lastRow[:0], lastRow...)\n\treturn len(d.uniq)\n}\n"
  },
  {
    "path": "dedupe_test.go",
    "content": "//go:build go1.18\n\npackage parquet_test\n\nimport (\n\t\"sort\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc TestDedupeRowReader(t *testing.T) {\n\ttype Row struct {\n\t\tValue int32 `parquet:\"value\"`\n\t}\n\n\trows := make([]Row, 1000)\n\tfor i := range rows {\n\t\trows[i].Value = int32(i / 3)\n\t}\n\n\tdedupeMap := make(map[Row]struct{}, len(rows))\n\tfor _, row := range rows {\n\t\tdedupeMap[row] = struct{}{}\n\t}\n\n\tdedupeRows := make([]Row, 0, len(dedupeMap))\n\tfor row := range dedupeMap {\n\t\tdedupeRows = append(dedupeRows, row)\n\t}\n\n\tsort.Slice(dedupeRows, func(i, j int) bool {\n\t\treturn dedupeRows[i].Value < dedupeRows[j].Value\n\t})\n\n\tbuffer1 := parquet.NewRowBuffer[Row]()\n\tbuffer1.Write(rows)\n\n\tbuffer1Rows := buffer1.Rows()\n\tdefer buffer1Rows.Close()\n\n\tbuffer2 := parquet.NewRowBuffer[Row]()\n\n\t_, err := parquet.CopyRows(buffer2,\n\t\tparquet.DedupeRowReader(buffer1Rows,\n\t\t\tbuffer1.Schema().Comparator(parquet.Ascending(\"value\")),\n\t\t),\n\t)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\treader := parquet.NewGenericRowGroupReader[Row](buffer2)\n\tdefer reader.Close()\n\n\tn, _ := reader.Read(rows)\n\tassertRowsEqual(t, dedupeRows, rows[:n])\n}\n\nfunc TestDedupeRowWriter(t *testing.T) {\n\ttype Row struct {\n\t\tValue int32 `parquet:\"value\"`\n\t}\n\n\trows := make([]Row, 1000)\n\tfor i := range rows {\n\t\trows[i].Value = int32(i / 3)\n\t}\n\n\tdedupeMap := make(map[Row]struct{}, len(rows))\n\tfor _, row := range rows {\n\t\tdedupeMap[row] = struct{}{}\n\t}\n\n\tdedupeRows := make([]Row, 0, len(dedupeMap))\n\tfor row := range dedupeMap {\n\t\tdedupeRows = append(dedupeRows, row)\n\t}\n\n\tsort.Slice(dedupeRows, func(i, j int) bool {\n\t\treturn dedupeRows[i].Value < dedupeRows[j].Value\n\t})\n\n\tbuffer1 := parquet.NewRowBuffer[Row]()\n\tbuffer1.Write(rows)\n\n\tbuffer1Rows := buffer1.Rows()\n\tdefer buffer1Rows.Close()\n\n\tbuffer2 := parquet.NewRowBuffer[Row]()\n\n\t_, err := parquet.CopyRows(\n\t\tparquet.DedupeRowWriter(buffer2,\n\t\t\tbuffer1.Schema().Comparator(parquet.Ascending(\"value\")),\n\t\t),\n\t\tbuffer1Rows,\n\t)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\treader := parquet.NewGenericRowGroupReader[Row](buffer2)\n\tdefer reader.Close()\n\n\tn, _ := reader.Read(rows)\n\tassertRowsEqual(t, dedupeRows, rows[:n])\n}\n"
  },
  {
    "path": "deprecated/int96.go",
    "content": "package deprecated\n\nimport (\n\t\"math/big\"\n\t\"math/bits\"\n\t\"unsafe\"\n)\n\n// Int96 is an implementation of the deprecated INT96 parquet type.\ntype Int96 [3]uint32\n\n// Int32ToInt96 converts a int32 value to a Int96.\nfunc Int32ToInt96(value int32) (i96 Int96) {\n\tif value < 0 {\n\t\ti96[2] = 0xFFFFFFFF\n\t\ti96[1] = 0xFFFFFFFF\n\t}\n\ti96[0] = uint32(value)\n\treturn\n}\n\n// Int64ToInt96 converts a int64 value to Int96.\nfunc Int64ToInt96(value int64) (i96 Int96) {\n\tif value < 0 {\n\t\ti96[2] = 0xFFFFFFFF\n\t}\n\ti96[1] = uint32(value >> 32)\n\ti96[0] = uint32(value)\n\treturn\n}\n\n// IsZero returns true if i is the zero-value.\nfunc (i Int96) IsZero() bool { return i == Int96{} }\n\n// Negative returns true if i is a negative value.\nfunc (i Int96) Negative() bool {\n\treturn (i[2] >> 31) != 0\n}\n\n// Less returns true if i < j.\n//\n// The method implements a signed comparison between the two operands.\nfunc (i Int96) Less(j Int96) bool {\n\tif i.Negative() {\n\t\tif !j.Negative() {\n\t\t\treturn true\n\t\t}\n\t} else {\n\t\tif j.Negative() {\n\t\t\treturn false\n\t\t}\n\t}\n\tfor k := 2; k >= 0; k-- {\n\t\ta, b := i[k], j[k]\n\t\tswitch {\n\t\tcase a < b:\n\t\t\treturn true\n\t\tcase a > b:\n\t\t\treturn false\n\t\t}\n\t}\n\treturn false\n}\n\n// Int converts i to a big.Int representation.\nfunc (i Int96) Int() *big.Int {\n\tz := new(big.Int)\n\tz.Or(z, big.NewInt(int64(i[2])<<32|int64(i[1])))\n\tz.Lsh(z, 32)\n\tz.Or(z, big.NewInt(int64(i[0])))\n\treturn z\n}\n\n// Int32 converts i to a int32, potentially truncating the value.\nfunc (i Int96) Int32() int32 {\n\treturn int32(i[0])\n}\n\n// Int64 converts i to a int64, potentially truncating the value.\nfunc (i Int96) Int64() int64 {\n\treturn int64(i[1])<<32 | int64(i[0])\n}\n\n// String returns a string representation of i.\nfunc (i Int96) String() string {\n\treturn i.Int().String()\n}\n\n// Len returns the minimum length in bits required to store the value of i.\nfunc (i Int96) Len() int {\n\tswitch {\n\tcase i[2] != 0:\n\t\treturn 64 + bits.Len32(i[2])\n\tcase i[1] != 0:\n\t\treturn 32 + bits.Len32(i[1])\n\tdefault:\n\t\treturn bits.Len32(i[0])\n\t}\n}\n\n// Int96ToBytes converts the slice of Int96 values to a slice of bytes sharing\n// the same backing array.\nfunc Int96ToBytes(data []Int96) []byte {\n\treturn unsafe.Slice(*(**byte)(unsafe.Pointer(&data)), 12*len(data))\n}\n\n// BytesToInt96 converts the byte slice passed as argument to a slice of Int96\n// sharing the same backing array.\n//\n// When the number of bytes in the input is not a multiple of 12, the function\n// truncates it in the returned slice.\nfunc BytesToInt96(data []byte) []Int96 {\n\treturn unsafe.Slice(*(**Int96)(unsafe.Pointer(&data)), len(data)/12)\n}\n\nfunc MaxLenInt96(data []Int96) int {\n\tmax := 0\n\tfor i := range data {\n\t\tn := data[i].Len()\n\t\tif n > max {\n\t\t\tmax = n\n\t\t}\n\t}\n\treturn max\n}\n\nfunc MinInt96(data []Int96) (min Int96) {\n\tif len(data) > 0 {\n\t\tmin = data[0]\n\t\tfor _, v := range data[1:] {\n\t\t\tif v.Less(min) {\n\t\t\t\tmin = v\n\t\t\t}\n\t\t}\n\t}\n\treturn min\n}\n\nfunc MaxInt96(data []Int96) (max Int96) {\n\tif len(data) > 0 {\n\t\tmax = data[0]\n\t\tfor _, v := range data[1:] {\n\t\t\tif max.Less(v) {\n\t\t\t\tmax = v\n\t\t\t}\n\t\t}\n\t}\n\treturn max\n}\n\nfunc MinMaxInt96(data []Int96) (min, max Int96) {\n\tif len(data) > 0 {\n\t\tmin = data[0]\n\t\tmax = data[0]\n\t\tfor _, v := range data[1:] {\n\t\t\tif v.Less(min) {\n\t\t\t\tmin = v\n\t\t\t}\n\t\t\tif max.Less(v) {\n\t\t\t\tmax = v\n\t\t\t}\n\t\t}\n\t}\n\treturn min, max\n}\n\nfunc OrderOfInt96(data []Int96) int {\n\tif len(data) > 1 {\n\t\tif int96AreInAscendingOrder(data) {\n\t\t\treturn +1\n\t\t}\n\t\tif int96AreInDescendingOrder(data) {\n\t\t\treturn -1\n\t\t}\n\t}\n\treturn 0\n}\n\nfunc int96AreInAscendingOrder(data []Int96) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tif data[i].Less(data[i-1]) {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc int96AreInDescendingOrder(data []Int96) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tif data[i-1].Less(data[i]) {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n"
  },
  {
    "path": "deprecated/int96_test.go",
    "content": "package deprecated_test\n\nimport (\n\t\"fmt\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n)\n\nfunc TestInt96Less(t *testing.T) {\n\ttests := []struct {\n\t\ti    deprecated.Int96\n\t\tj    deprecated.Int96\n\t\tless bool\n\t}{\n\t\t{\n\t\t\ti:    deprecated.Int96{},\n\t\t\tj:    deprecated.Int96{},\n\t\t\tless: false,\n\t\t},\n\n\t\t{\n\t\t\ti:    deprecated.Int96{0: 1},\n\t\t\tj:    deprecated.Int96{0: 2},\n\t\t\tless: true,\n\t\t},\n\n\t\t{\n\t\t\ti:    deprecated.Int96{0: 1},\n\t\t\tj:    deprecated.Int96{1: 1},\n\t\t\tless: true,\n\t\t},\n\n\t\t{\n\t\t\ti:    deprecated.Int96{0: 1},\n\t\t\tj:    deprecated.Int96{2: 1},\n\t\t\tless: true,\n\t\t},\n\n\t\t{\n\t\t\ti:    deprecated.Int96{0: 0xFFFFFFFF, 1: 0xFFFFFFFF, 2: 0xFFFFFFFF}, // -1\n\t\t\tj:    deprecated.Int96{},                                            // 0\n\t\t\tless: true,\n\t\t},\n\n\t\t{\n\t\t\ti:    deprecated.Int96{},                                            // 0\n\t\t\tj:    deprecated.Int96{0: 0xFFFFFFFF, 1: 0xFFFFFFFF, 2: 0xFFFFFFFF}, // -1\n\t\t\tless: false,\n\t\t},\n\n\t\t{\n\t\t\ti:    deprecated.Int96{0: 0xFFFFFFFF, 1: 0xFFFFFFFF, 2: 0xFFFFFFFF}, // -1\n\t\t\tj:    deprecated.Int96{0: 0xFFFFFFFF, 1: 0xFFFFFFFF, 2: 0xFFFFFFFF}, // -1\n\t\t\tless: false,\n\t\t},\n\n\t\t{\n\t\t\ti:    deprecated.Int96{0: 0xFFFFFFFF, 1: 0xFFFFFFFF, 2: 0xFFFFFFFF}, // -1\n\t\t\tj:    deprecated.Int96{0: 0xFFFFFFFE, 1: 0xFFFFFFFF, 2: 0xFFFFFFFF}, // -2\n\t\t\tless: false,\n\t\t},\n\n\t\t{\n\t\t\ti:    deprecated.Int96{0: 0xFFFFFFFE, 1: 0xFFFFFFFF, 2: 0xFFFFFFFF}, // -2\n\t\t\tj:    deprecated.Int96{0: 0xFFFFFFFF, 1: 0xFFFFFFFF, 2: 0xFFFFFFFF}, // -1\n\t\t\tless: true,\n\t\t},\n\t}\n\n\tfor _, test := range tests {\n\t\tscenario := \"\"\n\t\tif test.less {\n\t\t\tscenario = fmt.Sprintf(\"%s<%s\", test.i, test.j)\n\t\t} else {\n\t\t\tscenario = fmt.Sprintf(\"%s>=%s\", test.i, test.j)\n\t\t}\n\t\tt.Run(scenario, func(t *testing.T) {\n\t\t\tif test.i.Less(test.j) != test.less {\n\t\t\t\tt.Error(\"FAIL\")\n\t\t\t}\n\t\t\tif test.less {\n\t\t\t\tif test.j.Less(test.i) {\n\t\t\t\t\tt.Error(\"FAIL (inverse)\")\n\t\t\t\t}\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc TestMaxLenInt96(t *testing.T) {\n\tfor _, test := range []struct {\n\t\tdata   []deprecated.Int96\n\t\tmaxlen int\n\t}{\n\t\t{\n\t\t\tdata:   nil,\n\t\t\tmaxlen: 0,\n\t\t},\n\n\t\t{\n\t\t\tdata:   []deprecated.Int96{{}, {}, {}, {}, {}},\n\t\t\tmaxlen: 0,\n\t\t},\n\n\t\t{\n\t\t\tdata:   []deprecated.Int96{{0: 0x01}, {0: 0xFF}, {1: 0x02}, {0: 0xF0}},\n\t\t\tmaxlen: 34,\n\t\t},\n\t} {\n\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\tif maxlen := deprecated.MaxLenInt96(test.data); maxlen != test.maxlen {\n\t\t\t\tt.Errorf(\"want=%d got=%d\", test.maxlen, maxlen)\n\t\t\t}\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "deprecated/parquet.go",
    "content": "package deprecated\n\n// DEPRECATED: Common types used by frameworks(e.g. hive, pig) using parquet.\n// ConvertedType is superseded by LogicalType.  This enum should not be extended.\n//\n// See LogicalTypes.md for conversion between ConvertedType and LogicalType.\ntype ConvertedType int32\n\nconst (\n\t// a BYTE_ARRAY actually contains UTF8 encoded chars\n\tUTF8 ConvertedType = 0\n\n\t// a map is converted as an optional field containing a repeated key/value pair\n\tMap ConvertedType = 1\n\n\t// a key/value pair is converted into a group of two fields\n\tMapKeyValue ConvertedType = 2\n\n\t// a list is converted into an optional field containing a repeated field for its\n\t// values\n\tList ConvertedType = 3\n\n\t// an enum is converted into a binary field\n\tEnum ConvertedType = 4\n\n\t// A decimal value.\n\t//\n\t// This may be used to annotate binary or fixed primitive types. The\n\t// underlying byte array stores the unscaled value encoded as two's\n\t// complement using big-endian byte order (the most significant byte is the\n\t// zeroth element). The value of the decimal is the value * 10^{-scale}.\n\t//\n\t// This must be accompanied by a (maximum) precision and a scale in the\n\t// SchemaElement. The precision specifies the number of digits in the decimal\n\t// and the scale stores the location of the decimal point. For example 1.23\n\t// would have precision 3 (3 total digits) and scale 2 (the decimal point is\n\t// 2 digits over).\n\tDecimal ConvertedType = 5\n\n\t// A Date\n\t//\n\t// Stored as days since Unix epoch, encoded as the INT32 physical type.\n\tDate ConvertedType = 6\n\n\t// A time\n\t//\n\t// The total number of milliseconds since midnight.  The value is stored\n\t// as an INT32 physical type.\n\tTimeMillis ConvertedType = 7\n\n\t// A time.\n\t//\n\t// The total number of microseconds since midnight.  The value is stored as\n\t// an INT64 physical type.\n\tTimeMicros ConvertedType = 8\n\n\t// A date/time combination\n\t//\n\t// Date and time recorded as milliseconds since the Unix epoch.  Recorded as\n\t// a physical type of INT64.\n\tTimestampMillis ConvertedType = 9\n\n\t// A date/time combination\n\t//\n\t// Date and time recorded as microseconds since the Unix epoch.  The value is\n\t// stored as an INT64 physical type.\n\tTimestampMicros ConvertedType = 10\n\n\t// An unsigned integer value.\n\t//\n\t// The number describes the maximum number of meaningful data bits in\n\t// the stored value. 8, 16 and 32 bit values are stored using the\n\t// INT32 physical type.  64 bit values are stored using the INT64\n\t// physical type.\n\tUint8  ConvertedType = 11\n\tUint16 ConvertedType = 12\n\tUint32 ConvertedType = 13\n\tUint64 ConvertedType = 14\n\n\t// A signed integer value.\n\t//\n\t// The number describes the maximum number of meaningful data bits in\n\t// the stored value. 8, 16 and 32 bit values are stored using the\n\t// INT32 physical type.  64 bit values are stored using the INT64\n\t// physical type.\n\tInt8  ConvertedType = 15\n\tInt16 ConvertedType = 16\n\tInt32 ConvertedType = 17\n\tInt64 ConvertedType = 18\n\n\t// An embedded JSON document\n\t//\n\t// A JSON document embedded within a single UTF8 column.\n\tJson ConvertedType = 19\n\n\t// An embedded BSON document\n\t//\n\t// A BSON document embedded within a single BINARY column.\n\tBson ConvertedType = 20\n\n\t// An interval of time\n\t//\n\t// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12\n\t// This data is composed of three separate little endian unsigned\n\t// integers.  Each stores a component of a duration of time.  The first\n\t// integer identifies the number of months associated with the duration,\n\t// the second identifies the number of days associated with the duration\n\t// and the third identifies the number of milliseconds associated with\n\t// the provided duration.  This duration of time is independent of any\n\t// particular timezone or date.\n\tInterval ConvertedType = 21\n)\n"
  },
  {
    "path": "dictionary.go",
    "content": "package parquet\n\nimport (\n\t\"io\"\n\t\"math/bits\"\n\t\"unsafe\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/encoding/plain\"\n\t\"github.com/segmentio/parquet-go/hashprobe\"\n\t\"github.com/segmentio/parquet-go/internal/bitpack\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n\t\"github.com/segmentio/parquet-go/sparse\"\n)\n\nconst (\n\t// Maximum load of probing tables. This parameter configures the balance\n\t// between memory density and compute time of probing operations. Valid\n\t// values are floating point numbers between 0 and 1.\n\t//\n\t// Smaller values result in lower collision probability when inserting\n\t// values in probing tables, but also increase memory utilization.\n\t//\n\t// TODO: make this configurable by the application?\n\thashprobeTableMaxLoad = 0.85\n\n\t// An estimate of the CPU cache footprint used by insert operations.\n\t//\n\t// This constant is used to determine a useful chunk size depending on the\n\t// size of values being inserted in dictionaries. More values of small size\n\t// can fit in CPU caches, so the inserts can operation on larger chunks.\n\tinsertsTargetCacheFootprint = 8192\n)\n\n// The Dictionary interface represents type-specific implementations of parquet\n// dictionaries.\n//\n// Programs can instantiate dictionaries by call the NewDictionary method of a\n// Type object.\n//\n// The current implementation has a limitation which prevents applications from\n// providing custom versions of this interface because it contains unexported\n// methods. The only way to create Dictionary values is to call the\n// NewDictionary of Type instances. This limitation may be lifted in future\n// releases.\ntype Dictionary interface {\n\t// Returns the type that the dictionary was created from.\n\tType() Type\n\n\t// Returns the number of value indexed in the dictionary.\n\tLen() int\n\n\t// Returns the dictionary value at the given index.\n\tIndex(index int32) Value\n\n\t// Inserts values from the second slice to the dictionary and writes the\n\t// indexes at which each value was inserted to the first slice.\n\t//\n\t// The method panics if the length of the indexes slice is smaller than the\n\t// length of the values slice.\n\tInsert(indexes []int32, values []Value)\n\n\t// Given an array of dictionary indexes, lookup the values into the array\n\t// of values passed as second argument.\n\t//\n\t// The method panics if len(indexes) > len(values), or one of the indexes\n\t// is negative or greater than the highest index in the dictionary.\n\tLookup(indexes []int32, values []Value)\n\n\t// Returns the min and max values found in the given indexes.\n\tBounds(indexes []int32) (min, max Value)\n\n\t// Resets the dictionary to its initial state, removing all values.\n\tReset()\n\n\t// Returns a Page representing the content of the dictionary.\n\t//\n\t// The returned page shares the underlying memory of the buffer, it remains\n\t// valid to use until the dictionary's Reset method is called.\n\tPage() Page\n\n\t// See ColumnBuffer.writeValues for details on the use of unexported methods\n\t// on interfaces.\n\tinsert(indexes []int32, rows sparse.Array)\n\t//lookup(indexes []int32, rows sparse.Array)\n}\n\nfunc checkLookupIndexBounds(indexes []int32, rows sparse.Array) {\n\tif rows.Len() < len(indexes) {\n\t\tpanic(\"dictionary lookup with more indexes than values\")\n\t}\n}\n\n// The boolean dictionary always contains two values for true and false.\ntype booleanDictionary struct {\n\tbooleanPage\n\t// There are only two possible values for booleans, false and true.\n\t// Rather than using a Go map, we track the indexes of each values\n\t// in an array of two 32 bits integers. When inserting values in the\n\t// dictionary, we ensure that an index exist for each boolean value,\n\t// then use the value 0 or 1 (false or true) to perform a lookup in\n\t// the dictionary's map.\n\ttable [2]int32\n}\n\nfunc newBooleanDictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *booleanDictionary {\n\tindexOfFalse, indexOfTrue, values := int32(-1), int32(-1), data.Boolean()\n\n\tfor i := int32(0); i < numValues && indexOfFalse < 0 && indexOfTrue < 0; i += 8 {\n\t\tv := values[i]\n\t\tif v != 0x00 {\n\t\t\tindexOfTrue = i + int32(bits.TrailingZeros8(v))\n\t\t}\n\t\tif v != 0xFF {\n\t\t\tindexOfFalse = i + int32(bits.TrailingZeros8(^v))\n\t\t}\n\t}\n\n\treturn &booleanDictionary{\n\t\tbooleanPage: booleanPage{\n\t\t\ttyp:         typ,\n\t\t\tbits:        values[:bitpack.ByteCount(uint(numValues))],\n\t\t\tnumValues:   numValues,\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t\ttable: [2]int32{\n\t\t\t0: indexOfFalse,\n\t\t\t1: indexOfTrue,\n\t\t},\n\t}\n}\n\nfunc (d *booleanDictionary) Type() Type { return newIndexedType(d.typ, d) }\n\nfunc (d *booleanDictionary) Len() int { return int(d.numValues) }\n\nfunc (d *booleanDictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }\n\nfunc (d *booleanDictionary) index(i int32) bool { return d.valueAt(int(i)) }\n\nfunc (d *booleanDictionary) Insert(indexes []int32, values []Value) {\n\tmodel := Value{}\n\td.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))\n}\n\nfunc (d *booleanDictionary) insert(indexes []int32, rows sparse.Array) {\n\t_ = indexes[:rows.Len()]\n\n\tif d.table[0] < 0 {\n\t\td.table[0] = d.numValues\n\t\td.numValues++\n\t\td.bits = plain.AppendBoolean(d.bits, int(d.table[0]), false)\n\t}\n\n\tif d.table[1] < 0 {\n\t\td.table[1] = d.numValues\n\t\td.numValues++\n\t\td.bits = plain.AppendBoolean(d.bits, int(d.table[1]), true)\n\t}\n\n\tvalues := rows.Uint8Array()\n\tdict := d.table\n\n\tfor i := 0; i < rows.Len(); i++ {\n\t\tv := values.Index(i) & 1\n\t\tindexes[i] = dict[v]\n\t}\n}\n\nfunc (d *booleanDictionary) Lookup(indexes []int32, values []Value) {\n\tmodel := d.makeValue(false)\n\tmemsetValues(values, model)\n\td.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))\n}\n\nfunc (d *booleanDictionary) lookup(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tfor i, j := range indexes {\n\t\t*(*bool)(rows.Index(i)) = d.index(j)\n\t}\n}\n\nfunc (d *booleanDictionary) Bounds(indexes []int32) (min, max Value) {\n\tif len(indexes) > 0 {\n\t\thasFalse, hasTrue := false, false\n\n\t\tfor _, i := range indexes {\n\t\t\tv := d.index(i)\n\t\t\tif v {\n\t\t\t\thasTrue = true\n\t\t\t} else {\n\t\t\t\thasFalse = true\n\t\t\t}\n\t\t\tif hasTrue && hasFalse {\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\n\t\tmin = d.makeValue(!hasFalse)\n\t\tmax = d.makeValue(hasTrue)\n\t}\n\treturn min, max\n}\n\nfunc (d *booleanDictionary) Reset() {\n\td.bits = d.bits[:0]\n\td.offset = 0\n\td.numValues = 0\n\td.table = [2]int32{-1, -1}\n}\n\nfunc (d *booleanDictionary) Page() Page {\n\treturn &d.booleanPage\n}\n\ntype int32Dictionary struct {\n\tint32Page\n\ttable *hashprobe.Int32Table\n}\n\nfunc newInt32Dictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *int32Dictionary {\n\treturn &int32Dictionary{\n\t\tint32Page: int32Page{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      data.Int32()[:numValues],\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (d *int32Dictionary) Type() Type { return newIndexedType(d.typ, d) }\n\nfunc (d *int32Dictionary) Len() int { return len(d.values) }\n\nfunc (d *int32Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }\n\nfunc (d *int32Dictionary) index(i int32) int32 { return d.values[i] }\n\nfunc (d *int32Dictionary) Insert(indexes []int32, values []Value) {\n\tmodel := Value{}\n\td.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))\n}\n\nfunc (d *int32Dictionary) init(indexes []int32) {\n\td.table = hashprobe.NewInt32Table(len(d.values), hashprobeTableMaxLoad)\n\n\tn := min(len(d.values), len(indexes))\n\n\tfor i := 0; i < len(d.values); i += n {\n\t\tj := min(i+n, len(d.values))\n\t\td.table.Probe(d.values[i:j:j], indexes[:n:n])\n\t}\n}\n\nfunc (d *int32Dictionary) insert(indexes []int32, rows sparse.Array) {\n\t// Iterating over the input in chunks helps keep relevant data in CPU\n\t// caches when a large number of values are inserted into the dictionary with\n\t// a single method call.\n\t//\n\t// Without this chunking, memory areas from the head of the indexes and\n\t// values arrays end up being evicted from CPU caches as the probing\n\t// operation iterates through the array. The subsequent scan of the indexes\n\t// required to determine which values must be inserted into the page then\n\t// stalls on retrieving data from main memory.\n\t//\n\t// We measured as much as ~37% drop in throughput when disabling the\n\t// chunking, and did not observe any penalties from having it on smaller\n\t// inserts.\n\tconst chunkSize = insertsTargetCacheFootprint / 4\n\n\tif d.table == nil {\n\t\td.init(indexes)\n\t}\n\n\tvalues := rows.Int32Array()\n\n\tfor i := 0; i < values.Len(); i += chunkSize {\n\t\tj := min(i+chunkSize, values.Len())\n\n\t\tif d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {\n\t\t\tfor k, index := range indexes[i:j] {\n\t\t\t\tif index == int32(len(d.values)) {\n\t\t\t\t\td.values = append(d.values, values.Index(i+k))\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n}\n\nfunc (d *int32Dictionary) Lookup(indexes []int32, values []Value) {\n\tmodel := d.makeValue(0)\n\tmemsetValues(values, model)\n\td.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))\n}\n\nfunc (d *int32Dictionary) Bounds(indexes []int32) (min, max Value) {\n\tif len(indexes) > 0 {\n\t\tminValue, maxValue := d.bounds(indexes)\n\t\tmin = d.makeValue(minValue)\n\t\tmax = d.makeValue(maxValue)\n\t}\n\treturn min, max\n}\n\nfunc (d *int32Dictionary) Reset() {\n\td.values = d.values[:0]\n\tif d.table != nil {\n\t\td.table.Reset()\n\t}\n}\n\nfunc (d *int32Dictionary) Page() Page {\n\treturn &d.int32Page\n}\n\ntype int64Dictionary struct {\n\tint64Page\n\ttable *hashprobe.Int64Table\n}\n\nfunc newInt64Dictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *int64Dictionary {\n\treturn &int64Dictionary{\n\t\tint64Page: int64Page{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      data.Int64()[:numValues],\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (d *int64Dictionary) Type() Type { return newIndexedType(d.typ, d) }\n\nfunc (d *int64Dictionary) Len() int { return len(d.values) }\n\nfunc (d *int64Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }\n\nfunc (d *int64Dictionary) index(i int32) int64 { return d.values[i] }\n\nfunc (d *int64Dictionary) Insert(indexes []int32, values []Value) {\n\tmodel := Value{}\n\td.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))\n}\n\nfunc (d *int64Dictionary) init(indexes []int32) {\n\td.table = hashprobe.NewInt64Table(len(d.values), hashprobeTableMaxLoad)\n\n\tn := min(len(d.values), len(indexes))\n\n\tfor i := 0; i < len(d.values); i += n {\n\t\tj := min(i+n, len(d.values))\n\t\td.table.Probe(d.values[i:j:j], indexes[:n:n])\n\t}\n}\n\nfunc (d *int64Dictionary) insert(indexes []int32, rows sparse.Array) {\n\tconst chunkSize = insertsTargetCacheFootprint / 8\n\n\tif d.table == nil {\n\t\td.init(indexes)\n\t}\n\n\tvalues := rows.Int64Array()\n\n\tfor i := 0; i < values.Len(); i += chunkSize {\n\t\tj := min(i+chunkSize, values.Len())\n\n\t\tif d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {\n\t\t\tfor k, index := range indexes[i:j] {\n\t\t\t\tif index == int32(len(d.values)) {\n\t\t\t\t\td.values = append(d.values, values.Index(i+k))\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n}\n\nfunc (d *int64Dictionary) Lookup(indexes []int32, values []Value) {\n\tmodel := d.makeValue(0)\n\tmemsetValues(values, model)\n\td.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))\n}\n\nfunc (d *int64Dictionary) Bounds(indexes []int32) (min, max Value) {\n\tif len(indexes) > 0 {\n\t\tminValue, maxValue := d.bounds(indexes)\n\t\tmin = d.makeValue(minValue)\n\t\tmax = d.makeValue(maxValue)\n\t}\n\treturn min, max\n}\n\nfunc (d *int64Dictionary) Reset() {\n\td.values = d.values[:0]\n\tif d.table != nil {\n\t\td.table.Reset()\n\t}\n}\n\nfunc (d *int64Dictionary) Page() Page {\n\treturn &d.int64Page\n}\n\ntype int96Dictionary struct {\n\tint96Page\n\thashmap map[deprecated.Int96]int32\n}\n\nfunc newInt96Dictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *int96Dictionary {\n\treturn &int96Dictionary{\n\t\tint96Page: int96Page{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      data.Int96()[:numValues],\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (d *int96Dictionary) Type() Type { return newIndexedType(d.typ, d) }\n\nfunc (d *int96Dictionary) Len() int { return len(d.values) }\n\nfunc (d *int96Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }\n\nfunc (d *int96Dictionary) index(i int32) deprecated.Int96 { return d.values[i] }\n\nfunc (d *int96Dictionary) Insert(indexes []int32, values []Value) {\n\td.insertValues(indexes, len(values), func(i int) deprecated.Int96 {\n\t\treturn values[i].Int96()\n\t})\n}\n\nfunc (d *int96Dictionary) insert(indexes []int32, rows sparse.Array) {\n\td.insertValues(indexes, rows.Len(), func(i int) deprecated.Int96 {\n\t\treturn *(*deprecated.Int96)(rows.Index(i))\n\t})\n}\n\nfunc (d *int96Dictionary) insertValues(indexes []int32, count int, valueAt func(int) deprecated.Int96) {\n\t_ = indexes[:count]\n\n\tif d.hashmap == nil {\n\t\td.hashmap = make(map[deprecated.Int96]int32, len(d.values))\n\t\tfor i, v := range d.values {\n\t\t\td.hashmap[v] = int32(i)\n\t\t}\n\t}\n\n\tfor i := 0; i < count; i++ {\n\t\tvalue := valueAt(i)\n\n\t\tindex, exists := d.hashmap[value]\n\t\tif !exists {\n\t\t\tindex = int32(len(d.values))\n\t\t\td.values = append(d.values, value)\n\t\t\td.hashmap[value] = index\n\t\t}\n\n\t\tindexes[i] = index\n\t}\n}\n\nfunc (d *int96Dictionary) Lookup(indexes []int32, values []Value) {\n\tfor i, j := range indexes {\n\t\tvalues[i] = d.Index(j)\n\t}\n}\n\nfunc (d *int96Dictionary) Bounds(indexes []int32) (min, max Value) {\n\tif len(indexes) > 0 {\n\t\tminValue := d.index(indexes[0])\n\t\tmaxValue := minValue\n\n\t\tfor _, i := range indexes[1:] {\n\t\t\tvalue := d.index(i)\n\t\t\tswitch {\n\t\t\tcase value.Less(minValue):\n\t\t\t\tminValue = value\n\t\t\tcase maxValue.Less(value):\n\t\t\t\tmaxValue = value\n\t\t\t}\n\t\t}\n\n\t\tmin = d.makeValue(minValue)\n\t\tmax = d.makeValue(maxValue)\n\t}\n\treturn min, max\n}\n\nfunc (d *int96Dictionary) Reset() {\n\td.values = d.values[:0]\n\td.hashmap = nil\n}\n\nfunc (d *int96Dictionary) Page() Page {\n\treturn &d.int96Page\n}\n\ntype floatDictionary struct {\n\tfloatPage\n\ttable *hashprobe.Float32Table\n}\n\nfunc newFloatDictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *floatDictionary {\n\treturn &floatDictionary{\n\t\tfloatPage: floatPage{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      data.Float()[:numValues],\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (d *floatDictionary) Type() Type { return newIndexedType(d.typ, d) }\n\nfunc (d *floatDictionary) Len() int { return len(d.values) }\n\nfunc (d *floatDictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }\n\nfunc (d *floatDictionary) index(i int32) float32 { return d.values[i] }\n\nfunc (d *floatDictionary) Insert(indexes []int32, values []Value) {\n\tmodel := Value{}\n\td.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))\n}\n\nfunc (d *floatDictionary) init(indexes []int32) {\n\td.table = hashprobe.NewFloat32Table(len(d.values), hashprobeTableMaxLoad)\n\n\tn := min(len(d.values), len(indexes))\n\n\tfor i := 0; i < len(d.values); i += n {\n\t\tj := min(i+n, len(d.values))\n\t\td.table.Probe(d.values[i:j:j], indexes[:n:n])\n\t}\n}\n\nfunc (d *floatDictionary) insert(indexes []int32, rows sparse.Array) {\n\tconst chunkSize = insertsTargetCacheFootprint / 4\n\n\tif d.table == nil {\n\t\td.init(indexes)\n\t}\n\n\tvalues := rows.Float32Array()\n\n\tfor i := 0; i < values.Len(); i += chunkSize {\n\t\tj := min(i+chunkSize, values.Len())\n\n\t\tif d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {\n\t\t\tfor k, index := range indexes[i:j] {\n\t\t\t\tif index == int32(len(d.values)) {\n\t\t\t\t\td.values = append(d.values, values.Index(i+k))\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n}\n\nfunc (d *floatDictionary) Lookup(indexes []int32, values []Value) {\n\tmodel := d.makeValue(0)\n\tmemsetValues(values, model)\n\td.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))\n}\n\nfunc (d *floatDictionary) Bounds(indexes []int32) (min, max Value) {\n\tif len(indexes) > 0 {\n\t\tminValue, maxValue := d.bounds(indexes)\n\t\tmin = d.makeValue(minValue)\n\t\tmax = d.makeValue(maxValue)\n\t}\n\treturn min, max\n}\n\nfunc (d *floatDictionary) Reset() {\n\td.values = d.values[:0]\n\tif d.table != nil {\n\t\td.table.Reset()\n\t}\n}\n\nfunc (d *floatDictionary) Page() Page {\n\treturn &d.floatPage\n}\n\ntype doubleDictionary struct {\n\tdoublePage\n\ttable *hashprobe.Float64Table\n}\n\nfunc newDoubleDictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *doubleDictionary {\n\treturn &doubleDictionary{\n\t\tdoublePage: doublePage{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      data.Double()[:numValues],\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (d *doubleDictionary) Type() Type { return newIndexedType(d.typ, d) }\n\nfunc (d *doubleDictionary) Len() int { return len(d.values) }\n\nfunc (d *doubleDictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }\n\nfunc (d *doubleDictionary) index(i int32) float64 { return d.values[i] }\n\nfunc (d *doubleDictionary) Insert(indexes []int32, values []Value) {\n\tmodel := Value{}\n\td.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))\n}\n\nfunc (d *doubleDictionary) init(indexes []int32) {\n\td.table = hashprobe.NewFloat64Table(len(d.values), hashprobeTableMaxLoad)\n\n\tn := min(len(d.values), len(indexes))\n\n\tfor i := 0; i < len(d.values); i += n {\n\t\tj := min(i+n, len(d.values))\n\t\td.table.Probe(d.values[i:j:j], indexes[:n:n])\n\t}\n}\n\nfunc (d *doubleDictionary) insert(indexes []int32, rows sparse.Array) {\n\tconst chunkSize = insertsTargetCacheFootprint / 8\n\n\tif d.table == nil {\n\t\td.init(indexes)\n\t}\n\n\tvalues := rows.Float64Array()\n\n\tfor i := 0; i < values.Len(); i += chunkSize {\n\t\tj := min(i+chunkSize, values.Len())\n\n\t\tif d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {\n\t\t\tfor k, index := range indexes[i:j] {\n\t\t\t\tif index == int32(len(d.values)) {\n\t\t\t\t\td.values = append(d.values, values.Index(i+k))\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n}\n\nfunc (d *doubleDictionary) Lookup(indexes []int32, values []Value) {\n\tmodel := d.makeValue(0)\n\tmemsetValues(values, model)\n\td.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))\n}\n\nfunc (d *doubleDictionary) Bounds(indexes []int32) (min, max Value) {\n\tif len(indexes) > 0 {\n\t\tminValue, maxValue := d.bounds(indexes)\n\t\tmin = d.makeValue(minValue)\n\t\tmax = d.makeValue(maxValue)\n\t}\n\treturn min, max\n}\n\nfunc (d *doubleDictionary) Reset() {\n\td.values = d.values[:0]\n\tif d.table != nil {\n\t\td.table.Reset()\n\t}\n}\n\nfunc (d *doubleDictionary) Page() Page {\n\treturn &d.doublePage\n}\n\ntype byteArrayDictionary struct {\n\tbyteArrayPage\n\ttable map[string]int32\n\talloc allocator\n}\n\nfunc newByteArrayDictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *byteArrayDictionary {\n\tvalues, offsets := data.ByteArray()\n\t// The first offset must always be zero, and the last offset is the length\n\t// of the values in bytes.\n\t//\n\t// As an optimization we make the assumption that the backing array of the\n\t// offsets slice belongs to the dictionary.\n\tswitch {\n\tcase cap(offsets) == 0:\n\t\toffsets = make([]uint32, 1, 8)\n\tcase len(offsets) == 0:\n\t\toffsets = append(offsets[:0], 0)\n\t}\n\treturn &byteArrayDictionary{\n\t\tbyteArrayPage: byteArrayPage{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      values,\n\t\t\toffsets:     offsets,\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (d *byteArrayDictionary) Type() Type { return newIndexedType(d.typ, d) }\n\nfunc (d *byteArrayDictionary) Len() int { return d.len() }\n\nfunc (d *byteArrayDictionary) Index(i int32) Value { return d.makeValueBytes(d.index(int(i))) }\n\nfunc (d *byteArrayDictionary) Insert(indexes []int32, values []Value) {\n\tmodel := Value{}\n\td.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.ptr)))\n}\n\nfunc (d *byteArrayDictionary) init() {\n\tnumValues := d.len()\n\td.table = make(map[string]int32, numValues)\n\n\tfor i := 0; i < numValues; i++ {\n\t\td.table[string(d.index(i))] = int32(len(d.table))\n\t}\n}\n\nfunc (d *byteArrayDictionary) insert(indexes []int32, rows sparse.Array) {\n\tif d.table == nil {\n\t\td.init()\n\t}\n\n\tvalues := rows.StringArray()\n\n\tfor i := range indexes {\n\t\tvalue := values.Index(i)\n\n\t\tindex, exists := d.table[value]\n\t\tif !exists {\n\t\t\tvalue = d.alloc.copyString(value)\n\t\t\tindex = int32(len(d.table))\n\t\t\td.table[value] = index\n\t\t\td.values = append(d.values, value...)\n\t\t\td.offsets = append(d.offsets, uint32(len(d.values)))\n\t\t}\n\n\t\tindexes[i] = index\n\t}\n}\n\nfunc (d *byteArrayDictionary) Lookup(indexes []int32, values []Value) {\n\tmodel := d.makeValueString(\"\")\n\tmemsetValues(values, model)\n\td.lookupString(indexes, makeArrayValue(values, unsafe.Offsetof(model.ptr)))\n}\n\nfunc (d *byteArrayDictionary) Bounds(indexes []int32) (min, max Value) {\n\tif len(indexes) > 0 {\n\t\tbase := d.index(int(indexes[0]))\n\t\tminValue := unsafecast.BytesToString(base)\n\t\tmaxValue := minValue\n\t\tvalues := [64]string{}\n\n\t\tfor i := 1; i < len(indexes); i += len(values) {\n\t\t\tn := len(indexes) - i\n\t\t\tif n > len(values) {\n\t\t\t\tn = len(values)\n\t\t\t}\n\t\t\tj := i + n\n\t\t\td.lookupString(indexes[i:j:j], makeArrayString(values[:n:n]))\n\n\t\t\tfor _, value := range values[:n:n] {\n\t\t\t\tswitch {\n\t\t\t\tcase value < minValue:\n\t\t\t\t\tminValue = value\n\t\t\t\tcase value > maxValue:\n\t\t\t\t\tmaxValue = value\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\tmin = d.makeValueString(minValue)\n\t\tmax = d.makeValueString(maxValue)\n\t}\n\treturn min, max\n}\n\nfunc (d *byteArrayDictionary) Reset() {\n\td.offsets = d.offsets[:1]\n\td.values = d.values[:0]\n\tfor k := range d.table {\n\t\tdelete(d.table, k)\n\t}\n\td.alloc.reset()\n}\n\nfunc (d *byteArrayDictionary) Page() Page {\n\treturn &d.byteArrayPage\n}\n\ntype fixedLenByteArrayDictionary struct {\n\tfixedLenByteArrayPage\n\thashmap map[string]int32\n}\n\nfunc newFixedLenByteArrayDictionary(typ Type, columnIndex int16, numValues int32, values encoding.Values) *fixedLenByteArrayDictionary {\n\tdata, size := values.FixedLenByteArray()\n\treturn &fixedLenByteArrayDictionary{\n\t\tfixedLenByteArrayPage: fixedLenByteArrayPage{\n\t\t\ttyp:         typ,\n\t\t\tsize:        size,\n\t\t\tdata:        data,\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (d *fixedLenByteArrayDictionary) Type() Type { return newIndexedType(d.typ, d) }\n\nfunc (d *fixedLenByteArrayDictionary) Len() int { return len(d.data) / d.size }\n\nfunc (d *fixedLenByteArrayDictionary) Index(i int32) Value {\n\treturn d.makeValueBytes(d.index(i))\n}\n\nfunc (d *fixedLenByteArrayDictionary) index(i int32) []byte {\n\tj := (int(i) + 0) * d.size\n\tk := (int(i) + 1) * d.size\n\treturn d.data[j:k:k]\n}\n\nfunc (d *fixedLenByteArrayDictionary) Insert(indexes []int32, values []Value) {\n\td.insertValues(indexes, len(values), func(i int) *byte {\n\t\treturn values[i].ptr\n\t})\n}\n\nfunc (d *fixedLenByteArrayDictionary) insert(indexes []int32, rows sparse.Array) {\n\td.insertValues(indexes, rows.Len(), func(i int) *byte {\n\t\treturn (*byte)(rows.Index(i))\n\t})\n}\n\nfunc (d *fixedLenByteArrayDictionary) insertValues(indexes []int32, count int, valueAt func(int) *byte) {\n\t_ = indexes[:count]\n\n\tif d.hashmap == nil {\n\t\td.hashmap = make(map[string]int32, cap(d.data)/d.size)\n\t\tfor i, j := 0, int32(0); i < len(d.data); i += d.size {\n\t\t\td.hashmap[string(d.data[i:i+d.size])] = j\n\t\t\tj++\n\t\t}\n\t}\n\n\tfor i := 0; i < count; i++ {\n\t\tvalue := unsafe.Slice(valueAt(i), d.size)\n\n\t\tindex, exists := d.hashmap[string(value)]\n\t\tif !exists {\n\t\t\tindex = int32(d.Len())\n\t\t\tstart := len(d.data)\n\t\t\td.data = append(d.data, value...)\n\t\t\td.hashmap[string(d.data[start:])] = index\n\t\t}\n\n\t\tindexes[i] = index\n\t}\n}\n\nfunc (d *fixedLenByteArrayDictionary) Lookup(indexes []int32, values []Value) {\n\tmodel := d.makeValueString(\"\")\n\tmemsetValues(values, model)\n\td.lookupString(indexes, makeArrayValue(values, unsafe.Offsetof(model.ptr)))\n}\n\nfunc (d *fixedLenByteArrayDictionary) Bounds(indexes []int32) (min, max Value) {\n\tif len(indexes) > 0 {\n\t\tbase := d.index(indexes[0])\n\t\tminValue := unsafecast.BytesToString(base)\n\t\tmaxValue := minValue\n\t\tvalues := [64]string{}\n\n\t\tfor i := 1; i < len(indexes); i += len(values) {\n\t\t\tn := len(indexes) - i\n\t\t\tif n > len(values) {\n\t\t\t\tn = len(values)\n\t\t\t}\n\t\t\tj := i + n\n\t\t\td.lookupString(indexes[i:j:j], makeArrayString(values[:n:n]))\n\n\t\t\tfor _, value := range values[:n:n] {\n\t\t\t\tswitch {\n\t\t\t\tcase value < minValue:\n\t\t\t\t\tminValue = value\n\t\t\t\tcase value > maxValue:\n\t\t\t\t\tmaxValue = value\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\tmin = d.makeValueString(minValue)\n\t\tmax = d.makeValueString(maxValue)\n\t}\n\treturn min, max\n}\n\nfunc (d *fixedLenByteArrayDictionary) Reset() {\n\td.data = d.data[:0]\n\td.hashmap = nil\n}\n\nfunc (d *fixedLenByteArrayDictionary) Page() Page {\n\treturn &d.fixedLenByteArrayPage\n}\n\ntype uint32Dictionary struct {\n\tuint32Page\n\ttable *hashprobe.Uint32Table\n}\n\nfunc newUint32Dictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *uint32Dictionary {\n\treturn &uint32Dictionary{\n\t\tuint32Page: uint32Page{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      data.Uint32()[:numValues],\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (d *uint32Dictionary) Type() Type { return newIndexedType(d.typ, d) }\n\nfunc (d *uint32Dictionary) Len() int { return len(d.values) }\n\nfunc (d *uint32Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }\n\nfunc (d *uint32Dictionary) index(i int32) uint32 { return d.values[i] }\n\nfunc (d *uint32Dictionary) Insert(indexes []int32, values []Value) {\n\tmodel := Value{}\n\td.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))\n}\n\nfunc (d *uint32Dictionary) init(indexes []int32) {\n\td.table = hashprobe.NewUint32Table(len(d.values), hashprobeTableMaxLoad)\n\n\tn := min(len(d.values), len(indexes))\n\n\tfor i := 0; i < len(d.values); i += n {\n\t\tj := min(i+n, len(d.values))\n\t\td.table.Probe(d.values[i:j:j], indexes[:n:n])\n\t}\n}\n\nfunc (d *uint32Dictionary) insert(indexes []int32, rows sparse.Array) {\n\tconst chunkSize = insertsTargetCacheFootprint / 4\n\n\tif d.table == nil {\n\t\td.init(indexes)\n\t}\n\n\tvalues := rows.Uint32Array()\n\n\tfor i := 0; i < values.Len(); i += chunkSize {\n\t\tj := min(i+chunkSize, values.Len())\n\n\t\tif d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {\n\t\t\tfor k, index := range indexes[i:j] {\n\t\t\t\tif index == int32(len(d.values)) {\n\t\t\t\t\td.values = append(d.values, values.Index(i+k))\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n}\n\nfunc (d *uint32Dictionary) Lookup(indexes []int32, values []Value) {\n\tmodel := d.makeValue(0)\n\tmemsetValues(values, model)\n\td.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))\n}\n\nfunc (d *uint32Dictionary) Bounds(indexes []int32) (min, max Value) {\n\tif len(indexes) > 0 {\n\t\tminValue, maxValue := d.bounds(indexes)\n\t\tmin = d.makeValue(minValue)\n\t\tmax = d.makeValue(maxValue)\n\t}\n\treturn min, max\n}\n\nfunc (d *uint32Dictionary) Reset() {\n\td.values = d.values[:0]\n\tif d.table != nil {\n\t\td.table.Reset()\n\t}\n}\n\nfunc (d *uint32Dictionary) Page() Page {\n\treturn &d.uint32Page\n}\n\ntype uint64Dictionary struct {\n\tuint64Page\n\ttable *hashprobe.Uint64Table\n}\n\nfunc newUint64Dictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *uint64Dictionary {\n\treturn &uint64Dictionary{\n\t\tuint64Page: uint64Page{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      data.Uint64()[:numValues],\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (d *uint64Dictionary) Type() Type { return newIndexedType(d.typ, d) }\n\nfunc (d *uint64Dictionary) Len() int { return len(d.values) }\n\nfunc (d *uint64Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }\n\nfunc (d *uint64Dictionary) index(i int32) uint64 { return d.values[i] }\n\nfunc (d *uint64Dictionary) Insert(indexes []int32, values []Value) {\n\tmodel := Value{}\n\td.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))\n}\n\nfunc (d *uint64Dictionary) init(indexes []int32) {\n\td.table = hashprobe.NewUint64Table(len(d.values), hashprobeTableMaxLoad)\n\n\tn := min(len(d.values), len(indexes))\n\n\tfor i := 0; i < len(d.values); i += n {\n\t\tj := min(i+n, len(d.values))\n\t\td.table.Probe(d.values[i:j:j], indexes[:n:n])\n\t}\n}\n\nfunc (d *uint64Dictionary) insert(indexes []int32, rows sparse.Array) {\n\tconst chunkSize = insertsTargetCacheFootprint / 8\n\n\tif d.table == nil {\n\t\td.init(indexes)\n\t}\n\n\tvalues := rows.Uint64Array()\n\n\tfor i := 0; i < values.Len(); i += chunkSize {\n\t\tj := min(i+chunkSize, values.Len())\n\n\t\tif d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {\n\t\t\tfor k, index := range indexes[i:j] {\n\t\t\t\tif index == int32(len(d.values)) {\n\t\t\t\t\td.values = append(d.values, values.Index(i+k))\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n}\n\nfunc (d *uint64Dictionary) Lookup(indexes []int32, values []Value) {\n\tmodel := d.makeValue(0)\n\tmemsetValues(values, model)\n\td.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))\n}\n\nfunc (d *uint64Dictionary) Bounds(indexes []int32) (min, max Value) {\n\tif len(indexes) > 0 {\n\t\tminValue, maxValue := d.bounds(indexes)\n\t\tmin = d.makeValue(minValue)\n\t\tmax = d.makeValue(maxValue)\n\t}\n\treturn min, max\n}\n\nfunc (d *uint64Dictionary) Reset() {\n\td.values = d.values[:0]\n\tif d.table != nil {\n\t\td.table.Reset()\n\t}\n}\n\nfunc (d *uint64Dictionary) Page() Page {\n\treturn &d.uint64Page\n}\n\ntype be128Dictionary struct {\n\tbe128Page\n\ttable *hashprobe.Uint128Table\n}\n\nfunc newBE128Dictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *be128Dictionary {\n\treturn &be128Dictionary{\n\t\tbe128Page: be128Page{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      data.Uint128()[:numValues],\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (d *be128Dictionary) Type() Type { return newIndexedType(d.typ, d) }\n\nfunc (d *be128Dictionary) Len() int { return len(d.values) }\n\nfunc (d *be128Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }\n\nfunc (d *be128Dictionary) index(i int32) *[16]byte { return &d.values[i] }\n\nfunc (d *be128Dictionary) Insert(indexes []int32, values []Value) {\n\t_ = indexes[:len(values)]\n\n\tfor _, v := range values {\n\t\tif v.kind != ^int8(FixedLenByteArray) {\n\t\t\tpanic(\"values inserted in BE128 dictionary must be of type BYTE_ARRAY\")\n\t\t}\n\t\tif v.u64 != 16 {\n\t\t\tpanic(\"values inserted in BE128 dictionary must be of length 16\")\n\t\t}\n\t}\n\n\tif d.table == nil {\n\t\td.init(indexes)\n\t}\n\n\tconst chunkSize = insertsTargetCacheFootprint / 16\n\tvar buffer [chunkSize][16]byte\n\n\tfor i := 0; i < len(values); i += chunkSize {\n\t\tj := min(chunkSize+i, len(values))\n\t\tn := min(chunkSize, len(values)-i)\n\n\t\tprobe := buffer[:n:n]\n\t\twritePointersBE128(probe, makeArrayValue(values[i:j], unsafe.Offsetof(values[i].ptr)))\n\n\t\tif d.table.Probe(probe, indexes[i:j:j]) > 0 {\n\t\t\tfor k, v := range probe {\n\t\t\t\tif indexes[i+k] == int32(len(d.values)) {\n\t\t\t\t\td.values = append(d.values, v)\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n}\n\nfunc (d *be128Dictionary) init(indexes []int32) {\n\td.table = hashprobe.NewUint128Table(len(d.values), 0.75)\n\n\tn := min(len(d.values), len(indexes))\n\n\tfor i := 0; i < len(d.values); i += n {\n\t\tj := min(i+n, len(d.values))\n\t\td.table.Probe(d.values[i:j:j], indexes[:n:n])\n\t}\n}\n\nfunc (d *be128Dictionary) insert(indexes []int32, rows sparse.Array) {\n\tconst chunkSize = insertsTargetCacheFootprint / 16\n\n\tif d.table == nil {\n\t\td.init(indexes)\n\t}\n\n\tvalues := rows.Uint128Array()\n\n\tfor i := 0; i < values.Len(); i += chunkSize {\n\t\tj := min(i+chunkSize, values.Len())\n\n\t\tif d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {\n\t\t\tfor k, index := range indexes[i:j] {\n\t\t\t\tif index == int32(len(d.values)) {\n\t\t\t\t\td.values = append(d.values, values.Index(i+k))\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n}\n\nfunc (d *be128Dictionary) Lookup(indexes []int32, values []Value) {\n\tmodel := d.makeValueString(\"\")\n\tmemsetValues(values, model)\n\td.lookupString(indexes, makeArrayValue(values, unsafe.Offsetof(model.ptr)))\n}\n\nfunc (d *be128Dictionary) Bounds(indexes []int32) (min, max Value) {\n\tif len(indexes) > 0 {\n\t\tminValue, maxValue := d.bounds(indexes)\n\t\tmin = d.makeValue(minValue)\n\t\tmax = d.makeValue(maxValue)\n\t}\n\treturn min, max\n}\n\nfunc (d *be128Dictionary) Reset() {\n\td.values = d.values[:0]\n\tif d.table != nil {\n\t\td.table.Reset()\n\t}\n}\n\nfunc (d *be128Dictionary) Page() Page {\n\treturn &d.be128Page\n}\n\n// indexedType is a wrapper around a Type value which overrides object\n// constructors to use indexed versions referencing values in the dictionary\n// instead of storing plain values.\ntype indexedType struct {\n\tType\n\tdict Dictionary\n}\n\nfunc newIndexedType(typ Type, dict Dictionary) *indexedType {\n\treturn &indexedType{Type: typ, dict: dict}\n}\n\nfunc (t *indexedType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn newIndexedColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))\n}\n\nfunc (t *indexedType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn newIndexedPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\n// indexedPage is an implementation of the Page interface which stores\n// indexes instead of plain value. The indexes reference the values in a\n// dictionary that the page was created for.\ntype indexedPage struct {\n\ttyp         *indexedType\n\tvalues      []int32\n\tcolumnIndex int16\n}\n\nfunc newIndexedPage(typ *indexedType, columnIndex int16, numValues int32, data encoding.Values) *indexedPage {\n\t// RLE encoded values that contain dictionary indexes in data pages are\n\t// sometimes truncated when they contain only zeros. We account for this\n\t// special case here and extend the values buffer if it is shorter than\n\t// needed to hold `numValues`.\n\tsize := int(numValues)\n\tvalues := data.Int32()\n\n\tif len(values) < size {\n\t\tif cap(values) < size {\n\t\t\ttmp := make([]int32, size)\n\t\t\tcopy(tmp, values)\n\t\t\tvalues = tmp\n\t\t} else {\n\t\t\tclear := values[len(values) : len(values)+size]\n\t\t\tfor i := range clear {\n\t\t\t\tclear[i] = 0\n\t\t\t}\n\t\t}\n\t}\n\n\treturn &indexedPage{\n\t\ttyp:         typ,\n\t\tvalues:      values[:size],\n\t\tcolumnIndex: ^columnIndex,\n\t}\n}\n\nfunc (page *indexedPage) Type() Type { return indexedPageType{page.typ} }\n\nfunc (page *indexedPage) Column() int { return int(^page.columnIndex) }\n\nfunc (page *indexedPage) Dictionary() Dictionary { return page.typ.dict }\n\nfunc (page *indexedPage) NumRows() int64 { return int64(len(page.values)) }\n\nfunc (page *indexedPage) NumValues() int64 { return int64(len(page.values)) }\n\nfunc (page *indexedPage) NumNulls() int64 { return 0 }\n\nfunc (page *indexedPage) Size() int64 { return 4 * int64(len(page.values)) }\n\nfunc (page *indexedPage) RepetitionLevels() []byte { return nil }\n\nfunc (page *indexedPage) DefinitionLevels() []byte { return nil }\n\nfunc (page *indexedPage) Data() encoding.Values { return encoding.Int32Values(page.values) }\n\nfunc (page *indexedPage) Values() ValueReader { return &indexedPageValues{page: page} }\n\nfunc (page *indexedPage) Bounds() (min, max Value, ok bool) {\n\tif ok = len(page.values) > 0; ok {\n\t\tmin, max = page.typ.dict.Bounds(page.values)\n\t\tmin.columnIndex = page.columnIndex\n\t\tmax.columnIndex = page.columnIndex\n\t}\n\treturn min, max, ok\n}\n\nfunc (page *indexedPage) Slice(i, j int64) Page {\n\treturn &indexedPage{\n\t\ttyp:         page.typ,\n\t\tvalues:      page.values[i:j],\n\t\tcolumnIndex: page.columnIndex,\n\t}\n}\n\n// indexedPageType is an adapter for the indexedType returned when accessing\n// the type of an indexedPage value. It overrides the Encode/Decode methods to\n// account for the fact that an indexed page is holding indexes of values into\n// its dictionary instead of plain values.\ntype indexedPageType struct{ *indexedType }\n\nfunc (t indexedPageType) NewValues(values []byte, _ []uint32) encoding.Values {\n\treturn encoding.Int32ValuesFromBytes(values)\n}\n\nfunc (t indexedPageType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn encoding.EncodeInt32(dst, src, enc)\n}\n\nfunc (t indexedPageType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn encoding.DecodeInt32(dst, src, enc)\n}\n\nfunc (t indexedPageType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn Int32Type.EstimateDecodeSize(numValues, src, enc)\n}\n\ntype indexedPageValues struct {\n\tpage   *indexedPage\n\toffset int\n}\n\nfunc (r *indexedPageValues) ReadValues(values []Value) (n int, err error) {\n\tif n = len(r.page.values) - r.offset; n == 0 {\n\t\treturn 0, io.EOF\n\t}\n\tif n > len(values) {\n\t\tn = len(values)\n\t}\n\tr.page.typ.dict.Lookup(r.page.values[r.offset:r.offset+n], values[:n])\n\tr.offset += n\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\n// indexedColumnBuffer is an implementation of the ColumnBuffer interface which\n// builds a page of indexes into a parent dictionary when values are written.\ntype indexedColumnBuffer struct{ indexedPage }\n\nfunc newIndexedColumnBuffer(typ *indexedType, columnIndex int16, numValues int32) *indexedColumnBuffer {\n\treturn &indexedColumnBuffer{\n\t\tindexedPage: indexedPage{\n\t\t\ttyp:         typ,\n\t\t\tvalues:      make([]int32, 0, numValues),\n\t\t\tcolumnIndex: ^columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *indexedColumnBuffer) Clone() ColumnBuffer {\n\treturn &indexedColumnBuffer{\n\t\tindexedPage: indexedPage{\n\t\t\ttyp:         col.typ,\n\t\t\tvalues:      append([]int32{}, col.values...),\n\t\t\tcolumnIndex: col.columnIndex,\n\t\t},\n\t}\n}\n\nfunc (col *indexedColumnBuffer) Type() Type { return col.typ.Type }\n\nfunc (col *indexedColumnBuffer) ColumnIndex() ColumnIndex { return indexedColumnIndex{col} }\n\nfunc (col *indexedColumnBuffer) OffsetIndex() OffsetIndex { return indexedOffsetIndex{col} }\n\nfunc (col *indexedColumnBuffer) BloomFilter() BloomFilter { return nil }\n\nfunc (col *indexedColumnBuffer) Dictionary() Dictionary { return col.typ.dict }\n\nfunc (col *indexedColumnBuffer) Pages() Pages { return onePage(col.Page()) }\n\nfunc (col *indexedColumnBuffer) Page() Page { return &col.indexedPage }\n\nfunc (col *indexedColumnBuffer) Reset() { col.values = col.values[:0] }\n\nfunc (col *indexedColumnBuffer) Cap() int { return cap(col.values) }\n\nfunc (col *indexedColumnBuffer) Len() int { return len(col.values) }\n\nfunc (col *indexedColumnBuffer) Less(i, j int) bool {\n\tu := col.typ.dict.Index(col.values[i])\n\tv := col.typ.dict.Index(col.values[j])\n\treturn col.typ.Compare(u, v) < 0\n}\n\nfunc (col *indexedColumnBuffer) Swap(i, j int) {\n\tcol.values[i], col.values[j] = col.values[j], col.values[i]\n}\n\nfunc (col *indexedColumnBuffer) WriteValues(values []Value) (int, error) {\n\ti := len(col.values)\n\tj := len(col.values) + len(values)\n\n\tif j <= cap(col.values) {\n\t\tcol.values = col.values[:j]\n\t} else {\n\t\ttmp := make([]int32, j, 2*j)\n\t\tcopy(tmp, col.values)\n\t\tcol.values = tmp\n\t}\n\n\tcol.typ.dict.Insert(col.values[i:], values)\n\treturn len(values), nil\n}\n\nfunc (col *indexedColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) {\n\ti := len(col.values)\n\tj := len(col.values) + rows.Len()\n\n\tif j <= cap(col.values) {\n\t\tcol.values = col.values[:j]\n\t} else {\n\t\ttmp := make([]int32, j, 2*j)\n\t\tcopy(tmp, col.values)\n\t\tcol.values = tmp\n\t}\n\n\tcol.typ.dict.insert(col.values[i:], rows)\n}\n\nfunc (col *indexedColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) {\n\ti := int(offset)\n\tswitch {\n\tcase i < 0:\n\t\treturn 0, errRowIndexOutOfBounds(offset, int64(len(col.values)))\n\tcase i >= len(col.values):\n\t\treturn 0, io.EOF\n\tdefault:\n\t\tfor n < len(values) && i < len(col.values) {\n\t\t\tvalues[n] = col.typ.dict.Index(col.values[i])\n\t\t\tvalues[n].columnIndex = col.columnIndex\n\t\t\tn++\n\t\t\ti++\n\t\t}\n\t\tif n < len(values) {\n\t\t\terr = io.EOF\n\t\t}\n\t\treturn n, err\n\t}\n}\n\nfunc (col *indexedColumnBuffer) ReadRowAt(row Row, index int64) (Row, error) {\n\tswitch {\n\tcase index < 0:\n\t\treturn row, errRowIndexOutOfBounds(index, int64(len(col.values)))\n\tcase index >= int64(len(col.values)):\n\t\treturn row, io.EOF\n\tdefault:\n\t\tv := col.typ.dict.Index(col.values[index])\n\t\tv.columnIndex = col.columnIndex\n\t\treturn append(row, v), nil\n\t}\n}\n\ntype indexedColumnIndex struct{ col *indexedColumnBuffer }\n\nfunc (index indexedColumnIndex) NumPages() int       { return 1 }\nfunc (index indexedColumnIndex) NullCount(int) int64 { return 0 }\nfunc (index indexedColumnIndex) NullPage(int) bool   { return false }\nfunc (index indexedColumnIndex) MinValue(int) Value {\n\tmin, _, _ := index.col.Bounds()\n\treturn min\n}\nfunc (index indexedColumnIndex) MaxValue(int) Value {\n\t_, max, _ := index.col.Bounds()\n\treturn max\n}\nfunc (index indexedColumnIndex) IsAscending() bool {\n\tmin, max, _ := index.col.Bounds()\n\treturn index.col.typ.Compare(min, max) <= 0\n}\nfunc (index indexedColumnIndex) IsDescending() bool {\n\tmin, max, _ := index.col.Bounds()\n\treturn index.col.typ.Compare(min, max) > 0\n}\n\ntype indexedOffsetIndex struct{ col *indexedColumnBuffer }\n\nfunc (index indexedOffsetIndex) NumPages() int                { return 1 }\nfunc (index indexedOffsetIndex) Offset(int) int64             { return 0 }\nfunc (index indexedOffsetIndex) CompressedPageSize(int) int64 { return index.col.Size() }\nfunc (index indexedOffsetIndex) FirstRowIndex(int) int64      { return 0 }\n"
  },
  {
    "path": "dictionary_amd64.go",
    "content": "//go:build !purego\n\npackage parquet\n\nimport (\n\t\"unsafe\"\n\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n\t\"github.com/segmentio/parquet-go/sparse\"\n)\n\n//go:noescape\nfunc dictionaryBoundsInt32(dict []int32, indexes []int32) (min, max int32, err errno)\n\n//go:noescape\nfunc dictionaryBoundsInt64(dict []int64, indexes []int32) (min, max int64, err errno)\n\n//go:noescape\nfunc dictionaryBoundsFloat32(dict []float32, indexes []int32) (min, max float32, err errno)\n\n//go:noescape\nfunc dictionaryBoundsFloat64(dict []float64, indexes []int32) (min, max float64, err errno)\n\n//go:noescape\nfunc dictionaryBoundsUint32(dict []uint32, indexes []int32) (min, max uint32, err errno)\n\n//go:noescape\nfunc dictionaryBoundsUint64(dict []uint64, indexes []int32) (min, max uint64, err errno)\n\n//go:noescape\nfunc dictionaryBoundsBE128(dict [][16]byte, indexes []int32) (min, max *[16]byte, err errno)\n\n//go:noescape\nfunc dictionaryLookup32(dict []uint32, indexes []int32, rows sparse.Array) errno\n\n//go:noescape\nfunc dictionaryLookup64(dict []uint64, indexes []int32, rows sparse.Array) errno\n\n//go:noescape\nfunc dictionaryLookupByteArrayString(dict []uint32, page []byte, indexes []int32, rows sparse.Array) errno\n\n//go:noescape\nfunc dictionaryLookupFixedLenByteArrayString(dict []byte, len int, indexes []int32, rows sparse.Array) errno\n\n//go:noescape\nfunc dictionaryLookupFixedLenByteArrayPointer(dict []byte, len int, indexes []int32, rows sparse.Array) errno\n\nfunc (d *int32Dictionary) lookup(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tdict := unsafecast.Int32ToUint32(d.values)\n\tdictionaryLookup32(dict, indexes, rows).check()\n}\n\nfunc (d *int64Dictionary) lookup(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tdict := unsafecast.Int64ToUint64(d.values)\n\tdictionaryLookup64(dict, indexes, rows).check()\n}\n\nfunc (d *floatDictionary) lookup(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tdict := unsafecast.Float32ToUint32(d.values)\n\tdictionaryLookup32(dict, indexes, rows).check()\n}\n\nfunc (d *doubleDictionary) lookup(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tdict := unsafecast.Float64ToUint64(d.values)\n\tdictionaryLookup64(dict, indexes, rows).check()\n}\n\nfunc (d *byteArrayDictionary) lookupString(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\t// TODO: this optimization is disabled for now because it appears to race\n\t// with the garbage collector and result in writing pointers to free objects\n\t// to the output.\n\t//\n\t// This command was used to trigger the problem:\n\t//\n\t//\tGOMAXPROCS=8 go test -run TestIssue368 -count 10\n\t//\n\t// https://github.com/segmentio/parquet-go/issues/368\n\t//\n\t//dictionaryLookupByteArrayString(d.offsets, d.values, indexes, rows).check()\n\tfor i, j := range indexes {\n\t\tv := d.index(int(j))\n\t\t*(*string)(rows.Index(i)) = *(*string)(unsafe.Pointer(&v))\n\t}\n}\n\nfunc (d *fixedLenByteArrayDictionary) lookupString(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\t//dictionaryLookupFixedLenByteArrayString(d.data, d.size, indexes, rows).check()\n\tfor i, j := range indexes {\n\t\tv := d.index(j)\n\t\t*(*string)(rows.Index(i)) = *(*string)(unsafe.Pointer(&v))\n\t}\n}\n\nfunc (d *uint32Dictionary) lookup(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tdictionaryLookup32(d.values, indexes, rows).check()\n}\n\nfunc (d *uint64Dictionary) lookup(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tdictionaryLookup64(d.values, indexes, rows).check()\n}\n\nfunc (d *be128Dictionary) lookupString(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\t//dict := unsafecast.Uint128ToBytes(d.values)\n\t//dictionaryLookupFixedLenByteArrayString(dict, 16, indexes, rows).check()\n\ts := \"0123456789ABCDEF\"\n\tfor i, j := range indexes {\n\t\t*(**[16]byte)(unsafe.Pointer(&s)) = d.index(j)\n\t\t*(*string)(rows.Index(i)) = s\n\t}\n}\n\nfunc (d *be128Dictionary) lookupPointer(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\t//dict := unsafecast.Uint128ToBytes(d.values)\n\t//dictionaryLookupFixedLenByteArrayPointer(dict, 16, indexes, rows).check()\n\tfor i, j := range indexes {\n\t\t*(**[16]byte)(rows.Index(i)) = d.index(j)\n\t}\n}\n\nfunc (d *int32Dictionary) bounds(indexes []int32) (min, max int32) {\n\tmin, max, err := dictionaryBoundsInt32(d.values, indexes)\n\terr.check()\n\treturn min, max\n}\n\nfunc (d *int64Dictionary) bounds(indexes []int32) (min, max int64) {\n\tmin, max, err := dictionaryBoundsInt64(d.values, indexes)\n\terr.check()\n\treturn min, max\n}\n\nfunc (d *floatDictionary) bounds(indexes []int32) (min, max float32) {\n\tmin, max, err := dictionaryBoundsFloat32(d.values, indexes)\n\terr.check()\n\treturn min, max\n}\n\nfunc (d *doubleDictionary) bounds(indexes []int32) (min, max float64) {\n\tmin, max, err := dictionaryBoundsFloat64(d.values, indexes)\n\terr.check()\n\treturn min, max\n}\n\nfunc (d *uint32Dictionary) bounds(indexes []int32) (min, max uint32) {\n\tmin, max, err := dictionaryBoundsUint32(d.values, indexes)\n\terr.check()\n\treturn min, max\n}\n\nfunc (d *uint64Dictionary) bounds(indexes []int32) (min, max uint64) {\n\tmin, max, err := dictionaryBoundsUint64(d.values, indexes)\n\terr.check()\n\treturn min, max\n}\n\nfunc (d *be128Dictionary) bounds(indexes []int32) (min, max *[16]byte) {\n\tmin, max, err := dictionaryBoundsBE128(d.values, indexes)\n\terr.check()\n\treturn min, max\n}\n"
  },
  {
    "path": "dictionary_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n#define errnoIndexOutOfBounds 1\n\n// func dictionaryBoundsInt32(dict []int32, indexes []int32) (min, max int32, err errno)\nTEXT ·dictionaryBoundsInt32(SB), NOSPLIT, $0-64\n    MOVQ dict_base+0(FP), AX\n    MOVQ dict_len+8(FP), BX\n\n    MOVQ indexes_base+24(FP), CX\n    MOVQ indexes_len+32(FP), DX\n\n    XORQ R10, R10 // min\n    XORQ R11, R11 // max\n    XORQ R12, R12 // err\n    XORQ SI, SI\n\n    CMPQ DX, $0\n    JE return\n\n    MOVL (CX), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    MOVL (AX)(DI*4), R10\n    MOVL R10, R11\n\n    CMPQ DX, $8\n    JB test\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE test\n\n    MOVQ DX, DI\n    SHRQ $3, DI\n    SHLQ $3, DI\n\n    MOVQ $0xFFFF, R8\n    KMOVW R8, K1\n\n    VPBROADCASTD BX, Y2  // [len(dict)...]\n    VPBROADCASTD R10, Y3 // [min...]\n    VMOVDQU32 Y3, Y4     // [max...]\nloopAVX512:\n    VMOVDQU32 (CX)(SI*4), Y0\n    VPCMPUD $1, Y2, Y0, K2\n    KMOVW K2, R9\n    CMPB R9, $0xFF\n    JNE indexOutOfBounds\n    VPGATHERDD (AX)(Y0*4), K1, Y1\n    VPMINSD Y1, Y3, Y3\n    VPMAXSD Y1, Y4, Y4\n    KMOVW R8, K1\n    ADDQ $8, SI\n    CMPQ SI, DI\n    JNE loopAVX512\n\n    VPERM2I128 $1, Y3, Y3, Y0\n    VPERM2I128 $1, Y4, Y4, Y1\n    VPMINSD Y0, Y3, Y3\n    VPMAXSD Y1, Y4, Y4\n\n    VPSHUFD $0b1110, Y3, Y0\n    VPSHUFD $0b1110, Y4, Y1\n    VPMINSD Y0, Y3, Y3\n    VPMAXSD Y1, Y4, Y4\n\n    VPSHUFD $1, Y3, Y0\n    VPSHUFD $1, Y4, Y1\n    VPMINSD Y0, Y3, Y3\n    VPMAXSD Y1, Y4, Y4\n\n    MOVQ X3, R10\n    MOVQ X4, R11\n    ANDQ $0xFFFFFFFF, R10\n    ANDQ $0xFFFFFFFF, R11\n\n    VZEROUPPER\n    JMP test\nloop:\n    MOVL (CX)(SI*4), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    MOVL (AX)(DI*4), DI\n    CMPL DI, R10\n    CMOVLLT DI, R10\n    CMPL DI, R11\n    CMOVLGT DI, R11\n    INCQ SI\ntest:\n    CMPQ SI, DX\n    JNE loop\nreturn:\n    MOVL R10, min+48(FP)\n    MOVL R11, max+52(FP)\n    MOVQ R12, err+56(FP)\n    RET\nindexOutOfBounds:\n    MOVQ $errnoIndexOutOfBounds, R12\n    JMP return\n\n// func dictionaryBoundsInt64(dict []int64, indexes []int32) (min, max int64, err errno)\nTEXT ·dictionaryBoundsInt64(SB), NOSPLIT, $0-72\n    MOVQ dict_base+0(FP), AX\n    MOVQ dict_len+8(FP), BX\n\n    MOVQ indexes_base+24(FP), CX\n    MOVQ indexes_len+32(FP), DX\n\n    XORQ R10, R10 // min\n    XORQ R11, R11 // max\n    XORQ R12, R12 // err\n    XORQ SI, SI\n\n    CMPQ DX, $0\n    JE return\n\n    MOVL (CX), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    MOVQ (AX)(DI*8), R10\n    MOVQ R10, R11\n\n    CMPQ DX, $8\n    JB test\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE test\n\n    MOVQ DX, DI\n    SHRQ $3, DI\n    SHLQ $3, DI\n\n    MOVQ $0xFFFF, R8\n    KMOVW R8, K1\n\n    VPBROADCASTD BX, Y2  // [len(dict)...]\n    VPBROADCASTQ R10, Z3 // [min...]\n    VMOVDQU64 Z3, Z4     // [max...]\nloopAVX512:\n    VMOVDQU32 (CX)(SI*4), Y0\n    VPCMPUD $1, Y2, Y0, K2\n    KMOVW K2, R9\n    CMPB R9, $0xFF\n    JNE indexOutOfBounds\n    VPGATHERDQ (AX)(Y0*8), K1, Z1\n    VPMINSQ Z1, Z3, Z3\n    VPMAXSQ Z1, Z4, Z4\n    KMOVW R8, K1\n    ADDQ $8, SI\n    CMPQ SI, DI\n    JNE loopAVX512\n\n    VPERMQ $0b1110, Z3, Z0\n    VPERMQ $0b1110, Z4, Z1\n    VPMINSQ Z0, Z3, Z3\n    VPMAXSQ Z1, Z4, Z4\n\n    VPERMQ $1, Z3, Z0\n    VPERMQ $1, Z4, Z1\n    VPMINSQ Z0, Z3, Z3\n    VPMAXSQ Z1, Z4, Z4\n\n    VSHUFF64X2 $2, Z3, Z3, Z0\n    VSHUFF64X2 $2, Z4, Z4, Z1\n    VPMINSQ Z0, Z3, Z3\n    VPMAXSQ Z1, Z4, Z4\n\n    MOVQ X3, R10\n    MOVQ X4, R11\n\n    VZEROUPPER\n    JMP test\nloop:\n    MOVL (CX)(SI*4), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    MOVQ (AX)(DI*8), DI\n    CMPQ DI, R10\n    CMOVQLT DI, R10\n    CMPQ DI, R11\n    CMOVQGT DI, R11\n    INCQ SI\ntest:\n    CMPQ SI, DX\n    JNE loop\nreturn:\n    MOVQ R10, min+48(FP)\n    MOVQ R11, max+56(FP)\n    MOVQ R12, err+64(FP)\n    RET\nindexOutOfBounds:\n    MOVQ $errnoIndexOutOfBounds, R12\n    JMP return\n\n// func dictionaryBoundsFloat32(dict []float32, indexes []int32) (min, max float32, err errno)\nTEXT ·dictionaryBoundsFloat32(SB), NOSPLIT, $0-64\n    MOVQ dict_base+0(FP), AX\n    MOVQ dict_len+8(FP), BX\n\n    MOVQ indexes_base+24(FP), CX\n    MOVQ indexes_len+32(FP), DX\n\n    PXOR X3, X3   // min\n    PXOR X4, X4   // max\n    XORQ R12, R12 // err\n    XORQ SI, SI\n\n    CMPQ DX, $0\n    JE return\n\n    MOVL (CX), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    MOVSS (AX)(DI*4), X3\n    MOVAPS X3, X4\n\n    CMPQ DX, $8\n    JB test\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE test\n\n    MOVQ DX, DI\n    SHRQ $3, DI\n    SHLQ $3, DI\n\n    MOVQ $0xFFFF, R8\n    KMOVW R8, K1\n\n    VPBROADCASTD BX, Y2 // [len(dict)...]\n    VPBROADCASTD X3, Y3 // [min...]\n    VMOVDQU32 Y3, Y4    // [max...]\nloopAVX512:\n    VMOVDQU32 (CX)(SI*4), Y0\n    VPCMPUD $1, Y2, Y0, K2\n    KMOVW K2, R9\n    CMPB R9, $0xFF\n    JNE indexOutOfBounds\n    VPGATHERDD (AX)(Y0*4), K1, Y1\n    VMINPS Y1, Y3, Y3\n    VMAXPS Y1, Y4, Y4\n    KMOVW R8, K1\n    ADDQ $8, SI\n    CMPQ SI, DI\n    JNE loopAVX512\n\n    VPERM2I128 $1, Y3, Y3, Y0\n    VPERM2I128 $1, Y4, Y4, Y1\n    VMINPS Y0, Y3, Y3\n    VMAXPS Y1, Y4, Y4\n\n    VPSHUFD $0b1110, Y3, Y0\n    VPSHUFD $0b1110, Y4, Y1\n    VMINPS Y0, Y3, Y3\n    VMAXPS Y1, Y4, Y4\n\n    VPSHUFD $1, Y3, Y0\n    VPSHUFD $1, Y4, Y1\n    VMINPS Y0, Y3, Y3\n    VMAXPS Y1, Y4, Y4\n\n    VZEROUPPER\n    JMP test\nloop:\n    MOVL (CX)(SI*4), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    MOVSS (AX)(DI*4), X1\n    UCOMISS X3, X1\n    JAE skipAssignMin\n    MOVAPS X1, X3\nskipAssignMin:\n    UCOMISS X4, X1\n    JBE skipAssignMax\n    MOVAPS X1, X4\nskipAssignMax:\n    INCQ SI\ntest:\n    CMPQ SI, DX\n    JNE loop\nreturn:\n    MOVSS X3, min+48(FP)\n    MOVSS X4, max+52(FP)\n    MOVQ R12, err+56(FP)\n    RET\nindexOutOfBounds:\n    MOVQ $errnoIndexOutOfBounds, R12\n    JMP return\n\n// func dictionaryBoundsFloat64(dict []float64, indexes []int32) (min, max float64, err errno)\nTEXT ·dictionaryBoundsFloat64(SB), NOSPLIT, $0-72\n    MOVQ dict_base+0(FP), AX\n    MOVQ dict_len+8(FP), BX\n\n    MOVQ indexes_base+24(FP), CX\n    MOVQ indexes_len+32(FP), DX\n\n    PXOR X3, X3   // min\n    PXOR X4, X4   // max\n    XORQ R12, R12 // err\n    XORQ SI, SI\n\n    CMPQ DX, $0\n    JE return\n\n    MOVL (CX), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    MOVSD (AX)(DI*8), X3\n    MOVAPS X3, X4\n\n    CMPQ DX, $8\n    JB test\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE test\n\n    MOVQ DX, DI\n    SHRQ $3, DI\n    SHLQ $3, DI\n\n    MOVQ $0xFFFF, R8\n    KMOVW R8, K1\n\n    VPBROADCASTD BX, Y2 // [len(dict)...]\n    VPBROADCASTQ X3, Z3 // [min...]\n    VMOVDQU64 Z3, Z4    // [max...]\nloopAVX512:\n    VMOVDQU32 (CX)(SI*4), Y0\n    VPCMPUD $1, Y2, Y0, K2\n    KMOVW K2, R9\n    CMPB R9, $0xFF\n    JNE indexOutOfBounds\n    VPGATHERDQ (AX)(Y0*8), K1, Z1\n    VMINPD Z1, Z3, Z3\n    VMAXPD Z1, Z4, Z4\n    KMOVW R8, K1\n    ADDQ $8, SI\n    CMPQ SI, DI\n    JNE loopAVX512\n\n    VPERMQ $0b1110, Z3, Z0\n    VPERMQ $0b1110, Z4, Z1\n    VMINPD Z0, Z3, Z3\n    VMAXPD Z1, Z4, Z4\n\n    VPERMQ $1, Z3, Z0\n    VPERMQ $1, Z4, Z1\n    VMINPD Z0, Z3, Z3\n    VMAXPD Z1, Z4, Z4\n\n    VSHUFF64X2 $2, Z3, Z3, Z0\n    VSHUFF64X2 $2, Z4, Z4, Z1\n    VMINPD Z0, Z3, Z3\n    VMAXPD Z1, Z4, Z4\n\n    VZEROUPPER\n    JMP test\nloop:\n    MOVL (CX)(SI*4), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    MOVSD (AX)(DI*8), X1\n    UCOMISD X3, X1\n    JAE skipAssignMin\n    MOVAPD X1, X3\nskipAssignMin:\n    UCOMISD X4, X1\n    JBE skipAssignMax\n    MOVAPD X1, X4\nskipAssignMax:\n    INCQ SI\ntest:\n    CMPQ SI, DX\n    JNE loop\nreturn:\n    MOVSD X3, min+48(FP)\n    MOVSD X4, max+56(FP)\n    MOVQ R12, err+64(FP)\n    RET\nindexOutOfBounds:\n    MOVQ $errnoIndexOutOfBounds, R12\n    JMP return\n\n// func dictionaryBoundsUint32(dict []uint32, indexes []int32) (min, max uint32, err errno)\nTEXT ·dictionaryBoundsUint32(SB), NOSPLIT, $0-64\n    MOVQ dict_base+0(FP), AX\n    MOVQ dict_len+8(FP), BX\n\n    MOVQ indexes_base+24(FP), CX\n    MOVQ indexes_len+32(FP), DX\n\n    XORQ R10, R10 // min\n    XORQ R11, R11 // max\n    XORQ R12, R12 // err\n    XORQ SI, SI\n\n    CMPQ DX, $0\n    JE return\n\n    MOVL (CX), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    MOVL (AX)(DI*4), R10\n    MOVL R10, R11\n\n    CMPQ DX, $8\n    JB test\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE test\n\n    MOVQ DX, DI\n    SHRQ $3, DI\n    SHLQ $3, DI\n\n    MOVQ $0xFFFF, R8\n    KMOVW R8, K1\n\n    VPBROADCASTD BX, Y2  // [len(dict)...]\n    VPBROADCASTD R10, Y3 // [min...]\n    VMOVDQU32 Y3, Y4     // [max...]\nloopAVX512:\n    VMOVDQU32 (CX)(SI*4), Y0\n    VPCMPUD $1, Y2, Y0, K2\n    KMOVW K2, R9\n    CMPB R9, $0xFF\n    JNE indexOutOfBounds\n    VPGATHERDD (AX)(Y0*4), K1, Y1\n    VPMINUD Y1, Y3, Y3\n    VPMAXUD Y1, Y4, Y4\n    KMOVW R8, K1\n    ADDQ $8, SI\n    CMPQ SI, DI\n    JNE loopAVX512\n\n    VPERM2I128 $1, Y3, Y3, Y0\n    VPERM2I128 $1, Y4, Y4, Y1\n    VPMINUD Y0, Y3, Y3\n    VPMAXUD Y1, Y4, Y4\n\n    VPSHUFD $0b1110, Y3, Y0\n    VPSHUFD $0b1110, Y4, Y1\n    VPMINUD Y0, Y3, Y3\n    VPMAXUD Y1, Y4, Y4\n\n    VPSHUFD $1, Y3, Y0\n    VPSHUFD $1, Y4, Y1\n    VPMINUD Y0, Y3, Y3\n    VPMAXUD Y1, Y4, Y4\n\n    MOVQ X3, R10\n    MOVQ X4, R11\n    ANDQ $0xFFFFFFFF, R10\n    ANDQ $0xFFFFFFFF, R11\n\n    VZEROUPPER\n    JMP test\nloop:\n    MOVL (CX)(SI*4), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    MOVL (AX)(DI*4), DI\n    CMPL DI, R10\n    CMOVLCS DI, R10\n    CMPL DI, R11\n    CMOVLHI DI, R11\n    INCQ SI\ntest:\n    CMPQ SI, DX\n    JNE loop\nreturn:\n    MOVL R10, min+48(FP)\n    MOVL R11, max+52(FP)\n    MOVQ R12, err+56(FP)\n    RET\nindexOutOfBounds:\n    MOVQ $errnoIndexOutOfBounds, R12\n    JMP return\n\n// func dictionaryBoundsUint64(dict []uint64, indexes []int32) (min, max uint64, err errno)\nTEXT ·dictionaryBoundsUint64(SB), NOSPLIT, $0-72\n    MOVQ dict_base+0(FP), AX\n    MOVQ dict_len+8(FP), BX\n\n    MOVQ indexes_base+24(FP), CX\n    MOVQ indexes_len+32(FP), DX\n\n    XORQ R10, R10 // min\n    XORQ R11, R11 // max\n    XORQ R12, R12 // err\n    XORQ SI, SI\n\n    CMPQ DX, $0\n    JE return\n\n    MOVL (CX)(SI*4), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    MOVQ (AX)(DI*8), R10\n    MOVQ R10, R11\n\n    CMPQ DX, $8\n    JB test\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE test\n\n    MOVQ DX, DI\n    SHRQ $3, DI\n    SHLQ $3, DI\n\n    MOVQ $0xFFFF, R8\n    KMOVW R8, K1\n\n    VPBROADCASTD BX, Y2  // [len(dict)...]\n    VPBROADCASTQ R10, Z3 // [min...]\n    VMOVDQU64 Z3, Z4     // [max...]\nloopAVX512:\n    VMOVDQU32 (CX)(SI*4), Y0\n    VPCMPUD $1, Y2, Y0, K2\n    KMOVW K2, R9\n    CMPB R9, $0xFF\n    JNE indexOutOfBounds\n    VPGATHERDQ (AX)(Y0*8), K1, Z1\n    VPMINUQ Z1, Z3, Z3\n    VPMAXUQ Z1, Z4, Z4\n    KMOVW R8, K1\n    ADDQ $8, SI\n    CMPQ SI, DI\n    JNE loopAVX512\n\n    VPERMQ $0b1110, Z3, Z0\n    VPERMQ $0b1110, Z4, Z1\n    VPMINUQ Z0, Z3, Z3\n    VPMAXUQ Z1, Z4, Z4\n\n    VPERMQ $1, Z3, Z0\n    VPERMQ $1, Z4, Z1\n    VPMINUQ Z0, Z3, Z3\n    VPMAXUQ Z1, Z4, Z4\n\n    VSHUFF64X2 $2, Z3, Z3, Z0\n    VSHUFF64X2 $2, Z4, Z4, Z1\n    VPMINUQ Z0, Z3, Z3\n    VPMAXUQ Z1, Z4, Z4\n\n    MOVQ X3, R10\n    MOVQ X4, R11\n\n    VZEROUPPER\n    JMP test\nloop:\n    MOVL (CX)(SI*4), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    MOVQ (AX)(DI*8), DI\n    CMPQ DI, R10\n    CMOVQCS DI, R10\n    CMPQ DI, R11\n    CMOVQHI DI, R11\n    INCQ SI\ntest:\n    CMPQ SI, DX\n    JNE loop\nreturn:\n    MOVQ R10, min+48(FP)\n    MOVQ R11, max+56(FP)\n    MOVQ R12, err+64(FP)\n    RET\nindexOutOfBounds:\n    MOVQ $errnoIndexOutOfBounds, R12\n    JMP return\n\n// func dictionaryBoundsBE128(dict [][16]byte, indexes []int32) (min, max *[16]byte, err errno)\nTEXT ·dictionaryBoundsBE128(SB), NOSPLIT, $0-72\n    MOVQ dict_base+0(FP), AX\n    MOVQ dict_len+8(FP), BX\n\n    MOVQ indexes_base+24(FP), CX\n    MOVQ indexes_len+32(FP), DX\n    SHLQ $2, DX // x 4\n    ADDQ CX, DX // end\n\n    XORQ R8, R8 // min (pointer)\n    XORQ R9, R9 // max (pointer)\n    XORQ SI, SI // err\n    XORQ DI, DI\n\n    CMPQ DX, $0\n    JE return\n\n    MOVL (CX), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    SHLQ $4, DI // the dictionary contains 16 byte words\n    LEAQ (AX)(DI*1), R8\n    MOVQ R8, R9\n    MOVQ 0(AX)(DI*1), R10 // min (high)\n    MOVQ 8(AX)(DI*1), R11 // min (low)\n    BSWAPQ R10\n    BSWAPQ R11\n    MOVQ R10, R12 // max (high)\n    MOVQ R11, R13 // max (low)\n\n    JMP next\nloop:\n    MOVL (CX), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    SHLQ $4, DI\n    MOVQ 0(AX)(DI*1), R14\n    MOVQ 8(AX)(DI*1), R15\n    BSWAPQ R14\n    BSWAPQ R15\ntestLessThan:\n    CMPQ R14, R10\n    JA testGreaterThan\n    JB lessThan\n    CMPQ R15, R11\n    JAE testGreaterThan\nlessThan:\n    LEAQ (AX)(DI*1), R8\n    MOVQ R14, R10\n    MOVQ R15, R11\n    JMP next\ntestGreaterThan:\n    CMPQ R14, R12\n    JB next\n    JA greaterThan\n    CMPQ R15, R13\n    JBE next\ngreaterThan:\n    LEAQ (AX)(DI*1), R9\n    MOVQ R14, R12\n    MOVQ R15, R13\nnext:\n    ADDQ $4, CX\n    CMPQ CX, DX\n    JNE loop\nreturn:\n    MOVQ R8, min+48(FP)\n    MOVQ R9, max+56(FP)\n    MOVQ SI, err+64(FP)\n    RET\nindexOutOfBounds:\n    MOVQ $errnoIndexOutOfBounds, SI\n    JMP return\n\n// The lookup functions provide optimized versions of the dictionary index\n// lookup logic.\n//\n// When AVX512 is available, the AVX512 versions of the functions are used\n// which use the VPGATHER* instructions to perform 8 parallel lookups of the\n// values in the dictionary, then VPSCATTER* to do 8 parallel writes to the\n// sparse output buffer.\n\n// func dictionaryLookup32(dict []uint32, indexes []int32, rows sparse.Array) errno\nTEXT ·dictionaryLookup32(SB), NOSPLIT, $0-80\n    MOVQ dict_base+0(FP), AX\n    MOVQ dict_len+8(FP), BX\n\n    MOVQ indexes_base+24(FP), CX\n    MOVQ indexes_len+32(FP), DX\n\n    MOVQ rows_array_ptr+48(FP), R8\n    MOVQ rows_array_off+64(FP), R9\n\n    XORQ SI, SI\n\n    CMPQ DX, $8\n    JB test\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE test\n\n    MOVQ DX, DI\n    SHRQ $3, DI\n    SHLQ $3, DI\n\n    MOVQ R9, R10\n    SHLQ $3, R10 // 8 * size\n\n    MOVW $0xFFFF, R11\n    KMOVW R11, K1\n    KMOVW R11, K2\n\n    VPBROADCASTD R9, Y2           // [size...]\n    VPMULLD ·range0n8(SB), Y2, Y2 // [0*size,1*size,...]\n    VPBROADCASTD BX, Y3           // [len(dict)...]\nloopAVX512:\n    VMOVDQU32 (CX)(SI*4), Y0\n    VPCMPUD $1, Y3, Y0, K3\n    KMOVW K3, R11\n    CMPB R11, $0xFF\n    JNE indexOutOfBounds\n    VPGATHERDD (AX)(Y0*4), K1, Y1\n    VPSCATTERDD Y1, K2, (R8)(Y2*1)\n    KMOVW R11, K1\n    KMOVW R11, K2\n    ADDQ R10, R8\n    ADDQ $8, SI\n    CMPQ SI, DI\n    JNE loopAVX512\n    VZEROUPPER\n    JMP test\nloop:\n    MOVL (CX)(SI*4), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    MOVL (AX)(DI*4), DI\n    MOVL DI, (R8)\n    ADDQ R9, R8\n    INCQ SI\ntest:\n    CMPQ SI, DX\n    JNE loop\n    XORQ AX, AX\nreturn:\n    MOVQ AX, ret+72(FP)\n    RET\nindexOutOfBounds:\n    MOVQ $errnoIndexOutOfBounds, AX\n    JMP return\n\n// func dictionaryLookup64(dict []uint64, indexes []int32, rows sparse.Array) errno\nTEXT ·dictionaryLookup64(SB), NOSPLIT, $0-80\n    MOVQ dict_base+0(FP), AX\n    MOVQ dict_len+8(FP), BX\n\n    MOVQ indexes_base+24(FP), CX\n    MOVQ indexes_len+32(FP), DX\n\n    MOVQ rows_array_ptr+48(FP), R8\n    MOVQ rows_array_off+64(FP), R9\n\n    XORQ SI, SI\n\n    CMPQ DX, $8\n    JB test\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE test\n\n    MOVQ DX, DI\n    SHRQ $3, DI\n    SHLQ $3, DI\n\n    MOVQ R9, R10\n    SHLQ $3, R10 // 8 * size\n\n    MOVW $0xFFFF, R11\n    KMOVW R11, K1\n    KMOVW R11, K2\n\n    VPBROADCASTD R9, Y2           // [size...]\n    VPMULLD ·range0n8(SB), Y2, Y2 // [0*size,1*size,...]\n    VPBROADCASTD BX, Y3           // [len(dict)...]\nloopAVX512:\n    VMOVDQU32 (CX)(SI*4), Y0\n    VPCMPUD $1, Y3, Y0, K3\n    KMOVW K3, R11\n    CMPB R11, $0xFF\n    JNE indexOutOfBounds\n    VPGATHERDQ (AX)(Y0*8), K1, Z1\n    VPSCATTERDQ Z1, K2, (R8)(Y2*1)\n    KMOVW R11, K1\n    KMOVW R11, K2\n    ADDQ R10, R8\n    ADDQ $8, SI\n    CMPQ SI, DI\n    JNE loopAVX512\n    VZEROUPPER\n    JMP test\nloop:\n    MOVL (CX)(SI*4), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n    MOVQ (AX)(DI*8), DI\n    MOVQ DI, (R8)\n    ADDQ R9, R8\n    INCQ SI\ntest:\n    CMPQ SI, DX\n    JNE loop\n    XORQ AX, AX\nreturn:\n    MOVQ AX, ret+72(FP)\n    RET\nindexOutOfBounds:\n    MOVQ $errnoIndexOutOfBounds, AX\n    JMP return\n\n// func dictionaryLookupByteArrayString(dict []uint32, page []byte, indexes []int32, rows sparse.Array) errno\nTEXT ·dictionaryLookupByteArrayString(SB), NOSPLIT, $0-104\n    MOVQ dict_base+0(FP), AX\n    MOVQ dict_len+8(FP), BX\n    DECQ BX // the offsets have the total length as last element\n\n    MOVQ page_base+24(FP), CX\n\n    MOVQ indexes_base+48(FP), R8\n    MOVQ indexes_len+56(FP), R9\n\n    MOVQ rows_array_ptr+72(FP), R10\n    MOVQ rows_array_off+88(FP), R11\n\n    XORQ DI, DI\n    XORQ SI, SI\nloop:\n    // Load the index that we want to read the value from. This may come from\n    // user input so we must validate that the indexes are within the bounds of\n    // the dictionary.\n    MOVL (R8)(SI*4), DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n\n    // Load the offsets within the dictionary page where the value is stored.\n    // We trust the offsets to be correct since they are generated internally by\n    // the dictionary code, there is no need to check that they are within the\n    // bounds of the dictionary page.\n    MOVL 0(AX)(DI*4), DX\n    MOVL 4(AX)(DI*4), DI\n\n    // Compute the length of the value (the difference between two consecutive\n    // offsets), and the pointer to the first byte of the string value.\n    SUBL DX, DI\n    LEAQ (CX)(DX*1), DX\n\n    // Store the length and pointer to the value into the output location.\n    // The memory layout is expected to hold a pointer and length, which are\n    // both 64 bits words. This is the layout used by parquet.Value and the Go\n    // string value type.\n    MOVQ DX, (R10)\n    MOVQ DI, 8(R10)\n\n    ADDQ R11, R10\n    INCQ SI\ntest:\n    CMPQ SI, R9\n    JNE loop\n    XORQ AX, AX\nreturn:\n    MOVQ AX, ret+96(FP)\n    RET\nindexOutOfBounds:\n    MOVQ $errnoIndexOutOfBounds, AX\n    JMP return\n\n// func dictionaryLookupFixedLenByteArrayString(dict []byte, len int, indexes []int32, rows sparse.Array) errno\nTEXT ·dictionaryLookupFixedLenByteArrayString(SB), NOSPLIT, $0-88\n    MOVQ dict_base+0(FP), AX\n    MOVQ dict_len+8(FP), BX\n\n    MOVQ len+24(FP), CX\n\n    MOVQ indexes_base+32(FP), DX\n    MOVQ indexes_len+40(FP), R8\n\n    MOVQ rows_array_ptr+56(FP), R9\n    MOVQ rows_array_off+72(FP), R10\n\n    XORQ DI, DI\n    XORQ SI, SI\nloop:\n    MOVL (DX)(SI*4), DI\n    IMULQ CX, DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n\n    ADDQ AX, DI\n    MOVQ DI, (R9)\n    MOVQ CX, 8(R9)\n\n    ADDQ R10, R9\n    INCQ SI\ntest:\n    CMPQ SI, R8\n    JNE loop\n    XORQ AX, AX\nreturn:\n    MOVQ AX, ret+80(FP)\n    RET\nindexOutOfBounds:\n    MOVQ $errnoIndexOutOfBounds, AX\n    JMP return\n\n// This is the same algorithm as dictionaryLookupFixedLenByteArrayString but we\n// only store the pointer to the location holding the value instead of storing\n// the pair of pointer and length. Since the length is fixed for this dictionary\n// type, the application can assume it at the call site.\n//\n// func dictionaryLookupFixedLenByteArrayPointer(dict []byte, len int, indexes []int32, rows sparse.Array) errno\nTEXT ·dictionaryLookupFixedLenByteArrayPointer(SB), NOSPLIT, $0-88\n    MOVQ dict_base+0(FP), AX\n    MOVQ dict_len+8(FP), BX\n\n    MOVQ len+24(FP), CX\n\n    MOVQ indexes_base+32(FP), DX\n    MOVQ indexes_len+40(FP), R8\n\n    MOVQ rows_array_ptr+56(FP), R9\n    MOVQ rows_array_off+72(FP), R10\n\n    XORQ DI, DI\n    XORQ SI, SI\nloop:\n    MOVL (DX)(SI*4), DI\n    IMULQ CX, DI\n    CMPL DI, BX\n    JAE indexOutOfBounds\n\n    ADDQ AX, DI\n    MOVQ DI, (R9)\n\n    ADDQ R10, R9\n    INCQ SI\ntest:\n    CMPQ SI, R8\n    JNE loop\n    XORQ AX, AX\nreturn:\n    MOVQ AX, ret+80(FP)\n    RET\nindexOutOfBounds:\n    MOVQ $errnoIndexOutOfBounds, AX\n    JMP return\n\nGLOBL ·range0n8(SB), RODATA|NOPTR, $40\nDATA ·range0n8+0(SB)/4, $0\nDATA ·range0n8+4(SB)/4, $1\nDATA ·range0n8+8(SB)/4, $2\nDATA ·range0n8+12(SB)/4, $3\nDATA ·range0n8+16(SB)/4, $4\nDATA ·range0n8+20(SB)/4, $5\nDATA ·range0n8+24(SB)/4, $6\nDATA ·range0n8+28(SB)/4, $7\nDATA ·range0n8+32(SB)/4, $8\n"
  },
  {
    "path": "dictionary_purego.go",
    "content": "//go:build purego || !amd64\n\npackage parquet\n\nimport (\n\t\"unsafe\"\n\n\t\"github.com/segmentio/parquet-go/sparse\"\n)\n\nfunc (d *int32Dictionary) lookup(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tfor i, j := range indexes {\n\t\t*(*int32)(rows.Index(i)) = d.index(j)\n\t}\n}\n\nfunc (d *int64Dictionary) lookup(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tfor i, j := range indexes {\n\t\t*(*int64)(rows.Index(i)) = d.index(j)\n\t}\n}\n\nfunc (d *floatDictionary) lookup(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tfor i, j := range indexes {\n\t\t*(*float32)(rows.Index(i)) = d.index(j)\n\t}\n}\n\nfunc (d *doubleDictionary) lookup(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tfor i, j := range indexes {\n\t\t*(*float64)(rows.Index(i)) = d.index(j)\n\t}\n}\n\nfunc (d *byteArrayDictionary) lookupString(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tfor i, j := range indexes {\n\t\tv := d.index(int(j))\n\t\t*(*string)(rows.Index(i)) = *(*string)(unsafe.Pointer(&v))\n\t}\n}\n\nfunc (d *fixedLenByteArrayDictionary) lookupString(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tfor i, j := range indexes {\n\t\tv := d.index(j)\n\t\t*(*string)(rows.Index(i)) = *(*string)(unsafe.Pointer(&v))\n\t}\n}\n\nfunc (d *uint32Dictionary) lookup(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tfor i, j := range indexes {\n\t\t*(*uint32)(rows.Index(i)) = d.index(j)\n\t}\n}\n\nfunc (d *uint64Dictionary) lookup(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tfor i, j := range indexes {\n\t\t*(*uint64)(rows.Index(i)) = d.index(j)\n\t}\n}\n\nfunc (d *be128Dictionary) lookupString(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\ts := \"0123456789ABCDEF\"\n\tfor i, j := range indexes {\n\t\t*(**[16]byte)(unsafe.Pointer(&s)) = d.index(j)\n\t\t*(*string)(rows.Index(i)) = s\n\t}\n}\n\nfunc (d *be128Dictionary) lookupPointer(indexes []int32, rows sparse.Array) {\n\tcheckLookupIndexBounds(indexes, rows)\n\tfor i, j := range indexes {\n\t\t*(**[16]byte)(rows.Index(i)) = d.index(j)\n\t}\n}\n\nfunc (d *int32Dictionary) bounds(indexes []int32) (min, max int32) {\n\tmin = d.index(indexes[0])\n\tmax = min\n\n\tfor _, i := range indexes[1:] {\n\t\tvalue := d.index(i)\n\t\tif value < min {\n\t\t\tmin = value\n\t\t}\n\t\tif value > max {\n\t\t\tmax = value\n\t\t}\n\t}\n\n\treturn min, max\n}\n\nfunc (d *int64Dictionary) bounds(indexes []int32) (min, max int64) {\n\tmin = d.index(indexes[0])\n\tmax = min\n\n\tfor _, i := range indexes[1:] {\n\t\tvalue := d.index(i)\n\t\tif value < min {\n\t\t\tmin = value\n\t\t}\n\t\tif value > max {\n\t\t\tmax = value\n\t\t}\n\t}\n\n\treturn min, max\n}\n\nfunc (d *floatDictionary) bounds(indexes []int32) (min, max float32) {\n\tmin = d.index(indexes[0])\n\tmax = min\n\n\tfor _, i := range indexes[1:] {\n\t\tvalue := d.index(i)\n\t\tif value < min {\n\t\t\tmin = value\n\t\t}\n\t\tif value > max {\n\t\t\tmax = value\n\t\t}\n\t}\n\n\treturn min, max\n}\n\nfunc (d *doubleDictionary) bounds(indexes []int32) (min, max float64) {\n\tmin = d.index(indexes[0])\n\tmax = min\n\n\tfor _, i := range indexes[1:] {\n\t\tvalue := d.index(i)\n\t\tif value < min {\n\t\t\tmin = value\n\t\t}\n\t\tif value > max {\n\t\t\tmax = value\n\t\t}\n\t}\n\n\treturn min, max\n}\n\nfunc (d *uint32Dictionary) bounds(indexes []int32) (min, max uint32) {\n\tmin = d.index(indexes[0])\n\tmax = min\n\n\tfor _, i := range indexes[1:] {\n\t\tvalue := d.index(i)\n\t\tif value < min {\n\t\t\tmin = value\n\t\t}\n\t\tif value > max {\n\t\t\tmax = value\n\t\t}\n\t}\n\n\treturn min, max\n}\n\nfunc (d *uint64Dictionary) bounds(indexes []int32) (min, max uint64) {\n\tmin = d.index(indexes[0])\n\tmax = min\n\n\tfor _, i := range indexes[1:] {\n\t\tvalue := d.index(i)\n\t\tif value < min {\n\t\t\tmin = value\n\t\t}\n\t\tif value > max {\n\t\t\tmax = value\n\t\t}\n\t}\n\n\treturn min, max\n}\n\nfunc (d *be128Dictionary) bounds(indexes []int32) (min, max *[16]byte) {\n\tvalues := [64]*[16]byte{}\n\tmin = d.index(indexes[0])\n\tmax = min\n\n\tfor i := 1; i < len(indexes); i += len(values) {\n\t\tn := len(indexes) - i\n\t\tif n > len(values) {\n\t\t\tn = len(values)\n\t\t}\n\t\tj := i + n\n\t\td.lookupPointer(indexes[i:j:j], makeArrayBE128(values[:n:n]))\n\n\t\tfor _, value := range values[:n:n] {\n\t\t\tswitch {\n\t\t\tcase lessBE128(value, min):\n\t\t\t\tmin = value\n\t\t\tcase lessBE128(max, value):\n\t\t\t\tmax = value\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min, max\n}\n"
  },
  {
    "path": "dictionary_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"math/rand\"\n\t\"testing\"\n\t\"time\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nvar dictionaryTypes = [...]parquet.Type{\n\tparquet.BooleanType,\n\tparquet.Int32Type,\n\tparquet.Int64Type,\n\tparquet.Int96Type,\n\tparquet.FloatType,\n\tparquet.DoubleType,\n\tparquet.ByteArrayType,\n\tparquet.FixedLenByteArrayType(10),\n\tparquet.FixedLenByteArrayType(16),\n\tparquet.Uint(32).Type(),\n\tparquet.Uint(64).Type(),\n}\n\nfunc TestDictionary(t *testing.T) {\n\tfor _, typ := range dictionaryTypes {\n\t\tfor _, numValues := range []int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 1e2, 1e3, 1e4} {\n\t\t\tt.Run(fmt.Sprintf(\"%s/N=%d\", typ, numValues), func(t *testing.T) {\n\t\t\t\ttestDictionary(t, typ, numValues)\n\t\t\t})\n\t\t}\n\t}\n}\n\nfunc testDictionary(t *testing.T, typ parquet.Type, numValues int) {\n\tconst columnIndex = 1\n\n\tdict := typ.NewDictionary(columnIndex, 0, typ.NewValues(nil, nil))\n\tvalues := make([]parquet.Value, numValues)\n\tindexes := make([]int32, numValues)\n\tlookups := make([]parquet.Value, numValues)\n\n\tf := randValueFuncOf(typ)\n\tr := rand.New(rand.NewSource(int64(numValues)))\n\n\tfor i := range values {\n\t\tvalues[i] = f(r)\n\t\tvalues[i] = values[i].Level(0, 0, columnIndex)\n\t}\n\n\tmapping := make(map[int32]parquet.Value, numValues)\n\n\tfor i := 0; i < numValues; {\n\t\tj := i + ((numValues-i)/2 + 1)\n\t\tif j > numValues {\n\t\t\tj = numValues\n\t\t}\n\n\t\tdict.Insert(indexes[i:j], values[i:j])\n\n\t\tfor k, v := range values[i:j] {\n\t\t\tmapping[indexes[i+k]] = v\n\t\t}\n\n\t\tfor _, index := range indexes[i:j] {\n\t\t\tif index < 0 || index >= int32(dict.Len()) {\n\t\t\t\tt.Fatalf(\"index out of bounds: %d\", index)\n\t\t\t}\n\t\t}\n\n\t\t// second insert is a no-op since all the values are already in the dictionary\n\t\tlastDictLen := dict.Len()\n\t\tdict.Insert(indexes[i:j], values[i:j])\n\n\t\tif dict.Len() != lastDictLen {\n\t\t\tfor k, index := range indexes[i:j] {\n\t\t\t\tif index >= int32(len(mapping)) {\n\t\t\t\t\tt.Log(values[i+k])\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tt.Fatalf(\"%d values were inserted on the second pass\", dict.Len()-len(mapping))\n\t\t}\n\n\t\tr.Shuffle(j-i, func(a, b int) {\n\t\t\tindexes[a+i], indexes[b+i] = indexes[b+i], indexes[a+i]\n\t\t})\n\n\t\tdict.Lookup(indexes[i:j], lookups[i:j])\n\n\t\tfor lookupIndex, valueIndex := range indexes[i:j] {\n\t\t\twant := mapping[valueIndex]\n\t\t\tgot := lookups[lookupIndex+i]\n\n\t\t\tif !parquet.DeepEqual(want, got) {\n\t\t\t\tt.Fatalf(\"wrong value looked up at index %d: want=%#v got=%#v\", valueIndex, want, got)\n\t\t\t}\n\t\t}\n\n\t\tminValue := values[i]\n\t\tmaxValue := values[i]\n\n\t\tfor _, value := range values[i+1 : j] {\n\t\t\tswitch {\n\t\t\tcase typ.Compare(value, minValue) < 0:\n\t\t\t\tminValue = value\n\t\t\tcase typ.Compare(value, maxValue) > 0:\n\t\t\t\tmaxValue = value\n\t\t\t}\n\t\t}\n\n\t\tlowerBound, upperBound := dict.Bounds(indexes[i:j])\n\t\tif !parquet.DeepEqual(lowerBound, minValue) {\n\t\t\tt.Errorf(\"wrong lower bound between indexes %d and %d: want=%#v got=%#v\", i, j, minValue, lowerBound)\n\t\t}\n\t\tif !parquet.DeepEqual(upperBound, maxValue) {\n\t\t\tt.Errorf(\"wrong upper bound between indexes %d and %d: want=%#v got=%#v\", i, j, maxValue, upperBound)\n\t\t}\n\n\t\ti = j\n\t}\n\n\tfor i := range lookups {\n\t\tlookups[i] = parquet.Value{}\n\t}\n\n\tdict.Lookup(indexes, lookups)\n\n\tfor lookupIndex, valueIndex := range indexes {\n\t\twant := mapping[valueIndex]\n\t\tgot := lookups[lookupIndex]\n\n\t\tif !parquet.Equal(want, got) {\n\t\t\tt.Fatalf(\"wrong value looked up at index %d: want=%+v got=%+v\", valueIndex, want, got)\n\t\t}\n\t}\n}\n\nfunc BenchmarkDictionary(b *testing.B) {\n\ttests := []struct {\n\t\tscenario string\n\t\tinit     func(parquet.Dictionary, []int32, []parquet.Value)\n\t\ttest     func(parquet.Dictionary, []int32, []parquet.Value)\n\t}{\n\t\t{\n\t\t\tscenario: \"Bounds\",\n\t\t\tinit:     parquet.Dictionary.Insert,\n\t\t\ttest: func(dict parquet.Dictionary, indexes []int32, _ []parquet.Value) {\n\t\t\t\tdict.Bounds(indexes)\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"Insert\",\n\t\t\ttest:     parquet.Dictionary.Insert,\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"Lookup\",\n\t\t\tinit:     parquet.Dictionary.Insert,\n\t\t\ttest:     parquet.Dictionary.Lookup,\n\t\t},\n\t}\n\n\tfor i, test := range tests {\n\t\tb.Run(test.scenario, func(b *testing.B) {\n\t\t\tfor j, typ := range dictionaryTypes {\n\t\t\t\tfor _, numValues := range []int{1e2, 1e3, 1e4, 1e5, 1e6} {\n\t\t\t\t\tbuf := typ.NewValues(make([]byte, 0, 4*numValues), nil)\n\t\t\t\t\tdict := typ.NewDictionary(0, 0, buf)\n\t\t\t\t\tvalues := make([]parquet.Value, numValues)\n\n\t\t\t\t\tf := randValueFuncOf(typ)\n\t\t\t\t\tr := rand.New(rand.NewSource(int64(i * j * numValues)))\n\n\t\t\t\t\tfor i := range values {\n\t\t\t\t\t\tvalues[i] = f(r)\n\t\t\t\t\t}\n\n\t\t\t\t\tindexes := make([]int32, len(values))\n\t\t\t\t\tif test.init != nil {\n\t\t\t\t\t\ttest.init(dict, indexes, values)\n\t\t\t\t\t}\n\n\t\t\t\t\tb.Run(fmt.Sprintf(\"%s/N=%d\", typ, numValues), func(b *testing.B) {\n\t\t\t\t\t\tstart := time.Now()\n\n\t\t\t\t\t\tfor i := 0; i < b.N; i++ {\n\t\t\t\t\t\t\ttest.test(dict, indexes, values)\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\tseconds := time.Since(start).Seconds()\n\t\t\t\t\t\tb.ReportMetric(float64(numValues*b.N)/seconds, \"value/s\")\n\t\t\t\t\t})\n\t\t\t\t}\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc TestIssue312(t *testing.T) {\n\tnode := parquet.String()\n\tnode = parquet.Encoded(node, &parquet.RLEDictionary)\n\tg := parquet.Group{}\n\tg[\"mystring\"] = node\n\tschema := parquet.NewSchema(\"test\", g)\n\n\trows := []parquet.Row{[]parquet.Value{parquet.ValueOf(\"hello\").Level(0, 0, 0)}}\n\n\tvar storage bytes.Buffer\n\n\ttests := []struct {\n\t\tname        string\n\t\tgetRowGroup func(t *testing.T) parquet.RowGroup\n\t}{\n\t\t{\n\t\t\tname: \"Writer\",\n\t\t\tgetRowGroup: func(t *testing.T) parquet.RowGroup {\n\t\t\t\tt.Helper()\n\n\t\t\t\tw := parquet.NewWriter(&storage, schema)\n\t\t\t\t_, err := w.WriteRows(rows)\n\t\t\t\tif err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\t\t\t\tif err := w.Close(); err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\n\t\t\t\tr := bytes.NewReader(storage.Bytes())\n\t\t\t\tf, err := parquet.OpenFile(r, int64(storage.Len()))\n\t\t\t\tif err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\t\t\t\treturn f.RowGroups()[0]\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tname: \"Buffer\",\n\t\t\tgetRowGroup: func(t *testing.T) parquet.RowGroup {\n\t\t\t\tt.Helper()\n\n\t\t\t\tb := parquet.NewBuffer(schema)\n\t\t\t\t_, err := b.WriteRows(rows)\n\t\t\t\tif err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\t\t\t\treturn b\n\t\t\t},\n\t\t},\n\t}\n\n\tfor _, testCase := range tests {\n\t\tt.Run(testCase.name, func(t *testing.T) {\n\t\t\trowGroup := testCase.getRowGroup(t)\n\n\t\t\tchunk := rowGroup.ColumnChunks()[0]\n\t\t\tidx := chunk.ColumnIndex()\n\t\t\tval := idx.MinValue(0)\n\t\t\tcolumnType := chunk.Type()\n\t\t\tvalues := columnType.NewValues(val.Bytes(), []uint32{0, uint32(len(val.Bytes()))})\n\n\t\t\t// This test ensures that the dictionary type created by column\n\t\t\t// chunks of parquet readers and buffers are the same. We want the\n\t\t\t// column chunk type to be the actual value type, even when the\n\t\t\t// schema uses a dictionary encoding.\n\t\t\t//\n\t\t\t// https://github.com/segmentio/parquet-go/issues/312\n\t\t\t_ = columnType.NewDictionary(0, 1, values)\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "encoding/bitpacked/bitpacked.go",
    "content": "package bitpacked\n\nimport (\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\ntype Encoding struct {\n\tencoding.NotSupported\n\tBitWidth int\n}\n\nfunc (e *Encoding) String() string {\n\treturn \"BIT_PACKED\"\n}\n\nfunc (e *Encoding) Encoding() format.Encoding {\n\treturn format.BitPacked\n}\n\nfunc (e *Encoding) EncodeLevels(dst []byte, src []uint8) ([]byte, error) {\n\tdst, err := encodeLevels(dst[:0], src, uint(e.BitWidth))\n\treturn dst, e.wrap(err)\n}\n\nfunc (e *Encoding) DecodeLevels(dst []uint8, src []byte) ([]uint8, error) {\n\tdst, err := decodeLevels(dst[:0], src, uint(e.BitWidth))\n\treturn dst, e.wrap(err)\n}\n\nfunc (e *Encoding) wrap(err error) error {\n\tif err != nil {\n\t\terr = encoding.Error(e, err)\n\t}\n\treturn err\n}\n\nfunc encodeLevels(dst, src []byte, bitWidth uint) ([]byte, error) {\n\tif bitWidth == 0 || len(src) == 0 {\n\t\treturn append(dst[:0], 0), nil\n\t}\n\n\tn := ((int(bitWidth) * len(src)) + 7) / 8\n\tc := n + 1\n\n\tif cap(dst) < c {\n\t\tdst = make([]byte, c, 2*c)\n\t} else {\n\t\tdst = dst[:c]\n\t\tfor i := range dst {\n\t\t\tdst[i] = 0\n\t\t}\n\t}\n\n\tbitMask := byte(1<<bitWidth) - 1\n\tbitShift := 8 - bitWidth\n\tbitOffset := uint(0)\n\n\tfor _, value := range src {\n\t\tv := bitFlip(value) >> bitShift\n\t\ti := bitOffset / 8\n\t\tj := bitOffset % 8\n\t\tdst[i+0] |= (v & bitMask) << j\n\t\tdst[i+1] |= (v >> (8 - j))\n\t\tbitOffset += bitWidth\n\t}\n\n\treturn dst[:n], nil\n}\n\nfunc decodeLevels(dst, src []byte, bitWidth uint) ([]byte, error) {\n\tif bitWidth == 0 || len(src) == 0 {\n\t\treturn append(dst[:0], 0), nil\n\t}\n\n\tnumBits := 8 * uint(len(src))\n\tnumValues := int(numBits / bitWidth)\n\tif (numBits % bitWidth) != 0 {\n\t\tnumValues++\n\t}\n\n\tif cap(dst) < numValues {\n\t\tdst = make([]byte, numValues, 2*numValues)\n\t} else {\n\t\tdst = dst[:numValues]\n\t\tfor i := range dst {\n\t\t\tdst[i] = 0\n\t\t}\n\t}\n\n\tbitMask := byte(1<<bitWidth) - 1\n\tbitShift := 8 - bitWidth\n\tbitOffset := uint(0)\n\n\tfor k := range dst {\n\t\ti := bitOffset / 8\n\t\tj := bitOffset % 8\n\t\tv := (src[i+0] >> j)\n\t\tif int(i+1) < len(src) {\n\t\t\tv |= (src[i+1] << (8 - j))\n\t\t}\n\t\tv &= bitMask\n\t\tdst[k] = bitFlip(v) >> bitShift\n\t\tbitOffset += bitWidth\n\t}\n\n\treturn dst, nil\n}\n\nfunc bitFlip(b byte) byte {\n\treturn (((b >> 0) & 1) << 7) |\n\t\t(((b >> 1) & 1) << 6) |\n\t\t(((b >> 2) & 1) << 5) |\n\t\t(((b >> 3) & 1) << 4) |\n\t\t(((b >> 4) & 1) << 3) |\n\t\t(((b >> 5) & 1) << 2) |\n\t\t(((b >> 6) & 1) << 1) |\n\t\t(((b >> 7) & 1) << 0)\n}\n"
  },
  {
    "path": "encoding/bitpacked/bitpacked_test.go",
    "content": "//go:build go1.18\n// +build go1.18\n\npackage bitpacked_test\n\nimport (\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/encoding/fuzz\"\n\t\"github.com/segmentio/parquet-go/encoding/rle\"\n)\n\nfunc FuzzEncodeLevels(f *testing.F) {\n\tfuzz.EncodeLevels(f, &rle.Encoding{BitWidth: 8})\n}\n"
  },
  {
    "path": "encoding/bytestreamsplit/bytestreamsplit.go",
    "content": "package bytestreamsplit\n\nimport (\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/format\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\n// This encoder implements a version of the Byte Stream Split encoding as described\n// in https://github.com/apache/parquet-format/blob/master/Encodings.md#byte-stream-split-byte_stream_split--9\ntype Encoding struct {\n\tencoding.NotSupported\n}\n\nfunc (e *Encoding) String() string {\n\treturn \"BYTE_STREAM_SPLIT\"\n}\n\nfunc (e *Encoding) Encoding() format.Encoding {\n\treturn format.ByteStreamSplit\n}\n\nfunc (e *Encoding) EncodeFloat(dst []byte, src []float32) ([]byte, error) {\n\tdst = resize(dst, 4*len(src))\n\tencodeFloat(dst, unsafecast.Float32ToBytes(src))\n\treturn dst, nil\n}\n\nfunc (e *Encoding) EncodeDouble(dst []byte, src []float64) ([]byte, error) {\n\tdst = resize(dst, 8*len(src))\n\tencodeDouble(dst, unsafecast.Float64ToBytes(src))\n\treturn dst, nil\n}\n\nfunc (e *Encoding) DecodeFloat(dst []float32, src []byte) ([]float32, error) {\n\tif (len(src) % 4) != 0 {\n\t\treturn dst, encoding.ErrDecodeInvalidInputSize(e, \"FLOAT\", len(src))\n\t}\n\tbuf := resize(unsafecast.Float32ToBytes(dst), len(src))\n\tdecodeFloat(buf, src)\n\treturn unsafecast.BytesToFloat32(buf), nil\n}\n\nfunc (e *Encoding) DecodeDouble(dst []float64, src []byte) ([]float64, error) {\n\tif (len(src) % 8) != 0 {\n\t\treturn dst, encoding.ErrDecodeInvalidInputSize(e, \"DOUBLE\", len(src))\n\t}\n\tbuf := resize(unsafecast.Float64ToBytes(dst), len(src))\n\tdecodeDouble(buf, src)\n\treturn unsafecast.BytesToFloat64(buf), nil\n}\n\nfunc resize(buf []byte, size int) []byte {\n\tif cap(buf) < size {\n\t\tbuf = make([]byte, size, 2*size)\n\t} else {\n\t\tbuf = buf[:size]\n\t}\n\treturn buf\n}\n"
  },
  {
    "path": "encoding/bytestreamsplit/bytestreamsplit_amd64.go",
    "content": "//go:build !purego\n\npackage bytestreamsplit\n\nimport (\n\t\"golang.org/x/sys/cpu\"\n)\n\nvar encodeFloatHasAVX512 = cpu.X86.HasAVX512 &&\n\tcpu.X86.HasAVX512F &&\n\tcpu.X86.HasAVX512VL\n\nvar encodeDoubleHasAVX512 = cpu.X86.HasAVX512 &&\n\tcpu.X86.HasAVX512F &&\n\tcpu.X86.HasAVX512VL &&\n\tcpu.X86.HasAVX512VBMI // VPERMB\n\nvar decodeFloatHasAVX2 = cpu.X86.HasAVX2\n\nvar decodeDoubleHasAVX512 = cpu.X86.HasAVX512 &&\n\tcpu.X86.HasAVX512F &&\n\tcpu.X86.HasAVX512VL &&\n\tcpu.X86.HasAVX512VBMI // VPERMB\n\n//go:noescape\nfunc encodeFloat(dst, src []byte)\n\n//go:noescape\nfunc encodeDouble(dst, src []byte)\n\n//go:noescape\nfunc decodeFloat(dst, src []byte)\n\n//go:noescape\nfunc decodeDouble(dst, src []byte)\n"
  },
  {
    "path": "encoding/bytestreamsplit/bytestreamsplit_amd64.s",
    "content": " //go:build !purego\n\n#include \"textflag.h\"\n\n// This file contains optimizations of the BYTE_STREAM_SPLIT encoding using AVX2\n// and AVX512 (when available).\n//\n// The AVX2/512 instruction set comes with instructions to load memory from, or\n// store memory at sparse locations called VPGATHER and VPSCATTER. VPGATHER was\n// available in the AVX2 instruction set, VPSCATTER was introduced in AVX512\n// (when the AVX512_VBMI extension is supported). Gathering bytes are sparse\n// memory locations is useful during the decoding process since we are\n// recomposing 32 or 64 bit floating point values from 4 or 8 bytes dispatched\n// in the input byte array.\n//\n// To either deconstruct or reconstruct floating point values, we need to\n// reorder the bytes of each value. If we have 4 32 bit floats, we can permute\n// their bytes so that the first one contains all the first bytes, the second\n// contains all the second bytes, etc... The VPSHUFB instruction is used to\n// perform the byte permutation, or the VPERMB instruction for 64 bit floats.\n//\n// We use different instructions because the VPSHUFB instruction works on two\n// lanes of 16 bytes when used on YMM registers. 4 32 bit floats take 16 bytes,\n// so a a YMM register can hold two lanes of 4 32 bit floats and the VPSHUFB\n// can permute the two sets of values in a single invocation. For 64 bit floats\n// we need to permute 8 values, which take 64 bytes and therefore need to be\n// held in a ZMM register and apply permutations across the entire register,\n// which is only possible using VPERMB.\n//\n// Technically we could use ZMM registers when working on 32 bit floats to work\n// on 16 values per iteration. However, measurements indicated that the latency\n// of VPGATHERDD/VPSCATTERDD on ZMM registers did not provide any improvements\n// to the throughput of the algorithms, but working on more values increased the\n// code complexity. Using YMM registers offered the best balance between\n// performance and maintainability.\n//\n// At a high level the vectorized algorithms are the following:\n//\n// encoding\n// --------\n//   * Load a vector of data from the input buffer\n//   * Permute bytes, grouping bytes by index\n//   * Scatter bytes of the register to the output buffer\n//\n// decoding\n// --------\n//   * Gather sparse bytes from the input buffer\n//   * Permute bytes, reconstructing the original values\n//   * Store the vector in the output buffer\n//\n// When AVX instructions are not available, the functions fallback to scalar\n// implementations of the algorithms. These yield much lower throughput, but\n// performed 20-30% better than the code generated by the Go compiler.\n\n// func encodeFloat(dst, src []byte)\nTEXT ·encodeFloat(SB), NOSPLIT, $0-48\n    MOVQ src_base+24(FP), AX\n    MOVQ src_len+32(FP), BX\n    MOVQ dst_base+0(FP), DX\n\n    MOVQ AX, CX\n    ADDQ BX, CX // end\n    SHRQ $2, BX // len\n\n    CMPQ BX, $0\n    JE done\n\n    CMPB ·encodeFloatHasAVX512(SB), $0\n    JE loop1x4\n\n    CMPQ BX, $8\n    JB loop1x4\n\n    MOVQ CX, DI\n    SUBQ AX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    ADDQ AX, DI\n\n    VMOVDQU32 shuffle8x4<>(SB), Y0\n    VPBROADCASTD BX, Y2\n    VPMULLD scale8x4<>(SB), Y2, Y2\n    VPADDD offset8x4<>(SB), Y2, Y2\nloop8x4:\n    KXORQ K1, K1, K1\n    KNOTQ K1, K1\n\n    VMOVDQU32 (AX), Y1\n    VPSHUFB Y0, Y1, Y1\n    VPSCATTERDD Y1, K1, (DX)(Y2*1)\n\n    ADDQ $32, AX\n    ADDQ $8, DX\n    CMPQ AX, DI\n    JNE loop8x4\n    VZEROUPPER\n\n    CMPQ AX, CX\n    JE done\nloop1x4:\n    MOVL (AX), SI\n    MOVQ DX, DI\n\n    MOVB SI, (DI)\n    SHRL $8, SI\n    ADDQ BX, DI\n\n    MOVB SI, (DI)\n    SHRL $8, SI\n    ADDQ BX, DI\n\n    MOVB SI, (DI)\n    SHRL $8, SI\n    ADDQ BX, DI\n\n    MOVB SI, (DI)\n\n    ADDQ $4, AX\n    INCQ DX\n    CMPQ AX, CX\n    JB loop1x4\ndone:\n    RET\n\n// func encodeDouble(dst, src []byte)\nTEXT ·encodeDouble(SB), NOSPLIT, $0-48\n    MOVQ src_base+24(FP), AX\n    MOVQ src_len+32(FP), BX\n    MOVQ dst_base+0(FP), DX\n\n    MOVQ AX, CX\n    ADDQ BX, CX\n    SHRQ $3, BX\n\n    CMPQ BX, $0\n    JE done\n\n    CMPB ·encodeDoubleHasAVX512(SB), $0\n    JE loop1x8\n\n    CMPQ BX, $8\n    JB loop1x8\n\n    MOVQ CX, DI\n    SUBQ AX, DI\n    SHRQ $6, DI\n    SHLQ $6, DI\n    ADDQ AX, DI\n\n    VMOVDQU64 shuffle8x8<>(SB), Z0\n    VPBROADCASTQ BX, Z2\n    VPMULLQ scale8x8<>(SB), Z2, Z2\nloop8x8:\n    KXORQ K1, K1, K1\n    KNOTQ K1, K1\n\n    VMOVDQU64 (AX), Z1\n    VPERMB Z1, Z0, Z1\n    VPSCATTERQQ Z1, K1, (DX)(Z2*1)\n\n    ADDQ $64, AX\n    ADDQ $8, DX\n    CMPQ AX, DI\n    JNE loop8x8\n    VZEROUPPER\n\n    CMPQ AX, CX\n    JE done\nloop1x8:\n    MOVQ (AX), SI\n    MOVQ DX, DI\n\n    MOVB SI, (DI)\n    SHRQ $8, SI\n    ADDQ BX, DI\n\n    MOVB SI, (DI)\n    SHRQ $8, SI\n    ADDQ BX, DI\n\n    MOVB SI, (DI)\n    SHRQ $8, SI\n    ADDQ BX, DI\n\n    MOVB SI, (DI)\n    SHRQ $8, SI\n    ADDQ BX, DI\n\n    MOVB SI, (DI)\n    SHRQ $8, SI\n    ADDQ BX, DI\n\n    MOVB SI, (DI)\n    SHRQ $8, SI\n    ADDQ BX, DI\n\n    MOVB SI, (DI)\n    SHRQ $8, SI\n    ADDQ BX, DI\n\n    MOVB SI, (DI)\n\n    ADDQ $8, AX\n    INCQ DX\n    CMPQ AX, CX\n    JB loop1x8\ndone:\n    RET\n\n// func decodeFloat(dst, src []byte)\nTEXT ·decodeFloat(SB), NOSPLIT, $0-48\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), BX\n    MOVQ src_base+24(FP), DX\n\n    MOVQ AX, CX\n    ADDQ BX, CX // end\n    SHRQ $2, BX // len\n\n    CMPQ BX, $0\n    JE done\n\n    CMPB ·decodeFloatHasAVX2(SB), $0\n    JE loop1x4\n\n    CMPQ BX, $8\n    JB loop1x4\n\n    MOVQ CX, DI\n    SUBQ AX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    ADDQ AX, DI\n\n    MOVQ $0xFFFFFFFF, SI\n    MOVQ BX, X5\n    MOVQ SI, X6\n    VMOVDQU shuffle8x4<>(SB), Y0\n    VPBROADCASTD X5, Y2\n    VPBROADCASTD X6, Y3\n    VPMULLD scale8x4<>(SB), Y2, Y2\n    VPADDD offset8x4<>(SB), Y2, Y2\n    VMOVDQU Y3, Y4\nloop8x4:\n    VPGATHERDD Y4, (DX)(Y2*1), Y1\n    VPSHUFB Y0, Y1, Y1\n    VMOVDQU Y1, (AX)\n    VMOVDQU Y3, Y4\n\n    ADDQ $32, AX\n    ADDQ $8, DX\n    CMPQ AX, DI\n    JNE loop8x4\n    VZEROUPPER\n\n    CMPQ AX, CX\n    JE done\nloop1x4:\n    MOVQ DX, DI\n    MOVBLZX (DI), R8\n    ADDQ BX, DI\n    MOVBLZX (DI), R9\n    ADDQ BX, DI\n    MOVBLZX (DI), R10\n    ADDQ BX, DI\n    MOVBLZX (DI), R11\n\n    SHLL $8, R9\n    SHLL $16, R10\n    SHLL $24, R11\n\n    ORL R9, R8\n    ORL R10, R8\n    ORL R11, R8\n\n    MOVL R8, (AX)\n\n    ADDQ $4, AX\n    INCQ DX\n    CMPQ AX, CX\n    JB loop1x4\ndone:\n    RET\n\n// func decodeDouble(dst, src []byte)\nTEXT ·decodeDouble(SB), NOSPLIT, $0-48\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), BX\n    MOVQ src_base+24(FP), DX\n\n    MOVQ AX, CX\n    ADDQ BX, CX\n    SHRQ $3, BX\n\n    CMPQ BX, $0\n    JE done\n\n    CMPB ·decodeDoubleHasAVX512(SB), $0\n    JE loop1x8\n\n    CMPQ BX, $8\n    JB loop1x8\n\n    MOVQ CX, DI\n    SUBQ AX, DI\n    SHRQ $6, DI\n    SHLQ $6, DI\n    ADDQ AX, DI\n\n    VMOVDQU64 shuffle8x8<>(SB), Z0\n    VPBROADCASTQ BX, Z2\n    VPMULLQ scale8x8<>(SB), Z2, Z2\nloop8x8:\n    KXORQ K1, K1, K1\n    KNOTQ K1, K1\n\n    VPGATHERQQ (DX)(Z2*1), K1, Z1\n    VPERMB Z1, Z0, Z1\n    VMOVDQU64 Z1, (AX)\n\n    ADDQ $64, AX\n    ADDQ $8, DX\n    CMPQ AX, DI\n    JNE loop8x8\n    VZEROUPPER\n\n    CMPQ AX, CX\n    JE done\nloop1x8:\n    MOVQ DX, DI\n    XORQ R12, R12\n\n    MOVBQZX (DI), R8\n    ADDQ BX, DI\n    MOVBQZX (DI), R9\n    ADDQ BX, DI\n    MOVBQZX (DI), R10\n    ADDQ BX, DI\n    MOVBQZX (DI), R11\n    ADDQ BX, DI\n\n    SHLQ $8, R9\n    SHLQ $16, R10\n    SHLQ $24, R11\n\n    ORQ R8, R12\n    ORQ R9, R12\n    ORQ R10, R12\n    ORQ R11, R12\n\n    MOVBQZX (DI), R8\n    ADDQ BX, DI\n    MOVBQZX (DI), R9\n    ADDQ BX, DI\n    MOVBQZX (DI), R10\n    ADDQ BX, DI\n    MOVBQZX (DI), R11\n\n    SHLQ $32, R8\n    SHLQ $40, R9\n    SHLQ $48, R10\n    SHLQ $56, R11\n\n    ORQ R8, R12\n    ORQ R9, R12\n    ORQ R10, R12\n    ORQ R11, R12\n\n    MOVQ R12, (AX)\n\n    ADDQ $8, AX\n    INCQ DX\n    CMPQ AX, CX\n    JB loop1x8\ndone:\n    RET\n\nGLOBL scale8x4<>(SB), RODATA|NOPTR, $32\nDATA scale8x4<>+0(SB)/4,  $0\nDATA scale8x4<>+4(SB)/4,  $1\nDATA scale8x4<>+8(SB)/4,  $2\nDATA scale8x4<>+12(SB)/4, $3\nDATA scale8x4<>+16(SB)/4, $0\nDATA scale8x4<>+20(SB)/4, $1\nDATA scale8x4<>+24(SB)/4, $2\nDATA scale8x4<>+28(SB)/4, $3\n\nGLOBL offset8x4<>(SB), RODATA|NOPTR, $32\nDATA offset8x4<>+0(SB)/4,  $0\nDATA offset8x4<>+4(SB)/4,  $0\nDATA offset8x4<>+8(SB)/4,  $0\nDATA offset8x4<>+12(SB)/4, $0\nDATA offset8x4<>+16(SB)/4, $4\nDATA offset8x4<>+20(SB)/4, $4\nDATA offset8x4<>+24(SB)/4, $4\nDATA offset8x4<>+28(SB)/4, $4\n\nGLOBL shuffle8x4<>(SB), RODATA|NOPTR, $32\nDATA shuffle8x4<>+0(SB)/4,  $0x0C080400\nDATA shuffle8x4<>+4(SB)/4,  $0x0D090501\nDATA shuffle8x4<>+8(SB)/4,  $0x0E0A0602\nDATA shuffle8x4<>+12(SB)/4, $0x0F0B0703\nDATA shuffle8x4<>+16(SB)/4, $0x0C080400\nDATA shuffle8x4<>+20(SB)/4, $0x0D090501\nDATA shuffle8x4<>+24(SB)/4, $0x0E0A0602\nDATA shuffle8x4<>+28(SB)/4, $0x0F0B0703\n\nGLOBL scale8x8<>(SB), RODATA|NOPTR, $64\nDATA scale8x8<>+0(SB)/8,  $0\nDATA scale8x8<>+8(SB)/8,  $1\nDATA scale8x8<>+16(SB)/8, $2\nDATA scale8x8<>+24(SB)/8, $3\nDATA scale8x8<>+32(SB)/8, $4\nDATA scale8x8<>+40(SB)/8, $5\nDATA scale8x8<>+48(SB)/8, $6\nDATA scale8x8<>+56(SB)/8, $7\n\nGLOBL shuffle8x8<>(SB), RODATA|NOPTR, $64\nDATA shuffle8x8<>+0(SB)/8,  $0x3830282018100800\nDATA shuffle8x8<>+8(SB)/8,  $0x3931292119110901\nDATA shuffle8x8<>+16(SB)/8, $0x3A322A221A120A02\nDATA shuffle8x8<>+24(SB)/8, $0x3B332B231B130B03\nDATA shuffle8x8<>+32(SB)/8, $0x3C342C241C140C04\nDATA shuffle8x8<>+40(SB)/8, $0x3D352D251D150D05\nDATA shuffle8x8<>+48(SB)/8, $0x3E362E261E160E06\nDATA shuffle8x8<>+56(SB)/8, $0x3F372F271F170F07\n"
  },
  {
    "path": "encoding/bytestreamsplit/bytestreamsplit_purego.go",
    "content": "//go:build purego || !amd64\n\npackage bytestreamsplit\n\nimport \"github.com/segmentio/parquet-go/internal/unsafecast\"\n\nfunc encodeFloat(dst, src []byte) {\n\tn := len(src) / 4\n\tb0 := dst[0*n : 1*n]\n\tb1 := dst[1*n : 2*n]\n\tb2 := dst[2*n : 3*n]\n\tb3 := dst[3*n : 4*n]\n\n\tfor i, v := range unsafecast.BytesToUint32(src) {\n\t\tb0[i] = byte(v >> 0)\n\t\tb1[i] = byte(v >> 8)\n\t\tb2[i] = byte(v >> 16)\n\t\tb3[i] = byte(v >> 24)\n\t}\n}\n\nfunc encodeDouble(dst, src []byte) {\n\tn := len(src) / 8\n\tb0 := dst[0*n : 1*n]\n\tb1 := dst[1*n : 2*n]\n\tb2 := dst[2*n : 3*n]\n\tb3 := dst[3*n : 4*n]\n\tb4 := dst[4*n : 5*n]\n\tb5 := dst[5*n : 6*n]\n\tb6 := dst[6*n : 7*n]\n\tb7 := dst[7*n : 8*n]\n\n\tfor i, v := range unsafecast.BytesToUint64(src) {\n\t\tb0[i] = byte(v >> 0)\n\t\tb1[i] = byte(v >> 8)\n\t\tb2[i] = byte(v >> 16)\n\t\tb3[i] = byte(v >> 24)\n\t\tb4[i] = byte(v >> 32)\n\t\tb5[i] = byte(v >> 40)\n\t\tb6[i] = byte(v >> 48)\n\t\tb7[i] = byte(v >> 56)\n\t}\n}\n\nfunc decodeFloat(dst, src []byte) {\n\tn := len(src) / 4\n\tb0 := src[0*n : 1*n]\n\tb1 := src[1*n : 2*n]\n\tb2 := src[2*n : 3*n]\n\tb3 := src[3*n : 4*n]\n\n\tdst32 := unsafecast.BytesToUint32(dst)\n\tfor i := range dst32 {\n\t\tdst32[i] = uint32(b0[i]) |\n\t\t\tuint32(b1[i])<<8 |\n\t\t\tuint32(b2[i])<<16 |\n\t\t\tuint32(b3[i])<<24\n\t}\n}\n\nfunc decodeDouble(dst, src []byte) {\n\tn := len(src) / 8\n\tb0 := src[0*n : 1*n]\n\tb1 := src[1*n : 2*n]\n\tb2 := src[2*n : 3*n]\n\tb3 := src[3*n : 4*n]\n\tb4 := src[4*n : 5*n]\n\tb5 := src[5*n : 6*n]\n\tb6 := src[6*n : 7*n]\n\tb7 := src[7*n : 8*n]\n\n\tdst64 := unsafecast.BytesToUint64(dst)\n\tfor i := range dst64 {\n\t\tdst64[i] = uint64(b0[i]) |\n\t\t\tuint64(b1[i])<<8 |\n\t\t\tuint64(b2[i])<<16 |\n\t\t\tuint64(b3[i])<<24 |\n\t\t\tuint64(b4[i])<<32 |\n\t\t\tuint64(b5[i])<<40 |\n\t\t\tuint64(b6[i])<<48 |\n\t\t\tuint64(b7[i])<<56\n\t}\n}\n"
  },
  {
    "path": "encoding/bytestreamsplit/bytestreamsplit_test.go",
    "content": "//go:build go1.18\n// +build go1.18\n\npackage bytestreamsplit_test\n\nimport (\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/encoding/bytestreamsplit\"\n\t\"github.com/segmentio/parquet-go/encoding/fuzz\"\n\t\"github.com/segmentio/parquet-go/encoding/test\"\n)\n\nfunc FuzzEncodeFloat(f *testing.F) {\n\tfuzz.EncodeFloat(f, new(bytestreamsplit.Encoding))\n}\n\nfunc FuzzEncodeDouble(f *testing.F) {\n\tfuzz.EncodeDouble(f, new(bytestreamsplit.Encoding))\n}\n\nfunc TestEncodeFloat(t *testing.T) {\n\ttest.EncodeFloat(t, new(bytestreamsplit.Encoding), 0, 100)\n}\n\nfunc TestEncodeDouble(t *testing.T) {\n\ttest.EncodeDouble(t, new(bytestreamsplit.Encoding), 0, 100)\n}\n"
  },
  {
    "path": "encoding/delta/binary_packed.go",
    "content": "package delta\n\nimport (\n\t\"encoding/binary\"\n\t\"fmt\"\n\t\"io\"\n\t\"math\"\n\t\"math/bits\"\n\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/format\"\n\t\"github.com/segmentio/parquet-go/internal/bitpack\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\ntype BinaryPackedEncoding struct {\n\tencoding.NotSupported\n}\n\nfunc (e *BinaryPackedEncoding) String() string {\n\treturn \"DELTA_BINARY_PACKED\"\n}\n\nfunc (e *BinaryPackedEncoding) Encoding() format.Encoding {\n\treturn format.DeltaBinaryPacked\n}\n\nfunc (e *BinaryPackedEncoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) {\n\treturn encodeInt32(dst[:0], src), nil\n}\n\nfunc (e *BinaryPackedEncoding) EncodeInt64(dst []byte, src []int64) ([]byte, error) {\n\treturn encodeInt64(dst[:0], src), nil\n}\n\nfunc (e *BinaryPackedEncoding) DecodeInt32(dst []int32, src []byte) ([]int32, error) {\n\tbuf := unsafecast.Int32ToBytes(dst)\n\tbuf, _, err := decodeInt32(buf[:0], src)\n\treturn unsafecast.BytesToInt32(buf), e.wrap(err)\n}\n\nfunc (e *BinaryPackedEncoding) DecodeInt64(dst []int64, src []byte) ([]int64, error) {\n\tbuf := unsafecast.Int64ToBytes(dst)\n\tbuf, _, err := decodeInt64(buf[:0], src)\n\treturn unsafecast.BytesToInt64(buf), e.wrap(err)\n}\n\nfunc (e *BinaryPackedEncoding) wrap(err error) error {\n\tif err != nil {\n\t\terr = encoding.Error(e, err)\n\t}\n\treturn err\n}\n\nconst (\n\tblockSize     = 128\n\tnumMiniBlocks = 4\n\tminiBlockSize = blockSize / numMiniBlocks\n\t// The parquet spec does not enforce a limit to the block size, but we need\n\t// one otherwise invalid inputs may result in unbounded memory allocations.\n\t//\n\t// 65K+ values should be enough for any valid use case.\n\tmaxSupportedBlockSize = 65536\n\n\tmaxHeaderLength32    = 4 * binary.MaxVarintLen64\n\tmaxMiniBlockLength32 = binary.MaxVarintLen64 + numMiniBlocks + (4 * blockSize)\n\n\tmaxHeaderLength64    = 8 * binary.MaxVarintLen64\n\tmaxMiniBlockLength64 = binary.MaxVarintLen64 + numMiniBlocks + (8 * blockSize)\n)\n\nvar (\n\tencodeInt32 = encodeInt32Default\n\tencodeInt64 = encodeInt64Default\n)\n\nfunc encodeInt32Default(dst []byte, src []int32) []byte {\n\ttotalValues := len(src)\n\tfirstValue := int32(0)\n\tif totalValues > 0 {\n\t\tfirstValue = src[0]\n\t}\n\n\tn := len(dst)\n\tdst = resize(dst, n+maxHeaderLength32)\n\tdst = dst[:n+encodeBinaryPackedHeader(dst[n:], blockSize, numMiniBlocks, totalValues, int64(firstValue))]\n\n\tif totalValues < 2 {\n\t\treturn dst\n\t}\n\n\tlastValue := firstValue\n\tfor i := 1; i < len(src); i += blockSize {\n\t\tblock := [blockSize]int32{}\n\t\tblockLength := copy(block[:], src[i:])\n\n\t\tlastValue = blockDeltaInt32(&block, lastValue)\n\t\tminDelta := blockMinInt32(&block)\n\t\tblockSubInt32(&block, minDelta)\n\t\tblockClearInt32(&block, blockLength)\n\n\t\tbitWidths := [numMiniBlocks]byte{}\n\t\tblockBitWidthsInt32(&bitWidths, &block)\n\n\t\tn := len(dst)\n\t\tdst = resize(dst, n+maxMiniBlockLength32+4)\n\t\tn += encodeBlockHeader(dst[n:], int64(minDelta), bitWidths)\n\n\t\tfor i, bitWidth := range bitWidths {\n\t\t\tif bitWidth != 0 {\n\t\t\t\tminiBlock := (*[miniBlockSize]int32)(block[i*miniBlockSize:])\n\t\t\t\tencodeMiniBlockInt32(dst[n:], miniBlock, uint(bitWidth))\n\t\t\t\tn += (miniBlockSize * int(bitWidth)) / 8\n\t\t\t}\n\t\t}\n\n\t\tdst = dst[:n]\n\t}\n\n\treturn dst\n}\n\nfunc encodeInt64Default(dst []byte, src []int64) []byte {\n\ttotalValues := len(src)\n\tfirstValue := int64(0)\n\tif totalValues > 0 {\n\t\tfirstValue = src[0]\n\t}\n\n\tn := len(dst)\n\tdst = resize(dst, n+maxHeaderLength64)\n\tdst = dst[:n+encodeBinaryPackedHeader(dst[n:], blockSize, numMiniBlocks, totalValues, firstValue)]\n\n\tif totalValues < 2 {\n\t\treturn dst\n\t}\n\n\tlastValue := firstValue\n\tfor i := 1; i < len(src); i += blockSize {\n\t\tblock := [blockSize]int64{}\n\t\tblockLength := copy(block[:], src[i:])\n\n\t\tlastValue = blockDeltaInt64(&block, lastValue)\n\t\tminDelta := blockMinInt64(&block)\n\t\tblockSubInt64(&block, minDelta)\n\t\tblockClearInt64(&block, blockLength)\n\n\t\tbitWidths := [numMiniBlocks]byte{}\n\t\tblockBitWidthsInt64(&bitWidths, &block)\n\n\t\tn := len(dst)\n\t\tdst = resize(dst, n+maxMiniBlockLength64+8)\n\t\tn += encodeBlockHeader(dst[n:], minDelta, bitWidths)\n\n\t\tfor i, bitWidth := range bitWidths {\n\t\t\tif bitWidth != 0 {\n\t\t\t\tminiBlock := (*[miniBlockSize]int64)(block[i*miniBlockSize:])\n\t\t\t\tencodeMiniBlockInt64(dst[n:], miniBlock, uint(bitWidth))\n\t\t\t\tn += (miniBlockSize * int(bitWidth)) / 8\n\t\t\t}\n\t\t}\n\n\t\tdst = dst[:n]\n\t}\n\n\treturn dst\n}\n\nfunc encodeBinaryPackedHeader(dst []byte, blockSize, numMiniBlocks, totalValues int, firstValue int64) (n int) {\n\tn += binary.PutUvarint(dst[n:], uint64(blockSize))\n\tn += binary.PutUvarint(dst[n:], uint64(numMiniBlocks))\n\tn += binary.PutUvarint(dst[n:], uint64(totalValues))\n\tn += binary.PutVarint(dst[n:], firstValue)\n\treturn n\n}\n\nfunc encodeBlockHeader(dst []byte, minDelta int64, bitWidths [numMiniBlocks]byte) (n int) {\n\tn += binary.PutVarint(dst, int64(minDelta))\n\tn += copy(dst[n:], bitWidths[:])\n\treturn n\n}\n\nfunc blockClearInt32(block *[blockSize]int32, blockLength int) {\n\tif blockLength < blockSize {\n\t\tclear := block[blockLength:]\n\t\tfor i := range clear {\n\t\t\tclear[i] = 0\n\t\t}\n\t}\n}\n\nfunc blockDeltaInt32(block *[blockSize]int32, lastValue int32) int32 {\n\tfor i, v := range block {\n\t\tblock[i], lastValue = v-lastValue, v\n\t}\n\treturn lastValue\n}\n\nfunc blockMinInt32(block *[blockSize]int32) int32 {\n\tmin := block[0]\n\tfor _, v := range block[1:] {\n\t\tif v < min {\n\t\t\tmin = v\n\t\t}\n\t}\n\treturn min\n}\n\nfunc blockSubInt32(block *[blockSize]int32, value int32) {\n\tfor i := range block {\n\t\tblock[i] -= value\n\t}\n}\n\nfunc blockBitWidthsInt32(bitWidths *[numMiniBlocks]byte, block *[blockSize]int32) {\n\tfor i := range bitWidths {\n\t\tj := (i + 0) * miniBlockSize\n\t\tk := (i + 1) * miniBlockSize\n\t\tbitWidth := 0\n\n\t\tfor _, v := range block[j:k] {\n\t\t\tif n := bits.Len32(uint32(v)); n > bitWidth {\n\t\t\t\tbitWidth = n\n\t\t\t}\n\t\t}\n\n\t\tbitWidths[i] = byte(bitWidth)\n\t}\n}\n\nfunc blockClearInt64(block *[blockSize]int64, blockLength int) {\n\tif blockLength < blockSize {\n\t\tclear := block[blockLength:]\n\t\tfor i := range clear {\n\t\t\tclear[i] = 0\n\t\t}\n\t}\n}\n\nfunc blockDeltaInt64(block *[blockSize]int64, lastValue int64) int64 {\n\tfor i, v := range block {\n\t\tblock[i], lastValue = v-lastValue, v\n\t}\n\treturn lastValue\n}\n\nfunc blockMinInt64(block *[blockSize]int64) int64 {\n\tmin := block[0]\n\tfor _, v := range block[1:] {\n\t\tif v < min {\n\t\t\tmin = v\n\t\t}\n\t}\n\treturn min\n}\n\nfunc blockSubInt64(block *[blockSize]int64, value int64) {\n\tfor i := range block {\n\t\tblock[i] -= value\n\t}\n}\n\nfunc blockBitWidthsInt64(bitWidths *[numMiniBlocks]byte, block *[blockSize]int64) {\n\tfor i := range bitWidths {\n\t\tj := (i + 0) * miniBlockSize\n\t\tk := (i + 1) * miniBlockSize\n\t\tbitWidth := 0\n\n\t\tfor _, v := range block[j:k] {\n\t\t\tif n := bits.Len64(uint64(v)); n > bitWidth {\n\t\t\t\tbitWidth = n\n\t\t\t}\n\t\t}\n\n\t\tbitWidths[i] = byte(bitWidth)\n\t}\n}\n\nfunc decodeInt32(dst, src []byte) ([]byte, []byte, error) {\n\tblockSize, numMiniBlocks, totalValues, firstValue, src, err := decodeBinaryPackedHeader(src)\n\tif err != nil {\n\t\treturn dst, src, err\n\t}\n\tif totalValues == 0 {\n\t\treturn dst, src, nil\n\t}\n\tif firstValue < math.MinInt32 || firstValue > math.MaxInt32 {\n\t\treturn dst, src, fmt.Errorf(\"first value out of range: %d\", firstValue)\n\t}\n\n\twriteOffset := len(dst)\n\tdst = resize(dst, len(dst)+4*totalValues)\n\tout := unsafecast.BytesToInt32(dst)\n\tout[writeOffset] = int32(firstValue)\n\twriteOffset++\n\ttotalValues--\n\tlastValue := int32(firstValue)\n\tnumValuesInMiniBlock := blockSize / numMiniBlocks\n\n\tconst padding = 16\n\tminiBlockTemp := make([]byte, 256+padding)\n\n\tfor totalValues > 0 && len(src) > 0 {\n\t\tvar minDelta int64\n\t\tvar bitWidths []byte\n\t\tminDelta, bitWidths, src, err = decodeBinaryPackedBlock(src, numMiniBlocks)\n\t\tif err != nil {\n\t\t\treturn dst, src, err\n\t\t}\n\n\t\tblockOffset := writeOffset\n\n\t\tfor _, bitWidth := range bitWidths {\n\t\t\tn := min(numValuesInMiniBlock, totalValues)\n\t\t\tif bitWidth != 0 {\n\t\t\t\tminiBlockSize := (numValuesInMiniBlock * int(bitWidth)) / 8\n\t\t\t\tminiBlockData := src\n\t\t\t\tif miniBlockSize <= len(src) {\n\t\t\t\t\tminiBlockData = miniBlockData[:miniBlockSize]\n\t\t\t\t}\n\t\t\t\tsrc = src[len(miniBlockData):]\n\t\t\t\tif cap(miniBlockData) < miniBlockSize+bitpack.PaddingInt32 {\n\t\t\t\t\tminiBlockTemp = resize(miniBlockTemp[:0], miniBlockSize+bitpack.PaddingInt32)\n\t\t\t\t\tminiBlockData = miniBlockTemp[:copy(miniBlockTemp, miniBlockData)]\n\t\t\t\t}\n\t\t\t\tminiBlockData = miniBlockData[:miniBlockSize]\n\t\t\t\tbitpack.UnpackInt32(out[writeOffset:writeOffset+n], miniBlockData, uint(bitWidth))\n\t\t\t}\n\t\t\twriteOffset += n\n\t\t\ttotalValues -= n\n\t\t\tif totalValues == 0 {\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\n\t\tlastValue = decodeBlockInt32(out[blockOffset:writeOffset], int32(minDelta), lastValue)\n\t}\n\n\tif totalValues > 0 {\n\t\treturn dst, src, fmt.Errorf(\"%d missing values: %w\", totalValues, io.ErrUnexpectedEOF)\n\t}\n\n\treturn dst, src, nil\n}\n\nfunc decodeInt64(dst, src []byte) ([]byte, []byte, error) {\n\tblockSize, numMiniBlocks, totalValues, firstValue, src, err := decodeBinaryPackedHeader(src)\n\tif err != nil {\n\t\treturn dst, src, err\n\t}\n\tif totalValues == 0 {\n\t\treturn dst, src, nil\n\t}\n\n\twriteOffset := len(dst)\n\tdst = resize(dst, len(dst)+8*totalValues)\n\tout := unsafecast.BytesToInt64(dst)\n\tout[writeOffset] = firstValue\n\twriteOffset++\n\ttotalValues--\n\tlastValue := firstValue\n\tnumValuesInMiniBlock := blockSize / numMiniBlocks\n\n\tconst padding = 16\n\tminiBlockTemp := make([]byte, 512+padding)\n\n\tfor totalValues > 0 && len(src) > 0 {\n\t\tvar minDelta int64\n\t\tvar bitWidths []byte\n\t\tminDelta, bitWidths, src, err = decodeBinaryPackedBlock(src, numMiniBlocks)\n\t\tif err != nil {\n\t\t\treturn dst, src, err\n\t\t}\n\t\tblockOffset := writeOffset\n\n\t\tfor _, bitWidth := range bitWidths {\n\t\t\tn := min(numValuesInMiniBlock, totalValues)\n\t\t\tif bitWidth != 0 {\n\t\t\t\tminiBlockSize := (numValuesInMiniBlock * int(bitWidth)) / 8\n\t\t\t\tminiBlockData := src\n\t\t\t\tif miniBlockSize <= len(src) {\n\t\t\t\t\tminiBlockData = src[:miniBlockSize]\n\t\t\t\t}\n\t\t\t\tsrc = src[len(miniBlockData):]\n\t\t\t\tif len(miniBlockData) < miniBlockSize+bitpack.PaddingInt64 {\n\t\t\t\t\tminiBlockTemp = resize(miniBlockTemp[:0], miniBlockSize+bitpack.PaddingInt64)\n\t\t\t\t\tminiBlockData = miniBlockTemp[:copy(miniBlockTemp, miniBlockData)]\n\t\t\t\t}\n\t\t\t\tminiBlockData = miniBlockData[:miniBlockSize]\n\t\t\t\tbitpack.UnpackInt64(out[writeOffset:writeOffset+n], miniBlockData, uint(bitWidth))\n\t\t\t}\n\t\t\twriteOffset += n\n\t\t\ttotalValues -= n\n\t\t\tif totalValues == 0 {\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\n\t\tlastValue = decodeBlockInt64(out[blockOffset:writeOffset], minDelta, lastValue)\n\t}\n\n\tif totalValues > 0 {\n\t\treturn dst, src, fmt.Errorf(\"%d missing values: %w\", totalValues, io.ErrUnexpectedEOF)\n\t}\n\n\treturn dst, src, nil\n}\n\nfunc decodeBinaryPackedHeader(src []byte) (blockSize, numMiniBlocks, totalValues int, firstValue int64, next []byte, err error) {\n\tu := uint64(0)\n\tn := 0\n\ti := 0\n\n\tif u, n, err = decodeUvarint(src[i:], \"block size\"); err != nil {\n\t\treturn\n\t}\n\ti += n\n\tblockSize = int(u)\n\n\tif u, n, err = decodeUvarint(src[i:], \"number of mini-blocks\"); err != nil {\n\t\treturn\n\t}\n\ti += n\n\tnumMiniBlocks = int(u)\n\n\tif u, n, err = decodeUvarint(src[i:], \"total values\"); err != nil {\n\t\treturn\n\t}\n\ti += n\n\ttotalValues = int(u)\n\n\tif firstValue, n, err = decodeVarint(src[i:], \"first value\"); err != nil {\n\t\treturn\n\t}\n\ti += n\n\n\tif numMiniBlocks == 0 {\n\t\terr = fmt.Errorf(\"invalid number of mini block (%d)\", numMiniBlocks)\n\t} else if (blockSize <= 0) || (blockSize%128) != 0 {\n\t\terr = fmt.Errorf(\"invalid block size is not a multiple of 128 (%d)\", blockSize)\n\t} else if blockSize > maxSupportedBlockSize {\n\t\terr = fmt.Errorf(\"invalid block size is too large (%d)\", blockSize)\n\t} else if miniBlockSize := blockSize / numMiniBlocks; (numMiniBlocks <= 0) || (miniBlockSize%32) != 0 {\n\t\terr = fmt.Errorf(\"invalid mini block size is not a multiple of 32 (%d)\", miniBlockSize)\n\t} else if totalValues < 0 {\n\t\terr = fmt.Errorf(\"invalid total number of values is negative (%d)\", totalValues)\n\t} else if totalValues > math.MaxInt32 {\n\t\terr = fmt.Errorf(\"too many values: %d\", totalValues)\n\t}\n\n\treturn blockSize, numMiniBlocks, totalValues, firstValue, src[i:], err\n}\n\nfunc decodeBinaryPackedBlock(src []byte, numMiniBlocks int) (minDelta int64, bitWidths, next []byte, err error) {\n\tminDelta, n, err := decodeVarint(src, \"min delta\")\n\tif err != nil {\n\t\treturn 0, nil, src, err\n\t}\n\tsrc = src[n:]\n\tif len(src) < numMiniBlocks {\n\t\tbitWidths, next = src, nil\n\t} else {\n\t\tbitWidths, next = src[:numMiniBlocks], src[numMiniBlocks:]\n\t}\n\treturn minDelta, bitWidths, next, nil\n}\n\nfunc decodeUvarint(buf []byte, what string) (u uint64, n int, err error) {\n\tu, n = binary.Uvarint(buf)\n\tif n == 0 {\n\t\treturn 0, 0, fmt.Errorf(\"decoding %s: %w\", what, io.ErrUnexpectedEOF)\n\t}\n\tif n < 0 {\n\t\treturn 0, 0, fmt.Errorf(\"overflow decoding %s (read %d/%d bytes)\", what, -n, len(buf))\n\t}\n\treturn u, n, nil\n}\n\nfunc decodeVarint(buf []byte, what string) (v int64, n int, err error) {\n\tv, n = binary.Varint(buf)\n\tif n == 0 {\n\t\treturn 0, 0, fmt.Errorf(\"decoding %s: %w\", what, io.ErrUnexpectedEOF)\n\t}\n\tif n < 0 {\n\t\treturn 0, 0, fmt.Errorf(\"overflow decoding %s (read %d/%d bytes)\", what, -n, len(buf))\n\t}\n\treturn v, n, nil\n}\n"
  },
  {
    "path": "encoding/delta/binary_packed_amd64.go",
    "content": "//go:build !purego\n\npackage delta\n\nimport (\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n\t\"golang.org/x/sys/cpu\"\n)\n\nfunc init() {\n\tif cpu.X86.HasAVX2 {\n\t\tencodeInt32 = encodeInt32AVX2\n\t\tencodeInt64 = encodeInt64AVX2\n\t}\n}\n\n//go:noescape\nfunc blockDeltaInt32AVX2(block *[blockSize]int32, lastValue int32) int32\n\n//go:noescape\nfunc blockMinInt32AVX2(block *[blockSize]int32) int32\n\n//go:noescape\nfunc blockSubInt32AVX2(block *[blockSize]int32, value int32)\n\n//go:noescape\nfunc blockBitWidthsInt32AVX2(bitWidths *[numMiniBlocks]byte, block *[blockSize]int32)\n\n//go:noescape\nfunc encodeMiniBlockInt32Default(dst *byte, src *[miniBlockSize]int32, bitWidth uint)\n\n//go:noescape\nfunc encodeMiniBlockInt32x1bitAVX2(dst *byte, src *[miniBlockSize]int32)\n\n//go:noescape\nfunc encodeMiniBlockInt32x2bitsAVX2(dst *byte, src *[miniBlockSize]int32)\n\n//go:noescape\nfunc encodeMiniBlockInt32x3to16bitsAVX2(dst *byte, src *[miniBlockSize]int32, bitWidth uint)\n\n//go:noescape\nfunc encodeMiniBlockInt32x32bitsAVX2(dst *byte, src *[miniBlockSize]int32)\n\nfunc encodeMiniBlockInt32(dst []byte, src *[miniBlockSize]int32, bitWidth uint) {\n\tencodeMiniBlockInt32Default(&dst[0], src, bitWidth)\n}\n\nfunc encodeMiniBlockInt32AVX2(dst *byte, src *[miniBlockSize]int32, bitWidth uint) {\n\tswitch {\n\tcase bitWidth == 1:\n\t\tencodeMiniBlockInt32x1bitAVX2(dst, src)\n\tcase bitWidth == 2:\n\t\tencodeMiniBlockInt32x2bitsAVX2(dst, src)\n\tcase bitWidth == 32:\n\t\tencodeMiniBlockInt32x32bitsAVX2(dst, src)\n\tcase bitWidth <= 16:\n\t\tencodeMiniBlockInt32x3to16bitsAVX2(dst, src, bitWidth)\n\tdefault:\n\t\tencodeMiniBlockInt32Default(dst, src, bitWidth)\n\t}\n}\n\nfunc encodeInt32AVX2(dst []byte, src []int32) []byte {\n\ttotalValues := len(src)\n\tfirstValue := int32(0)\n\tif totalValues > 0 {\n\t\tfirstValue = src[0]\n\t}\n\n\tn := len(dst)\n\tdst = resize(dst, n+maxHeaderLength32)\n\tdst = dst[:n+encodeBinaryPackedHeader(dst[n:], blockSize, numMiniBlocks, totalValues, int64(firstValue))]\n\n\tif totalValues < 2 {\n\t\treturn dst\n\t}\n\n\tlastValue := firstValue\n\tfor i := 1; i < len(src); i += blockSize {\n\t\tblock := [blockSize]int32{}\n\t\tblockLength := copy(block[:], src[i:])\n\n\t\tlastValue = blockDeltaInt32AVX2(&block, lastValue)\n\t\tminDelta := blockMinInt32AVX2(&block)\n\t\tblockSubInt32AVX2(&block, minDelta)\n\t\tblockClearInt32(&block, blockLength)\n\n\t\tbitWidths := [numMiniBlocks]byte{}\n\t\tblockBitWidthsInt32AVX2(&bitWidths, &block)\n\n\t\tn := len(dst)\n\t\tdst = resize(dst, n+maxMiniBlockLength32+16)\n\t\tn += encodeBlockHeader(dst[n:], int64(minDelta), bitWidths)\n\n\t\tfor i, bitWidth := range bitWidths {\n\t\t\tif bitWidth != 0 {\n\t\t\t\tminiBlock := (*[miniBlockSize]int32)(block[i*miniBlockSize:])\n\t\t\t\tencodeMiniBlockInt32AVX2(&dst[n], miniBlock, uint(bitWidth))\n\t\t\t\tn += (miniBlockSize * int(bitWidth)) / 8\n\t\t\t}\n\t\t}\n\n\t\tdst = dst[:n]\n\t}\n\n\treturn dst\n}\n\n//go:noescape\nfunc blockDeltaInt64AVX2(block *[blockSize]int64, lastValue int64) int64\n\n//go:noescape\nfunc blockMinInt64AVX2(block *[blockSize]int64) int64\n\n//go:noescape\nfunc blockSubInt64AVX2(block *[blockSize]int64, value int64)\n\n//go:noescape\nfunc blockBitWidthsInt64AVX2(bitWidths *[numMiniBlocks]byte, block *[blockSize]int64)\n\n//go:noescape\nfunc encodeMiniBlockInt64Default(dst *byte, src *[miniBlockSize]int64, bitWidth uint)\n\n//go:noescape\nfunc encodeMiniBlockInt64x1bitAVX2(dst *byte, src *[miniBlockSize]int64)\n\n//go:noescape\nfunc encodeMiniBlockInt64x2bitsAVX2(dst *byte, src *[miniBlockSize]int64)\n\n//go:noescape\nfunc encodeMiniBlockInt64x64bitsAVX2(dst *byte, src *[miniBlockSize]int64)\n\nfunc encodeMiniBlockInt64(dst []byte, src *[miniBlockSize]int64, bitWidth uint) {\n\tencodeMiniBlockInt64Default(&dst[0], src, bitWidth)\n}\n\nfunc encodeMiniBlockInt64AVX2(dst *byte, src *[miniBlockSize]int64, bitWidth uint) {\n\tswitch {\n\tcase bitWidth == 1:\n\t\tencodeMiniBlockInt64x1bitAVX2(dst, src)\n\tcase bitWidth == 2:\n\t\tencodeMiniBlockInt64x2bitsAVX2(dst, src)\n\tcase bitWidth == 64:\n\t\tencodeMiniBlockInt64x64bitsAVX2(dst, src)\n\tdefault:\n\t\tencodeMiniBlockInt64Default(dst, src, bitWidth)\n\t}\n}\n\nfunc encodeInt64AVX2(dst []byte, src []int64) []byte {\n\ttotalValues := len(src)\n\tfirstValue := int64(0)\n\tif totalValues > 0 {\n\t\tfirstValue = src[0]\n\t}\n\n\tn := len(dst)\n\tdst = resize(dst, n+maxHeaderLength64)\n\tdst = dst[:n+encodeBinaryPackedHeader(dst[n:], blockSize, numMiniBlocks, totalValues, int64(firstValue))]\n\n\tif totalValues < 2 {\n\t\treturn dst\n\t}\n\n\tlastValue := firstValue\n\tfor i := 1; i < len(src); i += blockSize {\n\t\tblock := [blockSize]int64{}\n\t\tblockLength := copy(block[:], src[i:])\n\n\t\tlastValue = blockDeltaInt64AVX2(&block, lastValue)\n\t\tminDelta := blockMinInt64AVX2(&block)\n\t\tblockSubInt64AVX2(&block, minDelta)\n\t\tblockClearInt64(&block, blockLength)\n\n\t\tbitWidths := [numMiniBlocks]byte{}\n\t\tblockBitWidthsInt64AVX2(&bitWidths, &block)\n\n\t\tn := len(dst)\n\t\tdst = resize(dst, n+maxMiniBlockLength64+16)\n\t\tn += encodeBlockHeader(dst[n:], int64(minDelta), bitWidths)\n\n\t\tfor i, bitWidth := range bitWidths {\n\t\t\tif bitWidth != 0 {\n\t\t\t\tminiBlock := (*[miniBlockSize]int64)(block[i*miniBlockSize:])\n\t\t\t\tencodeMiniBlockInt64AVX2(&dst[n], miniBlock, uint(bitWidth))\n\t\t\t\tn += (miniBlockSize * int(bitWidth)) / 8\n\t\t\t}\n\t\t}\n\n\t\tdst = dst[:n]\n\t}\n\n\treturn dst\n}\n\n//go:noescape\nfunc decodeBlockInt32Default(dst []int32, minDelta, lastValue int32) int32\n\n//go:noescape\nfunc decodeBlockInt32AVX2(dst []int32, minDelta, lastValue int32) int32\n\nfunc decodeBlockInt32(dst []int32, minDelta, lastValue int32) int32 {\n\tswitch {\n\tcase cpu.X86.HasAVX2:\n\t\treturn decodeBlockInt32AVX2(dst, minDelta, lastValue)\n\tdefault:\n\t\treturn decodeBlockInt32Default(dst, minDelta, lastValue)\n\t}\n}\n\n//go:noescape\nfunc decodeMiniBlockInt32Default(dst []int32, src []uint32, bitWidth uint)\n\n//go:noescape\nfunc decodeMiniBlockInt32x1to16bitsAVX2(dst []int32, src []uint32, bitWidth uint)\n\n//go:noescape\nfunc decodeMiniBlockInt32x17to26bitsAVX2(dst []int32, src []uint32, bitWidth uint)\n\n//go:noescape\nfunc decodeMiniBlockInt32x27to31bitsAVX2(dst []int32, src []uint32, bitWidth uint)\n\nfunc decodeMiniBlockInt32(dst []int32, src []uint32, bitWidth uint) {\n\thasAVX2 := cpu.X86.HasAVX2\n\tswitch {\n\tcase hasAVX2 && bitWidth <= 16:\n\t\tdecodeMiniBlockInt32x1to16bitsAVX2(dst, src, bitWidth)\n\tcase hasAVX2 && bitWidth <= 26:\n\t\tdecodeMiniBlockInt32x17to26bitsAVX2(dst, src, bitWidth)\n\tcase hasAVX2 && bitWidth <= 31:\n\t\tdecodeMiniBlockInt32x27to31bitsAVX2(dst, src, bitWidth)\n\tcase bitWidth == 32:\n\t\tcopy(dst, unsafecast.Uint32ToInt32(src))\n\tdefault:\n\t\tdecodeMiniBlockInt32Default(dst, src, bitWidth)\n\t}\n}\n\n//go:noescape\nfunc decodeBlockInt64Default(dst []int64, minDelta, lastValue int64) int64\n\nfunc decodeBlockInt64(dst []int64, minDelta, lastValue int64) int64 {\n\treturn decodeBlockInt64Default(dst, minDelta, lastValue)\n}\n\n//go:noescape\nfunc decodeMiniBlockInt64Default(dst []int64, src []uint32, bitWidth uint)\n\nfunc decodeMiniBlockInt64(dst []int64, src []uint32, bitWidth uint) {\n\tswitch {\n\tcase bitWidth == 64:\n\t\tcopy(dst, unsafecast.Uint32ToInt64(src))\n\tdefault:\n\t\tdecodeMiniBlockInt64Default(dst, src, bitWidth)\n\t}\n}\n"
  },
  {
    "path": "encoding/delta/binary_packed_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n#define blockSize 128\n#define numMiniBlocks 4\n#define miniBlockSize 32\n\n// -----------------------------------------------------------------------------\n// 32 bits\n// -----------------------------------------------------------------------------\n\n#define deltaInt32AVX2x8(baseAddr) \\\n    VMOVDQU baseAddr, Y1    \\ // [0,1,2,3,4,5,6,7]\n    VPERMD Y1, Y3, Y2       \\ // [7,0,1,2,3,4,5,6]\n    VPBLENDD $1, Y0, Y2, Y2 \\ // [x,0,1,2,3,4,5,6]\n    VPSUBD Y2, Y1, Y2       \\ // [0,1,2,...] - [x,0,1,...]\n    VMOVDQU Y2, baseAddr    \\\n    VPERMD Y1, Y3, Y0\n\n// func blockDeltaInt32AVX2(block *[blockSize]int32, lastValue int32) int32\nTEXT ·blockDeltaInt32AVX2(SB), NOSPLIT, $0-20\n    MOVQ block+0(FP), AX\n    MOVL 4*blockSize-4(AX), CX\n    MOVL CX, ret+16(FP)\n\n    VPBROADCASTD lastValue+8(FP), Y0\n    VMOVDQU ·rotateLeft32(SB), Y3\n\n    XORQ SI, SI\nloop:\n    deltaInt32AVX2x8(0(AX)(SI*4))\n    deltaInt32AVX2x8(32(AX)(SI*4))\n    deltaInt32AVX2x8(64(AX)(SI*4))\n    deltaInt32AVX2x8(96(AX)(SI*4))\n    ADDQ $32, SI\n    CMPQ SI, $blockSize\n    JNE loop\n    VZEROUPPER\n    RET\n\n// func blockMinInt32AVX2(block *[blockSize]int32) int32\nTEXT ·blockMinInt32AVX2(SB), NOSPLIT, $0-12\n    MOVQ block+0(FP), AX\n    VPBROADCASTD (AX), Y15\n\n    VPMINSD 0(AX), Y15, Y0\n    VPMINSD 32(AX), Y15, Y1\n    VPMINSD 64(AX), Y15, Y2\n    VPMINSD 96(AX), Y15, Y3\n    VPMINSD 128(AX), Y15, Y4\n    VPMINSD 160(AX), Y15, Y5\n    VPMINSD 192(AX), Y15, Y6\n    VPMINSD 224(AX), Y15, Y7\n    VPMINSD 256(AX), Y15, Y8\n    VPMINSD 288(AX), Y15, Y9\n    VPMINSD 320(AX), Y15, Y10\n    VPMINSD 352(AX), Y15, Y11\n    VPMINSD 384(AX), Y15, Y12\n    VPMINSD 416(AX), Y15, Y13\n    VPMINSD 448(AX), Y15, Y14\n    VPMINSD 480(AX), Y15, Y15\n\n    VPMINSD Y1, Y0, Y0\n    VPMINSD Y3, Y2, Y2\n    VPMINSD Y5, Y4, Y4\n    VPMINSD Y7, Y6, Y6\n    VPMINSD Y9, Y8, Y8\n    VPMINSD Y11, Y10, Y10\n    VPMINSD Y13, Y12, Y12\n    VPMINSD Y15, Y14, Y14\n\n    VPMINSD Y2, Y0, Y0\n    VPMINSD Y6, Y4, Y4\n    VPMINSD Y10, Y8, Y8\n    VPMINSD Y14, Y12, Y12\n\n    VPMINSD Y4, Y0, Y0\n    VPMINSD Y12, Y8, Y8\n\n    VPMINSD Y8, Y0, Y0\n\n    VPERM2I128 $1, Y0, Y0, Y1\n    VPMINSD Y1, Y0, Y0\n\n    VPSHUFD $0b00011011, Y0, Y1\n    VPMINSD Y1, Y0, Y0\n    VZEROUPPER\n\n    MOVQ X0, CX\n    MOVL CX, BX\n    SHRQ $32, CX\n    CMPL CX, BX\n    CMOVLLT CX, BX\n    MOVL BX, ret+8(FP)\n    RET\n\n#define subInt32AVX2x32(baseAddr, offset) \\\n    VMOVDQU offset+0(baseAddr), Y1      \\\n    VMOVDQU offset+32(baseAddr), Y2     \\\n    VMOVDQU offset+64(baseAddr), Y3     \\\n    VMOVDQU offset+96(baseAddr), Y4     \\\n    VPSUBD Y0, Y1, Y1                   \\\n    VPSUBD Y0, Y2, Y2                   \\\n    VPSUBD Y0, Y3, Y3                   \\\n    VPSUBD Y0, Y4, Y4                   \\\n    VMOVDQU Y1, offset+0(baseAddr)      \\\n    VMOVDQU Y2, offset+32(baseAddr)     \\\n    VMOVDQU Y3, offset+64(baseAddr)     \\\n    VMOVDQU Y4, offset+96(baseAddr)\n\n// func blockSubInt32AVX2(block *[blockSize]int32, value int32)\nTEXT ·blockSubInt32AVX2(SB), NOSPLIT, $0-12\n    MOVQ block+0(FP), AX\n    VPBROADCASTD value+8(FP), Y0\n    subInt32AVX2x32(AX, 0)\n    subInt32AVX2x32(AX, 128)\n    subInt32AVX2x32(AX, 256)\n    subInt32AVX2x32(AX, 384)\n    VZEROUPPER\n    RET\n\n// func blockBitWidthsInt32AVX2(bitWidths *[numMiniBlocks]byte, block *[blockSize]int32)\nTEXT ·blockBitWidthsInt32AVX2(SB), NOSPLIT, $0-16\n    MOVQ bitWidths+0(FP), AX\n    MOVQ block+8(FP), BX\n\n    // AVX2 only has signed comparisons (and min/max), we emulate working on\n    // unsigned values by adding -2^31 to the values. Y5 is a vector of -2^31\n    // used to offset 8 packed 32 bits integers in other YMM registers where\n    // the block data are loaded.\n    VPCMPEQD Y5, Y5, Y5\n    VPSLLD $31, Y5, Y5\n\n    XORQ DI, DI\nloop:\n    VPBROADCASTD (BX), Y0 // max\n    VPADDD Y5, Y0, Y0\n\n    VMOVDQU (BX), Y1\n    VMOVDQU 32(BX), Y2\n    VMOVDQU 64(BX), Y3\n    VMOVDQU 96(BX), Y4\n\n    VPADDD Y5, Y1, Y1\n    VPADDD Y5, Y2, Y2\n    VPADDD Y5, Y3, Y3\n    VPADDD Y5, Y4, Y4\n\n    VPMAXSD Y2, Y1, Y1\n    VPMAXSD Y4, Y3, Y3\n    VPMAXSD Y3, Y1, Y1\n    VPMAXSD Y1, Y0, Y0\n\n    VPERM2I128 $1, Y0, Y0, Y1\n    VPMAXSD Y1, Y0, Y0\n\n    VPSHUFD $0b00011011, Y0, Y1\n    VPMAXSD Y1, Y0, Y0\n    VPSUBD Y5, Y0, Y0\n\n    MOVQ X0, CX\n    MOVL CX, DX\n    SHRQ $32, CX\n    CMPL CX, DX\n    CMOVLHI CX, DX\n\n    LZCNTL DX, DX\n    NEGL DX\n    ADDL $32, DX\n    MOVB DX, (AX)(DI*1)\n\n    ADDQ $128, BX\n    INCQ DI\n    CMPQ DI, $numMiniBlocks\n    JNE loop\n    VZEROUPPER\n    RET\n\n// encodeMiniBlockInt32Default is the generic implementation of the algorithm to\n// pack 32 bit integers into values of a given bit width (<=32).\n//\n// This algorithm is much slower than the vectorized versions, but is useful\n// as a reference implementation to run the tests against, and as fallback when\n// the code runs on a CPU which does not support the AVX2 instruction set.\n//\n// func encodeMiniBlockInt32Default(dst *byte, src *[miniBlockSize]int32, bitWidth uint)\nTEXT ·encodeMiniBlockInt32Default(SB), NOSPLIT, $0-24\n    MOVQ dst+0(FP), AX\n    MOVQ src+8(FP), BX\n    MOVQ bitWidth+16(FP), R9\n\n    XORQ DI, DI // bitOffset\n    XORQ SI, SI\nloop:\n    MOVQ DI, CX\n    MOVQ DI, DX\n\n    ANDQ $0b11111, CX // bitOffset % 32\n    SHRQ $5, DX       // bitOffset / 32\n\n    MOVLQZX (BX)(SI*4), R8\n    SHLQ CX, R8\n    ORQ R8, (AX)(DX*4)\n\n    ADDQ R9, DI\n    INCQ SI\n    CMPQ SI, $miniBlockSize\n    JNE loop\n    RET\n\n// encodeMiniBlockInt32x1bitAVX2 packs 32 bit integers into 1 bit values in the\n// the output buffer.\n//\n// The algorithm uses MOVMSKPS to extract the 8 relevant bits from the 8 values\n// packed in YMM registers, then combines 4 of these into a 32 bit word which\n// then gets written to the output. The result is 32 bits because each mini\n// block has 32 values (the block size is 128 and there are 4 mini blocks per\n// block).\n//\n// func encodeMiniBlockInt32x1bitAVX2(dst *byte, src *[miniBlockSize]int32)\nTEXT ·encodeMiniBlockInt32x1bitAVX2(SB), NOSPLIT, $0-16\n    MOVQ dst+0(FP), AX\n    MOVQ src+8(FP), BX\n\n    VMOVDQU 0(BX), Y0\n    VMOVDQU 32(BX), Y1\n    VMOVDQU 64(BX), Y2\n    VMOVDQU 96(BX), Y3\n\n    VPSLLD $31, Y0, Y0\n    VPSLLD $31, Y1, Y1\n    VPSLLD $31, Y2, Y2\n    VPSLLD $31, Y3, Y3\n\n    VMOVMSKPS Y0, R8\n    VMOVMSKPS Y1, R9\n    VMOVMSKPS Y2, R10\n    VMOVMSKPS Y3, R11\n\n    SHLL $8, R9\n    SHLL $16, R10\n    SHLL $24, R11\n\n    ORL R9, R8\n    ORL R10, R8\n    ORL R11, R8\n    MOVL R8, (AX)\n    VZEROUPPER\n    RET\n\n// encodeMiniBlockInt32x2bitsAVX2 implements an algorithm for packing 32 bit\n// integers into 2 bit values.\n//\n// The algorithm is derived from the one employed in encodeMiniBlockInt32x1bitAVX2\n// but needs to perform a bit extra work since MOVMSKPS can only extract one bit\n// per packed integer of each YMM vector. We run two passes to extract the two\n// bits needed to compose each item of the result, and merge the values by\n// interleaving the first and second bits with PDEP.\n//\n// func encodeMiniBlockInt32x2bitsAVX2(dst *byte, src *[miniBlockSize]int32)\nTEXT ·encodeMiniBlockInt32x2bitsAVX2(SB), NOSPLIT, $0-16\n    MOVQ dst+0(FP), AX\n    MOVQ src+8(FP), BX\n\n    VMOVDQU 0(BX), Y0\n    VMOVDQU 32(BX), Y1\n    VMOVDQU 64(BX), Y2\n    VMOVDQU 96(BX), Y3\n\n    VPSLLD $31, Y0, Y4\n    VPSLLD $31, Y1, Y5\n    VPSLLD $31, Y2, Y6\n    VPSLLD $31, Y3, Y7\n\n    VMOVMSKPS Y4, R8\n    VMOVMSKPS Y5, R9\n    VMOVMSKPS Y6, R10\n    VMOVMSKPS Y7, R11\n\n    SHLQ $8, R9\n    SHLQ $16, R10\n    SHLQ $24, R11\n    ORQ R9, R8\n    ORQ R10, R8\n    ORQ R11, R8\n\n    MOVQ $0x5555555555555555, DX // 0b010101...\n    PDEPQ DX, R8, R8\n\n    VPSLLD $30, Y0, Y8\n    VPSLLD $30, Y1, Y9\n    VPSLLD $30, Y2, Y10\n    VPSLLD $30, Y3, Y11\n\n    VMOVMSKPS Y8, R12\n    VMOVMSKPS Y9, R13\n    VMOVMSKPS Y10, R14\n    VMOVMSKPS Y11, R15\n\n    SHLQ $8, R13\n    SHLQ $16, R14\n    SHLQ $24, R15\n    ORQ R13, R12\n    ORQ R14, R12\n    ORQ R15, R12\n\n    MOVQ $0xAAAAAAAAAAAAAAAA, DI // 0b101010...\n    PDEPQ DI, R12, R12\n\n    ORQ R12, R8\n    MOVQ R8, (AX)\n    VZEROUPPER\n    RET\n\n// encodeMiniBlockInt32x32bitsAVX2 is a specialization of the bit packing logic\n// for 32 bit integers when the output bit width is also 32, in which case a\n// simple copy of the mini block to the output buffer produces the result.\n//\n// func encodeMiniBlockInt32x32bitsAVX2(dst *byte, src *[miniBlockSize]int32)\nTEXT ·encodeMiniBlockInt32x32bitsAVX2(SB), NOSPLIT, $0-16\n    MOVQ dst+0(FP), AX\n    MOVQ src+8(FP), BX\n    VMOVDQU 0(BX), Y0\n    VMOVDQU 32(BX), Y1\n    VMOVDQU 64(BX), Y2\n    VMOVDQU 96(BX), Y3\n    VMOVDQU Y0, 0(AX)\n    VMOVDQU Y1, 32(AX)\n    VMOVDQU Y2, 64(AX)\n    VMOVDQU Y3, 96(AX)\n    VZEROUPPER\n    RET\n\n// encodeMiniBlockInt32x3to16bitsAVX2 is the algorithm used to bit-pack 32 bit\n// integers into values of width 3 to 16 bits.\n//\n// This function is a small overhead due to having to initialize registers with\n// values that depend on the bit width. We measured this cost at ~10% throughput\n// in synthetic benchmarks compared to generating constant shifts and offsets\n// using a macro. Using a single function rather than generating one for each\n// bit width has the benefit of reducing the code size, which in practice can\n// also yield benefits like reducing CPU cache misses. Not using a macro also\n// has other advantages like providing accurate line number of stack traces and\n// enabling the use of breakpoints when debugging. Overall, this approach seemed\n// to be the right trade off between performance and maintainability.\n//\n// The algorithm treats chunks of 8 values in 4 iterations to process all 32\n// values of the mini block. Writes to the output buffer are aligned on 128 bits\n// since we may write up to 128 bits (8 x 16 bits). Padding is therefore\n// required in the output buffer to avoid triggering a segfault.\n// The encodeInt32AVX2 method adds enough padding when sizing the output buffer\n// to account for this requirement.\n//\n// We leverage the two lanes of YMM registers to work on two sets of 4 values\n// (in the sequence of VMOVDQU/VPSHUFD, VPAND, VPSLLQ, VPOR), resulting in having\n// two sets of bit-packed values in the lower 64 bits of each YMM lane.\n// The upper lane is then permuted into a lower lane to merge the two results,\n// which may not be aligned on byte boundaries so we shift the lower and upper\n// bits and compose two sets of 128 bits sequences (VPSLLQ, VPSRLQ, VBLENDPD),\n// merge them and write the 16 bytes result to the output buffer.\nTEXT ·encodeMiniBlockInt32x3to16bitsAVX2(SB), NOSPLIT, $0-24\n    MOVQ dst+0(FP), AX\n    MOVQ src+8(FP), BX\n    MOVQ bitWidth+16(FP), CX\n\n    VPBROADCASTQ bitWidth+16(FP), Y6 // [1*bitWidth...]\n    VPSLLQ $1, Y6, Y7                // [2*bitWidth...]\n    VPADDQ Y6, Y7, Y8                // [3*bitWidth...]\n    VPSLLQ $2, Y6, Y9                // [4*bitWidth...]\n\n    VPBROADCASTQ sixtyfour<>(SB), Y10\n    VPSUBQ Y6, Y10, Y11 // [64-1*bitWidth...]\n    VPSUBQ Y9, Y10, Y12 // [64-4*bitWidth...]\n    VPCMPEQQ Y4, Y4, Y4\n    VPSRLVQ Y11, Y4, Y4\n\n    VPXOR Y5, Y5, Y5\n    XORQ SI, SI\nloop:\n    VMOVDQU (BX)(SI*4), Y0\n    VPSHUFD $0b01010101, Y0, Y1\n    VPSHUFD $0b10101010, Y0, Y2\n    VPSHUFD $0b11111111, Y0, Y3\n\n    VPAND Y4, Y0, Y0\n    VPAND Y4, Y1, Y1\n    VPAND Y4, Y2, Y2\n    VPAND Y4, Y3, Y3\n\n    VPSLLVQ Y6, Y1, Y1\n    VPSLLVQ Y7, Y2, Y2\n    VPSLLVQ Y8, Y3, Y3\n\n    VPOR Y1, Y0, Y0\n    VPOR Y3, Y2, Y2\n    VPOR Y2, Y0, Y0\n\n    VPERMQ $0b00001010, Y0, Y1\n\n    VPSLLVQ X9, X1, X2\n    VPSRLQ X12, X1, X3\n    VBLENDPD $0b10, X3, X2, X1\n    VBLENDPD $0b10, X5, X0, X0\n    VPOR X1, X0, X0\n\n    VMOVDQU X0, (AX)\n\n    ADDQ CX, AX\n    ADDQ $8, SI\n    CMPQ SI, $miniBlockSize\n    JNE loop\n    VZEROUPPER\n    RET\n\nGLOBL sixtyfour<>(SB), RODATA|NOPTR, $32\nDATA sixtyfour<>+0(SB)/8, $64\nDATA sixtyfour<>+8(SB)/8, $64\nDATA sixtyfour<>+16(SB)/8, $64\nDATA sixtyfour<>+24(SB)/8, $64\n\n// func decodeBlockInt32Default(dst []int32, minDelta, lastValue int32) int32\nTEXT ·decodeBlockInt32Default(SB), NOSPLIT, $0-36\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), BX\n    MOVLQZX minDelta+24(FP), CX\n    MOVLQZX lastValue+28(FP), DX\n    XORQ SI, SI\n    JMP test\nloop:\n    MOVL (AX)(SI*4), DI\n    ADDL CX, DI\n    ADDL DI, DX\n    MOVL DX, (AX)(SI*4)\n    INCQ SI\ntest:\n    CMPQ SI, BX\n    JNE loop\ndone:\n    MOVL DX, ret+32(FP)\n    RET\n\n// func decodeBlockInt32AVX2(dst []int32, minDelta, lastValue int32) int32\nTEXT ·decodeBlockInt32AVX2(SB), NOSPLIT, $0-36\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), BX\n    MOVLQZX minDelta+24(FP), CX\n    MOVLQZX lastValue+28(FP), DX\n    XORQ SI, SI\n\n    CMPQ BX, $8\n    JB test\n\n    MOVQ BX, DI\n    SHRQ $3, DI\n    SHLQ $3, DI\n\n    VPXOR X1, X1, X1\n    MOVQ CX, X0\n    MOVQ DX, X1\n    VPBROADCASTD X0, Y0\nloopAVX2:\n    VMOVDQU (AX)(SI*4), Y2\n    VPADDD Y0, Y2, Y2 // Y2[:] += minDelta\n    VPADDD Y1, Y2, Y2 // Y2[0] += lastValue\n\n    VPSLLDQ $4, Y2, Y3\n    VPADDD Y3, Y2, Y2\n\n    VPSLLDQ $8, Y2, Y3\n    VPADDD Y3, Y2, Y2\n\n    VPSHUFD $0xFF, X2, X1\n    VPERM2I128 $1, Y2, Y2, Y3\n    VPADDD X1, X3, X3\n\n    VMOVDQU X2, (AX)(SI*4)\n    VMOVDQU X3, 16(AX)(SI*4)\n    VPSRLDQ $12, X3, X1 // lastValue\n\n    ADDQ $8, SI\n    CMPQ SI, DI\n    JNE loopAVX2\n    VZEROUPPER\n    MOVQ X1, DX\n    JMP test\nloop:\n    MOVL (AX)(SI*4), DI\n    ADDL CX, DI\n    ADDL DI, DX\n    MOVL DX, (AX)(SI*4)\n    INCQ SI\ntest:\n    CMPQ SI, BX\n    JNE loop\ndone:\n    MOVL DX, ret+32(FP)\n    RET\n\n// -----------------------------------------------------------------------------\n// 64 bits\n// -----------------------------------------------------------------------------\n\n#define deltaInt64AVX2x4(baseAddr)  \\\n    VMOVDQU baseAddr, Y1            \\ // [0,1,2,3]\n    VPERMQ $0b10010011, Y1, Y2      \\ // [3,0,1,2]\n    VPBLENDD $3, Y0, Y2, Y2         \\ // [x,0,1,2]\n    VPSUBQ Y2, Y1, Y2               \\ // [0,1,2,3] - [x,0,1,2]\n    VMOVDQU Y2, baseAddr            \\\n    VPERMQ $0b10010011, Y1, Y0\n\n// func blockDeltaInt64AVX2(block *[blockSize]int64, lastValue int64) int64\nTEXT ·blockDeltaInt64AVX2(SB), NOSPLIT, $0-24\n    MOVQ block+0(FP), AX\n    MOVQ 8*blockSize-8(AX), CX\n    MOVQ CX, ret+16(FP)\n\n    VPBROADCASTQ lastValue+8(FP), Y0\n    XORQ SI, SI\nloop:\n    deltaInt64AVX2x4((AX)(SI*8))\n    deltaInt64AVX2x4(32(AX)(SI*8))\n    deltaInt64AVX2x4(64(AX)(SI*8))\n    deltaInt64AVX2x4(96(AX)(SI*8))\n    ADDQ $16, SI\n    CMPQ SI, $blockSize\n    JNE loop\n    VZEROUPPER\n    RET\n\n// vpminsq is an emulation of the AVX-512 VPMINSQ instruction with AVX2.\n#define vpminsq(ones, tmp, arg2, arg1, ret) \\\n    VPCMPGTQ arg1, arg2, tmp \\\n    VPBLENDVB tmp, arg1, arg2, ret\n\n// func blockMinInt64AVX2(block *[blockSize]int64) int64\nTEXT ·blockMinInt64AVX2(SB), NOSPLIT, $0-16\n    MOVQ block+0(FP), AX\n    XORQ SI, SI\n    VPCMPEQQ Y9, Y9, Y9 // ones\n    VPBROADCASTQ (AX), Y0\nloop:\n    VMOVDQU 0(AX)(SI*8), Y1\n    VMOVDQU 32(AX)(SI*8), Y2\n    VMOVDQU 64(AX)(SI*8), Y3\n    VMOVDQU 96(AX)(SI*8), Y4\n    VMOVDQU 128(AX)(SI*8), Y5\n    VMOVDQU 160(AX)(SI*8), Y6\n    VMOVDQU 192(AX)(SI*8), Y7\n    VMOVDQU 224(AX)(SI*8), Y8\n\n    vpminsq(Y9, Y10, Y0, Y1, Y1)\n    vpminsq(Y9, Y11, Y0, Y2, Y2)\n    vpminsq(Y9, Y12, Y0, Y3, Y3)\n    vpminsq(Y9, Y13, Y0, Y4, Y4)\n    vpminsq(Y9, Y14, Y0, Y5, Y5)\n    vpminsq(Y9, Y15, Y0, Y6, Y6)\n    vpminsq(Y9, Y10, Y0, Y7, Y7)\n    vpminsq(Y9, Y11, Y0, Y8, Y8)\n\n    vpminsq(Y9, Y12, Y2, Y1, Y1)\n    vpminsq(Y9, Y13, Y4, Y3, Y3)\n    vpminsq(Y9, Y14, Y6, Y5, Y5)\n    vpminsq(Y9, Y15, Y8, Y7, Y7)\n\n    vpminsq(Y9, Y10, Y3, Y1, Y1)\n    vpminsq(Y9, Y11, Y7, Y5, Y5)\n    vpminsq(Y9, Y12, Y5, Y1, Y0)\n\n    ADDQ $32, SI\n    CMPQ SI, $blockSize\n    JNE loop\n\n    VPERM2I128 $1, Y0, Y0, Y1\n    vpminsq(Y9, Y10, Y1, Y0, Y0)\n\n    MOVQ X0, CX\n    VPEXTRQ $1, X0, BX\n    CMPQ CX, BX\n    CMOVQLT CX, BX\n    MOVQ BX, ret+8(FP)\n    VZEROUPPER\n    RET\n\n#define subInt64AVX2x32(baseAddr, offset) \\\n    VMOVDQU offset+0(baseAddr), Y1      \\\n    VMOVDQU offset+32(baseAddr), Y2     \\\n    VMOVDQU offset+64(baseAddr), Y3     \\\n    VMOVDQU offset+96(baseAddr), Y4     \\\n    VMOVDQU offset+128(baseAddr), Y5    \\\n    VMOVDQU offset+160(baseAddr), Y6    \\\n    VMOVDQU offset+192(baseAddr), Y7    \\\n    VMOVDQU offset+224(baseAddr), Y8    \\\n    VPSUBQ Y0, Y1, Y1                   \\\n    VPSUBQ Y0, Y2, Y2                   \\\n    VPSUBQ Y0, Y3, Y3                   \\\n    VPSUBQ Y0, Y4, Y4                   \\\n    VPSUBQ Y0, Y5, Y5                   \\\n    VPSUBQ Y0, Y6, Y6                   \\\n    VPSUBQ Y0, Y7, Y7                   \\\n    VPSUBQ Y0, Y8, Y8                   \\\n    VMOVDQU Y1, offset+0(baseAddr)      \\\n    VMOVDQU Y2, offset+32(baseAddr)     \\\n    VMOVDQU Y3, offset+64(baseAddr)     \\\n    VMOVDQU Y4, offset+96(baseAddr)     \\\n    VMOVDQU Y5, offset+128(baseAddr)    \\\n    VMOVDQU Y6, offset+160(baseAddr)    \\\n    VMOVDQU Y7, offset+192(baseAddr)    \\\n    VMOVDQU Y8, offset+224(baseAddr)\n\n// func blockSubInt64AVX2(block *[blockSize]int64, value int64)\nTEXT ·blockSubInt64AVX2(SB), NOSPLIT, $0-16\n    MOVQ block+0(FP), AX\n    VPBROADCASTQ value+8(FP), Y0\n    subInt64AVX2x32(AX, 0)\n    subInt64AVX2x32(AX, 256)\n    subInt64AVX2x32(AX, 512)\n    subInt64AVX2x32(AX, 768)\n    VZEROUPPER\n    RET\n\n// vpmaxsq is an emulation of the AVX-512 VPMAXSQ instruction with AVX2.\n#define vpmaxsq(tmp, arg2, arg1, ret) \\\n    VPCMPGTQ arg2, arg1, tmp \\\n    VPBLENDVB tmp, arg1, arg2, ret\n\n// func blockBitWidthsInt64AVX2(bitWidths *[numMiniBlocks]byte, block *[blockSize]int64)\nTEXT ·blockBitWidthsInt64AVX2(SB), NOSPLIT, $0-16\n    MOVQ bitWidths+0(FP), AX\n    MOVQ block+8(FP), BX\n\n    // AVX2 only has signed comparisons (and min/max), we emulate working on\n    // unsigned values by adding -2^64 to the values. Y9 is a vector of -2^64\n    // used to offset 4 packed 64 bits integers in other YMM registers where\n    // the block data are loaded.\n    VPCMPEQQ Y9, Y9, Y9\n    VPSLLQ $63, Y9, Y9\n\n    XORQ DI, DI\nloop:\n    VPBROADCASTQ (BX), Y0 // max\n    VPADDQ Y9, Y0, Y0\n\n    VMOVDQU (BX), Y1\n    VMOVDQU 32(BX), Y2\n    VMOVDQU 64(BX), Y3\n    VMOVDQU 96(BX), Y4\n    VMOVDQU 128(BX), Y5\n    VMOVDQU 160(BX), Y6\n    VMOVDQU 192(BX), Y7\n    VMOVDQU 224(BX), Y8\n\n    VPADDQ Y9, Y1, Y1\n    VPADDQ Y9, Y2, Y2\n    VPADDQ Y9, Y3, Y3\n    VPADDQ Y9, Y4, Y4\n    VPADDQ Y9, Y5, Y5\n    VPADDQ Y9, Y6, Y6\n    VPADDQ Y9, Y7, Y7\n    VPADDQ Y9, Y8, Y8\n\n    vpmaxsq(Y10, Y2, Y1, Y1)\n    vpmaxsq(Y11, Y4, Y3, Y3)\n    vpmaxsq(Y12, Y6, Y5, Y5)\n    vpmaxsq(Y13, Y8, Y7, Y7)\n\n    vpmaxsq(Y10, Y3, Y1, Y1)\n    vpmaxsq(Y11, Y7, Y5, Y5)\n    vpmaxsq(Y12, Y5, Y1, Y1)\n    vpmaxsq(Y13, Y1, Y0, Y0)\n\n    VPERM2I128 $1, Y0, Y0, Y1\n    vpmaxsq(Y10, Y1, Y0, Y0)\n    VPSUBQ Y9, Y0, Y0\n\n    MOVQ X0, CX\n    VPEXTRQ $1, X0, DX\n    CMPQ CX, DX\n    CMOVQHI CX, DX\n\n    LZCNTQ DX, DX\n    NEGQ DX\n    ADDQ $64, DX\n    MOVB DX, (AX)(DI*1)\n\n    ADDQ $256, BX\n    INCQ DI\n    CMPQ DI, $numMiniBlocks\n    JNE loop\n    VZEROUPPER\n    RET\n\n// encodeMiniBlockInt64Default is the generic implementation of the algorithm to\n// pack 64 bit integers into values of a given bit width (<=64).\n//\n// This algorithm is much slower than the vectorized versions, but is useful\n// as a reference implementation to run the tests against, and as fallback when\n// the code runs on a CPU which does not support the AVX2 instruction set.\n//\n// func encodeMiniBlockInt64Default(dst *byte, src *[miniBlockSize]int64, bitWidth uint)\nTEXT ·encodeMiniBlockInt64Default(SB), NOSPLIT, $0-24\n    MOVQ dst+0(FP), AX\n    MOVQ src+8(FP), BX\n    MOVQ bitWidth+16(FP), R10\n\n    XORQ R11, R11 // zero\n    XORQ DI, DI // bitOffset\n    XORQ SI, SI\nloop:\n    MOVQ DI, CX\n    MOVQ DI, DX\n\n    ANDQ $0b111111, CX // bitOffset % 64\n    SHRQ $6, DX        // bitOffset / 64\n\n    MOVQ (BX)(SI*8), R8\n    MOVQ R8, R9\n    SHLQ CX, R8\n    NEGQ CX\n    ADDQ $64, CX\n    SHRQ CX, R9\n    CMPQ CX, $64\n    CMOVQEQ R11, R9 // needed because shifting by more than 63 is undefined\n\n    ORQ R8, 0(AX)(DX*8)\n    ORQ R9, 8(AX)(DX*8)\n\n    ADDQ R10, DI\n    INCQ SI\n    CMPQ SI, $miniBlockSize\n    JNE loop\n    RET\n\n// func encodeMiniBlockInt64x1bitAVX2(dst *byte, src *[miniBlockSize]int64)\nTEXT ·encodeMiniBlockInt64x1bitAVX2(SB), NOSPLIT, $0-16\n    MOVQ dst+0(FP), AX\n    MOVQ src+8(FP), BX\n\n    VMOVDQU 0(BX), Y0\n    VMOVDQU 32(BX), Y1\n    VMOVDQU 64(BX), Y2\n    VMOVDQU 96(BX), Y3\n    VMOVDQU 128(BX), Y4\n    VMOVDQU 160(BX), Y5\n    VMOVDQU 192(BX), Y6\n    VMOVDQU 224(BX), Y7\n\n    VPSLLQ $63, Y0, Y0\n    VPSLLQ $63, Y1, Y1\n    VPSLLQ $63, Y2, Y2\n    VPSLLQ $63, Y3, Y3\n    VPSLLQ $63, Y4, Y4\n    VPSLLQ $63, Y5, Y5\n    VPSLLQ $63, Y6, Y6\n    VPSLLQ $63, Y7, Y7\n\n    VMOVMSKPD Y0, R8\n    VMOVMSKPD Y1, R9\n    VMOVMSKPD Y2, R10\n    VMOVMSKPD Y3, R11\n    VMOVMSKPD Y4, R12\n    VMOVMSKPD Y5, R13\n    VMOVMSKPD Y6, R14\n    VMOVMSKPD Y7, R15\n\n    SHLL $4, R9\n    SHLL $8, R10\n    SHLL $12, R11\n    SHLL $16, R12\n    SHLL $20, R13\n    SHLL $24, R14\n    SHLL $28, R15\n\n    ORL R9, R8\n    ORL R11, R10\n    ORL R13, R12\n    ORL R15, R14\n    ORL R10, R8\n    ORL R14, R12\n    ORL R12, R8\n\n    MOVL R8, (AX)\n    VZEROUPPER\n    RET\n\n// func encodeMiniBlockInt64x2bitsAVX2(dst *byte, src *[miniBlockSize]int64)\nTEXT ·encodeMiniBlockInt64x2bitsAVX2(SB), NOSPLIT, $0-16\n    MOVQ dst+0(FP), AX\n    MOVQ src+8(FP), BX\n\n    VMOVDQU 0(BX), Y8\n    VMOVDQU 32(BX), Y9\n    VMOVDQU 64(BX), Y10\n    VMOVDQU 96(BX), Y11\n    VMOVDQU 128(BX), Y12\n    VMOVDQU 160(BX), Y13\n    VMOVDQU 192(BX), Y14\n    VMOVDQU 224(BX), Y15\n\n    VPSLLQ $63, Y8, Y0\n    VPSLLQ $63, Y9, Y1\n    VPSLLQ $63, Y10, Y2\n    VPSLLQ $63, Y11, Y3\n    VPSLLQ $63, Y12, Y4\n    VPSLLQ $63, Y13, Y5\n    VPSLLQ $63, Y14, Y6\n    VPSLLQ $63, Y15, Y7\n\n    VMOVMSKPD Y0, R8\n    VMOVMSKPD Y1, R9\n    VMOVMSKPD Y2, R10\n    VMOVMSKPD Y3, R11\n    VMOVMSKPD Y4, R12\n    VMOVMSKPD Y5, R13\n    VMOVMSKPD Y6, R14\n    VMOVMSKPD Y7, R15\n\n    SHLQ $4, R9\n    SHLQ $8, R10\n    SHLQ $12, R11\n    SHLQ $16, R12\n    SHLQ $20, R13\n    SHLQ $24, R14\n    SHLQ $28, R15\n\n    ORQ R9, R8\n    ORQ R11, R10\n    ORQ R13, R12\n    ORQ R15, R14\n    ORQ R10, R8\n    ORQ R14, R12\n    ORQ R12, R8\n\n    MOVQ $0x5555555555555555, CX // 0b010101...\n    PDEPQ CX, R8, CX\n\n    VPSLLQ $62, Y8, Y8\n    VPSLLQ $62, Y9, Y9\n    VPSLLQ $62, Y10, Y10\n    VPSLLQ $62, Y11, Y11\n    VPSLLQ $62, Y12, Y12\n    VPSLLQ $62, Y13, Y13\n    VPSLLQ $62, Y14, Y14\n    VPSLLQ $62, Y15, Y15\n\n    VMOVMSKPD Y8, R8\n    VMOVMSKPD Y9, R9\n    VMOVMSKPD Y10, R10\n    VMOVMSKPD Y11, R11\n    VMOVMSKPD Y12, R12\n    VMOVMSKPD Y13, R13\n    VMOVMSKPD Y14, R14\n    VMOVMSKPD Y15, R15\n\n    SHLQ $4, R9\n    SHLQ $8, R10\n    SHLQ $12, R11\n    SHLQ $16, R12\n    SHLQ $20, R13\n    SHLQ $24, R14\n    SHLQ $28, R15\n\n    ORQ R9, R8\n    ORQ R11, R10\n    ORQ R13, R12\n    ORQ R15, R14\n    ORQ R10, R8\n    ORQ R14, R12\n    ORQ R12, R8\n\n    MOVQ $0xAAAAAAAAAAAAAAAA, DX // 0b101010...\n    PDEPQ DX, R8, DX\n    ORQ DX, CX\n    MOVQ CX, (AX)\n    VZEROUPPER\n    RET\n\n// func encodeMiniBlockInt64x64bitsAVX2(dst *byte, src *[miniBlockSize]int64)\nTEXT ·encodeMiniBlockInt64x64bitsAVX2(SB), NOSPLIT, $0-16\n    MOVQ dst+0(FP), AX\n    MOVQ src+8(FP), BX\n    VMOVDQU 0(BX), Y0\n    VMOVDQU 32(BX), Y1\n    VMOVDQU 64(BX), Y2\n    VMOVDQU 96(BX), Y3\n    VMOVDQU 128(BX), Y4\n    VMOVDQU 160(BX), Y5\n    VMOVDQU 192(BX), Y6\n    VMOVDQU 224(BX), Y7\n    VMOVDQU Y0, 0(AX)\n    VMOVDQU Y1, 32(AX)\n    VMOVDQU Y2, 64(AX)\n    VMOVDQU Y3, 96(AX)\n    VMOVDQU Y4, 128(AX)\n    VMOVDQU Y5, 160(AX)\n    VMOVDQU Y6, 192(AX)\n    VMOVDQU Y7, 224(AX)\n    VZEROUPPER\n    RET\n\n// func decodeBlockInt64Default(dst []int64, minDelta, lastValue int64) int64\nTEXT ·decodeBlockInt64Default(SB), NOSPLIT, $0-48\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), BX\n    MOVQ minDelta+24(FP), CX\n    MOVQ lastValue+32(FP), DX\n    XORQ SI, SI\n    JMP test\nloop:\n    MOVQ (AX)(SI*8), DI\n    ADDQ CX, DI\n    ADDQ DI, DX\n    MOVQ DX, (AX)(SI*8)\n    INCQ SI\ntest:\n    CMPQ SI, BX\n    JNE loop\ndone:\n    MOVQ DX, ret+40(FP)\n    RET\n"
  },
  {
    "path": "encoding/delta/binary_packed_amd64_test.go",
    "content": "//go:build amd64 && !purego\n\npackage delta\n\nimport (\n\t\"testing\"\n\n\t\"golang.org/x/sys/cpu\"\n)\n\nfunc requireAVX2(t testing.TB) {\n\tif !cpu.X86.HasAVX2 {\n\t\tt.Skip(\"CPU does not support AVX2\")\n\t}\n}\n\nfunc TestBlockDeltaInt32AVX2(t *testing.T) {\n\trequireAVX2(t)\n\ttestBlockDeltaInt32(t, blockDeltaInt32AVX2)\n}\n\nfunc TestBlockMinInt32AVX2(t *testing.T) {\n\trequireAVX2(t)\n\ttestBlockMinInt32(t, blockMinInt32AVX2)\n}\n\nfunc TestBlockSubInt32AVX2(t *testing.T) {\n\trequireAVX2(t)\n\ttestBlockSubInt32(t, blockSubInt32AVX2)\n}\n\nfunc TestBlockBitWidthsInt32AVX2(t *testing.T) {\n\trequireAVX2(t)\n\ttestBlockBitWidthsInt32(t, blockBitWidthsInt32AVX2)\n}\n\nfunc TestEncodeMiniBlockInt32AVX2(t *testing.T) {\n\trequireAVX2(t)\n\ttestEncodeMiniBlockInt32(t,\n\t\tfunc(dst []byte, src *[miniBlockSize]int32, bitWidth uint) {\n\t\t\tencodeMiniBlockInt32AVX2(&dst[0], src, bitWidth)\n\t\t},\n\t)\n}\n\nfunc BenchmarkBlockDeltaInt32AVX2(b *testing.B) {\n\trequireAVX2(b)\n\tbenchmarkBlockDeltaInt32(b, blockDeltaInt32AVX2)\n}\n\nfunc BenchmarkBlockMinInt32AVX2(b *testing.B) {\n\trequireAVX2(b)\n\tbenchmarkBlockMinInt32(b, blockMinInt32AVX2)\n}\n\nfunc BenchmarkBlockSubInt32AVX2(b *testing.B) {\n\trequireAVX2(b)\n\tbenchmarkBlockSubInt32(b, blockSubInt32AVX2)\n}\n\nfunc BenchmarkBlockBitWidthsInt32AVX2(b *testing.B) {\n\trequireAVX2(b)\n\tbenchmarkBlockBitWidthsInt32(b, blockBitWidthsInt32AVX2)\n}\n\nfunc BenchmarkEncodeMiniBlockInt32AVX2(b *testing.B) {\n\trequireAVX2(b)\n\tbenchmarkEncodeMiniBlockInt32(b,\n\t\tfunc(dst []byte, src *[miniBlockSize]int32, bitWidth uint) {\n\t\t\tencodeMiniBlockInt32AVX2(&dst[0], src, bitWidth)\n\t\t},\n\t)\n}\n\nfunc TestBlockDeltaInt64AVX2(t *testing.T) {\n\trequireAVX2(t)\n\ttestBlockDeltaInt64(t, blockDeltaInt64AVX2)\n}\n\nfunc TestBlockMinInt64AVX2(t *testing.T) {\n\trequireAVX2(t)\n\ttestBlockMinInt64(t, blockMinInt64AVX2)\n}\n\nfunc TestBlockSubInt64AVX2(t *testing.T) {\n\trequireAVX2(t)\n\ttestBlockSubInt64(t, blockSubInt64AVX2)\n}\n\nfunc TestBlockBitWidthsInt64AVX2(t *testing.T) {\n\trequireAVX2(t)\n\ttestBlockBitWidthsInt64(t, blockBitWidthsInt64AVX2)\n}\n\nfunc TestEncodeMiniBlockInt64AVX2(t *testing.T) {\n\trequireAVX2(t)\n\ttestEncodeMiniBlockInt64(t,\n\t\tfunc(dst []byte, src *[miniBlockSize]int64, bitWidth uint) {\n\t\t\tencodeMiniBlockInt64AVX2(&dst[0], src, bitWidth)\n\t\t},\n\t)\n}\n\nfunc BenchmarkBlockDeltaInt64AVX2(b *testing.B) {\n\trequireAVX2(b)\n\tbenchmarkBlockDeltaInt64(b, blockDeltaInt64AVX2)\n}\n\nfunc BenchmarkBlockMinInt64AVX2(b *testing.B) {\n\trequireAVX2(b)\n\tbenchmarkBlockMinInt64(b, blockMinInt64AVX2)\n}\n\nfunc BenchmarkBlockSubInt64AVX2(b *testing.B) {\n\trequireAVX2(b)\n\tbenchmarkBlockSubInt64(b, blockSubInt64AVX2)\n}\n\nfunc BenchmarkBlockBitWidthsInt64AVX2(b *testing.B) {\n\trequireAVX2(b)\n\tbenchmarkBlockBitWidthsInt64(b, blockBitWidthsInt64AVX2)\n}\n\nfunc BenchmarkEncodeMiniBlockInt64AVX2(b *testing.B) {\n\trequireAVX2(b)\n\tbenchmarkEncodeMiniBlockInt64(b,\n\t\tfunc(dst []byte, src *[miniBlockSize]int64, bitWidth uint) {\n\t\t\tencodeMiniBlockInt64AVX2(&dst[0], src, bitWidth)\n\t\t},\n\t)\n}\n"
  },
  {
    "path": "encoding/delta/binary_packed_purego.go",
    "content": "//go:build purego || !amd64\n\npackage delta\n\nimport (\n\t\"encoding/binary\"\n)\n\nfunc encodeMiniBlockInt32(dst []byte, src *[miniBlockSize]int32, bitWidth uint) {\n\tbitMask := uint32(1<<bitWidth) - 1\n\tbitOffset := uint(0)\n\n\tfor _, value := range src {\n\t\ti := bitOffset / 32\n\t\tj := bitOffset % 32\n\n\t\tlo := binary.LittleEndian.Uint32(dst[(i+0)*4:])\n\t\thi := binary.LittleEndian.Uint32(dst[(i+1)*4:])\n\n\t\tlo |= (uint32(value) & bitMask) << j\n\t\thi |= (uint32(value) >> (32 - j))\n\n\t\tbinary.LittleEndian.PutUint32(dst[(i+0)*4:], lo)\n\t\tbinary.LittleEndian.PutUint32(dst[(i+1)*4:], hi)\n\n\t\tbitOffset += bitWidth\n\t}\n}\n\nfunc encodeMiniBlockInt64(dst []byte, src *[miniBlockSize]int64, bitWidth uint) {\n\tbitMask := uint64(1<<bitWidth) - 1\n\tbitOffset := uint(0)\n\n\tfor _, value := range src {\n\t\ti := bitOffset / 64\n\t\tj := bitOffset % 64\n\n\t\tlo := binary.LittleEndian.Uint64(dst[(i+0)*8:])\n\t\thi := binary.LittleEndian.Uint64(dst[(i+1)*8:])\n\n\t\tlo |= (uint64(value) & bitMask) << j\n\t\thi |= (uint64(value) >> (64 - j))\n\n\t\tbinary.LittleEndian.PutUint64(dst[(i+0)*8:], lo)\n\t\tbinary.LittleEndian.PutUint64(dst[(i+1)*8:], hi)\n\n\t\tbitOffset += bitWidth\n\t}\n}\n\nfunc decodeBlockInt32(block []int32, minDelta, lastValue int32) int32 {\n\tfor i := range block {\n\t\tblock[i] += minDelta\n\t\tblock[i] += lastValue\n\t\tlastValue = block[i]\n\t}\n\treturn lastValue\n}\n\nfunc decodeBlockInt64(block []int64, minDelta, lastValue int64) int64 {\n\tfor i := range block {\n\t\tblock[i] += minDelta\n\t\tblock[i] += lastValue\n\t\tlastValue = block[i]\n\t}\n\treturn lastValue\n}\n\nfunc decodeMiniBlockInt32(dst []int32, src []uint32, bitWidth uint) {\n\tbitMask := uint32(1<<bitWidth) - 1\n\tbitOffset := uint(0)\n\n\tfor n := range dst {\n\t\ti := bitOffset / 32\n\t\tj := bitOffset % 32\n\t\td := (src[i] & (bitMask << j)) >> j\n\t\tif j+bitWidth > 32 {\n\t\t\tk := 32 - j\n\t\t\td |= (src[i+1] & (bitMask >> k)) << k\n\t\t}\n\t\tdst[n] = int32(d)\n\t\tbitOffset += bitWidth\n\t}\n}\n\nfunc decodeMiniBlockInt64(dst []int64, src []uint32, bitWidth uint) {\n\tbitMask := uint64(1<<bitWidth) - 1\n\tbitOffset := uint(0)\n\n\tfor n := range dst {\n\t\ti := bitOffset / 32\n\t\tj := bitOffset % 32\n\t\td := (uint64(src[i]) & (bitMask << j)) >> j\n\t\tif j+bitWidth > 32 {\n\t\t\tk := 32 - j\n\t\t\td |= (uint64(src[i+1]) & (bitMask >> k)) << k\n\t\t\tif j+bitWidth > 64 {\n\t\t\t\tk := 64 - j\n\t\t\t\td |= (uint64(src[i+2]) & (bitMask >> k)) << k\n\t\t\t}\n\t\t}\n\t\tdst[n] = int64(d)\n\t\tbitOffset += bitWidth\n\t}\n}\n"
  },
  {
    "path": "encoding/delta/binary_packed_test.go",
    "content": "package delta\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"math/bits\"\n\t\"testing\"\n)\n\nfunc maxLen32(miniBlock []int32) (maxLen int) {\n\tfor _, v := range miniBlock {\n\t\tif n := bits.Len32(uint32(v)); n > maxLen {\n\t\t\tmaxLen = n\n\t\t}\n\t}\n\treturn maxLen\n}\n\nfunc maxLen64(miniBlock []int64) (maxLen int) {\n\tfor _, v := range miniBlock {\n\t\tif n := bits.Len64(uint64(v)); n > maxLen {\n\t\t\tmaxLen = n\n\t\t}\n\t}\n\treturn maxLen\n}\n\nfunc TestBlockDeltaInt32(t *testing.T) {\n\ttestBlockDeltaInt32(t, blockDeltaInt32)\n}\n\nfunc testBlockDeltaInt32(t *testing.T, f func(*[blockSize]int32, int32) int32) {\n\tt.Helper()\n\tblock := [blockSize]int32{}\n\tfor i := range block {\n\t\tblock[i] = int32(2 * (i + 1))\n\t}\n\tlastValue := f(&block, 0)\n\tif lastValue != 2*blockSize {\n\t\tt.Errorf(\"wrong last block value: want=%d got=%d\", 2*blockSize, lastValue)\n\t}\n\tfor i := range block {\n\t\tj := int32(2 * (i + 0))\n\t\tk := int32(2 * (i + 1))\n\t\tif block[i] != (k - j) {\n\t\t\tt.Errorf(\"wrong block delta at index %d: want=%d got=%d\", i, k-j, block[i])\n\t\t}\n\t}\n}\n\nfunc TestBlockMinInt32(t *testing.T) {\n\ttestBlockMinInt32(t, blockMinInt32)\n}\n\nfunc testBlockMinInt32(t *testing.T, f func(*[blockSize]int32) int32) {\n\tt.Helper()\n\tblock := [blockSize]int32{}\n\tfor i := range block {\n\t\tblock[i] = blockSize - int32(i)\n\t}\n\tif min := f(&block); min != 1 {\n\t\tt.Errorf(\"wrong min block value: want=1 got=%d\", min)\n\t}\n}\n\nfunc TestBlockSubInt32(t *testing.T) {\n\ttestBlockSubInt32(t, blockSubInt32)\n}\n\nfunc testBlockSubInt32(t *testing.T, f func(*[blockSize]int32, int32)) {\n\tt.Helper()\n\tblock := [blockSize]int32{}\n\tfor i := range block {\n\t\tblock[i] = int32(i)\n\t}\n\tf(&block, 1)\n\tfor i := range block {\n\t\tif block[i] != int32(i-1) {\n\t\t\tt.Errorf(\"wrong block value at index %d: want=%d got=%d\", i, i-1, block[i])\n\t\t}\n\t}\n}\n\nfunc TestBlockBitWidthsInt32(t *testing.T) {\n\ttestBlockBitWidthsInt32(t, blockBitWidthsInt32)\n}\n\nfunc testBlockBitWidthsInt32(t *testing.T, f func(*[numMiniBlocks]byte, *[blockSize]int32)) {\n\tt.Helper()\n\tbitWidths := [numMiniBlocks]byte{}\n\tblock := [blockSize]int32{}\n\tfor i := range block {\n\t\tblock[i] = int32(i)\n\t}\n\tf(&bitWidths, &block)\n\n\twant := [numMiniBlocks]byte{}\n\tfor i := range want {\n\t\tj := (i + 0) * miniBlockSize\n\t\tk := (i + 1) * miniBlockSize\n\t\twant[i] = byte(maxLen32(block[j:k]))\n\t}\n\n\tif bitWidths != want {\n\t\tt.Errorf(\"wrong bit widths: want=%d got=%d\", want, bitWidths)\n\t}\n}\n\nfunc TestEncodeMiniBlockInt32(t *testing.T) {\n\ttestEncodeMiniBlockInt32(t, encodeMiniBlockInt32)\n}\n\nfunc testEncodeMiniBlockInt32(t *testing.T, f func([]byte, *[miniBlockSize]int32, uint)) {\n\tt.Helper()\n\tfor bitWidth := uint(1); bitWidth <= 32; bitWidth++ {\n\t\tt.Run(fmt.Sprintf(\"bitWidth=%d\", bitWidth), func(t *testing.T) {\n\t\t\tgot := [4*miniBlockSize + 32]byte{}\n\t\t\tsrc := [miniBlockSize]int32{}\n\t\t\tfor i := range src {\n\t\t\t\tsrc[i] = int32(i) & int32((1<<bitWidth)-1)\n\t\t\t}\n\n\t\t\twant := [4*miniBlockSize + 32]byte{}\n\t\t\tbitOffset := uint(0)\n\n\t\t\tfor _, bits := range src {\n\t\t\t\tfor b := uint(0); b < bitWidth; b++ {\n\t\t\t\t\tx := bitOffset / 8\n\t\t\t\t\ty := bitOffset % 8\n\t\t\t\t\twant[x] |= byte(((bits >> b) & 1) << y)\n\t\t\t\t\tbitOffset++\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tf(got[:], &src, bitWidth)\n\t\t\tn := (miniBlockSize * bitWidth) / 8\n\n\t\t\tif !bytes.Equal(want[:n], got[:n]) {\n\t\t\t\tt.Errorf(\"output mismatch: want=%08x got=%08x\", want[:n], got[:n])\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc BenchmarkBlockDeltaInt32(b *testing.B) {\n\tbenchmarkBlockDeltaInt32(b, blockDeltaInt32)\n}\n\nfunc benchmarkBlockDeltaInt32(b *testing.B, f func(*[blockSize]int32, int32) int32) {\n\tb.SetBytes(4 * blockSize)\n\tblock := [blockSize]int32{}\n\tfor i := 0; i < b.N; i++ {\n\t\t_ = f(&block, 0)\n\t}\n}\n\nfunc BenchmarkBlockMinInt32(b *testing.B) {\n\tbenchmarkBlockMinInt32(b, blockMinInt32)\n}\n\nfunc benchmarkBlockMinInt32(b *testing.B, f func(*[blockSize]int32) int32) {\n\tb.SetBytes(4 * blockSize)\n\tblock := [blockSize]int32{}\n\tfor i := 0; i < b.N; i++ {\n\t\t_ = f(&block)\n\t}\n}\n\nfunc BenchmarkBlockSubInt32(b *testing.B) {\n\tbenchmarkBlockSubInt32(b, blockSubInt32)\n}\n\nfunc benchmarkBlockSubInt32(b *testing.B, f func(*[blockSize]int32, int32)) {\n\tb.SetBytes(4 * blockSize)\n\tblock := [blockSize]int32{}\n\tfor i := 0; i < b.N; i++ {\n\t\tf(&block, 42)\n\t}\n}\n\nfunc BenchmarkBlockBitWidthsInt32(b *testing.B) {\n\tbenchmarkBlockBitWidthsInt32(b, blockBitWidthsInt32)\n}\n\nfunc benchmarkBlockBitWidthsInt32(b *testing.B, f func(*[numMiniBlocks]byte, *[blockSize]int32)) {\n\tb.SetBytes(4 * blockSize)\n\tbitWidths := [numMiniBlocks]byte{}\n\tblock := [blockSize]int32{}\n\tfor i := 0; i < b.N; i++ {\n\t\tf(&bitWidths, &block)\n\t}\n}\n\nfunc BenchmarkEncodeMiniBlockInt32(b *testing.B) {\n\tbenchmarkEncodeMiniBlockInt32(b, encodeMiniBlockInt32)\n}\n\nfunc benchmarkEncodeMiniBlockInt32(b *testing.B, f func([]byte, *[miniBlockSize]int32, uint)) {\n\tfor bitWidth := uint(1); bitWidth <= 32; bitWidth++ {\n\t\tb.Run(fmt.Sprintf(\"bitWidth=%d\", bitWidth), func(b *testing.B) {\n\t\t\tb.SetBytes(4 * miniBlockSize)\n\t\t\tdst := [4*miniBlockSize + 32]byte{}\n\t\t\tsrc := [miniBlockSize]int32{}\n\t\t\tfor i := 0; i < b.N; i++ {\n\t\t\t\tf(dst[:], &src, bitWidth)\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc TestBlockDeltaInt64(t *testing.T) {\n\ttestBlockDeltaInt64(t, blockDeltaInt64)\n}\n\nfunc testBlockDeltaInt64(t *testing.T, f func(*[blockSize]int64, int64) int64) {\n\tt.Helper()\n\tblock := [blockSize]int64{}\n\tfor i := range block {\n\t\tblock[i] = int64(2 * (i + 1))\n\t}\n\tlastValue := f(&block, 0)\n\tif lastValue != 2*blockSize {\n\t\tt.Errorf(\"wrong last block value: want=%d got=%d\", 2*blockSize, lastValue)\n\t}\n\tfor i := range block {\n\t\tj := int64(2 * (i + 0))\n\t\tk := int64(2 * (i + 1))\n\t\tif block[i] != (k - j) {\n\t\t\tt.Errorf(\"wrong block delta at index %d: want=%d got=%d\", i, k-j, block[i])\n\t\t}\n\t}\n}\n\nfunc TestBlockMinInt64(t *testing.T) {\n\ttestBlockMinInt64(t, blockMinInt64)\n}\n\nfunc testBlockMinInt64(t *testing.T, f func(*[blockSize]int64) int64) {\n\tblock := [blockSize]int64{}\n\tfor i := range block {\n\t\tblock[i] = blockSize - int64(i)\n\t}\n\tif min := f(&block); min != 1 {\n\t\tt.Errorf(\"wrong min block value: want=1 got=%d\", min)\n\t}\n}\n\nfunc TestBlockSubInt64(t *testing.T) {\n\ttestBlockSubInt64(t, blockSubInt64)\n}\n\nfunc testBlockSubInt64(t *testing.T, f func(*[blockSize]int64, int64)) {\n\tblock := [blockSize]int64{}\n\tfor i := range block {\n\t\tblock[i] = int64(i)\n\t}\n\tf(&block, 1)\n\tfor i := range block {\n\t\tif block[i] != int64(i-1) {\n\t\t\tt.Errorf(\"wrong block value at index %d: want=%d got=%d\", i, i-1, block[i])\n\t\t}\n\t}\n}\n\nfunc TestBlockBitWidthsInt64(t *testing.T) {\n\ttestBlockBitWidthsInt64(t, blockBitWidthsInt64)\n}\n\nfunc testBlockBitWidthsInt64(t *testing.T, f func(*[numMiniBlocks]byte, *[blockSize]int64)) {\n\tbitWidths := [numMiniBlocks]byte{}\n\tblock := [blockSize]int64{}\n\tfor i := range block {\n\t\tblock[i] = int64(i)\n\t}\n\tf(&bitWidths, &block)\n\n\twant := [numMiniBlocks]byte{}\n\tfor i := range want {\n\t\tj := (i + 0) * miniBlockSize\n\t\tk := (i + 1) * miniBlockSize\n\t\twant[i] = byte(maxLen64(block[j:k]))\n\t}\n\n\tif bitWidths != want {\n\t\tt.Errorf(\"wrong bit widths: want=%d got=%d\", want, bitWidths)\n\t}\n}\n\nfunc TestEncodeMiniBlockInt64(t *testing.T) {\n\ttestEncodeMiniBlockInt64(t, encodeMiniBlockInt64)\n}\n\nfunc testEncodeMiniBlockInt64(t *testing.T, f func([]byte, *[miniBlockSize]int64, uint)) {\n\tfor bitWidth := uint(1); bitWidth <= 64; bitWidth++ {\n\t\tt.Run(fmt.Sprintf(\"bitWidth=%d\", bitWidth), func(t *testing.T) {\n\t\t\tgot := [8*miniBlockSize + 64]byte{}\n\t\t\tsrc := [miniBlockSize]int64{}\n\t\t\tfor i := range src {\n\t\t\t\tsrc[i] = int64(i) & int64((1<<bitWidth)-1)\n\t\t\t}\n\n\t\t\twant := [8*miniBlockSize + 64]byte{}\n\t\t\tbitOffset := uint(0)\n\n\t\t\tfor _, bits := range src {\n\t\t\t\tfor b := uint(0); b < bitWidth; b++ {\n\t\t\t\t\tx := bitOffset / 8\n\t\t\t\t\ty := bitOffset % 8\n\t\t\t\t\twant[x] |= byte(((bits >> b) & 1) << y)\n\t\t\t\t\tbitOffset++\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tf(got[:], &src, bitWidth)\n\t\t\tn := (miniBlockSize * bitWidth) / 8\n\n\t\t\tif !bytes.Equal(want[:n], got[:n]) {\n\t\t\t\tt.Errorf(\"output mismatch: want=%08x got=%08x\", want[:n], got[:n])\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc BenchmarkBlockDeltaInt64(b *testing.B) {\n\tbenchmarkBlockDeltaInt64(b, blockDeltaInt64)\n}\n\nfunc benchmarkBlockDeltaInt64(b *testing.B, f func(*[blockSize]int64, int64) int64) {\n\tb.SetBytes(8 * blockSize)\n\tblock := [blockSize]int64{}\n\tfor i := 0; i < b.N; i++ {\n\t\t_ = f(&block, 0)\n\t}\n}\n\nfunc BenchmarkBlockMinInt64(b *testing.B) {\n\tbenchmarkBlockMinInt64(b, blockMinInt64)\n}\n\nfunc benchmarkBlockMinInt64(b *testing.B, f func(*[blockSize]int64) int64) {\n\tb.SetBytes(8 * blockSize)\n\tblock := [blockSize]int64{}\n\tfor i := 0; i < b.N; i++ {\n\t\t_ = f(&block)\n\t}\n}\n\nfunc BenchmarkBlockSubInt64(b *testing.B) {\n\tbenchmarkBlockSubInt64(b, blockSubInt64)\n}\n\nfunc benchmarkBlockSubInt64(b *testing.B, f func(*[blockSize]int64, int64)) {\n\tb.SetBytes(8 * blockSize)\n\tblock := [blockSize]int64{}\n\tfor i := 0; i < b.N; i++ {\n\t\tf(&block, 42)\n\t}\n}\n\nfunc BenchmarkBlockBitWidthsInt64(b *testing.B) {\n\tbenchmarkBlockBitWidthsInt64(b, blockBitWidthsInt64)\n}\n\nfunc benchmarkBlockBitWidthsInt64(b *testing.B, f func(*[numMiniBlocks]byte, *[blockSize]int64)) {\n\tb.SetBytes(8 * blockSize)\n\tbitWidths := [numMiniBlocks]byte{}\n\tblock := [blockSize]int64{}\n\tfor i := 0; i < b.N; i++ {\n\t\tf(&bitWidths, &block)\n\t}\n}\n\nfunc BenchmarkEncodeMiniBlockInt64(b *testing.B) {\n\tbenchmarkEncodeMiniBlockInt64(b, encodeMiniBlockInt64)\n}\n\nfunc benchmarkEncodeMiniBlockInt64(b *testing.B, f func([]byte, *[miniBlockSize]int64, uint)) {\n\tfor bitWidth := uint(1); bitWidth <= 64; bitWidth++ {\n\t\tb.Run(fmt.Sprintf(\"bitWidth=%d\", bitWidth), func(b *testing.B) {\n\t\t\tb.SetBytes(8 * miniBlockSize)\n\t\t\tdst := [8*miniBlockSize + 64]byte{}\n\t\t\tsrc := [miniBlockSize]int64{}\n\t\t\tfor i := 0; i < b.N; i++ {\n\t\t\t\tf(dst[:], &src, bitWidth)\n\t\t\t}\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "encoding/delta/byte_array.go",
    "content": "package delta\n\nimport (\n\t\"bytes\"\n\t\"sort\"\n\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\nconst (\n\tmaxLinearSearchPrefixLength = 64 // arbitrary\n)\n\ntype ByteArrayEncoding struct {\n\tencoding.NotSupported\n}\n\nfunc (e *ByteArrayEncoding) String() string {\n\treturn \"DELTA_BYTE_ARRAY\"\n}\n\nfunc (e *ByteArrayEncoding) Encoding() format.Encoding {\n\treturn format.DeltaByteArray\n}\n\nfunc (e *ByteArrayEncoding) EncodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, error) {\n\tprefix := getInt32Buffer()\n\tdefer putInt32Buffer(prefix)\n\n\tlength := getInt32Buffer()\n\tdefer putInt32Buffer(length)\n\n\ttotalSize := 0\n\tif len(offsets) > 0 {\n\t\tlastValue := ([]byte)(nil)\n\t\tbaseOffset := offsets[0]\n\n\t\tfor _, endOffset := range offsets[1:] {\n\t\t\tv := src[baseOffset:endOffset:endOffset]\n\t\t\tn := int(endOffset - baseOffset)\n\t\t\tp := 0\n\t\t\tbaseOffset = endOffset\n\n\t\t\tif len(v) <= maxLinearSearchPrefixLength {\n\t\t\t\tp = linearSearchPrefixLength(lastValue, v)\n\t\t\t} else {\n\t\t\t\tp = binarySearchPrefixLength(lastValue, v)\n\t\t\t}\n\n\t\t\tprefix.values = append(prefix.values, int32(p))\n\t\t\tlength.values = append(length.values, int32(n-p))\n\t\t\tlastValue = v\n\t\t\ttotalSize += n - p\n\t\t}\n\t}\n\n\tdst = dst[:0]\n\tdst = encodeInt32(dst, prefix.values)\n\tdst = encodeInt32(dst, length.values)\n\tdst = resize(dst, len(dst)+totalSize)\n\n\tif len(offsets) > 0 {\n\t\tb := dst[len(dst)-totalSize:]\n\t\ti := int(offsets[0])\n\t\tj := 0\n\n\t\t_ = length.values[:len(prefix.values)]\n\n\t\tfor k, p := range prefix.values {\n\t\t\tn := p + length.values[k]\n\t\t\tj += copy(b[j:], src[i+int(p):i+int(n)])\n\t\t\ti += int(n)\n\t\t}\n\t}\n\n\treturn dst, nil\n}\n\nfunc (e *ByteArrayEncoding) EncodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error) {\n\t// The parquet specs say that this encoding is only supported for BYTE_ARRAY\n\t// values, but the reference Java implementation appears to support\n\t// FIXED_LEN_BYTE_ARRAY as well:\n\t// https://github.com/apache/parquet-mr/blob/5608695f5777de1eb0899d9075ec9411cfdf31d3/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java#L211\n\tif size < 0 || size > encoding.MaxFixedLenByteArraySize {\n\t\treturn dst[:0], encoding.Error(e, encoding.ErrInvalidArgument)\n\t}\n\tif (len(src) % size) != 0 {\n\t\treturn dst[:0], encoding.ErrEncodeInvalidInputSize(e, \"FIXED_LEN_BYTE_ARRAY\", len(src))\n\t}\n\n\tprefix := getInt32Buffer()\n\tdefer putInt32Buffer(prefix)\n\n\tlength := getInt32Buffer()\n\tdefer putInt32Buffer(length)\n\n\ttotalSize := 0\n\tlastValue := ([]byte)(nil)\n\n\tfor i := size; i <= len(src); i += size {\n\t\tv := src[i-size : i : i]\n\t\tp := linearSearchPrefixLength(lastValue, v)\n\t\tn := size - p\n\t\tprefix.values = append(prefix.values, int32(p))\n\t\tlength.values = append(length.values, int32(n))\n\t\tlastValue = v\n\t\ttotalSize += n\n\t}\n\n\tdst = dst[:0]\n\tdst = encodeInt32(dst, prefix.values)\n\tdst = encodeInt32(dst, length.values)\n\tdst = resize(dst, len(dst)+totalSize)\n\n\tb := dst[len(dst)-totalSize:]\n\ti := 0\n\tj := 0\n\n\tfor _, p := range prefix.values {\n\t\tj += copy(b[j:], src[i+int(p):i+size])\n\t\ti += size\n\t}\n\n\treturn dst, nil\n}\n\nfunc (e *ByteArrayEncoding) DecodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, []uint32, error) {\n\tdst, offsets = dst[:0], offsets[:0]\n\n\tprefix := getInt32Buffer()\n\tdefer putInt32Buffer(prefix)\n\n\tsuffix := getInt32Buffer()\n\tdefer putInt32Buffer(suffix)\n\n\tvar err error\n\tsrc, err = prefix.decode(src)\n\tif err != nil {\n\t\treturn dst, offsets, e.wrapf(\"decoding prefix lengths: %w\", err)\n\t}\n\tsrc, err = suffix.decode(src)\n\tif err != nil {\n\t\treturn dst, offsets, e.wrapf(\"decoding suffix lengths: %w\", err)\n\t}\n\tif len(prefix.values) != len(suffix.values) {\n\t\treturn dst, offsets, e.wrap(errPrefixAndSuffixLengthMismatch(len(prefix.values), len(suffix.values)))\n\t}\n\treturn decodeByteArray(dst, src, prefix.values, suffix.values, offsets)\n}\n\nfunc (e *ByteArrayEncoding) DecodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error) {\n\tdst = dst[:0]\n\n\tif size < 0 || size > encoding.MaxFixedLenByteArraySize {\n\t\treturn dst, e.wrap(encoding.ErrInvalidArgument)\n\t}\n\n\tprefix := getInt32Buffer()\n\tdefer putInt32Buffer(prefix)\n\n\tsuffix := getInt32Buffer()\n\tdefer putInt32Buffer(suffix)\n\n\tvar err error\n\tsrc, err = prefix.decode(src)\n\tif err != nil {\n\t\treturn dst, e.wrapf(\"decoding prefix lengths: %w\", err)\n\t}\n\tsrc, err = suffix.decode(src)\n\tif err != nil {\n\t\treturn dst, e.wrapf(\"decoding suffix lengths: %w\", err)\n\t}\n\tif len(prefix.values) != len(suffix.values) {\n\t\treturn dst, e.wrap(errPrefixAndSuffixLengthMismatch(len(prefix.values), len(suffix.values)))\n\t}\n\treturn decodeFixedLenByteArray(dst[:0], src, size, prefix.values, suffix.values)\n}\n\nfunc (e *ByteArrayEncoding) EstimateDecodeByteArraySize(src []byte) int {\n\tlength := getInt32Buffer()\n\tdefer putInt32Buffer(length)\n\tsrc, _ = length.decode(src)\n\tsum := int(length.sum())\n\tlength.decode(src)\n\treturn sum + int(length.sum())\n}\n\nfunc (e *ByteArrayEncoding) wrap(err error) error {\n\tif err != nil {\n\t\terr = encoding.Error(e, err)\n\t}\n\treturn err\n}\n\nfunc (e *ByteArrayEncoding) wrapf(msg string, args ...interface{}) error {\n\treturn encoding.Errorf(e, msg, args...)\n}\n\nfunc linearSearchPrefixLength(base, data []byte) (n int) {\n\tfor n < len(base) && n < len(data) && base[n] == data[n] {\n\t\tn++\n\t}\n\treturn n\n}\n\nfunc binarySearchPrefixLength(base, data []byte) int {\n\tn := len(base)\n\tif n > len(data) {\n\t\tn = len(data)\n\t}\n\treturn sort.Search(n, func(i int) bool {\n\t\treturn !bytes.Equal(base[:i+1], data[:i+1])\n\t})\n}\n"
  },
  {
    "path": "encoding/delta/byte_array_amd64.go",
    "content": "//go:build !purego\n\npackage delta\n\nimport (\n\t\"golang.org/x/sys/cpu\"\n)\n\n//go:noescape\nfunc validatePrefixAndSuffixLengthValuesAVX2(prefix, suffix []int32, maxLength int) (totalPrefixLength, totalSuffixLength int, ok bool)\n\nfunc validatePrefixAndSuffixLengthValues(prefix, suffix []int32, maxLength int) (totalPrefixLength, totalSuffixLength int, err error) {\n\tif cpu.X86.HasAVX2 {\n\t\ttotalPrefixLength, totalSuffixLength, ok := validatePrefixAndSuffixLengthValuesAVX2(prefix, suffix, maxLength)\n\t\tif ok {\n\t\t\treturn totalPrefixLength, totalSuffixLength, nil\n\t\t}\n\t}\n\n\tlastValueLength := 0\n\n\tfor i := range prefix {\n\t\tp := int(prefix[i])\n\t\tn := int(suffix[i])\n\t\tif p < 0 {\n\t\t\terr = errInvalidNegativePrefixLength(p)\n\t\t\treturn\n\t\t}\n\t\tif n < 0 {\n\t\t\terr = errInvalidNegativeValueLength(n)\n\t\t\treturn\n\t\t}\n\t\tif p > lastValueLength {\n\t\t\terr = errPrefixLengthOutOfBounds(p, lastValueLength)\n\t\t\treturn\n\t\t}\n\t\ttotalPrefixLength += p\n\t\ttotalSuffixLength += n\n\t\tlastValueLength = p + n\n\t}\n\n\tif totalSuffixLength > maxLength {\n\t\terr = errValueLengthOutOfBounds(totalSuffixLength, maxLength)\n\t\treturn\n\t}\n\n\treturn totalPrefixLength, totalSuffixLength, nil\n}\n\n//go:noescape\nfunc decodeByteArrayOffsets(offsets []uint32, prefix, suffix []int32)\n\n//go:noescape\nfunc decodeByteArrayAVX2(dst, src []byte, prefix, suffix []int32) int\n\nfunc decodeByteArray(dst, src []byte, prefix, suffix []int32, offsets []uint32) ([]byte, []uint32, error) {\n\ttotalPrefixLength, totalSuffixLength, err := validatePrefixAndSuffixLengthValues(prefix, suffix, len(src))\n\tif err != nil {\n\t\treturn dst, offsets, err\n\t}\n\n\ttotalLength := totalPrefixLength + totalSuffixLength\n\tdst = resizeNoMemclr(dst, totalLength+padding)\n\n\tif size := len(prefix) + 1; cap(offsets) < size {\n\t\toffsets = make([]uint32, size)\n\t} else {\n\t\toffsets = offsets[:size]\n\t}\n\n\t_ = prefix[:len(suffix)]\n\t_ = suffix[:len(prefix)]\n\tdecodeByteArrayOffsets(offsets, prefix, suffix)\n\n\tvar lastValue []byte\n\tvar i int\n\tvar j int\n\n\tif cpu.X86.HasAVX2 && len(src) > padding {\n\t\tk := len(suffix)\n\t\tn := 0\n\n\t\tfor k > 0 && n < padding {\n\t\t\tk--\n\t\t\tn += int(suffix[k])\n\t\t}\n\n\t\tif k > 0 && n >= padding {\n\t\t\ti = decodeByteArrayAVX2(dst, src, prefix[:k], suffix[:k])\n\t\t\tj = len(src) - n\n\t\t\tlastValue = dst[i-(int(prefix[k-1])+int(suffix[k-1])):]\n\t\t\tprefix = prefix[k:]\n\t\t\tsuffix = suffix[k:]\n\t\t}\n\t}\n\n\tfor k := range prefix {\n\t\tp := int(prefix[k])\n\t\tn := int(suffix[k])\n\t\tlastValueOffset := i\n\t\ti += copy(dst[i:], lastValue[:p])\n\t\ti += copy(dst[i:], src[j:j+n])\n\t\tj += n\n\t\tlastValue = dst[lastValueOffset:]\n\t}\n\n\treturn dst[:totalLength], offsets, nil\n}\n\n//go:noescape\nfunc decodeByteArrayAVX2x128bits(dst, src []byte, prefix, suffix []int32) int\n\nfunc decodeFixedLenByteArray(dst, src []byte, size int, prefix, suffix []int32) ([]byte, error) {\n\ttotalPrefixLength, totalSuffixLength, err := validatePrefixAndSuffixLengthValues(prefix, suffix, len(src))\n\tif err != nil {\n\t\treturn dst, err\n\t}\n\n\ttotalLength := totalPrefixLength + totalSuffixLength\n\tdst = resizeNoMemclr(dst, totalLength+padding)\n\n\t_ = prefix[:len(suffix)]\n\t_ = suffix[:len(prefix)]\n\n\tvar lastValue []byte\n\tvar i int\n\tvar j int\n\n\tif cpu.X86.HasAVX2 && len(src) > padding {\n\t\tk := len(suffix)\n\t\tn := 0\n\n\t\tfor k > 0 && n < padding {\n\t\t\tk--\n\t\t\tn += int(suffix[k])\n\t\t}\n\n\t\tif k > 0 && n >= padding {\n\t\t\tif size == 16 {\n\t\t\t\ti = decodeByteArrayAVX2x128bits(dst, src, prefix[:k], suffix[:k])\n\t\t\t} else {\n\t\t\t\ti = decodeByteArrayAVX2(dst, src, prefix[:k], suffix[:k])\n\t\t\t}\n\t\t\tj = len(src) - n\n\t\t\tprefix = prefix[k:]\n\t\t\tsuffix = suffix[k:]\n\t\t\tif i >= size {\n\t\t\t\tlastValue = dst[i-size:]\n\t\t\t}\n\t\t}\n\t}\n\n\tfor k := range prefix {\n\t\tp := int(prefix[k])\n\t\tn := int(suffix[k])\n\t\tk := i\n\t\ti += copy(dst[i:], lastValue[:p])\n\t\ti += copy(dst[i:], src[j:j+n])\n\t\tj += n\n\t\tlastValue = dst[k:]\n\t}\n\n\treturn dst[:totalLength], nil\n}\n"
  },
  {
    "path": "encoding/delta/byte_array_amd64.s",
    "content": "//go:build !purego\n\n#include \"funcdata.h\"\n#include \"textflag.h\"\n\n// func validatePrefixAndSuffixLengthValuesAVX2(prefix, suffix []int32, maxLength int) (totalPrefixLength, totalSuffixLength int, ok bool)\nTEXT ·validatePrefixAndSuffixLengthValuesAVX2(SB), NOSPLIT, $0-73\n    MOVQ prefix_base+0(FP), AX\n    MOVQ suffix_base+24(FP), BX\n    MOVQ suffix_len+32(FP), CX\n    MOVQ maxLength+48(FP), DX\n\n    XORQ SI, SI\n    XORQ DI, DI // lastValueLength\n    XORQ R8, R8\n    XORQ R9, R9\n    XORQ R10, R10 // totalPrefixLength\n    XORQ R11, R11 // totalSuffixLength\n    XORQ R12, R12 // ok\n\n    CMPQ CX, $8\n    JB test\n\n    MOVQ CX, R13\n    SHRQ $3, R13\n    SHLQ $3, R13\n\n    VPXOR X0, X0, X0 // lastValueLengths\n    VPXOR X1, X1, X1 // totalPrefixLengths\n    VPXOR X2, X2, X2 // totalSuffixLengths\n    VPXOR X3, X3, X3 // negative prefix length sentinels\n    VPXOR X4, X4, X4 // negative suffix length sentinels\n    VPXOR X5, X5, X5 // prefix length overflow sentinels\n    VMOVDQU ·rotateLeft32(SB), Y6\n\nloopAVX2:\n    VMOVDQU (AX)(SI*4), Y7 // p\n    VMOVDQU (BX)(SI*4), Y8 // n\n\n    VPADDD Y7, Y1, Y1\n    VPADDD Y8, Y2, Y2\n\n    VPOR Y7, Y3, Y3\n    VPOR Y8, Y4, Y4\n\n    VPADDD Y7, Y8, Y9 // p + n\n    VPERMD Y0, Y6, Y10\n    VPBLENDD $1, Y10, Y9, Y10\n    VPCMPGTD Y10, Y7, Y10\n    VPOR Y10, Y5, Y5\n\n    VMOVDQU Y9, Y0\n    ADDQ $8, SI\n    CMPQ SI, R13\n    JNE loopAVX2\n\n    // If any of the sentinel values has its most significant bit set then one\n    // of the values was negative or one of the prefixes was greater than the\n    // length of the previous value, return false.\n    VPOR Y4, Y3, Y3\n    VPOR Y5, Y3, Y3\n    VMOVMSKPS Y3, R13\n    CMPQ R13, $0\n    JNE done\n\n    // We computed 8 sums in parallel for the prefix and suffix arrays, they\n    // need to be accumulated into single values, which is what these reduction\n    // steps do.\n    VPSRLDQ $4, Y1, Y5\n    VPSRLDQ $8, Y1, Y6\n    VPSRLDQ $12, Y1, Y7\n    VPADDD Y5, Y1, Y1\n    VPADDD Y6, Y1, Y1\n    VPADDD Y7, Y1, Y1\n    VPERM2I128 $1, Y1, Y1, Y0\n    VPADDD Y0, Y1, Y1\n    MOVQ X1, R10\n    ANDQ $0x7FFFFFFF, R10\n\n    VPSRLDQ $4, Y2, Y5\n    VPSRLDQ $8, Y2, Y6\n    VPSRLDQ $12, Y2, Y7\n    VPADDD Y5, Y2, Y2\n    VPADDD Y6, Y2, Y2\n    VPADDD Y7, Y2, Y2\n    VPERM2I128 $1, Y2, Y2, Y0\n    VPADDD Y0, Y2, Y2\n    MOVQ X2, R11\n    ANDQ $0x7FFFFFFF, R11\n\n    JMP test\nloop:\n    MOVLQSX (AX)(SI*4), R8\n    MOVLQSX (BX)(SI*4), R9\n\n    CMPQ R8, $0 // p < 0 ?\n    JL done\n\n    CMPQ R9, $0 // n < 0 ?\n    JL done\n\n    CMPQ R8, DI // p > lastValueLength ?\n    JG done\n\n    ADDQ R8, R10\n    ADDQ R9, R11\n    ADDQ R8, DI\n    ADDQ R9, DI\n\n    INCQ SI\ntest:\n    CMPQ SI, CX\n    JNE loop\n\n    CMPQ R11, DX // totalSuffixLength > maxLength ?\n    JG done\n\n    MOVB $1, R12\ndone:\n    MOVQ R10, totalPrefixLength+56(FP)\n    MOVQ R11, totalSuffixLength+64(FP)\n    MOVB R12, ok+72(FP)\n    RET\n\n// func decodeByteArrayOffsets(offsets []uint32, prefix, suffix []int32)\nTEXT ·decodeByteArrayOffsets(SB), NOSPLIT, $0-72\n    MOVQ offsets_base+0(FP), AX\n    MOVQ prefix_base+24(FP), BX\n    MOVQ suffix_base+48(FP), CX\n    MOVQ suffix_len+56(FP), DX\n\n    XORQ SI, SI\n    XORQ R10, R10\n    JMP test\nloop:\n    MOVL (BX)(SI*4), R8\n    MOVL (CX)(SI*4), R9\n    MOVL R10, (AX)(SI*4)\n    ADDL R8, R10\n    ADDL R9, R10\n    INCQ SI\ntest:\n    CMPQ SI, DX\n    JNE loop\n    MOVL R10, (AX)(SI*4)\n    RET\n\n// func decodeByteArrayAVX2(dst, src []byte, prefix, suffix []int32) int\nTEXT ·decodeByteArrayAVX2(SB), NOSPLIT, $0-104\n    MOVQ dst_base+0(FP), AX\n    MOVQ src_base+24(FP), BX\n    MOVQ prefix_base+48(FP), CX\n    MOVQ suffix_base+72(FP), DX\n    MOVQ suffix_len+80(FP), DI\n\n    XORQ SI, SI\n    XORQ R8, R8\n    XORQ R9, R9\n    MOVQ AX, R10 // last value\n\n    JMP test\nloop:\n    MOVLQZX (CX)(SI*4), R8 // prefix length\n    MOVLQZX (DX)(SI*4), R9 // suffix length\nprefix:\n    VMOVDQU (R10), Y0\n    VMOVDQU Y0, (AX)\n    CMPQ R8, $32\n    JA copyPrefix\nsuffix:\n    VMOVDQU (BX), Y1\n    VMOVDQU Y1, (AX)(R8*1)\n    CMPQ R9, $32\n    JA copySuffix\nnext:\n    MOVQ AX, R10\n    ADDQ R9, R8\n    LEAQ (AX)(R8*1), AX\n    LEAQ (BX)(R9*1), BX\n    INCQ SI\ntest:\n    CMPQ SI, DI\n    JNE loop\n    MOVQ dst_base+0(FP), BX\n    SUBQ BX, AX\n    MOVQ AX, ret+96(FP)\n    VZEROUPPER\n    RET\ncopyPrefix:\n    MOVQ $32, R12\ncopyPrefixLoop:\n    VMOVDQU (R10)(R12*1), Y0\n    VMOVDQU Y0, (AX)(R12*1)\n    ADDQ $32, R12\n    CMPQ R12, R8\n    JB copyPrefixLoop\n    JMP suffix\ncopySuffix:\n    MOVQ $32, R12\n    LEAQ (AX)(R8*1), R13\ncopySuffixLoop:\n    VMOVDQU (BX)(R12*1), Y1\n    VMOVDQU Y1, (R13)(R12*1)\n    ADDQ $32, R12\n    CMPQ R12, R9\n    JB copySuffixLoop\n    JMP next\n\n// func decodeByteArrayAVX2x128bits(dst, src []byte, prefix, suffix []int32) int\nTEXT ·decodeByteArrayAVX2x128bits(SB), NOSPLIT, $0-104\n    MOVQ dst_base+0(FP), AX\n    MOVQ src_base+24(FP), BX\n    MOVQ prefix_base+48(FP), CX\n    MOVQ suffix_base+72(FP), DX\n    MOVQ suffix_len+80(FP), DI\n\n    XORQ SI, SI\n    XORQ R8, R8\n    XORQ R9, R9\n    VPXOR X0, X0, X0\n\n    JMP test\nloop:\n    MOVLQZX (CX)(SI*4), R8 // prefix length\n    MOVLQZX (DX)(SI*4), R9 // suffix length\n\n    VMOVDQU (BX), X1\n    VMOVDQU X0, (AX)\n    VMOVDQU X1, (AX)(R8*1)\n    VMOVDQU (AX), X0\n\n    ADDQ R9, R8\n    LEAQ (AX)(R8*1), AX\n    LEAQ (BX)(R9*1), BX\n    INCQ SI\ntest:\n    CMPQ SI, DI\n    JNE loop\n    MOVQ dst_base+0(FP), BX\n    SUBQ BX, AX\n    MOVQ AX, ret+96(FP)\n    VZEROUPPER\n    RET\n"
  },
  {
    "path": "encoding/delta/byte_array_purego.go",
    "content": "//go:build purego || !amd64\n\npackage delta\n\nfunc decodeByteArray(dst, src []byte, prefix, suffix []int32, offsets []uint32) ([]byte, []uint32, error) {\n\t_ = prefix[:len(suffix)]\n\t_ = suffix[:len(prefix)]\n\n\tvar lastValue []byte\n\tfor i := range suffix {\n\t\tn := int(suffix[i])\n\t\tp := int(prefix[i])\n\t\tif n < 0 {\n\t\t\treturn dst, offsets, errInvalidNegativeValueLength(n)\n\t\t}\n\t\tif n > len(src) {\n\t\t\treturn dst, offsets, errValueLengthOutOfBounds(n, len(src))\n\t\t}\n\t\tif p < 0 {\n\t\t\treturn dst, offsets, errInvalidNegativePrefixLength(p)\n\t\t}\n\t\tif p > len(lastValue) {\n\t\t\treturn dst, offsets, errPrefixLengthOutOfBounds(p, len(lastValue))\n\t\t}\n\t\tj := len(dst)\n\t\toffsets = append(offsets, uint32(j))\n\t\tdst = append(dst, lastValue[:p]...)\n\t\tdst = append(dst, src[:n]...)\n\t\tlastValue = dst[j:]\n\t\tsrc = src[n:]\n\t}\n\n\treturn dst, append(offsets, uint32(len(dst))), nil\n}\n\nfunc decodeFixedLenByteArray(dst, src []byte, size int, prefix, suffix []int32) ([]byte, error) {\n\t_ = prefix[:len(suffix)]\n\t_ = suffix[:len(prefix)]\n\n\tvar lastValue []byte\n\tfor i := range suffix {\n\t\tn := int(suffix[i])\n\t\tp := int(prefix[i])\n\t\tif n < 0 {\n\t\t\treturn dst, errInvalidNegativeValueLength(n)\n\t\t}\n\t\tif n > len(src) {\n\t\t\treturn dst, errValueLengthOutOfBounds(n, len(src))\n\t\t}\n\t\tif p < 0 {\n\t\t\treturn dst, errInvalidNegativePrefixLength(p)\n\t\t}\n\t\tif p > len(lastValue) {\n\t\t\treturn dst, errPrefixLengthOutOfBounds(p, len(lastValue))\n\t\t}\n\t\tj := len(dst)\n\t\tdst = append(dst, lastValue[:p]...)\n\t\tdst = append(dst, src[:n]...)\n\t\tlastValue = dst[j:]\n\t\tsrc = src[n:]\n\t}\n\treturn dst, nil\n}\n"
  },
  {
    "path": "encoding/delta/byte_array_test.go",
    "content": "package delta\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"testing\"\n)\n\nfunc TestLinearSearchPrefixLength(t *testing.T) {\n\ttestSearchPrefixLength(t, linearSearchPrefixLength)\n}\n\nfunc TestBinarySearchPrefixLength(t *testing.T) {\n\ttestSearchPrefixLength(t, func(base, data []byte) int {\n\t\treturn binarySearchPrefixLength(base, data)\n\t})\n}\n\nfunc testSearchPrefixLength(t *testing.T, prefixLength func(base, data []byte) int) {\n\ttests := []struct {\n\t\tbase string\n\t\tdata string\n\t\tlen  int\n\t}{\n\t\t{\n\t\t\tbase: \"\",\n\t\t\tdata: \"\",\n\t\t\tlen:  0,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"A\",\n\t\t\tdata: \"B\",\n\t\t\tlen:  0,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"\",\n\t\t\tdata: \"Hello World!\",\n\t\t\tlen:  0,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"H\",\n\t\t\tdata: \"Hello World!\",\n\t\t\tlen:  1,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"He\",\n\t\t\tdata: \"Hello World!\",\n\t\t\tlen:  2,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"Hel\",\n\t\t\tdata: \"Hello World!\",\n\t\t\tlen:  3,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"Hell\",\n\t\t\tdata: \"Hello World!\",\n\t\t\tlen:  4,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"Hello\",\n\t\t\tdata: \"Hello World!\",\n\t\t\tlen:  5,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"Hello \",\n\t\t\tdata: \"Hello World!\",\n\t\t\tlen:  6,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"Hello W\",\n\t\t\tdata: \"Hello World!\",\n\t\t\tlen:  7,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"Hello Wo\",\n\t\t\tdata: \"Hello World!\",\n\t\t\tlen:  8,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"Hello Wor\",\n\t\t\tdata: \"Hello World!\",\n\t\t\tlen:  9,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"Hello Worl\",\n\t\t\tdata: \"Hello World!\",\n\t\t\tlen:  10,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"Hello World\",\n\t\t\tdata: \"Hello World!\",\n\t\t\tlen:  11,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"Hello World!\",\n\t\t\tdata: \"Hello World!\",\n\t\t\tlen:  12,\n\t\t},\n\n\t\t{\n\t\t\tbase: \"Hell.......\",\n\t\t\tdata: \"Hello World!\",\n\t\t\tlen:  4,\n\t\t},\n\t}\n\n\tfor _, test := range tests {\n\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\tn := prefixLength([]byte(test.base), []byte(test.data))\n\t\t\tif n != test.len {\n\t\t\t\tt.Errorf(\"prefixLength(%q,%q): want=%d got=%d\", test.base, test.data, test.len, n)\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc BenchmarkLinearSearchPrefixLength(b *testing.B) {\n\tbenchmarkSearchPrefixLength(b, linearSearchPrefixLength)\n}\n\nfunc BenchmarkBinarySearchPrefixLength(b *testing.B) {\n\tbenchmarkSearchPrefixLength(b, func(base, data []byte) int {\n\t\treturn binarySearchPrefixLength(base, data)\n\t})\n}\n\nfunc benchmarkSearchPrefixLength(b *testing.B, prefixLength func(base, data []byte) int) {\n\tbuffer := bytes.Repeat([]byte(\"0123456789\"), 100)\n\n\tfor _, size := range []int{10, 100, 1000} {\n\t\tb.Run(fmt.Sprintf(\"size=%d\", size), func(b *testing.B) {\n\t\t\tbase := buffer[:size]\n\t\t\tdata := buffer[:size/2]\n\n\t\t\tfor i := 0; i < b.N; i++ {\n\t\t\t\t_ = prefixLength(base, data)\n\t\t\t}\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "encoding/delta/delta.go",
    "content": "package delta\n\nimport (\n\t\"fmt\"\n\t\"sync\"\n\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\ntype int32Buffer struct {\n\tvalues []int32\n}\n\nfunc (buf *int32Buffer) resize(size int) {\n\tif cap(buf.values) < size {\n\t\tbuf.values = make([]int32, size, 2*size)\n\t} else {\n\t\tbuf.values = buf.values[:size]\n\t}\n}\n\nfunc (buf *int32Buffer) decode(src []byte) ([]byte, error) {\n\tvalues, remain, err := decodeInt32(unsafecast.Int32ToBytes(buf.values[:0]), src)\n\tbuf.values = unsafecast.BytesToInt32(values)\n\treturn remain, err\n}\n\nfunc (buf *int32Buffer) sum() (sum int32) {\n\tfor _, v := range buf.values {\n\t\tsum += v\n\t}\n\treturn sum\n}\n\nvar (\n\tint32BufferPool sync.Pool // *int32Buffer\n)\n\nfunc getInt32Buffer() *int32Buffer {\n\tb, _ := int32BufferPool.Get().(*int32Buffer)\n\tif b != nil {\n\t\tb.values = b.values[:0]\n\t} else {\n\t\tb = &int32Buffer{\n\t\t\tvalues: make([]int32, 0, 1024),\n\t\t}\n\t}\n\treturn b\n}\n\nfunc putInt32Buffer(b *int32Buffer) {\n\tint32BufferPool.Put(b)\n}\n\nfunc resizeNoMemclr(buf []byte, size int) []byte {\n\tif cap(buf) < size {\n\t\treturn grow(buf, size)\n\t}\n\treturn buf[:size]\n}\n\nfunc resize(buf []byte, size int) []byte {\n\tif cap(buf) < size {\n\t\treturn grow(buf, size)\n\t}\n\tif size > len(buf) {\n\t\tclear := buf[len(buf):size]\n\t\tfor i := range clear {\n\t\t\tclear[i] = 0\n\t\t}\n\t}\n\treturn buf[:size]\n}\n\nfunc grow(buf []byte, size int) []byte {\n\tnewCap := 2 * cap(buf)\n\tif newCap < size {\n\t\tnewCap = size\n\t}\n\tnewBuf := make([]byte, size, newCap)\n\tcopy(newBuf, buf)\n\treturn newBuf\n}\n\nfunc min(a, b int) int {\n\tif a < b {\n\t\treturn a\n\t}\n\treturn b\n}\n\nfunc errPrefixAndSuffixLengthMismatch(prefixLength, suffixLength int) error {\n\treturn fmt.Errorf(\"length of prefix and suffix mismatch: %d != %d\", prefixLength, suffixLength)\n}\n\nfunc errInvalidNegativeValueLength(length int) error {\n\treturn fmt.Errorf(\"invalid negative value length: %d\", length)\n}\n\nfunc errInvalidNegativePrefixLength(length int) error {\n\treturn fmt.Errorf(\"invalid negative prefix length: %d\", length)\n}\n\nfunc errValueLengthOutOfBounds(length, maxLength int) error {\n\treturn fmt.Errorf(\"value length is larger than the input size: %d > %d\", length, maxLength)\n}\n\nfunc errPrefixLengthOutOfBounds(length, maxLength int) error {\n\treturn fmt.Errorf(\"prefix length %d is larger than the last value of size %d\", length, maxLength)\n}\n"
  },
  {
    "path": "encoding/delta/delta_amd64.go",
    "content": "//go:build !purego\n\npackage delta\n\nconst (\n\tpadding = 64\n)\n\nfunc findNegativeLength(lengths []int32) int {\n\tfor _, n := range lengths {\n\t\tif n < 0 {\n\t\t\treturn int(n)\n\t\t}\n\t}\n\treturn -1\n}\n"
  },
  {
    "path": "encoding/delta/delta_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\nGLOBL ·rotateLeft32(SB), RODATA|NOPTR, $32\nDATA ·rotateLeft32+0(SB)/4, $7\nDATA ·rotateLeft32+4(SB)/4, $0\nDATA ·rotateLeft32+8(SB)/4, $1\nDATA ·rotateLeft32+12(SB)/4, $2\nDATA ·rotateLeft32+16(SB)/4, $3\nDATA ·rotateLeft32+20(SB)/4, $4\nDATA ·rotateLeft32+24(SB)/4, $5\nDATA ·rotateLeft32+28(SB)/4, $6\n"
  },
  {
    "path": "encoding/delta/delta_test.go",
    "content": "//go:build go1.18\n// +build go1.18\n\npackage delta_test\n\nimport (\n\t\"fmt\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/encoding/delta\"\n\t\"github.com/segmentio/parquet-go/encoding/fuzz\"\n\t\"github.com/segmentio/parquet-go/encoding/test\"\n)\n\nfunc FuzzDeltaBinaryPackedInt32(f *testing.F) {\n\tfuzz.EncodeInt32(f, new(delta.BinaryPackedEncoding))\n}\n\nfunc FuzzDeltaBinaryPackedInt64(f *testing.F) {\n\tfuzz.EncodeInt64(f, new(delta.BinaryPackedEncoding))\n}\n\nfunc FuzzDeltaLengthByteArray(f *testing.F) {\n\tfuzz.EncodeByteArray(f, new(delta.LengthByteArrayEncoding))\n}\n\nfunc FuzzDeltaByteArray(f *testing.F) {\n\tfuzz.EncodeByteArray(f, new(delta.ByteArrayEncoding))\n}\n\nconst (\n\tencodeMinNumValues = 0\n\tencodeMaxNumValues = 200\n)\n\nfunc TestEncodeInt32(t *testing.T) {\n\tfor bitWidth := uint(0); bitWidth <= 32; bitWidth++ {\n\t\tt.Run(fmt.Sprintf(\"bitWidth=%d\", bitWidth), func(t *testing.T) {\n\t\t\ttest.EncodeInt32(t,\n\t\t\t\tnew(delta.BinaryPackedEncoding),\n\t\t\t\tencodeMinNumValues,\n\t\t\t\tencodeMaxNumValues,\n\t\t\t\tbitWidth,\n\t\t\t)\n\t\t})\n\t}\n}\n\nfunc TestEncodeInt64(t *testing.T) {\n\tfor bitWidth := uint(0); bitWidth <= 64; bitWidth++ {\n\t\tt.Run(fmt.Sprintf(\"bitWidth=%d\", bitWidth), func(t *testing.T) {\n\t\t\ttest.EncodeInt64(t,\n\t\t\t\tnew(delta.BinaryPackedEncoding),\n\t\t\t\tencodeMinNumValues,\n\t\t\t\tencodeMaxNumValues,\n\t\t\t\tbitWidth,\n\t\t\t)\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "encoding/delta/length_byte_array.go",
    "content": "package delta\n\nimport (\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\ntype LengthByteArrayEncoding struct {\n\tencoding.NotSupported\n}\n\nfunc (e *LengthByteArrayEncoding) String() string {\n\treturn \"DELTA_LENGTH_BYTE_ARRAY\"\n}\n\nfunc (e *LengthByteArrayEncoding) Encoding() format.Encoding {\n\treturn format.DeltaLengthByteArray\n}\n\nfunc (e *LengthByteArrayEncoding) EncodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, error) {\n\tif len(offsets) == 0 {\n\t\treturn dst[:0], nil\n\t}\n\n\tlength := getInt32Buffer()\n\tdefer putInt32Buffer(length)\n\n\tlength.resize(len(offsets) - 1)\n\tencodeByteArrayLengths(length.values, offsets)\n\n\tdst = dst[:0]\n\tdst = encodeInt32(dst, length.values)\n\tdst = append(dst, src...)\n\treturn dst, nil\n}\n\nfunc (e *LengthByteArrayEncoding) DecodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, []uint32, error) {\n\tdst, offsets = dst[:0], offsets[:0]\n\n\tlength := getInt32Buffer()\n\tdefer putInt32Buffer(length)\n\n\tsrc, err := length.decode(src)\n\tif err != nil {\n\t\treturn dst, offsets, e.wrap(err)\n\t}\n\n\tif size := len(length.values) + 1; cap(offsets) < size {\n\t\toffsets = make([]uint32, size, 2*size)\n\t} else {\n\t\toffsets = offsets[:size]\n\t}\n\n\tlastOffset, invalidLength := decodeByteArrayLengths(offsets, length.values)\n\tif invalidLength != 0 {\n\t\treturn dst, offsets, e.wrap(errInvalidNegativeValueLength(int(invalidLength)))\n\t}\n\tif int(lastOffset) > len(src) {\n\t\treturn dst, offsets, e.wrap(errValueLengthOutOfBounds(int(lastOffset), len(src)))\n\t}\n\n\treturn append(dst, src[:lastOffset]...), offsets, nil\n}\n\nfunc (e *LengthByteArrayEncoding) EstimateDecodeByteArraySize(src []byte) int {\n\tlength := getInt32Buffer()\n\tdefer putInt32Buffer(length)\n\tlength.decode(src)\n\treturn int(length.sum())\n}\n\nfunc (e *LengthByteArrayEncoding) CanDecodeInPlace() bool {\n\treturn true\n}\n\nfunc (e *LengthByteArrayEncoding) wrap(err error) error {\n\tif err != nil {\n\t\terr = encoding.Error(e, err)\n\t}\n\treturn err\n}\n"
  },
  {
    "path": "encoding/delta/length_byte_array_amd64.go",
    "content": "//go:build !purego\n\npackage delta\n\n//go:noescape\nfunc encodeByteArrayLengths(lengths []int32, offsets []uint32)\n\n//go:noescape\nfunc decodeByteArrayLengths(offsets []uint32, lengths []int32) (lastOffset uint32, invalidLength int32)\n"
  },
  {
    "path": "encoding/delta/length_byte_array_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n// func encodeByteArrayLengths(lengths []int32, offsets []uint32)\nTEXT ·encodeByteArrayLengths(SB), NOSPLIT, $0-48\n    MOVQ lengths_base+0(FP), AX\n    MOVQ lengths_len+8(FP), CX\n    MOVQ offsets_base+24(FP), BX\n    XORQ SI, SI\n\n    CMPQ CX, $4\n    JB test\n\n    MOVQ CX, DX\n    SHRQ $2, DX\n    SHLQ $2, DX\n\nloopSSE2:\n    MOVOU 0(BX)(SI*4), X0\n    MOVOU 4(BX)(SI*4), X1\n    PSUBL X0, X1\n    MOVOU X1, (AX)(SI*4)\n    ADDQ $4, SI\n    CMPQ SI, DX\n    JNE loopSSE2\n    JMP test\nloop:\n    MOVL 0(BX)(SI*4), R8\n    MOVL 4(BX)(SI*4), R9\n    SUBL R8, R9\n    MOVL R9, (AX)(SI*4)\n    INCQ SI\ntest:\n    CMPQ SI, CX\n    JNE loop\n    RET\n\n// func decodeByteArrayLengths(offsets []uint32, length []int32) (lastOffset uint32, invalidLength int32)\nTEXT ·decodeByteArrayLengths(SB), NOSPLIT, $0-56\n    MOVQ offsets_base+0(FP), AX\n    MOVQ lengths_base+24(FP), BX\n    MOVQ lengths_len+32(FP), CX\n\n    XORQ DX, DX // lastOffset\n    XORQ DI, DI // invalidLength\n    XORQ SI, SI\n\n    CMPQ CX, $4\n    JL test\n\n    MOVQ CX, R8\n    SHRQ $2, R8\n    SHLQ $2, R8\n\n    MOVL $0, (AX)\n    PXOR X0, X0\n    PXOR X3, X3\n    // This loop computes the prefix sum of the lengths array in order to\n    // generate values of the offsets array.\n    //\n    // We stick to SSE2 to keep the code simple (the Go compiler appears to\n    // assume that SSE2 must be supported on AMD64) which already yields most\n    // of the performance that we could get on this subroutine if we were using\n    // AVX2.\n    //\n    // The X3 register also accumulates a mask of all length values, which is\n    // checked after the loop to determine whether any of the lengths were\n    // negative.\n    //\n    // The following article contains a description of the prefix sum algorithm\n    // used in this function: https://en.algorithmica.org/hpc/algorithms/prefix/\nloopSSE2:\n    MOVOU (BX)(SI*4), X1\n    POR X1, X3\n\n    MOVOA X1, X2\n    PSLLDQ $4, X2\n    PADDD X2, X1\n\n    MOVOA X1, X2\n    PSLLDQ $8, X2\n    PADDD X2, X1\n\n    PADDD X1, X0\n    MOVOU X0, 4(AX)(SI*4)\n\n    PSHUFD $0b11111111, X0, X0\n\n    ADDQ $4, SI\n    CMPQ SI, R8\n    JNE loopSSE2\n\n    // If any of the most significant bits of double words in the X3 register\n    // are set to 1, it indicates that one of the lengths was negative and\n    // therefore the prefix sum is invalid.\n    //\n    // TODO: we report the invalid length as -1, effectively losing the original\n    // value due to the aggregation within X3. This is something that we might\n    // want to address in the future to provide better error reporting.\n    MOVMSKPS X3, R8\n    MOVL $-1, R9\n    CMPL R8, $0\n    CMOVLNE R9, DI\n\n    MOVQ X0, DX\n    JMP test\nloop:\n    MOVL (BX)(SI*4), R8\n    MOVL DX, (AX)(SI*4)\n    ADDL R8, DX\n    CMPL R8, $0\n    CMOVLLT R8, DI\n    INCQ SI\ntest:\n    CMPQ SI, CX\n    JNE loop\n\n    MOVL DX, (AX)(SI*4)\n    MOVL DX, lastOffset+48(FP)\n    MOVL DI, invalidLength+52(FP)\n    RET\n"
  },
  {
    "path": "encoding/delta/length_byte_array_purego.go",
    "content": "//go:build purego || !amd64\n\npackage delta\n\nfunc encodeByteArrayLengths(lengths []int32, offsets []uint32) {\n\tfor i := range lengths {\n\t\tlengths[i] = int32(offsets[i+1] - offsets[i])\n\t}\n}\n\nfunc decodeByteArrayLengths(offsets []uint32, lengths []int32) (uint32, int32) {\n\tlastOffset := uint32(0)\n\n\tfor i, n := range lengths {\n\t\tif n < 0 {\n\t\t\treturn lastOffset, n\n\t\t}\n\t\toffsets[i] = lastOffset\n\t\tlastOffset += uint32(n)\n\t}\n\n\toffsets[len(lengths)] = lastOffset\n\treturn lastOffset, 0\n}\n"
  },
  {
    "path": "encoding/delta/length_byte_array_test.go",
    "content": "package delta\n\nimport \"testing\"\n\nfunc TestDecodeByteArrayLengths(t *testing.T) {\n\tlengths := make([]int32, 999)\n\toffsets := make([]uint32, len(lengths)+1)\n\n\ttotalLength := uint32(0)\n\tfor i := range lengths {\n\t\tlengths[i] = int32(i)\n\t\ttotalLength += uint32(i)\n\t}\n\n\tlastOffset, invalidLength := decodeByteArrayLengths(offsets, lengths)\n\tif invalidLength != 0 {\n\t\tt.Fatal(\"wrong invalid length:\", invalidLength)\n\t}\n\tif lastOffset != totalLength {\n\t\tt.Fatalf(\"wrong last offset: want=%d got=%d\", lastOffset, totalLength)\n\t}\n\n\texpectOffset := uint32(0)\n\tfor i, offset := range offsets[:len(lengths)] {\n\t\tif offset != expectOffset {\n\t\t\tt.Fatalf(\"wrong offset at index %d: want=%d got=%d\", i, expectOffset, offset)\n\t\t}\n\t\texpectOffset += uint32(lengths[i])\n\t}\n\n\tif offsets[len(lengths)] != lastOffset {\n\t\tt.Fatalf(\"wrong last offset: want=%d got=%d\", lastOffset, offsets[len(lengths)])\n\t}\n}\n"
  },
  {
    "path": "encoding/delta/testdata/fuzz/FuzzDeltaByteArray/2404234dd7e87c04303eb7e58208d5b2ccb04fb616c18f3254e2375c4bc327e3",
    "content": "go test fuzz v1\n[]byte(\"\\x80\\xf8\\xa9\\xaf\\x14\\xfc\\r\\rR1000\")\nint64(13)\n"
  },
  {
    "path": "encoding/delta/testdata/fuzz/FuzzDeltaByteArray/4cf9c92e5a2096e3d6c42eaf9b1e31d2567854d33e06c8d2d7a8c46437345850",
    "content": "go test fuzz v1\n[]byte(\"\\xa1\\xa1\\xa1\\xa1\\xa1\\xa1\\xa1\\xa1\\xa100\")\nint64(-180)\n"
  },
  {
    "path": "encoding/delta/testdata/fuzz/FuzzDeltaByteArray/9b210529f5e34e2dea5824929bf0d8242dc9c3165c0dce10bb376c50e21b38cc",
    "content": "go test fuzz v1\n[]byte(\"\\x800000\\xc9\\xc9\\xc9\\xc9\\xc9\\xc9\\xc9\\xc9\\xc900000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\")\nint64(-79)\n"
  },
  {
    "path": "encoding/delta/testdata/fuzz/FuzzDeltaByteArray/fbe137144bcda3a149c8ea109703f3242192c5480ea1e82dde0ea24e94f3afef",
    "content": "go test fuzz v1\n[]byte(\"\\x8000000\")\nint64(-97)\n"
  },
  {
    "path": "encoding/encoding.go",
    "content": "// Package encoding provides the generic APIs implemented by parquet encodings\n// in its sub-packages.\npackage encoding\n\nimport (\n\t\"math\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\nconst (\n\tMaxFixedLenByteArraySize = math.MaxInt16\n)\n\n// The Encoding interface is implemented by types representing parquet column\n// encodings.\n//\n// Encoding instances must be safe to use concurrently from multiple goroutines.\ntype Encoding interface {\n\t// Returns a human-readable name for the encoding.\n\tString() string\n\n\t// Returns the parquet code representing the encoding.\n\tEncoding() format.Encoding\n\n\t// Encode methods serialize the source sequence of values into the\n\t// destination buffer, potentially reallocating it if it was too short to\n\t// contain the output.\n\t//\n\t// The methods panic if the type of src values differ from the type of\n\t// values being encoded.\n\tEncodeLevels(dst []byte, src []uint8) ([]byte, error)\n\tEncodeBoolean(dst []byte, src []byte) ([]byte, error)\n\tEncodeInt32(dst []byte, src []int32) ([]byte, error)\n\tEncodeInt64(dst []byte, src []int64) ([]byte, error)\n\tEncodeInt96(dst []byte, src []deprecated.Int96) ([]byte, error)\n\tEncodeFloat(dst []byte, src []float32) ([]byte, error)\n\tEncodeDouble(dst []byte, src []float64) ([]byte, error)\n\tEncodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, error)\n\tEncodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error)\n\n\t// Decode methods deserialize from the source buffer into the destination\n\t// slice, potentially growing it if it was too short to contain the result.\n\t//\n\t// The methods panic if the type of dst values differ from the type of\n\t// values being decoded.\n\tDecodeLevels(dst []uint8, src []byte) ([]uint8, error)\n\tDecodeBoolean(dst []byte, src []byte) ([]byte, error)\n\tDecodeInt32(dst []int32, src []byte) ([]int32, error)\n\tDecodeInt64(dst []int64, src []byte) ([]int64, error)\n\tDecodeInt96(dst []deprecated.Int96, src []byte) ([]deprecated.Int96, error)\n\tDecodeFloat(dst []float32, src []byte) ([]float32, error)\n\tDecodeDouble(dst []float64, src []byte) ([]float64, error)\n\tDecodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, []uint32, error)\n\tDecodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error)\n\n\t// Computes an estimation of the output size of decoding the encoded page\n\t// of values passed as argument.\n\t//\n\t// Note that this is an estimate, it is useful to preallocate the output\n\t// buffer that will be passed to the decode method, but the actual output\n\t// size may be different.\n\t//\n\t// The estimate never errors since it is not intended to be used as an\n\t// input validation method.\n\tEstimateDecodeByteArraySize(src []byte) int\n\n\t// When this method returns true, the encoding supports receiving the same\n\t// buffer as source and destination.\n\tCanDecodeInPlace() bool\n}\n"
  },
  {
    "path": "encoding/encoding_test.go",
    "content": "package encoding_test\n\nimport (\n\t\"bytes\"\n\t\"io\"\n\t\"math\"\n\t\"math/bits\"\n\t\"math/rand\"\n\t\"testing\"\n\t\"time\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/encoding/bitpacked\"\n\t\"github.com/segmentio/parquet-go/encoding/bytestreamsplit\"\n\t\"github.com/segmentio/parquet-go/encoding/delta\"\n\t\"github.com/segmentio/parquet-go/encoding/plain\"\n\t\"github.com/segmentio/parquet-go/encoding/rle\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\nfunc repeatInt64(seq []int64, n int) []int64 {\n\trep := make([]int64, len(seq)*n)\n\tfor i := 0; i < n; i++ {\n\t\tcopy(rep[i*len(seq):], seq)\n\t}\n\treturn rep\n}\n\nvar booleanTests = [...][]bool{\n\t{},\n\t{true},\n\t{false},\n\t{true, false, true, false, true, true, true, false, false, true},\n\t{ // repeating 32x\n\t\ttrue, true, true, true, true, true, true, true,\n\t\ttrue, true, true, true, true, true, true, true,\n\t\ttrue, true, true, true, true, true, true, true,\n\t\ttrue, true, true, true, true, true, true, true,\n\t},\n\t{ // repeating 33x\n\t\ttrue, true, true, true, true, true, true, true,\n\t\ttrue, true, true, true, true, true, true, true,\n\t\ttrue, true, true, true, true, true, true, true,\n\t\ttrue, true, true, true, true, true, true, true,\n\t\ttrue,\n\t},\n\t{ // alternating 15x\n\t\tfalse, true, false, true, false, true, false, true,\n\t\tfalse, true, false, true, false, true, false,\n\t},\n\t{ // alternating 16x\n\t\tfalse, true, false, true, false, true, false, true,\n\t\tfalse, true, false, true, false, true, false, true,\n\t},\n}\n\nvar levelsTests = [...][]byte{\n\t{},\n\t{0},\n\t{1},\n\t{0, 1, 0, 2, 3, 4, 5, 6, math.MaxInt8, math.MaxInt8, 0},\n\t{ // repeating 24x\n\t\t42, 42, 42, 42, 42, 42, 42, 42,\n\t\t42, 42, 42, 42, 42, 42, 42, 42,\n\t\t42, 42, 42, 42, 42, 42, 42, 42,\n\t},\n\t{ // never repeating\n\t\t0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,\n\t\t0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,\n\t\t0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,\n\t\t0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,\n\t},\n\t{ // streaks of repeating values\n\t\t0, 0, 0, 0, 1, 1, 1, 1,\n\t\t2, 2, 2, 2, 3, 3, 3, 3,\n\t\t4, 4, 4, 4, 5, 5, 5, 5,\n\t\t6, 6, 6, 7, 7, 7, 8, 8,\n\t\t8, 9, 9, 9,\n\t},\n}\n\nvar int32Tests = [...][]int32{\n\t{},\n\t{0},\n\t{1},\n\t{-1, 0, 1, 0, 2, 3, 4, 5, 6, math.MaxInt32, math.MaxInt32, 0},\n\t{ // repeating 24x\n\t\t42, 42, 42, 42, 42, 42, 42, 42,\n\t\t42, 42, 42, 42, 42, 42, 42, 42,\n\t\t42, 42, 42, 42, 42, 42, 42, 42,\n\t},\n\t{ // never repeating\n\t\t0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,\n\t\t0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,\n\t\t0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,\n\t\t0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,\n\t},\n\t{ // streaks of repeating values\n\t\t0, 0, 0, 0, 1, 1, 1, 1,\n\t\t2, 2, 2, 2, 3, 3, 3, 3,\n\t\t4, 4, 4, 4, 5, 5, 5, 5,\n\t\t6, 6, 6, 7, 7, 7, 8, 8,\n\t\t8, 9, 9, 9,\n\t},\n\t{ // a sequence that triggered a bug in the delta binary packed encoding\n\t\t24, 36, 47, 32, 29, 4, 9, 20, 2, 18,\n\t},\n}\n\nvar int64Tests = [...][]int64{\n\t{},\n\t{0},\n\t{1},\n\t{-1, 0, 1, 0, 2, 3, 4, 5, 6, math.MaxInt64, math.MaxInt64, 0},\n\t{ // repeating 24x\n\t\t42, 42, 42, 42, 42, 42, 42, 42,\n\t\t42, 42, 42, 42, 42, 42, 42, 42,\n\t\t42, 42, 42, 42, 42, 42, 42, 42,\n\t},\n\t{ // never repeating\n\t\t0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,\n\t\t0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,\n\t\t0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,\n\t\t0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,\n\t},\n\t{ // streaks of repeating values\n\t\t0, 0, 0, 0, 1, 1, 1, 1,\n\t\t2, 2, 2, 2, 3, 3, 3, 3,\n\t\t4, 4, 4, 4, 5, 5, 5, 5,\n\t\t6, 6, 6, 7, 7, 7, 8, 8,\n\t\t8, 9, 9, 9,\n\t},\n\t{ // streaks of repeating values\n\t\t0, 0, 0, 0, 1, 1, 1, 1,\n\t\t2, 2, 2, 2, 3, 3, 3, 3,\n\t\t4, 4, 4, 4, 5, 5, 5, 5,\n\t\t6, 6, 6, 7, 7, 7, 8, 8,\n\t\t8, 9, 9, 9,\n\t},\n\trepeatInt64( // a sequence resulting in 64 bits words in the delta binary packed encoding\n\t\t[]int64{\n\t\t\tmath.MinInt64, math.MaxInt64, math.MinInt64, math.MaxInt64,\n\t\t\tmath.MinInt64, math.MaxInt64, math.MinInt64, math.MaxInt64,\n\n\t\t\t0, math.MaxInt64, math.MinInt64, math.MaxInt64,\n\t\t\tmath.MinInt64, math.MaxInt64, math.MinInt64, math.MaxInt64,\n\t\t},\n\t\t5,\n\t),\n}\n\nvar int96Tests = [...][]deprecated.Int96{\n\t{},\n\t{{0: 0}},\n\t{{0: 1}},\n}\n\nvar floatTests = [...][]float32{\n\t{},\n\t{0},\n\t{1},\n\t{0, 1, 0, 1, 0, 2, 3, 4, 5, 6, math.MaxFloat32, math.MaxFloat32, 0},\n\t{-1, 0, 1, 0, 2, 3, 4, 5, 6, math.MaxFloat32, math.MaxFloat32, 0},\n}\n\nvar doubleTests = [...][]float64{\n\t{},\n\t{0},\n\t{1},\n\t{-1, 0, 1, 0, 2, 3, 4, 5, 6, math.MaxFloat64, math.MaxFloat64, 0},\n}\n\nvar byteArrayTests = [...][][]byte{\n\t{},\n\t{[]byte(\"\")},\n\t{[]byte(\"A\"), []byte(\"B\"), []byte(\"C\")},\n\t{[]byte(\"hello world!\"), bytes.Repeat([]byte(\"1234567890\"), 100)},\n}\n\nvar fixedLenByteArrayTests = [...]struct {\n\tsize int\n\tdata []byte\n}{\n\t{size: 1, data: []byte(\"\")},\n\t{size: 1, data: []byte(\"ABCDEFGH\")},\n\t{size: 2, data: []byte(\"ABCDEFGH\")},\n\t{size: 4, data: []byte(\"ABCDEFGH\")},\n\t{size: 8, data: []byte(\"ABCDEFGH\")},\n\t{size: 10, data: bytes.Repeat([]byte(\"123456789\"), 100)},\n\t{size: 16, data: bytes.Repeat([]byte(\"1234567890\"), 160)},\n}\n\nvar encodings = [...]encoding.Encoding{\n\tnew(plain.Encoding),\n\tnew(rle.Encoding),\n\tnew(bitpacked.Encoding),\n\tnew(plain.DictionaryEncoding),\n\tnew(rle.DictionaryEncoding),\n\tnew(delta.BinaryPackedEncoding),\n\tnew(delta.LengthByteArrayEncoding),\n\tnew(delta.ByteArrayEncoding),\n\tnew(bytestreamsplit.Encoding),\n}\n\nfunc TestEncoding(t *testing.T) {\n\tfor _, encoding := range encodings {\n\t\tt.Run(encoding.String(), func(t *testing.T) { testEncoding(t, encoding) })\n\t}\n}\n\nfunc testEncoding(t *testing.T, e encoding.Encoding) {\n\tfor _, test := range [...]struct {\n\t\tscenario string\n\t\tfunction func(*testing.T, encoding.Encoding)\n\t}{\n\t\t{\n\t\t\tscenario: \"boolean\",\n\t\t\tfunction: testBooleanEncoding,\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"levels\",\n\t\t\tfunction: testLevelsEncoding,\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"int32\",\n\t\t\tfunction: testInt32Encoding,\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"int64\",\n\t\t\tfunction: testInt64Encoding,\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"int96\",\n\t\t\tfunction: testInt96Encoding,\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"float\",\n\t\t\tfunction: testFloatEncoding,\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"double\",\n\t\t\tfunction: testDoubleEncoding,\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"byte array\",\n\t\t\tfunction: testByteArrayEncoding,\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"fixed length byte array\",\n\t\t\tfunction: testFixedLenByteArrayEncoding,\n\t\t},\n\t} {\n\t\tt.Run(test.scenario, func(t *testing.T) { test.function(t, e) })\n\t}\n}\n\nfunc setBitWidth(enc encoding.Encoding, bitWidth int) {\n\tswitch e := enc.(type) {\n\tcase *rle.Encoding:\n\t\te.BitWidth = bitWidth\n\tcase *bitpacked.Encoding:\n\t\te.BitWidth = bitWidth\n\t}\n}\n\ntype encodingFunc func(encoding.Encoding, []byte, []byte) ([]byte, error)\n\nfunc testBooleanEncoding(t *testing.T, e encoding.Encoding) {\n\ttestCanEncodeBoolean(t, e)\n\tbuffer := []byte{}\n\tvalues := []byte{}\n\tinput := []byte{}\n\tsetBitWidth(e, 1)\n\n\tfor _, test := range booleanTests {\n\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\tvar err error\n\n\t\t\tinput = input[:0]\n\t\t\tcount := 0\n\t\t\tfor _, value := range test {\n\t\t\t\tinput = plain.AppendBoolean(input, count, value)\n\t\t\t\tcount++\n\t\t\t}\n\n\t\t\tbuffer, err = e.EncodeBoolean(buffer, input)\n\t\t\tassertNoError(t, err)\n\t\t\tvalues, err = e.DecodeBoolean(values, buffer)\n\t\t\tassertNoError(t, err)\n\t\t\tassertEqualBytes(t, input, values)\n\t\t})\n\t}\n}\n\nfunc testLevelsEncoding(t *testing.T, e encoding.Encoding) {\n\ttestCanEncodeLevels(t, e)\n\tbuffer := []byte{}\n\tvalues := []byte{}\n\n\tfor _, input := range levelsTests {\n\t\tsetBitWidth(e, maxLenInt8(unsafecast.BytesToInt8(input)))\n\n\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\tvar err error\n\t\t\tbuffer, err = e.EncodeLevels(buffer, input)\n\t\t\tassertNoError(t, err)\n\t\t\tvalues, err = e.DecodeLevels(values, buffer)\n\t\t\tassertNoError(t, err)\n\t\t\tassertEqualBytes(t, input, values[:len(input)])\n\t\t})\n\t}\n}\n\nfunc testInt32Encoding(t *testing.T, e encoding.Encoding) {\n\ttestCanEncodeInt32(t, e)\n\tbuffer := []byte{}\n\tvalues := []int32{}\n\n\tfor _, input := range int32Tests {\n\t\tsetBitWidth(e, maxLenInt32(input))\n\n\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\tvar err error\n\t\t\tbuffer, err = e.EncodeInt32(buffer, input)\n\t\t\tassertNoError(t, err)\n\t\t\tvalues, err = e.DecodeInt32(values, buffer)\n\t\t\tassertNoError(t, err)\n\t\t\tassertEqualInt32(t, input, values)\n\t\t})\n\t}\n}\n\nfunc testInt64Encoding(t *testing.T, e encoding.Encoding) {\n\ttestCanEncodeInt64(t, e)\n\tbuffer := []byte{}\n\tvalues := []int64{}\n\n\tfor _, input := range int64Tests {\n\t\tsetBitWidth(e, maxLenInt64(input))\n\n\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\tvar err error\n\t\t\tbuffer, err = e.EncodeInt64(buffer, input)\n\t\t\tassertNoError(t, err)\n\t\t\tvalues, err = e.DecodeInt64(values, buffer)\n\t\t\tassertNoError(t, err)\n\t\t\tassertEqualInt64(t, input, values)\n\t\t})\n\t}\n}\n\nfunc testInt96Encoding(t *testing.T, e encoding.Encoding) {\n\ttestCanEncodeInt96(t, e)\n\tbuffer := []byte{}\n\tvalues := []deprecated.Int96{}\n\n\tfor _, input := range int96Tests {\n\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\tvar err error\n\t\t\tbuffer, err = e.EncodeInt96(buffer, input)\n\t\t\tassertNoError(t, err)\n\t\t\tvalues, err = e.DecodeInt96(values, buffer)\n\t\t\tassertNoError(t, err)\n\t\t\tassertEqualInt96(t, input, values)\n\t\t})\n\t}\n}\n\nfunc testFloatEncoding(t *testing.T, e encoding.Encoding) {\n\ttestCanEncodeFloat(t, e)\n\tbuffer := []byte{}\n\tvalues := []float32{}\n\n\tfor _, input := range floatTests {\n\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\tvar err error\n\t\t\tbuffer, err = e.EncodeFloat(buffer, input)\n\t\t\tassertNoError(t, err)\n\t\t\tvalues, err = e.DecodeFloat(values, buffer)\n\t\t\tassertNoError(t, err)\n\t\t\tassertEqualFloat32(t, input, values)\n\t\t})\n\t}\n}\n\nfunc testDoubleEncoding(t *testing.T, e encoding.Encoding) {\n\ttestCanEncodeDouble(t, e)\n\tbuffer := []byte{}\n\tvalues := []float64{}\n\n\tfor _, input := range doubleTests {\n\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\tvar err error\n\t\t\tbuffer, err = e.EncodeDouble(buffer, input)\n\t\t\tassertNoError(t, err)\n\t\t\tvalues, err = e.DecodeDouble(values, buffer)\n\t\t\tassertNoError(t, err)\n\t\t\tassertEqualFloat64(t, input, values)\n\t\t})\n\t}\n}\n\nfunc testByteArrayEncoding(t *testing.T, e encoding.Encoding) {\n\ttestCanEncodeByteArray(t, e)\n\tinput := []byte{}\n\tbuffer := []byte{}\n\tvalues := []byte{}\n\toffsets := []uint32{}\n\n\tfor _, test := range byteArrayTests {\n\t\toffsets, input = offsets[:0], input[:0]\n\t\tlastOffset := uint32(0)\n\n\t\tfor _, value := range test {\n\t\t\toffsets = append(offsets, lastOffset)\n\t\t\tinput = append(input, value...)\n\t\t\tlastOffset += uint32(len(value))\n\t\t}\n\n\t\toffsets = append(offsets, lastOffset)\n\n\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\tvar err error\n\t\t\tbuffer, err = e.EncodeByteArray(buffer, input, offsets)\n\t\t\tassertNoError(t, err)\n\t\t\testimatedOutputSize := e.EstimateDecodeByteArraySize(buffer)\n\t\t\tvalues, _, err = e.DecodeByteArray(values, buffer, offsets)\n\t\t\tassertNoError(t, err)\n\t\t\tassertEqualBytes(t, input, values)\n\t\t\tif len(values) > estimatedOutputSize {\n\t\t\t\tt.Errorf(\"the decode output was larger than the estimate: %d>%d\", len(values), estimatedOutputSize)\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc testFixedLenByteArrayEncoding(t *testing.T, e encoding.Encoding) {\n\ttestCanEncodeFixedLenByteArray(t, e)\n\tbuffer := []byte{}\n\tvalues := []byte{}\n\n\tfor _, test := range fixedLenByteArrayTests {\n\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\tvar err error\n\t\t\tbuffer, err = e.EncodeFixedLenByteArray(buffer, test.data, test.size)\n\t\t\tassertNoError(t, err)\n\t\t\tvalues, err = e.DecodeFixedLenByteArray(values, buffer, test.size)\n\t\t\tassertNoError(t, err)\n\t\t\tassertEqualBytes(t, test.data, values)\n\t\t})\n\t}\n}\n\nfunc testCanEncodeBoolean(t testing.TB, e encoding.Encoding) {\n\ttestCanEncode(t, e, encoding.CanEncodeBoolean)\n}\n\nfunc testCanEncodeLevels(t testing.TB, e encoding.Encoding) {\n\ttestCanEncode(t, e, encoding.CanEncodeLevels)\n}\n\nfunc testCanEncodeInt32(t testing.TB, e encoding.Encoding) {\n\ttestCanEncode(t, e, encoding.CanEncodeInt32)\n}\n\nfunc testCanEncodeInt64(t testing.TB, e encoding.Encoding) {\n\ttestCanEncode(t, e, encoding.CanEncodeInt64)\n}\n\nfunc testCanEncodeInt96(t testing.TB, e encoding.Encoding) {\n\ttestCanEncode(t, e, encoding.CanEncodeInt96)\n}\n\nfunc testCanEncodeFloat(t testing.TB, e encoding.Encoding) {\n\ttestCanEncode(t, e, encoding.CanEncodeFloat)\n}\n\nfunc testCanEncodeDouble(t testing.TB, e encoding.Encoding) {\n\ttestCanEncode(t, e, encoding.CanEncodeDouble)\n}\n\nfunc testCanEncodeByteArray(t testing.TB, e encoding.Encoding) {\n\ttestCanEncode(t, e, encoding.CanEncodeByteArray)\n}\n\nfunc testCanEncodeFixedLenByteArray(t testing.TB, e encoding.Encoding) {\n\ttestCanEncode(t, e, encoding.CanEncodeFixedLenByteArray)\n}\n\nfunc testCanEncode(t testing.TB, e encoding.Encoding, test func(encoding.Encoding) bool) {\n\tif !test(e) {\n\t\tt.Skip(\"encoding not supported\")\n\t}\n}\n\nfunc assertNoError(t *testing.T, err error) {\n\tt.Helper()\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n}\n\nfunc assertEqualBytes(t *testing.T, want, got []byte) {\n\tt.Helper()\n\tif !bytes.Equal(want, got) {\n\t\tt.Fatalf(\"values mismatch:\\nwant = %q\\ngot  = %q\", want, got)\n\t}\n}\n\nfunc assertEqualInt32(t *testing.T, want, got []int32) {\n\tt.Helper()\n\tassertEqualBytes(t, unsafecast.Int32ToBytes(want), unsafecast.Int32ToBytes(got))\n}\n\nfunc assertEqualInt64(t *testing.T, want, got []int64) {\n\tt.Helper()\n\tassertEqualBytes(t, unsafecast.Int64ToBytes(want), unsafecast.Int64ToBytes(got))\n}\n\nfunc assertEqualInt96(t *testing.T, want, got []deprecated.Int96) {\n\tt.Helper()\n\tassertEqualBytes(t, deprecated.Int96ToBytes(want), deprecated.Int96ToBytes(got))\n}\n\nfunc assertEqualFloat32(t *testing.T, want, got []float32) {\n\tt.Helper()\n\tassertEqualBytes(t, unsafecast.Float32ToBytes(want), unsafecast.Float32ToBytes(got))\n}\n\nfunc assertEqualFloat64(t *testing.T, want, got []float64) {\n\tt.Helper()\n\tassertEqualBytes(t, unsafecast.Float64ToBytes(want), unsafecast.Float64ToBytes(got))\n}\n\nconst (\n\tbenchmarkNumValues = 10e3\n)\n\nfunc newRand() *rand.Rand {\n\treturn rand.New(rand.NewSource(1))\n}\n\nfunc BenchmarkEncode(b *testing.B) {\n\tfor _, encoding := range encodings {\n\t\tb.Run(encoding.String(), func(b *testing.B) { benchmarkEncode(b, encoding) })\n\t}\n}\n\nfunc benchmarkEncode(b *testing.B, e encoding.Encoding) {\n\tfor _, test := range [...]struct {\n\t\tscenario string\n\t\tfunction func(*testing.B, encoding.Encoding)\n\t}{\n\t\t{\n\t\t\tscenario: \"boolean\",\n\t\t\tfunction: benchmarkEncodeBoolean,\n\t\t},\n\t\t{\n\t\t\tscenario: \"levels\",\n\t\t\tfunction: benchmarkEncodeLevels,\n\t\t},\n\t\t{\n\t\t\tscenario: \"int32\",\n\t\t\tfunction: benchmarkEncodeInt32,\n\t\t},\n\t\t{\n\t\t\tscenario: \"int64\",\n\t\t\tfunction: benchmarkEncodeInt64,\n\t\t},\n\t\t{\n\t\t\tscenario: \"float\",\n\t\t\tfunction: benchmarkEncodeFloat,\n\t\t},\n\t\t{\n\t\t\tscenario: \"double\",\n\t\t\tfunction: benchmarkEncodeDouble,\n\t\t},\n\t\t{\n\t\t\tscenario: \"byte array\",\n\t\t\tfunction: benchmarkEncodeByteArray,\n\t\t},\n\t\t{\n\t\t\tscenario: \"fixed length byte array\",\n\t\t\tfunction: benchmarkEncodeFixedLenByteArray,\n\t\t},\n\t} {\n\t\tb.Run(test.scenario, func(b *testing.B) { test.function(b, e) })\n\t}\n}\n\nfunc benchmarkEncodeBoolean(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeBoolean(b, e)\n\tbuffer := make([]byte, 0)\n\tvalues := generateBooleanValues(benchmarkNumValues, newRand())\n\tsetBitWidth(e, 1)\n\n\treportThroughput(b, benchmarkNumValues, len(values), func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tbuffer, _ = e.EncodeBoolean(buffer, values)\n\t\t})\n\t})\n}\n\nfunc benchmarkEncodeLevels(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeLevels(b, e)\n\tbuffer := make([]byte, 0)\n\tvalues := generateLevelValues(benchmarkNumValues, newRand())\n\tsetBitWidth(e, maxLenInt8(unsafecast.BytesToInt8(values)))\n\n\treportThroughput(b, benchmarkNumValues, len(values), func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tbuffer, _ = e.EncodeLevels(buffer, values)\n\t\t})\n\t})\n}\n\nfunc benchmarkEncodeInt32(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeInt32(b, e)\n\tbuffer := make([]byte, 0)\n\tvalues := generateInt32Values(benchmarkNumValues, newRand())\n\tsetBitWidth(e, maxLenInt32(values))\n\n\treportThroughput(b, benchmarkNumValues, 4*len(values), func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tbuffer, _ = e.EncodeInt32(buffer, values)\n\t\t})\n\t})\n}\n\nfunc benchmarkEncodeInt64(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeInt64(b, e)\n\tbuffer := make([]byte, 0)\n\tvalues := generateInt64Values(benchmarkNumValues, newRand())\n\tsetBitWidth(e, maxLenInt64(values))\n\n\treportThroughput(b, benchmarkNumValues, 8*len(values), func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tbuffer, _ = e.EncodeInt64(buffer, values)\n\t\t})\n\t})\n}\n\nfunc benchmarkEncodeFloat(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeFloat(b, e)\n\tbuffer := make([]byte, 0)\n\tvalues := generateFloatValues(benchmarkNumValues, newRand())\n\n\treportThroughput(b, benchmarkNumValues, 4*len(values), func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tbuffer, _ = e.EncodeFloat(buffer, values)\n\t\t})\n\t})\n}\n\nfunc benchmarkEncodeDouble(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeDouble(b, e)\n\tbuffer := make([]byte, 0)\n\tvalues := generateDoubleValues(benchmarkNumValues, newRand())\n\n\treportThroughput(b, benchmarkNumValues, 8*len(values), func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tbuffer, _ = e.EncodeDouble(buffer, values)\n\t\t})\n\t})\n}\n\nfunc benchmarkEncodeByteArray(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeByteArray(b, e)\n\tbuffer := make([]byte, 0)\n\tvalues, offsets := generateByteArrayValues(benchmarkNumValues, newRand())\n\n\tnumBytes := len(values) + 4*len(offsets)\n\treportThroughput(b, benchmarkNumValues, numBytes, func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tbuffer, _ = e.EncodeByteArray(buffer, values, offsets)\n\t\t})\n\t})\n}\n\nfunc benchmarkEncodeFixedLenByteArray(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeFixedLenByteArray(b, e)\n\tconst size = 16\n\tbuffer := make([]byte, 0)\n\tvalues := generateFixedLenByteArrayValues(benchmarkNumValues, newRand(), size)\n\n\treportThroughput(b, benchmarkNumValues, len(values), func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tbuffer, _ = e.EncodeFixedLenByteArray(buffer, values, size)\n\t\t})\n\t})\n}\n\nfunc BenchmarkDecode(b *testing.B) {\n\tfor _, encoding := range encodings {\n\t\tb.Run(encoding.String(), func(b *testing.B) { benchmarkDecode(b, encoding) })\n\t}\n}\n\nfunc benchmarkDecode(b *testing.B, e encoding.Encoding) {\n\tfor _, test := range [...]struct {\n\t\tscenario string\n\t\tfunction func(*testing.B, encoding.Encoding)\n\t}{\n\t\t{\n\t\t\tscenario: \"boolean\",\n\t\t\tfunction: benchmarkDecodeBoolean,\n\t\t},\n\t\t{\n\t\t\tscenario: \"levels\",\n\t\t\tfunction: benchmarkDecodeLevels,\n\t\t},\n\t\t{\n\t\t\tscenario: \"int32\",\n\t\t\tfunction: benchmarkDecodeInt32,\n\t\t},\n\t\t{\n\t\t\tscenario: \"int64\",\n\t\t\tfunction: benchmarkDecodeInt64,\n\t\t},\n\t\t{\n\t\t\tscenario: \"float\",\n\t\t\tfunction: benchmarkDecodeFloat,\n\t\t},\n\t\t{\n\t\t\tscenario: \"double\",\n\t\t\tfunction: benchmarkDecodeDouble,\n\t\t},\n\t\t{\n\t\t\tscenario: \"byte array\",\n\t\t\tfunction: benchmarkDecodeByteArray,\n\t\t},\n\t\t{\n\t\t\tscenario: \"fixed length byte array\",\n\t\t\tfunction: benchmarkDecodeFixedLenByteArray,\n\t\t},\n\t} {\n\t\tb.Run(test.scenario, func(b *testing.B) { test.function(b, e) })\n\t}\n}\n\nfunc benchmarkDecodeBoolean(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeBoolean(b, e)\n\tvalues := generateBooleanValues(benchmarkNumValues, newRand())\n\tsetBitWidth(e, 1)\n\tbuffer, _ := e.EncodeBoolean(nil, values)\n\n\treportThroughput(b, benchmarkNumValues, len(values), func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tvalues, _ = e.DecodeBoolean(values, buffer)\n\t\t})\n\t})\n}\n\nfunc benchmarkDecodeLevels(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeLevels(b, e)\n\tvalues := generateLevelValues(benchmarkNumValues, newRand())\n\tsetBitWidth(e, maxLenInt8(unsafecast.BytesToInt8(values)))\n\tbuffer, _ := e.EncodeLevels(nil, values)\n\n\treportThroughput(b, benchmarkNumValues, len(values), func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tvalues, _ = e.DecodeLevels(values, buffer)\n\t\t})\n\t})\n}\n\nfunc benchmarkDecodeInt32(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeInt32(b, e)\n\tvalues := generateInt32Values(benchmarkNumValues, newRand())\n\tsetBitWidth(e, maxLenInt32(values))\n\tbuffer, _ := e.EncodeInt32(nil, values)\n\n\treportThroughput(b, benchmarkNumValues, 4*len(values), func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tvalues, _ = e.DecodeInt32(values, buffer)\n\t\t})\n\t})\n}\n\nfunc benchmarkDecodeInt64(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeInt64(b, e)\n\tvalues := generateInt64Values(benchmarkNumValues, newRand())\n\tsetBitWidth(e, maxLenInt64(values))\n\tbuffer, _ := e.EncodeInt64(nil, values)\n\n\treportThroughput(b, benchmarkNumValues, 8*len(values), func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tvalues, _ = e.DecodeInt64(values, buffer)\n\t\t})\n\t})\n}\n\nfunc benchmarkDecodeFloat(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeFloat(b, e)\n\tvalues := generateFloatValues(benchmarkNumValues, newRand())\n\tbuffer, _ := e.EncodeFloat(nil, values)\n\n\treportThroughput(b, benchmarkNumValues, 4*len(values), func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tvalues, _ = e.DecodeFloat(values, buffer)\n\t\t})\n\t})\n}\n\nfunc benchmarkDecodeDouble(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeDouble(b, e)\n\tvalues := generateDoubleValues(benchmarkNumValues, newRand())\n\tbuffer, _ := e.EncodeDouble(nil, values)\n\n\treportThroughput(b, benchmarkNumValues, 8*len(values), func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tvalues, _ = e.DecodeDouble(values, buffer)\n\t\t})\n\t})\n}\n\nfunc benchmarkDecodeByteArray(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeByteArray(b, e)\n\tvalues, offsets := generateByteArrayValues(benchmarkNumValues, newRand())\n\tbuffer, _ := e.EncodeByteArray(nil, values, offsets)\n\n\tnumBytes := len(values) + 4*len(offsets)\n\treportThroughput(b, benchmarkNumValues, numBytes, func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tvalues, offsets, _ = e.DecodeByteArray(values, buffer, offsets)\n\t\t})\n\t})\n}\n\nfunc benchmarkDecodeFixedLenByteArray(b *testing.B, e encoding.Encoding) {\n\ttestCanEncodeFixedLenByteArray(b, e)\n\tconst size = 16\n\tvalues := generateFixedLenByteArrayValues(benchmarkNumValues, newRand(), size)\n\tbuffer, _ := e.EncodeFixedLenByteArray(nil, values, size)\n\n\treportThroughput(b, benchmarkNumValues, len(values), func() {\n\t\tbenchmarkZeroAllocsPerRun(b, func() {\n\t\t\tvalues, _ = e.DecodeFixedLenByteArray(values, buffer, size)\n\t\t})\n\t})\n}\n\nfunc benchmarkZeroAllocsPerRun(b *testing.B, f func()) {\n\tif allocs := testing.AllocsPerRun(b.N, f); allocs != 0 && !testing.Short() {\n\t\tb.Errorf(\"too many memory allocations: %g\", allocs)\n\t}\n}\n\nfunc reportThroughput(b *testing.B, numValues, numBytes int, do func()) {\n\tstart := time.Now()\n\tdo()\n\tseconds := time.Since(start).Seconds()\n\tb.SetBytes(int64(numBytes))\n\tb.ReportMetric(float64(b.N*numValues)/seconds, \"value/s\")\n}\n\nfunc generateLevelValues(n int, r *rand.Rand) []uint8 {\n\tvalues := make([]uint8, n)\n\tfor i := range values {\n\t\tvalues[i] = uint8(r.Intn(6))\n\t}\n\treturn values\n}\n\nfunc generateBooleanValues(n int, r *rand.Rand) []byte {\n\tvalues := make([]byte, n/8+1)\n\tio.ReadFull(r, values)\n\treturn values\n}\n\nfunc generateInt32Values(n int, r *rand.Rand) []int32 {\n\tvalues := make([]int32, n)\n\tfor i := range values {\n\t\tvalues[i] = r.Int31n(100)\n\t}\n\treturn values\n}\n\nfunc generateInt64Values(n int, r *rand.Rand) []int64 {\n\tvalues := make([]int64, n)\n\tfor i := range values {\n\t\tvalues[i] = r.Int63n(100)\n\t}\n\treturn values\n}\n\nfunc generateFloatValues(n int, r *rand.Rand) []float32 {\n\tvalues := make([]float32, n)\n\tfor i := range values {\n\t\tvalues[i] = r.Float32()\n\t}\n\treturn values\n}\n\nfunc generateDoubleValues(n int, r *rand.Rand) []float64 {\n\tvalues := make([]float64, n)\n\tfor i := range values {\n\t\tvalues[i] = r.Float64()\n\t}\n\treturn values\n}\n\nfunc generateByteArrayValues(n int, r *rand.Rand) ([]byte, []uint32) {\n\tconst maxLen = 21\n\toffsets := make([]uint32, n+1)\n\tvalues := make([]byte, n*maxLen)\n\tlength := 0\n\n\tfor i := 0; i < n; i++ {\n\t\tk := r.Intn(maxLen) + 1\n\t\tio.ReadFull(r, values[length:length+k])\n\t\toffsets[i] = uint32(length)\n\t\tlength += k\n\t}\n\n\toffsets[n] = uint32(length)\n\treturn values[:length], offsets\n}\n\nfunc generateFixedLenByteArrayValues(n int, r *rand.Rand, size int) []byte {\n\tvalues := make([]byte, n*size)\n\tio.ReadFull(r, values)\n\treturn values\n}\n\nfunc maxLenInt8(data []int8) int {\n\tmax := 0\n\tfor _, v := range data {\n\t\tif n := bits.Len8(uint8(v)); n > max {\n\t\t\tmax = n\n\t\t}\n\t}\n\treturn max\n}\n\nfunc maxLenInt32(data []int32) int {\n\tmax := 0\n\tfor _, v := range data {\n\t\tif n := bits.Len32(uint32(v)); n > max {\n\t\t\tmax = n\n\t\t}\n\t}\n\treturn max\n}\n\nfunc maxLenInt64(data []int64) int {\n\tmax := 0\n\tfor _, v := range data {\n\t\tif n := bits.Len64(uint64(v)); n > max {\n\t\t\tmax = n\n\t\t}\n\t}\n\treturn max\n}\n"
  },
  {
    "path": "encoding/fuzz/fuzz.go",
    "content": "//go:build go1.18\n// +build go1.18\n\n// Package fuzz contains functions to help fuzz test parquet encodings.\npackage fuzz\n\nimport (\n\t\"math/rand\"\n\t\"testing\"\n\t\"unsafe\"\n\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\nfunc EncodeBoolean(f *testing.F, e encoding.Encoding) {\n\tencode(f, e,\n\t\tencoding.Encoding.EncodeBoolean,\n\t\tencoding.Encoding.DecodeBoolean,\n\t\tgenerate[byte],\n\t)\n}\n\nfunc EncodeLevels(f *testing.F, e encoding.Encoding) {\n\tencode(f, e,\n\t\tencoding.Encoding.EncodeLevels,\n\t\tencoding.Encoding.DecodeLevels,\n\t\tgenerate[byte],\n\t)\n}\n\nfunc EncodeInt32(f *testing.F, e encoding.Encoding) {\n\tencode(f, e,\n\t\tencoding.Encoding.EncodeInt32,\n\t\tencoding.Encoding.DecodeInt32,\n\t\tgenerate[int32],\n\t)\n}\n\nfunc EncodeInt64(f *testing.F, e encoding.Encoding) {\n\tencode(f, e,\n\t\tencoding.Encoding.EncodeInt64,\n\t\tencoding.Encoding.DecodeInt64,\n\t\tgenerate[int64],\n\t)\n}\n\nfunc EncodeFloat(f *testing.F, e encoding.Encoding) {\n\tencode(f, e,\n\t\tencoding.Encoding.EncodeFloat,\n\t\tencoding.Encoding.DecodeFloat,\n\t\tgenerate[float32],\n\t)\n}\n\nfunc EncodeDouble(f *testing.F, e encoding.Encoding) {\n\tencode(f, e,\n\t\tencoding.Encoding.EncodeDouble,\n\t\tencoding.Encoding.DecodeDouble,\n\t\tgenerate[float64],\n\t)\n}\n\nfunc EncodeByteArray(f *testing.F, e encoding.Encoding) {\n\tencode(f, e,\n\t\tfunc(enc encoding.Encoding, dst []byte, src []string) ([]byte, error) {\n\t\t\tsize := 0\n\t\t\tfor _, s := range src {\n\t\t\t\tsize += len(s)\n\t\t\t}\n\n\t\t\toffsets := make([]uint32, 0, len(src)+1)\n\t\t\tvalues := make([]byte, 0, size)\n\n\t\t\tfor _, s := range src {\n\t\t\t\toffsets = append(offsets, uint32(len(values)))\n\t\t\t\tvalues = append(values, s...)\n\t\t\t}\n\n\t\t\toffsets = append(offsets, uint32(len(values)))\n\t\t\treturn enc.EncodeByteArray(dst, values, offsets)\n\t\t},\n\n\t\tfunc(enc encoding.Encoding, dst []string, src []byte) ([]string, error) {\n\t\t\tdst = dst[:0]\n\n\t\t\tvalues, offsets, err := enc.DecodeByteArray(nil, src, nil)\n\t\t\tif err != nil {\n\t\t\t\treturn dst, err\n\t\t\t}\n\n\t\t\tif len(offsets) > 0 {\n\t\t\t\tbaseOffset := offsets[0]\n\n\t\t\t\tfor _, endOffset := range offsets[1:] {\n\t\t\t\t\tdst = append(dst, unsafecast.BytesToString(values[baseOffset:endOffset]))\n\t\t\t\t\tbaseOffset = endOffset\n\t\t\t\t}\n\t\t\t}\n\n\t\t\treturn dst, nil\n\t\t},\n\n\t\tfunc(dst []string, src []byte, prng *rand.Rand) []string {\n\t\t\tlimit := len(src)/10 + 1\n\n\t\t\tfor i := 0; i < len(src); {\n\t\t\t\tn := prng.Intn(limit) + 1\n\t\t\t\tr := len(src) - i\n\t\t\t\tif n > r {\n\t\t\t\t\tn = r\n\t\t\t\t}\n\t\t\t\tdst = append(dst, unsafecast.BytesToString(src[i:i+n]))\n\t\t\t\ti += n\n\t\t\t}\n\n\t\t\treturn dst\n\t\t},\n\t)\n}\n\ntype encodingFunc[T comparable] func(encoding.Encoding, []byte, []T) ([]byte, error)\n\ntype decodingFunc[T comparable] func(encoding.Encoding, []T, []byte) ([]T, error)\n\ntype generateFunc[T comparable] func(dst []T, src []byte, prng *rand.Rand) []T\n\nfunc encode[T comparable](f *testing.F, e encoding.Encoding, encode encodingFunc[T], decode decodingFunc[T], generate generateFunc[T]) {\n\tconst bufferSize = 64 * 1024\n\tvar zero T\n\tvar err error\n\tvar buf = make([]T, bufferSize/unsafe.Sizeof(zero))\n\tvar src = make([]T, bufferSize/unsafe.Sizeof(zero))\n\tvar dst = make([]byte, bufferSize)\n\tvar prng = rand.New(rand.NewSource(0))\n\n\tf.Fuzz(func(t *testing.T, input []byte, seed int64) {\n\t\tprng.Seed(seed)\n\t\tsrc = generate(src[:0], input, prng)\n\n\t\tdst, err = encode(e, dst, src)\n\t\tif err != nil {\n\t\t\tt.Error(err)\n\t\t\treturn\n\t\t}\n\n\t\tbuf, err = decode(e, buf, dst)\n\t\tif err != nil {\n\t\t\tt.Error(err)\n\t\t\treturn\n\t\t}\n\n\t\tif !equal(buf, src) {\n\t\t\tt.Error(\"decoded output does not match the original input\")\n\t\t\treturn\n\t\t}\n\t})\n}\n\nfunc equal[T comparable](a, b []T) bool {\n\tif len(a) != len(b) {\n\t\treturn false\n\t}\n\tfor i := range a {\n\t\tif a[i] != b[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc generate[T comparable](dst []T, src []byte, prng *rand.Rand) []T {\n\treturn append(dst[:0], unsafecast.Slice[T](src)...)\n}\n"
  },
  {
    "path": "encoding/notsupported.go",
    "content": "package encoding\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\nvar (\n\t// ErrNotSupported is an error returned when the underlying encoding does\n\t// not support the type of values being encoded or decoded.\n\t//\n\t// This error may be wrapped with type information, applications must use\n\t// errors.Is rather than equality comparisons to test the error values\n\t// returned by encoders and decoders.\n\tErrNotSupported = errors.New(\"encoding not supported\")\n\n\t// ErrInvalidArgument is an error returned one or more arguments passed to\n\t// the encoding functions are incorrect.\n\t//\n\t// As with ErrNotSupported, this error may be wrapped with specific\n\t// information about the problem and applications are expected to use\n\t// errors.Is for comparisons.\n\tErrInvalidArgument = errors.New(\"invalid argument\")\n)\n\n// Error constructs an error which wraps err and indicates that it originated\n// from the given encoding.\nfunc Error(e Encoding, err error) error {\n\treturn fmt.Errorf(\"%s: %w\", e, err)\n}\n\n// Errorf is like Error but constructs the error message from the given format\n// and arguments.\nfunc Errorf(e Encoding, msg string, args ...interface{}) error {\n\treturn Error(e, fmt.Errorf(msg, args...))\n}\n\n// ErrEncodeInvalidInputSize constructs an error indicating that encoding failed\n// due to the size of the input.\nfunc ErrEncodeInvalidInputSize(e Encoding, typ string, size int) error {\n\treturn errInvalidInputSize(e, \"encode\", typ, size)\n}\n\n// ErrDecodeInvalidInputSize constructs an error indicating that decoding failed\n// due to the size of the input.\nfunc ErrDecodeInvalidInputSize(e Encoding, typ string, size int) error {\n\treturn errInvalidInputSize(e, \"decode\", typ, size)\n}\n\nfunc errInvalidInputSize(e Encoding, op, typ string, size int) error {\n\treturn Errorf(e, \"cannot %s %s from input of size %d: %w\", op, typ, size, ErrInvalidArgument)\n}\n\n// CanEncodeInt8 reports whether e can encode LEVELS values.\nfunc CanEncodeLevels(e Encoding) bool {\n\t_, err := e.EncodeLevels(nil, nil)\n\treturn !errors.Is(err, ErrNotSupported)\n}\n\n// CanEncodeBoolean reports whether e can encode BOOLEAN values.\nfunc CanEncodeBoolean(e Encoding) bool {\n\t_, err := e.EncodeBoolean(nil, nil)\n\treturn !errors.Is(err, ErrNotSupported)\n}\n\n// CanEncodeInt32 reports whether e can encode INT32 values.\nfunc CanEncodeInt32(e Encoding) bool {\n\t_, err := e.EncodeInt32(nil, nil)\n\treturn !errors.Is(err, ErrNotSupported)\n}\n\n// CanEncodeInt64 reports whether e can encode INT64 values.\nfunc CanEncodeInt64(e Encoding) bool {\n\t_, err := e.EncodeInt64(nil, nil)\n\treturn !errors.Is(err, ErrNotSupported)\n}\n\n// CanEncodeInt96 reports whether e can encode INT96 values.\nfunc CanEncodeInt96(e Encoding) bool {\n\t_, err := e.EncodeInt96(nil, nil)\n\treturn !errors.Is(err, ErrNotSupported)\n}\n\n// CanEncodeFloat reports whether e can encode FLOAT values.\nfunc CanEncodeFloat(e Encoding) bool {\n\t_, err := e.EncodeFloat(nil, nil)\n\treturn !errors.Is(err, ErrNotSupported)\n}\n\n// CanEncodeDouble reports whether e can encode DOUBLE values.\nfunc CanEncodeDouble(e Encoding) bool {\n\t_, err := e.EncodeDouble(nil, nil)\n\treturn !errors.Is(err, ErrNotSupported)\n}\n\n// CanEncodeByteArray reports whether e can encode BYTE_ARRAY values.\nfunc CanEncodeByteArray(e Encoding) bool {\n\t_, err := e.EncodeByteArray(nil, nil, zeroOffsets[:])\n\treturn !errors.Is(err, ErrNotSupported)\n}\n\n// CanEncodeFixedLenByteArray reports whether e can encode\n// FIXED_LEN_BYTE_ARRAY values.\nfunc CanEncodeFixedLenByteArray(e Encoding) bool {\n\t_, err := e.EncodeFixedLenByteArray(nil, nil, 1)\n\treturn !errors.Is(err, ErrNotSupported)\n}\n\nvar zeroOffsets [1]uint32\n\n// NotSupported is a type satisfying the Encoding interface which does not\n// support encoding nor decoding any value types.\ntype NotSupported struct {\n}\n\nfunc (NotSupported) String() string {\n\treturn \"NOT_SUPPORTED\"\n}\n\nfunc (NotSupported) Encoding() format.Encoding {\n\treturn -1\n}\n\nfunc (NotSupported) EncodeLevels(dst []byte, src []uint8) ([]byte, error) {\n\treturn dst[:0], errNotSupported(\"LEVELS\")\n}\n\nfunc (NotSupported) EncodeBoolean(dst []byte, src []byte) ([]byte, error) {\n\treturn dst[:0], errNotSupported(\"BOOLEAN\")\n}\n\nfunc (NotSupported) EncodeInt32(dst []byte, src []int32) ([]byte, error) {\n\treturn dst[:0], errNotSupported(\"INT32\")\n}\n\nfunc (NotSupported) EncodeInt64(dst []byte, src []int64) ([]byte, error) {\n\treturn dst[:0], errNotSupported(\"INT64\")\n}\n\nfunc (NotSupported) EncodeInt96(dst []byte, src []deprecated.Int96) ([]byte, error) {\n\treturn dst[:0], errNotSupported(\"INT96\")\n}\n\nfunc (NotSupported) EncodeFloat(dst []byte, src []float32) ([]byte, error) {\n\treturn dst[:0], errNotSupported(\"FLOAT\")\n}\n\nfunc (NotSupported) EncodeDouble(dst []byte, src []float64) ([]byte, error) {\n\treturn dst[:0], errNotSupported(\"DOUBLE\")\n}\n\nfunc (NotSupported) EncodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, error) {\n\treturn dst[:0], errNotSupported(\"BYTE_ARRAY\")\n}\n\nfunc (NotSupported) EncodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error) {\n\treturn dst[:0], errNotSupported(\"FIXED_LEN_BYTE_ARRAY\")\n}\n\nfunc (NotSupported) DecodeLevels(dst []uint8, src []byte) ([]uint8, error) {\n\treturn dst, errNotSupported(\"LEVELS\")\n}\n\nfunc (NotSupported) DecodeBoolean(dst []byte, src []byte) ([]byte, error) {\n\treturn dst, errNotSupported(\"BOOLEAN\")\n}\n\nfunc (NotSupported) DecodeInt32(dst []int32, src []byte) ([]int32, error) {\n\treturn dst, errNotSupported(\"INT32\")\n}\n\nfunc (NotSupported) DecodeInt64(dst []int64, src []byte) ([]int64, error) {\n\treturn dst, errNotSupported(\"INT64\")\n}\n\nfunc (NotSupported) DecodeInt96(dst []deprecated.Int96, src []byte) ([]deprecated.Int96, error) {\n\treturn dst, errNotSupported(\"INT96\")\n}\n\nfunc (NotSupported) DecodeFloat(dst []float32, src []byte) ([]float32, error) {\n\treturn dst, errNotSupported(\"FLOAT\")\n}\n\nfunc (NotSupported) DecodeDouble(dst []float64, src []byte) ([]float64, error) {\n\treturn dst, errNotSupported(\"DOUBLE\")\n}\n\nfunc (NotSupported) DecodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, []uint32, error) {\n\treturn dst, offsets, errNotSupported(\"BYTE_ARRAY\")\n}\n\nfunc (NotSupported) DecodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error) {\n\treturn dst, errNotSupported(\"FIXED_LEN_BYTE_ARRAY\")\n}\n\nfunc (NotSupported) EstimateDecodeByteArraySize(src []byte) int {\n\treturn 0\n}\n\nfunc (NotSupported) CanDecodeInPlace() bool {\n\treturn false\n}\n\nfunc errNotSupported(typ string) error {\n\treturn fmt.Errorf(\"%w for type %s\", ErrNotSupported, typ)\n}\n\nvar (\n\t_ Encoding = NotSupported{}\n)\n"
  },
  {
    "path": "encoding/plain/dictionary.go",
    "content": "package plain\n\nimport (\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\ntype DictionaryEncoding struct {\n\tencoding.NotSupported\n\tplain Encoding\n}\n\nfunc (e *DictionaryEncoding) String() string {\n\treturn \"PLAIN_DICTIONARY\"\n}\n\nfunc (e *DictionaryEncoding) Encoding() format.Encoding {\n\treturn format.PlainDictionary\n}\n\nfunc (e *DictionaryEncoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) {\n\treturn e.plain.EncodeInt32(dst, src)\n}\n\nfunc (e *DictionaryEncoding) DecodeInt32(dst []int32, src []byte) ([]int32, error) {\n\treturn e.plain.DecodeInt32(dst, src)\n}\n"
  },
  {
    "path": "encoding/plain/plain.go",
    "content": "// Package plain implements the PLAIN parquet encoding.\n//\n// https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0\npackage plain\n\nimport (\n\t\"encoding/binary\"\n\t\"fmt\"\n\t\"io\"\n\t\"math\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/format\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\nconst (\n\tByteArrayLengthSize = 4\n\tMaxByteArrayLength  = math.MaxInt32\n)\n\ntype Encoding struct {\n\tencoding.NotSupported\n}\n\nfunc (e *Encoding) String() string {\n\treturn \"PLAIN\"\n}\n\nfunc (e *Encoding) Encoding() format.Encoding {\n\treturn format.Plain\n}\n\nfunc (e *Encoding) EncodeBoolean(dst []byte, src []byte) ([]byte, error) {\n\treturn append(dst[:0], src...), nil\n}\n\nfunc (e *Encoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) {\n\treturn append(dst[:0], unsafecast.Int32ToBytes(src)...), nil\n}\n\nfunc (e *Encoding) EncodeInt64(dst []byte, src []int64) ([]byte, error) {\n\treturn append(dst[:0], unsafecast.Int64ToBytes(src)...), nil\n}\n\nfunc (e *Encoding) EncodeInt96(dst []byte, src []deprecated.Int96) ([]byte, error) {\n\treturn append(dst[:0], deprecated.Int96ToBytes(src)...), nil\n}\n\nfunc (e *Encoding) EncodeFloat(dst []byte, src []float32) ([]byte, error) {\n\treturn append(dst[:0], unsafecast.Float32ToBytes(src)...), nil\n}\n\nfunc (e *Encoding) EncodeDouble(dst []byte, src []float64) ([]byte, error) {\n\treturn append(dst[:0], unsafecast.Float64ToBytes(src)...), nil\n}\n\nfunc (e *Encoding) EncodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, error) {\n\tdst = dst[:0]\n\n\tif len(offsets) > 0 {\n\t\tbaseOffset := offsets[0]\n\n\t\tfor _, endOffset := range offsets[1:] {\n\t\t\tdst = AppendByteArray(dst, src[baseOffset:endOffset:endOffset])\n\t\t\tbaseOffset = endOffset\n\t\t}\n\t}\n\n\treturn dst, nil\n}\n\nfunc (e *Encoding) EncodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error) {\n\tif size < 0 || size > encoding.MaxFixedLenByteArraySize {\n\t\treturn dst[:0], encoding.Error(e, encoding.ErrInvalidArgument)\n\t}\n\treturn append(dst[:0], src...), nil\n}\n\nfunc (e *Encoding) DecodeBoolean(dst []byte, src []byte) ([]byte, error) {\n\treturn append(dst[:0], src...), nil\n}\n\nfunc (e *Encoding) DecodeInt32(dst []int32, src []byte) ([]int32, error) {\n\tif (len(src) % 4) != 0 {\n\t\treturn dst, encoding.ErrDecodeInvalidInputSize(e, \"INT32\", len(src))\n\t}\n\treturn append(dst[:0], unsafecast.BytesToInt32(src)...), nil\n}\n\nfunc (e *Encoding) DecodeInt64(dst []int64, src []byte) ([]int64, error) {\n\tif (len(src) % 8) != 0 {\n\t\treturn dst, encoding.ErrDecodeInvalidInputSize(e, \"INT64\", len(src))\n\t}\n\treturn append(dst[:0], unsafecast.BytesToInt64(src)...), nil\n}\n\nfunc (e *Encoding) DecodeInt96(dst []deprecated.Int96, src []byte) ([]deprecated.Int96, error) {\n\tif (len(src) % 12) != 0 {\n\t\treturn dst, encoding.ErrDecodeInvalidInputSize(e, \"INT96\", len(src))\n\t}\n\treturn append(dst[:0], deprecated.BytesToInt96(src)...), nil\n}\n\nfunc (e *Encoding) DecodeFloat(dst []float32, src []byte) ([]float32, error) {\n\tif (len(src) % 4) != 0 {\n\t\treturn dst, encoding.ErrDecodeInvalidInputSize(e, \"FLOAT\", len(src))\n\t}\n\treturn append(dst[:0], unsafecast.BytesToFloat32(src)...), nil\n}\n\nfunc (e *Encoding) DecodeDouble(dst []float64, src []byte) ([]float64, error) {\n\tif (len(src) % 8) != 0 {\n\t\treturn dst, encoding.ErrDecodeInvalidInputSize(e, \"DOUBLE\", len(src))\n\t}\n\treturn append(dst[:0], unsafecast.BytesToFloat64(src)...), nil\n}\n\nfunc (e *Encoding) DecodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, []uint32, error) {\n\tdst, offsets = dst[:0], offsets[:0]\n\n\tfor i := 0; i < len(src); {\n\t\tif (len(src) - i) < ByteArrayLengthSize {\n\t\t\treturn dst, offsets, ErrTooShort(len(src))\n\t\t}\n\t\tn := ByteArrayLength(src[i:])\n\t\tif n > (len(src) - ByteArrayLengthSize) {\n\t\t\treturn dst, offsets, ErrTooShort(len(src))\n\t\t}\n\t\ti += ByteArrayLengthSize\n\t\toffsets = append(offsets, uint32(len(dst)))\n\t\tdst = append(dst, src[i:i+n]...)\n\t\ti += n\n\t}\n\n\treturn dst, append(offsets, uint32(len(dst))), nil\n}\n\nfunc (e *Encoding) DecodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error) {\n\tif size < 0 || size > encoding.MaxFixedLenByteArraySize {\n\t\treturn dst, encoding.Error(e, encoding.ErrInvalidArgument)\n\t}\n\tif (len(src) % size) != 0 {\n\t\treturn dst, encoding.ErrDecodeInvalidInputSize(e, \"FIXED_LEN_BYTE_ARRAY\", len(src))\n\t}\n\treturn append(dst[:0], src...), nil\n}\n\nfunc (e *Encoding) EstimateDecodeByteArraySize(src []byte) int {\n\treturn len(src)\n}\n\nfunc (e *Encoding) CanDecodeInPlace() bool {\n\treturn true\n}\n\nfunc Boolean(v bool) []byte { return AppendBoolean(nil, 0, v) }\n\nfunc Int32(v int32) []byte { return AppendInt32(nil, v) }\n\nfunc Int64(v int64) []byte { return AppendInt64(nil, v) }\n\nfunc Int96(v deprecated.Int96) []byte { return AppendInt96(nil, v) }\n\nfunc Float(v float32) []byte { return AppendFloat(nil, v) }\n\nfunc Double(v float64) []byte { return AppendDouble(nil, v) }\n\nfunc ByteArray(v []byte) []byte { return AppendByteArray(nil, v) }\n\nfunc AppendBoolean(b []byte, n int, v bool) []byte {\n\ti := n / 8\n\tj := n % 8\n\n\tif cap(b) > i {\n\t\tb = b[:i+1]\n\t} else {\n\t\ttmp := make([]byte, i+1, 2*(i+1))\n\t\tcopy(tmp, b)\n\t\tb = tmp\n\t}\n\n\tk := uint(j)\n\tx := byte(0)\n\tif v {\n\t\tx = 1\n\t}\n\n\tb[i] = (b[i] & ^(1 << k)) | (x << k)\n\treturn b\n}\n\nfunc AppendInt32(b []byte, v int32) []byte {\n\tx := [4]byte{}\n\tbinary.LittleEndian.PutUint32(x[:], uint32(v))\n\treturn append(b, x[:]...)\n}\n\nfunc AppendInt64(b []byte, v int64) []byte {\n\tx := [8]byte{}\n\tbinary.LittleEndian.PutUint64(x[:], uint64(v))\n\treturn append(b, x[:]...)\n}\n\nfunc AppendInt96(b []byte, v deprecated.Int96) []byte {\n\tx := [12]byte{}\n\tbinary.LittleEndian.PutUint32(x[0:4], v[0])\n\tbinary.LittleEndian.PutUint32(x[4:8], v[1])\n\tbinary.LittleEndian.PutUint32(x[8:12], v[2])\n\treturn append(b, x[:]...)\n}\n\nfunc AppendFloat(b []byte, v float32) []byte {\n\tx := [4]byte{}\n\tbinary.LittleEndian.PutUint32(x[:], math.Float32bits(v))\n\treturn append(b, x[:]...)\n}\n\nfunc AppendDouble(b []byte, v float64) []byte {\n\tx := [8]byte{}\n\tbinary.LittleEndian.PutUint64(x[:], math.Float64bits(v))\n\treturn append(b, x[:]...)\n}\n\nfunc AppendByteArray(b, v []byte) []byte {\n\tlength := [ByteArrayLengthSize]byte{}\n\tPutByteArrayLength(length[:], len(v))\n\tb = append(b, length[:]...)\n\tb = append(b, v...)\n\treturn b\n}\n\nfunc AppendByteArrayString(b []byte, v string) []byte {\n\tlength := [ByteArrayLengthSize]byte{}\n\tPutByteArrayLength(length[:], len(v))\n\tb = append(b, length[:]...)\n\tb = append(b, v...)\n\treturn b\n}\n\nfunc AppendByteArrayLength(b []byte, n int) []byte {\n\tlength := [ByteArrayLengthSize]byte{}\n\tPutByteArrayLength(length[:], n)\n\treturn append(b, length[:]...)\n}\n\nfunc ByteArrayLength(b []byte) int {\n\treturn int(binary.LittleEndian.Uint32(b))\n}\n\nfunc PutByteArrayLength(b []byte, n int) {\n\tbinary.LittleEndian.PutUint32(b, uint32(n))\n}\n\nfunc RangeByteArray(b []byte, do func([]byte) error) (err error) {\n\tfor len(b) > 0 {\n\t\tvar v []byte\n\t\tif v, b, err = NextByteArray(b); err != nil {\n\t\t\treturn err\n\t\t}\n\t\tif err = do(v); err != nil {\n\t\t\treturn err\n\t\t}\n\t}\n\treturn nil\n}\n\nfunc NextByteArray(b []byte) (v, r []byte, err error) {\n\tif len(b) < ByteArrayLengthSize {\n\t\treturn nil, b, ErrTooShort(len(b))\n\t}\n\tn := ByteArrayLength(b)\n\tif n > (len(b) - ByteArrayLengthSize) {\n\t\treturn nil, b, ErrTooShort(len(b))\n\t}\n\tif n > MaxByteArrayLength {\n\t\treturn nil, b, ErrTooLarge(n)\n\t}\n\tn += ByteArrayLengthSize\n\treturn b[ByteArrayLengthSize:n:n], b[n:len(b):len(b)], nil\n}\n\nfunc ErrTooShort(length int) error {\n\treturn fmt.Errorf(\"input of length %d is too short to contain a PLAIN encoded byte array value: %w\", length, io.ErrUnexpectedEOF)\n}\n\nfunc ErrTooLarge(length int) error {\n\treturn fmt.Errorf(\"byte array of length %d is too large to be encoded\", length)\n}\n"
  },
  {
    "path": "encoding/plain/plain_test.go",
    "content": "package plain_test\n\nimport (\n\t\"bytes\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/encoding/plain\"\n)\n\nfunc TestAppendBoolean(t *testing.T) {\n\tvalues := []byte{}\n\n\tfor i := 0; i < 100; i++ {\n\t\tvalues = plain.AppendBoolean(values, i, (i%2) != 0)\n\t}\n\n\tif !bytes.Equal(values, []byte{\n\t\t0b10101010,\n\t\t0b10101010,\n\t\t0b10101010,\n\t\t0b10101010,\n\t\t0b10101010,\n\t\t0b10101010,\n\t\t0b10101010,\n\t\t0b10101010,\n\t\t0b10101010,\n\t\t0b10101010,\n\t\t0b10101010,\n\t\t0b10101010,\n\t\t0b00001010,\n\t}) {\n\t\tt.Errorf(\"%08b\\n\", values)\n\t}\n}\n"
  },
  {
    "path": "encoding/rle/dictionary.go",
    "content": "package rle\n\nimport (\n\t\"math/bits\"\n\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/format\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\ntype DictionaryEncoding struct {\n\tencoding.NotSupported\n}\n\nfunc (e *DictionaryEncoding) String() string {\n\treturn \"RLE_DICTIONARY\"\n}\n\nfunc (e *DictionaryEncoding) Encoding() format.Encoding {\n\treturn format.RLEDictionary\n}\n\nfunc (e *DictionaryEncoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) {\n\tbitWidth := maxLenInt32(src)\n\tdst = append(dst[:0], byte(bitWidth))\n\tdst, err := encodeInt32(dst, src, uint(bitWidth))\n\treturn dst, e.wrap(err)\n}\n\nfunc (e *DictionaryEncoding) DecodeInt32(dst []int32, src []byte) ([]int32, error) {\n\tif len(src) == 0 {\n\t\treturn dst[:0], nil\n\t}\n\tbuf := unsafecast.Int32ToBytes(dst)\n\tbuf, err := decodeInt32(buf[:0], src[1:], uint(src[0]))\n\treturn unsafecast.BytesToInt32(buf), e.wrap(err)\n}\n\nfunc (e *DictionaryEncoding) wrap(err error) error {\n\tif err != nil {\n\t\terr = encoding.Error(e, err)\n\t}\n\treturn err\n}\n\nfunc clearInt32(data []int32) {\n\tfor i := range data {\n\t\tdata[i] = 0\n\t}\n}\n\nfunc maxLenInt32(data []int32) (max int) {\n\tfor _, v := range data {\n\t\tif n := bits.Len32(uint32(v)); n > max {\n\t\t\tmax = n\n\t\t}\n\t}\n\treturn max\n}\n"
  },
  {
    "path": "encoding/rle/rle.go",
    "content": "// Package rle implements the hybrid RLE/Bit-Packed encoding employed in\n// repetition and definition levels, dictionary indexed data pages, and\n// boolean values in the PLAIN encoding.\n//\n// https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3\npackage rle\n\nimport (\n\t\"encoding/binary\"\n\t\"fmt\"\n\t\"io\"\n\t\"unsafe\"\n\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/format\"\n\t\"github.com/segmentio/parquet-go/internal/bitpack\"\n\t\"github.com/segmentio/parquet-go/internal/bytealg\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\nconst (\n\t// This limit is intended to prevent unbounded memory allocations when\n\t// decoding runs.\n\t//\n\t// We use a generous limit which allows for over 16 million values per page\n\t// if there is only one run to encode the repetition or definition levels\n\t// (this should be uncommon).\n\tmaxSupportedValueCount = 16 * 1024 * 1024\n)\n\ntype Encoding struct {\n\tencoding.NotSupported\n\tBitWidth int\n}\n\nfunc (e *Encoding) String() string {\n\treturn \"RLE\"\n}\n\nfunc (e *Encoding) Encoding() format.Encoding {\n\treturn format.RLE\n}\n\nfunc (e *Encoding) EncodeLevels(dst []byte, src []uint8) ([]byte, error) {\n\tdst, err := encodeBytes(dst[:0], src, uint(e.BitWidth))\n\treturn dst, e.wrap(err)\n}\n\nfunc (e *Encoding) EncodeBoolean(dst []byte, src []byte) ([]byte, error) {\n\t// In the case of encoding a boolean values, the 4 bytes length of the\n\t// output is expected by the parquet format. We add the bytes as placeholder\n\t// before appending the encoded data.\n\tdst = append(dst[:0], 0, 0, 0, 0)\n\tdst, err := encodeBits(dst, src)\n\tbinary.LittleEndian.PutUint32(dst, uint32(len(dst))-4)\n\treturn dst, e.wrap(err)\n}\n\nfunc (e *Encoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) {\n\tdst, err := encodeInt32(dst[:0], src, uint(e.BitWidth))\n\treturn dst, e.wrap(err)\n}\n\nfunc (e *Encoding) DecodeLevels(dst []uint8, src []byte) ([]uint8, error) {\n\tdst, err := decodeBytes(dst[:0], src, uint(e.BitWidth))\n\treturn dst, e.wrap(err)\n}\n\nfunc (e *Encoding) DecodeBoolean(dst []byte, src []byte) ([]byte, error) {\n\tif len(src) == 4 {\n\t\treturn dst[:0], nil\n\t}\n\tif len(src) < 4 {\n\t\treturn dst[:0], fmt.Errorf(\"input shorter than 4 bytes: %w\", io.ErrUnexpectedEOF)\n\t}\n\tn := int(binary.LittleEndian.Uint32(src))\n\tsrc = src[4:]\n\tif n > len(src) {\n\t\treturn dst[:0], fmt.Errorf(\"input shorter than length prefix: %d < %d: %w\", len(src), n, io.ErrUnexpectedEOF)\n\t}\n\tdst, err := decodeBits(dst[:0], src[:n])\n\treturn dst, e.wrap(err)\n}\n\nfunc (e *Encoding) DecodeInt32(dst []int32, src []byte) ([]int32, error) {\n\tbuf := unsafecast.Int32ToBytes(dst)\n\tbuf, err := decodeInt32(buf[:0], src, uint(e.BitWidth))\n\treturn unsafecast.BytesToInt32(buf), e.wrap(err)\n}\n\nfunc (e *Encoding) wrap(err error) error {\n\tif err != nil {\n\t\terr = encoding.Error(e, err)\n\t}\n\treturn err\n}\n\nfunc encodeBits(dst, src []byte) ([]byte, error) {\n\tif len(src) == 0 || isZero(src) || isOnes(src) {\n\t\tdst = appendUvarint(dst, uint64(8*len(src))<<1)\n\t\tif len(src) > 0 {\n\t\t\tdst = append(dst, src[0])\n\t\t}\n\t\treturn dst, nil\n\t}\n\n\tfor i := 0; i < len(src); {\n\t\tj := i + 1\n\n\t\t// Look for contiguous sections of 8 bits, all zeros or ones; these\n\t\t// are run-length encoded as it only takes 2 or 3 bytes to store these\n\t\t// sequences.\n\t\tif src[i] == 0 || src[i] == 0xFF {\n\t\t\tfor j < len(src) && src[i] == src[j] {\n\t\t\t\tj++\n\t\t\t}\n\n\t\t\tif n := j - i; n > 1 {\n\t\t\t\tdst = appendRunLengthBits(dst, 8*n, src[i])\n\t\t\t\ti = j\n\t\t\t\tcontinue\n\t\t\t}\n\t\t}\n\n\t\t// Sequences of bits that are neither all zeroes or ones are bit-packed,\n\t\t// which is a simple copy of the input to the output preceded with the\n\t\t// bit-pack header.\n\t\tfor j < len(src) && (src[j-1] != src[j] || (src[j] != 0 && src[j] == 0xFF)) {\n\t\t\tj++\n\t\t}\n\n\t\tif (j-i) > 1 && j < len(src) {\n\t\t\tj--\n\t\t}\n\n\t\tdst = appendBitPackedBits(dst, src[i:j])\n\t\ti = j\n\t}\n\treturn dst, nil\n}\n\nfunc encodeBytes(dst, src []byte, bitWidth uint) ([]byte, error) {\n\tif bitWidth > 8 {\n\t\treturn dst, errEncodeInvalidBitWidth(\"INT8\", bitWidth)\n\t}\n\tif bitWidth == 0 {\n\t\tif !isZero(src) {\n\t\t\treturn dst, errEncodeInvalidBitWidth(\"INT8\", bitWidth)\n\t\t}\n\t\treturn appendUvarint(dst, uint64(len(src))<<1), nil\n\t}\n\n\tif len(src) >= 8 {\n\t\twords := unsafe.Slice((*uint64)(unsafe.Pointer(&src[0])), len(src)/8)\n\n\t\tfor i := 0; i < len(words); {\n\t\t\tj := i\n\t\t\tpattern := broadcast8x1(words[i])\n\n\t\t\tfor j < len(words) && words[j] == pattern {\n\t\t\t\tj++\n\t\t\t}\n\n\t\t\tif i < j {\n\t\t\t\tdst = appendRunLengthBytes(dst, 8*(j-i), byte(pattern))\n\t\t\t} else {\n\t\t\t\tj++\n\n\t\t\t\tfor j < len(words) && words[j] != broadcast8x1(words[j-1]) {\n\t\t\t\t\tj++\n\t\t\t\t}\n\n\t\t\t\tdst = appendBitPackedBytes(dst, words[i:j], bitWidth)\n\t\t\t}\n\n\t\t\ti = j\n\t\t}\n\t}\n\n\tfor i := (len(src) / 8) * 8; i < len(src); {\n\t\tj := i + 1\n\n\t\tfor j < len(src) && src[i] == src[j] {\n\t\t\tj++\n\t\t}\n\n\t\tdst = appendRunLengthBytes(dst, j-i, src[i])\n\t\ti = j\n\t}\n\n\treturn dst, nil\n}\n\nfunc encodeInt32(dst []byte, src []int32, bitWidth uint) ([]byte, error) {\n\tif bitWidth > 32 {\n\t\treturn dst, errEncodeInvalidBitWidth(\"INT32\", bitWidth)\n\t}\n\tif bitWidth == 0 {\n\t\tif !isZero(unsafecast.Int32ToBytes(src)) {\n\t\t\treturn dst, errEncodeInvalidBitWidth(\"INT32\", bitWidth)\n\t\t}\n\t\treturn appendUvarint(dst, uint64(len(src))<<1), nil\n\t}\n\n\tif len(src) >= 8 {\n\t\twords := unsafe.Slice((*[8]int32)(unsafe.Pointer(&src[0])), len(src)/8)\n\n\t\tfor i := 0; i < len(words); {\n\t\t\tj := i\n\t\t\tpattern := broadcast8x4(words[i][0])\n\n\t\t\tfor j < len(words) && words[j] == pattern {\n\t\t\t\tj++\n\t\t\t}\n\n\t\t\tif i < j {\n\t\t\t\tdst = appendRunLengthInt32(dst, 8*(j-i), pattern[0], bitWidth)\n\t\t\t} else {\n\t\t\t\tj += 1\n\t\t\t\tj += encodeInt32IndexEqual8Contiguous(words[j:])\n\t\t\t\tdst = appendBitPackedInt32(dst, words[i:j], bitWidth)\n\t\t\t}\n\n\t\t\ti = j\n\t\t}\n\t}\n\n\tfor i := (len(src) / 8) * 8; i < len(src); {\n\t\tj := i + 1\n\n\t\tfor j < len(src) && src[i] == src[j] {\n\t\t\tj++\n\t\t}\n\n\t\tdst = appendRunLengthInt32(dst, j-i, src[i], bitWidth)\n\t\ti = j\n\t}\n\n\treturn dst, nil\n}\n\nfunc decodeBits(dst, src []byte) ([]byte, error) {\n\tfor i := 0; i < len(src); {\n\t\tu, n := binary.Uvarint(src[i:])\n\t\tif n == 0 {\n\t\t\treturn dst, fmt.Errorf(\"decoding run-length block header: %w\", io.ErrUnexpectedEOF)\n\t\t}\n\t\tif n < 0 {\n\t\t\treturn dst, fmt.Errorf(\"overflow after decoding %d/%d bytes of run-length block header\", -n+i, len(src))\n\t\t}\n\t\ti += n\n\n\t\tcount, bitpacked := uint(u>>1), (u&1) != 0\n\t\tif count > maxSupportedValueCount {\n\t\t\treturn dst, fmt.Errorf(\"decoded run-length block cannot have more than %d values\", maxSupportedValueCount)\n\t\t}\n\t\tif bitpacked {\n\t\t\tn := int(count)\n\t\t\tj := i + n\n\n\t\t\tif j > len(src) {\n\t\t\t\treturn dst, fmt.Errorf(\"decoding bit-packed block of %d values: %w\", n, io.ErrUnexpectedEOF)\n\t\t\t}\n\n\t\t\tdst = append(dst, src[i:j]...)\n\t\t\ti = j\n\t\t} else {\n\t\t\tword := byte(0)\n\t\t\tif i < len(src) {\n\t\t\t\tword = src[i]\n\t\t\t\ti++\n\t\t\t}\n\n\t\t\toffset := len(dst)\n\t\t\tlength := bitpack.ByteCount(count)\n\t\t\tdst = resize(dst, offset+length)\n\t\t\tbytealg.Broadcast(dst[offset:], word)\n\t\t}\n\t}\n\treturn dst, nil\n}\n\nfunc decodeBytes(dst, src []byte, bitWidth uint) ([]byte, error) {\n\tif bitWidth > 8 {\n\t\treturn dst, errDecodeInvalidBitWidth(\"INT8\", bitWidth)\n\t}\n\n\tfor i := 0; i < len(src); {\n\t\tu, n := binary.Uvarint(src[i:])\n\t\tif n == 0 {\n\t\t\treturn dst, fmt.Errorf(\"decoding run-length block header: %w\", io.ErrUnexpectedEOF)\n\t\t}\n\t\tif n < 0 {\n\t\t\treturn dst, fmt.Errorf(\"overflow after decoding %d/%d bytes of run-length block header\", -n+i, len(src))\n\t\t}\n\t\ti += n\n\n\t\tcount, bitpacked := uint(u>>1), (u&1) != 0\n\t\tif count > maxSupportedValueCount {\n\t\t\treturn dst, fmt.Errorf(\"decoded run-length block cannot have more than %d values\", maxSupportedValueCount)\n\t\t}\n\t\tif bitpacked {\n\t\t\tcount *= 8\n\t\t\tj := i + bitpack.ByteCount(count*bitWidth)\n\n\t\t\tif j > len(src) {\n\t\t\t\treturn dst, fmt.Errorf(\"decoding bit-packed block of %d values: %w\", 8*count, io.ErrUnexpectedEOF)\n\t\t\t}\n\n\t\t\toffset := len(dst)\n\t\t\tlength := int(count)\n\t\t\tdst = resize(dst, offset+length)\n\t\t\tdecodeBytesBitpack(dst[offset:], src[i:j], count, bitWidth)\n\n\t\t\ti = j\n\t\t} else {\n\t\t\tif bitWidth != 0 && (i+1) > len(src) {\n\t\t\t\treturn dst, fmt.Errorf(\"decoding run-length block of %d values: %w\", count, io.ErrUnexpectedEOF)\n\t\t\t}\n\n\t\t\tword := byte(0)\n\t\t\tif bitWidth != 0 {\n\t\t\t\tword = src[i]\n\t\t\t\ti++\n\t\t\t}\n\n\t\t\toffset := len(dst)\n\t\t\tlength := int(count)\n\t\t\tdst = resize(dst, offset+length)\n\t\t\tbytealg.Broadcast(dst[offset:], word)\n\t\t}\n\t}\n\n\treturn dst, nil\n}\n\nfunc decodeInt32(dst, src []byte, bitWidth uint) ([]byte, error) {\n\tif bitWidth > 32 {\n\t\treturn dst, errDecodeInvalidBitWidth(\"INT32\", bitWidth)\n\t}\n\n\tbuf := make([]byte, 2*bitpack.PaddingInt32)\n\n\tfor i := 0; i < len(src); {\n\t\tu, n := binary.Uvarint(src[i:])\n\t\tif n == 0 {\n\t\t\treturn dst, fmt.Errorf(\"decoding run-length block header: %w\", io.ErrUnexpectedEOF)\n\t\t}\n\t\tif n < 0 {\n\t\t\treturn dst, fmt.Errorf(\"overflow after decoding %d/%d bytes of run-length block header\", -n+i, len(src))\n\t\t}\n\t\ti += n\n\n\t\tcount, bitpacked := uint(u>>1), (u&1) != 0\n\t\tif count > maxSupportedValueCount {\n\t\t\treturn dst, fmt.Errorf(\"decoded run-length block cannot have more than %d values\", maxSupportedValueCount)\n\t\t}\n\t\tif bitpacked {\n\t\t\toffset := len(dst)\n\t\t\tlength := int(count * bitWidth)\n\t\t\tdst = resize(dst, offset+4*8*int(count))\n\n\t\t\t// The bitpack.UnpackInt32 function requires the input to be padded\n\t\t\t// or the function panics. If there is enough room in the input\n\t\t\t// buffer we can use it, otherwise we have to copy it to a larger\n\t\t\t// location (which should rarely happen).\n\t\t\tin := src[i : i+length]\n\t\t\tif (cap(in) - len(in)) >= bitpack.PaddingInt32 {\n\t\t\t\tin = in[:cap(in)]\n\t\t\t} else {\n\t\t\t\tbuf = resize(buf, len(in)+bitpack.PaddingInt32)\n\t\t\t\tcopy(buf, in)\n\t\t\t\tin = buf\n\t\t\t}\n\n\t\t\tout := unsafecast.BytesToInt32(dst[offset:])\n\t\t\tbitpack.UnpackInt32(out, in, bitWidth)\n\t\t\ti += length\n\t\t} else {\n\t\t\tj := i + bitpack.ByteCount(bitWidth)\n\n\t\t\tif j > len(src) {\n\t\t\t\treturn dst, fmt.Errorf(\"decoding run-length block of %d values: %w\", count, io.ErrUnexpectedEOF)\n\t\t\t}\n\n\t\t\tbits := [4]byte{}\n\t\t\tcopy(bits[:], src[i:j])\n\t\t\tdst = appendRepeat(dst, bits[:], count)\n\t\t\ti = j\n\t\t}\n\t}\n\n\treturn dst, nil\n}\n\nfunc errEncodeInvalidBitWidth(typ string, bitWidth uint) error {\n\treturn errInvalidBitWidth(\"encode\", typ, bitWidth)\n}\n\nfunc errDecodeInvalidBitWidth(typ string, bitWidth uint) error {\n\treturn errInvalidBitWidth(\"decode\", typ, bitWidth)\n}\n\nfunc errInvalidBitWidth(op, typ string, bitWidth uint) error {\n\treturn fmt.Errorf(\"cannot %s %s with invalid bit-width=%d\", op, typ, bitWidth)\n}\n\nfunc appendRepeat(dst, pattern []byte, count uint) []byte {\n\toffset := len(dst)\n\tlength := int(count) * len(pattern)\n\tdst = resize(dst, offset+length)\n\ti := offset + copy(dst[offset:], pattern)\n\tfor i < len(dst) {\n\t\ti += copy(dst[i:], dst[offset:i])\n\t}\n\treturn dst\n}\n\nfunc appendUvarint(dst []byte, u uint64) []byte {\n\tvar b [binary.MaxVarintLen64]byte\n\tvar n = binary.PutUvarint(b[:], u)\n\treturn append(dst, b[:n]...)\n}\n\nfunc appendRunLengthBits(dst []byte, count int, value byte) []byte {\n\treturn appendRunLengthBytes(dst, count, value)\n}\n\nfunc appendBitPackedBits(dst []byte, words []byte) []byte {\n\tn := len(dst)\n\tdst = resize(dst, n+binary.MaxVarintLen64+len(words))\n\tn += binary.PutUvarint(dst[n:], uint64(len(words)<<1)|1)\n\tn += copy(dst[n:], words)\n\treturn dst[:n]\n}\n\nfunc appendRunLengthBytes(dst []byte, count int, value byte) []byte {\n\tn := len(dst)\n\tdst = resize(dst, n+binary.MaxVarintLen64+1)\n\tn += binary.PutUvarint(dst[n:], uint64(count)<<1)\n\tdst[n] = value\n\treturn dst[:n+1]\n}\n\nfunc appendBitPackedBytes(dst []byte, words []uint64, bitWidth uint) []byte {\n\tn := len(dst)\n\tdst = resize(dst, n+binary.MaxVarintLen64+(len(words)*int(bitWidth))+8)\n\tn += binary.PutUvarint(dst[n:], uint64(len(words)<<1)|1)\n\tn += encodeBytesBitpack(dst[n:], words, bitWidth)\n\treturn dst[:n]\n}\n\nfunc appendRunLengthInt32(dst []byte, count int, value int32, bitWidth uint) []byte {\n\tn := len(dst)\n\tdst = resize(dst, n+binary.MaxVarintLen64+4)\n\tn += binary.PutUvarint(dst[n:], uint64(count)<<1)\n\tbinary.LittleEndian.PutUint32(dst[n:], uint32(value))\n\treturn dst[:n+bitpack.ByteCount(bitWidth)]\n}\n\nfunc appendBitPackedInt32(dst []byte, words [][8]int32, bitWidth uint) []byte {\n\tn := len(dst)\n\tdst = resize(dst, n+binary.MaxVarintLen64+(len(words)*int(bitWidth))+32)\n\tn += binary.PutUvarint(dst[n:], uint64(len(words))<<1|1)\n\tn += encodeInt32Bitpack(dst[n:], words, bitWidth)\n\treturn dst[:n]\n}\n\nfunc broadcast8x1(v uint64) uint64 {\n\treturn (v & 0xFF) * 0x0101010101010101\n}\n\nfunc broadcast8x4(v int32) [8]int32 {\n\treturn [8]int32{v, v, v, v, v, v, v, v}\n}\n\nfunc isZero(data []byte) bool {\n\treturn bytealg.Count(data, 0x00) == len(data)\n}\n\nfunc isOnes(data []byte) bool {\n\treturn bytealg.Count(data, 0xFF) == len(data)\n}\n\nfunc resize(buf []byte, size int) []byte {\n\tif cap(buf) < size {\n\t\treturn grow(buf, size)\n\t}\n\treturn buf[:size]\n}\n\nfunc grow(buf []byte, size int) []byte {\n\tnewCap := 2 * cap(buf)\n\tif newCap < size {\n\t\tnewCap = size\n\t}\n\tnewBuf := make([]byte, size, newCap)\n\tcopy(newBuf, buf)\n\treturn newBuf\n}\n\nfunc encodeInt32BitpackDefault(dst []byte, src [][8]int32, bitWidth uint) int {\n\tbits := unsafe.Slice((*int32)(unsafe.Pointer(&src[0])), len(src)*8)\n\tbitpack.PackInt32(dst, bits, bitWidth)\n\treturn bitpack.ByteCount(uint(len(src)*8) * bitWidth)\n}\n\nfunc encodeBytesBitpackDefault(dst []byte, src []uint64, bitWidth uint) int {\n\tbitMask := uint64(1<<bitWidth) - 1\n\tn := 0\n\n\tfor _, word := range src {\n\t\tword = (word & bitMask) |\n\t\t\t(((word >> 8) & bitMask) << (1 * bitWidth)) |\n\t\t\t(((word >> 16) & bitMask) << (2 * bitWidth)) |\n\t\t\t(((word >> 24) & bitMask) << (3 * bitWidth)) |\n\t\t\t(((word >> 32) & bitMask) << (4 * bitWidth)) |\n\t\t\t(((word >> 40) & bitMask) << (5 * bitWidth)) |\n\t\t\t(((word >> 48) & bitMask) << (6 * bitWidth)) |\n\t\t\t(((word >> 56) & bitMask) << (7 * bitWidth))\n\t\tbinary.LittleEndian.PutUint64(dst[n:], word)\n\t\tn += int(bitWidth)\n\t}\n\n\treturn n\n}\n\nfunc decodeBytesBitpackDefault(dst, src []byte, count, bitWidth uint) {\n\tdst = dst[:0]\n\n\tbitMask := uint64(1<<bitWidth) - 1\n\tbyteCount := bitpack.ByteCount(8 * bitWidth)\n\n\tfor i := 0; count > 0; count -= 8 {\n\t\tj := i + byteCount\n\n\t\tbits := [8]byte{}\n\t\tcopy(bits[:], src[i:j])\n\t\tword := binary.LittleEndian.Uint64(bits[:])\n\n\t\tdst = append(dst,\n\t\t\tbyte((word>>(0*bitWidth))&bitMask),\n\t\t\tbyte((word>>(1*bitWidth))&bitMask),\n\t\t\tbyte((word>>(2*bitWidth))&bitMask),\n\t\t\tbyte((word>>(3*bitWidth))&bitMask),\n\t\t\tbyte((word>>(4*bitWidth))&bitMask),\n\t\t\tbyte((word>>(5*bitWidth))&bitMask),\n\t\t\tbyte((word>>(6*bitWidth))&bitMask),\n\t\t\tbyte((word>>(7*bitWidth))&bitMask),\n\t\t)\n\n\t\ti = j\n\t}\n}\n"
  },
  {
    "path": "encoding/rle/rle_amd64.go",
    "content": "//go:build !purego\n\npackage rle\n\nimport (\n\t\"golang.org/x/sys/cpu\"\n)\n\nvar (\n\tencodeInt32IndexEqual8Contiguous func(words [][8]int32) int\n\tencodeInt32Bitpack               func(dst []byte, src [][8]int32, bitWidth uint) int\n\tencodeBytesBitpack               func(dst []byte, src []uint64, bitWidth uint) int\n\tdecodeBytesBitpack               func(dst, src []byte, count, bitWidth uint)\n)\n\nfunc init() {\n\tswitch {\n\tcase cpu.X86.HasAVX2:\n\t\tencodeInt32IndexEqual8Contiguous = encodeInt32IndexEqual8ContiguousAVX2\n\t\tencodeInt32Bitpack = encodeInt32BitpackAVX2\n\tdefault:\n\t\tencodeInt32IndexEqual8Contiguous = encodeInt32IndexEqual8ContiguousSSE\n\t\tencodeInt32Bitpack = encodeInt32BitpackDefault\n\t}\n\n\tswitch {\n\tcase cpu.X86.HasBMI2:\n\t\tencodeBytesBitpack = encodeBytesBitpackBMI2\n\t\tdecodeBytesBitpack = decodeBytesBitpackBMI2\n\tdefault:\n\t\tencodeBytesBitpack = encodeBytesBitpackDefault\n\t\tdecodeBytesBitpack = decodeBytesBitpackDefault\n\t}\n}\n\n//go:noescape\nfunc encodeBytesBitpackBMI2(dst []byte, src []uint64, bitWidth uint) int\n\n//go:noescape\nfunc encodeInt32IndexEqual8ContiguousAVX2(words [][8]int32) int\n\n//go:noescape\nfunc encodeInt32IndexEqual8ContiguousSSE(words [][8]int32) int\n\n//go:noescape\nfunc encodeInt32Bitpack1to16bitsAVX2(dst []byte, src [][8]int32, bitWidth uint) int\n\nfunc encodeInt32BitpackAVX2(dst []byte, src [][8]int32, bitWidth uint) int {\n\tswitch {\n\tcase bitWidth == 0:\n\t\treturn 0\n\tcase bitWidth <= 16:\n\t\treturn encodeInt32Bitpack1to16bitsAVX2(dst, src, bitWidth)\n\tdefault:\n\t\treturn encodeInt32BitpackDefault(dst, src, bitWidth)\n\t}\n}\n\n//go:noescape\nfunc decodeBytesBitpackBMI2(dst, src []byte, count, bitWidth uint)\n"
  },
  {
    "path": "encoding/rle/rle_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\nGLOBL bitMasks<>(SB), RODATA|NOPTR, $64\nDATA bitMasks<>+0(SB)/8,  $0b0000000100000001000000010000000100000001000000010000000100000001\nDATA bitMasks<>+8(SB)/8,  $0b0000001100000011000000110000001100000011000000110000001100000011\nDATA bitMasks<>+16(SB)/8, $0b0000011100000111000001110000011100000111000001110000011100000111\nDATA bitMasks<>+24(SB)/8, $0b0000111100001111000011110000111100001111000011110000111100001111\nDATA bitMasks<>+32(SB)/8, $0b0001111100011111000111110001111100011111000111110001111100011111\nDATA bitMasks<>+40(SB)/8, $0b0011111100111111001111110011111100111111001111110011111100111111\nDATA bitMasks<>+48(SB)/8, $0b0111111101111111011111110111111101111111011111110111111101111111\nDATA bitMasks<>+56(SB)/8, $0b1111111111111111111111111111111111111111111111111111111111111111\n\n// func decodeBytesBitpackBMI2(dst, src []byte, count, bitWidth uint)\nTEXT ·decodeBytesBitpackBMI2(SB), NOSPLIT, $0-64\n    MOVQ dst_base+0(FP), AX\n    MOVQ src_base+24(FP), BX\n    MOVQ count+48(FP), CX\n    MOVQ bitWidth+56(FP), DX\n    LEAQ bitMasks<>(SB), DI\n    MOVQ -8(DI)(DX*8), DI\n    XORQ SI, SI\n    SHRQ $3, CX\n    JMP test\nloop:\n    MOVQ (BX), R8\n    PDEPQ DI, R8, R8\n    MOVQ R8, (AX)(SI*8)\n    ADDQ DX, BX\n    INCQ SI\ntest:\n    CMPQ SI, CX\n    JNE loop\n    RET\n\n// func encodeBytesBitpackBMI2(dst []byte, src []uint64, bitWidth uint) int\nTEXT ·encodeBytesBitpackBMI2(SB), NOSPLIT, $0-64\n    MOVQ dst_base+0(FP), AX\n    MOVQ src_base+24(FP), BX\n    MOVQ src_len+32(FP), CX\n    MOVQ bitWidth+48(FP), DX\n    LEAQ bitMasks<>(SB), DI\n    MOVQ -8(DI)(DX*8), DI\n    XORQ SI, SI\n    JMP test\nloop:\n    MOVQ (BX)(SI*8), R8\n    PEXTQ DI, R8, R8\n    MOVQ R8, (AX)\n    ADDQ DX, AX\n    INCQ SI\ntest:\n    CMPQ SI, CX\n    JNE loop\ndone:\n    SUBQ dst+0(FP), AX\n    MOVQ AX, ret+56(FP)\n    RET\n\n// func encodeInt32IndexEqual8ContiguousAVX2(words [][8]int32) int\nTEXT ·encodeInt32IndexEqual8ContiguousAVX2(SB), NOSPLIT, $0-32\n    MOVQ words_base+0(FP), AX\n    MOVQ words_len+8(FP), BX\n    XORQ SI, SI\n    SHLQ $5, BX\n    JMP test\nloop:\n    VMOVDQU (AX)(SI*1), Y0\n    VPSHUFD $0, Y0, Y1\n    VPCMPEQD Y1, Y0, Y0\n    VMOVMSKPS Y0, CX\n    CMPL CX, $0xFF\n    JE done\n    ADDQ $32, SI\ntest:\n    CMPQ SI, BX\n    JNE loop\ndone:\n    VZEROUPPER\n    SHRQ $5, SI\n    MOVQ SI, ret+24(FP)\n    RET\n\n// func encodeInt32IndexEqual8ContiguousSSE(words [][8]int32) int\nTEXT ·encodeInt32IndexEqual8ContiguousSSE(SB), NOSPLIT, $0-32\n    MOVQ words_base+0(FP), AX\n    MOVQ words_len+8(FP), BX\n    XORQ SI, SI\n    SHLQ $5, BX\n    JMP test\nloop:\n    MOVOU (AX)(SI*1), X0\n    MOVOU 16(AX)(SI*1), X1\n    PSHUFD $0, X0, X2\n    PCMPEQL X2, X0\n    PCMPEQL X2, X1\n    MOVMSKPS X0, CX\n    MOVMSKPS X1, DX\n    ANDL DX, CX\n    CMPL CX, $0xF\n    JE done\n    ADDQ $32, SI\ntest:\n    CMPQ SI, BX\n    JNE loop\ndone:\n    SHRQ $5, SI\n    MOVQ SI, ret+24(FP)\n    RET\n\n// func encodeInt32Bitpack1to16bitsAVX2(dst []byte, src [][8]int32, bitWidth uint) int\nTEXT ·encodeInt32Bitpack1to16bitsAVX2(SB), NOSPLIT, $0-64\n    MOVQ dst_base+0(FP), AX\n    MOVQ src_base+24(FP), BX\n    MOVQ src_len+32(FP), CX\n    MOVQ bitWidth+48(FP), DX\n\n    MOVQ DX, X0\n    VPBROADCASTQ X0, Y6 // [1*bitWidth...]\n    VPSLLQ $1, Y6, Y7   // [2*bitWidth...]\n    VPADDQ Y6, Y7, Y8   // [3*bitWidth...]\n    VPSLLQ $2, Y6, Y9   // [4*bitWidth...]\n\n    MOVQ $64, DI\n    MOVQ DI, X1\n    VPBROADCASTQ X1, Y10\n    VPSUBQ Y6, Y10, Y11 // [64-1*bitWidth...]\n    VPSUBQ Y9, Y10, Y12 // [64-4*bitWidth...]\n    VPCMPEQQ Y4, Y4, Y4\n    VPSRLVQ Y11, Y4, Y4\n\n    VPXOR Y5, Y5, Y5\n    XORQ SI, SI\n    SHLQ $5, CX\n    JMP test\nloop:\n    VMOVDQU (BX)(SI*1), Y0\n    VPSHUFD $0b01010101, Y0, Y1\n    VPSHUFD $0b10101010, Y0, Y2\n    VPSHUFD $0b11111111, Y0, Y3\n\n    VPAND Y4, Y0, Y0\n    VPAND Y4, Y1, Y1\n    VPAND Y4, Y2, Y2\n    VPAND Y4, Y3, Y3\n\n    VPSLLVQ Y6, Y1, Y1\n    VPSLLVQ Y7, Y2, Y2\n    VPSLLVQ Y8, Y3, Y3\n\n    VPOR Y1, Y0, Y0\n    VPOR Y3, Y2, Y2\n    VPOR Y2, Y0, Y0\n\n    VPERMQ $0b00001010, Y0, Y1\n\n    VPSLLVQ X9, X1, X2\n    VPSRLQ X12, X1, X3\n    VBLENDPD $0b10, X3, X2, X1\n    VBLENDPD $0b10, X5, X0, X0\n    VPOR X1, X0, X0\n\n    VMOVDQU X0, (AX)\n\n    ADDQ DX, AX\n    ADDQ $32, SI\ntest:\n    CMPQ SI, CX\n    JNE loop\n    VZEROUPPER\n    SUBQ dst+0(FP), AX\n    MOVQ AX, ret+56(FP)\n    RET\n"
  },
  {
    "path": "encoding/rle/rle_amd64_test.go",
    "content": "//go:build go1.18 && !purego && amd64\n// +build go1.18,!purego,amd64\n\npackage rle\n\nimport \"testing\"\n\nfunc TestEncodeInt32IndexEqual8ContiguousAVX2(t *testing.T) {\n\ttestEncodeInt32IndexEqual8Contiguous(t, encodeInt32IndexEqual8ContiguousAVX2)\n}\n\nfunc TestEncodeInt32IndexEqual8ContiguousSSE(t *testing.T) {\n\ttestEncodeInt32IndexEqual8Contiguous(t, encodeInt32IndexEqual8ContiguousSSE)\n}\n\nfunc BenchmarkEncodeInt32IndexEqual8ContiguousAVX2(b *testing.B) {\n\tbenchmarkEncodeInt32IndexEqual8Contiguous(b, encodeInt32IndexEqual8ContiguousAVX2)\n}\n\nfunc BenchmarkEncodeInt32IndexEqual8ContiguousSSE(b *testing.B) {\n\tbenchmarkEncodeInt32IndexEqual8Contiguous(b, encodeInt32IndexEqual8ContiguousSSE)\n}\n"
  },
  {
    "path": "encoding/rle/rle_purego.go",
    "content": "//go:build purego || !amd64\n\npackage rle\n\nfunc encodeBytesBitpack(dst []byte, src []uint64, bitWidth uint) int {\n\treturn encodeBytesBitpackDefault(dst, src, bitWidth)\n}\n\nfunc encodeInt32IndexEqual8Contiguous(words [][8]int32) (n int) {\n\tfor n < len(words) && words[n] != broadcast8x4(words[n][0]) {\n\t\tn++\n\t}\n\treturn n\n}\n\nfunc encodeInt32Bitpack(dst []byte, src [][8]int32, bitWidth uint) int {\n\treturn encodeInt32BitpackDefault(dst, src, bitWidth)\n}\n\nfunc decodeBytesBitpack(dst, src []byte, count, bitWidth uint) {\n\tdecodeBytesBitpackDefault(dst, src, count, bitWidth)\n}\n"
  },
  {
    "path": "encoding/rle/rle_test.go",
    "content": "//go:build go1.18\n// +build go1.18\n\npackage rle\n\nimport (\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/encoding/fuzz\"\n\t\"github.com/segmentio/parquet-go/internal/quick\"\n)\n\nfunc FuzzEncodeBoolean(f *testing.F) {\n\tfuzz.EncodeBoolean(f, &Encoding{BitWidth: 1})\n}\n\nfunc FuzzEncodeLevels(f *testing.F) {\n\tfuzz.EncodeLevels(f, &Encoding{BitWidth: 8})\n}\n\nfunc FuzzEncodeInt32(f *testing.F) {\n\tfuzz.EncodeInt32(f, &Encoding{BitWidth: 32})\n}\n\nfunc TestEncodeInt32IndexEqual8Contiguous(t *testing.T) {\n\ttestEncodeInt32IndexEqual8Contiguous(t, encodeInt32IndexEqual8Contiguous)\n}\n\nfunc testEncodeInt32IndexEqual8Contiguous(t *testing.T, f func([][8]int32) int) {\n\tt.Helper()\n\n\terr := quick.Check(func(words [][8]int32) bool {\n\t\twant := 0\n\n\t\tfor want < len(words) && words[want] != broadcast8x4(words[want][0]) {\n\t\t\twant++\n\t\t}\n\n\t\tif got := f(words); got != want {\n\t\t\tt.Errorf(\"want=%d got=%d\", want, got)\n\t\t\treturn false\n\t\t}\n\n\t\treturn true\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc BenchmarkEncodeInt32IndexEqual8Contiguous(b *testing.B) {\n\tbenchmarkEncodeInt32IndexEqual8Contiguous(b, encodeInt32IndexEqual8Contiguous)\n}\n\nfunc benchmarkEncodeInt32IndexEqual8Contiguous(b *testing.B, f func([][8]int32) int) {\n\twords := make([][8]int32, 1000)\n\tfor i := range words {\n\t\twords[i][0] = 1\n\t}\n\tfor i := 0; i < b.N; i++ {\n\t\t_ = f(words)\n\t}\n\tb.SetBytes(32 * int64(len(words)))\n}\n"
  },
  {
    "path": "encoding/rle/testdata/fuzz/FuzzEncodeBoolean/6be5e340694798c2e5b94c758f0262edd2edf8af5795d4c6c60f6e02643bbb96",
    "content": "go test fuzz v1\n[]byte(\"0\\x00\\x00\")\nint64(93)\n"
  },
  {
    "path": "encoding/rle/testdata/fuzz/FuzzEncodeBoolean/9772b3f21a6f61810fe38d120bcc9da6d78540f22dc819a4201283608671fdf4",
    "content": "go test fuzz v1\n[]byte(\"00000001\")\nint64(0)\n"
  },
  {
    "path": "encoding/rle/testdata/fuzz/FuzzEncodeInt32/06ba4bdb19de593e669c642987e270fe2488d4d58ecd712db136a3e011071253",
    "content": "go test fuzz v1\n[]byte(\"0000\")\nint64(0)\n"
  },
  {
    "path": "encoding/rle/testdata/fuzz/FuzzEncodeLevels/0468684de48f926219bfc47be13ddf085b5a0ed9fbd9c40a005641b253e88d33",
    "content": "go test fuzz v1\n[]byte(\"\\xba\\xba\\xba\\xba0\\xba\\xba\\xba\\xba\\xba\\xba\")\nint64(0)\n"
  },
  {
    "path": "encoding/test/test_go17.go",
    "content": "//go:build !go1.17\n\npackage test\n"
  },
  {
    "path": "encoding/test/test_go18.go",
    "content": "//go:build go1.18\n\npackage test\n\nimport (\n\t\"fmt\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/encoding\"\n)\n\nfunc EncodeInt32(t *testing.T, enc encoding.Encoding, min, max int, bitWidth uint) {\n\tt.Helper()\n\tencode(t, enc, min, max,\n\t\tencoding.Encoding.EncodeInt32,\n\t\tencoding.Encoding.DecodeInt32,\n\t\tfunc(i int) int32 {\n\t\t\tvalue := int32(i)\n\t\t\tmask := int32((1 << bitWidth) - 1)\n\t\t\tif (i % 2) != 0 {\n\t\t\t\tvalue = -value\n\t\t\t}\n\t\t\treturn value & mask\n\t\t},\n\t)\n}\n\nfunc EncodeInt64(t *testing.T, enc encoding.Encoding, min, max int, bitWidth uint) {\n\tt.Helper()\n\tencode(t, enc, min, max,\n\t\tencoding.Encoding.EncodeInt64,\n\t\tencoding.Encoding.DecodeInt64,\n\t\tfunc(i int) int64 {\n\t\t\tvalue := int64(i)\n\t\t\tmask := int64((1 << bitWidth) - 1)\n\t\t\tif (i % 2) != 0 {\n\t\t\t\tvalue = -value\n\t\t\t}\n\t\t\treturn value & mask\n\t\t},\n\t)\n}\n\nfunc EncodeFloat(t *testing.T, enc encoding.Encoding, min, max int) {\n\tt.Helper()\n\tencode(t, enc, min, max,\n\t\tencoding.Encoding.EncodeFloat,\n\t\tencoding.Encoding.DecodeFloat,\n\t\tfunc(i int) float32 { return float32(i) },\n\t)\n}\n\nfunc EncodeDouble(t *testing.T, enc encoding.Encoding, min, max int) {\n\tt.Helper()\n\tencode(t, enc, min, max,\n\t\tencoding.Encoding.EncodeDouble,\n\t\tencoding.Encoding.DecodeDouble,\n\t\tfunc(i int) float64 { return float64(i) },\n\t)\n}\n\ntype encodingFunc[T comparable] func(encoding.Encoding, []byte, []T) ([]byte, error)\n\ntype decodingFunc[T comparable] func(encoding.Encoding, []T, []byte) ([]T, error)\n\nfunc encode[T comparable](t *testing.T, enc encoding.Encoding, min, max int, encode encodingFunc[T], decode decodingFunc[T], valueOf func(int) T) {\n\tt.Helper()\n\n\tfor k := min; k <= max; k++ {\n\t\tt.Run(fmt.Sprintf(\"N=%d\", k), func(t *testing.T) {\n\t\t\tsrc := make([]T, k)\n\t\t\tfor i := range src {\n\t\t\t\tsrc[i] = valueOf(i)\n\t\t\t}\n\n\t\t\tbuf, err := encode(enc, nil, src)\n\t\t\tif err != nil {\n\t\t\t\tt.Fatalf(\"encoding %d values: %v\", k, err)\n\t\t\t}\n\n\t\t\tres, err := decode(enc, nil, buf)\n\t\t\tif err != nil {\n\t\t\t\tt.Fatalf(\"decoding %d values: %v\", k, err)\n\t\t\t}\n\n\t\t\tif err := assertEqual(src, res); err != nil {\n\t\t\t\tt.Fatalf(\"testing %d values: %v\", k, err)\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc assertEqual[T comparable](want, got []T) error {\n\tif len(want) != len(got) {\n\t\treturn fmt.Errorf(\"number of values mismatch: want=%d got=%d\", len(want), len(got))\n\t}\n\n\tfor i := range want {\n\t\tif want[i] != got[i] {\n\t\t\treturn fmt.Errorf(\"values at index %d/%d mismatch: want=%+v got=%+v\", i, len(want), want[i], got[i])\n\t\t}\n\t}\n\n\treturn nil\n}\n"
  },
  {
    "path": "encoding/values.go",
    "content": "package encoding\n\nimport (\n\t\"fmt\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\ntype Kind int32\n\nconst (\n\tUndefined Kind = iota\n\tBoolean\n\tInt32\n\tInt64\n\tInt96\n\tFloat\n\tDouble\n\tByteArray\n\tFixedLenByteArray\n)\n\nfunc (kind Kind) String() string {\n\tswitch kind {\n\tcase Boolean:\n\t\treturn \"BOOLEAN\"\n\tcase Int32:\n\t\treturn \"INT32\"\n\tcase Int64:\n\t\treturn \"INT64\"\n\tcase Int96:\n\t\treturn \"INT96\"\n\tcase Float:\n\t\treturn \"FLOAT\"\n\tcase Double:\n\t\treturn \"DOUBLE\"\n\tcase ByteArray:\n\t\treturn \"BYTE_ARRAY\"\n\tcase FixedLenByteArray:\n\t\treturn \"FIXED_LEN_BYTE_ARRAY\"\n\tdefault:\n\t\treturn \"UNDEFINED\"\n\t}\n}\n\ntype Values struct {\n\tkind    Kind\n\tsize    int32\n\tdata    []byte\n\toffsets []uint32\n}\n\nfunc (v *Values) assertKind(kind Kind) {\n\tif kind != v.kind {\n\t\tpanic(fmt.Sprintf(\"cannot convert values of type %s to type %s\", v.kind, kind))\n\t}\n}\n\nfunc (v *Values) assertSize(size int) {\n\tif size != int(v.size) {\n\t\tpanic(fmt.Sprintf(\"cannot convert values of size %d to size %d\", v.size, size))\n\t}\n}\n\nfunc (v *Values) Size() int64 {\n\treturn int64(len(v.data))\n}\n\nfunc (v *Values) Kind() Kind {\n\treturn v.kind\n}\n\nfunc (v *Values) Data() (data []byte, offsets []uint32) {\n\treturn v.data, v.offsets\n}\n\nfunc (v *Values) Boolean() []byte {\n\tv.assertKind(Boolean)\n\treturn v.data\n}\n\nfunc (v *Values) Int32() []int32 {\n\tv.assertKind(Int32)\n\treturn unsafecast.BytesToInt32(v.data)\n}\n\nfunc (v *Values) Int64() []int64 {\n\tv.assertKind(Int64)\n\treturn unsafecast.BytesToInt64(v.data)\n}\n\nfunc (v *Values) Int96() []deprecated.Int96 {\n\tv.assertKind(Int96)\n\treturn deprecated.BytesToInt96(v.data)\n}\n\nfunc (v *Values) Float() []float32 {\n\tv.assertKind(Float)\n\treturn unsafecast.BytesToFloat32(v.data)\n}\n\nfunc (v *Values) Double() []float64 {\n\tv.assertKind(Double)\n\treturn unsafecast.BytesToFloat64(v.data)\n}\n\nfunc (v *Values) ByteArray() (data []byte, offsets []uint32) {\n\tv.assertKind(ByteArray)\n\treturn v.data, v.offsets\n}\n\nfunc (v *Values) FixedLenByteArray() (data []byte, size int) {\n\tv.assertKind(FixedLenByteArray)\n\treturn v.data, int(v.size)\n}\n\nfunc (v *Values) Uint32() []uint32 {\n\tv.assertKind(Int32)\n\treturn unsafecast.BytesToUint32(v.data)\n}\n\nfunc (v *Values) Uint64() []uint64 {\n\tv.assertKind(Int64)\n\treturn unsafecast.BytesToUint64(v.data)\n}\n\nfunc (v *Values) Uint128() [][16]byte {\n\tv.assertKind(FixedLenByteArray)\n\tv.assertSize(16)\n\treturn unsafecast.BytesToUint128(v.data)\n}\n\nfunc BooleanValues(values []byte) Values {\n\treturn Values{\n\t\tkind: Boolean,\n\t\tdata: values,\n\t}\n}\n\nfunc Int32Values(values []int32) Values {\n\treturn Values{\n\t\tkind: Int32,\n\t\tdata: unsafecast.Int32ToBytes(values),\n\t}\n}\n\nfunc Int64Values(values []int64) Values {\n\treturn Values{\n\t\tkind: Int64,\n\t\tdata: unsafecast.Int64ToBytes(values),\n\t}\n}\n\nfunc Int96Values(values []deprecated.Int96) Values {\n\treturn Values{\n\t\tkind: Int96,\n\t\tdata: deprecated.Int96ToBytes(values),\n\t}\n}\n\nfunc FloatValues(values []float32) Values {\n\treturn Values{\n\t\tkind: Float,\n\t\tdata: unsafecast.Float32ToBytes(values),\n\t}\n}\n\nfunc DoubleValues(values []float64) Values {\n\treturn Values{\n\t\tkind: Double,\n\t\tdata: unsafecast.Float64ToBytes(values),\n\t}\n}\n\nfunc ByteArrayValues(values []byte, offsets []uint32) Values {\n\treturn Values{\n\t\tkind:    ByteArray,\n\t\tdata:    values,\n\t\toffsets: offsets,\n\t}\n}\n\nfunc FixedLenByteArrayValues(values []byte, size int) Values {\n\treturn Values{\n\t\tkind: FixedLenByteArray,\n\t\tsize: int32(size),\n\t\tdata: values,\n\t}\n}\n\nfunc Uint32Values(values []uint32) Values {\n\treturn Int32Values(unsafecast.Uint32ToInt32(values))\n}\n\nfunc Uint64Values(values []uint64) Values {\n\treturn Int64Values(unsafecast.Uint64ToInt64(values))\n}\n\nfunc Uint128Values(values [][16]byte) Values {\n\treturn FixedLenByteArrayValues(unsafecast.Uint128ToBytes(values), 16)\n}\n\nfunc Int32ValuesFromBytes(values []byte) Values {\n\treturn Values{\n\t\tkind: Int32,\n\t\tdata: values,\n\t}\n}\n\nfunc Int64ValuesFromBytes(values []byte) Values {\n\treturn Values{\n\t\tkind: Int64,\n\t\tdata: values,\n\t}\n}\n\nfunc Int96ValuesFromBytes(values []byte) Values {\n\treturn Values{\n\t\tkind: Int96,\n\t\tdata: values,\n\t}\n}\n\nfunc FloatValuesFromBytes(values []byte) Values {\n\treturn Values{\n\t\tkind: Float,\n\t\tdata: values,\n\t}\n}\n\nfunc DoubleValuesFromBytes(values []byte) Values {\n\treturn Values{\n\t\tkind: Double,\n\t\tdata: values,\n\t}\n}\n\nfunc EncodeBoolean(dst []byte, src Values, enc Encoding) ([]byte, error) {\n\treturn enc.EncodeBoolean(dst, src.Boolean())\n}\n\nfunc EncodeInt32(dst []byte, src Values, enc Encoding) ([]byte, error) {\n\treturn enc.EncodeInt32(dst, src.Int32())\n}\n\nfunc EncodeInt64(dst []byte, src Values, enc Encoding) ([]byte, error) {\n\treturn enc.EncodeInt64(dst, src.Int64())\n}\n\nfunc EncodeInt96(dst []byte, src Values, enc Encoding) ([]byte, error) {\n\treturn enc.EncodeInt96(dst, src.Int96())\n}\n\nfunc EncodeFloat(dst []byte, src Values, enc Encoding) ([]byte, error) {\n\treturn enc.EncodeFloat(dst, src.Float())\n}\n\nfunc EncodeDouble(dst []byte, src Values, enc Encoding) ([]byte, error) {\n\treturn enc.EncodeDouble(dst, src.Double())\n}\n\nfunc EncodeByteArray(dst []byte, src Values, enc Encoding) ([]byte, error) {\n\tvalues, offsets := src.ByteArray()\n\treturn enc.EncodeByteArray(dst, values, offsets)\n}\n\nfunc EncodeFixedLenByteArray(dst []byte, src Values, enc Encoding) ([]byte, error) {\n\tdata, size := src.FixedLenByteArray()\n\treturn enc.EncodeFixedLenByteArray(dst, data, size)\n}\n\nfunc DecodeBoolean(dst Values, src []byte, enc Encoding) (Values, error) {\n\tvalues, err := enc.DecodeBoolean(dst.Boolean(), src)\n\treturn BooleanValues(values), err\n}\n\nfunc DecodeInt32(dst Values, src []byte, enc Encoding) (Values, error) {\n\tvalues, err := enc.DecodeInt32(dst.Int32(), src)\n\treturn Int32Values(values), err\n}\n\nfunc DecodeInt64(dst Values, src []byte, enc Encoding) (Values, error) {\n\tvalues, err := enc.DecodeInt64(dst.Int64(), src)\n\treturn Int64Values(values), err\n}\n\nfunc DecodeInt96(dst Values, src []byte, enc Encoding) (Values, error) {\n\tvalues, err := enc.DecodeInt96(dst.Int96(), src)\n\treturn Int96Values(values), err\n}\n\nfunc DecodeFloat(dst Values, src []byte, enc Encoding) (Values, error) {\n\tvalues, err := enc.DecodeFloat(dst.Float(), src)\n\treturn FloatValues(values), err\n}\n\nfunc DecodeDouble(dst Values, src []byte, enc Encoding) (Values, error) {\n\tvalues, err := enc.DecodeDouble(dst.Double(), src)\n\treturn DoubleValues(values), err\n}\n\nfunc DecodeByteArray(dst Values, src []byte, enc Encoding) (Values, error) {\n\tvalues, offsets := dst.ByteArray()\n\tvalues, offsets, err := enc.DecodeByteArray(values, src, offsets)\n\treturn ByteArrayValues(values, offsets), err\n}\n\nfunc DecodeFixedLenByteArray(dst Values, src []byte, enc Encoding) (Values, error) {\n\tdata, size := dst.FixedLenByteArray()\n\tvalues, err := enc.DecodeFixedLenByteArray(data, src, size)\n\treturn FixedLenByteArrayValues(values, size), err\n}\n"
  },
  {
    "path": "encoding/values_test.go",
    "content": "package encoding_test\n\nimport (\n\t\"testing\"\n\t\"unsafe\"\n\n\t\"github.com/segmentio/parquet-go/encoding\"\n)\n\nfunc TestValuesSize(t *testing.T) {\n\tt.Log(unsafe.Sizeof(encoding.Values{}))\n}\n"
  },
  {
    "path": "encoding.go",
    "content": "package parquet\n\nimport (\n\t\"math/bits\"\n\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/encoding/bitpacked\"\n\t\"github.com/segmentio/parquet-go/encoding/bytestreamsplit\"\n\t\"github.com/segmentio/parquet-go/encoding/delta\"\n\t\"github.com/segmentio/parquet-go/encoding/plain\"\n\t\"github.com/segmentio/parquet-go/encoding/rle\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\nvar (\n\t// Plain is the default parquet encoding.\n\tPlain plain.Encoding\n\n\t// RLE is the hybrid bit-pack/run-length parquet encoding.\n\tRLE rle.Encoding\n\n\t// BitPacked is the deprecated bit-packed encoding for repetition and\n\t// definition levels.\n\tBitPacked bitpacked.Encoding\n\n\t// PlainDictionary is the plain dictionary parquet encoding.\n\t//\n\t// This encoding should not be used anymore in parquet 2.0 and later,\n\t// it is implemented for backwards compatibility to support reading\n\t// files that were encoded with older parquet libraries.\n\tPlainDictionary plain.DictionaryEncoding\n\n\t// RLEDictionary is the RLE dictionary parquet encoding.\n\tRLEDictionary rle.DictionaryEncoding\n\n\t// DeltaBinaryPacked is the delta binary packed parquet encoding.\n\tDeltaBinaryPacked delta.BinaryPackedEncoding\n\n\t// DeltaLengthByteArray is the delta length byte array parquet encoding.\n\tDeltaLengthByteArray delta.LengthByteArrayEncoding\n\n\t// DeltaByteArray is the delta byte array parquet encoding.\n\tDeltaByteArray delta.ByteArrayEncoding\n\n\t// ByteStreamSplit is an encoding for floating-point data.\n\tByteStreamSplit bytestreamsplit.Encoding\n\n\t// Table indexing the encodings supported by this package.\n\tencodings = [...]encoding.Encoding{\n\t\tformat.Plain:                &Plain,\n\t\tformat.PlainDictionary:      &PlainDictionary,\n\t\tformat.BitPacked:            &BitPacked,\n\t\tformat.RLE:                  &RLE,\n\t\tformat.RLEDictionary:        &RLEDictionary,\n\t\tformat.DeltaBinaryPacked:    &DeltaBinaryPacked,\n\t\tformat.DeltaLengthByteArray: &DeltaLengthByteArray,\n\t\tformat.DeltaByteArray:       &DeltaByteArray,\n\t\tformat.ByteStreamSplit:      &ByteStreamSplit,\n\t}\n\n\t// Table indexing RLE encodings for repetition and definition levels of\n\t// all supported bit widths.\n\tlevelEncodingsRLE = [...]rle.Encoding{\n\t\t0: {BitWidth: 1},\n\t\t1: {BitWidth: 2},\n\t\t2: {BitWidth: 3},\n\t\t3: {BitWidth: 4},\n\t\t4: {BitWidth: 5},\n\t\t5: {BitWidth: 6},\n\t\t6: {BitWidth: 7},\n\t\t7: {BitWidth: 8},\n\t}\n\n\tlevelEncodingsBitPacked = [...]bitpacked.Encoding{\n\t\t0: {BitWidth: 1},\n\t\t1: {BitWidth: 2},\n\t\t2: {BitWidth: 3},\n\t\t3: {BitWidth: 4},\n\t\t4: {BitWidth: 5},\n\t\t5: {BitWidth: 6},\n\t\t6: {BitWidth: 7},\n\t\t7: {BitWidth: 8},\n\t}\n)\n\nfunc isDictionaryEncoding(encoding encoding.Encoding) bool {\n\treturn isDictionaryFormat(encoding.Encoding())\n}\n\nfunc isDictionaryFormat(encoding format.Encoding) bool {\n\treturn encoding == format.PlainDictionary || encoding == format.RLEDictionary\n}\n\n// LookupEncoding returns the parquet encoding associated with the given code.\n//\n// The function never returns nil. If the encoding is not supported,\n// encoding.NotSupported is returned.\nfunc LookupEncoding(enc format.Encoding) encoding.Encoding {\n\tif enc >= 0 && int(enc) < len(encodings) {\n\t\tif e := encodings[enc]; e != nil {\n\t\t\treturn e\n\t\t}\n\t}\n\treturn encoding.NotSupported{}\n}\n\nfunc lookupLevelEncoding(enc format.Encoding, max byte) encoding.Encoding {\n\ti := bits.Len8(max) - 1\n\tswitch enc {\n\tcase format.RLE:\n\t\treturn &levelEncodingsRLE[i]\n\tcase format.BitPacked:\n\t\treturn &levelEncodingsBitPacked[i]\n\tdefault:\n\t\treturn encoding.NotSupported{}\n\t}\n}\n\nfunc canEncode(e encoding.Encoding, k Kind) bool {\n\tif isDictionaryEncoding(e) {\n\t\treturn true\n\t}\n\tswitch k {\n\tcase Boolean:\n\t\treturn encoding.CanEncodeBoolean(e)\n\tcase Int32:\n\t\treturn encoding.CanEncodeInt32(e)\n\tcase Int64:\n\t\treturn encoding.CanEncodeInt64(e)\n\tcase Int96:\n\t\treturn encoding.CanEncodeInt96(e)\n\tcase Float:\n\t\treturn encoding.CanEncodeFloat(e)\n\tcase Double:\n\t\treturn encoding.CanEncodeDouble(e)\n\tcase ByteArray:\n\t\treturn encoding.CanEncodeByteArray(e)\n\tcase FixedLenByteArray:\n\t\treturn encoding.CanEncodeFixedLenByteArray(e)\n\tdefault:\n\t\treturn false\n\t}\n}\n"
  },
  {
    "path": "errors.go",
    "content": "package parquet\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n)\n\nvar (\n\t// ErrCorrupted is an error returned by the Err method of ColumnPages\n\t// instances when they encountered a mismatch between the CRC checksum\n\t// recorded in a page header and the one computed while reading the page\n\t// data.\n\tErrCorrupted = errors.New(\"corrupted parquet page\")\n\n\t// ErrMissingRootColumn is an error returned when opening an invalid parquet\n\t// file which does not have a root column.\n\tErrMissingRootColumn = errors.New(\"parquet file is missing a root column\")\n\n\t// ErrRowGroupSchemaMissing is an error returned when attempting to write a\n\t// row group but the source has no schema.\n\tErrRowGroupSchemaMissing = errors.New(\"cannot write rows to a row group which has no schema\")\n\n\t// ErrRowGroupSchemaMismatch is an error returned when attempting to write a\n\t// row group but the source and destination schemas differ.\n\tErrRowGroupSchemaMismatch = errors.New(\"cannot write row groups with mismatching schemas\")\n\n\t// ErrRowGroupSortingColumnsMismatch is an error returned when attempting to\n\t// write a row group but the sorting columns differ in the source and\n\t// destination.\n\tErrRowGroupSortingColumnsMismatch = errors.New(\"cannot write row groups with mismatching sorting columns\")\n\n\t// ErrSeekOutOfRange is an error returned when seeking to a row index which\n\t// is less than the first row of a page.\n\tErrSeekOutOfRange = errors.New(\"seek to row index out of page range\")\n\n\t// ErrUnexpectedDictionaryPage is an error returned when a page reader\n\t// encounters a dictionary page after the first page, or in a column\n\t// which does not use a dictionary encoding.\n\tErrUnexpectedDictionaryPage = errors.New(\"unexpected dictionary page\")\n\n\t// ErrMissingPageHeader is an error returned when a page reader encounters\n\t// a malformed page header which is missing page-type-specific information.\n\tErrMissingPageHeader = errors.New(\"missing page header\")\n\n\t// ErrUnexpectedRepetitionLevels is an error returned when attempting to\n\t// decode repetition levels into a page which is not part of a repeated\n\t// column.\n\tErrUnexpectedRepetitionLevels = errors.New(\"unexpected repetition levels\")\n\n\t// ErrUnexpectedDefinitionLevels is an error returned when attempting to\n\t// decode definition levels into a page which is part of a required column.\n\tErrUnexpectedDefinitionLevels = errors.New(\"unexpected definition levels\")\n\n\t// ErrTooManyRowGroups is returned when attempting to generate a parquet\n\t// file with more than MaxRowGroups row groups.\n\tErrTooManyRowGroups = errors.New(\"the limit of 32767 row groups has been reached\")\n\n\t// ErrConversion is used to indicate that a conversion betwen two values\n\t// cannot be done because there are no rules to translate between their\n\t// physical types.\n\tErrInvalidConversion = errors.New(\"invalid conversion between parquet values\")\n)\n\ntype errno int\n\nconst (\n\tok errno = iota\n\tindexOutOfBounds\n)\n\nfunc (e errno) check() {\n\tswitch e {\n\tcase ok:\n\tcase indexOutOfBounds:\n\t\tpanic(\"index out of bounds\")\n\tdefault:\n\t\tpanic(\"BUG: unknown error code\")\n\t}\n}\n\nfunc errRowIndexOutOfBounds(rowIndex, rowCount int64) error {\n\treturn fmt.Errorf(\"row index out of bounds: %d/%d\", rowIndex, rowCount)\n}\n"
  },
  {
    "path": "example_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"fmt\"\n\t\"io\"\n\t\"io/ioutil\"\n\t\"log\"\n\t\"os\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc Example() {\n\t// parquet-go uses the same struct-tag definition style as JSON and XML\n\ttype Contact struct {\n\t\tName string `parquet:\"name\"`\n\t\t// \"zstd\" specifies the compression for this column\n\t\tPhoneNumber string `parquet:\"phoneNumber,optional,zstd\"`\n\t}\n\n\ttype AddressBook struct {\n\t\tOwner             string    `parquet:\"owner,zstd\"`\n\t\tOwnerPhoneNumbers []string  `parquet:\"ownerPhoneNumbers,gzip\"`\n\t\tContacts          []Contact `parquet:\"contacts\"`\n\t}\n\n\tf, _ := ioutil.TempFile(\"\", \"parquet-example-\")\n\twriter := parquet.NewWriter(f)\n\trows := []AddressBook{\n\t\t{Owner: \"UserA\", Contacts: []Contact{\n\t\t\t{Name: \"Alice\", PhoneNumber: \"+15505551234\"},\n\t\t\t{Name: \"Bob\"},\n\t\t}},\n\t\t// Add more rows here.\n\t}\n\tfor _, row := range rows {\n\t\tif err := writer.Write(row); err != nil {\n\t\t\tlog.Fatal(err)\n\t\t}\n\t}\n\t_ = writer.Close()\n\t_ = f.Close()\n\n\t// Now, we can read from the file.\n\trf, _ := os.Open(f.Name())\n\tpf := parquet.NewReader(rf)\n\taddrs := make([]AddressBook, 0)\n\tfor {\n\t\tvar addr AddressBook\n\t\terr := pf.Read(&addr)\n\t\tif err == io.EOF {\n\t\t\tbreak\n\t\t}\n\t\tif err != nil {\n\t\t\tlog.Fatal(err)\n\t\t}\n\t\taddrs = append(addrs, addr)\n\t}\n\tfmt.Println(addrs[0].Owner)\n\t// Output: UserA\n}\n"
  },
  {
    "path": "file.go",
    "content": "package parquet\n\nimport (\n\t\"bufio\"\n\t\"encoding/binary\"\n\t\"fmt\"\n\t\"hash/crc32\"\n\t\"io\"\n\t\"sort\"\n\t\"strings\"\n\t\"sync\"\n\n\t\"github.com/segmentio/encoding/thrift\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\nconst (\n\tdefaultDictBufferSize = 8192\n\tdefaultReadBufferSize = 4096\n)\n\n// File represents a parquet file. The layout of a Parquet file can be found\n// here: https://github.com/apache/parquet-format#file-format\ntype File struct {\n\tmetadata      format.FileMetaData\n\tprotocol      thrift.CompactProtocol\n\treader        io.ReaderAt\n\tsize          int64\n\tschema        *Schema\n\troot          *Column\n\tcolumnIndexes []format.ColumnIndex\n\toffsetIndexes []format.OffsetIndex\n\trowGroups     []RowGroup\n\tconfig        *FileConfig\n}\n\n// OpenFile opens a parquet file and reads the content between offset 0 and the given\n// size in r.\n//\n// Only the parquet magic bytes and footer are read, column chunks and other\n// parts of the file are left untouched; this means that successfully opening\n// a file does not validate that the pages have valid checksums.\nfunc OpenFile(r io.ReaderAt, size int64, options ...FileOption) (*File, error) {\n\tb := make([]byte, 8)\n\tc, err := NewFileConfig(options...)\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\tf := &File{reader: r, size: size, config: c}\n\n\tif _, err := r.ReadAt(b[:4], 0); err != nil {\n\t\treturn nil, fmt.Errorf(\"reading magic header of parquet file: %w\", err)\n\t}\n\tif string(b[:4]) != \"PAR1\" {\n\t\treturn nil, fmt.Errorf(\"invalid magic header of parquet file: %q\", b[:4])\n\t}\n\n\tif cast, ok := f.reader.(interface{ SetMagicFooterSection(offset, length int64) }); ok {\n\t\tcast.SetMagicFooterSection(size-8, 8)\n\t}\n\tif n, err := r.ReadAt(b[:8], size-8); n != 8 {\n\t\treturn nil, fmt.Errorf(\"reading magic footer of parquet file: %w\", err)\n\t}\n\tif string(b[4:8]) != \"PAR1\" {\n\t\treturn nil, fmt.Errorf(\"invalid magic footer of parquet file: %q\", b[4:8])\n\t}\n\n\tfooterSize := int64(binary.LittleEndian.Uint32(b[:4]))\n\tfooterData := make([]byte, footerSize)\n\n\tif cast, ok := f.reader.(interface{ SetFooterSection(offset, length int64) }); ok {\n\t\tcast.SetFooterSection(size-(footerSize+8), footerSize)\n\t}\n\tif _, err := f.reader.ReadAt(footerData, size-(footerSize+8)); err != nil {\n\t\treturn nil, fmt.Errorf(\"reading footer of parquet file: %w\", err)\n\t}\n\tif err := thrift.Unmarshal(&f.protocol, footerData, &f.metadata); err != nil {\n\t\treturn nil, fmt.Errorf(\"reading parquet file metadata: %w\", err)\n\t}\n\tif len(f.metadata.Schema) == 0 {\n\t\treturn nil, ErrMissingRootColumn\n\t}\n\n\tif !c.SkipPageIndex {\n\t\tif f.columnIndexes, f.offsetIndexes, err = f.ReadPageIndex(); err != nil {\n\t\t\treturn nil, fmt.Errorf(\"reading page index of parquet file: %w\", err)\n\t\t}\n\t}\n\n\tif f.root, err = openColumns(f); err != nil {\n\t\treturn nil, fmt.Errorf(\"opening columns of parquet file: %w\", err)\n\t}\n\n\tvar schema *Schema\n\tif c.Schema != nil {\n\t\tschema = c.Schema\n\t} else {\n\t\tschema = NewSchema(f.root.Name(), f.root)\n\t}\n\tcolumns := make([]*Column, 0, numLeafColumnsOf(f.root))\n\tf.schema = schema\n\tf.root.forEachLeaf(func(c *Column) { columns = append(columns, c) })\n\n\trowGroups := make([]fileRowGroup, len(f.metadata.RowGroups))\n\tfor i := range rowGroups {\n\t\trowGroups[i].init(f, schema, columns, &f.metadata.RowGroups[i])\n\t}\n\tf.rowGroups = make([]RowGroup, len(rowGroups))\n\tfor i := range rowGroups {\n\t\tf.rowGroups[i] = &rowGroups[i]\n\t}\n\n\tif !c.SkipBloomFilters {\n\t\tsection := io.NewSectionReader(r, 0, size)\n\t\trbuf, rbufpool := getBufioReader(section, c.ReadBufferSize)\n\t\tdefer putBufioReader(rbuf, rbufpool)\n\n\t\theader := format.BloomFilterHeader{}\n\t\tcompact := thrift.CompactProtocol{}\n\t\tdecoder := thrift.NewDecoder(compact.NewReader(rbuf))\n\n\t\tfor i := range rowGroups {\n\t\t\tg := &rowGroups[i]\n\n\t\t\tfor j := range g.columns {\n\t\t\t\tc := g.columns[j].(*fileColumnChunk)\n\n\t\t\t\tif offset := c.chunk.MetaData.BloomFilterOffset; offset > 0 {\n\t\t\t\t\tsection.Seek(offset, io.SeekStart)\n\t\t\t\t\trbuf.Reset(section)\n\n\t\t\t\t\theader = format.BloomFilterHeader{}\n\t\t\t\t\tif err := decoder.Decode(&header); err != nil {\n\t\t\t\t\t\treturn nil, fmt.Errorf(\"decoding bloom filter header: %w\", err)\n\t\t\t\t\t}\n\n\t\t\t\t\toffset, _ = section.Seek(0, io.SeekCurrent)\n\t\t\t\t\toffset -= int64(rbuf.Buffered())\n\n\t\t\t\t\tif cast, ok := r.(interface{ SetBloomFilterSection(offset, length int64) }); ok {\n\t\t\t\t\t\tbloomFilterOffset := c.chunk.MetaData.BloomFilterOffset\n\t\t\t\t\t\tbloomFilterLength := (offset - bloomFilterOffset) + int64(header.NumBytes)\n\t\t\t\t\t\tcast.SetBloomFilterSection(bloomFilterOffset, bloomFilterLength)\n\t\t\t\t\t}\n\n\t\t\t\t\tc.bloomFilter = newBloomFilter(r, offset, &header)\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\n\tsortKeyValueMetadata(f.metadata.KeyValueMetadata)\n\treturn f, nil\n}\n\n// ReadPageIndex reads the page index section of the parquet file f.\n//\n// If the file did not contain a page index, the method returns two empty slices\n// and a nil error.\n//\n// Only leaf columns have indexes, the returned indexes are arranged using the\n// following layout:\n//\n//\t------------------\n//\t| col 0: chunk 0 |\n//\t------------------\n//\t| col 1: chunk 0 |\n//\t------------------\n//\t| ...            |\n//\t------------------\n//\t| col 0: chunk 1 |\n//\t------------------\n//\t| col 1: chunk 1 |\n//\t------------------\n//\t| ...            |\n//\t------------------\n//\n// This method is useful in combination with the SkipPageIndex option to delay\n// reading the page index section until after the file was opened. Note that in\n// this case the page index is not cached within the file, programs are expected\n// to make use of independently from the parquet package.\nfunc (f *File) ReadPageIndex() ([]format.ColumnIndex, []format.OffsetIndex, error) {\n\tif len(f.metadata.RowGroups) == 0 {\n\t\treturn nil, nil, nil\n\t}\n\n\tcolumnIndexOffset := f.metadata.RowGroups[0].Columns[0].ColumnIndexOffset\n\toffsetIndexOffset := f.metadata.RowGroups[0].Columns[0].OffsetIndexOffset\n\tcolumnIndexLength := int64(0)\n\toffsetIndexLength := int64(0)\n\n\tforEachColumnChunk := func(do func(int, int, *format.ColumnChunk) error) error {\n\t\tfor i := range f.metadata.RowGroups {\n\t\t\tfor j := range f.metadata.RowGroups[i].Columns {\n\t\t\t\tc := &f.metadata.RowGroups[i].Columns[j]\n\t\t\t\tif err := do(i, j, c); err != nil {\n\t\t\t\t\treturn err\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn nil\n\t}\n\n\tforEachColumnChunk(func(_, _ int, c *format.ColumnChunk) error {\n\t\tcolumnIndexLength += int64(c.ColumnIndexLength)\n\t\toffsetIndexLength += int64(c.OffsetIndexLength)\n\t\treturn nil\n\t})\n\n\tif columnIndexLength == 0 && offsetIndexLength == 0 {\n\t\treturn nil, nil, nil\n\t}\n\n\tnumRowGroups := len(f.metadata.RowGroups)\n\tnumColumns := len(f.metadata.RowGroups[0].Columns)\n\tnumColumnChunks := numRowGroups * numColumns\n\n\tcolumnIndexes := make([]format.ColumnIndex, numColumnChunks)\n\toffsetIndexes := make([]format.OffsetIndex, numColumnChunks)\n\tindexBuffer := make([]byte, max(int(columnIndexLength), int(offsetIndexLength)))\n\n\tif columnIndexOffset > 0 {\n\t\tcolumnIndexData := indexBuffer[:columnIndexLength]\n\n\t\tif cast, ok := f.reader.(interface{ SetColumnIndexSection(offset, length int64) }); ok {\n\t\t\tcast.SetColumnIndexSection(columnIndexOffset, columnIndexLength)\n\t\t}\n\t\tif _, err := f.reader.ReadAt(columnIndexData, columnIndexOffset); err != nil {\n\t\t\treturn nil, nil, fmt.Errorf(\"reading %d bytes column index at offset %d: %w\", columnIndexLength, columnIndexOffset, err)\n\t\t}\n\n\t\terr := forEachColumnChunk(func(i, j int, c *format.ColumnChunk) error {\n\t\t\t// Some parquet files are missing the column index on some columns.\n\t\t\t//\n\t\t\t// An example of this file is testdata/alltypes_tiny_pages_plain.parquet\n\t\t\t// which was added in https://github.com/apache/parquet-testing/pull/24.\n\t\t\tif c.ColumnIndexOffset > 0 {\n\t\t\t\toffset := c.ColumnIndexOffset - columnIndexOffset\n\t\t\t\tlength := int64(c.ColumnIndexLength)\n\t\t\t\tbuffer := columnIndexData[offset : offset+length]\n\t\t\t\tif err := thrift.Unmarshal(&f.protocol, buffer, &columnIndexes[(i*numColumns)+j]); err != nil {\n\t\t\t\t\treturn fmt.Errorf(\"decoding column index: rowGroup=%d columnChunk=%d/%d: %w\", i, j, numColumns, err)\n\t\t\t\t}\n\t\t\t}\n\t\t\treturn nil\n\t\t})\n\t\tif err != nil {\n\t\t\treturn nil, nil, err\n\t\t}\n\t}\n\n\tif offsetIndexOffset > 0 {\n\t\toffsetIndexData := indexBuffer[:offsetIndexLength]\n\n\t\tif cast, ok := f.reader.(interface{ SetOffsetIndexSection(offset, length int64) }); ok {\n\t\t\tcast.SetOffsetIndexSection(offsetIndexOffset, offsetIndexLength)\n\t\t}\n\t\tif _, err := f.reader.ReadAt(offsetIndexData, offsetIndexOffset); err != nil {\n\t\t\treturn nil, nil, fmt.Errorf(\"reading %d bytes offset index at offset %d: %w\", offsetIndexLength, offsetIndexOffset, err)\n\t\t}\n\n\t\terr := forEachColumnChunk(func(i, j int, c *format.ColumnChunk) error {\n\t\t\tif c.OffsetIndexOffset > 0 {\n\t\t\t\toffset := c.OffsetIndexOffset - offsetIndexOffset\n\t\t\t\tlength := int64(c.OffsetIndexLength)\n\t\t\t\tbuffer := offsetIndexData[offset : offset+length]\n\t\t\t\tif err := thrift.Unmarshal(&f.protocol, buffer, &offsetIndexes[(i*numColumns)+j]); err != nil {\n\t\t\t\t\treturn fmt.Errorf(\"decoding column index: rowGroup=%d columnChunk=%d/%d: %w\", i, j, numColumns, err)\n\t\t\t\t}\n\t\t\t}\n\t\t\treturn nil\n\t\t})\n\t\tif err != nil {\n\t\t\treturn nil, nil, err\n\t\t}\n\t}\n\n\treturn columnIndexes, offsetIndexes, nil\n}\n\n// NumRows returns the number of rows in the file.\nfunc (f *File) NumRows() int64 { return f.metadata.NumRows }\n\n// RowGroups returns the list of row groups in the file.\nfunc (f *File) RowGroups() []RowGroup { return f.rowGroups }\n\n// Root returns the root column of f.\nfunc (f *File) Root() *Column { return f.root }\n\n// Schema returns the schema of f.\nfunc (f *File) Schema() *Schema { return f.schema }\n\n// Metadata returns the metadata of f.\nfunc (f *File) Metadata() *format.FileMetaData { return &f.metadata }\n\n// Size returns the size of f (in bytes).\nfunc (f *File) Size() int64 { return f.size }\n\n// ReadAt reads bytes into b from f at the given offset.\n//\n// The method satisfies the io.ReaderAt interface.\nfunc (f *File) ReadAt(b []byte, off int64) (int, error) {\n\tif off < 0 || off >= f.size {\n\t\treturn 0, io.EOF\n\t}\n\n\tif limit := f.size - off; limit < int64(len(b)) {\n\t\tn, err := f.reader.ReadAt(b[:limit], off)\n\t\tif err == nil {\n\t\t\terr = io.EOF\n\t\t}\n\t\treturn n, err\n\t}\n\n\treturn f.reader.ReadAt(b, off)\n}\n\n// ColumnIndexes returns the page index of the parquet file f.\n//\n// If the file did not contain a column index, the method returns an empty slice\n// and nil error.\nfunc (f *File) ColumnIndexes() []format.ColumnIndex { return f.columnIndexes }\n\n// OffsetIndexes returns the page index of the parquet file f.\n//\n// If the file did not contain an offset index, the method returns an empty\n// slice and nil error.\nfunc (f *File) OffsetIndexes() []format.OffsetIndex { return f.offsetIndexes }\n\n// Lookup returns the value associated with the given key in the file key/value\n// metadata.\n//\n// The ok boolean will be true if the key was found, false otherwise.\nfunc (f *File) Lookup(key string) (value string, ok bool) {\n\treturn lookupKeyValueMetadata(f.metadata.KeyValueMetadata, key)\n}\n\nfunc (f *File) hasIndexes() bool {\n\treturn f.columnIndexes != nil && f.offsetIndexes != nil\n}\n\nvar _ io.ReaderAt = (*File)(nil)\n\nfunc sortKeyValueMetadata(keyValueMetadata []format.KeyValue) {\n\tsort.Slice(keyValueMetadata, func(i, j int) bool {\n\t\tswitch {\n\t\tcase keyValueMetadata[i].Key < keyValueMetadata[j].Key:\n\t\t\treturn true\n\t\tcase keyValueMetadata[i].Key > keyValueMetadata[j].Key:\n\t\t\treturn false\n\t\tdefault:\n\t\t\treturn keyValueMetadata[i].Value < keyValueMetadata[j].Value\n\t\t}\n\t})\n}\n\nfunc lookupKeyValueMetadata(keyValueMetadata []format.KeyValue, key string) (value string, ok bool) {\n\ti := sort.Search(len(keyValueMetadata), func(i int) bool {\n\t\treturn keyValueMetadata[i].Key >= key\n\t})\n\tif i == len(keyValueMetadata) || keyValueMetadata[i].Key != key {\n\t\treturn \"\", false\n\t}\n\treturn keyValueMetadata[i].Value, true\n}\n\ntype fileRowGroup struct {\n\tschema   *Schema\n\trowGroup *format.RowGroup\n\tcolumns  []ColumnChunk\n\tsorting  []SortingColumn\n\tconfig   *FileConfig\n}\n\nfunc (g *fileRowGroup) init(file *File, schema *Schema, columns []*Column, rowGroup *format.RowGroup) {\n\tg.schema = schema\n\tg.rowGroup = rowGroup\n\tg.config = file.config\n\tg.columns = make([]ColumnChunk, len(rowGroup.Columns))\n\tg.sorting = make([]SortingColumn, len(rowGroup.SortingColumns))\n\tfileColumnChunks := make([]fileColumnChunk, len(rowGroup.Columns))\n\n\tfor i := range g.columns {\n\t\tfileColumnChunks[i] = fileColumnChunk{\n\t\t\tfile:     file,\n\t\t\tcolumn:   columns[i],\n\t\t\trowGroup: rowGroup,\n\t\t\tchunk:    &rowGroup.Columns[i],\n\t\t}\n\n\t\tif file.hasIndexes() {\n\t\t\tj := (int(rowGroup.Ordinal) * len(columns)) + i\n\t\t\tfileColumnChunks[i].columnIndex = &file.columnIndexes[j]\n\t\t\tfileColumnChunks[i].offsetIndex = &file.offsetIndexes[j]\n\t\t}\n\n\t\tg.columns[i] = &fileColumnChunks[i]\n\t}\n\n\tfor i := range g.sorting {\n\t\tg.sorting[i] = &fileSortingColumn{\n\t\t\tcolumn:     columns[rowGroup.SortingColumns[i].ColumnIdx],\n\t\t\tdescending: rowGroup.SortingColumns[i].Descending,\n\t\t\tnullsFirst: rowGroup.SortingColumns[i].NullsFirst,\n\t\t}\n\t}\n}\n\nfunc (g *fileRowGroup) Schema() *Schema                 { return g.schema }\nfunc (g *fileRowGroup) NumRows() int64                  { return g.rowGroup.NumRows }\nfunc (g *fileRowGroup) ColumnChunks() []ColumnChunk     { return g.columns }\nfunc (g *fileRowGroup) SortingColumns() []SortingColumn { return g.sorting }\nfunc (g *fileRowGroup) Rows() Rows                      { return newRowGroupRows(g, g.config.ReadMode) }\n\ntype fileSortingColumn struct {\n\tcolumn     *Column\n\tdescending bool\n\tnullsFirst bool\n}\n\nfunc (s *fileSortingColumn) Path() []string   { return s.column.Path() }\nfunc (s *fileSortingColumn) Descending() bool { return s.descending }\nfunc (s *fileSortingColumn) NullsFirst() bool { return s.nullsFirst }\nfunc (s *fileSortingColumn) String() string {\n\tb := new(strings.Builder)\n\tif s.nullsFirst {\n\t\tb.WriteString(\"nulls_first+\")\n\t}\n\tif s.descending {\n\t\tb.WriteString(\"descending(\")\n\t} else {\n\t\tb.WriteString(\"ascending(\")\n\t}\n\tb.WriteString(columnPath(s.Path()).String())\n\tb.WriteString(\")\")\n\treturn b.String()\n}\n\ntype fileColumnChunk struct {\n\tfile        *File\n\tcolumn      *Column\n\tbloomFilter *bloomFilter\n\trowGroup    *format.RowGroup\n\tcolumnIndex *format.ColumnIndex\n\toffsetIndex *format.OffsetIndex\n\tchunk       *format.ColumnChunk\n}\n\nfunc (c *fileColumnChunk) Type() Type {\n\treturn c.column.Type()\n}\n\nfunc (c *fileColumnChunk) Column() int {\n\treturn int(c.column.Index())\n}\n\nfunc (c *fileColumnChunk) Pages() Pages {\n\tr := new(filePages)\n\tr.init(c)\n\treturn r\n}\n\nfunc (c *fileColumnChunk) ColumnIndex() ColumnIndex {\n\tif c.columnIndex == nil {\n\t\treturn nil\n\t}\n\treturn fileColumnIndex{c}\n}\n\nfunc (c *fileColumnChunk) OffsetIndex() OffsetIndex {\n\tif c.offsetIndex == nil {\n\t\treturn nil\n\t}\n\treturn (*fileOffsetIndex)(c.offsetIndex)\n}\n\nfunc (c *fileColumnChunk) BloomFilter() BloomFilter {\n\tif c.bloomFilter == nil {\n\t\treturn nil\n\t}\n\treturn c.bloomFilter\n}\n\nfunc (c *fileColumnChunk) NumValues() int64 {\n\treturn c.chunk.MetaData.NumValues\n}\n\ntype filePages struct {\n\tchunk    *fileColumnChunk\n\trbuf     *bufio.Reader\n\trbufpool *sync.Pool\n\tsection  io.SectionReader\n\n\tprotocol thrift.CompactProtocol\n\tdecoder  thrift.Decoder\n\n\tbaseOffset int64\n\tdataOffset int64\n\tdictOffset int64\n\tindex      int\n\tskip       int64\n\tdictionary Dictionary\n\n\tbufferSize int\n}\n\nfunc (f *filePages) init(c *fileColumnChunk) {\n\tf.chunk = c\n\tf.baseOffset = c.chunk.MetaData.DataPageOffset\n\tf.dataOffset = f.baseOffset\n\tf.bufferSize = c.file.config.ReadBufferSize\n\n\tif c.chunk.MetaData.DictionaryPageOffset != 0 {\n\t\tf.baseOffset = c.chunk.MetaData.DictionaryPageOffset\n\t\tf.dictOffset = f.baseOffset\n\t}\n\n\tf.section = *io.NewSectionReader(c.file, f.baseOffset, c.chunk.MetaData.TotalCompressedSize)\n\tf.rbuf, f.rbufpool = getBufioReader(&f.section, f.bufferSize)\n\tf.decoder.Reset(f.protocol.NewReader(f.rbuf))\n}\n\nfunc (f *filePages) ReadPage() (Page, error) {\n\tif f.chunk == nil {\n\t\treturn nil, io.EOF\n\t}\n\n\theader := getPageHeader()\n\tdefer putPageHeader(header)\n\n\tfor {\n\t\tif err := f.decoder.Decode(header); err != nil {\n\t\t\treturn nil, err\n\t\t}\n\t\tdata, err := f.readPage(header, f.rbuf)\n\t\tif err != nil {\n\t\t\treturn nil, err\n\t\t}\n\n\t\tvar page Page\n\t\tswitch header.Type {\n\t\tcase format.DataPageV2:\n\t\t\tpage, err = f.readDataPageV2(header, data)\n\t\tcase format.DataPage:\n\t\t\tpage, err = f.readDataPageV1(header, data)\n\t\tcase format.DictionaryPage:\n\t\t\t// Sometimes parquet files do not have the dictionary page offset\n\t\t\t// recorded in the column metadata. We account for this by lazily\n\t\t\t// reading dictionary pages when we encounter them.\n\t\t\terr = f.readDictionaryPage(header, data)\n\t\tdefault:\n\t\t\terr = fmt.Errorf(\"cannot read values of type %s from page\", header.Type)\n\t\t}\n\n\t\tdata.unref()\n\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"decoding page %d of column %q: %w\", f.index, f.columnPath(), err)\n\t\t}\n\n\t\tif page == nil {\n\t\t\tcontinue\n\t\t}\n\n\t\tf.index++\n\t\tif f.skip == 0 {\n\t\t\treturn page, nil\n\t\t}\n\n\t\t// TODO: what about pages that don't embed the number of rows?\n\t\t// (data page v1 with no offset index in the column chunk).\n\t\tnumRows := page.NumRows()\n\n\t\tif numRows <= f.skip {\n\t\t\tRelease(page)\n\t\t} else {\n\t\t\ttail := page.Slice(f.skip, numRows)\n\t\t\tRelease(page)\n\t\t\tf.skip = 0\n\t\t\treturn tail, nil\n\t\t}\n\n\t\tf.skip -= numRows\n\t}\n}\n\nfunc (f *filePages) readDictionary() error {\n\tchunk := io.NewSectionReader(f.chunk.file, f.baseOffset, f.chunk.chunk.MetaData.TotalCompressedSize)\n\trbuf, pool := getBufioReader(chunk, f.bufferSize)\n\tdefer putBufioReader(rbuf, pool)\n\n\tdecoder := thrift.NewDecoder(f.protocol.NewReader(rbuf))\n\n\theader := getPageHeader()\n\tdefer putPageHeader(header)\n\n\tif err := decoder.Decode(header); err != nil {\n\t\treturn err\n\t}\n\n\tpage := buffers.get(int(header.CompressedPageSize))\n\tdefer page.unref()\n\n\tif _, err := io.ReadFull(rbuf, page.data); err != nil {\n\t\treturn err\n\t}\n\n\treturn f.readDictionaryPage(header, page)\n}\n\nfunc (f *filePages) readDictionaryPage(header *format.PageHeader, page *buffer) error {\n\tif header.DictionaryPageHeader == nil {\n\t\treturn ErrMissingPageHeader\n\t}\n\td, err := f.chunk.column.decodeDictionary(DictionaryPageHeader{header.DictionaryPageHeader}, page, header.UncompressedPageSize)\n\tif err != nil {\n\t\treturn err\n\t}\n\tf.dictionary = d\n\treturn nil\n}\n\nfunc (f *filePages) readDataPageV1(header *format.PageHeader, page *buffer) (Page, error) {\n\tif header.DataPageHeader == nil {\n\t\treturn nil, ErrMissingPageHeader\n\t}\n\tif isDictionaryFormat(header.DataPageHeader.Encoding) && f.dictionary == nil {\n\t\tif err := f.readDictionary(); err != nil {\n\t\t\treturn nil, err\n\t\t}\n\t}\n\treturn f.chunk.column.decodeDataPageV1(DataPageHeaderV1{header.DataPageHeader}, page, f.dictionary, header.UncompressedPageSize)\n}\n\nfunc (f *filePages) readDataPageV2(header *format.PageHeader, page *buffer) (Page, error) {\n\tif header.DataPageHeaderV2 == nil {\n\t\treturn nil, ErrMissingPageHeader\n\t}\n\tif isDictionaryFormat(header.DataPageHeaderV2.Encoding) && f.dictionary == nil {\n\t\t// If the program seeked to a row passed the first page, the dictionary\n\t\t// page may not have been seen, in which case we have to lazily load it\n\t\t// from the beginning of column chunk.\n\t\tif err := f.readDictionary(); err != nil {\n\t\t\treturn nil, err\n\t\t}\n\t}\n\treturn f.chunk.column.decodeDataPageV2(DataPageHeaderV2{header.DataPageHeaderV2}, page, f.dictionary, header.UncompressedPageSize)\n}\n\nfunc (f *filePages) readPage(header *format.PageHeader, reader *bufio.Reader) (*buffer, error) {\n\tpage := buffers.get(int(header.CompressedPageSize))\n\tdefer page.unref()\n\n\tif _, err := io.ReadFull(reader, page.data); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif header.CRC != 0 {\n\t\theaderChecksum := uint32(header.CRC)\n\t\tbufferChecksum := crc32.ChecksumIEEE(page.data)\n\n\t\tif headerChecksum != bufferChecksum {\n\t\t\t// The parquet specs indicate that corruption errors could be\n\t\t\t// handled gracefully by skipping pages, tho this may not always\n\t\t\t// be practical. Depending on how the pages are consumed,\n\t\t\t// missing rows may cause unpredictable behaviors in algorithms.\n\t\t\t//\n\t\t\t// For now, we assume these errors to be fatal, but we may\n\t\t\t// revisit later and improve error handling to be more resilient\n\t\t\t// to data corruption.\n\t\t\treturn nil, fmt.Errorf(\"crc32 checksum mismatch in page of column %q: want=0x%08X got=0x%08X: %w\",\n\t\t\t\tf.columnPath(),\n\t\t\t\theaderChecksum,\n\t\t\t\tbufferChecksum,\n\t\t\t\tErrCorrupted,\n\t\t\t)\n\t\t}\n\t}\n\n\tpage.ref()\n\treturn page, nil\n}\n\nfunc (f *filePages) SeekToRow(rowIndex int64) (err error) {\n\tif f.chunk == nil {\n\t\treturn io.ErrClosedPipe\n\t}\n\tif f.chunk.offsetIndex == nil {\n\t\t_, err = f.section.Seek(f.dataOffset-f.baseOffset, io.SeekStart)\n\t\tf.skip = rowIndex\n\t\tf.index = 0\n\t\tif f.dictOffset > 0 {\n\t\t\tf.index = 1\n\t\t}\n\t} else {\n\t\tpages := f.chunk.offsetIndex.PageLocations\n\t\tindex := sort.Search(len(pages), func(i int) bool {\n\t\t\treturn pages[i].FirstRowIndex > rowIndex\n\t\t}) - 1\n\t\tif index < 0 {\n\t\t\treturn ErrSeekOutOfRange\n\t\t}\n\t\t_, err = f.section.Seek(pages[index].Offset-f.baseOffset, io.SeekStart)\n\t\tf.skip = rowIndex - pages[index].FirstRowIndex\n\t\tf.index = index\n\t}\n\tf.rbuf.Reset(&f.section)\n\treturn err\n}\n\nfunc (f *filePages) Close() error {\n\tputBufioReader(f.rbuf, f.rbufpool)\n\tf.chunk = nil\n\tf.section = io.SectionReader{}\n\tf.rbuf = nil\n\tf.rbufpool = nil\n\tf.baseOffset = 0\n\tf.dataOffset = 0\n\tf.dictOffset = 0\n\tf.index = 0\n\tf.skip = 0\n\tf.dictionary = nil\n\treturn nil\n}\n\nfunc (f *filePages) columnPath() columnPath {\n\treturn columnPath(f.chunk.column.Path())\n}\n\ntype putBufioReaderFunc func()\n\nvar (\n\tbufioReaderPoolLock sync.Mutex\n\tbufioReaderPool     = map[int]*sync.Pool{}\n)\n\nfunc getBufioReader(r io.Reader, bufferSize int) (*bufio.Reader, *sync.Pool) {\n\tpool := getBufioReaderPool(bufferSize)\n\trbuf, _ := pool.Get().(*bufio.Reader)\n\tif rbuf == nil {\n\t\trbuf = bufio.NewReaderSize(r, bufferSize)\n\t} else {\n\t\trbuf.Reset(r)\n\t}\n\treturn rbuf, pool\n}\n\nfunc putBufioReader(rbuf *bufio.Reader, pool *sync.Pool) {\n\tif rbuf != nil && pool != nil {\n\t\trbuf.Reset(nil)\n\t\tpool.Put(rbuf)\n\t}\n}\n\nfunc getBufioReaderPool(size int) *sync.Pool {\n\tbufioReaderPoolLock.Lock()\n\tdefer bufioReaderPoolLock.Unlock()\n\n\tif pool := bufioReaderPool[size]; pool != nil {\n\t\treturn pool\n\t}\n\n\tpool := &sync.Pool{}\n\tbufioReaderPool[size] = pool\n\treturn pool\n}\n\nvar pageHeaderPool = &sync.Pool{}\n\nfunc getPageHeader() *format.PageHeader {\n\th, _ := pageHeaderPool.Get().(*format.PageHeader)\n\tif h != nil {\n\t\treturn h\n\t}\n\treturn new(format.PageHeader)\n}\n\nfunc putPageHeader(h *format.PageHeader) {\n\tif h != nil {\n\t\th.CRC = 0\n\t\tpageHeaderPool.Put(h)\n\t}\n}\n"
  },
  {
    "path": "file_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"io\"\n\t\"os\"\n\t\"path/filepath\"\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nvar testdataFiles []string\n\nfunc init() {\n\tentries, _ := os.ReadDir(\"testdata\")\n\tfor _, e := range entries {\n\t\ttestdataFiles = append(testdataFiles, filepath.Join(\"testdata\", e.Name()))\n\t}\n}\n\nfunc TestOpenFile(t *testing.T) {\n\tfor _, path := range testdataFiles {\n\t\tt.Run(path, func(t *testing.T) {\n\t\t\tf, err := os.Open(path)\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t}\n\t\t\tdefer f.Close()\n\n\t\t\ts, err := f.Stat()\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t}\n\n\t\t\tp, err := parquet.OpenFile(f, s.Size())\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t}\n\n\t\t\tif size := p.Size(); size != s.Size() {\n\t\t\t\tt.Errorf(\"file size mismatch: want=%d got=%d\", s.Size(), size)\n\t\t\t}\n\n\t\t\troot := p.Root()\n\t\t\tb := new(strings.Builder)\n\t\t\tparquet.PrintSchema(b, root.Name(), root)\n\t\t\tt.Log(b)\n\n\t\t\tprintColumns(t, p.Root(), \"\")\n\t\t})\n\t}\n}\n\nfunc printColumns(t *testing.T, col *parquet.Column, indent string) {\n\tif t.Failed() {\n\t\treturn\n\t}\n\n\tpath := strings.Join(col.Path(), \".\")\n\tif col.Leaf() {\n\t\tt.Logf(\"%s%s %v %v\", indent, path, col.Encoding(), col.Compression())\n\t} else {\n\t\tt.Logf(\"%s%s\", indent, path)\n\t}\n\tindent += \". \"\n\n\tbuffer := make([]parquet.Value, 42)\n\tpages := col.Pages()\n\tdefer pages.Close()\n\tfor {\n\t\tp, err := pages.ReadPage()\n\t\tif err != nil {\n\t\t\tif err != io.EOF {\n\t\t\t\tt.Error(err)\n\t\t\t}\n\t\t\tbreak\n\t\t}\n\n\t\tvalues := p.Values()\n\t\tnumValues := int64(0)\n\t\tnullCount := int64(0)\n\n\t\tfor {\n\t\t\tn, err := values.ReadValues(buffer)\n\t\t\tfor _, v := range buffer[:n] {\n\t\t\t\tif v.Column() != col.Index() {\n\t\t\t\t\tt.Errorf(\"value read from page of column %d says it belongs to column %d\", col.Index(), v.Column())\n\t\t\t\t\treturn\n\t\t\t\t}\n\t\t\t\tif v.IsNull() {\n\t\t\t\t\tnullCount++\n\t\t\t\t}\n\t\t\t}\n\t\t\tnumValues += int64(n)\n\t\t\tif err != nil {\n\t\t\t\tif err != io.EOF {\n\t\t\t\t\tt.Error(err)\n\t\t\t\t\treturn\n\t\t\t\t}\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\n\t\tif numValues != p.NumValues() {\n\t\t\tt.Errorf(\"page of column %d declared %d values but %d were read\", col.Index(), p.NumValues(), numValues)\n\t\t\treturn\n\t\t}\n\n\t\tif nullCount != p.NumNulls() {\n\t\t\tt.Errorf(\"page of column %d declared %d nulls but %d were read\", col.Index(), p.NumNulls(), nullCount)\n\t\t\treturn\n\t\t}\n\n\t\tparquet.Release(p)\n\t}\n\n\tfor _, child := range col.Columns() {\n\t\tprintColumns(t, child, indent)\n\t}\n}\n\nfunc TestFileKeyValueMetadata(t *testing.T) {\n\ttype Row struct {\n\t\tName string\n\t}\n\n\tf, err := createParquetFile(\n\t\tmakeRows([]Row{{Name: \"A\"}, {Name: \"B\"}, {Name: \"C\"}}),\n\t\tparquet.KeyValueMetadata(\"hello\", \"ignore this one\"),\n\t\tparquet.KeyValueMetadata(\"hello\", \"world\"),\n\t\tparquet.KeyValueMetadata(\"answer\", \"42\"),\n\t)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tfor _, want := range [][2]string{\n\t\t{\"hello\", \"world\"},\n\t\t{\"answer\", \"42\"},\n\t} {\n\t\tkey, value := want[0], want[1]\n\t\tif found, ok := f.Lookup(key); !ok || found != value {\n\t\t\tt.Errorf(\"key/value metadata mismatch: want %q=%q but got %q=%q (found=%t)\", key, value, key, found, ok)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "filter.go",
    "content": "package parquet\n\n// FilterRowReader constructs a RowReader which exposes rows from reader for\n// which the predicate has returned true.\nfunc FilterRowReader(reader RowReader, predicate func(Row) bool) RowReader {\n\tf := &filterRowReader{reader: reader, predicate: predicate}\n\tfor i := range f.rows {\n\t\tf.rows[i] = f.values[i : i : i+1]\n\t}\n\treturn f\n}\n\ntype filterRowReader struct {\n\treader    RowReader\n\tpredicate func(Row) bool\n\trows      [defaultRowBufferSize]Row\n\tvalues    [defaultRowBufferSize]Value\n}\n\nfunc (f *filterRowReader) ReadRows(rows []Row) (n int, err error) {\n\tfor n < len(rows) {\n\t\tr := len(rows) - n\n\n\t\tif r > len(f.rows) {\n\t\t\tr = len(f.rows)\n\t\t}\n\n\t\tr, err = f.reader.ReadRows(f.rows[:r])\n\n\t\tfor i := 0; i < r; i++ {\n\t\t\tif f.predicate(f.rows[i]) {\n\t\t\t\trows[n] = append(rows[n][:0], f.rows[i]...)\n\t\t\t\tn++\n\t\t\t}\n\t\t}\n\n\t\tif err != nil {\n\t\t\tbreak\n\t\t}\n\t}\n\treturn n, err\n}\n\n// FilterRowWriter constructs a RowWriter which writes rows to writer for which\n// the predicate has returned true.\nfunc FilterRowWriter(writer RowWriter, predicate func(Row) bool) RowWriter {\n\treturn &filterRowWriter{writer: writer, predicate: predicate}\n}\n\ntype filterRowWriter struct {\n\twriter    RowWriter\n\tpredicate func(Row) bool\n\trows      [defaultRowBufferSize]Row\n}\n\nfunc (f *filterRowWriter) WriteRows(rows []Row) (n int, err error) {\n\tdefer func() {\n\t\tclear := f.rows[:]\n\t\tfor i := range clear {\n\t\t\tclearValues(clear[i])\n\t\t}\n\t}()\n\n\tfor n < len(rows) {\n\t\ti := 0\n\t\tj := len(rows) - n\n\n\t\tif j > len(f.rows) {\n\t\t\tj = len(f.rows)\n\t\t}\n\n\t\tfor _, row := range rows[n : n+j] {\n\t\t\tif f.predicate(row) {\n\t\t\t\tf.rows[i] = row\n\t\t\t\ti++\n\t\t\t}\n\t\t}\n\n\t\tif i > 0 {\n\t\t\t_, err := f.writer.WriteRows(f.rows[:i])\n\t\t\tif err != nil {\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\n\t\tn += j\n\t}\n\n\treturn n, err\n}\n"
  },
  {
    "path": "filter_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc TestFilterRowReader(t *testing.T) {\n\trows := []parquet.Row{\n\t\t{parquet.Int64Value(0)},\n\t\t{parquet.Int64Value(1)},\n\t\t{parquet.Int64Value(2)},\n\t\t{parquet.Int64Value(3)},\n\t\t{parquet.Int64Value(4)},\n\t}\n\n\twant := []parquet.Row{\n\t\t{parquet.Int64Value(0)},\n\t\t{parquet.Int64Value(2)},\n\t\t{parquet.Int64Value(4)},\n\t}\n\n\treader := parquet.FilterRowReader(&bufferedRows{rows: rows},\n\t\tfunc(row parquet.Row) bool {\n\t\t\treturn row[0].Int64()%2 == 0\n\t\t},\n\t)\n\n\twriter := &bufferedRows{}\n\t_, err := parquet.CopyRows(writer, reader)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tassertEqualRows(t, want, writer.rows)\n}\n\nfunc TestFilterRowWriter(t *testing.T) {\n\trows := []parquet.Row{\n\t\t{parquet.Int64Value(0)},\n\t\t{parquet.Int64Value(1)},\n\t\t{parquet.Int64Value(2)},\n\t\t{parquet.Int64Value(3)},\n\t\t{parquet.Int64Value(4)},\n\t}\n\n\twant := []parquet.Row{\n\t\t{parquet.Int64Value(1)},\n\t\t{parquet.Int64Value(3)},\n\t}\n\n\tbuffer := &bufferedRows{}\n\twriter := parquet.FilterRowWriter(buffer,\n\t\tfunc(row parquet.Row) bool {\n\t\t\treturn row[0].Int64()%2 == 1\n\t\t},\n\t)\n\n\treader := &bufferedRows{rows: rows}\n\t_, err := parquet.CopyRows(writer, reader)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tassertEqualRows(t, want, buffer.rows)\n}\n"
  },
  {
    "path": "format/parquet.go",
    "content": "package format\n\nimport (\n\t\"fmt\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n)\n\n// Types supported by Parquet. These types are intended to be used in combination\n// with the encodings to control the on disk storage format. For example INT16\n// is not included as a type since a good encoding of INT32 would handle this.\ntype Type int32\n\nconst (\n\tBoolean           Type = 0\n\tInt32             Type = 1\n\tInt64             Type = 2\n\tInt96             Type = 3 // deprecated, only used by legacy implementations.\n\tFloat             Type = 4\n\tDouble            Type = 5\n\tByteArray         Type = 6\n\tFixedLenByteArray Type = 7\n)\n\nfunc (t Type) String() string {\n\tswitch t {\n\tcase Boolean:\n\t\treturn \"BOOLEAN\"\n\tcase Int32:\n\t\treturn \"INT32\"\n\tcase Int64:\n\t\treturn \"INT64\"\n\tcase Int96:\n\t\treturn \"INT96\"\n\tcase Float:\n\t\treturn \"FLOAT\"\n\tcase Double:\n\t\treturn \"DOUBLE\"\n\tcase ByteArray:\n\t\treturn \"BYTE_ARRAY\"\n\tcase FixedLenByteArray:\n\t\treturn \"FIXED_LEN_BYTE_ARRAY\"\n\tdefault:\n\t\treturn \"Type(?)\"\n\t}\n}\n\n// Representation of Schemas.\ntype FieldRepetitionType int32\n\nconst (\n\t// The field is required (can not be null) and each record has exactly 1 value.\n\tRequired FieldRepetitionType = 0\n\t// The field is optional (can be null) and each record has 0 or 1 values.\n\tOptional FieldRepetitionType = 1\n\t// The field is repeated and can contain 0 or more values.\n\tRepeated FieldRepetitionType = 2\n)\n\nfunc (t FieldRepetitionType) String() string {\n\tswitch t {\n\tcase Required:\n\t\treturn \"REQUIRED\"\n\tcase Optional:\n\t\treturn \"OPTIONAL\"\n\tcase Repeated:\n\t\treturn \"REPEATED\"\n\tdefault:\n\t\treturn \"FieldRepeationaType(?)\"\n\t}\n}\n\n// Statistics per row group and per page.\n// All fields are optional.\ntype Statistics struct {\n\t// DEPRECATED: min and max value of the column. Use min_value and max_value.\n\t//\n\t// Values are encoded using PLAIN encoding, except that variable-length byte\n\t// arrays do not include a length prefix.\n\t//\n\t// These fields encode min and max values determined by signed comparison\n\t// only. New files should use the correct order for a column's logical type\n\t// and store the values in the min_value and max_value fields.\n\t//\n\t// To support older readers, these may be set when the column order is\n\t// signed.\n\tMax []byte `thrift:\"1\"`\n\tMin []byte `thrift:\"2\"`\n\t// Count of null value in the column.\n\tNullCount int64 `thrift:\"3\"`\n\t// Count of distinct values occurring.\n\tDistinctCount int64 `thrift:\"4\"`\n\t// Min and max values for the column, determined by its ColumnOrder.\n\t//\n\t// Values are encoded using PLAIN encoding, except that variable-length byte\n\t// arrays do not include a length prefix.\n\tMaxValue []byte `thrift:\"5\"`\n\tMinValue []byte `thrift:\"6\"`\n}\n\n// Empty structs to use as logical type annotations.\ntype StringType struct{} // allowed for BINARY, must be encoded with UTF-8\ntype UUIDType struct{}   // allowed for FIXED[16], must encode raw UUID bytes\ntype MapType struct{}    // see see LogicalTypes.md\ntype ListType struct{}   // see LogicalTypes.md\ntype EnumType struct{}   // allowed for BINARY, must be encoded with UTF-8\ntype DateType struct{}   // allowed for INT32\n\nfunc (*StringType) String() string { return \"STRING\" }\nfunc (*UUIDType) String() string   { return \"UUID\" }\nfunc (*MapType) String() string    { return \"MAP\" }\nfunc (*ListType) String() string   { return \"LIST\" }\nfunc (*EnumType) String() string   { return \"ENUM\" }\nfunc (*DateType) String() string   { return \"DATE\" }\n\n// Logical type to annotate a column that is always null.\n//\n// Sometimes when discovering the schema of existing data, values are always\n// null and the physical type can't be determined. This annotation signals\n// the case where the physical type was guessed from all null values.\ntype NullType struct{}\n\nfunc (*NullType) String() string { return \"NULL\" }\n\n// Decimal logical type annotation\n//\n// To maintain forward-compatibility in v1, implementations using this logical\n// type must also set scale and precision on the annotated SchemaElement.\n//\n// Allowed for physical types: INT32, INT64, FIXED, and BINARY\ntype DecimalType struct {\n\tScale     int32 `thrift:\"1,required\"`\n\tPrecision int32 `thrift:\"2,required\"`\n}\n\nfunc (t *DecimalType) String() string {\n\t// Matching parquet-cli's decimal string format: https://github.com/apache/parquet-mr/blob/d057b39d93014fe40f5067ee4a33621e65c91552/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java#L249-L265\n\treturn fmt.Sprintf(\"DECIMAL(%d,%d)\", t.Precision, t.Scale)\n}\n\n// Time units for logical types.\ntype MilliSeconds struct{}\ntype MicroSeconds struct{}\ntype NanoSeconds struct{}\n\nfunc (*MilliSeconds) String() string { return \"MILLIS\" }\nfunc (*MicroSeconds) String() string { return \"MICROS\" }\nfunc (*NanoSeconds) String() string  { return \"NANOS\" }\n\ntype TimeUnit struct { // union\n\tMillis *MilliSeconds `thrift:\"1\"`\n\tMicros *MicroSeconds `thrift:\"2\"`\n\tNanos  *NanoSeconds  `thrift:\"3\"`\n}\n\nfunc (u *TimeUnit) String() string {\n\tswitch {\n\tcase u.Millis != nil:\n\t\treturn u.Millis.String()\n\tcase u.Micros != nil:\n\t\treturn u.Micros.String()\n\tcase u.Nanos != nil:\n\t\treturn u.Nanos.String()\n\tdefault:\n\t\treturn \"\"\n\t}\n}\n\n// Timestamp logical type annotation\n//\n// Allowed for physical types: INT64\ntype TimestampType struct {\n\tIsAdjustedToUTC bool     `thrift:\"1,required\"`\n\tUnit            TimeUnit `thrift:\"2,required\"`\n}\n\nfunc (t *TimestampType) String() string {\n\treturn fmt.Sprintf(\"TIMESTAMP(isAdjustedToUTC=%t,unit=%s)\", t.IsAdjustedToUTC, &t.Unit)\n}\n\n// Time logical type annotation\n//\n// Allowed for physical types: INT32 (millis), INT64 (micros, nanos)\ntype TimeType struct {\n\tIsAdjustedToUTC bool     `thrift:\"1,required\"`\n\tUnit            TimeUnit `thrift:\"2,required\"`\n}\n\nfunc (t *TimeType) String() string {\n\treturn fmt.Sprintf(\"TIME(isAdjustedToUTC=%t,unit=%s)\", t.IsAdjustedToUTC, &t.Unit)\n}\n\n// Integer logical type annotation\n//\n// bitWidth must be 8, 16, 32, or 64.\n//\n// Allowed for physical types: INT32, INT64\ntype IntType struct {\n\tBitWidth int8 `thrift:\"1,required\"`\n\tIsSigned bool `thrift:\"2,required\"`\n}\n\nfunc (t *IntType) String() string {\n\treturn fmt.Sprintf(\"INT(%d,%t)\", t.BitWidth, t.IsSigned)\n}\n\n// Embedded JSON logical type annotation\n//\n// Allowed for physical types: BINARY\ntype JsonType struct{}\n\nfunc (t *JsonType) String() string { return \"JSON\" }\n\n// Embedded BSON logical type annotation\n//\n// Allowed for physical types: BINARY\ntype BsonType struct{}\n\nfunc (t *BsonType) String() string { return \"BSON\" }\n\n// LogicalType annotations to replace ConvertedType.\n//\n// To maintain compatibility, implementations using LogicalType for a\n// SchemaElement must also set the corresponding ConvertedType (if any)\n// from the following table.\ntype LogicalType struct { // union\n\tUTF8    *StringType  `thrift:\"1\"` // use ConvertedType UTF8\n\tMap     *MapType     `thrift:\"2\"` // use ConvertedType Map\n\tList    *ListType    `thrift:\"3\"` // use ConvertedType List\n\tEnum    *EnumType    `thrift:\"4\"` // use ConvertedType Enum\n\tDecimal *DecimalType `thrift:\"5\"` // use ConvertedType Decimal + SchemaElement.{Scale, Precision}\n\tDate    *DateType    `thrift:\"6\"` // use ConvertedType Date\n\n\t// use ConvertedType TimeMicros for Time{IsAdjustedToUTC: *, Unit: Micros}\n\t// use ConvertedType TimeMillis for Time{IsAdjustedToUTC: *, Unit: Millis}\n\tTime *TimeType `thrift:\"7\"`\n\n\t// use ConvertedType TimestampMicros for Timestamp{IsAdjustedToUTC: *, Unit: Micros}\n\t// use ConvertedType TimestampMillis for Timestamp{IsAdjustedToUTC: *, Unit: Millis}\n\tTimestamp *TimestampType `thrift:\"8\"`\n\n\t// 9: reserved for Interval\n\tInteger *IntType  `thrift:\"10\"` // use ConvertedType Int* or Uint*\n\tUnknown *NullType `thrift:\"11\"` // no compatible ConvertedType\n\tJson    *JsonType `thrift:\"12\"` // use ConvertedType JSON\n\tBson    *BsonType `thrift:\"13\"` // use ConvertedType BSON\n\tUUID    *UUIDType `thrift:\"14\"` // no compatible ConvertedType\n}\n\nfunc (t *LogicalType) String() string {\n\tswitch {\n\tcase t.UTF8 != nil:\n\t\treturn t.UTF8.String()\n\tcase t.Map != nil:\n\t\treturn t.Map.String()\n\tcase t.List != nil:\n\t\treturn t.List.String()\n\tcase t.Enum != nil:\n\t\treturn t.Enum.String()\n\tcase t.Decimal != nil:\n\t\treturn t.Decimal.String()\n\tcase t.Date != nil:\n\t\treturn t.Date.String()\n\tcase t.Time != nil:\n\t\treturn t.Time.String()\n\tcase t.Timestamp != nil:\n\t\treturn t.Timestamp.String()\n\tcase t.Integer != nil:\n\t\treturn t.Integer.String()\n\tcase t.Unknown != nil:\n\t\treturn t.Unknown.String()\n\tcase t.Json != nil:\n\t\treturn t.Json.String()\n\tcase t.Bson != nil:\n\t\treturn t.Bson.String()\n\tcase t.UUID != nil:\n\t\treturn t.UUID.String()\n\tdefault:\n\t\treturn \"\"\n\t}\n}\n\n// Represents a element inside a schema definition.\n//\n//   - if it is a group (inner node) then type is undefined and num_children is\n//     defined\n//\n//   - if it is a primitive type (leaf) then type is defined and num_children is\n//     undefined\n//\n// The nodes are listed in depth first traversal order.\ntype SchemaElement struct {\n\t// Data type for this field. Not set if the current element is a non-leaf node.\n\tType *Type `thrift:\"1,optional\"`\n\n\t// If type is FixedLenByteArray, this is the byte length of the values.\n\t// Otherwise, if specified, this is the maximum bit length to store any of the values.\n\t// (e.g. a low cardinality INT col could have this set to 3).  Note that this is\n\t// in the schema, and therefore fixed for the entire file.\n\tTypeLength *int32 `thrift:\"2,optional\"`\n\n\t// repetition of the field. The root of the schema does not have a repetition_type.\n\t// All other nodes must have one.\n\tRepetitionType *FieldRepetitionType `thrift:\"3,optional\"`\n\n\t// Name of the field in the schema.\n\tName string `thrift:\"4,required\"`\n\n\t// Nested fields.  Since thrift does not support nested fields,\n\t// the nesting is flattened to a single list by a depth-first traversal.\n\t// The children count is used to construct the nested relationship.\n\t// This field is not set when the element is a primitive type\n\tNumChildren int32 `thrift:\"5,optional\"`\n\n\t// DEPRECATED: When the schema is the result of a conversion from another model.\n\t// Used to record the original type to help with cross conversion.\n\t//\n\t// This is superseded by logicalType.\n\tConvertedType *deprecated.ConvertedType `thrift:\"6,optional\"`\n\n\t// DEPRECATED: Used when this column contains decimal data.\n\t// See the DECIMAL converted type for more details.\n\t//\n\t// This is superseded by using the DecimalType annotation in logicalType.\n\tScale     *int32 `thrift:\"7,optional\"`\n\tPrecision *int32 `thrift:\"8,optional\"`\n\n\t// When the original schema supports field ids, this will save the\n\t// original field id in the parquet schema.\n\tFieldID int32 `thrift:\"9,optional\"`\n\n\t// The logical type of this SchemaElement\n\t//\n\t// LogicalType replaces ConvertedType, but ConvertedType is still required\n\t// for some logical types to ensure forward-compatibility in format v1.\n\tLogicalType *LogicalType `thrift:\"10,optional\"`\n}\n\n// Encodings supported by Parquet. Not all encodings are valid for all types.\n// These enums are also used to specify the encoding of definition and\n// repetition levels. See the accompanying doc for the details of the more\n// complicated encodings.\ntype Encoding int32\n\nconst (\n\t// Default encoding.\n\t// Boolean - 1 bit per value. 0 is false; 1 is true.\n\t// Int32 - 4 bytes per value. Stored as little-endian.\n\t// Int64 - 8 bytes per value. Stored as little-endian.\n\t// Float - 4 bytes per value. IEEE. Stored as little-endian.\n\t// Double - 8 bytes per value. IEEE. Stored as little-endian.\n\t// ByteArray - 4 byte length stored as little endian, followed by bytes.\n\t// FixedLenByteArray - Just the bytes.\n\tPlain Encoding = 0\n\n\t// Group VarInt encoding for Int32/Int64.\n\t// This encoding is deprecated. It was never used.\n\t// GroupVarInt Encoding = 1\n\n\t// Deprecated: Dictionary encoding. The values in the dictionary are encoded\n\t// in the plain type.\n\t// In a data page use RLEDictionary instead.\n\t// In a Dictionary page use Plain instead.\n\tPlainDictionary Encoding = 2\n\n\t// Group packed run length encoding. Usable for definition/repetition levels\n\t// encoding and Booleans (on one bit: 0 is false 1 is true.)\n\tRLE Encoding = 3\n\n\t// Bit packed encoding. This can only be used if the data has a known max\n\t// width. Usable for definition/repetition levels encoding.\n\tBitPacked Encoding = 4\n\n\t// Delta encoding for integers. This can be used for int columns and works best\n\t// on sorted data.\n\tDeltaBinaryPacked Encoding = 5\n\n\t// Encoding for byte arrays to separate the length values and the data.\n\t// The lengths are encoded using DeltaBinaryPacked.\n\tDeltaLengthByteArray Encoding = 6\n\n\t// Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED.\n\t// Suffixes are stored as delta length byte arrays.\n\tDeltaByteArray Encoding = 7\n\n\t// Dictionary encoding: the ids are encoded using the RLE encoding\n\tRLEDictionary Encoding = 8\n\n\t// Encoding for floating-point data.\n\t// K byte-streams are created where K is the size in bytes of the data type.\n\t// The individual bytes of an FP value are scattered to the corresponding stream and\n\t// the streams are concatenated.\n\t// This itself does not reduce the size of the data but can lead to better compression\n\t// afterwards.\n\tByteStreamSplit Encoding = 9\n)\n\nfunc (e Encoding) String() string {\n\tswitch e {\n\tcase Plain:\n\t\treturn \"PLAIN\"\n\tcase PlainDictionary:\n\t\treturn \"PLAIN_DICTIONARY\"\n\tcase RLE:\n\t\treturn \"RLE\"\n\tcase BitPacked:\n\t\treturn \"BIT_PACKED\"\n\tcase DeltaBinaryPacked:\n\t\treturn \"DELTA_BINARY_PACKED\"\n\tcase DeltaLengthByteArray:\n\t\treturn \"DELTA_LENGTH_BYTE_ARRAY\"\n\tcase DeltaByteArray:\n\t\treturn \"DELTA_BYTE_ARRAY\"\n\tcase RLEDictionary:\n\t\treturn \"RLE_DICTIONARY\"\n\tcase ByteStreamSplit:\n\t\treturn \"BYTE_STREAM_SPLIT\"\n\tdefault:\n\t\treturn \"Encoding(?)\"\n\t}\n}\n\n// Supported compression algorithms.\n//\n// Codecs added in format version X.Y can be read by readers based on X.Y and later.\n// Codec support may vary between readers based on the format version and\n// libraries available at runtime.\n//\n// See Compression.md for a detailed specification of these algorithms.\ntype CompressionCodec int32\n\nconst (\n\tUncompressed CompressionCodec = 0\n\tSnappy       CompressionCodec = 1\n\tGzip         CompressionCodec = 2\n\tLZO          CompressionCodec = 3\n\tBrotli       CompressionCodec = 4 // Added in 2.4\n\tLz4          CompressionCodec = 5 // DEPRECATED (Added in 2.4)\n\tZstd         CompressionCodec = 6 // Added in 2.4\n\tLz4Raw       CompressionCodec = 7 // Added in 2.9\n)\n\nfunc (c CompressionCodec) String() string {\n\tswitch c {\n\tcase Uncompressed:\n\t\treturn \"UNCOMPRESSED\"\n\tcase Snappy:\n\t\treturn \"SNAPPY\"\n\tcase Gzip:\n\t\treturn \"GZIP\"\n\tcase LZO:\n\t\treturn \"LZO\"\n\tcase Brotli:\n\t\treturn \"BROTLI\"\n\tcase Lz4:\n\t\treturn \"LZ4\"\n\tcase Zstd:\n\t\treturn \"ZSTD\"\n\tcase Lz4Raw:\n\t\treturn \"LZ4_RAW\"\n\tdefault:\n\t\treturn \"CompressionCodec(?)\"\n\t}\n}\n\ntype PageType int32\n\nconst (\n\tDataPage       PageType = 0\n\tIndexPage      PageType = 1\n\tDictionaryPage PageType = 2\n\t// Version 2 is indicated in the PageHeader and the use of DataPageHeaderV2,\n\t// and allows you to read repetition and definition level data without\n\t// decompressing the Page.\n\tDataPageV2 PageType = 3\n)\n\nfunc (p PageType) String() string {\n\tswitch p {\n\tcase DataPage:\n\t\treturn \"DATA_PAGE\"\n\tcase IndexPage:\n\t\treturn \"INDEX_PAGE\"\n\tcase DictionaryPage:\n\t\treturn \"DICTIONARY_PAGE\"\n\tcase DataPageV2:\n\t\treturn \"DATA_PAGE_V2\"\n\tdefault:\n\t\treturn \"PageType(?)\"\n\t}\n}\n\n// Enum to annotate whether lists of min/max elements inside ColumnIndex\n// are ordered and if so, in which direction.\ntype BoundaryOrder int32\n\nconst (\n\tUnordered  BoundaryOrder = 0\n\tAscending  BoundaryOrder = 1\n\tDescending BoundaryOrder = 2\n)\n\nfunc (b BoundaryOrder) String() string {\n\tswitch b {\n\tcase Unordered:\n\t\treturn \"UNORDERED\"\n\tcase Ascending:\n\t\treturn \"ASCENDING\"\n\tcase Descending:\n\t\treturn \"DESCENDING\"\n\tdefault:\n\t\treturn \"BoundaryOrder(?)\"\n\t}\n}\n\n// Data page header.\ntype DataPageHeader struct {\n\t// Number of values, including NULLs, in this data page.\n\tNumValues int32 `thrift:\"1,required\"`\n\n\t// Encoding used for this data page.\n\tEncoding Encoding `thrift:\"2,required\"`\n\n\t// Encoding used for definition levels.\n\tDefinitionLevelEncoding Encoding `thrift:\"3,required\"`\n\n\t// Encoding used for repetition levels.\n\tRepetitionLevelEncoding Encoding `thrift:\"4,required\"`\n\n\t// Optional statistics for the data in this page.\n\tStatistics Statistics `thrift:\"5,optional\"`\n}\n\ntype IndexPageHeader struct {\n\t// TODO\n}\n\n// The dictionary page must be placed at the first position of the column chunk\n// if it is partly or completely dictionary encoded. At most one dictionary page\n// can be placed in a column chunk.\ntype DictionaryPageHeader struct {\n\t// Number of values in the dictionary.\n\tNumValues int32 `thrift:\"1,required\"`\n\n\t// Encoding using this dictionary page.\n\tEncoding Encoding `thrift:\"2,required\"`\n\n\t// If true, the entries in the dictionary are sorted in ascending order.\n\tIsSorted bool `thrift:\"3,optional\"`\n}\n\n// New page format allowing reading levels without decompressing the data\n// Repetition and definition levels are uncompressed\n// The remaining section containing the data is compressed if is_compressed is\n// true.\ntype DataPageHeaderV2 struct {\n\t// Number of values, including NULLs, in this data page.\n\tNumValues int32 `thrift:\"1,required\"`\n\t// Number of NULL values, in this data page.\n\t// Number of non-null = num_values - num_nulls which is also the number of\n\t// values in the data section.\n\tNumNulls int32 `thrift:\"2,required\"`\n\t// Number of rows in this data page. which means pages change on record boundaries (r = 0).\n\tNumRows int32 `thrift:\"3,required\"`\n\t// Encoding used for data in this page.\n\tEncoding Encoding `thrift:\"4,required\"`\n\n\t// Repetition levels and definition levels are always using RLE (without size in it).\n\n\t// Length of the definition levels.\n\tDefinitionLevelsByteLength int32 `thrift:\"5,required\"`\n\t// Length of the repetition levels.\n\tRepetitionLevelsByteLength int32 `thrift:\"6,required\"`\n\n\t// Whether the values are compressed.\n\t// Which means the section of the page between\n\t// definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included)\n\t// is compressed with the compression_codec.\n\t// If missing it is considered compressed.\n\tIsCompressed *bool `thrift:\"7,optional\"`\n\n\t// Optional statistics for the data in this page.\n\tStatistics Statistics `thrift:\"8,optional\"`\n}\n\n// Block-based algorithm type annotation.\ntype SplitBlockAlgorithm struct{}\n\n// The algorithm used in Bloom filter.\ntype BloomFilterAlgorithm struct { // union\n\tBlock *SplitBlockAlgorithm `thrift:\"1\"`\n}\n\n// Hash strategy type annotation. xxHash is an extremely fast non-cryptographic\n// hash algorithm. It uses 64 bits version of xxHash.\ntype XxHash struct{}\n\n// The hash function used in Bloom filter. This function takes the hash of a\n// column value using plain encoding.\ntype BloomFilterHash struct { // union\n\tXxHash *XxHash `thrift:\"1\"`\n}\n\n// The compression used in the Bloom filter.\ntype BloomFilterUncompressed struct{}\ntype BloomFilterCompression struct { // union\n\tUncompressed *BloomFilterUncompressed `thrift:\"1\"`\n}\n\n// Bloom filter header is stored at beginning of Bloom filter data of each column\n// and followed by its bitset.\ntype BloomFilterHeader struct {\n\t// The size of bitset in bytes.\n\tNumBytes int32 `thrift:\"1,required\"`\n\t// The algorithm for setting bits.\n\tAlgorithm BloomFilterAlgorithm `thrift:\"2,required\"`\n\t// The hash function used for Bloom filter.\n\tHash BloomFilterHash `thrift:\"3,required\"`\n\t// The compression used in the Bloom filter.\n\tCompression BloomFilterCompression `thrift:\"4,required\"`\n}\n\ntype PageHeader struct {\n\t// The type of the page indicates which of the *Header fields below is set.\n\tType PageType `thrift:\"1,required\"`\n\n\t// Uncompressed page size in bytes (not including this header).\n\tUncompressedPageSize int32 `thrift:\"2,required\"`\n\n\t// Compressed (and potentially encrypted) page size in bytes, not including\n\t// this header.\n\tCompressedPageSize int32 `thrift:\"3,required\"`\n\n\t// The 32bit CRC for the page, to be be calculated as follows:\n\t// - Using the standard CRC32 algorithm\n\t// - On the data only, i.e. this header should not be included. 'Data'\n\t//   hereby refers to the concatenation of the repetition levels, the\n\t//   definition levels and the column value, in this exact order.\n\t// - On the encoded versions of the repetition levels, definition levels and\n\t//   column values.\n\t// - On the compressed versions of the repetition levels, definition levels\n\t//   and column values where possible;\n\t//   - For v1 data pages, the repetition levels, definition levels and column\n\t//     values are always compressed together. If a compression scheme is\n\t//     specified, the CRC shall be calculated on the compressed version of\n\t//     this concatenation. If no compression scheme is specified, the CRC\n\t//     shall be calculated on the uncompressed version of this concatenation.\n\t//   - For v2 data pages, the repetition levels and definition levels are\n\t//     handled separately from the data and are never compressed (only\n\t//     encoded). If a compression scheme is specified, the CRC shall be\n\t//     calculated on the concatenation of the uncompressed repetition levels,\n\t//     uncompressed definition levels and the compressed column values.\n\t//     If no compression scheme is specified, the CRC shall be calculated on\n\t//     the uncompressed concatenation.\n\t// - In encrypted columns, CRC is calculated after page encryption; the\n\t//   encryption itself is performed after page compression (if compressed)\n\t// If enabled, this allows for disabling checksumming in HDFS if only a few\n\t// pages need to be read.\n\tCRC int32 `thrift:\"4,optional\"`\n\n\t// Headers for page specific data. One only will be set.\n\tDataPageHeader       *DataPageHeader       `thrift:\"5,optional\"`\n\tIndexPageHeader      *IndexPageHeader      `thrift:\"6,optional\"`\n\tDictionaryPageHeader *DictionaryPageHeader `thrift:\"7,optional\"`\n\tDataPageHeaderV2     *DataPageHeaderV2     `thrift:\"8,optional\"`\n}\n\n// Wrapper struct to store key values.\ntype KeyValue struct {\n\tKey   string `thrift:\"1,required\"`\n\tValue string `thrift:\"2,required\"`\n}\n\n// Wrapper struct to specify sort order.\ntype SortingColumn struct {\n\t// The column index (in this row group)\n\tColumnIdx int32 `thrift:\"1,required\"`\n\n\t// If true, indicates this column is sorted in descending order.\n\tDescending bool `thrift:\"2,required\"`\n\n\t// If true, nulls will come before non-null values, otherwise,\n\t// nulls go at the end.\n\tNullsFirst bool `thrift:\"3,required\"`\n}\n\n// Statistics of a given page type and encoding.\ntype PageEncodingStats struct {\n\t// The page type (data/dic/...).\n\tPageType PageType `thrift:\"1,required\"`\n\n\t// Encoding of the page.\n\tEncoding Encoding `thrift:\"2,required\"`\n\n\t// Number of pages of this type with this encoding.\n\tCount int32 `thrift:\"3,required\"`\n}\n\n// Description for column metadata.\ntype ColumnMetaData struct {\n\t// Type of this column.\n\tType Type `thrift:\"1,required\"`\n\n\t// Set of all encodings used for this column. The purpose is to validate\n\t// whether we can decode those pages.\n\tEncoding []Encoding `thrift:\"2,required\"`\n\n\t// Path in schema.\n\tPathInSchema []string `thrift:\"3,required\"`\n\n\t// Compression codec.\n\tCodec CompressionCodec `thrift:\"4,required\"`\n\n\t// Number of values in this column.\n\tNumValues int64 `thrift:\"5,required\"`\n\n\t// Total byte size of all uncompressed pages in this column chunk (including the headers).\n\tTotalUncompressedSize int64 `thrift:\"6,required\"`\n\n\t// Total byte size of all compressed, and potentially encrypted, pages\n\t// in this column chunk (including the headers).\n\tTotalCompressedSize int64 `thrift:\"7,required\"`\n\n\t// Optional key/value metadata.\n\tKeyValueMetadata []KeyValue `thrift:\"8,optional\"`\n\n\t// Byte offset from beginning of file to first data page.\n\tDataPageOffset int64 `thrift:\"9,required\"`\n\n\t// Byte offset from beginning of file to root index page.\n\tIndexPageOffset int64 `thrift:\"10,optional\"`\n\n\t// Byte offset from the beginning of file to first (only) dictionary page.\n\tDictionaryPageOffset int64 `thrift:\"11,optional\"`\n\n\t// optional statistics for this column chunk.\n\tStatistics Statistics `thrift:\"12,optional\"`\n\n\t// Set of all encodings used for pages in this column chunk.\n\t// This information can be used to determine if all data pages are\n\t// dictionary encoded for example.\n\tEncodingStats []PageEncodingStats `thrift:\"13,optional\"`\n\n\t// Byte offset from beginning of file to Bloom filter data.\n\tBloomFilterOffset int64 `thrift:\"14,optional\"`\n}\n\ntype EncryptionWithFooterKey struct{}\n\ntype EncryptionWithColumnKey struct {\n\t// Column path in schema.\n\tPathInSchema []string `thrift:\"1,required\"`\n\n\t// Retrieval metadata of column encryption key.\n\tKeyMetadata []byte `thrift:\"2,optional\"`\n}\n\ntype ColumnCryptoMetaData struct {\n\tEncryptionWithFooterKey *EncryptionWithFooterKey `thrift:\"1\"`\n\tEncryptionWithColumnKey *EncryptionWithColumnKey `thrift:\"2\"`\n}\n\ntype ColumnChunk struct {\n\t// File where column data is stored.  If not set, assumed to be same file as\n\t// metadata.  This path is relative to the current file.\n\tFilePath string `thrift:\"1,optional\"`\n\n\t// Byte offset in file_path to the ColumnMetaData.\n\tFileOffset int64 `thrift:\"2,required\"`\n\n\t// Column metadata for this chunk. This is the same content as what is at\n\t// file_path/file_offset. Having it here has it replicated in the file\n\t// metadata.\n\tMetaData ColumnMetaData `thrift:\"3,optional\"`\n\n\t// File offset of ColumnChunk's OffsetIndex.\n\tOffsetIndexOffset int64 `thrift:\"4,optional\"`\n\n\t// Size of ColumnChunk's OffsetIndex, in bytes.\n\tOffsetIndexLength int32 `thrift:\"5,optional\"`\n\n\t// File offset of ColumnChunk's ColumnIndex.\n\tColumnIndexOffset int64 `thrift:\"6,optional\"`\n\n\t// Size of ColumnChunk's ColumnIndex, in bytes.\n\tColumnIndexLength int32 `thrift:\"7,optional\"`\n\n\t// Crypto metadata of encrypted columns.\n\tCryptoMetadata ColumnCryptoMetaData `thrift:\"8,optional\"`\n\n\t// Encrypted column metadata for this chunk.\n\tEncryptedColumnMetadata []byte `thrift:\"9,optional\"`\n}\n\ntype RowGroup struct {\n\t// Metadata for each column chunk in this row group.\n\t// This list must have the same order as the SchemaElement list in FileMetaData.\n\tColumns []ColumnChunk `thrift:\"1,required\"`\n\n\t// Total byte size of all the uncompressed column data in this row group.\n\tTotalByteSize int64 `thrift:\"2,required\"`\n\n\t// Number of rows in this row group.\n\tNumRows int64 `thrift:\"3,required\"`\n\n\t// If set, specifies a sort ordering of the rows in this RowGroup.\n\t// The sorting columns can be a subset of all the columns.\n\tSortingColumns []SortingColumn `thrift:\"4,optional\"`\n\n\t// Byte offset from beginning of file to first page (data or dictionary)\n\t// in this row group\n\tFileOffset int64 `thrift:\"5,optional\"`\n\n\t// Total byte size of all compressed (and potentially encrypted) column data\n\t// in this row group.\n\tTotalCompressedSize int64 `thrift:\"6,optional\"`\n\n\t// Row group ordinal in the file.\n\tOrdinal int16 `thrift:\"7,optional\"`\n}\n\n// Empty struct to signal the order defined by the physical or logical type.\ntype TypeDefinedOrder struct{}\n\n// Union to specify the order used for the min_value and max_value fields for a\n// column. This union takes the role of an enhanced enum that allows rich\n// elements (which will be needed for a collation-based ordering in the future).\n//\n// Possible values are:\n//\n//\tTypeDefinedOrder - the column uses the order defined by its logical or\n//\t                   physical type (if there is no logical type).\n//\n// If the reader does not support the value of this union, min and max stats\n// for this column should be ignored.\ntype ColumnOrder struct { // union\n\t// The sort orders for logical types are:\n\t//   UTF8 - unsigned byte-wise comparison\n\t//   INT8 - signed comparison\n\t//   INT16 - signed comparison\n\t//   INT32 - signed comparison\n\t//   INT64 - signed comparison\n\t//   UINT8 - unsigned comparison\n\t//   UINT16 - unsigned comparison\n\t//   UINT32 - unsigned comparison\n\t//   UINT64 - unsigned comparison\n\t//   DECIMAL - signed comparison of the represented value\n\t//   DATE - signed comparison\n\t//   TIME_MILLIS - signed comparison\n\t//   TIME_MICROS - signed comparison\n\t//   TIMESTAMP_MILLIS - signed comparison\n\t//   TIMESTAMP_MICROS - signed comparison\n\t//   INTERVAL - unsigned comparison\n\t//   JSON - unsigned byte-wise comparison\n\t//   BSON - unsigned byte-wise comparison\n\t//   ENUM - unsigned byte-wise comparison\n\t//   LIST - undefined\n\t//   MAP - undefined\n\t//\n\t// In the absence of logical types, the sort order is determined by the physical type:\n\t//   BOOLEAN - false, true\n\t//   INT32 - signed comparison\n\t//   INT64 - signed comparison\n\t//   INT96 (only used for legacy timestamps) - undefined\n\t//   FLOAT - signed comparison of the represented value (*)\n\t//   DOUBLE - signed comparison of the represented value (*)\n\t//   BYTE_ARRAY - unsigned byte-wise comparison\n\t//   FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison\n\t//\n\t// (*) Because the sorting order is not specified properly for floating\n\t//     point values (relations vs. total ordering) the following\n\t//     compatibility rules should be applied when reading statistics:\n\t//     - If the min is a NaN, it should be ignored.\n\t//     - If the max is a NaN, it should be ignored.\n\t//     - If the min is +0, the row group may contain -0 values as well.\n\t//     - If the max is -0, the row group may contain +0 values as well.\n\t//     - When looking for NaN values, min and max should be ignored.\n\tTypeOrder *TypeDefinedOrder `thrift:\"1\"`\n}\n\ntype PageLocation struct {\n\t// Offset of the page in the file.\n\tOffset int64 `thrift:\"1,required\"`\n\n\t// Size of the page, including header. Sum of compressed_page_size and\n\t// header length.\n\tCompressedPageSize int32 `thrift:\"2,required\"`\n\n\t// Index within the RowGroup of the first row of the page; this means\n\t// pages change on record boundaries (r = 0).\n\tFirstRowIndex int64 `thrift:\"3,required\"`\n}\n\ntype OffsetIndex struct {\n\t// PageLocations, ordered by increasing PageLocation.offset. It is required\n\t// that page_locations[i].first_row_index < page_locations[i+1].first_row_index.\n\tPageLocations []PageLocation `thrift:\"1,required\"`\n}\n\n// Description for ColumnIndex.\n// Each <array-field>[i] refers to the page at OffsetIndex.PageLocations[i]\ntype ColumnIndex struct {\n\t// A list of Boolean values to determine the validity of the corresponding\n\t// min and max values. If true, a page contains only null values, and writers\n\t// have to set the corresponding entries in min_values and max_values to\n\t// byte[0], so that all lists have the same length. If false, the\n\t// corresponding entries in min_values and max_values must be valid.\n\tNullPages []bool `thrift:\"1,required\"`\n\n\t// Two lists containing lower and upper bounds for the values of each page\n\t// determined by the ColumnOrder of the column. These may be the actual\n\t// minimum and maximum values found on a page, but can also be (more compact)\n\t// values that do not exist on a page. For example, instead of storing \"\"Blart\n\t// Versenwald III\", a writer may set min_values[i]=\"B\", max_values[i]=\"C\".\n\t// Such more compact values must still be valid values within the column's\n\t// logical type. Readers must make sure that list entries are populated before\n\t// using them by inspecting null_pages.\n\tMinValues [][]byte `thrift:\"2,required\"`\n\tMaxValues [][]byte `thrift:\"3,required\"`\n\n\t// Stores whether both min_values and max_values are ordered and if so, in\n\t// which direction. This allows readers to perform binary searches in both\n\t// lists. Readers cannot assume that max_values[i] <= min_values[i+1], even\n\t// if the lists are ordered.\n\tBoundaryOrder BoundaryOrder `thrift:\"4,required\"`\n\n\t// A list containing the number of null values for each page.\n\tNullCounts []int64 `thrift:\"5,optional\"`\n}\n\ntype AesGcmV1 struct {\n\t// AAD prefix.\n\tAadPrefix []byte `thrift:\"1,optional\"`\n\n\t// Unique file identifier part of AAD suffix.\n\tAadFileUnique []byte `thrift:\"2,optional\"`\n\n\t// In files encrypted with AAD prefix without storing it,\n\t// readers must supply the prefix.\n\tSupplyAadPrefix bool `thrift:\"3,optional\"`\n}\n\ntype AesGcmCtrV1 struct {\n\t// AAD prefix.\n\tAadPrefix []byte `thrift:\"1,optional\"`\n\n\t// Unique file identifier part of AAD suffix.\n\tAadFileUnique []byte `thrift:\"2,optional\"`\n\n\t// In files encrypted with AAD prefix without storing it,\n\t// readers must supply the prefix.\n\tSupplyAadPrefix bool `thrift:\"3,optional\"`\n}\n\ntype EncryptionAlgorithm struct { // union\n\tAesGcmV1    *AesGcmV1    `thrift:\"1\"`\n\tAesGcmCtrV1 *AesGcmCtrV1 `thrift:\"2\"`\n}\n\n// Description for file metadata.\ntype FileMetaData struct {\n\t// Version of this file.\n\tVersion int32 `thrift:\"1,required\"`\n\n\t// Parquet schema for this file.  This schema contains metadata for all the columns.\n\t// The schema is represented as a tree with a single root.  The nodes of the tree\n\t// are flattened to a list by doing a depth-first traversal.\n\t// The column metadata contains the path in the schema for that column which can be\n\t// used to map columns to nodes in the schema.\n\t// The first element is the root.\n\tSchema []SchemaElement `thrift:\"2,required\"`\n\n\t// Number of rows in this file.\n\tNumRows int64 `thrift:\"3,required\"`\n\n\t// Row groups in this file.\n\tRowGroups []RowGroup `thrift:\"4,required\"`\n\n\t// Optional key/value metadata.\n\tKeyValueMetadata []KeyValue `thrift:\"5,optional\"`\n\n\t// String for application that wrote this file.  This should be in the format\n\t// <Application> version <App Version> (build <App Build Hash>).\n\t// e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)\n\tCreatedBy string `thrift:\"6,optional\"`\n\n\t// Sort order used for the min_value and max_value fields in the Statistics\n\t// objects and the min_values and max_values fields in the ColumnIndex\n\t// objects of each column in this file. Sort orders are listed in the order\n\t// matching the columns in the schema. The indexes are not necessary the same\n\t// though, because only leaf nodes of the schema are represented in the list\n\t// of sort orders.\n\t//\n\t// Without column_orders, the meaning of the min_value and max_value fields\n\t// in the Statistics object and the ColumnIndex object is undefined. To ensure\n\t// well-defined behavior, if these fields are written to a Parquet file,\n\t// column_orders must be written as well.\n\t//\n\t// The obsolete min and max fields in the Statistics object are always sorted\n\t// by signed comparison regardless of column_orders.\n\tColumnOrders []ColumnOrder `thrift:\"7,optional\"`\n\n\t// Encryption algorithm. This field is set only in encrypted files\n\t// with plaintext footer. Files with encrypted footer store algorithm id\n\t// in FileCryptoMetaData structure.\n\tEncryptionAlgorithm EncryptionAlgorithm `thrift:\"8,optional\"`\n\n\t// Retrieval metadata of key used for signing the footer.\n\t// Used only in encrypted files with plaintext footer.\n\tFooterSigningKeyMetadata []byte `thrift:\"9,optional\"`\n}\n\n// Crypto metadata for files with encrypted footer.\ntype FileCryptoMetaData struct {\n\t// Encryption algorithm. This field is only used for files\n\t// with encrypted footer. Files with plaintext footer store algorithm id\n\t// inside footer (FileMetaData structure).\n\tEncryptionAlgorithm EncryptionAlgorithm `thrift:\"1,required\"`\n\n\t// Retrieval metadata of key used for encryption of footer,\n\t// and (possibly) columns.\n\tKeyMetadata []byte `thrift:\"2,optional\"`\n}\n"
  },
  {
    "path": "format/parquet_test.go",
    "content": "package format_test\n\nimport (\n\t\"reflect\"\n\t\"testing\"\n\n\t\"github.com/segmentio/encoding/thrift\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\nfunc TestMarshalUnmarshalSchemaMetadata(t *testing.T) {\n\tprotocol := &thrift.CompactProtocol{}\n\tmetadata := &format.FileMetaData{\n\t\tVersion: 1,\n\t\tSchema: []format.SchemaElement{\n\t\t\t{\n\t\t\t\tName: \"hello\",\n\t\t\t},\n\t\t},\n\t\tRowGroups: []format.RowGroup{},\n\t}\n\n\tb, err := thrift.Marshal(protocol, metadata)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tdecoded := &format.FileMetaData{}\n\tif err := thrift.Unmarshal(protocol, b, &decoded); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tif !reflect.DeepEqual(metadata, decoded) {\n\t\tt.Error(\"values mismatch:\")\n\t\tt.Logf(\"expected:\\n%#v\", metadata)\n\t\tt.Logf(\"found:\\n%#v\", decoded)\n\t}\n}\n"
  },
  {
    "path": "go.mod",
    "content": "module github.com/segmentio/parquet-go\n\ngo 1.19\n\nrequire (\n\tgithub.com/andybalholm/brotli v1.0.3\n\tgithub.com/google/uuid v1.3.0\n\tgithub.com/hexops/gotextdiff v1.0.3\n\tgithub.com/klauspost/compress v1.15.9\n\tgithub.com/olekukonko/tablewriter v0.0.5\n\tgithub.com/pierrec/lz4/v4 v4.1.9\n\tgithub.com/segmentio/encoding v0.3.5\n\tgolang.org/x/sys v0.0.0-20211110154304-99a53858aa08\n\tgoogle.golang.org/protobuf v1.30.0\n)\n\nrequire github.com/mattn/go-runewidth v0.0.9 // indirect\n"
  },
  {
    "path": "go.sum",
    "content": "github.com/andybalholm/brotli v1.0.3 h1:fpcw+r1N1h0Poc1F/pHbW40cUm/lMEQslZtCkBQ0UnM=\ngithub.com/andybalholm/brotli v1.0.3/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=\ngithub.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=\ngithub.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=\ngithub.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=\ngithub.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=\ngithub.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=\ngithub.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=\ngithub.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=\ngithub.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY=\ngithub.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=\ngithub.com/mattn/go-runewidth v0.0.9 h1:Lm995f3rfxdpd6TSmuVCHVb/QhupuXlYr8sCI/QdE+0=\ngithub.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=\ngithub.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=\ngithub.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=\ngithub.com/pierrec/lz4/v4 v4.1.9 h1:xkrjwpOP5xg1k4Nn4GX4a4YFGhscyQL/3EddJ1Xxqm8=\ngithub.com/pierrec/lz4/v4 v4.1.9/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=\ngithub.com/segmentio/asm v1.1.3/go.mod h1:Ld3L4ZXGNcSLRg4JBsZ3//1+f/TjYl0Mzen/DQy1EJg=\ngithub.com/segmentio/encoding v0.3.5 h1:UZEiaZ55nlXGDL92scoVuw00RmiRCazIEmvPSbSvt8Y=\ngithub.com/segmentio/encoding v0.3.5/go.mod h1:n0JeuIqEQrQoPDGsjo8UNd1iA0U8d8+oHAA4E3G3OxM=\ngolang.org/x/sys v0.0.0-20211110154304-99a53858aa08 h1:WecRHqgE09JBkh/584XIE6PMz5KKE/vER4izNUi30AQ=\ngolang.org/x/sys v0.0.0-20211110154304-99a53858aa08/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=\ngolang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=\ngolang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=\ngoogle.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=\ngoogle.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=\ngoogle.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=\n"
  },
  {
    "path": "hashprobe/aeshash/aeshash.go",
    "content": "// Package aeshash implements hashing functions derived from the Go runtime's\n// internal hashing based on the support of AES encryption in CPU instructions.\n//\n// On architecture where the CPU does not provide instructions for AES\n// encryption, the aeshash.Enabled function always returns false, and attempting\n// to call any other function will trigger a panic.\npackage aeshash\n\nimport \"github.com/segmentio/parquet-go/sparse\"\n\nfunc MultiHash32(hashes []uintptr, values []uint32, seed uintptr) {\n\tMultiHashUint32Array(hashes, sparse.MakeUint32Array(values), seed)\n}\n\nfunc MultiHash64(hashes []uintptr, values []uint64, seed uintptr) {\n\tMultiHashUint64Array(hashes, sparse.MakeUint64Array(values), seed)\n}\n\nfunc MultiHash128(hashes []uintptr, values [][16]byte, seed uintptr) {\n\tMultiHashUint128Array(hashes, sparse.MakeUint128Array(values), seed)\n}\n"
  },
  {
    "path": "hashprobe/aeshash/aeshash_amd64.go",
    "content": "//go:build !purego\n\npackage aeshash\n\nimport (\n\t\"github.com/segmentio/parquet-go/sparse\"\n\t\"golang.org/x/sys/cpu\"\n)\n\n// Enabled returns true if AES hash is available on the system.\n//\n// The function uses the same logic than the Go runtime since we depend on\n// the AES hash state being initialized.\n//\n// See https://go.dev/src/runtime/alg.go\nfunc Enabled() bool { return cpu.X86.HasAES && cpu.X86.HasSSSE3 && cpu.X86.HasSSE41 }\n\n//go:noescape\nfunc Hash32(value uint32, seed uintptr) uintptr\n\n//go:noescape\nfunc Hash64(value uint64, seed uintptr) uintptr\n\n//go:noescape\nfunc Hash128(value [16]byte, seed uintptr) uintptr\n\n//go:noescape\nfunc MultiHashUint32Array(hashes []uintptr, values sparse.Uint32Array, seed uintptr)\n\n//go:noescape\nfunc MultiHashUint64Array(hashes []uintptr, values sparse.Uint64Array, seed uintptr)\n\n//go:noescape\nfunc MultiHashUint128Array(hashes []uintptr, values sparse.Uint128Array, seed uintptr)\n"
  },
  {
    "path": "hashprobe/aeshash/aeshash_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n// func Hash32(value uint32, seed uintptr) uintptr\nTEXT ·Hash32(SB), NOSPLIT, $0-24\n    MOVL value+0(FP), AX\n    MOVQ seed+8(FP), BX\n\n    MOVOU runtime·aeskeysched+0(SB), X1\n    MOVOU runtime·aeskeysched+16(SB), X2\n    MOVOU runtime·aeskeysched+32(SB), X3\n\n    MOVQ BX, X0\n    PINSRD $2, AX, X0\n\n    AESENC X1, X0\n    AESENC X2, X0\n    AESENC X3, X0\n\n    MOVQ X0, ret+16(FP)\n    RET\n\n// func Hash64(value uint64, seed uintptr) uintptr\nTEXT ·Hash64(SB), NOSPLIT, $0-24\n    MOVQ value+0(FP), AX\n    MOVQ seed+8(FP), BX\n\n    MOVOU runtime·aeskeysched+0(SB), X1\n    MOVOU runtime·aeskeysched+16(SB), X2\n    MOVOU runtime·aeskeysched+32(SB), X3\n\n    MOVQ BX, X0\n    PINSRQ $1, AX, X0\n\n    AESENC X1, X0\n    AESENC X2, X0\n    AESENC X3, X0\n\n    MOVQ X0, ret+16(FP)\n    RET\n\n// func Hash128(value [16]byte, seed uintptr) uintptr\nTEXT ·Hash128(SB), NOSPLIT, $0-32\n    LEAQ value+0(FP), AX\n    MOVQ seed+16(FP), BX\n    MOVQ $16, CX\n\n    MOVQ BX, X0                      // 64 bits of per-table hash seed\n    PINSRW $4, CX, X0                // 16 bits of length\n    PSHUFHW $0, X0, X0               // repeat length 4 times total\n    PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed\n    AESENC X0, X0                    // scramble seed\n\n    MOVOU (AX), X1\n    PXOR X0, X1\n    AESENC X1, X1\n    AESENC X1, X1\n    AESENC X1, X1\n\n    MOVQ X1, ret+24(FP)\n    RET\n\n// func MultiHashUint32Array(hashes []uintptr, values sparse.Uint32Array, seed uintptr)\nTEXT ·MultiHashUint32Array(SB), NOSPLIT, $0-56\n    MOVQ hashes_base+0(FP), AX\n    MOVQ values_array_ptr+24(FP), BX\n    MOVQ values_array_len+32(FP), CX\n    MOVQ values_array_off+40(FP), R8\n    MOVQ seed+48(FP), DX\n\n    MOVOU runtime·aeskeysched+0(SB), X1\n    MOVOU runtime·aeskeysched+16(SB), X2\n    MOVOU runtime·aeskeysched+32(SB), X3\n\n    XORQ SI, SI\n    JMP test\nloop:\n    MOVQ DX, X0\n    PINSRD $2, (BX), X0\n\n    AESENC X1, X0\n    AESENC X2, X0\n    AESENC X3, X0\n\n    MOVQ X0, (AX)(SI*8)\n    INCQ SI\n    ADDQ R8, BX\ntest:\n    CMPQ SI, CX\n    JNE loop\n    RET\n\n// func MultiHashUint64Array(hashes []uintptr, values sparse.Uint64Array, seed uintptr)\nTEXT ·MultiHashUint64Array(SB), NOSPLIT, $0-56\n    MOVQ hashes_base+0(FP), AX\n    MOVQ values_array_ptr+24(FP), BX\n    MOVQ values_array_len+32(FP), CX\n    MOVQ values_array_off+40(FP), R8\n    MOVQ seed+48(FP), DX\n\n    MOVOU runtime·aeskeysched+0(SB), X1\n    MOVOU runtime·aeskeysched+16(SB), X2\n    MOVOU runtime·aeskeysched+32(SB), X3\n\n    XORQ SI, SI\n    JMP test\nloop:\n    MOVQ DX, X0\n    PINSRQ $1, (BX), X0\n\n    AESENC X1, X0\n    AESENC X2, X0\n    AESENC X3, X0\n\n    MOVQ X0, (AX)(SI*8)\n    INCQ SI\n    ADDQ R8, BX\ntest:\n    CMPQ SI, CX\n    JNE loop\n    RET\n\n// func MultiHashUint128Array(hashes []uintptr, values sparse.Uint128Array, seed uintptr)\nTEXT ·MultiHashUint128Array(SB), NOSPLIT, $0-56\n    MOVQ hashes_base+0(FP), AX\n    MOVQ values_array_ptr+24(FP), BX\n    MOVQ values_array_len+32(FP), CX\n    MOVQ values_array_off+40(FP), R8\n    MOVQ seed+48(FP), DX\n    MOVQ $16, DI\n\n    MOVQ DX, X0\n    PINSRW $4, DI, X0\n    PSHUFHW $0, X0, X0\n    PXOR runtime·aeskeysched(SB), X0\n    AESENC X0, X0\n\n    XORQ SI, SI\n    JMP test\nloop:\n    MOVOU (BX), X1\n\n    PXOR X0, X1\n    AESENC X1, X1\n    AESENC X1, X1\n    AESENC X1, X1\n\n    MOVQ X1, (AX)(SI*8)\n    INCQ SI\n    ADDQ R8, BX\ntest:\n    CMPQ SI, CX\n    JNE loop\n    RET\n"
  },
  {
    "path": "hashprobe/aeshash/aeshash_purego.go",
    "content": "//go:build purego || !amd64\n\npackage aeshash\n\nimport \"github.com/segmentio/parquet-go/sparse\"\n\n// Enabled always returns false since we assume that AES instructions are not\n// available by default.\nfunc Enabled() bool { return false }\n\nconst unsupported = \"BUG: AES hash is not available on this platform\"\n\nfunc Hash32(value uint32, seed uintptr) uintptr { panic(unsupported) }\n\nfunc Hash64(value uint64, seed uintptr) uintptr { panic(unsupported) }\n\nfunc Hash128(value [16]byte, seed uintptr) uintptr { panic(unsupported) }\n\nfunc MultiHashUint32Array(hashes []uintptr, values sparse.Uint32Array, seed uintptr) {\n\tpanic(unsupported)\n}\n\nfunc MultiHashUint64Array(hashes []uintptr, values sparse.Uint64Array, seed uintptr) {\n\tpanic(unsupported)\n}\n\nfunc MultiHashUint128Array(hashes []uintptr, values sparse.Uint128Array, seed uintptr) {\n\tpanic(unsupported)\n}\n"
  },
  {
    "path": "hashprobe/aeshash/aeshash_test.go",
    "content": "package aeshash\n\nimport (\n\t\"encoding/binary\"\n\t\"testing\"\n\t\"time\"\n\t\"unsafe\"\n)\n\n//go:noescape\n//go:linkname runtime_memhash32 runtime.memhash32\nfunc runtime_memhash32(data unsafe.Pointer, seed uintptr) uintptr\n\n//go:noescape\n//go:linkname runtime_memhash64 runtime.memhash64\nfunc runtime_memhash64(data unsafe.Pointer, seed uintptr) uintptr\n\n//go:noescape\n//go:linkname runtime_memhash runtime.memhash\nfunc runtime_memhash(data unsafe.Pointer, seed, size uintptr) uintptr\n\nfunc memhash32(data uint32, seed uintptr) uintptr {\n\treturn runtime_memhash32(unsafe.Pointer(&data), seed)\n}\n\nfunc memhash64(data uint64, seed uintptr) uintptr {\n\treturn runtime_memhash64(unsafe.Pointer(&data), seed)\n}\n\nfunc memhash128(data [16]byte, seed uintptr) uintptr {\n\treturn runtime_memhash(unsafe.Pointer(&data), seed, 16)\n}\n\nfunc TestHash32(t *testing.T) {\n\tif !Enabled() {\n\t\tt.Skip(\"AES hash not supported on this platform\")\n\t}\n\n\th0 := memhash32(42, 1)\n\th1 := Hash32(42, 1)\n\n\tif h0 != h1 {\n\t\tt.Errorf(\"want=%016x got=%016x\", h0, h1)\n\t}\n}\n\nfunc TestMultiHash32(t *testing.T) {\n\tif !Enabled() {\n\t\tt.Skip(\"AES hash not supported on this platform\")\n\t}\n\n\tconst N = 10\n\thashes := [N]uintptr{}\n\tvalues := [N]uint32{}\n\tseed := uintptr(32)\n\n\tfor i := range values {\n\t\tvalues[i] = uint32(i)\n\t}\n\n\tMultiHash32(hashes[:], values[:], seed)\n\n\tfor i := range values {\n\t\th := Hash32(values[i], seed)\n\n\t\tif h != hashes[i] {\n\t\t\tt.Errorf(\"hash(%d): want=%016x got=%016x\", values[i], h, hashes[i])\n\t\t}\n\t}\n}\n\nfunc TestHash64(t *testing.T) {\n\tif !Enabled() {\n\t\tt.Skip(\"AES hash not supported on this platform\")\n\t}\n\n\th0 := memhash64(42, 1)\n\th1 := Hash64(42, 1)\n\n\tif h0 != h1 {\n\t\tt.Errorf(\"want=%016x got=%016x\", h0, h1)\n\t}\n}\n\nfunc TestMultiHash64(t *testing.T) {\n\tif !Enabled() {\n\t\tt.Skip(\"AES hash not supported on this platform\")\n\t}\n\n\tconst N = 10\n\thashes := [N]uintptr{}\n\tvalues := [N]uint64{}\n\tseed := uintptr(64)\n\n\tfor i := range values {\n\t\tvalues[i] = uint64(i)\n\t}\n\n\tMultiHash64(hashes[:], values[:], seed)\n\n\tfor i := range values {\n\t\th := Hash64(values[i], seed)\n\n\t\tif h != hashes[i] {\n\t\t\tt.Errorf(\"hash(%d): want=%016x got=%016x\", values[i], h, hashes[i])\n\t\t}\n\t}\n}\n\nfunc BenchmarkMultiHash64(b *testing.B) {\n\tif !Enabled() {\n\t\tb.Skip(\"AES hash not supported on this platform\")\n\t}\n\n\thashes := [512]uintptr{}\n\tvalues := [512]uint64{}\n\tb.SetBytes(8 * int64(len(hashes)))\n\tbenchmarkHashThroughput(b, func(seed uintptr) int {\n\t\tMultiHash64(hashes[:], values[:], seed)\n\t\treturn len(hashes)\n\t})\n}\n\nfunc TestHash128(t *testing.T) {\n\tif !Enabled() {\n\t\tt.Skip(\"AES hash not supported on this platform\")\n\t}\n\n\th0 := memhash128([16]byte{0: 42}, 1)\n\th1 := Hash128([16]byte{0: 42}, 1)\n\n\tif h0 != h1 {\n\t\tt.Errorf(\"want=%016x got=%016x\", h0, h1)\n\t}\n}\n\nfunc TestMultiHash128(t *testing.T) {\n\tif !Enabled() {\n\t\tt.Skip(\"AES hash not supported on this platform\")\n\t}\n\n\tconst N = 10\n\thashes := [N]uintptr{}\n\tvalues := [N][16]byte{}\n\tseed := uintptr(128)\n\n\tfor i := range values {\n\t\tbinary.LittleEndian.PutUint64(values[i][:8], uint64(i))\n\t}\n\n\tMultiHash128(hashes[:], values[:], seed)\n\n\tfor i := range values {\n\t\th := Hash128(values[i], seed)\n\n\t\tif h != hashes[i] {\n\t\t\tt.Errorf(\"hash(%d): want=%016x got=%016x\", values[i], h, hashes[i])\n\t\t}\n\t}\n}\n\nfunc benchmarkHashThroughput(b *testing.B, f func(seed uintptr) int) {\n\thashes := int64(0)\n\tstart := time.Now()\n\n\tfor i := 0; i < b.N; i++ {\n\t\thashes += int64(f(uintptr(i)))\n\t}\n\n\tseconds := time.Since(start).Seconds()\n\tb.ReportMetric(float64(hashes)/seconds, \"hash/s\")\n}\n"
  },
  {
    "path": "hashprobe/hashprobe.go",
    "content": "// Package hashprobe provides implementations of probing tables for various\n// data types.\n//\n// Probing tables are specialized hash tables supporting only a single\n// \"probing\" operation which behave like a \"lookup or insert\". When a key\n// is probed, either its value is retrieved if it already existed in the table,\n// or it is inserted and assigned its index in the insert sequence as value.\n//\n// Values are represented as signed 32 bits integers, which means that probing\n// tables defined in this package may contain at most 2^31-1 entries.\n//\n// Probing tables have a method named Probe with the following signature:\n//\n//\tfunc (t *Int64Table) Probe(keys []int64, values []int32) int {\n//\t\t...\n//\t}\n//\n// The method takes an array of keys to probe as first argument, an array of\n// values where the indexes of each key will be written as second argument, and\n// returns the number of keys that were inserted during the call.\n//\n// Applications that need to determine which keys were inserted can capture the\n// length of the probing table prior to the call, and scan the list of values\n// looking for indexes greater or equal to the length of the table before the\n// call.\npackage hashprobe\n\nimport (\n\tcryptoRand \"crypto/rand\"\n\t\"encoding/binary\"\n\t\"math\"\n\t\"math/bits\"\n\t\"math/rand\"\n\t\"sync\"\n\n\t\"github.com/segmentio/parquet-go/hashprobe/aeshash\"\n\t\"github.com/segmentio/parquet-go/hashprobe/wyhash\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n\t\"github.com/segmentio/parquet-go/sparse\"\n)\n\nconst (\n\t// Number of probes tested per iteration. This parameter balances between\n\t// the amount of memory allocated on the stack to hold the computed hashes\n\t// of the keys being probed, and amortizing the baseline cost of the probing\n\t// algorithm.\n\t//\n\t// The larger the value, the more memory is required, but lower the baseline\n\t// cost will be.\n\t//\n\t// We chose a value that is somewhat large, resulting in reserving 2KiB of\n\t// stack but mostly erasing the baseline cost.\n\tprobesPerLoop = 256\n)\n\nvar (\n\tprngSeed   [8]byte\n\tprngMutex  sync.Mutex\n\tprngSource rand.Source64\n)\n\nfunc init() {\n\t_, err := cryptoRand.Read(prngSeed[:])\n\tif err != nil {\n\t\tpanic(\"cannot seed random number generator from system source: \" + err.Error())\n\t}\n\tseed := int64(binary.LittleEndian.Uint64(prngSeed[:]))\n\tprngSource = rand.NewSource(seed).(rand.Source64)\n}\n\nfunc tableSizeAndMaxLen(groupSize, numValues int, maxLoad float64) (size, maxLen int) {\n\tn := int(math.Ceil((1 / maxLoad) * float64(numValues)))\n\tsize = nextPowerOf2((n + (groupSize - 1)) / groupSize)\n\tmaxLen = int(math.Ceil(maxLoad * float64(groupSize*size)))\n\treturn\n}\n\nfunc nextPowerOf2(n int) int {\n\treturn 1 << (64 - bits.LeadingZeros64(uint64(n-1)))\n}\n\nfunc randSeed() uintptr {\n\tprngMutex.Lock()\n\tdefer prngMutex.Unlock()\n\treturn uintptr(prngSource.Uint64())\n}\n\ntype Int32Table struct{ table32 }\n\nfunc NewInt32Table(cap int, maxLoad float64) *Int32Table {\n\treturn &Int32Table{makeTable32(cap, maxLoad)}\n}\n\nfunc (t *Int32Table) Reset() { t.reset() }\n\nfunc (t *Int32Table) Len() int { return t.len }\n\nfunc (t *Int32Table) Cap() int { return t.size() }\n\nfunc (t *Int32Table) Probe(keys, values []int32) int {\n\treturn t.probe(unsafecast.Int32ToUint32(keys), values)\n}\n\nfunc (t *Int32Table) ProbeArray(keys sparse.Int32Array, values []int32) int {\n\treturn t.probeArray(keys.Uint32Array(), values)\n}\n\ntype Float32Table struct{ table32 }\n\nfunc NewFloat32Table(cap int, maxLoad float64) *Float32Table {\n\treturn &Float32Table{makeTable32(cap, maxLoad)}\n}\n\nfunc (t *Float32Table) Reset() { t.reset() }\n\nfunc (t *Float32Table) Len() int { return t.len }\n\nfunc (t *Float32Table) Cap() int { return t.size() }\n\nfunc (t *Float32Table) Probe(keys []float32, values []int32) int {\n\treturn t.probe(unsafecast.Float32ToUint32(keys), values)\n}\n\nfunc (t *Float32Table) ProbeArray(keys sparse.Float32Array, values []int32) int {\n\treturn t.probeArray(keys.Uint32Array(), values)\n}\n\ntype Uint32Table struct{ table32 }\n\nfunc NewUint32Table(cap int, maxLoad float64) *Uint32Table {\n\treturn &Uint32Table{makeTable32(cap, maxLoad)}\n}\n\nfunc (t *Uint32Table) Reset() { t.reset() }\n\nfunc (t *Uint32Table) Len() int { return t.len }\n\nfunc (t *Uint32Table) Cap() int { return t.size() }\n\nfunc (t *Uint32Table) Probe(keys []uint32, values []int32) int {\n\treturn t.probe(keys, values)\n}\n\nfunc (t *Uint32Table) ProbeArray(keys sparse.Uint32Array, values []int32) int {\n\treturn t.probeArray(keys, values)\n}\n\n// table32 is the generic implementation of probing tables for 32 bit types.\n//\n// The table uses the following memory layout:\n//\n//\t[group 0][group 1][...][group N]\n//\n// Each group contains up to 7 key/value pairs, and is exactly 64 bytes in size,\n// which allows it to fit within a single cache line, and ensures that probes\n// can be performed with a single memory load per key.\n//\n// Groups fill up by appending new entries to the keys and values arrays. When a\n// group is full, the probe checks the next group.\n//\n// https://en.wikipedia.org/wiki/Linear_probing\ntype table32 struct {\n\tlen     int\n\tmaxLen  int\n\tmaxLoad float64\n\tseed    uintptr\n\ttable   []table32Group\n}\n\nconst table32GroupSize = 7\n\ntype table32Group struct {\n\tkeys   [table32GroupSize]uint32\n\tvalues [table32GroupSize]uint32\n\tbits   uint32\n\t_      uint32\n}\n\nfunc makeTable32(cap int, maxLoad float64) (t table32) {\n\tif maxLoad < 0 || maxLoad > 1 {\n\t\tpanic(\"max load of probing table must be a value between 0 and 1\")\n\t}\n\tif cap < table32GroupSize {\n\t\tcap = table32GroupSize\n\t}\n\tt.init(cap, maxLoad)\n\treturn t\n}\n\nfunc (t *table32) size() int {\n\treturn table32GroupSize * len(t.table)\n}\n\nfunc (t *table32) init(cap int, maxLoad float64) {\n\tsize, maxLen := tableSizeAndMaxLen(table32GroupSize, cap, maxLoad)\n\t*t = table32{\n\t\tmaxLen:  maxLen,\n\t\tmaxLoad: maxLoad,\n\t\tseed:    randSeed(),\n\t\ttable:   make([]table32Group, size),\n\t}\n}\n\nfunc (t *table32) grow(totalValues int) {\n\ttmp := table32{}\n\ttmp.init(totalValues, t.maxLoad)\n\ttmp.len = t.len\n\n\thashes := make([]uintptr, table32GroupSize)\n\tmodulo := uintptr(len(tmp.table)) - 1\n\n\tfor i := range t.table {\n\t\tg := &t.table[i]\n\t\tn := bits.OnesCount32(g.bits)\n\n\t\tif aeshash.Enabled() {\n\t\t\taeshash.MultiHash32(hashes[:n], g.keys[:n], tmp.seed)\n\t\t} else {\n\t\t\twyhash.MultiHash32(hashes[:n], g.keys[:n], tmp.seed)\n\t\t}\n\n\t\tfor j, hash := range hashes[:n] {\n\t\t\tfor {\n\t\t\t\tgroup := &tmp.table[hash&modulo]\n\n\t\t\t\tif n := bits.OnesCount32(group.bits); n < table32GroupSize {\n\t\t\t\t\tgroup.bits = (group.bits << 1) | 1\n\t\t\t\t\tgroup.keys[n] = g.keys[j]\n\t\t\t\t\tgroup.values[n] = g.values[j]\n\t\t\t\t\tbreak\n\t\t\t\t}\n\n\t\t\t\thash++\n\t\t\t}\n\t\t}\n\t}\n\n\t*t = tmp\n}\n\nfunc (t *table32) reset() {\n\tt.len = 0\n\n\tfor i := range t.table {\n\t\tt.table[i] = table32Group{}\n\t}\n}\n\nfunc (t *table32) probe(keys []uint32, values []int32) int {\n\treturn t.probeArray(sparse.MakeUint32Array(keys), values)\n}\n\nfunc (t *table32) probeArray(keys sparse.Uint32Array, values []int32) int {\n\tnumKeys := keys.Len()\n\n\tif totalValues := t.len + numKeys; totalValues > t.maxLen {\n\t\tt.grow(totalValues)\n\t}\n\n\tvar hashes [probesPerLoop]uintptr\n\tvar baseLength = t.len\n\tvar useAesHash = aeshash.Enabled()\n\n\t_ = values[:numKeys]\n\n\tfor i := 0; i < numKeys; {\n\t\tj := len(hashes) + i\n\t\tn := len(hashes)\n\n\t\tif j > numKeys {\n\t\t\tj = numKeys\n\t\t\tn = numKeys - i\n\t\t}\n\n\t\tk := keys.Slice(i, j)\n\t\tv := values[i:j:j]\n\t\th := hashes[:n:n]\n\n\t\tif useAesHash {\n\t\t\taeshash.MultiHashUint32Array(h, k, t.seed)\n\t\t} else {\n\t\t\twyhash.MultiHashUint32Array(h, k, t.seed)\n\t\t}\n\n\t\tt.len = multiProbe32(t.table, t.len, h, k, v)\n\t\ti = j\n\t}\n\n\treturn t.len - baseLength\n}\n\nfunc multiProbe32Default(table []table32Group, numKeys int, hashes []uintptr, keys sparse.Uint32Array, values []int32) int {\n\tmodulo := uintptr(len(table)) - 1\n\n\tfor i, hash := range hashes {\n\t\tkey := keys.Index(i)\n\t\tfor {\n\t\t\tgroup := &table[hash&modulo]\n\t\t\tindex := table32GroupSize\n\t\t\tvalue := int32(0)\n\n\t\t\tfor j, k := range group.keys {\n\t\t\t\tif k == key {\n\t\t\t\t\tindex = j\n\t\t\t\t\tbreak\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tif n := bits.OnesCount32(group.bits); index < n {\n\t\t\t\tvalue = int32(group.values[index])\n\t\t\t} else {\n\t\t\t\tif n == table32GroupSize {\n\t\t\t\t\thash++\n\t\t\t\t\tcontinue\n\t\t\t\t}\n\n\t\t\t\tvalue = int32(numKeys)\n\t\t\t\tgroup.bits = (group.bits << 1) | 1\n\t\t\t\tgroup.keys[n] = key\n\t\t\t\tgroup.values[n] = uint32(value)\n\t\t\t\tnumKeys++\n\t\t\t}\n\n\t\t\tvalues[i] = value\n\t\t\tbreak\n\t\t}\n\t}\n\n\treturn numKeys\n}\n\ntype Int64Table struct{ table64 }\n\nfunc NewInt64Table(cap int, maxLoad float64) *Int64Table {\n\treturn &Int64Table{makeTable64(cap, maxLoad)}\n}\n\nfunc (t *Int64Table) Reset() { t.reset() }\n\nfunc (t *Int64Table) Len() int { return t.len }\n\nfunc (t *Int64Table) Cap() int { return t.size() }\n\nfunc (t *Int64Table) Probe(keys []int64, values []int32) int {\n\treturn t.probe(unsafecast.Int64ToUint64(keys), values)\n}\n\nfunc (t *Int64Table) ProbeArray(keys sparse.Int64Array, values []int32) int {\n\treturn t.probeArray(keys.Uint64Array(), values)\n}\n\ntype Float64Table struct{ table64 }\n\nfunc NewFloat64Table(cap int, maxLoad float64) *Float64Table {\n\treturn &Float64Table{makeTable64(cap, maxLoad)}\n}\n\nfunc (t *Float64Table) Reset() { t.reset() }\n\nfunc (t *Float64Table) Len() int { return t.len }\n\nfunc (t *Float64Table) Cap() int { return t.size() }\n\nfunc (t *Float64Table) Probe(keys []float64, values []int32) int {\n\treturn t.probe(unsafecast.Float64ToUint64(keys), values)\n}\n\nfunc (t *Float64Table) ProbeArray(keys sparse.Float64Array, values []int32) int {\n\treturn t.probeArray(keys.Uint64Array(), values)\n}\n\ntype Uint64Table struct{ table64 }\n\nfunc NewUint64Table(cap int, maxLoad float64) *Uint64Table {\n\treturn &Uint64Table{makeTable64(cap, maxLoad)}\n}\n\nfunc (t *Uint64Table) Reset() { t.reset() }\n\nfunc (t *Uint64Table) Len() int { return t.len }\n\nfunc (t *Uint64Table) Cap() int { return t.size() }\n\nfunc (t *Uint64Table) Probe(keys []uint64, values []int32) int {\n\treturn t.probe(keys, values)\n}\n\nfunc (t *Uint64Table) ProbeArray(keys sparse.Uint64Array, values []int32) int {\n\treturn t.probeArray(keys, values)\n}\n\n// table64 is the generic implementation of probing tables for 64 bit types.\n//\n// The table uses a layout similar to the one documented on the table for 32 bit\n// keys (see table32). Each group holds up to 4 key/value pairs (instead of 7\n// like table32) so that each group fits in a single CPU cache line. This table\n// version has a bit lower memory density, with ~23% of table memory being used\n// for padding.\n//\n// Technically we could hold up to 5 entries per group and still fit within the\n// 64 bytes of a CPU cache line; on x86 platforms, AVX2 registers can only hold\n// four 64 bit values, we would need twice as many instructions per probe if the\n// groups were holding 5 values. The trade off of memory for compute efficiency\n// appeared to be the right choice at the time.\ntype table64 struct {\n\tlen     int\n\tmaxLen  int\n\tmaxLoad float64\n\tseed    uintptr\n\ttable   []table64Group\n}\n\nconst table64GroupSize = 4\n\ntype table64Group struct {\n\tkeys   [table64GroupSize]uint64\n\tvalues [table64GroupSize]uint32\n\tbits   uint32\n\t_      uint32\n\t_      uint32\n\t_      uint32\n}\n\nfunc makeTable64(cap int, maxLoad float64) (t table64) {\n\tif maxLoad < 0 || maxLoad > 1 {\n\t\tpanic(\"max load of probing table must be a value between 0 and 1\")\n\t}\n\tif cap < table64GroupSize {\n\t\tcap = table64GroupSize\n\t}\n\tt.init(cap, maxLoad)\n\treturn t\n}\n\nfunc (t *table64) size() int {\n\treturn table64GroupSize * len(t.table)\n}\n\nfunc (t *table64) init(cap int, maxLoad float64) {\n\tsize, maxLen := tableSizeAndMaxLen(table64GroupSize, cap, maxLoad)\n\t*t = table64{\n\t\tmaxLen:  maxLen,\n\t\tmaxLoad: maxLoad,\n\t\tseed:    randSeed(),\n\t\ttable:   make([]table64Group, size),\n\t}\n}\n\nfunc (t *table64) grow(totalValues int) {\n\ttmp := table64{}\n\ttmp.init(totalValues, t.maxLoad)\n\ttmp.len = t.len\n\n\thashes := make([]uintptr, table64GroupSize)\n\tmodulo := uintptr(len(tmp.table)) - 1\n\n\tfor i := range t.table {\n\t\tg := &t.table[i]\n\t\tn := bits.OnesCount32(g.bits)\n\n\t\tif aeshash.Enabled() {\n\t\t\taeshash.MultiHash64(hashes[:n], g.keys[:n], tmp.seed)\n\t\t} else {\n\t\t\twyhash.MultiHash64(hashes[:n], g.keys[:n], tmp.seed)\n\t\t}\n\n\t\tfor j, hash := range hashes[:n] {\n\t\t\tfor {\n\t\t\t\tgroup := &tmp.table[hash&modulo]\n\n\t\t\t\tif n := bits.OnesCount32(group.bits); n < table64GroupSize {\n\t\t\t\t\tgroup.bits = (group.bits << 1) | 1\n\t\t\t\t\tgroup.keys[n] = g.keys[j]\n\t\t\t\t\tgroup.values[n] = g.values[j]\n\t\t\t\t\tbreak\n\t\t\t\t}\n\n\t\t\t\thash++\n\t\t\t}\n\t\t}\n\t}\n\n\t*t = tmp\n}\n\nfunc (t *table64) reset() {\n\tt.len = 0\n\n\tfor i := range t.table {\n\t\tt.table[i] = table64Group{}\n\t}\n}\n\nfunc (t *table64) probe(keys []uint64, values []int32) int {\n\treturn t.probeArray(sparse.MakeUint64Array(keys), values)\n}\n\nfunc (t *table64) probeArray(keys sparse.Uint64Array, values []int32) int {\n\tnumKeys := keys.Len()\n\n\tif totalValues := t.len + numKeys; totalValues > t.maxLen {\n\t\tt.grow(totalValues)\n\t}\n\n\tvar hashes [probesPerLoop]uintptr\n\tvar baseLength = t.len\n\tvar useAesHash = aeshash.Enabled()\n\n\t_ = values[:numKeys]\n\n\tfor i := 0; i < numKeys; {\n\t\tj := len(hashes) + i\n\t\tn := len(hashes)\n\n\t\tif j > numKeys {\n\t\t\tj = numKeys\n\t\t\tn = numKeys - i\n\t\t}\n\n\t\tk := keys.Slice(i, j)\n\t\tv := values[i:j:j]\n\t\th := hashes[:n:n]\n\n\t\tif useAesHash {\n\t\t\taeshash.MultiHashUint64Array(h, k, t.seed)\n\t\t} else {\n\t\t\twyhash.MultiHashUint64Array(h, k, t.seed)\n\t\t}\n\n\t\tt.len = multiProbe64(t.table, t.len, h, k, v)\n\t\ti = j\n\t}\n\n\treturn t.len - baseLength\n}\n\nfunc multiProbe64Default(table []table64Group, numKeys int, hashes []uintptr, keys sparse.Uint64Array, values []int32) int {\n\tmodulo := uintptr(len(table)) - 1\n\n\tfor i, hash := range hashes {\n\t\tkey := keys.Index(i)\n\t\tfor {\n\t\t\tgroup := &table[hash&modulo]\n\t\t\tindex := table64GroupSize\n\t\t\tvalue := int32(0)\n\n\t\t\tfor i, k := range group.keys {\n\t\t\t\tif k == key {\n\t\t\t\t\tindex = i\n\t\t\t\t\tbreak\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tif n := bits.OnesCount32(group.bits); index < n {\n\t\t\t\tvalue = int32(group.values[index])\n\t\t\t} else {\n\t\t\t\tif n == table64GroupSize {\n\t\t\t\t\thash++\n\t\t\t\t\tcontinue\n\t\t\t\t}\n\n\t\t\t\tvalue = int32(numKeys)\n\t\t\t\tgroup.bits = (group.bits << 1) | 1\n\t\t\t\tgroup.keys[n] = key\n\t\t\t\tgroup.values[n] = uint32(value)\n\t\t\t\tnumKeys++\n\t\t\t}\n\n\t\t\tvalues[i] = value\n\t\t\tbreak\n\t\t}\n\t}\n\n\treturn numKeys\n}\n\ntype Uint128Table struct{ table128 }\n\nfunc NewUint128Table(cap int, maxLoad float64) *Uint128Table {\n\treturn &Uint128Table{makeTable128(cap, maxLoad)}\n}\n\nfunc (t *Uint128Table) Reset() { t.reset() }\n\nfunc (t *Uint128Table) Len() int { return t.len }\n\nfunc (t *Uint128Table) Cap() int { return t.cap }\n\nfunc (t *Uint128Table) Probe(keys [][16]byte, values []int32) int {\n\treturn t.probe(keys, values)\n}\n\nfunc (t *Uint128Table) ProbeArray(keys sparse.Uint128Array, values []int32) int {\n\treturn t.probeArray(keys, values)\n}\n\n// table128 is the generic implementation of probing tables for 128 bit types.\n//\n// This table uses the following memory layout:\n//\n//\t[key A][key B][...][value A][value B][...]\n//\n// The table stores values as their actual value plus one, and uses zero as a\n// sentinel to determine whether a slot is occupied. A linear probing strategy\n// is used to resolve conflicts. This approach results in at most two memory\n// loads for every four keys being tested, since the location of a key and its\n// corresponding value will not be contiguous on the same CPU cache line, but\n// a cache line can hold four 16 byte keys.\ntype table128 struct {\n\tlen     int\n\tcap     int\n\tmaxLen  int\n\tmaxLoad float64\n\tseed    uintptr\n\ttable   []byte\n}\n\nfunc makeTable128(cap int, maxLoad float64) (t table128) {\n\tif maxLoad < 0 || maxLoad > 1 {\n\t\tpanic(\"max load of probing table must be a value between 0 and 1\")\n\t}\n\tif cap < 8 {\n\t\tcap = 8\n\t}\n\tt.init(cap, maxLoad)\n\treturn t\n}\n\nfunc (t *table128) init(cap int, maxLoad float64) {\n\tsize, maxLen := tableSizeAndMaxLen(1, cap, maxLoad)\n\t*t = table128{\n\t\tcap:     size,\n\t\tmaxLen:  maxLen,\n\t\tmaxLoad: maxLoad,\n\t\tseed:    randSeed(),\n\t\ttable:   make([]byte, 16*size+4*size),\n\t}\n}\n\nfunc (t *table128) kv() (keys [][16]byte, values []int32) {\n\ti := t.cap * 16\n\treturn unsafecast.BytesToUint128(t.table[:i]), unsafecast.BytesToInt32(t.table[i:])\n}\n\nfunc (t *table128) grow(totalValues int) {\n\ttmp := table128{}\n\ttmp.init(totalValues, t.maxLoad)\n\ttmp.len = t.len\n\n\tkeys, values := t.kv()\n\thashes := make([]uintptr, probesPerLoop)\n\tuseAesHash := aeshash.Enabled()\n\n\t_ = values[:len(keys)]\n\n\tfor i := 0; i < len(keys); {\n\t\tj := len(hashes) + i\n\t\tn := len(hashes)\n\n\t\tif j > len(keys) {\n\t\t\tj = len(keys)\n\t\t\tn = len(keys) - i\n\t\t}\n\n\t\th := hashes[:n:n]\n\t\tk := keys[i:j:j]\n\t\tv := values[i:j:j]\n\n\t\tif useAesHash {\n\t\t\taeshash.MultiHash128(h, k, tmp.seed)\n\t\t} else {\n\t\t\twyhash.MultiHash128(h, k, tmp.seed)\n\t\t}\n\n\t\ttmp.insert(h, k, v)\n\t\ti = j\n\t}\n\n\t*t = tmp\n}\n\nfunc (t *table128) insert(hashes []uintptr, keys [][16]byte, values []int32) {\n\ttableKeys, tableValues := t.kv()\n\tmodulo := uintptr(t.cap) - 1\n\n\tfor i, hash := range hashes {\n\t\tfor {\n\t\t\tj := hash & modulo\n\t\t\tv := tableValues[j]\n\n\t\t\tif v == 0 {\n\t\t\t\ttableKeys[j] = keys[i]\n\t\t\t\ttableValues[j] = values[i]\n\t\t\t\tbreak\n\t\t\t}\n\n\t\t\thash++\n\t\t}\n\t}\n}\n\nfunc (t *table128) reset() {\n\tt.len = 0\n\n\tfor i := range t.table {\n\t\tt.table[i] = 0\n\t}\n}\n\nfunc (t *table128) probe(keys [][16]byte, values []int32) int {\n\treturn t.probeArray(sparse.MakeUint128Array(keys), values)\n}\n\nfunc (t *table128) probeArray(keys sparse.Uint128Array, values []int32) int {\n\tnumKeys := keys.Len()\n\n\tif totalValues := t.len + numKeys; totalValues > t.maxLen {\n\t\tt.grow(totalValues)\n\t}\n\n\tvar hashes [probesPerLoop]uintptr\n\tvar baseLength = t.len\n\tvar useAesHash = aeshash.Enabled()\n\n\t_ = values[:numKeys]\n\n\tfor i := 0; i < numKeys; {\n\t\tj := len(hashes) + i\n\t\tn := len(hashes)\n\n\t\tif j > numKeys {\n\t\t\tj = numKeys\n\t\t\tn = numKeys - i\n\t\t}\n\n\t\tk := keys.Slice(i, j)\n\t\tv := values[i:j:j]\n\t\th := hashes[:n:n]\n\n\t\tif useAesHash {\n\t\t\taeshash.MultiHashUint128Array(h, k, t.seed)\n\t\t} else {\n\t\t\twyhash.MultiHashUint128Array(h, k, t.seed)\n\t\t}\n\n\t\tt.len = multiProbe128(t.table, t.cap, t.len, h, k, v)\n\t\ti = j\n\t}\n\n\treturn t.len - baseLength\n}\n\nfunc multiProbe128Default(table []byte, tableCap, tableLen int, hashes []uintptr, keys sparse.Uint128Array, values []int32) int {\n\tmodulo := uintptr(tableCap) - 1\n\toffset := uintptr(tableCap) * 16\n\ttableKeys := unsafecast.BytesToUint128(table[:offset])\n\ttableValues := unsafecast.BytesToInt32(table[offset:])\n\n\tfor i, hash := range hashes {\n\t\tkey := keys.Index(i)\n\t\tfor {\n\t\t\tj := hash & modulo\n\t\t\tv := tableValues[j]\n\n\t\t\tif v == 0 {\n\t\t\t\tvalues[i] = int32(tableLen)\n\t\t\t\ttableLen++\n\t\t\t\ttableKeys[j] = key\n\t\t\t\ttableValues[j] = int32(tableLen)\n\t\t\t\tbreak\n\t\t\t}\n\n\t\t\tif key == tableKeys[j] {\n\t\t\t\tvalues[i] = v - 1\n\t\t\t\tbreak\n\t\t\t}\n\n\t\t\thash++\n\t\t}\n\t}\n\n\treturn tableLen\n}\n"
  },
  {
    "path": "hashprobe/hashprobe_amd64.go",
    "content": "//go:build !purego\n\npackage hashprobe\n\nimport (\n\t\"github.com/segmentio/parquet-go/sparse\"\n\t\"golang.org/x/sys/cpu\"\n)\n\n//go:noescape\nfunc multiProbe32AVX2(table []table32Group, numKeys int, hashes []uintptr, keys sparse.Uint32Array, values []int32) int\n\n//go:noescape\nfunc multiProbe64AVX2(table []table64Group, numKeys int, hashes []uintptr, keys sparse.Uint64Array, values []int32) int\n\n//go:noescape\nfunc multiProbe128SSE2(table []byte, tableCap, tableLen int, hashes []uintptr, keys sparse.Uint128Array, values []int32) int\n\nfunc multiProbe32(table []table32Group, numKeys int, hashes []uintptr, keys sparse.Uint32Array, values []int32) int {\n\tif cpu.X86.HasAVX2 {\n\t\treturn multiProbe32AVX2(table, numKeys, hashes, keys, values)\n\t}\n\treturn multiProbe32Default(table, numKeys, hashes, keys, values)\n}\n\nfunc multiProbe64(table []table64Group, numKeys int, hashes []uintptr, keys sparse.Uint64Array, values []int32) int {\n\tif cpu.X86.HasAVX2 {\n\t\treturn multiProbe64AVX2(table, numKeys, hashes, keys, values)\n\t}\n\treturn multiProbe64Default(table, numKeys, hashes, keys, values)\n}\n\nfunc multiProbe128(table []byte, tableCap, tableLen int, hashes []uintptr, keys sparse.Uint128Array, values []int32) int {\n\tif cpu.X86.HasSSE2 {\n\t\treturn multiProbe128SSE2(table, tableCap, tableLen, hashes, keys, values)\n\t}\n\treturn multiProbe128Default(table, tableCap, tableLen, hashes, keys, values)\n}\n"
  },
  {
    "path": "hashprobe/hashprobe_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n// This version of the probing algorithm for 32 bit keys takes advantage of\n// the memory layout of table groups and SIMD instructions to accelerate the\n// probing operations.\n//\n// The first 32 bytes of a table group contain the bit mask indicating which\n// slots are in use, and the array of keys, which fits into a single vector\n// register (YMM) and can be loaded and tested with a single instruction.\n//\n// A first version of the table group used the number of keys held in the\n// group instead of a bit mask, which required the probing operation to\n// reconstruct the bit mask during the lookup operation in order to identify\n// which elements of the VPCMPEQD result should be retained. The extra CPU\n// instructions used to reconstruct the bit mask had a measurable overhead.\n// By holding the bit mask in the data structure, we can determine the number\n// of keys in a group using the POPCNT instruction, and avoid recomputing the\n// mask during lookups.\n//\n// func multiProbe32AVX2(table []table32Group, numKeys int, hashes []uintptr, keys sparse.Uint32Array, values []int32) int\nTEXT ·multiProbe32AVX2(SB), NOSPLIT, $0-112\n    MOVQ table_base+0(FP), AX\n    MOVQ table_len+8(FP), BX\n    MOVQ numKeys+24(FP), CX\n    MOVQ hashes_base+32(FP), DX\n    MOVQ hashes_len+40(FP), DI\n    MOVQ keys_array_ptr+56(FP), R8\n    MOVQ keys_array_off+72(FP), R15\n    MOVQ values_base+80(FP), R9\n    DECQ BX // modulo = len(table) - 1\n\n    XORQ SI, SI\n    JMP test\nloop:\n    MOVQ (DX)(SI*8), R10  // hash\n    VPBROADCASTD (R8), Y0 // [key]\nprobe:\n    MOVQ R10, R11\n    ANDQ BX, R11 // hash & modulo\n    SHLQ $6, R11 // x 64 (size of table32Group)\n    LEAQ (AX)(R11*1), R12\n\n    VMOVDQU (R12), Y1\n    VPCMPEQD Y0, Y1, Y2\n    VMOVMSKPS Y2, R11\n    MOVL 56(R12), R13\n    TESTL R11, R13\n    JZ insert\n\n    TZCNTL R11, R13\n    MOVL 28(R12)(R13*4), R14\nnext:\n    MOVL R14, (R9)(SI*4)\n    INCQ SI\n    ADDQ R15, R8\ntest:\n    CMPQ SI, DI\n    JNE loop\n    MOVQ CX, ret+104(FP)\n    VZEROUPPER\n    RET\ninsert:\n    CMPL R13, $0b1111111\n    JE probeNextGroup\n\n    MOVL R13, R11\n    POPCNTL R13, R13\n    MOVQ X0, R14 // key\n    SHLL $1, R11\n    ORL $1, R11\n    MOVL R11, 56(R12)       // group.len = (group.len << 1) | 1\n    MOVL R14, (R12)(R13*4)  // group.keys[i] = key\n    MOVL CX, 28(R12)(R13*4) // group.values[i] = value\n    MOVL CX, R14\n    INCL CX\n    JMP next\nprobeNextGroup:\n    INCQ R10\n    JMP probe\n\n// func multiProbe64AVX2(table []table64Group, numKeys int, hashes []uintptr, keys sparse.Uint64Array, values []int32) int\nTEXT ·multiProbe64AVX2(SB), NOSPLIT, $0-112\n    MOVQ table_base+0(FP), AX\n    MOVQ table_len+8(FP), BX\n    MOVQ numKeys+24(FP), CX\n    MOVQ hashes_base+32(FP), DX\n    MOVQ hashes_len+40(FP), DI\n    MOVQ keys_array_ptr+56(FP), R8\n    MOVQ keys_array_off+72(FP), R15\n    MOVQ values_base+80(FP), R9\n    DECQ BX // modulo = len(table) - 1\n\n    XORQ SI, SI\n    JMP test\nloop:\n    MOVQ (DX)(SI*8), R10        // hash\n    VPBROADCASTQ (R8), Y0 // [key]\nprobe:\n    MOVQ R10, R11\n    ANDQ BX, R11 // hash & modulo\n    SHLQ $6, R11 // x 64 (size of table64Group)\n    LEAQ (AX)(R11*1), R12\n\n    VMOVDQU (R12), Y1\n    VPCMPEQQ Y0, Y1, Y2\n    VMOVMSKPD Y2, R11\n    MOVL 48(R12), R13\n    TESTL R11, R13\n    JZ insert\n\n    TZCNTL R11, R13\n    MOVL 32(R12)(R13*4), R14\nnext:\n    MOVL R14, (R9)(SI*4)\n    INCQ SI\n    ADDQ R15, R8\ntest:\n    CMPQ SI, DI\n    JNE loop\n    MOVQ CX, ret+104(FP)\n    VZEROUPPER\n    RET\ninsert:\n    CMPL R13, $0b1111\n    JE probeNextGroup\n\n    MOVL R13, R11\n    POPCNTL R13, R13\n    SHLL $1, R11\n    ORL $1, R11\n    MOVL R11, 48(R12)       // group.len = (group.len << 1) | 1\n    MOVQ X0, (R12)(R13*8)   // group.keys[i] = key\n    MOVL CX, 32(R12)(R13*4) // group.values[i] = value\n    MOVL CX, R14\n    INCL CX\n    JMP next\nprobeNextGroup:\n    INCQ R10\n    JMP probe\n\n// func multiProbe128SSE2(table []byte, tableCap, tableLen int, hashes []uintptr, keys sparse.Uint128Array, values []int32) int\nTEXT ·multiProbe128SSE2(SB), NOSPLIT, $0-120\n    MOVQ table_base+0(FP), AX\n    MOVQ tableCap+24(FP), BX\n    MOVQ tableLen+32(FP), CX\n    MOVQ hashes_base+40(FP), DX\n    MOVQ hashes_len+48(FP), DI\n    MOVQ keys_array_ptr+64(FP), R8\n    MOVQ keys_array_off+80(FP), R15\n    MOVQ values_base+88(FP), R9\n\n    MOVQ BX, R10\n    SHLQ $4, R10\n    LEAQ (AX)(R10*1), R10\n    DECQ BX // modulo = tableCap - 1\n\n    XORQ SI, SI\n    JMP test\nloop:\n    MOVQ (DX)(SI*8), R11 // hash\n    MOVOU (R8), X0       // key\nprobe:\n    MOVQ R11, R12\n    ANDQ BX, R12\n\n    MOVL (R10)(R12*4), R14\n    CMPL R14, $0\n    JE insert\n\n    SHLQ $4, R12\n    MOVOU (AX)(R12*1), X1\n    PCMPEQL X0, X1\n    MOVMSKPS X1, R13\n    CMPL R13, $0b1111\n    JE next\n\n    INCQ R11\n    JMP probe\nnext:\n    DECL R14\n    MOVL R14, (R9)(SI*4)\n    INCQ SI\n    ADDQ R15, R8\ntest:\n    CMPQ SI, DI\n    JNE loop\n    MOVQ CX, ret+112(FP)\n    RET\ninsert:\n    INCL CX\n    MOVL CX, (R10)(R12*4)\n    MOVL CX, R14\n    SHLQ $4, R12\n    MOVOU X0, (AX)(R12*1)\n    JMP next\n"
  },
  {
    "path": "hashprobe/hashprobe_purego.go",
    "content": "//go:build purego || !amd64\n\npackage hashprobe\n\nimport (\n\t\"github.com/segmentio/parquet-go/sparse\"\n)\n\nfunc multiProbe32(table []table32Group, numKeys int, hashes []uintptr, keys sparse.Uint32Array, values []int32) int {\n\treturn multiProbe32Default(table, numKeys, hashes, keys, values)\n}\n\nfunc multiProbe64(table []table64Group, numKeys int, hashes []uintptr, keys sparse.Uint64Array, values []int32) int {\n\treturn multiProbe64Default(table, numKeys, hashes, keys, values)\n}\n\nfunc multiProbe128(table []byte, tableCap, tableLen int, hashes []uintptr, keys sparse.Uint128Array, values []int32) int {\n\treturn multiProbe128Default(table, tableCap, tableLen, hashes, keys, values)\n}\n"
  },
  {
    "path": "hashprobe/hashprobe_test.go",
    "content": "package hashprobe\n\nimport (\n\t\"encoding/binary\"\n\t\"fmt\"\n\t\"math/rand\"\n\t\"testing\"\n\t\"time\"\n\t\"unsafe\"\n)\n\nfunc TestTable32GroupSize(t *testing.T) {\n\tif n := unsafe.Sizeof(table32Group{}); n != 64 {\n\t\tt.Errorf(\"size of 32 bit table group is not 64 bytes: %d\", n)\n\t}\n}\n\nfunc TestUint32TableProbeOneByOne(t *testing.T) {\n\tconst N = 500\n\ttable := NewUint32Table(0, 0.9)\n\n\tfor n := 0; n < 2; n++ {\n\t\t// Do two passes, both should behave the same.\n\t\tfor i := 1; i <= N; i++ {\n\t\t\tk := [1]uint32{}\n\t\t\tv := [1]int32{}\n\n\t\t\tk[0] = uint32(i)\n\t\t\ttable.Probe(k[:], v[:])\n\n\t\t\tif v[0] != int32(i-1) {\n\t\t\t\tt.Errorf(\"wrong value probed for key=%d: want=%d got=%d\", i, i-1, v[0])\n\t\t\t}\n\t\t}\n\t}\n}\n\nfunc TestUint32TableProbeBulk(t *testing.T) {\n\tconst N = 999\n\ttable := NewUint32Table(0, 0.9)\n\n\tk := make([]uint32, N)\n\tv := make([]int32, N)\n\n\tfor i := range k {\n\t\tk[i] = uint32(i)\n\t}\n\n\tfor n := 0; n < 2; n++ {\n\t\ttable.Probe(k, v)\n\n\t\tfor i := range v {\n\t\t\tif v[i] != int32(i) {\n\t\t\t\tt.Errorf(\"wrong value probed for key=%d: want=%d got=%d\", k[i], i, v[i])\n\t\t\t}\n\t\t}\n\n\t\tif t.Failed() {\n\t\t\tbreak\n\t\t}\n\n\t\tfor i := range v {\n\t\t\tv[i] = 0\n\t\t}\n\t}\n}\n\nfunc TestTable64GroupSize(t *testing.T) {\n\tif n := unsafe.Sizeof(table64Group{}); n != 64 {\n\t\tt.Errorf(\"size of 64 bit table group is not 64 bytes: %d\", n)\n\t}\n}\n\nfunc TestUint64TableProbeOneByOne(t *testing.T) {\n\tconst N = 500\n\ttable := NewUint64Table(0, 0.9)\n\n\tfor n := 0; n < 2; n++ {\n\t\t// Do two passes, both should behave the same.\n\t\tfor i := 1; i <= N; i++ {\n\t\t\tk := [1]uint64{}\n\t\t\tv := [1]int32{}\n\n\t\t\tk[0] = uint64(i)\n\t\t\ttable.Probe(k[:], v[:])\n\n\t\t\tif v[0] != int32(i-1) {\n\t\t\t\tt.Errorf(\"wrong value probed for key=%d: want=%d got=%d\", i, i-1, v[0])\n\t\t\t}\n\t\t}\n\t}\n}\n\nfunc TestUint64TableProbeBulk(t *testing.T) {\n\tconst N = 999\n\ttable := NewUint64Table(0, 0.9)\n\n\tk := make([]uint64, N)\n\tv := make([]int32, N)\n\n\tfor i := range k {\n\t\tk[i] = uint64(i)\n\t}\n\n\tfor n := 0; n < 2; n++ {\n\t\ttable.Probe(k, v)\n\n\t\tfor i := range v {\n\t\t\tif v[i] != int32(i) {\n\t\t\t\tt.Errorf(\"wrong value probed for key=%d: want=%d got=%d\", k[i], i, v[i])\n\t\t\t}\n\t\t}\n\n\t\tif t.Failed() {\n\t\t\tbreak\n\t\t}\n\n\t\tfor i := range v {\n\t\t\tv[i] = 0\n\t\t}\n\t}\n}\n\nfunc TestUint128TableProbeOneByOne(t *testing.T) {\n\tconst N = 500\n\ttable := NewUint128Table(0, 0.9)\n\n\tfor n := 0; n < 2; n++ {\n\t\t// Do two passes, both should behave the same.\n\t\tfor i := 1; i <= N; i++ {\n\t\t\tk := [1][16]byte{}\n\t\t\tv := [1]int32{}\n\n\t\t\tbinary.LittleEndian.PutUint64(k[0][:8], uint64(i))\n\t\t\ttable.Probe(k[:], v[:])\n\n\t\t\tif v[0] != int32(i-1) {\n\t\t\t\tt.Errorf(\"wrong value probed for key=%x: want=%d got=%d\", i, i-1, v[0])\n\t\t\t}\n\t\t}\n\t}\n}\n\nfunc TestUint128TableProbeBulk(t *testing.T) {\n\tconst N = 999\n\ttable := NewUint128Table(0, 0.9)\n\n\tk := make([][16]byte, N)\n\tv := make([]int32, N)\n\n\tfor i := range k {\n\t\tbinary.LittleEndian.PutUint64(k[i][:8], uint64(i))\n\t}\n\n\tfor n := 0; n < 2; n++ {\n\t\ttable.Probe(k, v)\n\n\t\tfor i := range v {\n\t\t\tif v[i] != int32(i) {\n\t\t\t\tt.Errorf(\"wrong value probed for key=%x: want=%d got=%d\", k[i], i, v[i])\n\t\t\t}\n\t\t}\n\n\t\tif t.Failed() {\n\t\t\tbreak\n\t\t}\n\n\t\tfor i := range v {\n\t\t\tv[i] = 0\n\t\t}\n\t}\n}\n\nconst (\n\tbenchmarkProbesPerLoop = 500\n\tbenchmarkMaxLoad       = 0.9\n)\n\ntype uint32Table interface {\n\tReset()\n\tLen() int\n\tProbe([]uint32, []int32) int\n}\n\ntype uint32Map map[uint32]int32\n\nfunc (m uint32Map) Reset() {\n\tfor k := range m {\n\t\tdelete(m, k)\n\t}\n}\n\nfunc (m uint32Map) Len() int {\n\treturn len(m)\n}\n\nfunc (m uint32Map) Probe(keys []uint32, values []int32) (n int) {\n\t_ = values[:len(keys)]\n\n\tfor i, k := range keys {\n\t\tv, ok := m[k]\n\t\tif !ok {\n\t\t\tv = int32(len(m))\n\t\t\tm[k] = v\n\t\t\tn++\n\t\t}\n\t\tvalues[i] = v\n\t}\n\n\treturn n\n}\n\nfunc BenchmarkUint32Table(b *testing.B) {\n\tbenchmarkUint32Table(b, func(size int) uint32Table { return NewUint32Table(size, benchmarkMaxLoad) })\n}\n\nfunc BenchmarkGoUint32Map(b *testing.B) {\n\tbenchmarkUint32Table(b, func(size int) uint32Table { return make(uint32Map, size) })\n}\n\nfunc benchmarkUint32Table(b *testing.B, newTable func(size int) uint32Table) {\n\tfor n := 100; n <= 1e6; n *= 10 {\n\t\ttable := newTable(0)\n\t\tkeys, values := generateUint32Table(n)\n\n\t\tb.Run(fmt.Sprintf(\"N=%d\", n), func(b *testing.B) {\n\t\t\tbenchmarkUint32Loop(b, table.Probe, keys, values)\n\t\t})\n\t}\n}\n\nfunc benchmarkUint32Loop(b *testing.B, f func([]uint32, []int32) int, keys []uint32, values []int32) {\n\ti := 0\n\tj := benchmarkProbesPerLoop\n\tb.SetBytes(4 * int64(benchmarkProbesPerLoop))\n\n\t_ = keys[:len(values)]\n\t_ = values[:len(keys)]\n\tstart := time.Now()\n\n\tfor k := 0; k < b.N; k++ {\n\t\tif j > len(keys) {\n\t\t\tj = len(keys)\n\t\t}\n\t\tf(keys[i:j:j], values[i:j:j])\n\t\tif j == len(keys) {\n\t\t\ti, j = 0, benchmarkProbesPerLoop\n\t\t} else {\n\t\t\ti, j = j, j+benchmarkProbesPerLoop\n\t\t}\n\t}\n\n\tseconds := time.Since(start).Seconds()\n\tb.ReportMetric(float64(benchmarkProbesPerLoop*b.N)/seconds, \"probe/s\")\n}\n\nfunc generateUint32Table(n int) ([]uint32, []int32) {\n\tprng := rand.New(rand.NewSource(int64(n)))\n\tkeys := make([]uint32, n)\n\tvalues := make([]int32, n)\n\n\tfor i := range keys {\n\t\tkeys[i] = prng.Uint32()\n\t}\n\n\treturn keys, values\n}\n\ntype uint64Table interface {\n\tReset()\n\tLen() int\n\tProbe([]uint64, []int32) int\n}\n\ntype uint64Map map[uint64]int32\n\nfunc (m uint64Map) Reset() {\n\tfor k := range m {\n\t\tdelete(m, k)\n\t}\n}\n\nfunc (m uint64Map) Len() int {\n\treturn len(m)\n}\n\nfunc (m uint64Map) Probe(keys []uint64, values []int32) (n int) {\n\t_ = values[:len(keys)]\n\n\tfor i, k := range keys {\n\t\tv, ok := m[k]\n\t\tif !ok {\n\t\t\tv = int32(len(m))\n\t\t\tm[k] = v\n\t\t\tn++\n\t\t}\n\t\tvalues[i] = v\n\t}\n\n\treturn n\n}\n\nfunc BenchmarkUint64Table(b *testing.B) {\n\tbenchmarkUint64Table(b, func(size int) uint64Table { return NewUint64Table(size, benchmarkMaxLoad) })\n}\n\nfunc BenchmarkGoUint64Map(b *testing.B) {\n\tbenchmarkUint64Table(b, func(size int) uint64Table { return make(uint64Map, size) })\n}\n\nfunc benchmarkUint64Table(b *testing.B, newTable func(size int) uint64Table) {\n\tfor n := 100; n <= 1e6; n *= 10 {\n\t\ttable := newTable(0)\n\t\tkeys, values := generateUint64Table(n)\n\n\t\tb.Run(fmt.Sprintf(\"N=%d\", n), func(b *testing.B) {\n\t\t\tbenchmarkUint64Loop(b, table.Probe, keys, values)\n\t\t})\n\t}\n}\n\nfunc benchmarkUint64Loop(b *testing.B, f func([]uint64, []int32) int, keys []uint64, values []int32) {\n\ti := 0\n\tj := benchmarkProbesPerLoop\n\tb.SetBytes(8 * int64(benchmarkProbesPerLoop))\n\n\t_ = keys[:len(values)]\n\t_ = values[:len(keys)]\n\tstart := time.Now()\n\n\tfor k := 0; k < b.N; k++ {\n\t\tif j > len(keys) {\n\t\t\tj = len(keys)\n\t\t}\n\t\tf(keys[i:j:j], values[i:j:j])\n\t\tif j == len(keys) {\n\t\t\ti, j = 0, benchmarkProbesPerLoop\n\t\t} else {\n\t\t\ti, j = j, j+benchmarkProbesPerLoop\n\t\t}\n\t}\n\n\tseconds := time.Since(start).Seconds()\n\tb.ReportMetric(float64(benchmarkProbesPerLoop*b.N)/seconds, \"probe/s\")\n}\n\nfunc generateUint64Table(n int) ([]uint64, []int32) {\n\tprng := rand.New(rand.NewSource(int64(n)))\n\tkeys := make([]uint64, n)\n\tvalues := make([]int32, n)\n\n\tfor i := range keys {\n\t\tkeys[i] = prng.Uint64()\n\t}\n\n\treturn keys, values\n}\n\ntype uint128Table interface {\n\tReset()\n\tLen() int\n\tProbe([][16]byte, []int32) int\n}\n\ntype uint128Map map[[16]byte]int32\n\nfunc (m uint128Map) Reset() {\n\tfor k := range m {\n\t\tdelete(m, k)\n\t}\n}\n\nfunc (m uint128Map) Len() int {\n\treturn len(m)\n}\n\nfunc (m uint128Map) Probe(keys [][16]byte, values []int32) (n int) {\n\t_ = values[:len(keys)]\n\n\tfor i, k := range keys {\n\t\tv, ok := m[k]\n\t\tif !ok {\n\t\t\tv = int32(len(m))\n\t\t\tm[k] = v\n\t\t\tn++\n\t\t}\n\t\tvalues[i] = v\n\t}\n\n\treturn n\n}\n\nfunc BenchmarkUint128Table(b *testing.B) {\n\tbenchmarkUint128Table(b, func(size int) uint128Table { return NewUint128Table(size, benchmarkMaxLoad) })\n}\n\nfunc BenchmarkGoUint128Map(b *testing.B) {\n\tbenchmarkUint128Table(b, func(size int) uint128Table { return make(uint128Map, size) })\n}\n\nfunc benchmarkUint128Table(b *testing.B, newTable func(size int) uint128Table) {\n\tfor n := 100; n <= 1e6; n *= 10 {\n\t\ttable := newTable(0)\n\t\tkeys, values := generateUint128Table(n)\n\n\t\tb.Run(fmt.Sprintf(\"N=%d\", n), func(b *testing.B) {\n\t\t\tbenchmarkUint128Loop(b, table.Probe, keys, values)\n\t\t})\n\t}\n}\n\nfunc benchmarkUint128Loop(b *testing.B, f func([][16]byte, []int32) int, keys [][16]byte, values []int32) {\n\ti := 0\n\tj := benchmarkProbesPerLoop\n\tb.SetBytes(16 * int64(benchmarkProbesPerLoop))\n\n\t_ = keys[:len(values)]\n\t_ = values[:len(keys)]\n\tstart := time.Now()\n\n\tfor k := 0; k < b.N; k++ {\n\t\tif j > len(keys) {\n\t\t\tj = len(keys)\n\t\t}\n\t\tf(keys[i:j:j], values[i:j:j])\n\t\tif j == len(keys) {\n\t\t\ti, j = 0, benchmarkProbesPerLoop\n\t\t} else {\n\t\t\ti, j = j, j+benchmarkProbesPerLoop\n\t\t}\n\t}\n\n\tseconds := time.Since(start).Seconds()\n\tb.ReportMetric(float64(benchmarkProbesPerLoop*b.N)/seconds, \"probe/s\")\n}\n\nfunc generateUint128Table(n int) ([][16]byte, []int32) {\n\tprng := rand.New(rand.NewSource(int64(n)))\n\tkeys := make([][16]byte, n)\n\tvalues := make([]int32, n)\n\n\tfor i := range keys {\n\t\tprng.Read(keys[i][:])\n\t}\n\n\treturn keys, values\n}\n"
  },
  {
    "path": "hashprobe/wyhash/wyhash.go",
    "content": "// Package wyhash implements a hashing algorithm derived from the Go runtime's\n// internal hashing fallback, which uses a variation of the wyhash algorithm.\npackage wyhash\n\nimport (\n\t\"encoding/binary\"\n\t\"math/bits\"\n\n\t\"github.com/segmentio/parquet-go/sparse\"\n)\n\nconst (\n\tm1 = 0xa0761d6478bd642f\n\tm2 = 0xe7037ed1a0b428db\n\tm3 = 0x8ebc6af09c88c6e3\n\tm4 = 0x589965cc75374cc3\n\tm5 = 0x1d8e4e27c47d124f\n)\n\nfunc mix(a, b uint64) uint64 {\n\thi, lo := bits.Mul64(a, b)\n\treturn hi ^ lo\n}\n\nfunc Hash32(value uint32, seed uintptr) uintptr {\n\treturn uintptr(mix(m5^4, mix(uint64(value)^m2, uint64(value)^uint64(seed)^m1)))\n}\n\nfunc Hash64(value uint64, seed uintptr) uintptr {\n\treturn uintptr(mix(m5^8, mix(value^m2, value^uint64(seed)^m1)))\n}\n\nfunc Hash128(value [16]byte, seed uintptr) uintptr {\n\ta := binary.LittleEndian.Uint64(value[:8])\n\tb := binary.LittleEndian.Uint64(value[8:])\n\treturn uintptr(mix(m5^16, mix(a^m2, b^uint64(seed)^m1)))\n}\n\nfunc MultiHash32(hashes []uintptr, values []uint32, seed uintptr) {\n\tMultiHashUint32Array(hashes, sparse.MakeUint32Array(values), seed)\n}\n\nfunc MultiHash64(hashes []uintptr, values []uint64, seed uintptr) {\n\tMultiHashUint64Array(hashes, sparse.MakeUint64Array(values), seed)\n}\n\nfunc MultiHash128(hashes []uintptr, values [][16]byte, seed uintptr) {\n\tMultiHashUint128Array(hashes, sparse.MakeUint128Array(values), seed)\n}\n"
  },
  {
    "path": "hashprobe/wyhash/wyhash_amd64.go",
    "content": "//go:build !purego\n\npackage wyhash\n\nimport \"github.com/segmentio/parquet-go/sparse\"\n\n//go:noescape\nfunc MultiHashUint32Array(hashes []uintptr, values sparse.Uint32Array, seed uintptr)\n\n//go:noescape\nfunc MultiHashUint64Array(hashes []uintptr, values sparse.Uint64Array, seed uintptr)\n\n//go:noescape\nfunc MultiHashUint128Array(hashes []uintptr, values sparse.Uint128Array, seed uintptr)\n"
  },
  {
    "path": "hashprobe/wyhash/wyhash_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n#define m1 0xa0761d6478bd642f\n#define m2 0xe7037ed1a0b428db\n#define m3 0x8ebc6af09c88c6e3\n#define m4 0x589965cc75374cc3\n#define m5 0x1d8e4e27c47d124f\n\n// func MultiHashUint32Array(hashes []uintptr, values sparse.Uint32Array, seed uintptr)\nTEXT ·MultiHashUint32Array(SB), NOSPLIT, $0-56\n    MOVQ hashes_base+0(FP), R12\n    MOVQ values_array_ptr+24(FP), R13\n    MOVQ values_array_len+32(FP), R14\n    MOVQ values_array_off+40(FP), R15\n    MOVQ seed+48(FP), R11\n\n    MOVQ $m1, R8\n    MOVQ $m2, R9\n    MOVQ $m5^4, R10\n    XORQ R11, R8\n\n    XORQ SI, SI\n    JMP test\nloop:\n    MOVL (R13), AX\n    MOVQ R8, BX\n\n    XORQ AX, BX\n    XORQ R9, AX\n\n    MULQ BX\n    XORQ DX, AX\n\n    MULQ R10\n    XORQ DX, AX\n\n    MOVQ AX, (R12)(SI*8)\n    INCQ SI\n    ADDQ R15, R13\ntest:\n    CMPQ SI, R14\n    JNE loop\n    RET\n\n// func MultiHashUint64Array(hashes []uintptr, values sparse.Uint64Array, seed uintptr)\nTEXT ·MultiHashUint64Array(SB), NOSPLIT, $0-56\n    MOVQ hashes_base+0(FP), R12\n    MOVQ values_array_ptr+24(FP), R13\n    MOVQ values_array_len+32(FP), R14\n    MOVQ values_array_off+40(FP), R15\n    MOVQ seed+48(FP), R11\n\n    MOVQ $m1, R8\n    MOVQ $m2, R9\n    MOVQ $m5^8, R10\n    XORQ R11, R8\n\n    XORQ SI, SI\n    JMP test\nloop:\n    MOVQ (R13), AX\n    MOVQ R8, BX\n\n    XORQ AX, BX\n    XORQ R9, AX\n\n    MULQ BX\n    XORQ DX, AX\n\n    MULQ R10\n    XORQ DX, AX\n\n    MOVQ AX, (R12)(SI*8)\n    INCQ SI\n    ADDQ R15, R13\ntest:\n    CMPQ SI, R14\n    JNE loop\n    RET\n\n// func MultiHashUint128Array(hashes []uintptr, values sparse.Uint128Array, seed uintptr)\nTEXT ·MultiHashUint128Array(SB), NOSPLIT, $0-56\n    MOVQ hashes_base+0(FP), R12\n    MOVQ values_array_ptr+24(FP), R13\n    MOVQ values_array_len+32(FP), R14\n    MOVQ values_array_off+40(FP), R15\n    MOVQ seed+48(FP), R11\n\n    MOVQ $m1, R8\n    MOVQ $m2, R9\n    MOVQ $m5^16, R10\n    XORQ R11, R8\n\n    XORQ SI, SI\n    JMP test\nloop:\n    MOVQ 0(R13), AX\n    MOVQ 8(R13), DX\n    MOVQ R8, BX\n\n    XORQ DX, BX\n    XORQ R9, AX\n\n    MULQ BX\n    XORQ DX, AX\n\n    MULQ R10\n    XORQ DX, AX\n\n    MOVQ AX, (R12)(SI*8)\n    INCQ SI\n    ADDQ R15, R13\ntest:\n    CMPQ SI, R14\n    JNE loop\n    RET\n"
  },
  {
    "path": "hashprobe/wyhash/wyhash_purego.go",
    "content": "//go:build purego || !amd64\n\npackage wyhash\n\nimport \"github.com/segmentio/parquet-go/sparse\"\n\nfunc MultiHashUint32Array(hashes []uintptr, values sparse.Uint32Array, seed uintptr) {\n\tfor i := range hashes {\n\t\thashes[i] = Hash32(values.Index(i), seed)\n\t}\n}\n\nfunc MultiHashUint64Array(hashes []uintptr, values sparse.Uint64Array, seed uintptr) {\n\tfor i := range hashes {\n\t\thashes[i] = Hash64(values.Index(i), seed)\n\t}\n}\n\nfunc MultiHashUint128Array(hashes []uintptr, values sparse.Uint128Array, seed uintptr) {\n\tfor i := range hashes {\n\t\thashes[i] = Hash128(values.Index(i), seed)\n\t}\n}\n"
  },
  {
    "path": "hashprobe/wyhash/wyhash_test.go",
    "content": "package wyhash\n\nimport (\n\t\"encoding/binary\"\n\t\"math/rand\"\n\t\"testing\"\n\t\"time\"\n)\n\nfunc TestHash32(t *testing.T) {\n\tif h := Hash32(42, 1); h != 0xda93b6f668a0496e {\n\t\tt.Errorf(\"hash mismatch: %08x\", h)\n\t}\n}\n\nfunc TestMultiHash32(t *testing.T) {\n\tconst N = 10\n\thashes := [N]uintptr{}\n\tvalues := [N]uint32{}\n\tseed := uintptr(32)\n\n\tfor i := range values {\n\t\tvalues[i] = uint32(i)\n\t}\n\n\tMultiHash32(hashes[:], values[:], seed)\n\n\tfor i := range values {\n\t\th := Hash32(values[i], seed)\n\n\t\tif h != hashes[i] {\n\t\t\tt.Errorf(\"hash(%d): want=%08x got=%08x\", values[i], h, hashes[i])\n\t\t}\n\t}\n}\n\nfunc BenchmarkHash32(b *testing.B) {\n\tb.SetBytes(8)\n\tvalue := rand.Uint32()\n\tbenchmarkHashThroughput(b, func(seed uintptr) int {\n\t\tvalue = uint32(Hash32(value, seed))\n\t\treturn 1\n\t})\n}\n\nfunc BenchmarkMultiHash32(b *testing.B) {\n\thashes := [512]uintptr{}\n\tvalues := [512]uint32{}\n\tb.SetBytes(4 * int64(len(hashes)))\n\tbenchmarkHashThroughput(b, func(seed uintptr) int {\n\t\tMultiHash32(hashes[:], values[:], seed)\n\t\treturn len(hashes)\n\t})\n}\n\nfunc TestHash64(t *testing.T) {\n\tif h := Hash64(42, 1); h != 0x6e69a6ede6b5a25e {\n\t\tt.Errorf(\"hash mismatch: %016x\", h)\n\t}\n}\n\nfunc TestMultiHash64(t *testing.T) {\n\tconst N = 10\n\thashes := [N]uintptr{}\n\tvalues := [N]uint64{}\n\tseed := uintptr(64)\n\n\tfor i := range values {\n\t\tvalues[i] = uint64(i)\n\t}\n\n\tMultiHash64(hashes[:], values[:], seed)\n\n\tfor i := range values {\n\t\th := Hash64(values[i], seed)\n\n\t\tif h != hashes[i] {\n\t\t\tt.Errorf(\"hash(%d): want=%016x got=%016x\", values[i], h, hashes[i])\n\t\t}\n\t}\n}\n\nfunc BenchmarkHash64(b *testing.B) {\n\tb.SetBytes(8)\n\tvalue := rand.Uint64()\n\tbenchmarkHashThroughput(b, func(seed uintptr) int {\n\t\tvalue = uint64(Hash64(value, seed))\n\t\treturn 1\n\t})\n}\n\nfunc BenchmarkMultiHash64(b *testing.B) {\n\thashes := [512]uintptr{}\n\tvalues := [512]uint64{}\n\tb.SetBytes(8 * int64(len(hashes)))\n\tbenchmarkHashThroughput(b, func(seed uintptr) int {\n\t\tMultiHash64(hashes[:], values[:], seed)\n\t\treturn len(hashes)\n\t})\n}\n\nfunc TestHash128(t *testing.T) {\n\tif h := Hash128([16]byte{0: 42}, 1); h != 0xcd09fcdae9a79e7c {\n\t\tt.Errorf(\"hash mismatch: %016x\", h)\n\t}\n}\n\nfunc TestMultiHash128(t *testing.T) {\n\tconst N = 10\n\thashes := [N]uintptr{}\n\tvalues := [N][16]byte{}\n\tseed := uintptr(64)\n\n\tfor i := range values {\n\t\tbinary.LittleEndian.PutUint64(values[i][:8], uint64(i))\n\t}\n\n\tMultiHash128(hashes[:], values[:], seed)\n\n\tfor i := range values {\n\t\th := Hash128(values[i], seed)\n\n\t\tif h != hashes[i] {\n\t\t\tt.Errorf(\"hash(%d): want=%016x got=%016x\", values[i], h, hashes[i])\n\t\t}\n\t}\n}\n\nfunc BenchmarkHash128(b *testing.B) {\n\tb.SetBytes(8)\n\thash := uintptr(0)\n\tvalue := [16]byte{}\n\tbinary.LittleEndian.PutUint64(value[:8], rand.Uint64())\n\tbinary.LittleEndian.PutUint64(value[8:], rand.Uint64())\n\tbenchmarkHashThroughput(b, func(seed uintptr) int {\n\t\thash = Hash128(value, seed)\n\t\treturn 1\n\t})\n\t_ = hash\n}\n\nfunc BenchmarkMultiHash128(b *testing.B) {\n\thashes := [512]uintptr{}\n\tvalues := [512][16]byte{}\n\tb.SetBytes(16 * int64(len(hashes)))\n\tbenchmarkHashThroughput(b, func(seed uintptr) int {\n\t\tMultiHash128(hashes[:], values[:], seed)\n\t\treturn len(hashes)\n\t})\n}\n\nfunc benchmarkHashThroughput(b *testing.B, f func(seed uintptr) int) {\n\thashes := int64(0)\n\tstart := time.Now()\n\n\tfor i := 0; i < b.N; i++ {\n\t\thashes += int64(f(uintptr(i)))\n\t}\n\n\tseconds := time.Since(start).Seconds()\n\tb.ReportMetric(float64(hashes)/seconds, \"hash/s\")\n}\n"
  },
  {
    "path": "internal/bitpack/bitpack.go",
    "content": "// Package bitpack implements efficient bit packing and unpacking routines for\n// integers of various bit widths.\npackage bitpack\n\n// ByteCount returns the number of bytes needed to hold the given bit count.\nfunc ByteCount(bitCount uint) int {\n\treturn int((bitCount + 7) / 8)\n}\n"
  },
  {
    "path": "internal/bitpack/masks_int32_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n// -----------------------------------------------------------------------------\n// Shuffle masks used to broadcast bytes of bit-packed valued into vector\n// registers at positions where they can then be shifted into the right\n// locations.\n// -----------------------------------------------------------------------------\n\n// Shuffle masks for unpacking values from bit widths 1 to 16.\n//\n// The masks are grouped in 32 bytes chunks containing 2 masks of 16 bytes, with\n// the following layout:\n//\n// - The first mask is used to shuffle values from the 16 bytes of input into\n//   the lower 16 bytes of output. These values are then shifted RIGHT to be\n//   aligned on the begining of each 32 bit word.\n//\n// - The second mask selects values from the 16 bytes of input into the upper\n//   16 bytes of output. These values are then shifted RIGHT to be aligned on\n//   the beginning of each 32 bit word.\n//\n// The bit width is intended to be used as an index into this array, using this\n// formula to convert from the index to a byte offset:\n//\n//      offset = 32 * (bitWidth - 1)\n//\nGLOBL ·shuffleInt32x1to16bits(SB), RODATA|NOPTR, $512\n\n// 1 bit => 32 bits\n// -----------------\n// 0: [a,b,c,d,e,f,g,h]\n// ...\nDATA ·shuffleInt32x1to16bits+0+0(SB)/4,  $0x80808000\nDATA ·shuffleInt32x1to16bits+0+4(SB)/4,  $0x80808000\nDATA ·shuffleInt32x1to16bits+0+8(SB)/4,  $0x80808000\nDATA ·shuffleInt32x1to16bits+0+12(SB)/4, $0x80808000\n\nDATA ·shuffleInt32x1to16bits+0+16(SB)/4, $0x80808000\nDATA ·shuffleInt32x1to16bits+0+20(SB)/4, $0x80808000\nDATA ·shuffleInt32x1to16bits+0+24(SB)/4, $0x80808000\nDATA ·shuffleInt32x1to16bits+0+28(SB)/4, $0x80808000\n\n// 2 bits => 32 bits\n// -----------------\n// 0: [a,a,b,b,c,c,d,d]\n// 1: [e,e,f,f,g,g,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+32+0(SB)/4,  $0x80808000\nDATA ·shuffleInt32x1to16bits+32+4(SB)/4,  $0x80808000\nDATA ·shuffleInt32x1to16bits+32+8(SB)/4,  $0x80808000\nDATA ·shuffleInt32x1to16bits+32+12(SB)/4, $0x80808000\n\nDATA ·shuffleInt32x1to16bits+32+16(SB)/4, $0x80808001\nDATA ·shuffleInt32x1to16bits+32+20(SB)/4, $0x80808001\nDATA ·shuffleInt32x1to16bits+32+24(SB)/4, $0x80808001\nDATA ·shuffleInt32x1to16bits+32+28(SB)/4, $0x80808001\n\n// 3 bits => 32 bits\n// -----------------\n// 0: [a,a,a,b,b,b,c,c]\n// 1: [c,d,d,d,e,e,e,f]\n// 2: [f,f,g,g,g,h,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+64+0(SB)/4,  $0x80808000\nDATA ·shuffleInt32x1to16bits+64+4(SB)/4,  $0x80808000\nDATA ·shuffleInt32x1to16bits+64+8(SB)/4,  $0x80800100\nDATA ·shuffleInt32x1to16bits+64+12(SB)/4, $0x80808001\n\nDATA ·shuffleInt32x1to16bits+64+16(SB)/4, $0x80808001\nDATA ·shuffleInt32x1to16bits+64+20(SB)/4, $0x80800201\nDATA ·shuffleInt32x1to16bits+64+24(SB)/4, $0x80808002\nDATA ·shuffleInt32x1to16bits+64+28(SB)/4, $0x80808002\n\n// 4 bits => 32 bits\n// -----------------\n// 0: [a,a,a,a,b,b,b,b]\n// 1: [c,c,c,c,d,d,d,d]\n// 2: [e,e,e,e,f,f,f,f]\n// 3: [g,g,g,g,h,h,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+96+0(SB)/4,  $0x80808000\nDATA ·shuffleInt32x1to16bits+96+4(SB)/4,  $0x80808000\nDATA ·shuffleInt32x1to16bits+96+8(SB)/4,  $0x80808001\nDATA ·shuffleInt32x1to16bits+96+12(SB)/4, $0x80808001\n\nDATA ·shuffleInt32x1to16bits+96+16(SB)/4, $0x80808002\nDATA ·shuffleInt32x1to16bits+96+20(SB)/4, $0x80808002\nDATA ·shuffleInt32x1to16bits+96+24(SB)/4, $0x80808003\nDATA ·shuffleInt32x1to16bits+96+28(SB)/4, $0x80808003\n\n// 5 bits => 32 bits\n// -----------------\n// 0: [a,a,a,a,a,b,b,b]\n// 1: [b,b,c,c,c,c,c,d]\n// 2: [d,d,d,d,e,e,e,e]\n// 3: [e,f,f,f,f,f,g,g]\n// 4: [g,g,g,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+128+0(SB)/4,  $0x80808000\nDATA ·shuffleInt32x1to16bits+128+4(SB)/4,  $0x80800100\nDATA ·shuffleInt32x1to16bits+128+8(SB)/4,  $0x80808001\nDATA ·shuffleInt32x1to16bits+128+12(SB)/4, $0x80800201\n\nDATA ·shuffleInt32x1to16bits+128+16(SB)/4, $0x80800302\nDATA ·shuffleInt32x1to16bits+128+20(SB)/4, $0x80808003\nDATA ·shuffleInt32x1to16bits+128+24(SB)/4, $0x80800403\nDATA ·shuffleInt32x1to16bits+128+28(SB)/4, $0x80808004\n\n// 6 bits => 32 bits\n// -----------------\n// 0: [a,a,a,a,a,a,b,b]\n// 1: [b,b,b,b,c,c,c,c]\n// 2: [c,c,d,d,d,d,d,d]\n// 3: [e,e,e,e,e,e,f,f]\n// 4: [f,f,f,f,g,g,g,g]\n// 5: [g,g,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+160+0(SB)/4,  $0x80808000\nDATA ·shuffleInt32x1to16bits+160+4(SB)/4,  $0x80800100\nDATA ·shuffleInt32x1to16bits+160+8(SB)/4,  $0x80800201\nDATA ·shuffleInt32x1to16bits+160+12(SB)/4, $0x80808002\n\nDATA ·shuffleInt32x1to16bits+160+16(SB)/4, $0x80808003\nDATA ·shuffleInt32x1to16bits+160+20(SB)/4, $0x80800403\nDATA ·shuffleInt32x1to16bits+160+24(SB)/4, $0x80800504\nDATA ·shuffleInt32x1to16bits+160+28(SB)/4, $0x80808005\n\n// 7 bits => 32 bits\n// -----------------\n// 0: [a,a,a,a,a,a,a,b]\n// 1: [b,b,b,b,b,b,c,c]\n// 2: [c,c,c,c,c,d,d,d]\n// 3: [d,d,d,d,e,e,e,e]\n// 4: [e,e,e,f,f,f,f,f]\n// 5: [f,f,g,g,g,g,g,g]\n// 6: [g,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+192+0(SB)/4,  $0x80808000\nDATA ·shuffleInt32x1to16bits+192+4(SB)/4,  $0x80800100\nDATA ·shuffleInt32x1to16bits+192+8(SB)/4,  $0x80800201\nDATA ·shuffleInt32x1to16bits+192+12(SB)/4, $0x80800302\n\nDATA ·shuffleInt32x1to16bits+192+16(SB)/4, $0x80800403\nDATA ·shuffleInt32x1to16bits+192+20(SB)/4, $0x80800504\nDATA ·shuffleInt32x1to16bits+192+24(SB)/4, $0x80800605\nDATA ·shuffleInt32x1to16bits+192+28(SB)/4, $0x80808006\n\n// 8 bits => 32 bits\n// -----------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [b,b,b,b,b,b,b,b]\n// 2: [c,c,c,c,c,c,c,c]\n// 3: [d,d,d,d,d,d,d,d]\n// 4: [e,e,e,e,e,e,e,e]\n// 5: [f,f,f,f,f,f,f,f]\n// 6: [g,g,g,g,g,g,g,g]\n// 7: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+224+0(SB)/4,  $0x80808000\nDATA ·shuffleInt32x1to16bits+224+4(SB)/4,  $0x80808001\nDATA ·shuffleInt32x1to16bits+224+8(SB)/4,  $0x80808002\nDATA ·shuffleInt32x1to16bits+224+12(SB)/4, $0x80808003\n\nDATA ·shuffleInt32x1to16bits+224+16(SB)/4, $0x80808004\nDATA ·shuffleInt32x1to16bits+224+20(SB)/4, $0x80808005\nDATA ·shuffleInt32x1to16bits+224+24(SB)/4, $0x80808006\nDATA ·shuffleInt32x1to16bits+224+28(SB)/4, $0x80808007\n\n// 9 bits => 32 bits\n// -----------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,b,b,b,b,b,b,b]\n// 2: [b,b,c,c,c,c,c,c]\n// 3: [c,c,c,d,d,d,d,d]\n// 4: [d,d,d,d,e,e,e,e]\n// 5: [e,e,e,e,e,f,f,f]\n// 6: [f,f,f,f,f,f,g,g]\n// 7: [g,g,g,g,g,g,g,h]\n// 8: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+256+0(SB)/4,  $0x80800100\nDATA ·shuffleInt32x1to16bits+256+4(SB)/4,  $0x80800201\nDATA ·shuffleInt32x1to16bits+256+8(SB)/4,  $0x80800302\nDATA ·shuffleInt32x1to16bits+256+12(SB)/4, $0x80800403\n\nDATA ·shuffleInt32x1to16bits+256+16(SB)/4, $0x80800504\nDATA ·shuffleInt32x1to16bits+256+20(SB)/4, $0x80800605\nDATA ·shuffleInt32x1to16bits+256+24(SB)/4, $0x80800706\nDATA ·shuffleInt32x1to16bits+256+28(SB)/4, $0x80800807\n\n// 10 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,b,b,b,b,b,b]\n// 2: [b,b,b,b,c,c,c,c]\n// 3: [c,c,c,c,c,c,d,d]\n// 4: [d,d,d,d,d,d,d,d]\n// 5: [e,e,e,e,e,e,e,e]\n// 6: [e,e,f,f,f,f,f,f]\n// 7: [f,f,f,f,g,g,g,g]\n// 8: [g,g,g,g,g,g,h,h]\n// 9: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+288+0(SB)/4,  $0x80800100\nDATA ·shuffleInt32x1to16bits+288+4(SB)/4,  $0x80800201\nDATA ·shuffleInt32x1to16bits+288+8(SB)/4,  $0x80800302\nDATA ·shuffleInt32x1to16bits+288+12(SB)/4, $0x80800403\n\nDATA ·shuffleInt32x1to16bits+288+16(SB)/4, $0x80800605\nDATA ·shuffleInt32x1to16bits+288+20(SB)/4, $0x80800706\nDATA ·shuffleInt32x1to16bits+288+24(SB)/4, $0x80800807\nDATA ·shuffleInt32x1to16bits+288+28(SB)/4, $0x80800908\n\n// 11 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,b,b,b,b,b]\n// 2: [b,b,b,b,b,b,c,c]\n// 3: [c,c,c,c,c,c,c,c]\n// 4: [c,d,d,d,d,d,d,d]\n// 5: [d,d,d,d,e,e,e,e]\n// 6: [e,e,e,e,e,e,e,f]\n// 7: [f,f,f,f,f,f,f,f]\n// 8: [f,f,g,g,g,g,g,g]\n// 9: [g,g,g,g,g,h,h,h]\n// A: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+320+0(SB)/4,  $0x80800100\nDATA ·shuffleInt32x1to16bits+320+4(SB)/4,  $0x80800201\nDATA ·shuffleInt32x1to16bits+320+8(SB)/4,  $0x80040302\nDATA ·shuffleInt32x1to16bits+320+12(SB)/4, $0x80800504\n\nDATA ·shuffleInt32x1to16bits+320+16(SB)/4, $0x80800605\nDATA ·shuffleInt32x1to16bits+320+20(SB)/4, $0x80080706\nDATA ·shuffleInt32x1to16bits+320+24(SB)/4, $0x80800908\nDATA ·shuffleInt32x1to16bits+320+28(SB)/4, $0x80800A09\n\n// 12 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,b,b,b,b]\n// 2: [b,b,b,b,b,b,b,b]\n// 3: [c,c,c,c,c,c,c,c]\n// 4: [c,c,c,c,d,d,d,d]\n// 5: [d,d,d,d,d,d,d,d]\n// 6: [e,e,e,e,e,e,e,e]\n// 7: [e,e,e,e,f,f,f,f]\n// 8: [f,f,f,f,f,f,f,f]\n// 9: [g,g,g,g,g,g,g,g]\n// A: [g,g,g,g,h,h,h,h]\n// B: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+352+0(SB)/4,  $0x80800100\nDATA ·shuffleInt32x1to16bits+352+4(SB)/4,  $0x80800201\nDATA ·shuffleInt32x1to16bits+352+8(SB)/4,  $0x80080403\nDATA ·shuffleInt32x1to16bits+352+12(SB)/4, $0x80800504\n\nDATA ·shuffleInt32x1to16bits+352+16(SB)/4, $0x80800706\nDATA ·shuffleInt32x1to16bits+352+20(SB)/4, $0x80800807\nDATA ·shuffleInt32x1to16bits+352+24(SB)/4, $0x80800A09\nDATA ·shuffleInt32x1to16bits+352+28(SB)/4, $0x80800B0A\n\n// 13 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,b,b,b]\n// 2: [b,b,b,b,b,b,b,b]\n// 3: [b,b,c,c,c,c,c,c]\n// 4: [c,c,c,c,c,c,c,d]\n// 5: [d,d,d,d,d,d,d,d]\n// 6: [d,d,d,d,e,e,e,e]\n// 7: [e,e,e,e,e,e,e,e]\n// 8: [e,f,f,f,f,f,f,f]\n// 9: [f,f,f,f,f,f,g,g]\n// A: [g,g,g,g,g,g,g,g]\n// B: [g,g,g,h,h,h,h,h]\n// C: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+384+0(SB)/4,  $0x80800100\nDATA ·shuffleInt32x1to16bits+384+4(SB)/4,  $0x80030201\nDATA ·shuffleInt32x1to16bits+384+8(SB)/4,  $0x80800403\nDATA ·shuffleInt32x1to16bits+384+12(SB)/4, $0x80060504\n\nDATA ·shuffleInt32x1to16bits+384+16(SB)/4, $0x80080706\nDATA ·shuffleInt32x1to16bits+384+20(SB)/4, $0x80800908\nDATA ·shuffleInt32x1to16bits+384+24(SB)/4, $0x800B0A09\nDATA ·shuffleInt32x1to16bits+384+28(SB)/4, $0x80800C0B\n\n// 14 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,b,b]\n// 2: [b,b,b,b,b,b,b,b]\n// 3: [b,b,b,b,c,c,c,c]\n// 4: [c,c,c,c,c,c,c,c]\n// 5: [c,c,d,d,d,d,d,d]\n// 6: [d,d,d,d,d,d,d,d]\n// 7: [e,e,e,e,e,e,e,e]\n// 8: [e,e,e,e,e,e,f,f]\n// 9: [f,f,f,f,f,f,f,f]\n// A: [f,f,f,f,g,g,g,g]\n// B: [g,g,g,g,g,g,g,g]\n// C: [g,g,h,h,h,h,h,h]\n// D: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+416+0(SB)/4,  $0x80800100\nDATA ·shuffleInt32x1to16bits+416+4(SB)/4,  $0x80030201\nDATA ·shuffleInt32x1to16bits+416+8(SB)/4,  $0x80050403\nDATA ·shuffleInt32x1to16bits+416+12(SB)/4, $0x80800605\n\nDATA ·shuffleInt32x1to16bits+416+16(SB)/4, $0x80080807\nDATA ·shuffleInt32x1to16bits+416+20(SB)/4, $0x800A0908\nDATA ·shuffleInt32x1to16bits+416+24(SB)/4, $0x800C0B0A\nDATA ·shuffleInt32x1to16bits+416+28(SB)/4, $0x80800D0C\n\n// 15 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,b]\n// 2: [b,b,b,b,b,b,b,b]\n// 3: [b,b,b,b,b,b,c,c]\n// 4: [c,c,c,c,c,c,c,c]\n// 5: [c,c,c,c,c,d,d,d]\n// 6: [d,d,d,d,d,d,d,d]\n// 7: [d,d,d,d,e,e,e,e]\n// 8: [e,e,e,e,e,e,e,e]\n// 9: [e,e,e,f,f,f,f,f]\n// A: [f,f,f,f,f,f,f,f]\n// B: [f,f,g,g,g,g,g,g]\n// C: [g,g,g,g,g,g,g,g]\n// D: [g,h,h,h,h,h,h,h]\n// E: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+448+0(SB)/4,  $0x80800100\nDATA ·shuffleInt32x1to16bits+448+4(SB)/4,  $0x80030201\nDATA ·shuffleInt32x1to16bits+448+8(SB)/4,  $0x80050403\nDATA ·shuffleInt32x1to16bits+448+12(SB)/4, $0x80070605\n\nDATA ·shuffleInt32x1to16bits+448+16(SB)/4, $0x80090807\nDATA ·shuffleInt32x1to16bits+448+20(SB)/4, $0x800B0A09\nDATA ·shuffleInt32x1to16bits+448+24(SB)/4, $0x800D0C0B\nDATA ·shuffleInt32x1to16bits+448+28(SB)/4, $0x80800E0D\n\n// 16 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [b,b,b,b,b,b,b,b]\n// 3: [b,b,b,b,b,b,c,b]\n// 4: [c,c,c,c,c,c,c,c]\n// 5: [c,c,c,c,c,c,c,c]\n// 6: [d,d,d,d,d,d,d,d]\n// 7: [d,d,d,d,d,d,d,d]\n// 8: [e,e,e,e,e,e,e,e]\n// 9: [e,e,e,e,e,e,e,e]\n// A: [f,f,f,f,f,f,f,f]\n// B: [f,f,f,f,f,f,f,f]\n// C: [g,g,g,g,g,g,g,g]\n// D: [g,g,g,g,g,g,g,g]\n// E: [h,h,h,h,h,h,h,h]\n// F: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x1to16bits+480+0(SB)/4,  $0x80800100\nDATA ·shuffleInt32x1to16bits+480+4(SB)/4,  $0x80800302\nDATA ·shuffleInt32x1to16bits+480+8(SB)/4,  $0x80800504\nDATA ·shuffleInt32x1to16bits+480+12(SB)/4, $0x80800706\n\nDATA ·shuffleInt32x1to16bits+480+16(SB)/4, $0x80800908\nDATA ·shuffleInt32x1to16bits+480+20(SB)/4, $0x80800B0A\nDATA ·shuffleInt32x1to16bits+480+24(SB)/4, $0x80800D0C\nDATA ·shuffleInt32x1to16bits+480+28(SB)/4, $0x80800F0E\n\n// Shuffle masks for unpacking values from bit widths 17 to 26.\n//\n// The masks are grouped in 48 bytes chunks containing 3 masks of 16 bytes, with\n// the following layout:\n//\n// - The first mask is used to shuffle values from the first 16 bytes of input\n//   into the lower 16 bytes of output. These values are then shifted RIGHT to\n//   be aligned on the begining of each 32 bit word.\n//\n// - The second mask selects values from the first 16 bytes of input into the\n//   upper 16 bytes of output. These values are then shifted RIGHT to be aligned\n//   on the beginning of each 32 bit word.\n//\n// - The third mask selects values from the second 16 bytes of input into the\n//   upper 16 bytes of output. These values are then shifted RIGHT to be aligned\n//   on the beginning of each 32 bit word.\n//\n// The bit width is intended to be used as an index into this array, using this\n// formula to convert from the index to a byte offset:\n//\n//      offset = 48 * (bitWidth - 17)\n//\nGLOBL ·shuffleInt32x17to26bits(SB), RODATA|NOPTR, $480\n\n// 17 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,b,b,b,b,b,b,b]\n// 3: [b,b,b,b,b,b,b,b]\n// 4: [b,b,c,c,c,c,c,c]\n// 5: [c,c,c,c,c,c,c,c]\n// 6: [c,c,c,d,d,d,d,d]\n// 7: [d,d,d,d,d,d,d,d]\n// 8: [d,d,d,d,e,e,e,e]\n// 9: [e,e,e,e,e,e,e,e]\n// A: [e,e,e,e,e,f,f,f]\n// B: [f,f,f,f,f,f,f,f]\n// C: [f,f,f,f,f,f,g,g]\n// D: [g,g,g,g,g,g,g,g]\n// E: [g,g,g,g,g,g,g,h]\n// F: [h,h,h,h,h,h,h,h]\n// ---\n// 0: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x17to26bits+0+0(SB)/4,  $0x80020100\nDATA ·shuffleInt32x17to26bits+0+4(SB)/4,  $0x80040302\nDATA ·shuffleInt32x17to26bits+0+8(SB)/4,  $0x80060504\nDATA ·shuffleInt32x17to26bits+0+12(SB)/4, $0x80080706\n\nDATA ·shuffleInt32x17to26bits+0+16(SB)/4, $0x800A0908\nDATA ·shuffleInt32x17to26bits+0+20(SB)/4, $0x800C0B0A\nDATA ·shuffleInt32x17to26bits+0+24(SB)/4, $0x800E0D0C\nDATA ·shuffleInt32x17to26bits+0+28(SB)/4, $0x80800F0E\n\nDATA ·shuffleInt32x17to26bits+0+32(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+0+36(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+0+40(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+0+44(SB)/4, $0x80008080\n\n// 18 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,a,b,b,b,b,b,b]\n// 3: [b,b,b,b,b,b,b,b]\n// 4: [b,b,b,b,c,c,c,c]\n// 5: [c,c,c,c,c,c,c,c]\n// 6: [c,c,c,c,c,c,d,d]\n// 7: [d,d,d,d,d,d,d,d]\n// 8: [d,d,d,d,d,d,d,d]\n// 9: [e,e,e,e,e,e,e,e]\n// A: [e,e,e,e,e,e,e,e]\n// B: [e,e,f,f,f,f,f,f]\n// C: [f,f,f,f,f,f,f,f]\n// D: [f,f,f,f,g,g,g,g]\n// E: [g,g,g,g,g,g,g,g]\n// F: [g,g,g,g,g,g,h,h]\n// ---\n// 0: [h,h,h,h,h,h,h,h]\n// 1: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x17to26bits+48+0(SB)/4,  $0x80020100\nDATA ·shuffleInt32x17to26bits+48+4(SB)/4,  $0x80040302\nDATA ·shuffleInt32x17to26bits+48+8(SB)/4,  $0x80060504\nDATA ·shuffleInt32x17to26bits+48+12(SB)/4, $0x80080706\n\nDATA ·shuffleInt32x17to26bits+48+16(SB)/4, $0x800B0A09\nDATA ·shuffleInt32x17to26bits+48+20(SB)/4, $0x800D0C0B\nDATA ·shuffleInt32x17to26bits+48+24(SB)/4, $0x800F0E0D\nDATA ·shuffleInt32x17to26bits+48+28(SB)/4, $0x8080800F\n\nDATA ·shuffleInt32x17to26bits+48+32(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+48+36(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+48+40(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+48+44(SB)/4, $0x80010080\n\n// 19 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,a,a,b,b,b,b,b]\n// 3: [b,b,b,b,b,b,b,b]\n// 4: [b,b,b,b,b,b,c,c]\n// 5: [c,c,c,c,c,c,c,c]\n// 6: [c,c,c,c,c,c,c,c]\n// 7: [c,d,d,d,d,d,d,d]\n// 8: [d,d,d,d,d,d,d,d]\n// 9: [d,d,d,d,e,e,e,e]\n// A: [e,e,e,e,e,e,e,e]\n// B: [e,e,e,e,e,e,e,f]\n// C: [f,f,f,f,f,f,f,f]\n// D: [f,f,f,f,f,f,f,f]\n// E: [f,f,g,g,g,g,g,g]\n// F: [g,g,g,g,g,g,g,g]\n// ---\n// 0: [g,g,g,g,g,h,h,h]\n// 1: [h,h,h,h,h,h,h,h]\n// 2: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x17to26bits+96+0(SB)/4,  $0x80020100\nDATA ·shuffleInt32x17to26bits+96+4(SB)/4,  $0x80040302\nDATA ·shuffleInt32x17to26bits+96+8(SB)/4,  $0x07060504\nDATA ·shuffleInt32x17to26bits+96+12(SB)/4, $0x80090807\n\nDATA ·shuffleInt32x17to26bits+96+16(SB)/4, $0x800B0A09\nDATA ·shuffleInt32x17to26bits+96+20(SB)/4, $0x0E0D0C0B\nDATA ·shuffleInt32x17to26bits+96+24(SB)/4, $0x80800F0E\nDATA ·shuffleInt32x17to26bits+96+28(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x17to26bits+96+32(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+96+36(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+96+40(SB)/4, $0x80008080\nDATA ·shuffleInt32x17to26bits+96+44(SB)/4, $0x80020100\n\n// 20 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,a,a,a,b,b,b,b]\n// 3: [b,b,b,b,b,b,b,b]\n// 4: [b,b,b,b,b,b,b,b]\n// 5: [c,c,c,c,c,c,c,c]\n// 6: [c,c,c,c,c,c,c,c]\n// 7: [c,c,c,c,d,d,d,d]\n// 8: [d,d,d,d,d,d,d,d]\n// 9: [d,d,d,d,d,d,d,d]\n// A: [e,e,e,e,e,e,e,e]\n// B: [e,e,e,e,e,e,e,e]\n// C: [e,e,e,e,f,f,f,f]\n// D: [f,f,f,f,f,f,f,f]\n// E: [f,f,f,f,f,f,f,f]\n// F: [g,g,g,g,g,g,g,g]\n// ---\n// 0: [g,g,g,g,g,g,g,g]\n// 1: [g,g,g,g,h,h,h,h]\n// 2: [h,h,h,h,h,h,h,h]\n// 3: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x17to26bits+144+0(SB)/4,  $0x80020100\nDATA ·shuffleInt32x17to26bits+144+4(SB)/4,  $0x80040302\nDATA ·shuffleInt32x17to26bits+144+8(SB)/4,  $0x80070605\nDATA ·shuffleInt32x17to26bits+144+12(SB)/4, $0x80090807\n\nDATA ·shuffleInt32x17to26bits+144+16(SB)/4, $0x800C0B0A\nDATA ·shuffleInt32x17to26bits+144+20(SB)/4, $0x800E0D0C\nDATA ·shuffleInt32x17to26bits+144+24(SB)/4, $0x8080800F\nDATA ·shuffleInt32x17to26bits+144+28(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x17to26bits+144+32(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+144+36(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+144+40(SB)/4, $0x80010080\nDATA ·shuffleInt32x17to26bits+144+44(SB)/4, $0x80030201\n\n// 21 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,a,a,a,a,b,b,b]\n// 3: [b,b,b,b,b,b,b,b]\n// 4: [b,b,b,b,b,b,b,b]\n// 5: [b,b,c,c,c,c,c,c]\n// 6: [c,c,c,c,c,c,c,c]\n// 7: [c,c,c,c,c,c,c,d]\n// 8: [d,d,d,d,d,d,d,d]\n// 9: [d,d,d,d,d,d,d,d]\n// A: [d,d,d,d,e,e,e,e]\n// B: [e,e,e,e,e,e,e,e]\n// C: [e,e,e,e,e,e,e,e]\n// D: [e,f,f,f,f,f,f,f]\n// E: [f,f,f,f,f,f,f,f]\n// F: [f,f,f,f,f,f,g,g]\n// ---\n// 0: [g,g,g,g,g,g,g,g]\n// 1: [g,g,g,g,g,g,g,g]\n// 2: [g,g,g,h,h,h,h,h]\n// 3: [h,h,h,h,h,h,h,h]\n// 4: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x17to26bits+192+0(SB)/4,  $0x80020100\nDATA ·shuffleInt32x17to26bits+192+4(SB)/4,  $0x05040302\nDATA ·shuffleInt32x17to26bits+192+8(SB)/4,  $0x80070605\nDATA ·shuffleInt32x17to26bits+192+12(SB)/4, $0x0A090807\n\nDATA ·shuffleInt32x17to26bits+192+16(SB)/4, $0x0D0C0B0A\nDATA ·shuffleInt32x17to26bits+192+20(SB)/4, $0x800F0E0D\nDATA ·shuffleInt32x17to26bits+192+24(SB)/4, $0x8080800F\nDATA ·shuffleInt32x17to26bits+192+28(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x17to26bits+192+32(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+192+36(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+192+40(SB)/4, $0x02010080\nDATA ·shuffleInt32x17to26bits+192+44(SB)/4, $0x80040302\n\n// 22 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,a,a,a,a,a,b,b]\n// 3: [b,b,b,b,b,b,b,b]\n// 4: [b,b,b,b,b,b,b,b]\n// 5: [b,b,b,b,c,c,c,c]\n// 6: [c,c,c,c,c,c,c,c]\n// 7: [c,c,c,c,c,c,c,c]\n// 8: [c,c,d,d,d,d,d,d]\n// 9: [d,d,d,d,d,d,d,d]\n// A: [d,d,d,d,d,d,d,d]\n// B: [e,e,e,e,e,e,e,e]\n// C: [e,e,e,e,e,e,e,e]\n// D: [e,e,e,e,e,e,f,f]\n// E: [f,f,f,f,f,f,f,f]\n// F: [f,f,f,f,f,f,f,f]\n// ---\n// 0: [f,f,f,f,g,g,g,g]\n// 1: [g,g,g,g,g,g,g,g]\n// 2: [g,g,g,g,g,g,g,g]\n// 3: [g,g,h,h,h,h,h,h]\n// 4: [h,h,h,h,h,h,h,h]\n// 5: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x17to26bits+240+0(SB)/4,  $0x80020100\nDATA ·shuffleInt32x17to26bits+240+4(SB)/4,  $0x05040302\nDATA ·shuffleInt32x17to26bits+240+8(SB)/4,  $0x08070605\nDATA ·shuffleInt32x17to26bits+240+12(SB)/4, $0x800A0908\n\nDATA ·shuffleInt32x17to26bits+240+16(SB)/4, $0x800D0C0B\nDATA ·shuffleInt32x17to26bits+240+20(SB)/4, $0x800F0E0D\nDATA ·shuffleInt32x17to26bits+240+24(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+240+28(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x17to26bits+240+32(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+240+36(SB)/4, $0x00808080\nDATA ·shuffleInt32x17to26bits+240+40(SB)/4, $0x03020100\nDATA ·shuffleInt32x17to26bits+240+44(SB)/4, $0x80050403\n\n// 23 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,a,a,a,a,a,a,b]\n// 3: [b,b,b,b,b,b,b,b]\n// 4: [b,b,b,b,b,b,b,b]\n// 5: [b,b,b,b,b,b,c,c]\n// 6: [c,c,c,c,c,c,c,c]\n// 7: [c,c,c,c,c,c,c,c]\n// 8: [c,c,c,c,c,d,d,d]\n// 9: [d,d,d,d,d,d,d,d]\n// A: [d,d,d,d,d,d,d,d]\n// B: [d,d,d,d,e,e,e,e]\n// C: [e,e,e,e,e,e,e,e]\n// D: [e,e,e,e,e,e,e,e]\n// E: [e,e,e,f,f,f,f,f]\n// F: [f,f,f,f,f,f,f,f]\n// ---\n// 0: [f,f,f,f,f,f,f,f]\n// 1: [f,f,g,g,g,g,g,g]\n// 2: [g,g,g,g,g,g,g,g]\n// 3: [g,g,g,g,g,g,g,g]\n// 4: [g,h,h,h,h,h,h,h]\n// 5: [h,h,h,h,h,h,h,h]\n// 6: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x17to26bits+288+0(SB)/4,  $0x80020100\nDATA ·shuffleInt32x17to26bits+288+4(SB)/4,  $0x05040302\nDATA ·shuffleInt32x17to26bits+288+8(SB)/4,  $0x08070605\nDATA ·shuffleInt32x17to26bits+288+12(SB)/4, $0x0B0A0908\n\nDATA ·shuffleInt32x17to26bits+288+16(SB)/4, $0x0E0D0C0B\nDATA ·shuffleInt32x17to26bits+288+20(SB)/4, $0x80800F0E\nDATA ·shuffleInt32x17to26bits+288+24(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+288+28(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x17to26bits+288+32(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+288+36(SB)/4, $0x01008080\nDATA ·shuffleInt32x17to26bits+288+40(SB)/4, $0x04030201\nDATA ·shuffleInt32x17to26bits+288+44(SB)/4, $0x80060504\n\n// 24 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,a,a,a,a,a,a,a]\n// 3: [b,b,b,b,b,b,b,b]\n// 4: [b,b,b,b,b,b,b,b]\n// 5: [b,b,b,b,b,b,b,b]\n// 6: [c,c,c,c,c,c,c,c]\n// 7: [c,c,c,c,c,c,c,c]\n// 8: [c,c,c,c,c,c,c,c]\n// 9: [d,d,d,d,d,d,d,d]\n// A: [d,d,d,d,d,d,d,d]\n// B: [d,d,d,d,d,d,d,d]\n// C: [e,e,e,e,e,e,e,e]\n// D: [e,e,e,e,e,e,e,e]\n// E: [e,e,e,e,e,e,e,e]\n// F: [f,f,f,f,f,f,f,f]\n// ---\n// 0: [f,f,f,f,f,f,f,f]\n// 1: [f,f,f,f,f,f,f,f]\n// 2: [g,g,g,g,g,g,g,g]\n// 3: [g,g,g,g,g,g,g,g]\n// 4: [g,g,g,g,g,g,g,g]\n// 5: [h,h,h,h,h,h,h,h]\n// 6: [h,h,h,h,h,h,h,h]\n// 7: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x17to26bits+336+0(SB)/4,  $0x80020100\nDATA ·shuffleInt32x17to26bits+336+4(SB)/4,  $0x80050403\nDATA ·shuffleInt32x17to26bits+336+8(SB)/4,  $0x80080706\nDATA ·shuffleInt32x17to26bits+336+12(SB)/4, $0x800B0A09\n\nDATA ·shuffleInt32x17to26bits+336+16(SB)/4, $0x800E0D0C\nDATA ·shuffleInt32x17to26bits+336+20(SB)/4, $0x8080800F\nDATA ·shuffleInt32x17to26bits+336+24(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+336+28(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x17to26bits+336+32(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+336+36(SB)/4, $0x80010080\nDATA ·shuffleInt32x17to26bits+336+40(SB)/4, $0x80040302\nDATA ·shuffleInt32x17to26bits+336+44(SB)/4, $0x80070605\n\n// 25 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,a,a,a,a,a,a,a]\n// 3: [a,b,b,b,b,b,b,b]\n// 4: [b,b,b,b,b,b,b,b]\n// 5: [b,b,b,b,b,b,b,b]\n// 6: [b,b,c,c,c,c,c,c]\n// 7: [c,c,c,c,c,c,c,c]\n// 8: [c,c,c,c,c,c,c,c]\n// 9: [c,c,c,d,d,d,d,d]\n// A: [d,d,d,d,d,d,d,d]\n// B: [d,d,d,d,d,d,d,d]\n// C: [d,d,d,d,e,e,e,e]\n// D: [e,e,e,e,e,e,e,e]\n// E: [e,e,e,e,e,e,e,e]\n// F: [e,e,e,e,e,f,f,f]\n// ---\n// 0: [f,f,f,f,f,f,f,f]\n// 1: [f,f,f,f,f,f,f,f]\n// 2: [f,f,f,f,f,f,g,g]\n// 3: [g,g,g,g,g,g,g,g]\n// 4: [g,g,g,g,g,g,g,g]\n// 5: [g,g,g,g,g,g,g,h]\n// 6: [h,h,h,h,h,h,h,h]\n// 7: [h,h,h,h,h,h,h,h]\n// 8: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x17to26bits+384+0(SB)/4,  $0x03020100\nDATA ·shuffleInt32x17to26bits+384+4(SB)/4,  $0x06050403\nDATA ·shuffleInt32x17to26bits+384+8(SB)/4,  $0x09080706\nDATA ·shuffleInt32x17to26bits+384+12(SB)/4, $0x0C0B0A09\n\nDATA ·shuffleInt32x17to26bits+384+16(SB)/4, $0x0F0E0D0C\nDATA ·shuffleInt32x17to26bits+384+20(SB)/4, $0x8080800F\nDATA ·shuffleInt32x17to26bits+384+24(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+384+28(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x17to26bits+384+32(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+384+36(SB)/4, $0x02010080\nDATA ·shuffleInt32x17to26bits+384+40(SB)/4, $0x05040302\nDATA ·shuffleInt32x17to26bits+384+44(SB)/4, $0x08070605\n\n// 26 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,a,a,a,a,a,a,a]\n// 3: [a,a,b,b,b,b,b,b]\n// 4: [b,b,b,b,b,b,b,b]\n// 5: [b,b,b,b,b,b,b,b]\n// 6: [b,b,b,b,c,c,c,c]\n// 7: [c,c,c,c,c,c,c,c]\n// 8: [c,c,c,c,c,c,c,c]\n// 9: [c,c,c,c,c,c,d,d]\n// A: [d,d,d,d,d,d,d,d]\n// B: [d,d,d,d,d,d,d,d]\n// C: [d,d,d,d,d,d,d,d]\n// D: [e,e,e,e,e,e,e,e]\n// E: [e,e,e,e,e,e,e,e]\n// F: [e,e,e,e,e,e,e,e]\n// ---\n// 0: [e,e,f,f,f,f,f,f]\n// 1: [f,f,f,f,f,f,f,f]\n// 2: [f,f,f,f,f,f,f,f]\n// 3: [f,f,f,f,g,g,g,g]\n// 4: [g,g,g,g,g,g,g,g]\n// 5: [g,g,g,g,g,g,g,g]\n// 6: [g,g,g,g,g,g,h,h]\n// 7: [h,h,h,h,h,h,h,h]\n// 8: [h,h,h,h,h,h,h,h]\n// 9: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x17to26bits+432+0(SB)/4,  $0x03020100\nDATA ·shuffleInt32x17to26bits+432+4(SB)/4,  $0x06050403\nDATA ·shuffleInt32x17to26bits+432+8(SB)/4,  $0x09080706\nDATA ·shuffleInt32x17to26bits+432+12(SB)/4, $0x0C0B0A09\n\nDATA ·shuffleInt32x17to26bits+432+16(SB)/4, $0x800F0E0D\nDATA ·shuffleInt32x17to26bits+432+20(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+432+24(SB)/4, $0x80808080\nDATA ·shuffleInt32x17to26bits+432+28(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x17to26bits+432+32(SB)/4, $0x00808080\nDATA ·shuffleInt32x17to26bits+432+36(SB)/4, $0x03020100\nDATA ·shuffleInt32x17to26bits+432+40(SB)/4, $0x06050403\nDATA ·shuffleInt32x17to26bits+432+44(SB)/4, $0x09080706\n\n// Shuffle masks for unpacking values from bit widths 27 to 31.\n//\n// The masks are grouped in 80 bytes chunks containing 5 masks of 16 bytes, with\n// the following layout:\n//\n// - The first mask is used to shuffle values from the first 16 bytes of input\n//   into the lower 16 bytes of output. These values are then shifted RIGHT to\n//   be aligned on the begining of each 32 bit word.\n//\n// - The second mask is used to shuffle upper bits of bit-packed values of the\n//   first 16 bytes of input that spanned across 5 bytes. These extra bits cannot\n//   be selected by the first mask (which can select at most 4 bytes per word).\n//   The extra bits are then shifted LEFT to be positioned at the end of the\n//   words, after the bits extracted by the first mask.\n//\n// - The third mask selects values from the first 16 bytes of input into the\n//   upper 16 bytes of output. These values are then shifted RIGHT to be aligned\n//   on the beginning of each 32 bit word.\n//\n// - The fourth mask selects values from the second 16 bytes of input into the\n//   upper 16 bytes of output. These values are then shifted RIGHT to be aligned\n//   on the beginning of each 32 bit word.\n//\n// - The fifth mask is used to shuffle upper bits of bit-packed values values of\n//   second 16 bytes of input that spanned across 5 bytes. These values are then\n//   shifted LEFT to be aligned on the beginning of each 32 bit word.\n//\n// The bit width is intended to be used as an index into this array, using this\n// formula to convert from the index to a byte offset:\n//\n//      offset = 80 * (bitWidth - 27)\n//\nGLOBL ·shuffleInt32x27to31bits(SB), RODATA|NOPTR, $400\n\n// 27 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,a,a,a,a,a,a,a]\n// 3: [a,a,a,b,b,b,b,b]\n// 4: [b,b,b,b,b,b,b,b]\n// 5: [b,b,b,b,b,b,b,b]\n// 6: [b,b,b,b,b,b,c,c]\n// 7: [c,c,c,c,c,c,c,c]\n// 8: [c,c,c,c,c,c,c,c]\n// 9: [c,c,c,c,c,c,c,c]\n// A: [c,d,d,d,d,d,d,d]\n// B: [d,d,d,d,d,d,d,d]\n// C: [d,d,d,d,d,d,d,d]\n// D: [d,d,d,d,e,e,e,e]\n// E: [e,e,e,e,e,e,e,e]\n// F: [e,e,e,e,e,e,e,e]\n// ---\n// 0: [e,e,e,e,e,e,e,f]\n// 1: [f,f,f,f,f,f,f,f]\n// 2: [f,f,f,f,f,f,f,f]\n// 3: [f,f,f,f,f,f,f,f]\n// 4: [f,f,g,g,g,g,g,g]\n// 5: [g,g,g,g,g,g,g,g]\n// 6: [g,g,g,g,g,g,g,g]\n// 7: [g,g,g,g,g,h,h,h]\n// 8: [h,h,h,h,h,h,h,h]\n// 9: [h,h,h,h,h,h,h,h]\n// A: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x27to31bits+0+0(SB)/4,  $0x03020100\nDATA ·shuffleInt32x27to31bits+0+4(SB)/4,  $0x06050403\nDATA ·shuffleInt32x27to31bits+0+8(SB)/4,  $0x09080706\nDATA ·shuffleInt32x27to31bits+0+12(SB)/4, $0x0D0C0B0A\n\nDATA ·shuffleInt32x27to31bits+0+16(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+0+20(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+0+24(SB)/4, $0x0A808080\nDATA ·shuffleInt32x27to31bits+0+28(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x27to31bits+0+32(SB)/4, $0x800F0E0D\nDATA ·shuffleInt32x27to31bits+0+36(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+0+40(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+0+44(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x27to31bits+0+48(SB)/4, $0x00808080\nDATA ·shuffleInt32x27to31bits+0+52(SB)/4, $0x03020100\nDATA ·shuffleInt32x27to31bits+0+56(SB)/4, $0x07060504\nDATA ·shuffleInt32x27to31bits+0+60(SB)/4, $0x0A090807\n\nDATA ·shuffleInt32x27to31bits+0+64(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+0+68(SB)/4, $0x04808080\nDATA ·shuffleInt32x27to31bits+0+72(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+0+76(SB)/4, $0x80808080\n\n// 28 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,a,a,a,a,a,a,a]\n// 3: [a,a,a,a,b,b,b,b]\n// 4: [b,b,b,b,b,b,b,b]\n// 5: [b,b,b,b,b,b,b,b]\n// 6: [b,b,b,b,b,b,b,b]\n// 7: [c,c,c,c,c,c,c,c]\n// 8: [c,c,c,c,c,c,c,c]\n// 9: [c,c,c,c,c,c,c,c]\n// A: [c,c,c,c,d,d,d,d]\n// B: [d,d,d,d,d,d,d,d]\n// C: [d,d,d,d,d,d,d,d]\n// D: [d,d,d,d,d,d,d,d]\n// E: [e,e,e,e,e,e,e,e]\n// F: [e,e,e,e,e,e,e,e]\n// ---\n// 0: [e,e,e,e,e,e,e,e]\n// 1: [e,e,e,e,f,f,f,f]\n// 2: [f,f,f,f,f,f,f,f]\n// 3: [f,f,f,f,f,f,f,f]\n// 4: [f,f,f,f,f,f,f,f]\n// 5: [g,g,g,g,g,g,g,g]\n// 6: [g,g,g,g,g,g,g,g]\n// 7: [g,g,g,g,g,g,g,g]\n// 8: [g,g,g,g,h,h,h,h]\n// 9: [h,h,h,h,h,h,h,h]\n// A: [h,h,h,h,h,h,h,h]\n// B: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x27to31bits+80+0(SB)/4,  $0x03020100\nDATA ·shuffleInt32x27to31bits+80+4(SB)/4,  $0x06050403\nDATA ·shuffleInt32x27to31bits+80+8(SB)/4,  $0x0A090807\nDATA ·shuffleInt32x27to31bits+80+12(SB)/4, $0x0D0C0B0A\n\nDATA ·shuffleInt32x27to31bits+80+16(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+80+20(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+80+24(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+80+28(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x27to31bits+80+32(SB)/4, $0x80800F0E\nDATA ·shuffleInt32x27to31bits+80+36(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+80+40(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+80+44(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x27to31bits+80+48(SB)/4, $0x01008080\nDATA ·shuffleInt32x27to31bits+80+52(SB)/4, $0x04030201\nDATA ·shuffleInt32x27to31bits+80+56(SB)/4, $0x08070605\nDATA ·shuffleInt32x27to31bits+80+60(SB)/4, $0x0B0A0908\n\nDATA ·shuffleInt32x27to31bits+80+64(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+80+68(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+80+72(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+80+76(SB)/4, $0x80808080\n\n// 29 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,a,a,a,a,a,a,a]\n// 3: [a,a,a,a,a,b,b,b]\n// 4: [b,b,b,b,b,b,b,b]\n// 5: [b,b,b,b,b,b,b,b]\n// 6: [b,b,b,b,b,b,b,b]\n// 7: [b,b,c,c,c,c,c,c]\n// 8: [c,c,c,c,c,c,c,c]\n// 9: [c,c,c,c,c,c,c,c]\n// A: [c,c,c,c,c,c,c,d]\n// B: [d,d,d,d,d,d,d,d]\n// C: [d,d,d,d,d,d,d,d]\n// D: [d,d,d,d,d,d,d,d]\n// E: [d,d,d,d,e,e,e,e]\n// F: [e,e,e,e,e,e,e,e]\n// ---\n// 0: [e,e,e,e,e,e,e,e]\n// 1: [e,e,e,e,e,e,e,e]\n// 2: [e,f,f,f,f,f,f,f]\n// 3: [f,f,f,f,f,f,f,f]\n// 4: [f,f,f,f,f,f,f,f]\n// 5: [f,f,f,f,f,f,g,g]\n// 6: [g,g,g,g,g,g,g,g]\n// 7: [g,g,g,g,g,g,g,g]\n// 8: [g,g,g,g,g,g,g,g]\n// 9: [g,g,g,h,h,h,h,h]\n// A: [h,h,h,h,h,h,h,h]\n// B: [h,h,h,h,h,h,h,h]\n// C: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x27to31bits+160+0(SB)/4,  $0x03020100\nDATA ·shuffleInt32x27to31bits+160+4(SB)/4,  $0x06050403\nDATA ·shuffleInt32x27to31bits+160+8(SB)/4,  $0x0A090807\nDATA ·shuffleInt32x27to31bits+160+12(SB)/4, $0x0D0C0B0A\n\nDATA ·shuffleInt32x27to31bits+160+16(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+160+20(SB)/4, $0x07808080\nDATA ·shuffleInt32x27to31bits+160+24(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+160+28(SB)/4, $0x0E808080\n\nDATA ·shuffleInt32x27to31bits+160+32(SB)/4, $0x80800F0E\nDATA ·shuffleInt32x27to31bits+160+36(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+160+40(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+160+44(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x27to31bits+160+48(SB)/4, $0x01008080\nDATA ·shuffleInt32x27to31bits+160+52(SB)/4, $0x05040302\nDATA ·shuffleInt32x27to31bits+160+56(SB)/4, $0x08070605\nDATA ·shuffleInt32x27to31bits+160+60(SB)/4, $0x0C0B0A09\n\nDATA ·shuffleInt32x27to31bits+160+64(SB)/4, $0x02808080\nDATA ·shuffleInt32x27to31bits+160+68(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+160+72(SB)/4, $0x09808080\nDATA ·shuffleInt32x27to31bits+160+76(SB)/4, $0x80808080\n\n// 30 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,a,a,a,a,a,a,a]\n// 3: [a,a,a,a,a,a,b,b]\n// 4: [b,b,b,b,b,b,b,b]\n// 5: [b,b,b,b,b,b,b,b]\n// 6: [b,b,b,b,b,b,b,b]\n// 7: [b,b,b,b,c,c,c,c]\n// 8: [c,c,c,c,c,c,c,c]\n// 9: [c,c,c,c,c,c,c,c]\n// A: [c,c,c,c,c,c,c,c]\n// B: [c,c,d,d,d,d,d,d]\n// C: [d,d,d,d,d,d,d,d]\n// D: [d,d,d,d,d,d,d,d]\n// E: [d,d,d,d,d,d,d,d]\n// F: [e,e,e,e,e,e,e,e]\n// ---\n// 0: [e,e,e,e,e,e,e,e]\n// 1: [e,e,e,e,e,e,e,e]\n// 2: [e,e,e,e,e,e,f,f]\n// 3: [f,f,f,f,f,f,f,f]\n// 4: [f,f,f,f,f,f,f,f]\n// 5: [f,f,f,f,f,f,f,f]\n// 6: [f,f,f,f,g,g,g,g]\n// 7: [g,g,g,g,g,g,g,g]\n// 8: [g,g,g,g,g,g,g,g]\n// 9: [g,g,g,g,g,g,g,g]\n// A: [g,g,h,h,h,h,h,h]\n// B: [h,h,h,h,h,h,h,h]\n// C: [h,h,h,h,h,h,h,h]\n// D: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x27to31bits+240+0(SB)/4,  $0x03020100\nDATA ·shuffleInt32x27to31bits+240+4(SB)/4,  $0x06050403\nDATA ·shuffleInt32x27to31bits+240+8(SB)/4,  $0x0A090807\nDATA ·shuffleInt32x27to31bits+240+12(SB)/4, $0x0E0D0C0B\n\nDATA ·shuffleInt32x27to31bits+240+16(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+240+20(SB)/4, $0x07808080\nDATA ·shuffleInt32x27to31bits+240+24(SB)/4, $0x0B808080\nDATA ·shuffleInt32x27to31bits+240+28(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x27to31bits+240+32(SB)/4, $0x8080800F\nDATA ·shuffleInt32x27to31bits+240+36(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+240+40(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+240+44(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x27to31bits+240+48(SB)/4, $0x02010080\nDATA ·shuffleInt32x27to31bits+240+52(SB)/4, $0x05040302\nDATA ·shuffleInt32x27to31bits+240+56(SB)/4, $0x09080706\nDATA ·shuffleInt32x27to31bits+240+60(SB)/4, $0x0D0C0B0A\n\nDATA ·shuffleInt32x27to31bits+240+64(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+240+68(SB)/4, $0x06808080\nDATA ·shuffleInt32x27to31bits+240+72(SB)/4, $0x0A808080\nDATA ·shuffleInt32x27to31bits+240+76(SB)/4, $0x80808080\n\n// 31 bits => 32 bits\n// ------------------\n// 0: [a,a,a,a,a,a,a,a]\n// 1: [a,a,a,a,a,a,a,a]\n// 2: [a,a,a,a,a,a,a,a]\n// 3: [a,a,a,a,a,a,a,b]\n// 4: [b,b,b,b,b,b,b,b]\n// 5: [b,b,b,b,b,b,b,b]\n// 6: [b,b,b,b,b,b,b,b]\n// 7: [b,b,b,b,b,b,c,c]\n// 8: [c,c,c,c,c,c,c,c]\n// 9: [c,c,c,c,c,c,c,c]\n// A: [c,c,c,c,c,c,c,c]\n// B: [c,c,c,c,c,d,d,d]\n// C: [d,d,d,d,d,d,d,d]\n// D: [d,d,d,d,d,d,d,d]\n// E: [d,d,d,d,d,d,d,d]\n// F: [d,d,d,d,e,e,e,e]\n// ---\n// 0: [e,e,e,e,e,e,e,e]\n// 1: [e,e,e,e,e,e,e,e]\n// 2: [e,e,e,e,e,e,e,e]\n// 3: [e,e,e,f,f,f,f,f]\n// 4: [f,f,f,f,f,f,f,f]\n// 5: [f,f,f,f,f,f,f,f]\n// 6: [f,f,f,f,f,f,f,f]\n// 7: [f,f,g,g,g,g,g,g]\n// 8: [g,g,g,g,g,g,g,g]\n// 9: [g,g,g,g,g,g,g,g]\n// A: [g,g,g,g,g,g,g,g]\n// B: [g,h,h,h,h,h,h,h]\n// C: [h,h,h,h,h,h,h,h]\n// D: [h,h,h,h,h,h,h,h]\n// E: [h,h,h,h,h,h,h,h]\n// ...\nDATA ·shuffleInt32x27to31bits+320+0(SB)/4,  $0x03020100\nDATA ·shuffleInt32x27to31bits+320+4(SB)/4,  $0x06050403\nDATA ·shuffleInt32x27to31bits+320+8(SB)/4,  $0x0A090807\nDATA ·shuffleInt32x27to31bits+320+12(SB)/4, $0x0E0D0C0B\n\nDATA ·shuffleInt32x27to31bits+320+16(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+320+20(SB)/4, $0x07808080\nDATA ·shuffleInt32x27to31bits+320+24(SB)/4, $0x0B808080\nDATA ·shuffleInt32x27to31bits+320+28(SB)/4, $0x0F808080\n\nDATA ·shuffleInt32x27to31bits+320+32(SB)/4, $0x8080800F\nDATA ·shuffleInt32x27to31bits+320+36(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+320+40(SB)/4, $0x80808080\nDATA ·shuffleInt32x27to31bits+320+44(SB)/4, $0x80808080\n\nDATA ·shuffleInt32x27to31bits+320+48(SB)/4, $0x02010080\nDATA ·shuffleInt32x27to31bits+320+52(SB)/4, $0x06050403\nDATA ·shuffleInt32x27to31bits+320+56(SB)/4, $0x0A090807\nDATA ·shuffleInt32x27to31bits+320+60(SB)/4, $0x0E0D0C0B\n\nDATA ·shuffleInt32x27to31bits+320+64(SB)/4, $0x03808080\nDATA ·shuffleInt32x27to31bits+320+68(SB)/4, $0x07808080\nDATA ·shuffleInt32x27to31bits+320+72(SB)/4, $0x0B808080\nDATA ·shuffleInt32x27to31bits+320+76(SB)/4, $0x80808080\n\n// The RIGHT shifts to unpack 32 bits integers.\n//\n// The following formula was determined empirically as the expression which\n// generates shift values:\n//\n//      shift[i] = (i * bitWidth) % 8\n//\nGLOBL ·shiftRightInt32(SB), RODATA|NOPTR, $256\n\nDATA ·shiftRightInt32+0+0(SB)/4,  $0\nDATA ·shiftRightInt32+0+4(SB)/4,  $1\nDATA ·shiftRightInt32+0+8(SB)/4,  $2\nDATA ·shiftRightInt32+0+12(SB)/4, $3\nDATA ·shiftRightInt32+0+16(SB)/4, $4\nDATA ·shiftRightInt32+0+20(SB)/4, $5\nDATA ·shiftRightInt32+0+24(SB)/4, $6\nDATA ·shiftRightInt32+0+28(SB)/4, $7\n\nDATA ·shiftRightInt32+32+0(SB)/4,  $0\nDATA ·shiftRightInt32+32+4(SB)/4,  $2\nDATA ·shiftRightInt32+32+8(SB)/4,  $4\nDATA ·shiftRightInt32+32+12(SB)/4, $6\nDATA ·shiftRightInt32+32+16(SB)/4, $0\nDATA ·shiftRightInt32+32+20(SB)/4, $2\nDATA ·shiftRightInt32+32+24(SB)/4, $4\nDATA ·shiftRightInt32+32+28(SB)/4, $6\n\nDATA ·shiftRightInt32+64+0(SB)/4,  $0\nDATA ·shiftRightInt32+64+4(SB)/4,  $3\nDATA ·shiftRightInt32+64+8(SB)/4,  $6\nDATA ·shiftRightInt32+64+12(SB)/4, $1\nDATA ·shiftRightInt32+64+16(SB)/4, $4\nDATA ·shiftRightInt32+64+20(SB)/4, $7\nDATA ·shiftRightInt32+64+24(SB)/4, $2\nDATA ·shiftRightInt32+64+28(SB)/4, $5\n\nDATA ·shiftRightInt32+96+0(SB)/4,  $0\nDATA ·shiftRightInt32+96+4(SB)/4,  $4\nDATA ·shiftRightInt32+96+8(SB)/4,  $0\nDATA ·shiftRightInt32+96+12(SB)/4, $4\nDATA ·shiftRightInt32+96+16(SB)/4, $0\nDATA ·shiftRightInt32+96+20(SB)/4, $4\nDATA ·shiftRightInt32+96+24(SB)/4, $0\nDATA ·shiftRightInt32+96+28(SB)/4, $4\n\nDATA ·shiftRightInt32+128+0(SB)/4,  $0\nDATA ·shiftRightInt32+128+4(SB)/4,  $5\nDATA ·shiftRightInt32+128+8(SB)/4,  $2\nDATA ·shiftRightInt32+128+12(SB)/4, $7\nDATA ·shiftRightInt32+128+16(SB)/4, $4\nDATA ·shiftRightInt32+128+20(SB)/4, $1\nDATA ·shiftRightInt32+128+24(SB)/4, $6\nDATA ·shiftRightInt32+128+28(SB)/4, $3\n\nDATA ·shiftRightInt32+160+0(SB)/4,  $0\nDATA ·shiftRightInt32+160+4(SB)/4,  $6\nDATA ·shiftRightInt32+160+8(SB)/4,  $4\nDATA ·shiftRightInt32+160+12(SB)/4, $2\nDATA ·shiftRightInt32+160+16(SB)/4, $0\nDATA ·shiftRightInt32+160+20(SB)/4, $6\nDATA ·shiftRightInt32+160+24(SB)/4, $4\nDATA ·shiftRightInt32+160+28(SB)/4, $2\n\nDATA ·shiftRightInt32+192+0(SB)/4,  $0\nDATA ·shiftRightInt32+192+4(SB)/4,  $7\nDATA ·shiftRightInt32+192+8(SB)/4,  $6\nDATA ·shiftRightInt32+192+12(SB)/4, $5\nDATA ·shiftRightInt32+192+16(SB)/4, $4\nDATA ·shiftRightInt32+192+20(SB)/4, $3\nDATA ·shiftRightInt32+192+24(SB)/4, $2\nDATA ·shiftRightInt32+192+28(SB)/4, $1\n\nDATA ·shiftRightInt32+224+0(SB)/4,  $0\nDATA ·shiftRightInt32+224+4(SB)/4,  $0\nDATA ·shiftRightInt32+224+8(SB)/4,  $0\nDATA ·shiftRightInt32+224+12(SB)/4, $0\nDATA ·shiftRightInt32+224+16(SB)/4, $0\nDATA ·shiftRightInt32+224+20(SB)/4, $0\nDATA ·shiftRightInt32+224+24(SB)/4, $0\nDATA ·shiftRightInt32+224+28(SB)/4, $0\n\n// The LEFT shifts to unpack 32 bits integers.\n//\n// The following formula was determined empirically as the expression which\n// generates shift values:\n//\n//      shift[i] = (8 - (i * bitWidth)) % 8\n//\nGLOBL ·shiftLeftInt32(SB), RODATA|NOPTR, $256\n\nDATA ·shiftLeftInt32+0+0(SB)/4,  $0\nDATA ·shiftLeftInt32+0+4(SB)/4,  $7\nDATA ·shiftLeftInt32+0+8(SB)/4,  $6\nDATA ·shiftLeftInt32+0+12(SB)/4, $5\nDATA ·shiftLeftInt32+0+16(SB)/4, $4\nDATA ·shiftLeftInt32+0+20(SB)/4, $3\nDATA ·shiftLeftInt32+0+24(SB)/4, $2\nDATA ·shiftLeftInt32+0+28(SB)/4, $1\n\nDATA ·shiftLeftInt32+32+0(SB)/4,  $0\nDATA ·shiftLeftInt32+32+4(SB)/4,  $6\nDATA ·shiftLeftInt32+32+8(SB)/4,  $4\nDATA ·shiftLeftInt32+32+12(SB)/4, $2\nDATA ·shiftLeftInt32+32+16(SB)/4, $0\nDATA ·shiftLeftInt32+32+20(SB)/4, $6\nDATA ·shiftLeftInt32+32+24(SB)/4, $4\nDATA ·shiftLeftInt32+32+28(SB)/4, $2\n\nDATA ·shiftLeftInt32+64+0(SB)/4,  $0\nDATA ·shiftLeftInt32+64+4(SB)/4,  $5\nDATA ·shiftLeftInt32+64+8(SB)/4,  $2\nDATA ·shiftLeftInt32+64+12(SB)/4, $7\nDATA ·shiftLeftInt32+64+16(SB)/4, $4\nDATA ·shiftLeftInt32+64+20(SB)/4, $1\nDATA ·shiftLeftInt32+64+24(SB)/4, $6\nDATA ·shiftLeftInt32+64+28(SB)/4, $3\n\nDATA ·shiftLeftInt32+96+0(SB)/4,  $0\nDATA ·shiftLeftInt32+96+4(SB)/4,  $4\nDATA ·shiftLeftInt32+96+8(SB)/4,  $0\nDATA ·shiftLeftInt32+96+12(SB)/4, $4\nDATA ·shiftLeftInt32+96+16(SB)/4, $0\nDATA ·shiftLeftInt32+96+20(SB)/4, $4\nDATA ·shiftLeftInt32+96+24(SB)/4, $0\nDATA ·shiftLeftInt32+96+28(SB)/4, $4\n\nDATA ·shiftLeftInt32+128+0(SB)/4,  $0\nDATA ·shiftLeftInt32+128+4(SB)/4,  $3\nDATA ·shiftLeftInt32+128+8(SB)/4,  $6\nDATA ·shiftLeftInt32+128+12(SB)/4, $1\nDATA ·shiftLeftInt32+128+16(SB)/4, $4\nDATA ·shiftLeftInt32+128+20(SB)/4, $7\nDATA ·shiftLeftInt32+128+24(SB)/4, $2\nDATA ·shiftLeftInt32+128+28(SB)/4, $5\n\nDATA ·shiftLeftInt32+160+0(SB)/4,  $0\nDATA ·shiftLeftInt32+160+4(SB)/4,  $2\nDATA ·shiftLeftInt32+160+8(SB)/4,  $4\nDATA ·shiftLeftInt32+160+12(SB)/4, $6\nDATA ·shiftLeftInt32+160+16(SB)/4, $0\nDATA ·shiftLeftInt32+160+20(SB)/4, $2\nDATA ·shiftLeftInt32+160+24(SB)/4, $4\nDATA ·shiftLeftInt32+160+28(SB)/4, $6\n\nDATA ·shiftLeftInt32+192+0(SB)/4,  $0\nDATA ·shiftLeftInt32+192+4(SB)/4,  $1\nDATA ·shiftLeftInt32+192+8(SB)/4,  $2\nDATA ·shiftLeftInt32+192+12(SB)/4, $3\nDATA ·shiftLeftInt32+192+16(SB)/4, $4\nDATA ·shiftLeftInt32+192+20(SB)/4, $5\nDATA ·shiftLeftInt32+192+24(SB)/4, $6\nDATA ·shiftLeftInt32+192+28(SB)/4, $7\n\nDATA ·shiftLeftInt32+224+0(SB)/4,  $0\nDATA ·shiftLeftInt32+224+4(SB)/4,  $0\nDATA ·shiftLeftInt32+224+8(SB)/4,  $0\nDATA ·shiftLeftInt32+224+12(SB)/4, $0\nDATA ·shiftLeftInt32+224+16(SB)/4, $0\nDATA ·shiftLeftInt32+224+20(SB)/4, $0\nDATA ·shiftLeftInt32+224+24(SB)/4, $0\nDATA ·shiftLeftInt32+224+28(SB)/4, $0\n"
  },
  {
    "path": "internal/bitpack/pack.go",
    "content": "package bitpack\n\nimport (\n\t\"encoding/binary\"\n)\n\n// PackInt32 packs values from src to dst, each value is packed into the given\n// bit width regardless of how many bits are needed to represent it.\n//\n// The function panics if dst is too short to hold the bit packed values.\nfunc PackInt32(dst []byte, src []int32, bitWidth uint) {\n\tassertPack(dst, len(src), bitWidth)\n\tpackInt32(dst, src, bitWidth)\n}\n\nfunc packInt32(dst []byte, src []int32, bitWidth uint) {\n\tn := ByteCount(uint(len(src)) * bitWidth)\n\tb := dst[:n]\n\n\tfor i := range b {\n\t\tb[i] = 0\n\t}\n\n\tbitMask := uint32(1<<bitWidth) - 1\n\tbitOffset := uint(0)\n\n\tfor _, value := range src {\n\t\ti := bitOffset / 32\n\t\tj := bitOffset % 32\n\n\t\tlo := binary.LittleEndian.Uint32(dst[(i+0)*4:])\n\t\thi := binary.LittleEndian.Uint32(dst[(i+1)*4:])\n\n\t\tlo |= (uint32(value) & bitMask) << j\n\t\thi |= (uint32(value) >> (32 - j))\n\n\t\tbinary.LittleEndian.PutUint32(dst[(i+0)*4:], lo)\n\t\tbinary.LittleEndian.PutUint32(dst[(i+1)*4:], hi)\n\n\t\tbitOffset += bitWidth\n\t}\n}\n\n// PackInt64 packs values from src to dst, each value is packed into the given\n// bit width regardless of how many bits are needed to represent it.\n//\n// The function panics if dst is too short to hold the bit packed values.\nfunc PackInt64(dst []byte, src []int64, bitWidth uint) {\n\tassertPack(dst, len(src), bitWidth)\n\tpackInt64(dst, src, bitWidth)\n}\n\nfunc packInt64(dst []byte, src []int64, bitWidth uint) {\n\tn := ByteCount(uint(len(src)) * bitWidth)\n\tb := dst[:n]\n\n\tfor i := range b {\n\t\tb[i] = 0\n\t}\n\n\tbitMask := uint64(1<<bitWidth) - 1\n\tbitOffset := uint(0)\n\n\tfor _, value := range src {\n\t\ti := bitOffset / 64\n\t\tj := bitOffset % 64\n\n\t\tlo := binary.LittleEndian.Uint64(dst[(i+0)*8:])\n\t\thi := binary.LittleEndian.Uint64(dst[(i+1)*8:])\n\n\t\tlo |= (uint64(value) & bitMask) << j\n\t\thi |= (uint64(value) >> (64 - j))\n\n\t\tbinary.LittleEndian.PutUint64(dst[(i+0)*8:], lo)\n\t\tbinary.LittleEndian.PutUint64(dst[(i+1)*8:], hi)\n\n\t\tbitOffset += bitWidth\n\t}\n}\n\nfunc assertPack(dst []byte, count int, bitWidth uint) {\n\t_ = dst[:ByteCount(bitWidth*uint(count))]\n}\n"
  },
  {
    "path": "internal/bitpack/unpack.go",
    "content": "package bitpack\n\n// PaddingInt32 is the padding expected to exist after the end of input buffers\n// for the UnpackInt32 algorithm to avoid reading beyond the end of the input.\nconst PaddingInt32 = 16\n\n// PaddingInt64 is the padding expected to exist after the end of input buffers\n// for the UnpackInt32 algorithm to avoid reading beyond the end of the input.\nconst PaddingInt64 = 32\n\n// UnpackInt32 unpacks 32 bit integers from src to dst.\n//\n// The function unpacked len(dst) integers, it panics if src is too short to\n// contain len(dst) values of the given bit width.\nfunc UnpackInt32(dst []int32, src []byte, bitWidth uint) {\n\t_ = src[:ByteCount(bitWidth*uint(len(dst))+8*PaddingInt32)]\n\tunpackInt32(dst, src, bitWidth)\n}\n\n// UnpackInt64 unpacks 64 bit integers from src to dst.\n//\n// The function unpacked len(dst) integers, it panics if src is too short to\n// contain len(dst) values of the given bit width.\nfunc UnpackInt64(dst []int64, src []byte, bitWidth uint) {\n\t_ = src[:ByteCount(bitWidth*uint(len(dst))+8*PaddingInt64)]\n\tunpackInt64(dst, src, bitWidth)\n}\n"
  },
  {
    "path": "internal/bitpack/unpack_int32_amd64.go",
    "content": "//go:build !purego\n\npackage bitpack\n\nimport (\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n\t\"golang.org/x/sys/cpu\"\n)\n\n//go:noescape\nfunc unpackInt32Default(dst []int32, src []byte, bitWidth uint)\n\n//go:noescape\nfunc unpackInt32x1to16bitsAVX2(dst []int32, src []byte, bitWidth uint)\n\n//go:noescape\nfunc unpackInt32x17to26bitsAVX2(dst []int32, src []byte, bitWidth uint)\n\n//go:noescape\nfunc unpackInt32x27to31bitsAVX2(dst []int32, src []byte, bitWidth uint)\n\nfunc unpackInt32(dst []int32, src []byte, bitWidth uint) {\n\thasAVX2 := cpu.X86.HasAVX2\n\tswitch {\n\tcase hasAVX2 && bitWidth <= 16:\n\t\tunpackInt32x1to16bitsAVX2(dst, src, bitWidth)\n\tcase hasAVX2 && bitWidth <= 26:\n\t\tunpackInt32x17to26bitsAVX2(dst, src, bitWidth)\n\tcase hasAVX2 && bitWidth <= 31:\n\t\tunpackInt32x27to31bitsAVX2(dst, src, bitWidth)\n\tcase bitWidth == 32:\n\t\tcopy(dst, unsafecast.BytesToInt32(src))\n\tdefault:\n\t\tunpackInt32Default(dst, src, bitWidth)\n\t}\n}\n"
  },
  {
    "path": "internal/bitpack/unpack_int32_amd64.s",
    "content": "//go:build !purego\n\n#include \"funcdata.h\"\n#include \"textflag.h\"\n\n// func unpackInt32Default(dst []int32, src []byte, bitWidth uint)\nTEXT ·unpackInt32Default(SB), NOSPLIT, $0-56\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), DX\n    MOVQ src_base+24(FP), BX\n    MOVQ bitWidth+48(FP), CX\n\n    MOVQ $1, R8 // bitMask = (1 << bitWidth) - 1\n    SHLQ CX, R8\n    DECQ R8\n    MOVQ CX, R9 // bitWidth\n\n    XORQ DI, DI // bitOffset\n    XORQ SI, SI // index\n    JMP test\nloop:\n    MOVQ DI, R10\n    MOVQ DI, CX\n    SHRQ $5, R10      // i = bitOffset / 32\n    ANDQ $0b11111, CX // j = bitOffset % 32\n\n    MOVL (BX)(R10*4), R11\n    MOVL R8, R12  // d = bitMask\n    SHLL CX, R12  // d = d << j\n    ANDL R12, R11 // d = src[i] & d\n    SHRL CX, R11  // d = d >> j\n\n    MOVL CX, R13\n    ADDL R9, R13\n    CMPL R13, $32\n    JBE next // j+bitWidth <= 32 ?\n\n    MOVL 4(BX)(R10*4), R14\n    MOVL CX, R12\n    MOVL $32, CX\n    SUBL R12, CX  // k = 32 - j\n    MOVL R8, R12  // c = bitMask\n    SHRL CX, R12  // c = c >> k\n    ANDL R12, R14 // c = src[i+1] & c\n    SHLL CX, R14  // c = c << k\n    ORL R14, R11  // d = d | c\nnext:\n    MOVL R11, (AX)(SI*4) // dst[n] = d\n    ADDQ R9, DI          // bitOffset += bitWidth\n    INCQ SI\ntest:\n    CMPQ SI, DX\n    JNE loop\n    RET\n\n// -----------------------------------------------------------------------------\n// The unpack* functions below are adaptations of the algorithms\n// described in \"Decoding billions of integers per second through vectorization\"\n// from D. Lemire & L. Boytsov, the following changes were made:\n//\n// - The paper described two methods for decoding integers called \"horizontal\"\n//   and \"vertical\". The \"horizontal\" version is the one that applies the best\n//   to the bit packing done in the Parquet delta encoding; however, it also\n//   differs in some ways, many compression techniques discussed in the paper\n//   are not implemented in the Parquet format.\n//\n// - The paper focuses on implementations based on SSE instructions, which\n//   describes how to use PMULLD to emulate the lack of variable bit shift\n//   for packed integers. Our version of the bit unpacking algorithms here\n//   uses AVX2 and can perform variable bit shifts using VPSRLVD, which yields\n//   better throughput since the instruction latency is a single CPU cycle,\n//   vs 10 for VPMULLD.\n//\n// - The reference implementation at https://github.com/lemire/FastPFor/ uses\n//   specializations for each bit size, resulting in 32 unique functions.\n//   Our version here are more generic, we provide 3 variations of the\n//   algorithm for bit widths 1 to 16, 17 to 26, and 27 to 31 (unpacking 32\n//   bits values is a simple copy). In that regard, our implementation is\n//   somewhat an improvement over the reference, since it uses less code and\n//   less memory to hold the shuffle masks and shift tables.\n//\n// Technically, each specialization of our functions could be expressed by the\n// algorithm used for unpacking values of 27 to 31 bits. However, multiple steps\n// of the main loop can be removed for lower bit widths, providing up to ~35%\n// better throughput for smaller sizes. Since we expect delta encoding to often\n// result in bit packing values to smaller bit widths, the specializations are\n// worth the extra complexity.\n//\n// For more details, see: https://arxiv.org/pdf/1209.2137v5.pdf\n// -----------------------------------------------------------------------------\n\n// unpackInt32x1to16bitsAVX2 is the implementation of the bit unpacking\n// algorithm for inputs of bit width 1 to 16.\n//\n// In this version of the algorithm, we can perform a single memory load in each\n// loop iteration since we know that 8 values will fit in a single XMM register.\n//\n// func unpackInt32x1to16bitsAVX2(dst []int32, src []byte, bitWidth uint)\nTEXT ·unpackInt32x1to16bitsAVX2(SB), NOSPLIT, $56-56\n    NO_LOCAL_POINTERS\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), DX\n    MOVQ src_base+24(FP), BX\n    MOVQ bitWidth+48(FP), CX\n\n    CMPQ DX, $8\n    JB tail\n\n    MOVQ DX, DI\n    SHRQ $3, DI\n    SHLQ $3, DI\n    XORQ SI, SI\n\n    MOVQ $1, R8\n    SHLQ CX, R8\n    DECQ R8\n    MOVQ R8, X0\n    VPBROADCASTD X0, X0 // bitMask = (1 << bitWidth) - 1\n\n    MOVQ CX, R9\n    DECQ R9\n    SHLQ $5, R9 // 32 * (bitWidth - 1)\n\n    MOVQ CX, R10\n    DECQ R10\n    SHLQ $5, R10\n    ANDQ $0xFF, R10 // (32 * (bitWidth - 1)) % 256\n\n    LEAQ ·shuffleInt32x1to16bits(SB), R11\n    VMOVDQA (R11)(R9*1), X1\n    VMOVDQA 16(R11)(R9*1), X2\n\n    LEAQ ·shiftRightInt32(SB), R12\n    VMOVDQA (R12)(R10*1), X3\n    VMOVDQA 16(R12)(R10*1), X4\nloop:\n    VMOVDQU (BX), X7\n\n    VPSHUFB X1, X7, X5\n    VPSHUFB X2, X7, X6\n\n    VPSRLVD X3, X5, X5\n    VPSRLVD X4, X6, X6\n\n    VPAND X0, X5, X5\n    VPAND X0, X6, X6\n\n    VMOVDQU X5, (AX)(SI*4)\n    VMOVDQU X6, 16(AX)(SI*4)\n\n    ADDQ CX, BX\n    ADDQ $8, SI\n    CMPQ SI, DI\n    JNE loop\n    VZEROUPPER\n\n    CMPQ SI, DX\n    JE done\n    LEAQ (AX)(SI*4), AX\n    SUBQ SI, DX\ntail:\n    MOVQ AX, dst_base-56(SP)\n    MOVQ DX, dst_len-48(SP)\n    MOVQ BX, src_base-32(SP)\n    MOVQ CX, bitWidth-8(SP)\n    CALL ·unpackInt32Default(SB)\ndone:\n    RET\n\n// unpackInt32x17to26bitsAVX2 is the implementation of the bit unpacking\n// algorithm for inputs of bit width 17 to 26.\n//\n// In this version of the algorithm, we need to 32 bytes at each loop iteration\n// because 8 bit-packed values will span across two XMM registers.\n//\n// func unpackInt32x17to26bitsAVX2(dst []int32, src []byte, bitWidth uint)\nTEXT ·unpackInt32x17to26bitsAVX2(SB), NOSPLIT, $56-56\n    NO_LOCAL_POINTERS\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), DX\n    MOVQ src_base+24(FP), BX\n    MOVQ bitWidth+48(FP), CX\n\n    CMPQ DX, $8\n    JB tail\n\n    MOVQ DX, DI\n    SHRQ $3, DI\n    SHLQ $3, DI\n    XORQ SI, SI\n\n    MOVQ $1, R8\n    SHLQ CX, R8\n    DECQ R8\n    MOVQ R8, X0\n    VPBROADCASTD X0, X0\n\n    MOVQ CX, R9\n    SUBQ $17, R9\n    IMULQ $48, R9 // 48 * (bitWidth - 17)\n\n    MOVQ CX, R10\n    DECQ R10\n    SHLQ $5, R10\n    ANDQ $0xFF, R10 // (32 * (bitWidth - 1)) % 256\n\n    LEAQ ·shuffleInt32x17to26bits(SB), R11\n    VMOVDQA (R11)(R9*1), X1\n    VMOVDQA 16(R11)(R9*1), X2\n    VMOVDQA 32(R11)(R9*1), X3\n\n    LEAQ ·shiftRightInt32(SB), R12\n    VMOVDQA (R12)(R10*1), X4\n    VMOVDQA 16(R12)(R10*1), X5\nloop:\n    VMOVDQU (BX), X6\n    VMOVDQU 16(BX), X7\n\n    VPSHUFB X1, X6, X8\n    VPSHUFB X2, X6, X9\n    VPSHUFB X3, X7, X10\n    VPOR X10, X9, X9\n\n    VPSRLVD X4, X8, X8\n    VPSRLVD X5, X9, X9\n\n    VPAND X0, X8, X8\n    VPAND X0, X9, X9\n\n    VMOVDQU X8, (AX)(SI*4)\n    VMOVDQU X9, 16(AX)(SI*4)\n\n    ADDQ CX, BX\n    ADDQ $8, SI\n    CMPQ SI, DI\n    JNE loop\n    VZEROUPPER\n\n    CMPQ SI, DX\n    JE done\n    LEAQ (AX)(SI*4), AX\n    SUBQ SI, DX\ntail:\n    MOVQ AX, dst_base-56(SP)\n    MOVQ DX, dst_len-48(SP)\n    MOVQ BX, src_base-32(SP)\n    MOVQ CX, bitWidth-8(SP)\n    CALL ·unpackInt32Default(SB)\ndone:\n    RET\n\n// unpackInt32x27to31bitsAVX2 is the implementation of the bit unpacking\n// algorithm for inputs of bit width 27 to 31.\n//\n// In this version of the algorithm the bit-packed values may span across up to\n// 5 bytes. The simpler approach for smaller bit widths where we could perform a\n// single shuffle + shift to unpack the values do not work anymore.\n//\n// Values are unpacked in two steps: the first one extracts lower bits which are\n// shifted RIGHT to align on the beginning of 32 bit words, the second extracts\n// upper bits which are shifted LEFT to be moved to the end of the 32 bit words.\n//\n// The amount of LEFT shifts is always \"8 minus the amount of RIGHT shift\".\n//\n// func unpackInt32x27to31bitsAVX2(dst []int32, src []byte, bitWidth uint)\nTEXT ·unpackInt32x27to31bitsAVX2(SB), NOSPLIT, $56-56\n    NO_LOCAL_POINTERS\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), DX\n    MOVQ src_base+24(FP), BX\n    MOVQ bitWidth+48(FP), CX\n\n    CMPQ DX, $8\n    JB tail\n\n    MOVQ DX, DI\n    SHRQ $3, DI\n    SHLQ $3, DI\n    XORQ SI, SI\n\n    MOVQ $1, R8\n    SHLQ CX, R8\n    DECQ R8\n    MOVQ R8, X0\n    VPBROADCASTD X0, X0\n\n    MOVQ CX, R9\n    SUBQ $27, R9\n    IMULQ $80, R9 // (80 * (bitWidth - 27))\n\n    MOVQ CX, R10\n    DECQ R10\n    SHLQ $5, R10\n    ANDQ $0xFF, R10 // (32 * (bitWidth - 1)) % 256\n\n    LEAQ ·shuffleInt32x27to31bits(SB), R11\n    VMOVDQA (R11)(R9*1), X1\n    VMOVDQA 16(R11)(R9*1), X2\n    VMOVDQA 32(R11)(R9*1), X3\n    VMOVDQA 48(R11)(R9*1), X4\n    VMOVDQA 64(R11)(R9*1), X5\n\n    LEAQ ·shiftRightInt32(SB), R12\n    LEAQ ·shiftLeftInt32(SB), R13\n    VMOVDQA (R12)(R10*1), X6\n    VMOVDQA (R13)(R10*1), X7\n    VMOVDQA 16(R12)(R10*1), X8\n    VMOVDQA 16(R13)(R10*1), X9\nloop:\n    VMOVDQU (BX), X10\n    VMOVDQU 16(BX), X11\n\n    VPSHUFB X1, X10, X12\n    VPSHUFB X2, X10, X13\n    VPSHUFB X3, X10, X14\n    VPSHUFB X4, X11, X15\n    VPSHUFB X5, X11, X11\n\n    VPSRLVD X6, X12, X12\n    VPSLLVD X7, X13, X13\n    VPSRLVD X8, X14, X14\n    VPSRLVD X8, X15, X15\n    VPSLLVD X9, X11, X11\n\n    VPOR X13, X12, X12\n    VPOR X15, X14, X14\n    VPOR X11, X14, X14\n\n    VPAND X0, X12, X12\n    VPAND X0, X14, X14\n\n    VMOVDQU X12, (AX)(SI*4)\n    VMOVDQU X14, 16(AX)(SI*4)\n\n    ADDQ CX, BX\n    ADDQ $8, SI\n    CMPQ SI, DI\n    JNE loop\n    VZEROUPPER\n\n    CMPQ SI, DX\n    JE done\n    LEAQ (AX)(SI*4), AX\n    SUBQ SI, DX\ntail:\n    MOVQ AX, dst_base-56(SP)\n    MOVQ DX, dst_len-48(SP)\n    MOVQ BX, src_base-32(SP)\n    MOVQ CX, bitWidth-8(SP)\n    CALL ·unpackInt32Default(SB)\ndone:\n    RET\n"
  },
  {
    "path": "internal/bitpack/unpack_int32_purego.go",
    "content": "//go:build purego || !amd64\n\npackage bitpack\n\nimport (\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\nfunc unpackInt32(dst []int32, src []byte, bitWidth uint) {\n\tbits := unsafecast.BytesToUint32(src)\n\tbitMask := uint32(1<<bitWidth) - 1\n\tbitOffset := uint(0)\n\n\tfor n := range dst {\n\t\ti := bitOffset / 32\n\t\tj := bitOffset % 32\n\t\td := (bits[i] & (bitMask << j)) >> j\n\t\tif j+bitWidth > 32 {\n\t\t\tk := 32 - j\n\t\t\td |= (bits[i+1] & (bitMask >> k)) << k\n\t\t}\n\t\tdst[n] = int32(d)\n\t\tbitOffset += bitWidth\n\t}\n}\n"
  },
  {
    "path": "internal/bitpack/unpack_int64_amd64.go",
    "content": "//go:build !purego\n\npackage bitpack\n\nimport (\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n\t\"golang.org/x/sys/cpu\"\n)\n\n//go:noescape\nfunc unpackInt64Default(dst []int64, src []byte, bitWidth uint)\n\n//go:noescape\nfunc unpackInt64x1to32bitsAVX2(dst []int64, src []byte, bitWidth uint)\n\nfunc unpackInt64(dst []int64, src []byte, bitWidth uint) {\n\thasAVX2 := cpu.X86.HasAVX2\n\tswitch {\n\tcase hasAVX2 && bitWidth <= 32:\n\t\tunpackInt64x1to32bitsAVX2(dst, src, bitWidth)\n\tcase bitWidth == 64:\n\t\tcopy(dst, unsafecast.BytesToInt64(src))\n\tdefault:\n\t\tunpackInt64Default(dst, src, bitWidth)\n\t}\n}\n"
  },
  {
    "path": "internal/bitpack/unpack_int64_amd64.s",
    "content": "//go:build !purego\n\n#include \"funcdata.h\"\n#include \"textflag.h\"\n\n// func unpackInt64Default(dst []int64, src []uint32, bitWidth uint)\nTEXT ·unpackInt64Default(SB), NOSPLIT, $0-56\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), DX\n    MOVQ src_base+24(FP), BX\n    MOVQ bitWidth+48(FP), CX\n\n    MOVQ $1, R8 // bitMask = (1 << bitWidth) - 1\n    SHLQ CX, R8, R8\n    DECQ R8\n    MOVQ CX, R9 // bitWidth\n\n    XORQ DI, DI // bitOffset\n    XORQ SI, SI // index\n    XORQ R10, R10\n    XORQ R11, R11\n    XORQ R14, R14\n    JMP test\nloop:\n    MOVQ DI, R10\n    MOVQ DI, CX\n    SHRQ $5, R10      // i = bitOffset / 32\n    ANDQ $0b11111, CX // j = bitOffset % 32\n\n    MOVLQZX (BX)(R10*4), R11\n    MOVQ R8, R12  // d = bitMask\n    SHLQ CX, R12  // d = d << j\n    ANDQ R12, R11 // d = src[i] & d\n    SHRQ CX, R11  // d = d >> j\n\n    MOVQ CX, R13\n    ADDQ R9, R13\n    CMPQ R13, $32\n    JBE next // j+bitWidth <= 32 ?\n    MOVQ CX, R15 // j\n\n    MOVLQZX 4(BX)(R10*4), R14\n    MOVQ $32, CX\n    SUBQ R15, CX  // k = 32 - j\n    MOVQ R8, R12  // c = bitMask\n    SHRQ CX, R12  // c = c >> k\n    ANDQ R12, R14 // c = src[i+1] & c\n    SHLQ CX, R14  // c = c << k\n    ORQ R14, R11  // d = d | c\n\n    CMPQ R13, $64\n    JBE next\n\n    MOVLQZX 8(BX)(R10*4), R14\n    MOVQ $64, CX\n    SUBQ R15, CX  // k = 64 - j\n    MOVQ R8, R12  // c = bitMask\n    SHRQ CX, R12  // c = c >> k\n    ANDQ R12, R14 // c = src[i+2] & c\n    SHLQ CX, R14  // c = c << k\n    ORQ R14, R11  // d = d | c\nnext:\n    MOVQ R11, (AX)(SI*8) // dst[n] = d\n    ADDQ R9, DI          // bitOffset += bitWidth\n    INCQ SI\ntest:\n    CMPQ SI, DX\n    JNE loop\n    RET\n\n// This bit unpacking function was inspired from the 32 bit version, but\n// adapted to account for the fact that eight 64 bit values span across\n// two YMM registers, and across lanes of YMM registers.\n//\n// Because of the two lanes of YMM registers, we cannot use the VPSHUFB\n// instruction to dispatch bytes of the input to the registers. Instead we use\n// the VPERMD instruction, which has higher latency but supports dispatching\n// bytes across register lanes. Measurable throughput gains remain despite the\n// algorithm running on a few more CPU cycles per loop.\n//\n// The initialization phase of this algorithm generates masks for\n// permutations and shifts used to decode the bit-packed values.\n//\n// The permutation masks are written to Y7 and Y8, and contain the results\n// of this formula:\n//\n//      temp[i] = (bitWidth * i) / 32\n//      mask[i] = temp[i] | ((temp[i] + 1) << 32)\n//\n// Since VPERMQ only supports reading the permutation combination from an\n// immediate value, we use VPERMD and generate permutation for pairs of two\n// consecutive 32 bit words, which is why we have the upper part of each 64\n// bit word set with (x+1)<<32.\n//\n// The masks for right shifts are written to Y5 and Y6, and computed with\n// this formula:\n//\n//      shift[i] = (bitWidth * i) - (32 * ((bitWidth * i) / 32))\n//\n// The amount to shift by is the number of values previously unpacked, offseted\n// by the byte count of 32 bit words that we read from first bits from.\n//\n// Technically the masks could be precomputed and declared in global tables;\n// however, declaring masks for all bit width is tedious and makes code\n// maintenance more costly for no measurable benefits on production workloads.\n//\n// func unpackInt64x1to32bitsAVX2(dst []int64, src []byte, bitWidth uint)\nTEXT ·unpackInt64x1to32bitsAVX2(SB), NOSPLIT, $56-56\n    NO_LOCAL_POINTERS\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), DX\n    MOVQ src_base+24(FP), BX\n    MOVQ bitWidth+48(FP), CX\n\n    CMPQ DX, $8\n    JB tail\n\n    MOVQ DX, DI\n    SHRQ $3, DI\n    SHLQ $3, DI\n    XORQ SI, SI\n\n    MOVQ $1, R8\n    SHLQ CX, R8\n    DECQ R8\n    MOVQ R8, X0\n    VPBROADCASTQ X0, Y0 // bitMask = (1 << bitWidth) - 1\n\n    VPCMPEQQ Y1, Y1, Y1\n    VPSRLQ $63, Y1, Y1  // [1,1,1,1]\n\n    MOVQ CX, X2\n    VPBROADCASTQ X2, Y2 // [bitWidth]\n\n    VMOVDQU range0n7<>+0(SB), Y3  // [0,1,2,3]\n    VMOVDQU range0n7<>+32(SB), Y4 // [4,5,6,7]\n\n    VPMULLD Y2, Y3, Y5 // [bitWidth] * [0,1,2,3]\n    VPMULLD Y2, Y4, Y6 // [bitWidth] * [4,5,6,7]\n\n    VPSRLQ $5, Y5, Y7 // ([bitWidth] * [0,1,2,3]) / 32\n    VPSRLQ $5, Y6, Y8 // ([bitWidth] * [4,5,6,7]) / 32\n\n    VPSLLQ $5, Y7, Y9  // (([bitWidth] * [0,1,2,3]) / 32) * 32\n    VPSLLQ $5, Y8, Y10 // (([bitWidth] * [4,5,6,7]) / 32) * 32\n\n    VPADDQ Y1, Y7, Y11\n    VPADDQ Y1, Y8, Y12\n    VPSLLQ $32, Y11, Y11\n    VPSLLQ $32, Y12, Y12\n    VPOR Y11, Y7, Y7 // permutations[i] = [i | ((i + 1) << 32)]\n    VPOR Y12, Y8, Y8 // permutations[i] = [i | ((i + 1) << 32)]\n\n    VPSUBQ Y9, Y5, Y5 // shifts\n    VPSUBQ Y10, Y6, Y6\nloop:\n    VMOVDQU (BX), Y1\n\n    VPERMD Y1, Y7, Y2\n    VPERMD Y1, Y8, Y3\n\n    VPSRLVQ Y5, Y2, Y2\n    VPSRLVQ Y6, Y3, Y3\n\n    VPAND Y0, Y2, Y2\n    VPAND Y0, Y3, Y3\n\n    VMOVDQU Y2, (AX)(SI*8)\n    VMOVDQU Y3, 32(AX)(SI*8)\n\n    ADDQ CX, BX\n    ADDQ $8, SI\n    CMPQ SI, DI\n    JNE loop\n    VZEROUPPER\n\n    CMPQ SI, DX\n    JE done\n    LEAQ (AX)(SI*8), AX\n    SUBQ SI, DX\ntail:\n    MOVQ AX, dst_base-56(SP)\n    MOVQ DX, dst_len-48(SP)\n    MOVQ BX, src_base-32(SP)\n    MOVQ CX, bitWidth-8(SP)\n    CALL ·unpackInt64Default(SB)\ndone:\n    RET\n\nGLOBL range0n7<>(SB), RODATA|NOPTR, $64\nDATA range0n7<>+0(SB)/8,  $0\nDATA range0n7<>+8(SB)/8,  $1\nDATA range0n7<>+16(SB)/8, $2\nDATA range0n7<>+24(SB)/8, $3\nDATA range0n7<>+32(SB)/8, $4\nDATA range0n7<>+40(SB)/8, $5\nDATA range0n7<>+48(SB)/8, $6\nDATA range0n7<>+56(SB)/8, $7\n"
  },
  {
    "path": "internal/bitpack/unpack_int64_purego.go",
    "content": "//go:build purego || !amd64\n\npackage bitpack\n\nimport \"github.com/segmentio/parquet-go/internal/unsafecast\"\n\nfunc unpackInt64(dst []int64, src []byte, bitWidth uint) {\n\tbits := unsafecast.BytesToUint32(src)\n\tbitMask := uint64(1<<bitWidth) - 1\n\tbitOffset := uint(0)\n\n\tfor n := range dst {\n\t\ti := bitOffset / 32\n\t\tj := bitOffset % 32\n\t\td := (uint64(bits[i]) & (bitMask << j)) >> j\n\t\tif j+bitWidth > 32 {\n\t\t\tk := 32 - j\n\t\t\td |= (uint64(bits[i+1]) & (bitMask >> k)) << k\n\t\t\tif j+bitWidth > 64 {\n\t\t\t\tk := 64 - j\n\t\t\t\td |= (uint64(bits[i+2]) & (bitMask >> k)) << k\n\t\t\t}\n\t\t}\n\t\tdst[n] = int64(d)\n\t\tbitOffset += bitWidth\n\t}\n}\n"
  },
  {
    "path": "internal/bitpack/unpack_test.go",
    "content": "package bitpack_test\n\nimport (\n\t\"fmt\"\n\t\"math/rand\"\n\t\"reflect\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/internal/bitpack\"\n)\n\nconst (\n\tblockSize = 128\n)\n\nfunc TestUnpackInt32(t *testing.T) {\n\tfor bitWidth := uint(1); bitWidth <= 32; bitWidth++ {\n\t\tt.Run(fmt.Sprintf(\"bitWidth=%d\", bitWidth), func(t *testing.T) {\n\t\t\tblock := [blockSize]int32{}\n\t\t\tbitMask := int32(bitWidth<<1) - 1\n\n\t\t\tprng := rand.New(rand.NewSource(0))\n\t\t\tfor i := range block {\n\t\t\t\tblock[i] = prng.Int31() & bitMask\n\t\t\t}\n\n\t\t\tsize := (blockSize * bitWidth) / 8\n\t\t\tbuf := make([]byte, size+bitpack.PaddingInt32)\n\t\t\tbitpack.PackInt32(buf, block[:], bitWidth)\n\n\t\t\tsrc := buf[:size]\n\t\t\tdst := make([]int32, blockSize)\n\n\t\t\tfor n := 1; n <= blockSize; n++ {\n\t\t\t\tfor i := range dst {\n\t\t\t\t\tdst[i] = 0\n\t\t\t\t}\n\n\t\t\t\tbitpack.UnpackInt32(dst[:n], src, bitWidth)\n\n\t\t\t\tif !reflect.DeepEqual(block[:n], dst[:n]) {\n\t\t\t\t\tt.Fatalf(\"values mismatch for length=%d\\nwant: %v\\ngot:  %v\", n, block[:n], dst[:n])\n\t\t\t\t}\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc TestUnpackInt64(t *testing.T) {\n\tfor bitWidth := uint(1); bitWidth <= 63; bitWidth++ {\n\t\tt.Run(fmt.Sprintf(\"bitWidth=%d\", bitWidth), func(t *testing.T) {\n\t\t\tblock := [blockSize]int64{}\n\t\t\tbitMask := int64(bitWidth<<1) - 1\n\n\t\t\tprng := rand.New(rand.NewSource(0))\n\t\t\tfor i := range block {\n\t\t\t\tblock[i] = prng.Int63() & bitMask\n\t\t\t}\n\n\t\t\tsize := (blockSize * bitWidth) / 8\n\t\t\tbuf := make([]byte, size+bitpack.PaddingInt64)\n\t\t\tbitpack.PackInt64(buf, block[:], bitWidth)\n\n\t\t\tsrc := buf[:size]\n\t\t\tdst := make([]int64, blockSize)\n\n\t\t\tfor n := 1; n <= blockSize; n++ {\n\t\t\t\tfor i := range dst {\n\t\t\t\t\tdst[i] = 0\n\t\t\t\t}\n\n\t\t\t\tbitpack.UnpackInt64(dst[:n], src, bitWidth)\n\n\t\t\t\tif !reflect.DeepEqual(block[:n], dst[:n]) {\n\t\t\t\t\tt.Fatalf(\"values mismatch for length=%d\\nwant: %v\\ngot:  %v\", n, block[:n], dst[:n])\n\t\t\t\t}\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc BenchmarkUnpackInt32(b *testing.B) {\n\tfor bitWidth := uint(1); bitWidth <= 32; bitWidth++ {\n\t\tblock := [blockSize]int32{}\n\t\tbuf := [4*blockSize + bitpack.PaddingInt32]byte{}\n\t\tbitpack.PackInt32(buf[:], block[:], bitWidth)\n\n\t\tb.Run(fmt.Sprintf(\"bitWidth=%d\", bitWidth), func(b *testing.B) {\n\t\t\tdst := block[:]\n\t\t\tsrc := buf[:]\n\n\t\t\tfor i := 0; i < b.N; i++ {\n\t\t\t\tbitpack.UnpackInt32(dst, src, bitWidth)\n\t\t\t}\n\n\t\t\tb.SetBytes(4 * blockSize)\n\t\t})\n\t}\n}\n\nfunc BenchmarkUnpackInt64(b *testing.B) {\n\tfor bitWidth := uint(1); bitWidth <= 64; bitWidth++ {\n\t\tblock := [blockSize]int64{}\n\t\tbuf := [8*blockSize + bitpack.PaddingInt64]byte{}\n\t\tbitpack.PackInt64(buf[:], block[:], bitWidth)\n\n\t\tb.Run(fmt.Sprintf(\"bitWidth=%d\", bitWidth), func(b *testing.B) {\n\t\t\tdst := block[:]\n\t\t\tsrc := buf[:]\n\n\t\t\tfor i := 0; i < b.N; i++ {\n\t\t\t\tbitpack.UnpackInt64(dst, src, bitWidth)\n\t\t\t}\n\n\t\t\tb.SetBytes(4 * blockSize)\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "internal/bytealg/broadcast_amd64.go",
    "content": "//go:build !purego\n\npackage bytealg\n\n//go:noescape\nfunc broadcastAVX2(dst []byte, src byte)\n\n// Broadcast writes the src value to all bytes of dst.\nfunc Broadcast(dst []byte, src byte) {\n\tif len(dst) >= 8 && hasAVX2 {\n\t\tbroadcastAVX2(dst, src)\n\t} else {\n\t\tfor i := range dst {\n\t\t\tdst[i] = src\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "internal/bytealg/broadcast_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n// func broadcastAVX2(dst []byte, src byte)\nTEXT ·broadcastAVX2(SB), NOSPLIT, $0-25\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), BX\n    MOVBQZX src+24(FP), CX\n\n    CMPQ BX, $8\n    JBE test\n\n    CMPQ BX, $64\n    JB init8\n\n    XORQ SI, SI\n    MOVQ BX, DX\n    SHRQ $6, DX\n    SHLQ $6, DX\n    MOVQ CX, X0\n    VPBROADCASTB X0, Y0\nloop64:\n    VMOVDQU Y0, (AX)(SI*1)\n    VMOVDQU Y0, 32(AX)(SI*1)\n    ADDQ $64, SI\n    CMPQ SI, DX\n    JNE loop64\n    VMOVDQU Y0, -64(AX)(BX*1)\n    VMOVDQU Y0, -32(AX)(BX*1)\n    VZEROUPPER\n    RET\n\ninit8:\n    MOVQ $0x0101010101010101, R8\n    IMULQ R8, CX\nloop8:\n    MOVQ CX, -8(AX)(BX*1)\n    SUBQ $8, BX\n    CMPQ BX, $8\n    JAE loop8\n    MOVQ CX, (AX)\n    RET\n\nloop:\n    MOVB CX, -1(AX)(BX*1)\n    DECQ BX\ntest:\n    CMPQ BX, $0\n    JNE loop\n    RET\n"
  },
  {
    "path": "internal/bytealg/broadcast_purego.go",
    "content": "//go:build purego || !amd64\n\npackage bytealg\n\nfunc Broadcast(dst []byte, src byte) {\n\tfor i := range dst {\n\t\tdst[i] = src\n\t}\n}\n"
  },
  {
    "path": "internal/bytealg/broadcast_test.go",
    "content": "package bytealg_test\n\nimport (\n\t\"fmt\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/internal/bytealg\"\n)\n\nfunc TestBroadcast(t *testing.T) {\n\tconst N = 100_0000\n\tbuffer := make([]byte, N)\n\n\tfor n := 1; n <= N; n = (n * 2) + 1 {\n\t\tt.Run(fmt.Sprintf(\"size=%d\", n), func(t *testing.T) {\n\t\t\tb := buffer[:n]\n\n\t\t\tfor i := range b {\n\t\t\t\tb[i] = 0\n\t\t\t}\n\n\t\t\tbytealg.Broadcast(b, 42)\n\n\t\t\tfor i, c := range b {\n\t\t\t\tif c != 42 {\n\t\t\t\t\tt.Fatalf(\"byte at index %d has value %d\", i, c)\n\t\t\t\t}\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc BenchmarkBroadcast(b *testing.B) {\n\tfor _, size := range []int{0, 10, 100, 1000, 10_000} {\n\t\tb.Run(fmt.Sprintf(\"size=%d\", size), func(b *testing.B) {\n\t\t\tdata := make([]byte, size)\n\n\t\t\tfor i := 0; i < b.N; i++ {\n\t\t\t\tbytealg.Broadcast(data, 1)\n\t\t\t}\n\n\t\t\tb.SetBytes(int64(size))\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "internal/bytealg/bytealg.go",
    "content": "// Package bytealg contains optimized algorithms operating on byte slices.\npackage bytealg\n"
  },
  {
    "path": "internal/bytealg/bytealg_amd64.go",
    "content": "//go:build !purego\n\npackage bytealg\n\nimport \"golang.org/x/sys/cpu\"\n\nvar (\n\thasAVX2 = cpu.X86.HasAVX2\n\t// These use AVX-512 instructions in the countByte algorithm relies\n\t// operations that are available in the AVX512BW extension:\n\t// * VPCMPUB\n\t// * KMOVQ\n\t//\n\t// Note that the function will fallback to an AVX2 version if those\n\t// instructions are not available.\n\thasAVX512Count = cpu.X86.HasAVX512VL && cpu.X86.HasAVX512BW\n)\n"
  },
  {
    "path": "internal/bytealg/bytealg_test.go",
    "content": "package bytealg_test\n\nimport (\n\t\"fmt\"\n\t\"testing\"\n)\n\nvar benchmarkBufferSizes = [...]int{\n\t4 * 1024,\n\t256 * 1024,\n\t2048 * 1024,\n}\n\nfunc forEachBenchmarkBufferSize(b *testing.B, f func(*testing.B, int)) {\n\tfor _, bufferSize := range benchmarkBufferSizes {\n\t\tb.Run(fmt.Sprintf(\"%dKiB\", bufferSize/1024), func(b *testing.B) {\n\t\t\tb.SetBytes(int64(bufferSize))\n\t\t\tf(b, bufferSize)\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "internal/bytealg/count_amd64.go",
    "content": "//go:build !purego\n\npackage bytealg\n\n// This function is similar to using the standard bytes.Count function with a\n// one-byte separator. However, the implementation makes use of AVX-512 when\n// possible, which yields measurable throughput improvements:\n//\n// name       old time/op    new time/op    delta\n// CountByte    82.5ns ± 0%    43.9ns ± 0%  -46.74%  (p=0.000 n=10+10)\n//\n// name       old speed      new speed      delta\n// CountByte  49.6GB/s ± 0%  93.2GB/s ± 0%  +87.74%  (p=0.000 n=10+10)\n//\n// On systems that do not have AVX-512, the AVX2 version of the code is also\n// optimized to make use of multiple register lanes, which gives a bit better\n// throughput than the standard library function:\n//\n// name       old time/op    new time/op    delta\n// CountByte    82.5ns ± 0%    61.0ns ± 0%  -26.04%  (p=0.000 n=10+10)\n//\n// name       old speed      new speed      delta\n// CountByte  49.6GB/s ± 0%  67.1GB/s ± 0%  +35.21%  (p=0.000 n=10+10)\n//\n//go:noescape\nfunc Count(data []byte, value byte) int\n"
  },
  {
    "path": "internal/bytealg/count_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n// func Count(data []byte, value byte) int\nTEXT ·Count(SB), NOSPLIT, $0-40\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    MOVB value+24(FP), BX\n    MOVQ CX, DX // len\n    ADDQ AX, CX // end\n    XORQ SI, SI // count\n\n    CMPQ DX, $256\n    JB test\n\n    CMPB ·hasAVX2(SB), $0\n    JE test\n\n    XORQ R12, R12\n    XORQ R13, R13\n    XORQ R14, R14\n    XORQ R15, R15\n\n    CMPB ·hasAVX512Count(SB), $0\n    JE initAVX2\n\n    SHRQ $8, DX\n    SHLQ $8, DX\n    ADDQ AX, DX\n    VPBROADCASTB BX, Z0\nloopAVX512:\n    VMOVDQU64 (AX), Z1\n    VMOVDQU64 64(AX), Z2\n    VMOVDQU64 128(AX), Z3\n    VMOVDQU64 192(AX), Z4\n    VPCMPUB $0, Z0, Z1, K1\n    VPCMPUB $0, Z0, Z2, K2\n    VPCMPUB $0, Z0, Z3, K3\n    VPCMPUB $0, Z0, Z4, K4\n    KMOVQ K1, R8\n    KMOVQ K2, R9\n    KMOVQ K3, R10\n    KMOVQ K4, R11\n    POPCNTQ R8, R8\n    POPCNTQ R9, R9\n    POPCNTQ R10, R10\n    POPCNTQ R11, R11\n    ADDQ R8, R12\n    ADDQ R9, R13\n    ADDQ R10, R14\n    ADDQ R11, R15\n    ADDQ $256, AX\n    CMPQ AX, DX\n    JNE loopAVX512\n    ADDQ R12, R13\n    ADDQ R14, R15\n    ADDQ R13, SI\n    ADDQ R15, SI\n    JMP doneAVX\n\ninitAVX2:\n    SHRQ $6, DX\n    SHLQ $6, DX\n    ADDQ AX, DX\n    VPBROADCASTB value+24(FP), Y0\nloopAVX2:\n    VMOVDQU (AX), Y1\n    VMOVDQU 32(AX), Y2\n    VPCMPEQB Y0, Y1, Y1\n    VPCMPEQB Y0, Y2, Y2\n    VPMOVMSKB Y1, R12\n    VPMOVMSKB Y2, R13\n    POPCNTL R12, R12\n    POPCNTL R13, R13\n    ADDQ R12, R14\n    ADDQ R13, R15\n    ADDQ $64, AX\n    CMPQ AX, DX\n    JNE loopAVX2\n    ADDQ R14, SI\n    ADDQ R15, SI\n\ndoneAVX:\n    VZEROUPPER\n    JMP test\n\nloop:\n    MOVQ SI, DI\n    INCQ DI\n    MOVB (AX), R8\n    CMPB BX, R8\n    CMOVQEQ DI, SI\n    INCQ AX\ntest:\n    CMPQ AX, CX\n    JNE loop\ndone:\n    MOVQ SI, ret+32(FP)\n    RET\n"
  },
  {
    "path": "internal/bytealg/count_purego.go",
    "content": "//go:build purego || !amd64\n\npackage bytealg\n\nimport \"bytes\"\n\nfunc Count(data []byte, value byte) int {\n\treturn bytes.Count(data, []byte{value})\n}\n"
  },
  {
    "path": "internal/bytealg/count_test.go",
    "content": "package bytealg_test\n\nimport (\n\t\"bytes\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/internal/bytealg\"\n\t\"github.com/segmentio/parquet-go/internal/quick\"\n)\n\nfunc TestCount(t *testing.T) {\n\terr := quick.Check(func(data []byte) bool {\n\t\tdata = bytes.Repeat(data, 8)\n\t\tfor _, c := range data {\n\t\t\tn1 := bytes.Count(data, []byte{c})\n\t\t\tn2 := bytealg.Count(data, c)\n\t\t\tif n1 != n2 {\n\t\t\t\tt.Errorf(\"got=%d want=%d\", n2, n1)\n\t\t\t\treturn false\n\t\t\t}\n\t\t}\n\t\treturn true\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc BenchmarkCount(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tdata := make([]byte, bufferSize)\n\t\tfor i := range data {\n\t\t\tdata[i] = byte(i)\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tbytealg.Count(data, 0)\n\t\t}\n\t})\n}\n"
  },
  {
    "path": "internal/debug/debug.go",
    "content": "package debug\n\nimport (\n\t\"encoding/hex\"\n\t\"fmt\"\n\t\"io\"\n\t\"log\"\n\t\"os\"\n\t\"strconv\"\n\t\"strings\"\n)\n\nfunc ReaderAt(reader io.ReaderAt, prefix string) io.ReaderAt {\n\treturn &ioReaderAt{\n\t\treader: reader,\n\t\tprefix: prefix,\n\t}\n}\n\ntype ioReaderAt struct {\n\treader io.ReaderAt\n\tprefix string\n}\n\nfunc (d *ioReaderAt) ReadAt(b []byte, off int64) (int, error) {\n\tn, err := d.reader.ReadAt(b, off)\n\tfmt.Printf(\"%s: Read(%d) @%d => %d %v \\n%s\\n\", d.prefix, len(b), off, n, err, hex.Dump(b[:n]))\n\treturn n, err\n}\n\nfunc Reader(reader io.Reader, prefix string) io.Reader {\n\treturn &ioReader{\n\t\treader: reader,\n\t\tprefix: prefix,\n\t}\n}\n\ntype ioReader struct {\n\treader io.Reader\n\tprefix string\n\toffset int64\n}\n\nfunc (d *ioReader) Read(b []byte) (int, error) {\n\tn, err := d.reader.Read(b)\n\tfmt.Printf(\"%s: Read(%d) @%d => %d %v \\n%s\\n\", d.prefix, len(b), d.offset, n, err, hex.Dump(b[:n]))\n\td.offset += int64(n)\n\treturn n, err\n}\n\nfunc Writer(writer io.Writer, prefix string) io.Writer {\n\treturn &ioWriter{\n\t\twriter: writer,\n\t\tprefix: prefix,\n\t}\n}\n\ntype ioWriter struct {\n\twriter io.Writer\n\tprefix string\n\toffset int64\n}\n\nfunc (d *ioWriter) Write(b []byte) (int, error) {\n\tn, err := d.writer.Write(b)\n\tfmt.Printf(\"%s: Write(%d) @%d => %d %v \\n  %q\\n\", d.prefix, len(b), d.offset, n, err, b[:n])\n\td.offset += int64(n)\n\treturn n, err\n}\n\nvar (\n\tTRACEBUF int\n)\n\nfunc init() {\n\tfor _, arg := range strings.Split(os.Getenv(\"PARQUETGODEBUG\"), \",\") {\n\t\tk := arg\n\t\tv := \"\"\n\t\ti := strings.IndexByte(arg, '=')\n\t\tif i >= 0 {\n\t\t\tk, v = arg[:i], arg[i+1:]\n\t\t}\n\t\tvar err error\n\t\tswitch k {\n\t\tcase \"\":\n\t\t\t// ignore empty entries\n\t\tcase \"tracebuf\":\n\t\t\tif TRACEBUF, err = strconv.Atoi(v); err != nil {\n\t\t\t\tlog.Printf(\"PARQUETGODEBUG: invalid value for tracebuf: %q\", v)\n\t\t\t}\n\t\tdefault:\n\t\t\tlog.Printf(\"PARQUETGODEBUG: unrecognized debug option: %q\", k)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "internal/debug/finalizer_off.go",
    "content": "//go:build debug\n\npackage debug\n\n// SetFinalizer is a no-op when the debug tag is specified.\nfunc SetFinalizer(interface{}, interface{}) {}\n"
  },
  {
    "path": "internal/debug/finalizer_on.go",
    "content": "//go:build !debug\n\npackage debug\n\nimport \"runtime\"\n\nfunc SetFinalizer(obj, finalizer interface{}) { runtime.SetFinalizer(obj, finalizer) }\n"
  },
  {
    "path": "internal/quick/quick.go",
    "content": "package quick\n\nimport (\n\t\"fmt\"\n\t\"math\"\n\t\"math/rand\"\n\t\"reflect\"\n\t\"strings\"\n\t\"time\"\n)\n\nvar DefaultConfig = Config{\n\tSizes: []int{\n\t\t0, 1, 2, 3, 4, 5, 6, 7, 8, 9,\n\t\t10, 11, 12, 13, 14, 15, 16, 17, 18, 19,\n\t\t20, 21, 22, 23, 24, 25, 26, 27, 28, 29,\n\t\t30, 31, 32, 33, 34, 35, 36, 37, 38, 39,\n\t\t99, 100, 101,\n\t\t127, 128, 129,\n\t\t255, 256, 257,\n\t\t1000, 1023, 1024, 1025,\n\t\t2000, 2095, 2048, 2049,\n\t\t4000, 4095, 4096, 4097,\n\t},\n\tSeed: 0,\n}\n\n// Check is inspired by the standard quick.Check package, but enhances the\n// API and tests arrays of larger sizes than the maximum of 50 hardcoded in\n// testing/quick.\nfunc Check(f interface{}) error {\n\treturn DefaultConfig.Check(f)\n}\n\ntype Config struct {\n\tSizes []int\n\tSeed  int64\n}\n\nfunc (c *Config) Check(f interface{}) error {\n\tv := reflect.ValueOf(f)\n\tr := rand.New(rand.NewSource(c.Seed))\n\tt := v.Type().In(0)\n\n\tmakeValue := MakeValueFuncOf(t.Elem())\n\tmakeArray := func(n int) reflect.Value {\n\t\tarray := reflect.MakeSlice(t, n, n)\n\t\tfor i := 0; i < n; i++ {\n\t\t\tmakeValue(array.Index(i), r)\n\t\t}\n\t\treturn array\n\t}\n\n\tif makeArray == nil {\n\t\tpanic(\"cannot run quick check on function with input of type \" + v.Type().In(0).String())\n\t}\n\n\tfor _, n := range c.Sizes {\n\t\tfor i := 0; i < 3; i++ {\n\t\t\tin := makeArray(n)\n\t\t\tok := v.Call([]reflect.Value{in})\n\t\t\tif !ok[0].Bool() {\n\t\t\t\treturn fmt.Errorf(\"test #%d: failed on input of size %d: %#v\\n\", i+1, n, in.Interface())\n\t\t\t}\n\t\t}\n\t}\n\treturn nil\n\n}\n\ntype MakeValueFunc func(reflect.Value, *rand.Rand)\n\nfunc MakeValueFuncOf(t reflect.Type) MakeValueFunc {\n\tswitch t {\n\tcase reflect.TypeOf(time.Time{}):\n\t\treturn func(v reflect.Value, r *rand.Rand) {\n\t\t\t// TODO: This is a hack to support the matching of times in a precision\n\t\t\t// other than nanosecond by generating times rounded to the second. A\n\t\t\t// better solution would be to update columns types to add a compare\n\t\t\t// function.\n\t\t\tsec := r.Int63n(2524608000) // 2050-01-01\n\t\t\tv.Set(reflect.ValueOf(time.Unix(sec, 0).UTC()))\n\t\t}\n\t}\n\n\tswitch t.Kind() {\n\tcase reflect.Bool:\n\t\treturn func(v reflect.Value, r *rand.Rand) {\n\t\t\tv.SetBool((r.Int() % 2) != 0)\n\t\t}\n\n\tcase reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:\n\t\treturn func(v reflect.Value, r *rand.Rand) {\n\t\t\tv.SetInt(r.Int63n(math.MaxInt32))\n\t\t}\n\n\tcase reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr:\n\t\treturn func(v reflect.Value, r *rand.Rand) {\n\t\t\tv.SetUint(r.Uint64())\n\t\t}\n\n\tcase reflect.Float32, reflect.Float64:\n\t\treturn func(v reflect.Value, r *rand.Rand) {\n\t\t\tv.SetFloat(r.Float64())\n\t\t}\n\n\tcase reflect.String:\n\t\treturn func(v reflect.Value, r *rand.Rand) {\n\t\t\tconst characters = \"1234567890qwertyuiopasdfghjklzxcvbnm\"\n\t\t\ts := new(strings.Builder)\n\t\t\tn := r.Intn(10)\n\t\t\tfor i := 0; i < n; i++ {\n\t\t\t\ts.WriteByte(characters[i])\n\t\t\t}\n\t\t\tv.SetString(s.String())\n\t\t}\n\n\tcase reflect.Array:\n\t\tmakeElem := MakeValueFuncOf(t.Elem())\n\t\treturn func(v reflect.Value, r *rand.Rand) {\n\t\t\tfor i, n := 0, v.Len(); i < n; i++ {\n\t\t\t\tmakeElem(v.Index(i), r)\n\t\t\t}\n\t\t}\n\n\tcase reflect.Slice:\n\t\tswitch e := t.Elem(); e.Kind() {\n\t\tcase reflect.Uint8:\n\t\t\treturn func(v reflect.Value, r *rand.Rand) {\n\t\t\t\tb := make([]byte, r.Intn(50))\n\t\t\t\tr.Read(b)\n\t\t\t\tv.SetBytes(b)\n\t\t\t}\n\t\tdefault:\n\t\t\tmakeElem := MakeValueFuncOf(t.Elem())\n\t\t\treturn func(v reflect.Value, r *rand.Rand) {\n\t\t\t\tn := r.Intn(10)\n\t\t\t\ts := reflect.MakeSlice(t, n, n)\n\t\t\t\tfor i := 0; i < n; i++ {\n\t\t\t\t\tmakeElem(s.Index(i), r)\n\t\t\t\t}\n\t\t\t\tv.Set(s)\n\t\t\t}\n\t\t}\n\n\tcase reflect.Map:\n\t\tmakeKey := MakeValueFuncOf(t.Key())\n\t\tmakeElem := MakeValueFuncOf(t.Elem())\n\t\treturn func(v reflect.Value, r *rand.Rand) {\n\t\t\tm := reflect.MakeMap(t)\n\t\t\tn := r.Intn(10)\n\t\t\tk := reflect.New(t.Key()).Elem()\n\t\t\te := reflect.New(t.Elem()).Elem()\n\t\t\tfor i := 0; i < n; i++ {\n\t\t\t\tmakeKey(k, r)\n\t\t\t\tmakeElem(e, r)\n\t\t\t\tm.SetMapIndex(k, e)\n\t\t\t}\n\t\t\tv.Set(m)\n\t\t}\n\n\tcase reflect.Struct:\n\t\tfields := make([]reflect.StructField, 0, t.NumField())\n\t\tmakeValues := make([]MakeValueFunc, 0, cap(fields))\n\t\tfor i, n := 0, cap(fields); i < n; i++ {\n\t\t\tif f := t.Field(i); f.PkgPath == \"\" { // skip unexported fields\n\t\t\t\tfields = append(fields, f)\n\t\t\t\tmakeValues = append(makeValues, MakeValueFuncOf(f.Type))\n\t\t\t}\n\t\t}\n\t\treturn func(v reflect.Value, r *rand.Rand) {\n\t\t\tfor i := range fields {\n\t\t\t\tmakeValues[i](v.FieldByIndex(fields[i].Index), r)\n\t\t\t}\n\t\t}\n\n\tcase reflect.Ptr:\n\t\tt = t.Elem()\n\t\tmakeValue := MakeValueFuncOf(t)\n\t\treturn func(v reflect.Value, r *rand.Rand) {\n\t\t\tv.Set(reflect.New(t))\n\t\t\tmakeValue(v.Elem(), r)\n\t\t}\n\n\tdefault:\n\t\tpanic(\"quick.Check does not support test values of type \" + t.String())\n\t}\n}\n"
  },
  {
    "path": "internal/unsafecast/unsafecast_go17.go",
    "content": "//go:build !go1.18\n\npackage unsafecast\n\nimport (\n\t\"reflect\"\n\t\"unsafe\"\n)\n\nfunc AddressOfBytes(data []byte) *byte {\n\treturn *(**byte)(unsafe.Pointer(&data))\n}\n\nfunc AddressOfString(data string) *byte {\n\treturn *(**byte)(unsafe.Pointer(&data))\n}\n\nfunc PointerOfValue(value reflect.Value) unsafe.Pointer {\n\treturn (*[2]unsafe.Pointer)(unsafe.Pointer(&value))[1]\n}\n\nfunc BoolToBytes(data []bool) []byte {\n\treturn unsafe.Slice(*(**byte)(unsafe.Pointer(&data)), len(data))\n}\n\nfunc Int8ToBytes(data []int8) []byte {\n\treturn unsafe.Slice(*(**byte)(unsafe.Pointer(&data)), len(data))\n}\n\nfunc Int16ToBytes(data []int16) []byte {\n\treturn unsafe.Slice(*(**byte)(unsafe.Pointer(&data)), 2*len(data))\n}\n\nfunc Int32ToBytes(data []int32) []byte {\n\treturn unsafe.Slice(*(**byte)(unsafe.Pointer(&data)), 4*len(data))\n}\n\nfunc Int64ToBytes(data []int64) []byte {\n\treturn unsafe.Slice(*(**byte)(unsafe.Pointer(&data)), 8*len(data))\n}\n\nfunc Float32ToBytes(data []float32) []byte {\n\treturn unsafe.Slice(*(**byte)(unsafe.Pointer(&data)), 4*len(data))\n}\n\nfunc Float64ToBytes(data []float64) []byte {\n\treturn unsafe.Slice(*(**byte)(unsafe.Pointer(&data)), 8*len(data))\n}\n\nfunc Int16ToUint16(data []int16) []uint16 {\n\treturn unsafe.Slice(*(**uint16)(unsafe.Pointer(&data)), len(data))\n}\n\nfunc Int32ToUint32(data []int32) []uint32 {\n\treturn unsafe.Slice(*(**uint32)(unsafe.Pointer(&data)), len(data))\n}\n\nfunc Int64ToUint64(data []int64) []uint64 {\n\treturn unsafe.Slice(*(**uint64)(unsafe.Pointer(&data)), len(data))\n}\n\nfunc Float32ToUint32(data []float32) []uint32 {\n\treturn unsafe.Slice(*(**uint32)(unsafe.Pointer(&data)), len(data))\n}\n\nfunc Float64ToUint64(data []float64) []uint64 {\n\treturn unsafe.Slice(*(**uint64)(unsafe.Pointer(&data)), len(data))\n}\n\nfunc Uint32ToBytes(data []uint32) []byte {\n\treturn unsafe.Slice(*(**byte)(unsafe.Pointer(&data)), 4*len(data))\n}\n\nfunc Uint64ToBytes(data []uint64) []byte {\n\treturn unsafe.Slice(*(**byte)(unsafe.Pointer(&data)), 8*len(data))\n}\n\nfunc Uint128ToBytes(data [][16]byte) []byte {\n\treturn unsafe.Slice(*(**byte)(unsafe.Pointer(&data)), 16*len(data))\n}\n\nfunc Uint32ToInt32(data []uint32) []int32 {\n\treturn unsafe.Slice(*(**int32)(unsafe.Pointer(&data)), len(data))\n}\n\nfunc Uint32ToInt64(data []uint32) []int64 {\n\treturn unsafe.Slice(*(**int64)(unsafe.Pointer(&data)), len(data)/2)\n}\n\nfunc Uint64ToInt64(data []uint64) []int64 {\n\treturn unsafe.Slice(*(**int64)(unsafe.Pointer(&data)), len(data))\n}\n\nfunc BytesToBool(data []byte) []bool {\n\treturn unsafe.Slice(*(**bool)(unsafe.Pointer(&data)), len(data))\n}\n\nfunc BytesToInt8(data []byte) []int8 {\n\treturn unsafe.Slice(*(**int8)(unsafe.Pointer(&data)), len(data))\n}\n\nfunc BytesToInt16(data []byte) []int16 {\n\treturn unsafe.Slice(*(**int16)(unsafe.Pointer(&data)), len(data)/2)\n}\n\nfunc BytesToInt32(data []byte) []int32 {\n\treturn unsafe.Slice(*(**int32)(unsafe.Pointer(&data)), len(data)/4)\n}\n\nfunc BytesToInt64(data []byte) []int64 {\n\treturn unsafe.Slice(*(**int64)(unsafe.Pointer(&data)), len(data)/8)\n}\n\nfunc BytesToUint32(data []byte) []uint32 {\n\treturn unsafe.Slice(*(**uint32)(unsafe.Pointer(&data)), len(data)/4)\n}\n\nfunc BytesToUint64(data []byte) []uint64 {\n\treturn unsafe.Slice(*(**uint64)(unsafe.Pointer(&data)), len(data)/8)\n}\n\nfunc BytesToUint128(data []byte) [][16]byte {\n\treturn unsafe.Slice(*(**[16]byte)(unsafe.Pointer(&data)), len(data)/16)\n}\n\nfunc BytesToFloat32(data []byte) []float32 {\n\treturn unsafe.Slice(*(**float32)(unsafe.Pointer(&data)), len(data)/4)\n}\n\nfunc BytesToFloat64(data []byte) []float64 {\n\treturn unsafe.Slice(*(**float64)(unsafe.Pointer(&data)), len(data)/8)\n}\n\nfunc BytesToString(data []byte) string {\n\treturn *(*string)(unsafe.Pointer(&data))\n}\n\nfunc Bytes(data *byte, size int) []byte {\n\treturn unsafe.Slice(data, size)\n}\n"
  },
  {
    "path": "internal/unsafecast/unsafecast_go18.go",
    "content": "//go:build go1.18\n\n// Package unsafecast exposes functions to bypass the Go type system and perform\n// conversions between types that would otherwise not be possible.\n//\n// The functions of this package are mostly useful as optimizations to avoid\n// memory copies when converting between compatible memory layouts; for example,\n// casting a [][16]byte to a []byte in order to use functions of the standard\n// bytes package on the slices.\n//\n//\tWith great power comes great responsibility.\npackage unsafecast\n\nimport (\n\t\"reflect\"\n\t\"unsafe\"\n)\n\n// AddressOf returns the address to the first element in data, even if the slice\n// has length zero.\nfunc AddressOf[T any](data []T) *T {\n\treturn *(**T)(unsafe.Pointer(&data))\n}\n\n// AddressOfBytes returns the address of the first byte in data.\nfunc AddressOfBytes(data []byte) *byte {\n\treturn *(**byte)(unsafe.Pointer(&data))\n}\n\n// AddressOfString returns the address of the first byte in data.\nfunc AddressOfString(data string) *byte {\n\treturn *(**byte)(unsafe.Pointer(&data))\n}\n\n// PointerOf is like AddressOf but returns an unsafe.Pointer, losing type\n// information about the underlying data.\nfunc PointerOf[T any](data []T) unsafe.Pointer {\n\treturn unsafe.Pointer(AddressOf(data))\n}\n\n// PointerOfString is like AddressOfString but returns an unsafe.Pointer, losing\n// type information about the underlying data.\nfunc PointerOfString(data string) unsafe.Pointer {\n\treturn unsafe.Pointer(AddressOfString(data))\n}\n\n// PointerOfValue returns the address of the object packed in the given value.\n//\n// This function is like value.UnsafePointer but works for any underlying type,\n// bypassing the safety checks done by the reflect package.\nfunc PointerOfValue(value reflect.Value) unsafe.Pointer {\n\treturn (*[2]unsafe.Pointer)(unsafe.Pointer(&value))[1]\n}\n\n// The slice type represents the memory layout of slices in Go. It is similar to\n// reflect.SliceHeader but uses a unsafe.Pointer instead of uintptr to for the\n// backing array to allow the garbage collector to track track the reference.\ntype slice struct {\n\tptr unsafe.Pointer\n\tlen int\n\tcap int\n}\n\n// Slice converts the data slice of type []From to a slice of type []To sharing\n// the same backing array. The length and capacity of the returned slice are\n// scaled according to the size difference between the source and destination\n// types.\n//\n// Note that the function does not perform any checks to ensure that the memory\n// layouts of the types are compatible, it is possible to cause memory\n// corruption if the layouts mismatch (e.g. the pointers in the From are different\n// than the pointers in To).\nfunc Slice[To, From any](data []From) []To {\n\t// This function could use unsafe.Slice but it would drop the capacity\n\t// information, so instead we implement the type conversion.\n\tvar zf From\n\tvar zt To\n\tvar s = (*slice)(unsafe.Pointer(&data))\n\ts.len = int((uintptr(s.len) * unsafe.Sizeof(zf)) / unsafe.Sizeof(zt))\n\ts.cap = int((uintptr(s.cap) * unsafe.Sizeof(zf)) / unsafe.Sizeof(zt))\n\treturn *(*[]To)(unsafe.Pointer(s))\n}\n\n// Bytes constructs a byte slice. The pointer to the first element of the slice\n// is set to data, the length and capacity are set to size.\nfunc Bytes(data *byte, size int) []byte {\n\treturn *(*[]byte)(unsafe.Pointer(&slice{\n\t\tptr: unsafe.Pointer(data),\n\t\tlen: size,\n\t\tcap: size,\n\t}))\n}\n\n// BytesToString converts a byte slice to a string value. The returned string\n// shares the backing array of the byte slice.\n//\n// Programs using this function are responsible for ensuring that the data slice\n// is not modified while the returned string is in use, otherwise the guarantee\n// of immutability of Go string values will be violated, resulting in undefined\n// behavior.\nfunc BytesToString(data []byte) string {\n\treturn *(*string)(unsafe.Pointer(&data))\n}\n\n// StringToBytes applies the inverse conversion of BytesToString.\nfunc StringToBytes(data string) []byte {\n\treturn *(*[]byte)(unsafe.Pointer(&slice{\n\t\tptr: PointerOfString(data),\n\t\tlen: len(data),\n\t\tcap: len(data),\n\t}))\n}\n\n// -----------------------------------------------------------------------------\n// TODO: the functions below are used for backward compatibility with Go 1.17\n// where generics weren't available. We should remove them and inline calls to\n// unsafecast.Slice when we change our minimum supported Go version to 1.18.\n// -----------------------------------------------------------------------------\n\nfunc BoolToBytes(data []bool) []byte { return Slice[byte](data) }\n\nfunc Int8ToBytes(data []int8) []byte { return Slice[byte](data) }\n\nfunc Int16ToBytes(data []int16) []byte { return Slice[byte](data) }\n\nfunc Int32ToBytes(data []int32) []byte { return Slice[byte](data) }\n\nfunc Int64ToBytes(data []int64) []byte { return Slice[byte](data) }\n\nfunc Float32ToBytes(data []float32) []byte { return Slice[byte](data) }\n\nfunc Float64ToBytes(data []float64) []byte { return Slice[byte](data) }\n\nfunc Uint32ToBytes(data []uint32) []byte { return Slice[byte](data) }\n\nfunc Uint64ToBytes(data []uint64) []byte { return Slice[byte](data) }\n\nfunc Uint128ToBytes(data [][16]byte) []byte { return Slice[byte](data) }\n\nfunc Int16ToUint16(data []int16) []uint16 { return Slice[uint16](data) }\n\nfunc Int32ToUint32(data []int32) []uint32 { return Slice[uint32](data) }\n\nfunc Int64ToUint64(data []int64) []uint64 { return Slice[uint64](data) }\n\nfunc Float32ToUint32(data []float32) []uint32 { return Slice[uint32](data) }\n\nfunc Float64ToUint64(data []float64) []uint64 { return Slice[uint64](data) }\n\nfunc Uint32ToInt32(data []uint32) []int32 { return Slice[int32](data) }\n\nfunc Uint32ToInt64(data []uint32) []int64 { return Slice[int64](data) }\n\nfunc Uint64ToInt64(data []uint64) []int64 { return Slice[int64](data) }\n\nfunc BytesToBool(data []byte) []bool { return Slice[bool](data) }\n\nfunc BytesToInt8(data []byte) []int8 { return Slice[int8](data) }\n\nfunc BytesToInt16(data []byte) []int16 { return Slice[int16](data) }\n\nfunc BytesToInt32(data []byte) []int32 { return Slice[int32](data) }\n\nfunc BytesToInt64(data []byte) []int64 { return Slice[int64](data) }\n\nfunc BytesToUint32(data []byte) []uint32 { return Slice[uint32](data) }\n\nfunc BytesToUint64(data []byte) []uint64 { return Slice[uint64](data) }\n\nfunc BytesToUint128(data []byte) [][16]byte { return Slice[[16]byte](data) }\n\nfunc BytesToFloat32(data []byte) []float32 { return Slice[float32](data) }\n\nfunc BytesToFloat64(data []byte) []float64 { return Slice[float64](data) }\n"
  },
  {
    "path": "internal/unsafecast/unsafecast_go18_test.go",
    "content": "//go:build go1.18\n\npackage unsafecast_test\n\nimport (\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\nfunc TestUnsafeCastSlice(t *testing.T) {\n\ta := make([]uint32, 4, 13)\n\ta[0] = 1\n\ta[1] = 0\n\ta[2] = 2\n\ta[3] = 0\n\n\tb := unsafecast.Slice[int64](a)\n\tif len(b) != 2 { // (4 * sizeof(uint32)) / sizeof(int64)\n\t\tt.Fatalf(\"length mismatch: want=2 got=%d\", len(b))\n\t}\n\tif cap(b) != 6 { // (13 * sizeof(uint32)) / sizeof(int64)\n\t\tt.Fatalf(\"capacity mismatch: want=7 got=%d\", cap(b))\n\t}\n\tif b[0] != 1 {\n\t\tt.Errorf(\"wrong value at index 0: want=1 got=%d\", b[0])\n\t}\n\tif b[1] != 2 {\n\t\tt.Errorf(\"wrong value at index 1: want=2 got=%d\", b[1])\n\t}\n\n\tc := unsafecast.Slice[uint32](b)\n\tif len(c) != 4 {\n\t\tt.Fatalf(\"length mismatch: want=2 got=%d\", len(b))\n\t}\n\tif cap(c) != 12 {\n\t\tt.Fatalf(\"capacity mismatch: want=7 got=%d\", cap(b))\n\t}\n\tfor i := range c {\n\t\tif c[i] != a[i] {\n\t\t\tt.Errorf(\"wrong value at index %d: want=%d got=%d\", i, a[i], c[i])\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "level.go",
    "content": "package parquet\n\nimport \"github.com/segmentio/parquet-go/internal/bytealg\"\n\nfunc countLevelsEqual(levels []byte, value byte) int {\n\treturn bytealg.Count(levels, value)\n}\n\nfunc countLevelsNotEqual(levels []byte, value byte) int {\n\treturn len(levels) - countLevelsEqual(levels, value)\n}\n\nfunc appendLevel(levels []byte, value byte, count int) []byte {\n\ti := len(levels)\n\tn := len(levels) + count\n\n\tif cap(levels) < n {\n\t\tnewLevels := make([]byte, n, 2*n)\n\t\tcopy(newLevels, levels)\n\t\tlevels = newLevels\n\t} else {\n\t\tlevels = levels[:n]\n\t}\n\n\tbytealg.Broadcast(levels[i:], value)\n\treturn levels\n}\n"
  },
  {
    "path": "limits.go",
    "content": "package parquet\n\nimport (\n\t\"fmt\"\n\t\"math\"\n)\n\nconst (\n\t// MaxColumnDepth is the maximum column depth supported by this package.\n\tMaxColumnDepth = math.MaxInt8\n\n\t// MaxColumnIndex is the maximum column index supported by this package.\n\tMaxColumnIndex = math.MaxInt16\n\n\t// MaxRepetitionLevel is the maximum repetition level supported by this\n\t// package.\n\tMaxRepetitionLevel = math.MaxUint8\n\n\t// MaxDefinitionLevel is the maximum definition level supported by this\n\t// package.\n\tMaxDefinitionLevel = math.MaxUint8\n\n\t// MaxRowGroups is the maximum number of row groups which can be contained\n\t// in a single parquet file.\n\t//\n\t// This limit is enforced by the use of 16 bits signed integers in the file\n\t// metadata footer of parquet files. It is part of the parquet specification\n\t// and therefore cannot be changed.\n\tMaxRowGroups = math.MaxInt16\n)\n\nconst (\n\testimatedSizeOfByteArrayValues = 20\n)\n\nfunc makeRepetitionLevel(i int) byte {\n\tcheckIndexRange(\"repetition level\", i, 0, MaxRepetitionLevel)\n\treturn byte(i)\n}\n\nfunc makeDefinitionLevel(i int) byte {\n\tcheckIndexRange(\"definition level\", i, 0, MaxDefinitionLevel)\n\treturn byte(i)\n}\n\nfunc makeColumnIndex(i int) int16 {\n\tcheckIndexRange(\"column index\", i, 0, MaxColumnIndex)\n\treturn int16(i)\n}\n\nfunc makeNumValues(i int) int32 {\n\tcheckIndexRange(\"number of values\", i, 0, math.MaxInt32)\n\treturn int32(i)\n}\n\nfunc checkIndexRange(typ string, i, min, max int) {\n\tif i < min || i > max {\n\t\tpanic(errIndexOutOfRange(typ, i, min, max))\n\t}\n}\n\nfunc errIndexOutOfRange(typ string, i, min, max int) error {\n\treturn fmt.Errorf(\"%s out of range: %d not in [%d:%d]\", typ, i, min, max)\n}\n"
  },
  {
    "path": "merge.go",
    "content": "package parquet\n\nimport (\n\t\"container/heap\"\n\t\"fmt\"\n\t\"io\"\n)\n\n// MergeRowGroups constructs a row group which is a merged view of rowGroups. If\n// rowGroups are sorted and the passed options include sorting, the merged row\n// group will also be sorted.\n//\n// The function validates the input to ensure that the merge operation is\n// possible, ensuring that the schemas match or can be converted to an\n// optionally configured target schema passed as argument in the option list.\n//\n// The sorting columns of each row group are also consulted to determine whether\n// the output can be represented. If sorting columns are configured on the merge\n// they must be a prefix of sorting columns of all row groups being merged.\nfunc MergeRowGroups(rowGroups []RowGroup, options ...RowGroupOption) (RowGroup, error) {\n\tconfig, err := NewRowGroupConfig(options...)\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\n\tschema := config.Schema\n\tif len(rowGroups) == 0 {\n\t\treturn newEmptyRowGroup(schema), nil\n\t}\n\tif schema == nil {\n\t\tschema = rowGroups[0].Schema()\n\n\t\tfor _, rowGroup := range rowGroups[1:] {\n\t\t\tif !nodesAreEqual(schema, rowGroup.Schema()) {\n\t\t\t\treturn nil, ErrRowGroupSchemaMismatch\n\t\t\t}\n\t\t}\n\t}\n\n\tmergedRowGroups := make([]RowGroup, len(rowGroups))\n\tcopy(mergedRowGroups, rowGroups)\n\n\tfor i, rowGroup := range mergedRowGroups {\n\t\tif rowGroupSchema := rowGroup.Schema(); !nodesAreEqual(schema, rowGroupSchema) {\n\t\t\tconv, err := Convert(schema, rowGroupSchema)\n\t\t\tif err != nil {\n\t\t\t\treturn nil, fmt.Errorf(\"cannot merge row groups: %w\", err)\n\t\t\t}\n\t\t\tmergedRowGroups[i] = ConvertRowGroup(rowGroup, conv)\n\t\t}\n\t}\n\n\tm := &mergedRowGroup{sorting: config.Sorting.SortingColumns}\n\tm.init(schema, mergedRowGroups)\n\n\tif len(m.sorting) == 0 {\n\t\t// When the row group has no ordering, use a simpler version of the\n\t\t// merger which simply concatenates rows from each of the row groups.\n\t\t// This is preferable because it makes the output deterministic, the\n\t\t// heap merge may otherwise reorder rows across groups.\n\t\treturn &m.multiRowGroup, nil\n\t}\n\n\tfor _, rowGroup := range m.rowGroups {\n\t\tif !sortingColumnsHavePrefix(rowGroup.SortingColumns(), m.sorting) {\n\t\t\treturn nil, ErrRowGroupSortingColumnsMismatch\n\t\t}\n\t}\n\n\tm.compare = compareRowsFuncOf(schema, m.sorting)\n\treturn m, nil\n}\n\ntype mergedRowGroup struct {\n\tmultiRowGroup\n\tsorting []SortingColumn\n\tcompare func(Row, Row) int\n}\n\nfunc (m *mergedRowGroup) SortingColumns() []SortingColumn {\n\treturn m.sorting\n}\n\nfunc (m *mergedRowGroup) Rows() Rows {\n\t// The row group needs to respect a sorting order; the merged row reader\n\t// uses a heap to merge rows from the row groups.\n\trows := make([]Rows, len(m.rowGroups))\n\tfor i := range rows {\n\t\trows[i] = m.rowGroups[i].Rows()\n\t}\n\treturn &mergedRowGroupRows{\n\t\tmerge: mergedRowReader{\n\t\t\tcompare: m.compare,\n\t\t\treaders: makeBufferedRowReaders(len(rows), func(i int) RowReader { return rows[i] }),\n\t\t},\n\t\trows:   rows,\n\t\tschema: m.schema,\n\t}\n}\n\ntype mergedRowGroupRows struct {\n\tmerge     mergedRowReader\n\trowIndex  int64\n\tseekToRow int64\n\trows      []Rows\n\tschema    *Schema\n}\n\nfunc (r *mergedRowGroupRows) readInternal(rows []Row) (int, error) {\n\tn, err := r.merge.ReadRows(rows)\n\tr.rowIndex += int64(n)\n\treturn n, err\n}\n\nfunc (r *mergedRowGroupRows) Close() (lastErr error) {\n\tr.merge.close()\n\tr.rowIndex = 0\n\tr.seekToRow = 0\n\n\tfor _, rows := range r.rows {\n\t\tif err := rows.Close(); err != nil {\n\t\t\tlastErr = err\n\t\t}\n\t}\n\n\treturn lastErr\n}\n\nfunc (r *mergedRowGroupRows) ReadRows(rows []Row) (int, error) {\n\tfor r.rowIndex < r.seekToRow {\n\t\tn := int(r.seekToRow - r.rowIndex)\n\t\tif n > len(rows) {\n\t\t\tn = len(rows)\n\t\t}\n\t\tn, err := r.readInternal(rows[:n])\n\t\tif err != nil {\n\t\t\treturn 0, err\n\t\t}\n\t}\n\n\treturn r.readInternal(rows)\n}\n\nfunc (r *mergedRowGroupRows) SeekToRow(rowIndex int64) error {\n\tif rowIndex >= r.rowIndex {\n\t\tr.seekToRow = rowIndex\n\t\treturn nil\n\t}\n\treturn fmt.Errorf(\"SeekToRow: merged row reader cannot seek backward from row %d to %d\", r.rowIndex, rowIndex)\n}\n\nfunc (r *mergedRowGroupRows) Schema() *Schema {\n\treturn r.schema\n}\n\n// MergeRowReader constructs a RowReader which creates an ordered sequence of\n// all the readers using the given compare function as the ordering predicate.\nfunc MergeRowReaders(readers []RowReader, compare func(Row, Row) int) RowReader {\n\treturn &mergedRowReader{\n\t\tcompare: compare,\n\t\treaders: makeBufferedRowReaders(len(readers), func(i int) RowReader { return readers[i] }),\n\t}\n}\n\nfunc makeBufferedRowReaders(numReaders int, readerAt func(int) RowReader) []*bufferedRowReader {\n\tbuffers := make([]bufferedRowReader, numReaders)\n\treaders := make([]*bufferedRowReader, numReaders)\n\n\tfor i := range readers {\n\t\tbuffers[i].rows = readerAt(i)\n\t\treaders[i] = &buffers[i]\n\t}\n\n\treturn readers\n}\n\ntype mergedRowReader struct {\n\tcompare     func(Row, Row) int\n\treaders     []*bufferedRowReader\n\tinitialized bool\n}\n\nfunc (m *mergedRowReader) initialize() error {\n\tfor i, r := range m.readers {\n\t\tswitch err := r.read(); err {\n\t\tcase nil:\n\t\tcase io.EOF:\n\t\t\tm.readers[i] = nil\n\t\tdefault:\n\t\t\tm.readers = nil\n\t\t\treturn err\n\t\t}\n\t}\n\n\tn := 0\n\tfor _, r := range m.readers {\n\t\tif r != nil {\n\t\t\tm.readers[n] = r\n\t\t\tn++\n\t\t}\n\t}\n\n\tclear := m.readers[n:]\n\tfor i := range clear {\n\t\tclear[i] = nil\n\t}\n\n\tm.readers = m.readers[:n]\n\theap.Init(m)\n\treturn nil\n}\n\nfunc (m *mergedRowReader) close() {\n\tfor _, r := range m.readers {\n\t\tr.close()\n\t}\n\tm.readers = nil\n}\n\nfunc (m *mergedRowReader) ReadRows(rows []Row) (n int, err error) {\n\tif !m.initialized {\n\t\tm.initialized = true\n\n\t\tif err := m.initialize(); err != nil {\n\t\t\treturn 0, err\n\t\t}\n\t}\n\n\tfor n < len(rows) && len(m.readers) != 0 {\n\t\tr := m.readers[0]\n\n\t\trows[n] = append(rows[n][:0], r.head()...)\n\t\tn++\n\n\t\tif err := r.next(); err != nil {\n\t\t\tif err != io.EOF {\n\t\t\t\treturn n, err\n\t\t\t}\n\t\t\theap.Pop(m)\n\t\t} else {\n\t\t\theap.Fix(m, 0)\n\t\t}\n\t}\n\n\tif len(m.readers) == 0 {\n\t\terr = io.EOF\n\t}\n\n\treturn n, err\n}\n\nfunc (m *mergedRowReader) Less(i, j int) bool {\n\treturn m.compare(m.readers[i].head(), m.readers[j].head()) < 0\n}\n\nfunc (m *mergedRowReader) Len() int {\n\treturn len(m.readers)\n}\n\nfunc (m *mergedRowReader) Swap(i, j int) {\n\tm.readers[i], m.readers[j] = m.readers[j], m.readers[i]\n}\n\nfunc (m *mergedRowReader) Push(x interface{}) {\n\tpanic(\"NOT IMPLEMENTED\")\n}\n\nfunc (m *mergedRowReader) Pop() interface{} {\n\ti := len(m.readers) - 1\n\tr := m.readers[i]\n\tm.readers = m.readers[:i]\n\treturn r\n}\n\ntype bufferedRowReader struct {\n\trows RowReader\n\toff  int32\n\tend  int32\n\tbuf  [10]Row\n}\n\nfunc (r *bufferedRowReader) head() Row {\n\treturn r.buf[r.off]\n}\n\nfunc (r *bufferedRowReader) next() error {\n\tif r.off++; r.off == r.end {\n\t\tr.off = 0\n\t\tr.end = 0\n\t\treturn r.read()\n\t}\n\treturn nil\n}\n\nfunc (r *bufferedRowReader) read() error {\n\tif r.rows == nil {\n\t\treturn io.EOF\n\t}\n\tn, err := r.rows.ReadRows(r.buf[r.end:])\n\tif err != nil && n == 0 {\n\t\treturn err\n\t}\n\tr.end += int32(n)\n\treturn nil\n}\n\nfunc (r *bufferedRowReader) close() {\n\tr.rows = nil\n\tr.off = 0\n\tr.end = 0\n}\n\nvar (\n\t_ RowReaderWithSchema = (*mergedRowGroupRows)(nil)\n)\n"
  },
  {
    "path": "merge_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"bytes\"\n\t\"errors\"\n\t\"fmt\"\n\t\"io\"\n\t\"math/rand\"\n\t\"sort\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nconst (\n\tnumRowGroups = 3\n\trowsPerGroup = benchmarkNumRows\n)\n\ntype wrappedRowGroup struct {\n\tparquet.RowGroup\n\trowsCallback func(parquet.Rows) parquet.Rows\n}\n\nfunc (r wrappedRowGroup) Rows() parquet.Rows {\n\treturn r.rowsCallback(r.RowGroup.Rows())\n}\n\ntype wrappedRows struct {\n\tparquet.Rows\n\tclosed bool\n}\n\nfunc (r *wrappedRows) Close() error {\n\tr.closed = true\n\treturn r.Rows.Close()\n}\n\nfunc TestMergeRowGroups(t *testing.T) {\n\ttests := []struct {\n\t\tscenario string\n\t\toptions  []parquet.RowGroupOption\n\t\tinput    []parquet.RowGroup\n\t\toutput   parquet.RowGroup\n\t}{\n\t\t{\n\t\t\tscenario: \"no row groups\",\n\t\t\toptions: []parquet.RowGroupOption{\n\t\t\t\tparquet.SchemaOf(Person{}),\n\t\t\t},\n\t\t\toutput: sortedRowGroup(\n\t\t\t\t[]parquet.RowGroupOption{\n\t\t\t\t\tparquet.SchemaOf(Person{}),\n\t\t\t\t},\n\t\t\t),\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"a single row group\",\n\t\t\tinput: []parquet.RowGroup{\n\t\t\t\tsortedRowGroup(nil,\n\t\t\t\t\tPerson{FirstName: \"some\", LastName: \"one\", Age: 30},\n\t\t\t\t\tPerson{FirstName: \"some\", LastName: \"one else\", Age: 31},\n\t\t\t\t\tPerson{FirstName: \"and\", LastName: \"you\", Age: 32},\n\t\t\t\t),\n\t\t\t},\n\t\t\toutput: sortedRowGroup(nil,\n\t\t\t\tPerson{FirstName: \"some\", LastName: \"one\", Age: 30},\n\t\t\t\tPerson{FirstName: \"some\", LastName: \"one else\", Age: 31},\n\t\t\t\tPerson{FirstName: \"and\", LastName: \"you\", Age: 32},\n\t\t\t),\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"two row groups without ordering\",\n\t\t\tinput: []parquet.RowGroup{\n\t\t\t\tsortedRowGroup(nil, Person{FirstName: \"some\", LastName: \"one\", Age: 30}),\n\t\t\t\tsortedRowGroup(nil, Person{FirstName: \"some\", LastName: \"one else\", Age: 31}),\n\t\t\t},\n\t\t\toutput: sortedRowGroup(nil,\n\t\t\t\tPerson{FirstName: \"some\", LastName: \"one\", Age: 30},\n\t\t\t\tPerson{FirstName: \"some\", LastName: \"one else\", Age: 31},\n\t\t\t),\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"three row groups without ordering\",\n\t\t\tinput: []parquet.RowGroup{\n\t\t\t\tsortedRowGroup(nil, Person{FirstName: \"some\", LastName: \"one\", Age: 30}),\n\t\t\t\tsortedRowGroup(nil, Person{FirstName: \"some\", LastName: \"one else\", Age: 31}),\n\t\t\t\tsortedRowGroup(nil, Person{FirstName: \"question\", LastName: \"answer\", Age: 42}),\n\t\t\t},\n\t\t\toutput: sortedRowGroup(nil,\n\t\t\t\tPerson{FirstName: \"some\", LastName: \"one\", Age: 30},\n\t\t\t\tPerson{FirstName: \"some\", LastName: \"one else\", Age: 31},\n\t\t\t\tPerson{FirstName: \"question\", LastName: \"answer\", Age: 42},\n\t\t\t),\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"row groups sorted by ascending last name\",\n\t\t\toptions: []parquet.RowGroupOption{\n\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\t\tparquet.Ascending(\"LastName\"),\n\t\t\t\t\t),\n\t\t\t\t),\n\t\t\t},\n\t\t\tinput: []parquet.RowGroup{\n\t\t\t\tsortedRowGroup(\n\t\t\t\t\t[]parquet.RowGroupOption{\n\t\t\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\t\t\t\tparquet.Ascending(\"LastName\"),\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t),\n\t\t\t\t\t},\n\t\t\t\t\tPerson{FirstName: \"Han\", LastName: \"Solo\"},\n\t\t\t\t\tPerson{FirstName: \"Luke\", LastName: \"Skywalker\"},\n\t\t\t\t),\n\t\t\t\tsortedRowGroup(\n\t\t\t\t\t[]parquet.RowGroupOption{\n\t\t\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\t\t\t\tparquet.Ascending(\"LastName\"),\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t),\n\t\t\t\t\t},\n\t\t\t\t\tPerson{FirstName: \"Obiwan\", LastName: \"Kenobi\"},\n\t\t\t\t),\n\t\t\t},\n\t\t\toutput: sortedRowGroup(nil,\n\t\t\t\tPerson{FirstName: \"Obiwan\", LastName: \"Kenobi\"},\n\t\t\t\tPerson{FirstName: \"Luke\", LastName: \"Skywalker\"},\n\t\t\t\tPerson{FirstName: \"Han\", LastName: \"Solo\"},\n\t\t\t),\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"row groups sorted by descending last name\",\n\t\t\toptions: []parquet.RowGroupOption{\n\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\t\tparquet.Descending(\"LastName\"),\n\t\t\t\t\t),\n\t\t\t\t),\n\t\t\t},\n\t\t\tinput: []parquet.RowGroup{\n\t\t\t\tsortedRowGroup(\n\t\t\t\t\t[]parquet.RowGroupOption{\n\t\t\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\t\t\t\tparquet.Descending(\"LastName\"),\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t),\n\t\t\t\t\t},\n\t\t\t\t\tPerson{FirstName: \"Han\", LastName: \"Solo\"},\n\t\t\t\t\tPerson{FirstName: \"Luke\", LastName: \"Skywalker\"},\n\t\t\t\t),\n\t\t\t\tsortedRowGroup(\n\t\t\t\t\t[]parquet.RowGroupOption{\n\t\t\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\t\t\t\tparquet.Descending(\"LastName\"),\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t),\n\t\t\t\t\t},\n\t\t\t\t\tPerson{FirstName: \"Obiwan\", LastName: \"Kenobi\"},\n\t\t\t\t),\n\t\t\t},\n\t\t\toutput: sortedRowGroup(nil,\n\t\t\t\tPerson{FirstName: \"Han\", LastName: \"Solo\"},\n\t\t\t\tPerson{FirstName: \"Luke\", LastName: \"Skywalker\"},\n\t\t\t\tPerson{FirstName: \"Obiwan\", LastName: \"Kenobi\"},\n\t\t\t),\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"row groups sorted by ascending last and first name\",\n\t\t\toptions: []parquet.RowGroupOption{\n\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\t\tparquet.Ascending(\"LastName\"),\n\t\t\t\t\t\tparquet.Ascending(\"FirstName\"),\n\t\t\t\t\t),\n\t\t\t\t),\n\t\t\t},\n\t\t\tinput: []parquet.RowGroup{\n\t\t\t\tsortedRowGroup(\n\t\t\t\t\t[]parquet.RowGroupOption{\n\t\t\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\t\t\t\tparquet.Ascending(\"LastName\"),\n\t\t\t\t\t\t\t\tparquet.Ascending(\"FirstName\"),\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t),\n\t\t\t\t\t},\n\t\t\t\t\tPerson{FirstName: \"Luke\", LastName: \"Skywalker\"},\n\t\t\t\t\tPerson{FirstName: \"Han\", LastName: \"Solo\"},\n\t\t\t\t),\n\t\t\t\tsortedRowGroup(\n\t\t\t\t\t[]parquet.RowGroupOption{\n\t\t\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\t\t\t\tparquet.Ascending(\"LastName\"),\n\t\t\t\t\t\t\t\tparquet.Ascending(\"FirstName\"),\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t),\n\t\t\t\t\t},\n\t\t\t\t\tPerson{FirstName: \"Obiwan\", LastName: \"Kenobi\"},\n\t\t\t\t\tPerson{FirstName: \"Anakin\", LastName: \"Skywalker\"},\n\t\t\t\t),\n\t\t\t},\n\t\t\toutput: sortedRowGroup(nil,\n\t\t\t\tPerson{FirstName: \"Obiwan\", LastName: \"Kenobi\"},\n\t\t\t\tPerson{FirstName: \"Anakin\", LastName: \"Skywalker\"},\n\t\t\t\tPerson{FirstName: \"Luke\", LastName: \"Skywalker\"},\n\t\t\t\tPerson{FirstName: \"Han\", LastName: \"Solo\"},\n\t\t\t),\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"row groups with conversion to a different schema\",\n\t\t\toptions: []parquet.RowGroupOption{\n\t\t\t\tparquet.SchemaOf(LastNameOnly{}),\n\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\t\tparquet.Ascending(\"LastName\"),\n\t\t\t\t\t),\n\t\t\t\t),\n\t\t\t},\n\t\t\tinput: []parquet.RowGroup{\n\t\t\t\tsortedRowGroup(\n\t\t\t\t\t[]parquet.RowGroupOption{\n\t\t\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\t\t\t\tparquet.Ascending(\"LastName\"),\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t),\n\t\t\t\t\t},\n\t\t\t\t\tPerson{FirstName: \"Han\", LastName: \"Solo\"},\n\t\t\t\t\tPerson{FirstName: \"Luke\", LastName: \"Skywalker\"},\n\t\t\t\t),\n\t\t\t\tsortedRowGroup(\n\t\t\t\t\t[]parquet.RowGroupOption{\n\t\t\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\t\t\t\tparquet.Ascending(\"LastName\"),\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t),\n\t\t\t\t\t},\n\t\t\t\t\tPerson{FirstName: \"Obiwan\", LastName: \"Kenobi\"},\n\t\t\t\t\tPerson{FirstName: \"Anakin\", LastName: \"Skywalker\"},\n\t\t\t\t),\n\t\t\t},\n\t\t\toutput: sortedRowGroup(\n\t\t\t\t[]parquet.RowGroupOption{\n\t\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\t\t\tparquet.Ascending(\"LastName\"),\n\t\t\t\t\t\t),\n\t\t\t\t\t),\n\t\t\t\t},\n\t\t\t\tLastNameOnly{LastName: \"Solo\"},\n\t\t\t\tLastNameOnly{LastName: \"Skywalker\"},\n\t\t\t\tLastNameOnly{LastName: \"Skywalker\"},\n\t\t\t\tLastNameOnly{LastName: \"Kenobi\"},\n\t\t\t),\n\t\t},\n\t}\n\n\tfor _, adapter := range []struct {\n\t\tscenario string\n\t\tfunction func(parquet.RowGroup) parquet.RowGroup\n\t}{\n\t\t{scenario: \"buffer\", function: selfRowGroup},\n\t\t{scenario: \"file\", function: fileRowGroup},\n\t} {\n\t\tt.Run(adapter.scenario, func(t *testing.T) {\n\t\t\tfor _, test := range tests {\n\t\t\t\tt.Run(test.scenario, func(t *testing.T) {\n\t\t\t\t\tinput := make([]parquet.RowGroup, len(test.input))\n\t\t\t\t\tfor i := range test.input {\n\t\t\t\t\t\tinput[i] = adapter.function(test.input[i])\n\t\t\t\t\t}\n\n\t\t\t\t\tmerged, err := parquet.MergeRowGroups(test.input, test.options...)\n\t\t\t\t\tif err != nil {\n\t\t\t\t\t\tt.Fatal(err)\n\t\t\t\t\t}\n\t\t\t\t\tif merged.NumRows() != test.output.NumRows() {\n\t\t\t\t\t\tt.Fatalf(\"the number of rows mismatch: want=%d got=%d\", merged.NumRows(), test.output.NumRows())\n\t\t\t\t\t}\n\t\t\t\t\tif merged.Schema() != test.output.Schema() {\n\t\t\t\t\t\tt.Fatalf(\"the row group schemas mismatch:\\n%v\\n%v\", test.output.Schema(), merged.Schema())\n\t\t\t\t\t}\n\n\t\t\t\t\toptions := []parquet.RowGroupOption{parquet.SchemaOf(Person{})}\n\t\t\t\t\toptions = append(options, test.options...)\n\t\t\t\t\t// We test two views of the resulting row group: the one originally\n\t\t\t\t\t// returned by MergeRowGroups, and one where the merged row group\n\t\t\t\t\t// has been copied into a new buffer. The intent is to exercise both\n\t\t\t\t\t// the row-by-row read as well as optimized code paths when CopyRows\n\t\t\t\t\t// bypasses the ReadRow/WriteRow calls and the row group is written\n\t\t\t\t\t// directly to the buffer by calling WriteRowsTo/WriteRowGroup.\n\t\t\t\t\tmergedCopy := parquet.NewBuffer(options...)\n\n\t\t\t\t\ttotalRows := test.output.NumRows()\n\t\t\t\t\tnumRows, err := copyRowsAndClose(mergedCopy, merged.Rows())\n\t\t\t\t\tif err != nil {\n\t\t\t\t\t\tt.Fatal(err)\n\t\t\t\t\t}\n\t\t\t\t\tif numRows != totalRows {\n\t\t\t\t\t\tt.Fatalf(\"wrong number of rows copied: want=%d got=%d\", totalRows, numRows)\n\t\t\t\t\t}\n\n\t\t\t\t\tfor _, merge := range []struct {\n\t\t\t\t\t\tscenario string\n\t\t\t\t\t\trowGroup parquet.RowGroup\n\t\t\t\t\t}{\n\t\t\t\t\t\t{scenario: \"self\", rowGroup: merged},\n\t\t\t\t\t\t{scenario: \"copy\", rowGroup: mergedCopy},\n\t\t\t\t\t} {\n\t\t\t\t\t\tt.Run(merge.scenario, func(t *testing.T) {\n\t\t\t\t\t\t\tvar expectedRows = test.output.Rows()\n\t\t\t\t\t\t\tvar mergedRows = merge.rowGroup.Rows()\n\t\t\t\t\t\t\tvar row1 = make([]parquet.Row, 1)\n\t\t\t\t\t\t\tvar row2 = make([]parquet.Row, 1)\n\t\t\t\t\t\t\tvar numRows int64\n\n\t\t\t\t\t\t\tdefer expectedRows.Close()\n\t\t\t\t\t\t\tdefer mergedRows.Close()\n\n\t\t\t\t\t\t\tfor {\n\t\t\t\t\t\t\t\t_, err1 := expectedRows.ReadRows(row1)\n\t\t\t\t\t\t\t\tn, err2 := mergedRows.ReadRows(row2)\n\n\t\t\t\t\t\t\t\tif err1 != err2 {\n\t\t\t\t\t\t\t\t\t// ReadRows may or may not return io.EOF\n\t\t\t\t\t\t\t\t\t// when it reads the last row, so we test\n\t\t\t\t\t\t\t\t\t// that the reference RowReader has also\n\t\t\t\t\t\t\t\t\t// reached the end.\n\t\t\t\t\t\t\t\t\tif err1 == nil && err2 == io.EOF {\n\t\t\t\t\t\t\t\t\t\t_, err1 = expectedRows.ReadRows(row1[:0])\n\t\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t\t\tif err1 != io.EOF {\n\t\t\t\t\t\t\t\t\t\tt.Fatalf(\"errors mismatched while comparing row %d/%d: want=%v got=%v\", numRows, totalRows, err1, err2)\n\t\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t\t}\n\n\t\t\t\t\t\t\t\tif n != 0 {\n\t\t\t\t\t\t\t\t\tif !row1[0].Equal(row2[0]) {\n\t\t\t\t\t\t\t\t\t\tt.Errorf(\"row at index %d/%d mismatch: want=%+v got=%+v\", numRows, totalRows, row1[0], row2[0])\n\t\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t\t\tnumRows++\n\t\t\t\t\t\t\t\t}\n\n\t\t\t\t\t\t\t\tif err1 != nil {\n\t\t\t\t\t\t\t\t\tbreak\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t}\n\n\t\t\t\t\t\t\tif numRows != totalRows {\n\t\t\t\t\t\t\t\tt.Errorf(\"expected to read %d rows but %d were found\", totalRows, numRows)\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t})\n\t\t\t\t\t}\n\n\t\t\t\t})\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc TestMergeRowGroupsCursorsAreClosed(t *testing.T) {\n\ttype model struct {\n\t\tA int\n\t}\n\n\tschema := parquet.SchemaOf(model{})\n\toptions := []parquet.RowGroupOption{\n\t\tparquet.SortingRowGroupConfig(\n\t\t\tparquet.SortingColumns(\n\t\t\t\tparquet.Ascending(schema.Columns()[0]...),\n\t\t\t),\n\t\t),\n\t}\n\n\tprng := rand.New(rand.NewSource(0))\n\trowGroups := make([]parquet.RowGroup, numRowGroups)\n\trows := make([]*wrappedRows, 0, numRowGroups)\n\n\tfor i := range rowGroups {\n\t\trowGroups[i] = wrappedRowGroup{\n\t\t\tRowGroup: sortedRowGroup(options, randomRowsOf(prng, rowsPerGroup, model{})...),\n\t\t\trowsCallback: func(r parquet.Rows) parquet.Rows {\n\t\t\t\twrapped := &wrappedRows{Rows: r}\n\t\t\t\trows = append(rows, wrapped)\n\t\t\t\treturn wrapped\n\t\t\t},\n\t\t}\n\t}\n\n\tm, err := parquet.MergeRowGroups(rowGroups, options...)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\tfunc() {\n\t\tmergedRows := m.Rows()\n\t\tdefer mergedRows.Close()\n\n\t\t// Add 1 more slot to the buffer to force an io.EOF on the first read.\n\t\trbuf := make([]parquet.Row, (numRowGroups*rowsPerGroup)+1)\n\t\tif _, err := mergedRows.ReadRows(rbuf); !errors.Is(err, io.EOF) {\n\t\t\tt.Fatal(err)\n\t\t}\n\t}()\n\n\tfor i, wrapped := range rows {\n\t\tif !wrapped.closed {\n\t\t\tt.Fatalf(\"RowGroup %d not closed\", i)\n\t\t}\n\t}\n}\n\nfunc TestMergeRowGroupsSeekToRow(t *testing.T) {\n\ttype model struct {\n\t\tA int\n\t}\n\n\tschema := parquet.SchemaOf(model{})\n\toptions := []parquet.RowGroupOption{\n\t\tparquet.SortingRowGroupConfig(\n\t\t\tparquet.SortingColumns(\n\t\t\t\tparquet.Ascending(schema.Columns()[0]...),\n\t\t\t),\n\t\t),\n\t}\n\n\trowGroups := make([]parquet.RowGroup, numRowGroups)\n\n\tcounter := 0\n\tfor i := range rowGroups {\n\t\trows := make([]interface{}, 0, rowsPerGroup)\n\t\tfor j := 0; j < rowsPerGroup; j++ {\n\t\t\trows = append(rows, model{A: counter})\n\t\t\tcounter++\n\t\t}\n\t\trowGroups[i] = sortedRowGroup(options, rows...)\n\t}\n\n\tm, err := parquet.MergeRowGroups(rowGroups, options...)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tfunc() {\n\t\tmergedRows := m.Rows()\n\t\tdefer mergedRows.Close()\n\n\t\trbuf := make([]parquet.Row, 1)\n\t\tcursor := int64(0)\n\t\tfor {\n\t\t\tif err := mergedRows.SeekToRow(cursor); err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t}\n\n\t\t\tif _, err := mergedRows.ReadRows(rbuf); err != nil {\n\t\t\t\tif errors.Is(err, io.EOF) {\n\t\t\t\t\tbreak\n\t\t\t\t}\n\t\t\t\tt.Fatal(err)\n\t\t\t}\n\t\t\tv := model{}\n\t\t\tif err := schema.Reconstruct(&v, rbuf[0]); err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t}\n\t\t\tif v.A != int(cursor) {\n\t\t\t\tt.Fatalf(\"expected value %d, got %d\", cursor, v.A)\n\t\t\t}\n\n\t\t\tcursor++\n\t\t}\n\t}()\n}\n\nfunc BenchmarkMergeRowGroups(b *testing.B) {\n\tfor _, test := range readerTests {\n\t\tb.Run(test.scenario, func(b *testing.B) {\n\t\t\tschema := parquet.SchemaOf(test.model)\n\n\t\t\toptions := []parquet.RowGroupOption{\n\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\t\tparquet.Ascending(schema.Columns()[0]...),\n\t\t\t\t\t),\n\t\t\t\t),\n\t\t\t}\n\n\t\t\tprng := rand.New(rand.NewSource(0))\n\t\t\trowGroups := make([]parquet.RowGroup, numRowGroups)\n\n\t\t\tfor i := range rowGroups {\n\t\t\t\trowGroups[i] = sortedRowGroup(options, randomRowsOf(prng, rowsPerGroup, test.model)...)\n\t\t\t}\n\n\t\t\tfor n := 1; n <= numRowGroups; n++ {\n\t\t\t\tb.Run(fmt.Sprintf(\"groups=%d,rows=%d\", n, n*rowsPerGroup), func(b *testing.B) {\n\t\t\t\t\tmergedRowGroup, err := parquet.MergeRowGroups(rowGroups[:n], options...)\n\t\t\t\t\tif err != nil {\n\t\t\t\t\t\tb.Fatal(err)\n\t\t\t\t\t}\n\n\t\t\t\t\trows := mergedRowGroup.Rows()\n\t\t\t\t\trbuf := make([]parquet.Row, benchmarkRowsPerStep)\n\t\t\t\t\tdefer func() { rows.Close() }()\n\n\t\t\t\t\tbenchmarkRowsPerSecond(b, func() int {\n\t\t\t\t\t\tn, err := rows.ReadRows(rbuf)\n\t\t\t\t\t\tif err != nil {\n\t\t\t\t\t\t\tif !errors.Is(err, io.EOF) {\n\t\t\t\t\t\t\t\tb.Fatal(err)\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\trows.Close()\n\t\t\t\t\t\t\trows = mergedRowGroup.Rows()\n\t\t\t\t\t\t}\n\t\t\t\t\t\treturn n\n\t\t\t\t\t})\n\t\t\t\t})\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc BenchmarkMergeFiles(b *testing.B) {\n\trowGroupBuffers := make([]bytes.Buffer, numRowGroups)\n\n\tfor _, test := range readerTests {\n\t\tb.Run(test.scenario, func(b *testing.B) {\n\t\t\tschema := parquet.SchemaOf(test.model)\n\n\t\t\tsortingOptions := []parquet.SortingOption{\n\t\t\t\tparquet.SortingColumns(\n\t\t\t\t\tparquet.Ascending(schema.Columns()[0]...),\n\t\t\t\t),\n\t\t\t}\n\n\t\t\toptions := []parquet.RowGroupOption{\n\t\t\t\tschema,\n\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\tsortingOptions...,\n\t\t\t\t),\n\t\t\t}\n\n\t\t\tbuffer := parquet.NewBuffer(options...)\n\n\t\t\tprng := rand.New(rand.NewSource(0))\n\t\t\tfiles := make([]*parquet.File, numRowGroups)\n\t\t\trowGroups := make([]parquet.RowGroup, numRowGroups)\n\n\t\t\tfor i := range files {\n\t\t\t\tfor _, row := range randomRowsOf(prng, rowsPerGroup, test.model) {\n\t\t\t\t\tbuffer.Write(row)\n\t\t\t\t}\n\t\t\t\tsort.Sort(buffer)\n\t\t\t\trowGroupBuffers[i].Reset()\n\t\t\t\twriter := parquet.NewWriter(&rowGroupBuffers[i],\n\t\t\t\t\tschema,\n\t\t\t\t\tparquet.SortingWriterConfig(\n\t\t\t\t\t\tsortingOptions...,\n\t\t\t\t\t),\n\t\t\t\t)\n\t\t\t\t_, err := copyRowsAndClose(writer, buffer.Rows())\n\t\t\t\tif err != nil {\n\t\t\t\t\tb.Fatal(err)\n\t\t\t\t}\n\t\t\t\tif err := writer.Close(); err != nil {\n\t\t\t\t\tb.Fatal(err)\n\t\t\t\t}\n\t\t\t\tr := bytes.NewReader(rowGroupBuffers[i].Bytes())\n\t\t\t\tf, err := parquet.OpenFile(r, r.Size())\n\t\t\t\tif err != nil {\n\t\t\t\t\tb.Fatal(err)\n\t\t\t\t}\n\t\t\t\tfiles[i], rowGroups[i] = f, f.RowGroups()[0]\n\t\t\t}\n\n\t\t\tfor n := 1; n <= numRowGroups; n++ {\n\t\t\t\tb.Run(fmt.Sprintf(\"groups=%d,rows=%d\", n, n*rowsPerGroup), func(b *testing.B) {\n\t\t\t\t\tmergedRowGroup, err := parquet.MergeRowGroups(rowGroups[:n], options...)\n\t\t\t\t\tif err != nil {\n\t\t\t\t\t\tb.Fatal(err)\n\t\t\t\t\t}\n\n\t\t\t\t\trows := mergedRowGroup.Rows()\n\t\t\t\t\trbuf := make([]parquet.Row, benchmarkRowsPerStep)\n\t\t\t\t\tdefer func() { rows.Close() }()\n\n\t\t\t\t\tbenchmarkRowsPerSecond(b, func() int {\n\t\t\t\t\t\tn, err := rows.ReadRows(rbuf)\n\t\t\t\t\t\tif err != nil {\n\t\t\t\t\t\t\tif !errors.Is(err, io.EOF) {\n\t\t\t\t\t\t\t\tb.Fatal(err)\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\trows.Close()\n\t\t\t\t\t\t\trows = mergedRowGroup.Rows()\n\t\t\t\t\t\t}\n\t\t\t\t\t\treturn n\n\t\t\t\t\t})\n\n\t\t\t\t\ttotalSize := int64(0)\n\t\t\t\t\tfor _, f := range files[:n] {\n\t\t\t\t\t\ttotalSize += f.Size()\n\t\t\t\t\t}\n\t\t\t\t})\n\t\t\t}\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "multi_row_group.go",
    "content": "package parquet\n\nimport (\n\t\"io\"\n)\n\n// MultiRowGroup wraps multiple row groups to appear as if it was a single\n// RowGroup. RowGroups must have the same schema or it will error.\nfunc MultiRowGroup(rowGroups ...RowGroup) RowGroup {\n\treturn newMultiRowGroup(ReadModeSync, rowGroups...)\n}\n\nfunc newMultiRowGroup(pageReadMode ReadMode, rowGroups ...RowGroup) RowGroup {\n\tif len(rowGroups) == 0 {\n\t\treturn &emptyRowGroup{}\n\t}\n\tif len(rowGroups) == 1 {\n\t\treturn rowGroups[0]\n\t}\n\n\tschema, err := compatibleSchemaOf(rowGroups)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\n\trowGroupsCopy := make([]RowGroup, len(rowGroups))\n\tcopy(rowGroupsCopy, rowGroups)\n\n\tc := &multiRowGroup{\n\t\tpageReadMode: pageReadMode,\n\t}\n\tc.init(schema, rowGroupsCopy)\n\treturn c\n}\n\nfunc (c *multiRowGroup) init(schema *Schema, rowGroups []RowGroup) error {\n\tcolumns := make([]multiColumnChunk, len(schema.Columns()))\n\n\trowGroupColumnChunks := make([][]ColumnChunk, len(rowGroups))\n\tfor i, rowGroup := range rowGroups {\n\t\trowGroupColumnChunks[i] = rowGroup.ColumnChunks()\n\t}\n\n\tfor i := range columns {\n\t\tcolumns[i].rowGroup = c\n\t\tcolumns[i].column = i\n\t\tcolumns[i].chunks = make([]ColumnChunk, len(rowGroupColumnChunks))\n\n\t\tfor j, columnChunks := range rowGroupColumnChunks {\n\t\t\tcolumns[i].chunks[j] = columnChunks[i]\n\t\t}\n\t}\n\n\tc.schema = schema\n\tc.rowGroups = rowGroups\n\tc.columns = make([]ColumnChunk, len(columns))\n\n\tfor i := range columns {\n\t\tc.columns[i] = &columns[i]\n\t}\n\n\treturn nil\n}\n\nfunc compatibleSchemaOf(rowGroups []RowGroup) (*Schema, error) {\n\tschema := rowGroups[0].Schema()\n\n\t// Fast path: Many times all row groups have the exact same schema so a\n\t// pointer comparison is cheaper.\n\tsamePointer := true\n\tfor _, rowGroup := range rowGroups[1:] {\n\t\tif rowGroup.Schema() != schema {\n\t\t\tsamePointer = false\n\t\t\tbreak\n\t\t}\n\t}\n\tif samePointer {\n\t\treturn schema, nil\n\t}\n\n\t// Slow path: The schema pointers are not the same, but they still have to\n\t// be compatible.\n\tfor _, rowGroup := range rowGroups[1:] {\n\t\tif !nodesAreEqual(schema, rowGroup.Schema()) {\n\t\t\treturn nil, ErrRowGroupSchemaMismatch\n\t\t}\n\t}\n\n\treturn schema, nil\n}\n\ntype multiRowGroup struct {\n\tschema       *Schema\n\trowGroups    []RowGroup\n\tcolumns      []ColumnChunk\n\tpageReadMode ReadMode\n}\n\nfunc (c *multiRowGroup) NumRows() (numRows int64) {\n\tfor _, rowGroup := range c.rowGroups {\n\t\tnumRows += rowGroup.NumRows()\n\t}\n\treturn numRows\n}\n\nfunc (c *multiRowGroup) ColumnChunks() []ColumnChunk { return c.columns }\n\nfunc (c *multiRowGroup) SortingColumns() []SortingColumn { return nil }\n\nfunc (c *multiRowGroup) Schema() *Schema { return c.schema }\n\nfunc (c *multiRowGroup) Rows() Rows { return newRowGroupRows(c, c.pageReadMode) }\n\ntype multiColumnChunk struct {\n\trowGroup *multiRowGroup\n\tcolumn   int\n\tchunks   []ColumnChunk\n}\n\nfunc (c *multiColumnChunk) Type() Type {\n\tif len(c.chunks) != 0 {\n\t\treturn c.chunks[0].Type() // all chunks should be of the same type\n\t}\n\treturn nil\n}\n\nfunc (c *multiColumnChunk) NumValues() int64 {\n\tn := int64(0)\n\tfor i := range c.chunks {\n\t\tn += c.chunks[i].NumValues()\n\t}\n\treturn n\n}\n\nfunc (c *multiColumnChunk) Column() int {\n\treturn c.column\n}\n\nfunc (c *multiColumnChunk) Pages() Pages {\n\treturn &multiPages{column: c}\n}\n\nfunc (c *multiColumnChunk) ColumnIndex() ColumnIndex {\n\t// TODO: implement\n\treturn nil\n}\n\nfunc (c *multiColumnChunk) OffsetIndex() OffsetIndex {\n\t// TODO: implement\n\treturn nil\n}\n\nfunc (c *multiColumnChunk) BloomFilter() BloomFilter {\n\treturn multiBloomFilter{c}\n}\n\ntype multiBloomFilter struct{ *multiColumnChunk }\n\nfunc (f multiBloomFilter) ReadAt(b []byte, off int64) (int, error) {\n\t// TODO: add a test for this function\n\ti := 0\n\n\tfor i < len(f.chunks) {\n\t\tif r := f.chunks[i].BloomFilter(); r != nil {\n\t\t\tsize := r.Size()\n\t\t\tif off < size {\n\t\t\t\tbreak\n\t\t\t}\n\t\t\toff -= size\n\t\t}\n\t\ti++\n\t}\n\n\tif i == len(f.chunks) {\n\t\treturn 0, io.EOF\n\t}\n\n\trn := int(0)\n\tfor len(b) > 0 {\n\t\tif r := f.chunks[i].BloomFilter(); r != nil {\n\t\t\tn, err := r.ReadAt(b, off)\n\t\t\trn += n\n\t\t\tif err != nil {\n\t\t\t\treturn rn, err\n\t\t\t}\n\t\t\tif b = b[n:]; len(b) == 0 {\n\t\t\t\treturn rn, nil\n\t\t\t}\n\t\t\toff += int64(n)\n\t\t}\n\t\ti++\n\t}\n\n\tif i == len(f.chunks) {\n\t\treturn rn, io.EOF\n\t}\n\treturn rn, nil\n}\n\nfunc (f multiBloomFilter) Size() int64 {\n\tsize := int64(0)\n\tfor _, c := range f.chunks {\n\t\tif b := c.BloomFilter(); b != nil {\n\t\t\tsize += b.Size()\n\t\t}\n\t}\n\treturn size\n}\n\nfunc (f multiBloomFilter) Check(v Value) (bool, error) {\n\tfor _, c := range f.chunks {\n\t\tif b := c.BloomFilter(); b != nil {\n\t\t\tif ok, err := b.Check(v); ok || err != nil {\n\t\t\t\treturn ok, err\n\t\t\t}\n\t\t}\n\t}\n\treturn false, nil\n}\n\ntype multiPages struct {\n\tpages  Pages\n\tindex  int\n\tcolumn *multiColumnChunk\n}\n\nfunc (m *multiPages) ReadPage() (Page, error) {\n\tfor {\n\t\tif m.pages != nil {\n\t\t\tp, err := m.pages.ReadPage()\n\t\t\tif err == nil || err != io.EOF {\n\t\t\t\treturn p, err\n\t\t\t}\n\t\t\tif err := m.pages.Close(); err != nil {\n\t\t\t\treturn nil, err\n\t\t\t}\n\t\t\tm.pages = nil\n\t\t}\n\n\t\tif m.column == nil || m.index == len(m.column.chunks) {\n\t\t\treturn nil, io.EOF\n\t\t}\n\n\t\tm.pages = m.column.chunks[m.index].Pages()\n\t\tm.index++\n\t}\n}\n\nfunc (m *multiPages) SeekToRow(rowIndex int64) error {\n\tif m.column == nil {\n\t\treturn io.ErrClosedPipe\n\t}\n\n\tif m.pages != nil {\n\t\tif err := m.pages.Close(); err != nil {\n\t\t\treturn err\n\t\t}\n\t}\n\n\trowGroups := m.column.rowGroup.rowGroups\n\tnumRows := int64(0)\n\tm.pages = nil\n\tm.index = 0\n\n\tfor m.index < len(rowGroups) {\n\t\tnumRows = rowGroups[m.index].NumRows()\n\t\tif rowIndex < numRows {\n\t\t\tbreak\n\t\t}\n\t\trowIndex -= numRows\n\t\tm.index++\n\t}\n\n\tif m.index < len(rowGroups) {\n\t\tm.pages = m.column.chunks[m.index].Pages()\n\t\tm.index++\n\t\treturn m.pages.SeekToRow(rowIndex)\n\t}\n\treturn nil\n}\n\nfunc (m *multiPages) Close() (err error) {\n\tif m.pages != nil {\n\t\terr = m.pages.Close()\n\t}\n\tm.pages = nil\n\tm.index = 0\n\tm.column = nil\n\treturn err\n}\n"
  },
  {
    "path": "node.go",
    "content": "package parquet\n\nimport (\n\t\"reflect\"\n\t\"sort\"\n\t\"unicode\"\n\t\"unicode/utf8\"\n\n\t\"github.com/segmentio/parquet-go/compress\"\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\n// Node values represent nodes of a parquet schema.\n//\n// Nodes carry the type of values, as well as properties like whether the values\n// are optional or repeat. Nodes with one or more children represent parquet\n// groups and therefore do not have a logical type.\n//\n// Nodes are immutable values and therefore safe to use concurrently from\n// multiple goroutines.\ntype Node interface {\n\t// Returns a human-readable representation of the parquet node.\n\tString() string\n\n\t// For leaf nodes, returns the type of values of the parquet column.\n\t//\n\t// Calling this method on non-leaf nodes will panic.\n\tType() Type\n\n\t// Returns whether the parquet column is optional.\n\tOptional() bool\n\n\t// Returns whether the parquet column is repeated.\n\tRepeated() bool\n\n\t// Returns whether the parquet column is required.\n\tRequired() bool\n\n\t// Returns true if this a leaf node.\n\tLeaf() bool\n\n\t// Returns a mapping of the node's fields.\n\t//\n\t// As an optimization, the same slices may be returned by multiple calls to\n\t// this method, programs must treat the returned values as immutable.\n\t//\n\t// This method returns an empty mapping when called on leaf nodes.\n\tFields() []Field\n\n\t// Returns the encoding used by the node.\n\t//\n\t// The method may return nil to indicate that no specific encoding was\n\t// configured on the node, in which case a default encoding might be used.\n\tEncoding() encoding.Encoding\n\n\t// Returns compression codec used by the node.\n\t//\n\t// The method may return nil to indicate that no specific compression codec\n\t// was configured on the node, in which case a default compression might be\n\t// used.\n\tCompression() compress.Codec\n\n\t// Returns the Go type that best represents the parquet node.\n\t//\n\t// For leaf nodes, this will be one of bool, int32, int64, deprecated.Int96,\n\t// float32, float64, string, []byte, or [N]byte.\n\t//\n\t// For groups, the method returns a struct type.\n\t//\n\t// If the method is called on a repeated node, the method returns a slice of\n\t// the underlying type.\n\t//\n\t// For optional nodes, the method returns a pointer of the underlying type.\n\t//\n\t// For nodes that were constructed from Go values (e.g. using SchemaOf), the\n\t// method returns the original Go type.\n\tGoType() reflect.Type\n}\n\n// Field instances represent fields of a parquet node, which associate a node to\n// their name in their parent node.\ntype Field interface {\n\tNode\n\n\t// Returns the name of this field in its parent node.\n\tName() string\n\n\t// Given a reference to the Go value matching the structure of the parent\n\t// node, returns the Go value of the field.\n\tValue(base reflect.Value) reflect.Value\n}\n\n// Encoded wraps the node passed as argument to use the given encoding.\n//\n// The function panics if it is called on a non-leaf node, or if the\n// encoding does not support the node type.\nfunc Encoded(node Node, encoding encoding.Encoding) Node {\n\tif !node.Leaf() {\n\t\tpanic(\"cannot add encoding to a non-leaf node\")\n\t}\n\tif encoding != nil {\n\t\tkind := node.Type().Kind()\n\t\tif !canEncode(encoding, kind) {\n\t\t\tpanic(\"cannot apply \" + encoding.Encoding().String() + \" to node of type \" + kind.String())\n\t\t}\n\t}\n\treturn &encodedNode{\n\t\tNode:     node,\n\t\tencoding: encoding,\n\t}\n}\n\ntype encodedNode struct {\n\tNode\n\tencoding encoding.Encoding\n}\n\nfunc (n *encodedNode) Encoding() encoding.Encoding {\n\treturn n.encoding\n}\n\n// Compressed wraps the node passed as argument to use the given compression\n// codec.\n//\n// If the codec is nil, the node's compression is left unchanged.\n//\n// The function panics if it is called on a non-leaf node.\nfunc Compressed(node Node, codec compress.Codec) Node {\n\tif !node.Leaf() {\n\t\tpanic(\"cannot add compression codec to a non-leaf node\")\n\t}\n\treturn &compressedNode{\n\t\tNode:  node,\n\t\tcodec: codec,\n\t}\n}\n\ntype compressedNode struct {\n\tNode\n\tcodec compress.Codec\n}\n\nfunc (n *compressedNode) Compression() compress.Codec {\n\treturn n.codec\n}\n\n// Optional wraps the given node to make it optional.\nfunc Optional(node Node) Node { return &optionalNode{node} }\n\ntype optionalNode struct{ Node }\n\nfunc (opt *optionalNode) Optional() bool       { return true }\nfunc (opt *optionalNode) Repeated() bool       { return false }\nfunc (opt *optionalNode) Required() bool       { return false }\nfunc (opt *optionalNode) GoType() reflect.Type { return reflect.PtrTo(opt.Node.GoType()) }\n\n// Repeated wraps the given node to make it repeated.\nfunc Repeated(node Node) Node { return &repeatedNode{node} }\n\ntype repeatedNode struct{ Node }\n\nfunc (rep *repeatedNode) Optional() bool       { return false }\nfunc (rep *repeatedNode) Repeated() bool       { return true }\nfunc (rep *repeatedNode) Required() bool       { return false }\nfunc (rep *repeatedNode) GoType() reflect.Type { return reflect.SliceOf(rep.Node.GoType()) }\n\n// Required wraps the given node to make it required.\nfunc Required(node Node) Node { return &requiredNode{node} }\n\ntype requiredNode struct{ Node }\n\nfunc (req *requiredNode) Optional() bool       { return false }\nfunc (req *requiredNode) Repeated() bool       { return false }\nfunc (req *requiredNode) Required() bool       { return true }\nfunc (req *requiredNode) GoType() reflect.Type { return req.Node.GoType() }\n\ntype node struct{}\n\n// Leaf returns a leaf node of the given type.\nfunc Leaf(typ Type) Node {\n\treturn &leafNode{typ: typ}\n}\n\ntype leafNode struct{ typ Type }\n\nfunc (n *leafNode) String() string { return sprint(\"\", n) }\n\nfunc (n *leafNode) Type() Type { return n.typ }\n\nfunc (n *leafNode) Optional() bool { return false }\n\nfunc (n *leafNode) Repeated() bool { return false }\n\nfunc (n *leafNode) Required() bool { return true }\n\nfunc (n *leafNode) Leaf() bool { return true }\n\nfunc (n *leafNode) Fields() []Field { return nil }\n\nfunc (n *leafNode) Encoding() encoding.Encoding { return nil }\n\nfunc (n *leafNode) Compression() compress.Codec { return nil }\n\nfunc (n *leafNode) GoType() reflect.Type { return goTypeOfLeaf(n) }\n\nvar repetitionTypes = [...]format.FieldRepetitionType{\n\t0: format.Required,\n\t1: format.Optional,\n\t2: format.Repeated,\n}\n\nfunc fieldRepetitionTypePtrOf(node Node) *format.FieldRepetitionType {\n\tswitch {\n\tcase node.Required():\n\t\treturn &repetitionTypes[format.Required]\n\tcase node.Optional():\n\t\treturn &repetitionTypes[format.Optional]\n\tcase node.Repeated():\n\t\treturn &repetitionTypes[format.Repeated]\n\tdefault:\n\t\treturn nil\n\t}\n}\n\nfunc fieldRepetitionTypeOf(node Node) format.FieldRepetitionType {\n\tswitch {\n\tcase node.Optional():\n\t\treturn format.Optional\n\tcase node.Repeated():\n\t\treturn format.Repeated\n\tdefault:\n\t\treturn format.Required\n\t}\n}\n\nfunc applyFieldRepetitionType(t format.FieldRepetitionType, repetitionLevel, definitionLevel byte) (byte, byte) {\n\tswitch t {\n\tcase format.Optional:\n\t\tdefinitionLevel++\n\tcase format.Repeated:\n\t\trepetitionLevel++\n\t\tdefinitionLevel++\n\t}\n\treturn repetitionLevel, definitionLevel\n}\n\ntype Group map[string]Node\n\nfunc (g Group) String() string { return sprint(\"\", g) }\n\nfunc (g Group) Type() Type { return groupType{} }\n\nfunc (g Group) Optional() bool { return false }\n\nfunc (g Group) Repeated() bool { return false }\n\nfunc (g Group) Required() bool { return true }\n\nfunc (g Group) Leaf() bool { return false }\n\nfunc (g Group) Fields() []Field {\n\tgroupFields := make([]groupField, 0, len(g))\n\tfor name, node := range g {\n\t\tgroupFields = append(groupFields, groupField{\n\t\t\tNode: node,\n\t\t\tname: name,\n\t\t})\n\t}\n\tsort.Slice(groupFields, func(i, j int) bool {\n\t\treturn groupFields[i].name < groupFields[j].name\n\t})\n\tfields := make([]Field, len(groupFields))\n\tfor i := range groupFields {\n\t\tfields[i] = &groupFields[i]\n\t}\n\treturn fields\n}\n\nfunc (g Group) Encoding() encoding.Encoding { return nil }\n\nfunc (g Group) Compression() compress.Codec { return nil }\n\nfunc (g Group) GoType() reflect.Type { return goTypeOfGroup(g) }\n\ntype groupField struct {\n\tNode\n\tname string\n}\n\nfunc (f *groupField) Name() string { return f.name }\n\nfunc (f *groupField) Value(base reflect.Value) reflect.Value {\n\treturn base.MapIndex(reflect.ValueOf(&f.name).Elem())\n}\n\nfunc goTypeOf(node Node) reflect.Type {\n\tswitch {\n\tcase node.Optional():\n\t\treturn goTypeOfOptional(node)\n\tcase node.Repeated():\n\t\treturn goTypeOfRepeated(node)\n\tdefault:\n\t\treturn goTypeOfRequired(node)\n\t}\n}\n\nfunc goTypeOfOptional(node Node) reflect.Type {\n\treturn reflect.PtrTo(goTypeOfRequired(node))\n}\n\nfunc goTypeOfRepeated(node Node) reflect.Type {\n\treturn reflect.SliceOf(goTypeOfRequired(node))\n}\n\nfunc goTypeOfRequired(node Node) reflect.Type {\n\tif node.Leaf() {\n\t\treturn goTypeOfLeaf(node)\n\t} else {\n\t\treturn goTypeOfGroup(node)\n\t}\n}\n\nfunc goTypeOfLeaf(node Node) reflect.Type {\n\tt := node.Type()\n\tif convertibleType, ok := t.(interface{ GoType() reflect.Type }); ok {\n\t\treturn convertibleType.GoType()\n\t}\n\tswitch t.Kind() {\n\tcase Boolean:\n\t\treturn reflect.TypeOf(false)\n\tcase Int32:\n\t\treturn reflect.TypeOf(int32(0))\n\tcase Int64:\n\t\treturn reflect.TypeOf(int64(0))\n\tcase Int96:\n\t\treturn reflect.TypeOf(deprecated.Int96{})\n\tcase Float:\n\t\treturn reflect.TypeOf(float32(0))\n\tcase Double:\n\t\treturn reflect.TypeOf(float64(0))\n\tcase ByteArray:\n\t\treturn reflect.TypeOf(([]byte)(nil))\n\tcase FixedLenByteArray:\n\t\treturn reflect.ArrayOf(t.Length(), reflect.TypeOf(byte(0)))\n\tdefault:\n\t\tpanic(\"BUG: parquet type returned an unsupported kind\")\n\t}\n}\n\nfunc goTypeOfGroup(node Node) reflect.Type {\n\tfields := node.Fields()\n\tstructFields := make([]reflect.StructField, len(fields))\n\tfor i, field := range fields {\n\t\tstructFields[i].Name = exportedStructFieldName(field.Name())\n\t\tstructFields[i].Type = field.GoType()\n\t\t// TODO: can we reconstruct a struct tag that would be valid if a value\n\t\t// of this type were passed to SchemaOf?\n\t}\n\treturn reflect.StructOf(structFields)\n}\n\nfunc exportedStructFieldName(name string) string {\n\tfirstRune, size := utf8.DecodeRuneInString(name)\n\treturn string([]rune{unicode.ToUpper(firstRune)}) + name[size:]\n}\n\nfunc isList(node Node) bool {\n\tlogicalType := node.Type().LogicalType()\n\treturn logicalType != nil && logicalType.List != nil\n}\n\nfunc isMap(node Node) bool {\n\tlogicalType := node.Type().LogicalType()\n\treturn logicalType != nil && logicalType.Map != nil\n}\n\nfunc numLeafColumnsOf(node Node) int16 {\n\treturn makeColumnIndex(numLeafColumns(node, 0))\n}\n\nfunc numLeafColumns(node Node, columnIndex int) int {\n\tif node.Leaf() {\n\t\treturn columnIndex + 1\n\t}\n\tfor _, field := range node.Fields() {\n\t\tcolumnIndex = numLeafColumns(field, columnIndex)\n\t}\n\treturn columnIndex\n}\n\nfunc listElementOf(node Node) Node {\n\tif !node.Leaf() {\n\t\tif list := fieldByName(node, \"list\"); list != nil {\n\t\t\tif elem := fieldByName(list, \"element\"); elem != nil {\n\t\t\t\treturn elem\n\t\t\t}\n\t\t}\n\t}\n\tpanic(\"node with logical type LIST is not composed of a repeated .list.element\")\n}\n\nfunc mapKeyValueOf(node Node) Node {\n\tif !node.Leaf() && (node.Required() || node.Optional()) {\n\t\tif keyValue := fieldByName(node, \"key_value\"); keyValue != nil && !keyValue.Leaf() && keyValue.Repeated() {\n\t\t\tk := fieldByName(keyValue, \"key\")\n\t\t\tv := fieldByName(keyValue, \"value\")\n\t\t\tif k != nil && v != nil && k.Required() {\n\t\t\t\treturn keyValue\n\t\t\t}\n\t\t}\n\t}\n\tpanic(\"node with logical type MAP is not composed of a repeated .key_value group with key and value fields\")\n}\n\nfunc encodingOf(node Node) encoding.Encoding {\n\tencoding := node.Encoding()\n\t// The parquet-format documentation states that the\n\t// DELTA_LENGTH_BYTE_ARRAY is always preferred to PLAIN when\n\t// encoding BYTE_ARRAY values. We apply it as a default if\n\t// none were explicitly specified, which gives the application\n\t// the opportunity to override this behavior if needed.\n\t//\n\t// https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6\n\tif node.Type().Kind() == ByteArray && encoding == nil {\n\t\tencoding = &DeltaLengthByteArray\n\t}\n\tif encoding == nil {\n\t\tencoding = &Plain\n\t}\n\treturn encoding\n}\n\nfunc forEachNodeOf(name string, node Node, do func(string, Node)) {\n\tdo(name, node)\n\n\tfor _, f := range node.Fields() {\n\t\tforEachNodeOf(f.Name(), f, do)\n\t}\n}\n\nfunc fieldByName(node Node, name string) Field {\n\tfor _, f := range node.Fields() {\n\t\tif f.Name() == name {\n\t\t\treturn f\n\t\t}\n\t}\n\treturn nil\n}\n\nfunc nodesAreEqual(node1, node2 Node) bool {\n\tif node1.Leaf() {\n\t\treturn node2.Leaf() && leafNodesAreEqual(node1, node2)\n\t} else {\n\t\treturn !node2.Leaf() && groupNodesAreEqual(node1, node2)\n\t}\n}\n\nfunc typesAreEqual(type1, type2 Type) bool {\n\treturn type1.Kind() == type2.Kind() &&\n\t\ttype1.Length() == type2.Length() &&\n\t\treflect.DeepEqual(type1.LogicalType(), type2.LogicalType())\n}\n\nfunc repetitionsAreEqual(node1, node2 Node) bool {\n\treturn node1.Optional() == node2.Optional() && node1.Repeated() == node2.Repeated()\n}\n\nfunc leafNodesAreEqual(node1, node2 Node) bool {\n\treturn typesAreEqual(node1.Type(), node2.Type()) && repetitionsAreEqual(node1, node2)\n}\n\nfunc groupNodesAreEqual(node1, node2 Node) bool {\n\tfields1 := node1.Fields()\n\tfields2 := node2.Fields()\n\n\tif len(fields1) != len(fields2) {\n\t\treturn false\n\t}\n\n\tif !repetitionsAreEqual(node1, node2) {\n\t\treturn false\n\t}\n\n\tfor i := range fields1 {\n\t\tf1 := fields1[i]\n\t\tf2 := fields2[i]\n\n\t\tif f1.Name() != f2.Name() {\n\t\t\treturn false\n\t\t}\n\n\t\tif !nodesAreEqual(f1, f2) {\n\t\t\treturn false\n\t\t}\n\t}\n\n\treturn true\n}\n"
  },
  {
    "path": "null.go",
    "content": "//go:build go1.18\n\npackage parquet\n\nimport (\n\t\"reflect\"\n\t\"unsafe\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/internal/bytealg\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n\t\"github.com/segmentio/parquet-go/sparse\"\n)\n\n// nullIndexFunc is the type of functions used to detect null values in rows.\n//\n// For each value of the rows array, the bitmap passed as first argument is\n// populated to indicate whether the values were null (0) or not (1).\n//\n// The function writes one bit to the output buffer for each row in the input,\n// the buffer must be sized accordingly.\ntype nullIndexFunc func(bits []uint64, rows sparse.Array)\n\nfunc nullIndex[T comparable](bits []uint64, rows sparse.Array) {\n\tvar zero T\n\tfor i := 0; i < rows.Len(); i++ {\n\t\tv := *(*T)(rows.Index(i))\n\t\tif v != zero {\n\t\t\tx := uint(i) / 64\n\t\t\ty := uint(i) % 64\n\t\t\tbits[x] |= 1 << y\n\t\t}\n\t}\n}\n\nfunc nullIndexStruct(bits []uint64, rows sparse.Array) {\n\tbytealg.Broadcast(unsafecast.Slice[byte](bits), 0xFF)\n}\n\nfunc nullIndexFuncOf(t reflect.Type) nullIndexFunc {\n\tswitch t {\n\tcase reflect.TypeOf(deprecated.Int96{}):\n\t\treturn nullIndex[deprecated.Int96]\n\t}\n\n\tswitch t.Kind() {\n\tcase reflect.Bool:\n\t\treturn nullIndexBool\n\n\tcase reflect.Int:\n\t\treturn nullIndexInt\n\n\tcase reflect.Int32:\n\t\treturn nullIndexInt32\n\n\tcase reflect.Int64:\n\t\treturn nullIndexInt64\n\n\tcase reflect.Uint:\n\t\treturn nullIndexUint\n\n\tcase reflect.Uint32:\n\t\treturn nullIndexUint32\n\n\tcase reflect.Uint64:\n\t\treturn nullIndexUint64\n\n\tcase reflect.Float32:\n\t\treturn nullIndexFloat32\n\n\tcase reflect.Float64:\n\t\treturn nullIndexFloat64\n\n\tcase reflect.String:\n\t\treturn nullIndexString\n\n\tcase reflect.Slice:\n\t\treturn nullIndexSlice\n\n\tcase reflect.Map:\n\t\treturn nullIndexPointer\n\n\tcase reflect.Array:\n\t\tif t.Elem().Kind() == reflect.Uint8 {\n\t\t\tswitch size := t.Len(); size {\n\t\t\tcase 16:\n\t\t\t\treturn nullIndexUint128\n\t\t\tdefault:\n\t\t\t\treturn nullIndexFuncOfByteArray(size)\n\t\t\t}\n\t\t}\n\n\tcase reflect.Pointer:\n\t\treturn nullIndexPointer\n\n\tcase reflect.Struct:\n\t\treturn nullIndexStruct\n\t}\n\n\tpanic(\"cannot convert Go values of type \" + typeNameOf(t) + \" to parquet value\")\n}\n\nfunc nullIndexFuncOfByteArray(n int) nullIndexFunc {\n\treturn func(bits []uint64, rows sparse.Array) {\n\t\tfor i := 0; i < rows.Len(); i++ {\n\t\t\tp := (*byte)(rows.Index(i))\n\t\t\tb := unsafe.Slice(p, n)\n\t\t\tif !isZero(b) {\n\t\t\t\tx := uint(i) / 64\n\t\t\t\ty := uint(i) % 64\n\t\t\t\tbits[x] |= 1 << y\n\t\t\t}\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "null_amd64.go",
    "content": "//go:build go1.18 && !purego\n\npackage parquet\n\nimport \"github.com/segmentio/parquet-go/sparse\"\n\n//go:noescape\nfunc nullIndex8(bits *uint64, rows sparse.Array)\n\n//go:noescape\nfunc nullIndex32(bits *uint64, rows sparse.Array)\n\n//go:noescape\nfunc nullIndex64(bits *uint64, rows sparse.Array)\n\n//go:noescape\nfunc nullIndex128(bits *uint64, rows sparse.Array)\n\nfunc nullIndexBool(bits []uint64, rows sparse.Array) {\n\tnullIndex8(&bits[0], rows)\n}\n\nfunc nullIndexInt(bits []uint64, rows sparse.Array) {\n\tnullIndex64(&bits[0], rows)\n}\n\nfunc nullIndexInt32(bits []uint64, rows sparse.Array) {\n\tnullIndex32(&bits[0], rows)\n}\n\nfunc nullIndexInt64(bits []uint64, rows sparse.Array) {\n\tnullIndex64(&bits[0], rows)\n}\n\nfunc nullIndexUint(bits []uint64, rows sparse.Array) {\n\tnullIndex64(&bits[0], rows)\n}\n\nfunc nullIndexUint32(bits []uint64, rows sparse.Array) {\n\tnullIndex32(&bits[0], rows)\n}\n\nfunc nullIndexUint64(bits []uint64, rows sparse.Array) {\n\tnullIndex64(&bits[0], rows)\n}\n\nfunc nullIndexUint128(bits []uint64, rows sparse.Array) {\n\tnullIndex128(&bits[0], rows)\n}\n\nfunc nullIndexFloat32(bits []uint64, rows sparse.Array) {\n\tnullIndex32(&bits[0], rows)\n}\n\nfunc nullIndexFloat64(bits []uint64, rows sparse.Array) {\n\tnullIndex64(&bits[0], rows)\n}\n\nfunc nullIndexString(bits []uint64, rows sparse.Array) {\n\t// We offset by an extra 8 bytes to test the lengths of string values where\n\t// the first field is the pointer and the second is the length which we want\n\t// to test.\n\tnullIndex64(&bits[0], rows.Offset(8))\n}\n\nfunc nullIndexSlice(bits []uint64, rows sparse.Array) {\n\t// Slice values are null if their pointer is nil, which is held in the first\n\t// 8 bytes of the object so we can simply test 64 bits words.\n\tnullIndex64(&bits[0], rows)\n}\n\nfunc nullIndexPointer(bits []uint64, rows sparse.Array) {\n\tnullIndex64(&bits[0], rows)\n}\n"
  },
  {
    "path": "null_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n// func nullIndex8(bits *uint64, rows sparse.Array)\nTEXT ·nullIndex8(SB), NOSPLIT, $0-32\n    MOVQ bits+0(FP), AX\n    MOVQ rows_array_ptr+8(FP), BX\n    MOVQ rows_array_len+16(FP), DI\n    MOVQ rows_array_off+24(FP), DX\n\n    MOVQ $1, CX\n    XORQ SI, SI\n\n    CMPQ DI, $0\n    JE done\nloop1x1:\n    XORQ R8, R8\n    MOVB (BX), R9\n    CMPB R9, $0\n    JE next1x1\n\n    MOVQ SI, R10\n    SHRQ $6, R10\n    ORQ CX, (AX)(R10*8)\nnext1x1:\n    ADDQ DX, BX\n    ROLQ $1, CX\n    INCQ SI\n    CMPQ SI, DI\n    JNE loop1x1\ndone:\n    RET\n\n// func nullIndex32(bits *uint64, rows sparse.Array)\nTEXT ·nullIndex32(SB), NOSPLIT, $0-32\n    MOVQ bits+0(FP), AX\n    MOVQ rows_array_ptr+8(FP), BX\n    MOVQ rows_array_len+16(FP), DI\n    MOVQ rows_array_off+24(FP), DX\n\n    MOVQ $1, CX\n    XORQ SI, SI\n\n    CMPQ DI, $0\n    JE done\n\n    CMPQ DI, $8\n    JB loop1x4\n\n    CMPB ·hasAVX2(SB), $0\n    JE loop1x4\n\n    MOVQ DI, R8\n    SHRQ $3, R8\n    SHLQ $3, R8\n\n    VPBROADCASTD rows_array_off+24(FP), Y0\n    VPMULLD ·range0n8(SB), Y0, Y0\n    VPCMPEQD Y1, Y1, Y1\n    VPCMPEQD Y2, Y2, Y2\n    VPXOR Y3, Y3, Y3\nloop8x4:\n    VPGATHERDD Y1, (BX)(Y0*1), Y4\n    VPCMPEQD Y3, Y4, Y4\n    VMOVMSKPS Y4, R9\n    VMOVDQU Y2, Y1\n\n    NOTQ R9\n    ANDQ $0b11111111, R9\n\n    MOVQ SI, CX\n    ANDQ $0b111111, CX\n\n    MOVQ SI, R10\n    SHRQ $6, R10\n\n    SHLQ CX, R9\n    ORQ R9, (AX)(R10*8)\n\n    LEAQ (BX)(DX*8), BX\n    ADDQ $8, SI\n    CMPQ SI, R8\n    JNE loop8x4\n    VZEROUPPER\n\n    CMPQ SI, DI\n    JE done\n\n    MOVQ $1, R8\n    MOVQ SI, CX\n    ANDQ $0b111111, R8\n    SHLQ CX, R8\n    MOVQ R8, CX\n\nloop1x4:\n    MOVL (BX), R8\n    CMPL R8, $0\n    JE next1x4\n\n    MOVQ SI, R9\n    SHRQ $6, R9\n    ORQ CX, (AX)(R9*8)\nnext1x4:\n    ADDQ DX, BX\n    ROLQ $1, CX\n    INCQ SI\n    CMPQ SI, DI\n    JNE loop1x4\ndone:\n    RET\n\n// func nullIndex64(bits *uint64, rows sparse.Array)\nTEXT ·nullIndex64(SB), NOSPLIT, $0-32\n    MOVQ bits+0(FP), AX\n    MOVQ rows_array_ptr+8(FP), BX\n    MOVQ rows_array_len+16(FP), DI\n    MOVQ rows_array_off+24(FP), DX\n\n    MOVQ $1, CX\n    XORQ SI, SI\n\n    CMPQ DI, $0\n    JE done\n\n    CMPQ DI, $4\n    JB loop1x8\n\n    CMPB ·hasAVX2(SB), $0\n    JE loop1x8\n\n    MOVQ DI, R8\n    SHRQ $2, R8\n    SHLQ $2, R8\n\n    VPBROADCASTQ rows_array_off+24(FP), Y0\n    VPMULLD scale4x8<>(SB), Y0, Y0\n    VPCMPEQQ Y1, Y1, Y1\n    VPCMPEQQ Y2, Y2, Y2\n    VPXOR Y3, Y3, Y3\nloop4x8:\n    VPGATHERQQ Y1, (BX)(Y0*1), Y4\n    VPCMPEQQ Y3, Y4, Y4\n    VMOVMSKPD Y4, R9\n    VMOVDQU Y2, Y1\n\n    NOTQ R9\n    ANDQ $0b1111, R9\n\n    MOVQ SI, CX\n    ANDQ $0b111111, CX\n\n    MOVQ SI, R10\n    SHRQ $6, R10\n\n    SHLQ CX, R9\n    ORQ R9, (AX)(R10*8)\n\n    LEAQ (BX)(DX*4), BX\n    ADDQ $4, SI\n    CMPQ SI, R8\n    JNE loop4x8\n    VZEROUPPER\n\n    CMPQ SI, DI\n    JE done\n\n    MOVQ $1, R8\n    MOVQ SI, CX\n    ANDQ $0b111111, R8\n    SHLQ CX, R8\n    MOVQ R8, CX\n\nloop1x8:\n    MOVQ (BX), R8\n    CMPQ R8, $0\n    JE next1x8\n\n    MOVQ SI, R9\n    SHRQ $6, R9\n    ORQ CX, (AX)(R9*8)\nnext1x8:\n    ADDQ DX, BX\n    ROLQ $1, CX\n    INCQ SI\n    CMPQ SI, DI\n    JNE loop1x8\ndone:\n    RET\n\nGLOBL scale4x8<>(SB), RODATA|NOPTR, $32\nDATA scale4x8<>+0(SB)/8,  $0\nDATA scale4x8<>+8(SB)/8,  $1\nDATA scale4x8<>+16(SB)/8, $2\nDATA scale4x8<>+24(SB)/8, $3\n\n// func nullIndex128(bits *uint64, rows sparse.Array)\nTEXT ·nullIndex128(SB), NOSPLIT, $0-32\n    MOVQ bits+0(FP), AX\n    MOVQ rows_array_ptr+8(FP), BX\n    MOVQ rows_array_len+16(FP), DI\n    MOVQ rows_array_off+24(FP), DX\n\n    CMPQ DI, $0\n    JE done\n\n    MOVQ $1, CX\n    XORQ SI, SI\n    PXOR X0, X0\nloop1x16:\n    MOVOU (BX), X1\n    PCMPEQQ X0, X1\n    MOVMSKPD X1, R8\n    CMPB R8, $0b11\n    JE next1x16\n\n    MOVQ SI, R9\n    SHRQ $6, R9\n    ORQ CX, (AX)(R9*8)\nnext1x16:\n    ADDQ DX, BX\n    ROLQ $1, CX\n    INCQ SI\n    CMPQ SI, DI\n    JNE loop1x16\ndone:\n    RET\n"
  },
  {
    "path": "null_purego.go",
    "content": "//go:build go1.18 && (purego || !amd64)\n\npackage parquet\n\nimport \"github.com/segmentio/parquet-go/sparse\"\n\nfunc nullIndexBool(bits []uint64, rows sparse.Array) {\n\tnullIndex[bool](bits, rows)\n}\n\nfunc nullIndexInt(bits []uint64, rows sparse.Array) {\n\tnullIndex[int](bits, rows)\n}\n\nfunc nullIndexInt32(bits []uint64, rows sparse.Array) {\n\tnullIndex[int32](bits, rows)\n}\n\nfunc nullIndexInt64(bits []uint64, rows sparse.Array) {\n\tnullIndex[int64](bits, rows)\n}\n\nfunc nullIndexUint(bits []uint64, rows sparse.Array) {\n\tnullIndex[uint](bits, rows)\n}\n\nfunc nullIndexUint32(bits []uint64, rows sparse.Array) {\n\tnullIndex[uint32](bits, rows)\n}\n\nfunc nullIndexUint64(bits []uint64, rows sparse.Array) {\n\tnullIndex[uint64](bits, rows)\n}\n\nfunc nullIndexUint128(bits []uint64, rows sparse.Array) {\n\tnullIndex[[16]byte](bits, rows)\n}\n\nfunc nullIndexFloat32(bits []uint64, rows sparse.Array) {\n\tnullIndex[float32](bits, rows)\n}\n\nfunc nullIndexFloat64(bits []uint64, rows sparse.Array) {\n\tnullIndex[float64](bits, rows)\n}\n\nfunc nullIndexString(bits []uint64, rows sparse.Array) {\n\tnullIndex[string](bits, rows)\n}\n\nfunc nullIndexSlice(bits []uint64, rows sparse.Array) {\n\tfor i := 0; i < rows.Len(); i++ {\n\t\tp := *(**struct{})(rows.Index(i))\n\t\tb := uint64(0)\n\t\tif p != nil {\n\t\t\tb = 1\n\t\t}\n\t\tbits[uint(i)/64] |= b << (uint(i) % 64)\n\t}\n}\n\nfunc nullIndexPointer(bits []uint64, rows sparse.Array) {\n\tnullIndex[*struct{}](bits, rows)\n}\n"
  },
  {
    "path": "null_test.go",
    "content": "//go:build go1.18\n\npackage parquet\n\nimport (\n\t\"reflect\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/internal/quick\"\n)\n\nfunc TestNullIndex(t *testing.T) {\n\ttestNullIndex[bool](t)\n\ttestNullIndex[int](t)\n\ttestNullIndex[int32](t)\n\ttestNullIndex[int64](t)\n\ttestNullIndex[uint](t)\n\ttestNullIndex[uint32](t)\n\ttestNullIndex[uint64](t)\n\ttestNullIndex[float32](t)\n\ttestNullIndex[float64](t)\n\ttestNullIndex[[10]byte](t)\n\ttestNullIndex[[16]byte](t)\n\ttestNullIndex[deprecated.Int96](t)\n\ttestNullIndex[string](t)\n\ttestNullIndex[*struct{}](t)\n}\n\nfunc testNullIndex[T comparable](t *testing.T) {\n\tvar zero T\n\tt.Helper()\n\tt.Run(reflect.TypeOf(zero).String(), func(t *testing.T) {\n\t\terr := quick.Check(func(data []T) bool {\n\t\t\tif len(data) == 0 {\n\t\t\t\treturn true\n\t\t\t}\n\n\t\t\twant := make([]uint64, (len(data)+63)/64)\n\t\t\tgot := make([]uint64, (len(data)+63)/64)\n\n\t\t\tfor i := range data {\n\t\t\t\tif (i % 2) == 0 {\n\t\t\t\t\tdata[i] = zero\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tarray := makeArrayOf(data)\n\t\t\tnullIndex[T](want, array)\n\t\t\tnullIndexFuncOf(reflect.TypeOf(zero))(got, array)\n\n\t\t\tif !reflect.DeepEqual(want, got) {\n\t\t\t\tt.Errorf(\"unexpected null index\\nwant = %064b\\ngot  = %064b\", want, got)\n\t\t\t\treturn false\n\t\t\t}\n\t\t\treturn true\n\t\t})\n\t\tif err != nil {\n\t\t\tt.Error(err)\n\t\t}\n\t})\n}\n\nfunc BenchmarkNullIndex(b *testing.B) {\n\tbenchmarkNullIndex[bool](b)\n\tbenchmarkNullIndex[int](b)\n\tbenchmarkNullIndex[int32](b)\n\tbenchmarkNullIndex[int64](b)\n\tbenchmarkNullIndex[uint](b)\n\tbenchmarkNullIndex[uint32](b)\n\tbenchmarkNullIndex[uint64](b)\n\tbenchmarkNullIndex[float32](b)\n\tbenchmarkNullIndex[float64](b)\n\tbenchmarkNullIndex[[10]byte](b)\n\tbenchmarkNullIndex[[16]byte](b)\n\tbenchmarkNullIndex[deprecated.Int96](b)\n\tbenchmarkNullIndex[string](b)\n\tbenchmarkNullIndex[[]struct{}](b)\n\tbenchmarkNullIndex[*struct{}](b)\n}\n\nfunc benchmarkNullIndex[T any](b *testing.B) {\n\tconst N = 1000\n\n\tvar zero T\n\ttyp := reflect.TypeOf(zero)\n\tnull := nullIndexFuncOf(typ)\n\tdata := makeArrayOf(make([]T, N))\n\tbits := make([]uint64, (N+63)/64)\n\n\tb.Run(typ.String(), func(b *testing.B) {\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tnull(bits, data)\n\t\t}\n\t\tb.SetBytes(int64(typ.Size() * N))\n\t})\n}\n"
  },
  {
    "path": "offset_index.go",
    "content": "package parquet\n\nimport (\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\ntype OffsetIndex interface {\n\t// NumPages returns the number of pages in the offset index.\n\tNumPages() int\n\n\t// Offset returns the offset starting from the beginning of the file for the\n\t// page at the given index.\n\tOffset(int) int64\n\n\t// CompressedPageSize returns the size of the page at the given index\n\t// (in bytes).\n\tCompressedPageSize(int) int64\n\n\t// FirstRowIndex returns the the first row in the page at the given index.\n\t//\n\t// The returned row index is based on the row group that the page belongs\n\t// to, the first row has index zero.\n\tFirstRowIndex(int) int64\n}\n\ntype fileOffsetIndex format.OffsetIndex\n\nfunc (i *fileOffsetIndex) NumPages() int {\n\treturn len(i.PageLocations)\n}\n\nfunc (i *fileOffsetIndex) Offset(j int) int64 {\n\treturn i.PageLocations[j].Offset\n}\n\nfunc (i *fileOffsetIndex) CompressedPageSize(j int) int64 {\n\treturn int64(i.PageLocations[j].CompressedPageSize)\n}\n\nfunc (i *fileOffsetIndex) FirstRowIndex(j int) int64 {\n\treturn i.PageLocations[j].FirstRowIndex\n}\n\ntype emptyOffsetIndex struct{}\n\nfunc (emptyOffsetIndex) NumPages() int                { return 0 }\nfunc (emptyOffsetIndex) Offset(int) int64             { return 0 }\nfunc (emptyOffsetIndex) CompressedPageSize(int) int64 { return 0 }\nfunc (emptyOffsetIndex) FirstRowIndex(int) int64      { return 0 }\n\ntype booleanOffsetIndex struct{ page *booleanPage }\n\nfunc (i booleanOffsetIndex) NumPages() int                { return 1 }\nfunc (i booleanOffsetIndex) Offset(int) int64             { return 0 }\nfunc (i booleanOffsetIndex) CompressedPageSize(int) int64 { return i.page.Size() }\nfunc (i booleanOffsetIndex) FirstRowIndex(int) int64      { return 0 }\n\ntype int32OffsetIndex struct{ page *int32Page }\n\nfunc (i int32OffsetIndex) NumPages() int                { return 1 }\nfunc (i int32OffsetIndex) Offset(int) int64             { return 0 }\nfunc (i int32OffsetIndex) CompressedPageSize(int) int64 { return i.page.Size() }\nfunc (i int32OffsetIndex) FirstRowIndex(int) int64      { return 0 }\n\ntype int64OffsetIndex struct{ page *int64Page }\n\nfunc (i int64OffsetIndex) NumPages() int                { return 1 }\nfunc (i int64OffsetIndex) Offset(int) int64             { return 0 }\nfunc (i int64OffsetIndex) CompressedPageSize(int) int64 { return i.page.Size() }\nfunc (i int64OffsetIndex) FirstRowIndex(int) int64      { return 0 }\n\ntype int96OffsetIndex struct{ page *int96Page }\n\nfunc (i int96OffsetIndex) NumPages() int                { return 1 }\nfunc (i int96OffsetIndex) Offset(int) int64             { return 0 }\nfunc (i int96OffsetIndex) CompressedPageSize(int) int64 { return i.page.Size() }\nfunc (i int96OffsetIndex) FirstRowIndex(int) int64      { return 0 }\n\ntype floatOffsetIndex struct{ page *floatPage }\n\nfunc (i floatOffsetIndex) NumPages() int                { return 1 }\nfunc (i floatOffsetIndex) Offset(int) int64             { return 0 }\nfunc (i floatOffsetIndex) CompressedPageSize(int) int64 { return i.page.Size() }\nfunc (i floatOffsetIndex) FirstRowIndex(int) int64      { return 0 }\n\ntype doubleOffsetIndex struct{ page *doublePage }\n\nfunc (i doubleOffsetIndex) NumPages() int                { return 1 }\nfunc (i doubleOffsetIndex) Offset(int) int64             { return 0 }\nfunc (i doubleOffsetIndex) CompressedPageSize(int) int64 { return i.page.Size() }\nfunc (i doubleOffsetIndex) FirstRowIndex(int) int64      { return 0 }\n\ntype byteArrayOffsetIndex struct{ page *byteArrayPage }\n\nfunc (i byteArrayOffsetIndex) NumPages() int                { return 1 }\nfunc (i byteArrayOffsetIndex) Offset(int) int64             { return 0 }\nfunc (i byteArrayOffsetIndex) CompressedPageSize(int) int64 { return i.page.Size() }\nfunc (i byteArrayOffsetIndex) FirstRowIndex(int) int64      { return 0 }\n\ntype fixedLenByteArrayOffsetIndex struct{ page *fixedLenByteArrayPage }\n\nfunc (i fixedLenByteArrayOffsetIndex) NumPages() int                { return 1 }\nfunc (i fixedLenByteArrayOffsetIndex) Offset(int) int64             { return 0 }\nfunc (i fixedLenByteArrayOffsetIndex) CompressedPageSize(int) int64 { return i.page.Size() }\nfunc (i fixedLenByteArrayOffsetIndex) FirstRowIndex(int) int64      { return 0 }\n\ntype uint32OffsetIndex struct{ page *uint32Page }\n\nfunc (i uint32OffsetIndex) NumPages() int                { return 1 }\nfunc (i uint32OffsetIndex) Offset(int) int64             { return 0 }\nfunc (i uint32OffsetIndex) CompressedPageSize(int) int64 { return i.page.Size() }\nfunc (i uint32OffsetIndex) FirstRowIndex(int) int64      { return 0 }\n\ntype uint64OffsetIndex struct{ page *uint64Page }\n\nfunc (i uint64OffsetIndex) NumPages() int                { return 1 }\nfunc (i uint64OffsetIndex) Offset(int) int64             { return 0 }\nfunc (i uint64OffsetIndex) CompressedPageSize(int) int64 { return i.page.Size() }\nfunc (i uint64OffsetIndex) FirstRowIndex(int) int64      { return 0 }\n\ntype be128OffsetIndex struct{ page *be128Page }\n\nfunc (i be128OffsetIndex) NumPages() int                { return 1 }\nfunc (i be128OffsetIndex) Offset(int) int64             { return 0 }\nfunc (i be128OffsetIndex) CompressedPageSize(int) int64 { return i.page.Size() }\nfunc (i be128OffsetIndex) FirstRowIndex(int) int64      { return 0 }\n"
  },
  {
    "path": "order.go",
    "content": "package parquet\n\nimport (\n\t\"bytes\"\n\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\nfunc orderOfBool(data []bool) int {\n\tswitch len(data) {\n\tcase 0, 1:\n\t\treturn 0\n\tdefault:\n\t\tk := 0\n\t\ti := 0\n\n\t\tif data[0] { // true => false: descending\n\t\t\tk = -1\n\t\t\ti = streakOfTrue(data)\n\t\t\tif i == len(data) {\n\t\t\t\tk = +1\n\t\t\t} else {\n\t\t\t\ti += streakOfFalse(data[i:])\n\t\t\t}\n\t\t} else { // false => true: ascending\n\t\t\tk = +1\n\t\t\ti = streakOfFalse(data)\n\t\t\ti += streakOfTrue(data[i:])\n\t\t}\n\n\t\tif i != len(data) {\n\t\t\tk = 0\n\t\t}\n\t\treturn k\n\t}\n}\n\nfunc streakOfTrue(data []bool) int {\n\tif i := bytes.IndexByte(unsafecast.BoolToBytes(data), 0); i >= 0 {\n\t\treturn i\n\t}\n\treturn len(data)\n}\n\nfunc streakOfFalse(data []bool) int {\n\tif i := bytes.IndexByte(unsafecast.BoolToBytes(data), 1); i >= 0 {\n\t\treturn i\n\t}\n\treturn len(data)\n}\n\nfunc orderOfBytes(data [][]byte) int {\n\tswitch len(data) {\n\tcase 0, 1:\n\t\treturn 0\n\t}\n\tdata = skipBytesStreak(data)\n\tif len(data) < 2 {\n\t\treturn 1\n\t}\n\tordering := bytes.Compare(data[0], data[1])\n\tswitch {\n\tcase ordering < 0:\n\t\tif bytesAreInAscendingOrder(data[1:]) {\n\t\t\treturn +1\n\t\t}\n\tcase ordering > 0:\n\t\tif bytesAreInDescendingOrder(data[1:]) {\n\t\t\treturn -1\n\t\t}\n\t}\n\treturn 0\n}\n\nfunc skipBytesStreak(data [][]byte) [][]byte {\n\tfor i := 1; i < len(data); i++ {\n\t\tif !bytes.Equal(data[i], data[0]) {\n\t\t\treturn data[i-1:]\n\t\t}\n\t}\n\treturn data[len(data)-1:]\n}\n\nfunc bytesAreInAscendingOrder(data [][]byte) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tk := bytes.Compare(data[i-1], data[i])\n\t\tif k > 0 {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc bytesAreInDescendingOrder(data [][]byte) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tk := bytes.Compare(data[i-1], data[i])\n\t\tif k < 0 {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n"
  },
  {
    "path": "order_amd64.go",
    "content": "//go:build !purego\n\npackage parquet\n\n//go:noescape\nfunc orderOfInt32(data []int32) int\n\n//go:noescape\nfunc orderOfInt64(data []int64) int\n\n//go:noescape\nfunc orderOfUint32(data []uint32) int\n\n//go:noescape\nfunc orderOfUint64(data []uint64) int\n\n//go:noescape\nfunc orderOfFloat32(data []float32) int\n\n//go:noescape\nfunc orderOfFloat64(data []float64) int\n"
  },
  {
    "path": "order_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n#define UNDEFINED 0\n#define ASCENDING 1\n#define DESCENDING -1\n\nDATA shift1x32<>+0(SB)/4, $1\nDATA shift1x32<>+4(SB)/4, $2\nDATA shift1x32<>+8(SB)/4, $3\nDATA shift1x32<>+12(SB)/4, $4\nDATA shift1x32<>+16(SB)/4, $5\nDATA shift1x32<>+20(SB)/4, $6\nDATA shift1x32<>+24(SB)/4, $7\nDATA shift1x32<>+28(SB)/4, $8\nDATA shift1x32<>+32(SB)/4, $9\nDATA shift1x32<>+36(SB)/4, $10\nDATA shift1x32<>+40(SB)/4, $11\nDATA shift1x32<>+44(SB)/4, $12\nDATA shift1x32<>+48(SB)/4, $13\nDATA shift1x32<>+52(SB)/4, $14\nDATA shift1x32<>+56(SB)/4, $15\nDATA shift1x32<>+60(SB)/4, $15\nGLOBL shift1x32<>(SB), RODATA|NOPTR, $64\n\nDATA shift1x64<>+0(SB)/4, $1\nDATA shift1x64<>+8(SB)/4, $2\nDATA shift1x64<>+16(SB)/4, $3\nDATA shift1x64<>+24(SB)/4, $4\nDATA shift1x64<>+32(SB)/4, $5\nDATA shift1x64<>+40(SB)/4, $6\nDATA shift1x64<>+48(SB)/4, $7\nDATA shift1x64<>+56(SB)/4, $7\nGLOBL shift1x64<>(SB), RODATA|NOPTR, $64\n\n// func orderOfInt32(data []int32) int\nTEXT ·orderOfInt32(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), R8\n    MOVQ data_len+8(FP), R9\n    XORQ SI, SI\n    XORQ DI, DI\n\n    CMPQ R9, $2\n    JB undefined\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE test\n\n    CMPQ R9, $16\n    JB test\n\n    XORQ DX, DX\n    MOVQ R9, AX\n    SHRQ $4, AX\n    SHLQ $4, AX\n    MOVQ $15, CX\n    IDIVQ CX\n    IMULQ $15, AX\n    DECQ R9\n\n    VMOVDQU32 shift1x32<>(SB), Z2\n    KXORW K2, K2, K2\ntestAscending15:\n    VMOVDQU32 (R8)(SI*4), Z0\n    VMOVDQU32 Z2, Z1\n    VPERMI2D Z0, Z0, Z1\n    VPCMPD $2, Z1, Z0, K1\n    KORTESTW K2, K1\n    JNC testDescending15\n    ADDQ $15, SI\n    CMPQ SI, AX\n    JNE testAscending15\n    VZEROUPPER\n    JMP testAscending\ntestDescending15:\n    VMOVDQU32 (R8)(DI*4), Z0\n    VMOVDQU32 Z2, Z1\n    VPERMI2D Z0, Z0, Z1\n    VPCMPD $5, Z1, Z0, K1\n    KORTESTW K2, K1\n    JNC undefined15\n    ADDQ $15, DI\n    CMPQ DI, AX\n    JNE testDescending15\n    VZEROUPPER\n    JMP testDescending\n\ntest:\n    DECQ R9\ntestAscending:\n    CMPQ SI, R9\n    JAE ascending\n    MOVL (R8)(SI*4), BX\n    MOVL 4(R8)(SI*4), DX\n    INCQ SI\n    CMPL BX, DX\n    JLE testAscending\n    JMP testDescending\nascending:\n    MOVQ $ASCENDING, ret+24(FP)\n    RET\ntestDescending:\n    CMPQ DI, R9\n    JAE descending\n    MOVL (R8)(DI*4), BX\n    MOVL 4(R8)(DI*4), DX\n    INCQ DI\n    CMPL BX, DX\n    JGE testDescending\n    JMP undefined\ndescending:\n    MOVQ $DESCENDING, ret+24(FP)\n    RET\nundefined15:\n    VZEROUPPER\nundefined:\n    MOVQ $UNDEFINED, ret+24(FP)\n    RET\n\n// func orderOfInt64(data []int64) int\nTEXT ·orderOfInt64(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), R8\n    MOVQ data_len+8(FP), R9\n    XORQ SI, SI\n    XORQ DI, DI\n\n    CMPQ R9, $2\n    JB undefined\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE test\n\n    CMPQ R9, $8\n    JB test\n\n    XORQ DX, DX\n    MOVQ R9, AX\n    SHRQ $3, AX\n    SHLQ $3, AX\n    MOVQ $7, CX\n    IDIVQ CX\n    IMULQ $7, AX\n    DECQ R9\n\n    VMOVDQU64 shift1x64<>(SB), Z2\n    KXORB K2, K2, K2\ntestAscending7:\n    VMOVDQU64 (R8)(SI*8), Z0\n    VMOVDQU64 Z2, Z1\n    VPERMI2Q Z0, Z0, Z1\n    VPCMPQ $2, Z1, Z0, K1\n    KORTESTB K2, K1\n    JNC testDescending7\n    ADDQ $7, SI\n    CMPQ SI, AX\n    JNE testAscending7\n    VZEROUPPER\n    JMP testAscending\ntestDescending7:\n    VMOVDQU64 (R8)(DI*8), Z0\n    VMOVDQU64 Z2, Z1\n    VPERMI2Q Z0, Z0, Z1\n    VPCMPQ $5, Z1, Z0, K1\n    KORTESTB K2, K1\n    JNC undefined7\n    ADDQ $7, DI\n    CMPQ DI, AX\n    JNE testDescending7\n    VZEROUPPER\n    JMP testDescending\n\ntest:\n    DECQ R9\ntestAscending:\n    CMPQ SI, R9\n    JAE ascending\n    MOVQ (R8)(SI*8), BX\n    MOVQ 8(R8)(SI*8), DX\n    INCQ SI\n    CMPQ BX, DX\n    JLE testAscending\n    JMP testDescending\nascending:\n    MOVQ $ASCENDING, ret+24(FP)\n    RET\ntestDescending:\n    CMPQ DI, R9\n    JAE descending\n    MOVQ (R8)(DI*8), BX\n    MOVQ 8(R8)(DI*8), DX\n    INCQ DI\n    CMPQ BX, DX\n    JGE testDescending\n    JMP undefined\ndescending:\n    MOVQ $DESCENDING, ret+24(FP)\n    RET\nundefined7:\n    VZEROUPPER\nundefined:\n    MOVQ $UNDEFINED, ret+24(FP)\n    RET\n\n// func orderOfUint32(data []uint32) int\nTEXT ·orderOfUint32(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), R8\n    MOVQ data_len+8(FP), R9\n    XORQ SI, SI\n    XORQ DI, DI\n\n    CMPQ R9, $2\n    JB undefined\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE test\n\n    CMPQ R9, $16\n    JB test\n\n    XORQ DX, DX\n    MOVQ R9, AX\n    SHRQ $4, AX\n    SHLQ $4, AX\n    MOVQ $15, CX\n    IDIVQ CX\n    IMULQ $15, AX\n    DECQ R9\n\n    VMOVDQU32 shift1x32<>(SB), Z2\n    KXORW K2, K2, K2\ntestAscending15:\n    VMOVDQU32 (R8)(SI*4), Z0\n    VMOVDQU32 Z2, Z1\n    VPERMI2D Z0, Z0, Z1\n    VPCMPUD $2, Z1, Z0, K1\n    KORTESTW K2, K1\n    JNC testDescending15\n    ADDQ $15, SI\n    CMPQ SI, AX\n    JNE testAscending15\n    VZEROUPPER\n    JMP testAscending\ntestDescending15:\n    VMOVDQU32 (R8)(DI*4), Z0\n    VMOVDQU32 Z2, Z1\n    VPERMI2D Z0, Z0, Z1\n    VPCMPUD $5, Z1, Z0, K1\n    KORTESTW K2, K1\n    JNC undefined15\n    ADDQ $15, DI\n    CMPQ DI, AX\n    JNE testDescending15\n    VZEROUPPER\n    JMP testDescending\n\ntest:\n    DECQ R9\ntestAscending:\n    CMPQ SI, R9\n    JAE ascending\n    MOVL (R8)(SI*4), BX\n    MOVL 4(R8)(SI*4), DX\n    INCQ SI\n    CMPL BX, DX\n    JBE testAscending\n    JMP testDescending\nascending:\n    MOVQ $ASCENDING, ret+24(FP)\n    RET\ntestDescending:\n    CMPQ DI, R9\n    JAE descending\n    MOVL (R8)(DI*4), BX\n    MOVL 4(R8)(DI*4), DX\n    INCQ DI\n    CMPL BX, DX\n    JAE testDescending\n    JMP undefined\ndescending:\n    MOVQ $DESCENDING, ret+24(FP)\n    RET\nundefined15:\n    VZEROUPPER\nundefined:\n    MOVQ $UNDEFINED, ret+24(FP)\n    RET\n\n// func orderOfUint64(data []uint64) int\nTEXT ·orderOfUint64(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), R8\n    MOVQ data_len+8(FP), R9\n    XORQ SI, SI\n    XORQ DI, DI\n\n    CMPQ R9, $2\n    JB undefined\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE test\n\n    CMPQ R9, $8\n    JB test\n\n    XORQ DX, DX\n    MOVQ R9, AX\n    SHRQ $3, AX\n    SHLQ $3, AX\n    MOVQ $7, CX\n    IDIVQ CX\n    IMULQ $7, AX\n    DECQ R9\n\n    VMOVDQU64 shift1x64<>(SB), Z2\n    KXORB K2, K2, K2\ntestAscending7:\n    VMOVDQU64 (R8)(SI*8), Z0\n    VMOVDQU64 Z2, Z1\n    VPERMI2Q Z0, Z0, Z1\n    VPCMPUQ $2, Z1, Z0, K1\n    KORTESTB K2, K1\n    JNC testDescending7\n    ADDQ $7, SI\n    CMPQ SI, AX\n    JNE testAscending7\n    VZEROUPPER\n    JMP testAscending\ntestDescending7:\n    VMOVDQU64 (R8)(DI*8), Z0\n    VMOVDQU64 Z2, Z1\n    VPERMI2Q Z0, Z0, Z1\n    VPCMPUQ $5, Z1, Z0, K1\n    KORTESTB K2, K1\n    JNC undefined7\n    ADDQ $7, DI\n    CMPQ DI, AX\n    JNE testDescending7\n    VZEROUPPER\n    JMP testDescending\n\ntest:\n    DECQ R9\ntestAscending:\n    CMPQ SI, R9\n    JAE ascending\n    MOVQ (R8)(SI*8), BX\n    MOVQ 8(R8)(SI*8), DX\n    INCQ SI\n    CMPQ BX, DX\n    JBE testAscending\n    JMP testDescending\nascending:\n    MOVQ $ASCENDING, ret+24(FP)\n    RET\ntestDescending:\n    CMPQ DI, R9\n    JAE descending\n    MOVQ (R8)(DI*8), BX\n    MOVQ 8(R8)(DI*8), DX\n    INCQ DI\n    CMPQ BX, DX\n    JAE testDescending\n    JMP undefined\ndescending:\n    MOVQ $DESCENDING, ret+24(FP)\n    RET\nundefined7:\n    VZEROUPPER\nundefined:\n    MOVQ $UNDEFINED, ret+24(FP)\n    RET\n\n// func orderOfFloat32(data []float32) int\nTEXT ·orderOfFloat32(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), R8\n    MOVQ data_len+8(FP), R9\n    XORQ SI, SI\n    XORQ DI, DI\n\n    CMPQ R9, $2\n    JB undefined\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE test\n\n    CMPQ R9, $16\n    JB test\n\n    XORQ DX, DX\n    MOVQ R9, AX\n    SHRQ $4, AX\n    SHLQ $4, AX\n    MOVQ $15, CX\n    IDIVQ CX\n    IMULQ $15, AX\n    DECQ R9\n\n    VMOVDQU32 shift1x32<>(SB), Z2\n    KXORW K2, K2, K2\ntestAscending15:\n    VMOVDQU32 (R8)(SI*4), Z0\n    VMOVDQU32 Z2, Z1\n    VPERMI2D Z0, Z0, Z1\n    VCMPPS $2, Z1, Z0, K1\n    KORTESTW K2, K1\n    JNC testDescending15\n    ADDQ $15, SI\n    CMPQ SI, AX\n    JNE testAscending15\n    VZEROUPPER\n    JMP testAscending\ntestDescending15:\n    VMOVDQU32 (R8)(DI*4), Z0\n    VMOVDQU32 Z2, Z1\n    VPERMI2D Z0, Z0, Z1\n    VCMPPS $5, Z1, Z0, K1\n    KORTESTW K2, K1\n    JNC undefined15\n    ADDQ $15, DI\n    CMPQ DI, AX\n    JNE testDescending15\n    VZEROUPPER\n    JMP testDescending\n\ntest:\n    DECQ R9\ntestAscending:\n    CMPQ SI, R9\n    JAE ascending\n    MOVLQZX (R8)(SI*4), BX\n    MOVLQZX 4(R8)(SI*4), DX\n    INCQ SI\n    MOVQ BX, X0\n    MOVQ DX, X1\n    UCOMISS X1, X0\n    JBE testAscending\n    JMP testDescending\nascending:\n    MOVQ $ASCENDING, ret+24(FP)\n    RET\ntestDescending:\n    CMPQ DI, R9\n    JAE descending\n    MOVLQZX (R8)(DI*4), BX\n    MOVLQZX 4(R8)(DI*4), DX\n    INCQ DI\n    MOVQ BX, X0\n    MOVQ DX, X1\n    UCOMISS X1, X0\n    JAE testDescending\n    JMP undefined\ndescending:\n    MOVQ $DESCENDING, ret+24(FP)\n    RET\nundefined15:\n    VZEROUPPER\nundefined:\n    MOVQ $UNDEFINED, ret+24(FP)\n    RET\n\n// func orderOfFloat64(data []uint64) int\nTEXT ·orderOfFloat64(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), R8\n    MOVQ data_len+8(FP), R9\n    XORQ SI, SI\n    XORQ DI, DI\n\n    CMPQ R9, $2\n    JB undefined\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE test\n\n    CMPQ R9, $8\n    JB test\n\n    XORQ DX, DX\n    MOVQ R9, AX\n    SHRQ $3, AX\n    SHLQ $3, AX\n    MOVQ $7, CX\n    IDIVQ CX\n    IMULQ $7, AX\n    DECQ R9\n\n    VMOVDQU64 shift1x64<>(SB), Z2\n    KXORB K2, K2, K2\ntestAscending7:\n    VMOVDQU64 (R8)(SI*8), Z0\n    VMOVDQU64 Z2, Z1\n    VPERMI2Q Z0, Z0, Z1\n    VCMPPD $2, Z1, Z0, K1\n    KORTESTB K2, K1\n    JNC testDescending7\n    ADDQ $7, SI\n    CMPQ SI, AX\n    JNE testAscending7\n    VZEROUPPER\n    JMP testAscending\ntestDescending7:\n    VMOVDQU64 (R8)(DI*8), Z0\n    VMOVDQU64 Z2, Z1\n    VPERMI2Q Z0, Z0, Z1\n    VCMPPD $5, Z1, Z0, K1\n    KORTESTB K2, K1\n    JNC undefined7\n    ADDQ $7, DI\n    CMPQ DI, AX\n    JNE testDescending7\n    VZEROUPPER\n    JMP testDescending\n\ntest:\n    DECQ R9\ntestAscending:\n    CMPQ SI, R9\n    JAE ascending\n    MOVQ (R8)(SI*8), BX\n    MOVQ 8(R8)(SI*8), DX\n    INCQ SI\n    MOVQ BX, X0\n    MOVQ DX, X1\n    UCOMISD X1, X0\n    JBE testAscending\n    JMP testDescending\nascending:\n    MOVQ $ASCENDING, ret+24(FP)\n    RET\ntestDescending:\n    CMPQ DI, R9\n    JAE descending\n    MOVQ (R8)(DI*8), BX\n    MOVQ 8(R8)(DI*8), DX\n    INCQ DI\n    MOVQ BX, X0\n    MOVQ DX, X1\n    UCOMISD X1, X0\n    JAE testDescending\n    JMP undefined\ndescending:\n    MOVQ $DESCENDING, ret+24(FP)\n    RET\nundefined7:\n    VZEROUPPER\nundefined:\n    MOVQ $UNDEFINED, ret+24(FP)\n    RET\n"
  },
  {
    "path": "order_purego.go",
    "content": "//go:build purego || !amd64\n\npackage parquet\n\n// -----------------------------------------------------------------------------\n// TODO: use generics versions of the these functions to reduce the amount of\n// code to maintain when we drop compatilibty with Go version older than 1.18.\n// -----------------------------------------------------------------------------\n\nfunc orderOfInt32(data []int32) int {\n\tif len(data) > 1 {\n\t\tif int32AreInAscendingOrder(data) {\n\t\t\treturn +1\n\t\t}\n\t\tif int32AreInDescendingOrder(data) {\n\t\t\treturn -1\n\t\t}\n\t}\n\treturn 0\n}\n\nfunc orderOfInt64(data []int64) int {\n\tif len(data) > 1 {\n\t\tif int64AreInAscendingOrder(data) {\n\t\t\treturn +1\n\t\t}\n\t\tif int64AreInDescendingOrder(data) {\n\t\t\treturn -1\n\t\t}\n\t}\n\treturn 0\n}\n\nfunc orderOfUint32(data []uint32) int {\n\tif len(data) > 1 {\n\t\tif uint32AreInAscendingOrder(data) {\n\t\t\treturn +1\n\t\t}\n\t\tif uint32AreInDescendingOrder(data) {\n\t\t\treturn -1\n\t\t}\n\t}\n\treturn 0\n}\n\nfunc orderOfUint64(data []uint64) int {\n\tif len(data) > 1 {\n\t\tif uint64AreInAscendingOrder(data) {\n\t\t\treturn +1\n\t\t}\n\t\tif uint64AreInDescendingOrder(data) {\n\t\t\treturn -1\n\t\t}\n\t}\n\treturn 0\n}\n\nfunc orderOfFloat32(data []float32) int {\n\tif len(data) > 1 {\n\t\tif float32AreInAscendingOrder(data) {\n\t\t\treturn +1\n\t\t}\n\t\tif float32AreInDescendingOrder(data) {\n\t\t\treturn -1\n\t\t}\n\t}\n\treturn 0\n}\n\nfunc orderOfFloat64(data []float64) int {\n\tif len(data) > 1 {\n\t\tif float64AreInAscendingOrder(data) {\n\t\t\treturn +1\n\t\t}\n\t\tif float64AreInDescendingOrder(data) {\n\t\t\treturn -1\n\t\t}\n\t}\n\treturn 0\n}\n\nfunc int32AreInAscendingOrder(data []int32) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tif data[i-1] > data[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc int32AreInDescendingOrder(data []int32) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tif data[i-1] < data[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc int64AreInAscendingOrder(data []int64) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tif data[i-1] > data[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc int64AreInDescendingOrder(data []int64) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tif data[i-1] < data[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc uint32AreInAscendingOrder(data []uint32) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tif data[i-1] > data[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc uint32AreInDescendingOrder(data []uint32) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tif data[i-1] < data[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc uint64AreInAscendingOrder(data []uint64) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tif data[i-1] > data[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc uint64AreInDescendingOrder(data []uint64) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tif data[i-1] < data[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc float32AreInAscendingOrder(data []float32) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tif data[i-1] > data[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc float32AreInDescendingOrder(data []float32) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tif data[i-1] < data[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc float64AreInAscendingOrder(data []float64) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tif data[i-1] > data[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc float64AreInDescendingOrder(data []float64) bool {\n\tfor i := len(data) - 1; i > 0; i-- {\n\t\tif data[i-1] < data[i] {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n"
  },
  {
    "path": "order_test.go",
    "content": "package parquet\n\nimport (\n\t\"bytes\"\n\t\"sort\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/internal/quick\"\n)\n\ntype boolOrder []bool\n\nfunc (v boolOrder) Len() int           { return len(v) }\nfunc (v boolOrder) Less(i, j int) bool { return !v[i] && v[j] }\nfunc (v boolOrder) Swap(i, j int)      { v[i], v[j] = v[j], v[i] }\n\ntype int32Order []int32\n\nfunc (v int32Order) Len() int           { return len(v) }\nfunc (v int32Order) Less(i, j int) bool { return v[i] < v[j] }\nfunc (v int32Order) Swap(i, j int)      { v[i], v[j] = v[j], v[i] }\n\ntype int64Order []int64\n\nfunc (v int64Order) Len() int           { return len(v) }\nfunc (v int64Order) Less(i, j int) bool { return v[i] < v[j] }\nfunc (v int64Order) Swap(i, j int)      { v[i], v[j] = v[j], v[i] }\n\ntype uint32Order []uint32\n\nfunc (v uint32Order) Len() int           { return len(v) }\nfunc (v uint32Order) Less(i, j int) bool { return v[i] < v[j] }\nfunc (v uint32Order) Swap(i, j int)      { v[i], v[j] = v[j], v[i] }\n\ntype uint64Order []uint64\n\nfunc (v uint64Order) Len() int           { return len(v) }\nfunc (v uint64Order) Less(i, j int) bool { return v[i] < v[j] }\nfunc (v uint64Order) Swap(i, j int)      { v[i], v[j] = v[j], v[i] }\n\ntype float32Order []float32\n\nfunc (v float32Order) Len() int           { return len(v) }\nfunc (v float32Order) Less(i, j int) bool { return v[i] < v[j] }\nfunc (v float32Order) Swap(i, j int)      { v[i], v[j] = v[j], v[i] }\n\ntype float64Order []float64\n\nfunc (v float64Order) Len() int           { return len(v) }\nfunc (v float64Order) Less(i, j int) bool { return v[i] < v[j] }\nfunc (v float64Order) Swap(i, j int)      { v[i], v[j] = v[j], v[i] }\n\ntype bytesOrder [][]byte\n\nfunc (v bytesOrder) Len() int           { return len(v) }\nfunc (v bytesOrder) Less(i, j int) bool { return bytes.Compare(v[i], v[j]) < 0 }\nfunc (v bytesOrder) Swap(i, j int)      { v[i], v[j] = v[j], v[i] }\n\nfunc orderingName(ordering int) string {\n\tswitch {\n\tcase isAscending(ordering):\n\t\treturn \"ascending\"\n\tcase isDescending(ordering):\n\t\treturn \"descending\"\n\tdefault:\n\t\treturn \"undefined\"\n\t}\n}\n\nfunc isAscending(ordering int) bool {\n\treturn ordering > 0\n}\n\nfunc isDescending(ordering int) bool {\n\treturn ordering < 0\n}\n\nfunc isUndefined(ordering int) bool {\n\treturn ordering == 0\n}\n\nfunc isOrdered(set sort.Interface) bool {\n\treturn set.Len() > 1 && sort.IsSorted(set)\n}\n\nfunc checkOrdering(t *testing.T, set sort.Interface, ordering int) bool {\n\tt.Helper()\n\tswitch {\n\tcase isOrdered(set):\n\t\tif !isAscending(ordering) {\n\t\t\tt.Errorf(\"got=%s want=ascending\", orderingName(ordering))\n\t\t\treturn false\n\t\t}\n\tcase isOrdered(sort.Reverse(set)):\n\t\tif !isDescending(ordering) {\n\t\t\tt.Errorf(\"got=%s want=descending\", orderingName(ordering))\n\t\t\treturn false\n\t\t}\n\tdefault:\n\t\tif !isUndefined(ordering) {\n\t\t\tt.Errorf(\"got=%s want=undefined\", orderingName(ordering))\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc TestOrderOfBool(t *testing.T) {\n\tcheck := func(values []bool) bool {\n\t\treturn checkOrdering(t, boolOrder(values), orderOfBool(values))\n\t}\n\terr := quick.Check(func(values []bool) bool {\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(boolOrder(values))\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(sort.Reverse(boolOrder(values)))\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\treturn true\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestOrderOfInt32(t *testing.T) {\n\tcheck := func(values []int32) bool {\n\t\treturn checkOrdering(t, int32Order(values), orderOfInt32(values))\n\t}\n\terr := quick.Check(func(values []int32) bool {\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(int32Order(values))\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(sort.Reverse(int32Order(values)))\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\treturn true\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n\n\t// This extra test validates that out-of-order values at 64 byte boundaries\n\t// are properly detected; it tests corner cases of the vectorized code path\n\t// which works on 64 bytes per loop iteration.\n\tvalues := []int32{\n\t\t0, 1, 2, 3, 4, 5, 6, 7,\n\t\t8, 9, 10, 11, 12, 13, 14, 15,\n\t\t// 15 > 14, the algorithm must detect that the values are not ordered.\n\t\t14, 17, 18, 19, 20, 21, 22, 23,\n\t\t24, 25, 26, 27, 28, 29, 30, 31,\n\t}\n\n\tif !check(values) {\n\t\tt.Error(\"failed due to not checking the connection between sequences of 16 elements\")\n\t}\n}\n\nfunc TestOrderOfInt64(t *testing.T) {\n\tcheck := func(values []int64) bool {\n\t\treturn checkOrdering(t, int64Order(values), orderOfInt64(values))\n\t}\n\terr := quick.Check(func(values []int64) bool {\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(int64Order(values))\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(sort.Reverse(int64Order(values)))\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\treturn true\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n\n\tvalues := []int64{\n\t\t0, 1, 2, 3, 4, 5, 6, 7,\n\t\t6, 9, 10, 11, 12, 13, 14, 15,\n\t\t14, 17, 18, 19, 20, 21, 22, 23,\n\t\t24, 25, 26, 27, 28, 29, 30, 31,\n\t}\n\n\tif !check(values) {\n\t\tt.Error(\"failed due to not checking the connection between sequences of 8 elements\")\n\t}\n}\n\nfunc TestOrderOfUint32(t *testing.T) {\n\tcheck := func(values []uint32) bool {\n\t\treturn checkOrdering(t, uint32Order(values), orderOfUint32(values))\n\t}\n\terr := quick.Check(func(values []uint32) bool {\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(uint32Order(values))\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(sort.Reverse(uint32Order(values)))\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\treturn true\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n\n\tvalues := []uint32{\n\t\t0, 1, 2, 3, 4, 5, 6, 7,\n\t\t8, 9, 10, 11, 12, 13, 14, 15,\n\t\t14, 17, 18, 19, 20, 21, 22, 23,\n\t\t24, 25, 26, 27, 28, 29, 30, 31,\n\t}\n\n\tif !check(values) {\n\t\tt.Error(\"failed due to not checking the connection between sequences of 16 elements\")\n\t}\n}\n\nfunc TestOrderOfUint64(t *testing.T) {\n\tcheck := func(values []uint64) bool {\n\t\treturn checkOrdering(t, uint64Order(values), orderOfUint64(values))\n\t}\n\terr := quick.Check(func(values []uint64) bool {\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(uint64Order(values))\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(sort.Reverse(uint64Order(values)))\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\treturn true\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n\n\tvalues := []uint64{\n\t\t0, 1, 2, 3, 4, 5, 6, 7,\n\t\t6, 9, 10, 11, 12, 13, 14, 15,\n\t\t14, 17, 18, 19, 20, 21, 22, 23,\n\t\t24, 25, 26, 27, 28, 29, 30, 31,\n\t}\n\n\tif !check(values) {\n\t\tt.Error(\"failed due to not checking the connection between sequences of 8 elements\")\n\t}\n}\n\nfunc TestOrderOfFloat32(t *testing.T) {\n\tcheck := func(values []float32) bool {\n\t\treturn checkOrdering(t, float32Order(values), orderOfFloat32(values))\n\t}\n\terr := quick.Check(func(values []float32) bool {\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(float32Order(values))\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(sort.Reverse(float32Order(values)))\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\treturn true\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n\n\tvalues := []float32{\n\t\t0, 1, 2, 3, 4, 5, 6, 7,\n\t\t8, 9, 10, 11, 12, 13, 14, 15,\n\t\t14, 17, 18, 19, 20, 21, 22, 23,\n\t\t24, 25, 26, 27, 28, 29, 30, 31,\n\t}\n\n\tif !check(values) {\n\t\tt.Error(\"failed due to not checking the connection between sequences of 16 elements\")\n\t}\n}\n\nfunc TestOrderOfFloat64(t *testing.T) {\n\tcheck := func(values []float64) bool {\n\t\treturn checkOrdering(t, float64Order(values), orderOfFloat64(values))\n\t}\n\terr := quick.Check(func(values []float64) bool {\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(float64Order(values))\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(sort.Reverse(float64Order(values)))\n\t\tif !check(values) {\n\t\t\treturn false\n\t\t}\n\t\treturn true\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n\n\tvalues := []float64{\n\t\t0, 1, 2, 3, 4, 5, 6, 7,\n\t\t6, 9, 10, 11, 12, 13, 14, 15,\n\t\t14, 17, 18, 19, 20, 21, 22, 23,\n\t\t24, 25, 26, 27, 28, 29, 30, 31,\n\t}\n\n\tif !check(values) {\n\t\tt.Error(\"failed due to not checking the connection between sequences of 8 elements\")\n\t}\n}\n\nfunc TestOrderOfBytes(t *testing.T) {\n\tcheck := func(values [][]byte) bool {\n\t\treturn checkOrdering(t, bytesOrder(values), orderOfBytes(values))\n\t}\n\terr := quick.Check(func(values [][16]byte) bool {\n\t\tslices := make([][]byte, len(values))\n\t\tfor i := range values {\n\t\t\tslices[i] = values[i][:]\n\t\t}\n\t\tif !check(slices) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(bytesOrder(slices))\n\t\tif !check(slices) {\n\t\t\treturn false\n\t\t}\n\t\tsort.Sort(sort.Reverse(bytesOrder(slices)))\n\t\tif !check(slices) {\n\t\t\treturn false\n\t\t}\n\t\treturn true\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc BenchmarkOrderOfBool(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]bool, bufferSize/1)\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\torderOfBool(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkOrderOfInt32(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]int32, bufferSize/4)\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\torderOfInt32(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkOrderOfInt64(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]int64, bufferSize/8)\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\torderOfInt64(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkOrderOfUint32(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]uint32, bufferSize/4)\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\torderOfUint32(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkOrderOfUint64(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]uint64, bufferSize/8)\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\torderOfUint64(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkOrderOfFloat32(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]float32, bufferSize/4)\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\torderOfFloat32(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkOrderOfFloat64(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]float64, bufferSize/8)\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\torderOfFloat64(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkOrderOfBytes(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tdata := make([]byte, bufferSize)\n\t\tvalues := make([][]byte, len(data)/16)\n\t\tfor i := range values {\n\t\t\tvalues[i] = data[i*16 : (i+1)*16]\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\torderOfBytes(values)\n\t\t}\n\t})\n}\n"
  },
  {
    "path": "page.go",
    "content": "package parquet\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"io\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/internal/bitpack\"\n\t\"github.com/segmentio/parquet-go/internal/debug\"\n)\n\n// Page values represent sequences of parquet values. From the Parquet\n// documentation: \"Column chunks are a chunk of the data for a particular\n// column. They live in a particular row group and are guaranteed to be\n// contiguous in the file. Column chunks are divided up into pages. A page is\n// conceptually an indivisible unit (in terms of compression and encoding).\n// There can be multiple page types which are interleaved in a column chunk.\"\n//\n// https://github.com/apache/parquet-format#glossary\ntype Page interface {\n\t// Returns the type of values read from this page.\n\t//\n\t// The returned type can be used to encode the page data, in the case of\n\t// an indexed page (which has a dictionary), the type is configured to\n\t// encode the indexes stored in the page rather than the plain values.\n\tType() Type\n\n\t// Returns the column index that this page belongs to.\n\tColumn() int\n\n\t// If the page contains indexed values, calling this method returns the\n\t// dictionary in which the values are looked up. Otherwise, the method\n\t// returns nil.\n\tDictionary() Dictionary\n\n\t// Returns the number of rows, values, and nulls in the page. The number of\n\t// rows may be less than the number of values in the page if the page is\n\t// part of a repeated column.\n\tNumRows() int64\n\tNumValues() int64\n\tNumNulls() int64\n\n\t// Returns the page's min and max values.\n\t//\n\t// The third value is a boolean indicating whether the page bounds were\n\t// available. Page bounds may not be known if the page contained no values\n\t// or only nulls, or if they were read from a parquet file which had neither\n\t// page statistics nor a page index.\n\tBounds() (min, max Value, ok bool)\n\n\t// Returns the size of the page in bytes (uncompressed).\n\tSize() int64\n\n\t// Returns a reader exposing the values contained in the page.\n\t//\n\t// Depending on the underlying implementation, the returned reader may\n\t// support reading an array of typed Go values by implementing interfaces\n\t// like parquet.Int32Reader. Applications should use type assertions on\n\t// the returned reader to determine whether those optimizations are\n\t// available.\n\tValues() ValueReader\n\n\t// Returns a new page which is as slice of the receiver between row indexes\n\t// i and j.\n\tSlice(i, j int64) Page\n\n\t// Expose the lists of repetition and definition levels of the page.\n\t//\n\t// The returned slices may be empty when the page has no repetition or\n\t// definition levels.\n\tRepetitionLevels() []byte\n\tDefinitionLevels() []byte\n\n\t// Returns the in-memory buffer holding the page values.\n\t//\n\t// The intent is for the returned value to be used as input parameter when\n\t// calling the Encode method of the associated Type.\n\t//\n\t// The slices referenced by the encoding.Values may be the same across\n\t// multiple calls to this method, applications must treat the content as\n\t// immutable.\n\tData() encoding.Values\n}\n\n// PageReader is an interface implemented by types that support producing a\n// sequence of pages.\ntype PageReader interface {\n\t// Reads and returns the next page from the sequence. When all pages have\n\t// been read, or if the sequence was closed, the method returns io.EOF.\n\tReadPage() (Page, error)\n}\n\n// PageWriter is an interface implemented by types that support writing pages\n// to an underlying storage medium.\ntype PageWriter interface {\n\tWritePage(Page) (int64, error)\n}\n\n// Pages is an interface implemented by page readers returned by calling the\n// Pages method of ColumnChunk instances.\ntype Pages interface {\n\tPageReader\n\tRowSeeker\n\tio.Closer\n}\n\n// AsyncPages wraps the given Pages instance to perform page reads\n// asynchronously in a separate goroutine.\n//\n// Performing page reads asynchronously is important when the application may\n// be reading pages from a high latency backend, and the last\n// page read may be processed while initiating reading of the next page.\nfunc AsyncPages(pages Pages) Pages {\n\tp := new(asyncPages)\n\tp.init(pages, nil)\n\t// If the pages object gets garbage collected without Close being called,\n\t// this finalizer would ensure that the goroutine is stopped and doesn't\n\t// leak.\n\tdebug.SetFinalizer(p, func(p *asyncPages) { p.Close() })\n\treturn p\n}\n\ntype asyncPages struct {\n\tread    <-chan asyncPage\n\tseek    chan<- int64\n\tdone    chan<- struct{}\n\tversion int64\n}\n\ntype asyncPage struct {\n\tpage    Page\n\terr     error\n\tversion int64\n}\n\nfunc (pages *asyncPages) init(base Pages, done chan struct{}) {\n\tread := make(chan asyncPage)\n\tseek := make(chan int64, 1)\n\n\tpages.read = read\n\tpages.seek = seek\n\n\tif done == nil {\n\t\tdone = make(chan struct{})\n\t\tpages.done = done\n\t}\n\n\tgo readPages(base, read, seek, done)\n}\n\nfunc (pages *asyncPages) Close() (err error) {\n\tif pages.done != nil {\n\t\tclose(pages.done)\n\t\tpages.done = nil\n\t}\n\tfor p := range pages.read {\n\t\t// Capture the last error, which is the value returned from closing the\n\t\t// underlying Pages instance.\n\t\terr = p.err\n\t}\n\tpages.seek = nil\n\treturn err\n}\n\nfunc (pages *asyncPages) ReadPage() (Page, error) {\n\tfor {\n\t\tp, ok := <-pages.read\n\t\tif !ok {\n\t\t\treturn nil, io.EOF\n\t\t}\n\t\t// Because calls to SeekToRow might be made concurrently to reading\n\t\t// pages, it is possible for ReadPage to see pages that were read before\n\t\t// the last SeekToRow call.\n\t\t//\n\t\t// A version number is attached to each page read asynchronously to\n\t\t// discard outdated pages and ensure that we maintain a consistent view\n\t\t// of the sequence of pages read.\n\t\tif p.version == pages.version {\n\t\t\treturn p.page, p.err\n\t\t}\n\t}\n}\n\nfunc (pages *asyncPages) SeekToRow(rowIndex int64) error {\n\tif pages.seek == nil {\n\t\treturn io.ErrClosedPipe\n\t}\n\t// The seek channel has a capacity of 1 to allow the first SeekToRow call to\n\t// be non-blocking.\n\t//\n\t// If SeekToRow calls are performed faster than they can be handled by the\n\t// goroutine reading pages, this path might become a contention point.\n\tpages.seek <- rowIndex\n\tpages.version++\n\treturn nil\n}\n\nfunc readPages(pages Pages, read chan<- asyncPage, seek <-chan int64, done <-chan struct{}) {\n\tdefer func() {\n\t\tread <- asyncPage{err: pages.Close(), version: -1}\n\t\tclose(read)\n\t}()\n\n\tversion := int64(0)\n\tfor {\n\t\tpage, err := pages.ReadPage()\n\n\t\tfor {\n\t\t\tselect {\n\t\t\tcase <-done:\n\t\t\t\treturn\n\t\t\tcase read <- asyncPage{\n\t\t\t\tpage:    page,\n\t\t\t\terr:     err,\n\t\t\t\tversion: version,\n\t\t\t}:\n\t\t\tcase rowIndex := <-seek:\n\t\t\t\tversion++\n\t\t\t\terr = pages.SeekToRow(rowIndex)\n\t\t\t}\n\t\t\tif err == nil {\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\t}\n}\n\ntype singlePage struct {\n\tpage    Page\n\tseek    int64\n\tnumRows int64\n}\n\nfunc (r *singlePage) ReadPage() (Page, error) {\n\tif r.page != nil {\n\t\tif r.seek < r.numRows {\n\t\t\tseek := r.seek\n\t\t\tr.seek = r.numRows\n\t\t\tif seek > 0 {\n\t\t\t\treturn r.page.Slice(seek, r.numRows), nil\n\t\t\t}\n\t\t\treturn r.page, nil\n\t\t}\n\t}\n\treturn nil, io.EOF\n}\n\nfunc (r *singlePage) SeekToRow(rowIndex int64) error {\n\tr.seek = rowIndex\n\treturn nil\n}\n\nfunc (r *singlePage) Close() error {\n\tr.page = nil\n\tr.seek = 0\n\treturn nil\n}\n\nfunc onePage(page Page) Pages {\n\treturn &singlePage{page: page, numRows: page.NumRows()}\n}\n\n// CopyPages copies pages from src to dst, returning the number of values that\n// were copied.\n//\n// The function returns any error it encounters reading or writing pages, except\n// for io.EOF from the reader which indicates that there were no more pages to\n// read.\nfunc CopyPages(dst PageWriter, src PageReader) (numValues int64, err error) {\n\tfor {\n\t\tp, err := src.ReadPage()\n\t\tif err != nil {\n\t\t\tif err == io.EOF {\n\t\t\t\terr = nil\n\t\t\t}\n\t\t\treturn numValues, err\n\t\t}\n\t\tn, err := dst.WritePage(p)\n\t\tnumValues += n\n\t\tif err != nil {\n\t\t\treturn numValues, err\n\t\t}\n\t}\n}\n\n// errorPage is an implementation of the Page interface which always errors when\n// attempting to read its values.\n//\n// The error page declares that it contains one value (even if it does not)\n// as a way to ensure that it is not ignored due to being empty when written\n// to a file.\ntype errorPage struct {\n\ttyp         Type\n\terr         error\n\tcolumnIndex int\n}\n\nfunc newErrorPage(typ Type, columnIndex int, msg string, args ...interface{}) *errorPage {\n\treturn &errorPage{\n\t\ttyp:         typ,\n\t\terr:         fmt.Errorf(msg, args...),\n\t\tcolumnIndex: columnIndex,\n\t}\n}\n\nfunc (page *errorPage) Type() Type                        { return page.typ }\nfunc (page *errorPage) Column() int                       { return page.columnIndex }\nfunc (page *errorPage) Dictionary() Dictionary            { return nil }\nfunc (page *errorPage) NumRows() int64                    { return 1 }\nfunc (page *errorPage) NumValues() int64                  { return 1 }\nfunc (page *errorPage) NumNulls() int64                   { return 0 }\nfunc (page *errorPage) Bounds() (min, max Value, ok bool) { return }\nfunc (page *errorPage) Slice(i, j int64) Page             { return page }\nfunc (page *errorPage) Size() int64                       { return 1 }\nfunc (page *errorPage) RepetitionLevels() []byte          { return nil }\nfunc (page *errorPage) DefinitionLevels() []byte          { return nil }\nfunc (page *errorPage) Data() encoding.Values             { return encoding.Values{} }\nfunc (page *errorPage) Values() ValueReader               { return errorPageValues{page: page} }\n\ntype errorPageValues struct{ page *errorPage }\n\nfunc (r errorPageValues) ReadValues([]Value) (int, error) { return 0, r.page.err }\nfunc (r errorPageValues) Close() error                    { return nil }\n\nfunc errPageBoundsOutOfRange(i, j, n int64) error {\n\treturn fmt.Errorf(\"page bounds out of range [%d:%d]: with length %d\", i, j, n)\n}\n\ntype optionalPage struct {\n\tbase               Page\n\tmaxDefinitionLevel byte\n\tdefinitionLevels   []byte\n}\n\nfunc newOptionalPage(base Page, maxDefinitionLevel byte, definitionLevels []byte) *optionalPage {\n\treturn &optionalPage{\n\t\tbase:               base,\n\t\tmaxDefinitionLevel: maxDefinitionLevel,\n\t\tdefinitionLevels:   definitionLevels,\n\t}\n}\n\nfunc (page *optionalPage) Type() Type { return page.base.Type() }\n\nfunc (page *optionalPage) Column() int { return page.base.Column() }\n\nfunc (page *optionalPage) Dictionary() Dictionary { return page.base.Dictionary() }\n\nfunc (page *optionalPage) NumRows() int64 { return int64(len(page.definitionLevels)) }\n\nfunc (page *optionalPage) NumValues() int64 { return int64(len(page.definitionLevels)) }\n\nfunc (page *optionalPage) NumNulls() int64 {\n\treturn int64(countLevelsNotEqual(page.definitionLevels, page.maxDefinitionLevel))\n}\n\nfunc (page *optionalPage) Bounds() (min, max Value, ok bool) { return page.base.Bounds() }\n\nfunc (page *optionalPage) Size() int64 { return int64(len(page.definitionLevels)) + page.base.Size() }\n\nfunc (page *optionalPage) RepetitionLevels() []byte { return nil }\n\nfunc (page *optionalPage) DefinitionLevels() []byte { return page.definitionLevels }\n\nfunc (page *optionalPage) Data() encoding.Values { return page.base.Data() }\n\nfunc (page *optionalPage) Values() ValueReader {\n\treturn &optionalPageValues{\n\t\tpage:   page,\n\t\tvalues: page.base.Values(),\n\t}\n}\n\nfunc (page *optionalPage) Slice(i, j int64) Page {\n\tmaxDefinitionLevel := page.maxDefinitionLevel\n\tdefinitionLevels := page.definitionLevels\n\tnumNulls1 := int64(countLevelsNotEqual(definitionLevels[:i], maxDefinitionLevel))\n\tnumNulls2 := int64(countLevelsNotEqual(definitionLevels[i:j], maxDefinitionLevel))\n\treturn newOptionalPage(\n\t\tpage.base.Slice(i-numNulls1, j-(numNulls1+numNulls2)),\n\t\tmaxDefinitionLevel,\n\t\tdefinitionLevels[i:j:j],\n\t)\n}\n\ntype repeatedPage struct {\n\tbase               Page\n\tmaxRepetitionLevel byte\n\tmaxDefinitionLevel byte\n\tdefinitionLevels   []byte\n\trepetitionLevels   []byte\n}\n\nfunc newRepeatedPage(base Page, maxRepetitionLevel, maxDefinitionLevel byte, repetitionLevels, definitionLevels []byte) *repeatedPage {\n\treturn &repeatedPage{\n\t\tbase:               base,\n\t\tmaxRepetitionLevel: maxRepetitionLevel,\n\t\tmaxDefinitionLevel: maxDefinitionLevel,\n\t\tdefinitionLevels:   definitionLevels,\n\t\trepetitionLevels:   repetitionLevels,\n\t}\n}\n\nfunc (page *repeatedPage) Type() Type { return page.base.Type() }\n\nfunc (page *repeatedPage) Column() int { return page.base.Column() }\n\nfunc (page *repeatedPage) Dictionary() Dictionary { return page.base.Dictionary() }\n\nfunc (page *repeatedPage) NumRows() int64 { return int64(countLevelsEqual(page.repetitionLevels, 0)) }\n\nfunc (page *repeatedPage) NumValues() int64 { return int64(len(page.definitionLevels)) }\n\nfunc (page *repeatedPage) NumNulls() int64 {\n\treturn int64(countLevelsNotEqual(page.definitionLevels, page.maxDefinitionLevel))\n}\n\nfunc (page *repeatedPage) Bounds() (min, max Value, ok bool) { return page.base.Bounds() }\n\nfunc (page *repeatedPage) Size() int64 {\n\treturn int64(len(page.repetitionLevels)) + int64(len(page.definitionLevels)) + page.base.Size()\n}\n\nfunc (page *repeatedPage) RepetitionLevels() []byte { return page.repetitionLevels }\n\nfunc (page *repeatedPage) DefinitionLevels() []byte { return page.definitionLevels }\n\nfunc (page *repeatedPage) Data() encoding.Values { return page.base.Data() }\n\nfunc (page *repeatedPage) Values() ValueReader {\n\treturn &repeatedPageValues{\n\t\tpage:   page,\n\t\tvalues: page.base.Values(),\n\t}\n}\n\nfunc (page *repeatedPage) Slice(i, j int64) Page {\n\tnumRows := page.NumRows()\n\tif i < 0 || i > numRows {\n\t\tpanic(errPageBoundsOutOfRange(i, j, numRows))\n\t}\n\tif j < 0 || j > numRows {\n\t\tpanic(errPageBoundsOutOfRange(i, j, numRows))\n\t}\n\tif i > j {\n\t\tpanic(errPageBoundsOutOfRange(i, j, numRows))\n\t}\n\n\tmaxRepetitionLevel := page.maxRepetitionLevel\n\tmaxDefinitionLevel := page.maxDefinitionLevel\n\trepetitionLevels := page.repetitionLevels\n\tdefinitionLevels := page.definitionLevels\n\n\trowIndex0 := 0\n\trowIndex1 := len(repetitionLevels)\n\trowIndex2 := len(repetitionLevels)\n\n\tfor k, def := range repetitionLevels {\n\t\tif def == 0 {\n\t\t\tif rowIndex0 == int(i) {\n\t\t\t\trowIndex1 = k\n\t\t\t\tbreak\n\t\t\t}\n\t\t\trowIndex0++\n\t\t}\n\t}\n\n\tfor k, def := range repetitionLevels[rowIndex1:] {\n\t\tif def == 0 {\n\t\t\tif rowIndex0 == int(j) {\n\t\t\t\trowIndex2 = rowIndex1 + k\n\t\t\t\tbreak\n\t\t\t}\n\t\t\trowIndex0++\n\t\t}\n\t}\n\n\tnumNulls1 := countLevelsNotEqual(definitionLevels[:rowIndex1], maxDefinitionLevel)\n\tnumNulls2 := countLevelsNotEqual(definitionLevels[rowIndex1:rowIndex2], maxDefinitionLevel)\n\n\ti = int64(rowIndex1 - numNulls1)\n\tj = int64(rowIndex2 - (numNulls1 + numNulls2))\n\n\treturn newRepeatedPage(\n\t\tpage.base.Slice(i, j),\n\t\tmaxRepetitionLevel,\n\t\tmaxDefinitionLevel,\n\t\trepetitionLevels[rowIndex1:rowIndex2:rowIndex2],\n\t\tdefinitionLevels[rowIndex1:rowIndex2:rowIndex2],\n\t)\n}\n\ntype booleanPage struct {\n\ttyp         Type\n\tbits        []byte\n\toffset      int32\n\tnumValues   int32\n\tcolumnIndex int16\n}\n\nfunc newBooleanPage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *booleanPage {\n\treturn &booleanPage{\n\t\ttyp:         typ,\n\t\tbits:        values.Boolean()[:bitpack.ByteCount(uint(numValues))],\n\t\tnumValues:   numValues,\n\t\tcolumnIndex: ^columnIndex,\n\t}\n}\n\nfunc (page *booleanPage) Type() Type { return page.typ }\n\nfunc (page *booleanPage) Column() int { return int(^page.columnIndex) }\n\nfunc (page *booleanPage) Dictionary() Dictionary { return nil }\n\nfunc (page *booleanPage) NumRows() int64 { return int64(page.numValues) }\n\nfunc (page *booleanPage) NumValues() int64 { return int64(page.numValues) }\n\nfunc (page *booleanPage) NumNulls() int64 { return 0 }\n\nfunc (page *booleanPage) Size() int64 { return int64(len(page.bits)) }\n\nfunc (page *booleanPage) RepetitionLevels() []byte { return nil }\n\nfunc (page *booleanPage) DefinitionLevels() []byte { return nil }\n\nfunc (page *booleanPage) Data() encoding.Values { return encoding.BooleanValues(page.bits) }\n\nfunc (page *booleanPage) Values() ValueReader { return &booleanPageValues{page: page} }\n\nfunc (page *booleanPage) valueAt(i int) bool {\n\tj := uint32(int(page.offset)+i) / 8\n\tk := uint32(int(page.offset)+i) % 8\n\treturn ((page.bits[j] >> k) & 1) != 0\n}\n\nfunc (page *booleanPage) min() bool {\n\tfor i := 0; i < int(page.numValues); i++ {\n\t\tif !page.valueAt(i) {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn page.numValues > 0\n}\n\nfunc (page *booleanPage) max() bool {\n\tfor i := 0; i < int(page.numValues); i++ {\n\t\tif page.valueAt(i) {\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n\nfunc (page *booleanPage) bounds() (min, max bool) {\n\thasFalse, hasTrue := false, false\n\n\tfor i := 0; i < int(page.numValues); i++ {\n\t\tv := page.valueAt(i)\n\t\tif v {\n\t\t\thasTrue = true\n\t\t} else {\n\t\t\thasFalse = true\n\t\t}\n\t\tif hasTrue && hasFalse {\n\t\t\tbreak\n\t\t}\n\t}\n\n\tmin = !hasFalse\n\tmax = hasTrue\n\treturn min, max\n}\n\nfunc (page *booleanPage) Bounds() (min, max Value, ok bool) {\n\tif ok = page.numValues > 0; ok {\n\t\tminBool, maxBool := page.bounds()\n\t\tmin = page.makeValue(minBool)\n\t\tmax = page.makeValue(maxBool)\n\t}\n\treturn min, max, ok\n}\n\nfunc (page *booleanPage) Slice(i, j int64) Page {\n\toff := i / 8\n\tend := j / 8\n\n\tif (j % 8) != 0 {\n\t\tend++\n\t}\n\n\treturn &booleanPage{\n\t\ttyp:         page.typ,\n\t\tbits:        page.bits[off:end],\n\t\toffset:      int32(i % 8),\n\t\tnumValues:   int32(j - i),\n\t\tcolumnIndex: page.columnIndex,\n\t}\n}\n\nfunc (page *booleanPage) makeValue(v bool) Value {\n\tvalue := makeValueBoolean(v)\n\tvalue.columnIndex = page.columnIndex\n\treturn value\n}\n\ntype int32Page struct {\n\ttyp         Type\n\tvalues      []int32\n\tcolumnIndex int16\n}\n\nfunc newInt32Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *int32Page {\n\treturn &int32Page{\n\t\ttyp:         typ,\n\t\tvalues:      values.Int32()[:numValues],\n\t\tcolumnIndex: ^columnIndex,\n\t}\n}\n\nfunc (page *int32Page) Type() Type { return page.typ }\n\nfunc (page *int32Page) Column() int { return int(^page.columnIndex) }\n\nfunc (page *int32Page) Dictionary() Dictionary { return nil }\n\nfunc (page *int32Page) NumRows() int64 { return int64(len(page.values)) }\n\nfunc (page *int32Page) NumValues() int64 { return int64(len(page.values)) }\n\nfunc (page *int32Page) NumNulls() int64 { return 0 }\n\nfunc (page *int32Page) Size() int64 { return 4 * int64(len(page.values)) }\n\nfunc (page *int32Page) RepetitionLevels() []byte { return nil }\n\nfunc (page *int32Page) DefinitionLevels() []byte { return nil }\n\nfunc (page *int32Page) Data() encoding.Values { return encoding.Int32Values(page.values) }\n\nfunc (page *int32Page) Values() ValueReader { return &int32PageValues{page: page} }\n\nfunc (page *int32Page) min() int32 { return minInt32(page.values) }\n\nfunc (page *int32Page) max() int32 { return maxInt32(page.values) }\n\nfunc (page *int32Page) bounds() (min, max int32) { return boundsInt32(page.values) }\n\nfunc (page *int32Page) Bounds() (min, max Value, ok bool) {\n\tif ok = len(page.values) > 0; ok {\n\t\tminInt32, maxInt32 := page.bounds()\n\t\tmin = page.makeValue(minInt32)\n\t\tmax = page.makeValue(maxInt32)\n\t}\n\treturn min, max, ok\n}\n\nfunc (page *int32Page) Slice(i, j int64) Page {\n\treturn &int32Page{\n\t\ttyp:         page.typ,\n\t\tvalues:      page.values[i:j],\n\t\tcolumnIndex: page.columnIndex,\n\t}\n}\n\nfunc (page *int32Page) makeValue(v int32) Value {\n\tvalue := makeValueInt32(v)\n\tvalue.columnIndex = page.columnIndex\n\treturn value\n}\n\ntype int64Page struct {\n\ttyp         Type\n\tvalues      []int64\n\tcolumnIndex int16\n}\n\nfunc newInt64Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *int64Page {\n\treturn &int64Page{\n\t\ttyp:         typ,\n\t\tvalues:      values.Int64()[:numValues],\n\t\tcolumnIndex: ^columnIndex,\n\t}\n}\n\nfunc (page *int64Page) Type() Type { return page.typ }\n\nfunc (page *int64Page) Column() int { return int(^page.columnIndex) }\n\nfunc (page *int64Page) Dictionary() Dictionary { return nil }\n\nfunc (page *int64Page) NumRows() int64 { return int64(len(page.values)) }\n\nfunc (page *int64Page) NumValues() int64 { return int64(len(page.values)) }\n\nfunc (page *int64Page) NumNulls() int64 { return 0 }\n\nfunc (page *int64Page) Size() int64 { return 8 * int64(len(page.values)) }\n\nfunc (page *int64Page) RepetitionLevels() []byte { return nil }\n\nfunc (page *int64Page) DefinitionLevels() []byte { return nil }\n\nfunc (page *int64Page) Data() encoding.Values { return encoding.Int64Values(page.values) }\n\nfunc (page *int64Page) Values() ValueReader { return &int64PageValues{page: page} }\n\nfunc (page *int64Page) min() int64 { return minInt64(page.values) }\n\nfunc (page *int64Page) max() int64 { return maxInt64(page.values) }\n\nfunc (page *int64Page) bounds() (min, max int64) { return boundsInt64(page.values) }\n\nfunc (page *int64Page) Bounds() (min, max Value, ok bool) {\n\tif ok = len(page.values) > 0; ok {\n\t\tminInt64, maxInt64 := page.bounds()\n\t\tmin = page.makeValue(minInt64)\n\t\tmax = page.makeValue(maxInt64)\n\t}\n\treturn min, max, ok\n}\n\nfunc (page *int64Page) Slice(i, j int64) Page {\n\treturn &int64Page{\n\t\ttyp:         page.typ,\n\t\tvalues:      page.values[i:j],\n\t\tcolumnIndex: page.columnIndex,\n\t}\n}\n\nfunc (page *int64Page) makeValue(v int64) Value {\n\tvalue := makeValueInt64(v)\n\tvalue.columnIndex = page.columnIndex\n\treturn value\n}\n\ntype int96Page struct {\n\ttyp         Type\n\tvalues      []deprecated.Int96\n\tcolumnIndex int16\n}\n\nfunc newInt96Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *int96Page {\n\treturn &int96Page{\n\t\ttyp:         typ,\n\t\tvalues:      values.Int96()[:numValues],\n\t\tcolumnIndex: ^columnIndex,\n\t}\n}\n\nfunc (page *int96Page) Type() Type { return page.typ }\n\nfunc (page *int96Page) Column() int { return int(^page.columnIndex) }\n\nfunc (page *int96Page) Dictionary() Dictionary { return nil }\n\nfunc (page *int96Page) NumRows() int64 { return int64(len(page.values)) }\n\nfunc (page *int96Page) NumValues() int64 { return int64(len(page.values)) }\n\nfunc (page *int96Page) NumNulls() int64 { return 0 }\n\nfunc (page *int96Page) Size() int64 { return 12 * int64(len(page.values)) }\n\nfunc (page *int96Page) RepetitionLevels() []byte { return nil }\n\nfunc (page *int96Page) DefinitionLevels() []byte { return nil }\n\nfunc (page *int96Page) Data() encoding.Values { return encoding.Int96Values(page.values) }\n\nfunc (page *int96Page) Values() ValueReader { return &int96PageValues{page: page} }\n\nfunc (page *int96Page) min() deprecated.Int96 { return deprecated.MinInt96(page.values) }\n\nfunc (page *int96Page) max() deprecated.Int96 { return deprecated.MaxInt96(page.values) }\n\nfunc (page *int96Page) bounds() (min, max deprecated.Int96) {\n\treturn deprecated.MinMaxInt96(page.values)\n}\n\nfunc (page *int96Page) Bounds() (min, max Value, ok bool) {\n\tif ok = len(page.values) > 0; ok {\n\t\tminInt96, maxInt96 := page.bounds()\n\t\tmin = page.makeValue(minInt96)\n\t\tmax = page.makeValue(maxInt96)\n\t}\n\treturn min, max, ok\n}\n\nfunc (page *int96Page) Slice(i, j int64) Page {\n\treturn &int96Page{\n\t\ttyp:         page.typ,\n\t\tvalues:      page.values[i:j],\n\t\tcolumnIndex: page.columnIndex,\n\t}\n}\n\nfunc (page *int96Page) makeValue(v deprecated.Int96) Value {\n\tvalue := makeValueInt96(v)\n\tvalue.columnIndex = page.columnIndex\n\treturn value\n}\n\ntype floatPage struct {\n\ttyp         Type\n\tvalues      []float32\n\tcolumnIndex int16\n}\n\nfunc newFloatPage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *floatPage {\n\treturn &floatPage{\n\t\ttyp:         typ,\n\t\tvalues:      values.Float()[:numValues],\n\t\tcolumnIndex: ^columnIndex,\n\t}\n}\n\nfunc (page *floatPage) Type() Type { return page.typ }\n\nfunc (page *floatPage) Column() int { return int(^page.columnIndex) }\n\nfunc (page *floatPage) Dictionary() Dictionary { return nil }\n\nfunc (page *floatPage) NumRows() int64 { return int64(len(page.values)) }\n\nfunc (page *floatPage) NumValues() int64 { return int64(len(page.values)) }\n\nfunc (page *floatPage) NumNulls() int64 { return 0 }\n\nfunc (page *floatPage) Size() int64 { return 4 * int64(len(page.values)) }\n\nfunc (page *floatPage) RepetitionLevels() []byte { return nil }\n\nfunc (page *floatPage) DefinitionLevels() []byte { return nil }\n\nfunc (page *floatPage) Data() encoding.Values { return encoding.FloatValues(page.values) }\n\nfunc (page *floatPage) Values() ValueReader { return &floatPageValues{page: page} }\n\nfunc (page *floatPage) min() float32 { return minFloat32(page.values) }\n\nfunc (page *floatPage) max() float32 { return maxFloat32(page.values) }\n\nfunc (page *floatPage) bounds() (min, max float32) { return boundsFloat32(page.values) }\n\nfunc (page *floatPage) Bounds() (min, max Value, ok bool) {\n\tif ok = len(page.values) > 0; ok {\n\t\tminFloat32, maxFloat32 := page.bounds()\n\t\tmin = page.makeValue(minFloat32)\n\t\tmax = page.makeValue(maxFloat32)\n\t}\n\treturn min, max, ok\n}\n\nfunc (page *floatPage) Slice(i, j int64) Page {\n\treturn &floatPage{\n\t\ttyp:         page.typ,\n\t\tvalues:      page.values[i:j],\n\t\tcolumnIndex: page.columnIndex,\n\t}\n}\n\nfunc (page *floatPage) makeValue(v float32) Value {\n\tvalue := makeValueFloat(v)\n\tvalue.columnIndex = page.columnIndex\n\treturn value\n}\n\ntype doublePage struct {\n\ttyp         Type\n\tvalues      []float64\n\tcolumnIndex int16\n}\n\nfunc newDoublePage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *doublePage {\n\treturn &doublePage{\n\t\ttyp:         typ,\n\t\tvalues:      values.Double()[:numValues],\n\t\tcolumnIndex: ^columnIndex,\n\t}\n}\n\nfunc (page *doublePage) Type() Type { return page.typ }\n\nfunc (page *doublePage) Column() int { return int(^page.columnIndex) }\n\nfunc (page *doublePage) Dictionary() Dictionary { return nil }\n\nfunc (page *doublePage) NumRows() int64 { return int64(len(page.values)) }\n\nfunc (page *doublePage) NumValues() int64 { return int64(len(page.values)) }\n\nfunc (page *doublePage) NumNulls() int64 { return 0 }\n\nfunc (page *doublePage) Size() int64 { return 8 * int64(len(page.values)) }\n\nfunc (page *doublePage) RepetitionLevels() []byte { return nil }\n\nfunc (page *doublePage) DefinitionLevels() []byte { return nil }\n\nfunc (page *doublePage) Data() encoding.Values { return encoding.DoubleValues(page.values) }\n\nfunc (page *doublePage) Values() ValueReader { return &doublePageValues{page: page} }\n\nfunc (page *doublePage) min() float64 { return minFloat64(page.values) }\n\nfunc (page *doublePage) max() float64 { return maxFloat64(page.values) }\n\nfunc (page *doublePage) bounds() (min, max float64) { return boundsFloat64(page.values) }\n\nfunc (page *doublePage) Bounds() (min, max Value, ok bool) {\n\tif ok = len(page.values) > 0; ok {\n\t\tminFloat64, maxFloat64 := page.bounds()\n\t\tmin = page.makeValue(minFloat64)\n\t\tmax = page.makeValue(maxFloat64)\n\t}\n\treturn min, max, ok\n}\n\nfunc (page *doublePage) Slice(i, j int64) Page {\n\treturn &doublePage{\n\t\ttyp:         page.typ,\n\t\tvalues:      page.values[i:j],\n\t\tcolumnIndex: page.columnIndex,\n\t}\n}\n\nfunc (page *doublePage) makeValue(v float64) Value {\n\tvalue := makeValueDouble(v)\n\tvalue.columnIndex = page.columnIndex\n\treturn value\n}\n\ntype byteArrayPage struct {\n\ttyp         Type\n\tvalues      []byte\n\toffsets     []uint32\n\tcolumnIndex int16\n}\n\nfunc newByteArrayPage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *byteArrayPage {\n\tdata, offsets := values.ByteArray()\n\treturn &byteArrayPage{\n\t\ttyp:         typ,\n\t\tvalues:      data,\n\t\toffsets:     offsets[:numValues+1],\n\t\tcolumnIndex: ^columnIndex,\n\t}\n}\n\nfunc (page *byteArrayPage) Type() Type { return page.typ }\n\nfunc (page *byteArrayPage) Column() int { return int(^page.columnIndex) }\n\nfunc (page *byteArrayPage) Dictionary() Dictionary { return nil }\n\nfunc (page *byteArrayPage) NumRows() int64 { return int64(page.len()) }\n\nfunc (page *byteArrayPage) NumValues() int64 { return int64(page.len()) }\n\nfunc (page *byteArrayPage) NumNulls() int64 { return 0 }\n\nfunc (page *byteArrayPage) Size() int64 { return int64(len(page.values)) + 4*int64(len(page.offsets)) }\n\nfunc (page *byteArrayPage) RepetitionLevels() []byte { return nil }\n\nfunc (page *byteArrayPage) DefinitionLevels() []byte { return nil }\n\nfunc (page *byteArrayPage) Data() encoding.Values {\n\treturn encoding.ByteArrayValues(page.values, page.offsets)\n}\n\nfunc (page *byteArrayPage) Values() ValueReader { return &byteArrayPageValues{page: page} }\n\nfunc (page *byteArrayPage) len() int { return len(page.offsets) - 1 }\n\nfunc (page *byteArrayPage) index(i int) []byte {\n\tj := page.offsets[i+0]\n\tk := page.offsets[i+1]\n\treturn page.values[j:k:k]\n}\n\nfunc (page *byteArrayPage) min() (min []byte) {\n\tif n := page.len(); n > 0 {\n\t\tmin = page.index(0)\n\n\t\tfor i := 1; i < n; i++ {\n\t\t\tv := page.index(i)\n\n\t\t\tif bytes.Compare(v, min) < 0 {\n\t\t\t\tmin = v\n\t\t\t}\n\t\t}\n\t}\n\treturn min\n}\n\nfunc (page *byteArrayPage) max() (max []byte) {\n\tif n := page.len(); n > 0 {\n\t\tmax = page.index(0)\n\n\t\tfor i := 1; i < n; i++ {\n\t\t\tv := page.index(i)\n\n\t\t\tif bytes.Compare(v, max) > 0 {\n\t\t\t\tmax = v\n\t\t\t}\n\t\t}\n\t}\n\treturn max\n}\n\nfunc (page *byteArrayPage) bounds() (min, max []byte) {\n\tif n := page.len(); n > 0 {\n\t\tmin = page.index(0)\n\t\tmax = min\n\n\t\tfor i := 1; i < n; i++ {\n\t\t\tv := page.index(i)\n\n\t\t\tswitch {\n\t\t\tcase bytes.Compare(v, min) < 0:\n\t\t\t\tmin = v\n\t\t\tcase bytes.Compare(v, max) > 0:\n\t\t\t\tmax = v\n\t\t\t}\n\t\t}\n\t}\n\treturn min, max\n}\n\nfunc (page *byteArrayPage) Bounds() (min, max Value, ok bool) {\n\tif ok = len(page.offsets) > 1; ok {\n\t\tminBytes, maxBytes := page.bounds()\n\t\tmin = page.makeValueBytes(minBytes)\n\t\tmax = page.makeValueBytes(maxBytes)\n\t}\n\treturn min, max, ok\n}\n\nfunc (page *byteArrayPage) cloneValues() []byte {\n\tvalues := make([]byte, len(page.values))\n\tcopy(values, page.values)\n\treturn values\n}\n\nfunc (page *byteArrayPage) cloneOffsets() []uint32 {\n\toffsets := make([]uint32, len(page.offsets))\n\tcopy(offsets, page.offsets)\n\treturn offsets\n}\n\nfunc (page *byteArrayPage) Slice(i, j int64) Page {\n\treturn &byteArrayPage{\n\t\ttyp:         page.typ,\n\t\tvalues:      page.values,\n\t\toffsets:     page.offsets[i : j+1],\n\t\tcolumnIndex: page.columnIndex,\n\t}\n}\n\nfunc (page *byteArrayPage) makeValueBytes(v []byte) Value {\n\tvalue := makeValueBytes(ByteArray, v)\n\tvalue.columnIndex = page.columnIndex\n\treturn value\n}\n\nfunc (page *byteArrayPage) makeValueString(v string) Value {\n\tvalue := makeValueString(ByteArray, v)\n\tvalue.columnIndex = page.columnIndex\n\treturn value\n}\n\ntype fixedLenByteArrayPage struct {\n\ttyp         Type\n\tdata        []byte\n\tsize        int\n\tcolumnIndex int16\n}\n\nfunc newFixedLenByteArrayPage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *fixedLenByteArrayPage {\n\tdata, size := values.FixedLenByteArray()\n\treturn &fixedLenByteArrayPage{\n\t\ttyp:         typ,\n\t\tdata:        data[:int(numValues)*size],\n\t\tsize:        size,\n\t\tcolumnIndex: ^columnIndex,\n\t}\n}\n\nfunc (page *fixedLenByteArrayPage) Type() Type { return page.typ }\n\nfunc (page *fixedLenByteArrayPage) Column() int { return int(^page.columnIndex) }\n\nfunc (page *fixedLenByteArrayPage) Dictionary() Dictionary { return nil }\n\nfunc (page *fixedLenByteArrayPage) NumRows() int64 { return int64(len(page.data) / page.size) }\n\nfunc (page *fixedLenByteArrayPage) NumValues() int64 { return int64(len(page.data) / page.size) }\n\nfunc (page *fixedLenByteArrayPage) NumNulls() int64 { return 0 }\n\nfunc (page *fixedLenByteArrayPage) Size() int64 { return int64(len(page.data)) }\n\nfunc (page *fixedLenByteArrayPage) RepetitionLevels() []byte { return nil }\n\nfunc (page *fixedLenByteArrayPage) DefinitionLevels() []byte { return nil }\n\nfunc (page *fixedLenByteArrayPage) Data() encoding.Values {\n\treturn encoding.FixedLenByteArrayValues(page.data, page.size)\n}\n\nfunc (page *fixedLenByteArrayPage) Values() ValueReader {\n\treturn &fixedLenByteArrayPageValues{page: page}\n}\n\nfunc (page *fixedLenByteArrayPage) min() []byte { return minFixedLenByteArray(page.data, page.size) }\n\nfunc (page *fixedLenByteArrayPage) max() []byte { return maxFixedLenByteArray(page.data, page.size) }\n\nfunc (page *fixedLenByteArrayPage) bounds() (min, max []byte) {\n\treturn boundsFixedLenByteArray(page.data, page.size)\n}\n\nfunc (page *fixedLenByteArrayPage) Bounds() (min, max Value, ok bool) {\n\tif ok = len(page.data) > 0; ok {\n\t\tminBytes, maxBytes := page.bounds()\n\t\tmin = page.makeValueBytes(minBytes)\n\t\tmax = page.makeValueBytes(maxBytes)\n\t}\n\treturn min, max, ok\n}\n\nfunc (page *fixedLenByteArrayPage) Slice(i, j int64) Page {\n\treturn &fixedLenByteArrayPage{\n\t\ttyp:         page.typ,\n\t\tdata:        page.data[i*int64(page.size) : j*int64(page.size)],\n\t\tsize:        page.size,\n\t\tcolumnIndex: page.columnIndex,\n\t}\n}\n\nfunc (page *fixedLenByteArrayPage) makeValueBytes(v []byte) Value {\n\tvalue := makeValueBytes(FixedLenByteArray, v)\n\tvalue.columnIndex = page.columnIndex\n\treturn value\n}\n\nfunc (page *fixedLenByteArrayPage) makeValueString(v string) Value {\n\tvalue := makeValueString(FixedLenByteArray, v)\n\tvalue.columnIndex = page.columnIndex\n\treturn value\n}\n\ntype uint32Page struct {\n\ttyp         Type\n\tvalues      []uint32\n\tcolumnIndex int16\n}\n\nfunc newUint32Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *uint32Page {\n\treturn &uint32Page{\n\t\ttyp:         typ,\n\t\tvalues:      values.Uint32()[:numValues],\n\t\tcolumnIndex: ^columnIndex,\n\t}\n}\n\nfunc (page *uint32Page) Type() Type { return page.typ }\n\nfunc (page *uint32Page) Column() int { return int(^page.columnIndex) }\n\nfunc (page *uint32Page) Dictionary() Dictionary { return nil }\n\nfunc (page *uint32Page) NumRows() int64 { return int64(len(page.values)) }\n\nfunc (page *uint32Page) NumValues() int64 { return int64(len(page.values)) }\n\nfunc (page *uint32Page) NumNulls() int64 { return 0 }\n\nfunc (page *uint32Page) Size() int64 { return 4 * int64(len(page.values)) }\n\nfunc (page *uint32Page) RepetitionLevels() []byte { return nil }\n\nfunc (page *uint32Page) DefinitionLevels() []byte { return nil }\n\nfunc (page *uint32Page) Data() encoding.Values { return encoding.Uint32Values(page.values) }\n\nfunc (page *uint32Page) Values() ValueReader { return &uint32PageValues{page: page} }\n\nfunc (page *uint32Page) min() uint32 { return minUint32(page.values) }\n\nfunc (page *uint32Page) max() uint32 { return maxUint32(page.values) }\n\nfunc (page *uint32Page) bounds() (min, max uint32) { return boundsUint32(page.values) }\n\nfunc (page *uint32Page) Bounds() (min, max Value, ok bool) {\n\tif ok = len(page.values) > 0; ok {\n\t\tminUint32, maxUint32 := page.bounds()\n\t\tmin = page.makeValue(minUint32)\n\t\tmax = page.makeValue(maxUint32)\n\t}\n\treturn min, max, ok\n}\n\nfunc (page *uint32Page) Slice(i, j int64) Page {\n\treturn &uint32Page{\n\t\ttyp:         page.typ,\n\t\tvalues:      page.values[i:j],\n\t\tcolumnIndex: page.columnIndex,\n\t}\n}\n\nfunc (page *uint32Page) makeValue(v uint32) Value {\n\tvalue := makeValueUint32(v)\n\tvalue.columnIndex = page.columnIndex\n\treturn value\n}\n\ntype uint64Page struct {\n\ttyp         Type\n\tvalues      []uint64\n\tcolumnIndex int16\n}\n\nfunc newUint64Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *uint64Page {\n\treturn &uint64Page{\n\t\ttyp:         typ,\n\t\tvalues:      values.Uint64()[:numValues],\n\t\tcolumnIndex: ^columnIndex,\n\t}\n}\n\nfunc (page *uint64Page) Type() Type { return page.typ }\n\nfunc (page *uint64Page) Column() int { return int(^page.columnIndex) }\n\nfunc (page *uint64Page) Dictionary() Dictionary { return nil }\n\nfunc (page *uint64Page) NumRows() int64 { return int64(len(page.values)) }\n\nfunc (page *uint64Page) NumValues() int64 { return int64(len(page.values)) }\n\nfunc (page *uint64Page) NumNulls() int64 { return 0 }\n\nfunc (page *uint64Page) Size() int64 { return 8 * int64(len(page.values)) }\n\nfunc (page *uint64Page) RepetitionLevels() []byte { return nil }\n\nfunc (page *uint64Page) DefinitionLevels() []byte { return nil }\n\nfunc (page *uint64Page) Data() encoding.Values { return encoding.Uint64Values(page.values) }\n\nfunc (page *uint64Page) Values() ValueReader { return &uint64PageValues{page: page} }\n\nfunc (page *uint64Page) min() uint64 { return minUint64(page.values) }\n\nfunc (page *uint64Page) max() uint64 { return maxUint64(page.values) }\n\nfunc (page *uint64Page) bounds() (min, max uint64) { return boundsUint64(page.values) }\n\nfunc (page *uint64Page) Bounds() (min, max Value, ok bool) {\n\tif ok = len(page.values) > 0; ok {\n\t\tminUint64, maxUint64 := page.bounds()\n\t\tmin = page.makeValue(minUint64)\n\t\tmax = page.makeValue(maxUint64)\n\t}\n\treturn min, max, ok\n}\n\nfunc (page *uint64Page) Slice(i, j int64) Page {\n\treturn &uint64Page{\n\t\ttyp:         page.typ,\n\t\tvalues:      page.values[i:j],\n\t\tcolumnIndex: page.columnIndex,\n\t}\n}\n\nfunc (page *uint64Page) makeValue(v uint64) Value {\n\tvalue := makeValueUint64(v)\n\tvalue.columnIndex = page.columnIndex\n\treturn value\n}\n\ntype be128Page struct {\n\ttyp         Type\n\tvalues      [][16]byte\n\tcolumnIndex int16\n}\n\nfunc newBE128Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *be128Page {\n\treturn &be128Page{\n\t\ttyp:         typ,\n\t\tvalues:      values.Uint128()[:numValues],\n\t\tcolumnIndex: ^columnIndex,\n\t}\n}\n\nfunc (page *be128Page) Type() Type { return page.typ }\n\nfunc (page *be128Page) Column() int { return int(^page.columnIndex) }\n\nfunc (page *be128Page) Dictionary() Dictionary { return nil }\n\nfunc (page *be128Page) NumRows() int64 { return int64(len(page.values)) }\n\nfunc (page *be128Page) NumValues() int64 { return int64(len(page.values)) }\n\nfunc (page *be128Page) NumNulls() int64 { return 0 }\n\nfunc (page *be128Page) Size() int64 { return 16 * int64(len(page.values)) }\n\nfunc (page *be128Page) RepetitionLevels() []byte { return nil }\n\nfunc (page *be128Page) DefinitionLevels() []byte { return nil }\n\nfunc (page *be128Page) Data() encoding.Values { return encoding.Uint128Values(page.values) }\n\nfunc (page *be128Page) Values() ValueReader { return &be128PageValues{page: page} }\n\nfunc (page *be128Page) min() []byte { return minBE128(page.values) }\n\nfunc (page *be128Page) max() []byte { return maxBE128(page.values) }\n\nfunc (page *be128Page) bounds() (min, max []byte) { return boundsBE128(page.values) }\n\nfunc (page *be128Page) Bounds() (min, max Value, ok bool) {\n\tif ok = len(page.values) > 0; ok {\n\t\tminBytes, maxBytes := page.bounds()\n\t\tmin = page.makeValueBytes(minBytes)\n\t\tmax = page.makeValueBytes(maxBytes)\n\t}\n\treturn min, max, ok\n}\n\nfunc (page *be128Page) Slice(i, j int64) Page {\n\treturn &be128Page{\n\t\ttyp:         page.typ,\n\t\tvalues:      page.values[i:j],\n\t\tcolumnIndex: page.columnIndex,\n\t}\n}\n\nfunc (page *be128Page) makeValue(v *[16]byte) Value {\n\treturn page.makeValueBytes(v[:])\n}\n\nfunc (page *be128Page) makeValueBytes(v []byte) Value {\n\tvalue := makeValueBytes(FixedLenByteArray, v)\n\tvalue.columnIndex = page.columnIndex\n\treturn value\n}\n\nfunc (page *be128Page) makeValueString(v string) Value {\n\tvalue := makeValueString(FixedLenByteArray, v)\n\tvalue.columnIndex = page.columnIndex\n\treturn value\n}\n\ntype nullPage struct {\n\ttyp    Type\n\tcolumn int\n\tcount  int\n}\n\nfunc newNullPage(typ Type, columnIndex int16, numValues int32) *nullPage {\n\treturn &nullPage{\n\t\ttyp:    typ,\n\t\tcolumn: int(columnIndex),\n\t\tcount:  int(numValues),\n\t}\n}\n\nfunc (page *nullPage) Type() Type                        { return page.typ }\nfunc (page *nullPage) Column() int                       { return page.column }\nfunc (page *nullPage) Dictionary() Dictionary            { return nil }\nfunc (page *nullPage) NumRows() int64                    { return int64(page.count) }\nfunc (page *nullPage) NumValues() int64                  { return int64(page.count) }\nfunc (page *nullPage) NumNulls() int64                   { return int64(page.count) }\nfunc (page *nullPage) Bounds() (min, max Value, ok bool) { return }\nfunc (page *nullPage) Size() int64                       { return 1 }\nfunc (page *nullPage) Values() ValueReader {\n\treturn &nullPageValues{column: page.column, remain: page.count}\n}\nfunc (page *nullPage) Slice(i, j int64) Page {\n\treturn &nullPage{column: page.column, count: page.count - int(j-i)}\n}\nfunc (page *nullPage) RepetitionLevels() []byte { return nil }\nfunc (page *nullPage) DefinitionLevels() []byte { return nil }\nfunc (page *nullPage) Data() encoding.Values    { return encoding.Values{} }\n"
  },
  {
    "path": "page_bounds.go",
    "content": "package parquet\n\nimport \"bytes\"\n\nfunc boundsFixedLenByteArray(data []byte, size int) (min, max []byte) {\n\tif len(data) > 0 {\n\t\tmin = data[:size]\n\t\tmax = data[:size]\n\n\t\tfor i, j := size, 2*size; j <= len(data); {\n\t\t\titem := data[i:j]\n\n\t\t\tif bytes.Compare(item, min) < 0 {\n\t\t\t\tmin = item\n\t\t\t}\n\t\t\tif bytes.Compare(item, max) > 0 {\n\t\t\t\tmax = item\n\t\t\t}\n\n\t\t\ti += size\n\t\t\tj += size\n\t\t}\n\t}\n\treturn min, max\n}\n"
  },
  {
    "path": "page_bounds_amd64.go",
    "content": "//go:build !purego\n\npackage parquet\n\n// The min-max algorithms combine looking for the min and max values in a single\n// pass over the data. While the behavior is the same as calling functions to\n// look for the min and max values independently, doing both operations at the\n// same time means that we only load the data from memory once. When working on\n// large arrays the algorithms are limited by memory bandwidth, computing both\n// the min and max together shrinks by half the amount of data read from memory.\n//\n// The following benchmarks results were highlighting the benefits of combining\n// the min-max search, compared to calling the min and max functions separately:\n//\n// name                 old time/op    new time/op    delta\n// BoundsInt64/10240KiB    590µs ±15%     330µs ±10%  -44.01%  (p=0.000 n=10+10)\n//\n// name                 old speed      new speed      delta\n// BoundsInt64/10240KiB 17.9GB/s ±13%  31.8GB/s ±11%  +78.13%  (p=0.000 n=10+10)\n//\n// As expected, since the functions are memory-bound in those cases, and load\n// half as much data, we see significant improvements. The gains are not 2x because\n// running more AVX-512 instructions in the tight loops causes more contention\n// on CPU ports.\n//\n// Optimizations being trade offs, using min/max functions independently appears\n// to yield better throughput when the data resides in CPU caches:\n//\n// name             old time/op    new time/op    delta\n// BoundsInt64/4KiB   52.1ns ± 0%    46.2ns ± 1%  -12.65%  (p=0.000 n=10+10)\n//\n// name             old speed      new speed      delta\n// BoundsInt64/4KiB 78.6GB/s ± 0%  88.6GB/s ± 1%  +11.23%  (p=0.000 n=10+10)\n//\n// The probable explanation is that in those cases the algorithms are not\n// memory-bound anymore, but limited by contention on CPU ports, and the\n// individual min/max functions are able to better parallelize the work due\n// to running less instructions per loop. The performance starts to equalize\n// around 256KiB, and degrade beyond 1MiB, so we use this threshold to determine\n// which approach to prefer.\nconst combinedBoundsThreshold = 1 * 1024 * 1024\n\n//go:noescape\nfunc combinedBoundsBool(data []bool) (min, max bool)\n\n//go:noescape\nfunc combinedBoundsInt32(data []int32) (min, max int32)\n\n//go:noescape\nfunc combinedBoundsInt64(data []int64) (min, max int64)\n\n//go:noescape\nfunc combinedBoundsUint32(data []uint32) (min, max uint32)\n\n//go:noescape\nfunc combinedBoundsUint64(data []uint64) (min, max uint64)\n\n//go:noescape\nfunc combinedBoundsFloat32(data []float32) (min, max float32)\n\n//go:noescape\nfunc combinedBoundsFloat64(data []float64) (min, max float64)\n\n//go:noescape\nfunc combinedBoundsBE128(data [][16]byte) (min, max []byte)\n\nfunc boundsInt32(data []int32) (min, max int32) {\n\tif 4*len(data) >= combinedBoundsThreshold {\n\t\treturn combinedBoundsInt32(data)\n\t}\n\tmin = minInt32(data)\n\tmax = maxInt32(data)\n\treturn\n}\n\nfunc boundsInt64(data []int64) (min, max int64) {\n\tif 8*len(data) >= combinedBoundsThreshold {\n\t\treturn combinedBoundsInt64(data)\n\t}\n\tmin = minInt64(data)\n\tmax = maxInt64(data)\n\treturn\n}\n\nfunc boundsUint32(data []uint32) (min, max uint32) {\n\tif 4*len(data) >= combinedBoundsThreshold {\n\t\treturn combinedBoundsUint32(data)\n\t}\n\tmin = minUint32(data)\n\tmax = maxUint32(data)\n\treturn\n}\n\nfunc boundsUint64(data []uint64) (min, max uint64) {\n\tif 8*len(data) >= combinedBoundsThreshold {\n\t\treturn combinedBoundsUint64(data)\n\t}\n\tmin = minUint64(data)\n\tmax = maxUint64(data)\n\treturn\n}\n\nfunc boundsFloat32(data []float32) (min, max float32) {\n\tif 4*len(data) >= combinedBoundsThreshold {\n\t\treturn combinedBoundsFloat32(data)\n\t}\n\tmin = minFloat32(data)\n\tmax = maxFloat32(data)\n\treturn\n}\n\nfunc boundsFloat64(data []float64) (min, max float64) {\n\tif 8*len(data) >= combinedBoundsThreshold {\n\t\treturn combinedBoundsFloat64(data)\n\t}\n\tmin = minFloat64(data)\n\tmax = maxFloat64(data)\n\treturn\n}\n\nfunc boundsBE128(data [][16]byte) (min, max []byte) {\n\t// TODO: min/max BE128 is really complex to vectorize, and the returns\n\t// were barely better than doing the min and max independently, for all\n\t// input sizes. We should revisit if we find ways to improve the min or\n\t// max algorithms which can be transposed to the combined version.\n\tmin = minBE128(data)\n\tmax = maxBE128(data)\n\treturn\n}\n"
  },
  {
    "path": "page_bounds_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n#define bswap128lo 0x08080A0B0C0D0E0F\n#define bswap128hi 0x0001020304050607\n\nDATA bswap128+0(SB)/8, $bswap128lo\nDATA bswap128+8(SB)/8, $bswap128hi\nDATA bswap128+16(SB)/8, $bswap128lo\nDATA bswap128+24(SB)/8, $bswap128hi\nDATA bswap128+32(SB)/8, $bswap128lo\nDATA bswap128+40(SB)/8, $bswap128hi\nDATA bswap128+48(SB)/8, $bswap128lo\nDATA bswap128+56(SB)/8, $bswap128hi\nGLOBL bswap128(SB), RODATA|NOPTR, $64\n\nDATA indexes128+0(SB)/8, $0\nDATA indexes128+8(SB)/8, $0\nDATA indexes128+16(SB)/8, $1\nDATA indexes128+24(SB)/8, $1\nDATA indexes128+32(SB)/8, $2\nDATA indexes128+40(SB)/8, $2\nDATA indexes128+48(SB)/8, $3\nDATA indexes128+56(SB)/8, $3\nGLOBL indexes128(SB), RODATA|NOPTR, $64\n\nDATA swap64+0(SB)/8, $4\nDATA swap64+8(SB)/8, $5\nDATA swap64+16(SB)/8, $6\nDATA swap64+24(SB)/8, $7\nDATA swap64+32(SB)/8, $2\nDATA swap64+40(SB)/8, $3\nDATA swap64+48(SB)/8, $0\nDATA swap64+56(SB)/8, $1\nGLOBL swap64(SB), RODATA|NOPTR, $64\n\nDATA swap32+0(SB)/4, $8\nDATA swap32+4(SB)/4, $9\nDATA swap32+8(SB)/4, $10\nDATA swap32+12(SB)/4, $11\nDATA swap32+16(SB)/4, $12\nDATA swap32+20(SB)/4, $13\nDATA swap32+24(SB)/4, $14\nDATA swap32+28(SB)/4, $15\nDATA swap32+32(SB)/4, $4\nDATA swap32+36(SB)/4, $5\nDATA swap32+40(SB)/4, $6\nDATA swap32+44(SB)/4, $7\nDATA swap32+48(SB)/4, $2\nDATA swap32+52(SB)/4, $3\nDATA swap32+56(SB)/4, $0\nDATA swap32+60(SB)/4, $1\nGLOBL swap32(SB), RODATA|NOPTR, $64\n\n// func combinedBoundsInt32(data []int32) (min, max int32)\nTEXT ·combinedBoundsInt32(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ R8, R8\n    XORQ R9, R9\n\n    CMPQ CX, $0\n    JE done\n    XORQ SI, SI\n    MOVLQZX (AX), R8 // min\n    MOVLQZX (AX), R9 // max\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $32\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    VPBROADCASTD (AX), Z0\n    VPBROADCASTD (AX), Z3\nloop32:\n    VMOVDQU32 (AX)(SI*4), Z1\n    VMOVDQU32 64(AX)(SI*4), Z2\n    VPMINSD Z1, Z0, Z0\n    VPMINSD Z2, Z0, Z0\n    VPMAXSD Z1, Z3, Z3\n    VPMAXSD Z2, Z3, Z3\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JNE loop32\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VMOVDQU32 swap32+0(SB), Z2\n    VPERMI2D Z0, Z0, Z1\n    VPERMI2D Z3, Z3, Z2\n    VPMINSD Y1, Y0, Y0\n    VPMAXSD Y2, Y3, Y3\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VMOVDQU32 swap32+32(SB), Y2\n    VPERMI2D Y0, Y0, Y1\n    VPERMI2D Y3, Y3, Y2\n    VPMINSD X1, X0, X0\n    VPMAXSD X2, X3, X3\n\n    VMOVDQU32 swap32+48(SB), X1\n    VMOVDQU32 swap32+48(SB), X2\n    VPERMI2D X0, X0, X1\n    VPERMI2D X3, X3, X2\n    VPMINSD X1, X0, X0\n    VPMAXSD X2, X3, X3\n    VZEROUPPER\n\n    MOVQ X0, BX\n    MOVQ X3, DX\n    MOVL BX, R8\n    MOVL DX, R9\n    SHRQ $32, BX\n    SHRQ $32, DX\n    CMPL BX, R8\n    CMOVLLT BX, R8\n    CMPL DX, R9\n    CMOVLGT DX, R9\n\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVLQZX (AX)(SI*4), DX\n    CMPL DX, R8\n    CMOVLLT DX, R8\n    CMPL DX, R9\n    CMOVLGT DX, R9\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVL R8, min+24(FP)\n    MOVL R9, max+28(FP)\n    RET\n\n// func combinedBoundsInt64(data []int64) (min, max int64)\nTEXT ·combinedBoundsInt64(SB), NOSPLIT, $-40\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ R8, R8\n    XORQ R9, R9\n\n    CMPQ CX, $0\n    JE done\n    XORQ SI, SI\n    MOVQ (AX), R8 // min\n    MOVQ (AX), R9 // max\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $16\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $4, DI\n    SHLQ $4, DI\n    VPBROADCASTQ (AX), Z0\n    VPBROADCASTQ (AX), Z3\nloop16:\n    VMOVDQU64 (AX)(SI*8), Z1\n    VMOVDQU64 64(AX)(SI*8), Z2\n    VPMINSQ Z1, Z0, Z0\n    VPMINSQ Z2, Z0, Z0\n    VPMAXSQ Z1, Z3, Z3\n    VPMAXSQ Z2, Z3, Z3\n    ADDQ $16, SI\n    CMPQ SI, DI\n    JNE loop16\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VMOVDQU32 swap32+0(SB), Z2\n    VPERMI2D Z0, Z0, Z1\n    VPERMI2D Z3, Z3, Z2\n    VPMINSQ Y1, Y0, Y0\n    VPMAXSQ Y2, Y3, Y3\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VMOVDQU32 swap32+32(SB), Y2\n    VPERMI2D Y0, Y0, Y1\n    VPERMI2D Y3, Y3, Y2\n    VPMINSQ X1, X0, X0\n    VPMAXSQ X2, X3, X3\n\n    VMOVDQU32 swap32+48(SB), X1\n    VMOVDQU32 swap32+48(SB), X2\n    VPERMI2D X0, X0, X1\n    VPERMI2D X3, X3, X2\n    VPMINSQ X1, X0, X0\n    VPMAXSQ X2, X3, X3\n    VZEROUPPER\n\n    MOVQ X0, R8\n    MOVQ X3, R9\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVQ (AX)(SI*8), DX\n    CMPQ DX, R8\n    CMOVQLT DX, R8\n    CMPQ DX, R9\n    CMOVQGT DX, R9\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVQ R8, min+24(FP)\n    MOVQ R9, max+32(FP)\n    RET\n\n// func combinedBoundsUint32(data []uint32) (min, max uint32)\nTEXT ·combinedBoundsUint32(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ R8, R8\n    XORQ R9, R9\n\n    CMPQ CX, $0\n    JE done\n    XORQ SI, SI\n    MOVLQZX (AX), R8 // min\n    MOVLQZX (AX), R9 // max\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $32\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    VPBROADCASTD (AX), Z0\n    VPBROADCASTD (AX), Z3\nloop32:\n    VMOVDQU32 (AX)(SI*4), Z1\n    VMOVDQU32 64(AX)(SI*4), Z2\n    VPMINUD Z1, Z0, Z0\n    VPMINUD Z2, Z0, Z0\n    VPMAXUD Z1, Z3, Z3\n    VPMAXUD Z2, Z3, Z3\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JNE loop32\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VMOVDQU32 swap32+0(SB), Z2\n    VPERMI2D Z0, Z0, Z1\n    VPERMI2D Z3, Z3, Z2\n    VPMINUD Y1, Y0, Y0\n    VPMAXUD Y2, Y3, Y3\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VMOVDQU32 swap32+32(SB), Y2\n    VPERMI2D Y0, Y0, Y1\n    VPERMI2D Y3, Y3, Y2\n    VPMINUD X1, X0, X0\n    VPMAXUD X2, X3, X3\n\n    VMOVDQU32 swap32+48(SB), X1\n    VMOVDQU32 swap32+48(SB), X2\n    VPERMI2D X0, X0, X1\n    VPERMI2D X3, X3, X2\n    VPMINUD X1, X0, X0\n    VPMAXUD X2, X3, X3\n    VZEROUPPER\n\n    MOVQ X0, BX\n    MOVQ X3, DX\n    MOVL BX, R8\n    MOVL DX, R9\n    SHRQ $32, BX\n    SHRQ $32, DX\n    CMPL BX, R8\n    CMOVLCS BX, R8\n    CMPL DX, R9\n    CMOVLHI DX, R9\n\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVLQZX (AX)(SI*4), DX\n    CMPL DX, R8\n    CMOVLCS DX, R8\n    CMPL DX, R9\n    CMOVLHI DX, R9\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVL R8, min+24(FP)\n    MOVL R9, max+28(FP)\n    RET\n\n// func combinedBoundsUint64(data []uint64) (min, max uint64)\nTEXT ·combinedBoundsUint64(SB), NOSPLIT, $-40\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ R8, R8\n    XORQ R9, R9\n\n    CMPQ CX, $0\n    JE done\n    XORQ SI, SI\n    MOVQ (AX), R8 // min\n    MOVQ (AX), R9 // max\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $16\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $4, DI\n    SHLQ $4, DI\n    VPBROADCASTQ (AX), Z0\n    VPBROADCASTQ (AX), Z3\nloop16:\n    VMOVDQU64 (AX)(SI*8), Z1\n    VMOVDQU64 64(AX)(SI*8), Z2\n    VPMINUQ Z1, Z0, Z0\n    VPMINUQ Z2, Z0, Z0\n    VPMAXUQ Z1, Z3, Z3\n    VPMAXUQ Z2, Z3, Z3\n    ADDQ $16, SI\n    CMPQ SI, DI\n    JNE loop16\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VMOVDQU32 swap32+0(SB), Z2\n    VPERMI2D Z0, Z0, Z1\n    VPERMI2D Z3, Z3, Z2\n    VPMINUQ Y1, Y0, Y0\n    VPMAXUQ Y2, Y3, Y3\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VMOVDQU32 swap32+32(SB), Y2\n    VPERMI2D Y0, Y0, Y1\n    VPERMI2D Y3, Y3, Y2\n    VPMINUQ X1, X0, X0\n    VPMAXUQ X2, X3, X3\n\n    VMOVDQU32 swap32+48(SB), X1\n    VMOVDQU32 swap32+48(SB), X2\n    VPERMI2D X0, X0, X1\n    VPERMI2D X3, X3, X2\n    VPMINUQ X1, X0, X0\n    VPMAXUQ X2, X3, X3\n    VZEROUPPER\n\n    MOVQ X0, R8\n    MOVQ X3, R9\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVQ (AX)(SI*8), DX\n    CMPQ DX, R8\n    CMOVQCS DX, R8\n    CMPQ DX, R9\n    CMOVQHI DX, R9\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVQ R8, min+24(FP)\n    MOVQ R9, max+32(FP)\n    RET\n\n// func combinedBoundsFloat32(data []float32) (min, max float32)\nTEXT ·combinedBoundsFloat32(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ R8, R8\n    XORQ R9, R9\n\n    CMPQ CX, $0\n    JE done\n    XORPS X0, X0\n    XORPS X1, X1\n    XORQ SI, SI\n    MOVLQZX (AX), R8 // min\n    MOVLQZX (AX), R9 // max\n    MOVQ R8, X0\n    MOVQ R9, X1\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $32\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    VPBROADCASTD (AX), Z0\n    VPBROADCASTD (AX), Z3\nloop32:\n    VMOVDQU32 (AX)(SI*4), Z1\n    VMOVDQU32 64(AX)(SI*4), Z2\n    VMINPS Z1, Z0, Z0\n    VMINPS Z2, Z0, Z0\n    VMAXPS Z1, Z3, Z3\n    VMAXPS Z2, Z3, Z3\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JNE loop32\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VMOVDQU32 swap32+0(SB), Z2\n    VPERMI2D Z0, Z0, Z1\n    VPERMI2D Z3, Z3, Z2\n    VMINPS Y1, Y0, Y0\n    VMAXPS Y2, Y3, Y3\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VMOVDQU32 swap32+32(SB), Y2\n    VPERMI2D Y0, Y0, Y1\n    VPERMI2D Y3, Y3, Y2\n    VMINPS X1, X0, X0\n    VMAXPS X2, X3, X3\n\n    VMOVDQU32 swap32+48(SB), X1\n    VMOVDQU32 swap32+48(SB), X2\n    VPERMI2D X0, X0, X1\n    VPERMI2D X3, X3, X2\n    VMINPS X1, X0, X0\n    VMAXPS X2, X3, X3\n    VZEROUPPER\n\n    MOVAPS X0, X1\n    MOVAPS X3, X2\n\n    PSRLQ $32, X1\n    MOVQ X0, R8\n    MOVQ X1, R10\n    UCOMISS X0, X1\n    CMOVLCS R10, R8\n\n    PSRLQ $32, X2\n    MOVQ X3, R9\n    MOVQ X2, R11\n    UCOMISS X3, X2\n    CMOVLHI R11, R9\n\n    CMPQ SI, CX\n    JE done\n    MOVQ R8, X0\n    MOVQ R9, X1\nloop:\n    MOVLQZX (AX)(SI*4), DX\n    MOVQ DX, X2\n    UCOMISS X0, X2\n    CMOVLCS DX, R8\n    UCOMISS X1, X2\n    CMOVLHI DX, R9\n    MOVQ R8, X0\n    MOVQ R9, X1\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVL R8, min+24(FP)\n    MOVL R9, max+28(FP)\n    RET\n\n// func combinedBoundsFloat64(data []float64) (min, max float64)\nTEXT ·combinedBoundsFloat64(SB), NOSPLIT, $-40\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ R8, R8\n    XORQ R9, R9\n\n    CMPQ CX, $0\n    JE done\n    XORPD X0, X0\n    XORPD X1, X1\n    XORQ SI, SI\n    MOVQ (AX), R8 // min\n    MOVQ (AX), R9 // max\n    MOVQ R8, X0\n    MOVQ R9, X1\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $16\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $4, DI\n    SHLQ $4, DI\n    VPBROADCASTQ (AX), Z0\n    VPBROADCASTQ (AX), Z3\nloop16:\n    VMOVDQU64 (AX)(SI*8), Z1\n    VMOVDQU64 64(AX)(SI*8), Z2\n    VMINPD Z1, Z0, Z0\n    VMINPD Z2, Z0, Z0\n    VMAXPD Z1, Z3, Z3\n    VMAXPD Z2, Z3, Z3\n    ADDQ $16, SI\n    CMPQ SI, DI\n    JNE loop16\n\n    VMOVDQU64 swap32+0(SB), Z1\n    VMOVDQU64 swap32+0(SB), Z2\n    VPERMI2D Z0, Z0, Z1\n    VPERMI2D Z3, Z3, Z2\n    VMINPD Y1, Y0, Y0\n    VMAXPD Y2, Y3, Y3\n\n    VMOVDQU64 swap32+32(SB), Y1\n    VMOVDQU64 swap32+32(SB), Y2\n    VPERMI2D Y0, Y0, Y1\n    VPERMI2D Y3, Y3, Y2\n    VMINPD X1, X0, X0\n    VMAXPD X2, X3, X3\n\n    VMOVDQU64 swap32+48(SB), X1\n    VMOVDQU64 swap32+48(SB), X2\n    VPERMI2D X0, X0, X1\n    VPERMI2D X3, X3, X2\n    VMINPD X1, X0, X0\n    VMAXPD X2, X3, X1\n    VZEROUPPER\n\n    MOVQ X0, R8\n    MOVQ X1, R9\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVQ (AX)(SI*8), DX\n    MOVQ DX, X2\n    UCOMISD X0, X2\n    CMOVQCS DX, R8\n    UCOMISD X1, X2\n    CMOVQHI DX, R9\n    MOVQ R8, X0\n    MOVQ R9, X1\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVQ R8, min+24(FP)\n    MOVQ R9, max+32(FP)\n    RET\n"
  },
  {
    "path": "page_bounds_purego.go",
    "content": "//go:build purego || !amd64\n\npackage parquet\n\nimport (\n\t\"encoding/binary\"\n)\n\nfunc boundsInt32(data []int32) (min, max int32) {\n\tif len(data) > 0 {\n\t\tmin = data[0]\n\t\tmax = data[0]\n\n\t\tfor _, v := range data[1:] {\n\t\t\tif v < min {\n\t\t\t\tmin = v\n\t\t\t}\n\t\t\tif v > max {\n\t\t\t\tmax = v\n\t\t\t}\n\t\t}\n\t}\n\treturn min, max\n}\n\nfunc boundsInt64(data []int64) (min, max int64) {\n\tif len(data) > 0 {\n\t\tmin = data[0]\n\t\tmax = data[0]\n\n\t\tfor _, v := range data[1:] {\n\t\t\tif v < min {\n\t\t\t\tmin = v\n\t\t\t}\n\t\t\tif v > max {\n\t\t\t\tmax = v\n\t\t\t}\n\t\t}\n\t}\n\treturn min, max\n}\n\nfunc boundsUint32(data []uint32) (min, max uint32) {\n\tif len(data) > 0 {\n\t\tmin = data[0]\n\t\tmax = data[0]\n\n\t\tfor _, v := range data[1:] {\n\t\t\tif v < min {\n\t\t\t\tmin = v\n\t\t\t}\n\t\t\tif v > max {\n\t\t\t\tmax = v\n\t\t\t}\n\t\t}\n\t}\n\treturn min, max\n}\n\nfunc boundsUint64(data []uint64) (min, max uint64) {\n\tif len(data) > 0 {\n\t\tmin = data[0]\n\t\tmax = data[0]\n\n\t\tfor _, v := range data[1:] {\n\t\t\tif v < min {\n\t\t\t\tmin = v\n\t\t\t}\n\t\t\tif v > max {\n\t\t\t\tmax = v\n\t\t\t}\n\t\t}\n\t}\n\treturn min, max\n}\n\nfunc boundsFloat32(data []float32) (min, max float32) {\n\tif len(data) > 0 {\n\t\tmin = data[0]\n\t\tmax = data[0]\n\n\t\tfor _, v := range data[1:] {\n\t\t\tif v < min {\n\t\t\t\tmin = v\n\t\t\t}\n\t\t\tif v > max {\n\t\t\t\tmax = v\n\t\t\t}\n\t\t}\n\t}\n\treturn min, max\n}\n\nfunc boundsFloat64(data []float64) (min, max float64) {\n\tif len(data) > 0 {\n\t\tmin = data[0]\n\t\tmax = data[0]\n\n\t\tfor _, v := range data[1:] {\n\t\t\tif v < min {\n\t\t\t\tmin = v\n\t\t\t}\n\t\t\tif v > max {\n\t\t\t\tmax = v\n\t\t\t}\n\t\t}\n\t}\n\treturn min, max\n}\n\nfunc boundsBE128(data [][16]byte) (min, max []byte) {\n\tif len(data) > 0 {\n\t\tminHi := binary.BigEndian.Uint64(data[0][:8])\n\t\tmaxHi := minHi\n\t\tminIndex := 0\n\t\tmaxIndex := 0\n\t\tfor i := 1; i < len(data); i++ {\n\t\t\thi := binary.BigEndian.Uint64(data[i][:8])\n\t\t\tlo := binary.BigEndian.Uint64(data[i][8:])\n\t\t\tswitch {\n\t\t\tcase hi < minHi:\n\t\t\t\tminHi, minIndex = hi, i\n\t\t\tcase hi == minHi:\n\t\t\t\tminLo := binary.BigEndian.Uint64(data[minIndex][8:])\n\t\t\t\tif lo < minLo {\n\t\t\t\t\tminHi, minIndex = hi, i\n\t\t\t\t}\n\t\t\t}\n\t\t\tswitch {\n\t\t\tcase hi > maxHi:\n\t\t\t\tmaxHi, maxIndex = hi, i\n\t\t\tcase hi == maxHi:\n\t\t\t\tmaxLo := binary.BigEndian.Uint64(data[maxIndex][8:])\n\t\t\t\tif lo > maxLo {\n\t\t\t\t\tmaxHi, maxIndex = hi, i\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tmin = data[minIndex][:]\n\t\tmax = data[maxIndex][:]\n\t}\n\treturn min, max\n}\n"
  },
  {
    "path": "page_bounds_test.go",
    "content": "package parquet\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"math/rand\"\n\t\"reflect\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/internal/quick\"\n)\n\nvar benchmarkBufferSizes = [...]int{\n\t4 * 1024,\n\t256 * 1024,\n\t2048 * 1024,\n}\n\nfunc forEachBenchmarkBufferSize(b *testing.B, f func(*testing.B, int)) {\n\tfor _, bufferSize := range benchmarkBufferSizes {\n\t\tb.Run(fmt.Sprintf(\"%dKiB\", bufferSize/1024), func(b *testing.B) {\n\t\t\tb.SetBytes(int64(bufferSize))\n\t\t\tf(b, bufferSize)\n\t\t})\n\t}\n}\n\nfunc TestBoundsInt32(t *testing.T) {\n\terr := quick.Check(func(values []int32) bool {\n\t\tmin := int32(0)\n\t\tmax := int32(0)\n\t\tif len(values) > 0 {\n\t\t\tmin = values[0]\n\t\t\tmax = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v < min {\n\t\t\t\t\tmin = v\n\t\t\t\t}\n\t\t\t\tif v > max {\n\t\t\t\t\tmax = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tminValue, maxValue := boundsInt32(values)\n\t\treturn min == minValue && max == maxValue\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestBoundsInt64(t *testing.T) {\n\terr := quick.Check(func(values []int64) bool {\n\t\tmin := int64(0)\n\t\tmax := int64(0)\n\t\tif len(values) > 0 {\n\t\t\tmin = values[0]\n\t\t\tmax = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v < min {\n\t\t\t\t\tmin = v\n\t\t\t\t}\n\t\t\t\tif v > max {\n\t\t\t\t\tmax = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tminValue, maxValue := boundsInt64(values)\n\t\treturn min == minValue && max == maxValue\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestBoundsUint32(t *testing.T) {\n\terr := quick.Check(func(values []uint32) bool {\n\t\tmin := uint32(0)\n\t\tmax := uint32(0)\n\t\tif len(values) > 0 {\n\t\t\tmin = values[0]\n\t\t\tmax = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v < min {\n\t\t\t\t\tmin = v\n\t\t\t\t}\n\t\t\t\tif v > max {\n\t\t\t\t\tmax = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tminValue, maxValue := boundsUint32(values)\n\t\treturn min == minValue && max == maxValue\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestBoundsUint64(t *testing.T) {\n\terr := quick.Check(func(values []uint64) bool {\n\t\tmin := uint64(0)\n\t\tmax := uint64(0)\n\t\tif len(values) > 0 {\n\t\t\tmin = values[0]\n\t\t\tmax = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v < min {\n\t\t\t\t\tmin = v\n\t\t\t\t}\n\t\t\t\tif v > max {\n\t\t\t\t\tmax = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tminValue, maxValue := boundsUint64(values)\n\t\treturn min == minValue && max == maxValue\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestBoundsFloat32(t *testing.T) {\n\terr := quick.Check(func(values []float32) bool {\n\t\tmin := float32(0)\n\t\tmax := float32(0)\n\t\tif len(values) > 0 {\n\t\t\tmin = values[0]\n\t\t\tmax = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v < min {\n\t\t\t\t\tmin = v\n\t\t\t\t}\n\t\t\t\tif v > max {\n\t\t\t\t\tmax = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tminValue, maxValue := boundsFloat32(values)\n\t\treturn min == minValue && max == maxValue\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestBoundsFloat64(t *testing.T) {\n\terr := quick.Check(func(values []float64) bool {\n\t\tmin := float64(0)\n\t\tmax := float64(0)\n\t\tif len(values) > 0 {\n\t\t\tmin = values[0]\n\t\t\tmax = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v < min {\n\t\t\t\t\tmin = v\n\t\t\t\t}\n\t\t\t\tif v > max {\n\t\t\t\t\tmax = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tminValue, maxValue := boundsFloat64(values)\n\t\treturn min == minValue && max == maxValue\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestBE128MinMaxSimilar(t *testing.T) {\n\tvar min [16]byte\n\n\t// Test values:\n\t//   [1 1 ... 1 1]\n\t//   [0 1 ... 1 1]\n\t//   ...\n\t//   [0 0 ... 0 1]\n\t//   [0 0 ... 0 0]\n\tfor i := 0; i < 17; i++ {\n\t\tvar max [16]byte\n\t\tfor j := i; j < 16; j++ {\n\t\t\tmax[j] = 1\n\t\t}\n\t\ttestBE182MinMaxPerm(t, min, max)\n\t}\n\n\t// Test values:\n\t//   [0 0 ... 0 0]\n\t//   [1 0 ... 0 0]\n\t//   ...\n\t//   [1 1 ... 1 0]\n\t//   [1 1 ... 1 1]\n\tfor i := 0; i < 17; i++ {\n\t\tvar max [16]byte\n\t\tfor j := 0; j < i; j++ {\n\t\t\tmax[j] = 1\n\t\t}\n\t\ttestBE182MinMaxPerm(t, min, max)\n\t}\n}\n\nfunc testBE182MinMaxPerm(t *testing.T, min, max [16]byte) {\n\ttestBE128MinMax(t, min[:], max[:], [][16]byte{min, max})\n\ttestBE128MinMax(t, min[:], max[:], [][16]byte{max, min})\n}\n\nfunc testBE128MinMax(t *testing.T, min, max []byte, data [][16]byte) {\n\tbmin := minBE128(data)\n\tif !reflect.DeepEqual(bmin, min[:]) {\n\t\tt.Errorf(\"unexpected min value\\nexpected %v\\n     got %v\", min, bmin)\n\t}\n\n\tbmax := maxBE128(data)\n\tif !reflect.DeepEqual(bmax, max[:]) {\n\t\tt.Errorf(\"unexpected max value\\nexpected %v\\n     got %v\", max, bmax)\n\t}\n}\n\nfunc TestBoundsBE128(t *testing.T) {\n\terr := quick.Check(func(values [][16]byte) bool {\n\t\tmin := [16]byte{}\n\t\tmax := [16]byte{}\n\t\tif len(values) > 0 {\n\t\t\tmin = values[0]\n\t\t\tmax = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif bytes.Compare(v[:], min[:]) < 0 {\n\t\t\t\t\tmin = v\n\t\t\t\t}\n\t\t\t\tif bytes.Compare(v[:], max[:]) > 0 {\n\t\t\t\t\tmax = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tminValue, maxValue := boundsBE128(values)\n\t\treturn (len(values) == 0 && minValue == nil && maxValue == nil) ||\n\t\t\t(bytes.Equal(min[:], minValue) && bytes.Equal(max[:], maxValue))\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestBoundsFixedLenByteArray(t *testing.T) {\n\terr := quick.Check(func(values []byte) bool {\n\t\tmin := [1]byte{}\n\t\tmax := [1]byte{}\n\t\tif len(values) > 0 {\n\t\t\tmin[0] = values[0]\n\t\t\tmax[0] = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v < min[0] {\n\t\t\t\t\tmin[0] = v\n\t\t\t\t}\n\t\t\t\tif v > max[0] {\n\t\t\t\t\tmax[0] = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tminValue, maxValue := boundsFixedLenByteArray(values, 1)\n\t\treturn (len(values) == 0 && minValue == nil && maxValue == nil) ||\n\t\t\t(bytes.Equal(min[:], minValue) && bytes.Equal(max[:], maxValue))\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc BenchmarkBoundsInt32(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]int32, bufferSize/4)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Int31()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tboundsInt32(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkBoundsInt64(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]int64, bufferSize/8)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Int63()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tboundsInt64(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkBoundsUint32(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]uint32, bufferSize/4)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Uint32()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tboundsUint32(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkBoundsUint64(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]uint64, bufferSize/8)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Uint64()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tboundsUint64(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkBoundsFloat32(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]float32, bufferSize/4)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Float32()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tboundsFloat32(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkBoundsFloat64(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]float64, bufferSize/8)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Float64()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tboundsFloat64(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkBoundsBE128(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([][16]byte, bufferSize)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tprng.Read(values[i][:])\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tboundsBE128(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkBoundsFixedLenByteArray(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]byte, bufferSize)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tprng.Read(values)\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tboundsFixedLenByteArray(values, 32)\n\t\t}\n\t})\n}\n"
  },
  {
    "path": "page_header.go",
    "content": "package parquet\n\nimport (\n\t\"fmt\"\n\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\n// PageHeader is an interface implemented by parquet page headers.\ntype PageHeader interface {\n\t// Returns the number of values in the page (including nulls).\n\tNumValues() int64\n\n\t// Returns the page encoding.\n\tEncoding() format.Encoding\n\n\t// Returns the parquet format page type.\n\tPageType() format.PageType\n}\n\n// DataPageHeader is a specialization of the PageHeader interface implemented by\n// data pages.\ntype DataPageHeader interface {\n\tPageHeader\n\n\t// Returns the encoding of the repetition level section.\n\tRepetitionLevelEncoding() format.Encoding\n\n\t// Returns the encoding of the definition level section.\n\tDefinitionLevelEncoding() format.Encoding\n\n\t// Returns the number of null values in the page.\n\tNullCount() int64\n\n\t// Returns the minimum value in the page based on the ordering rules of the\n\t// column's logical type.\n\t//\n\t// As an optimization, the method may return the same slice across multiple\n\t// calls. Programs must treat the returned value as immutable to prevent\n\t// unpredictable behaviors.\n\t//\n\t// If the page only contains only null values, an empty slice is returned.\n\tMinValue() []byte\n\n\t// Returns the maximum value in the page based on the ordering rules of the\n\t// column's logical type.\n\t//\n\t// As an optimization, the method may return the same slice across multiple\n\t// calls. Programs must treat the returned value as immutable to prevent\n\t// unpredictable behaviors.\n\t//\n\t// If the page only contains only null values, an empty slice is returned.\n\tMaxValue() []byte\n}\n\n// DictionaryPageHeader is an implementation of the PageHeader interface\n// representing dictionary pages.\ntype DictionaryPageHeader struct {\n\theader *format.DictionaryPageHeader\n}\n\nfunc (dict DictionaryPageHeader) NumValues() int64 {\n\treturn int64(dict.header.NumValues)\n}\n\nfunc (dict DictionaryPageHeader) Encoding() format.Encoding {\n\treturn dict.header.Encoding\n}\n\nfunc (dict DictionaryPageHeader) PageType() format.PageType {\n\treturn format.DictionaryPage\n}\n\nfunc (dict DictionaryPageHeader) IsSorted() bool {\n\treturn dict.header.IsSorted\n}\n\nfunc (dict DictionaryPageHeader) String() string {\n\treturn fmt.Sprintf(\"DICTIONARY_PAGE_HEADER{NumValues=%d,Encoding=%s,IsSorted=%t}\",\n\t\tdict.header.NumValues,\n\t\tdict.header.Encoding,\n\t\tdict.header.IsSorted)\n}\n\n// DataPageHeaderV1 is an implementation of the DataPageHeader interface\n// representing data pages version 1.\ntype DataPageHeaderV1 struct {\n\theader *format.DataPageHeader\n}\n\nfunc (v1 DataPageHeaderV1) NumValues() int64 {\n\treturn int64(v1.header.NumValues)\n}\n\nfunc (v1 DataPageHeaderV1) RepetitionLevelEncoding() format.Encoding {\n\treturn v1.header.RepetitionLevelEncoding\n}\n\nfunc (v1 DataPageHeaderV1) DefinitionLevelEncoding() format.Encoding {\n\treturn v1.header.DefinitionLevelEncoding\n}\n\nfunc (v1 DataPageHeaderV1) Encoding() format.Encoding {\n\treturn v1.header.Encoding\n}\n\nfunc (v1 DataPageHeaderV1) PageType() format.PageType {\n\treturn format.DataPage\n}\n\nfunc (v1 DataPageHeaderV1) NullCount() int64 {\n\treturn v1.header.Statistics.NullCount\n}\n\nfunc (v1 DataPageHeaderV1) MinValue() []byte {\n\treturn v1.header.Statistics.MinValue\n}\n\nfunc (v1 DataPageHeaderV1) MaxValue() []byte {\n\treturn v1.header.Statistics.MaxValue\n}\n\nfunc (v1 DataPageHeaderV1) String() string {\n\treturn fmt.Sprintf(\"DATA_PAGE_HEADER{NumValues=%d,Encoding=%s}\",\n\t\tv1.header.NumValues,\n\t\tv1.header.Encoding)\n}\n\n// DataPageHeaderV2 is an implementation of the DataPageHeader interface\n// representing data pages version 2.\ntype DataPageHeaderV2 struct {\n\theader *format.DataPageHeaderV2\n}\n\nfunc (v2 DataPageHeaderV2) NumValues() int64 {\n\treturn int64(v2.header.NumValues)\n}\n\nfunc (v2 DataPageHeaderV2) NumNulls() int64 {\n\treturn int64(v2.header.NumNulls)\n}\n\nfunc (v2 DataPageHeaderV2) NumRows() int64 {\n\treturn int64(v2.header.NumRows)\n}\n\nfunc (v2 DataPageHeaderV2) RepetitionLevelsByteLength() int64 {\n\treturn int64(v2.header.RepetitionLevelsByteLength)\n}\n\nfunc (v2 DataPageHeaderV2) DefinitionLevelsByteLength() int64 {\n\treturn int64(v2.header.DefinitionLevelsByteLength)\n}\n\nfunc (v2 DataPageHeaderV2) RepetitionLevelEncoding() format.Encoding {\n\treturn format.RLE\n}\n\nfunc (v2 DataPageHeaderV2) DefinitionLevelEncoding() format.Encoding {\n\treturn format.RLE\n}\n\nfunc (v2 DataPageHeaderV2) Encoding() format.Encoding {\n\treturn v2.header.Encoding\n}\n\nfunc (v2 DataPageHeaderV2) PageType() format.PageType {\n\treturn format.DataPageV2\n}\n\nfunc (v2 DataPageHeaderV2) NullCount() int64 {\n\treturn v2.header.Statistics.NullCount\n}\n\nfunc (v2 DataPageHeaderV2) MinValue() []byte {\n\treturn v2.header.Statistics.MinValue\n}\n\nfunc (v2 DataPageHeaderV2) MaxValue() []byte {\n\treturn v2.header.Statistics.MaxValue\n}\n\nfunc (v2 DataPageHeaderV2) IsCompressed() bool {\n\treturn v2.header.IsCompressed == nil || *v2.header.IsCompressed\n}\n\nfunc (v2 DataPageHeaderV2) String() string {\n\treturn fmt.Sprintf(\"DATA_PAGE_HEADER_V2{NumValues=%d,NumNulls=%d,NumRows=%d,Encoding=%s,IsCompressed=%t}\",\n\t\tv2.header.NumValues,\n\t\tv2.header.NumNulls,\n\t\tv2.header.NumRows,\n\t\tv2.header.Encoding,\n\t\tv2.IsCompressed())\n}\n\ntype unknownPageHeader struct {\n\theader *format.PageHeader\n}\n\nfunc (u unknownPageHeader) NumValues() int64 {\n\treturn 0\n}\n\nfunc (u unknownPageHeader) Encoding() format.Encoding {\n\treturn -1\n}\n\nfunc (u unknownPageHeader) PageType() format.PageType {\n\treturn u.header.Type\n}\n\nfunc (u unknownPageHeader) String() string {\n\treturn fmt.Sprintf(\"UNKNOWN_PAGE_HEADER{Type=%d}\", u.header.Type)\n}\n\nvar (\n\t_ PageHeader     = DictionaryPageHeader{}\n\t_ DataPageHeader = DataPageHeaderV1{}\n\t_ DataPageHeader = DataPageHeaderV2{}\n\t_ PageHeader     = unknownPageHeader{}\n)\n"
  },
  {
    "path": "page_max.go",
    "content": "package parquet\n\nimport (\n\t\"bytes\"\n)\n\nfunc maxFixedLenByteArray(data []byte, size int) (max []byte) {\n\tif len(data) > 0 {\n\t\tmax = data[:size]\n\n\t\tfor i, j := size, 2*size; j <= len(data); {\n\t\t\titem := data[i:j]\n\n\t\t\tif bytes.Compare(item, max) > 0 {\n\t\t\t\tmax = item\n\t\t\t}\n\n\t\t\ti += size\n\t\t\tj += size\n\t\t}\n\t}\n\treturn max\n}\n"
  },
  {
    "path": "page_max_amd64.go",
    "content": "//go:build !purego\n\npackage parquet\n\n//go:noescape\nfunc maxInt32(data []int32) int32\n\n//go:noescape\nfunc maxInt64(data []int64) int64\n\n//go:noescape\nfunc maxUint32(data []uint32) uint32\n\n//go:noescape\nfunc maxUint64(data []uint64) uint64\n\n//go:noescape\nfunc maxFloat32(data []float32) float32\n\n//go:noescape\nfunc maxFloat64(data []float64) float64\n\n//go:noescape\nfunc maxBE128(data [][16]byte) []byte\n"
  },
  {
    "path": "page_max_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n// func maxInt32(data []int32) int32\nTEXT ·maxInt32(SB), NOSPLIT, $-28\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ BX, BX\n\n    CMPQ CX, $0\n    JE done\n    XORQ SI, SI\n    MOVLQZX (AX), BX\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $32\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    VPBROADCASTD (AX), Z0\nloop32:\n    VMOVDQU32 (AX)(SI*4), Z1\n    VMOVDQU32 64(AX)(SI*4), Z2\n    VPMAXSD Z1, Z0, Z0\n    VPMAXSD Z2, Z0, Z0\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JNE loop32\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VPERMI2D Z0, Z0, Z1\n    VPMAXSD Y1, Y0, Y0\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VPERMI2D Y0, Y0, Y1\n    VPMAXSD X1, X0, X0\n\n    VMOVDQU32 swap32+48(SB), X1\n    VPERMI2D X0, X0, X1\n    VPMAXSD X1, X0, X0\n    VZEROUPPER\n\n    MOVQ X0, DX\n    MOVL DX, BX\n    SHRQ $32, DX\n    CMPL DX, BX\n    CMOVLGT DX, BX\n\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVLQZX (AX)(SI*4), DX\n    CMPL DX, BX\n    CMOVLGT DX, BX\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVL BX, ret+24(FP)\n    RET\n\n// func maxInt64(data []int64) int64\nTEXT ·maxInt64(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ BX, BX\n\n    CMPQ CX, $0\n    JE done\n    XORQ SI, SI\n    MOVQ (AX), BX\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $32\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    VPBROADCASTQ (AX), Z0\nloop32:\n    VMOVDQU64 (AX)(SI*8), Z1\n    VMOVDQU64 64(AX)(SI*8), Z2\n    VMOVDQU64 128(AX)(SI*8), Z3\n    VMOVDQU64 192(AX)(SI*8), Z4\n    VPMAXSQ Z1, Z2, Z5\n    VPMAXSQ Z3, Z4, Z6\n    VPMAXSQ Z5, Z6, Z1\n    VPMAXSQ Z1, Z0, Z0\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JNE loop32\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VPERMI2D Z0, Z0, Z1\n    VPMAXSQ Y1, Y0, Y0\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VPERMI2D Y0, Y0, Y1\n    VPMAXSQ X1, X0, X0\n\n    VMOVDQU32 swap32+48(SB), X1\n    VPERMI2D X0, X0, X1\n    VPMAXSQ X1, X0, X0\n    VZEROUPPER\n\n    MOVQ X0, BX\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVQ (AX)(SI*8), DX\n    CMPQ DX, BX\n    CMOVQGT DX, BX\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVQ BX, ret+24(FP)\n    RET\n\n// func maxUint32(data []int32) int32\nTEXT ·maxUint32(SB), NOSPLIT, $-28\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ BX, BX\n\n    CMPQ CX, $0\n    JE done\n    XORQ SI, SI\n    MOVLQZX (AX), BX\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $32\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    VPBROADCASTD (AX), Z0\nloop32:\n    VMOVDQU32 (AX)(SI*4), Z1\n    VMOVDQU32 64(AX)(SI*4), Z2\n    VPMAXUD Z1, Z0, Z0\n    VPMAXUD Z2, Z0, Z0\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JNE loop32\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VPERMI2D Z0, Z0, Z1\n    VPMAXUD Y1, Y0, Y0\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VPERMI2D Y0, Y0, Y1\n    VPMAXUD X1, X0, X0\n\n    VMOVDQU32 swap32+48(SB), X1\n    VPERMI2D X0, X0, X1\n    VPMAXUD X1, X0, X0\n    VZEROUPPER\n\n    MOVQ X0, DX\n    MOVL DX, BX\n    SHRQ $32, DX\n    CMPL DX, BX\n    CMOVLHI DX, BX\n\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVLQZX (AX)(SI*4), DX\n    CMPL DX, BX\n    CMOVLHI DX, BX\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVL BX, ret+24(FP)\n    RET\n\n// func maxUint64(data []uint64) uint64\nTEXT ·maxUint64(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ BX, BX\n\n    CMPQ CX, $0\n    JE done\n    XORQ SI, SI\n    MOVQ (AX), BX\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $32\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    VPBROADCASTQ (AX), Z0\nloop32:\n    VMOVDQU64 (AX)(SI*8), Z1\n    VMOVDQU64 64(AX)(SI*8), Z2\n    VMOVDQU64 128(AX)(SI*8), Z3\n    VMOVDQU64 192(AX)(SI*8), Z4\n    VPMAXUQ Z1, Z2, Z5\n    VPMAXUQ Z3, Z4, Z6\n    VPMAXUQ Z5, Z6, Z1\n    VPMAXUQ Z1, Z0, Z0\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JNE loop32\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VPERMI2D Z0, Z0, Z1\n    VPMAXUQ Y1, Y0, Y0\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VPERMI2D Y0, Y0, Y1\n    VPMAXUQ X1, X0, X0\n\n    VMOVDQU32 swap32+48(SB), X1\n    VPERMI2D X0, X0, X1\n    VPMAXUQ X1, X0, X0\n    VZEROUPPER\n\n    MOVQ X0, BX\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVQ (AX)(SI*8), DX\n    CMPQ DX, BX\n    CMOVQHI DX, BX\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVQ BX, ret+24(FP)\n    RET\n\n// func maxFloat32(data []float32) float32\nTEXT ·maxFloat32(SB), NOSPLIT, $-28\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ BX, BX\n\n    CMPQ CX, $0\n    JE done\n    XORPS X0, X0\n    XORPS X1, X1\n    XORQ SI, SI\n    MOVLQZX (AX), BX\n    MOVQ BX, X0\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $64\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $6, DI\n    SHLQ $6, DI\n    VPBROADCASTD (AX), Z0\nloop64:\n    VMOVDQU32 (AX)(SI*4), Z1\n    VMOVDQU32 64(AX)(SI*4), Z2\n    VMOVDQU32 128(AX)(SI*4), Z3\n    VMOVDQU32 192(AX)(SI*4), Z4\n    VMAXPS Z1, Z2, Z5\n    VMAXPS Z3, Z4, Z6\n    VMAXPS Z5, Z6, Z1\n    VMAXPS Z1, Z0, Z0\n    ADDQ $64, SI\n    CMPQ SI, DI\n    JNE loop64\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VPERMI2D Z0, Z0, Z1\n    VMAXPS Y1, Y0, Y0\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VPERMI2D Y0, Y0, Y1\n    VMAXPS X1, X0, X0\n\n    VMOVDQU32 swap32+48(SB), X1\n    VPERMI2D X0, X0, X1\n    VMAXPS X1, X0, X0\n    VZEROUPPER\n\n    MOVAPS X0, X1\n    PSRLQ $32, X1\n    MOVQ X0, BX\n    MOVQ X1, DX\n    UCOMISS X0, X1\n    CMOVLHI DX, BX\n\n    CMPQ SI, CX\n    JE done\n    MOVQ BX, X0\nloop:\n    MOVLQZX (AX)(SI*4), DX\n    MOVQ DX, X1\n    UCOMISS X0, X1\n    CMOVLHI DX, BX\n    MOVQ BX, X0\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVL BX, ret+24(FP)\n    RET\n\n// func maxFloat64(data []float64) float64\nTEXT ·maxFloat64(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ BX, BX\n\n    CMPQ CX, $0\n    JE done\n    XORPD X0, X0\n    XORPD X1, X1\n    XORQ SI, SI\n    MOVQ (AX), BX\n    MOVQ BX, X0\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $32\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    VPBROADCASTQ (AX), Z0\nloop32:\n    VMOVDQU64 (AX)(SI*8), Z1\n    VMOVDQU64 64(AX)(SI*8), Z2\n    VMOVDQU64 128(AX)(SI*8), Z3\n    VMOVDQU64 192(AX)(SI*8), Z4\n    VMAXPD Z1, Z2, Z5\n    VMAXPD Z3, Z4, Z6\n    VMAXPD Z5, Z6, Z1\n    VMAXPD Z1, Z0, Z0\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JNE loop32\n\n    VMOVDQU64 swap32+0(SB), Z1\n    VPERMI2D Z0, Z0, Z1\n    VMAXPD Y1, Y0, Y0\n\n    VMOVDQU64 swap32+32(SB), Y1\n    VPERMI2D Y0, Y0, Y1\n    VMAXPD X1, X0, X0\n\n    VMOVDQU64 swap32+48(SB), X1\n    VPERMI2D X0, X0, X1\n    VMAXPD X1, X0, X0\n    VZEROUPPER\n\n    MOVQ X0, BX\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVQ (AX)(SI*8), DX\n    MOVQ DX, X1\n    UCOMISD X0, X1\n    CMOVQHI DX, BX\n    MOVQ BX, X0\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVQ BX, ret+24(FP)\n    RET\n\n// vpmaxu128 is a macro comparing unsigned 128 bits values held in the\n// `srcValues` and `maxValues` vectors. The `srcIndexes` and `maxIndexes`\n// vectors contain the indexes of elements in the value vectors. Remaining\n// K and R arguments are mask and general purpose registers needed to hold\n// temporary values during the computation. The last M argument is a mask\n// generated by vpmaxu128mask.\n//\n// The routine uses AVX-512 instructions (VPCMPUQ, VPBLENDMQ) to implement\n// the comparison of 128 bits values. The values are expected to be stored\n// in the vectors as a little-endian pair of two consecutive quad words.\n//\n// The results are written to the `maxValues` and `maxIndexes` vectors,\n// overwriting the inputs. `srcValues` and `srcIndexes` are read-only\n// parameters.\n//\n// At a high level, for two pairs of quad words formaxg two 128 bits values\n// A and B, the test implemented by this macro is:\n//\n//   A[1] > B[1] || (A[1] == B[1] && A[0] > B[0])\n//\n// Values in the source vector that evaluate to true on this expression are\n// written to the vector of maximum values, and their indexes are written to\n// the vector of indexes.\n#define vpmaxu128(srcValues, srcIndexes, maxValues, maxIndexes, K1, K2, R1, R2, R3, M) \\\n    VPCMPUQ $0, maxValues, srcValues, K1 \\\n    VPCMPUQ $6, maxValues, srcValues, K2 \\\n    KMOVB K1, R1 \\\n    KMOVB K2, R2 \\\n    MOVB R2, R3 \\\n    SHLB $1, R3 \\\n    ANDB R3, R1 \\\n    ORB R2, R1 \\\n    ANDB M, R1 \\\n    MOVB R1, R2 \\\n    SHRB $1, R2 \\\n    ORB R2, R1 \\\n    KMOVB R1, K1 \\\n    VPBLENDMQ srcValues, maxValues, K1, maxValues \\\n    VPBLENDMQ srcIndexes, maxIndexes, K1, maxIndexes\n\n// vpmaxu128mask is a macro used to initialize the mask passed as last argument\n// to vpmaxu128. The argument M is intended to be a general purpose register.\n//\n// The bit mask is used to merge the results of the \"greater than\" and \"equal\"\n// comparison that are performed on each lane of maximum vectors. The upper bits\n// are used to compute results of the operation to determine which of the pairs\n// of quad words representing the 128 bits elements are the maximums.\n#define vpmaxu128mask(M) MOVB $0b10101010, M\n\n// func maxBE128(data [][16]byte) []byte\nTEXT ·maxBE128(SB), NOSPLIT, $-48\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    CMPQ CX, $0\n    JE null\n\n    SHLQ $4, CX\n    MOVQ CX, DX // len\n    MOVQ AX, BX // max\n    ADDQ AX, CX // end\n\n    CMPQ DX, $256\n    JB loop\n\n    CMPB ·hasAVX512MinMaxBE128(SB), $0\n    JE loop\n\n    // Z19 holds a vector of the count by which we increment the vectors of\n    // swap at each loop iteration.\n    MOVQ $16, DI\n    VPBROADCASTQ DI, Z19\n\n    // Z31 holds the shuffle mask used to convert 128 bits elements from big to\n    // little endian so we can apply vectorized comparison instructions.\n    VMOVDQU64 bswap128(SB), Z31\n\n    // These vectors hold four lanes of maximum values found in the input.\n    VBROADCASTI64X2 (AX), Z0\n    VPSHUFB Z31, Z0, Z0\n    VMOVDQU64 Z0, Z5\n    VMOVDQU64 Z0, Z10\n    VMOVDQU64 Z0, Z15\n\n    // These vectors hold four lanes of swap of maximum values.\n    //\n    // We initialize them at zero because we broadcast the first value of the\n    // input in the vectors that track the maximums of each lane; in other\n    // words, we assume the maximum value is at the first offset and work our\n    // way up from there.\n    VPXORQ Z2, Z2, Z2\n    VPXORQ Z7, Z7, Z7\n    VPXORQ Z12, Z12, Z12\n    VPXORQ Z17, Z17, Z17\n\n    // These vectors are used to compute the swap of maximum values held\n    // in [Z1, Z5, Z10, Z15]. Each vector holds a contiguous sequence of\n    // swap; for example, Z3 is initialized with [0, 1, 2, 3]. At each\n    // loop iteration, the swap are incremented by the number of elements\n    // consumed from the input (4x4=16).\n    VMOVDQU64 indexes128(SB), Z3\n    VPXORQ Z8, Z8, Z8\n    VPXORQ Z13, Z13, Z13\n    VPXORQ Z18, Z18, Z18\n    MOVQ $4, DI\n    VPBROADCASTQ DI, Z1\n    VPADDQ Z1, Z3, Z8\n    VPADDQ Z1, Z8, Z13\n    VPADDQ Z1, Z13, Z18\n\n    // This bit mask is used to merge the results of the \"less than\" and \"equal\"\n    // comparison that we perform on each lane of maximum vectors. We use the\n    // upper bits to compute four results of the operation which determines\n    // which of the pair of quad words representing the 128 bits elements is the\n    // maximum.\n    vpmaxu128mask(DI)\n    SHRQ $8, DX\n    SHLQ $8, DX\n    ADDQ AX, DX\nloop16:\n    // Compute 4x4 maximum values in vector registers, along with their swap\n    // in the input array.\n    VMOVDQU64 (AX), Z1\n    VMOVDQU64 64(AX), Z6\n    VMOVDQU64 128(AX), Z11\n    VMOVDQU64 192(AX), Z16\n    VPSHUFB Z31, Z1, Z1\n    VPSHUFB Z31, Z6, Z6\n    VPSHUFB Z31, Z11, Z11\n    VPSHUFB Z31, Z16, Z16\n    vpmaxu128(Z1, Z3, Z0, Z2, K1, K2, R8, R9, R10, DI)\n    vpmaxu128(Z6, Z8, Z5, Z7, K3, K4, R11, R12, R13, DI)\n    vpmaxu128(Z11, Z13, Z10, Z12, K1, K2, R8, R9, R10, DI)\n    vpmaxu128(Z16, Z18, Z15, Z17, K3, K4, R11, R12, R13, DI)\n    VPADDQ Z19, Z3, Z3\n    VPADDQ Z19, Z8, Z8\n    VPADDQ Z19, Z13, Z13\n    VPADDQ Z19, Z18, Z18\n    ADDQ $256, AX\n    CMPQ AX, DX\n    JB loop16\n\n    // After the loop completed, we need to merge the lanes that each contain\n    // 4 maximum values (so 16 total candidate at this stage). The results are\n    // reduced into 4 candidates in Z0, with their swap in Z2.\n    vpmaxu128(Z10, Z12, Z0, Z2, K1, K2, R8, R9, R10, DI)\n    vpmaxu128(Z15, Z17, Z5, Z7, K3, K4, R11, R12, R13, DI)\n    vpmaxu128(Z5, Z7, Z0, Z2, K1, K2, R8, R9, R10, DI)\n\n    // Further reduce the results by swapping the upper and lower parts of the\n    // vector registers, and comparing them to determaxe which values are the\n    // smallest. We compare 2x2 values at this step, then 2x1 values at the next\n    // to find the index of the maximum.\n    VMOVDQU64 swap64+0(SB), Z1\n    VMOVDQU64 swap64+0(SB), Z3\n    VPERMI2Q Z0, Z0, Z1\n    VPERMI2Q Z2, Z2, Z3\n    vpmaxu128(Y1, Y3, Y0, Y2, K1, K2, R8, R9, R10, DI)\n\n    VMOVDQU64 swap64+32(SB), Y1\n    VMOVDQU64 swap64+32(SB), Y3\n    VPERMI2Q Y0, Y0, Y1\n    VPERMI2Q Y2, Y2, Y3\n    vpmaxu128(X1, X3, X0, X2, K1, K2, R8, R9, R10, DI)\n    VZEROUPPER\n\n    // Extract the index of the maximum value computed in the lower 64 bits of\n    // X2 and position the BX pointer at the index of the maximum value.\n    MOVQ X2, DX\n    SHLQ $4, DX\n    ADDQ DX, BX\n    CMPQ AX, CX\n    JE done\n\n    // Unless the input was aligned on 256 bytes, we need to perform a few more\n    // iterations on the remaining elements.\n    //\n    // This loop is also taken if the CPU has no support for AVX-512.\nloop:\n    MOVQ (AX), R8\n    MOVQ (BX), R9\n    BSWAPQ R8\n    BSWAPQ R9\n    CMPQ R8, R9\n    JA more\n    JB next\n    MOVQ 8(AX), R8\n    MOVQ 8(BX), R9\n    BSWAPQ R8\n    BSWAPQ R9\n    CMPQ R8, R9\n    JBE next\nmore:\n    MOVQ AX, BX\nnext:\n    ADDQ $16, AX\n    CMPQ AX, CX\n    JB loop\ndone:\n    MOVQ BX, ret_base+24(FP)\n    MOVQ $16, ret_len+32(FP)\n    MOVQ $16, ret_cap+40(FP)\n    RET\nnull:\n    XORQ BX, BX\n    MOVQ BX, ret_base+24(FP)\n    MOVQ BX, ret_len+32(FP)\n    MOVQ BX, ret_cap+40(FP)\n    RET\n\n"
  },
  {
    "path": "page_max_purego.go",
    "content": "//go:build purego || !amd64\n\npackage parquet\n\nimport \"encoding/binary\"\n\n// -----------------------------------------------------------------------------\n// TODO: use generics versions of the these functions to reduce the amount of\n// code to maintain when we drop compatilibty with Go version older than 1.18.\n// -----------------------------------------------------------------------------\n\nfunc maxInt32(data []int32) (max int32) {\n\tif len(data) > 0 {\n\t\tmax = data[0]\n\n\t\tfor _, value := range data {\n\t\t\tif value > max {\n\t\t\t\tmax = value\n\t\t\t}\n\t\t}\n\t}\n\treturn max\n}\n\nfunc maxInt64(data []int64) (max int64) {\n\tif len(data) > 0 {\n\t\tmax = data[0]\n\n\t\tfor _, value := range data {\n\t\t\tif value > max {\n\t\t\t\tmax = value\n\t\t\t}\n\t\t}\n\t}\n\treturn max\n}\n\nfunc maxUint32(data []uint32) (max uint32) {\n\tif len(data) > 0 {\n\t\tmax = data[0]\n\n\t\tfor _, value := range data {\n\t\t\tif value > max {\n\t\t\t\tmax = value\n\t\t\t}\n\t\t}\n\t}\n\treturn max\n}\n\nfunc maxUint64(data []uint64) (max uint64) {\n\tif len(data) > 0 {\n\t\tmax = data[0]\n\n\t\tfor _, value := range data {\n\t\t\tif value > max {\n\t\t\t\tmax = value\n\t\t\t}\n\t\t}\n\t}\n\treturn max\n}\n\nfunc maxFloat32(data []float32) (max float32) {\n\tif len(data) > 0 {\n\t\tmax = data[0]\n\n\t\tfor _, value := range data {\n\t\t\tif value > max {\n\t\t\t\tmax = value\n\t\t\t}\n\t\t}\n\t}\n\treturn max\n}\n\nfunc maxFloat64(data []float64) (max float64) {\n\tif len(data) > 0 {\n\t\tmax = data[0]\n\n\t\tfor _, value := range data {\n\t\t\tif value > max {\n\t\t\t\tmax = value\n\t\t\t}\n\t\t}\n\t}\n\treturn max\n}\n\nfunc maxBE128(data [][16]byte) (min []byte) {\n\tif len(data) > 0 {\n\t\tm := binary.BigEndian.Uint64(data[0][:8])\n\t\tj := 0\n\t\tfor i := 1; i < len(data); i++ {\n\t\t\tx := binary.BigEndian.Uint64(data[i][:8])\n\t\t\tswitch {\n\t\t\tcase x > m:\n\t\t\t\tm, j = x, i\n\t\t\tcase x == m:\n\t\t\t\ty := binary.BigEndian.Uint64(data[i][8:])\n\t\t\t\tn := binary.BigEndian.Uint64(data[j][8:])\n\t\t\t\tif y > n {\n\t\t\t\t\tm, j = x, i\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tmin = data[j][:]\n\t}\n\treturn min\n}\n"
  },
  {
    "path": "page_max_test.go",
    "content": "package parquet\n\nimport (\n\t\"bytes\"\n\t\"math/rand\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/internal/quick\"\n)\n\nfunc TestMaxInt32(t *testing.T) {\n\terr := quick.Check(func(values []int32) bool {\n\t\tmax := int32(0)\n\t\tif len(values) > 0 {\n\t\t\tmax = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v > max {\n\t\t\t\t\tmax = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn max == maxInt32(values)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMaxInt64(t *testing.T) {\n\terr := quick.Check(func(values []int64) bool {\n\t\tmax := int64(0)\n\t\tif len(values) > 0 {\n\t\t\tmax = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v > max {\n\t\t\t\t\tmax = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn max == maxInt64(values)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMaxUint32(t *testing.T) {\n\terr := quick.Check(func(values []uint32) bool {\n\t\tmax := uint32(0)\n\t\tif len(values) > 0 {\n\t\t\tmax = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v > max {\n\t\t\t\t\tmax = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn max == maxUint32(values)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMaxUint64(t *testing.T) {\n\terr := quick.Check(func(values []uint64) bool {\n\t\tmax := uint64(0)\n\t\tif len(values) > 0 {\n\t\t\tmax = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v > max {\n\t\t\t\t\tmax = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn max == maxUint64(values)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMaxFloat32(t *testing.T) {\n\terr := quick.Check(func(values []float32) bool {\n\t\tmax := float32(0)\n\t\tif len(values) > 0 {\n\t\t\tmax = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v > max {\n\t\t\t\t\tmax = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn max == maxFloat32(values)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMaxFloat64(t *testing.T) {\n\terr := quick.Check(func(values []float64) bool {\n\t\tmax := float64(0)\n\t\tif len(values) > 0 {\n\t\t\tmax = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v > max {\n\t\t\t\t\tmax = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn max == maxFloat64(values)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMaxBE128(t *testing.T) {\n\terr := quick.Check(func(values [][16]byte) bool {\n\t\tmax := [16]byte{}\n\t\tif len(values) > 0 {\n\t\t\tmax = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif bytes.Compare(v[:], max[:]) > 0 {\n\t\t\t\t\tmax = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tret := maxBE128(values)\n\t\treturn (len(values) == 0 && ret == nil) || bytes.Equal(max[:], ret)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMaxFixedLenByteArray(t *testing.T) {\n\terr := quick.Check(func(values []byte) bool {\n\t\tmax := [1]byte{}\n\t\tif len(values) > 0 {\n\t\t\tmax[0] = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v > max[0] {\n\t\t\t\t\tmax[0] = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tret := maxFixedLenByteArray(values, 1)\n\t\treturn (len(values) == 0 && ret == nil) || bytes.Equal(max[:], ret)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc BenchmarkMaxInt32(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]int32, bufferSize/4)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Int31()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tmaxInt32(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkMaxInt64(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]int64, bufferSize/8)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Int63()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tmaxInt64(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkMaxUint32(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]uint32, bufferSize/4)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Uint32()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tmaxUint32(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkMaxUint64(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]uint64, bufferSize/8)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Uint64()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tmaxUint64(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkMaxFloat32(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]float32, bufferSize/4)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Float32()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tmaxFloat32(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkMaxFloat64(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]float64, bufferSize/8)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Float64()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tmaxFloat64(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkMaxBE128(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([][16]byte, bufferSize)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tprng.Read(values[i][:])\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tmaxBE128(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkMaxFixedLenByteArray(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]byte, bufferSize)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tprng.Read(values)\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tmaxFixedLenByteArray(values, 32)\n\t\t}\n\t})\n}\n"
  },
  {
    "path": "page_min.go",
    "content": "package parquet\n\nimport (\n\t\"bytes\"\n)\n\nfunc minFixedLenByteArray(data []byte, size int) (min []byte) {\n\tif len(data) > 0 {\n\t\tmin = data[:size]\n\n\t\tfor i, j := size, 2*size; j <= len(data); {\n\t\t\titem := data[i:j]\n\n\t\t\tif bytes.Compare(item, min) < 0 {\n\t\t\t\tmin = item\n\t\t\t}\n\n\t\t\ti += size\n\t\t\tj += size\n\t\t}\n\t}\n\treturn min\n}\n"
  },
  {
    "path": "page_min_amd64.go",
    "content": "//go:build !purego\n\npackage parquet\n\n//go:noescape\nfunc minInt32(data []int32) int32\n\n//go:noescape\nfunc minInt64(data []int64) int64\n\n//go:noescape\nfunc minUint32(data []uint32) uint32\n\n//go:noescape\nfunc minUint64(data []uint64) uint64\n\n//go:noescape\nfunc minFloat32(data []float32) float32\n\n//go:noescape\nfunc minFloat64(data []float64) float64\n\n//go:noescape\nfunc minBE128(data [][16]byte) []byte\n"
  },
  {
    "path": "page_min_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n// func minInt32(data []int32) int32\nTEXT ·minInt32(SB), NOSPLIT, $-28\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ BX, BX\n\n    CMPQ CX, $0\n    JE done\n    XORQ SI, SI\n    MOVLQZX (AX), BX\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $32\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    VPBROADCASTD (AX), Z0\nloop32:\n    VMOVDQU32 (AX)(SI*4), Z1\n    VMOVDQU32 64(AX)(SI*4), Z2\n    VPMINSD Z1, Z0, Z0\n    VPMINSD Z2, Z0, Z0\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JNE loop32\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VPERMI2D Z0, Z0, Z1\n    VPMINSD Y1, Y0, Y0\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VPERMI2D Y0, Y0, Y1\n    VPMINSD X1, X0, X0\n\n    VMOVDQU32 swap32+48(SB), X1\n    VPERMI2D X0, X0, X1\n    VPMINSD X1, X0, X0\n    VZEROUPPER\n\n    MOVQ X0, DX\n    MOVL DX, BX\n    SHRQ $32, DX\n    CMPL DX, BX\n    CMOVLLT DX, BX\n\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVLQZX (AX)(SI*4), DX\n    CMPL DX, BX\n    CMOVLLT DX, BX\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVL BX, ret+24(FP)\n    RET\n\n// func minInt64(data []int64) int64\nTEXT ·minInt64(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ BX, BX\n\n    CMPQ CX, $0\n    JE done\n    XORQ SI, SI\n    MOVQ (AX), BX\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $32\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    VPBROADCASTQ (AX), Z0\nloop32:\n    VMOVDQU64 (AX)(SI*8), Z1\n    VMOVDQU64 64(AX)(SI*8), Z2\n    VMOVDQU64 128(AX)(SI*8), Z3\n    VMOVDQU64 192(AX)(SI*8), Z4\n    VPMINSQ Z1, Z2, Z5\n    VPMINSQ Z3, Z4, Z6\n    VPMINSQ Z5, Z6, Z1\n    VPMINSQ Z1, Z0, Z0\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JNE loop32\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VPERMI2D Z0, Z0, Z1\n    VPMINSQ Y1, Y0, Y0\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VPERMI2D Y0, Y0, Y1\n    VPMINSQ X1, X0, X0\n\n    VMOVDQU32 swap32+48(SB), X1\n    VPERMI2D X0, X0, X1\n    VPMINSQ X1, X0, X0\n    VZEROUPPER\n\n    MOVQ X0, BX\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVQ (AX)(SI*8), DX\n    CMPQ DX, BX\n    CMOVQLT DX, BX\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVQ BX, ret+24(FP)\n    RET\n\n// func minUint32(data []int32) int32\nTEXT ·minUint32(SB), NOSPLIT, $-28\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ BX, BX\n\n    CMPQ CX, $0\n    JE done\n    XORQ SI, SI\n    MOVLQZX (AX), BX\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $32\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    VPBROADCASTD (AX), Z0\nloop32:\n    VMOVDQU32 (AX)(SI*4), Z1\n    VMOVDQU32 64(AX)(SI*4), Z2\n    VPMINUD Z1, Z0, Z0\n    VPMINUD Z2, Z0, Z0\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JNE loop32\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VPERMI2D Z0, Z0, Z1\n    VPMINUD Y1, Y0, Y0\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VPERMI2D Y0, Y0, Y1\n    VPMINUD X1, X0, X0\n\n    VMOVDQU32 swap32+48(SB), X1\n    VPERMI2D X0, X0, X1\n    VPMINUD X1, X0, X0\n    VZEROUPPER\n\n    MOVQ X0, DX\n    MOVL DX, BX\n    SHRQ $32, DX\n    CMPL DX, BX\n    CMOVLCS DX, BX\n\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVLQZX (AX)(SI*4), DX\n    CMPL DX, BX\n    CMOVLCS DX, BX\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVL BX, ret+24(FP)\n    RET\n\n// func minUint64(data []uint64) uint64\nTEXT ·minUint64(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ BX, BX\n\n    CMPQ CX, $0\n    JE done\n    XORQ SI, SI\n    MOVQ (AX), BX\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $32\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    VPBROADCASTQ (AX), Z0\nloop32:\n    VMOVDQU64 (AX)(SI*8), Z1\n    VMOVDQU64 64(AX)(SI*8), Z2\n    VMOVDQU64 128(AX)(SI*8), Z3\n    VMOVDQU64 192(AX)(SI*8), Z4\n    VPMINUQ Z1, Z2, Z5\n    VPMINUQ Z3, Z4, Z6\n    VPMINUQ Z5, Z6, Z1\n    VPMINUQ Z1, Z0, Z0\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JNE loop32\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VPERMI2D Z0, Z0, Z1\n    VPMINUQ Y1, Y0, Y0\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VPERMI2D Y0, Y0, Y1\n    VPMINUQ X1, X0, X0\n\n    VMOVDQU32 swap32+48(SB), X1\n    VPERMI2D X0, X0, X1\n    VPMINUQ X1, X0, X0\n    VZEROUPPER\n\n    MOVQ X0, BX\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVQ (AX)(SI*8), DX\n    CMPQ DX, BX\n    CMOVQCS DX, BX\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVQ BX, ret+24(FP)\n    RET\n\n// func minFloat32(data []float32) float32\nTEXT ·minFloat32(SB), NOSPLIT, $-28\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ BX, BX\n\n    CMPQ CX, $0\n    JE done\n    XORPS X0, X0\n    XORPS X1, X1\n    XORQ SI, SI\n    MOVLQZX (AX), BX\n    MOVQ BX, X0\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $64\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $6, DI\n    SHLQ $6, DI\n    VPBROADCASTD (AX), Z0\nloop64:\n    VMOVDQU32 (AX)(SI*4), Z1\n    VMOVDQU32 64(AX)(SI*4), Z2\n    VMOVDQU32 128(AX)(SI*4), Z3\n    VMOVDQU32 192(AX)(SI*4), Z4\n    VMINPS Z1, Z2, Z5\n    VMINPS Z3, Z4, Z6\n    VMINPS Z5, Z6, Z1\n    VMINPS Z1, Z0, Z0\n    ADDQ $64, SI\n    CMPQ SI, DI\n    JNE loop64\n\n    VMOVDQU32 swap32+0(SB), Z1\n    VPERMI2D Z0, Z0, Z1\n    VMINPS Y1, Y0, Y0\n\n    VMOVDQU32 swap32+32(SB), Y1\n    VPERMI2D Y0, Y0, Y1\n    VMINPS X1, X0, X0\n\n    VMOVDQU32 swap32+48(SB), X1\n    VPERMI2D X0, X0, X1\n    VMINPS X1, X0, X0\n    VZEROUPPER\n\n    MOVAPS X0, X1\n    PSRLQ $32, X1\n    MOVQ X0, BX\n    MOVQ X1, DX\n    UCOMISS X0, X1\n    CMOVLCS DX, BX\n\n    CMPQ SI, CX\n    JE done\n    MOVQ BX, X0\nloop:\n    MOVLQZX (AX)(SI*4), DX\n    MOVQ DX, X1\n    UCOMISS X0, X1\n    CMOVLCS DX, BX\n    MOVQ BX, X0\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVL BX, ret+24(FP)\n    RET\n\n// func minFloat64(data []float64) float64\nTEXT ·minFloat64(SB), NOSPLIT, $-32\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    XORQ BX, BX\n\n    CMPQ CX, $0\n    JE done\n    XORPD X0, X0\n    XORPD X1, X1\n    XORQ SI, SI\n    MOVQ (AX), BX\n    MOVQ BX, X0\n\n    CMPB ·hasAVX512VL(SB), $0\n    JE loop\n\n    CMPQ CX, $32\n    JB loop\n\n    MOVQ CX, DI\n    SHRQ $5, DI\n    SHLQ $5, DI\n    VPBROADCASTQ (AX), Z0\nloop32:\n    VMOVDQU64 (AX)(SI*8), Z1\n    VMOVDQU64 64(AX)(SI*8), Z2\n    VMOVDQU64 128(AX)(SI*8), Z3\n    VMOVDQU64 192(AX)(SI*8), Z4\n    VMINPD Z1, Z2, Z5\n    VMINPD Z3, Z4, Z6\n    VMINPD Z5, Z6, Z1\n    VMINPD Z1, Z0, Z0\n    ADDQ $32, SI\n    CMPQ SI, DI\n    JNE loop32\n\n    VMOVDQU64 swap32+0(SB), Z1\n    VPERMI2D Z0, Z0, Z1\n    VMINPD Y1, Y0, Y0\n\n    VMOVDQU64 swap32+32(SB), Y1\n    VPERMI2D Y0, Y0, Y1\n    VMINPD X1, X0, X0\n\n    VMOVDQU64 swap32+48(SB), X1\n    VPERMI2D X0, X0, X1\n    VMINPD X1, X0, X0\n    VZEROUPPER\n\n    MOVQ X0, BX\n    CMPQ SI, CX\n    JE done\nloop:\n    MOVQ (AX)(SI*8), DX\n    MOVQ DX, X1\n    UCOMISD X0, X1\n    CMOVQCS DX, BX\n    MOVQ BX, X0\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\ndone:\n    MOVQ BX, ret+24(FP)\n    RET\n\n// vpminu128 is a macro comparing unsigned 128 bits values held in the\n// `srcValues` and `minValues` vectors. The `srcIndexes` and `minIndexes`\n// vectors contain the indexes of elements in the value vectors. Remaining\n// K and R arguments are mask and general purpose registers needed to hold\n// temporary values during the computation. The last M argument is a mask\n// generated by vpminu128mask.\n//\n// The routine uses AVX-512 instructions (VPCMPUQ, VPBLENDMQ) to implement\n// the comparison of 128 bits values. The values are expected to be stored\n// in the vectors as a little-endian pair of two consecutive quad words.\n//\n// The results are written to the `minValues` and `minIndexes` vectors,\n// overwriting the inputs. `srcValues` and `srcIndexes` are read-only\n// parameters.\n//\n// At a high level, for two pairs of quad words forming two 128 bits values\n// A and B, the test implemented by this macro is:\n//\n//   A[1] < B[1] || (A[1] == B[1] && A[0] < B[0])\n//\n// Values in the source vector that evalute to true on this expression are\n// written to the vector of minimum values, and their indexes are written to\n// the vector of indexes.\n#define vpminu128(srcValues, srcIndexes, minValues, minIndexes, K1, K2, R1, R2, R3, M) \\\n    VPCMPUQ $0, minValues, srcValues, K1 \\\n    VPCMPUQ $1, minValues, srcValues, K2 \\\n    KMOVB K1, R1 \\\n    KMOVB K2, R2 \\\n    MOVB R2, R3 \\\n    SHLB $1, R3 \\\n    ANDB R3, R1 \\\n    ORB R2, R1 \\\n    ANDB M, R1 \\\n    MOVB R1, R2 \\\n    SHRB $1, R2 \\\n    ORB R2, R1 \\\n    KMOVB R1, K1 \\\n    VPBLENDMQ srcValues, minValues, K1, minValues \\\n    VPBLENDMQ srcIndexes, minIndexes, K1, minIndexes\n\n// vpminu128mask is a macro used to initialize the mask passed as last argument\n// to vpminu128. The argument M is intended to be a general purpose register.\n//\n// The bit mask is used to merge the results of the \"less than\" and \"equal\"\n// comparison that are performed on each lane of minimum vectors. The upper bits\n// are used to compute results of the operation to determines which of the pairs\n// of quad words representing the 128 bits elements are the minimums.\n#define vpminu128mask(M) MOVB $0b10101010, M\n\n// func minBE128(data [][16]byte) []byte\nTEXT ·minBE128(SB), NOSPLIT, $-48\n    MOVQ data_base+0(FP), AX\n    MOVQ data_len+8(FP), CX\n    CMPQ CX, $0\n    JE null\n\n    SHLQ $4, CX\n    MOVQ CX, DX // len\n    MOVQ AX, BX // min\n    ADDQ AX, CX // end\n\n    CMPQ DX, $256\n    JB loop\n\n    CMPB ·hasAVX512MinMaxBE128(SB), $0\n    JE loop\n\n    // Z19 holds a vector of the count by which we increment the vectors of\n    // swap at each loop iteration.\n    MOVQ $16, DI\n    VPBROADCASTQ DI, Z19\n\n    // Z31 holds the shuffle mask used to convert 128 bits elements from big to\n    // little endian so we can apply vectorized comparison instructions.\n    VMOVDQU64 bswap128(SB), Z31\n\n    // These vectors hold four lanes of minimum values found in the input.\n    VBROADCASTI64X2 (AX), Z0\n    VPSHUFB Z31, Z0, Z0\n    VMOVDQU64 Z0, Z5\n    VMOVDQU64 Z0, Z10\n    VMOVDQU64 Z0, Z15\n\n    // These vectors hold four lanes of swap of minimum values.\n    //\n    // We initialize them at zero because we broadcast the first value of the\n    // input in the vectors that track the minimums of each lane; in other\n    // words, we assume the minimum value is at the first offset and work our\n    // way up from there.\n    VPXORQ Z2, Z2, Z2\n    VPXORQ Z7, Z7, Z7\n    VPXORQ Z12, Z12, Z12\n    VPXORQ Z17, Z17, Z17\n\n    // These vectors are used to compute the swap of minimum values held\n    // in [Z1, Z5, Z10, Z15]. Each vector holds a contiguous sequence of\n    // swap; for example, Z3 is initialized with [0, 1, 2, 3]. At each\n    // loop iteration, the swap are incremented by the number of elements\n    // consumed from the input (4x4=16).\n    VMOVDQU64 indexes128(SB), Z3\n    VPXORQ Z8, Z8, Z8\n    VPXORQ Z13, Z13, Z13\n    VPXORQ Z18, Z18, Z18\n    MOVQ $4, DI\n    VPBROADCASTQ DI, Z1\n    VPADDQ Z1, Z3, Z8\n    VPADDQ Z1, Z8, Z13\n    VPADDQ Z1, Z13, Z18\n\n    vpminu128mask(DI)\n    SHRQ $8, DX\n    SHLQ $8, DX\n    ADDQ AX, DX\nloop16:\n    // Compute 4x4 minimum values in vector registers, along with their swap\n    // in the input array.\n    VMOVDQU64 (AX), Z1\n    VMOVDQU64 64(AX), Z6\n    VMOVDQU64 128(AX), Z11\n    VMOVDQU64 192(AX), Z16\n    VPSHUFB Z31, Z1, Z1\n    VPSHUFB Z31, Z6, Z6\n    VPSHUFB Z31, Z11, Z11\n    VPSHUFB Z31, Z16, Z16\n    vpminu128(Z1, Z3, Z0, Z2, K1, K2, R8, R9, R10, DI)\n    vpminu128(Z6, Z8, Z5, Z7, K3, K4, R11, R12, R13, DI)\n    vpminu128(Z11, Z13, Z10, Z12, K1, K2, R8, R9, R10, DI)\n    vpminu128(Z16, Z18, Z15, Z17, K3, K4, R11, R12, R13, DI)\n    VPADDQ Z19, Z3, Z3\n    VPADDQ Z19, Z8, Z8\n    VPADDQ Z19, Z13, Z13\n    VPADDQ Z19, Z18, Z18\n    ADDQ $256, AX\n    CMPQ AX, DX\n    JB loop16\n\n    // After the loop completed, we need to merge the lanes that each contain\n    // 4 minimum values (so 16 total candidate at this stage). The results are\n    // reduced into 4 candidates in Z0, with their swap in Z2.\n    vpminu128(Z10, Z12, Z0, Z2, K1, K2, R8, R9, R10, DI)\n    vpminu128(Z15, Z17, Z5, Z7, K3, K4, R11, R12, R13, DI)\n    vpminu128(Z5, Z7, Z0, Z2, K1, K2, R8, R9, R10, DI)\n\n    // Further reduce the results by swapping the upper and lower parts of the\n    // vector registers, and comparing them to determine which values are the\n    // smallest. We compare 2x2 values at this step, then 2x1 values at the next\n    // to find the index of the minimum.\n    VMOVDQU64 swap64+0(SB), Z1\n    VMOVDQU64 swap64+0(SB), Z3\n    VPERMI2Q Z0, Z0, Z1\n    VPERMI2Q Z2, Z2, Z3\n    vpminu128(Y1, Y3, Y0, Y2, K1, K2, R8, R9, R10, DI)\n\n    VMOVDQU64 swap64+32(SB), Y1\n    VMOVDQU64 swap64+32(SB), Y3\n    VPERMI2Q Y0, Y0, Y1\n    VPERMI2Q Y2, Y2, Y3\n    vpminu128(X1, X3, X0, X2, K1, K2, R8, R9, R10, DI)\n    VZEROUPPER\n\n    // Extract the index of the minimum value computed in the lower 64 bits of\n    // X2 and position the BX pointer at the index of the minimum value.\n    MOVQ X2, DX\n    SHLQ $4, DX\n    ADDQ DX, BX\n    CMPQ AX, CX\n    JE done\n\n    // Unless the input was aligned on 256 bytes, we need to perform a few more\n    // iterations on the remaining elements.\n    //\n    // This loop is also taken if the CPU has no support for AVX-512.\nloop:\n    MOVQ (AX), R8\n    MOVQ (BX), R9\n    BSWAPQ R8\n    BSWAPQ R9\n    CMPQ R8, R9\n    JB less\n    JA next\n    MOVQ 8(AX), R8\n    MOVQ 8(BX), R9\n    BSWAPQ R8\n    BSWAPQ R9\n    CMPQ R8, R9\n    JAE next\nless:\n    MOVQ AX, BX\nnext:\n    ADDQ $16, AX\n    CMPQ AX, CX\n    JB loop\ndone:\n    MOVQ BX, ret_base+24(FP)\n    MOVQ $16, ret_len+32(FP)\n    MOVQ $16, ret_cap+40(FP)\n    RET\nnull:\n    XORQ BX, BX\n    MOVQ BX, ret_base+24(FP)\n    MOVQ BX, ret_len+32(FP)\n    MOVQ BX, ret_cap+40(FP)\n    RET\n"
  },
  {
    "path": "page_min_purego.go",
    "content": "//go:build purego || !amd64\n\npackage parquet\n\nimport \"encoding/binary\"\n\n// -----------------------------------------------------------------------------\n// TODO: use generics versions of the these functions to reduce the amount of\n// code to maintain when we drop compatilibty with Go version older than 1.18.\n// -----------------------------------------------------------------------------\n\nfunc minInt32(data []int32) (min int32) {\n\tif len(data) > 0 {\n\t\tmin = data[0]\n\n\t\tfor _, value := range data {\n\t\t\tif value < min {\n\t\t\t\tmin = value\n\t\t\t}\n\t\t}\n\t}\n\treturn min\n}\n\nfunc minInt64(data []int64) (min int64) {\n\tif len(data) > 0 {\n\t\tmin = data[0]\n\n\t\tfor _, value := range data {\n\t\t\tif value < min {\n\t\t\t\tmin = value\n\t\t\t}\n\t\t}\n\t}\n\treturn min\n}\n\nfunc minUint32(data []uint32) (min uint32) {\n\tif len(data) > 0 {\n\t\tmin = data[0]\n\n\t\tfor _, value := range data {\n\t\t\tif value < min {\n\t\t\t\tmin = value\n\t\t\t}\n\t\t}\n\t}\n\treturn min\n}\n\nfunc minUint64(data []uint64) (min uint64) {\n\tif len(data) > 0 {\n\t\tmin = data[0]\n\n\t\tfor _, value := range data {\n\t\t\tif value < min {\n\t\t\t\tmin = value\n\t\t\t}\n\t\t}\n\t}\n\treturn min\n}\n\nfunc minFloat32(data []float32) (min float32) {\n\tif len(data) > 0 {\n\t\tmin = data[0]\n\n\t\tfor _, value := range data {\n\t\t\tif value < min {\n\t\t\t\tmin = value\n\t\t\t}\n\t\t}\n\t}\n\treturn min\n}\n\nfunc minFloat64(data []float64) (min float64) {\n\tif len(data) > 0 {\n\t\tmin = data[0]\n\n\t\tfor _, value := range data {\n\t\t\tif value < min {\n\t\t\t\tmin = value\n\t\t\t}\n\t\t}\n\t}\n\treturn min\n}\n\nfunc minBE128(data [][16]byte) (min []byte) {\n\tif len(data) > 0 {\n\t\tm := binary.BigEndian.Uint64(data[0][:8])\n\t\tj := 0\n\t\tfor i := 1; i < len(data); i++ {\n\t\t\tx := binary.BigEndian.Uint64(data[i][:8])\n\t\t\tswitch {\n\t\t\tcase x < m:\n\t\t\t\tm, j = x, i\n\t\t\tcase x == m:\n\t\t\t\ty := binary.BigEndian.Uint64(data[i][8:])\n\t\t\t\tn := binary.BigEndian.Uint64(data[j][8:])\n\t\t\t\tif y < n {\n\t\t\t\t\tm, j = x, i\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tmin = data[j][:]\n\t}\n\treturn min\n}\n"
  },
  {
    "path": "page_min_test.go",
    "content": "package parquet\n\nimport (\n\t\"bytes\"\n\t\"math/rand\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go/internal/quick\"\n)\n\nfunc TestMinInt32(t *testing.T) {\n\terr := quick.Check(func(values []int32) bool {\n\t\tmin := int32(0)\n\t\tif len(values) > 0 {\n\t\t\tmin = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v < min {\n\t\t\t\t\tmin = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn min == minInt32(values)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMinInt64(t *testing.T) {\n\terr := quick.Check(func(values []int64) bool {\n\t\tmin := int64(0)\n\t\tif len(values) > 0 {\n\t\t\tmin = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v < min {\n\t\t\t\t\tmin = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn min == minInt64(values)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMinUint32(t *testing.T) {\n\terr := quick.Check(func(values []uint32) bool {\n\t\tmin := uint32(0)\n\t\tif len(values) > 0 {\n\t\t\tmin = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v < min {\n\t\t\t\t\tmin = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn min == minUint32(values)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMinUint64(t *testing.T) {\n\terr := quick.Check(func(values []uint64) bool {\n\t\tmin := uint64(0)\n\t\tif len(values) > 0 {\n\t\t\tmin = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v < min {\n\t\t\t\t\tmin = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn min == minUint64(values)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMinFloat32(t *testing.T) {\n\terr := quick.Check(func(values []float32) bool {\n\t\tmin := float32(0)\n\t\tif len(values) > 0 {\n\t\t\tmin = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v < min {\n\t\t\t\t\tmin = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn min == minFloat32(values)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMinFloat64(t *testing.T) {\n\terr := quick.Check(func(values []float64) bool {\n\t\tmin := float64(0)\n\t\tif len(values) > 0 {\n\t\t\tmin = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v < min {\n\t\t\t\t\tmin = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn min == minFloat64(values)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMinBE128(t *testing.T) {\n\terr := quick.Check(func(values [][16]byte) bool {\n\t\tmin := [16]byte{}\n\t\tif len(values) > 0 {\n\t\t\tmin = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif bytes.Compare(v[:], min[:]) < 0 {\n\t\t\t\t\tmin = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tret := minBE128(values)\n\t\treturn (len(values) == 0 && ret == nil) || bytes.Equal(min[:], ret)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestMinFixedLenByteArray(t *testing.T) {\n\terr := quick.Check(func(values []byte) bool {\n\t\tmin := [1]byte{}\n\t\tif len(values) > 0 {\n\t\t\tmin[0] = values[0]\n\t\t\tfor _, v := range values[1:] {\n\t\t\t\tif v < min[0] {\n\t\t\t\t\tmin[0] = v\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tret := minFixedLenByteArray(values, 1)\n\t\treturn (len(values) == 0 && ret == nil) || bytes.Equal(min[:], ret)\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc BenchmarkMinInt32(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]int32, bufferSize/4)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Int31()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tminInt32(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkMinInt64(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]int64, bufferSize/8)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Int63()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tminInt64(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkMinUint32(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]uint32, bufferSize/4)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Uint32()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tminUint32(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkMinUint64(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]uint64, bufferSize/8)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Uint64()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tminUint64(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkMinFloat32(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]float32, bufferSize/4)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Float32()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tminFloat32(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkMinFloat64(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]float64, bufferSize/8)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tvalues[i] = prng.Float64()\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tminFloat64(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkMinBE128(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([][16]byte, bufferSize)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tfor i := range values {\n\t\t\tprng.Read(values[i][:])\n\t\t}\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tminBE128(values)\n\t\t}\n\t})\n}\n\nfunc BenchmarkMinFixedLenByteArray(b *testing.B) {\n\tforEachBenchmarkBufferSize(b, func(b *testing.B, bufferSize int) {\n\t\tvalues := make([]byte, bufferSize)\n\t\tprng := rand.New(rand.NewSource(1))\n\t\tprng.Read(values)\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\tminFixedLenByteArray(values, 32)\n\t\t}\n\t})\n}\n"
  },
  {
    "path": "page_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"bytes\"\n\t\"io\"\n\t\"reflect\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding/plain\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\nfunc TestPage(t *testing.T) {\n\tt.Run(\"BOOLEAN\", testPageBoolean)\n\tt.Run(\"INT32\", testPageInt32)\n\tt.Run(\"INT64\", testPageInt64)\n\tt.Run(\"INT96\", testPageInt96)\n\tt.Run(\"FLOAT\", testPageFloat)\n\tt.Run(\"DOUBLE\", testPageDouble)\n\tt.Run(\"BYTE_ARRAY\", testPageByteArray)\n\tt.Run(\"FIXED_LEN_BYTE_ARRAY\", testPageFixedLenByteArray)\n}\n\nfunc testPageBoolean(t *testing.T) {\n\tschema := parquet.SchemaOf(struct{ Value bool }{})\n\n\tt.Run(\"parquet\", func(t *testing.T) {\n\t\ttestPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := make([]bool, 50_000)\n\t\t\t\tfor i := range values {\n\t\t\t\t\tvalues[i] = i%2 == 0\n\t\t\t\t}\n\t\t\t\tn, err := w.(parquet.BooleanWriter).WriteBooleans(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]bool, 50_000)\n\t\t\t\tn, err := r.(parquet.BooleanReader).ReadBooleans(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\t\t})\n\t})\n}\n\nfunc testPageInt32(t *testing.T) {\n\tschema := parquet.SchemaOf(struct{ Value int32 }{})\n\n\tt.Run(\"io\", func(t *testing.T) {\n\t\ttestBufferPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}\n\t\t\t\tn, err := w.(io.Writer).Write(unsafecast.Int32ToBytes(values))\n\t\t\t\treturn values[:n/4], err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]int32, 10)\n\t\t\t\tn, err := r.(io.Reader).Read(unsafecast.Int32ToBytes(values))\n\t\t\t\treturn values[:n/4], err\n\t\t\t},\n\t\t})\n\t})\n\n\tt.Run(\"parquet\", func(t *testing.T) {\n\t\ttestPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}\n\t\t\t\tn, err := w.(parquet.Int32Writer).WriteInt32s(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]int32, 10)\n\t\t\t\tn, err := r.(parquet.Int32Reader).ReadInt32s(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\t\t})\n\t})\n}\n\nfunc testPageInt64(t *testing.T) {\n\tschema := parquet.SchemaOf(struct{ Value int64 }{})\n\n\tt.Run(\"io\", func(t *testing.T) {\n\t\ttestBufferPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := []int64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}\n\t\t\t\tn, err := w.(io.Writer).Write(unsafecast.Int64ToBytes(values))\n\t\t\t\treturn values[:n/8], err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]int64, 10)\n\t\t\t\tn, err := r.(io.Reader).Read(unsafecast.Int64ToBytes(values))\n\t\t\t\treturn values[:n/8], err\n\t\t\t},\n\t\t})\n\t})\n\n\tt.Run(\"parquet\", func(t *testing.T) {\n\t\ttestPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := []int64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}\n\t\t\t\tn, err := w.(parquet.Int64Writer).WriteInt64s(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]int64, 10)\n\t\t\t\tn, err := r.(parquet.Int64Reader).ReadInt64s(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\t\t})\n\t})\n}\n\nfunc testPageInt96(t *testing.T) {\n\tschema := parquet.SchemaOf(struct{ Value deprecated.Int96 }{})\n\n\tt.Run(\"io\", func(t *testing.T) {\n\t\ttestBufferPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := []deprecated.Int96{{0: 0}, {0: 1}, {0: 2}}\n\t\t\t\tn, err := w.(io.Writer).Write(deprecated.Int96ToBytes(values))\n\t\t\t\treturn values[:n/12], err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]deprecated.Int96, 3)\n\t\t\t\tn, err := r.(io.Reader).Read(deprecated.Int96ToBytes(values))\n\t\t\t\treturn values[:n/12], err\n\t\t\t},\n\t\t})\n\t})\n\n\tt.Run(\"parquet\", func(t *testing.T) {\n\t\ttestPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := []deprecated.Int96{{0: 0}, {0: 1}, {0: 2}}\n\t\t\t\tn, err := w.(parquet.Int96Writer).WriteInt96s(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]deprecated.Int96, 3)\n\t\t\t\tn, err := r.(parquet.Int96Reader).ReadInt96s(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\t\t})\n\t})\n}\n\nfunc testPageFloat(t *testing.T) {\n\tschema := parquet.SchemaOf(struct{ Value float32 }{})\n\n\tt.Run(\"io\", func(t *testing.T) {\n\t\ttestBufferPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}\n\t\t\t\tn, err := w.(io.Writer).Write(unsafecast.Float32ToBytes(values))\n\t\t\t\treturn values[:n/4], err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]float32, 10)\n\t\t\t\tn, err := r.(io.Reader).Read(unsafecast.Float32ToBytes(values))\n\t\t\t\treturn values[:n/4], err\n\t\t\t},\n\t\t})\n\t})\n\n\tt.Run(\"parquet\", func(t *testing.T) {\n\t\ttestPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}\n\t\t\t\tn, err := w.(parquet.FloatWriter).WriteFloats(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]float32, 10)\n\t\t\t\tn, err := r.(parquet.FloatReader).ReadFloats(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\t\t})\n\t})\n}\n\nfunc testPageDouble(t *testing.T) {\n\tschema := parquet.SchemaOf(struct{ Value float64 }{})\n\n\tt.Run(\"io\", func(t *testing.T) {\n\t\ttestBufferPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}\n\t\t\t\tn, err := w.(io.Writer).Write(unsafecast.Float64ToBytes(values))\n\t\t\t\treturn values[:n/8], err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]float64, 10)\n\t\t\t\tn, err := r.(io.Reader).Read(unsafecast.Float64ToBytes(values))\n\t\t\t\treturn values[:n/8], err\n\t\t\t},\n\t\t})\n\t})\n\n\tt.Run(\"parquet\", func(t *testing.T) {\n\t\ttestPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}\n\t\t\t\tn, err := w.(parquet.DoubleWriter).WriteDoubles(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]float64, 10)\n\t\t\t\tn, err := r.(parquet.DoubleReader).ReadDoubles(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\t\t})\n\t})\n}\n\nfunc testPageByteArray(t *testing.T) {\n\tschema := parquet.SchemaOf(struct{ Value []byte }{})\n\n\tt.Run(\"io\", func(t *testing.T) {\n\t\ttestBufferPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := []byte{}\n\t\t\t\tvalues = plain.AppendByteArray(values, []byte(\"A\"))\n\t\t\t\tvalues = plain.AppendByteArray(values, []byte(\"B\"))\n\t\t\t\tvalues = plain.AppendByteArray(values, []byte(\"C\"))\n\t\t\t\tn, err := w.(io.Writer).Write(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]byte, 3+3*plain.ByteArrayLengthSize)\n\t\t\t\tn, err := r.(io.Reader).Read(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\t\t})\n\t})\n\n\tt.Run(\"parquet\", func(t *testing.T) {\n\t\ttestPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := []byte{}\n\t\t\t\tvalues = plain.AppendByteArray(values, []byte(\"A\"))\n\t\t\t\tvalues = plain.AppendByteArray(values, []byte(\"B\"))\n\t\t\t\tvalues = plain.AppendByteArray(values, []byte(\"C\"))\n\t\t\t\t_, err := w.(parquet.ByteArrayWriter).WriteByteArrays(values)\n\t\t\t\treturn values, err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]byte, 3+3*plain.ByteArrayLengthSize)\n\t\t\t\tn, err := r.(parquet.ByteArrayReader).ReadByteArrays(values)\n\t\t\t\treturn values[:n+n*plain.ByteArrayLengthSize], err\n\t\t\t},\n\t\t})\n\t})\n}\n\nfunc testPageFixedLenByteArray(t *testing.T) {\n\tschema := parquet.SchemaOf(struct{ Value [3]byte }{})\n\n\tt.Run(\"io\", func(t *testing.T) {\n\t\ttestBufferPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := []byte(\"123456789\")\n\t\t\t\tn, err := w.(io.Writer).Write(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]byte, 3*3)\n\t\t\t\tn, err := r.(io.Reader).Read(values)\n\t\t\t\treturn values[:n], err\n\t\t\t},\n\t\t})\n\t})\n\n\tt.Run(\"parquet\", func(t *testing.T) {\n\t\ttestPage(t, schema, pageTest{\n\t\t\twrite: func(w parquet.ValueWriter) (interface{}, error) {\n\t\t\t\tvalues := []byte(\"123456789\")\n\t\t\t\tn, err := w.(parquet.FixedLenByteArrayWriter).WriteFixedLenByteArrays(values)\n\t\t\t\treturn values[:3*n], err\n\t\t\t},\n\n\t\t\tread: func(r parquet.ValueReader) (interface{}, error) {\n\t\t\t\tvalues := make([]byte, 3*3)\n\t\t\t\tn, err := r.(parquet.FixedLenByteArrayReader).ReadFixedLenByteArrays(values)\n\t\t\t\treturn values[:3*n], err\n\t\t\t},\n\t\t})\n\t})\n}\n\ntype pageTest struct {\n\twrite func(parquet.ValueWriter) (interface{}, error)\n\tread  func(parquet.ValueReader) (interface{}, error)\n}\n\nfunc testPage(t *testing.T, schema *parquet.Schema, test pageTest) {\n\tt.Run(\"buffer\", func(t *testing.T) { testBufferPage(t, schema, test) })\n\tt.Run(\"file\", func(t *testing.T) { testFilePage(t, schema, test) })\n}\n\nfunc testBufferPage(t *testing.T, schema *parquet.Schema, test pageTest) {\n\tbuffer := parquet.NewBuffer(schema)\n\tcolumn := buffer.ColumnBuffers()[0]\n\n\tw, err := test.write(column)\n\tif err != nil {\n\t\tt.Fatal(\"writing page values:\", err)\n\t}\n\n\tr, err := test.read(column.Page().Values())\n\tif err != io.EOF {\n\t\tt.Errorf(\"expected io.EOF after reading all values but got %v\", err)\n\t}\n\tif !reflect.DeepEqual(w, r) {\n\t\tt.Errorf(\"wrong values read from the page: got=%+v want=%+v\", r, w)\n\t}\n}\n\nfunc testFilePage(t *testing.T, schema *parquet.Schema, test pageTest) {\n\tbuffer := parquet.NewBuffer(schema)\n\tcolumn := buffer.ColumnBuffers()[0]\n\n\tw, err := test.write(column)\n\tif err != nil {\n\t\tt.Fatal(\"writing page values:\", err)\n\t}\n\n\toutput := new(bytes.Buffer)\n\twriter := parquet.NewWriter(output)\n\tn, err := writer.WriteRowGroup(buffer)\n\tif err != nil {\n\t\tt.Fatal(\"writing parquet file:\", err)\n\t}\n\tif err := writer.Close(); err != nil {\n\t\tt.Fatal(\"writing parquet file:\", err)\n\t}\n\tif n != buffer.NumRows() {\n\t\tt.Fatalf(\"number of rows written mismatch: got=%d want=%d\", n, buffer.NumRows())\n\t}\n\n\treader := bytes.NewReader(output.Bytes())\n\tf, err := parquet.OpenFile(reader, reader.Size())\n\tif err != nil {\n\t\tt.Fatal(\"opening parquet file:\", err)\n\t}\n\n\tpages := f.RowGroups()[0].ColumnChunks()[0].Pages()\n\tdefer pages.Close()\n\n\tp, err := pages.ReadPage()\n\tif err != nil {\n\t\tt.Fatal(\"reading parquet page:\", err)\n\t}\n\tdefer parquet.Release(p)\n\n\tvalues := p.Values()\n\tr, err := test.read(values)\n\tif err != io.EOF && err != nil {\n\t\tt.Errorf(\"expected io.EOF after reading all values but got %v\", err)\n\t}\n\tif !reflect.DeepEqual(w, r) {\n\t\tt.Errorf(\"wrong values read from the page: got=%+v want=%+v\", r, w)\n\t}\n\tif r, err := test.read(values); reflect.ValueOf(r).Len() != 0 || err != io.EOF {\n\t\tt.Errorf(\"expected no data and io.EOF after reading all values but got %d and %v\", r, err)\n\t}\n}\n\ntype testStruct struct {\n\tValue *string\n}\n\nfunc TestOptionalPageTrailingNulls(t *testing.T) {\n\tschema := parquet.SchemaOf(&testStruct{})\n\tbuffer := parquet.NewBuffer(schema)\n\n\tstr := \"test\"\n\trows := []testStruct{{\n\t\tValue: nil,\n\t}, {\n\t\tValue: &str,\n\t}, {\n\t\tValue: nil,\n\t}}\n\n\tfor _, row := range rows {\n\t\t_, err := buffer.WriteRows([]parquet.Row{schema.Deconstruct(nil, row)})\n\t\tif err != nil {\n\t\t\tt.Fatal(\"writing row:\", err)\n\t\t}\n\t}\n\n\tresultRows := make([]parquet.Row, 0, len(rows))\n\tbufferRows := make([]parquet.Row, 10)\n\treader := buffer.Rows()\n\tdefer reader.Close()\n\tfor {\n\t\tn, err := reader.ReadRows(bufferRows)\n\t\tresultRows = append(resultRows, bufferRows[:n]...)\n\t\tif err != nil {\n\t\t\tif err == io.EOF {\n\t\t\t\tbreak\n\t\t\t}\n\t\t\tt.Fatal(\"reading rows:\", err)\n\t\t}\n\t}\n\n\tif len(resultRows) != len(rows) {\n\t\tt.Errorf(\"wrong number of rows read: got=%d want=%d\", len(resultRows), len(rows))\n\t}\n}\n\nfunc TestOptionalPagePreserveIndex(t *testing.T) {\n\tschema := parquet.SchemaOf(&testStruct{})\n\tbuffer := parquet.NewBuffer(schema)\n\n\t_, err := buffer.WriteRows([]parquet.Row{\n\t\tschema.Deconstruct(nil, &testStruct{Value: nil}),\n\t})\n\tif err != nil {\n\t\tt.Fatal(\"writing row:\", err)\n\t}\n\n\trows := buffer.Rows()\n\tdefer rows.Close()\n\n\trowbuf := make([]parquet.Row, 2)\n\n\tn, err := rows.ReadRows(rowbuf)\n\tif err != nil && err != io.EOF {\n\t\tt.Fatal(\"reading rows:\", err)\n\t}\n\tif n != 1 {\n\t\tt.Fatal(\"wrong number of rows returned:\", n)\n\t}\n\tif rowbuf[0][0].Column() != 0 {\n\t\tt.Errorf(\"wrong index: got=%d want=%d\", rowbuf[0][0].Column(), 0)\n\t}\n\n\tn, err = rows.ReadRows(rowbuf)\n\tif err != io.EOF {\n\t\tt.Fatal(\"reading EOF:\", err)\n\t}\n\tif n != 0 {\n\t\tt.Fatal(\"expected no more rows after EOF:\", n)\n\t}\n}\n\nfunc TestRepeatedPageTrailingNulls(t *testing.T) {\n\ttype testStruct struct {\n\t\tA []string `parquet:\"a\"`\n\t}\n\n\ts := parquet.SchemaOf(&testStruct{})\n\n\trecords := []*testStruct{\n\t\t{A: nil},\n\t\t{A: []string{\"test\"}},\n\t\t{A: nil},\n\t}\n\n\tbuf := parquet.NewBuffer(s)\n\tfor _, rec := range records {\n\t\trow := s.Deconstruct(nil, rec)\n\t\t_, err := buf.WriteRows([]parquet.Row{row})\n\t\tif err != nil {\n\t\t\tt.Fatal(err)\n\t\t}\n\t}\n\n\trows := make([]parquet.Row, len(records)+1)\n\treader := buf.Rows()\n\tdefer reader.Close()\n\n\tn, err := reader.ReadRows(rows)\n\tif err != nil && err != io.EOF {\n\t\tt.Fatal(\"reading rows:\", err)\n\t}\n\n\tif n != len(records) {\n\t\tt.Errorf(\"wrong number of rows read: got=%d want=%d\", n, len(records))\n\t}\n}\n"
  },
  {
    "path": "page_values.go",
    "content": "package parquet\n\nimport (\n\t\"io\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding/plain\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\ntype optionalPageValues struct {\n\tpage   *optionalPage\n\tvalues ValueReader\n\toffset int\n}\n\nfunc (r *optionalPageValues) ReadValues(values []Value) (n int, err error) {\n\tmaxDefinitionLevel := r.page.maxDefinitionLevel\n\tdefinitionLevels := r.page.definitionLevels\n\tcolumnIndex := ^int16(r.page.Column())\n\n\tfor n < len(values) && r.offset < len(definitionLevels) {\n\t\tfor n < len(values) && r.offset < len(definitionLevels) && definitionLevels[r.offset] != maxDefinitionLevel {\n\t\t\tvalues[n] = Value{\n\t\t\t\tdefinitionLevel: definitionLevels[r.offset],\n\t\t\t\tcolumnIndex:     columnIndex,\n\t\t\t}\n\t\t\tr.offset++\n\t\t\tn++\n\t\t}\n\n\t\ti := n\n\t\tj := r.offset\n\t\tfor i < len(values) && j < len(definitionLevels) && definitionLevels[j] == maxDefinitionLevel {\n\t\t\ti++\n\t\t\tj++\n\t\t}\n\n\t\tif n < i {\n\t\t\tfor j, err = r.values.ReadValues(values[n:i]); j > 0; j-- {\n\t\t\t\tvalues[n].definitionLevel = maxDefinitionLevel\n\t\t\t\tr.offset++\n\t\t\t\tn++\n\t\t\t}\n\t\t\t// Do not return on an io.EOF here as we may still have null values to read.\n\t\t\tif err != nil && err != io.EOF {\n\t\t\t\treturn n, err\n\t\t\t}\n\t\t\terr = nil\n\t\t}\n\t}\n\n\tif r.offset == len(definitionLevels) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\ntype repeatedPageValues struct {\n\tpage   *repeatedPage\n\tvalues ValueReader\n\toffset int\n}\n\nfunc (r *repeatedPageValues) ReadValues(values []Value) (n int, err error) {\n\tmaxDefinitionLevel := r.page.maxDefinitionLevel\n\tdefinitionLevels := r.page.definitionLevels\n\trepetitionLevels := r.page.repetitionLevels\n\tcolumnIndex := ^int16(r.page.Column())\n\n\t// While we haven't exceeded the output buffer and we haven't exceeded the page size.\n\tfor n < len(values) && r.offset < len(definitionLevels) {\n\n\t\t// While we haven't exceeded the output buffer and we haven't exceeded the\n\t\t// page size AND the current element's definitionLevel is not the\n\t\t// maxDefinitionLevel (this is a null value), Create the zero values to be\n\t\t// returned in this run.\n\t\tfor n < len(values) && r.offset < len(definitionLevels) && definitionLevels[r.offset] != maxDefinitionLevel {\n\t\t\tvalues[n] = Value{\n\t\t\t\trepetitionLevel: repetitionLevels[r.offset],\n\t\t\t\tdefinitionLevel: definitionLevels[r.offset],\n\t\t\t\tcolumnIndex:     columnIndex,\n\t\t\t}\n\t\t\tr.offset++\n\t\t\tn++\n\t\t}\n\n\t\ti := n\n\t\tj := r.offset\n\t\t// Get the length of the run of non-zero values to be copied.\n\t\tfor i < len(values) && j < len(definitionLevels) && definitionLevels[j] == maxDefinitionLevel {\n\t\t\ti++\n\t\t\tj++\n\t\t}\n\n\t\t// Copy all the non-zero values in this run.\n\t\tif n < i {\n\t\t\tfor j, err = r.values.ReadValues(values[n:i]); j > 0; j-- {\n\t\t\t\tvalues[n].repetitionLevel = repetitionLevels[r.offset]\n\t\t\t\tvalues[n].definitionLevel = maxDefinitionLevel\n\t\t\t\tr.offset++\n\t\t\t\tn++\n\t\t\t}\n\t\t\tif err != nil && err != io.EOF {\n\t\t\t\treturn n, err\n\t\t\t}\n\t\t\terr = nil\n\t\t}\n\t}\n\n\tif r.offset == len(definitionLevels) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\ntype booleanPageValues struct {\n\tpage   *booleanPage\n\toffset int\n}\n\nfunc (r *booleanPageValues) ReadBooleans(values []bool) (n int, err error) {\n\tfor n < len(values) && r.offset < int(r.page.numValues) {\n\t\tvalues[n] = r.page.valueAt(r.offset)\n\t\tr.offset++\n\t\tn++\n\t}\n\tif r.offset == int(r.page.numValues) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\nfunc (r *booleanPageValues) ReadValues(values []Value) (n int, err error) {\n\tfor n < len(values) && r.offset < int(r.page.numValues) {\n\t\tvalues[n] = r.page.makeValue(r.page.valueAt(r.offset))\n\t\tr.offset++\n\t\tn++\n\t}\n\tif r.offset == int(r.page.numValues) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\ntype int32PageValues struct {\n\tpage   *int32Page\n\toffset int\n}\n\nfunc (r *int32PageValues) Read(b []byte) (n int, err error) {\n\tn, err = r.ReadInt32s(unsafecast.BytesToInt32(b))\n\treturn 4 * n, err\n}\n\nfunc (r *int32PageValues) ReadInt32s(values []int32) (n int, err error) {\n\tn = copy(values, r.page.values[r.offset:])\n\tr.offset += n\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\nfunc (r *int32PageValues) ReadValues(values []Value) (n int, err error) {\n\tfor n < len(values) && r.offset < len(r.page.values) {\n\t\tvalues[n] = r.page.makeValue(r.page.values[r.offset])\n\t\tr.offset++\n\t\tn++\n\t}\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\ntype int64PageValues struct {\n\tpage   *int64Page\n\toffset int\n}\n\nfunc (r *int64PageValues) Read(b []byte) (n int, err error) {\n\tn, err = r.ReadInt64s(unsafecast.BytesToInt64(b))\n\treturn 8 * n, err\n}\n\nfunc (r *int64PageValues) ReadInt64s(values []int64) (n int, err error) {\n\tn = copy(values, r.page.values[r.offset:])\n\tr.offset += n\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\nfunc (r *int64PageValues) ReadValues(values []Value) (n int, err error) {\n\tfor n < len(values) && r.offset < len(r.page.values) {\n\t\tvalues[n] = r.page.makeValue(r.page.values[r.offset])\n\t\tr.offset++\n\t\tn++\n\t}\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\ntype int96PageValues struct {\n\tpage   *int96Page\n\toffset int\n}\n\nfunc (r *int96PageValues) Read(b []byte) (n int, err error) {\n\tn, err = r.ReadInt96s(deprecated.BytesToInt96(b))\n\treturn 12 * n, err\n}\n\nfunc (r *int96PageValues) ReadInt96s(values []deprecated.Int96) (n int, err error) {\n\tn = copy(values, r.page.values[r.offset:])\n\tr.offset += n\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\nfunc (r *int96PageValues) ReadValues(values []Value) (n int, err error) {\n\tfor n < len(values) && r.offset < len(r.page.values) {\n\t\tvalues[n] = r.page.makeValue(r.page.values[r.offset])\n\t\tr.offset++\n\t\tn++\n\t}\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\ntype floatPageValues struct {\n\tpage   *floatPage\n\toffset int\n}\n\nfunc (r *floatPageValues) Read(b []byte) (n int, err error) {\n\tn, err = r.ReadFloats(unsafecast.BytesToFloat32(b))\n\treturn 4 * n, err\n}\n\nfunc (r *floatPageValues) ReadFloats(values []float32) (n int, err error) {\n\tn = copy(values, r.page.values[r.offset:])\n\tr.offset += n\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\nfunc (r *floatPageValues) ReadValues(values []Value) (n int, err error) {\n\tfor n < len(values) && r.offset < len(r.page.values) {\n\t\tvalues[n] = r.page.makeValue(r.page.values[r.offset])\n\t\tr.offset++\n\t\tn++\n\t}\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\ntype doublePageValues struct {\n\tpage   *doublePage\n\toffset int\n}\n\nfunc (r *doublePageValues) Read(b []byte) (n int, err error) {\n\tn, err = r.ReadDoubles(unsafecast.BytesToFloat64(b))\n\treturn 8 * n, err\n}\n\nfunc (r *doublePageValues) ReadDoubles(values []float64) (n int, err error) {\n\tn = copy(values, r.page.values[r.offset:])\n\tr.offset += n\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\nfunc (r *doublePageValues) ReadValues(values []Value) (n int, err error) {\n\tfor n < len(values) && r.offset < len(r.page.values) {\n\t\tvalues[n] = r.page.makeValue(r.page.values[r.offset])\n\t\tr.offset++\n\t\tn++\n\t}\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\ntype byteArrayPageValues struct {\n\tpage   *byteArrayPage\n\toffset int\n}\n\nfunc (r *byteArrayPageValues) Read(b []byte) (int, error) {\n\t_, n, err := r.readByteArrays(b)\n\treturn n, err\n}\n\nfunc (r *byteArrayPageValues) ReadRequired(values []byte) (int, error) {\n\treturn r.ReadByteArrays(values)\n}\n\nfunc (r *byteArrayPageValues) ReadByteArrays(values []byte) (int, error) {\n\tn, _, err := r.readByteArrays(values)\n\treturn n, err\n}\n\nfunc (r *byteArrayPageValues) readByteArrays(values []byte) (c, n int, err error) {\n\tnumValues := r.page.len()\n\tfor r.offset < numValues {\n\t\tb := r.page.index(r.offset)\n\t\tk := plain.ByteArrayLengthSize + len(b)\n\t\tif k > (len(values) - n) {\n\t\t\tbreak\n\t\t}\n\t\tplain.PutByteArrayLength(values[n:], len(b))\n\t\tn += plain.ByteArrayLengthSize\n\t\tn += copy(values[n:], b)\n\t\tr.offset++\n\t\tc++\n\t}\n\tif r.offset == numValues {\n\t\terr = io.EOF\n\t} else if n == 0 && len(values) > 0 {\n\t\terr = io.ErrShortBuffer\n\t}\n\treturn c, n, err\n}\n\nfunc (r *byteArrayPageValues) ReadValues(values []Value) (n int, err error) {\n\tnumValues := r.page.len()\n\tfor n < len(values) && r.offset < numValues {\n\t\tvalues[n] = r.page.makeValueBytes(r.page.index(r.offset))\n\t\tr.offset++\n\t\tn++\n\t}\n\tif r.offset == numValues {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\ntype fixedLenByteArrayPageValues struct {\n\tpage   *fixedLenByteArrayPage\n\toffset int\n}\n\nfunc (r *fixedLenByteArrayPageValues) Read(b []byte) (n int, err error) {\n\tn, err = r.ReadFixedLenByteArrays(b)\n\treturn n * r.page.size, err\n}\n\nfunc (r *fixedLenByteArrayPageValues) ReadRequired(values []byte) (int, error) {\n\treturn r.ReadFixedLenByteArrays(values)\n}\n\nfunc (r *fixedLenByteArrayPageValues) ReadFixedLenByteArrays(values []byte) (n int, err error) {\n\tn = copy(values, r.page.data[r.offset:]) / r.page.size\n\tr.offset += n * r.page.size\n\tif r.offset == len(r.page.data) {\n\t\terr = io.EOF\n\t} else if n == 0 && len(values) > 0 {\n\t\terr = io.ErrShortBuffer\n\t}\n\treturn n, err\n}\n\nfunc (r *fixedLenByteArrayPageValues) ReadValues(values []Value) (n int, err error) {\n\tfor n < len(values) && r.offset < len(r.page.data) {\n\t\tvalues[n] = r.page.makeValueBytes(r.page.data[r.offset : r.offset+r.page.size])\n\t\tr.offset += r.page.size\n\t\tn++\n\t}\n\tif r.offset == len(r.page.data) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\ntype uint32PageValues struct {\n\tpage   *uint32Page\n\toffset int\n}\n\nfunc (r *uint32PageValues) Read(b []byte) (n int, err error) {\n\tn, err = r.ReadUint32s(unsafecast.BytesToUint32(b))\n\treturn 4 * n, err\n}\n\nfunc (r *uint32PageValues) ReadUint32s(values []uint32) (n int, err error) {\n\tn = copy(values, r.page.values[r.offset:])\n\tr.offset += n\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\nfunc (r *uint32PageValues) ReadValues(values []Value) (n int, err error) {\n\tfor n < len(values) && r.offset < len(r.page.values) {\n\t\tvalues[n] = r.page.makeValue(r.page.values[r.offset])\n\t\tr.offset++\n\t\tn++\n\t}\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\ntype uint64PageValues struct {\n\tpage   *uint64Page\n\toffset int\n}\n\nfunc (r *uint64PageValues) Read(b []byte) (n int, err error) {\n\tn, err = r.ReadUint64s(unsafecast.BytesToUint64(b))\n\treturn 8 * n, err\n}\n\nfunc (r *uint64PageValues) ReadUint64s(values []uint64) (n int, err error) {\n\tn = copy(values, r.page.values[r.offset:])\n\tr.offset += n\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\nfunc (r *uint64PageValues) ReadValues(values []Value) (n int, err error) {\n\tfor n < len(values) && r.offset < len(r.page.values) {\n\t\tvalues[n] = r.page.makeValue(r.page.values[r.offset])\n\t\tr.offset++\n\t\tn++\n\t}\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\ntype be128PageValues struct {\n\tpage   *be128Page\n\toffset int\n}\n\nfunc (r *be128PageValues) ReadValues(values []Value) (n int, err error) {\n\tfor n < len(values) && r.offset < len(r.page.values) {\n\t\tvalues[n] = r.page.makeValue(&r.page.values[r.offset])\n\t\tr.offset++\n\t\tn++\n\t}\n\tif r.offset == len(r.page.values) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\ntype nullPageValues struct {\n\tcolumn int\n\tremain int\n}\n\nfunc (r *nullPageValues) ReadValues(values []Value) (n int, err error) {\n\tcolumnIndex := ^int16(r.column)\n\tvalues = values[:min(r.remain, len(values))]\n\tfor i := range values {\n\t\tvalues[i] = Value{columnIndex: columnIndex}\n\t}\n\tr.remain -= len(values)\n\tif r.remain == 0 {\n\t\terr = io.EOF\n\t}\n\treturn len(values), err\n}\n"
  },
  {
    "path": "parquet.go",
    "content": "// Copyright 2022 Twilio Inc.\n\n// Package parquet is a library for working with parquet files. For an overview\n// of Parquet's qualities as a storage format, see this blog post:\n// https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet\n//\n// Or see the Parquet documentation: https://parquet.apache.org/docs/\npackage parquet\n\nimport \"reflect\"\n\nfunc atLeastOne(size int) int {\n\treturn atLeast(size, 1)\n}\n\nfunc atLeast(size, least int) int {\n\tif size < least {\n\t\treturn least\n\t}\n\treturn size\n}\n\nfunc min(a, b int) int {\n\tif a < b {\n\t\treturn a\n\t}\n\treturn b\n}\n\nfunc max(a, b int) int {\n\tif a > b {\n\t\treturn a\n\t}\n\treturn b\n}\n\nfunc typeNameOf(t reflect.Type) string {\n\ts1 := t.String()\n\ts2 := t.Kind().String()\n\tif s1 == s2 {\n\t\treturn s1\n\t}\n\treturn s1 + \" (\" + s2 + \")\"\n}\n\nfunc isZero(b []byte) bool {\n\tfor _, c := range b {\n\t\tif c != 0 {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n"
  },
  {
    "path": "parquet_amd64.go",
    "content": "//go:build !purego\n\npackage parquet\n\nimport \"golang.org/x/sys/cpu\"\n\nvar (\n\t// This variable is used in x86 assembly source files to gate the use of\n\t// AVX2 instructions depending on whether the CPU supports it.\n\thasAVX2     = cpu.X86.HasAVX2\n\thasAVX512F  = cpu.X86.HasAVX512F\n\thasAVX512VL = cpu.X86.HasAVX512F && cpu.X86.HasAVX512VL\n\t// For min/max functions over big-endian 128 bits values, we need the\n\t// follwing instructions from the DQ set:\n\t// * VPBROADCASTQ (with 64 bits source register)\n\t// * VBROADCASTI64X2\n\thasAVX512MinMaxBE128 = cpu.X86.HasAVX512F && cpu.X86.HasAVX512DQ\n)\n"
  },
  {
    "path": "parquet_go18.go",
    "content": "//go:build go1.18\n\npackage parquet\n\nimport (\n\t\"io\"\n\t\"os\"\n)\n\n// Read reads and returns rows from the parquet file in the given reader.\n//\n// The type T defines the type of rows read from r. T must be compatible with\n// the file's schema or an error will be returned. The row type might represent\n// a subset of the full schema, in which case only a subset of the columns will\n// be loaded from r.\n//\n// This function is provided for convenience to facilitate reading of parquet\n// files from arbitrary locations in cases where the data set fit in memory.\nfunc Read[T any](r io.ReaderAt, size int64, options ...ReaderOption) (rows []T, err error) {\n\tconfig, err := NewReaderConfig(options...)\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\tfile, err := OpenFile(r, size)\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\trows = make([]T, file.NumRows())\n\treader := NewGenericReader[T](file, config)\n\tn, err := reader.Read(rows)\n\tif err == io.EOF {\n\t\terr = nil\n\t}\n\treader.Close()\n\treturn rows[:n], err\n}\n\n// ReadFile reads rows of the parquet file at the given path.\n//\n// The type T defines the type of rows read from r. T must be compatible with\n// the file's schema or an error will be returned. The row type might represent\n// a subset of the full schema, in which case only a subset of the columns will\n// be loaded from the file.\n//\n// This function is provided for convenience to facilitate reading of parquet\n// files from the file system in cases where the data set fit in memory.\nfunc ReadFile[T any](path string, options ...ReaderOption) (rows []T, err error) {\n\tf, err := os.Open(path)\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\tdefer f.Close()\n\ts, err := f.Stat()\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\treturn Read[T](f, s.Size())\n}\n\n// Write writes the given list of rows to a parquet file written to w.\n//\n// This function is provided for convenience to facilitate the creation of\n// parquet files.\nfunc Write[T any](w io.Writer, rows []T, options ...WriterOption) error {\n\tconfig, err := NewWriterConfig(options...)\n\tif err != nil {\n\t\treturn err\n\t}\n\twriter := NewGenericWriter[T](w, config)\n\tif _, err := writer.Write(rows); err != nil {\n\t\treturn err\n\t}\n\treturn writer.Close()\n}\n\n// Write writes the given list of rows to a parquet file written to w.\n//\n// This function is provided for convenience to facilitate writing parquet\n// files to the file system.\nfunc WriteFile[T any](path string, rows []T, options ...WriterOption) error {\n\tf, err := os.Create(path)\n\tif err != nil {\n\t\treturn err\n\t}\n\tdefer f.Close()\n\treturn Write(f, rows, options...)\n}\n"
  },
  {
    "path": "parquet_go18_test.go",
    "content": "//go:build go1.18\n\npackage parquet_test\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"io\"\n\t\"log\"\n\t\"os\"\n\t\"reflect\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n\t\"google.golang.org/protobuf/types/known/structpb\"\n)\n\nfunc ExampleReadFile() {\n\ttype Row struct {\n\t\tID   int64  `parquet:\"id\"`\n\t\tName string `parquet:\"name,zstd\"`\n\t}\n\n\tExampleWriteFile()\n\n\trows, err := parquet.ReadFile[Row](\"/tmp/file.parquet\")\n\tif err != nil {\n\t\tlog.Fatal(err)\n\t}\n\n\tfor _, row := range rows {\n\t\tfmt.Printf(\"%d: %q\\n\", row.ID, row.Name)\n\t}\n\n\t// Output:\n\t// 0: \"Bob\"\n\t// 1: \"Alice\"\n\t// 2: \"Franky\"\n}\n\nfunc ExampleWriteFile() {\n\ttype Row struct {\n\t\tID   int64  `parquet:\"id\"`\n\t\tName string `parquet:\"name,zstd\"`\n\t}\n\n\tif err := parquet.WriteFile(\"/tmp/file.parquet\", []Row{\n\t\t{ID: 0, Name: \"Bob\"},\n\t\t{ID: 1, Name: \"Alice\"},\n\t\t{ID: 2, Name: \"Franky\"},\n\t}); err != nil {\n\t\tlog.Fatal(err)\n\t}\n\n\t// Output:\n}\n\nfunc ExampleRead_any() {\n\ttype Row struct{ FirstName, LastName string }\n\n\tbuf := new(bytes.Buffer)\n\terr := parquet.Write(buf, []Row{\n\t\t{FirstName: \"Luke\", LastName: \"Skywalker\"},\n\t\t{FirstName: \"Han\", LastName: \"Solo\"},\n\t\t{FirstName: \"R2\", LastName: \"D2\"},\n\t})\n\tif err != nil {\n\t\tlog.Fatal(err)\n\t}\n\n\tfile := bytes.NewReader(buf.Bytes())\n\n\trows, err := parquet.Read[any](file, file.Size())\n\tif err != nil {\n\t\tlog.Fatal(err)\n\t}\n\n\tfor _, row := range rows {\n\t\tfmt.Printf(\"%q\\n\", row)\n\t}\n\n\t// Output:\n\t// map[\"FirstName\":\"Luke\" \"LastName\":\"Skywalker\"]\n\t// map[\"FirstName\":\"Han\" \"LastName\":\"Solo\"]\n\t// map[\"FirstName\":\"R2\" \"LastName\":\"D2\"]\n}\n\nfunc ExampleWrite_any() {\n\tschema := parquet.SchemaOf(struct {\n\t\tFirstName string\n\t\tLastName  string\n\t}{})\n\n\tbuf := new(bytes.Buffer)\n\terr := parquet.Write[any](\n\t\tbuf,\n\t\t[]any{\n\t\t\tmap[string]string{\"FirstName\": \"Luke\", \"LastName\": \"Skywalker\"},\n\t\t\tmap[string]string{\"FirstName\": \"Han\", \"LastName\": \"Solo\"},\n\t\t\tmap[string]string{\"FirstName\": \"R2\", \"LastName\": \"D2\"},\n\t\t},\n\t\tschema,\n\t)\n\tif err != nil {\n\t\tlog.Fatal(err)\n\t}\n\n\tfile := bytes.NewReader(buf.Bytes())\n\n\trows, err := parquet.Read[any](file, file.Size())\n\tif err != nil {\n\t\tlog.Fatal(err)\n\t}\n\n\tfor _, row := range rows {\n\t\tfmt.Printf(\"%q\\n\", row)\n\t}\n\n\t// Output:\n\t// map[\"FirstName\":\"Luke\" \"LastName\":\"Skywalker\"]\n\t// map[\"FirstName\":\"Han\" \"LastName\":\"Solo\"]\n\t// map[\"FirstName\":\"R2\" \"LastName\":\"D2\"]\n}\n\nfunc ExampleSearch() {\n\ttype Row struct{ FirstName, LastName string }\n\n\tbuf := new(bytes.Buffer)\n\t// The column being searched should be sorted to avoid a full scan of the\n\t// column. See the section of the readme on sorting for how to sort on\n\t// insertion into the parquet file using parquet.SortingColumns\n\trows := []Row{\n\t\t{FirstName: \"C\", LastName: \"3PO\"},\n\t\t{FirstName: \"Han\", LastName: \"Solo\"},\n\t\t{FirstName: \"Leia\", LastName: \"Organa\"},\n\t\t{FirstName: \"Luke\", LastName: \"Skywalker\"},\n\t\t{FirstName: \"R2\", LastName: \"D2\"},\n\t}\n\t// The tiny page buffer size ensures we get multiple pages out of the example above.\n\tw := parquet.NewGenericWriter[Row](buf, parquet.PageBufferSize(12), parquet.WriteBufferSize(0))\n\t// Need to write 1 row at a time here as writing many at once disregards PageBufferSize option.\n\tfor _, row := range rows {\n\t\t_, err := w.Write([]Row{row})\n\t\tif err != nil {\n\t\t\tlog.Fatal(err)\n\t\t}\n\t}\n\terr := w.Close()\n\tif err != nil {\n\t\tlog.Fatal(err)\n\t}\n\n\treader := bytes.NewReader(buf.Bytes())\n\tfile, err := parquet.OpenFile(reader, reader.Size())\n\tif err != nil {\n\t\tlog.Fatal(err)\n\t}\n\n\t// Search is scoped to a single RowGroup/ColumnChunk\n\trowGroup := file.RowGroups()[0]\n\tfirstNameColChunk := rowGroup.ColumnChunks()[0]\n\n\tfound := parquet.Search(firstNameColChunk.ColumnIndex(), parquet.ValueOf(\"Luke\"), parquet.ByteArrayType)\n\toffsetIndex := firstNameColChunk.OffsetIndex()\n\tfmt.Printf(\"numPages: %d\\n\", offsetIndex.NumPages())\n\tfmt.Printf(\"result found in page: %d\\n\", found)\n\tif found < offsetIndex.NumPages() {\n\t\tr := parquet.NewGenericReader[Row](file)\n\t\tdefer r.Close()\n\t\t// Seek to the first row in the page the result was found\n\t\tr.SeekToRow(offsetIndex.FirstRowIndex(found))\n\t\tresult := make([]Row, 2)\n\t\t_, _ = r.Read(result)\n\t\t// Leia is in index 0 for the page.\n\t\tfor _, row := range result {\n\t\t\tif row.FirstName == \"Luke\" {\n\t\t\t\tfmt.Printf(\"%q\\n\", row)\n\t\t\t}\n\t\t}\n\t}\n\n\t// Output:\n\t// numPages: 3\n\t// result found in page: 1\n\t// {\"Luke\" \"Skywalker\"}\n}\n\nfunc TestIssue360(t *testing.T) {\n\ttype TestType struct {\n\t\tKey []int\n\t}\n\n\tschema := parquet.SchemaOf(TestType{})\n\tbuffer := parquet.NewGenericBuffer[any](schema)\n\n\tdata := make([]any, 1)\n\tdata[0] = TestType{Key: []int{1}}\n\t_, err := buffer.Write(data)\n\tif err != nil {\n\t\tfmt.Println(\"Exiting with error: \", err)\n\t\treturn\n\t}\n\n\tvar out bytes.Buffer\n\twriter := parquet.NewGenericWriter[any](&out, schema)\n\n\t_, err = parquet.CopyRows(writer, buffer.Rows())\n\tif err != nil {\n\t\tfmt.Println(\"Exiting with error: \", err)\n\t\treturn\n\t}\n\twriter.Close()\n\n\tbr := bytes.NewReader(out.Bytes())\n\trows, _ := parquet.Read[any](br, br.Size())\n\n\texpect := []any{\n\t\tmap[string]any{\n\t\t\t\"Key\": []any{\n\t\t\t\tint64(1),\n\t\t\t},\n\t\t},\n\t}\n\n\tassertRowsEqual(t, expect, rows)\n}\n\nfunc TestIssue362ParquetReadFromGenericReaders(t *testing.T) {\n\tpath := \"testdata/dms_test_table_LOAD00000001.parquet\"\n\tfp, err := os.Open(path)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\tdefer fp.Close()\n\n\tr1 := parquet.NewGenericReader[any](fp)\n\trows1 := make([]any, r1.NumRows())\n\t_, err = r1.Read(rows1)\n\tif err != nil && err != io.EOF {\n\t\tt.Fatal(err)\n\t}\n\n\tr2 := parquet.NewGenericReader[any](fp)\n\trows2 := make([]any, r2.NumRows())\n\t_, err = r2.Read(rows2)\n\tif err != nil && err != io.EOF {\n\t\tt.Fatal(err)\n\t}\n}\n\nfunc TestIssue362ParquetReadFile(t *testing.T) {\n\trows1, err := parquet.ReadFile[any](\"testdata/dms_test_table_LOAD00000001.parquet\")\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\trows2, err := parquet.ReadFile[any](\"testdata/dms_test_table_LOAD00000001.parquet\")\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tassertRowsEqual(t, rows1, rows2)\n}\n\nfunc TestIssue368(t *testing.T) {\n\tf, err := os.Open(\"testdata/issue368.parquet\")\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\tdefer f.Close()\n\n\tinfo, err := f.Stat()\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tpf, err := parquet.OpenFile(f, info.Size())\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\treader := parquet.NewGenericReader[any](pf)\n\tdefer reader.Close()\n\n\ttrs := make([]any, 1)\n\tfor {\n\t\t_, err := reader.Read(trs)\n\t\tif err != nil {\n\t\t\tbreak\n\t\t}\n\t}\n}\n\nfunc TestIssue377(t *testing.T) {\n\ttype People struct {\n\t\tName string\n\t\tAge  int\n\t}\n\n\ttype Nested struct {\n\t\tP  []People\n\t\tF  string\n\t\tGF string\n\t}\n\trow1 := Nested{P: []People{\n\t\t{\n\t\t\tName: \"Bob\",\n\t\t\tAge:  10,\n\t\t}}}\n\tods := []Nested{\n\t\trow1,\n\t}\n\tbuf := new(bytes.Buffer)\n\tw := parquet.NewGenericWriter[Nested](buf)\n\t_, err := w.Write(ods)\n\tif err != nil {\n\t\tt.Fatal(\"write error: \", err)\n\t}\n\tw.Close()\n\n\tfile := bytes.NewReader(buf.Bytes())\n\trows, err := parquet.Read[Nested](file, file.Size())\n\tif err != nil {\n\t\tt.Fatal(\"read error: \", err)\n\t}\n\n\tassertRowsEqual(t, rows, ods)\n}\n\nfunc TestIssue423(t *testing.T) {\n\ttype Inner struct {\n\t\tValue string `parquet:\",\"`\n\t}\n\ttype Outer struct {\n\t\tLabel string  `parquet:\",\"`\n\t\tInner Inner   `parquet:\",json\"`\n\t\tSlice []Inner `parquet:\",json\"`\n\t\t// This is the only tricky situation. Because we're delegating to json Marshaler/Unmarshaler\n\t\t// We use the json tags for optionality.\n\t\tPtr *Inner `json:\",omitempty\" parquet:\",json\"`\n\n\t\t// This tests BC behavior that slices of bytes and json strings still get written/read in a BC way.\n\t\tString        string                     `parquet:\",json\"`\n\t\tBytes         []byte                     `parquet:\",json\"`\n\t\tMapOfStructPb map[string]*structpb.Value `parquet:\",json\"`\n\t\tStructPB      *structpb.Value            `parquet:\",json\"`\n\t}\n\n\twriteRows := []Outer{\n\t\t{\n\t\t\tLabel: \"welp\",\n\t\t\tInner: Inner{\n\t\t\t\tValue: \"this is a string\",\n\t\t\t},\n\t\t\tSlice: []Inner{\n\t\t\t\t{\n\t\t\t\t\tValue: \"in a slice\",\n\t\t\t\t},\n\t\t\t},\n\t\t\tPtr:    nil,\n\t\t\tString: `{\"hello\":\"world\"}`,\n\t\t\tBytes:  []byte(`{\"goodbye\":\"world\"}`),\n\t\t\tMapOfStructPb: map[string]*structpb.Value{\n\t\t\t\t\"answer\": structpb.NewNumberValue(42.00),\n\t\t\t},\n\t\t\tStructPB: structpb.NewBoolValue(true),\n\t\t},\n\t\t{\n\t\t\tLabel: \"foxes\",\n\t\t\tInner: Inner{\n\t\t\t\tValue: \"the quick brown fox jumped over the yellow lazy dog.\",\n\t\t\t},\n\t\t\tSlice: []Inner{\n\t\t\t\t{\n\t\t\t\t\tValue: \"in a slice\",\n\t\t\t\t},\n\t\t\t},\n\t\t\tPtr: &Inner{\n\t\t\t\tValue: \"not nil\",\n\t\t\t},\n\t\t\tString: `{\"hello\":\"world\"}`,\n\t\t\tBytes:  []byte(`{\"goodbye\":\"world\"}`),\n\t\t\tMapOfStructPb: map[string]*structpb.Value{\n\t\t\t\t\"doubleAnswer\": structpb.NewNumberValue(84.00),\n\t\t\t},\n\t\t\tStructPB: structpb.NewBoolValue(false),\n\t\t},\n\t}\n\n\tschema := parquet.SchemaOf(new(Outer))\n\tfmt.Println(schema.String())\n\tbuf := new(bytes.Buffer)\n\tw := parquet.NewGenericWriter[Outer](buf, schema)\n\t_, err := w.Write(writeRows)\n\tif err != nil {\n\t\tt.Fatal(\"write error: \", err)\n\t}\n\tw.Close()\n\n\tfile := bytes.NewReader(buf.Bytes())\n\treadRows, err := parquet.Read[Outer](file, file.Size())\n\tif err != nil {\n\t\tt.Fatal(\"read error: \", err)\n\t}\n\n\tassertRowsEqual(t, writeRows, readRows)\n}\n\nfunc TestReadFileGenericMultipleRowGroupsMultiplePages(t *testing.T) {\n\ttype MyRow struct {\n\t\tID    [16]byte `parquet:\"id,delta,uuid\"`\n\t\tFile  string   `parquet:\"file,dict,zstd\"`\n\t\tIndex int64    `parquet:\"index,delta,zstd\"`\n\t}\n\n\tnumRows := 20_000\n\tmaxPageBytes := 5000\n\n\ttmp, err := os.CreateTemp(\"/tmp\", \"*.parquet\")\n\tif err != nil {\n\t\tt.Fatal(\"os.CreateTemp: \", err)\n\t}\n\tpath := tmp.Name()\n\tdefer os.Remove(path)\n\tt.Log(\"file:\", path)\n\n\t// The page buffer size ensures we get multiple pages out of this example.\n\tw := parquet.NewGenericWriter[MyRow](tmp, parquet.PageBufferSize(maxPageBytes))\n\t// Need to write 1 row at a time here as writing many at once disregards PageBufferSize option.\n\tfor i := 0; i < numRows; i++ {\n\t\trow := MyRow{\n\t\t\tID:    [16]byte{15: byte(i)},\n\t\t\tFile:  \"hi\" + fmt.Sprint(i),\n\t\t\tIndex: int64(i),\n\t\t}\n\t\t_, err := w.Write([]MyRow{row})\n\t\tif err != nil {\n\t\t\tt.Fatal(\"w.Write: \", err)\n\t\t}\n\t\t// Flush writes rows as row group. 4 total (20k/5k) in this file.\n\t\tif (i+1)%maxPageBytes == 0 {\n\t\t\terr = w.Flush()\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(\"w.Flush: \", err)\n\t\t\t}\n\t\t}\n\t}\n\terr = w.Close()\n\tif err != nil {\n\t\tt.Fatal(\"w.Close: \", err)\n\t}\n\terr = tmp.Close()\n\tif err != nil {\n\t\tt.Fatal(\"tmp.Close: \", err)\n\t}\n\n\trows, err := parquet.ReadFile[MyRow](path)\n\tif err != nil {\n\t\tt.Fatal(\"parquet.ReadFile: \", err)\n\t}\n\n\tif len(rows) != numRows {\n\t\tt.Fatalf(\"not enough values were read: want=%d got=%d\", len(rows), numRows)\n\t}\n\tfor i, row := range rows {\n\t\tid := [16]byte{15: byte(i)}\n\t\tfile := \"hi\" + fmt.Sprint(i)\n\t\tindex := int64(i)\n\n\t\tif row.ID != id || row.File != file || row.Index != index {\n\t\t\tt.Fatalf(\"rows mismatch at index: %d got: %+v\", i, row)\n\t\t}\n\t}\n}\n\nfunc assertRowsEqual[T any](t *testing.T, rows1, rows2 []T) {\n\tif !reflect.DeepEqual(rows1, rows2) {\n\t\tt.Error(\"rows mismatch\")\n\n\t\tt.Log(\"want:\")\n\t\tlogRows(t, rows1)\n\n\t\tt.Log(\"got:\")\n\t\tlogRows(t, rows2)\n\t}\n}\n\nfunc logRows[T any](t *testing.T, rows []T) {\n\tfor _, row := range rows {\n\t\tt.Logf(\". %#v\\n\", row)\n\t}\n}\n"
  },
  {
    "path": "parquet_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"io\"\n\t\"math/rand\"\n\t\"reflect\"\n\t\"strings\"\n\t\"testing\"\n\t\"time\"\n\n\t\"github.com/google/uuid\"\n\n\t\"github.com/segmentio/parquet-go\"\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/internal/quick\"\n)\n\nconst (\n\tbenchmarkNumRows     = 10_000\n\tbenchmarkRowsPerStep = 1000\n)\n\ntype benchmarkRowType struct {\n\tID    [16]byte `parquet:\"id,uuid\"`\n\tValue float64  `parquet:\"value\"`\n}\n\nfunc (row benchmarkRowType) generate(prng *rand.Rand) benchmarkRowType {\n\tprng.Read(row.ID[:])\n\trow.Value = prng.Float64()\n\treturn row\n}\n\ntype paddedBooleanColumn struct {\n\tValue bool\n\t_     [3]byte\n}\n\nfunc (row paddedBooleanColumn) generate(prng *rand.Rand) paddedBooleanColumn {\n\treturn paddedBooleanColumn{Value: prng.Int()%2 == 0}\n}\n\ntype booleanColumn struct {\n\tValue bool\n}\n\nfunc (row booleanColumn) generate(prng *rand.Rand) booleanColumn {\n\treturn booleanColumn{Value: prng.Int()%2 == 0}\n}\n\ntype int32Column struct {\n\tValue int32 `parquet:\",delta\"`\n}\n\nfunc (row int32Column) generate(prng *rand.Rand) int32Column {\n\treturn int32Column{Value: prng.Int31n(100)}\n}\n\ntype int64Column struct {\n\tValue int64 `parquet:\",delta\"`\n}\n\nfunc (row int64Column) generate(prng *rand.Rand) int64Column {\n\treturn int64Column{Value: prng.Int63n(100)}\n}\n\ntype int96Column struct {\n\tValue deprecated.Int96\n}\n\nfunc (row int96Column) generate(prng *rand.Rand) int96Column {\n\trow.Value[0] = prng.Uint32()\n\trow.Value[1] = prng.Uint32()\n\trow.Value[2] = prng.Uint32()\n\treturn row\n}\n\ntype floatColumn struct {\n\tValue float32\n}\n\nfunc (row floatColumn) generate(prng *rand.Rand) floatColumn {\n\treturn floatColumn{Value: prng.Float32()}\n}\n\ntype doubleColumn struct {\n\tValue float64\n}\n\nfunc (row doubleColumn) generate(prng *rand.Rand) doubleColumn {\n\treturn doubleColumn{Value: prng.Float64()}\n}\n\ntype byteArrayColumn struct {\n\tValue []byte\n}\n\nfunc (row byteArrayColumn) generate(prng *rand.Rand) byteArrayColumn {\n\trow.Value = make([]byte, prng.Intn(10))\n\tprng.Read(row.Value)\n\treturn row\n}\n\ntype fixedLenByteArrayColumn struct {\n\tValue [10]byte\n}\n\nfunc (row fixedLenByteArrayColumn) generate(prng *rand.Rand) fixedLenByteArrayColumn {\n\tprng.Read(row.Value[:])\n\treturn row\n}\n\ntype stringColumn struct {\n\tValue string\n}\n\nfunc (row stringColumn) generate(prng *rand.Rand) stringColumn {\n\treturn stringColumn{Value: generateString(prng, 10)}\n}\n\ntype indexedStringColumn struct {\n\tValue string `parquet:\",dict\"`\n}\n\nfunc (row indexedStringColumn) generate(prng *rand.Rand) indexedStringColumn {\n\treturn indexedStringColumn{Value: generateString(prng, 10)}\n}\n\ntype uuidColumn struct {\n\tValue uuid.UUID `parquet:\",delta\"`\n}\n\nfunc (row uuidColumn) generate(prng *rand.Rand) uuidColumn {\n\tprng.Read(row.Value[:])\n\treturn row\n}\n\ntype timeColumn struct {\n\tValue time.Time\n}\n\nfunc (row timeColumn) generate(prng *rand.Rand) timeColumn {\n\tt := time.Unix(0, prng.Int63()).UTC()\n\treturn timeColumn{Value: t}\n}\n\ntype timeInMillisColumn struct {\n\tValue time.Time `parquet:\",timestamp(millisecond)\"`\n}\n\nfunc (row timeInMillisColumn) generate(prng *rand.Rand) timeInMillisColumn {\n\tt := time.Unix(0, prng.Int63()).UTC()\n\treturn timeInMillisColumn{Value: t}\n}\n\ntype decimalColumn struct {\n\tValue int64 `parquet:\",decimal(0:3)\"`\n}\n\nfunc (row decimalColumn) generate(prng *rand.Rand) decimalColumn {\n\treturn decimalColumn{Value: prng.Int63()}\n}\n\ntype mapColumn struct {\n\tValue map[utf8string]int\n}\n\nfunc (row mapColumn) generate(prng *rand.Rand) mapColumn {\n\tn := prng.Intn(10)\n\trow.Value = make(map[utf8string]int, n)\n\tfor i := 0; i < n; i++ {\n\t\trow.Value[utf8string(generateString(prng, 8))] = prng.Intn(100)\n\t}\n\treturn row\n}\n\ntype addressBook struct {\n\tOwner             utf8string   `parquet:\",plain\"`\n\tOwnerPhoneNumbers []utf8string `parquet:\",plain\"`\n\tContacts          []contact\n}\n\ntype contact struct {\n\tName        utf8string `parquet:\",plain\"`\n\tPhoneNumber utf8string `parquet:\",plain\"`\n}\n\nfunc (row contact) generate(prng *rand.Rand) contact {\n\treturn contact{\n\t\tName:        utf8string(generateString(prng, 16)),\n\t\tPhoneNumber: utf8string(generateString(prng, 10)),\n\t}\n}\n\ntype optionalInt32Column struct {\n\tValue int32 `parquet:\",optional\"`\n}\n\nfunc (row optionalInt32Column) generate(prng *rand.Rand) optionalInt32Column {\n\treturn optionalInt32Column{Value: prng.Int31n(100)}\n}\n\ntype repeatedInt32Column struct {\n\tValues []int32\n}\n\nfunc (row repeatedInt32Column) generate(prng *rand.Rand) repeatedInt32Column {\n\trow.Values = make([]int32, prng.Intn(10))\n\tfor i := range row.Values {\n\t\trow.Values[i] = prng.Int31n(10)\n\t}\n\treturn row\n}\n\ntype listColumn2 struct {\n\tValue utf8string `parquet:\",optional\"`\n}\n\ntype listColumn1 struct {\n\tList2 []listColumn2 `parquet:\",list\"`\n}\n\ntype listColumn0 struct {\n\tList1 []listColumn1 `parquet:\",list\"`\n}\n\ntype nestedListColumn1 struct {\n\tLevel3 []utf8string `parquet:\"level3\"`\n}\n\ntype nestedListColumn struct {\n\tLevel1 []nestedListColumn1 `parquet:\"level1\"`\n\tLevel2 []utf8string        `parquet:\"level2\"`\n}\n\ntype utf8string string\n\nfunc (utf8string) Generate(rand *rand.Rand, size int) reflect.Value {\n\tconst characters = \"abcdefghijklmnopqrstuvwxyz1234567890\"\n\tconst maxSize = 10\n\tif size > maxSize {\n\t\tsize = maxSize\n\t}\n\tn := rand.Intn(size)\n\tb := make([]byte, n)\n\tfor i := range b {\n\t\tb[i] = characters[rand.Intn(len(characters))]\n\t}\n\treturn reflect.ValueOf(utf8string(b))\n}\n\ntype Contact struct {\n\tName        string `parquet:\"name\"`\n\tPhoneNumber string `parquet:\"phoneNumber,optional,zstd\"`\n}\n\ntype AddressBook struct {\n\tOwner             string    `parquet:\"owner,zstd\"`\n\tOwnerPhoneNumbers []string  `parquet:\"ownerPhoneNumbers,gzip\"`\n\tContacts          []Contact `parquet:\"contacts\"`\n}\n\nfunc forEachLeafColumn(col *parquet.Column, do func(*parquet.Column) error) error {\n\tchildren := col.Columns()\n\n\tif len(children) == 0 {\n\t\treturn do(col)\n\t}\n\n\tfor _, child := range children {\n\t\tif err := forEachLeafColumn(child, do); err != nil {\n\t\t\treturn err\n\t\t}\n\t}\n\n\treturn nil\n}\n\nfunc forEachPage(pages parquet.PageReader, do func(parquet.Page) error) error {\n\tdoAndReleasePage := func(page parquet.Page) error {\n\t\tdefer parquet.Release(page)\n\t\treturn do(page)\n\t}\n\n\tfor {\n\t\tp, err := pages.ReadPage()\n\t\tif err != nil {\n\t\t\tif err == io.EOF {\n\t\t\t\terr = nil\n\t\t\t}\n\t\t\treturn err\n\t\t}\n\t\tif err := doAndReleasePage(p); err != nil {\n\t\t\treturn err\n\t\t}\n\t}\n}\n\nfunc forEachValue(values parquet.ValueReader, do func(parquet.Value) error) error {\n\tbuffer := [3]parquet.Value{}\n\tfor {\n\t\tn, err := values.ReadValues(buffer[:])\n\t\tfor _, v := range buffer[:n] {\n\t\t\tif err := do(v); err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\t\t}\n\t\tif err != nil {\n\t\t\tif err == io.EOF {\n\t\t\t\terr = nil\n\t\t\t}\n\t\t\treturn err\n\t\t}\n\t}\n}\n\nfunc forEachColumnPage(col *parquet.Column, do func(*parquet.Column, parquet.Page) error) error {\n\treturn forEachLeafColumn(col, func(leaf *parquet.Column) error {\n\t\tpages := leaf.Pages()\n\t\tdefer pages.Close()\n\t\treturn forEachPage(pages, func(page parquet.Page) error { return do(leaf, page) })\n\t})\n}\n\nfunc forEachColumnValue(col *parquet.Column, do func(*parquet.Column, parquet.Value) error) error {\n\treturn forEachColumnPage(col, func(leaf *parquet.Column, page parquet.Page) error {\n\t\treturn forEachValue(page.Values(), func(value parquet.Value) error { return do(leaf, value) })\n\t})\n}\n\nfunc forEachColumnChunk(file *parquet.File, do func(*parquet.Column, parquet.ColumnChunk) error) error {\n\treturn forEachLeafColumn(file.Root(), func(leaf *parquet.Column) error {\n\t\tfor _, rowGroup := range file.RowGroups() {\n\t\t\tif err := do(leaf, rowGroup.ColumnChunks()[leaf.Index()]); err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\t\t}\n\t\treturn nil\n\t})\n}\n\nfunc createParquetFile(rows rows, options ...parquet.WriterOption) (*parquet.File, error) {\n\tbuffer := new(bytes.Buffer)\n\n\tif err := writeParquetFile(buffer, rows, options...); err != nil {\n\t\treturn nil, err\n\t}\n\n\treader := bytes.NewReader(buffer.Bytes())\n\treturn parquet.OpenFile(reader, reader.Size())\n}\n\nfunc writeParquetFile(w io.Writer, rows rows, options ...parquet.WriterOption) error {\n\twriter := parquet.NewWriter(w, options...)\n\n\tfor _, row := range rows {\n\t\tif err := writer.Write(row); err != nil {\n\t\t\treturn err\n\t\t}\n\t}\n\n\treturn writer.Close()\n}\n\nfunc writeParquetFileWithBuffer(w io.Writer, rows rows, options ...parquet.WriterOption) error {\n\tbuffer := parquet.NewBuffer()\n\tfor _, row := range rows {\n\t\tif err := buffer.Write(row); err != nil {\n\t\t\treturn err\n\t\t}\n\t}\n\n\twriter := parquet.NewWriter(w, options...)\n\tnumRows, err := copyRowsAndClose(writer, buffer.Rows())\n\tif err != nil {\n\t\treturn err\n\t}\n\tif numRows != int64(len(rows)) {\n\t\treturn fmt.Errorf(\"wrong number of rows written from buffer to file: want=%d got=%d\", len(rows), numRows)\n\t}\n\treturn writer.Close()\n}\n\ntype rows []interface{}\n\nfunc makeRows(any interface{}) rows {\n\tif v, ok := any.([]interface{}); ok {\n\t\treturn rows(v)\n\t}\n\tvalue := reflect.ValueOf(any)\n\tslice := make([]interface{}, value.Len())\n\tfor i := range slice {\n\t\tslice[i] = value.Index(i).Interface()\n\t}\n\treturn rows(slice)\n}\n\nfunc randValueFuncOf(t parquet.Type) func(*rand.Rand) parquet.Value {\n\tswitch k := t.Kind(); k {\n\tcase parquet.Boolean:\n\t\treturn func(r *rand.Rand) parquet.Value {\n\t\t\treturn parquet.ValueOf(r.Float64() < 0.5)\n\t\t}\n\n\tcase parquet.Int32:\n\t\treturn func(r *rand.Rand) parquet.Value {\n\t\t\treturn parquet.ValueOf(r.Int31())\n\t\t}\n\n\tcase parquet.Int64:\n\t\treturn func(r *rand.Rand) parquet.Value {\n\t\t\treturn parquet.ValueOf(r.Int63())\n\t\t}\n\n\tcase parquet.Int96:\n\t\treturn func(r *rand.Rand) parquet.Value {\n\t\t\treturn parquet.ValueOf(deprecated.Int96{\n\t\t\t\t0: r.Uint32(),\n\t\t\t\t1: r.Uint32(),\n\t\t\t\t2: r.Uint32(),\n\t\t\t})\n\t\t}\n\n\tcase parquet.Float:\n\t\treturn func(r *rand.Rand) parquet.Value {\n\t\t\treturn parquet.ValueOf(r.Float32())\n\t\t}\n\n\tcase parquet.Double:\n\t\treturn func(r *rand.Rand) parquet.Value {\n\t\t\treturn parquet.ValueOf(r.Float64())\n\t\t}\n\n\tcase parquet.ByteArray:\n\t\treturn func(r *rand.Rand) parquet.Value {\n\t\t\tn := r.Intn(49) + 1\n\t\t\tb := make([]byte, n)\n\t\t\tconst characters = \"1234567890qwertyuiopasdfghjklzxcvbnm \"\n\t\t\tfor i := range b {\n\t\t\t\tb[i] = characters[r.Intn(len(characters))]\n\t\t\t}\n\t\t\treturn parquet.ValueOf(b)\n\t\t}\n\n\tcase parquet.FixedLenByteArray:\n\t\tarrayType := reflect.ArrayOf(t.Length(), reflect.TypeOf(byte(0)))\n\t\treturn func(r *rand.Rand) parquet.Value {\n\t\t\tb := make([]byte, arrayType.Len())\n\t\t\tr.Read(b)\n\t\t\tv := reflect.New(arrayType).Elem()\n\t\t\treflect.Copy(v, reflect.ValueOf(b))\n\t\t\treturn parquet.ValueOf(v.Interface())\n\t\t}\n\n\tdefault:\n\t\tpanic(\"NOT IMPLEMENTED\")\n\t}\n}\n\nfunc copyRowsAndClose(w parquet.RowWriter, r parquet.Rows) (int64, error) {\n\tdefer r.Close()\n\treturn parquet.CopyRows(w, r)\n}\n\nfunc benchmarkRowsPerSecond(b *testing.B, f func() int) {\n\tb.ResetTimer()\n\tstart := time.Now()\n\tnumRows := int64(0)\n\n\tfor i := 0; i < b.N; i++ {\n\t\tn := f()\n\t\tnumRows += int64(n)\n\t}\n\n\tseconds := time.Since(start).Seconds()\n\tb.ReportMetric(float64(numRows)/seconds, \"row/s\")\n}\n\nfunc generateString(r *rand.Rand, n int) string {\n\tconst characters = \"1234567890qwertyuiopasdfghjklzxcvbnm\"\n\tb := new(strings.Builder)\n\tfor i := 0; i < n; i++ {\n\t\tb.WriteByte(characters[r.Intn(len(characters))])\n\t}\n\treturn b.String()\n}\n\nvar quickCheckConfig = quick.Config{\n\tSizes: []int{\n\t\t0, 1, 2, 3, 4, 5, 6, 7, 8, 9,\n\t\t10, 20, 30, 40, 50, 123,\n\t\t4096 + 1,\n\t},\n}\n\nfunc quickCheck(f interface{}) error {\n\treturn quickCheckConfig.Check(f)\n}\n"
  },
  {
    "path": "print.go",
    "content": "package parquet\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n\t\"io\"\n\t\"strconv\"\n\t\"strings\"\n\n\t\"github.com/olekukonko/tablewriter\"\n)\n\nfunc PrintSchema(w io.Writer, name string, node Node) error {\n\treturn PrintSchemaIndent(w, name, node, \"\\t\", \"\\n\")\n}\n\nfunc PrintSchemaIndent(w io.Writer, name string, node Node, pattern, newline string) error {\n\tpw := &printWriter{writer: w}\n\tpi := &printIndent{}\n\n\tif node.Leaf() {\n\t\tprintSchemaWithIndent(pw, \"\", node, pi)\n\t} else {\n\t\tpw.WriteString(\"message \")\n\n\t\tif name == \"\" {\n\t\t\tpw.WriteString(\"{\")\n\t\t} else {\n\t\t\tpw.WriteString(name)\n\t\t\tpw.WriteString(\" {\")\n\t\t}\n\n\t\tpi.pattern = pattern\n\t\tpi.newline = newline\n\t\tpi.repeat = 1\n\t\tpi.writeNewLine(pw)\n\n\t\tfor _, field := range node.Fields() {\n\t\t\tprintSchemaWithIndent(pw, field.Name(), field, pi)\n\t\t\tpi.writeNewLine(pw)\n\t\t}\n\n\t\tpw.WriteString(\"}\")\n\t}\n\n\treturn pw.err\n}\n\nfunc printSchemaWithIndent(w io.StringWriter, name string, node Node, indent *printIndent) {\n\tindent.writeTo(w)\n\n\tswitch {\n\tcase node.Optional():\n\t\tw.WriteString(\"optional \")\n\tcase node.Repeated():\n\t\tw.WriteString(\"repeated \")\n\tdefault:\n\t\tw.WriteString(\"required \")\n\t}\n\n\tif node.Leaf() {\n\t\tt := node.Type()\n\t\tswitch t.Kind() {\n\t\tcase Boolean:\n\t\t\tw.WriteString(\"boolean\")\n\t\tcase Int32:\n\t\t\tw.WriteString(\"int32\")\n\t\tcase Int64:\n\t\t\tw.WriteString(\"int64\")\n\t\tcase Int96:\n\t\t\tw.WriteString(\"int96\")\n\t\tcase Float:\n\t\t\tw.WriteString(\"float\")\n\t\tcase Double:\n\t\t\tw.WriteString(\"double\")\n\t\tcase ByteArray:\n\t\t\tw.WriteString(\"binary\")\n\t\tcase FixedLenByteArray:\n\t\t\tw.WriteString(\"fixed_len_byte_array(\")\n\t\t\tw.WriteString(strconv.Itoa(t.Length()))\n\t\t\tw.WriteString(\")\")\n\t\tdefault:\n\t\t\tw.WriteString(\"<?>\")\n\t\t}\n\n\t\tif name != \"\" {\n\t\t\tw.WriteString(\" \")\n\t\t\tw.WriteString(name)\n\t\t}\n\n\t\tif annotation := annotationOf(node); annotation != \"\" {\n\t\t\tw.WriteString(\" (\")\n\t\t\tw.WriteString(annotation)\n\t\t\tw.WriteString(\")\")\n\t\t}\n\n\t\tw.WriteString(\";\")\n\t} else {\n\t\tw.WriteString(\"group\")\n\n\t\tif name != \"\" {\n\t\t\tw.WriteString(\" \")\n\t\t\tw.WriteString(name)\n\t\t}\n\n\t\tif annotation := annotationOf(node); annotation != \"\" {\n\t\t\tw.WriteString(\" (\")\n\t\t\tw.WriteString(annotation)\n\t\t\tw.WriteString(\")\")\n\t\t}\n\n\t\tw.WriteString(\" {\")\n\t\tindent.writeNewLine(w)\n\t\tindent.push()\n\n\t\tfor _, field := range node.Fields() {\n\t\t\tprintSchemaWithIndent(w, field.Name(), field, indent)\n\t\t\tindent.writeNewLine(w)\n\t\t}\n\n\t\tindent.pop()\n\t\tindent.writeTo(w)\n\t\tw.WriteString(\"}\")\n\t}\n}\n\nfunc annotationOf(node Node) string {\n\tif logicalType := node.Type().LogicalType(); logicalType != nil {\n\t\treturn logicalType.String()\n\t}\n\treturn \"\"\n}\n\ntype printIndent struct {\n\tpattern string\n\tnewline string\n\trepeat  int\n}\n\nfunc (i *printIndent) push() {\n\ti.repeat++\n}\n\nfunc (i *printIndent) pop() {\n\ti.repeat--\n}\n\nfunc (i *printIndent) writeTo(w io.StringWriter) {\n\tif i.pattern != \"\" {\n\t\tfor n := i.repeat; n > 0; n-- {\n\t\t\tw.WriteString(i.pattern)\n\t\t}\n\t}\n}\n\nfunc (i *printIndent) writeNewLine(w io.StringWriter) {\n\tif i.newline != \"\" {\n\t\tw.WriteString(i.newline)\n\t}\n}\n\ntype printWriter struct {\n\twriter io.Writer\n\terr    error\n}\n\nfunc (w *printWriter) Write(b []byte) (int, error) {\n\tif w.err != nil {\n\t\treturn 0, w.err\n\t}\n\tn, err := w.writer.Write(b)\n\tif err != nil {\n\t\tw.err = err\n\t}\n\treturn n, err\n}\n\nfunc (w *printWriter) WriteString(s string) (int, error) {\n\tif w.err != nil {\n\t\treturn 0, w.err\n\t}\n\tn, err := io.WriteString(w.writer, s)\n\tif err != nil {\n\t\tw.err = err\n\t}\n\treturn n, err\n}\n\nvar (\n\t_ io.StringWriter = (*printWriter)(nil)\n)\n\nfunc sprint(name string, node Node) string {\n\ts := new(strings.Builder)\n\tPrintSchema(s, name, node)\n\treturn s.String()\n}\n\nfunc PrintRowGroup(w io.Writer, rowGroup RowGroup) error {\n\tschema := rowGroup.Schema()\n\tpw := &printWriter{writer: w}\n\ttw := tablewriter.NewWriter(pw)\n\n\tcolumns := schema.Columns()\n\theader := make([]string, len(columns))\n\tfooter := make([]string, len(columns))\n\talignment := make([]int, len(columns))\n\n\tfor i, column := range columns {\n\t\tleaf, _ := schema.Lookup(column...)\n\t\tcolumnType := leaf.Node.Type()\n\n\t\theader[i] = strings.Join(column, \".\")\n\t\tfooter[i] = columnType.String()\n\n\t\tswitch columnType.Kind() {\n\t\tcase ByteArray:\n\t\t\talignment[i] = tablewriter.ALIGN_LEFT\n\t\tdefault:\n\t\t\talignment[i] = tablewriter.ALIGN_RIGHT\n\t\t}\n\t}\n\n\trowbuf := make([]Row, defaultRowBufferSize)\n\tcells := make([]string, 0, len(columns))\n\trows := rowGroup.Rows()\n\tdefer rows.Close()\n\n\tfor {\n\t\tn, err := rows.ReadRows(rowbuf)\n\n\t\tfor _, row := range rowbuf[:n] {\n\t\t\tcells = cells[:0]\n\n\t\t\tfor _, value := range row {\n\t\t\t\tcolumnIndex := value.Column()\n\n\t\t\t\tfor len(cells) <= columnIndex {\n\t\t\t\t\tcells = append(cells, \"\")\n\t\t\t\t}\n\n\t\t\t\tif cells[columnIndex] == \"\" {\n\t\t\t\t\tcells[columnIndex] = value.String()\n\t\t\t\t} else {\n\t\t\t\t\tcells[columnIndex] += \",\" + value.String()\n\t\t\t\t\talignment[columnIndex] = tablewriter.ALIGN_LEFT\n\t\t\t\t}\n\t\t\t}\n\n\t\t\ttw.Append(cells)\n\t\t}\n\n\t\tif err != nil {\n\t\t\tif errors.Is(err, io.EOF) {\n\t\t\t\tbreak\n\t\t\t}\n\t\t\treturn err\n\t\t}\n\t}\n\n\ttw.SetAutoFormatHeaders(false)\n\ttw.SetColumnAlignment(alignment)\n\ttw.SetHeaderAlignment(tablewriter.ALIGN_LEFT)\n\ttw.SetFooterAlignment(tablewriter.ALIGN_LEFT)\n\ttw.SetHeader(header)\n\ttw.SetFooter(footer)\n\ttw.Render()\n\n\tfmt.Fprintf(pw, \"%d rows\\n\\n\", rowGroup.NumRows())\n\treturn pw.err\n}\n\nfunc PrintColumnChunk(w io.Writer, columnChunk ColumnChunk) error {\n\tpw := &printWriter{writer: w}\n\tpw.WriteString(columnChunk.Type().String())\n\tpw.WriteString(\"\\n--------------------------------------------------------------------------------\\n\")\n\n\tvalues := [42]Value{}\n\tpages := columnChunk.Pages()\n\tnumPages, numValues := int64(0), int64(0)\n\n\tdefer pages.Close()\n\tfor {\n\t\tp, err := pages.ReadPage()\n\t\tif err != nil {\n\t\t\tif !errors.Is(err, io.EOF) {\n\t\t\t\treturn err\n\t\t\t}\n\t\t\tbreak\n\t\t}\n\n\t\tnumPages++\n\t\tn := p.NumValues()\n\t\tif n == 0 {\n\t\t\tfmt.Fprintf(pw, \"*** page %d, no values ***\\n\", numPages)\n\t\t} else {\n\t\t\tfmt.Fprintf(pw, \"*** page %d, values %d to %d ***\\n\", numPages, numValues+1, numValues+n)\n\t\t\tprintPage(w, p, values[:], numValues+1)\n\t\t\tnumValues += n\n\t\t}\n\n\t\tpw.WriteString(\"\\n\")\n\t}\n\n\treturn pw.err\n}\n\nfunc PrintPage(w io.Writer, page Page) error {\n\treturn printPage(w, page, make([]Value, 42), 0)\n}\n\nfunc printPage(w io.Writer, page Page, values []Value, numValues int64) error {\n\tr := page.Values()\n\tfor {\n\t\tn, err := r.ReadValues(values[:])\n\t\tfor i, v := range values[:n] {\n\t\t\t_, err := fmt.Fprintf(w, \"value %d: %+v\\n\", numValues+int64(i), v)\n\t\t\tif err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\t\t}\n\t\tif err != nil {\n\t\t\tif errors.Is(err, io.EOF) {\n\t\t\t\terr = nil\n\t\t\t}\n\t\t\treturn err\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "print_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc TestPrintSchema(t *testing.T) {\n\ttests := []struct {\n\t\tnode  parquet.Node\n\t\tprint string\n\t}{\n\t\t{\n\t\t\tnode: parquet.Group{\"on\": parquet.Leaf(parquet.BooleanType)},\n\t\t\tprint: `message Test {\n\trequired boolean on;\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"name\": parquet.String()},\n\t\t\tprint: `message Test {\n\trequired binary name (STRING);\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"uuid\": parquet.UUID()},\n\t\t\tprint: `message Test {\n\trequired fixed_len_byte_array(16) uuid (UUID);\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"enum\": parquet.Enum()},\n\t\t\tprint: `message Test {\n\trequired binary enum (ENUM);\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"json\": parquet.JSON()},\n\t\t\tprint: `message Test {\n\trequired binary json (JSON);\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"bson\": parquet.BSON()},\n\t\t\tprint: `message Test {\n\trequired binary bson (BSON);\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"name\": parquet.Optional(parquet.String())},\n\t\t\tprint: `message Test {\n\toptional binary name (STRING);\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"name\": parquet.Repeated(parquet.String())},\n\t\t\tprint: `message Test {\n\trepeated binary name (STRING);\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"age\": parquet.Int(8)},\n\t\t\tprint: `message Test {\n\trequired int32 age (INT(8,true));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"age\": parquet.Int(16)},\n\t\t\tprint: `message Test {\n\trequired int32 age (INT(16,true));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"age\": parquet.Int(32)},\n\t\t\tprint: `message Test {\n\trequired int32 age (INT(32,true));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"age\": parquet.Int(64)},\n\t\t\tprint: `message Test {\n\trequired int64 age (INT(64,true));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"age\": parquet.Uint(8)},\n\t\t\tprint: `message Test {\n\trequired int32 age (INT(8,false));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"age\": parquet.Uint(16)},\n\t\t\tprint: `message Test {\n\trequired int32 age (INT(16,false));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"age\": parquet.Uint(32)},\n\t\t\tprint: `message Test {\n\trequired int32 age (INT(32,false));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"age\": parquet.Uint(64)},\n\t\t\tprint: `message Test {\n\trequired int64 age (INT(64,false));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"ratio\": parquet.Leaf(parquet.FloatType)},\n\t\t\tprint: `message Test {\n\trequired float ratio;\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"ratio\": parquet.Leaf(parquet.DoubleType)},\n\t\t\tprint: `message Test {\n\trequired double ratio;\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"cost\": parquet.Decimal(0, 9, parquet.Int32Type)},\n\t\t\tprint: `message Test {\n\trequired int32 cost (DECIMAL(9,0));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"cost\": parquet.Decimal(0, 18, parquet.Int64Type)},\n\t\t\tprint: `message Test {\n\trequired int64 cost (DECIMAL(18,0));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"date\": parquet.Date()},\n\t\t\tprint: `message Test {\n\trequired int32 date (DATE);\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"time\": parquet.Time(parquet.Millisecond)},\n\t\t\tprint: `message Test {\n\trequired int32 time (TIME(isAdjustedToUTC=true,unit=MILLIS));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"time\": parquet.Time(parquet.Microsecond)},\n\t\t\tprint: `message Test {\n\trequired int64 time (TIME(isAdjustedToUTC=true,unit=MICROS));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"time\": parquet.Time(parquet.Nanosecond)},\n\t\t\tprint: `message Test {\n\trequired int64 time (TIME(isAdjustedToUTC=true,unit=NANOS));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"timestamp\": parquet.Timestamp(parquet.Millisecond)},\n\t\t\tprint: `message Test {\n\trequired int64 timestamp (TIMESTAMP(isAdjustedToUTC=true,unit=MILLIS));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"timestamp\": parquet.Timestamp(parquet.Microsecond)},\n\t\t\tprint: `message Test {\n\trequired int64 timestamp (TIMESTAMP(isAdjustedToUTC=true,unit=MICROS));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"timestamp\": parquet.Timestamp(parquet.Nanosecond)},\n\t\t\tprint: `message Test {\n\trequired int64 timestamp (TIMESTAMP(isAdjustedToUTC=true,unit=NANOS));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\"names\": parquet.List(parquet.String())},\n\t\t\tprint: `message Test {\n\trequired group names (LIST) {\n\t\trepeated group list {\n\t\t\trequired binary element (STRING);\n\t\t}\n\t}\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\n\t\t\t\t\"keys\": parquet.List(\n\t\t\t\t\tparquet.Group{\n\t\t\t\t\t\t\"key\":   parquet.String(),\n\t\t\t\t\t\t\"value\": parquet.String(),\n\t\t\t\t\t},\n\t\t\t\t),\n\t\t\t},\n\t\t\tprint: `message Test {\n\trequired group keys (LIST) {\n\t\trepeated group list {\n\t\t\trequired group element {\n\t\t\t\trequired binary key (STRING);\n\t\t\t\trequired binary value (STRING);\n\t\t\t}\n\t\t}\n\t}\n}`,\n\t\t},\n\n\t\t{\n\t\t\tnode: parquet.Group{\n\t\t\t\t\"pairs\": parquet.Map(\n\t\t\t\t\tparquet.String(),\n\t\t\t\t\tparquet.String(),\n\t\t\t\t),\n\t\t\t},\n\t\t\tprint: `message Test {\n\trequired group pairs (MAP) {\n\t\trepeated group key_value {\n\t\t\trequired binary key (STRING);\n\t\t\trequired binary value (STRING);\n\t\t}\n\t}\n}`,\n\t\t},\n\t}\n\n\tfor _, test := range tests {\n\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\tbuf := new(strings.Builder)\n\n\t\t\tif err := parquet.PrintSchema(buf, \"Test\", test.node); err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t}\n\n\t\t\tif buf.String() != test.print {\n\t\t\t\tt.Errorf(\"\\nexpected:\\n\\n%s\\n\\nfound:\\n\\n%s\\n\", test.print, buf)\n\t\t\t}\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "reader.go",
    "content": "package parquet\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n\t\"io\"\n\t\"reflect\"\n)\n\n// Deprecated: A Reader reads Go values from parquet files.\n//\n// This example showcases a typical use of parquet readers:\n//\n//\treader := parquet.NewReader(file)\n//\trows := []RowType{}\n//\tfor {\n//\t\trow := RowType{}\n//\t\terr := reader.Read(&row)\n//\t\tif err != nil {\n//\t\t\tif err == io.EOF {\n//\t\t\t\tbreak\n//\t\t\t}\n//\t\t\t...\n//\t\t}\n//\t\trows = append(rows, row)\n//\t}\n//\tif err := reader.Close(); err != nil {\n//\t\t...\n//\t}\n//\n// For programs building with Go 1.18 or later, the GenericReader[T] type\n// supersedes this one.\ntype Reader struct {\n\tseen     reflect.Type\n\tfile     reader\n\tread     reader\n\trowIndex int64\n\trowbuf   []Row\n}\n\n// NewReader constructs a parquet reader reading rows from the given\n// io.ReaderAt.\n//\n// In order to read parquet rows, the io.ReaderAt must be converted to a\n// parquet.File. If r is already a parquet.File it is used directly; otherwise,\n// the io.ReaderAt value is expected to either have a `Size() int64` method or\n// implement io.Seeker in order to determine its size.\n//\n// The function panics if the reader configuration is invalid. Programs that\n// cannot guarantee the validity of the options passed to NewReader should\n// construct the reader configuration independently prior to calling this\n// function:\n//\n//\tconfig, err := parquet.NewReaderConfig(options...)\n//\tif err != nil {\n//\t\t// handle the configuration error\n//\t\t...\n//\t} else {\n//\t\t// this call to create a reader is guaranteed not to panic\n//\t\treader := parquet.NewReader(input, config)\n//\t\t...\n//\t}\nfunc NewReader(input io.ReaderAt, options ...ReaderOption) *Reader {\n\tc, err := NewReaderConfig(options...)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\n\tf, err := openFile(input)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\n\tr := &Reader{\n\t\tfile: reader{\n\t\t\tschema:   f.schema,\n\t\t\trowGroup: fileRowGroupOf(f),\n\t\t},\n\t}\n\n\tif c.Schema != nil {\n\t\tr.file.schema = c.Schema\n\t\tr.file.rowGroup = convertRowGroupTo(r.file.rowGroup, c.Schema)\n\t}\n\n\tr.read.init(r.file.schema, r.file.rowGroup)\n\treturn r\n}\n\nfunc openFile(input io.ReaderAt) (*File, error) {\n\tf, _ := input.(*File)\n\tif f != nil {\n\t\treturn f, nil\n\t}\n\tn, err := sizeOf(input)\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\treturn OpenFile(input, n)\n}\n\nfunc fileRowGroupOf(f *File) RowGroup {\n\tswitch rowGroups := f.RowGroups(); len(rowGroups) {\n\tcase 0:\n\t\treturn newEmptyRowGroup(f.Schema())\n\tcase 1:\n\t\treturn rowGroups[0]\n\tdefault:\n\t\t// TODO: should we attempt to merge the row groups via MergeRowGroups\n\t\t// to preserve the global order of sorting columns within the file?\n\t\treturn newMultiRowGroup(f.config.ReadMode, rowGroups...)\n\t}\n}\n\n// NewRowGroupReader constructs a new Reader which reads rows from the RowGroup\n// passed as argument.\nfunc NewRowGroupReader(rowGroup RowGroup, options ...ReaderOption) *Reader {\n\tc, err := NewReaderConfig(options...)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\n\tif c.Schema != nil {\n\t\trowGroup = convertRowGroupTo(rowGroup, c.Schema)\n\t}\n\n\tr := &Reader{\n\t\tfile: reader{\n\t\t\tschema:   rowGroup.Schema(),\n\t\t\trowGroup: rowGroup,\n\t\t},\n\t}\n\n\tr.read.init(r.file.schema, r.file.rowGroup)\n\treturn r\n}\n\nfunc convertRowGroupTo(rowGroup RowGroup, schema *Schema) RowGroup {\n\tif rowGroupSchema := rowGroup.Schema(); !nodesAreEqual(schema, rowGroupSchema) {\n\t\tconv, err := Convert(schema, rowGroupSchema)\n\t\tif err != nil {\n\t\t\t// TODO: this looks like something we should not be panicking on,\n\t\t\t// but the current NewReader API does not offer a mechanism to\n\t\t\t// report errors.\n\t\t\tpanic(err)\n\t\t}\n\t\trowGroup = ConvertRowGroup(rowGroup, conv)\n\t}\n\treturn rowGroup\n}\n\nfunc sizeOf(r io.ReaderAt) (int64, error) {\n\tswitch f := r.(type) {\n\tcase interface{ Size() int64 }:\n\t\treturn f.Size(), nil\n\tcase io.Seeker:\n\t\toff, err := f.Seek(0, io.SeekCurrent)\n\t\tif err != nil {\n\t\t\treturn 0, err\n\t\t}\n\t\tend, err := f.Seek(0, io.SeekEnd)\n\t\tif err != nil {\n\t\t\treturn 0, err\n\t\t}\n\t\t_, err = f.Seek(off, io.SeekStart)\n\t\treturn end, err\n\tdefault:\n\t\treturn 0, fmt.Errorf(\"cannot determine length of %T\", r)\n\t}\n}\n\n// Reset repositions the reader at the beginning of the underlying parquet file.\nfunc (r *Reader) Reset() {\n\tr.file.Reset()\n\tr.read.Reset()\n\tr.rowIndex = 0\n\tclearRows(r.rowbuf)\n}\n\n// Read reads the next row from r. The type of the row must match the schema\n// of the underlying parquet file or an error will be returned.\n//\n// The method returns io.EOF when no more rows can be read from r.\nfunc (r *Reader) Read(row interface{}) error {\n\tif rowType := dereference(reflect.TypeOf(row)); rowType.Kind() == reflect.Struct {\n\t\tif r.seen != rowType {\n\t\t\tif err := r.updateReadSchema(rowType); err != nil {\n\t\t\t\treturn fmt.Errorf(\"cannot read parquet row into go value of type %T: %w\", row, err)\n\t\t\t}\n\t\t}\n\t}\n\n\tif err := r.read.SeekToRow(r.rowIndex); err != nil {\n\t\tif errors.Is(err, io.ErrClosedPipe) {\n\t\t\treturn io.EOF\n\t\t}\n\t\treturn fmt.Errorf(\"seeking reader to row %d: %w\", r.rowIndex, err)\n\t}\n\n\tif cap(r.rowbuf) == 0 {\n\t\tr.rowbuf = make([]Row, 1)\n\t} else {\n\t\tr.rowbuf = r.rowbuf[:1]\n\t}\n\n\tn, err := r.read.ReadRows(r.rowbuf[:])\n\tif n == 0 {\n\t\treturn err\n\t}\n\n\tr.rowIndex++\n\treturn r.read.schema.Reconstruct(row, r.rowbuf[0])\n}\n\nfunc (r *Reader) updateReadSchema(rowType reflect.Type) error {\n\tschema := schemaOf(rowType)\n\n\tif nodesAreEqual(schema, r.file.schema) {\n\t\tr.read.init(schema, r.file.rowGroup)\n\t} else {\n\t\tconv, err := Convert(schema, r.file.schema)\n\t\tif err != nil {\n\t\t\treturn err\n\t\t}\n\t\tr.read.init(schema, ConvertRowGroup(r.file.rowGroup, conv))\n\t}\n\n\tr.seen = rowType\n\treturn nil\n}\n\n// ReadRows reads the next rows from r into the given Row buffer.\n//\n// The returned values are laid out in the order expected by the\n// parquet.(*Schema).Reconstruct method.\n//\n// The method returns io.EOF when no more rows can be read from r.\nfunc (r *Reader) ReadRows(rows []Row) (int, error) {\n\tif err := r.file.SeekToRow(r.rowIndex); err != nil {\n\t\treturn 0, err\n\t}\n\tn, err := r.file.ReadRows(rows)\n\tr.rowIndex += int64(n)\n\treturn n, err\n}\n\n// Schema returns the schema of rows read by r.\nfunc (r *Reader) Schema() *Schema { return r.file.schema }\n\n// NumRows returns the number of rows that can be read from r.\nfunc (r *Reader) NumRows() int64 { return r.file.rowGroup.NumRows() }\n\n// SeekToRow positions r at the given row index.\nfunc (r *Reader) SeekToRow(rowIndex int64) error {\n\tif err := r.file.SeekToRow(rowIndex); err != nil {\n\t\treturn err\n\t}\n\tr.rowIndex = rowIndex\n\treturn nil\n}\n\n// Close closes the reader, preventing more rows from being read.\nfunc (r *Reader) Close() error {\n\tif err := r.read.Close(); err != nil {\n\t\treturn err\n\t}\n\tif err := r.file.Close(); err != nil {\n\t\treturn err\n\t}\n\treturn nil\n}\n\n// reader is a subtype used in the implementation of Reader to support the two\n// use cases of either reading rows calling the ReadRow method (where full rows\n// are read from the underlying parquet file), or calling the Read method to\n// read rows into Go values, potentially doing partial reads on a subset of the\n// columns due to using a converted row group view.\ntype reader struct {\n\tschema   *Schema\n\trowGroup RowGroup\n\trows     Rows\n\trowIndex int64\n}\n\nfunc (r *reader) init(schema *Schema, rowGroup RowGroup) {\n\tr.schema = schema\n\tr.rowGroup = rowGroup\n\tr.Reset()\n}\n\nfunc (r *reader) Reset() {\n\tr.rowIndex = 0\n\n\tif rows, ok := r.rows.(interface{ Reset() }); ok {\n\t\t// This optimization works for the common case where the underlying type\n\t\t// of the Rows instance is rowGroupRows, which should be true in most\n\t\t// cases since even external implementations of the RowGroup interface\n\t\t// can construct values of this type via the NewRowGroupRowReader\n\t\t// function.\n\t\t//\n\t\t// Foreign implementations of the Rows interface may also define a Reset\n\t\t// method in order to participate in this optimization.\n\t\trows.Reset()\n\t\treturn\n\t}\n\n\tif r.rows != nil {\n\t\tr.rows.Close()\n\t\tr.rows = nil\n\t}\n}\n\nfunc (r *reader) ReadRows(rows []Row) (int, error) {\n\tif r.rowGroup == nil {\n\t\treturn 0, io.EOF\n\t}\n\tif r.rows == nil {\n\t\tr.rows = r.rowGroup.Rows()\n\t\tif r.rowIndex > 0 {\n\t\t\tif err := r.rows.SeekToRow(r.rowIndex); err != nil {\n\t\t\t\treturn 0, err\n\t\t\t}\n\t\t}\n\t}\n\tn, err := r.rows.ReadRows(rows)\n\tr.rowIndex += int64(n)\n\treturn n, err\n}\n\nfunc (r *reader) SeekToRow(rowIndex int64) error {\n\tif r.rowGroup == nil {\n\t\treturn io.ErrClosedPipe\n\t}\n\tif rowIndex != r.rowIndex {\n\t\tif r.rows != nil {\n\t\t\tif err := r.rows.SeekToRow(rowIndex); err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\t\t}\n\t\tr.rowIndex = rowIndex\n\t}\n\treturn nil\n}\n\nfunc (r *reader) Close() (err error) {\n\tr.rowGroup = nil\n\tif r.rows != nil {\n\t\terr = r.rows.Close()\n\t}\n\treturn err\n}\n\nvar (\n\t_ Rows                = (*Reader)(nil)\n\t_ RowReaderWithSchema = (*Reader)(nil)\n\n\t_ RowReader = (*reader)(nil)\n\t_ RowSeeker = (*reader)(nil)\n)\n"
  },
  {
    "path": "reader_go18.go",
    "content": "//go:build go1.18\n\npackage parquet\n\nimport (\n\t\"io\"\n\t\"reflect\"\n)\n\n// GenericReader is similar to a Reader but uses a type parameter to define the\n// Go type representing the schema of rows being read.\n//\n// See GenericWriter for details about the benefits over the classic Reader API.\ntype GenericReader[T any] struct {\n\tbase Reader\n\tread readFunc[T]\n}\n\n// NewGenericReader is like NewReader but returns GenericReader[T] suited to write\n// rows of Go type T.\n//\n// The type parameter T should be a map, struct, or any. Any other types will\n// cause a panic at runtime. Type checking is a lot more effective when the\n// generic parameter is a struct type, using map and interface types is somewhat\n// similar to using a Writer.\n//\n// If the option list may explicitly declare a schema, it must be compatible\n// with the schema generated from T.\nfunc NewGenericReader[T any](input io.ReaderAt, options ...ReaderOption) *GenericReader[T] {\n\tc, err := NewReaderConfig(options...)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\n\tf, err := openFile(input)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\n\trowGroup := fileRowGroupOf(f)\n\n\tt := typeOf[T]()\n\tif c.Schema == nil {\n\t\tif t == nil {\n\t\t\tc.Schema = rowGroup.Schema()\n\t\t} else {\n\t\t\tc.Schema = schemaOf(dereference(t))\n\t\t}\n\t}\n\n\tr := &GenericReader[T]{\n\t\tbase: Reader{\n\t\t\tfile: reader{\n\t\t\t\tschema:   c.Schema,\n\t\t\t\trowGroup: rowGroup,\n\t\t\t},\n\t\t},\n\t}\n\n\tif !nodesAreEqual(c.Schema, f.schema) {\n\t\tr.base.file.rowGroup = convertRowGroupTo(r.base.file.rowGroup, c.Schema)\n\t}\n\n\tr.base.read.init(r.base.file.schema, r.base.file.rowGroup)\n\tr.read = readFuncOf[T](t, r.base.file.schema)\n\treturn r\n}\n\nfunc NewGenericRowGroupReader[T any](rowGroup RowGroup, options ...ReaderOption) *GenericReader[T] {\n\tc, err := NewReaderConfig(options...)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\n\tt := typeOf[T]()\n\tif c.Schema == nil {\n\t\tif t == nil {\n\t\t\tc.Schema = rowGroup.Schema()\n\t\t} else {\n\t\t\tc.Schema = schemaOf(dereference(t))\n\t\t}\n\t}\n\n\tr := &GenericReader[T]{\n\t\tbase: Reader{\n\t\t\tfile: reader{\n\t\t\t\tschema:   c.Schema,\n\t\t\t\trowGroup: rowGroup,\n\t\t\t},\n\t\t},\n\t}\n\n\tif !nodesAreEqual(c.Schema, rowGroup.Schema()) {\n\t\tr.base.file.rowGroup = convertRowGroupTo(r.base.file.rowGroup, c.Schema)\n\t}\n\n\tr.base.read.init(r.base.file.schema, r.base.file.rowGroup)\n\tr.read = readFuncOf[T](t, r.base.file.schema)\n\treturn r\n}\n\nfunc (r *GenericReader[T]) Reset() {\n\tr.base.Reset()\n}\n\n// Read reads the next rows from the reader into the given rows slice up to len(rows).\n//\n// The returned values are safe to reuse across Read calls and do not share\n// memory with the reader's underlying page buffers.\n//\n// The method returns the number of rows read and io.EOF when no more rows\n// can be read from the reader.\nfunc (r *GenericReader[T]) Read(rows []T) (int, error) {\n\treturn r.read(r, rows)\n}\n\nfunc (r *GenericReader[T]) ReadRows(rows []Row) (int, error) {\n\treturn r.base.ReadRows(rows)\n}\n\nfunc (r *GenericReader[T]) Schema() *Schema {\n\treturn r.base.Schema()\n}\n\nfunc (r *GenericReader[T]) NumRows() int64 {\n\treturn r.base.NumRows()\n}\n\nfunc (r *GenericReader[T]) SeekToRow(rowIndex int64) error {\n\treturn r.base.SeekToRow(rowIndex)\n}\n\nfunc (r *GenericReader[T]) Close() error {\n\treturn r.base.Close()\n}\n\n// readRows reads the next rows from the reader into the given rows slice up to len(rows).\n//\n// The returned values are safe to reuse across readRows calls and do not share\n// memory with the reader's underlying page buffers.\n//\n// The method returns the number of rows read and io.EOF when no more rows\n// can be read from the reader.\nfunc (r *GenericReader[T]) readRows(rows []T) (int, error) {\n\tnRequest := len(rows)\n\tif cap(r.base.rowbuf) < nRequest {\n\t\tr.base.rowbuf = make([]Row, nRequest)\n\t} else {\n\t\tr.base.rowbuf = r.base.rowbuf[:nRequest]\n\t}\n\n\tvar n, nTotal int\n\tvar err error\n\tfor {\n\t\t// ReadRows reads the minimum remaining rows in a column page across all columns\n\t\t// of the underlying reader, unless the length of the slice passed to it is smaller.\n\t\t// In that case, ReadRows will read the number of rows equal to the length of the\n\t\t// given slice argument. We limit that length to never be more than requested\n\t\t// because sequential reads can cross page boundaries.\n\t\tn, err = r.base.ReadRows(r.base.rowbuf[:nRequest-nTotal])\n\t\tif n > 0 {\n\t\t\tschema := r.base.Schema()\n\n\t\t\tfor i, row := range r.base.rowbuf[:n] {\n\t\t\t\tif err2 := schema.Reconstruct(&rows[nTotal+i], row); err2 != nil {\n\t\t\t\t\treturn nTotal + i, err2\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tnTotal += n\n\t\tif n == 0 || nTotal == nRequest || err != nil {\n\t\t\tbreak\n\t\t}\n\t}\n\n\treturn nTotal, err\n}\n\nvar (\n\t_ Rows                = (*GenericReader[any])(nil)\n\t_ RowReaderWithSchema = (*Reader)(nil)\n\n\t_ Rows                = (*GenericReader[struct{}])(nil)\n\t_ RowReaderWithSchema = (*GenericReader[struct{}])(nil)\n\n\t_ Rows                = (*GenericReader[map[struct{}]struct{}])(nil)\n\t_ RowReaderWithSchema = (*GenericReader[map[struct{}]struct{}])(nil)\n)\n\ntype readFunc[T any] func(*GenericReader[T], []T) (int, error)\n\nfunc readFuncOf[T any](t reflect.Type, schema *Schema) readFunc[T] {\n\tif t == nil {\n\t\treturn (*GenericReader[T]).readRows\n\t}\n\tswitch t.Kind() {\n\tcase reflect.Interface, reflect.Map:\n\t\treturn (*GenericReader[T]).readRows\n\n\tcase reflect.Struct:\n\t\treturn (*GenericReader[T]).readRows\n\n\tcase reflect.Pointer:\n\t\tif e := t.Elem(); e.Kind() == reflect.Struct {\n\t\t\treturn (*GenericReader[T]).readRows\n\t\t}\n\t}\n\tpanic(\"cannot create reader for values of type \" + t.String())\n}\n"
  },
  {
    "path": "reader_go18_test.go",
    "content": "//go:build go1.18\n\npackage parquet_test\n\nimport (\n\t\"bytes\"\n\t\"errors\"\n\t\"fmt\"\n\t\"io\"\n\t\"math/rand\"\n\t\"os\"\n\t\"reflect\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc TestGenericReader(t *testing.T) {\n\ttestGenericReader[booleanColumn](t)\n\ttestGenericReader[int32Column](t)\n\ttestGenericReader[int64Column](t)\n\ttestGenericReader[int96Column](t)\n\ttestGenericReader[floatColumn](t)\n\ttestGenericReader[doubleColumn](t)\n\ttestGenericReader[byteArrayColumn](t)\n\ttestGenericReader[fixedLenByteArrayColumn](t)\n\ttestGenericReader[stringColumn](t)\n\ttestGenericReader[indexedStringColumn](t)\n\ttestGenericReader[uuidColumn](t)\n\ttestGenericReader[timeColumn](t)\n\ttestGenericReader[timeInMillisColumn](t)\n\ttestGenericReader[mapColumn](t)\n\ttestGenericReader[decimalColumn](t)\n\ttestGenericReader[addressBook](t)\n\ttestGenericReader[contact](t)\n\ttestGenericReader[listColumn2](t)\n\ttestGenericReader[listColumn1](t)\n\ttestGenericReader[listColumn0](t)\n\ttestGenericReader[nestedListColumn1](t)\n\ttestGenericReader[nestedListColumn](t)\n\ttestGenericReader[*contact](t)\n\ttestGenericReader[paddedBooleanColumn](t)\n\ttestGenericReader[optionalInt32Column](t)\n\ttestGenericReader[repeatedInt32Column](t)\n}\n\nfunc testGenericReader[Row any](t *testing.T) {\n\tvar model Row\n\tt.Run(reflect.TypeOf(model).Name(), func(t *testing.T) {\n\t\terr := quickCheck(func(rows []Row) bool {\n\t\t\tif len(rows) == 0 {\n\t\t\t\treturn true // TODO: fix support for parquet files with zero rows\n\t\t\t}\n\t\t\tif err := testGenericReaderRows(rows); err != nil {\n\t\t\t\tt.Error(err)\n\t\t\t\treturn false\n\t\t\t}\n\t\t\treturn true\n\t\t})\n\t\tif err != nil {\n\t\t\tt.Error(err)\n\t\t}\n\t})\n}\n\nfunc testGenericReaderRows[Row any](rows []Row) error {\n\tsetNullPointers(rows)\n\tbuffer := new(bytes.Buffer)\n\twriter := parquet.NewGenericWriter[Row](buffer)\n\t_, err := writer.Write(rows)\n\tif err != nil {\n\t\treturn err\n\t}\n\tif err := writer.Close(); err != nil {\n\t\treturn err\n\t}\n\treader := parquet.NewGenericReader[Row](bytes.NewReader(buffer.Bytes()))\n\tresult := make([]Row, len(rows))\n\tn, err := reader.Read(result)\n\tif err != nil && !errors.Is(err, io.EOF) {\n\t\treturn err\n\t}\n\tif n < len(rows) {\n\t\treturn fmt.Errorf(\"not enough values were read: want=%d got=%d\", len(rows), n)\n\t}\n\tif !reflect.DeepEqual(rows, result) {\n\t\treturn fmt.Errorf(\"rows mismatch:\\nwant: %+v\\ngot: %+v\", rows, result)\n\t}\n\treturn nil\n}\n\nfunc TestIssue400(t *testing.T) {\n\ttype B struct {\n\t\tName string\n\t}\n\ttype A struct {\n\t\tB []B `parquet:\",optional\"`\n\t}\n\n\tb := new(bytes.Buffer)\n\tw := parquet.NewGenericWriter[A](b)\n\texpect := []A{\n\t\t{\n\t\t\tB: []B{\n\t\t\t\t{\n\t\t\t\t\t// 32 bytes random so we can see in the binary parquet if we\n\t\t\t\t\t// actually wrote the value\n\t\t\t\t\tName: \"9e7eb1f0-bbcc-43ec-bfad-a9fac1bb0feb\",\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\t}\n\t_, err := w.Write(expect)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\tif err = w.Close(); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tr := parquet.NewGenericReader[A](bytes.NewReader(b.Bytes()))\n\tvalues := make([]A, 1)\n\t_, err = r.Read(values)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\tif !reflect.DeepEqual(expect[0], values[0]) {\n\t\tt.Errorf(\"want %q got %q\", values[0], expect[0])\n\t}\n}\n\nfunc TestReadMinPageSize(t *testing.T) {\n\t// NOTE: min page size is 307 for MyRow schema\n\tt.Run(\"test read less than min page size\", func(t *testing.T) { testReadMinPageSize(128, t) })\n\tt.Run(\"test read equal to min page size\", func(t *testing.T) { testReadMinPageSize(307, t) })\n\tt.Run(\"test read more than min page size\", func(t *testing.T) { testReadMinPageSize(384, t) })\n\t// NOTE: num rows is 20,000\n\tt.Run(\"test read equal to num rows\", func(t *testing.T) { testReadMinPageSize(20_000, t) })\n\tt.Run(\"test read more than num rows\", func(t *testing.T) { testReadMinPageSize(25_000, t) })\n}\n\nfunc testReadMinPageSize(readSize int, t *testing.T) {\n\ttype MyRow struct {\n\t\tID    [16]byte `parquet:\"id,delta,uuid\"`\n\t\tFile  string   `parquet:\"file,dict,zstd\"`\n\t\tIndex int64    `parquet:\"index,delta,zstd\"`\n\t}\n\n\tnumRows := 20_000\n\tmaxPageBytes := 5000\n\n\ttmp, err := os.CreateTemp(\"/tmp\", \"*.parquet\")\n\tif err != nil {\n\t\tt.Fatal(\"os.CreateTemp: \", err)\n\t}\n\tpath := tmp.Name()\n\tdefer os.Remove(path)\n\tt.Log(\"file:\", path)\n\n\t// The page buffer size ensures we get multiple pages out of this example.\n\tw := parquet.NewGenericWriter[MyRow](tmp, parquet.PageBufferSize(maxPageBytes))\n\t// Need to write 1 row at a time here as writing many at once disregards PageBufferSize option.\n\tfor i := 0; i < numRows; i++ {\n\t\trow := MyRow{\n\t\t\tID:    [16]byte{15: byte(i)},\n\t\t\tFile:  \"hi\" + fmt.Sprint(i),\n\t\t\tIndex: int64(i),\n\t\t}\n\t\t_, err := w.Write([]MyRow{row})\n\t\tif err != nil {\n\t\t\tt.Fatal(\"w.Write: \", err)\n\t\t}\n\t\t// Flush writes rows as row group. 4 total (20k/5k) in this file.\n\t\tif (i+1)%maxPageBytes == 0 {\n\t\t\terr = w.Flush()\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(\"w.Flush: \", err)\n\t\t\t}\n\t\t}\n\t}\n\terr = w.Close()\n\tif err != nil {\n\t\tt.Fatal(\"w.Close: \", err)\n\t}\n\terr = tmp.Close()\n\tif err != nil {\n\t\tt.Fatal(\"tmp.Close: \", err)\n\t}\n\n\tfile, err := os.Open(path)\n\tif err != nil {\n\t\tt.Fatal(\"os.Open\", err)\n\t}\n\treader := parquet.NewGenericReader[MyRow](file)\n\tread := int64(0)\n\tnRows := reader.NumRows()\n\trows := make([]MyRow, 0, nRows)\n\tbuf := make([]MyRow, readSize) // NOTE: min page size is 307 for MyRow schema\n\n\tfor read < nRows {\n\t\tnum, err := reader.Read(buf)\n\t\tread += int64(num)\n\t\tif err != nil && !errors.Is(err, io.EOF) {\n\t\t\tt.Fatal(\"Read:\", err)\n\t\t}\n\t\trows = append(rows, buf...)\n\t}\n\n\tif err := reader.Close(); err != nil {\n\t\tt.Fatal(\"Close\", err)\n\t}\n\n\tif len(rows) < numRows {\n\t\tt.Fatalf(\"not enough values were read: want=%d got=%d\", len(rows), numRows)\n\t}\n\tfor i, row := range rows[:numRows] {\n\t\tid := [16]byte{15: byte(i)}\n\t\tfile := \"hi\" + fmt.Sprint(i)\n\t\tindex := int64(i)\n\n\t\tif row.ID != id || row.File != file || row.Index != index {\n\t\t\tt.Fatalf(\"rows mismatch at index: %d got: %+v\", i, row)\n\t\t}\n\t}\n}\n\nfunc BenchmarkGenericReader(b *testing.B) {\n\tbenchmarkGenericReader[benchmarkRowType](b)\n\tbenchmarkGenericReader[booleanColumn](b)\n\tbenchmarkGenericReader[int32Column](b)\n\tbenchmarkGenericReader[int64Column](b)\n\tbenchmarkGenericReader[floatColumn](b)\n\tbenchmarkGenericReader[doubleColumn](b)\n\tbenchmarkGenericReader[byteArrayColumn](b)\n\tbenchmarkGenericReader[fixedLenByteArrayColumn](b)\n\tbenchmarkGenericReader[stringColumn](b)\n\tbenchmarkGenericReader[indexedStringColumn](b)\n\tbenchmarkGenericReader[uuidColumn](b)\n\tbenchmarkGenericReader[timeColumn](b)\n\tbenchmarkGenericReader[timeInMillisColumn](b)\n\tbenchmarkGenericReader[mapColumn](b)\n\tbenchmarkGenericReader[decimalColumn](b)\n\tbenchmarkGenericReader[contact](b)\n\tbenchmarkGenericReader[paddedBooleanColumn](b)\n\tbenchmarkGenericReader[optionalInt32Column](b)\n}\n\nfunc benchmarkGenericReader[Row generator[Row]](b *testing.B) {\n\tvar model Row\n\tb.Run(reflect.TypeOf(model).Name(), func(b *testing.B) {\n\t\tprng := rand.New(rand.NewSource(0))\n\t\trows := make([]Row, benchmarkNumRows)\n\t\tfor i := range rows {\n\t\t\trows[i] = rows[i].generate(prng)\n\t\t}\n\n\t\trowbuf := make([]Row, benchmarkRowsPerStep)\n\t\tbuffer := parquet.NewGenericBuffer[Row]()\n\t\tbuffer.Write(rows)\n\n\t\tb.Run(\"go1.17\", func(b *testing.B) {\n\t\t\treader := parquet.NewRowGroupReader(buffer)\n\t\t\tbenchmarkRowsPerSecond(b, func() int {\n\t\t\t\tfor i := range rowbuf {\n\t\t\t\t\tif err := reader.Read(&rowbuf[i]); err != nil {\n\t\t\t\t\t\tif err != io.EOF {\n\t\t\t\t\t\t\tb.Fatal(err)\n\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\treader.Reset()\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\treturn len(rowbuf)\n\t\t\t})\n\t\t})\n\n\t\tb.Run(\"go1.18\", func(b *testing.B) {\n\t\t\treader := parquet.NewGenericRowGroupReader[Row](buffer)\n\t\t\tbenchmarkRowsPerSecond(b, func() int {\n\t\t\t\tn, err := reader.Read(rowbuf)\n\t\t\t\tif err != nil {\n\t\t\t\t\tif err != io.EOF {\n\t\t\t\t\t\tb.Fatal(err)\n\t\t\t\t\t} else {\n\t\t\t\t\t\treader.Reset()\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\treturn n\n\t\t\t})\n\t\t})\n\t})\n}\n"
  },
  {
    "path": "reader_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"io\"\n\t\"math\"\n\t\"math/rand\"\n\t\"reflect\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n\t\"github.com/segmentio/parquet-go/internal/quick\"\n)\n\nfunc rowsOf(numRows int, model interface{}) rows {\n\tprng := rand.New(rand.NewSource(0))\n\treturn randomRowsOf(prng, numRows, model)\n}\n\nfunc randomRowsOf(prng *rand.Rand, numRows int, model interface{}) rows {\n\ttyp := reflect.TypeOf(model)\n\trows := make(rows, numRows)\n\tmakeValue := quick.MakeValueFuncOf(typ)\n\tfor i := range rows {\n\t\tv := reflect.New(typ).Elem()\n\t\tmakeValue(v, prng)\n\t\trows[i] = v.Interface()\n\t}\n\treturn rows\n}\n\nvar readerTests = []struct {\n\tscenario string\n\tmodel    interface{}\n}{\n\t{\n\t\tscenario: \"BOOLEAN\",\n\t\tmodel:    booleanColumn{},\n\t},\n\n\t{\n\t\tscenario: \"INT32\",\n\t\tmodel:    int32Column{},\n\t},\n\n\t{\n\t\tscenario: \"INT64\",\n\t\tmodel:    int64Column{},\n\t},\n\n\t{\n\t\tscenario: \"INT96\",\n\t\tmodel:    int96Column{},\n\t},\n\n\t{\n\t\tscenario: \"FLOAT\",\n\t\tmodel:    floatColumn{},\n\t},\n\n\t{\n\t\tscenario: \"DOUBLE\",\n\t\tmodel:    doubleColumn{},\n\t},\n\n\t{\n\t\tscenario: \"BYTE_ARRAY\",\n\t\tmodel:    byteArrayColumn{},\n\t},\n\n\t{\n\t\tscenario: \"FIXED_LEN_BYTE_ARRAY\",\n\t\tmodel:    fixedLenByteArrayColumn{},\n\t},\n\n\t{\n\t\tscenario: \"STRING\",\n\t\tmodel:    stringColumn{},\n\t},\n\n\t{\n\t\tscenario: \"STRING (dict)\",\n\t\tmodel:    indexedStringColumn{},\n\t},\n\n\t{\n\t\tscenario: \"UUID\",\n\t\tmodel:    uuidColumn{},\n\t},\n\n\t{\n\t\tscenario: \"time.Time\",\n\t\tmodel:    timeColumn{},\n\t},\n\n\t{\n\t\tscenario: \"time.Time in ms\",\n\t\tmodel:    timeInMillisColumn{},\n\t},\n\n\t{\n\t\tscenario: \"DECIMAL\",\n\t\tmodel:    decimalColumn{},\n\t},\n\n\t{\n\t\tscenario: \"AddressBook\",\n\t\tmodel:    addressBook{},\n\t},\n\n\t{\n\t\tscenario: \"one optional level\",\n\t\tmodel:    listColumn2{},\n\t},\n\n\t{\n\t\tscenario: \"one repeated level\",\n\t\tmodel:    listColumn1{},\n\t},\n\n\t{\n\t\tscenario: \"two repeated levels\",\n\t\tmodel:    listColumn0{},\n\t},\n\n\t{\n\t\tscenario: \"three repeated levels\",\n\t\tmodel:    listColumn0{},\n\t},\n\n\t{\n\t\tscenario: \"nested lists\",\n\t\tmodel:    nestedListColumn{},\n\t},\n\n\t{\n\t\tscenario: \"key-value pairs\",\n\t\tmodel: struct {\n\t\t\tKeyValuePairs map[utf8string]utf8string\n\t\t}{},\n\t},\n\n\t{\n\t\tscenario: \"multiple key-value pairs\",\n\t\tmodel: struct {\n\t\t\tKeyValuePairs0 map[utf8string]utf8string\n\t\t\tKeyValuePairs1 map[utf8string]utf8string\n\t\t\tKeyValuePairs2 map[utf8string]utf8string\n\t\t}{},\n\t},\n\n\t{\n\t\tscenario: \"repeated key-value pairs\",\n\t\tmodel: struct {\n\t\t\tRepeatedKeyValuePairs []map[utf8string]utf8string\n\t\t}{},\n\t},\n\n\t{\n\t\tscenario: \"map of repeated values\",\n\t\tmodel: struct {\n\t\t\tMapOfRepeated map[utf8string][]utf8string\n\t\t}{},\n\t},\n}\n\nfunc TestReader(t *testing.T) {\n\tbuf := new(bytes.Buffer)\n\tfile := bytes.NewReader(nil)\n\n\tfor _, test := range readerTests {\n\t\tt.Run(test.scenario, func(t *testing.T) {\n\t\t\tconst N = 42\n\n\t\t\trowType := reflect.TypeOf(test.model)\n\t\t\trowPtr := reflect.New(rowType)\n\t\t\trowZero := reflect.Zero(rowType)\n\t\t\trowValue := rowPtr.Elem()\n\n\t\t\tfor n := 1; n < N; n++ {\n\t\t\t\tt.Run(fmt.Sprintf(\"N=%d\", n), func(t *testing.T) {\n\t\t\t\t\tdefer buf.Reset()\n\t\t\t\t\trows := rowsOf(n, test.model)\n\n\t\t\t\t\tif err := writeParquetFileWithBuffer(buf, rows); err != nil {\n\t\t\t\t\t\tt.Fatal(err)\n\t\t\t\t\t}\n\n\t\t\t\t\tfile.Reset(buf.Bytes())\n\t\t\t\t\tr := parquet.NewReader(file, parquet.SchemaOf(test.model))\n\n\t\t\t\t\tfor i, v := range rows {\n\t\t\t\t\t\tif err := r.Read(rowPtr.Interface()); err != nil {\n\t\t\t\t\t\t\tt.Fatal(err)\n\t\t\t\t\t\t}\n\t\t\t\t\t\tif !reflect.DeepEqual(rowValue.Interface(), v) {\n\t\t\t\t\t\t\tt.Errorf(\"row mismatch at index %d\\nwant = %+v\\ngot  = %+v\", i, v, rowValue.Interface())\n\t\t\t\t\t\t}\n\t\t\t\t\t\trowValue.Set(rowZero)\n\t\t\t\t\t}\n\n\t\t\t\t\tif err := r.Read(rowPtr.Interface()); err != io.EOF {\n\t\t\t\t\t\tt.Errorf(\"expected EOF after reading all values but got: %v\", err)\n\t\t\t\t\t}\n\t\t\t\t})\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc BenchmarkReaderReadType(b *testing.B) {\n\tbuf := new(bytes.Buffer)\n\tfile := bytes.NewReader(nil)\n\n\tfor _, test := range readerTests {\n\t\tb.Run(test.scenario, func(b *testing.B) {\n\t\t\tdefer buf.Reset()\n\t\t\trows := rowsOf(benchmarkNumRows, test.model)\n\n\t\t\tif err := writeParquetFile(buf, rows); err != nil {\n\t\t\t\tb.Fatal(err)\n\t\t\t}\n\t\t\tfile.Reset(buf.Bytes())\n\t\t\tf, err := parquet.OpenFile(file, file.Size())\n\t\t\tif err != nil {\n\t\t\t\tb.Fatal(err)\n\t\t\t}\n\n\t\t\trowType := reflect.TypeOf(test.model)\n\t\t\trowPtr := reflect.New(rowType)\n\t\t\trowZero := reflect.Zero(rowType)\n\t\t\trowValue := rowPtr.Elem()\n\n\t\t\tr := parquet.NewReader(f)\n\t\t\tp := rowPtr.Interface()\n\n\t\t\tbenchmarkRowsPerSecond(b, func() (n int) {\n\t\t\t\tfor i := 0; i < benchmarkRowsPerStep; i++ {\n\t\t\t\t\tif err := r.Read(p); err != nil {\n\t\t\t\t\t\tif err == io.EOF {\n\t\t\t\t\t\t\tr.Reset()\n\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\tb.Fatal(err)\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\trowValue.Set(rowZero)\n\t\t\t\treturn benchmarkRowsPerStep\n\t\t\t})\n\n\t\t\tb.SetBytes(int64(math.Ceil(benchmarkRowsPerStep * float64(file.Size()) / benchmarkNumRows)))\n\t\t})\n\t}\n}\n\nfunc BenchmarkReaderReadRow(b *testing.B) {\n\tbuf := new(bytes.Buffer)\n\tfile := bytes.NewReader(nil)\n\n\tfor _, test := range readerTests {\n\t\tb.Run(test.scenario, func(b *testing.B) {\n\t\t\tdefer buf.Reset()\n\t\t\trows := rowsOf(benchmarkNumRows, test.model)\n\n\t\t\tif err := writeParquetFile(buf, rows); err != nil {\n\t\t\t\tb.Fatal(err)\n\t\t\t}\n\t\t\tfile.Reset(buf.Bytes())\n\t\t\tf, err := parquet.OpenFile(file, file.Size())\n\t\t\tif err != nil {\n\t\t\t\tb.Fatal(err)\n\t\t\t}\n\n\t\t\tr := parquet.NewReader(f)\n\t\t\trowbuf := make([]parquet.Row, benchmarkRowsPerStep)\n\n\t\t\tbenchmarkRowsPerSecond(b, func() int {\n\t\t\t\tn, err := r.ReadRows(rowbuf)\n\t\t\t\tif err != nil {\n\t\t\t\t\tif err == io.EOF {\n\t\t\t\t\t\tr.Reset()\n\t\t\t\t\t} else {\n\t\t\t\t\t\tb.Fatal(err)\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\treturn n\n\t\t\t})\n\n\t\t\tb.SetBytes(int64(math.Ceil(benchmarkRowsPerStep * float64(file.Size()) / benchmarkNumRows)))\n\t\t})\n\t}\n}\n\nfunc TestReaderReadSubset(t *testing.T) {\n\t// In this example we'll write 3 columns to the file - X, Y, and Z, but\n\t// we'll only read out the X and Y columns. Returns true if all writes\n\t// and reads were successful, and false otherwise.\n\ttype Point3D struct{ X, Y, Z int64 }\n\ttype Point2D struct{ X, Y int64 }\n\n\terr := quickCheck(func(points3D []Point3D) bool {\n\t\tif len(points3D) == 0 {\n\t\t\treturn true\n\t\t}\n\t\tbuf := new(bytes.Buffer)\n\t\terr := writeParquetFile(buf, makeRows(points3D))\n\t\tif err != nil {\n\t\t\tt.Error(err)\n\t\t\treturn false\n\t\t}\n\t\treader := parquet.NewReader(bytes.NewReader(buf.Bytes()))\n\t\tfor i := 0; ; i++ {\n\t\t\trow := Point2D{}\n\t\t\terr := reader.Read(&row)\n\t\t\tif err != nil {\n\t\t\t\tif err == io.EOF && i == len(points3D) {\n\t\t\t\t\tbreak\n\t\t\t\t}\n\t\t\t\tt.Error(err)\n\t\t\t\treturn false\n\t\t\t}\n\t\t\tif row != (Point2D{X: points3D[i].X, Y: points3D[i].Y}) {\n\t\t\t\tt.Errorf(\"points mismatch at row index %d: want=%v got=%v\", i, points3D[i], row)\n\t\t\t\treturn false\n\t\t\t}\n\t\t}\n\t\treturn true\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestReaderSeekToRow(t *testing.T) {\n\ttype rowType struct {\n\t\tName utf8string `parquet:\",dict\"`\n\t}\n\n\trows := rowsOf(10, rowType{})\n\tbuf := new(bytes.Buffer)\n\terr := writeParquetFile(buf, rows)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\treader := parquet.NewReader(bytes.NewReader(buf.Bytes()))\n\tfor i := 0; i < 10; i++ {\n\t\tif err := reader.SeekToRow(int64(i)); err != nil {\n\t\t\tt.Fatalf(\"seek to row %d: %v\", i, err)\n\t\t}\n\n\t\trow := new(rowType)\n\t\terr := reader.Read(row)\n\t\tif err != nil {\n\t\t\tt.Fatalf(\"reading row %d: %v\", i, err)\n\t\t}\n\n\t\tif *row != rows[i] {\n\t\t\tt.Fatalf(\"row %d mismatch: got=%+v want=%+v\", i, *row, rows[i])\n\t\t}\n\t}\n}\n\nfunc TestSeekToRowNoDict(t *testing.T) {\n\ttype rowType struct {\n\t\tName utf8string `parquet:\",\"` // no dictionary encoding\n\t}\n\n\t// write samples to in-memory buffer\n\tbuf := new(bytes.Buffer)\n\tschema := parquet.SchemaOf(new(rowType))\n\tw := parquet.NewWriter(buf, schema)\n\tsample := rowType{\n\t\tName: \"foo1\",\n\t}\n\t// write two rows\n\tw.Write(sample)\n\tsample.Name = \"foo2\"\n\tw.Write(sample)\n\tw.Close()\n\n\t// create reader\n\tr := parquet.NewReader(bytes.NewReader(buf.Bytes()))\n\n\t// read second row\n\tr.SeekToRow(1)\n\trow := new(rowType)\n\terr := r.Read(row)\n\tif err != nil {\n\t\tt.Fatalf(\"reading row: %v\", err)\n\t}\n\t// fmt.Println(&sample, row)\n\tif *row != sample {\n\t\tt.Fatalf(\"read != write\")\n\t}\n}\n\nfunc TestSeekToRowReadAll(t *testing.T) {\n\ttype rowType struct {\n\t\tName utf8string `parquet:\",dict\"`\n\t}\n\n\t// write samples to in-memory buffer\n\tbuf := new(bytes.Buffer)\n\tschema := parquet.SchemaOf(new(rowType))\n\tw := parquet.NewWriter(buf, schema)\n\tsample := rowType{\n\t\tName: \"foo1\",\n\t}\n\t// write two rows\n\tw.Write(sample)\n\tsample.Name = \"foo2\"\n\tw.Write(sample)\n\tw.Close()\n\n\t// create reader\n\tr := parquet.NewReader(bytes.NewReader(buf.Bytes()))\n\n\t// read first row\n\tr.SeekToRow(0)\n\trow := new(rowType)\n\terr := r.Read(row)\n\tif err != nil {\n\t\tt.Fatalf(\"reading row: %v\", err)\n\t}\n\t// read second row\n\tr.SeekToRow(1)\n\trow = new(rowType)\n\terr = r.Read(row)\n\tif err != nil {\n\t\tt.Fatalf(\"reading row: %v\", err)\n\t}\n\t// fmt.Println(&sample, row)\n\tif *row != sample {\n\t\tt.Fatalf(\"read != write\")\n\t}\n}\n\nfunc TestSeekToRowDictReadSecond(t *testing.T) {\n\ttype rowType struct {\n\t\tName utf8string `parquet:\",dict\"`\n\t}\n\n\t// write samples to in-memory buffer\n\tbuf := new(bytes.Buffer)\n\tschema := parquet.SchemaOf(new(rowType))\n\tw := parquet.NewWriter(buf, schema)\n\tsample := rowType{\n\t\tName: \"foo1\",\n\t}\n\t// write two rows\n\tw.Write(sample)\n\tsample.Name = \"foo2\"\n\tw.Write(sample)\n\tw.Close()\n\n\t// create reader\n\tr := parquet.NewReader(bytes.NewReader(buf.Bytes()))\n\n\t// read second row\n\tr.SeekToRow(1)\n\trow := new(rowType)\n\terr := r.Read(row)\n\tif err != nil {\n\t\tt.Fatalf(\"reading row: %v\", err)\n\t}\n\t// fmt.Println(&sample, row)\n\tif *row != sample {\n\t\tt.Fatalf(\"read != write\")\n\t}\n}\n\nfunc TestSeekToRowDictReadMultiplePages(t *testing.T) {\n\ttype rowType struct {\n\t\tName utf8string `parquet:\",dict\"`\n\t}\n\n\t// write samples to in-memory buffer\n\tbuf := new(bytes.Buffer)\n\tschema := parquet.SchemaOf(new(rowType))\n\tw := parquet.NewWriter(buf, schema, &parquet.WriterConfig{\n\t\tPageBufferSize: 10,\n\t})\n\tsample := rowType{\n\t\tName: \"foo1\",\n\t}\n\n\t// write enough rows to spill over a single page\n\tfor i := 0; i < 10; i++ {\n\t\tw.Write(sample)\n\t}\n\tsample.Name = \"foo2\"\n\tw.Write(sample)\n\tw.Close()\n\n\t// create reader\n\tr := parquet.NewReader(bytes.NewReader(buf.Bytes()))\n\n\t// read 11th row\n\tr.SeekToRow(10)\n\trow := new(rowType)\n\terr := r.Read(row)\n\tif err != nil {\n\t\tt.Fatalf(\"reading row: %v\", err)\n\t}\n\tif *row != sample {\n\t\tt.Fatalf(\"read != write\")\n\t}\n}\n"
  },
  {
    "path": "row.go",
    "content": "package parquet\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n\t\"io\"\n\t\"reflect\"\n)\n\nconst (\n\tdefaultRowBufferSize = 42\n)\n\n// Row represents a parquet row as a slice of values.\n//\n// Each value should embed a column index, repetition level, and definition\n// level allowing the program to determine how to reconstruct the original\n// object from the row.\ntype Row []Value\n\n// MakeRow constructs a Row from a list of column values.\n//\n// The function panics if the column indexes of values in each column do not\n// match their position in the argument list.\nfunc MakeRow(columns ...[]Value) Row { return AppendRow(nil, columns...) }\n\n// AppendRow appends to row the given list of column values.\n//\n// AppendRow can be used to construct a Row value from columns, while retaining\n// the underlying memory buffer to avoid reallocation; for example:\n//\n// The function panics if the column indexes of values in each column do not\n// match their position in the argument list.\nfunc AppendRow(row Row, columns ...[]Value) Row {\n\tnumValues := 0\n\n\tfor expectedColumnIndex, column := range columns {\n\t\tnumValues += len(column)\n\n\t\tfor _, value := range column {\n\t\t\tif value.columnIndex != ^int16(expectedColumnIndex) {\n\t\t\t\tpanic(fmt.Sprintf(\"value of column %d has column index %d\", expectedColumnIndex, value.Column()))\n\t\t\t}\n\t\t}\n\t}\n\n\tif capacity := cap(row) - len(row); capacity < numValues {\n\t\trow = append(make(Row, 0, len(row)+numValues), row...)\n\t}\n\n\treturn appendRow(row, columns)\n}\n\nfunc appendRow(row Row, columns [][]Value) Row {\n\tfor _, column := range columns {\n\t\trow = append(row, column...)\n\t}\n\treturn row\n}\n\n// Clone creates a copy of the row which shares no pointers.\n//\n// This method is useful to capture rows after a call to RowReader.ReadRows when\n// values need to be retained before the next call to ReadRows or after the lifespan\n// of the reader.\nfunc (row Row) Clone() Row {\n\tclone := make(Row, len(row))\n\tfor i := range row {\n\t\tclone[i] = row[i].Clone()\n\t}\n\treturn clone\n}\n\n// Equal returns true if row and other contain the same sequence of values.\nfunc (row Row) Equal(other Row) bool {\n\tif len(row) != len(other) {\n\t\treturn false\n\t}\n\tfor i := range row {\n\t\tif !Equal(row[i], other[i]) {\n\t\t\treturn false\n\t\t}\n\t\tif row[i].repetitionLevel != other[i].repetitionLevel {\n\t\t\treturn false\n\t\t}\n\t\tif row[i].definitionLevel != other[i].definitionLevel {\n\t\t\treturn false\n\t\t}\n\t\tif row[i].columnIndex != other[i].columnIndex {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\n// Range calls f for each column of row.\nfunc (row Row) Range(f func(columnIndex int, columnValues []Value) bool) {\n\tcolumnIndex := 0\n\n\tfor i := 0; i < len(row); {\n\t\tj := i + 1\n\n\t\tfor j < len(row) && row[j].columnIndex == ^int16(columnIndex) {\n\t\t\tj++\n\t\t}\n\n\t\tif !f(columnIndex, row[i:j:j]) {\n\t\t\tbreak\n\t\t}\n\n\t\tcolumnIndex++\n\t\ti = j\n\t}\n}\n\n// RowSeeker is an interface implemented by readers of parquet rows which can be\n// positioned at a specific row index.\ntype RowSeeker interface {\n\t// Positions the stream on the given row index.\n\t//\n\t// Some implementations of the interface may only allow seeking forward.\n\t//\n\t// The method returns io.ErrClosedPipe if the stream had already been closed.\n\tSeekToRow(int64) error\n}\n\n// RowReader reads a sequence of parquet rows.\ntype RowReader interface {\n\t// ReadRows reads rows from the reader, returning the number of rows read\n\t// into the buffer, and any error that occurred. Note that the rows read\n\t// into the buffer are not safe for reuse after a subsequent call to\n\t// ReadRows. Callers that want to reuse rows must copy the rows using Clone.\n\t//\n\t// When all rows have been read, the reader returns io.EOF to indicate the\n\t// end of the sequence. It is valid for the reader to return both a non-zero\n\t// number of rows and a non-nil error (including io.EOF).\n\t//\n\t// The buffer of rows passed as argument will be used to store values of\n\t// each row read from the reader. If the rows are not nil, the backing array\n\t// of the slices will be used as an optimization to avoid re-allocating new\n\t// arrays.\n\t//\n\t// The application is expected to handle the case where ReadRows returns\n\t// less rows than requested and no error, by looking at the first returned\n\t// value from ReadRows, which is the number of rows that were read.\n\tReadRows([]Row) (int, error)\n}\n\n// RowReaderFrom reads parquet rows from reader.\ntype RowReaderFrom interface {\n\tReadRowsFrom(RowReader) (int64, error)\n}\n\n// RowReaderWithSchema is an extension of the RowReader interface which\n// advertises the schema of rows returned by ReadRow calls.\ntype RowReaderWithSchema interface {\n\tRowReader\n\tSchema() *Schema\n}\n\n// RowReadSeeker is an interface implemented by row readers which support\n// seeking to arbitrary row positions.\ntype RowReadSeeker interface {\n\tRowReader\n\tRowSeeker\n}\n\n// RowWriter writes parquet rows to an underlying medium.\ntype RowWriter interface {\n\t// Writes rows to the writer, returning the number of rows written and any\n\t// error that occurred.\n\t//\n\t// Because columnar operations operate on independent columns of values,\n\t// writes of rows may not be atomic operations, and could result in some\n\t// rows being partially written. The method returns the number of rows that\n\t// were successfully written, but if an error occurs, values of the row(s)\n\t// that failed to be written may have been partially committed to their\n\t// columns. For that reason, applications should consider a write error as\n\t// fatal and assume that they need to discard the state, they cannot retry\n\t// the write nor recover the underlying file.\n\tWriteRows([]Row) (int, error)\n}\n\n// RowWriterTo writes parquet rows to a writer.\ntype RowWriterTo interface {\n\tWriteRowsTo(RowWriter) (int64, error)\n}\n\n// RowWriterWithSchema is an extension of the RowWriter interface which\n// advertises the schema of rows expected to be passed to WriteRow calls.\ntype RowWriterWithSchema interface {\n\tRowWriter\n\tSchema() *Schema\n}\n\n// RowReaderFunc is a function type implementing the RowReader interface.\ntype RowReaderFunc func([]Row) (int, error)\n\nfunc (f RowReaderFunc) ReadRows(rows []Row) (int, error) { return f(rows) }\n\n// RowWriterFunc is a function type implementing the RowWriter interface.\ntype RowWriterFunc func([]Row) (int, error)\n\nfunc (f RowWriterFunc) WriteRows(rows []Row) (int, error) { return f(rows) }\n\n// MultiRowWriter constructs a RowWriter which dispatches writes to all the\n// writers passed as arguments.\n//\n// When writing rows, if any of the writers returns an error, the operation is\n// aborted and the error returned. If one of the writers did not error, but did\n// not write all the rows, the operation is aborted and io.ErrShortWrite is\n// returned.\n//\n// Rows are written sequentially to each writer in the order they are given to\n// this function.\nfunc MultiRowWriter(writers ...RowWriter) RowWriter {\n\tm := &multiRowWriter{writers: make([]RowWriter, len(writers))}\n\tcopy(m.writers, writers)\n\treturn m\n}\n\ntype multiRowWriter struct{ writers []RowWriter }\n\nfunc (m *multiRowWriter) WriteRows(rows []Row) (int, error) {\n\tfor _, w := range m.writers {\n\t\tn, err := w.WriteRows(rows)\n\t\tif err != nil {\n\t\t\treturn n, err\n\t\t}\n\t\tif n != len(rows) {\n\t\t\treturn n, io.ErrShortWrite\n\t\t}\n\t}\n\treturn len(rows), nil\n}\n\ntype forwardRowSeeker struct {\n\trows  RowReader\n\tseek  int64\n\tindex int64\n}\n\nfunc (r *forwardRowSeeker) ReadRows(rows []Row) (int, error) {\n\tfor {\n\t\tn, err := r.rows.ReadRows(rows)\n\n\t\tif n > 0 && r.index < r.seek {\n\t\t\tskip := r.seek - r.index\n\t\t\tr.index += int64(n)\n\t\t\tif skip >= int64(n) {\n\t\t\t\tcontinue\n\t\t\t}\n\n\t\t\tfor i, j := 0, int(skip); j < n; i++ {\n\t\t\t\trows[i] = append(rows[i][:0], rows[j]...)\n\t\t\t}\n\n\t\t\tn -= int(skip)\n\t\t}\n\n\t\treturn n, err\n\t}\n}\n\nfunc (r *forwardRowSeeker) SeekToRow(rowIndex int64) error {\n\tif rowIndex >= r.index {\n\t\tr.seek = rowIndex\n\t\treturn nil\n\t}\n\treturn fmt.Errorf(\n\t\t\"SeekToRow: %T does not implement parquet.RowSeeker: cannot seek backward from row %d to %d\",\n\t\tr.rows,\n\t\tr.index,\n\t\trowIndex,\n\t)\n}\n\n// CopyRows copies rows from src to dst.\n//\n// The underlying types of src and dst are tested to determine if they expose\n// information about the schema of rows that are read and expected to be\n// written. If the schema information are available but do not match, the\n// function will attempt to automatically convert the rows from the source\n// schema to the destination.\n//\n// As an optimization, the src argument may implement RowWriterTo to bypass\n// the default row copy logic and provide its own. The dst argument may also\n// implement RowReaderFrom for the same purpose.\n//\n// The function returns the number of rows written, or any error encountered\n// other than io.EOF.\nfunc CopyRows(dst RowWriter, src RowReader) (int64, error) {\n\treturn copyRows(dst, src, nil)\n}\n\nfunc copyRows(dst RowWriter, src RowReader, buf []Row) (written int64, err error) {\n\ttargetSchema := targetSchemaOf(dst)\n\tsourceSchema := sourceSchemaOf(src)\n\n\tif targetSchema != nil && sourceSchema != nil {\n\t\tif !nodesAreEqual(targetSchema, sourceSchema) {\n\t\t\tconv, err := Convert(targetSchema, sourceSchema)\n\t\t\tif err != nil {\n\t\t\t\treturn 0, err\n\t\t\t}\n\t\t\t// The conversion effectively disables a potential optimization\n\t\t\t// if the source reader implemented RowWriterTo. It is a trade off\n\t\t\t// we are making to optimize for safety rather than performance.\n\t\t\t//\n\t\t\t// Entering this code path should not be the common case tho, it is\n\t\t\t// most often used when parquet schemas are evolving, but we expect\n\t\t\t// that the majority of files of an application to be sharing a\n\t\t\t// common schema.\n\t\t\tsrc = ConvertRowReader(src, conv)\n\t\t}\n\t}\n\n\tif wt, ok := src.(RowWriterTo); ok {\n\t\treturn wt.WriteRowsTo(dst)\n\t}\n\n\tif rf, ok := dst.(RowReaderFrom); ok {\n\t\treturn rf.ReadRowsFrom(src)\n\t}\n\n\tif len(buf) == 0 {\n\t\tbuf = make([]Row, defaultRowBufferSize)\n\t}\n\n\tdefer clearRows(buf)\n\n\tfor {\n\t\trn, err := src.ReadRows(buf)\n\n\t\tif rn > 0 {\n\t\t\twn, err := dst.WriteRows(buf[:rn])\n\t\t\tif err != nil {\n\t\t\t\treturn written, err\n\t\t\t}\n\n\t\t\twritten += int64(wn)\n\t\t}\n\n\t\tif err != nil {\n\t\t\tif errors.Is(err, io.EOF) {\n\t\t\t\terr = nil\n\t\t\t}\n\t\t\treturn written, err\n\t\t}\n\n\t\tif rn == 0 {\n\t\t\treturn written, io.ErrNoProgress\n\t\t}\n\t}\n}\n\nfunc makeRows(n int) []Row {\n\tbuf := make([]Value, n)\n\trow := make([]Row, n)\n\tfor i := range row {\n\t\trow[i] = buf[i : i : i+1]\n\t}\n\treturn row\n}\n\nfunc clearRows(rows []Row) {\n\tfor i, values := range rows {\n\t\tclearValues(values)\n\t\trows[i] = values[:0]\n\t}\n}\n\nfunc sourceSchemaOf(r RowReader) *Schema {\n\tif rrs, ok := r.(RowReaderWithSchema); ok {\n\t\treturn rrs.Schema()\n\t}\n\treturn nil\n}\n\nfunc targetSchemaOf(w RowWriter) *Schema {\n\tif rws, ok := w.(RowWriterWithSchema); ok {\n\t\treturn rws.Schema()\n\t}\n\treturn nil\n}\n\n// =============================================================================\n// Functions returning closures are marked with \"go:noinline\" below to prevent\n// losing naming information of the closure in stack traces.\n//\n// Because some of the functions are very short (simply return a closure), the\n// compiler inlines when at their call site, which result in the closure being\n// named something like parquet.deconstructFuncOf.func2 instead of the original\n// parquet.deconstructFuncOfLeaf.func1; the latter being much more meaningful\n// when reading CPU or memory profiles.\n// =============================================================================\n\ntype levels struct {\n\trepetitionDepth byte\n\trepetitionLevel byte\n\tdefinitionLevel byte\n}\n\n// deconstructFunc accepts a row, the current levels, the value to deserialize\n// the current column onto, and returns the row minus the deserialied value(s)\n// It recurses until it hits a leaf node, then deserializes that value\n// individually as the base case.\ntype deconstructFunc func([][]Value, levels, reflect.Value)\n\nfunc deconstructFuncOf(columnIndex int16, node Node) (int16, deconstructFunc) {\n\tswitch {\n\tcase node.Optional():\n\t\treturn deconstructFuncOfOptional(columnIndex, node)\n\tcase node.Repeated():\n\t\treturn deconstructFuncOfRepeated(columnIndex, node)\n\tcase isList(node):\n\t\treturn deconstructFuncOfList(columnIndex, node)\n\tcase isMap(node):\n\t\treturn deconstructFuncOfMap(columnIndex, node)\n\tdefault:\n\t\treturn deconstructFuncOfRequired(columnIndex, node)\n\t}\n}\n\n//go:noinline\nfunc deconstructFuncOfOptional(columnIndex int16, node Node) (int16, deconstructFunc) {\n\tcolumnIndex, deconstruct := deconstructFuncOf(columnIndex, Required(node))\n\treturn columnIndex, func(columns [][]Value, levels levels, value reflect.Value) {\n\t\tif value.IsValid() {\n\t\t\tif value.IsZero() {\n\t\t\t\tvalue = reflect.Value{}\n\t\t\t} else {\n\t\t\t\tif value.Kind() == reflect.Ptr {\n\t\t\t\t\tvalue = value.Elem()\n\t\t\t\t}\n\t\t\t\tlevels.definitionLevel++\n\t\t\t}\n\t\t}\n\t\tdeconstruct(columns, levels, value)\n\t}\n}\n\n//go:noinline\nfunc deconstructFuncOfRepeated(columnIndex int16, node Node) (int16, deconstructFunc) {\n\tcolumnIndex, deconstruct := deconstructFuncOf(columnIndex, Required(node))\n\treturn columnIndex, func(columns [][]Value, levels levels, value reflect.Value) {\n\t\tif !value.IsValid() || value.Len() == 0 {\n\t\t\tdeconstruct(columns, levels, reflect.Value{})\n\t\t\treturn\n\t\t}\n\n\t\tlevels.repetitionDepth++\n\t\tlevels.definitionLevel++\n\n\t\tfor i, n := 0, value.Len(); i < n; i++ {\n\t\t\tdeconstruct(columns, levels, value.Index(i))\n\t\t\tlevels.repetitionLevel = levels.repetitionDepth\n\t\t}\n\t}\n}\n\nfunc deconstructFuncOfRequired(columnIndex int16, node Node) (int16, deconstructFunc) {\n\tswitch {\n\tcase node.Leaf():\n\t\treturn deconstructFuncOfLeaf(columnIndex, node)\n\tdefault:\n\t\treturn deconstructFuncOfGroup(columnIndex, node)\n\t}\n}\n\nfunc deconstructFuncOfList(columnIndex int16, node Node) (int16, deconstructFunc) {\n\treturn deconstructFuncOf(columnIndex, Repeated(listElementOf(node)))\n}\n\n//go:noinline\nfunc deconstructFuncOfMap(columnIndex int16, node Node) (int16, deconstructFunc) {\n\tkeyValue := mapKeyValueOf(node)\n\tkeyValueType := keyValue.GoType()\n\tkeyValueElem := keyValueType.Elem()\n\tkeyType := keyValueElem.Field(0).Type\n\tvalueType := keyValueElem.Field(1).Type\n\tnextColumnIndex, deconstruct := deconstructFuncOf(columnIndex, schemaOf(keyValueElem))\n\treturn nextColumnIndex, func(columns [][]Value, levels levels, mapValue reflect.Value) {\n\t\tif !mapValue.IsValid() || mapValue.Len() == 0 {\n\t\t\tdeconstruct(columns, levels, reflect.Value{})\n\t\t\treturn\n\t\t}\n\n\t\tlevels.repetitionDepth++\n\t\tlevels.definitionLevel++\n\n\t\telem := reflect.New(keyValueElem).Elem()\n\t\tk := elem.Field(0)\n\t\tv := elem.Field(1)\n\n\t\tfor _, key := range mapValue.MapKeys() {\n\t\t\tk.Set(key.Convert(keyType))\n\t\t\tv.Set(mapValue.MapIndex(key).Convert(valueType))\n\t\t\tdeconstruct(columns, levels, elem)\n\t\t\tlevels.repetitionLevel = levels.repetitionDepth\n\t\t}\n\t}\n}\n\n//go:noinline\nfunc deconstructFuncOfGroup(columnIndex int16, node Node) (int16, deconstructFunc) {\n\tfields := node.Fields()\n\tfuncs := make([]deconstructFunc, len(fields))\n\tfor i, field := range fields {\n\t\tcolumnIndex, funcs[i] = deconstructFuncOf(columnIndex, field)\n\t}\n\treturn columnIndex, func(columns [][]Value, levels levels, value reflect.Value) {\n\t\tif value.IsValid() {\n\t\t\tfor i, f := range funcs {\n\t\t\t\tf(columns, levels, fields[i].Value(value))\n\t\t\t}\n\t\t} else {\n\t\t\tfor _, f := range funcs {\n\t\t\t\tf(columns, levels, value)\n\t\t\t}\n\t\t}\n\t}\n}\n\n//go:noinline\nfunc deconstructFuncOfLeaf(columnIndex int16, node Node) (int16, deconstructFunc) {\n\tif columnIndex > MaxColumnIndex {\n\t\tpanic(\"row cannot be deconstructed because it has more than 127 columns\")\n\t}\n\ttyp := node.Type()\n\tkind := typ.Kind()\n\tlt := typ.LogicalType()\n\tvalueColumnIndex := ^columnIndex\n\treturn columnIndex + 1, func(columns [][]Value, levels levels, value reflect.Value) {\n\t\tv := Value{}\n\n\t\tif value.IsValid() {\n\t\t\tv = makeValue(kind, lt, value)\n\t\t}\n\n\t\tv.repetitionLevel = levels.repetitionLevel\n\t\tv.definitionLevel = levels.definitionLevel\n\t\tv.columnIndex = valueColumnIndex\n\n\t\tcolumns[columnIndex] = append(columns[columnIndex], v)\n\t}\n}\n\n// \"reconstructX\" turns a Go value into a Go representation of a Parquet series\n// of values\n\ntype reconstructFunc func(reflect.Value, levels, [][]Value) error\n\nfunc reconstructFuncOf(columnIndex int16, node Node) (int16, reconstructFunc) {\n\tswitch {\n\tcase node.Optional():\n\t\treturn reconstructFuncOfOptional(columnIndex, node)\n\tcase node.Repeated():\n\t\treturn reconstructFuncOfRepeated(columnIndex, node)\n\tcase isList(node):\n\t\treturn reconstructFuncOfList(columnIndex, node)\n\tcase isMap(node):\n\t\treturn reconstructFuncOfMap(columnIndex, node)\n\tdefault:\n\t\treturn reconstructFuncOfRequired(columnIndex, node)\n\t}\n}\n\n//go:noinline\nfunc reconstructFuncOfOptional(columnIndex int16, node Node) (int16, reconstructFunc) {\n\t// We convert the optional func to required so that we eventually reach the\n\t// leaf base-case.  We're still using the heuristics of optional in the\n\t// returned closure (see levels.definitionLevel++), but we don't actually do\n\t// deserialization here, that happens in the leaf function, hence this line.\n\tnextColumnIndex, reconstruct := reconstructFuncOf(columnIndex, Required(node))\n\n\treturn nextColumnIndex, func(value reflect.Value, levels levels, columns [][]Value) error {\n\t\tlevels.definitionLevel++\n\n\t\tif columns[0][0].definitionLevel < levels.definitionLevel {\n\t\t\tvalue.Set(reflect.Zero(value.Type()))\n\t\t\treturn nil\n\t\t}\n\n\t\tif value.Kind() == reflect.Ptr {\n\t\t\tif value.IsNil() {\n\t\t\t\tvalue.Set(reflect.New(value.Type().Elem()))\n\t\t\t}\n\t\t\tvalue = value.Elem()\n\t\t}\n\n\t\treturn reconstruct(value, levels, columns)\n\t}\n}\n\nfunc setMakeSlice(v reflect.Value, n int) reflect.Value {\n\tt := v.Type()\n\tif t.Kind() == reflect.Interface {\n\t\tt = reflect.TypeOf(([]interface{})(nil))\n\t}\n\ts := reflect.MakeSlice(t, n, n)\n\tv.Set(s)\n\treturn s\n}\n\n//go:noinline\nfunc reconstructFuncOfRepeated(columnIndex int16, node Node) (int16, reconstructFunc) {\n\tnextColumnIndex, reconstruct := reconstructFuncOf(columnIndex, Required(node))\n\treturn nextColumnIndex, func(value reflect.Value, levels levels, columns [][]Value) error {\n\t\tlevels.repetitionDepth++\n\t\tlevels.definitionLevel++\n\n\t\tif columns[0][0].definitionLevel < levels.definitionLevel {\n\t\t\tsetMakeSlice(value, 0)\n\t\t\treturn nil\n\t\t}\n\n\t\tvalues := make([][]Value, len(columns))\n\t\tcolumn := columns[0]\n\t\tn := 0\n\n\t\tfor i, column := range columns {\n\t\t\tvalues[i] = column[0:0:len(column)]\n\t\t}\n\n\t\tfor i := 0; i < len(column); {\n\t\t\ti++\n\t\t\tn++\n\n\t\t\tfor i < len(column) && column[i].repetitionLevel > levels.repetitionDepth {\n\t\t\t\ti++\n\t\t\t}\n\t\t}\n\n\t\tvalue = setMakeSlice(value, n)\n\n\t\tfor i := 0; i < n; i++ {\n\t\t\tfor j, column := range values {\n\t\t\t\tcolumn = column[:cap(column)]\n\t\t\t\tif len(column) == 0 {\n\t\t\t\t\tcontinue\n\t\t\t\t}\n\n\t\t\t\tk := 1\n\t\t\t\tfor k < len(column) && column[k].repetitionLevel > levels.repetitionDepth {\n\t\t\t\t\tk++\n\t\t\t\t}\n\n\t\t\t\tvalues[j] = column[:k]\n\t\t\t}\n\n\t\t\tif err := reconstruct(value.Index(i), levels, values); err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\n\t\t\tfor j, column := range values {\n\t\t\t\tvalues[j] = column[len(column):len(column):cap(column)]\n\t\t\t}\n\n\t\t\tlevels.repetitionLevel = levels.repetitionDepth\n\t\t}\n\n\t\treturn nil\n\t}\n}\n\nfunc reconstructFuncOfRequired(columnIndex int16, node Node) (int16, reconstructFunc) {\n\tswitch {\n\tcase node.Leaf():\n\t\treturn reconstructFuncOfLeaf(columnIndex, node)\n\tdefault:\n\t\treturn reconstructFuncOfGroup(columnIndex, node)\n\t}\n}\n\nfunc reconstructFuncOfList(columnIndex int16, node Node) (int16, reconstructFunc) {\n\treturn reconstructFuncOf(columnIndex, Repeated(listElementOf(node)))\n}\n\n//go:noinline\nfunc reconstructFuncOfMap(columnIndex int16, node Node) (int16, reconstructFunc) {\n\tkeyValue := mapKeyValueOf(node)\n\tkeyValueType := keyValue.GoType()\n\tkeyValueElem := keyValueType.Elem()\n\tkeyValueZero := reflect.Zero(keyValueElem)\n\tnextColumnIndex, reconstruct := reconstructFuncOf(columnIndex, schemaOf(keyValueElem))\n\treturn nextColumnIndex, func(value reflect.Value, levels levels, columns [][]Value) error {\n\t\tlevels.repetitionDepth++\n\t\tlevels.definitionLevel++\n\n\t\tif columns[0][0].definitionLevel < levels.definitionLevel {\n\t\t\tvalue.Set(reflect.MakeMap(value.Type()))\n\t\t\treturn nil\n\t\t}\n\n\t\tvalues := make([][]Value, len(columns))\n\t\tcolumn := columns[0]\n\t\tt := value.Type()\n\t\tk := t.Key()\n\t\tv := t.Elem()\n\t\tn := 0\n\n\t\tfor i, column := range columns {\n\t\t\tvalues[i] = column[0:0:len(column)]\n\t\t}\n\n\t\tfor i := 0; i < len(column); {\n\t\t\ti++\n\t\t\tn++\n\n\t\t\tfor i < len(column) && column[i].repetitionLevel > levels.repetitionDepth {\n\t\t\t\ti++\n\t\t\t}\n\t\t}\n\n\t\tif value.IsNil() {\n\t\t\tvalue.Set(reflect.MakeMapWithSize(t, n))\n\t\t}\n\n\t\telem := reflect.New(keyValueElem).Elem()\n\t\tfor i := 0; i < n; i++ {\n\t\t\tfor j, column := range values {\n\t\t\t\tcolumn = column[:cap(column)]\n\t\t\t\tk := 1\n\n\t\t\t\tfor k < len(column) && column[k].repetitionLevel > levels.repetitionDepth {\n\t\t\t\t\tk++\n\t\t\t\t}\n\n\t\t\t\tvalues[j] = column[:k]\n\t\t\t}\n\n\t\t\tif err := reconstruct(elem, levels, values); err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\n\t\t\tfor j, column := range values {\n\t\t\t\tvalues[j] = column[len(column):len(column):cap(column)]\n\t\t\t}\n\n\t\t\tvalue.SetMapIndex(elem.Field(0).Convert(k), elem.Field(1).Convert(v))\n\t\t\telem.Set(keyValueZero)\n\t\t\tlevels.repetitionLevel = levels.repetitionDepth\n\t\t}\n\n\t\treturn nil\n\t}\n}\n\n//go:noinline\nfunc reconstructFuncOfGroup(columnIndex int16, node Node) (int16, reconstructFunc) {\n\tfields := node.Fields()\n\tfuncs := make([]reconstructFunc, len(fields))\n\tcolumnOffsets := make([]int16, len(fields))\n\tfirstColumnIndex := columnIndex\n\n\tfor i, field := range fields {\n\t\tcolumnIndex, funcs[i] = reconstructFuncOf(columnIndex, field)\n\t\tcolumnOffsets[i] = columnIndex - firstColumnIndex\n\t}\n\n\treturn columnIndex, func(value reflect.Value, levels levels, columns [][]Value) error {\n\t\tif value.Kind() == reflect.Interface {\n\t\t\tvalue.Set(reflect.MakeMap(reflect.TypeOf((map[string]interface{})(nil))))\n\t\t\tvalue = value.Elem()\n\t\t}\n\n\t\tif value.Kind() == reflect.Map {\n\t\t\telemType := value.Type().Elem()\n\t\t\tname := reflect.New(reflect.TypeOf(\"\")).Elem()\n\t\t\telem := reflect.New(elemType).Elem()\n\t\t\tzero := reflect.Zero(elemType)\n\n\t\t\tif value.Len() > 0 {\n\t\t\t\tvalue.Set(reflect.MakeMap(value.Type()))\n\t\t\t}\n\n\t\t\toff := int16(0)\n\n\t\t\tfor i, f := range funcs {\n\t\t\t\tname.SetString(fields[i].Name())\n\t\t\t\tend := columnOffsets[i]\n\t\t\t\terr := f(elem, levels, columns[off:end:end])\n\t\t\t\tif err != nil {\n\t\t\t\t\treturn fmt.Errorf(\"%s → %w\", name, err)\n\t\t\t\t}\n\t\t\t\toff = end\n\t\t\t\tvalue.SetMapIndex(name, elem)\n\t\t\t\telem.Set(zero)\n\t\t\t}\n\t\t} else {\n\t\t\toff := int16(0)\n\n\t\t\tfor i, f := range funcs {\n\t\t\t\tend := columnOffsets[i]\n\t\t\t\terr := f(fields[i].Value(value), levels, columns[off:end:end])\n\t\t\t\tif err != nil {\n\t\t\t\t\treturn fmt.Errorf(\"%s → %w\", fields[i].Name(), err)\n\t\t\t\t}\n\t\t\t\toff = end\n\t\t\t}\n\t\t}\n\n\t\treturn nil\n\t}\n}\n\n//go:noinline\nfunc reconstructFuncOfLeaf(columnIndex int16, node Node) (int16, reconstructFunc) {\n\ttyp := node.Type()\n\treturn columnIndex + 1, func(value reflect.Value, _ levels, columns [][]Value) error {\n\t\tcolumn := columns[0]\n\t\tif len(column) == 0 {\n\t\t\treturn fmt.Errorf(\"no values found in parquet row for column %d\", columnIndex)\n\t\t}\n\t\treturn typ.AssignValue(value, column[0])\n\t}\n}\n"
  },
  {
    "path": "row_buffer.go",
    "content": "//go:build go1.18\n\npackage parquet\n\nimport (\n\t\"io\"\n\t\"sort\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding\"\n)\n\n// RowBuffer is an implementation of the RowGroup interface which stores parquet\n// rows in memory.\n//\n// Unlike GenericBuffer which uses a column layout to store values in memory\n// buffers, RowBuffer uses a row layout. The use of row layout provides greater\n// efficiency when sorting the buffer, which is the primary use case for the\n// RowBuffer type. Applications which intend to sort rows prior to writing them\n// to a parquet file will often see lower CPU utilization from using a RowBuffer\n// than a GenericBuffer.\n//\n// RowBuffer values are not safe to use concurrently from multiple goroutines.\ntype RowBuffer[T any] struct {\n\talloc   rowAllocator\n\tschema  *Schema\n\tsorting []SortingColumn\n\trows    []Row\n\tvalues  []Value\n\tcompare func(Row, Row) int\n}\n\n// NewRowBuffer constructs a new row buffer.\nfunc NewRowBuffer[T any](options ...RowGroupOption) *RowBuffer[T] {\n\tconfig := DefaultRowGroupConfig()\n\tconfig.Apply(options...)\n\tif err := config.Validate(); err != nil {\n\t\tpanic(err)\n\t}\n\n\tt := typeOf[T]()\n\tif config.Schema == nil && t != nil {\n\t\tconfig.Schema = schemaOf(dereference(t))\n\t}\n\n\tif config.Schema == nil {\n\t\tpanic(\"row buffer must be instantiated with schema or concrete type.\")\n\t}\n\n\treturn &RowBuffer[T]{\n\t\tschema:  config.Schema,\n\t\tsorting: config.Sorting.SortingColumns,\n\t\tcompare: config.Schema.Comparator(config.Sorting.SortingColumns...),\n\t}\n}\n\n// Reset clears the content of the buffer without releasing its memory.\nfunc (buf *RowBuffer[T]) Reset() {\n\tfor i := range buf.rows {\n\t\tbuf.rows[i] = nil\n\t}\n\tfor i := range buf.values {\n\t\tbuf.values[i] = Value{}\n\t}\n\tbuf.rows = buf.rows[:0]\n\tbuf.values = buf.values[:0]\n\tbuf.alloc.reset()\n}\n\n// NumRows returns the number of rows currently written to the buffer.\nfunc (buf *RowBuffer[T]) NumRows() int64 { return int64(len(buf.rows)) }\n\n// ColumnChunks returns a view of the buffer's columns.\n//\n// Note that reading columns of a RowBuffer will be less efficient than reading\n// columns of a GenericBuffer since the latter uses a column layout. This method\n// is mainly exposed to satisfy the RowGroup interface, applications which need\n// compute-efficient column scans on in-memory buffers should likely use a\n// GenericBuffer instead.\n//\n// The returned column chunks are snapshots at the time the method is called,\n// they remain valid until the next call to Reset on the buffer.\nfunc (buf *RowBuffer[T]) ColumnChunks() []ColumnChunk {\n\tcolumns := buf.schema.Columns()\n\tchunks := make([]rowBufferColumnChunk, len(columns))\n\n\tfor i, column := range columns {\n\t\tleafColumn, _ := buf.schema.Lookup(column...)\n\t\tchunks[i] = rowBufferColumnChunk{\n\t\t\tpage: rowBufferPage{\n\t\t\t\trows:               buf.rows,\n\t\t\t\ttyp:                leafColumn.Node.Type(),\n\t\t\t\tcolumn:             leafColumn.ColumnIndex,\n\t\t\t\tmaxRepetitionLevel: byte(leafColumn.MaxRepetitionLevel),\n\t\t\t\tmaxDefinitionLevel: byte(leafColumn.MaxDefinitionLevel),\n\t\t\t},\n\t\t}\n\t}\n\n\tcolumnChunks := make([]ColumnChunk, len(chunks))\n\tfor i := range chunks {\n\t\tcolumnChunks[i] = &chunks[i]\n\t}\n\treturn columnChunks\n}\n\n// SortingColumns returns the list of columns that rows are expected to be\n// sorted by.\n//\n// The list of sorting columns is configured when the buffer is created and used\n// when it is sorted.\n//\n// Note that unless the buffer is explicitly sorted, there are no guarantees\n// that the rows it contains will be in the order specified by the sorting\n// columns.\nfunc (buf *RowBuffer[T]) SortingColumns() []SortingColumn { return buf.sorting }\n\n// Schema returns the schema of rows in the buffer.\nfunc (buf *RowBuffer[T]) Schema() *Schema { return buf.schema }\n\n// Len returns the number of rows in the buffer.\n//\n// The method contributes to satisfying sort.Interface.\nfunc (buf *RowBuffer[T]) Len() int { return len(buf.rows) }\n\n// Less compares the rows at index i and j according to the sorting columns\n// configured on the buffer.\n//\n// The method contributes to satisfying sort.Interface.\nfunc (buf *RowBuffer[T]) Less(i, j int) bool {\n\treturn buf.compare(buf.rows[i], buf.rows[j]) < 0\n}\n\n// Swap exchanges the rows at index i and j in the buffer.\n//\n// The method contributes to satisfying sort.Interface.\nfunc (buf *RowBuffer[T]) Swap(i, j int) {\n\tbuf.rows[i], buf.rows[j] = buf.rows[j], buf.rows[i]\n}\n\n// Rows returns a Rows instance exposing rows stored in the buffer.\n//\n// The rows returned are a snapshot at the time the method is called.\n// The returned rows and values read from it remain valid until the next call\n// to Reset on the buffer.\nfunc (buf *RowBuffer[T]) Rows() Rows {\n\treturn &rowBufferRows{rows: buf.rows, schema: buf.schema}\n}\n\n// Write writes rows to the buffer, returning the number of rows written.\nfunc (buf *RowBuffer[T]) Write(rows []T) (int, error) {\n\tfor i := range rows {\n\t\toff := len(buf.values)\n\t\tbuf.values = buf.schema.Deconstruct(buf.values, &rows[i])\n\t\tend := len(buf.values)\n\t\trow := buf.values[off:end:end]\n\t\tbuf.alloc.capture(row)\n\t\tbuf.rows = append(buf.rows, row)\n\t}\n\treturn len(rows), nil\n}\n\n// WriteRows writes parquet rows to the buffer, returing the number of rows\n// written.\nfunc (buf *RowBuffer[T]) WriteRows(rows []Row) (int, error) {\n\tfor i := range rows {\n\t\toff := len(buf.values)\n\t\tbuf.values = append(buf.values, rows[i]...)\n\t\tend := len(buf.values)\n\t\trow := buf.values[off:end:end]\n\t\tbuf.alloc.capture(row)\n\t\tbuf.rows = append(buf.rows, row)\n\t}\n\treturn len(rows), nil\n}\n\ntype rowBufferColumnChunk struct{ page rowBufferPage }\n\nfunc (c *rowBufferColumnChunk) Type() Type { return c.page.Type() }\n\nfunc (c *rowBufferColumnChunk) Column() int { return c.page.Column() }\n\nfunc (c *rowBufferColumnChunk) Pages() Pages { return onePage(&c.page) }\n\nfunc (c *rowBufferColumnChunk) ColumnIndex() ColumnIndex { return nil }\n\nfunc (c *rowBufferColumnChunk) OffsetIndex() OffsetIndex { return nil }\n\nfunc (c *rowBufferColumnChunk) BloomFilter() BloomFilter { return nil }\n\nfunc (c *rowBufferColumnChunk) NumValues() int64 { return c.page.NumValues() }\n\ntype rowBufferPage struct {\n\trows               []Row\n\ttyp                Type\n\tcolumn             int\n\tmaxRepetitionLevel byte\n\tmaxDefinitionLevel byte\n}\n\nfunc (p *rowBufferPage) Type() Type { return p.typ }\n\nfunc (p *rowBufferPage) Column() int { return p.column }\n\nfunc (p *rowBufferPage) Dictionary() Dictionary { return nil }\n\nfunc (p *rowBufferPage) NumRows() int64 { return int64(len(p.rows)) }\n\nfunc (p *rowBufferPage) NumValues() int64 {\n\tnumValues := int64(0)\n\tp.scan(func(value Value) {\n\t\tif !value.isNull() {\n\t\t\tnumValues++\n\t\t}\n\t})\n\treturn numValues\n}\n\nfunc (p *rowBufferPage) NumNulls() int64 {\n\tnumNulls := int64(0)\n\tp.scan(func(value Value) {\n\t\tif value.isNull() {\n\t\t\tnumNulls++\n\t\t}\n\t})\n\treturn numNulls\n}\n\nfunc (p *rowBufferPage) Bounds() (min, max Value, ok bool) {\n\tp.scan(func(value Value) {\n\t\tif !value.IsNull() {\n\t\t\tswitch {\n\t\t\tcase !ok:\n\t\t\t\tmin, max, ok = value, value, true\n\t\t\tcase p.typ.Compare(value, min) < 0:\n\t\t\t\tmin = value\n\t\t\tcase p.typ.Compare(value, max) > 0:\n\t\t\t\tmax = value\n\t\t\t}\n\t\t}\n\t})\n\treturn min, max, ok\n}\n\nfunc (p *rowBufferPage) Size() int64 { return 0 }\n\nfunc (p *rowBufferPage) Values() ValueReader {\n\treturn &rowBufferPageValueReader{\n\t\tpage:        p,\n\t\tcolumnIndex: ^int16(p.column),\n\t}\n}\n\nfunc (p *rowBufferPage) Clone() Page {\n\trows := make([]Row, len(p.rows))\n\tfor i := range rows {\n\t\trows[i] = p.rows[i].Clone()\n\t}\n\treturn &rowBufferPage{\n\t\trows:   rows,\n\t\ttyp:    p.typ,\n\t\tcolumn: p.column,\n\t}\n}\n\nfunc (p *rowBufferPage) Slice(i, j int64) Page {\n\treturn &rowBufferPage{\n\t\trows:   p.rows[i:j],\n\t\ttyp:    p.typ,\n\t\tcolumn: p.column,\n\t}\n}\n\nfunc (p *rowBufferPage) RepetitionLevels() (repetitionLevels []byte) {\n\tif p.maxRepetitionLevel != 0 {\n\t\trepetitionLevels = make([]byte, 0, len(p.rows))\n\t\tp.scan(func(value Value) {\n\t\t\trepetitionLevels = append(repetitionLevels, value.repetitionLevel)\n\t\t})\n\t}\n\treturn repetitionLevels\n}\n\nfunc (p *rowBufferPage) DefinitionLevels() (definitionLevels []byte) {\n\tif p.maxDefinitionLevel != 0 {\n\t\tdefinitionLevels = make([]byte, 0, len(p.rows))\n\t\tp.scan(func(value Value) {\n\t\t\tdefinitionLevels = append(definitionLevels, value.definitionLevel)\n\t\t})\n\t}\n\treturn definitionLevels\n}\n\nfunc (p *rowBufferPage) Data() encoding.Values {\n\tswitch p.typ.Kind() {\n\tcase Boolean:\n\t\tvalues := make([]byte, (len(p.rows)+7)/8)\n\t\tnumValues := 0\n\t\tp.scanNonNull(func(value Value) {\n\t\t\tif value.boolean() {\n\t\t\t\ti := uint(numValues) / 8\n\t\t\t\tj := uint(numValues) % 8\n\t\t\t\tvalues[i] |= 1 << j\n\t\t\t}\n\t\t\tnumValues++\n\t\t})\n\t\treturn encoding.BooleanValues(values[:(numValues+7)/8])\n\n\tcase Int32:\n\t\tvalues := make([]int32, 0, len(p.rows))\n\t\tp.scanNonNull(func(value Value) { values = append(values, value.int32()) })\n\t\treturn encoding.Int32Values(values)\n\n\tcase Int64:\n\t\tvalues := make([]int64, 0, len(p.rows))\n\t\tp.scanNonNull(func(value Value) { values = append(values, value.int64()) })\n\t\treturn encoding.Int64Values(values)\n\n\tcase Int96:\n\t\tvalues := make([]deprecated.Int96, 0, len(p.rows))\n\t\tp.scanNonNull(func(value Value) { values = append(values, value.int96()) })\n\t\treturn encoding.Int96Values(values)\n\n\tcase Float:\n\t\tvalues := make([]float32, 0, len(p.rows))\n\t\tp.scanNonNull(func(value Value) { values = append(values, value.float()) })\n\t\treturn encoding.FloatValues(values)\n\n\tcase Double:\n\t\tvalues := make([]float64, 0, len(p.rows))\n\t\tp.scanNonNull(func(value Value) { values = append(values, value.double()) })\n\t\treturn encoding.DoubleValues(values)\n\n\tcase ByteArray:\n\t\tvalues := make([]byte, 0, p.typ.EstimateSize(len(p.rows)))\n\t\toffsets := make([]uint32, 0, len(p.rows))\n\t\tp.scanNonNull(func(value Value) {\n\t\t\toffsets = append(offsets, uint32(len(values)))\n\t\t\tvalues = append(values, value.byteArray()...)\n\t\t})\n\t\toffsets = append(offsets, uint32(len(values)))\n\t\treturn encoding.ByteArrayValues(values, offsets)\n\n\tcase FixedLenByteArray:\n\t\tlength := p.typ.Length()\n\t\tvalues := make([]byte, 0, length*len(p.rows))\n\t\tp.scanNonNull(func(value Value) { values = append(values, value.byteArray()...) })\n\t\treturn encoding.FixedLenByteArrayValues(values, length)\n\n\tdefault:\n\t\treturn encoding.Values{}\n\t}\n}\n\nfunc (p *rowBufferPage) scan(f func(Value)) {\n\tcolumnIndex := ^int16(p.column)\n\n\tfor _, row := range p.rows {\n\t\tfor _, value := range row {\n\t\t\tif value.columnIndex == columnIndex {\n\t\t\t\tf(value)\n\t\t\t}\n\t\t}\n\t}\n}\n\nfunc (p *rowBufferPage) scanNonNull(f func(Value)) {\n\tp.scan(func(value Value) {\n\t\tif !value.isNull() {\n\t\t\tf(value)\n\t\t}\n\t})\n}\n\ntype rowBufferPageValueReader struct {\n\tpage        *rowBufferPage\n\trowIndex    int\n\tvalueIndex  int\n\tcolumnIndex int16\n}\n\nfunc (r *rowBufferPageValueReader) ReadValues(values []Value) (n int, err error) {\n\tfor n < len(values) && r.rowIndex < len(r.page.rows) {\n\t\tfor n < len(values) && r.valueIndex < len(r.page.rows[r.rowIndex]) {\n\t\t\tif v := r.page.rows[r.rowIndex][r.valueIndex]; v.columnIndex == r.columnIndex {\n\t\t\t\tvalues[n] = v\n\t\t\t\tn++\n\t\t\t}\n\t\t\tr.valueIndex++\n\t\t}\n\t\tr.rowIndex++\n\t\tr.valueIndex = 0\n\t}\n\tif r.rowIndex == len(r.page.rows) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n\ntype rowBufferRows struct {\n\trows   []Row\n\tindex  int\n\tschema *Schema\n}\n\nfunc (r *rowBufferRows) Close() error {\n\tr.index = -1\n\treturn nil\n}\n\nfunc (r *rowBufferRows) Schema() *Schema {\n\treturn r.schema\n}\n\nfunc (r *rowBufferRows) SeekToRow(rowIndex int64) error {\n\tif rowIndex < 0 {\n\t\treturn ErrSeekOutOfRange\n\t}\n\n\tif r.index < 0 {\n\t\treturn io.ErrClosedPipe\n\t}\n\n\tmaxRowIndex := int64(len(r.rows))\n\tif rowIndex > maxRowIndex {\n\t\trowIndex = maxRowIndex\n\t}\n\n\tr.index = int(rowIndex)\n\treturn nil\n}\n\nfunc (r *rowBufferRows) ReadRows(rows []Row) (n int, err error) {\n\tif r.index < 0 {\n\t\treturn 0, io.EOF\n\t}\n\n\tif n = len(r.rows) - r.index; n > len(rows) {\n\t\tn = len(rows)\n\t}\n\n\tfor i, row := range r.rows[r.index : r.index+n] {\n\t\trows[i] = append(rows[i][:0], row...)\n\t}\n\n\tif r.index += n; r.index == len(r.rows) {\n\t\terr = io.EOF\n\t}\n\n\treturn n, err\n}\n\nfunc (r *rowBufferRows) WriteRowsTo(w RowWriter) (int64, error) {\n\tn, err := w.WriteRows(r.rows[r.index:])\n\tr.index += n\n\treturn int64(n), err\n}\n\nvar (\n\t_ RowGroup       = (*RowBuffer[any])(nil)\n\t_ RowWriter      = (*RowBuffer[any])(nil)\n\t_ sort.Interface = (*RowBuffer[any])(nil)\n\n\t_ RowWriterTo = (*rowBufferRows)(nil)\n)\n"
  },
  {
    "path": "row_buffer_test.go",
    "content": "//go:build go1.18\n\npackage parquet_test\n\nimport (\n\t\"bytes\"\n\t\"encoding/binary\"\n\t\"errors\"\n\t\"fmt\"\n\t\"io\"\n\t\"math/rand\"\n\t\"reflect\"\n\t\"sort\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n\t\"github.com/segmentio/parquet-go/encoding\"\n)\n\nfunc TestRowBuffer(t *testing.T) {\n\ttestRowBuffer[booleanColumn](t)\n\ttestRowBuffer[int32Column](t)\n\ttestRowBuffer[int64Column](t)\n\ttestRowBuffer[int96Column](t)\n\ttestRowBuffer[floatColumn](t)\n\ttestRowBuffer[doubleColumn](t)\n\ttestRowBuffer[byteArrayColumn](t)\n\ttestRowBuffer[fixedLenByteArrayColumn](t)\n\ttestRowBuffer[stringColumn](t)\n\ttestRowBuffer[indexedStringColumn](t)\n\ttestRowBuffer[uuidColumn](t)\n\ttestRowBuffer[timeColumn](t)\n\ttestRowBuffer[timeInMillisColumn](t)\n\ttestRowBuffer[mapColumn](t)\n\ttestRowBuffer[decimalColumn](t)\n\ttestRowBuffer[addressBook](t)\n\ttestRowBuffer[contact](t)\n\ttestRowBuffer[listColumn2](t)\n\ttestRowBuffer[listColumn1](t)\n\ttestRowBuffer[listColumn0](t)\n\ttestRowBuffer[nestedListColumn1](t)\n\ttestRowBuffer[nestedListColumn](t)\n\ttestRowBuffer[*contact](t)\n\ttestRowBuffer[paddedBooleanColumn](t)\n\ttestRowBuffer[optionalInt32Column](t)\n\ttestRowBuffer[repeatedInt32Column](t)\n\n\tfor _, test := range bufferTests {\n\t\tt.Run(test.scenario, func(t *testing.T) {\n\t\t\tfor _, mod := range [...]struct {\n\t\t\t\tscenario string\n\t\t\t\tfunction func(parquet.Node) parquet.Node\n\t\t\t}{\n\t\t\t\t{scenario: \"optional\", function: parquet.Optional},\n\t\t\t\t{scenario: \"repeated\", function: parquet.Repeated},\n\t\t\t\t{scenario: \"required\", function: parquet.Required},\n\t\t\t} {\n\t\t\t\tt.Run(mod.scenario, func(t *testing.T) {\n\t\t\t\t\tfor _, ordering := range [...]struct {\n\t\t\t\t\t\tscenario string\n\t\t\t\t\t\tsorting  parquet.SortingColumn\n\t\t\t\t\t\tsortFunc func(parquet.Type, []parquet.Value)\n\t\t\t\t\t}{\n\t\t\t\t\t\t{scenario: \"unordered\", sorting: nil, sortFunc: unordered},\n\t\t\t\t\t\t{scenario: \"ascending\", sorting: parquet.Ascending(\"data\"), sortFunc: ascending},\n\t\t\t\t\t\t{scenario: \"descending\", sorting: parquet.Descending(\"data\"), sortFunc: descending},\n\t\t\t\t\t} {\n\t\t\t\t\t\tt.Run(ordering.scenario, func(t *testing.T) {\n\t\t\t\t\t\t\tschema := parquet.NewSchema(\"test\", parquet.Group{\n\t\t\t\t\t\t\t\t\"data\": mod.function(parquet.Leaf(test.typ)),\n\t\t\t\t\t\t\t})\n\n\t\t\t\t\t\t\toptions := []parquet.RowGroupOption{\n\t\t\t\t\t\t\t\tschema,\n\t\t\t\t\t\t\t}\n\n\t\t\t\t\t\t\tif ordering.sorting != nil {\n\t\t\t\t\t\t\t\toptions = append(options,\n\t\t\t\t\t\t\t\t\tparquet.SortingRowGroupConfig(\n\t\t\t\t\t\t\t\t\t\tparquet.SortingColumns(ordering.sorting),\n\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t)\n\t\t\t\t\t\t\t}\n\n\t\t\t\t\t\t\tcontent := new(bytes.Buffer)\n\t\t\t\t\t\t\tbuffer := parquet.NewRowBuffer[any](options...)\n\n\t\t\t\t\t\t\tfor _, values := range test.values {\n\t\t\t\t\t\t\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\t\t\t\t\t\t\tdefer content.Reset()\n\t\t\t\t\t\t\t\t\tdefer buffer.Reset()\n\t\t\t\t\t\t\t\t\tfields := schema.Fields()\n\t\t\t\t\t\t\t\t\ttestRowBufferAny(t, fields[0], buffer, &parquet.Plain, values, ordering.sortFunc)\n\t\t\t\t\t\t\t\t})\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t})\n\t\t\t\t\t}\n\t\t\t\t})\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc testRowBuffer[Row any](t *testing.T) {\n\tvar model Row\n\tt.Run(reflect.TypeOf(model).Name(), func(t *testing.T) {\n\t\terr := quickCheck(func(rows []Row) bool {\n\t\t\tif len(rows) == 0 {\n\t\t\t\treturn true // TODO: fix support for parquet files with zero rows\n\t\t\t}\n\t\t\tif err := testRowBufferRows(rows); err != nil {\n\t\t\t\tt.Error(err)\n\t\t\t\treturn false\n\t\t\t}\n\t\t\treturn true\n\t\t})\n\t\tif err != nil {\n\t\t\tt.Error(err)\n\t\t}\n\t})\n}\n\nfunc testRowBufferRows[Row any](rows []Row) error {\n\tsetNullPointers(rows)\n\tbuffer := parquet.NewRowBuffer[Row]()\n\t_, err := buffer.Write(rows)\n\tif err != nil {\n\t\treturn err\n\t}\n\treader := parquet.NewGenericRowGroupReader[Row](buffer)\n\tresult := make([]Row, len(rows))\n\tn, err := reader.Read(result)\n\tif err != nil && !errors.Is(err, io.EOF) {\n\t\treturn err\n\t}\n\tif n < len(rows) {\n\t\treturn fmt.Errorf(\"not enough values were read: want=%d got=%d\", len(rows), n)\n\t}\n\tif !reflect.DeepEqual(rows, result) {\n\t\treturn fmt.Errorf(\"rows mismatch:\\nwant: %#v\\ngot:  %#v\", rows, result)\n\t}\n\treturn nil\n}\n\nfunc testRowBufferAny(t *testing.T, node parquet.Node, buffer *parquet.RowBuffer[any], encoding encoding.Encoding, values []any, sortFunc sortFunc) {\n\trepetitionLevel := 0\n\tdefinitionLevel := 0\n\tif !node.Required() {\n\t\tdefinitionLevel = 1\n\t}\n\n\tminValue := parquet.Value{}\n\tmaxValue := parquet.Value{}\n\tbatch := make([]parquet.Value, len(values))\n\tfor i := range values {\n\t\tbatch[i] = parquet.ValueOf(values[i]).Level(repetitionLevel, definitionLevel, 0)\n\t}\n\n\tfor i := range batch {\n\t\t_, err := buffer.WriteRows([]parquet.Row{batch[i : i+1]})\n\t\tif err != nil {\n\t\t\tt.Fatalf(\"writing value to row group: %v\", err)\n\t\t}\n\t}\n\n\tnumRows := buffer.NumRows()\n\tif numRows != int64(len(batch)) {\n\t\tt.Fatalf(\"number of rows mismatch: want=%d got=%d\", len(batch), numRows)\n\t}\n\n\ttyp := node.Type()\n\tfor _, value := range batch {\n\t\tif minValue.IsNull() || typ.Compare(value, minValue) < 0 {\n\t\t\tminValue = value\n\t\t}\n\t\tif maxValue.IsNull() || typ.Compare(value, maxValue) > 0 {\n\t\t\tmaxValue = value\n\t\t}\n\t}\n\n\tsortFunc(typ, batch)\n\tsort.Sort(buffer)\n\n\tpages := buffer.ColumnChunks()[0].Pages()\n\tpage, err := pages.ReadPage()\n\tdefer pages.Close()\n\n\tif err == io.EOF {\n\t\tif numRows != 0 {\n\t\t\tt.Fatalf(\"no pages found in row buffer despite having %d rows\", numRows)\n\t\t} else {\n\t\t\treturn\n\t\t}\n\t}\n\n\tnumValues := page.NumValues()\n\tif numValues != int64(len(batch)) {\n\t\tt.Fatalf(\"number of values mistmatch: want=%d got=%d\", len(batch), numValues)\n\t}\n\n\tnumNulls := page.NumNulls()\n\tif numNulls != 0 {\n\t\tt.Fatalf(\"number of nulls mismatch: want=0 got=%d\", numNulls)\n\t}\n\n\tmin, max, hasBounds := page.Bounds()\n\tif !hasBounds && numRows > 0 {\n\t\tt.Fatal(\"page bounds are missing\")\n\t}\n\tif !parquet.Equal(min, minValue) {\n\t\tt.Fatalf(\"min value mismatch: want=%v got=%v\", minValue, min)\n\t}\n\tif !parquet.Equal(max, maxValue) {\n\t\tt.Fatalf(\"max value mismatch: want=%v got=%v\", maxValue, max)\n\t}\n\n\t// We write a single value per row, so num values = num rows for all pages\n\t// including repeated ones, which makes it OK to slice the pages using the\n\t// number of values as a proxy for the row indexes.\n\thalfValues := numValues / 2\n\n\tfor _, test := range [...]struct {\n\t\tscenario string\n\t\tvalues   []parquet.Value\n\t\treader   parquet.ValueReader\n\t}{\n\t\t{\"page\", batch, page.Values()},\n\t\t{\"head\", batch[:halfValues], page.Slice(0, halfValues).Values()},\n\t\t{\"tail\", batch[halfValues:], page.Slice(halfValues, numValues).Values()},\n\t} {\n\t\tv := [1]parquet.Value{}\n\t\ti := 0\n\n\t\tfor {\n\t\t\tn, err := test.reader.ReadValues(v[:])\n\t\t\tif n > 0 {\n\t\t\t\tif n != 1 {\n\t\t\t\t\tt.Fatalf(\"reading value from %q reader returned the wrong count: want=1 got=%d\", test.scenario, n)\n\t\t\t\t}\n\t\t\t\tif i < len(test.values) {\n\t\t\t\t\tif !parquet.Equal(v[0], test.values[i]) {\n\t\t\t\t\t\tt.Fatalf(\"%q value at index %d mismatches: want=%v got=%v\", test.scenario, i, test.values[i], v[0])\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\ti++\n\t\t\t}\n\t\t\tif err != nil {\n\t\t\t\tif err == io.EOF {\n\t\t\t\t\tbreak\n\t\t\t\t}\n\t\t\t\tt.Fatalf(\"reading value from %q reader: %v\", test.scenario, err)\n\t\t\t}\n\t\t}\n\n\t\tif i != len(test.values) {\n\t\t\tt.Errorf(\"wrong number of values read from %q reader: want=%d got=%d\", test.scenario, len(test.values), i)\n\t\t}\n\t}\n}\n\nfunc BenchmarkSortRowBuffer(b *testing.B) {\n\ttype Row struct {\n\t\tI0 int64\n\t\tI1 int64\n\t\tI2 int64\n\t\tI3 int64\n\t\tI4 int64\n\t\tI5 int64\n\t\tI6 int64\n\t\tI7 int64\n\t\tI8 int64\n\t\tI9 int64\n\t\tID [16]byte\n\t}\n\n\tbuf := parquet.NewRowBuffer[Row](\n\t\tparquet.SortingRowGroupConfig(\n\t\t\tparquet.SortingColumns(\n\t\t\t\tparquet.Ascending(\"ID\"),\n\t\t\t),\n\t\t),\n\t)\n\n\trows := make([]Row, 10e3)\n\tprng := rand.New(rand.NewSource(0))\n\n\tfor i := range rows {\n\t\tbinary.LittleEndian.PutUint64(rows[i].ID[:8], uint64(i))\n\t\tbinary.LittleEndian.PutUint64(rows[i].ID[8:], ^uint64(i))\n\t}\n\n\tbuf.Write(rows)\n\tb.ResetTimer()\n\n\tfor i := 0; i < b.N; i++ {\n\t\tfor j := 0; j < 10; j++ {\n\t\t\tbuf.Swap(prng.Intn(len(rows)), prng.Intn(len(rows)))\n\t\t}\n\n\t\tsort.Sort(buf)\n\t}\n}\n\nfunc BenchmarkMergeRowBuffers(b *testing.B) {\n\ttype Row struct {\n\t\tID int64 `parquet:\"id\"`\n\t}\n\n\tconst (\n\t\tnumBuffers       = 100\n\t\tnumRowsPerBuffer = 10e3\n\t)\n\n\trows := [numBuffers][numRowsPerBuffer]Row{}\n\tnextID := int64(0)\n\tfor i := 0; i < numRowsPerBuffer; i++ {\n\t\tfor j := 0; j < numBuffers; j++ {\n\t\t\trows[j][i].ID = nextID\n\t\t\tnextID++\n\t\t}\n\t}\n\n\toptions := []parquet.RowGroupOption{\n\t\tparquet.SortingRowGroupConfig(\n\t\t\tparquet.SortingColumns(\n\t\t\t\tparquet.Ascending(\"id\"),\n\t\t\t),\n\t\t),\n\t}\n\n\trowGroups := make([]parquet.RowGroup, numBuffers)\n\tfor i := range rowGroups {\n\t\tbuffer := parquet.NewRowBuffer[Row](options...)\n\t\tbuffer.Write(rows[i][:])\n\t\trowGroups[i] = buffer\n\t}\n\n\tmerge, err := parquet.MergeRowGroups(rowGroups, options...)\n\tif err != nil {\n\t\tb.Fatal(err)\n\t}\n\n\tb.ResetTimer()\n\n\tfor i := 0; i < b.N; i++ {\n\t\trows := merge.Rows()\n\t\t_, err := parquet.CopyRows(discardRows{}, rows)\n\t\trows.Close()\n\t\tif err != nil {\n\t\t\tb.Fatal(err)\n\t\t}\n\t}\n}\n\ntype discardRows struct{}\n\nfunc (discardRows) WriteRows(rows []parquet.Row) (int, error) {\n\treturn len(rows), nil\n}\n"
  },
  {
    "path": "row_builder.go",
    "content": "package parquet\n\n// RowBuilder is a type which helps build parquet rows incrementally by adding\n// values to columns.\ntype RowBuilder struct {\n\tcolumns [][]Value\n\tmodels  []Value\n\tlevels  []columnLevel\n\tgroups  []*columnGroup\n}\n\ntype columnLevel struct {\n\trepetitionDepth byte\n\trepetitionLevel byte\n\tdefinitionLevel byte\n}\n\ntype columnGroup struct {\n\tbaseColumn      []Value\n\tmembers         []int16\n\tstartIndex      int16\n\tendIndex        int16\n\trepetitionLevel byte\n\tdefinitionLevel byte\n}\n\n// NewRowBuilder constructs a RowBuilder which builds rows for the parquet\n// schema passed as argument.\nfunc NewRowBuilder(schema Node) *RowBuilder {\n\tif schema.Leaf() {\n\t\tpanic(\"schema of row builder must be a group\")\n\t}\n\tn := numLeafColumnsOf(schema)\n\tb := &RowBuilder{\n\t\tcolumns: make([][]Value, n),\n\t\tmodels:  make([]Value, n),\n\t\tlevels:  make([]columnLevel, n),\n\t}\n\tbuffers := make([]Value, len(b.columns))\n\tfor i := range b.columns {\n\t\tb.columns[i] = buffers[i : i : i+1]\n\t}\n\ttopGroup := &columnGroup{baseColumn: []Value{{}}}\n\tendIndex := b.configure(schema, 0, columnLevel{}, topGroup)\n\ttopGroup.endIndex = endIndex\n\tb.groups = append(b.groups, topGroup)\n\treturn b\n}\n\nfunc (b *RowBuilder) configure(node Node, columnIndex int16, level columnLevel, group *columnGroup) (endIndex int16) {\n\tswitch {\n\tcase node.Optional():\n\t\tlevel.definitionLevel++\n\t\tendIndex = b.configure(Required(node), columnIndex, level, group)\n\n\t\tfor i := columnIndex; i < endIndex; i++ {\n\t\t\tb.models[i].kind = 0 // null if not set\n\t\t\tb.models[i].ptr = nil\n\t\t\tb.models[i].u64 = 0\n\t\t}\n\n\tcase node.Repeated():\n\t\tlevel.definitionLevel++\n\n\t\tgroup = &columnGroup{\n\t\t\tstartIndex:      columnIndex,\n\t\t\trepetitionLevel: level.repetitionDepth,\n\t\t\tdefinitionLevel: level.definitionLevel,\n\t\t}\n\n\t\tlevel.repetitionDepth++\n\t\tendIndex = b.configure(Required(node), columnIndex, level, group)\n\n\t\tfor i := columnIndex; i < endIndex; i++ {\n\t\t\tb.models[i].kind = 0 // null if not set\n\t\t\tb.models[i].ptr = nil\n\t\t\tb.models[i].u64 = 0\n\t\t}\n\n\t\tgroup.endIndex = endIndex\n\t\tb.groups = append(b.groups, group)\n\n\tcase node.Leaf():\n\t\ttyp := node.Type()\n\t\tkind := typ.Kind()\n\t\tmodel := makeValueKind(kind)\n\t\tmodel.repetitionLevel = level.repetitionLevel\n\t\tmodel.definitionLevel = level.definitionLevel\n\t\t// FIXED_LEN_BYTE_ARRAY is the only type which needs to be given a\n\t\t// non-nil zero-value if the field is required.\n\t\tif kind == FixedLenByteArray {\n\t\t\tzero := make([]byte, typ.Length())\n\t\t\tmodel.ptr = &zero[0]\n\t\t\tmodel.u64 = uint64(len(zero))\n\t\t}\n\t\tgroup.members = append(group.members, columnIndex)\n\t\tb.models[columnIndex] = model\n\t\tb.levels[columnIndex] = level\n\t\tendIndex = columnIndex + 1\n\n\tdefault:\n\t\tendIndex = columnIndex\n\n\t\tfor _, field := range node.Fields() {\n\t\t\tendIndex = b.configure(field, endIndex, level, group)\n\t\t}\n\t}\n\treturn endIndex\n}\n\n// Add adds columnValue to the column at columnIndex.\nfunc (b *RowBuilder) Add(columnIndex int, columnValue Value) {\n\tlevel := &b.levels[columnIndex]\n\tcolumnValue.repetitionLevel = level.repetitionLevel\n\tcolumnValue.definitionLevel = level.definitionLevel\n\tcolumnValue.columnIndex = ^int16(columnIndex)\n\tlevel.repetitionLevel = level.repetitionDepth\n\tb.columns[columnIndex] = append(b.columns[columnIndex], columnValue)\n}\n\n// Next must be called to indicate the start of a new repeated record for the\n// column at the given index.\n//\n// If the column index is part of a repeated group, the builder automatically\n// starts a new record for all adjacent columns, the application does not need\n// to call this method for each column of the repeated group.\n//\n// Next must be called after adding a sequence of records.\nfunc (b *RowBuilder) Next(columnIndex int) {\n\tfor _, group := range b.groups {\n\t\tif group.startIndex <= int16(columnIndex) && int16(columnIndex) < group.endIndex {\n\t\t\tfor i := group.startIndex; i < group.endIndex; i++ {\n\t\t\t\tif level := &b.levels[i]; level.repetitionLevel != 0 {\n\t\t\t\t\tlevel.repetitionLevel = group.repetitionLevel\n\t\t\t\t}\n\t\t\t}\n\t\t\tbreak\n\t\t}\n\t}\n}\n\n// Reset clears the internal state of b, making it possible to reuse while\n// retaining the internal buffers.\nfunc (b *RowBuilder) Reset() {\n\tfor i, column := range b.columns {\n\t\tclearValues(column)\n\t\tb.columns[i] = column[:0]\n\t}\n\tfor i := range b.levels {\n\t\tb.levels[i].repetitionLevel = 0\n\t}\n}\n\n// Row materializes the current state of b into a parquet row.\nfunc (b *RowBuilder) Row() Row {\n\tnumValues := 0\n\tfor _, column := range b.columns {\n\t\tnumValues += len(column)\n\t}\n\treturn b.AppendRow(make(Row, 0, numValues))\n}\n\n// AppendRow appends the current state of b to row and returns it.\nfunc (b *RowBuilder) AppendRow(row Row) Row {\n\tfor _, group := range b.groups {\n\t\tmaxColumn := group.baseColumn\n\n\t\tfor _, columnIndex := range group.members {\n\t\t\tif column := b.columns[columnIndex]; len(column) > len(maxColumn) {\n\t\t\t\tmaxColumn = column\n\t\t\t}\n\t\t}\n\n\t\tif len(maxColumn) != 0 {\n\t\t\tcolumns := b.columns[group.startIndex:group.endIndex]\n\n\t\t\tfor i, column := range columns {\n\t\t\t\tif len(column) < len(maxColumn) {\n\t\t\t\t\tn := len(column)\n\t\t\t\t\tcolumn = append(column, maxColumn[n:]...)\n\n\t\t\t\t\tcolumnIndex := group.startIndex + int16(i)\n\t\t\t\t\tmodel := b.models[columnIndex]\n\n\t\t\t\t\tfor n < len(column) {\n\t\t\t\t\t\tv := &column[n]\n\t\t\t\t\t\tv.kind = model.kind\n\t\t\t\t\t\tv.ptr = model.ptr\n\t\t\t\t\t\tv.u64 = model.u64\n\t\t\t\t\t\tv.definitionLevel = group.definitionLevel\n\t\t\t\t\t\tv.columnIndex = ^columnIndex\n\t\t\t\t\t\tn++\n\t\t\t\t\t}\n\n\t\t\t\t\tcolumns[i] = column\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\n\treturn appendRow(row, b.columns)\n}\n"
  },
  {
    "path": "row_builder_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"fmt\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc ExampleRowBuilder() {\n\tbuilder := parquet.NewRowBuilder(parquet.Group{\n\t\t\"birth_date\": parquet.Optional(parquet.Date()),\n\t\t\"first_name\": parquet.String(),\n\t\t\"last_name\":  parquet.String(),\n\t})\n\n\tbuilder.Add(1, parquet.ByteArrayValue([]byte(\"Luke\")))\n\tbuilder.Add(2, parquet.ByteArrayValue([]byte(\"Skywalker\")))\n\n\trow := builder.Row()\n\trow.Range(func(columnIndex int, columnValues []parquet.Value) bool {\n\t\tfmt.Printf(\"%+v\\n\", columnValues[0])\n\t\treturn true\n\t})\n\n\t// Output:\n\t// C:0 D:0 R:0 V:<null>\n\t// C:1 D:0 R:0 V:Luke\n\t// C:2 D:0 R:0 V:Skywalker\n}\n\nfunc TestRowBuilder(t *testing.T) {\n\ttype (\n\t\toperation  = func(*parquet.RowBuilder)\n\t\toperations = []operation\n\t)\n\n\tadd := func(columnIndex int, columnValue parquet.Value) operation {\n\t\treturn func(b *parquet.RowBuilder) { b.Add(columnIndex, columnValue) }\n\t}\n\n\tnext := func(columnIndex int) operation {\n\t\treturn func(b *parquet.RowBuilder) { b.Next(columnIndex) }\n\t}\n\n\ttests := []struct {\n\t\tscenario   string\n\t\toperations operations\n\t\twant       parquet.Row\n\t\tschema     parquet.Node\n\t}{\n\t\t{\n\t\t\tscenario: \"add missing required column value\",\n\t\t\twant: parquet.Row{\n\t\t\t\tparquet.Int64Value(0).Level(0, 0, 0),\n\t\t\t},\n\t\t\tschema: parquet.Group{\n\t\t\t\t\"id\": parquet.Int(64),\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"set required column value\",\n\t\t\toperations: operations{\n\t\t\t\tadd(0, parquet.Int64Value(1)),\n\t\t\t},\n\t\t\twant: parquet.Row{\n\t\t\t\tparquet.Int64Value(1).Level(0, 0, 0),\n\t\t\t},\n\t\t\tschema: parquet.Group{\n\t\t\t\t\"id\": parquet.Int(64),\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"set repeated column values\",\n\t\t\toperations: operations{\n\t\t\t\tadd(0, parquet.Int64Value(1)),\n\t\t\t\tadd(1, parquet.ByteArrayValue([]byte(`1`))),\n\t\t\t\tadd(1, parquet.ByteArrayValue([]byte(`2`))),\n\t\t\t\tadd(1, parquet.ByteArrayValue([]byte(`3`))),\n\t\t\t},\n\t\t\twant: parquet.Row{\n\t\t\t\tparquet.Int64Value(1).Level(0, 0, 0),\n\t\t\t\tparquet.ByteArrayValue([]byte(`1`)).Level(0, 1, 1),\n\t\t\t\tparquet.ByteArrayValue([]byte(`2`)).Level(1, 1, 1),\n\t\t\t\tparquet.ByteArrayValue([]byte(`3`)).Level(1, 1, 1),\n\t\t\t},\n\t\t\tschema: parquet.Group{\n\t\t\t\t\"id\":    parquet.Int(64),\n\t\t\t\t\"names\": parquet.Repeated(parquet.String()),\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"add missing repeated column value\",\n\t\t\toperations: operations{\n\t\t\t\tadd(0, parquet.Int64Value(1)),\n\t\t\t},\n\t\t\twant: parquet.Row{\n\t\t\t\tparquet.Int64Value(1).Level(0, 0, 0),\n\t\t\t\tparquet.NullValue().Level(0, 0, 1),\n\t\t\t},\n\t\t\tschema: parquet.Group{\n\t\t\t\t\"id\":    parquet.Int(64),\n\t\t\t\t\"names\": parquet.Repeated(parquet.String()),\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"add missing optional column value\",\n\t\t\toperations: operations{\n\t\t\t\tadd(0, parquet.Int64Value(1)),\n\t\t\t},\n\t\t\twant: parquet.Row{\n\t\t\t\tparquet.Int64Value(1).Level(0, 0, 0),\n\t\t\t\tparquet.NullValue().Level(0, 0, 1),\n\t\t\t},\n\t\t\tschema: parquet.Group{\n\t\t\t\t\"id\":   parquet.Int(64),\n\t\t\t\t\"name\": parquet.Optional(parquet.String()),\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"add missing nested column values\",\n\t\t\toperations: operations{\n\t\t\t\tadd(0, parquet.Int64Value(1)),\n\t\t\t},\n\t\t\twant: parquet.Row{\n\t\t\t\tparquet.Int64Value(1).Level(0, 0, 0),\n\t\t\t\tparquet.NullValue().Level(0, 0, 1),\n\t\t\t\tparquet.ByteArrayValue(nil).Level(0, 0, 2),\n\t\t\t\tparquet.ByteArrayValue(nil).Level(0, 0, 3),\n\t\t\t},\n\t\t\tschema: parquet.Group{\n\t\t\t\t\"id\": parquet.Int(64),\n\t\t\t\t\"profile\": parquet.Group{\n\t\t\t\t\t\"first_name\": parquet.String(),\n\t\t\t\t\t\"last_name\":  parquet.String(),\n\t\t\t\t\t\"birth_date\": parquet.Optional(parquet.Date()),\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"add missing repeated column group\",\n\t\t\toperations: operations{\n\t\t\t\tadd(0, parquet.Int64Value(1)),\n\t\t\t\tadd(2, parquet.ByteArrayValue([]byte(`me`))),\n\t\t\t\tadd(1, parquet.Int32Value(0)),\n\t\t\t\tadd(1, parquet.Int32Value(123456)),\n\t\t\t\tadd(2, parquet.ByteArrayValue([]byte(`you`))),\n\t\t\t},\n\t\t\twant: parquet.Row{\n\t\t\t\tparquet.Int64Value(1).Level(0, 0, 0),\n\n\t\t\t\tparquet.Int32Value(0).Level(0, 2, 1),\n\t\t\t\tparquet.Int32Value(123456).Level(1, 2, 1),\n\n\t\t\t\tparquet.ByteArrayValue([]byte(`me`)).Level(0, 1, 2),\n\t\t\t\tparquet.ByteArrayValue([]byte(`you`)).Level(1, 1, 2),\n\n\t\t\t\tparquet.NullValue().Level(0, 1, 3),\n\t\t\t\tparquet.NullValue().Level(1, 1, 3),\n\t\t\t},\n\t\t\tschema: parquet.Group{\n\t\t\t\t\"id\": parquet.Int(64),\n\t\t\t\t\"profiles\": parquet.Repeated(parquet.Group{\n\t\t\t\t\t\"first_name\": parquet.String(),\n\t\t\t\t\t\"last_name\":  parquet.String(),\n\t\t\t\t\t\"birth_date\": parquet.Optional(parquet.Date()),\n\t\t\t\t}),\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"empty map\",\n\t\t\twant: parquet.Row{\n\t\t\t\tparquet.Value{}.Level(0, 0, 0),\n\t\t\t\tparquet.Value{}.Level(0, 0, 1),\n\t\t\t},\n\t\t\tschema: parquet.Group{\n\t\t\t\t\"map\": parquet.Repeated(parquet.Group{\n\t\t\t\t\t\"key_value\": parquet.Group{\n\t\t\t\t\t\t\"key\":   parquet.String(),\n\t\t\t\t\t\t\"value\": parquet.Optional(parquet.String()),\n\t\t\t\t\t},\n\t\t\t\t}),\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"one nested maps\",\n\t\t\toperations: operations{\n\t\t\t\tadd(0, parquet.ByteArrayValue([]byte(`A`))),\n\t\t\t\tadd(1, parquet.ByteArrayValue([]byte(`1`))),\n\t\t\t\tadd(0, parquet.ByteArrayValue([]byte(`B`))),\n\t\t\t\tadd(1, parquet.ByteArrayValue([]byte(`2`))),\n\t\t\t},\n\t\t\twant: parquet.Row{\n\t\t\t\t// objects.attributes.key_value.key\n\t\t\t\tparquet.ByteArrayValue([]byte(`A`)).Level(0, 2, 0),\n\t\t\t\tparquet.ByteArrayValue([]byte(`B`)).Level(2, 2, 0),\n\t\t\t\t// objects.attributes.key_value.value\n\t\t\t\tparquet.ByteArrayValue([]byte(`1`)).Level(0, 3, 1),\n\t\t\t\tparquet.ByteArrayValue([]byte(`2`)).Level(2, 3, 1),\n\t\t\t},\n\t\t\tschema: parquet.Group{\n\t\t\t\t\"objects\": parquet.Repeated(parquet.Group{\n\t\t\t\t\t\"attributes\": parquet.Repeated(parquet.Group{\n\t\t\t\t\t\t\"key_value\": parquet.Group{\n\t\t\t\t\t\t\t\"key\":   parquet.String(),\n\t\t\t\t\t\t\t\"value\": parquet.Optional(parquet.String()),\n\t\t\t\t\t\t},\n\t\t\t\t\t}),\n\t\t\t\t}),\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"multiple nested maps\",\n\t\t\toperations: operations{\n\t\t\t\tadd(0, parquet.ByteArrayValue([]byte(`A`))),\n\t\t\t\tadd(1, parquet.ByteArrayValue([]byte(`1`))),\n\t\t\t\tadd(0, parquet.ByteArrayValue([]byte(`B`))),\n\t\t\t\tadd(1, parquet.ByteArrayValue([]byte(`2`))),\n\t\t\t\tnext(1), // same as next(0) because the columns are in the same group\n\t\t\t\tadd(0, parquet.ByteArrayValue([]byte(`C`))),\n\t\t\t\tadd(1, parquet.ByteArrayValue([]byte(`3`))),\n\t\t\t},\n\t\t\twant: parquet.Row{\n\t\t\t\t// objects.attributes.key_value.key\n\t\t\t\tparquet.ByteArrayValue([]byte(`A`)).Level(0, 2, 0),\n\t\t\t\tparquet.ByteArrayValue([]byte(`B`)).Level(2, 2, 0),\n\t\t\t\tparquet.ByteArrayValue([]byte(`C`)).Level(1, 2, 0),\n\t\t\t\t// objects.attributes.key_value.value\n\t\t\t\tparquet.ByteArrayValue([]byte(`1`)).Level(0, 3, 1),\n\t\t\t\tparquet.ByteArrayValue([]byte(`2`)).Level(2, 3, 1),\n\t\t\t\tparquet.ByteArrayValue([]byte(`3`)).Level(1, 3, 1),\n\t\t\t},\n\t\t\tschema: parquet.Group{\n\t\t\t\t\"objects\": parquet.Repeated(parquet.Group{\n\t\t\t\t\t\"attributes\": parquet.Repeated(parquet.Group{\n\t\t\t\t\t\t\"key_value\": parquet.Group{\n\t\t\t\t\t\t\t\"key\":   parquet.String(),\n\t\t\t\t\t\t\t\"value\": parquet.Optional(parquet.String()),\n\t\t\t\t\t\t},\n\t\t\t\t\t}),\n\t\t\t\t}),\n\t\t\t},\n\t\t},\n\t}\n\n\tfor _, test := range tests {\n\t\tt.Run(test.scenario, func(t *testing.T) {\n\t\t\tb := parquet.NewRowBuilder(test.schema)\n\n\t\t\tfor i := 0; i < 2; i++ {\n\t\t\t\tfor _, op := range test.operations {\n\t\t\t\t\top(b)\n\t\t\t\t}\n\n\t\t\t\tif got := b.Row(); !got.Equal(test.want) {\n\t\t\t\t\tt.Fatalf(\"test %d: rows are not equal\\nwant = %+v\\ngot  = %+v\", i+1, test.want, got)\n\t\t\t\t}\n\n\t\t\t\tb.Reset()\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc BenchmarkRowBuilderAdd(b *testing.B) {\n\tbuilder := parquet.NewRowBuilder(parquet.Group{\n\t\t\"ids\": parquet.Repeated(parquet.Int(64)),\n\t})\n\n\tfor i := 0; i < b.N; i++ {\n\t\tbuilder.Add(0, parquet.Int64Value(int64(i)))\n\n\t\tif (i % 128) == 0 {\n\t\t\tbuilder.Reset() // so don't run out of memory ;)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "row_group.go",
    "content": "package parquet\n\nimport (\n\t\"fmt\"\n\t\"io\"\n\n\t\"github.com/segmentio/parquet-go/internal/debug\"\n)\n\n// RowGroup is an interface representing a parquet row group. From the Parquet\n// docs, a RowGroup is \"a logical horizontal partitioning of the data into rows.\n// There is no physical structure that is guaranteed for a row group. A row\n// group consists of a column chunk for each column in the dataset.\"\n//\n// https://github.com/apache/parquet-format#glossary\ntype RowGroup interface {\n\t// Returns the number of rows in the group.\n\tNumRows() int64\n\n\t// Returns the list of column chunks in this row group. The chunks are\n\t// ordered in the order of leaf columns from the row group's schema.\n\t//\n\t// If the underlying implementation is not read-only, the returned\n\t// parquet.ColumnChunk may implement other interfaces: for example,\n\t// parquet.ColumnBuffer if the chunk is backed by an in-memory buffer,\n\t// or typed writer interfaces like parquet.Int32Writer depending on the\n\t// underlying type of values that can be written to the chunk.\n\t//\n\t// As an optimization, the row group may return the same slice across\n\t// multiple calls to this method. Applications should treat the returned\n\t// slice as read-only.\n\tColumnChunks() []ColumnChunk\n\n\t// Returns the schema of rows in the group.\n\tSchema() *Schema\n\n\t// Returns the list of sorting columns describing how rows are sorted in the\n\t// group.\n\t//\n\t// The method will return an empty slice if the rows are not sorted.\n\tSortingColumns() []SortingColumn\n\n\t// Returns a reader exposing the rows of the row group.\n\t//\n\t// As an optimization, the returned parquet.Rows object may implement\n\t// parquet.RowWriterTo, and test the RowWriter it receives for an\n\t// implementation of the parquet.RowGroupWriter interface.\n\t//\n\t// This optimization mechanism is leveraged by the parquet.CopyRows function\n\t// to skip the generic row-by-row copy algorithm and delegate the copy logic\n\t// to the parquet.Rows object.\n\tRows() Rows\n}\n\n// Rows is an interface implemented by row readers returned by calling the Rows\n// method of RowGroup instances.\n//\n// Applications should call Close when they are done using a Rows instance in\n// order to release the underlying resources held by the row sequence.\n//\n// After calling Close, all attempts to read more rows will return io.EOF.\ntype Rows interface {\n\tRowReaderWithSchema\n\tRowSeeker\n\tio.Closer\n}\n\n// RowGroupReader is an interface implemented by types that expose sequences of\n// row groups to the application.\ntype RowGroupReader interface {\n\tReadRowGroup() (RowGroup, error)\n}\n\n// RowGroupWriter is an interface implemented by types that allow the program\n// to write row groups.\ntype RowGroupWriter interface {\n\tWriteRowGroup(RowGroup) (int64, error)\n}\n\n// SortingColumn represents a column by which a row group is sorted.\ntype SortingColumn interface {\n\t// Returns the path of the column in the row group schema, omitting the name\n\t// of the root node.\n\tPath() []string\n\n\t// Returns true if the column will sort values in descending order.\n\tDescending() bool\n\n\t// Returns true if the column will put null values at the beginning.\n\tNullsFirst() bool\n}\n\n// Ascending constructs a SortingColumn value which dictates to sort the column\n// at the path given as argument in ascending order.\nfunc Ascending(path ...string) SortingColumn { return ascending(path) }\n\n// Descending constructs a SortingColumn value which dictates to sort the column\n// at the path given as argument in descending order.\nfunc Descending(path ...string) SortingColumn { return descending(path) }\n\n// NullsFirst wraps the SortingColumn passed as argument so that it instructs\n// the row group to place null values first in the column.\nfunc NullsFirst(sortingColumn SortingColumn) SortingColumn { return nullsFirst{sortingColumn} }\n\ntype ascending []string\n\nfunc (asc ascending) String() string   { return fmt.Sprintf(\"ascending(%s)\", columnPath(asc)) }\nfunc (asc ascending) Path() []string   { return asc }\nfunc (asc ascending) Descending() bool { return false }\nfunc (asc ascending) NullsFirst() bool { return false }\n\ntype descending []string\n\nfunc (desc descending) String() string   { return fmt.Sprintf(\"descending(%s)\", columnPath(desc)) }\nfunc (desc descending) Path() []string   { return desc }\nfunc (desc descending) Descending() bool { return true }\nfunc (desc descending) NullsFirst() bool { return false }\n\ntype nullsFirst struct{ SortingColumn }\n\nfunc (nf nullsFirst) String() string   { return fmt.Sprintf(\"nulls_first+%s\", nf.SortingColumn) }\nfunc (nf nullsFirst) NullsFirst() bool { return true }\n\nfunc searchSortingColumn(sortingColumns []SortingColumn, path columnPath) int {\n\t// There are usually a few sorting columns in a row group, so the linear\n\t// scan is the fastest option and works whether the sorting column list\n\t// is sorted or not. Please revisit this decision if this code path ends\n\t// up being more costly than necessary.\n\tfor i, sorting := range sortingColumns {\n\t\tif path.equal(sorting.Path()) {\n\t\t\treturn i\n\t\t}\n\t}\n\treturn len(sortingColumns)\n}\n\nfunc sortingColumnsHavePrefix(sortingColumns, prefix []SortingColumn) bool {\n\tif len(sortingColumns) < len(prefix) {\n\t\treturn false\n\t}\n\tfor i, sortingColumn := range prefix {\n\t\tif !sortingColumnsAreEqual(sortingColumns[i], sortingColumn) {\n\t\t\treturn false\n\t\t}\n\t}\n\treturn true\n}\n\nfunc sortingColumnsAreEqual(s1, s2 SortingColumn) bool {\n\tpath1 := columnPath(s1.Path())\n\tpath2 := columnPath(s2.Path())\n\treturn path1.equal(path2) && s1.Descending() == s2.Descending() && s1.NullsFirst() == s2.NullsFirst()\n}\n\ntype rowGroup struct {\n\tschema  *Schema\n\tnumRows int64\n\tcolumns []ColumnChunk\n\tsorting []SortingColumn\n}\n\nfunc (r *rowGroup) NumRows() int64                  { return r.numRows }\nfunc (r *rowGroup) ColumnChunks() []ColumnChunk     { return r.columns }\nfunc (r *rowGroup) SortingColumns() []SortingColumn { return r.sorting }\nfunc (r *rowGroup) Schema() *Schema                 { return r.schema }\nfunc (r *rowGroup) Rows() Rows                      { return newRowGroupRows(r, ReadModeSync) }\n\nfunc NewRowGroupRowReader(rowGroup RowGroup) Rows {\n\treturn newRowGroupRows(rowGroup, ReadModeSync)\n}\n\ntype rowGroupRows struct {\n\trowGroup     RowGroup\n\tbuffers      []Value\n\treaders      []Pages\n\tcolumns      []columnChunkRows\n\tinited       bool\n\tclosed       bool\n\tdone         chan<- struct{}\n\tpageReadMode ReadMode\n}\n\ntype columnChunkRows struct {\n\trows   int64\n\toffset int32\n\tlength int32\n\tpage   Page\n\tvalues ValueReader\n}\n\nconst columnBufferSize = defaultValueBufferSize\n\nfunc (r *rowGroupRows) buffer(i int) []Value {\n\tj := (i + 0) * columnBufferSize\n\tk := (i + 1) * columnBufferSize\n\treturn r.buffers[j:k:k]\n}\n\nfunc newRowGroupRows(rowGroup RowGroup, pageReadMode ReadMode) *rowGroupRows {\n\treturn &rowGroupRows{\n\t\trowGroup:     rowGroup,\n\t\tpageReadMode: pageReadMode,\n\t}\n}\n\nfunc (r *rowGroupRows) init() {\n\tcolumns := r.rowGroup.ColumnChunks()\n\n\tr.buffers = make([]Value, len(columns)*columnBufferSize)\n\tr.readers = make([]Pages, len(columns))\n\tr.columns = make([]columnChunkRows, len(columns))\n\n\tswitch r.pageReadMode {\n\tcase ReadModeAsync:\n\t\tdone := make(chan struct{})\n\t\tr.done = done\n\t\treaders := make([]asyncPages, len(columns))\n\t\tfor i, column := range columns {\n\t\t\treaders[i].init(column.Pages(), done)\n\t\t\tr.readers[i] = &readers[i]\n\t\t}\n\tcase ReadModeSync:\n\t\tfor i, column := range columns {\n\t\t\tr.readers[i] = column.Pages()\n\t\t}\n\tdefault:\n\t\tpanic(fmt.Sprintf(\"parquet: invalid page read mode: %d\", r.pageReadMode))\n\t}\n\n\tr.inited = true\n\t// This finalizer is used to ensure that the goroutines started by calling\n\t// init on the underlying page readers will be shutdown in the event that\n\t// Close isn't called and the rowGroupRows object is garbage collected.\n\tdebug.SetFinalizer(r, func(r *rowGroupRows) { r.Close() })\n}\n\nfunc (r *rowGroupRows) clear() {\n\tfor i := range r.columns {\n\t\tRelease(r.columns[i].page)\n\t}\n\n\tfor i := range r.columns {\n\t\tr.columns[i] = columnChunkRows{}\n\t}\n\n\tfor i := range r.buffers {\n\t\tr.buffers[i] = Value{}\n\t}\n}\n\nfunc (r *rowGroupRows) Reset() {\n\tfor i := range r.readers {\n\t\t// Ignore errors because we are resetting the reader, if the error\n\t\t// persists we will see it on the next read, and otherwise we can\n\t\t// read back from the beginning.\n\t\tr.readers[i].SeekToRow(0)\n\t}\n\tr.clear()\n}\n\nfunc (r *rowGroupRows) Close() error {\n\tvar lastErr error\n\n\tif r.done != nil {\n\t\tclose(r.done)\n\t\tr.done = nil\n\t}\n\n\tfor i := range r.readers {\n\t\tif err := r.readers[i].Close(); err != nil {\n\t\t\tlastErr = err\n\t\t}\n\t}\n\n\tr.clear()\n\tr.inited = true\n\tr.closed = true\n\treturn lastErr\n}\n\nfunc (r *rowGroupRows) SeekToRow(rowIndex int64) error {\n\tvar lastErr error\n\n\tif r.closed {\n\t\treturn io.ErrClosedPipe\n\t}\n\n\tif !r.inited {\n\t\tr.init()\n\t}\n\n\tfor i := range r.readers {\n\t\tif err := r.readers[i].SeekToRow(rowIndex); err != nil {\n\t\t\tlastErr = err\n\t\t}\n\t}\n\n\tr.clear()\n\treturn lastErr\n}\n\nfunc (r *rowGroupRows) ReadRows(rows []Row) (int, error) {\n\tif r.closed {\n\t\treturn 0, io.EOF\n\t}\n\n\tif !r.inited {\n\t\tr.init()\n\t}\n\n\t// Limit the number of rows that we read to the smallest number of rows\n\t// remaining in the current page of each column. This is necessary because\n\t// the pointers exposed to the returned rows need to remain valid until the\n\t// next call to ReadRows, SeekToRow, Reset, or Close. If we release one of\n\t// the columns' page, the rows that were already read during the ReadRows\n\t// call would be invalidated, and might reference memory locations that have\n\t// been reused due to pooling of page buffers.\n\tnumRows := int64(len(rows))\n\n\tfor i := range r.columns {\n\t\tc := &r.columns[i]\n\t\t// When all rows of the current page of a column have been consumed we\n\t\t// have to read the next page. This will effectively invalidate all\n\t\t// pointers of values previously held in the page, which is valid if\n\t\t// the application respects the RowReader interface and does not retain\n\t\t// parquet values without cloning them first.\n\t\tfor c.rows == 0 {\n\t\t\tvar err error\n\t\t\tclearValues(r.buffer(i))\n\n\t\t\tc.offset = 0\n\t\t\tc.length = 0\n\t\t\tc.values = nil\n\t\t\tRelease(c.page)\n\n\t\t\tc.page, err = r.readers[i].ReadPage()\n\t\t\tif err != nil {\n\t\t\t\tif err != io.EOF {\n\t\t\t\t\treturn 0, err\n\t\t\t\t}\n\t\t\t\tbreak\n\t\t\t}\n\n\t\t\tc.rows = c.page.NumRows()\n\t\t\tc.values = c.page.Values()\n\t\t}\n\n\t\tif c.rows < numRows {\n\t\t\tnumRows = c.rows\n\t\t}\n\t}\n\n\tfor i := range rows {\n\t\trows[i] = rows[i][:0]\n\t}\n\n\tif numRows == 0 {\n\t\treturn 0, io.EOF\n\t}\n\n\tn, err := r.readRows(rows[:numRows])\n\n\tfor i := range r.columns {\n\t\tr.columns[i].rows -= int64(n)\n\t}\n\n\treturn n, err\n}\n\nfunc (r *rowGroupRows) Schema() *Schema {\n\treturn r.rowGroup.Schema()\n}\n\nfunc (r *rowGroupRows) readRows(rows []Row) (int, error) {\n\tfor i := range rows {\n\treadColumns:\n\t\tfor columnIndex := range r.columns {\n\t\t\tcol := &r.columns[columnIndex]\n\t\t\tbuf := r.buffer(columnIndex)\n\n\t\t\tskip := int32(1)\n\t\t\tfor {\n\t\t\t\tif col.offset == col.length {\n\t\t\t\t\tn, err := col.values.ReadValues(buf)\n\t\t\t\t\tif n == 0 {\n\t\t\t\t\t\tswitch err {\n\t\t\t\t\t\tcase nil:\n\t\t\t\t\t\t\terr = io.ErrNoProgress\n\t\t\t\t\t\tcase io.EOF:\n\t\t\t\t\t\t\tcontinue readColumns\n\t\t\t\t\t\t}\n\t\t\t\t\t\treturn i, err\n\t\t\t\t\t}\n\t\t\t\t\tcol.offset = 0\n\t\t\t\t\tcol.length = int32(n)\n\t\t\t\t}\n\n\t\t\t\t_ = buf[:col.offset]\n\t\t\t\t_ = buf[:col.length]\n\t\t\t\tendOffset := col.offset + skip\n\n\t\t\t\tfor endOffset < col.length && buf[endOffset].repetitionLevel != 0 {\n\t\t\t\t\tendOffset++\n\t\t\t\t}\n\n\t\t\t\trows[i] = append(rows[i], buf[col.offset:endOffset]...)\n\n\t\t\t\tif col.offset = endOffset; col.offset < col.length {\n\t\t\t\t\tbreak\n\t\t\t\t}\n\t\t\t\tskip = 0\n\t\t\t}\n\t\t}\n\t}\n\treturn len(rows), nil\n}\n\ntype seekRowGroup struct {\n\tbase    RowGroup\n\tseek    int64\n\tcolumns []ColumnChunk\n}\n\nfunc (g *seekRowGroup) NumRows() int64 {\n\treturn g.base.NumRows() - g.seek\n}\n\nfunc (g *seekRowGroup) ColumnChunks() []ColumnChunk {\n\treturn g.columns\n}\n\nfunc (g *seekRowGroup) Schema() *Schema {\n\treturn g.base.Schema()\n}\n\nfunc (g *seekRowGroup) SortingColumns() []SortingColumn {\n\treturn g.base.SortingColumns()\n}\n\nfunc (g *seekRowGroup) Rows() Rows {\n\trows := g.base.Rows()\n\trows.SeekToRow(g.seek)\n\treturn rows\n}\n\ntype seekColumnChunk struct {\n\tbase ColumnChunk\n\tseek int64\n}\n\nfunc (c *seekColumnChunk) Type() Type {\n\treturn c.base.Type()\n}\n\nfunc (c *seekColumnChunk) Column() int {\n\treturn c.base.Column()\n}\n\nfunc (c *seekColumnChunk) Pages() Pages {\n\tpages := c.base.Pages()\n\tpages.SeekToRow(c.seek)\n\treturn pages\n}\n\nfunc (c *seekColumnChunk) ColumnIndex() ColumnIndex {\n\treturn c.base.ColumnIndex()\n}\n\nfunc (c *seekColumnChunk) OffsetIndex() OffsetIndex {\n\treturn c.base.OffsetIndex()\n}\n\nfunc (c *seekColumnChunk) BloomFilter() BloomFilter {\n\treturn c.base.BloomFilter()\n}\n\nfunc (c *seekColumnChunk) NumValues() int64 {\n\treturn c.base.NumValues()\n}\n\ntype emptyRowGroup struct {\n\tschema  *Schema\n\tcolumns []ColumnChunk\n}\n\nfunc newEmptyRowGroup(schema *Schema) *emptyRowGroup {\n\tcolumns := schema.Columns()\n\trowGroup := &emptyRowGroup{\n\t\tschema:  schema,\n\t\tcolumns: make([]ColumnChunk, len(columns)),\n\t}\n\temptyColumnChunks := make([]emptyColumnChunk, len(columns))\n\tfor i, column := range schema.Columns() {\n\t\tleaf, _ := schema.Lookup(column...)\n\t\temptyColumnChunks[i].typ = leaf.Node.Type()\n\t\temptyColumnChunks[i].column = int16(leaf.ColumnIndex)\n\t\trowGroup.columns[i] = &emptyColumnChunks[i]\n\t}\n\treturn rowGroup\n}\n\nfunc (g *emptyRowGroup) NumRows() int64                  { return 0 }\nfunc (g *emptyRowGroup) ColumnChunks() []ColumnChunk     { return g.columns }\nfunc (g *emptyRowGroup) Schema() *Schema                 { return g.schema }\nfunc (g *emptyRowGroup) SortingColumns() []SortingColumn { return nil }\nfunc (g *emptyRowGroup) Rows() Rows                      { return emptyRows{g.schema} }\n\ntype emptyColumnChunk struct {\n\ttyp    Type\n\tcolumn int16\n}\n\nfunc (c *emptyColumnChunk) Type() Type               { return c.typ }\nfunc (c *emptyColumnChunk) Column() int              { return int(c.column) }\nfunc (c *emptyColumnChunk) Pages() Pages             { return emptyPages{} }\nfunc (c *emptyColumnChunk) ColumnIndex() ColumnIndex { return emptyColumnIndex{} }\nfunc (c *emptyColumnChunk) OffsetIndex() OffsetIndex { return emptyOffsetIndex{} }\nfunc (c *emptyColumnChunk) BloomFilter() BloomFilter { return emptyBloomFilter{} }\nfunc (c *emptyColumnChunk) NumValues() int64         { return 0 }\n\ntype emptyBloomFilter struct{}\n\nfunc (emptyBloomFilter) ReadAt([]byte, int64) (int, error) { return 0, io.EOF }\nfunc (emptyBloomFilter) Size() int64                       { return 0 }\nfunc (emptyBloomFilter) Check(Value) (bool, error)         { return false, nil }\n\ntype emptyRows struct{ schema *Schema }\n\nfunc (r emptyRows) Close() error                         { return nil }\nfunc (r emptyRows) Schema() *Schema                      { return r.schema }\nfunc (r emptyRows) ReadRows([]Row) (int, error)          { return 0, io.EOF }\nfunc (r emptyRows) SeekToRow(int64) error                { return nil }\nfunc (r emptyRows) WriteRowsTo(RowWriter) (int64, error) { return 0, nil }\n\ntype emptyPages struct{}\n\nfunc (emptyPages) ReadPage() (Page, error) { return nil, io.EOF }\nfunc (emptyPages) SeekToRow(int64) error   { return nil }\nfunc (emptyPages) Close() error            { return nil }\n\nvar (\n\t_ RowReaderWithSchema = (*rowGroupRows)(nil)\n\t//_ RowWriterTo         = (*rowGroupRows)(nil)\n\n\t_ RowReaderWithSchema = emptyRows{}\n\t_ RowWriterTo         = emptyRows{}\n)\n"
  },
  {
    "path": "row_group_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"bytes\"\n\t\"io\"\n\t\"reflect\"\n\t\"sort\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc sortedRowGroup(options []parquet.RowGroupOption, rows ...interface{}) parquet.RowGroup {\n\tbuf := parquet.NewBuffer(options...)\n\tfor _, row := range rows {\n\t\tbuf.Write(row)\n\t}\n\tsort.Stable(buf)\n\treturn buf\n}\n\ntype Person struct {\n\tFirstName utf8string\n\tLastName  utf8string\n\tAge       int\n}\n\ntype LastNameOnly struct {\n\tLastName utf8string\n}\n\nfunc newPeopleBuffer(people []Person) parquet.RowGroup {\n\tbuffer := parquet.NewBuffer()\n\tfor i := range people {\n\t\tbuffer.Write(&people[i])\n\t}\n\treturn buffer\n}\n\nfunc newPeopleFile(people []Person) parquet.RowGroup {\n\tbuffer := new(bytes.Buffer)\n\twriter := parquet.NewWriter(buffer)\n\tfor i := range people {\n\t\twriter.Write(&people[i])\n\t}\n\twriter.Close()\n\treader := bytes.NewReader(buffer.Bytes())\n\tf, err := parquet.OpenFile(reader, reader.Size())\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\treturn f.RowGroups()[0]\n}\n\nfunc TestSeekToRow(t *testing.T) {\n\tfor _, config := range []struct {\n\t\tname        string\n\t\tnewRowGroup func([]Person) parquet.RowGroup\n\t}{\n\t\t{name: \"buffer\", newRowGroup: newPeopleBuffer},\n\t\t{name: \"file\", newRowGroup: newPeopleFile},\n\t} {\n\t\tt.Run(config.name, func(t *testing.T) { testSeekToRow(t, config.newRowGroup) })\n\t}\n}\n\nfunc testSeekToRow(t *testing.T, newRowGroup func([]Person) parquet.RowGroup) {\n\terr := quickCheck(func(people []Person) bool {\n\t\tif len(people) == 0 { // TODO: fix creation of empty parquet files\n\t\t\treturn true\n\t\t}\n\t\trowGroup := newRowGroup(people)\n\t\trows := rowGroup.Rows()\n\t\trbuf := make([]parquet.Row, 1)\n\t\tpers := Person{}\n\t\tschema := parquet.SchemaOf(&pers)\n\t\tdefer rows.Close()\n\n\t\tfor i := range people {\n\t\t\tif err := rows.SeekToRow(int64(i)); err != nil {\n\t\t\t\tt.Errorf(\"seeking to row %d: %+v\", i, err)\n\t\t\t\treturn false\n\t\t\t}\n\t\t\tif _, err := rows.ReadRows(rbuf); err != nil {\n\t\t\t\tt.Errorf(\"reading row %d: %+v\", i, err)\n\t\t\t\treturn false\n\t\t\t}\n\t\t\tif err := schema.Reconstruct(&pers, rbuf[0]); err != nil {\n\t\t\t\tt.Errorf(\"deconstructing row %d: %+v\", i, err)\n\t\t\t\treturn false\n\t\t\t}\n\t\t\tif !reflect.DeepEqual(&pers, &people[i]) {\n\t\t\t\tt.Errorf(\"row %d mismatch\", i)\n\t\t\t\treturn false\n\t\t\t}\n\t\t}\n\n\t\treturn true\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc selfRowGroup(rowGroup parquet.RowGroup) parquet.RowGroup {\n\treturn rowGroup\n}\n\nfunc fileRowGroup(rowGroup parquet.RowGroup) parquet.RowGroup {\n\tbuffer := new(bytes.Buffer)\n\twriter := parquet.NewWriter(buffer)\n\tif _, err := writer.WriteRowGroup(rowGroup); err != nil {\n\t\tpanic(err)\n\t}\n\tif err := writer.Close(); err != nil {\n\t\tpanic(err)\n\t}\n\treader := bytes.NewReader(buffer.Bytes())\n\tf, err := parquet.OpenFile(reader, reader.Size())\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\treturn f.RowGroups()[0]\n}\n\nfunc TestWriteRowGroupClosesRows(t *testing.T) {\n\tvar rows []*wrappedRows\n\trg := wrappedRowGroup{\n\t\tRowGroup: newPeopleFile([]Person{{}}),\n\t\trowsCallback: func(r parquet.Rows) parquet.Rows {\n\t\t\twrapped := &wrappedRows{Rows: r}\n\t\t\trows = append(rows, wrapped)\n\t\t\treturn wrapped\n\t\t},\n\t}\n\twriter := parquet.NewWriter(io.Discard)\n\tif _, err := writer.WriteRowGroup(rg); err != nil {\n\t\tt.Fatal(err)\n\t}\n\tif err := writer.Close(); err != nil {\n\t\tt.Fatal(err)\n\t}\n\tfor _, r := range rows {\n\t\tif !r.closed {\n\t\t\tt.Fatal(\"rows not closed\")\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "row_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"io\"\n\t\"reflect\"\n\t\"testing\"\n\n\t\"github.com/google/uuid\"\n\t\"github.com/segmentio/parquet-go\"\n)\n\ntype bufferedRows struct {\n\trows []parquet.Row\n}\n\nfunc (r *bufferedRows) ReadRows(rows []parquet.Row) (int, error) {\n\tfor i := range rows {\n\t\tif len(r.rows) == 0 {\n\t\t\treturn i, io.EOF\n\t\t}\n\t\trows[i] = append(rows[i][:0], r.rows[0]...)\n\t\tr.rows = r.rows[1:]\n\t}\n\treturn len(rows), nil\n}\n\nfunc (w *bufferedRows) WriteRows(rows []parquet.Row) (int, error) {\n\tfor _, row := range rows {\n\t\tw.rows = append(w.rows, row.Clone())\n\t}\n\treturn len(rows), nil\n}\n\nfunc TestMultiRowWriter(t *testing.T) {\n\tb1 := new(bufferedRows)\n\tb2 := new(bufferedRows)\n\tmw := parquet.MultiRowWriter(b1, b2)\n\n\trows := []parquet.Row{\n\t\t{\n\t\t\tparquet.Int32Value(10).Level(0, 0, 0),\n\t\t\tparquet.Int32Value(11).Level(0, 0, 1),\n\t\t\tparquet.Int32Value(12).Level(0, 0, 2),\n\t\t},\n\t\t{\n\t\t\tparquet.Int32Value(20).Level(0, 0, 0),\n\t\t\tparquet.Int32Value(21).Level(0, 0, 1),\n\t\t\tparquet.Int32Value(22).Level(0, 0, 2),\n\t\t},\n\t}\n\n\tn, err := mw.WriteRows(rows)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\tif n != len(rows) {\n\t\tt.Fatalf(\"number of rows written mismatch: got=%d want=%d\", n, len(rows))\n\t}\n\n\tassertEqualRows(t, rows, b1.rows)\n\tassertEqualRows(t, rows, b2.rows)\n}\n\nfunc TestRowClone(t *testing.T) {\n\trow := parquet.Row{\n\t\tparquet.ValueOf(42).Level(0, 1, 0),\n\t\tparquet.ValueOf(\"Hello World\").Level(1, 1, 1),\n\t}\n\tif clone := row.Clone(); !row.Equal(clone) {\n\t\tt.Error(\"row and its clone are not equal\")\n\t}\n}\n\nfunc TestDeconstructionReconstruction(t *testing.T) {\n\ttype Person struct {\n\t\tFirstName string\n\t\tLastName  string\n\t\tAge       int     `parquet:\",optional\"`\n\t\tWeight    float64 `parquet:\",optional\"`\n\t}\n\n\ttype Details struct {\n\t\tPerson *Person\n\t}\n\n\ttype Friend struct {\n\t\tID      [16]byte `parquet:\",uuid\"`\n\t\tDetails *Details\n\t}\n\n\ttype User struct {\n\t\tID      [16]byte `parquet:\",uuid\"`\n\t\tDetails *Details\n\t\tFriends []Friend `parquet:\",list,optional\"`\n\t}\n\n\ttype List2 struct {\n\t\tValue string `parquet:\",optional\"`\n\t}\n\n\ttype List1 struct {\n\t\tList2 []List2 `parquet:\",list\"`\n\t}\n\n\ttype List0 struct {\n\t\tList1 []List1 `parquet:\",list\"`\n\t}\n\n\ttype nestedListsLevel1 struct {\n\t\tLevel2 []string `parquet:\"level2\"`\n\t}\n\n\ttype nestedLists struct {\n\t\tLevel1 []nestedListsLevel1 `parquet:\"level1\"`\n\t}\n\n\ttests := []struct {\n\t\tscenario string\n\t\tinput    interface{}\n\t\tvalues   [][]parquet.Value\n\t}{\n\t\t{\n\t\t\tscenario: \"single field\",\n\t\t\tinput: struct {\n\t\t\t\tName string\n\t\t\t}{Name: \"Luke\"},\n\t\t\tvalues: [][]parquet.Value{\n\t\t\t\t0: {parquet.ValueOf(\"Luke\").Level(0, 0, 0)},\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"multiple fields\",\n\t\t\tinput: Person{\n\t\t\t\tFirstName: \"Han\",\n\t\t\t\tLastName:  \"Solo\",\n\t\t\t\tAge:       42,\n\t\t\t\tWeight:    81.5,\n\t\t\t},\n\t\t\tvalues: [][]parquet.Value{\n\t\t\t\t0: {parquet.ValueOf(\"Han\").Level(0, 0, 0)},\n\t\t\t\t1: {parquet.ValueOf(\"Solo\").Level(0, 0, 1)},\n\t\t\t\t2: {parquet.ValueOf(42).Level(0, 1, 2)},\n\t\t\t\t3: {parquet.ValueOf(81.5).Level(0, 1, 3)},\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"empty repeated field\",\n\t\t\tinput: struct {\n\t\t\t\tSymbols []string\n\t\t\t}{\n\t\t\t\tSymbols: []string{},\n\t\t\t},\n\t\t\tvalues: [][]parquet.Value{\n\t\t\t\t0: {parquet.ValueOf(nil).Level(0, 0, 0)},\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"single repeated field\",\n\t\t\tinput: struct {\n\t\t\t\tSymbols []string\n\t\t\t}{\n\t\t\t\tSymbols: []string{\"EUR\", \"USD\", \"GBP\", \"JPY\"},\n\t\t\t},\n\t\t\tvalues: [][]parquet.Value{\n\t\t\t\t0: {\n\t\t\t\t\tparquet.ValueOf(\"EUR\").Level(0, 1, 0),\n\t\t\t\t\tparquet.ValueOf(\"USD\").Level(1, 1, 0),\n\t\t\t\t\tparquet.ValueOf(\"GBP\").Level(1, 1, 0),\n\t\t\t\t\tparquet.ValueOf(\"JPY\").Level(1, 1, 0),\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"multiple repeated field\",\n\t\t\tinput: struct {\n\t\t\t\tSymbols []string\n\t\t\t\tValues  []float32\n\t\t\t}{\n\t\t\t\tSymbols: []string{\"EUR\", \"USD\", \"GBP\", \"JPY\"},\n\t\t\t\tValues:  []float32{0.1, 0.2, 0.3, 0.4},\n\t\t\t},\n\t\t\tvalues: [][]parquet.Value{\n\t\t\t\t0: {\n\t\t\t\t\tparquet.ValueOf(\"EUR\").Level(0, 1, 0),\n\t\t\t\t\tparquet.ValueOf(\"USD\").Level(1, 1, 0),\n\t\t\t\t\tparquet.ValueOf(\"GBP\").Level(1, 1, 0),\n\t\t\t\t\tparquet.ValueOf(\"JPY\").Level(1, 1, 0),\n\t\t\t\t},\n\t\t\t\t1: {\n\t\t\t\t\tparquet.ValueOf(float32(0.1)).Level(0, 1, 0),\n\t\t\t\t\tparquet.ValueOf(float32(0.2)).Level(1, 1, 0),\n\t\t\t\t\tparquet.ValueOf(float32(0.3)).Level(1, 1, 0),\n\t\t\t\t\tparquet.ValueOf(float32(0.4)).Level(1, 1, 0),\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"top level nil pointer field\",\n\t\t\tinput: struct {\n\t\t\t\tPerson *Person\n\t\t\t}{\n\t\t\t\tPerson: nil,\n\t\t\t},\n\t\t\t// Here there are four nil values because the Person type has four\n\t\t\t// fields but it is nil.\n\t\t\tvalues: [][]parquet.Value{\n\t\t\t\t0: {parquet.ValueOf(nil).Level(0, 0, 0)},\n\t\t\t\t1: {parquet.ValueOf(nil).Level(0, 0, 0)},\n\t\t\t\t2: {parquet.ValueOf(nil).Level(0, 0, 0)},\n\t\t\t\t3: {parquet.ValueOf(nil).Level(0, 0, 0)},\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"top level slice pointer\",\n\t\t\tinput: struct {\n\t\t\t\tList []*List2\n\t\t\t}{\n\t\t\t\tList: []*List2{\n\t\t\t\t\t{Value: \"foo\"},\n\t\t\t\t\t{Value: \"bar\"},\n\t\t\t\t},\n\t\t\t},\n\t\t\tvalues: [][]parquet.Value{\n\t\t\t\t0: {\n\t\t\t\t\tparquet.ValueOf(\"foo\").Level(0, 2, 0),\n\t\t\t\t\tparquet.ValueOf(\"bar\").Level(1, 2, 0),\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"sub level nil pointer field\",\n\t\t\tinput: User{\n\t\t\t\tID: uuid.MustParse(\"A65B576D-9299-4769-9D93-04BE0583F027\"),\n\t\t\t\tDetails: &Details{\n\t\t\t\t\tPerson: nil,\n\t\t\t\t},\n\t\t\t},\n\t\t\t// Here there are four nil values because the Person type has four\n\t\t\t// fields but it is nil.\n\t\t\tvalues: [][]parquet.Value{\n\t\t\t\t// User.ID\n\t\t\t\t0: {parquet.ValueOf(uuid.MustParse(\"A65B576D-9299-4769-9D93-04BE0583F027\"))},\n\t\t\t\t// User.Details.Person\n\t\t\t\t1: {parquet.ValueOf(nil).Level(0, 1, 0)},\n\t\t\t\t2: {parquet.ValueOf(nil).Level(0, 1, 0)},\n\t\t\t\t3: {parquet.ValueOf(nil).Level(0, 1, 0)},\n\t\t\t\t4: {parquet.ValueOf(nil).Level(0, 1, 0)},\n\t\t\t\t// User.Friends.ID\n\t\t\t\t5: {parquet.ValueOf(nil).Level(0, 0, 0)},\n\t\t\t\t// User.Friends.Details.Person\n\t\t\t\t6: {parquet.ValueOf(nil).Level(0, 0, 0)},\n\t\t\t\t7: {parquet.ValueOf(nil).Level(0, 0, 0)},\n\t\t\t\t8: {parquet.ValueOf(nil).Level(0, 0, 0)},\n\t\t\t\t9: {parquet.ValueOf(nil).Level(0, 0, 0)},\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"deeply nested structure\",\n\t\t\tinput: struct {\n\t\t\t\tUser User\n\t\t\t}{\n\t\t\t\tUser: User{\n\t\t\t\t\tID: uuid.MustParse(\"A65B576D-9299-4769-9D93-04BE0583F027\"),\n\t\t\t\t\tDetails: &Details{\n\t\t\t\t\t\tPerson: &Person{\n\t\t\t\t\t\t\tFirstName: \"Luke\",\n\t\t\t\t\t\t\tLastName:  \"Skywalker\",\n\t\t\t\t\t\t},\n\t\t\t\t\t},\n\t\t\t\t\tFriends: []Friend{\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tID: uuid.MustParse(\"1B76F8D0-82C6-403F-A104-DCDA69207220\"),\n\t\t\t\t\t\t\tDetails: &Details{\n\t\t\t\t\t\t\t\tPerson: &Person{\n\t\t\t\t\t\t\t\t\tFirstName: \"Han\",\n\t\t\t\t\t\t\t\t\tLastName:  \"Solo\",\n\t\t\t\t\t\t\t\t},\n\t\t\t\t\t\t\t},\n\t\t\t\t\t\t},\n\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tID: uuid.MustParse(\"C43C8852-CCE5-40E6-B0DF-7212A5633346\"),\n\t\t\t\t\t\t\tDetails: &Details{\n\t\t\t\t\t\t\t\tPerson: &Person{\n\t\t\t\t\t\t\t\t\tFirstName: \"Leia\",\n\t\t\t\t\t\t\t\t\tLastName:  \"Skywalker\",\n\t\t\t\t\t\t\t\t},\n\t\t\t\t\t\t\t},\n\t\t\t\t\t\t},\n\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tID: uuid.MustParse(\"E78642A8-0931-4D5F-918F-24DC8FF445B0\"),\n\t\t\t\t\t\t\tDetails: &Details{\n\t\t\t\t\t\t\t\tPerson: &Person{\n\t\t\t\t\t\t\t\t\tFirstName: \"C3PO\",\n\t\t\t\t\t\t\t\t\tLastName:  \"Droid\",\n\t\t\t\t\t\t\t\t},\n\t\t\t\t\t\t\t},\n\t\t\t\t\t\t},\n\t\t\t\t\t},\n\t\t\t\t},\n\t\t\t},\n\n\t\t\tvalues: [][]parquet.Value{\n\t\t\t\t// User.ID\n\t\t\t\t0: {parquet.ValueOf(uuid.MustParse(\"A65B576D-9299-4769-9D93-04BE0583F027\"))},\n\n\t\t\t\t// User.Details\n\t\t\t\t1: {parquet.ValueOf(\"Luke\").Level(0, 2, 0)},\n\t\t\t\t2: {parquet.ValueOf(\"Skywalker\").Level(0, 2, 0)},\n\t\t\t\t3: {parquet.ValueOf(nil).Level(0, 2, 0)},\n\t\t\t\t4: {parquet.ValueOf(nil).Level(0, 2, 0)},\n\n\t\t\t\t5: { // User.Friends.ID\n\t\t\t\t\tparquet.ValueOf(uuid.MustParse(\"1B76F8D0-82C6-403F-A104-DCDA69207220\")).Level(0, 2, 0),\n\t\t\t\t\tparquet.ValueOf(uuid.MustParse(\"C43C8852-CCE5-40E6-B0DF-7212A5633346\")).Level(1, 2, 0),\n\t\t\t\t\tparquet.ValueOf(uuid.MustParse(\"E78642A8-0931-4D5F-918F-24DC8FF445B0\")).Level(1, 2, 0),\n\t\t\t\t},\n\n\t\t\t\t6: { // User.Friends.Details.Person.FirstName\n\t\t\t\t\tparquet.ValueOf(\"Han\").Level(0, 4, 0),\n\t\t\t\t\tparquet.ValueOf(\"Leia\").Level(1, 4, 0),\n\t\t\t\t\tparquet.ValueOf(\"C3PO\").Level(1, 4, 0),\n\t\t\t\t},\n\n\t\t\t\t7: { // User.Friends.Details.Person.LastName\n\t\t\t\t\tparquet.ValueOf(\"Solo\").Level(0, 4, 0),\n\t\t\t\t\tparquet.ValueOf(\"Skywalker\").Level(1, 4, 0),\n\t\t\t\t\tparquet.ValueOf(\"Droid\").Level(1, 4, 0),\n\t\t\t\t},\n\n\t\t\t\t8: { // User.Friends.Details.Person.Age\n\t\t\t\t\tparquet.ValueOf(nil).Level(0, 4, 0),\n\t\t\t\t\tparquet.ValueOf(nil).Level(1, 4, 0),\n\t\t\t\t\tparquet.ValueOf(nil).Level(1, 4, 0),\n\t\t\t\t},\n\n\t\t\t\t9: { // User.Friends.Details.Person.Weight\n\t\t\t\t\tparquet.ValueOf(nil).Level(0, 4, 0),\n\t\t\t\t\tparquet.ValueOf(nil).Level(1, 4, 0),\n\t\t\t\t\tparquet.ValueOf(nil).Level(1, 4, 0),\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"multiple repeated levels\",\n\t\t\tinput: List0{\n\t\t\t\tList1: []List1{\n\t\t\t\t\t{List2: []List2{{Value: \"A\"}, {Value: \"B\"}}},\n\t\t\t\t\t{List2: []List2{}}, // parquet doesn't differentiate between empty repeated and a nil list\n\t\t\t\t\t{List2: []List2{{Value: \"C\"}}},\n\t\t\t\t\t{List2: []List2{}},\n\t\t\t\t\t{List2: []List2{{Value: \"D\"}, {Value: \"E\"}, {Value: \"F\"}}},\n\t\t\t\t\t{List2: []List2{{Value: \"G\"}, {Value: \"H\"}, {Value: \"I\"}}},\n\t\t\t\t},\n\t\t\t},\n\t\t\tvalues: [][]parquet.Value{\n\t\t\t\t{\n\t\t\t\t\tparquet.ValueOf(\"A\").Level(0, 3, 0),\n\t\t\t\t\tparquet.ValueOf(\"B\").Level(2, 3, 0),\n\t\t\t\t\tparquet.ValueOf(nil).Level(1, 1, 0),\n\t\t\t\t\tparquet.ValueOf(\"C\").Level(1, 3, 0),\n\t\t\t\t\tparquet.ValueOf(nil).Level(1, 1, 0),\n\t\t\t\t\tparquet.ValueOf(\"D\").Level(1, 3, 0),\n\t\t\t\t\tparquet.ValueOf(\"E\").Level(2, 3, 0),\n\t\t\t\t\tparquet.ValueOf(\"F\").Level(2, 3, 0),\n\t\t\t\t\tparquet.ValueOf(\"G\").Level(1, 3, 0),\n\t\t\t\t\tparquet.ValueOf(\"H\").Level(2, 3, 0),\n\t\t\t\t\tparquet.ValueOf(\"I\").Level(2, 3, 0),\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\n\t\t// https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet\n\n\t\t// message nestedLists {\n\t\t//   repeated group level1 {\n\t\t//     repeated string level2;\n\t\t//   }\n\t\t// }\n\t\t// ---\n\t\t// {\n\t\t//   level1: {\n\t\t//     level2: a\n\t\t//     level2: b\n\t\t//     level2: c\n\t\t//   },\n\t\t//   level1: {\n\t\t//     level2: d\n\t\t//     level2: e\n\t\t//     level2: f\n\t\t//     level2: g\n\t\t//   }\n\t\t// }\n\t\t//\n\t\t{\n\t\t\tscenario: \"twitter blog example 1\",\n\t\t\tinput: nestedLists{\n\t\t\t\tLevel1: []nestedListsLevel1{\n\t\t\t\t\t{Level2: []string{\"a\", \"b\", \"c\"}},\n\t\t\t\t\t{Level2: []string{\"d\", \"e\", \"f\", \"g\"}},\n\t\t\t\t},\n\t\t\t},\n\t\t\tvalues: [][]parquet.Value{\n\t\t\t\t0: {\n\t\t\t\t\tparquet.ValueOf(\"a\").Level(0, 2, 0),\n\t\t\t\t\tparquet.ValueOf(\"b\").Level(2, 2, 0),\n\t\t\t\t\tparquet.ValueOf(\"c\").Level(2, 2, 0),\n\t\t\t\t\tparquet.ValueOf(\"d\").Level(1, 2, 0),\n\t\t\t\t\tparquet.ValueOf(\"e\").Level(2, 2, 0),\n\t\t\t\t\tparquet.ValueOf(\"f\").Level(2, 2, 0),\n\t\t\t\t\tparquet.ValueOf(\"g\").Level(2, 2, 0),\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\n\t\t// message nestedLists {\n\t\t//   repeated group level1 {\n\t\t//     repeated string level2;\n\t\t//   }\n\t\t// }\n\t\t// ---\n\t\t// {\n\t\t//   level1: {\n\t\t//     level2: h\n\t\t//   },\n\t\t//   level1: {\n\t\t//     level2: i\n\t\t//     level2: j\n\t\t//   }\n\t\t// }\n\t\t//\n\t\t{\n\t\t\tscenario: \"twitter blog example 2\",\n\t\t\tinput: nestedLists{\n\t\t\t\tLevel1: []nestedListsLevel1{\n\t\t\t\t\t{Level2: []string{\"h\"}},\n\t\t\t\t\t{Level2: []string{\"i\", \"j\"}},\n\t\t\t\t},\n\t\t\t},\n\t\t\tvalues: [][]parquet.Value{\n\t\t\t\t0: {\n\t\t\t\t\tparquet.ValueOf(\"h\").Level(0, 2, 0),\n\t\t\t\t\tparquet.ValueOf(\"i\").Level(1, 2, 0),\n\t\t\t\t\tparquet.ValueOf(\"j\").Level(2, 2, 0),\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\n\t\t// message AddressBook {\n\t\t//   required string owner;\n\t\t//   repeated string ownerPhoneNumbers;\n\t\t//   repeated group contacts {\n\t\t//     required string name;\n\t\t//     optional string phoneNumber;\n\t\t//   }\n\t\t// }\n\t\t// ---\n\t\t// AddressBook {\n\t\t//   owner: \"Julien Le Dem\",\n\t\t//   ownerPhoneNumbers: \"555 123 4567\",\n\t\t//   ownerPhoneNumbers: \"555 666 1337\",\n\t\t//   contacts: {\n\t\t//     name: \"Dmitriy Ryaboy\",\n\t\t//     phoneNumber: \"555 987 6543\",\n\t\t//   },\n\t\t//   contacts: {\n\t\t//     name: \"Chris Aniszczyk\"\n\t\t//   }\n\t\t// }\n\t\t{\n\t\t\tscenario: \"twitter blog example 3\",\n\t\t\tinput: AddressBook{\n\t\t\t\tOwner: \"Julien Le Dem\",\n\t\t\t\tOwnerPhoneNumbers: []string{\n\t\t\t\t\t\"555 123 4567\",\n\t\t\t\t\t\"555 666 1337\",\n\t\t\t\t},\n\t\t\t\tContacts: []Contact{\n\t\t\t\t\t{\n\t\t\t\t\t\tName:        \"Dmitriy Ryaboy\",\n\t\t\t\t\t\tPhoneNumber: \"555 987 6543\",\n\t\t\t\t\t},\n\t\t\t\t\t{\n\t\t\t\t\t\tName: \"Chris Aniszczyk\",\n\t\t\t\t\t},\n\t\t\t\t},\n\t\t\t},\n\t\t\tvalues: [][]parquet.Value{\n\t\t\t\t0: { // AddressBook.owner\n\t\t\t\t\tparquet.ValueOf(\"Julien Le Dem\").Level(0, 0, 0),\n\t\t\t\t},\n\t\t\t\t1: { // AddressBook.ownerPhoneNumbers\n\t\t\t\t\tparquet.ValueOf(\"555 123 4567\").Level(0, 1, 0),\n\t\t\t\t\tparquet.ValueOf(\"555 666 1337\").Level(1, 1, 0),\n\t\t\t\t},\n\t\t\t\t2: { // AddressBook.contacts.name\n\t\t\t\t\tparquet.ValueOf(\"Dmitriy Ryaboy\").Level(0, 1, 0),\n\t\t\t\t\tparquet.ValueOf(\"Chris Aniszczyk\").Level(1, 1, 0),\n\t\t\t\t},\n\t\t\t\t3: { // AddressBook.contacts.phoneNumber\n\t\t\t\t\tparquet.ValueOf(\"555 987 6543\").Level(0, 2, 0),\n\t\t\t\t\tparquet.ValueOf(nil).Level(1, 1, 0),\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\t}\n\n\tfor _, test := range tests {\n\t\tt.Run(test.scenario, func(t *testing.T) {\n\t\t\tschema := parquet.SchemaOf(test.input)\n\t\t\trow := schema.Deconstruct(nil, test.input)\n\t\t\tvalues := columnsOf(row)\n\n\t\t\tt.Logf(\"\\n%s\", schema)\n\n\t\t\tfor columnIndex, expect := range test.values {\n\t\t\t\tassertEqualValues(t, columnIndex, expect, values[columnIndex])\n\t\t\t}\n\n\t\t\tnewValue := reflect.New(reflect.TypeOf(test.input))\n\t\t\tif err := schema.Reconstruct(newValue.Interface(), row); err != nil {\n\t\t\t\tt.Errorf(\"reconstruction of the parquet row into a go value failed:\\n\\t%v\", err)\n\t\t\t} else if !reflect.DeepEqual(newValue.Elem().Interface(), test.input) {\n\t\t\t\tt.Errorf(\"reconstruction of the parquet row into a go value produced the wrong output:\\nwant = %#v\\ngot  = %#v\", test.input, newValue.Elem())\n\t\t\t}\n\n\t\t\tfor columnIndex := range test.values {\n\t\t\t\tvalues[columnIndex] = nil\n\t\t\t}\n\n\t\t\tfor columnIndex, unexpected := range values {\n\t\t\t\tif unexpected != nil {\n\t\t\t\t\tt.Errorf(\"unexpected column index %d found with %d values in it\", columnIndex, len(unexpected))\n\t\t\t\t}\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc columnsOf(row parquet.Row) [][]parquet.Value {\n\tcolumns := make([][]parquet.Value, 0)\n\trow.Range(func(_ int, c []parquet.Value) bool {\n\t\tcolumns = append(columns, c)\n\t\treturn true\n\t})\n\treturn columns\n}\n\nfunc assertEqualRows(t *testing.T, want, got []parquet.Row) {\n\tif len(want) != len(got) {\n\t\tt.Errorf(\"number of rows mismatch: want=%d got=%d\", len(want), len(got))\n\t\treturn\n\t}\n\n\tfor i := range want {\n\t\trow1, row2 := want[i], got[i]\n\n\t\tif len(row1) != len(row2) {\n\t\t\tt.Errorf(\"number of values in row %d mismatch: want=%d got=%d\", i, len(row1), len(row2))\n\t\t\tcontinue\n\t\t}\n\n\t\tfor j := range row1 {\n\t\t\tif value1, value2 := row1[j], row2[j]; !parquet.DeepEqual(value1, value2) {\n\t\t\t\tt.Errorf(\"values of row %d at index %d mismatch: want=%+v got=%+v\", i, j, value1, value2)\n\t\t\t}\n\t\t}\n\t}\n}\n\nfunc assertEqualValues(t *testing.T, columnIndex int, want, got []parquet.Value) {\n\tn := len(want)\n\n\tif len(want) != len(got) {\n\t\tt.Errorf(\"wrong number of values in column %d: want=%d got=%d\", columnIndex, len(want), len(got))\n\t\tif len(want) > len(got) {\n\t\t\tn = len(got)\n\t\t}\n\t}\n\n\tfor i := 0; i < n; i++ {\n\t\tv1, v2 := want[i], got[i]\n\n\t\tif !parquet.Equal(v1, v2) {\n\t\t\tt.Errorf(\"values at index %d mismatch in column %d: want=%#v got=%#v\", i, columnIndex, v1, v2)\n\t\t}\n\t\tif columnIndex != int(v2.Column()) {\n\t\t\tt.Errorf(\"column index mismatch in column %d: want=%d got=%#v\", i, columnIndex, v2)\n\t\t}\n\t\tif v1.RepetitionLevel() != v2.RepetitionLevel() {\n\t\t\tt.Errorf(\"repetition levels at index %d mismatch in column %d: want=%#v got=%#v\", i, columnIndex, v1, v2)\n\t\t}\n\t\tif v1.DefinitionLevel() != v2.DefinitionLevel() {\n\t\t\tt.Errorf(\"definition levels at index %d mismatch in column %d: want=%#v got=%#v\", i, columnIndex, v1, v2)\n\t\t}\n\t}\n}\n\nfunc BenchmarkDeconstruct(b *testing.B) {\n\trow := &AddressBook{\n\t\tOwner: \"Julien Le Dem\",\n\t\tOwnerPhoneNumbers: []string{\n\t\t\t\"555 123 4567\",\n\t\t\t\"555 666 1337\",\n\t\t},\n\t\tContacts: []Contact{\n\t\t\t{\n\t\t\t\tName:        \"Dmitriy Ryaboy\",\n\t\t\t\tPhoneNumber: \"555 987 6543\",\n\t\t\t},\n\t\t\t{\n\t\t\t\tName: \"Chris Aniszczyk\",\n\t\t\t},\n\t\t},\n\t}\n\n\tschema := parquet.SchemaOf(row)\n\tbuffer := parquet.Row{}\n\n\tfor i := 0; i < b.N; i++ {\n\t\tbuffer = schema.Deconstruct(buffer[:0], row)\n\t}\n}\n\nfunc BenchmarkReconstruct(b *testing.B) {\n\trow := &AddressBook{\n\t\tOwner: \"Julien Le Dem\",\n\t\tOwnerPhoneNumbers: []string{\n\t\t\t\"555 123 4567\",\n\t\t\t\"555 666 1337\",\n\t\t},\n\t\tContacts: []Contact{\n\t\t\t{\n\t\t\t\tName:        \"Dmitriy Ryaboy\",\n\t\t\t\tPhoneNumber: \"555 987 6543\",\n\t\t\t},\n\t\t\t{\n\t\t\t\tName: \"Chris Aniszczyk\",\n\t\t\t},\n\t\t},\n\t}\n\n\tschema := parquet.SchemaOf(row)\n\tvalues := schema.Deconstruct(nil, row)\n\tbuffer := AddressBook{}\n\n\tfor i := 0; i < b.N; i++ {\n\t\tbuffer = AddressBook{}\n\n\t\tif err := schema.Reconstruct(&buffer, values); err != nil {\n\t\t\tb.Fatal(err)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "scan.go",
    "content": "package parquet\n\nimport \"io\"\n\n// ScanRowReader constructs a RowReader which exposes rows from reader until\n// the predicate returns false for one of the rows, or EOF is reached.\nfunc ScanRowReader(reader RowReader, predicate func(Row, int64) bool) RowReader {\n\treturn &scanRowReader{reader: reader, predicate: predicate}\n}\n\ntype scanRowReader struct {\n\treader    RowReader\n\tpredicate func(Row, int64) bool\n\trowIndex  int64\n}\n\nfunc (s *scanRowReader) ReadRows(rows []Row) (int, error) {\n\tif s.rowIndex < 0 {\n\t\treturn 0, io.EOF\n\t}\n\n\tn, err := s.reader.ReadRows(rows)\n\n\tfor i, row := range rows[:n] {\n\t\tif !s.predicate(row, s.rowIndex) {\n\t\t\ts.rowIndex = -1\n\t\t\treturn i, io.EOF\n\t\t}\n\t\ts.rowIndex++\n\t}\n\n\treturn n, err\n}\n"
  },
  {
    "path": "scan_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc TestScanRowReader(t *testing.T) {\n\trows := []parquet.Row{\n\t\t{parquet.Int64Value(0)},\n\t\t{parquet.Int64Value(1)},\n\t\t{parquet.Int64Value(2)},\n\t\t{parquet.Int64Value(3)},\n\t\t{parquet.Int64Value(4)},\n\t}\n\n\twant := []parquet.Row{\n\t\t{parquet.Int64Value(0)},\n\t\t{parquet.Int64Value(1)},\n\t\t{parquet.Int64Value(2)},\n\t}\n\n\treader := parquet.ScanRowReader(&bufferedRows{rows: rows},\n\t\tfunc(row parquet.Row, _ int64) bool {\n\t\t\treturn row[0].Int64() < 3\n\t\t},\n\t)\n\n\twriter := &bufferedRows{}\n\t_, err := parquet.CopyRows(writer, reader)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tassertEqualRows(t, want, writer.rows)\n}\n"
  },
  {
    "path": "schema.go",
    "content": "package parquet\n\nimport (\n\t\"fmt\"\n\t\"math\"\n\t\"reflect\"\n\t\"strconv\"\n\t\"strings\"\n\t\"sync\"\n\t\"time\"\n\n\t\"github.com/google/uuid\"\n\t\"github.com/segmentio/parquet-go/compress\"\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding\"\n)\n\n// Schema represents a parquet schema created from a Go value.\n//\n// Schema implements the Node interface to represent the root node of a parquet\n// schema.\ntype Schema struct {\n\tname        string\n\troot        Node\n\tdeconstruct deconstructFunc\n\treconstruct reconstructFunc\n\tmapping     columnMapping\n\tcolumns     [][]string\n}\n\n// SchemaOf constructs a parquet schema from a Go value.\n//\n// The function can construct parquet schemas from struct or pointer-to-struct\n// values only. A panic is raised if a Go value of a different type is passed\n// to this function.\n//\n// When creating a parquet Schema from a Go value, the struct fields may contain\n// a \"parquet\" tag to describe properties of the parquet node. The \"parquet\" tag\n// follows the conventional format of Go struct tags: a comma-separated list of\n// values describe the options, with the first one defining the name of the\n// parquet column.\n//\n// The following options are also supported in the \"parquet\" struct tag:\n//\n//\toptional  | make the parquet column optional\n//\tsnappy    | sets the parquet column compression codec to snappy\n//\tgzip      | sets the parquet column compression codec to gzip\n//\tbrotli    | sets the parquet column compression codec to brotli\n//\tlz4       | sets the parquet column compression codec to lz4\n//\tzstd      | sets the parquet column compression codec to zstd\n//\tplain     | enables the plain encoding (no-op default)\n//\tdict      | enables dictionary encoding on the parquet column\n//\tdelta     | enables delta encoding on the parquet column\n//\tlist      | for slice types, use the parquet LIST logical type\n//\tenum      | for string types, use the parquet ENUM logical type\n//\tuuid      | for string and [16]byte types, use the parquet UUID logical type\n//\tdecimal   | for int32, int64 and [n]byte types, use the parquet DECIMAL logical type\n//\tdate      | for int32 types use the DATE logical type\n//\ttimestamp | for int64 types use the TIMESTAMP logical type with, by default, millisecond precision\n//\tsplit     | for float32/float64, use the BYTE_STREAM_SPLIT encoding\n//\n// # The date logical type is an int32 value of the number of days since the unix epoch\n//\n// The timestamp precision can be changed by defining which precision to use as an argument.\n// Supported precisions are: nanosecond, millisecond and microsecond. Example:\n//\n//\ttype Message struct {\n//\t  TimestrampMicros int64 `parquet:\"timestamp_micros,timestamp(microsecond)\"\n//\t}\n//\n// The decimal tag must be followed by two integer parameters, the first integer\n// representing the scale and the second the precision; for example:\n//\n//\ttype Item struct {\n//\t\tCost int64 `parquet:\"cost,decimal(0:3)\"`\n//\t}\n//\n// Invalid combination of struct tags and Go types, or repeating options will\n// cause the function to panic.\n//\n// As a special case, if the field tag is \"-\", the field is omitted from the schema\n// and the data will not be written into the parquet file(s).\n// Note that a field with name \"-\" can still be generated using the tag \"-,\".\n//\n// The configuration of Parquet maps are done via two tags:\n//   - The `parquet-key` tag allows to configure the key of a map.\n//   - The parquet-value tag allows users to configure a map's values, for example to declare their native Parquet types.\n//\n// When configuring a Parquet map, the `parquet` tag will configure the map itself.\n//\n// For example, the following will set the int64 key of the map to be a timestamp:\n//\n//\ttype Actions struct {\n//\t  Action map[int64]string `parquet:\",\" parquet-key:\",timestamp\"`\n//\t}\n//\n// The schema name is the Go type name of the value.\nfunc SchemaOf(model interface{}) *Schema {\n\treturn schemaOf(dereference(reflect.TypeOf(model)))\n}\n\nvar cachedSchemas sync.Map // map[reflect.Type]*Schema\n\nfunc schemaOf(model reflect.Type) *Schema {\n\tcached, _ := cachedSchemas.Load(model)\n\tschema, _ := cached.(*Schema)\n\tif schema != nil {\n\t\treturn schema\n\t}\n\tif model.Kind() != reflect.Struct {\n\t\tpanic(\"cannot construct parquet schema from value of type \" + model.String())\n\t}\n\tschema = NewSchema(model.Name(), nodeOf(model, nil))\n\tif actual, loaded := cachedSchemas.LoadOrStore(model, schema); loaded {\n\t\tschema = actual.(*Schema)\n\t}\n\treturn schema\n}\n\n// NewSchema constructs a new Schema object with the given name and root node.\n//\n// The function panics if Node contains more leaf columns than supported by the\n// package (see parquet.MaxColumnIndex).\nfunc NewSchema(name string, root Node) *Schema {\n\tmapping, columns := columnMappingOf(root)\n\treturn &Schema{\n\t\tname:        name,\n\t\troot:        root,\n\t\tdeconstruct: makeDeconstructFunc(root),\n\t\treconstruct: makeReconstructFunc(root),\n\t\tmapping:     mapping,\n\t\tcolumns:     columns,\n\t}\n}\n\nfunc dereference(t reflect.Type) reflect.Type {\n\tfor t.Kind() == reflect.Ptr {\n\t\tt = t.Elem()\n\t}\n\treturn t\n}\n\nfunc makeDeconstructFunc(node Node) (deconstruct deconstructFunc) {\n\tif schema, _ := node.(*Schema); schema != nil {\n\t\treturn schema.deconstruct\n\t}\n\tif !node.Leaf() {\n\t\t_, deconstruct = deconstructFuncOf(0, node)\n\t}\n\treturn deconstruct\n}\n\nfunc makeReconstructFunc(node Node) (reconstruct reconstructFunc) {\n\tif schema, _ := node.(*Schema); schema != nil {\n\t\treturn schema.reconstruct\n\t}\n\tif !node.Leaf() {\n\t\t_, reconstruct = reconstructFuncOf(0, node)\n\t}\n\treturn reconstruct\n}\n\n// ConfigureRowGroup satisfies the RowGroupOption interface, allowing Schema\n// instances to be passed to row group constructors to pre-declare the schema of\n// the output parquet file.\nfunc (s *Schema) ConfigureRowGroup(config *RowGroupConfig) { config.Schema = s }\n\n// ConfigureReader satisfies the ReaderOption interface, allowing Schema\n// instances to be passed to NewReader to pre-declare the schema of rows\n// read from the reader.\nfunc (s *Schema) ConfigureReader(config *ReaderConfig) { config.Schema = s }\n\n// ConfigureWriter satisfies the WriterOption interface, allowing Schema\n// instances to be passed to NewWriter to pre-declare the schema of the\n// output parquet file.\nfunc (s *Schema) ConfigureWriter(config *WriterConfig) { config.Schema = s }\n\n// String returns a parquet schema representation of s.\nfunc (s *Schema) String() string { return sprint(s.name, s.root) }\n\n// Name returns the name of s.\nfunc (s *Schema) Name() string { return s.name }\n\n// Type returns the parquet type of s.\nfunc (s *Schema) Type() Type { return s.root.Type() }\n\n// Optional returns false since the root node of a parquet schema is always required.\nfunc (s *Schema) Optional() bool { return s.root.Optional() }\n\n// Repeated returns false since the root node of a parquet schema is always required.\nfunc (s *Schema) Repeated() bool { return s.root.Repeated() }\n\n// Required returns true since the root node of a parquet schema is always required.\nfunc (s *Schema) Required() bool { return s.root.Required() }\n\n// Leaf returns true if the root node of the parquet schema is a leaf column.\nfunc (s *Schema) Leaf() bool { return s.root.Leaf() }\n\n// Fields returns the list of fields on the root node of the parquet schema.\nfunc (s *Schema) Fields() []Field { return s.root.Fields() }\n\n// Encoding returns the encoding set on the root node of the parquet schema.\nfunc (s *Schema) Encoding() encoding.Encoding { return s.root.Encoding() }\n\n// Compression returns the compression codec set on the root node of the parquet\n// schema.\nfunc (s *Schema) Compression() compress.Codec { return s.root.Compression() }\n\n// GoType returns the Go type that best represents the schema.\nfunc (s *Schema) GoType() reflect.Type { return s.root.GoType() }\n\n// Deconstruct deconstructs a Go value and appends it to a row.\n//\n// The method panics is the structure of the go value does not match the\n// parquet schema.\nfunc (s *Schema) Deconstruct(row Row, value interface{}) Row {\n\tcolumns := make([][]Value, len(s.columns))\n\tvalues := make([]Value, len(s.columns))\n\n\tfor i := range columns {\n\t\tcolumns[i] = values[i : i : i+1]\n\t}\n\n\ts.deconstructValueToColumns(columns, reflect.ValueOf(value))\n\treturn appendRow(row, columns)\n}\n\nfunc (s *Schema) deconstructValueToColumns(columns [][]Value, value reflect.Value) {\n\tfor value.Kind() == reflect.Ptr || value.Kind() == reflect.Interface {\n\t\tif value.IsNil() {\n\t\t\tvalue = reflect.Value{}\n\t\t\tbreak\n\t\t}\n\t\tvalue = value.Elem()\n\t}\n\ts.deconstruct(columns, levels{}, value)\n}\n\n// Reconstruct reconstructs a Go value from a row.\n//\n// The go value passed as first argument must be a non-nil pointer for the\n// row to be decoded into.\n//\n// The method panics if the structure of the go value and parquet row do not\n// match.\nfunc (s *Schema) Reconstruct(value interface{}, row Row) error {\n\tv := reflect.ValueOf(value)\n\tif !v.IsValid() {\n\t\tpanic(\"cannot reconstruct row into go value of type <nil>\")\n\t}\n\tif v.Kind() != reflect.Ptr {\n\t\tpanic(\"cannot reconstruct row into go value of non-pointer type \" + v.Type().String())\n\t}\n\tif v.IsNil() {\n\t\tpanic(\"cannot reconstruct row into nil pointer of type \" + v.Type().String())\n\t}\n\tfor v.Kind() == reflect.Ptr {\n\t\tif v.IsNil() {\n\t\t\tv.Set(reflect.New(v.Type().Elem()))\n\t\t}\n\t\tv = v.Elem()\n\t}\n\n\tcolumns := make([][]Value, len(s.columns))\n\trow.Range(func(columnIndex int, columnValues []Value) bool {\n\t\tif columnIndex < len(columns) {\n\t\t\tcolumns[columnIndex] = columnValues\n\t\t}\n\t\treturn true\n\t})\n\n\treturn s.reconstruct(v, levels{}, columns)\n}\n\n// Lookup returns the leaf column at the given path.\n//\n// The path is the sequence of column names identifying a leaf column (not\n// including the root).\n//\n// If the path was not found in the mapping, or if it did not represent a\n// leaf column of the parquet schema, the boolean will be false.\nfunc (s *Schema) Lookup(path ...string) (LeafColumn, bool) {\n\tleaf := s.mapping.lookup(path)\n\treturn LeafColumn{\n\t\tNode:               leaf.node,\n\t\tPath:               leaf.path,\n\t\tColumnIndex:        int(leaf.columnIndex),\n\t\tMaxRepetitionLevel: int(leaf.maxRepetitionLevel),\n\t\tMaxDefinitionLevel: int(leaf.maxDefinitionLevel),\n\t}, leaf.node != nil\n}\n\n// Columns returns the list of column paths available in the schema.\n//\n// The method always returns the same slice value across calls to ColumnPaths,\n// applications should treat it as immutable.\nfunc (s *Schema) Columns() [][]string {\n\treturn s.columns\n}\n\n// Comparator constructs a comparator function which orders rows according to\n// the list of sorting columns passed as arguments.\nfunc (s *Schema) Comparator(sortingColumns ...SortingColumn) func(Row, Row) int {\n\treturn compareRowsFuncOf(s, sortingColumns)\n}\n\nfunc (s *Schema) forEachNode(do func(name string, node Node)) {\n\tforEachNodeOf(s.Name(), s, do)\n}\n\ntype structNode struct {\n\tgotype reflect.Type\n\tfields []structField\n}\n\nfunc structNodeOf(t reflect.Type) *structNode {\n\t// Collect struct fields first so we can order them before generating the\n\t// column indexes.\n\tfields := structFieldsOf(t)\n\n\ts := &structNode{\n\t\tgotype: t,\n\t\tfields: make([]structField, len(fields)),\n\t}\n\n\tfor i := range fields {\n\t\tfield := structField{name: fields[i].Name, index: fields[i].Index}\n\t\tfield.Node = makeNodeOf(fields[i].Type, fields[i].Name, []string{\n\t\t\tfields[i].Tag.Get(\"parquet\"),\n\t\t\tfields[i].Tag.Get(\"parquet-key\"),\n\t\t\tfields[i].Tag.Get(\"parquet-value\"),\n\t\t})\n\t\ts.fields[i] = field\n\t}\n\n\treturn s\n}\n\nfunc structFieldsOf(t reflect.Type) []reflect.StructField {\n\tfields := appendStructFields(t, nil, nil, 0)\n\n\tfor i := range fields {\n\t\tf := &fields[i]\n\n\t\tif tag := f.Tag.Get(\"parquet\"); tag != \"\" {\n\t\t\tname, _ := split(tag)\n\t\t\tif name != \"\" {\n\t\t\t\tf.Name = name\n\t\t\t}\n\t\t}\n\t}\n\n\treturn fields\n}\n\nfunc appendStructFields(t reflect.Type, fields []reflect.StructField, index []int, offset uintptr) []reflect.StructField {\n\tfor i, n := 0, t.NumField(); i < n; i++ {\n\t\tf := t.Field(i)\n\t\tif tag := f.Tag.Get(\"parquet\"); tag != \"\" {\n\t\t\tname, _ := split(tag)\n\t\t\tif tag != \"-,\" && name == \"-\" {\n\t\t\t\tcontinue\n\t\t\t}\n\t\t}\n\n\t\tfieldIndex := index[:len(index):len(index)]\n\t\tfieldIndex = append(fieldIndex, i)\n\n\t\tf.Offset += offset\n\n\t\tif f.Anonymous {\n\t\t\tfields = appendStructFields(f.Type, fields, fieldIndex, f.Offset)\n\t\t} else if f.IsExported() {\n\t\t\tf.Index = fieldIndex\n\t\t\tfields = append(fields, f)\n\t\t}\n\t}\n\treturn fields\n}\n\nfunc (s *structNode) Optional() bool { return false }\n\nfunc (s *structNode) Repeated() bool { return false }\n\nfunc (s *structNode) Required() bool { return true }\n\nfunc (s *structNode) Leaf() bool { return false }\n\nfunc (s *structNode) Encoding() encoding.Encoding { return nil }\n\nfunc (s *structNode) Compression() compress.Codec { return nil }\n\nfunc (s *structNode) GoType() reflect.Type { return s.gotype }\n\nfunc (s *structNode) String() string { return sprint(\"\", s) }\n\nfunc (s *structNode) Type() Type { return groupType{} }\n\nfunc (s *structNode) Fields() []Field {\n\tfields := make([]Field, len(s.fields))\n\tfor i := range s.fields {\n\t\tfields[i] = &s.fields[i]\n\t}\n\treturn fields\n}\n\n// fieldByIndex is like reflect.Value.FieldByIndex but returns the zero-value of\n// reflect.Value if one of the fields was a nil pointer instead of panicking.\nfunc fieldByIndex(v reflect.Value, index []int) reflect.Value {\n\tfor _, i := range index {\n\t\tif v = v.Field(i); v.Kind() == reflect.Ptr || v.Kind() == reflect.Interface {\n\t\t\tif v.IsNil() {\n\t\t\t\tv = reflect.Value{}\n\t\t\t\tbreak\n\t\t\t} else {\n\t\t\t\tv = v.Elem()\n\t\t\t}\n\t\t}\n\t}\n\treturn v\n}\n\ntype structField struct {\n\tNode\n\tname  string\n\tindex []int\n}\n\nfunc (f *structField) Name() string { return f.name }\n\nfunc (f *structField) Value(base reflect.Value) reflect.Value {\n\tswitch base.Kind() {\n\tcase reflect.Map:\n\t\treturn base.MapIndex(reflect.ValueOf(&f.name).Elem())\n\tcase reflect.Ptr:\n\t\tif base.IsNil() {\n\t\t\tbase.Set(reflect.New(base.Type().Elem()))\n\t\t}\n\t\treturn fieldByIndex(base.Elem(), f.index)\n\tdefault:\n\t\tif len(f.index) == 1 {\n\t\t\treturn base.Field(f.index[0])\n\t\t} else {\n\t\t\treturn fieldByIndex(base, f.index)\n\t\t}\n\t}\n}\n\nfunc nodeString(t reflect.Type, name string, tag ...string) string {\n\treturn fmt.Sprintf(\"%s %s %v\", name, t.String(), tag)\n}\n\nfunc throwInvalidTag(t reflect.Type, name string, tag string) {\n\tpanic(tag + \" is an invalid parquet tag: \" + nodeString(t, name, tag))\n}\n\nfunc throwUnknownTag(t reflect.Type, name string, tag string) {\n\tpanic(tag + \" is an unrecognized parquet tag: \" + nodeString(t, name, tag))\n}\n\nfunc throwInvalidNode(t reflect.Type, msg, name string, tag ...string) {\n\tpanic(msg + \": \" + nodeString(t, name, tag...))\n}\n\n// FixedLenByteArray decimals are sized based on precision\n// this function calculates the necessary byte array size.\nfunc decimalFixedLenByteArraySize(precision int) int {\n\treturn int(math.Ceil((math.Log10(2) + float64(precision)) / math.Log10(256)))\n}\n\nfunc forEachStructTagOption(sf reflect.StructField, do func(t reflect.Type, option, args string)) {\n\tif tag := sf.Tag.Get(\"parquet\"); tag != \"\" {\n\t\t_, tag = split(tag) // skip the field name\n\t\tfor tag != \"\" {\n\t\t\toption := \"\"\n\t\t\targs := \"\"\n\t\t\toption, tag = split(tag)\n\t\t\toption, args = splitOptionArgs(option)\n\t\t\tft := sf.Type\n\t\t\tif ft.Kind() == reflect.Ptr {\n\t\t\t\tft = ft.Elem()\n\t\t\t}\n\t\t\tdo(ft, option, args)\n\t\t}\n\t}\n}\n\nfunc nodeOf(t reflect.Type, tag []string) Node {\n\tswitch t {\n\tcase reflect.TypeOf(deprecated.Int96{}):\n\t\treturn Leaf(Int96Type)\n\tcase reflect.TypeOf(uuid.UUID{}):\n\t\treturn UUID()\n\tcase reflect.TypeOf(time.Time{}):\n\t\treturn Timestamp(Nanosecond)\n\t}\n\n\tvar n Node\n\tswitch t.Kind() {\n\tcase reflect.Bool:\n\t\tn = Leaf(BooleanType)\n\n\tcase reflect.Int, reflect.Int64:\n\t\tn = Int(64)\n\n\tcase reflect.Int8, reflect.Int16, reflect.Int32:\n\t\tn = Int(t.Bits())\n\n\tcase reflect.Uint, reflect.Uintptr, reflect.Uint64:\n\t\tn = Uint(64)\n\n\tcase reflect.Uint8, reflect.Uint16, reflect.Uint32:\n\t\tn = Uint(t.Bits())\n\n\tcase reflect.Float32:\n\t\tn = Leaf(FloatType)\n\n\tcase reflect.Float64:\n\t\tn = Leaf(DoubleType)\n\n\tcase reflect.String:\n\t\tn = String()\n\n\tcase reflect.Ptr:\n\t\tn = Optional(nodeOf(t.Elem(), nil))\n\n\tcase reflect.Slice:\n\t\tif elem := t.Elem(); elem.Kind() == reflect.Uint8 { // []byte?\n\t\t\tn = Leaf(ByteArrayType)\n\t\t} else {\n\t\t\tn = Repeated(nodeOf(elem, nil))\n\t\t}\n\n\tcase reflect.Array:\n\t\tif t.Elem().Kind() == reflect.Uint8 {\n\t\t\tn = Leaf(FixedLenByteArrayType(t.Len()))\n\t\t}\n\n\tcase reflect.Map:\n\t\tvar mapTag, valueTag, keyTag string\n\t\tif len(tag) > 0 {\n\t\t\tmapTag = tag[0]\n\t\t\tif len(tag) > 1 {\n\t\t\t\tkeyTag = tag[1]\n\t\t\t}\n\t\t\tif len(tag) >= 2 {\n\t\t\t\tvalueTag = tag[2]\n\t\t\t}\n\t\t}\n\n\t\tif strings.Contains(mapTag, \"json\") {\n\t\t\tn = JSON()\n\t\t} else {\n\t\t\tn = Map(\n\t\t\t\tmakeNodeOf(t.Key(), t.Name(), []string{keyTag}),\n\t\t\t\tmakeNodeOf(t.Elem(), t.Name(), []string{valueTag}),\n\t\t\t)\n\t\t}\n\n\t\tforEachTagOption([]string{mapTag}, func(option, args string) {\n\t\t\tswitch option {\n\t\t\tcase \"\", \"json\":\n\t\t\t\treturn\n\t\t\tcase \"optional\":\n\t\t\t\tn = Optional(n)\n\t\t\tdefault:\n\t\t\t\tthrowUnknownTag(t, \"map\", option)\n\t\t\t}\n\t\t})\n\n\tcase reflect.Struct:\n\t\treturn structNodeOf(t)\n\t}\n\n\tif n == nil {\n\t\tpanic(\"cannot create parquet node from go value of type \" + t.String())\n\t}\n\n\treturn &goNode{Node: n, gotype: t}\n}\n\nfunc split(s string) (head, tail string) {\n\tif i := strings.IndexByte(s, ','); i < 0 {\n\t\thead = s\n\t} else {\n\t\thead, tail = s[:i], s[i+1:]\n\t}\n\treturn\n}\n\nfunc splitOptionArgs(s string) (option, args string) {\n\tif i := strings.IndexByte(s, '('); i >= 0 {\n\t\toption = s[:i]\n\t\targs = s[i:]\n\t} else {\n\t\toption = s\n\t\targs = \"()\"\n\t}\n\treturn\n}\n\nfunc parseDecimalArgs(args string) (scale, precision int, err error) {\n\tif !strings.HasPrefix(args, \"(\") || !strings.HasSuffix(args, \")\") {\n\t\treturn 0, 0, fmt.Errorf(\"malformed decimal args: %s\", args)\n\t}\n\targs = strings.TrimPrefix(args, \"(\")\n\targs = strings.TrimSuffix(args, \")\")\n\tparts := strings.Split(args, \":\")\n\tif len(parts) != 2 {\n\t\treturn 0, 0, fmt.Errorf(\"malformed decimal args: (%s)\", args)\n\t}\n\ts, err := strconv.ParseInt(parts[0], 10, 32)\n\tif err != nil {\n\t\treturn 0, 0, err\n\t}\n\tp, err := strconv.ParseInt(parts[1], 10, 32)\n\tif err != nil {\n\t\treturn 0, 0, err\n\t}\n\treturn int(s), int(p), nil\n}\n\nfunc parseTimestampArgs(args string) (TimeUnit, error) {\n\tif !strings.HasPrefix(args, \"(\") || !strings.HasSuffix(args, \")\") {\n\t\treturn nil, fmt.Errorf(\"malformed timestamp args: %s\", args)\n\t}\n\n\targs = strings.TrimPrefix(args, \"(\")\n\targs = strings.TrimSuffix(args, \")\")\n\n\tif len(args) == 0 {\n\t\treturn Millisecond, nil\n\t}\n\n\tswitch args {\n\tcase \"millisecond\":\n\t\treturn Millisecond, nil\n\tcase \"microsecond\":\n\t\treturn Microsecond, nil\n\tcase \"nanosecond\":\n\t\treturn Nanosecond, nil\n\tdefault:\n\t}\n\n\treturn nil, fmt.Errorf(\"unknown time unit: %s\", args)\n}\n\ntype goNode struct {\n\tNode\n\tgotype reflect.Type\n}\n\nfunc (n *goNode) GoType() reflect.Type { return n.gotype }\n\nvar (\n\t_ RowGroupOption = (*Schema)(nil)\n\t_ ReaderOption   = (*Schema)(nil)\n\t_ WriterOption   = (*Schema)(nil)\n)\n\nfunc makeNodeOf(t reflect.Type, name string, tag []string) Node {\n\tvar (\n\t\tnode       Node\n\t\toptional   bool\n\t\tlist       bool\n\t\tencoded    encoding.Encoding\n\t\tcompressed compress.Codec\n\t)\n\n\tsetNode := func(n Node) {\n\t\tif node != nil {\n\t\t\tthrowInvalidNode(t, \"struct field has multiple logical parquet types declared\", name, tag...)\n\t\t}\n\t\tnode = n\n\t}\n\n\tsetOptional := func() {\n\t\tif optional {\n\t\t\tthrowInvalidNode(t, \"struct field has multiple declaration of the optional tag\", name, tag...)\n\t\t}\n\t\toptional = true\n\t}\n\n\tsetList := func() {\n\t\tif list {\n\t\t\tthrowInvalidNode(t, \"struct field has multiple declaration of the list tag\", name, tag...)\n\t\t}\n\t\tlist = true\n\t}\n\n\tsetEncoding := func(e encoding.Encoding) {\n\t\tif encoded != nil {\n\t\t\tthrowInvalidNode(t, \"struct field has encoding declared multiple time\", name, tag...)\n\t\t}\n\t\tencoded = e\n\t}\n\n\tsetCompression := func(c compress.Codec) {\n\t\tif compressed != nil {\n\t\t\tthrowInvalidNode(t, \"struct field has compression codecs declared multiple times\", name, tag...)\n\t\t}\n\t\tcompressed = c\n\t}\n\n\tforEachTagOption(tag, func(option, args string) {\n\t\tif t.Kind() == reflect.Map {\n\t\t\tnode = nodeOf(t, tag)\n\t\t\treturn\n\t\t}\n\t\tswitch option {\n\t\tcase \"\":\n\t\t\treturn\n\t\tcase \"optional\":\n\t\t\tsetOptional()\n\n\t\tcase \"snappy\":\n\t\t\tsetCompression(&Snappy)\n\n\t\tcase \"gzip\":\n\t\t\tsetCompression(&Gzip)\n\n\t\tcase \"brotli\":\n\t\t\tsetCompression(&Brotli)\n\n\t\tcase \"lz4\":\n\t\t\tsetCompression(&Lz4Raw)\n\n\t\tcase \"zstd\":\n\t\t\tsetCompression(&Zstd)\n\n\t\tcase \"uncompressed\":\n\t\t\tsetCompression(&Uncompressed)\n\n\t\tcase \"plain\":\n\t\t\tsetEncoding(&Plain)\n\n\t\tcase \"dict\":\n\t\t\tsetEncoding(&RLEDictionary)\n\n\t\tcase \"json\":\n\t\t\tsetNode(JSON())\n\n\t\tcase \"delta\":\n\t\t\tswitch t.Kind() {\n\t\t\tcase reflect.Int, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint32, reflect.Uint64:\n\t\t\t\tsetEncoding(&DeltaBinaryPacked)\n\t\t\tcase reflect.String:\n\t\t\t\tsetEncoding(&DeltaByteArray)\n\t\t\tcase reflect.Slice:\n\t\t\t\tif t.Elem().Kind() == reflect.Uint8 { // []byte?\n\t\t\t\t\tsetEncoding(&DeltaByteArray)\n\t\t\t\t} else {\n\t\t\t\t\tthrowInvalidTag(t, name, option)\n\t\t\t\t}\n\t\t\tcase reflect.Array:\n\t\t\t\tif t.Elem().Kind() == reflect.Uint8 { // [N]byte?\n\t\t\t\t\tsetEncoding(&DeltaByteArray)\n\t\t\t\t} else {\n\t\t\t\t\tthrowInvalidTag(t, name, option)\n\t\t\t\t}\n\t\t\tdefault:\n\t\t\t\tthrowInvalidTag(t, name, option)\n\t\t\t}\n\n\t\tcase \"split\":\n\t\t\tswitch t.Kind() {\n\t\t\tcase reflect.Float32, reflect.Float64:\n\t\t\t\tsetEncoding(&ByteStreamSplit)\n\t\t\tdefault:\n\t\t\t\tthrowInvalidTag(t, name, option)\n\t\t\t}\n\n\t\tcase \"list\":\n\t\t\tswitch t.Kind() {\n\t\t\tcase reflect.Slice:\n\t\t\t\telement := nodeOf(t.Elem(), nil)\n\t\t\t\tsetNode(element)\n\t\t\t\tsetList()\n\t\t\tdefault:\n\t\t\t\tthrowInvalidTag(t, name, option)\n\t\t\t}\n\n\t\tcase \"enum\":\n\t\t\tswitch t.Kind() {\n\t\t\tcase reflect.String:\n\t\t\t\tsetNode(Enum())\n\t\t\tdefault:\n\t\t\t\tthrowInvalidTag(t, name, option)\n\t\t\t}\n\n\t\tcase \"uuid\":\n\t\t\tswitch t.Kind() {\n\t\t\tcase reflect.Array:\n\t\t\t\tif t.Elem().Kind() != reflect.Uint8 || t.Len() != 16 {\n\t\t\t\t\tthrowInvalidTag(t, name, option)\n\t\t\t\t}\n\t\t\tdefault:\n\t\t\t\tthrowInvalidTag(t, name, option)\n\t\t\t}\n\n\t\tcase \"decimal\":\n\t\t\tscale, precision, err := parseDecimalArgs(args)\n\t\t\tif err != nil {\n\t\t\t\tthrowInvalidTag(t, name, option+args)\n\t\t\t}\n\t\t\tvar baseType Type\n\t\t\tswitch t.Kind() {\n\t\t\tcase reflect.Int32:\n\t\t\t\tbaseType = Int32Type\n\t\t\tcase reflect.Int64:\n\t\t\t\tbaseType = Int64Type\n\t\t\tcase reflect.Array, reflect.Slice:\n\t\t\t\tbaseType = FixedLenByteArrayType(decimalFixedLenByteArraySize(precision))\n\t\t\tdefault:\n\t\t\t\tthrowInvalidTag(t, name, option)\n\t\t\t}\n\n\t\t\tsetNode(Decimal(scale, precision, baseType))\n\t\tcase \"date\":\n\t\t\tswitch t.Kind() {\n\t\t\tcase reflect.Int32:\n\t\t\t\tsetNode(Date())\n\t\t\tdefault:\n\t\t\t\tthrowInvalidTag(t, name, option)\n\t\t\t}\n\t\tcase \"timestamp\":\n\t\t\tswitch t.Kind() {\n\t\t\tcase reflect.Int64:\n\t\t\t\ttimeUnit, err := parseTimestampArgs(args)\n\t\t\t\tif err != nil {\n\t\t\t\t\tthrowInvalidTag(t, name, option)\n\t\t\t\t}\n\t\t\t\tsetNode(Timestamp(timeUnit))\n\t\t\tdefault:\n\t\t\t\tswitch t {\n\t\t\t\tcase reflect.TypeOf(time.Time{}):\n\t\t\t\t\ttimeUnit, err := parseTimestampArgs(args)\n\t\t\t\t\tif err != nil {\n\t\t\t\t\t\tthrowInvalidTag(t, name, option)\n\t\t\t\t\t}\n\t\t\t\t\tsetNode(Timestamp(timeUnit))\n\t\t\t\tdefault:\n\t\t\t\t\tthrowInvalidTag(t, name, option)\n\t\t\t\t}\n\t\t\t}\n\t\tdefault:\n\t\t\tthrowUnknownTag(t, name, option)\n\t\t}\n\t})\n\n\t// Special case: an \"optional\" struct tag on a slice applies to the\n\t// individual items, not the overall list. The least messy way to\n\t// deal with this is at this level, instead of passing down optional\n\t// information into the nodeOf function, and then passing back whether an\n\t// optional tag was applied.\n\tif node == nil && t.Kind() == reflect.Slice {\n\t\tisUint8 := t.Elem().Kind() == reflect.Uint8\n\t\t// Note for strings \"optional\" applies only to the entire BYTE_ARRAY and\n\t\t// not each individual byte.\n\t\tif optional && !isUint8 {\n\t\t\tnode = Repeated(Optional(nodeOf(t.Elem(), tag)))\n\t\t\t// Don't also apply \"optional\" to the whole list.\n\t\t\toptional = false\n\t\t}\n\t}\n\n\tif node == nil {\n\t\tnode = nodeOf(t, tag)\n\t}\n\n\tif compressed != nil {\n\t\tnode = Compressed(node, compressed)\n\t}\n\n\tif encoded != nil {\n\t\tnode = Encoded(node, encoded)\n\t}\n\n\tif list {\n\t\tnode = List(node)\n\t}\n\n\tif node.Repeated() && !list {\n\t\telemKind := node.GoType().Elem().Kind()\n\t\tif elemKind == reflect.Slice {\n\t\t\tpanic(\"unhandled nested slice on parquet schema without list tag\")\n\t\t}\n\t}\n\n\tif optional {\n\t\tnode = Optional(node)\n\t}\n\n\treturn node\n}\n\nfunc forEachTagOption(tags []string, do func(option, args string)) {\n\tfor _, tag := range tags {\n\t\t_, tag = split(tag) // skip the field name\n\t\tfor tag != \"\" {\n\t\t\toption := \"\"\n\t\t\toption, tag = split(tag)\n\t\t\tvar args string\n\t\t\toption, args = splitOptionArgs(option)\n\t\t\tdo(option, args)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "schema_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc TestSchemaOf(t *testing.T) {\n\ttests := []struct {\n\t\tvalue interface{}\n\t\tprint string\n\t}{\n\t\t{\n\t\t\tvalue: new(struct{ Name string }),\n\t\t\tprint: `message {\n\trequired binary Name (STRING);\n}`,\n\t\t},\n\n\t\t{\n\t\t\tvalue: new(struct {\n\t\t\t\tX int\n\t\t\t\tY int\n\t\t\t}),\n\t\t\tprint: `message {\n\trequired int64 X (INT(64,true));\n\trequired int64 Y (INT(64,true));\n}`,\n\t\t},\n\n\t\t{\n\t\t\tvalue: new(struct {\n\t\t\t\tX float32\n\t\t\t\tY float32\n\t\t\t}),\n\t\t\tprint: `message {\n\trequired float X;\n\trequired float Y;\n}`,\n\t\t},\n\n\t\t{\n\t\t\tvalue: new(struct {\n\t\t\t\tInner struct {\n\t\t\t\t\tFirstName string `parquet:\"first_name\"`\n\t\t\t\t\tLastName  string `parquet:\"last_name\"`\n\t\t\t\t} `parquet:\"inner,optional\"`\n\t\t\t}),\n\t\t\tprint: `message {\n\toptional group inner {\n\t\trequired binary first_name (STRING);\n\t\trequired binary last_name (STRING);\n\t}\n}`,\n\t\t},\n\n\t\t{\n\t\t\tvalue: new(struct {\n\t\t\t\tShort float32 `parquet:\"short,split\"`\n\t\t\t\tLong  float64 `parquet:\"long,split\"`\n\t\t\t}),\n\t\t\tprint: `message {\n\trequired float short;\n\trequired double long;\n}`,\n\t\t},\n\n\t\t{\n\t\t\tvalue: new(struct {\n\t\t\t\tInner struct {\n\t\t\t\t\tFirstName          string `parquet:\"first_name\"`\n\t\t\t\t\tShouldNotBePresent string `parquet:\"-\"`\n\t\t\t\t} `parquet:\"inner,optional\"`\n\t\t\t}),\n\t\t\tprint: `message {\n\toptional group inner {\n\t\trequired binary first_name (STRING);\n\t}\n}`,\n\t\t},\n\n\t\t{\n\t\t\tvalue: new(struct {\n\t\t\t\tInner struct {\n\t\t\t\t\tFirstName    string `parquet:\"first_name\"`\n\t\t\t\t\tMyNameIsDash string `parquet:\"-,\"`\n\t\t\t\t} `parquet:\"inner,optional\"`\n\t\t\t}),\n\t\t\tprint: `message {\n\toptional group inner {\n\t\trequired binary first_name (STRING);\n\t\trequired binary - (STRING);\n\t}\n}`,\n\t\t},\n\n\t\t{\n\t\t\tvalue: new(struct {\n\t\t\t\tInner struct {\n\t\t\t\t\tTimestampMillis int64 `parquet:\"timestamp_millis,timestamp\"`\n\t\t\t\t\tTimestampMicros int64 `parquet:\"timestamp_micros,timestamp(microsecond)\"`\n\t\t\t\t} `parquet:\"inner,optional\"`\n\t\t\t}),\n\t\t\tprint: `message {\n\toptional group inner {\n\t\trequired int64 timestamp_millis (TIMESTAMP(isAdjustedToUTC=true,unit=MILLIS));\n\t\trequired int64 timestamp_micros (TIMESTAMP(isAdjustedToUTC=true,unit=MICROS));\n\t}\n}`,\n\t\t},\n\n\t\t{\n\t\t\tvalue: new(struct {\n\t\t\t\tName string `parquet:\",json\"`\n\t\t\t}),\n\t\t\tprint: `message {\n\trequired binary Name (JSON);\n}`,\n\t\t},\n\n\t\t{\n\t\t\tvalue: new(struct {\n\t\t\t\tA map[int64]string `parquet:\",\" parquet-key:\",timestamp\"`\n\t\t\t\tB map[int64]string\n\t\t\t}),\n\t\t\tprint: `message {\n\trequired group A (MAP) {\n\t\trepeated group key_value {\n\t\t\trequired int64 key (TIMESTAMP(isAdjustedToUTC=true,unit=MILLIS));\n\t\t\trequired binary value (STRING);\n\t\t}\n\t}\n\trequired group B (MAP) {\n\t\trepeated group key_value {\n\t\t\trequired int64 key (INT(64,true));\n\t\t\trequired binary value (STRING);\n\t\t}\n\t}\n}`,\n\t\t},\n\n\t\t{\n\t\t\tvalue: new(struct {\n\t\t\t\tA map[int64]string `parquet:\",optional\" parquet-value:\",json\"`\n\t\t\t}),\n\t\t\tprint: `message {\n\toptional group A (MAP) {\n\t\trepeated group key_value {\n\t\t\trequired int64 key (INT(64,true));\n\t\t\trequired binary value (JSON);\n\t\t}\n\t}\n}`,\n\t\t},\n\n\t\t{\n\t\t\tvalue: new(struct {\n\t\t\t\tA map[int64]string `parquet:\",optional\"`\n\t\t\t}),\n\t\t\tprint: `message {\n\toptional group A (MAP) {\n\t\trepeated group key_value {\n\t\t\trequired int64 key (INT(64,true));\n\t\t\trequired binary value (STRING);\n\t\t}\n\t}\n}`,\n\t\t},\n\n\t\t{\n\t\t\tvalue: new(struct {\n\t\t\t\tA map[int64]string `parquet:\",optional\" parquet-value:\",json\" parquet-key:\",timestamp(microsecond)\"`\n\t\t\t}),\n\t\t\tprint: `message {\n\toptional group A (MAP) {\n\t\trepeated group key_value {\n\t\t\trequired int64 key (TIMESTAMP(isAdjustedToUTC=true,unit=MICROS));\n\t\t\trequired binary value (JSON);\n\t\t}\n\t}\n}`,\n\t\t},\n\t}\n\n\tfor _, test := range tests {\n\t\tt.Run(\"\", func(t *testing.T) {\n\t\t\tschema := parquet.SchemaOf(test.value)\n\n\t\t\tif s := schema.String(); s != test.print {\n\t\t\t\tt.Errorf(\"\\nexpected:\\n\\n%s\\n\\nfound:\\n\\n%s\\n\", test.print, s)\n\t\t\t}\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "search.go",
    "content": "package parquet\n\n// Search is like Find, but uses the default ordering of the given type. Search\n// and Find are scoped to a given ColumnChunk and find the pages within a\n// ColumnChunk which might contain the result.  See Find for more details.\nfunc Search(index ColumnIndex, value Value, typ Type) int {\n\treturn Find(index, value, CompareNullsLast(typ.Compare))\n}\n\n// Find uses the ColumnIndex passed as argument to find the page in a column\n// chunk (determined by the given ColumnIndex) that the given value is expected\n// to be found in.\n//\n// The function returns the index of the first page that might contain the\n// value. If the function determines that the value does not exist in the\n// index, NumPages is returned.\n//\n// If you want to search the entire parquet file, you must iterate over the\n// RowGroups and search each one individually, if there are multiple in the\n// file. If you call writer.Flush before closing the file, then you will have\n// multiple RowGroups to iterate over, otherwise Flush is called once on Close.\n//\n// The comparison function passed as last argument is used to determine the\n// relative order of values. This should generally be the Compare method of\n// the column type, but can sometimes be customized to modify how null values\n// are interpreted, for example:\n//\n//\tpageIndex := parquet.Find(columnIndex, value,\n//\t\tparquet.CompareNullsFirst(typ.Compare),\n//\t)\nfunc Find(index ColumnIndex, value Value, cmp func(Value, Value) int) int {\n\tswitch {\n\tcase index.IsAscending():\n\t\treturn binarySearch(index, value, cmp)\n\tdefault:\n\t\treturn linearSearch(index, value, cmp)\n\t}\n}\n\nfunc binarySearch(index ColumnIndex, value Value, cmp func(Value, Value) int) int {\n\tn := index.NumPages()\n\tcurIdx := 0\n\ttopIdx := n\n\n\t// while there's at least one more page to check\n\tfor (topIdx - curIdx) > 1 {\n\n\t\t// nextIdx is set to halfway between curIdx and topIdx\n\t\tnextIdx := ((topIdx - curIdx) / 2) + curIdx\n\n\t\tsmallerThanMin := cmp(value, index.MinValue(nextIdx))\n\n\t\tswitch {\n\t\t// search below pages[nextIdx]\n\t\tcase smallerThanMin < 0:\n\t\t\ttopIdx = nextIdx\n\t\t// search pages[nextIdx] and above\n\t\tcase smallerThanMin > 0:\n\t\t\tcurIdx = nextIdx\n\t\tcase smallerThanMin == 0:\n\t\t\t// this case is hit when winValue == value of nextIdx\n\t\t\t// we must check below this index to find if there's\n\t\t\t// another page before this.\n\t\t\t// e.g. searching for first page 3 is in:\n\t\t\t// [1,2,3]\n\t\t\t// [3,4,5]\n\t\t\t// [6,7,8]\n\n\t\t\t// if the page proceeding this has a maxValue matching the value we're\n\t\t\t// searching, continue the search.\n\t\t\t// otherwise, we can return early\n\t\t\t//\n\t\t\t// cases covered by else block\n\t\t\t// if cmp(value, index.MaxValue(nextIdx-1)) < 0: the value is only in this page\n\t\t\t// if cmp(value, index.MaxValue(nextIdx-1)) > 0: we've got a sorting problem with overlapping pages\n\t\t\t//\n\t\t\t// bounds check not needed for nextIdx-1 because nextIdx is guaranteed to be at least curIdx + 1\n\t\t\t// line 82 & 85 above\n\t\t\tif cmp(value, index.MaxValue(nextIdx-1)) == 0 {\n\t\t\t\ttopIdx = nextIdx\n\t\t\t} else {\n\t\t\t\treturn nextIdx\n\t\t\t}\n\t\t}\n\t}\n\n\t// last page check, if it wasn't explicitly found above\n\tif curIdx < n {\n\n\t\t// check pages[curIdx] for value\n\t\tmin := index.MinValue(curIdx)\n\t\tmax := index.MaxValue(curIdx)\n\n\t\t// if value is not in pages[curIdx], then it's not in this columnChunk\n\t\tif cmp(value, min) < 0 || cmp(value, max) > 0 {\n\t\t\tcurIdx = n\n\t\t}\n\t}\n\n\treturn curIdx\n}\n\nfunc linearSearch(index ColumnIndex, value Value, cmp func(Value, Value) int) int {\n\tn := index.NumPages()\n\n\tfor i := 0; i < n; i++ {\n\t\tmin := index.MinValue(i)\n\t\tmax := index.MaxValue(i)\n\n\t\tif cmp(min, value) <= 0 && cmp(value, max) <= 0 {\n\t\t\treturn i\n\t\t}\n\t}\n\n\treturn n\n}\n"
  },
  {
    "path": "search_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc TestSearchBinary(t *testing.T) {\n\ttestSearch(t, [][]int32{\n\t\t{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},\n\t\t{10, 10, 10, 10},\n\t\t{21, 22, 24, 25, 30},\n\t\t{30, 30},\n\t\t{30, 31},\n\t\t{32},\n\t\t{42, 43, 44, 45, 46, 47, 48, 49},\n\t}, [][]int{\n\t\t{10, 1},\n\t\t{0, 0},\n\t\t{9, 0},\n\t\t// non-existant, but would be in this page\n\t\t{23, 2},\n\t\t// ensure we find the first page\n\t\t{30, 2},\n\t\t{31, 4},\n\t\t// out of bounds\n\t\t{99, 7},\n\t\t// out of bounds\n\t\t{-1, 7},\n\t})\n}\n\nfunc TestSearchLinear(t *testing.T) {\n\ttestSearch(t, [][]int32{\n\t\t{10, 10, 10, 10},\n\t\t{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},\n\t\t{21, 22, 23, 24, 25},\n\t\t{19, 18, 17, 16, 14, 13, 12, 11},\n\t\t{42, 43, 44, 45, 46, 47, 48, 49},\n\t}, [][]int{\n\t\t{10, 0},\n\t\t{0, 1},\n\t\t{9, 1},\n\t\t{48, 4},\n\t\t// non-existant, but could be in this page\n\t\t{15, 3},\n\t\t// out of bounds\n\t\t{99, 5},\n\t\t// out of bounds\n\t\t{-1, 5},\n\t})\n}\n\nfunc testSearch(t *testing.T, pages [][]int32, expectIndex [][]int) {\n\tindexer := parquet.Int32Type.NewColumnIndexer(0)\n\n\tfor _, values := range pages {\n\t\tmin := values[0]\n\t\tmax := values[0]\n\n\t\tfor _, v := range values[1:] {\n\t\t\tswitch {\n\t\t\tcase v < min:\n\t\t\t\tmin = v\n\t\t\tcase v > max:\n\t\t\t\tmax = v\n\t\t\t}\n\t\t}\n\n\t\tindexer.IndexPage(int64(len(values)), 0,\n\t\t\tparquet.ValueOf(min),\n\t\t\tparquet.ValueOf(max),\n\t\t)\n\t}\n\n\tformatIndex := indexer.ColumnIndex()\n\tcolumnIndex := parquet.NewColumnIndex(parquet.Int32, &formatIndex)\n\n\tfor _, values := range expectIndex {\n\t\tv := parquet.ValueOf(values[0])\n\t\tj := parquet.Search(columnIndex, v, parquet.Int32Type)\n\n\t\tif values[1] != j {\n\t\t\tt.Errorf(\"searching for value %v: got=%d want=%d\", v, j, values[1])\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "sorting.go",
    "content": "//go:build go1.18\n\npackage parquet\n\nimport (\n\t\"io\"\n\t\"sort\"\n)\n\n// SortingWriter is a type similar to GenericWriter but it ensures that rows\n// are sorted according to the sorting columns configured on the writer.\n//\n// The writer accumulates rows in an in-memory buffer which is sorted when it\n// reaches the target number of rows, then written to a temporary row group.\n// When the writer is flushed or closed, the temporary row groups are merged\n// into a row group in the output file, ensuring that rows remain sorted in the\n// final row group.\n//\n// Because row groups get encoded and compressed, they hold a lot less memory\n// than if all rows were retained in memory. Sorting then merging rows chunks\n// also tends to be a lot more efficient than sorting all rows in memory as it\n// results in better CPU cache utilization since sorting multi-megabyte arrays\n// causes a lot of cache misses since the data set cannot be held in CPU caches.\ntype SortingWriter[T any] struct {\n\trowbuf  *RowBuffer[T]\n\twriter  *GenericWriter[T]\n\toutput  *GenericWriter[T]\n\tbuffer  io.ReadWriteSeeker\n\tmaxRows int64\n\tnumRows int64\n\tsorting SortingConfig\n\tdedupe  dedupe\n}\n\n// NewSortingWriter constructs a new sorting writer which writes a parquet file\n// where rows of each row group are ordered according to the sorting columns\n// configured on the writer.\n//\n// The sortRowCount argument defines the target number of rows that will be\n// sorted in memory before being written to temporary row groups. The greater\n// this value the more memory is needed to buffer rows in memory. Choosing a\n// value that is too small limits the maximum number of rows that can exist in\n// the output file since the writer cannot create more than 32K temporary row\n// groups to hold the sorted row chunks.\nfunc NewSortingWriter[T any](output io.Writer, sortRowCount int64, options ...WriterOption) *SortingWriter[T] {\n\tconfig, err := NewWriterConfig(options...)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\treturn &SortingWriter[T]{\n\t\trowbuf: NewRowBuffer[T](&RowGroupConfig{\n\t\t\tSchema:  config.Schema,\n\t\t\tSorting: config.Sorting,\n\t\t}),\n\t\twriter: NewGenericWriter[T](io.Discard, &WriterConfig{\n\t\t\tCreatedBy:            config.CreatedBy,\n\t\t\tColumnPageBuffers:    config.ColumnPageBuffers,\n\t\t\tColumnIndexSizeLimit: config.ColumnIndexSizeLimit,\n\t\t\tPageBufferSize:       config.PageBufferSize,\n\t\t\tWriteBufferSize:      config.WriteBufferSize,\n\t\t\tDataPageVersion:      config.DataPageVersion,\n\t\t\tSchema:               config.Schema,\n\t\t\tCompression:          config.Compression,\n\t\t\tSorting:              config.Sorting,\n\t\t}),\n\t\toutput:  NewGenericWriter[T](output, config),\n\t\tmaxRows: sortRowCount,\n\t\tsorting: config.Sorting,\n\t}\n}\n\nfunc (w *SortingWriter[T]) Close() error {\n\tif err := w.Flush(); err != nil {\n\t\treturn err\n\t}\n\treturn w.output.Close()\n}\n\nfunc (w *SortingWriter[T]) Flush() error {\n\tdefer w.resetSortingBuffer()\n\n\tif err := w.sortAndWriteBufferedRows(); err != nil {\n\t\treturn err\n\t}\n\n\tif w.numRows == 0 {\n\t\treturn nil\n\t}\n\n\tif err := w.writer.Close(); err != nil {\n\t\treturn err\n\t}\n\n\tsize, err := w.buffer.Seek(0, io.SeekCurrent)\n\tif err != nil {\n\t\treturn err\n\t}\n\n\tf, err := OpenFile(newReaderAt(w.buffer), size,\n\t\t&FileConfig{\n\t\t\tSkipPageIndex:    true,\n\t\t\tSkipBloomFilters: true,\n\t\t\tReadBufferSize:   defaultReadBufferSize,\n\t\t},\n\t)\n\tif err != nil {\n\t\treturn err\n\t}\n\n\tm, err := MergeRowGroups(f.RowGroups(),\n\t\t&RowGroupConfig{\n\t\t\tSchema:  w.Schema(),\n\t\t\tSorting: w.sorting,\n\t\t},\n\t)\n\tif err != nil {\n\t\treturn err\n\t}\n\n\trows := m.Rows()\n\tdefer rows.Close()\n\n\treader := RowReader(rows)\n\tif w.sorting.DropDuplicatedRows {\n\t\treader = DedupeRowReader(rows, w.rowbuf.compare)\n\t}\n\n\tif _, err := CopyRows(w.output, reader); err != nil {\n\t\treturn err\n\t}\n\n\treturn w.output.Flush()\n}\n\nfunc (w *SortingWriter[T]) Reset(output io.Writer) {\n\tw.output.Reset(output)\n\tw.rowbuf.Reset()\n\tw.resetSortingBuffer()\n}\n\nfunc (w *SortingWriter[T]) resetSortingBuffer() {\n\tw.writer.Reset(io.Discard)\n\tw.numRows = 0\n\n\tif w.buffer != nil {\n\t\tw.sorting.SortingBuffers.PutBuffer(w.buffer)\n\t\tw.buffer = nil\n\t}\n}\n\nfunc (w *SortingWriter[T]) Write(rows []T) (int, error) {\n\treturn w.writeRows(len(rows), func(i, j int) (int, error) { return w.rowbuf.Write(rows[i:j]) })\n}\n\nfunc (w *SortingWriter[T]) WriteRows(rows []Row) (int, error) {\n\treturn w.writeRows(len(rows), func(i, j int) (int, error) { return w.rowbuf.WriteRows(rows[i:j]) })\n}\n\nfunc (w *SortingWriter[T]) writeRows(numRows int, writeRows func(i, j int) (int, error)) (int, error) {\n\twn := 0\n\n\tfor wn < numRows {\n\t\tif w.rowbuf.NumRows() >= w.maxRows {\n\t\t\tif err := w.sortAndWriteBufferedRows(); err != nil {\n\t\t\t\treturn wn, err\n\t\t\t}\n\t\t}\n\n\t\tn := int(w.maxRows - w.rowbuf.NumRows())\n\t\tn += wn\n\t\tif n > numRows {\n\t\t\tn = numRows\n\t\t}\n\n\t\tn, err := writeRows(wn, n)\n\t\twn += n\n\n\t\tif err != nil {\n\t\t\treturn wn, err\n\t\t}\n\t}\n\n\treturn wn, nil\n}\n\nfunc (w *SortingWriter[T]) SetKeyValueMetadata(key, value string) {\n\tw.output.SetKeyValueMetadata(key, value)\n}\n\nfunc (w *SortingWriter[T]) Schema() *Schema {\n\treturn w.output.Schema()\n}\n\nfunc (w *SortingWriter[T]) sortAndWriteBufferedRows() error {\n\tif w.rowbuf.Len() == 0 {\n\t\treturn nil\n\t}\n\n\tdefer w.rowbuf.Reset()\n\tsort.Sort(w.rowbuf)\n\n\tif w.sorting.DropDuplicatedRows {\n\t\tw.rowbuf.rows = w.rowbuf.rows[:w.dedupe.deduplicate(w.rowbuf.rows, w.rowbuf.compare)]\n\t\tdefer w.dedupe.reset()\n\t}\n\n\trows := w.rowbuf.Rows()\n\tdefer rows.Close()\n\n\tif w.buffer == nil {\n\t\tw.buffer = w.sorting.SortingBuffers.GetBuffer()\n\t\tw.writer.Reset(w.buffer)\n\t}\n\n\tn, err := CopyRows(w.writer, rows)\n\tif err != nil {\n\t\treturn err\n\t}\n\n\tif err := w.writer.Flush(); err != nil {\n\t\treturn err\n\t}\n\n\tw.numRows += n\n\treturn nil\n}\n"
  },
  {
    "path": "sorting_test.go",
    "content": "//go:build go1.18\n\npackage parquet_test\n\nimport (\n\t\"bytes\"\n\t\"math/rand\"\n\t\"sort\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc TestSortingWriter(t *testing.T) {\n\ttype Row struct {\n\t\tValue int32 `parquet:\"value\"`\n\t}\n\n\trows := make([]Row, 1000)\n\tfor i := range rows {\n\t\trows[i].Value = int32(i)\n\t}\n\n\tprng := rand.New(rand.NewSource(0))\n\tprng.Shuffle(len(rows), func(i, j int) {\n\t\trows[i], rows[j] = rows[j], rows[i]\n\t})\n\n\tbuffer := bytes.NewBuffer(nil)\n\twriter := parquet.NewSortingWriter[Row](buffer, 99,\n\t\tparquet.SortingWriterConfig(\n\t\t\tparquet.SortingColumns(\n\t\t\t\tparquet.Ascending(\"value\"),\n\t\t\t),\n\t\t),\n\t)\n\n\t_, err := writer.Write(rows)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tif err := writer.Close(); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tread, err := parquet.Read[Row](bytes.NewReader(buffer.Bytes()), int64(buffer.Len()))\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tsort.Slice(rows, func(i, j int) bool {\n\t\treturn rows[i].Value < rows[j].Value\n\t})\n\n\tassertRowsEqual(t, rows, read)\n}\n\nfunc TestSortingWriterDropDuplicatedRows(t *testing.T) {\n\ttype Row struct {\n\t\tValue int32 `parquet:\"value\"`\n\t}\n\n\trows := make([]Row, 1000)\n\tfor i := range rows {\n\t\trows[i].Value = int32(i / 2)\n\t}\n\n\tprng := rand.New(rand.NewSource(0))\n\tprng.Shuffle(len(rows), func(i, j int) {\n\t\trows[i], rows[j] = rows[j], rows[i]\n\t})\n\n\tbuffer := bytes.NewBuffer(nil)\n\twriter := parquet.NewSortingWriter[Row](buffer, 99,\n\t\tparquet.SortingWriterConfig(\n\t\t\tparquet.SortingBuffers(\n\t\t\t\tparquet.NewFileBufferPool(\"\", \"buffers.*\"),\n\t\t\t),\n\t\t\tparquet.SortingColumns(\n\t\t\t\tparquet.Ascending(\"value\"),\n\t\t\t),\n\t\t\tparquet.DropDuplicatedRows(true),\n\t\t),\n\t)\n\n\t_, err := writer.Write(rows)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tif err := writer.Close(); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tread, err := parquet.Read[Row](bytes.NewReader(buffer.Bytes()), int64(buffer.Len()))\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tsort.Slice(rows, func(i, j int) bool {\n\t\treturn rows[i].Value < rows[j].Value\n\t})\n\n\tn := len(rows) / 2\n\tfor i := range rows[:n] {\n\t\trows[i] = rows[2*i]\n\t}\n\n\tassertRowsEqual(t, rows[:n], read)\n}\n"
  },
  {
    "path": "sparse/array.go",
    "content": "package sparse\n\nimport (\n\t\"time\"\n\t\"unsafe\"\n)\n\ntype Array struct{ array }\n\nfunc UnsafeArray(base unsafe.Pointer, length int, offset uintptr) Array {\n\treturn Array{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a Array) Len() int                   { return int(a.len) }\nfunc (a Array) Index(i int) unsafe.Pointer { return a.index(i) }\nfunc (a Array) Slice(i, j int) Array       { return Array{a.slice(i, j)} }\nfunc (a Array) Offset(off uintptr) Array   { return Array{a.offset(off)} }\nfunc (a Array) BoolArray() BoolArray       { return BoolArray{a.array} }\nfunc (a Array) Int8Array() Int8Array       { return Int8Array{a.array} }\nfunc (a Array) Int16Array() Int16Array     { return Int16Array{a.array} }\nfunc (a Array) Int32Array() Int32Array     { return Int32Array{a.array} }\nfunc (a Array) Int64Array() Int64Array     { return Int64Array{a.array} }\nfunc (a Array) Float32Array() Float32Array { return Float32Array{a.array} }\nfunc (a Array) Float64Array() Float64Array { return Float64Array{a.array} }\nfunc (a Array) Uint8Array() Uint8Array     { return Uint8Array{a.array} }\nfunc (a Array) Uint16Array() Uint16Array   { return Uint16Array{a.array} }\nfunc (a Array) Uint32Array() Uint32Array   { return Uint32Array{a.array} }\nfunc (a Array) Uint64Array() Uint64Array   { return Uint64Array{a.array} }\nfunc (a Array) Uint128Array() Uint128Array { return Uint128Array{a.array} }\nfunc (a Array) StringArray() StringArray   { return StringArray{a.array} }\nfunc (a Array) TimeArray() TimeArray       { return TimeArray{a.array} }\n\ntype array struct {\n\tptr unsafe.Pointer\n\tlen uintptr\n\toff uintptr\n}\n\nfunc makeArray(base unsafe.Pointer, length, offset uintptr) array {\n\treturn array{ptr: base, len: length, off: offset}\n}\n\nfunc (a array) index(i int) unsafe.Pointer {\n\tif uintptr(i) >= a.len {\n\t\tpanic(\"index out of bounds\")\n\t}\n\treturn unsafe.Add(a.ptr, a.off*uintptr(i))\n}\n\nfunc (a array) slice(i, j int) array {\n\tif uintptr(i) > a.len || uintptr(j) > a.len || i > j {\n\t\tpanic(\"slice index out of bounds\")\n\t}\n\treturn array{\n\t\tptr: unsafe.Add(a.ptr, a.off*uintptr(i)),\n\t\tlen: uintptr(j - i),\n\t\toff: a.off,\n\t}\n}\n\nfunc (a array) offset(off uintptr) array {\n\tif a.ptr == nil {\n\t\tpanic(\"offset of nil array\")\n\t}\n\treturn array{\n\t\tptr: unsafe.Add(a.ptr, off),\n\t\tlen: a.len,\n\t\toff: a.off,\n\t}\n}\n\ntype BoolArray struct{ array }\n\nfunc MakeBoolArray(values []bool) BoolArray {\n\treturn BoolArray{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), 1)}\n}\n\nfunc UnsafeBoolArray(base unsafe.Pointer, length int, offset uintptr) BoolArray {\n\treturn BoolArray{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a BoolArray) Len() int                 { return int(a.len) }\nfunc (a BoolArray) Index(i int) bool         { return *(*byte)(a.index(i)) != 0 }\nfunc (a BoolArray) Slice(i, j int) BoolArray { return BoolArray{a.slice(i, j)} }\nfunc (a BoolArray) Uint8Array() Uint8Array   { return Uint8Array{a.array} }\nfunc (a BoolArray) UnsafeArray() Array       { return Array{a.array} }\n\ntype Int8Array struct{ array }\n\nfunc MakeInt8Array(values []int8) Int8Array {\n\treturn Int8Array{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), 8)}\n}\n\nfunc UnsafeInt8Array(base unsafe.Pointer, length int, offset uintptr) Int8Array {\n\treturn Int8Array{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a Int8Array) Len() int                 { return int(a.len) }\nfunc (a Int8Array) Index(i int) int8         { return *(*int8)(a.index(i)) }\nfunc (a Int8Array) Slice(i, j int) Int8Array { return Int8Array{a.slice(i, j)} }\nfunc (a Int8Array) Uint8Array() Uint8Array   { return Uint8Array{a.array} }\nfunc (a Int8Array) UnsafeArray() Array       { return Array{a.array} }\n\ntype Int16Array struct{ array }\n\nfunc MakeInt16Array(values []int16) Int16Array {\n\treturn Int16Array{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), 8)}\n}\n\nfunc UnsafeInt16Array(base unsafe.Pointer, length int, offset uintptr) Int16Array {\n\treturn Int16Array{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a Int16Array) Len() int                  { return int(a.len) }\nfunc (a Int16Array) Index(i int) int16         { return *(*int16)(a.index(i)) }\nfunc (a Int16Array) Slice(i, j int) Int16Array { return Int16Array{a.slice(i, j)} }\nfunc (a Int16Array) Int8Array() Int8Array      { return Int8Array{a.array} }\nfunc (a Int16Array) Uint8Array() Uint8Array    { return Uint8Array{a.array} }\nfunc (a Int16Array) Uint16Array() Uint16Array  { return Uint16Array{a.array} }\nfunc (a Int16Array) UnsafeArray() Array        { return Array{a.array} }\n\ntype Int32Array struct{ array }\n\nfunc MakeInt32Array(values []int32) Int32Array {\n\treturn Int32Array{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), 4)}\n}\n\nfunc UnsafeInt32Array(base unsafe.Pointer, length int, offset uintptr) Int32Array {\n\treturn Int32Array{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a Int32Array) Len() int                  { return int(a.len) }\nfunc (a Int32Array) Index(i int) int32         { return *(*int32)(a.index(i)) }\nfunc (a Int32Array) Slice(i, j int) Int32Array { return Int32Array{a.slice(i, j)} }\nfunc (a Int32Array) Int8Array() Int8Array      { return Int8Array{a.array} }\nfunc (a Int32Array) Int16Array() Int16Array    { return Int16Array{a.array} }\nfunc (a Int32Array) Uint8Array() Uint8Array    { return Uint8Array{a.array} }\nfunc (a Int32Array) Uint16Array() Uint16Array  { return Uint16Array{a.array} }\nfunc (a Int32Array) Uint32Array() Uint32Array  { return Uint32Array{a.array} }\nfunc (a Int32Array) UnsafeArray() Array        { return Array{a.array} }\n\ntype Int64Array struct{ array }\n\nfunc MakeInt64Array(values []int64) Int64Array {\n\treturn Int64Array{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), 8)}\n}\n\nfunc UnsafeInt64Array(base unsafe.Pointer, length int, offset uintptr) Int64Array {\n\treturn Int64Array{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a Int64Array) Len() int                  { return int(a.len) }\nfunc (a Int64Array) Index(i int) int64         { return *(*int64)(a.index(i)) }\nfunc (a Int64Array) Slice(i, j int) Int64Array { return Int64Array{a.slice(i, j)} }\nfunc (a Int64Array) Int8Array() Int8Array      { return Int8Array{a.array} }\nfunc (a Int64Array) Int16Array() Int16Array    { return Int16Array{a.array} }\nfunc (a Int64Array) Int32Array() Int32Array    { return Int32Array{a.array} }\nfunc (a Int64Array) Uint8Array() Uint8Array    { return Uint8Array{a.array} }\nfunc (a Int64Array) Uint16Array() Uint16Array  { return Uint16Array{a.array} }\nfunc (a Int64Array) Uint32Array() Uint32Array  { return Uint32Array{a.array} }\nfunc (a Int64Array) Uint64Array() Uint64Array  { return Uint64Array{a.array} }\nfunc (a Int64Array) UnsafeArray() Array        { return Array{a.array} }\n\ntype Float32Array struct{ array }\n\nfunc MakeFloat32Array(values []float32) Float32Array {\n\treturn Float32Array{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), 4)}\n}\n\nfunc UnsafeFloat32Array(base unsafe.Pointer, length int, offset uintptr) Float32Array {\n\treturn Float32Array{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a Float32Array) Len() int                    { return int(a.len) }\nfunc (a Float32Array) Index(i int) float32         { return *(*float32)(a.index(i)) }\nfunc (a Float32Array) Slice(i, j int) Float32Array { return Float32Array{a.slice(i, j)} }\nfunc (a Float32Array) Array() Array                { return Array{a.array} }\nfunc (a Float32Array) Uint32Array() Uint32Array    { return Uint32Array{a.array} }\nfunc (a Float32Array) UnsafeArray() Array          { return Array{a.array} }\n\ntype Float64Array struct{ array }\n\nfunc MakeFloat64Array(values []float64) Float64Array {\n\treturn Float64Array{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), 8)}\n}\n\nfunc UnsafeFloat64Array(base unsafe.Pointer, length int, offset uintptr) Float64Array {\n\treturn Float64Array{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a Float64Array) Len() int                    { return int(a.len) }\nfunc (a Float64Array) Index(i int) float64         { return *(*float64)(a.index(i)) }\nfunc (a Float64Array) Slice(i, j int) Float64Array { return Float64Array{a.slice(i, j)} }\nfunc (a Float64Array) Uint64Array() Uint64Array    { return Uint64Array{a.array} }\nfunc (a Float64Array) UnsafeArray() Array          { return Array{a.array} }\n\ntype Uint8Array struct{ array }\n\nfunc MakeUint8Array(values []uint8) Uint8Array {\n\treturn Uint8Array{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), 8)}\n}\n\nfunc UnsafeUint8Array(base unsafe.Pointer, length int, offset uintptr) Uint8Array {\n\treturn Uint8Array{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a Uint8Array) Len() int                  { return int(a.len) }\nfunc (a Uint8Array) Index(i int) uint8         { return *(*uint8)(a.index(i)) }\nfunc (a Uint8Array) Slice(i, j int) Uint8Array { return Uint8Array{a.slice(i, j)} }\nfunc (a Uint8Array) UnsafeArray() Array        { return Array{a.array} }\n\ntype Uint16Array struct{ array }\n\nfunc MakeUint16Array(values []uint16) Uint16Array {\n\treturn Uint16Array{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), 8)}\n}\n\nfunc UnsafeUint16Array(base unsafe.Pointer, length int, offset uintptr) Uint16Array {\n\treturn Uint16Array{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a Uint16Array) Len() int                   { return int(a.len) }\nfunc (a Uint16Array) Index(i int) uint16         { return *(*uint16)(a.index(i)) }\nfunc (a Uint16Array) Slice(i, j int) Uint16Array { return Uint16Array{a.slice(i, j)} }\nfunc (a Uint16Array) Uint8Array() Uint8Array     { return Uint8Array{a.array} }\nfunc (a Uint16Array) UnsafeArray() Array         { return Array{a.array} }\n\ntype Uint32Array struct{ array }\n\nfunc MakeUint32Array(values []uint32) Uint32Array {\n\treturn Uint32Array{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), 4)}\n}\n\nfunc UnsafeUint32Array(base unsafe.Pointer, length int, offset uintptr) Uint32Array {\n\treturn Uint32Array{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a Uint32Array) Len() int                   { return int(a.len) }\nfunc (a Uint32Array) Index(i int) uint32         { return *(*uint32)(a.index(i)) }\nfunc (a Uint32Array) Slice(i, j int) Uint32Array { return Uint32Array{a.slice(i, j)} }\nfunc (a Uint32Array) Uint8Array() Uint8Array     { return Uint8Array{a.array} }\nfunc (a Uint32Array) Uint16Array() Uint16Array   { return Uint16Array{a.array} }\nfunc (a Uint32Array) UnsafeArray() Array         { return Array{a.array} }\n\ntype Uint64Array struct{ array }\n\nfunc MakeUint64Array(values []uint64) Uint64Array {\n\treturn Uint64Array{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), 8)}\n}\n\nfunc UnsafeUint64Array(base unsafe.Pointer, length int, offset uintptr) Uint64Array {\n\treturn Uint64Array{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a Uint64Array) Len() int                   { return int(a.len) }\nfunc (a Uint64Array) Index(i int) uint64         { return *(*uint64)(a.index(i)) }\nfunc (a Uint64Array) Slice(i, j int) Uint64Array { return Uint64Array{a.slice(i, j)} }\nfunc (a Uint64Array) Uint8Array() Uint8Array     { return Uint8Array{a.array} }\nfunc (a Uint64Array) Uint16Array() Uint16Array   { return Uint16Array{a.array} }\nfunc (a Uint64Array) Uint32Array() Uint32Array   { return Uint32Array{a.array} }\nfunc (a Uint64Array) UnsafeArray() Array         { return Array{a.array} }\n\ntype Uint128Array struct{ array }\n\nfunc MakeUint128Array(values [][16]byte) Uint128Array {\n\treturn Uint128Array{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), 16)}\n}\n\nfunc UnsafeUint128Array(base unsafe.Pointer, length int, offset uintptr) Uint128Array {\n\treturn Uint128Array{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a Uint128Array) Len() int                    { return int(a.len) }\nfunc (a Uint128Array) Index(i int) [16]byte        { return *(*[16]byte)(a.index(i)) }\nfunc (a Uint128Array) Slice(i, j int) Uint128Array { return Uint128Array{a.slice(i, j)} }\nfunc (a Uint128Array) Uint8Array() Uint8Array      { return Uint8Array{a.array} }\nfunc (a Uint128Array) Uint16Array() Uint16Array    { return Uint16Array{a.array} }\nfunc (a Uint128Array) Uint32Array() Uint32Array    { return Uint32Array{a.array} }\nfunc (a Uint128Array) Uint64Array() Uint64Array    { return Uint64Array{a.array} }\nfunc (a Uint128Array) UnsafeArray() Array          { return Array{a.array} }\n\ntype StringArray struct{ array }\n\nfunc MakeStringArray(values []string) StringArray {\n\tconst sizeOfString = unsafe.Sizeof(\"\")\n\treturn StringArray{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), sizeOfString)}\n}\n\nfunc UnsafeStringArray(base unsafe.Pointer, length int, offset uintptr) StringArray {\n\treturn StringArray{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a StringArray) Len() int                   { return int(a.len) }\nfunc (a StringArray) Index(i int) string         { return *(*string)(a.index(i)) }\nfunc (a StringArray) Slice(i, j int) StringArray { return StringArray{a.slice(i, j)} }\nfunc (a StringArray) UnsafeArray() Array         { return Array{a.array} }\n\ntype TimeArray struct{ array }\n\nfunc MakeTimeArray(values []time.Time) TimeArray {\n\tconst sizeOfTime = unsafe.Sizeof(time.Time{})\n\treturn TimeArray{makeArray(*(*unsafe.Pointer)(unsafe.Pointer(&values)), uintptr(len(values)), sizeOfTime)}\n}\n\nfunc UnsafeTimeArray(base unsafe.Pointer, length int, offset uintptr) TimeArray {\n\treturn TimeArray{makeArray(base, uintptr(length), offset)}\n}\n\nfunc (a TimeArray) Len() int                 { return int(a.len) }\nfunc (a TimeArray) Index(i int) time.Time    { return *(*time.Time)(a.index(i)) }\nfunc (a TimeArray) Slice(i, j int) TimeArray { return TimeArray{a.slice(i, j)} }\nfunc (a TimeArray) UnsafeArray() Array       { return Array{a.array} }\n"
  },
  {
    "path": "sparse/gather.go",
    "content": "package sparse\n\nimport \"unsafe\"\n\nfunc GatherInt32(dst []int32, src Int32Array) int {\n\treturn GatherUint32(*(*[]uint32)(unsafe.Pointer(&dst)), src.Uint32Array())\n}\n\nfunc GatherInt64(dst []int64, src Int64Array) int {\n\treturn GatherUint64(*(*[]uint64)(unsafe.Pointer(&dst)), src.Uint64Array())\n}\n\nfunc GatherFloat32(dst []float32, src Float32Array) int {\n\treturn GatherUint32(*(*[]uint32)(unsafe.Pointer(&dst)), src.Uint32Array())\n}\n\nfunc GatherFloat64(dst []float64, src Float64Array) int {\n\treturn GatherUint64(*(*[]uint64)(unsafe.Pointer(&dst)), src.Uint64Array())\n}\n\nfunc GatherBits(dst []byte, src Uint8Array) int { return gatherBits(dst, src) }\n\nfunc GatherUint32(dst []uint32, src Uint32Array) int { return gather32(dst, src) }\n\nfunc GatherUint64(dst []uint64, src Uint64Array) int { return gather64(dst, src) }\n\nfunc GatherUint128(dst [][16]byte, src Uint128Array) int { return gather128(dst, src) }\n\nfunc GatherString(dst []string, src StringArray) int {\n\tn := min(len(dst), src.Len())\n\n\tfor i := range dst[:n] {\n\t\tdst[i] = src.Index(i)\n\t}\n\n\treturn n\n}\n\nfunc min(a, b int) int {\n\tif a < b {\n\t\treturn a\n\t}\n\treturn b\n}\n"
  },
  {
    "path": "sparse/gather_amd64.go",
    "content": "//go:build !purego\n\npackage sparse\n\nimport (\n\t\"golang.org/x/sys/cpu\"\n)\n\nfunc gatherBits(dst []byte, src Uint8Array) int {\n\tn := min(len(dst)*8, src.Len())\n\ti := 0\n\n\tif n >= 8 {\n\t\ti = (n / 8) * 8\n\t\t// Make sure `offset` is at least 4 bytes, otherwise VPGATHERDD may read\n\t\t// data beyond the end of the program memory and trigger a fault.\n\t\t//\n\t\t// If the boolean values do not have enough padding we must fallback to\n\t\t// the scalar algorithm to be able to load single bytes from memory.\n\t\tif src.off >= 4 && cpu.X86.HasAVX2 {\n\t\t\tgatherBitsAVX2(dst, src.Slice(0, i))\n\t\t} else {\n\t\t\tgatherBitsDefault(dst, src.Slice(0, i))\n\t\t}\n\t}\n\n\tfor i < n {\n\t\tx := i / 8\n\t\ty := i % 8\n\t\tb := src.Index(i)\n\t\tdst[x] = ((b & 1) << y) | (dst[x] & ^(1 << y))\n\t\ti++\n\t}\n\n\treturn n\n}\n\nfunc gather32(dst []uint32, src Uint32Array) int {\n\tn := min(len(dst), src.Len())\n\ti := 0\n\n\tif n >= 16 && cpu.X86.HasAVX2 {\n\t\ti = (n / 8) * 8\n\t\tgather32AVX2(dst[:i:i], src)\n\t}\n\n\tfor i < n {\n\t\tdst[i] = src.Index(i)\n\t\ti++\n\t}\n\n\treturn n\n}\n\nfunc gather64(dst []uint64, src Uint64Array) int {\n\tn := min(len(dst), src.Len())\n\ti := 0\n\n\tif n >= 8 && cpu.X86.HasAVX2 {\n\t\ti = (n / 4) * 4\n\t\tgather64AVX2(dst[:i:i], src)\n\t}\n\n\tfor i < n {\n\t\tdst[i] = src.Index(i)\n\t\ti++\n\t}\n\n\treturn n\n}\n\n//go:noescape\nfunc gatherBitsAVX2(dst []byte, src Uint8Array)\n\n//go:noescape\nfunc gatherBitsDefault(dst []byte, src Uint8Array)\n\n//go:noescape\nfunc gather32AVX2(dst []uint32, src Uint32Array)\n\n//go:noescape\nfunc gather64AVX2(dst []uint64, src Uint64Array)\n\n//go:noescape\nfunc gather128(dst [][16]byte, src Uint128Array) int\n"
  },
  {
    "path": "sparse/gather_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n// func gatherBitsAVX2(dst []byte, src Uint8Array)\nTEXT ·gatherBitsAVX2(SB), NOSPLIT, $0-48\n    MOVQ dst_base+0(FP), AX\n    MOVQ src_array_ptr+24(FP), BX\n    MOVQ src_array_len+32(FP), CX\n    MOVQ src_array_off+40(FP), DX\n    XORQ SI, SI\n    SHRQ $3, CX\n\n    VPBROADCASTD src_array_off+40(FP), Y0\n    VPMULLD range0n7<>(SB), Y0, Y0\n    VPCMPEQD Y1, Y1, Y1\n    VPCMPEQD Y2, Y2, Y2\nloop:\n    VPGATHERDD Y1, (BX)(Y0*1), Y3\n    VMOVDQU Y2, Y1\n    VPSLLD $31, Y3, Y3\n    VMOVMSKPS Y3, DI\n\n    MOVB DI, (AX)(SI*1)\n\n    LEAQ (BX)(DX*8), BX\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\n    VZEROUPPER\n    RET\n\n// func gatherBitsDefault(dst []byte, src Uint8Array)\nTEXT ·gatherBitsDefault(SB), NOSPLIT, $0-48\n    MOVQ dst_base+0(FP), AX\n    MOVQ src_array_ptr+24(FP), BX\n    MOVQ src_array_len+32(FP), CX\n    MOVQ src_array_off+40(FP), DX\n    XORQ SI, SI\n    SHRQ $3, CX\nloop:\n    LEAQ (BX)(DX*2), DI\n    MOVBQZX (BX), R8\n    MOVBQZX (BX)(DX*1), R9\n    MOVBQZX (DI), R10\n    MOVBQZX (DI)(DX*1), R11\n    LEAQ (BX)(DX*4), BX\n    LEAQ (DI)(DX*4), DI\n    MOVBQZX (BX), R12\n    MOVBQZX (BX)(DX*1), R13\n    MOVBQZX (DI), R14\n    MOVBQZX (DI)(DX*1), R15\n    LEAQ (BX)(DX*4), BX\n\n    ANDQ $1, R8\n    ANDQ $1, R9\n    ANDQ $1, R10\n    ANDQ $1, R11\n    ANDQ $1, R12\n    ANDQ $1, R13\n    ANDQ $1, R14\n    ANDQ $1, R15\n\n    SHLQ $1, R9\n    SHLQ $2, R10\n    SHLQ $3, R11\n    SHLQ $4, R12\n    SHLQ $5, R13\n    SHLQ $6, R14\n    SHLQ $7, R15\n\n    ORQ R9, R8\n    ORQ R11, R10\n    ORQ R13, R12\n    ORQ R15, R14\n    ORQ R10, R8\n    ORQ R12, R8\n    ORQ R14, R8\n\n    MOVB R8, (AX)(SI*1)\n\n    INCQ SI\n    CMPQ SI, CX\n    JNE loop\n    RET\n\n// func gather32AVX2(dst []uint32, src Uint32Array)\nTEXT ·gather32AVX2(SB), NOSPLIT, $0-48\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), CX\n    MOVQ src_array_ptr+24(FP), BX\n    MOVQ src_array_off+40(FP), DX\n    XORQ SI, SI\n\n    VPBROADCASTD src_array_off+40(FP), Y0\n    VPMULLD range0n7<>(SB), Y0, Y0\n    VPCMPEQD Y1, Y1, Y1\n    VPCMPEQD Y2, Y2, Y2\nloop:\n    VPGATHERDD Y1, (BX)(Y0*1), Y3\n    VMOVDQU Y3, (AX)(SI*4)\n    VMOVDQU Y2, Y1\n\n    LEAQ (BX)(DX*8), BX\n    ADDQ $8, SI\n    CMPQ SI, CX\n    JNE loop\n    VZEROUPPER\n    RET\n\n// func gather64AVX2(dst []uint64, src Uint64Array)\nTEXT ·gather64AVX2(SB), NOSPLIT, $0-48\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), CX\n    MOVQ src_array_ptr+24(FP), BX\n    MOVQ src_array_off+40(FP), DX\n    XORQ SI, SI\n\n    VPBROADCASTQ src_array_off+40(FP), Y0\n    VPMULLD range0n3<>(SB), Y0, Y0\n    VPCMPEQQ Y1, Y1, Y1\n    VPCMPEQQ Y2, Y2, Y2\nloop:\n    VPGATHERQQ Y1, (BX)(Y0*1), Y3\n    VMOVDQU Y3, (AX)(SI*8)\n    VMOVDQU Y2, Y1\n\n    LEAQ (BX)(DX*4), BX\n    ADDQ $4, SI\n    CMPQ SI, CX\n    JNE loop\n    VZEROUPPER\n    RET\n\n// func gather128(dst [][16]byte, src Uint128Array) int\nTEXT ·gather128(SB), NOSPLIT, $0-56\n    MOVQ dst_base+0(FP), AX\n    MOVQ dst_len+8(FP), CX\n    MOVQ src_array_ptr+24(FP), BX\n    MOVQ src_array_len+32(FP), DI\n    MOVQ src_array_off+40(FP), DX\n    XORQ SI, SI\n\n    CMPQ DI, CX\n    CMOVQLT DI, CX\n\n    CMPQ CX, $0\n    JE done\n\n    CMPQ CX, $1\n    JE tail\n\n    XORQ SI, SI\n    MOVQ CX, DI\n    SHRQ $1, DI\n    SHLQ $1, DI\nloop:\n    MOVOU (BX), X0\n    MOVOU (BX)(DX*1), X1\n\n    MOVOU X0, (AX)\n    MOVOU X1, 16(AX)\n\n    LEAQ (BX)(DX*2), BX\n    ADDQ $32, AX\n    ADDQ $2, SI\n    CMPQ SI, DI\n    JNE loop\n\n    CMPQ SI, CX\n    JE done\ntail:\n    MOVOU (BX), X0\n    MOVOU X0, (AX)\ndone:\n    MOVQ CX, ret+48(FP)\n    RET\n\nGLOBL range0n3<>(SB), RODATA|NOPTR, $32\nDATA range0n3<>+0(SB)/8,  $0\nDATA range0n3<>+8(SB)/8,  $1\nDATA range0n3<>+16(SB)/8, $2\nDATA range0n3<>+24(SB)/8, $3\n\nGLOBL range0n7<>(SB), RODATA|NOPTR, $32\nDATA range0n7<>+0(SB)/4,  $0\nDATA range0n7<>+4(SB)/4,  $1\nDATA range0n7<>+8(SB)/4,  $2\nDATA range0n7<>+12(SB)/4, $3\nDATA range0n7<>+16(SB)/4, $4\nDATA range0n7<>+20(SB)/4, $5\nDATA range0n7<>+24(SB)/4, $6\nDATA range0n7<>+28(SB)/4, $7\n"
  },
  {
    "path": "sparse/gather_purego.go",
    "content": "//go:build purego || !amd64\n\npackage sparse\n\nfunc gatherBits(dst []byte, src Uint8Array) int {\n\tn := min(len(dst)*8, src.Len())\n\ti := 0\n\n\tif k := (n / 8) * 8; k > 0 {\n\t\tfor j := 0; i < k; j++ {\n\t\t\tb0 := src.Index(i + 0)\n\t\t\tb1 := src.Index(i + 1)\n\t\t\tb2 := src.Index(i + 2)\n\t\t\tb3 := src.Index(i + 3)\n\t\t\tb4 := src.Index(i + 4)\n\t\t\tb5 := src.Index(i + 5)\n\t\t\tb6 := src.Index(i + 6)\n\t\t\tb7 := src.Index(i + 7)\n\n\t\t\tdst[j] = (b0 & 1) |\n\t\t\t\t((b1 & 1) << 1) |\n\t\t\t\t((b2 & 1) << 2) |\n\t\t\t\t((b3 & 1) << 3) |\n\t\t\t\t((b4 & 1) << 4) |\n\t\t\t\t((b5 & 1) << 5) |\n\t\t\t\t((b6 & 1) << 6) |\n\t\t\t\t((b7 & 1) << 7)\n\n\t\t\ti += 8\n\t\t}\n\t}\n\n\tfor i < n {\n\t\tx := i / 8\n\t\ty := i % 8\n\t\tb := src.Index(i)\n\t\tdst[x] = ((b & 1) << y) | (dst[x] & ^(1 << y))\n\t\ti++\n\t}\n\n\treturn n\n}\n\nfunc gather32(dst []uint32, src Uint32Array) int {\n\tn := min(len(dst), src.Len())\n\n\tfor i := range dst[:n] {\n\t\tdst[i] = src.Index(i)\n\t}\n\n\treturn n\n}\n\nfunc gather64(dst []uint64, src Uint64Array) int {\n\tn := min(len(dst), src.Len())\n\n\tfor i := range dst[:n] {\n\t\tdst[i] = src.Index(i)\n\t}\n\n\treturn n\n}\n\nfunc gather128(dst [][16]byte, src Uint128Array) int {\n\tn := min(len(dst), src.Len())\n\n\tfor i := range dst[:n] {\n\t\tdst[i] = src.Index(i)\n\t}\n\n\treturn n\n}\n"
  },
  {
    "path": "sparse/gather_test.go",
    "content": "package sparse_test\n\nimport (\n\t\"encoding/binary\"\n\t\"fmt\"\n\t\"math\"\n\t\"strconv\"\n\t\"testing\"\n\t\"time\"\n\t\"unsafe\"\n\n\t\"github.com/segmentio/parquet-go/sparse\"\n)\n\nconst (\n\tbenchmarkGatherPerLoop = 1000\n)\n\nfunc ExampleGatherUint32() {\n\ttype point2D struct{ X, Y uint32 }\n\n\tbuf := make([]point2D, 10)\n\tdst := make([]uint32, 10)\n\tsrc := sparse.UnsafeUint32Array(unsafe.Pointer(&buf[0].Y), len(buf), unsafe.Sizeof(buf[0]))\n\n\tfor i := range buf {\n\t\tbuf[i].X = math.MaxUint32\n\t\tbuf[i].Y = uint32(2 * i)\n\t}\n\n\tn := sparse.GatherUint32(dst, src)\n\n\tfor i, v := range dst[:n] {\n\t\tfmt.Printf(\"points[%d].Y = %d\\n\", i, v)\n\t}\n\n\t// Output:\n\t// points[0].Y = 0\n\t// points[1].Y = 2\n\t// points[2].Y = 4\n\t// points[3].Y = 6\n\t// points[4].Y = 8\n\t// points[5].Y = 10\n\t// points[6].Y = 12\n\t// points[7].Y = 14\n\t// points[8].Y = 16\n\t// points[9].Y = 18\n}\n\nfunc ExampleGatherUint64() {\n\ttype point2D struct{ X, Y uint64 }\n\n\tbuf := make([]point2D, 10)\n\tdst := make([]uint64, 10)\n\tsrc := sparse.UnsafeUint64Array(unsafe.Pointer(&buf[0].Y), len(buf), unsafe.Sizeof(buf[0]))\n\n\tfor i := range buf {\n\t\tbuf[i].X = math.MaxUint64\n\t\tbuf[i].Y = uint64(2 * i)\n\t}\n\n\tn := sparse.GatherUint64(dst, src)\n\n\tfor i, v := range dst[:n] {\n\t\tfmt.Printf(\"points[%d].Y = %v\\n\", i, v)\n\t}\n\n\t// Output:\n\t// points[0].Y = 0\n\t// points[1].Y = 2\n\t// points[2].Y = 4\n\t// points[3].Y = 6\n\t// points[4].Y = 8\n\t// points[5].Y = 10\n\t// points[6].Y = 12\n\t// points[7].Y = 14\n\t// points[8].Y = 16\n\t// points[9].Y = 18\n}\n\nfunc ExampleGatherUint128() {\n\ttype point2D struct{ X, Y [16]byte }\n\n\tbuf := make([]point2D, 10)\n\tdst := make([][16]byte, 10)\n\tsrc := sparse.UnsafeUint128Array(unsafe.Pointer(&buf[0].Y), len(buf), unsafe.Sizeof(buf[0]))\n\n\tfor i := range buf {\n\t\tx := uint64(math.MaxUint64)\n\t\ty := uint64(2 * i)\n\t\tbinary.LittleEndian.PutUint64(buf[i].X[:], x)\n\t\tbinary.LittleEndian.PutUint64(buf[i].Y[:], y)\n\t}\n\n\tn := sparse.GatherUint128(dst, src)\n\n\tfor i, v := range dst[:n] {\n\t\tfmt.Printf(\"points[%d].Y = %v\\n\", i, binary.LittleEndian.Uint64(v[:]))\n\t}\n\n\t// Output:\n\t// points[0].Y = 0\n\t// points[1].Y = 2\n\t// points[2].Y = 4\n\t// points[3].Y = 6\n\t// points[4].Y = 8\n\t// points[5].Y = 10\n\t// points[6].Y = 12\n\t// points[7].Y = 14\n\t// points[8].Y = 16\n\t// points[9].Y = 18\n}\n\nfunc ExampleGatherString() {\n\tbuf := make([][2]string, 10)\n\tdst := make([]string, 10)\n\tsrc := sparse.UnsafeStringArray(unsafe.Pointer(&buf[0][1]), len(buf), unsafe.Sizeof(buf[0]))\n\n\tfor i := range buf {\n\t\tbuf[i][0] = \"-\"\n\t\tbuf[i][1] = strconv.Itoa(i)\n\t}\n\n\tn := sparse.GatherString(dst, src)\n\n\tfor i, v := range dst[:n] {\n\t\tfmt.Printf(\"points[%d].Y = %v\\n\", i, v)\n\t}\n\n\t// Output:\n\t// points[0].Y = 0\n\t// points[1].Y = 1\n\t// points[2].Y = 2\n\t// points[3].Y = 3\n\t// points[4].Y = 4\n\t// points[5].Y = 5\n\t// points[6].Y = 6\n\t// points[7].Y = 7\n\t// points[8].Y = 8\n\t// points[9].Y = 9\n}\n\nfunc TestGatherUint32(t *testing.T) {\n\ttype point2D struct{ X, Y uint32 }\n\n\tconst N = 100\n\tbuf := make([]point2D, N+1)\n\tdst := make([]uint32, N)\n\tsrc := sparse.UnsafeUint32Array(unsafe.Pointer(&buf[0].Y), len(buf), unsafe.Sizeof(buf[0]))\n\n\tfor i := range buf {\n\t\tbuf[i].X = math.MaxUint32\n\t\tbuf[i].Y = uint32(2 * i)\n\t}\n\n\tif n := sparse.GatherUint32(dst, src); n != N {\n\t\tt.Errorf(\"wrong number of values gathered: want=%d got=%d\", N, n)\n\t}\n\n\tfor i, v := range dst {\n\t\tif v != uint32(2*i) {\n\t\t\tt.Errorf(\"wrong value gathered at index %d: want=%d got=%d\", i, 2*i, v)\n\t\t}\n\t}\n}\n\nfunc TestGatherUint64(t *testing.T) {\n\ttype point2D struct{ X, Y uint64 }\n\n\tconst N = 100\n\tbuf := make([]point2D, N+1)\n\tdst := make([]uint64, N)\n\tsrc := sparse.UnsafeUint64Array(unsafe.Pointer(&buf[0].Y), len(buf), unsafe.Sizeof(buf[0]))\n\n\tfor i := range buf {\n\t\tbuf[i].X = math.MaxUint64\n\t\tbuf[i].Y = uint64(2 * i)\n\t}\n\n\tif n := sparse.GatherUint64(dst, src); n != N {\n\t\tt.Errorf(\"wrong number of values gathered: want=%d got=%d\", N, n)\n\t}\n\n\tfor i, v := range dst {\n\t\tif v != uint64(2*i) {\n\t\t\tt.Errorf(\"wrong value gathered at index %d: want=%d got=%d\", i, 2*i, v)\n\t\t}\n\t}\n}\n\nfunc TestGatherUint128(t *testing.T) {\n\ttype point2D struct{ X, Y [16]byte }\n\n\tconst N = 100\n\tbuf := make([]point2D, N+1)\n\tdst := make([][16]byte, N)\n\tsrc := sparse.UnsafeUint128Array(unsafe.Pointer(&buf[0].Y), len(buf), unsafe.Sizeof(buf[0]))\n\n\tfor i := range buf {\n\t\tx := uint64(math.MaxUint64)\n\t\ty := uint64(2 * i)\n\t\tbinary.LittleEndian.PutUint64(buf[i].X[:], x)\n\t\tbinary.LittleEndian.PutUint64(buf[i].Y[:], y)\n\t}\n\n\tif n := sparse.GatherUint128(dst, src); n != N {\n\t\tt.Errorf(\"wrong number of values gathered: want=%d got=%d\", N, n)\n\t}\n\n\tfor i, v := range dst {\n\t\tif y := binary.LittleEndian.Uint64(v[:]); y != uint64(2*i) {\n\t\t\tt.Errorf(\"wrong value gathered at index %d: want=%d got=%d\", i, 2*i, y)\n\t\t}\n\t}\n}\n\nfunc BenchmarkGather32(b *testing.B) {\n\ttype point2D struct{ X, Y uint32 }\n\n\tbuf := make([]point2D, benchmarkGatherPerLoop)\n\tdst := make([]uint32, benchmarkGatherPerLoop)\n\tsrc := sparse.UnsafeUint32Array(unsafe.Pointer(&buf[0].Y), len(buf), unsafe.Sizeof(buf[0]))\n\n\tb.SetBytes(4 * benchmarkGatherPerLoop)\n\tbenchmarkThroughput(b, \"gather\", func() int {\n\t\treturn sparse.GatherUint32(dst, src)\n\t})\n}\n\nfunc BenchmarkGather64(b *testing.B) {\n\ttype point2D struct{ X, Y uint64 }\n\n\tbuf := make([]point2D, benchmarkGatherPerLoop)\n\tdst := make([]uint64, benchmarkGatherPerLoop)\n\tsrc := sparse.UnsafeUint64Array(unsafe.Pointer(&buf[0].Y), len(buf), unsafe.Sizeof(buf[0]))\n\n\tb.SetBytes(8 * benchmarkGatherPerLoop)\n\tbenchmarkThroughput(b, \"gather\", func() int {\n\t\treturn sparse.GatherUint64(dst, src)\n\t})\n}\n\nfunc BenchmarkGather128(b *testing.B) {\n\ttype point2D struct{ X, Y [16]byte }\n\n\tbuf := make([]point2D, benchmarkGatherPerLoop)\n\tdst := make([][16]byte, benchmarkGatherPerLoop)\n\tsrc := sparse.UnsafeUint128Array(unsafe.Pointer(&buf[0].Y), len(buf), unsafe.Sizeof(buf[0]))\n\n\tb.SetBytes(16 * benchmarkGatherPerLoop)\n\tbenchmarkThroughput(b, \"gather\", func() int {\n\t\treturn sparse.GatherUint128(dst, src)\n\t})\n}\n\nfunc benchmarkThroughput(b *testing.B, m string, f func() int) {\n\tstart := time.Now()\n\tcount := 0\n\n\tfor i := 0; i < b.N; i++ {\n\t\tcount += f()\n\t}\n\n\tseconds := time.Since(start).Seconds()\n\tb.ReportMetric(float64(count)/seconds, m+\"/s\")\n}\n"
  },
  {
    "path": "sparse/sparse.go",
    "content": "// Package sparse contains abstractions to help work on arrays of values in\n// sparse memory locations.\n//\n// Conversion between array types is supported when converting integers to a\n// lower size (e.g. int32 to int16, or uint64 to uint8), or converting from\n// signed integers to unsigned. Float types can also be converted to unsigned\n// integers of the same size, in which case the conversion is similar to using\n// the standard library's math.Float32bits and math.Float64bits functions.\n//\n// All array types can be converted to a generic Array type that can be used to erase\n// type information and bypass type conversion rules. This conversion is similar\n// to using Go's unsafe package to bypass Go's type system and should usually be\n// avoided and a sign that the application is attempting to break type safety\n// boundaries.\n//\n// The package provides Gather* functions which retrieve values from sparse\n// arrays into contiguous memory buffers. On platforms that support it, these\n// operations are implemented using SIMD gather instructions (e.g. VPGATHER on\n// Intel CPUs).\npackage sparse\n"
  },
  {
    "path": "transform.go",
    "content": "package parquet\n\n// TransformRowReader constructs a RowReader which applies the given transform\n// to each row rad from reader.\n//\n// The transformation function appends the transformed src row to dst, returning\n// dst and any error that occurred during the transformation. If dst is returned\n// unchanged, the row is skipped.\nfunc TransformRowReader(reader RowReader, transform func(dst, src Row) (Row, error)) RowReader {\n\treturn &transformRowReader{reader: reader, transform: transform}\n}\n\ntype transformRowReader struct {\n\treader    RowReader\n\ttransform func(Row, Row) (Row, error)\n\trows      []Row\n\toffset    int\n\tlength    int\n}\n\nfunc (t *transformRowReader) ReadRows(rows []Row) (n int, err error) {\n\tif len(t.rows) == 0 {\n\t\tt.rows = makeRows(len(rows))\n\t}\n\n\tfor {\n\t\tfor n < len(rows) && t.offset < t.length {\n\t\t\tdst := rows[n][:0]\n\t\t\tsrc := t.rows[t.offset]\n\t\t\trows[n], err = t.transform(dst, src)\n\t\t\tif err != nil {\n\t\t\t\treturn n, err\n\t\t\t}\n\t\t\tclearValues(src)\n\t\t\tt.rows[t.offset] = src[:0]\n\t\t\tt.offset++\n\t\t\tn++\n\t\t}\n\n\t\tif n == len(rows) {\n\t\t\treturn n, nil\n\t\t}\n\n\t\tr, err := t.reader.ReadRows(t.rows)\n\t\tif r == 0 && err != nil {\n\t\t\treturn n, err\n\t\t}\n\t\tt.offset = 0\n\t\tt.length = r\n\t}\n}\n\ntype transformRowBuffer struct {\n\tbuffer []Row\n\toffset int32\n\tlength int32\n}\n\nfunc (b *transformRowBuffer) init(n int) {\n\tb.buffer = makeRows(n)\n\tb.offset = 0\n\tb.length = 0\n}\n\nfunc (b *transformRowBuffer) discard() {\n\trow := b.buffer[b.offset]\n\tclearValues(row)\n\tb.buffer[b.offset] = row[:0]\n\n\tif b.offset++; b.offset == b.length {\n\t\tb.reset(0)\n\t}\n}\n\nfunc (b *transformRowBuffer) reset(n int) {\n\tb.offset = 0\n\tb.length = int32(n)\n}\n\nfunc (b *transformRowBuffer) rows() []Row {\n\treturn b.buffer[b.offset:b.length]\n}\n\nfunc (b *transformRowBuffer) cap() int {\n\treturn len(b.buffer)\n}\n\nfunc (b *transformRowBuffer) len() int {\n\treturn int(b.length - b.offset)\n}\n\n// TransformRowWriter constructs a RowWriter which applies the given transform\n// to each row writter to writer.\n//\n// The transformation function appends the transformed src row to dst, returning\n// dst and any error that occurred during the transformation. If dst is returned\n// unchanged, the row is skipped.\nfunc TransformRowWriter(writer RowWriter, transform func(dst, src Row) (Row, error)) RowWriter {\n\treturn &transformRowWriter{writer: writer, transform: transform}\n}\n\ntype transformRowWriter struct {\n\twriter    RowWriter\n\ttransform func(Row, Row) (Row, error)\n\trows      []Row\n}\n\nfunc (t *transformRowWriter) WriteRows(rows []Row) (n int, err error) {\n\tif len(t.rows) == 0 {\n\t\tt.rows = makeRows(len(rows))\n\t}\n\n\tfor n < len(rows) {\n\t\tnumRows := len(rows) - n\n\t\tif numRows > len(t.rows) {\n\t\t\tnumRows = len(t.rows)\n\t\t}\n\t\tif err := t.writeRows(rows[n : n+numRows]); err != nil {\n\t\t\treturn n, err\n\t\t}\n\t\tn += numRows\n\t}\n\n\treturn n, nil\n}\n\nfunc (t *transformRowWriter) writeRows(rows []Row) (err error) {\n\tnumRows := 0\n\tdefer func() { clearRows(t.rows[:numRows]) }()\n\n\tfor _, row := range rows {\n\t\tt.rows[numRows], err = t.transform(t.rows[numRows][:0], row)\n\t\tif err != nil {\n\t\t\treturn err\n\t\t}\n\t\tif len(t.rows[numRows]) != 0 {\n\t\t\tnumRows++\n\t\t}\n\t}\n\n\t_, err = t.writer.WriteRows(t.rows[:numRows])\n\treturn err\n}\n"
  },
  {
    "path": "transform_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc TestTransformRowReader(t *testing.T) {\n\trows := []parquet.Row{\n\t\t{parquet.Int64Value(0)},\n\t\t{parquet.Int64Value(1)},\n\t\t{parquet.Int64Value(2)},\n\t\t{parquet.Int64Value(3)},\n\t\t{parquet.Int64Value(4)},\n\t}\n\n\twant := []parquet.Row{\n\t\t{parquet.Int64Value(0), parquet.Int64Value(0).Level(0, 0, 1)},\n\t\t{parquet.Int64Value(1), parquet.Int64Value(2).Level(0, 0, 1)},\n\t\t{parquet.Int64Value(2), parquet.Int64Value(4).Level(0, 0, 1)},\n\t\t{parquet.Int64Value(3), parquet.Int64Value(6).Level(0, 0, 1)},\n\t\t{parquet.Int64Value(4), parquet.Int64Value(8).Level(0, 0, 1)},\n\t}\n\n\treader := parquet.TransformRowReader(&bufferedRows{rows: rows},\n\t\tfunc(dst, src parquet.Row) (parquet.Row, error) {\n\t\t\tdst = append(dst, src[0])\n\t\t\tdst = append(dst, parquet.Int64Value(2*src[0].Int64()).Level(0, 0, 1))\n\t\t\treturn dst, nil\n\t\t},\n\t)\n\n\twriter := &bufferedRows{}\n\t_, err := parquet.CopyRows(writer, reader)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tassertEqualRows(t, want, writer.rows)\n}\n\nfunc TestTransformRowWriter(t *testing.T) {\n\trows := []parquet.Row{\n\t\t{parquet.Int64Value(0)},\n\t\t{parquet.Int64Value(1)},\n\t\t{parquet.Int64Value(2)},\n\t\t{parquet.Int64Value(3)},\n\t\t{parquet.Int64Value(4)},\n\t}\n\n\twant := []parquet.Row{\n\t\t{parquet.Int64Value(1)},\n\t\t{parquet.Int64Value(3)},\n\t}\n\n\tbuffer := &bufferedRows{}\n\twriter := parquet.TransformRowWriter(buffer,\n\t\tfunc(dst, src parquet.Row) (parquet.Row, error) {\n\t\t\tif (src[0].Int64() % 2) != 0 {\n\t\t\t\tdst = append(dst, src[0])\n\t\t\t}\n\t\t\treturn dst, nil\n\t\t},\n\t)\n\n\treader := &bufferedRows{rows: rows}\n\t_, err := parquet.CopyRows(writer, reader)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tassertEqualRows(t, want, buffer.rows)\n}\n"
  },
  {
    "path": "type.go",
    "content": "package parquet\n\nimport (\n\t\"bytes\"\n\t\"encoding/json\"\n\t\"fmt\"\n\t\"math/bits\"\n\t\"reflect\"\n\t\"time\"\n\t\"unsafe\"\n\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/format\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\n// Kind is an enumeration type representing the physical types supported by the\n// parquet type system.\ntype Kind int8\n\nconst (\n\tBoolean           Kind = Kind(format.Boolean)\n\tInt32             Kind = Kind(format.Int32)\n\tInt64             Kind = Kind(format.Int64)\n\tInt96             Kind = Kind(format.Int96)\n\tFloat             Kind = Kind(format.Float)\n\tDouble            Kind = Kind(format.Double)\n\tByteArray         Kind = Kind(format.ByteArray)\n\tFixedLenByteArray Kind = Kind(format.FixedLenByteArray)\n)\n\n// String returns a human-readable representation of the physical type.\nfunc (k Kind) String() string { return format.Type(k).String() }\n\n// Value constructs a value from k and v.\n//\n// The method panics if the data is not a valid representation of the value\n// kind; for example, if the kind is Int32 but the data is not 4 bytes long.\nfunc (k Kind) Value(v []byte) Value {\n\tx, err := parseValue(k, v)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\treturn x\n}\n\n// The Type interface represents logical types of the parquet type system.\n//\n// Types are immutable and therefore safe to access from multiple goroutines.\ntype Type interface {\n\t// Returns a human-readable representation of the parquet type.\n\tString() string\n\n\t// Returns the Kind value representing the underlying physical type.\n\t//\n\t// The method panics if it is called on a group type.\n\tKind() Kind\n\n\t// For integer and floating point physical types, the method returns the\n\t// size of values in bits.\n\t//\n\t// For fixed-length byte arrays, the method returns the size of elements\n\t// in bytes.\n\t//\n\t// For other types, the value is zero.\n\tLength() int\n\n\t// Returns an estimation of the number of bytes required to hold the given\n\t// number of values of this type in memory.\n\t//\n\t// The method returns zero for group types.\n\tEstimateSize(numValues int) int\n\n\t// Returns an estimation of the number of values of this type that can be\n\t// held in the given byte size.\n\t//\n\t// The method returns zero for group types.\n\tEstimateNumValues(size int) int\n\n\t// Compares two values and returns a negative integer if a < b, positive if\n\t// a > b, or zero if a == b.\n\t//\n\t// The values' Kind must match the type, otherwise the result is undefined.\n\t//\n\t// The method panics if it is called on a group type.\n\tCompare(a, b Value) int\n\n\t// ColumnOrder returns the type's column order. For group types, this method\n\t// returns nil.\n\t//\n\t// The order describes the comparison logic implemented by the Less method.\n\t//\n\t// As an optimization, the method may return the same pointer across\n\t// multiple calls. Applications must treat the returned value as immutable,\n\t// mutating the value will result in undefined behavior.\n\tColumnOrder() *format.ColumnOrder\n\n\t// Returns the physical type as a *format.Type value. For group types, this\n\t// method returns nil.\n\t//\n\t// As an optimization, the method may return the same pointer across\n\t// multiple calls. Applications must treat the returned value as immutable,\n\t// mutating the value will result in undefined behavior.\n\tPhysicalType() *format.Type\n\n\t// Returns the logical type as a *format.LogicalType value. When the logical\n\t// type is unknown, the method returns nil.\n\t//\n\t// As an optimization, the method may return the same pointer across\n\t// multiple calls. Applications must treat the returned value as immutable,\n\t// mutating the value will result in undefined behavior.\n\tLogicalType() *format.LogicalType\n\n\t// Returns the logical type's equivalent converted type. When there are\n\t// no equivalent converted type, the method returns nil.\n\t//\n\t// As an optimization, the method may return the same pointer across\n\t// multiple calls. Applications must treat the returned value as immutable,\n\t// mutating the value will result in undefined behavior.\n\tConvertedType() *deprecated.ConvertedType\n\n\t// Creates a column indexer for values of this type.\n\t//\n\t// The size limit is a hint to the column indexer that it is allowed to\n\t// truncate the page boundaries to the given size. Only BYTE_ARRAY and\n\t// FIXED_LEN_BYTE_ARRAY types currently take this value into account.\n\t//\n\t// A value of zero or less means no limits.\n\t//\n\t// The method panics if it is called on a group type.\n\tNewColumnIndexer(sizeLimit int) ColumnIndexer\n\n\t// Creates a row group buffer column for values of this type.\n\t//\n\t// Column buffers are created using the index of the column they are\n\t// accumulating values in memory for (relative to the parent schema),\n\t// and the size of their memory buffer.\n\t//\n\t// The application may give an estimate of the number of values it expects\n\t// to write to the buffer as second argument. This estimate helps set the\n\t// initialize buffer capacity but is not a hard limit, the underlying memory\n\t// buffer will grown as needed to allow more values to be written. Programs\n\t// may use the Size method of the column buffer (or the parent row group,\n\t// when relevant) to determine how many bytes are being used, and perform a\n\t// flush of the buffers to a storage layer.\n\t//\n\t// The method panics if it is called on a group type.\n\tNewColumnBuffer(columnIndex, numValues int) ColumnBuffer\n\n\t// Creates a dictionary holding values of this type.\n\t//\n\t// The dictionary retains the data buffer, it does not make a copy of it.\n\t// If the application needs to share ownership of the memory buffer, it must\n\t// ensure that it will not be modified while the page is in use, or it must\n\t// make a copy of it prior to creating the dictionary.\n\t//\n\t// The method panics if the data type does not correspond to the parquet\n\t// type it is called on.\n\tNewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary\n\n\t// Creates a page belonging to a column at the given index, backed by the\n\t// data buffer.\n\t//\n\t// The page retains the data buffer, it does not make a copy of it. If the\n\t// application needs to share ownership of the memory buffer, it must ensure\n\t// that it will not be modified while the page is in use, or it must make a\n\t// copy of it prior to creating the page.\n\t//\n\t// The method panics if the data type does not correspond to the parquet\n\t// type it is called on.\n\tNewPage(columnIndex, numValues int, data encoding.Values) Page\n\n\t// Creates an encoding.Values instance backed by the given buffers.\n\t//\n\t// The offsets is only used by BYTE_ARRAY types, where it represents the\n\t// positions of each variable length value in the values buffer.\n\t//\n\t// The following expression creates an empty instance for any type:\n\t//\n\t//\t\tvalues := typ.NewValues(nil, nil)\n\t//\n\t// The method panics if it is called on group types.\n\tNewValues(values []byte, offsets []uint32) encoding.Values\n\n\t// Assuming the src buffer contains PLAIN encoded values of the type it is\n\t// called on, applies the given encoding and produces the output to the dst\n\t// buffer passed as first argument by dispatching the call to one of the\n\t// encoding methods.\n\tEncode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error)\n\n\t// Assuming the src buffer contains values encoding in the given encoding,\n\t// decodes the input and produces the encoded values into the dst output\n\t// buffer passed as first argument by dispatching the call to one of the\n\t// encoding methods.\n\tDecode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error)\n\n\t// Returns an estimation of the output size after decoding the values passed\n\t// as first argument with the given encoding.\n\t//\n\t// For most types, this is similar to calling EstimateSize with the known\n\t// number of encoded values. For variable size types, using this method may\n\t// provide a more precise result since it can inspect the input buffer.\n\tEstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int\n\n\t// Assigns a Parquet value to a Go value. Returns an error if assignment is\n\t// not possible. The source Value must be an expected logical type for the\n\t// receiver. This can be accomplished using ConvertValue.\n\tAssignValue(dst reflect.Value, src Value) error\n\n\t// Convert a Parquet Value of the given Type into a Parquet Value that is\n\t// compatible with the receiver. The returned Value is suitable to be passed\n\t// to AssignValue.\n\tConvertValue(val Value, typ Type) (Value, error)\n}\n\nvar (\n\tBooleanType   Type = booleanType{}\n\tInt32Type     Type = int32Type{}\n\tInt64Type     Type = int64Type{}\n\tInt96Type     Type = int96Type{}\n\tFloatType     Type = floatType{}\n\tDoubleType    Type = doubleType{}\n\tByteArrayType Type = byteArrayType{}\n)\n\n// In the current parquet version supported by this library, only type-defined\n// orders are supported.\nvar typeDefinedColumnOrder = format.ColumnOrder{\n\tTypeOrder: new(format.TypeDefinedOrder),\n}\n\nvar physicalTypes = [...]format.Type{\n\t0: format.Boolean,\n\t1: format.Int32,\n\t2: format.Int64,\n\t3: format.Int96,\n\t4: format.Float,\n\t5: format.Double,\n\t6: format.ByteArray,\n\t7: format.FixedLenByteArray,\n}\n\nvar convertedTypes = [...]deprecated.ConvertedType{\n\t0:  deprecated.UTF8,\n\t1:  deprecated.Map,\n\t2:  deprecated.MapKeyValue,\n\t3:  deprecated.List,\n\t4:  deprecated.Enum,\n\t5:  deprecated.Decimal,\n\t6:  deprecated.Date,\n\t7:  deprecated.TimeMillis,\n\t8:  deprecated.TimeMicros,\n\t9:  deprecated.TimestampMillis,\n\t10: deprecated.TimestampMicros,\n\t11: deprecated.Uint8,\n\t12: deprecated.Uint16,\n\t13: deprecated.Uint32,\n\t14: deprecated.Uint64,\n\t15: deprecated.Int8,\n\t16: deprecated.Int16,\n\t17: deprecated.Int32,\n\t18: deprecated.Int64,\n\t19: deprecated.Json,\n\t20: deprecated.Bson,\n\t21: deprecated.Interval,\n}\n\ntype booleanType struct{}\n\nfunc (t booleanType) String() string                           { return \"BOOLEAN\" }\nfunc (t booleanType) Kind() Kind                               { return Boolean }\nfunc (t booleanType) Length() int                              { return 1 }\nfunc (t booleanType) EstimateSize(n int) int                   { return (n + 7) / 8 }\nfunc (t booleanType) EstimateNumValues(n int) int              { return 8 * n }\nfunc (t booleanType) Compare(a, b Value) int                   { return compareBool(a.boolean(), b.boolean()) }\nfunc (t booleanType) ColumnOrder() *format.ColumnOrder         { return &typeDefinedColumnOrder }\nfunc (t booleanType) LogicalType() *format.LogicalType         { return nil }\nfunc (t booleanType) ConvertedType() *deprecated.ConvertedType { return nil }\nfunc (t booleanType) PhysicalType() *format.Type               { return &physicalTypes[Boolean] }\n\nfunc (t booleanType) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn newBooleanColumnIndexer()\n}\n\nfunc (t booleanType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn newBooleanColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))\n}\n\nfunc (t booleanType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn newBooleanDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t booleanType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn newBooleanPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t booleanType) NewValues(values []byte, _ []uint32) encoding.Values {\n\treturn encoding.BooleanValues(values)\n}\n\nfunc (t booleanType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn encoding.EncodeBoolean(dst, src, enc)\n}\n\nfunc (t booleanType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn encoding.DecodeBoolean(dst, src, enc)\n}\n\nfunc (t booleanType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn t.EstimateSize(numValues)\n}\n\nfunc (t booleanType) AssignValue(dst reflect.Value, src Value) error {\n\tv := src.boolean()\n\tswitch dst.Kind() {\n\tcase reflect.Bool:\n\t\tdst.SetBool(v)\n\tdefault:\n\t\tdst.Set(reflect.ValueOf(v))\n\t}\n\treturn nil\n}\n\nfunc (t booleanType) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch typ.(type) {\n\tcase *stringType:\n\t\treturn convertStringToBoolean(val)\n\t}\n\tswitch typ.Kind() {\n\tcase Boolean:\n\t\treturn val, nil\n\tcase Int32:\n\t\treturn convertInt32ToBoolean(val)\n\tcase Int64:\n\t\treturn convertInt64ToBoolean(val)\n\tcase Int96:\n\t\treturn convertInt96ToBoolean(val)\n\tcase Float:\n\t\treturn convertFloatToBoolean(val)\n\tcase Double:\n\t\treturn convertDoubleToBoolean(val)\n\tcase ByteArray, FixedLenByteArray:\n\t\treturn convertByteArrayToBoolean(val)\n\tdefault:\n\t\treturn makeValueKind(Boolean), nil\n\t}\n}\n\ntype int32Type struct{}\n\nfunc (t int32Type) String() string                           { return \"INT32\" }\nfunc (t int32Type) Kind() Kind                               { return Int32 }\nfunc (t int32Type) Length() int                              { return 32 }\nfunc (t int32Type) EstimateSize(n int) int                   { return 4 * n }\nfunc (t int32Type) EstimateNumValues(n int) int              { return n / 4 }\nfunc (t int32Type) Compare(a, b Value) int                   { return compareInt32(a.int32(), b.int32()) }\nfunc (t int32Type) ColumnOrder() *format.ColumnOrder         { return &typeDefinedColumnOrder }\nfunc (t int32Type) LogicalType() *format.LogicalType         { return nil }\nfunc (t int32Type) ConvertedType() *deprecated.ConvertedType { return nil }\nfunc (t int32Type) PhysicalType() *format.Type               { return &physicalTypes[Int32] }\n\nfunc (t int32Type) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn newInt32ColumnIndexer()\n}\n\nfunc (t int32Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))\n}\n\nfunc (t int32Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t int32Type) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn newInt32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t int32Type) NewValues(values []byte, _ []uint32) encoding.Values {\n\treturn encoding.Int32ValuesFromBytes(values)\n}\n\nfunc (t int32Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn encoding.EncodeInt32(dst, src, enc)\n}\n\nfunc (t int32Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn encoding.DecodeInt32(dst, src, enc)\n}\n\nfunc (t int32Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn t.EstimateSize(numValues)\n}\n\nfunc (t int32Type) AssignValue(dst reflect.Value, src Value) error {\n\tv := src.int32()\n\tswitch dst.Kind() {\n\tcase reflect.Int8, reflect.Int16, reflect.Int32:\n\t\tdst.SetInt(int64(v))\n\tcase reflect.Uint8, reflect.Uint16, reflect.Uint32:\n\t\tdst.SetUint(uint64(v))\n\tdefault:\n\t\tdst.Set(reflect.ValueOf(v))\n\t}\n\treturn nil\n}\n\nfunc (t int32Type) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch typ.(type) {\n\tcase *stringType:\n\t\treturn convertStringToInt32(val)\n\t}\n\tswitch typ.Kind() {\n\tcase Boolean:\n\t\treturn convertBooleanToInt32(val)\n\tcase Int32:\n\t\treturn val, nil\n\tcase Int64:\n\t\treturn convertInt64ToInt32(val)\n\tcase Int96:\n\t\treturn convertInt96ToInt32(val)\n\tcase Float:\n\t\treturn convertFloatToInt32(val)\n\tcase Double:\n\t\treturn convertDoubleToInt32(val)\n\tcase ByteArray, FixedLenByteArray:\n\t\treturn convertByteArrayToInt32(val)\n\tdefault:\n\t\treturn makeValueKind(Int32), nil\n\t}\n}\n\ntype int64Type struct{}\n\nfunc (t int64Type) String() string                           { return \"INT64\" }\nfunc (t int64Type) Kind() Kind                               { return Int64 }\nfunc (t int64Type) Length() int                              { return 64 }\nfunc (t int64Type) EstimateSize(n int) int                   { return 8 * n }\nfunc (t int64Type) EstimateNumValues(n int) int              { return n / 8 }\nfunc (t int64Type) Compare(a, b Value) int                   { return compareInt64(a.int64(), b.int64()) }\nfunc (t int64Type) ColumnOrder() *format.ColumnOrder         { return &typeDefinedColumnOrder }\nfunc (t int64Type) LogicalType() *format.LogicalType         { return nil }\nfunc (t int64Type) ConvertedType() *deprecated.ConvertedType { return nil }\nfunc (t int64Type) PhysicalType() *format.Type               { return &physicalTypes[Int64] }\n\nfunc (t int64Type) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn newInt64ColumnIndexer()\n}\n\nfunc (t int64Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))\n}\n\nfunc (t int64Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t int64Type) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn newInt64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t int64Type) NewValues(values []byte, _ []uint32) encoding.Values {\n\treturn encoding.Int64ValuesFromBytes(values)\n}\n\nfunc (t int64Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn encoding.EncodeInt64(dst, src, enc)\n}\n\nfunc (t int64Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn encoding.DecodeInt64(dst, src, enc)\n}\n\nfunc (t int64Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn t.EstimateSize(numValues)\n}\n\nfunc (t int64Type) AssignValue(dst reflect.Value, src Value) error {\n\tv := src.int64()\n\tswitch dst.Kind() {\n\tcase reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int:\n\t\tdst.SetInt(v)\n\tcase reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uint, reflect.Uintptr:\n\t\tdst.SetUint(uint64(v))\n\tdefault:\n\t\tdst.Set(reflect.ValueOf(v))\n\t}\n\treturn nil\n}\n\nfunc (t int64Type) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch typ.(type) {\n\tcase *stringType:\n\t\treturn convertStringToInt64(val)\n\t}\n\tswitch typ.Kind() {\n\tcase Boolean:\n\t\treturn convertBooleanToInt64(val)\n\tcase Int32:\n\t\treturn convertInt32ToInt64(val)\n\tcase Int64:\n\t\treturn val, nil\n\tcase Int96:\n\t\treturn convertInt96ToInt64(val)\n\tcase Float:\n\t\treturn convertFloatToInt64(val)\n\tcase Double:\n\t\treturn convertDoubleToInt64(val)\n\tcase ByteArray, FixedLenByteArray:\n\t\treturn convertByteArrayToInt64(val)\n\tdefault:\n\t\treturn makeValueKind(Int64), nil\n\t}\n}\n\ntype int96Type struct{}\n\nfunc (t int96Type) String() string { return \"INT96\" }\n\nfunc (t int96Type) Kind() Kind                               { return Int96 }\nfunc (t int96Type) Length() int                              { return 96 }\nfunc (t int96Type) EstimateSize(n int) int                   { return 12 * n }\nfunc (t int96Type) EstimateNumValues(n int) int              { return n / 12 }\nfunc (t int96Type) Compare(a, b Value) int                   { return compareInt96(a.int96(), b.int96()) }\nfunc (t int96Type) ColumnOrder() *format.ColumnOrder         { return &typeDefinedColumnOrder }\nfunc (t int96Type) LogicalType() *format.LogicalType         { return nil }\nfunc (t int96Type) ConvertedType() *deprecated.ConvertedType { return nil }\nfunc (t int96Type) PhysicalType() *format.Type               { return &physicalTypes[Int96] }\n\nfunc (t int96Type) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn newInt96ColumnIndexer()\n}\n\nfunc (t int96Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn newInt96ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))\n}\n\nfunc (t int96Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn newInt96Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t int96Type) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn newInt96Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t int96Type) NewValues(values []byte, _ []uint32) encoding.Values {\n\treturn encoding.Int96ValuesFromBytes(values)\n}\n\nfunc (t int96Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn encoding.EncodeInt96(dst, src, enc)\n}\n\nfunc (t int96Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn encoding.DecodeInt96(dst, src, enc)\n}\n\nfunc (t int96Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn t.EstimateSize(numValues)\n}\n\nfunc (t int96Type) AssignValue(dst reflect.Value, src Value) error {\n\tv := src.Int96()\n\tdst.Set(reflect.ValueOf(v))\n\treturn nil\n}\n\nfunc (t int96Type) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch typ.(type) {\n\tcase *stringType:\n\t\treturn convertStringToInt96(val)\n\t}\n\tswitch typ.Kind() {\n\tcase Boolean:\n\t\treturn convertBooleanToInt96(val)\n\tcase Int32:\n\t\treturn convertInt32ToInt96(val)\n\tcase Int64:\n\t\treturn convertInt64ToInt96(val)\n\tcase Int96:\n\t\treturn val, nil\n\tcase Float:\n\t\treturn convertFloatToInt96(val)\n\tcase Double:\n\t\treturn convertDoubleToInt96(val)\n\tcase ByteArray, FixedLenByteArray:\n\t\treturn convertByteArrayToInt96(val)\n\tdefault:\n\t\treturn makeValueKind(Int96), nil\n\t}\n}\n\ntype floatType struct{}\n\nfunc (t floatType) String() string                           { return \"FLOAT\" }\nfunc (t floatType) Kind() Kind                               { return Float }\nfunc (t floatType) Length() int                              { return 32 }\nfunc (t floatType) EstimateSize(n int) int                   { return 4 * n }\nfunc (t floatType) EstimateNumValues(n int) int              { return n / 4 }\nfunc (t floatType) Compare(a, b Value) int                   { return compareFloat32(a.float(), b.float()) }\nfunc (t floatType) ColumnOrder() *format.ColumnOrder         { return &typeDefinedColumnOrder }\nfunc (t floatType) LogicalType() *format.LogicalType         { return nil }\nfunc (t floatType) ConvertedType() *deprecated.ConvertedType { return nil }\nfunc (t floatType) PhysicalType() *format.Type               { return &physicalTypes[Float] }\n\nfunc (t floatType) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn newFloatColumnIndexer()\n}\n\nfunc (t floatType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn newFloatColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))\n}\n\nfunc (t floatType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn newFloatDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t floatType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn newFloatPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t floatType) NewValues(values []byte, _ []uint32) encoding.Values {\n\treturn encoding.FloatValuesFromBytes(values)\n}\n\nfunc (t floatType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn encoding.EncodeFloat(dst, src, enc)\n}\n\nfunc (t floatType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn encoding.DecodeFloat(dst, src, enc)\n}\n\nfunc (t floatType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn t.EstimateSize(numValues)\n}\n\nfunc (t floatType) AssignValue(dst reflect.Value, src Value) error {\n\tv := src.float()\n\tswitch dst.Kind() {\n\tcase reflect.Float32, reflect.Float64:\n\t\tdst.SetFloat(float64(v))\n\tdefault:\n\t\tdst.Set(reflect.ValueOf(v))\n\t}\n\treturn nil\n}\n\nfunc (t floatType) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch typ.(type) {\n\tcase *stringType:\n\t\treturn convertStringToFloat(val)\n\t}\n\tswitch typ.Kind() {\n\tcase Boolean:\n\t\treturn convertBooleanToFloat(val)\n\tcase Int32:\n\t\treturn convertInt32ToFloat(val)\n\tcase Int64:\n\t\treturn convertInt64ToFloat(val)\n\tcase Int96:\n\t\treturn convertInt96ToFloat(val)\n\tcase Float:\n\t\treturn val, nil\n\tcase Double:\n\t\treturn convertDoubleToFloat(val)\n\tcase ByteArray, FixedLenByteArray:\n\t\treturn convertByteArrayToFloat(val)\n\tdefault:\n\t\treturn makeValueKind(Float), nil\n\t}\n}\n\ntype doubleType struct{}\n\nfunc (t doubleType) String() string                           { return \"DOUBLE\" }\nfunc (t doubleType) Kind() Kind                               { return Double }\nfunc (t doubleType) Length() int                              { return 64 }\nfunc (t doubleType) EstimateSize(n int) int                   { return 8 * n }\nfunc (t doubleType) EstimateNumValues(n int) int              { return n / 8 }\nfunc (t doubleType) Compare(a, b Value) int                   { return compareFloat64(a.double(), b.double()) }\nfunc (t doubleType) ColumnOrder() *format.ColumnOrder         { return &typeDefinedColumnOrder }\nfunc (t doubleType) LogicalType() *format.LogicalType         { return nil }\nfunc (t doubleType) ConvertedType() *deprecated.ConvertedType { return nil }\nfunc (t doubleType) PhysicalType() *format.Type               { return &physicalTypes[Double] }\n\nfunc (t doubleType) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn newDoubleColumnIndexer()\n}\n\nfunc (t doubleType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn newDoubleColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))\n}\n\nfunc (t doubleType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn newDoubleDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t doubleType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn newDoublePage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t doubleType) NewValues(values []byte, _ []uint32) encoding.Values {\n\treturn encoding.DoubleValuesFromBytes(values)\n}\n\nfunc (t doubleType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn encoding.EncodeDouble(dst, src, enc)\n}\n\nfunc (t doubleType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn encoding.DecodeDouble(dst, src, enc)\n}\n\nfunc (t doubleType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn t.EstimateSize(numValues)\n}\n\nfunc (t doubleType) AssignValue(dst reflect.Value, src Value) error {\n\tv := src.double()\n\tswitch dst.Kind() {\n\tcase reflect.Float32, reflect.Float64:\n\t\tdst.SetFloat(v)\n\tdefault:\n\t\tdst.Set(reflect.ValueOf(v))\n\t}\n\treturn nil\n}\n\nfunc (t doubleType) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch typ.(type) {\n\tcase *stringType:\n\t\treturn convertStringToDouble(val)\n\t}\n\tswitch typ.Kind() {\n\tcase Boolean:\n\t\treturn convertBooleanToDouble(val)\n\tcase Int32:\n\t\treturn convertInt32ToDouble(val)\n\tcase Int64:\n\t\treturn convertInt64ToDouble(val)\n\tcase Int96:\n\t\treturn convertInt96ToDouble(val)\n\tcase Float:\n\t\treturn convertFloatToDouble(val)\n\tcase Double:\n\t\treturn val, nil\n\tcase ByteArray, FixedLenByteArray:\n\t\treturn convertByteArrayToDouble(val)\n\tdefault:\n\t\treturn makeValueKind(Double), nil\n\t}\n}\n\ntype byteArrayType struct{}\n\nfunc (t byteArrayType) String() string                           { return \"BYTE_ARRAY\" }\nfunc (t byteArrayType) Kind() Kind                               { return ByteArray }\nfunc (t byteArrayType) Length() int                              { return 0 }\nfunc (t byteArrayType) EstimateSize(n int) int                   { return estimatedSizeOfByteArrayValues * n }\nfunc (t byteArrayType) EstimateNumValues(n int) int              { return n / estimatedSizeOfByteArrayValues }\nfunc (t byteArrayType) Compare(a, b Value) int                   { return bytes.Compare(a.byteArray(), b.byteArray()) }\nfunc (t byteArrayType) ColumnOrder() *format.ColumnOrder         { return &typeDefinedColumnOrder }\nfunc (t byteArrayType) LogicalType() *format.LogicalType         { return nil }\nfunc (t byteArrayType) ConvertedType() *deprecated.ConvertedType { return nil }\nfunc (t byteArrayType) PhysicalType() *format.Type               { return &physicalTypes[ByteArray] }\n\nfunc (t byteArrayType) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn newByteArrayColumnIndexer(sizeLimit)\n}\n\nfunc (t byteArrayType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))\n}\n\nfunc (t byteArrayType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t byteArrayType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t byteArrayType) NewValues(values []byte, offsets []uint32) encoding.Values {\n\treturn encoding.ByteArrayValues(values, offsets)\n}\n\nfunc (t byteArrayType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn encoding.EncodeByteArray(dst, src, enc)\n}\n\nfunc (t byteArrayType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn encoding.DecodeByteArray(dst, src, enc)\n}\n\nfunc (t byteArrayType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn enc.EstimateDecodeByteArraySize(src)\n}\n\nfunc (t byteArrayType) AssignValue(dst reflect.Value, src Value) error {\n\tv := src.byteArray()\n\tswitch dst.Kind() {\n\tcase reflect.String:\n\t\tdst.SetString(string(v))\n\tcase reflect.Slice:\n\t\tdst.SetBytes(copyBytes(v))\n\tdefault:\n\t\tval := reflect.ValueOf(string(v))\n\t\tdst.Set(val)\n\t}\n\treturn nil\n}\n\nfunc (t byteArrayType) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch typ.Kind() {\n\tcase Boolean:\n\t\treturn convertBooleanToByteArray(val)\n\tcase Int32:\n\t\treturn convertInt32ToByteArray(val)\n\tcase Int64:\n\t\treturn convertInt64ToByteArray(val)\n\tcase Int96:\n\t\treturn convertInt96ToByteArray(val)\n\tcase Float:\n\t\treturn convertFloatToByteArray(val)\n\tcase Double:\n\t\treturn convertDoubleToByteArray(val)\n\tcase ByteArray, FixedLenByteArray:\n\t\treturn val, nil\n\tdefault:\n\t\treturn makeValueKind(ByteArray), nil\n\t}\n}\n\ntype fixedLenByteArrayType struct{ length int }\n\nfunc (t fixedLenByteArrayType) String() string {\n\treturn fmt.Sprintf(\"FIXED_LEN_BYTE_ARRAY(%d)\", t.length)\n}\n\nfunc (t fixedLenByteArrayType) Kind() Kind { return FixedLenByteArray }\n\nfunc (t fixedLenByteArrayType) Length() int { return t.length }\n\nfunc (t fixedLenByteArrayType) EstimateSize(n int) int { return t.length * n }\n\nfunc (t fixedLenByteArrayType) EstimateNumValues(n int) int { return n / t.length }\n\nfunc (t fixedLenByteArrayType) Compare(a, b Value) int {\n\treturn bytes.Compare(a.byteArray(), b.byteArray())\n}\n\nfunc (t fixedLenByteArrayType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder }\n\nfunc (t fixedLenByteArrayType) LogicalType() *format.LogicalType { return nil }\n\nfunc (t fixedLenByteArrayType) ConvertedType() *deprecated.ConvertedType { return nil }\n\nfunc (t fixedLenByteArrayType) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] }\n\nfunc (t fixedLenByteArrayType) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn newFixedLenByteArrayColumnIndexer(t.length, sizeLimit)\n}\n\nfunc (t fixedLenByteArrayType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn newFixedLenByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))\n}\n\nfunc (t fixedLenByteArrayType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn newFixedLenByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t fixedLenByteArrayType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn newFixedLenByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t fixedLenByteArrayType) NewValues(values []byte, _ []uint32) encoding.Values {\n\treturn encoding.FixedLenByteArrayValues(values, t.length)\n}\n\nfunc (t fixedLenByteArrayType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn encoding.EncodeFixedLenByteArray(dst, src, enc)\n}\n\nfunc (t fixedLenByteArrayType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn encoding.DecodeFixedLenByteArray(dst, src, enc)\n}\n\nfunc (t fixedLenByteArrayType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn t.EstimateSize(numValues)\n}\n\nfunc (t fixedLenByteArrayType) AssignValue(dst reflect.Value, src Value) error {\n\tv := src.byteArray()\n\tswitch dst.Kind() {\n\tcase reflect.Array:\n\t\tif dst.Type().Elem().Kind() == reflect.Uint8 && dst.Len() == len(v) {\n\t\t\t// This code could be implemented as a call to reflect.Copy but\n\t\t\t// it would require creating a reflect.Value from v which causes\n\t\t\t// the heap allocation to pack the []byte value. To avoid this\n\t\t\t// overhead we instead convert the reflect.Value holding the\n\t\t\t// destination array into a byte slice which allows us to use\n\t\t\t// a more efficient call to copy.\n\t\t\td := unsafe.Slice((*byte)(unsafecast.PointerOfValue(dst)), len(v))\n\t\t\tcopy(d, v)\n\t\t\treturn nil\n\t\t}\n\tcase reflect.Slice:\n\t\tdst.SetBytes(copyBytes(v))\n\t\treturn nil\n\t}\n\n\tval := reflect.ValueOf(copyBytes(v))\n\tdst.Set(val)\n\treturn nil\n}\n\nfunc (t fixedLenByteArrayType) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch typ.(type) {\n\tcase *stringType:\n\t\treturn convertStringToFixedLenByteArray(val, t.length)\n\t}\n\tswitch typ.Kind() {\n\tcase Boolean:\n\t\treturn convertBooleanToFixedLenByteArray(val, t.length)\n\tcase Int32:\n\t\treturn convertInt32ToFixedLenByteArray(val, t.length)\n\tcase Int64:\n\t\treturn convertInt64ToFixedLenByteArray(val, t.length)\n\tcase Int96:\n\t\treturn convertInt96ToFixedLenByteArray(val, t.length)\n\tcase Float:\n\t\treturn convertFloatToFixedLenByteArray(val, t.length)\n\tcase Double:\n\t\treturn convertDoubleToFixedLenByteArray(val, t.length)\n\tcase ByteArray, FixedLenByteArray:\n\t\treturn convertByteArrayToFixedLenByteArray(val, t.length)\n\tdefault:\n\t\treturn makeValueBytes(FixedLenByteArray, make([]byte, t.length)), nil\n\t}\n}\n\ntype uint32Type struct{ int32Type }\n\nfunc (t uint32Type) Compare(a, b Value) int {\n\treturn compareUint32(a.uint32(), b.uint32())\n}\n\nfunc (t uint32Type) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn newUint32ColumnIndexer()\n}\n\nfunc (t uint32Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn newUint32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))\n}\n\nfunc (t uint32Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn newUint32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t uint32Type) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn newUint32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\ntype uint64Type struct{ int64Type }\n\nfunc (t uint64Type) Compare(a, b Value) int {\n\treturn compareUint64(a.uint64(), b.uint64())\n}\n\nfunc (t uint64Type) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn newUint64ColumnIndexer()\n}\n\nfunc (t uint64Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn newUint64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))\n}\n\nfunc (t uint64Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn newUint64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t uint64Type) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn newUint64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\n// BE128 stands for \"big-endian 128 bits\". This type is used as a special case\n// for fixed-length byte arrays of 16 bytes, which are commonly used to\n// represent columns of random unique identifiers such as UUIDs.\n//\n// Comparisons of BE128 values use the natural byte order, the zeroth byte is\n// the most significant byte.\n//\n// The special case is intended to provide optimizations based on the knowledge\n// that the values are 16 bytes long. Stronger type checking can also be applied\n// by the compiler when using [16]byte values rather than []byte, reducing the\n// risk of errors on these common code paths.\ntype be128Type struct{}\n\nfunc (t be128Type) String() string { return \"FIXED_LEN_BYTE_ARRAY(16)\" }\n\nfunc (t be128Type) Kind() Kind { return FixedLenByteArray }\n\nfunc (t be128Type) Length() int { return 16 }\n\nfunc (t be128Type) EstimateSize(n int) int { return 16 * n }\n\nfunc (t be128Type) EstimateNumValues(n int) int { return n / 16 }\n\nfunc (t be128Type) Compare(a, b Value) int { return compareBE128(a.be128(), b.be128()) }\n\nfunc (t be128Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder }\n\nfunc (t be128Type) LogicalType() *format.LogicalType { return nil }\n\nfunc (t be128Type) ConvertedType() *deprecated.ConvertedType { return nil }\n\nfunc (t be128Type) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] }\n\nfunc (t be128Type) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn newBE128ColumnIndexer()\n}\n\nfunc (t be128Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn newBE128ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))\n}\n\nfunc (t be128Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn newBE128Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t be128Type) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn newBE128Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t be128Type) NewValues(values []byte, _ []uint32) encoding.Values {\n\treturn encoding.FixedLenByteArrayValues(values, 16)\n}\n\nfunc (t be128Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn encoding.EncodeFixedLenByteArray(dst, src, enc)\n}\n\nfunc (t be128Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn encoding.DecodeFixedLenByteArray(dst, src, enc)\n}\n\nfunc (t be128Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn t.EstimateSize(numValues)\n}\n\nfunc (t be128Type) AssignValue(dst reflect.Value, src Value) error {\n\treturn fixedLenByteArrayType{length: 16}.AssignValue(dst, src)\n}\n\nfunc (t be128Type) ConvertValue(val Value, typ Type) (Value, error) {\n\treturn fixedLenByteArrayType{length: 16}.ConvertValue(val, typ)\n}\n\n// FixedLenByteArrayType constructs a type for fixed-length values of the given\n// size (in bytes).\nfunc FixedLenByteArrayType(length int) Type {\n\tswitch length {\n\tcase 16:\n\t\treturn be128Type{}\n\tdefault:\n\t\treturn fixedLenByteArrayType{length: length}\n\t}\n}\n\n// Int constructs a leaf node of signed integer logical type of the given bit\n// width.\n//\n// The bit width must be one of 8, 16, 32, 64, or the function will panic.\nfunc Int(bitWidth int) Node {\n\treturn Leaf(integerType(bitWidth, &signedIntTypes))\n}\n\n// Uint constructs a leaf node of unsigned integer logical type of the given\n// bit width.\n//\n// The bit width must be one of 8, 16, 32, 64, or the function will panic.\nfunc Uint(bitWidth int) Node {\n\treturn Leaf(integerType(bitWidth, &unsignedIntTypes))\n}\n\nfunc integerType(bitWidth int, types *[4]intType) *intType {\n\tswitch bitWidth {\n\tcase 8:\n\t\treturn &types[0]\n\tcase 16:\n\t\treturn &types[1]\n\tcase 32:\n\t\treturn &types[2]\n\tcase 64:\n\t\treturn &types[3]\n\tdefault:\n\t\tpanic(fmt.Sprintf(\"cannot create a %d bits parquet integer node\", bitWidth))\n\t}\n}\n\nvar signedIntTypes = [...]intType{\n\t{BitWidth: 8, IsSigned: true},\n\t{BitWidth: 16, IsSigned: true},\n\t{BitWidth: 32, IsSigned: true},\n\t{BitWidth: 64, IsSigned: true},\n}\n\nvar unsignedIntTypes = [...]intType{\n\t{BitWidth: 8, IsSigned: false},\n\t{BitWidth: 16, IsSigned: false},\n\t{BitWidth: 32, IsSigned: false},\n\t{BitWidth: 64, IsSigned: false},\n}\n\ntype intType format.IntType\n\nfunc (t *intType) baseType() Type {\n\tif t.IsSigned {\n\t\tif t.BitWidth == 64 {\n\t\t\treturn int64Type{}\n\t\t} else {\n\t\t\treturn int32Type{}\n\t\t}\n\t} else {\n\t\tif t.BitWidth == 64 {\n\t\t\treturn uint64Type{}\n\t\t} else {\n\t\t\treturn uint32Type{}\n\t\t}\n\t}\n}\n\nfunc (t *intType) String() string { return (*format.IntType)(t).String() }\n\nfunc (t *intType) Kind() Kind { return t.baseType().Kind() }\n\nfunc (t *intType) Length() int { return int(t.BitWidth) }\n\nfunc (t *intType) EstimateSize(n int) int { return (int(t.BitWidth) / 8) * n }\n\nfunc (t *intType) EstimateNumValues(n int) int { return n / (int(t.BitWidth) / 8) }\n\nfunc (t *intType) Compare(a, b Value) int {\n\t// This code is similar to t.baseType().Compare(a,b) but comparison methods\n\t// tend to be invoked a lot (e.g. when sorting) so avoiding the interface\n\t// indirection in this case yields much better throughput in some cases.\n\tif t.BitWidth == 64 {\n\t\ti1 := a.int64()\n\t\ti2 := b.int64()\n\t\tif t.IsSigned {\n\t\t\treturn compareInt64(i1, i2)\n\t\t} else {\n\t\t\treturn compareUint64(uint64(i1), uint64(i2))\n\t\t}\n\t} else {\n\t\ti1 := a.int32()\n\t\ti2 := b.int32()\n\t\tif t.IsSigned {\n\t\t\treturn compareInt32(i1, i2)\n\t\t} else {\n\t\t\treturn compareUint32(uint32(i1), uint32(i2))\n\t\t}\n\t}\n}\n\nfunc (t *intType) ColumnOrder() *format.ColumnOrder { return t.baseType().ColumnOrder() }\n\nfunc (t *intType) PhysicalType() *format.Type { return t.baseType().PhysicalType() }\n\nfunc (t *intType) LogicalType() *format.LogicalType {\n\treturn &format.LogicalType{Integer: (*format.IntType)(t)}\n}\n\nfunc (t *intType) ConvertedType() *deprecated.ConvertedType {\n\tconvertedType := bits.Len8(uint8(t.BitWidth)/8) - 1 // 8=>0, 16=>1, 32=>2, 64=>4\n\tif t.IsSigned {\n\t\tconvertedType += int(deprecated.Int8)\n\t} else {\n\t\tconvertedType += int(deprecated.Uint8)\n\t}\n\treturn &convertedTypes[convertedType]\n}\n\nfunc (t *intType) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn t.baseType().NewColumnIndexer(sizeLimit)\n}\n\nfunc (t *intType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn t.baseType().NewColumnBuffer(columnIndex, numValues)\n}\n\nfunc (t *intType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn t.baseType().NewDictionary(columnIndex, numValues, data)\n}\n\nfunc (t *intType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn t.baseType().NewPage(columnIndex, numValues, data)\n}\n\nfunc (t *intType) NewValues(values []byte, offsets []uint32) encoding.Values {\n\treturn t.baseType().NewValues(values, offsets)\n}\n\nfunc (t *intType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn t.baseType().Encode(dst, src, enc)\n}\n\nfunc (t *intType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn t.baseType().Decode(dst, src, enc)\n}\n\nfunc (t *intType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn t.baseType().EstimateDecodeSize(numValues, src, enc)\n}\n\nfunc (t *intType) AssignValue(dst reflect.Value, src Value) error {\n\tif t.BitWidth == 64 {\n\t\treturn int64Type{}.AssignValue(dst, src)\n\t} else {\n\t\treturn int32Type{}.AssignValue(dst, src)\n\t}\n}\n\nfunc (t *intType) ConvertValue(val Value, typ Type) (Value, error) {\n\tif t.BitWidth == 64 {\n\t\treturn int64Type{}.ConvertValue(val, typ)\n\t} else {\n\t\treturn int32Type{}.ConvertValue(val, typ)\n\t}\n}\n\n// Decimal constructs a leaf node of decimal logical type with the given\n// scale, precision, and underlying type.\n//\n// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal\nfunc Decimal(scale, precision int, typ Type) Node {\n\tswitch typ.Kind() {\n\tcase Int32, Int64, FixedLenByteArray:\n\tdefault:\n\t\tpanic(\"DECIMAL node must annotate Int32, Int64 or FixedLenByteArray but got \" + typ.String())\n\t}\n\treturn Leaf(&decimalType{\n\t\tdecimal: format.DecimalType{\n\t\t\tScale:     int32(scale),\n\t\t\tPrecision: int32(precision),\n\t\t},\n\t\tType: typ,\n\t})\n}\n\ntype decimalType struct {\n\tdecimal format.DecimalType\n\tType\n}\n\nfunc (t *decimalType) String() string { return t.decimal.String() }\n\nfunc (t *decimalType) LogicalType() *format.LogicalType {\n\treturn &format.LogicalType{Decimal: &t.decimal}\n}\n\nfunc (t *decimalType) ConvertedType() *deprecated.ConvertedType {\n\treturn &convertedTypes[deprecated.Decimal]\n}\n\n// String constructs a leaf node of UTF8 logical type.\n//\n// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#string\nfunc String() Node { return Leaf(&stringType{}) }\n\ntype stringType format.StringType\n\nfunc (t *stringType) String() string { return (*format.StringType)(t).String() }\n\nfunc (t *stringType) Kind() Kind { return ByteArray }\n\nfunc (t *stringType) Length() int { return 0 }\n\nfunc (t *stringType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) }\n\nfunc (t *stringType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) }\n\nfunc (t *stringType) Compare(a, b Value) int {\n\treturn bytes.Compare(a.byteArray(), b.byteArray())\n}\n\nfunc (t *stringType) ColumnOrder() *format.ColumnOrder {\n\treturn &typeDefinedColumnOrder\n}\n\nfunc (t *stringType) PhysicalType() *format.Type {\n\treturn &physicalTypes[ByteArray]\n}\n\nfunc (t *stringType) LogicalType() *format.LogicalType {\n\treturn &format.LogicalType{UTF8: (*format.StringType)(t)}\n}\n\nfunc (t *stringType) ConvertedType() *deprecated.ConvertedType {\n\treturn &convertedTypes[deprecated.UTF8]\n}\n\nfunc (t *stringType) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn newByteArrayColumnIndexer(sizeLimit)\n}\n\nfunc (t *stringType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t *stringType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))\n}\n\nfunc (t *stringType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)\n}\n\nfunc (t *stringType) NewValues(values []byte, offsets []uint32) encoding.Values {\n\treturn encoding.ByteArrayValues(values, offsets)\n}\n\nfunc (t *stringType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn encoding.EncodeByteArray(dst, src, enc)\n}\n\nfunc (t *stringType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn encoding.DecodeByteArray(dst, src, enc)\n}\n\nfunc (t *stringType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn byteArrayType{}.EstimateDecodeSize(numValues, src, enc)\n}\n\nfunc (t *stringType) AssignValue(dst reflect.Value, src Value) error {\n\treturn byteArrayType{}.AssignValue(dst, src)\n}\n\nfunc (t *stringType) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch t2 := typ.(type) {\n\tcase *dateType:\n\t\treturn convertDateToString(val)\n\tcase *timeType:\n\t\ttz := t2.tz()\n\t\tif t2.Unit.Micros != nil {\n\t\t\treturn convertTimeMicrosToString(val, tz)\n\t\t} else {\n\t\t\treturn convertTimeMillisToString(val, tz)\n\t\t}\n\t}\n\tswitch typ.Kind() {\n\tcase Boolean:\n\t\treturn convertBooleanToString(val)\n\tcase Int32:\n\t\treturn convertInt32ToString(val)\n\tcase Int64:\n\t\treturn convertInt64ToString(val)\n\tcase Int96:\n\t\treturn convertInt96ToString(val)\n\tcase Float:\n\t\treturn convertFloatToString(val)\n\tcase Double:\n\t\treturn convertDoubleToString(val)\n\tcase ByteArray:\n\t\treturn val, nil\n\tcase FixedLenByteArray:\n\t\treturn convertFixedLenByteArrayToString(val)\n\tdefault:\n\t\treturn makeValueKind(ByteArray), nil\n\t}\n}\n\n// UUID constructs a leaf node of UUID logical type.\n//\n// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#uuid\nfunc UUID() Node { return Leaf(&uuidType{}) }\n\ntype uuidType format.UUIDType\n\nfunc (t *uuidType) String() string { return (*format.UUIDType)(t).String() }\n\nfunc (t *uuidType) Kind() Kind { return be128Type{}.Kind() }\n\nfunc (t *uuidType) Length() int { return be128Type{}.Length() }\n\nfunc (t *uuidType) EstimateSize(n int) int { return be128Type{}.EstimateSize(n) }\n\nfunc (t *uuidType) EstimateNumValues(n int) int { return be128Type{}.EstimateNumValues(n) }\n\nfunc (t *uuidType) Compare(a, b Value) int { return be128Type{}.Compare(a, b) }\n\nfunc (t *uuidType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder }\n\nfunc (t *uuidType) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] }\n\nfunc (t *uuidType) LogicalType() *format.LogicalType {\n\treturn &format.LogicalType{UUID: (*format.UUIDType)(t)}\n}\n\nfunc (t *uuidType) ConvertedType() *deprecated.ConvertedType { return nil }\n\nfunc (t *uuidType) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn be128Type{}.NewColumnIndexer(sizeLimit)\n}\n\nfunc (t *uuidType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn be128Type{}.NewDictionary(columnIndex, numValues, data)\n}\n\nfunc (t *uuidType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn be128Type{}.NewColumnBuffer(columnIndex, numValues)\n}\n\nfunc (t *uuidType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn be128Type{}.NewPage(columnIndex, numValues, data)\n}\n\nfunc (t *uuidType) NewValues(values []byte, offsets []uint32) encoding.Values {\n\treturn be128Type{}.NewValues(values, offsets)\n}\n\nfunc (t *uuidType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn be128Type{}.Encode(dst, src, enc)\n}\n\nfunc (t *uuidType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn be128Type{}.Decode(dst, src, enc)\n}\n\nfunc (t *uuidType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn be128Type{}.EstimateDecodeSize(numValues, src, enc)\n}\n\nfunc (t *uuidType) AssignValue(dst reflect.Value, src Value) error {\n\treturn be128Type{}.AssignValue(dst, src)\n}\n\nfunc (t *uuidType) ConvertValue(val Value, typ Type) (Value, error) {\n\treturn be128Type{}.ConvertValue(val, typ)\n}\n\n// Enum constructs a leaf node with a logical type representing enumerations.\n//\n// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#enum\nfunc Enum() Node { return Leaf(&enumType{}) }\n\ntype enumType format.EnumType\n\nfunc (t *enumType) String() string { return (*format.EnumType)(t).String() }\n\nfunc (t *enumType) Kind() Kind { return new(stringType).Kind() }\n\nfunc (t *enumType) Length() int { return new(stringType).Length() }\n\nfunc (t *enumType) EstimateSize(n int) int { return new(stringType).EstimateSize(n) }\n\nfunc (t *enumType) EstimateNumValues(n int) int { return new(stringType).EstimateNumValues(n) }\n\nfunc (t *enumType) Compare(a, b Value) int { return new(stringType).Compare(a, b) }\n\nfunc (t *enumType) ColumnOrder() *format.ColumnOrder { return new(stringType).ColumnOrder() }\n\nfunc (t *enumType) PhysicalType() *format.Type { return new(stringType).PhysicalType() }\n\nfunc (t *enumType) LogicalType() *format.LogicalType {\n\treturn &format.LogicalType{Enum: (*format.EnumType)(t)}\n}\n\nfunc (t *enumType) ConvertedType() *deprecated.ConvertedType {\n\treturn &convertedTypes[deprecated.Enum]\n}\n\nfunc (t *enumType) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn new(stringType).NewColumnIndexer(sizeLimit)\n}\n\nfunc (t *enumType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn new(stringType).NewDictionary(columnIndex, numValues, data)\n}\n\nfunc (t *enumType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn new(stringType).NewColumnBuffer(columnIndex, numValues)\n}\n\nfunc (t *enumType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn new(stringType).NewPage(columnIndex, numValues, data)\n}\n\nfunc (t *enumType) NewValues(values []byte, offsets []uint32) encoding.Values {\n\treturn new(stringType).NewValues(values, offsets)\n}\n\nfunc (t *enumType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn new(stringType).Encode(dst, src, enc)\n}\n\nfunc (t *enumType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn new(stringType).Decode(dst, src, enc)\n}\n\nfunc (t *enumType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn new(stringType).EstimateDecodeSize(numValues, src, enc)\n}\n\nfunc (t *enumType) AssignValue(dst reflect.Value, src Value) error {\n\treturn new(stringType).AssignValue(dst, src)\n}\n\nfunc (t *enumType) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch typ.(type) {\n\tcase *byteArrayType, *stringType, *enumType:\n\t\treturn val, nil\n\tdefault:\n\t\treturn val, invalidConversion(val, \"ENUM\", typ.String())\n\t}\n}\n\n// JSON constructs a leaf node of JSON logical type.\n//\n// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#json\nfunc JSON() Node { return Leaf(&jsonType{}) }\n\ntype jsonType format.JsonType\n\nfunc (t *jsonType) String() string { return (*format.JsonType)(t).String() }\n\nfunc (t *jsonType) Kind() Kind { return byteArrayType{}.Kind() }\n\nfunc (t *jsonType) Length() int { return byteArrayType{}.Length() }\n\nfunc (t *jsonType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) }\n\nfunc (t *jsonType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) }\n\nfunc (t *jsonType) Compare(a, b Value) int { return byteArrayType{}.Compare(a, b) }\n\nfunc (t *jsonType) ColumnOrder() *format.ColumnOrder { return byteArrayType{}.ColumnOrder() }\n\nfunc (t *jsonType) PhysicalType() *format.Type { return byteArrayType{}.PhysicalType() }\n\nfunc (t *jsonType) LogicalType() *format.LogicalType {\n\treturn &format.LogicalType{Json: (*format.JsonType)(t)}\n}\n\nfunc (t *jsonType) ConvertedType() *deprecated.ConvertedType {\n\treturn &convertedTypes[deprecated.Json]\n}\n\nfunc (t *jsonType) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn byteArrayType{}.NewColumnIndexer(sizeLimit)\n}\n\nfunc (t *jsonType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn byteArrayType{}.NewDictionary(columnIndex, numValues, data)\n}\n\nfunc (t *jsonType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn byteArrayType{}.NewColumnBuffer(columnIndex, numValues)\n}\n\nfunc (t *jsonType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn byteArrayType{}.NewPage(columnIndex, numValues, data)\n}\n\nfunc (t *jsonType) NewValues(values []byte, offsets []uint32) encoding.Values {\n\treturn byteArrayType{}.NewValues(values, offsets)\n}\n\nfunc (t *jsonType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn byteArrayType{}.Encode(dst, src, enc)\n}\n\nfunc (t *jsonType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn byteArrayType{}.Decode(dst, src, enc)\n}\n\nfunc (t *jsonType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn byteArrayType{}.EstimateDecodeSize(numValues, src, enc)\n}\n\nfunc (t *jsonType) AssignValue(dst reflect.Value, src Value) error {\n\t// Assign value using ByteArrayType for BC...\n\tswitch dst.Kind() {\n\tcase reflect.String:\n\t\treturn byteArrayType{}.AssignValue(dst, src)\n\tcase reflect.Slice:\n\t\tif dst.Type().Elem().Kind() == reflect.Uint8 {\n\t\t\treturn byteArrayType{}.AssignValue(dst, src)\n\t\t}\n\t}\n\n\t// Otherwise handle with json.Unmarshal\n\tb := src.byteArray()\n\tval := reflect.New(dst.Type()).Elem()\n\terr := json.Unmarshal(b, val.Addr().Interface())\n\tif err != nil {\n\t\treturn err\n\t}\n\tdst.Set(val)\n\treturn nil\n}\n\nfunc (t *jsonType) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch typ.(type) {\n\tcase *byteArrayType, *stringType, *jsonType:\n\t\treturn val, nil\n\tdefault:\n\t\treturn val, invalidConversion(val, \"JSON\", typ.String())\n\t}\n}\n\n// BSON constructs a leaf node of BSON logical type.\n//\n// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#bson\nfunc BSON() Node { return Leaf(&bsonType{}) }\n\ntype bsonType format.BsonType\n\nfunc (t *bsonType) String() string { return (*format.BsonType)(t).String() }\n\nfunc (t *bsonType) Kind() Kind { return byteArrayType{}.Kind() }\n\nfunc (t *bsonType) Length() int { return byteArrayType{}.Length() }\n\nfunc (t *bsonType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) }\n\nfunc (t *bsonType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) }\n\nfunc (t *bsonType) Compare(a, b Value) int { return byteArrayType{}.Compare(a, b) }\n\nfunc (t *bsonType) ColumnOrder() *format.ColumnOrder { return byteArrayType{}.ColumnOrder() }\n\nfunc (t *bsonType) PhysicalType() *format.Type { return byteArrayType{}.PhysicalType() }\n\nfunc (t *bsonType) LogicalType() *format.LogicalType {\n\treturn &format.LogicalType{Bson: (*format.BsonType)(t)}\n}\n\nfunc (t *bsonType) ConvertedType() *deprecated.ConvertedType {\n\treturn &convertedTypes[deprecated.Bson]\n}\n\nfunc (t *bsonType) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn byteArrayType{}.NewColumnIndexer(sizeLimit)\n}\n\nfunc (t *bsonType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn byteArrayType{}.NewDictionary(columnIndex, numValues, data)\n}\n\nfunc (t *bsonType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn byteArrayType{}.NewColumnBuffer(columnIndex, numValues)\n}\n\nfunc (t *bsonType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn byteArrayType{}.NewPage(columnIndex, numValues, data)\n}\n\nfunc (t *bsonType) NewValues(values []byte, offsets []uint32) encoding.Values {\n\treturn byteArrayType{}.NewValues(values, offsets)\n}\n\nfunc (t *bsonType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn byteArrayType{}.Encode(dst, src, enc)\n}\n\nfunc (t *bsonType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn byteArrayType{}.Decode(dst, src, enc)\n}\n\nfunc (t *bsonType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn byteArrayType{}.EstimateDecodeSize(numValues, src, enc)\n}\n\nfunc (t *bsonType) AssignValue(dst reflect.Value, src Value) error {\n\treturn byteArrayType{}.AssignValue(dst, src)\n}\n\nfunc (t *bsonType) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch typ.(type) {\n\tcase *byteArrayType, *bsonType:\n\t\treturn val, nil\n\tdefault:\n\t\treturn val, invalidConversion(val, \"BSON\", typ.String())\n\t}\n}\n\n// Date constructs a leaf node of DATE logical type.\n//\n// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date\nfunc Date() Node { return Leaf(&dateType{}) }\n\ntype dateType format.DateType\n\nfunc (t *dateType) String() string { return (*format.DateType)(t).String() }\n\nfunc (t *dateType) Kind() Kind { return int32Type{}.Kind() }\n\nfunc (t *dateType) Length() int { return int32Type{}.Length() }\n\nfunc (t *dateType) EstimateSize(n int) int { return int32Type{}.EstimateSize(n) }\n\nfunc (t *dateType) EstimateNumValues(n int) int { return int32Type{}.EstimateNumValues(n) }\n\nfunc (t *dateType) Compare(a, b Value) int { return int32Type{}.Compare(a, b) }\n\nfunc (t *dateType) ColumnOrder() *format.ColumnOrder { return int32Type{}.ColumnOrder() }\n\nfunc (t *dateType) PhysicalType() *format.Type { return int32Type{}.PhysicalType() }\n\nfunc (t *dateType) LogicalType() *format.LogicalType {\n\treturn &format.LogicalType{Date: (*format.DateType)(t)}\n}\n\nfunc (t *dateType) ConvertedType() *deprecated.ConvertedType {\n\treturn &convertedTypes[deprecated.Date]\n}\n\nfunc (t *dateType) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn int32Type{}.NewColumnIndexer(sizeLimit)\n}\n\nfunc (t *dateType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn int32Type{}.NewDictionary(columnIndex, numValues, data)\n}\n\nfunc (t *dateType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn int32Type{}.NewColumnBuffer(columnIndex, numValues)\n}\n\nfunc (t *dateType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn int32Type{}.NewPage(columnIndex, numValues, data)\n}\n\nfunc (t *dateType) NewValues(values []byte, offsets []uint32) encoding.Values {\n\treturn int32Type{}.NewValues(values, offsets)\n}\n\nfunc (t *dateType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn int32Type{}.Encode(dst, src, enc)\n}\n\nfunc (t *dateType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn int32Type{}.Decode(dst, src, enc)\n}\n\nfunc (t *dateType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn int32Type{}.EstimateDecodeSize(numValues, src, enc)\n}\n\nfunc (t *dateType) AssignValue(dst reflect.Value, src Value) error {\n\treturn int32Type{}.AssignValue(dst, src)\n}\n\nfunc (t *dateType) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch src := typ.(type) {\n\tcase *stringType:\n\t\treturn convertStringToDate(val, time.UTC)\n\tcase *timestampType:\n\t\treturn convertTimestampToDate(val, src.Unit, src.tz())\n\t}\n\treturn int32Type{}.ConvertValue(val, typ)\n}\n\n// TimeUnit represents units of time in the parquet type system.\ntype TimeUnit interface {\n\t// Returns the precision of the time unit as a time.Duration value.\n\tDuration() time.Duration\n\t// Converts the TimeUnit value to its representation in the parquet thrift\n\t// format.\n\tTimeUnit() format.TimeUnit\n}\n\nvar (\n\tMillisecond TimeUnit = &millisecond{}\n\tMicrosecond TimeUnit = &microsecond{}\n\tNanosecond  TimeUnit = &nanosecond{}\n)\n\ntype millisecond format.MilliSeconds\n\nfunc (u *millisecond) Duration() time.Duration { return time.Millisecond }\nfunc (u *millisecond) TimeUnit() format.TimeUnit {\n\treturn format.TimeUnit{Millis: (*format.MilliSeconds)(u)}\n}\n\ntype microsecond format.MicroSeconds\n\nfunc (u *microsecond) Duration() time.Duration { return time.Microsecond }\nfunc (u *microsecond) TimeUnit() format.TimeUnit {\n\treturn format.TimeUnit{Micros: (*format.MicroSeconds)(u)}\n}\n\ntype nanosecond format.NanoSeconds\n\nfunc (u *nanosecond) Duration() time.Duration { return time.Nanosecond }\nfunc (u *nanosecond) TimeUnit() format.TimeUnit {\n\treturn format.TimeUnit{Nanos: (*format.NanoSeconds)(u)}\n}\n\n// Time constructs a leaf node of TIME logical type.\n//\n// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time\nfunc Time(unit TimeUnit) Node {\n\treturn Leaf(&timeType{IsAdjustedToUTC: true, Unit: unit.TimeUnit()})\n}\n\ntype timeType format.TimeType\n\nfunc (t *timeType) tz() *time.Location {\n\tif t.IsAdjustedToUTC {\n\t\treturn time.UTC\n\t} else {\n\t\treturn time.Local\n\t}\n}\n\nfunc (t *timeType) baseType() Type {\n\tif t.useInt32() {\n\t\treturn int32Type{}\n\t} else {\n\t\treturn int64Type{}\n\t}\n}\n\nfunc (t *timeType) useInt32() bool { return t.Unit.Millis != nil }\n\nfunc (t *timeType) useInt64() bool { return t.Unit.Micros != nil }\n\nfunc (t *timeType) String() string { return (*format.TimeType)(t).String() }\n\nfunc (t *timeType) Kind() Kind { return t.baseType().Kind() }\n\nfunc (t *timeType) Length() int { return t.baseType().Length() }\n\nfunc (t *timeType) EstimateSize(n int) int { return t.baseType().EstimateSize(n) }\n\nfunc (t *timeType) EstimateNumValues(n int) int { return t.baseType().EstimateNumValues(n) }\n\nfunc (t *timeType) Compare(a, b Value) int { return t.baseType().Compare(a, b) }\n\nfunc (t *timeType) ColumnOrder() *format.ColumnOrder { return t.baseType().ColumnOrder() }\n\nfunc (t *timeType) PhysicalType() *format.Type { return t.baseType().PhysicalType() }\n\nfunc (t *timeType) LogicalType() *format.LogicalType {\n\treturn &format.LogicalType{Time: (*format.TimeType)(t)}\n}\n\nfunc (t *timeType) ConvertedType() *deprecated.ConvertedType {\n\tswitch {\n\tcase t.useInt32():\n\t\treturn &convertedTypes[deprecated.TimeMillis]\n\tcase t.useInt64():\n\t\treturn &convertedTypes[deprecated.TimeMicros]\n\tdefault:\n\t\treturn nil\n\t}\n}\n\nfunc (t *timeType) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn t.baseType().NewColumnIndexer(sizeLimit)\n}\n\nfunc (t *timeType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn t.baseType().NewColumnBuffer(columnIndex, numValues)\n}\n\nfunc (t *timeType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn t.baseType().NewDictionary(columnIndex, numValues, data)\n}\n\nfunc (t *timeType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn t.baseType().NewPage(columnIndex, numValues, data)\n}\n\nfunc (t *timeType) NewValues(values []byte, offset []uint32) encoding.Values {\n\treturn t.baseType().NewValues(values, offset)\n}\n\nfunc (t *timeType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn t.baseType().Encode(dst, src, enc)\n}\n\nfunc (t *timeType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn t.baseType().Decode(dst, src, enc)\n}\n\nfunc (t *timeType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn t.baseType().EstimateDecodeSize(numValues, src, enc)\n}\n\nfunc (t *timeType) AssignValue(dst reflect.Value, src Value) error {\n\treturn t.baseType().AssignValue(dst, src)\n}\n\nfunc (t *timeType) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch src := typ.(type) {\n\tcase *stringType:\n\t\ttz := t.tz()\n\t\tif t.Unit.Micros != nil {\n\t\t\treturn convertStringToTimeMicros(val, tz)\n\t\t} else {\n\t\t\treturn convertStringToTimeMillis(val, tz)\n\t\t}\n\tcase *timestampType:\n\t\ttz := t.tz()\n\t\tif t.Unit.Micros != nil {\n\t\t\treturn convertTimestampToTimeMicros(val, src.Unit, src.tz(), tz)\n\t\t} else {\n\t\t\treturn convertTimestampToTimeMillis(val, src.Unit, src.tz(), tz)\n\t\t}\n\t}\n\treturn t.baseType().ConvertValue(val, typ)\n}\n\n// Timestamp constructs of leaf node of TIMESTAMP logical type.\n//\n// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp\nfunc Timestamp(unit TimeUnit) Node {\n\treturn Leaf(&timestampType{IsAdjustedToUTC: true, Unit: unit.TimeUnit()})\n}\n\ntype timestampType format.TimestampType\n\nfunc (t *timestampType) tz() *time.Location {\n\tif t.IsAdjustedToUTC {\n\t\treturn time.UTC\n\t} else {\n\t\treturn time.Local\n\t}\n}\n\nfunc (t *timestampType) String() string { return (*format.TimestampType)(t).String() }\n\nfunc (t *timestampType) Kind() Kind { return int64Type{}.Kind() }\n\nfunc (t *timestampType) Length() int { return int64Type{}.Length() }\n\nfunc (t *timestampType) EstimateSize(n int) int { return int64Type{}.EstimateSize(n) }\n\nfunc (t *timestampType) EstimateNumValues(n int) int { return int64Type{}.EstimateNumValues(n) }\n\nfunc (t *timestampType) Compare(a, b Value) int { return int64Type{}.Compare(a, b) }\n\nfunc (t *timestampType) ColumnOrder() *format.ColumnOrder { return int64Type{}.ColumnOrder() }\n\nfunc (t *timestampType) PhysicalType() *format.Type { return int64Type{}.PhysicalType() }\n\nfunc (t *timestampType) LogicalType() *format.LogicalType {\n\treturn &format.LogicalType{Timestamp: (*format.TimestampType)(t)}\n}\n\nfunc (t *timestampType) ConvertedType() *deprecated.ConvertedType {\n\tswitch {\n\tcase t.Unit.Millis != nil:\n\t\treturn &convertedTypes[deprecated.TimestampMillis]\n\tcase t.Unit.Micros != nil:\n\t\treturn &convertedTypes[deprecated.TimestampMicros]\n\tdefault:\n\t\treturn nil\n\t}\n}\n\nfunc (t *timestampType) NewColumnIndexer(sizeLimit int) ColumnIndexer {\n\treturn int64Type{}.NewColumnIndexer(sizeLimit)\n}\n\nfunc (t *timestampType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary {\n\treturn int64Type{}.NewDictionary(columnIndex, numValues, data)\n}\n\nfunc (t *timestampType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {\n\treturn int64Type{}.NewColumnBuffer(columnIndex, numValues)\n}\n\nfunc (t *timestampType) NewPage(columnIndex, numValues int, data encoding.Values) Page {\n\treturn int64Type{}.NewPage(columnIndex, numValues, data)\n}\n\nfunc (t *timestampType) NewValues(values []byte, offsets []uint32) encoding.Values {\n\treturn int64Type{}.NewValues(values, offsets)\n}\n\nfunc (t *timestampType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {\n\treturn int64Type{}.Encode(dst, src, enc)\n}\n\nfunc (t *timestampType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {\n\treturn int64Type{}.Decode(dst, src, enc)\n}\n\nfunc (t *timestampType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {\n\treturn int64Type{}.EstimateDecodeSize(numValues, src, enc)\n}\n\nfunc (t *timestampType) AssignValue(dst reflect.Value, src Value) error {\n\tswitch dst.Type() {\n\tcase reflect.TypeOf(time.Time{}):\n\t\tunit := Nanosecond.TimeUnit()\n\t\tlt := t.LogicalType()\n\t\tif lt != nil && lt.Timestamp != nil {\n\t\t\tunit = lt.Timestamp.Unit\n\t\t}\n\n\t\tnanos := src.int64()\n\t\tswitch {\n\t\tcase unit.Millis != nil:\n\t\t\tnanos = nanos * 1e6\n\t\tcase unit.Micros != nil:\n\t\t\tnanos = nanos * 1e3\n\t\t}\n\n\t\tval := time.Unix(0, nanos).UTC()\n\t\tdst.Set(reflect.ValueOf(val))\n\t\treturn nil\n\tdefault:\n\t\treturn int64Type{}.AssignValue(dst, src)\n\t}\n}\n\nfunc (t *timestampType) ConvertValue(val Value, typ Type) (Value, error) {\n\tswitch src := typ.(type) {\n\tcase *timestampType:\n\t\treturn convertTimestampToTimestamp(val, src.Unit, t.Unit)\n\tcase *dateType:\n\t\treturn convertDateToTimestamp(val, t.Unit, t.tz())\n\t}\n\treturn int64Type{}.ConvertValue(val, typ)\n}\n\n// List constructs a node of LIST logical type.\n//\n// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists\nfunc List(of Node) Node {\n\treturn listNode{Group{\"list\": Repeated(Group{\"element\": of})}}\n}\n\ntype listNode struct{ Group }\n\nfunc (listNode) Type() Type { return &listType{} }\n\ntype listType format.ListType\n\nfunc (t *listType) String() string { return (*format.ListType)(t).String() }\n\nfunc (t *listType) Kind() Kind { panic(\"cannot call Kind on parquet LIST type\") }\n\nfunc (t *listType) Length() int { return 0 }\n\nfunc (t *listType) EstimateSize(int) int { return 0 }\n\nfunc (t *listType) EstimateNumValues(int) int { return 0 }\n\nfunc (t *listType) Compare(Value, Value) int { panic(\"cannot compare values on parquet LIST type\") }\n\nfunc (t *listType) ColumnOrder() *format.ColumnOrder { return nil }\n\nfunc (t *listType) PhysicalType() *format.Type { return nil }\n\nfunc (t *listType) LogicalType() *format.LogicalType {\n\treturn &format.LogicalType{List: (*format.ListType)(t)}\n}\n\nfunc (t *listType) ConvertedType() *deprecated.ConvertedType {\n\treturn &convertedTypes[deprecated.List]\n}\n\nfunc (t *listType) NewColumnIndexer(int) ColumnIndexer {\n\tpanic(\"create create column indexer from parquet LIST type\")\n}\n\nfunc (t *listType) NewDictionary(int, int, encoding.Values) Dictionary {\n\tpanic(\"cannot create dictionary from parquet LIST type\")\n}\n\nfunc (t *listType) NewColumnBuffer(int, int) ColumnBuffer {\n\tpanic(\"cannot create column buffer from parquet LIST type\")\n}\n\nfunc (t *listType) NewPage(int, int, encoding.Values) Page {\n\tpanic(\"cannot create page from parquet LIST type\")\n}\n\nfunc (t *listType) NewValues(values []byte, _ []uint32) encoding.Values {\n\tpanic(\"cannot create values from parquet LIST type\")\n}\n\nfunc (t *listType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) {\n\tpanic(\"cannot encode parquet LIST type\")\n}\n\nfunc (t *listType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) {\n\tpanic(\"cannot decode parquet LIST type\")\n}\n\nfunc (t *listType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int {\n\tpanic(\"cannot estimate decode size of parquet LIST type\")\n}\n\nfunc (t *listType) AssignValue(reflect.Value, Value) error {\n\tpanic(\"cannot assign value to a parquet LIST type\")\n}\n\nfunc (t *listType) ConvertValue(Value, Type) (Value, error) {\n\tpanic(\"cannot convert value to a parquet LIST type\")\n}\n\n// Map constructs a node of MAP logical type.\n//\n// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps\nfunc Map(key, value Node) Node {\n\treturn mapNode{Group{\n\t\t\"key_value\": Repeated(Group{\n\t\t\t\"key\":   Required(key),\n\t\t\t\"value\": value,\n\t\t}),\n\t}}\n}\n\ntype mapNode struct{ Group }\n\nfunc (mapNode) Type() Type { return &mapType{} }\n\ntype mapType format.MapType\n\nfunc (t *mapType) String() string { return (*format.MapType)(t).String() }\n\nfunc (t *mapType) Kind() Kind { panic(\"cannot call Kind on parquet MAP type\") }\n\nfunc (t *mapType) Length() int { return 0 }\n\nfunc (t *mapType) EstimateSize(int) int { return 0 }\n\nfunc (t *mapType) EstimateNumValues(int) int { return 0 }\n\nfunc (t *mapType) Compare(Value, Value) int { panic(\"cannot compare values on parquet MAP type\") }\n\nfunc (t *mapType) ColumnOrder() *format.ColumnOrder { return nil }\n\nfunc (t *mapType) PhysicalType() *format.Type { return nil }\n\nfunc (t *mapType) LogicalType() *format.LogicalType {\n\treturn &format.LogicalType{Map: (*format.MapType)(t)}\n}\n\nfunc (t *mapType) ConvertedType() *deprecated.ConvertedType {\n\treturn &convertedTypes[deprecated.Map]\n}\n\nfunc (t *mapType) NewColumnIndexer(int) ColumnIndexer {\n\tpanic(\"create create column indexer from parquet MAP type\")\n}\n\nfunc (t *mapType) NewDictionary(int, int, encoding.Values) Dictionary {\n\tpanic(\"cannot create dictionary from parquet MAP type\")\n}\n\nfunc (t *mapType) NewColumnBuffer(int, int) ColumnBuffer {\n\tpanic(\"cannot create column buffer from parquet MAP type\")\n}\n\nfunc (t *mapType) NewPage(int, int, encoding.Values) Page {\n\tpanic(\"cannot create page from parquet MAP type\")\n}\n\nfunc (t *mapType) NewValues(values []byte, _ []uint32) encoding.Values {\n\tpanic(\"cannot create values from parquet MAP type\")\n}\n\nfunc (t *mapType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) {\n\tpanic(\"cannot encode parquet MAP type\")\n}\n\nfunc (t *mapType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) {\n\tpanic(\"cannot decode parquet MAP type\")\n}\n\nfunc (t *mapType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int {\n\tpanic(\"cannot estimate decode size of parquet MAP type\")\n}\n\nfunc (t *mapType) AssignValue(reflect.Value, Value) error {\n\tpanic(\"cannot assign value to a parquet MAP type\")\n}\n\nfunc (t *mapType) ConvertValue(Value, Type) (Value, error) {\n\tpanic(\"cannot convert value to a parquet MAP type\")\n}\n\ntype nullType format.NullType\n\nfunc (t *nullType) String() string { return (*format.NullType)(t).String() }\n\nfunc (t *nullType) Kind() Kind { return -1 }\n\nfunc (t *nullType) Length() int { return 0 }\n\nfunc (t *nullType) EstimateSize(int) int { return 0 }\n\nfunc (t *nullType) EstimateNumValues(int) int { return 0 }\n\nfunc (t *nullType) Compare(Value, Value) int { panic(\"cannot compare values on parquet NULL type\") }\n\nfunc (t *nullType) ColumnOrder() *format.ColumnOrder { return nil }\n\nfunc (t *nullType) PhysicalType() *format.Type { return nil }\n\nfunc (t *nullType) LogicalType() *format.LogicalType {\n\treturn &format.LogicalType{Unknown: (*format.NullType)(t)}\n}\n\nfunc (t *nullType) ConvertedType() *deprecated.ConvertedType { return nil }\n\nfunc (t *nullType) NewColumnIndexer(int) ColumnIndexer {\n\tpanic(\"create create column indexer from parquet NULL type\")\n}\n\nfunc (t *nullType) NewDictionary(int, int, encoding.Values) Dictionary {\n\tpanic(\"cannot create dictionary from parquet NULL type\")\n}\n\nfunc (t *nullType) NewColumnBuffer(int, int) ColumnBuffer {\n\tpanic(\"cannot create column buffer from parquet NULL type\")\n}\n\nfunc (t *nullType) NewPage(columnIndex, numValues int, _ encoding.Values) Page {\n\treturn newNullPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues))\n}\n\nfunc (t *nullType) NewValues(_ []byte, _ []uint32) encoding.Values {\n\treturn encoding.Values{}\n}\n\nfunc (t *nullType) Encode(dst []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) {\n\treturn dst[:0], nil\n}\n\nfunc (t *nullType) Decode(dst encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) {\n\treturn dst, nil\n}\n\nfunc (t *nullType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int {\n\treturn 0\n}\n\nfunc (t *nullType) AssignValue(reflect.Value, Value) error {\n\treturn nil\n}\n\nfunc (t *nullType) ConvertValue(val Value, _ Type) (Value, error) {\n\treturn val, nil\n}\n\ntype groupType struct{}\n\nfunc (groupType) String() string { return \"group\" }\n\nfunc (groupType) Kind() Kind {\n\tpanic(\"cannot call Kind on parquet group\")\n}\n\nfunc (groupType) Compare(Value, Value) int {\n\tpanic(\"cannot compare values on parquet group\")\n}\n\nfunc (groupType) NewColumnIndexer(int) ColumnIndexer {\n\tpanic(\"cannot create column indexer from parquet group\")\n}\n\nfunc (groupType) NewDictionary(int, int, encoding.Values) Dictionary {\n\tpanic(\"cannot create dictionary from parquet group\")\n}\n\nfunc (t groupType) NewColumnBuffer(int, int) ColumnBuffer {\n\tpanic(\"cannot create column buffer from parquet group\")\n}\n\nfunc (t groupType) NewPage(int, int, encoding.Values) Page {\n\tpanic(\"cannot create page from parquet group\")\n}\n\nfunc (t groupType) NewValues(_ []byte, _ []uint32) encoding.Values {\n\tpanic(\"cannot create values from parquet group\")\n}\n\nfunc (groupType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) {\n\tpanic(\"cannot encode parquet group\")\n}\n\nfunc (groupType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) {\n\tpanic(\"cannot decode parquet group\")\n}\n\nfunc (groupType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int {\n\tpanic(\"cannot estimate decode size of parquet group\")\n}\n\nfunc (groupType) AssignValue(reflect.Value, Value) error {\n\tpanic(\"cannot assign value to a parquet group\")\n}\n\nfunc (t groupType) ConvertValue(Value, Type) (Value, error) {\n\tpanic(\"cannot convert value to a parquet group\")\n}\n\nfunc (groupType) Length() int { return 0 }\n\nfunc (groupType) EstimateSize(int) int { return 0 }\n\nfunc (groupType) EstimateNumValues(int) int { return 0 }\n\nfunc (groupType) ColumnOrder() *format.ColumnOrder { return nil }\n\nfunc (groupType) PhysicalType() *format.Type { return nil }\n\nfunc (groupType) LogicalType() *format.LogicalType { return nil }\n\nfunc (groupType) ConvertedType() *deprecated.ConvertedType { return nil }\n\nfunc checkTypeKindEqual(to, from Type) error {\n\tif to.Kind() != from.Kind() {\n\t\treturn fmt.Errorf(\"cannot convert from parquet value of type %s to %s\", from, to)\n\t}\n\treturn nil\n}\n"
  },
  {
    "path": "value.go",
    "content": "package parquet\n\nimport (\n\t\"bytes\"\n\t\"encoding/binary\"\n\t\"fmt\"\n\t\"io\"\n\t\"math\"\n\t\"reflect\"\n\t\"strconv\"\n\t\"time\"\n\t\"unsafe\"\n\n\t\"github.com/google/uuid\"\n\t\"github.com/segmentio/parquet-go/deprecated\"\n\t\"github.com/segmentio/parquet-go/format\"\n\t\"github.com/segmentio/parquet-go/internal/unsafecast\"\n)\n\nconst (\n\t// 170 x sizeof(Value) = 4KB\n\tdefaultValueBufferSize = 170\n)\n\n// The Value type is similar to the reflect.Value abstraction of Go values, but\n// for parquet values. Value instances wrap underlying Go values mapped to one\n// of the parquet physical types.\n//\n// Value instances are small, immutable objects, and usually passed by value\n// between function calls.\n//\n// The zero-value of Value represents the null parquet value.\ntype Value struct {\n\t// data\n\tptr *byte\n\tu64 uint64\n\t// type\n\tkind int8 // XOR(Kind) so the zero-value is <null>\n\t// levels\n\tdefinitionLevel byte\n\trepetitionLevel byte\n\tcolumnIndex     int16 // XOR so the zero-value is -1\n}\n\n// ValueReader is an interface implemented by types that support reading\n// batches of values.\ntype ValueReader interface {\n\t// Read values into the buffer passed as argument and return the number of\n\t// values read. When all values have been read, the error will be io.EOF.\n\tReadValues([]Value) (int, error)\n}\n\n// ValueReaderAt is an interface implemented by types that support reading\n// values at offsets specified by the application.\ntype ValueReaderAt interface {\n\tReadValuesAt([]Value, int64) (int, error)\n}\n\n// ValueReaderFrom is an interface implemented by value writers to read values\n// from a reader.\ntype ValueReaderFrom interface {\n\tReadValuesFrom(ValueReader) (int64, error)\n}\n\n// ValueWriter is an interface implemented by types that support reading\n// batches of values.\ntype ValueWriter interface {\n\t// Write values from the buffer passed as argument and returns the number\n\t// of values written.\n\tWriteValues([]Value) (int, error)\n}\n\n// ValueWriterTo is an interface implemented by value readers to write values to\n// a writer.\ntype ValueWriterTo interface {\n\tWriteValuesTo(ValueWriter) (int64, error)\n}\n\n// ValueReaderFunc is a function type implementing the ValueReader interface.\ntype ValueReaderFunc func([]Value) (int, error)\n\nfunc (f ValueReaderFunc) ReadValues(values []Value) (int, error) { return f(values) }\n\n// ValueWriterFunc is a function type implementing the ValueWriter interface.\ntype ValueWriterFunc func([]Value) (int, error)\n\nfunc (f ValueWriterFunc) WriteValues(values []Value) (int, error) { return f(values) }\n\n// CopyValues copies values from src to dst, returning the number of values\n// that were written.\n//\n// As an optimization, the reader and writer may choose to implement\n// ValueReaderFrom and ValueWriterTo to provide their own copy logic.\n//\n// The function returns any error it encounters reading or writing pages, except\n// for io.EOF from the reader which indicates that there were no more values to\n// read.\nfunc CopyValues(dst ValueWriter, src ValueReader) (int64, error) {\n\treturn copyValues(dst, src, nil)\n}\n\nfunc copyValues(dst ValueWriter, src ValueReader, buf []Value) (written int64, err error) {\n\tif wt, ok := src.(ValueWriterTo); ok {\n\t\treturn wt.WriteValuesTo(dst)\n\t}\n\n\tif rf, ok := dst.(ValueReaderFrom); ok {\n\t\treturn rf.ReadValuesFrom(src)\n\t}\n\n\tif len(buf) == 0 {\n\t\tbuf = make([]Value, defaultValueBufferSize)\n\t}\n\n\tdefer clearValues(buf)\n\n\tfor {\n\t\tn, err := src.ReadValues(buf)\n\n\t\tif n > 0 {\n\t\t\twn, werr := dst.WriteValues(buf[:n])\n\t\t\twritten += int64(wn)\n\t\t\tif werr != nil {\n\t\t\t\treturn written, werr\n\t\t\t}\n\t\t}\n\n\t\tif err != nil {\n\t\t\tif err == io.EOF {\n\t\t\t\terr = nil\n\t\t\t}\n\t\t\treturn written, err\n\t\t}\n\n\t\tif n == 0 {\n\t\t\treturn written, io.ErrNoProgress\n\t\t}\n\t}\n}\n\n// ValueOf constructs a parquet value from a Go value v.\n//\n// The physical type of the value is assumed from the Go type of v using the\n// following conversion table:\n//\n//\tGo type | Parquet physical type\n//\t------- | ---------------------\n//\tnil     | NULL\n//\tbool    | BOOLEAN\n//\tint8    | INT32\n//\tint16   | INT32\n//\tint32   | INT32\n//\tint64   | INT64\n//\tint     | INT64\n//\tuint8   | INT32\n//\tuint16  | INT32\n//\tuint32  | INT32\n//\tuint64  | INT64\n//\tuintptr | INT64\n//\tfloat32 | FLOAT\n//\tfloat64 | DOUBLE\n//\tstring  | BYTE_ARRAY\n//\t[]byte  | BYTE_ARRAY\n//\t[*]byte | FIXED_LEN_BYTE_ARRAY\n//\n// When converting a []byte or [*]byte value, the underlying byte array is not\n// copied; instead, the returned parquet value holds a reference to it.\n//\n// The repetition and definition levels of the returned value are both zero.\n//\n// The function panics if the Go value cannot be represented in parquet.\nfunc ValueOf(v interface{}) Value {\n\tk := Kind(-1)\n\tt := reflect.TypeOf(v)\n\n\tswitch value := v.(type) {\n\tcase nil:\n\t\treturn Value{}\n\tcase uuid.UUID:\n\t\treturn makeValueBytes(FixedLenByteArray, value[:])\n\tcase deprecated.Int96:\n\t\treturn makeValueInt96(value)\n\tcase time.Time:\n\t\tk = Int64\n\t}\n\n\tswitch t.Kind() {\n\tcase reflect.Bool:\n\t\tk = Boolean\n\tcase reflect.Int8, reflect.Int16, reflect.Int32, reflect.Uint8, reflect.Uint16, reflect.Uint32:\n\t\tk = Int32\n\tcase reflect.Int64, reflect.Int, reflect.Uint64, reflect.Uint, reflect.Uintptr:\n\t\tk = Int64\n\tcase reflect.Float32:\n\t\tk = Float\n\tcase reflect.Float64:\n\t\tk = Double\n\tcase reflect.String:\n\t\tk = ByteArray\n\tcase reflect.Slice:\n\t\tif t.Elem().Kind() == reflect.Uint8 {\n\t\t\tk = ByteArray\n\t\t}\n\tcase reflect.Array:\n\t\tif t.Elem().Kind() == reflect.Uint8 {\n\t\t\tk = FixedLenByteArray\n\t\t}\n\t}\n\n\tif k < 0 {\n\t\tpanic(\"cannot create parquet value from go value of type \" + t.String())\n\t}\n\n\treturn makeValue(k, nil, reflect.ValueOf(v))\n}\n\n// NulLValue constructs a null value, which is the zero-value of the Value type.\nfunc NullValue() Value { return Value{} }\n\n// ZeroValue constructs a zero value of the given kind.\nfunc ZeroValue(kind Kind) Value { return makeValueKind(kind) }\n\n// BooleanValue constructs a BOOLEAN parquet value from the bool passed as\n// argument.\nfunc BooleanValue(value bool) Value { return makeValueBoolean(value) }\n\n// Int32Value constructs a INT32 parquet value from the int32 passed as\n// argument.\nfunc Int32Value(value int32) Value { return makeValueInt32(value) }\n\n// Int64Value constructs a INT64 parquet value from the int64 passed as\n// argument.\nfunc Int64Value(value int64) Value { return makeValueInt64(value) }\n\n// Int96Value constructs a INT96 parquet value from the deprecated.Int96 passed\n// as argument.\nfunc Int96Value(value deprecated.Int96) Value { return makeValueInt96(value) }\n\n// FloatValue constructs a FLOAT parquet value from the float32 passed as\n// argument.\nfunc FloatValue(value float32) Value { return makeValueFloat(value) }\n\n// DoubleValue constructs a DOUBLE parquet value from the float64 passed as\n// argument.\nfunc DoubleValue(value float64) Value { return makeValueDouble(value) }\n\n// ByteArrayValue constructs a BYTE_ARRAY parquet value from the byte slice\n// passed as argument.\nfunc ByteArrayValue(value []byte) Value { return makeValueBytes(ByteArray, value) }\n\n// FixedLenByteArrayValue constructs a BYTE_ARRAY parquet value from the byte\n// slice passed as argument.\nfunc FixedLenByteArrayValue(value []byte) Value { return makeValueBytes(FixedLenByteArray, value) }\n\nfunc makeValue(k Kind, lt *format.LogicalType, v reflect.Value) Value {\n\tswitch v.Type() {\n\tcase reflect.TypeOf(time.Time{}):\n\t\tunit := Nanosecond.TimeUnit()\n\t\tif lt != nil && lt.Timestamp != nil {\n\t\t\tunit = lt.Timestamp.Unit\n\t\t}\n\n\t\tt := v.Interface().(time.Time)\n\t\tvar val int64\n\t\tswitch {\n\t\tcase unit.Millis != nil:\n\t\t\tval = t.UnixMilli()\n\t\tcase unit.Micros != nil:\n\t\t\tval = t.UnixMicro()\n\t\tdefault:\n\t\t\tval = t.UnixNano()\n\t\t}\n\t\treturn makeValueInt64(val)\n\t}\n\n\tswitch k {\n\tcase Boolean:\n\t\treturn makeValueBoolean(v.Bool())\n\n\tcase Int32:\n\t\tswitch v.Kind() {\n\t\tcase reflect.Int8, reflect.Int16, reflect.Int32:\n\t\t\treturn makeValueInt32(int32(v.Int()))\n\t\tcase reflect.Uint8, reflect.Uint16, reflect.Uint32:\n\t\t\treturn makeValueInt32(int32(v.Uint()))\n\t\t}\n\n\tcase Int64:\n\t\tswitch v.Kind() {\n\t\tcase reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int:\n\t\t\treturn makeValueInt64(v.Int())\n\t\tcase reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uint, reflect.Uintptr:\n\t\t\treturn makeValueUint64(v.Uint())\n\t\t}\n\n\tcase Int96:\n\t\tswitch v.Type() {\n\t\tcase reflect.TypeOf(deprecated.Int96{}):\n\t\t\treturn makeValueInt96(v.Interface().(deprecated.Int96))\n\t\t}\n\n\tcase Float:\n\t\tswitch v.Kind() {\n\t\tcase reflect.Float32:\n\t\t\treturn makeValueFloat(float32(v.Float()))\n\t\t}\n\n\tcase Double:\n\t\tswitch v.Kind() {\n\t\tcase reflect.Float32, reflect.Float64:\n\t\t\treturn makeValueDouble(v.Float())\n\t\t}\n\n\tcase ByteArray:\n\t\tswitch v.Kind() {\n\t\tcase reflect.String:\n\t\t\treturn makeValueString(k, v.String())\n\t\tcase reflect.Slice:\n\t\t\tif v.Type().Elem().Kind() == reflect.Uint8 {\n\t\t\t\treturn makeValueBytes(k, v.Bytes())\n\t\t\t}\n\t\t}\n\n\tcase FixedLenByteArray:\n\t\tswitch v.Kind() {\n\t\tcase reflect.String: // uuid\n\t\t\treturn makeValueString(k, v.String())\n\t\tcase reflect.Array:\n\t\t\tif v.Type().Elem().Kind() == reflect.Uint8 {\n\t\t\t\treturn makeValueFixedLenByteArray(v)\n\t\t\t}\n\t\tcase reflect.Slice:\n\t\t\tif v.Type().Elem().Kind() == reflect.Uint8 {\n\t\t\t\treturn makeValueBytes(k, v.Bytes())\n\t\t\t}\n\t\t}\n\t}\n\n\tpanic(\"cannot create parquet value of type \" + k.String() + \" from go value of type \" + v.Type().String())\n}\n\nfunc makeValueKind(kind Kind) Value {\n\treturn Value{kind: ^int8(kind)}\n}\n\nfunc makeValueBoolean(value bool) Value {\n\tv := Value{kind: ^int8(Boolean)}\n\tif value {\n\t\tv.u64 = 1\n\t}\n\treturn v\n}\n\nfunc makeValueInt32(value int32) Value {\n\treturn Value{\n\t\tkind: ^int8(Int32),\n\t\tu64:  uint64(value),\n\t}\n}\n\nfunc makeValueInt64(value int64) Value {\n\treturn Value{\n\t\tkind: ^int8(Int64),\n\t\tu64:  uint64(value),\n\t}\n}\n\nfunc makeValueInt96(value deprecated.Int96) Value {\n\t// TODO: this is highly inefficient because we need a heap allocation to\n\t// store the value; we don't expect INT96 to be used frequently since it\n\t// is a deprecated feature of parquet, and it helps keep the Value type\n\t// compact for all the other more common cases.\n\tbits := [12]byte{}\n\tbinary.LittleEndian.PutUint32(bits[0:4], value[0])\n\tbinary.LittleEndian.PutUint32(bits[4:8], value[1])\n\tbinary.LittleEndian.PutUint32(bits[8:12], value[2])\n\treturn Value{\n\t\tkind: ^int8(Int96),\n\t\tptr:  &bits[0],\n\t\tu64:  12, // set the length so we can use the ByteArray method\n\t}\n}\n\nfunc makeValueUint32(value uint32) Value {\n\treturn Value{\n\t\tkind: ^int8(Int32),\n\t\tu64:  uint64(value),\n\t}\n}\n\nfunc makeValueUint64(value uint64) Value {\n\treturn Value{\n\t\tkind: ^int8(Int64),\n\t\tu64:  value,\n\t}\n}\n\nfunc makeValueFloat(value float32) Value {\n\treturn Value{\n\t\tkind: ^int8(Float),\n\t\tu64:  uint64(math.Float32bits(value)),\n\t}\n}\n\nfunc makeValueDouble(value float64) Value {\n\treturn Value{\n\t\tkind: ^int8(Double),\n\t\tu64:  math.Float64bits(value),\n\t}\n}\n\nfunc makeValueBytes(kind Kind, value []byte) Value {\n\treturn makeValueByteArray(kind, unsafecast.AddressOfBytes(value), len(value))\n}\n\nfunc makeValueString(kind Kind, value string) Value {\n\treturn makeValueByteArray(kind, unsafecast.AddressOfString(value), len(value))\n}\n\nfunc makeValueFixedLenByteArray(v reflect.Value) Value {\n\tt := v.Type()\n\t// When the array is addressable, we take advantage of this\n\t// condition to avoid the heap allocation otherwise needed\n\t// to pack the reference into an interface{} value.\n\tif v.CanAddr() {\n\t\tv = v.Addr()\n\t} else {\n\t\tu := reflect.New(t)\n\t\tu.Elem().Set(v)\n\t\tv = u\n\t}\n\treturn makeValueByteArray(FixedLenByteArray, (*byte)(unsafePointer(v)), t.Len())\n}\n\nfunc makeValueByteArray(kind Kind, data *byte, size int) Value {\n\treturn Value{\n\t\tkind: ^int8(kind),\n\t\tptr:  data,\n\t\tu64:  uint64(size),\n\t}\n}\n\n// These methods are internal versions of methods exported by the Value type,\n// they are usually inlined by the compiler and intended to be used inside the\n// parquet-go package because they tend to generate better code than their\n// exported counter part, which requires making a copy of the receiver.\nfunc (v *Value) isNull() bool            { return v.kind == 0 }\nfunc (v *Value) byte() byte              { return byte(v.u64) }\nfunc (v *Value) boolean() bool           { return v.u64 != 0 }\nfunc (v *Value) int32() int32            { return int32(v.u64) }\nfunc (v *Value) int64() int64            { return int64(v.u64) }\nfunc (v *Value) int96() deprecated.Int96 { return makeInt96(v.byteArray()) }\nfunc (v *Value) float() float32          { return math.Float32frombits(uint32(v.u64)) }\nfunc (v *Value) double() float64         { return math.Float64frombits(uint64(v.u64)) }\nfunc (v *Value) uint32() uint32          { return uint32(v.u64) }\nfunc (v *Value) uint64() uint64          { return v.u64 }\nfunc (v *Value) byteArray() []byte       { return unsafecast.Bytes(v.ptr, int(v.u64)) }\nfunc (v *Value) string() string          { return unsafecast.BytesToString(v.byteArray()) }\nfunc (v *Value) be128() *[16]byte        { return (*[16]byte)(unsafe.Pointer(v.ptr)) }\nfunc (v *Value) column() int             { return int(^v.columnIndex) }\n\nfunc (v Value) convertToBoolean(x bool) Value {\n\tv.kind = ^int8(Boolean)\n\tv.ptr = nil\n\tv.u64 = 0\n\tif x {\n\t\tv.u64 = 1\n\t}\n\treturn v\n}\n\nfunc (v Value) convertToInt32(x int32) Value {\n\tv.kind = ^int8(Int32)\n\tv.ptr = nil\n\tv.u64 = uint64(x)\n\treturn v\n}\n\nfunc (v Value) convertToInt64(x int64) Value {\n\tv.kind = ^int8(Int64)\n\tv.ptr = nil\n\tv.u64 = uint64(x)\n\treturn v\n}\n\nfunc (v Value) convertToInt96(x deprecated.Int96) Value {\n\ti96 := makeValueInt96(x)\n\tv.kind = i96.kind\n\tv.ptr = i96.ptr\n\tv.u64 = i96.u64\n\treturn v\n}\n\nfunc (v Value) convertToFloat(x float32) Value {\n\tv.kind = ^int8(Float)\n\tv.ptr = nil\n\tv.u64 = uint64(math.Float32bits(x))\n\treturn v\n}\n\nfunc (v Value) convertToDouble(x float64) Value {\n\tv.kind = ^int8(Double)\n\tv.ptr = nil\n\tv.u64 = math.Float64bits(x)\n\treturn v\n}\n\nfunc (v Value) convertToByteArray(x []byte) Value {\n\tv.kind = ^int8(ByteArray)\n\tv.ptr = unsafecast.AddressOfBytes(x)\n\tv.u64 = uint64(len(x))\n\treturn v\n}\n\nfunc (v Value) convertToFixedLenByteArray(x []byte) Value {\n\tv.kind = ^int8(FixedLenByteArray)\n\tv.ptr = unsafecast.AddressOfBytes(x)\n\tv.u64 = uint64(len(x))\n\treturn v\n}\n\n// Kind returns the kind of v, which represents its parquet physical type.\nfunc (v Value) Kind() Kind { return ^Kind(v.kind) }\n\n// IsNull returns true if v is the null value.\nfunc (v Value) IsNull() bool { return v.isNull() }\n\n// Byte returns v as a byte, which may truncate the underlying byte.\nfunc (v Value) Byte() byte { return v.byte() }\n\n// Boolean returns v as a bool, assuming the underlying type is BOOLEAN.\nfunc (v Value) Boolean() bool { return v.boolean() }\n\n// Int32 returns v as a int32, assuming the underlying type is INT32.\nfunc (v Value) Int32() int32 { return v.int32() }\n\n// Int64 returns v as a int64, assuming the underlying type is INT64.\nfunc (v Value) Int64() int64 { return v.int64() }\n\n// Int96 returns v as a int96, assuming the underlying type is INT96.\nfunc (v Value) Int96() deprecated.Int96 {\n\tvar val deprecated.Int96\n\tif !v.isNull() {\n\t\tval = v.int96()\n\t}\n\treturn val\n}\n\n// Float returns v as a float32, assuming the underlying type is FLOAT.\nfunc (v Value) Float() float32 { return v.float() }\n\n// Double returns v as a float64, assuming the underlying type is DOUBLE.\nfunc (v Value) Double() float64 { return v.double() }\n\n// Uint32 returns v as a uint32, assuming the underlying type is INT32.\nfunc (v Value) Uint32() uint32 { return v.uint32() }\n\n// Uint64 returns v as a uint64, assuming the underlying type is INT64.\nfunc (v Value) Uint64() uint64 { return v.uint64() }\n\n// ByteArray returns v as a []byte, assuming the underlying type is either\n// BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY.\n//\n// The application must treat the returned byte slice as a read-only value,\n// mutating the content will result in undefined behaviors.\nfunc (v Value) ByteArray() []byte { return v.byteArray() }\n\n// RepetitionLevel returns the repetition level of v.\nfunc (v Value) RepetitionLevel() int { return int(v.repetitionLevel) }\n\n// DefinitionLevel returns the definition level of v.\nfunc (v Value) DefinitionLevel() int { return int(v.definitionLevel) }\n\n// Column returns the column index within the row that v was created from.\n//\n// Returns -1 if the value does not carry a column index.\nfunc (v Value) Column() int { return v.column() }\n\n// Bytes returns the binary representation of v.\n//\n// If v is the null value, an nil byte slice is returned.\nfunc (v Value) Bytes() []byte {\n\tswitch v.Kind() {\n\tcase Boolean:\n\t\tbuf := [8]byte{}\n\t\tbinary.LittleEndian.PutUint32(buf[:4], v.uint32())\n\t\treturn buf[0:1]\n\tcase Int32, Float:\n\t\tbuf := [8]byte{}\n\t\tbinary.LittleEndian.PutUint32(buf[:4], v.uint32())\n\t\treturn buf[:4]\n\tcase Int64, Double:\n\t\tbuf := [8]byte{}\n\t\tbinary.LittleEndian.PutUint64(buf[:8], v.uint64())\n\t\treturn buf[:8]\n\tcase ByteArray, FixedLenByteArray, Int96:\n\t\treturn v.byteArray()\n\tdefault:\n\t\treturn nil\n\t}\n}\n\n// AppendBytes appends the binary representation of v to b.\n//\n// If v is the null value, b is returned unchanged.\nfunc (v Value) AppendBytes(b []byte) []byte {\n\tbuf := [8]byte{}\n\tswitch v.Kind() {\n\tcase Boolean:\n\t\tbinary.LittleEndian.PutUint32(buf[:4], v.uint32())\n\t\treturn append(b, buf[0])\n\tcase Int32, Float:\n\t\tbinary.LittleEndian.PutUint32(buf[:4], v.uint32())\n\t\treturn append(b, buf[:4]...)\n\tcase Int64, Double:\n\t\tbinary.LittleEndian.PutUint64(buf[:8], v.uint64())\n\t\treturn append(b, buf[:8]...)\n\tcase ByteArray, FixedLenByteArray, Int96:\n\t\treturn append(b, v.byteArray()...)\n\tdefault:\n\t\treturn b\n\t}\n}\n\n// Format outputs a human-readable representation of v to w, using r as the\n// formatting verb to describe how the value should be printed.\n//\n// The following formatting options are supported:\n//\n//\t%c\tprints the column index\n//\t%+c\tprints the column index, prefixed with \"C:\"\n//\t%d\tprints the definition level\n//\t%+d\tprints the definition level, prefixed with \"D:\"\n//\t%r\tprints the repetition level\n//\t%+r\tprints the repetition level, prefixed with \"R:\"\n//\t%q\tprints the quoted representation of v\n//\t%+q\tprints the quoted representation of v, prefixed with \"V:\"\n//\t%s\tprints the string representation of v\n//\t%+s\tprints the string representation of v, prefixed with \"V:\"\n//\t%v\tsame as %s\n//\t%+v\tprints a verbose representation of v\n//\t%#v\tprints a Go value representation of v\n//\n// Format satisfies the fmt.Formatter interface.\nfunc (v Value) Format(w fmt.State, r rune) {\n\tswitch r {\n\tcase 'c':\n\t\tif w.Flag('+') {\n\t\t\tio.WriteString(w, \"C:\")\n\t\t}\n\t\tfmt.Fprint(w, v.column())\n\n\tcase 'd':\n\t\tif w.Flag('+') {\n\t\t\tio.WriteString(w, \"D:\")\n\t\t}\n\t\tfmt.Fprint(w, v.definitionLevel)\n\n\tcase 'r':\n\t\tif w.Flag('+') {\n\t\t\tio.WriteString(w, \"R:\")\n\t\t}\n\t\tfmt.Fprint(w, v.repetitionLevel)\n\n\tcase 'q':\n\t\tif w.Flag('+') {\n\t\t\tio.WriteString(w, \"V:\")\n\t\t}\n\t\tswitch v.Kind() {\n\t\tcase ByteArray, FixedLenByteArray:\n\t\t\tfmt.Fprintf(w, \"%q\", v.byteArray())\n\t\tdefault:\n\t\t\tfmt.Fprintf(w, `\"%s\"`, v)\n\t\t}\n\n\tcase 's':\n\t\tif w.Flag('+') {\n\t\t\tio.WriteString(w, \"V:\")\n\t\t}\n\t\tswitch v.Kind() {\n\t\tcase Boolean:\n\t\t\tfmt.Fprint(w, v.boolean())\n\t\tcase Int32:\n\t\t\tfmt.Fprint(w, v.int32())\n\t\tcase Int64:\n\t\t\tfmt.Fprint(w, v.int64())\n\t\tcase Int96:\n\t\t\tfmt.Fprint(w, v.int96())\n\t\tcase Float:\n\t\t\tfmt.Fprint(w, v.float())\n\t\tcase Double:\n\t\t\tfmt.Fprint(w, v.double())\n\t\tcase ByteArray, FixedLenByteArray:\n\t\t\tw.Write(v.byteArray())\n\t\tdefault:\n\t\t\tio.WriteString(w, \"<null>\")\n\t\t}\n\n\tcase 'v':\n\t\tswitch {\n\t\tcase w.Flag('+'):\n\t\t\tfmt.Fprintf(w, \"%+[1]c %+[1]d %+[1]r %+[1]s\", v)\n\t\tcase w.Flag('#'):\n\t\t\tv.formatGoString(w)\n\t\tdefault:\n\t\t\tv.Format(w, 's')\n\t\t}\n\t}\n}\n\nfunc (v Value) formatGoString(w fmt.State) {\n\tio.WriteString(w, \"parquet.\")\n\tswitch v.Kind() {\n\tcase Boolean:\n\t\tfmt.Fprintf(w, \"BooleanValue(%t)\", v.boolean())\n\tcase Int32:\n\t\tfmt.Fprintf(w, \"Int32Value(%d)\", v.int32())\n\tcase Int64:\n\t\tfmt.Fprintf(w, \"Int64Value(%d)\", v.int64())\n\tcase Int96:\n\t\tfmt.Fprintf(w, \"Int96Value(%#v)\", v.int96())\n\tcase Float:\n\t\tfmt.Fprintf(w, \"FloatValue(%g)\", v.float())\n\tcase Double:\n\t\tfmt.Fprintf(w, \"DoubleValue(%g)\", v.double())\n\tcase ByteArray:\n\t\tfmt.Fprintf(w, \"ByteArrayValue(%q)\", v.byteArray())\n\tcase FixedLenByteArray:\n\t\tfmt.Fprintf(w, \"FixedLenByteArrayValue(%#v)\", v.byteArray())\n\tdefault:\n\t\tio.WriteString(w, \"Value{}\")\n\t\treturn\n\t}\n\tfmt.Fprintf(w, \".Level(%d,%d,%d)\",\n\t\tv.RepetitionLevel(),\n\t\tv.DefinitionLevel(),\n\t\tv.Column(),\n\t)\n}\n\n// String returns a string representation of v.\nfunc (v Value) String() string {\n\tswitch v.Kind() {\n\tcase Boolean:\n\t\treturn strconv.FormatBool(v.boolean())\n\tcase Int32:\n\t\treturn strconv.FormatInt(int64(v.int32()), 10)\n\tcase Int64:\n\t\treturn strconv.FormatInt(v.int64(), 10)\n\tcase Int96:\n\t\treturn v.Int96().String()\n\tcase Float:\n\t\treturn strconv.FormatFloat(float64(v.float()), 'g', -1, 32)\n\tcase Double:\n\t\treturn strconv.FormatFloat(v.double(), 'g', -1, 32)\n\tcase ByteArray, FixedLenByteArray:\n\t\treturn string(v.byteArray())\n\tdefault:\n\t\treturn \"<null>\"\n\t}\n}\n\n// GoString returns a Go value string representation of v.\nfunc (v Value) GoString() string { return fmt.Sprintf(\"%#v\", v) }\n\n// Level returns v with the repetition level, definition level, and column index\n// set to the values passed as arguments.\n//\n// The method panics if either argument is negative.\nfunc (v Value) Level(repetitionLevel, definitionLevel, columnIndex int) Value {\n\tv.repetitionLevel = makeRepetitionLevel(repetitionLevel)\n\tv.definitionLevel = makeDefinitionLevel(definitionLevel)\n\tv.columnIndex = ^makeColumnIndex(columnIndex)\n\treturn v\n}\n\n// Clone returns a copy of v which does not share any pointers with it.\nfunc (v Value) Clone() Value {\n\tswitch k := v.Kind(); k {\n\tcase ByteArray, FixedLenByteArray:\n\t\tv.ptr = unsafecast.AddressOfBytes(copyBytes(v.byteArray()))\n\t}\n\treturn v\n}\n\nfunc makeInt96(bits []byte) (i96 deprecated.Int96) {\n\treturn deprecated.Int96{\n\t\t2: binary.LittleEndian.Uint32(bits[8:12]),\n\t\t1: binary.LittleEndian.Uint32(bits[4:8]),\n\t\t0: binary.LittleEndian.Uint32(bits[0:4]),\n\t}\n}\n\nfunc parseValue(kind Kind, data []byte) (val Value, err error) {\n\tswitch kind {\n\tcase Boolean:\n\t\tif len(data) == 1 {\n\t\t\tval = makeValueBoolean(data[0] != 0)\n\t\t}\n\tcase Int32:\n\t\tif len(data) == 4 {\n\t\t\tval = makeValueInt32(int32(binary.LittleEndian.Uint32(data)))\n\t\t}\n\tcase Int64:\n\t\tif len(data) == 8 {\n\t\t\tval = makeValueInt64(int64(binary.LittleEndian.Uint64(data)))\n\t\t}\n\tcase Int96:\n\t\tif len(data) == 12 {\n\t\t\tval = makeValueInt96(makeInt96(data))\n\t\t}\n\tcase Float:\n\t\tif len(data) == 4 {\n\t\t\tval = makeValueFloat(float32(math.Float32frombits(binary.LittleEndian.Uint32(data))))\n\t\t}\n\tcase Double:\n\t\tif len(data) == 8 {\n\t\t\tval = makeValueDouble(float64(math.Float64frombits(binary.LittleEndian.Uint64(data))))\n\t\t}\n\tcase ByteArray, FixedLenByteArray:\n\t\tval = makeValueBytes(kind, data)\n\t}\n\tif val.isNull() {\n\t\terr = fmt.Errorf(\"cannot decode %s value from input of length %d\", kind, len(data))\n\t}\n\treturn val, err\n}\n\nfunc copyBytes(b []byte) []byte {\n\tc := make([]byte, len(b))\n\tcopy(c, b)\n\treturn c\n}\n\n// Equal returns true if v1 and v2 are equal.\n//\n// Values are considered equal if they are of the same physical type and hold\n// the same Go values. For BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY, the content of\n// the underlying byte arrays are tested for equality.\n//\n// Note that the repetition levels, definition levels, and column indexes are\n// not compared by this function, use DeepEqual instead.\nfunc Equal(v1, v2 Value) bool {\n\tif v1.kind != v2.kind {\n\t\treturn false\n\t}\n\tswitch ^Kind(v1.kind) {\n\tcase Boolean:\n\t\treturn v1.boolean() == v2.boolean()\n\tcase Int32:\n\t\treturn v1.int32() == v2.int32()\n\tcase Int64:\n\t\treturn v1.int64() == v2.int64()\n\tcase Int96:\n\t\treturn v1.int96() == v2.int96()\n\tcase Float:\n\t\treturn v1.float() == v2.float()\n\tcase Double:\n\t\treturn v1.double() == v2.double()\n\tcase ByteArray, FixedLenByteArray:\n\t\treturn bytes.Equal(v1.byteArray(), v2.byteArray())\n\tcase -1: // null\n\t\treturn true\n\tdefault:\n\t\treturn false\n\t}\n}\n\n// DeepEqual returns true if v1 and v2 are equal, including their repetition\n// levels, definition levels, and column indexes.\n//\n// See Equal for details about how value equality is determined.\nfunc DeepEqual(v1, v2 Value) bool {\n\treturn Equal(v1, v2) &&\n\t\tv1.repetitionLevel == v2.repetitionLevel &&\n\t\tv1.definitionLevel == v2.definitionLevel &&\n\t\tv1.columnIndex == v2.columnIndex\n}\n\nvar (\n\t_ fmt.Formatter = Value{}\n\t_ fmt.Stringer  = Value{}\n)\n\nfunc clearValues(values []Value) {\n\tfor i := range values {\n\t\tvalues[i] = Value{}\n\t}\n}\n\n// BooleanReader is an interface implemented by ValueReader instances which\n// expose the content of a column of boolean values.\ntype BooleanReader interface {\n\t// Read boolean values into the buffer passed as argument.\n\t//\n\t// The method returns io.EOF when all values have been read.\n\tReadBooleans(values []bool) (int, error)\n}\n\n// BooleanWriter is an interface implemented by ValueWriter instances which\n// support writing columns of boolean values.\ntype BooleanWriter interface {\n\t// Write boolean values.\n\t//\n\t// The method returns the number of values written, and any error that\n\t// occurred while writing the values.\n\tWriteBooleans(values []bool) (int, error)\n}\n\n// Int32Reader is an interface implemented by ValueReader instances which expose\n// the content of a column of int32 values.\ntype Int32Reader interface {\n\t// Read 32 bits integer values into the buffer passed as argument.\n\t//\n\t// The method returns io.EOF when all values have been read.\n\tReadInt32s(values []int32) (int, error)\n}\n\n// Int32Writer is an interface implemented by ValueWriter instances which\n// support writing columns of 32 bits signed integer values.\ntype Int32Writer interface {\n\t// Write 32 bits signed integer values.\n\t//\n\t// The method returns the number of values written, and any error that\n\t// occurred while writing the values.\n\tWriteInt32s(values []int32) (int, error)\n}\n\n// Int64Reader is an interface implemented by ValueReader instances which expose\n// the content of a column of int64 values.\ntype Int64Reader interface {\n\t// Read 64 bits integer values into the buffer passed as argument.\n\t//\n\t// The method returns io.EOF when all values have been read.\n\tReadInt64s(values []int64) (int, error)\n}\n\n// Int64Writer is an interface implemented by ValueWriter instances which\n// support writing columns of 64 bits signed integer values.\ntype Int64Writer interface {\n\t// Write 64 bits signed integer values.\n\t//\n\t// The method returns the number of values written, and any error that\n\t// occurred while writing the values.\n\tWriteInt64s(values []int64) (int, error)\n}\n\n// Int96Reader is an interface implemented by ValueReader instances which expose\n// the content of a column of int96 values.\ntype Int96Reader interface {\n\t// Read 96 bits integer values into the buffer passed as argument.\n\t//\n\t// The method returns io.EOF when all values have been read.\n\tReadInt96s(values []deprecated.Int96) (int, error)\n}\n\n// Int96Writer is an interface implemented by ValueWriter instances which\n// support writing columns of 96 bits signed integer values.\ntype Int96Writer interface {\n\t// Write 96 bits signed integer values.\n\t//\n\t// The method returns the number of values written, and any error that\n\t// occurred while writing the values.\n\tWriteInt96s(values []deprecated.Int96) (int, error)\n}\n\n// FloatReader is an interface implemented by ValueReader instances which expose\n// the content of a column of single-precision floating point values.\ntype FloatReader interface {\n\t// Read single-precision floating point values into the buffer passed as\n\t// argument.\n\t//\n\t// The method returns io.EOF when all values have been read.\n\tReadFloats(values []float32) (int, error)\n}\n\n// FloatWriter is an interface implemented by ValueWriter instances which\n// support writing columns of single-precision floating point values.\ntype FloatWriter interface {\n\t// Write single-precision floating point values.\n\t//\n\t// The method returns the number of values written, and any error that\n\t// occurred while writing the values.\n\tWriteFloats(values []float32) (int, error)\n}\n\n// DoubleReader is an interface implemented by ValueReader instances which\n// expose the content of a column of double-precision float point values.\ntype DoubleReader interface {\n\t// Read double-precision floating point values into the buffer passed as\n\t// argument.\n\t//\n\t// The method returns io.EOF when all values have been read.\n\tReadDoubles(values []float64) (int, error)\n}\n\n// DoubleWriter is an interface implemented by ValueWriter instances which\n// support writing columns of double-precision floating point values.\ntype DoubleWriter interface {\n\t// Write double-precision floating point values.\n\t//\n\t// The method returns the number of values written, and any error that\n\t// occurred while writing the values.\n\tWriteDoubles(values []float64) (int, error)\n}\n\n// ByteArrayReader is an interface implemented by ValueReader instances which\n// expose the content of a column of variable length byte array values.\ntype ByteArrayReader interface {\n\t// Read values into the byte buffer passed as argument, returning the number\n\t// of values written to the buffer (not the number of bytes). Values are\n\t// written using the PLAIN encoding, each byte array prefixed with its\n\t// length encoded as a 4 bytes little endian unsigned integer.\n\t//\n\t// The method returns io.EOF when all values have been read.\n\t//\n\t// If the buffer was not empty, but too small to hold at least one value,\n\t// io.ErrShortBuffer is returned.\n\tReadByteArrays(values []byte) (int, error)\n}\n\n// ByteArrayWriter is an interface implemented by ValueWriter instances which\n// support writing columns of variable length byte array values.\ntype ByteArrayWriter interface {\n\t// Write variable length byte array values.\n\t//\n\t// The values passed as input must be laid out using the PLAIN encoding,\n\t// with each byte array prefixed with the four bytes little endian unsigned\n\t// integer length.\n\t//\n\t// The method returns the number of values written to the underlying column\n\t// (not the number of bytes), or any error that occurred while attempting to\n\t// write the values.\n\tWriteByteArrays(values []byte) (int, error)\n}\n\n// FixedLenByteArrayReader is an interface implemented by ValueReader instances\n// which expose the content of a column of fixed length byte array values.\ntype FixedLenByteArrayReader interface {\n\t// Read values into the byte buffer passed as argument, returning the number\n\t// of values written to the buffer (not the number of bytes).\n\t//\n\t// The method returns io.EOF when all values have been read.\n\t//\n\t// If the buffer was not empty, but too small to hold at least one value,\n\t// io.ErrShortBuffer is returned.\n\tReadFixedLenByteArrays(values []byte) (int, error)\n}\n\n// FixedLenByteArrayWriter is an interface implemented by ValueWriter instances\n// which support writing columns of fixed length byte array values.\ntype FixedLenByteArrayWriter interface {\n\t// Writes the fixed length byte array values.\n\t//\n\t// The size of the values is assumed to be the same as the expected size of\n\t// items in the column. The method errors if the length of the input values\n\t// is not a multiple of the expected item size.\n\tWriteFixedLenByteArrays(values []byte) (int, error)\n}\n"
  },
  {
    "path": "value_amd64.go",
    "content": "//go:build !purego\n\npackage parquet\n\nimport \"golang.org/x/sys/cpu\"\n\n//go:noescape\nfunc memsetValuesAVX2(values []Value, model Value, _ uint64)\n\nfunc memsetValues(values []Value, model Value) {\n\tif cpu.X86.HasAVX2 {\n\t\tmemsetValuesAVX2(values, model, 0)\n\t} else {\n\t\tfor i := range values {\n\t\t\tvalues[i] = model\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "value_amd64.s",
    "content": "//go:build !purego\n\n#include \"textflag.h\"\n\n#define sizeOfValue 24\n\n// This function is an optimized implementation of the memsetValues function\n// which assigns the parquet.Value passed as second argument to all elements of\n// the first slice argument.\n//\n// The optimizations relies on the fact that we can pack 4 parquet.Value values\n// into 3 YMM registers (24 x 4 = 32 x 3 = 96).\n//\n// func memsetValuesAVX2(values []Value, model Value, _ uint64)\nTEXT ·memsetValuesAVX2(SB), NOSPLIT, $0-56 // 48 + padding to load model in YMM\n    MOVQ values_base+0(FP), AX\n    MOVQ values_len+8(FP), BX\n\n    MOVQ model_ptr+24(FP), R10\n    MOVQ model_u64+32(FP), R11\n    MOVQ model+40(FP), R12 // go vet complains about this line but it's OK\n\n    XORQ SI, SI // byte index\n    MOVQ BX, DI // byte count\n    IMULQ $sizeOfValue, DI\n\n    CMPQ BX, $4\n    JB test\n\n    MOVQ BX, R8\n    SHRQ $2, R8\n    SHLQ $2, R8\n    IMULQ $sizeOfValue, R8\n\n    VMOVDQU model+24(FP), Y0\n    VMOVDQU Y0, Y1\n    VMOVDQU Y0, Y2\n\n    VPERMQ $0b00100100, Y0, Y0\n    VPERMQ $0b01001001, Y1, Y1\n    VPERMQ $0b10010010, Y2, Y2\nloop4:\n    VMOVDQU Y0, 0(AX)(SI*1)\n    VMOVDQU Y1, 32(AX)(SI*1)\n    VMOVDQU Y2, 64(AX)(SI*1)\n    ADDQ $4*sizeOfValue, SI\n    CMPQ SI, R8\n    JNE loop4\n    VZEROUPPER\n    JMP test\nloop:\n    MOVQ R10, 0(AX)(SI*1)\n    MOVQ R11, 8(AX)(SI*1)\n    MOVQ R12, 16(AX)(SI*1)\n    ADDQ $sizeOfValue, SI\ntest:\n    CMPQ SI, DI\n    JNE loop\n    RET\n"
  },
  {
    "path": "value_go17.go",
    "content": "//go:build !go1.18\n\npackage parquet\n\nimport (\n\t\"reflect\"\n\t\"unsafe\"\n)\n\nfunc unsafePointer(v reflect.Value) unsafe.Pointer {\n\t// This may not have been a safe conversion but there were no better way\n\t// prior to Go 1.18 and the introduction of reflect.Value.UnsafePointer.\n\treturn unsafe.Pointer(v.Pointer())\n}\n"
  },
  {
    "path": "value_go18.go",
    "content": "//go:build go1.18\n\npackage parquet\n\nimport (\n\t\"reflect\"\n\t\"unsafe\"\n)\n\n// This function exists for backward compatibility with the Go 1.17 build which\n// has a different implementation.\n//\n// TODO: remove when we drop support for Go versions prior to 1.18.\nfunc unsafePointer(v reflect.Value) unsafe.Pointer { return v.UnsafePointer() }\n"
  },
  {
    "path": "value_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"bytes\"\n\t\"math\"\n\t\"testing\"\n\t\"time\"\n\t\"unsafe\"\n\n\t\"github.com/segmentio/parquet-go\"\n\t\"github.com/segmentio/parquet-go/deprecated\"\n)\n\nfunc TestSizeOfValue(t *testing.T) {\n\tt.Logf(\"sizeof(parquet.Value) = %d\", unsafe.Sizeof(parquet.Value{}))\n}\n\nfunc BenchmarkValueAppend(b *testing.B) {\n\tconst N = 1024\n\trow := make(parquet.Row, 0, N)\n\tval := parquet.ValueOf(42)\n\n\tfor i := 0; i < b.N; i++ {\n\t\trow = row[:0]\n\t\tfor j := 0; j < N; j++ {\n\t\t\trow = append(row, val)\n\t\t}\n\t}\n\n\tb.SetBytes(N * int64(unsafe.Sizeof(parquet.Value{})))\n}\n\nfunc TestValueClone(t *testing.T) {\n\ttests := []struct {\n\t\tscenario string\n\t\tvalues   []interface{}\n\t}{\n\t\t{\n\t\t\tscenario: \"BOOLEAN\",\n\t\t\tvalues:   []interface{}{false, true},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"INT32\",\n\t\t\tvalues:   []interface{}{int32(0), int32(1), int32(math.MinInt32), int32(math.MaxInt32)},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"INT64\",\n\t\t\tvalues:   []interface{}{int64(0), int64(1), int64(math.MinInt64), int64(math.MaxInt64)},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"FLOAT\",\n\t\t\tvalues:   []interface{}{float32(0), float32(1), float32(-1)},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"DOUBLE\",\n\t\t\tvalues:   []interface{}{float64(0), float64(1), float64(-1)},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"BYTE_ARRAY\",\n\t\t\tvalues:   []interface{}{\"\", \"A\", \"ABC\", \"Hello World!\"},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"FIXED_LEN_BYTE_ARRAY\",\n\t\t\tvalues:   []interface{}{[1]byte{42}, [16]byte{0: 1}},\n\t\t},\n\n\t\t{\n\t\t\tscenario: \"TIME\",\n\t\t\tvalues: []interface{}{\n\t\t\t\ttime.Date(2020, 1, 2, 3, 4, 5, 7, time.UTC),\n\t\t\t\ttime.Date(2021, 2, 3, 4, 5, 6, 8, time.UTC),\n\t\t\t},\n\t\t},\n\t}\n\n\tfor _, test := range tests {\n\t\tt.Run(test.scenario, func(t *testing.T) {\n\t\t\tfor _, value := range test.values {\n\t\t\t\tv := parquet.ValueOf(value)\n\t\t\t\tc := v.Clone()\n\n\t\t\t\tif !parquet.DeepEqual(v, c) {\n\t\t\t\t\tt.Errorf(\"cloned values are not equal: want=%#v got=%#v\", v, c)\n\t\t\t\t}\n\t\t\t\tif v.RepetitionLevel() != c.RepetitionLevel() {\n\t\t\t\t\tt.Error(\"cloned values do not have the same repetition level\")\n\t\t\t\t}\n\t\t\t\tif v.DefinitionLevel() != c.DefinitionLevel() {\n\t\t\t\t\tt.Error(\"cloned values do not have the same definition level\")\n\t\t\t\t}\n\t\t\t\tif v.Column() != c.Column() {\n\t\t\t\t\tt.Error(\"cloned values do not have the same column index\")\n\t\t\t\t}\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc TestZeroValue(t *testing.T) {\n\tvar v parquet.Value\n\tif !v.IsNull() {\n\t\tt.Error(\"expected zero value parquet.Value to be null\")\n\t}\n\n\tif v.Byte() != byte(0) {\n\t\tt.Errorf(\"byte not zero value: got=%#v\", v.Byte())\n\t}\n\n\tif v.Boolean() != false {\n\t\tt.Errorf(\"boolean not zero value: got=%#v\", v.Boolean())\n\t}\n\n\tif v.Int32() != 0 {\n\t\tt.Errorf(\"int32 not zero value: got=%#v\", v.Int32())\n\t}\n\n\tif v.Int64() != 0 {\n\t\tt.Errorf(\"int64 not zero value: got=%#v\", v.Int64())\n\t}\n\n\tvar zeroInt96 deprecated.Int96\n\tif v.Int96() != zeroInt96 {\n\t\tt.Errorf(\"int96 not zero value: got=%#v\", zeroInt96)\n\t}\n\n\tif v.Float() != 0 {\n\t\tt.Errorf(\"float not zero value: got=%#v\", v.Float())\n\t}\n\n\tif v.Double() != 0 {\n\t\tt.Errorf(\"double not zero value: got=%#v\", v.Double())\n\t}\n\n\tif v.Uint32() != 0 {\n\t\tt.Errorf(\"uint32 not zero value: got=%#v\", v.Uint32())\n\t}\n\n\tif v.Uint64() != 0 {\n\t\tt.Errorf(\"uint64 not zero value: got=%#v\", v.Uint64())\n\t}\n\n\tvar zeroByte []byte\n\tif !bytes.Equal(v.ByteArray(), zeroByte) {\n\t\tt.Errorf(\"byte array not zero value: got=%#v\", v.ByteArray())\n\t}\n}\n"
  },
  {
    "path": "values_purego.go",
    "content": "//go:build purego || !amd64\n\npackage parquet\n\nfunc memsetValues(values []Value, model Value) {\n\tfor i := range values {\n\t\tvalues[i] = model\n\t}\n}\n"
  },
  {
    "path": "writer.go",
    "content": "package parquet\n\nimport (\n\t\"bufio\"\n\t\"bytes\"\n\t\"encoding/binary\"\n\t\"fmt\"\n\t\"hash/crc32\"\n\t\"io\"\n\t\"math/bits\"\n\t\"sort\"\n\n\t\"github.com/segmentio/encoding/thrift\"\n\t\"github.com/segmentio/parquet-go/compress\"\n\t\"github.com/segmentio/parquet-go/encoding\"\n\t\"github.com/segmentio/parquet-go/encoding/plain\"\n\t\"github.com/segmentio/parquet-go/format\"\n)\n\n// Deprecated: A Writer uses a parquet schema and sequence of Go values to\n// produce a parquet file to an io.Writer.\n//\n// This example showcases a typical use of parquet writers:\n//\n//\twriter := parquet.NewWriter(output)\n//\n//\tfor _, row := range rows {\n//\t\tif err := writer.Write(row); err != nil {\n//\t\t\t...\n//\t\t}\n//\t}\n//\n//\tif err := writer.Close(); err != nil {\n//\t\t...\n//\t}\n//\n// The Writer type optimizes for minimal memory usage, each page is written as\n// soon as it has been filled so only a single page per column needs to be held\n// in memory and as a result, there are no opportunities to sort rows within an\n// entire row group. Programs that need to produce parquet files with sorted\n// row groups should use the Buffer type to buffer and sort the rows prior to\n// writing them to a Writer.\n//\n// For programs building with Go 1.18 or later, the GenericWriter[T] type\n// supersedes this one.\ntype Writer struct {\n\toutput io.Writer\n\tconfig *WriterConfig\n\tschema *Schema\n\twriter *writer\n\trowbuf []Row\n}\n\n// NewWriter constructs a parquet writer writing a file to the given io.Writer.\n//\n// The function panics if the writer configuration is invalid. Programs that\n// cannot guarantee the validity of the options passed to NewWriter should\n// construct the writer configuration independently prior to calling this\n// function:\n//\n//\tconfig, err := parquet.NewWriterConfig(options...)\n//\tif err != nil {\n//\t\t// handle the configuration error\n//\t\t...\n//\t} else {\n//\t\t// this call to create a writer is guaranteed not to panic\n//\t\twriter := parquet.NewWriter(output, config)\n//\t\t...\n//\t}\nfunc NewWriter(output io.Writer, options ...WriterOption) *Writer {\n\tconfig, err := NewWriterConfig(options...)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\tw := &Writer{\n\t\toutput: output,\n\t\tconfig: config,\n\t}\n\tif config.Schema != nil {\n\t\tw.configure(config.Schema)\n\t}\n\treturn w\n}\n\nfunc (w *Writer) configure(schema *Schema) {\n\tif schema != nil {\n\t\tw.config.Schema = schema\n\t\tw.schema = schema\n\t\tw.writer = newWriter(w.output, w.config)\n\t}\n}\n\n// Close must be called after all values were produced to the writer in order to\n// flush all buffers and write the parquet footer.\nfunc (w *Writer) Close() error {\n\tif w.writer != nil {\n\t\treturn w.writer.close()\n\t}\n\treturn nil\n}\n\n// Flush flushes all buffers into a row group to the underlying io.Writer.\n//\n// Flush is called automatically on Close, it is only useful to call explicitly\n// if the application needs to limit the size of row groups or wants to produce\n// multiple row groups per file.\n//\n// If the writer attempts to create more than MaxRowGroups row groups the method\n// returns ErrTooManyRowGroups.\nfunc (w *Writer) Flush() error {\n\tif w.writer != nil {\n\t\treturn w.writer.flush()\n\t}\n\treturn nil\n}\n\n// Reset clears the state of the writer without flushing any of the buffers,\n// and setting the output to the io.Writer passed as argument, allowing the\n// writer to be reused to produce another parquet file.\n//\n// Reset may be called at any time, including after a writer was closed.\nfunc (w *Writer) Reset(output io.Writer) {\n\tif w.output = output; w.writer != nil {\n\t\tw.writer.reset(w.output)\n\t}\n}\n\n// Write is called to write another row to the parquet file.\n//\n// The method uses the parquet schema configured on w to traverse the Go value\n// and decompose it into a set of columns and values. If no schema were passed\n// to NewWriter, it is deducted from the Go type of the row, which then have to\n// be a struct or pointer to struct.\nfunc (w *Writer) Write(row interface{}) error {\n\tif w.schema == nil {\n\t\tw.configure(SchemaOf(row))\n\t}\n\tif cap(w.rowbuf) == 0 {\n\t\tw.rowbuf = make([]Row, 1)\n\t} else {\n\t\tw.rowbuf = w.rowbuf[:1]\n\t}\n\tdefer clearRows(w.rowbuf)\n\tw.rowbuf[0] = w.schema.Deconstruct(w.rowbuf[0][:0], row)\n\t_, err := w.WriteRows(w.rowbuf)\n\treturn err\n}\n\n// WriteRows is called to write rows to the parquet file.\n//\n// The Writer must have been given a schema when NewWriter was called, otherwise\n// the structure of the parquet file cannot be determined from the row only.\n//\n// The row is expected to contain values for each column of the writer's schema,\n// in the order produced by the parquet.(*Schema).Deconstruct method.\nfunc (w *Writer) WriteRows(rows []Row) (int, error) {\n\treturn w.writer.WriteRows(rows)\n}\n\n// WriteRowGroup writes a row group to the parquet file.\n//\n// Buffered rows will be flushed prior to writing rows from the group, unless\n// the row group was empty in which case nothing is written to the file.\n//\n// The content of the row group is flushed to the writer; after the method\n// returns successfully, the row group will be empty and in ready to be reused.\nfunc (w *Writer) WriteRowGroup(rowGroup RowGroup) (int64, error) {\n\trowGroupSchema := rowGroup.Schema()\n\tswitch {\n\tcase rowGroupSchema == nil:\n\t\treturn 0, ErrRowGroupSchemaMissing\n\tcase w.schema == nil:\n\t\tw.configure(rowGroupSchema)\n\tcase !nodesAreEqual(w.schema, rowGroupSchema):\n\t\treturn 0, ErrRowGroupSchemaMismatch\n\t}\n\tif err := w.writer.flush(); err != nil {\n\t\treturn 0, err\n\t}\n\tw.writer.configureBloomFilters(rowGroup.ColumnChunks())\n\trows := rowGroup.Rows()\n\tdefer rows.Close()\n\tn, err := CopyRows(w.writer, rows)\n\tif err != nil {\n\t\treturn n, err\n\t}\n\treturn w.writer.writeRowGroup(rowGroup.Schema(), rowGroup.SortingColumns())\n}\n\n// ReadRowsFrom reads rows from the reader passed as arguments and writes them\n// to w.\n//\n// This is similar to calling WriteRow repeatedly, but will be more efficient\n// if optimizations are supported by the reader.\nfunc (w *Writer) ReadRowsFrom(rows RowReader) (written int64, err error) {\n\tif w.schema == nil {\n\t\tif r, ok := rows.(RowReaderWithSchema); ok {\n\t\t\tw.configure(r.Schema())\n\t\t}\n\t}\n\tif cap(w.rowbuf) < defaultRowBufferSize {\n\t\tw.rowbuf = make([]Row, defaultRowBufferSize)\n\t} else {\n\t\tw.rowbuf = w.rowbuf[:cap(w.rowbuf)]\n\t}\n\treturn copyRows(w.writer, rows, w.rowbuf)\n}\n\n// Schema returns the schema of rows written by w.\n//\n// The returned value will be nil if no schema has yet been configured on w.\nfunc (w *Writer) Schema() *Schema { return w.schema }\n\n// SetKeyValueMetadata sets a key/value pair in the Parquet file metadata.\n//\n// Keys are assumed to be unique, if the same key is repeated multiple times the\n// last value is retained. While the parquet format does not require unique keys,\n// this design decision was made to optimize for the most common use case where\n// applications leverage this extension mechanism to associate single values to\n// keys. This may create incompatibilities with other parquet libraries, or may\n// cause some key/value pairs to be lost when open parquet files written with\n// repeated keys. We can revisit this decision if it ever becomes a blocker.\nfunc (w *Writer) SetKeyValueMetadata(key, value string) {\n\tfor i, kv := range w.writer.metadata {\n\t\tif kv.Key == key {\n\t\t\tkv.Value = value\n\t\t\tw.writer.metadata[i] = kv\n\t\t\treturn\n\t\t}\n\t}\n\tw.writer.metadata = append(w.writer.metadata, format.KeyValue{\n\t\tKey:   key,\n\t\tValue: value,\n\t})\n}\n\ntype writer struct {\n\tbuffer  *bufio.Writer\n\twriter  offsetTrackingWriter\n\tvalues  [][]Value\n\tnumRows int64\n\tmaxRows int64\n\n\tcreatedBy string\n\tmetadata  []format.KeyValue\n\n\tcolumns     []*writerColumn\n\tcolumnChunk []format.ColumnChunk\n\tcolumnIndex []format.ColumnIndex\n\toffsetIndex []format.OffsetIndex\n\n\tcolumnOrders   []format.ColumnOrder\n\tschemaElements []format.SchemaElement\n\trowGroups      []format.RowGroup\n\tcolumnIndexes  [][]format.ColumnIndex\n\toffsetIndexes  [][]format.OffsetIndex\n\tsortingColumns []format.SortingColumn\n}\n\nfunc newWriter(output io.Writer, config *WriterConfig) *writer {\n\tw := new(writer)\n\tif config.WriteBufferSize <= 0 {\n\t\tw.writer.Reset(output)\n\t} else {\n\t\tw.buffer = bufio.NewWriterSize(output, config.WriteBufferSize)\n\t\tw.writer.Reset(w.buffer)\n\t}\n\tw.maxRows = config.MaxRowsPerRowGroup\n\tw.createdBy = config.CreatedBy\n\tw.metadata = make([]format.KeyValue, 0, len(config.KeyValueMetadata))\n\tfor k, v := range config.KeyValueMetadata {\n\t\tw.metadata = append(w.metadata, format.KeyValue{Key: k, Value: v})\n\t}\n\tsortKeyValueMetadata(w.metadata)\n\tw.sortingColumns = make([]format.SortingColumn, len(config.Sorting.SortingColumns))\n\n\tconfig.Schema.forEachNode(func(name string, node Node) {\n\t\tnodeType := node.Type()\n\n\t\trepetitionType := (*format.FieldRepetitionType)(nil)\n\t\tif node != config.Schema { // the root has no repetition type\n\t\t\trepetitionType = fieldRepetitionTypePtrOf(node)\n\t\t}\n\n\t\t// For backward compatibility with older readers, the parquet specification\n\t\t// recommends to set the scale and precision on schema elements when the\n\t\t// column is of logical type decimal.\n\t\tlogicalType := nodeType.LogicalType()\n\t\tscale, precision := (*int32)(nil), (*int32)(nil)\n\t\tif logicalType != nil && logicalType.Decimal != nil {\n\t\t\tscale = &logicalType.Decimal.Scale\n\t\t\tprecision = &logicalType.Decimal.Precision\n\t\t}\n\n\t\ttypeLength := (*int32)(nil)\n\t\tif n := int32(nodeType.Length()); n > 0 {\n\t\t\ttypeLength = &n\n\t\t}\n\n\t\tw.schemaElements = append(w.schemaElements, format.SchemaElement{\n\t\t\tType:           nodeType.PhysicalType(),\n\t\t\tTypeLength:     typeLength,\n\t\t\tRepetitionType: repetitionType,\n\t\t\tName:           name,\n\t\t\tNumChildren:    int32(len(node.Fields())),\n\t\t\tConvertedType:  nodeType.ConvertedType(),\n\t\t\tScale:          scale,\n\t\t\tPrecision:      precision,\n\t\t\tLogicalType:    logicalType,\n\t\t})\n\t})\n\n\tdataPageType := format.DataPage\n\tif config.DataPageVersion == 2 {\n\t\tdataPageType = format.DataPageV2\n\t}\n\n\tdefaultCompression := config.Compression\n\tif defaultCompression == nil {\n\t\tdefaultCompression = &Uncompressed\n\t}\n\n\t// Those buffers are scratch space used to generate the page header and\n\t// content, they are shared by all column chunks because they are only\n\t// used during calls to writeDictionaryPage or writeDataPage, which are\n\t// not done concurrently.\n\tbuffers := new(writerBuffers)\n\n\tforEachLeafColumnOf(config.Schema, func(leaf leafColumn) {\n\t\tencoding := encodingOf(leaf.node)\n\t\tdictionary := Dictionary(nil)\n\t\tcolumnType := leaf.node.Type()\n\t\tcolumnIndex := int(leaf.columnIndex)\n\t\tcompression := leaf.node.Compression()\n\n\t\tif compression == nil {\n\t\t\tcompression = defaultCompression\n\t\t}\n\n\t\tif isDictionaryEncoding(encoding) {\n\t\t\tdictBuffer := columnType.NewValues(\n\t\t\t\tmake([]byte, 0, defaultDictBufferSize),\n\t\t\t\tnil,\n\t\t\t)\n\t\t\tdictionary = columnType.NewDictionary(columnIndex, 0, dictBuffer)\n\t\t\tcolumnType = dictionary.Type()\n\t\t}\n\n\t\tc := &writerColumn{\n\t\t\tbuffers:            buffers,\n\t\t\tpool:               config.ColumnPageBuffers,\n\t\t\tcolumnPath:         leaf.path,\n\t\t\tcolumnType:         columnType,\n\t\t\tcolumnIndex:        columnType.NewColumnIndexer(config.ColumnIndexSizeLimit),\n\t\t\tcolumnFilter:       searchBloomFilterColumn(config.BloomFilters, leaf.path),\n\t\t\tcompression:        compression,\n\t\t\tdictionary:         dictionary,\n\t\t\tdataPageType:       dataPageType,\n\t\t\tmaxRepetitionLevel: leaf.maxRepetitionLevel,\n\t\t\tmaxDefinitionLevel: leaf.maxDefinitionLevel,\n\t\t\tbufferIndex:        int32(leaf.columnIndex),\n\t\t\tbufferSize:         int32(float64(config.PageBufferSize) * 0.98),\n\t\t\twritePageStats:     config.DataPageStatistics,\n\t\t\tencodings:          make([]format.Encoding, 0, 3),\n\t\t\t// Data pages in version 2 can omit compression when dictionary\n\t\t\t// encoding is employed; only the dictionary page needs to be\n\t\t\t// compressed, the data pages are encoded with the hybrid\n\t\t\t// RLE/Bit-Pack encoding which doesn't benefit from an extra\n\t\t\t// compression layer.\n\t\t\tisCompressed: isCompressed(compression) && (dataPageType != format.DataPageV2 || dictionary == nil),\n\t\t}\n\n\t\tc.header.encoder.Reset(c.header.protocol.NewWriter(&buffers.header))\n\n\t\tif leaf.maxDefinitionLevel > 0 {\n\t\t\tc.encodings = addEncoding(c.encodings, format.RLE)\n\t\t}\n\n\t\tif isDictionaryEncoding(encoding) {\n\t\t\tc.encodings = addEncoding(c.encodings, format.Plain)\n\t\t}\n\n\t\tc.encoding = encoding\n\t\tc.encodings = addEncoding(c.encodings, c.encoding.Encoding())\n\t\tsortPageEncodings(c.encodings)\n\n\t\tw.columns = append(w.columns, c)\n\n\t\tif sortingIndex := searchSortingColumn(config.Sorting.SortingColumns, leaf.path); sortingIndex < len(w.sortingColumns) {\n\t\t\tw.sortingColumns[sortingIndex] = format.SortingColumn{\n\t\t\t\tColumnIdx:  int32(leaf.columnIndex),\n\t\t\t\tDescending: config.Sorting.SortingColumns[sortingIndex].Descending(),\n\t\t\t\tNullsFirst: config.Sorting.SortingColumns[sortingIndex].NullsFirst(),\n\t\t\t}\n\t\t}\n\t})\n\n\t// Pre-allocate the backing array so that in most cases where the rows\n\t// contain a single value we will hit collocated memory areas when writing\n\t// rows to the writer. This won't benefit repeated columns much but in that\n\t// case we would just waste a bit of memory which we can afford.\n\tvalues := make([]Value, len(w.columns))\n\tw.values = make([][]Value, len(w.columns))\n\tfor i := range values {\n\t\tw.values[i] = values[i : i : i+1]\n\t}\n\n\tw.columnChunk = make([]format.ColumnChunk, len(w.columns))\n\tw.columnIndex = make([]format.ColumnIndex, len(w.columns))\n\tw.offsetIndex = make([]format.OffsetIndex, len(w.columns))\n\tw.columnOrders = make([]format.ColumnOrder, len(w.columns))\n\n\tfor i, c := range w.columns {\n\t\tw.columnChunk[i] = format.ColumnChunk{\n\t\t\tMetaData: format.ColumnMetaData{\n\t\t\t\tType:             format.Type(c.columnType.Kind()),\n\t\t\t\tEncoding:         c.encodings,\n\t\t\t\tPathInSchema:     c.columnPath,\n\t\t\t\tCodec:            c.compression.CompressionCodec(),\n\t\t\t\tKeyValueMetadata: nil, // TODO\n\t\t\t},\n\t\t}\n\t}\n\n\tfor i, c := range w.columns {\n\t\tc.columnChunk = &w.columnChunk[i]\n\t\tc.offsetIndex = &w.offsetIndex[i]\n\t}\n\n\tfor i, c := range w.columns {\n\t\tw.columnOrders[i] = *c.columnType.ColumnOrder()\n\t}\n\n\treturn w\n}\n\nfunc (w *writer) reset(writer io.Writer) {\n\tif w.buffer == nil {\n\t\tw.writer.Reset(writer)\n\t} else {\n\t\tw.buffer.Reset(writer)\n\t\tw.writer.Reset(w.buffer)\n\t}\n\tfor _, c := range w.columns {\n\t\tc.reset()\n\t}\n\tfor i := range w.rowGroups {\n\t\tw.rowGroups[i] = format.RowGroup{}\n\t}\n\tfor i := range w.columnIndexes {\n\t\tw.columnIndexes[i] = nil\n\t}\n\tfor i := range w.offsetIndexes {\n\t\tw.offsetIndexes[i] = nil\n\t}\n\tw.rowGroups = w.rowGroups[:0]\n\tw.columnIndexes = w.columnIndexes[:0]\n\tw.offsetIndexes = w.offsetIndexes[:0]\n}\n\nfunc (w *writer) close() error {\n\tif err := w.writeFileHeader(); err != nil {\n\t\treturn err\n\t}\n\tif err := w.flush(); err != nil {\n\t\treturn err\n\t}\n\tif err := w.writeFileFooter(); err != nil {\n\t\treturn err\n\t}\n\tif w.buffer != nil {\n\t\treturn w.buffer.Flush()\n\t}\n\treturn nil\n}\n\nfunc (w *writer) flush() error {\n\t_, err := w.writeRowGroup(nil, nil)\n\treturn err\n}\n\nfunc (w *writer) writeFileHeader() error {\n\tif w.writer.writer == nil {\n\t\treturn io.ErrClosedPipe\n\t}\n\tif w.writer.offset == 0 {\n\t\t_, err := w.writer.WriteString(\"PAR1\")\n\t\treturn err\n\t}\n\treturn nil\n}\n\nfunc (w *writer) configureBloomFilters(columnChunks []ColumnChunk) {\n\tfor i, c := range w.columns {\n\t\tif c.columnFilter != nil {\n\t\t\tc.resizeBloomFilter(columnChunks[i].NumValues())\n\t\t}\n\t}\n}\n\nfunc (w *writer) writeFileFooter() error {\n\t// The page index is composed of two sections: column and offset indexes.\n\t// They are written after the row groups, right before the footer (which\n\t// is written by the parent Writer.Close call).\n\t//\n\t// This section both writes the page index and generates the values of\n\t// ColumnIndexOffset, ColumnIndexLength, OffsetIndexOffset, and\n\t// OffsetIndexLength in the corresponding columns of the file metadata.\n\t//\n\t// Note: the page index is always written, even if we created data pages v1\n\t// because the parquet format is backward compatible in this case. Older\n\t// readers will simply ignore this section since they do not know how to\n\t// decode its content, nor have loaded any metadata to reference it.\n\tprotocol := new(thrift.CompactProtocol)\n\tencoder := thrift.NewEncoder(protocol.NewWriter(&w.writer))\n\n\tfor i, columnIndexes := range w.columnIndexes {\n\t\trowGroup := &w.rowGroups[i]\n\t\tfor j := range columnIndexes {\n\t\t\tcolumn := &rowGroup.Columns[j]\n\t\t\tcolumn.ColumnIndexOffset = w.writer.offset\n\t\t\tif err := encoder.Encode(&columnIndexes[j]); err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\t\t\tcolumn.ColumnIndexLength = int32(w.writer.offset - column.ColumnIndexOffset)\n\t\t}\n\t}\n\n\tfor i, offsetIndexes := range w.offsetIndexes {\n\t\trowGroup := &w.rowGroups[i]\n\t\tfor j := range offsetIndexes {\n\t\t\tcolumn := &rowGroup.Columns[j]\n\t\t\tcolumn.OffsetIndexOffset = w.writer.offset\n\t\t\tif err := encoder.Encode(&offsetIndexes[j]); err != nil {\n\t\t\t\treturn err\n\t\t\t}\n\t\t\tcolumn.OffsetIndexLength = int32(w.writer.offset - column.OffsetIndexOffset)\n\t\t}\n\t}\n\n\tnumRows := int64(0)\n\tfor rowGroupIndex := range w.rowGroups {\n\t\tnumRows += w.rowGroups[rowGroupIndex].NumRows\n\t}\n\n\tfooter, err := thrift.Marshal(new(thrift.CompactProtocol), &format.FileMetaData{\n\t\tVersion:          1,\n\t\tSchema:           w.schemaElements,\n\t\tNumRows:          numRows,\n\t\tRowGroups:        w.rowGroups,\n\t\tKeyValueMetadata: w.metadata,\n\t\tCreatedBy:        w.createdBy,\n\t\tColumnOrders:     w.columnOrders,\n\t})\n\tif err != nil {\n\t\treturn err\n\t}\n\n\tlength := len(footer)\n\tfooter = append(footer, 0, 0, 0, 0)\n\tfooter = append(footer, \"PAR1\"...)\n\tbinary.LittleEndian.PutUint32(footer[length:], uint32(length))\n\n\t_, err = w.writer.Write(footer)\n\treturn err\n}\n\nfunc (w *writer) writeRowGroup(rowGroupSchema *Schema, rowGroupSortingColumns []SortingColumn) (int64, error) {\n\tnumRows := w.columns[0].totalRowCount()\n\tif numRows == 0 {\n\t\treturn 0, nil\n\t}\n\n\tif len(w.rowGroups) == MaxRowGroups {\n\t\treturn 0, ErrTooManyRowGroups\n\t}\n\n\tdefer func() {\n\t\tw.numRows = 0\n\t\tfor _, c := range w.columns {\n\t\t\tc.reset()\n\t\t}\n\t\tfor i := range w.columnIndex {\n\t\t\tw.columnIndex[i] = format.ColumnIndex{}\n\t\t}\n\t}()\n\n\tfor _, c := range w.columns {\n\t\tif err := c.flush(); err != nil {\n\t\t\treturn 0, err\n\t\t}\n\t\tif err := c.flushFilterPages(); err != nil {\n\t\t\treturn 0, err\n\t\t}\n\t}\n\n\tif err := w.writeFileHeader(); err != nil {\n\t\treturn 0, err\n\t}\n\tfileOffset := w.writer.offset\n\n\tfor _, c := range w.columns {\n\t\tif len(c.filter) > 0 {\n\t\t\tc.columnChunk.MetaData.BloomFilterOffset = w.writer.offset\n\t\t\tif err := c.writeBloomFilter(&w.writer); err != nil {\n\t\t\t\treturn 0, err\n\t\t\t}\n\t\t}\n\t}\n\n\tfor i, c := range w.columns {\n\t\tw.columnIndex[i] = format.ColumnIndex(c.columnIndex.ColumnIndex())\n\n\t\tif c.dictionary != nil {\n\t\t\tc.columnChunk.MetaData.DictionaryPageOffset = w.writer.offset\n\t\t\tif err := c.writeDictionaryPage(&w.writer, c.dictionary); err != nil {\n\t\t\t\treturn 0, fmt.Errorf(\"writing dictionary page of row group colum %d: %w\", i, err)\n\t\t\t}\n\t\t}\n\n\t\tdataPageOffset := w.writer.offset\n\t\tc.columnChunk.MetaData.DataPageOffset = dataPageOffset\n\t\tfor j := range c.offsetIndex.PageLocations {\n\t\t\tc.offsetIndex.PageLocations[j].Offset += dataPageOffset\n\t\t}\n\n\t\tfor _, page := range c.pages {\n\t\t\tif _, err := io.Copy(&w.writer, page); err != nil {\n\t\t\t\treturn 0, fmt.Errorf(\"writing buffered pages of row group column %d: %w\", i, err)\n\t\t\t}\n\t\t}\n\t}\n\n\ttotalByteSize := int64(0)\n\ttotalCompressedSize := int64(0)\n\n\tfor i := range w.columnChunk {\n\t\tc := &w.columnChunk[i].MetaData\n\t\tsortPageEncodingStats(c.EncodingStats)\n\t\ttotalByteSize += int64(c.TotalUncompressedSize)\n\t\ttotalCompressedSize += int64(c.TotalCompressedSize)\n\t}\n\n\tsortingColumns := w.sortingColumns\n\tif len(sortingColumns) == 0 && len(rowGroupSortingColumns) > 0 {\n\t\tsortingColumns = make([]format.SortingColumn, 0, len(rowGroupSortingColumns))\n\t\tforEachLeafColumnOf(rowGroupSchema, func(leaf leafColumn) {\n\t\t\tif sortingIndex := searchSortingColumn(rowGroupSortingColumns, leaf.path); sortingIndex < len(sortingColumns) {\n\t\t\t\tsortingColumns[sortingIndex] = format.SortingColumn{\n\t\t\t\t\tColumnIdx:  int32(leaf.columnIndex),\n\t\t\t\t\tDescending: rowGroupSortingColumns[sortingIndex].Descending(),\n\t\t\t\t\tNullsFirst: rowGroupSortingColumns[sortingIndex].NullsFirst(),\n\t\t\t\t}\n\t\t\t}\n\t\t})\n\t}\n\n\tcolumns := make([]format.ColumnChunk, len(w.columnChunk))\n\tcopy(columns, w.columnChunk)\n\n\tcolumnIndex := make([]format.ColumnIndex, len(w.columnIndex))\n\tcopy(columnIndex, w.columnIndex)\n\n\toffsetIndex := make([]format.OffsetIndex, len(w.offsetIndex))\n\tcopy(offsetIndex, w.offsetIndex)\n\n\tfor i := range columns {\n\t\tc := &columns[i]\n\t\tc.MetaData.EncodingStats = make([]format.PageEncodingStats, len(c.MetaData.EncodingStats))\n\t\tcopy(c.MetaData.EncodingStats, w.columnChunk[i].MetaData.EncodingStats)\n\t}\n\n\tfor i := range offsetIndex {\n\t\tc := &offsetIndex[i]\n\t\tc.PageLocations = make([]format.PageLocation, len(c.PageLocations))\n\t\tcopy(c.PageLocations, w.offsetIndex[i].PageLocations)\n\t}\n\n\tw.rowGroups = append(w.rowGroups, format.RowGroup{\n\t\tColumns:             columns,\n\t\tTotalByteSize:       totalByteSize,\n\t\tNumRows:             numRows,\n\t\tSortingColumns:      sortingColumns,\n\t\tFileOffset:          fileOffset,\n\t\tTotalCompressedSize: totalCompressedSize,\n\t\tOrdinal:             int16(len(w.rowGroups)),\n\t})\n\n\tw.columnIndexes = append(w.columnIndexes, columnIndex)\n\tw.offsetIndexes = append(w.offsetIndexes, offsetIndex)\n\treturn numRows, nil\n}\n\nfunc (w *writer) WriteRows(rows []Row) (int, error) {\n\treturn w.writeRows(len(rows), func(start, end int) (int, error) {\n\t\tdefer func() {\n\t\t\tfor i, values := range w.values {\n\t\t\t\tclearValues(values)\n\t\t\t\tw.values[i] = values[:0]\n\t\t\t}\n\t\t}()\n\n\t\t// TODO: if an error occurs in this method the writer may be left in an\n\t\t// partially functional state. Applications are not expected to continue\n\t\t// using the writer after getting an error, but maybe we could ensure that\n\t\t// we are preventing further use as well?\n\t\tfor _, row := range rows[start:end] {\n\t\t\trow.Range(func(columnIndex int, columnValues []Value) bool {\n\t\t\t\tw.values[columnIndex] = append(w.values[columnIndex], columnValues...)\n\t\t\t\treturn true\n\t\t\t})\n\t\t}\n\n\t\tfor i, values := range w.values {\n\t\t\tif len(values) > 0 {\n\t\t\t\tif err := w.columns[i].writeRows(values); err != nil {\n\t\t\t\t\treturn 0, err\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\treturn end - start, nil\n\t})\n}\n\nfunc (w *writer) writeRows(numRows int, write func(i, j int) (int, error)) (int, error) {\n\twritten := 0\n\n\tfor written < numRows {\n\t\tremain := w.maxRows - w.numRows\n\t\tlength := numRows - written\n\n\t\tif remain == 0 {\n\t\t\tremain = w.maxRows\n\n\t\t\tif err := w.flush(); err != nil {\n\t\t\t\treturn written, err\n\t\t\t}\n\t\t}\n\n\t\tif remain < int64(length) {\n\t\t\tlength = int(remain)\n\t\t}\n\n\t\t// Since the writer cannot flush pages across row boundaries, calls to\n\t\t// WriteRows with very large slices can result in greatly exceeding the\n\t\t// target page size. To set a limit to the impact of these large writes\n\t\t// we chunk the input in slices of 64 rows.\n\t\t//\n\t\t// Note that this mechanism isn't perfect; for example, values may hold\n\t\t// large byte slices which could still cause the column buffers to grow\n\t\t// beyond the target page size.\n\t\tconst maxRowsPerWrite = 64\n\t\tif length > maxRowsPerWrite {\n\t\t\tlength = maxRowsPerWrite\n\t\t}\n\n\t\tn, err := write(written, written+length)\n\t\twritten += n\n\t\tw.numRows += int64(n)\n\t\tif err != nil {\n\t\t\treturn written, err\n\t\t}\n\t}\n\n\treturn written, nil\n}\n\n// The WriteValues method is intended to work in pair with WritePage to allow\n// programs to target writing values to specific columns of of the writer.\nfunc (w *writer) WriteValues(values []Value) (numValues int, err error) {\n\treturn w.columns[values[0].Column()].WriteValues(values)\n}\n\n// One writerBuffers is used by each writer instance, the memory buffers here\n// are shared by all columns of the writer because serialization is not done\n// concurrently, which helps keep memory utilization low, both in the total\n// footprint and GC cost.\n//\n// The type also exposes helper methods to facilitate the generation of parquet\n// pages. A scratch space is used when serialization requires combining multiple\n// buffers or compressing the page data, with double-buffering technique being\n// employed by swapping the scratch and page buffers to minimize memory copies.\ntype writerBuffers struct {\n\theader      bytes.Buffer // buffer where page headers are encoded\n\trepetitions []byte       // buffer used to encode repetition levels\n\tdefinitions []byte       // buffer used to encode definition levels\n\tpage        []byte       // page buffer holding the page data\n\tscratch     []byte       // scratch space used for compression\n}\n\nfunc (wb *writerBuffers) crc32() (checksum uint32) {\n\tchecksum = crc32.Update(checksum, crc32.IEEETable, wb.repetitions)\n\tchecksum = crc32.Update(checksum, crc32.IEEETable, wb.definitions)\n\tchecksum = crc32.Update(checksum, crc32.IEEETable, wb.page)\n\treturn checksum\n}\n\nfunc (wb *writerBuffers) size() int {\n\treturn len(wb.repetitions) + len(wb.definitions) + len(wb.page)\n}\n\nfunc (wb *writerBuffers) reset() {\n\twb.repetitions = wb.repetitions[:0]\n\twb.definitions = wb.definitions[:0]\n\twb.page = wb.page[:0]\n}\n\nfunc encodeLevels(dst, src []byte, maxLevel byte) ([]byte, error) {\n\tbitWidth := bits.Len8(maxLevel)\n\treturn levelEncodingsRLE[bitWidth-1].EncodeLevels(dst, src)\n}\n\nfunc (wb *writerBuffers) encodeRepetitionLevels(page Page, maxRepetitionLevel byte) (err error) {\n\twb.repetitions, err = encodeLevels(wb.repetitions, page.RepetitionLevels(), maxRepetitionLevel)\n\treturn\n}\n\nfunc (wb *writerBuffers) encodeDefinitionLevels(page Page, maxDefinitionLevel byte) (err error) {\n\twb.definitions, err = encodeLevels(wb.definitions, page.DefinitionLevels(), maxDefinitionLevel)\n\treturn\n}\n\nfunc (wb *writerBuffers) prependLevelsToDataPageV1(maxRepetitionLevel, maxDefinitionLevel byte) {\n\thasRepetitionLevels := maxRepetitionLevel > 0\n\thasDefinitionLevels := maxDefinitionLevel > 0\n\n\tif hasRepetitionLevels || hasDefinitionLevels {\n\t\twb.scratch = wb.scratch[:0]\n\t\t// In data pages v1, the repetition and definition levels are prefixed\n\t\t// with the 4 bytes length of the sections. While the parquet-format\n\t\t// documentation indicates that the length prefix is part of the hybrid\n\t\t// RLE/Bit-Pack encoding, this is the only condition where it is used\n\t\t// so we treat it as a special case rather than implementing it in the\n\t\t// encoding.\n\t\t//\n\t\t// Reference https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3\n\t\tif hasRepetitionLevels {\n\t\t\twb.scratch = plain.AppendInt32(wb.scratch, int32(len(wb.repetitions)))\n\t\t\twb.scratch = append(wb.scratch, wb.repetitions...)\n\t\t\twb.repetitions = wb.repetitions[:0]\n\t\t}\n\t\tif hasDefinitionLevels {\n\t\t\twb.scratch = plain.AppendInt32(wb.scratch, int32(len(wb.definitions)))\n\t\t\twb.scratch = append(wb.scratch, wb.definitions...)\n\t\t\twb.definitions = wb.definitions[:0]\n\t\t}\n\t\twb.scratch = append(wb.scratch, wb.page...)\n\t\twb.swapPageAndScratchBuffers()\n\t}\n}\n\nfunc (wb *writerBuffers) encode(page Page, enc encoding.Encoding) (err error) {\n\tpageType := page.Type()\n\tpageData := page.Data()\n\twb.page, err = pageType.Encode(wb.page[:0], pageData, enc)\n\treturn err\n}\n\nfunc (wb *writerBuffers) compress(codec compress.Codec) (err error) {\n\twb.scratch, err = codec.Encode(wb.scratch[:0], wb.page)\n\twb.swapPageAndScratchBuffers()\n\treturn err\n}\n\nfunc (wb *writerBuffers) swapPageAndScratchBuffers() {\n\twb.page, wb.scratch = wb.scratch, wb.page[:0]\n}\n\ntype writerColumn struct {\n\tpool  BufferPool\n\tpages []io.ReadWriteSeeker\n\n\tcolumnPath   columnPath\n\tcolumnType   Type\n\tcolumnIndex  ColumnIndexer\n\tcolumnBuffer ColumnBuffer\n\tcolumnFilter BloomFilterColumn\n\tencoding     encoding.Encoding\n\tcompression  compress.Codec\n\tdictionary   Dictionary\n\n\tdataPageType       format.PageType\n\tmaxRepetitionLevel byte\n\tmaxDefinitionLevel byte\n\n\tbuffers *writerBuffers\n\n\theader struct {\n\t\tprotocol thrift.CompactProtocol\n\t\tencoder  thrift.Encoder\n\t}\n\n\tfilter         []byte\n\tnumRows        int64\n\tbufferIndex    int32\n\tbufferSize     int32\n\twritePageStats bool\n\tisCompressed   bool\n\tencodings      []format.Encoding\n\n\tcolumnChunk *format.ColumnChunk\n\toffsetIndex *format.OffsetIndex\n}\n\nfunc (c *writerColumn) reset() {\n\tif c.columnBuffer != nil {\n\t\tc.columnBuffer.Reset()\n\t}\n\tif c.columnIndex != nil {\n\t\tc.columnIndex.Reset()\n\t}\n\tif c.dictionary != nil {\n\t\tc.dictionary.Reset()\n\t}\n\tfor _, page := range c.pages {\n\t\tc.pool.PutBuffer(page)\n\t}\n\tfor i := range c.pages {\n\t\tc.pages[i] = nil\n\t}\n\tc.pages = c.pages[:0]\n\t// Bloom filters may change in size between row groups, but we retain the\n\t// buffer to avoid reallocating large memory blocks.\n\tc.filter = c.filter[:0]\n\tc.numRows = 0\n\t// Reset the fields of column chunks that change between row groups,\n\t// but keep the ones that remain unchanged.\n\tc.columnChunk.MetaData.NumValues = 0\n\tc.columnChunk.MetaData.TotalUncompressedSize = 0\n\tc.columnChunk.MetaData.TotalCompressedSize = 0\n\tc.columnChunk.MetaData.DataPageOffset = 0\n\tc.columnChunk.MetaData.DictionaryPageOffset = 0\n\tc.columnChunk.MetaData.Statistics = format.Statistics{}\n\tc.columnChunk.MetaData.EncodingStats = c.columnChunk.MetaData.EncodingStats[:0]\n\tc.columnChunk.MetaData.BloomFilterOffset = 0\n\tc.offsetIndex.PageLocations = c.offsetIndex.PageLocations[:0]\n}\n\nfunc (c *writerColumn) totalRowCount() int64 {\n\tn := c.numRows\n\tif c.columnBuffer != nil {\n\t\tn += int64(c.columnBuffer.Len())\n\t}\n\treturn n\n}\n\nfunc (c *writerColumn) flush() (err error) {\n\tif c.columnBuffer.Len() > 0 {\n\t\tdefer c.columnBuffer.Reset()\n\t\t_, err = c.writeDataPage(c.columnBuffer.Page())\n\t}\n\treturn err\n}\n\nfunc (c *writerColumn) flushFilterPages() error {\n\tif c.columnFilter == nil {\n\t\treturn nil\n\t}\n\n\t// If there is a dictionary, it contains all the values that we need to\n\t// write to the filter.\n\tif dict := c.dictionary; dict != nil {\n\t\t// Need to always attempt to resize the filter, as the writer might\n\t\t// be reused after resetting which would have reset the length of\n\t\t// the filter to 0.\n\t\tc.resizeBloomFilter(int64(dict.Len()))\n\t\treturn c.writePageToFilter(dict.Page())\n\t}\n\n\t// When the filter was already allocated, pages have been written to it as\n\t// they were seen by the column writer.\n\tif len(c.filter) > 0 {\n\t\treturn nil\n\t}\n\n\t// When the filter was not allocated, the writer did not know how many\n\t// values were going to be seen and therefore could not properly size the\n\t// filter ahead of time. In this case, we read back all the pages that we\n\t// have encoded and copy their values back to the filter.\n\t//\n\t// A prior implementation of the column writer used to create in-memory\n\t// copies of the pages to avoid this decoding step; however, this unbounded\n\t// allocation caused memory exhaustion in production applications. CPU being\n\t// a somewhat more stretchable resource, we prefer spending time on this\n\t// decoding step than having to trigger incident response when production\n\t// systems are getting OOM-Killed.\n\tc.resizeBloomFilter(c.columnChunk.MetaData.NumValues)\n\n\tcolumn := &Column{\n\t\t// Set all the fields required by the decodeDataPage* methods.\n\t\ttyp:                c.columnType,\n\t\tencoding:           c.encoding,\n\t\tcompression:        c.compression,\n\t\tmaxRepetitionLevel: c.maxRepetitionLevel,\n\t\tmaxDefinitionLevel: c.maxDefinitionLevel,\n\t\tindex:              int16(c.bufferIndex),\n\t}\n\n\trbuf, pool := getBufioReader(nil, 1024)\n\tpbuf := (*buffer)(nil)\n\tdefer func() {\n\t\tputBufioReader(rbuf, pool)\n\t\tif pbuf != nil {\n\t\t\tpbuf.unref()\n\t\t}\n\t}()\n\n\tdecoder := thrift.NewDecoder(c.header.protocol.NewReader(rbuf))\n\n\tfor _, p := range c.pages {\n\t\trbuf.Reset(p)\n\n\t\theader := new(format.PageHeader)\n\t\tif err := decoder.Decode(header); err != nil {\n\t\t\treturn err\n\t\t}\n\n\t\tif pbuf != nil {\n\t\t\tpbuf.unref()\n\t\t}\n\t\tpbuf = buffers.get(int(header.CompressedPageSize))\n\t\tif _, err := io.ReadFull(rbuf, pbuf.data); err != nil {\n\t\t\treturn err\n\t\t}\n\t\tif _, err := p.Seek(0, io.SeekStart); err != nil {\n\t\t\treturn err\n\t\t}\n\n\t\tvar page Page\n\t\tvar err error\n\n\t\tswitch header.Type {\n\t\tcase format.DataPage:\n\t\t\tpage, err = column.decodeDataPageV1(DataPageHeaderV1{header.DataPageHeader}, pbuf, nil, header.UncompressedPageSize)\n\t\tcase format.DataPageV2:\n\t\t\tpage, err = column.decodeDataPageV2(DataPageHeaderV2{header.DataPageHeaderV2}, pbuf, nil, header.UncompressedPageSize)\n\t\t}\n\t\tif page != nil {\n\t\t\terr = c.writePageToFilter(page)\n\t\t\tRelease(page)\n\t\t}\n\t\tif err != nil {\n\t\t\treturn err\n\t\t}\n\t}\n\n\treturn nil\n}\n\nfunc (c *writerColumn) resizeBloomFilter(numValues int64) {\n\tfilterSize := c.columnFilter.Size(numValues)\n\tif cap(c.filter) < filterSize {\n\t\tc.filter = make([]byte, filterSize)\n\t} else {\n\t\tc.filter = c.filter[:filterSize]\n\t\tfor i := range c.filter {\n\t\t\tc.filter[i] = 0\n\t\t}\n\t}\n}\n\nfunc (c *writerColumn) newColumnBuffer() ColumnBuffer {\n\tcolumn := c.columnType.NewColumnBuffer(int(c.bufferIndex), c.columnType.EstimateNumValues(int(c.bufferSize)))\n\tswitch {\n\tcase c.maxRepetitionLevel > 0:\n\t\tcolumn = newRepeatedColumnBuffer(column, c.maxRepetitionLevel, c.maxDefinitionLevel, nullsGoLast)\n\tcase c.maxDefinitionLevel > 0:\n\t\tcolumn = newOptionalColumnBuffer(column, c.maxDefinitionLevel, nullsGoLast)\n\t}\n\treturn column\n}\n\nfunc (c *writerColumn) writeRows(rows []Value) error {\n\tif c.columnBuffer == nil {\n\t\t// Lazily create the row group column so we don't need to allocate it if\n\t\t// rows are not written individually to the column.\n\t\tc.columnBuffer = c.newColumnBuffer()\n\t}\n\tif _, err := c.columnBuffer.WriteValues(rows); err != nil {\n\t\treturn err\n\t}\n\tif c.columnBuffer.Size() >= int64(c.bufferSize) {\n\t\treturn c.flush()\n\t}\n\treturn nil\n}\n\nfunc (c *writerColumn) WriteValues(values []Value) (numValues int, err error) {\n\tif c.columnBuffer == nil {\n\t\tc.columnBuffer = c.newColumnBuffer()\n\t}\n\treturn c.columnBuffer.WriteValues(values)\n}\n\nfunc (c *writerColumn) writeBloomFilter(w io.Writer) error {\n\te := thrift.NewEncoder(c.header.protocol.NewWriter(w))\n\th := bloomFilterHeader(c.columnFilter)\n\th.NumBytes = int32(len(c.filter))\n\tif err := e.Encode(&h); err != nil {\n\t\treturn err\n\t}\n\t_, err := w.Write(c.filter)\n\treturn err\n}\n\nfunc (c *writerColumn) writeDataPage(page Page) (int64, error) {\n\tnumValues := page.NumValues()\n\tif numValues == 0 {\n\t\treturn 0, nil\n\t}\n\n\tbuf := c.buffers\n\tbuf.reset()\n\n\tif c.maxRepetitionLevel > 0 {\n\t\tbuf.encodeRepetitionLevels(page, c.maxRepetitionLevel)\n\t}\n\tif c.maxDefinitionLevel > 0 {\n\t\tbuf.encodeDefinitionLevels(page, c.maxDefinitionLevel)\n\t}\n\n\tif err := buf.encode(page, c.encoding); err != nil {\n\t\treturn 0, fmt.Errorf(\"encoding parquet data page: %w\", err)\n\t}\n\tif c.dataPageType == format.DataPage {\n\t\tbuf.prependLevelsToDataPageV1(c.maxDefinitionLevel, c.maxDefinitionLevel)\n\t}\n\n\tuncompressedPageSize := buf.size()\n\tif c.isCompressed {\n\t\tif err := buf.compress(c.compression); err != nil {\n\t\t\treturn 0, fmt.Errorf(\"compressing parquet data page: %w\", err)\n\t\t}\n\t}\n\n\tif page.Dictionary() == nil && len(c.filter) > 0 {\n\t\t// When the writer knows the number of values in advance (e.g. when\n\t\t// writing a full row group), the filter encoding is set and the page\n\t\t// can be directly applied to the filter, which minimizes memory usage\n\t\t// since there is no need to buffer the values in order to determine\n\t\t// the size of the filter.\n\t\tif err := c.writePageToFilter(page); err != nil {\n\t\t\treturn 0, err\n\t\t}\n\t}\n\n\tstatistics := format.Statistics{}\n\tif c.writePageStats {\n\t\tstatistics = c.makePageStatistics(page)\n\t}\n\n\tpageHeader := &format.PageHeader{\n\t\tType:                 c.dataPageType,\n\t\tUncompressedPageSize: int32(uncompressedPageSize),\n\t\tCompressedPageSize:   int32(buf.size()),\n\t\tCRC:                  int32(buf.crc32()),\n\t}\n\n\tnumRows := page.NumRows()\n\tnumNulls := page.NumNulls()\n\tswitch c.dataPageType {\n\tcase format.DataPage:\n\t\tpageHeader.DataPageHeader = &format.DataPageHeader{\n\t\t\tNumValues:               int32(numValues),\n\t\t\tEncoding:                c.encoding.Encoding(),\n\t\t\tDefinitionLevelEncoding: format.RLE,\n\t\t\tRepetitionLevelEncoding: format.RLE,\n\t\t\tStatistics:              statistics,\n\t\t}\n\tcase format.DataPageV2:\n\t\tpageHeader.DataPageHeaderV2 = &format.DataPageHeaderV2{\n\t\t\tNumValues:                  int32(numValues),\n\t\t\tNumNulls:                   int32(numNulls),\n\t\t\tNumRows:                    int32(numRows),\n\t\t\tEncoding:                   c.encoding.Encoding(),\n\t\t\tDefinitionLevelsByteLength: int32(len(buf.definitions)),\n\t\t\tRepetitionLevelsByteLength: int32(len(buf.repetitions)),\n\t\t\tIsCompressed:               &c.isCompressed,\n\t\t\tStatistics:                 statistics,\n\t\t}\n\t}\n\n\tbuf.header.Reset()\n\tif err := c.header.encoder.Encode(pageHeader); err != nil {\n\t\treturn 0, err\n\t}\n\n\tsize := int64(buf.header.Len()) +\n\t\tint64(len(buf.repetitions)) +\n\t\tint64(len(buf.definitions)) +\n\t\tint64(len(buf.page))\n\n\terr := c.writePageTo(size, func(output io.Writer) (written int64, err error) {\n\t\tfor _, data := range [...][]byte{\n\t\t\tbuf.header.Bytes(),\n\t\t\tbuf.repetitions,\n\t\t\tbuf.definitions,\n\t\t\tbuf.page,\n\t\t} {\n\t\t\twn, err := output.Write(data)\n\t\t\twritten += int64(wn)\n\t\t\tif err != nil {\n\t\t\t\treturn written, err\n\t\t\t}\n\t\t}\n\t\treturn written, nil\n\t})\n\tif err != nil {\n\t\treturn 0, err\n\t}\n\n\tc.recordPageStats(int32(buf.header.Len()), pageHeader, page)\n\treturn numValues, nil\n}\n\nfunc (c *writerColumn) writeDictionaryPage(output io.Writer, dict Dictionary) (err error) {\n\tbuf := c.buffers\n\tbuf.reset()\n\n\tif err := buf.encode(dict.Page(), &Plain); err != nil {\n\t\treturn fmt.Errorf(\"writing parquet dictionary page: %w\", err)\n\t}\n\n\tuncompressedPageSize := buf.size()\n\tif isCompressed(c.compression) {\n\t\tif err := buf.compress(c.compression); err != nil {\n\t\t\treturn fmt.Errorf(\"copmressing parquet dictionary page: %w\", err)\n\t\t}\n\t}\n\n\tpageHeader := &format.PageHeader{\n\t\tType:                 format.DictionaryPage,\n\t\tUncompressedPageSize: int32(uncompressedPageSize),\n\t\tCompressedPageSize:   int32(buf.size()),\n\t\tCRC:                  int32(buf.crc32()),\n\t\tDictionaryPageHeader: &format.DictionaryPageHeader{\n\t\t\tNumValues: int32(dict.Len()),\n\t\t\tEncoding:  format.Plain,\n\t\t\tIsSorted:  false,\n\t\t},\n\t}\n\n\theader := &c.buffers.header\n\theader.Reset()\n\tif err := c.header.encoder.Encode(pageHeader); err != nil {\n\t\treturn err\n\t}\n\tif _, err := output.Write(header.Bytes()); err != nil {\n\t\treturn err\n\t}\n\tif _, err := output.Write(buf.page); err != nil {\n\t\treturn err\n\t}\n\tc.recordPageStats(int32(header.Len()), pageHeader, nil)\n\treturn nil\n}\n\nfunc (w *writerColumn) writePageToFilter(page Page) (err error) {\n\tpageType := page.Type()\n\tpageData := page.Data()\n\tw.filter, err = pageType.Encode(w.filter, pageData, w.columnFilter.Encoding())\n\treturn err\n}\n\nfunc (c *writerColumn) writePageTo(size int64, writeTo func(io.Writer) (int64, error)) error {\n\tbuffer := c.pool.GetBuffer()\n\tdefer func() {\n\t\tif buffer != nil {\n\t\t\tc.pool.PutBuffer(buffer)\n\t\t}\n\t}()\n\twritten, err := writeTo(buffer)\n\tif err != nil {\n\t\treturn err\n\t}\n\tif written != size {\n\t\treturn fmt.Errorf(\"writing parquet column page expected %dB but got %dB: %w\", size, written, io.ErrShortWrite)\n\t}\n\toffset, err := buffer.Seek(0, io.SeekStart)\n\tif err != nil {\n\t\treturn err\n\t}\n\tif offset != 0 {\n\t\treturn fmt.Errorf(\"resetting parquet page buffer to the start expected offset zero but got %d\", offset)\n\t}\n\tc.pages, buffer = append(c.pages, buffer), nil\n\treturn nil\n}\n\nfunc (c *writerColumn) makePageStatistics(page Page) format.Statistics {\n\tnumNulls := page.NumNulls()\n\tminValue, maxValue, _ := page.Bounds()\n\tminValueBytes := minValue.Bytes()\n\tmaxValueBytes := maxValue.Bytes()\n\treturn format.Statistics{\n\t\tMin:       minValueBytes, // deprecated\n\t\tMax:       maxValueBytes, // deprecated\n\t\tNullCount: numNulls,\n\t\tMinValue:  minValueBytes,\n\t\tMaxValue:  maxValueBytes,\n\t}\n}\n\nfunc (c *writerColumn) recordPageStats(headerSize int32, header *format.PageHeader, page Page) {\n\tuncompressedSize := headerSize + header.UncompressedPageSize\n\tcompressedSize := headerSize + header.CompressedPageSize\n\n\tif page != nil {\n\t\tnumNulls := page.NumNulls()\n\t\tnumValues := page.NumValues()\n\t\tminValue, maxValue, pageHasBounds := page.Bounds()\n\t\tc.columnIndex.IndexPage(numValues, numNulls, minValue, maxValue)\n\t\tc.columnChunk.MetaData.NumValues += numValues\n\t\tc.columnChunk.MetaData.Statistics.NullCount += numNulls\n\n\t\tif pageHasBounds {\n\t\t\tvar existingMaxValue, existingMinValue Value\n\n\t\t\tif c.columnChunk.MetaData.Statistics.MaxValue != nil && c.columnChunk.MetaData.Statistics.MinValue != nil {\n\t\t\t\texistingMaxValue = c.columnType.Kind().Value(c.columnChunk.MetaData.Statistics.MaxValue)\n\t\t\t\texistingMinValue = c.columnType.Kind().Value(c.columnChunk.MetaData.Statistics.MinValue)\n\t\t\t}\n\n\t\t\tif existingMaxValue.isNull() || c.columnType.Compare(maxValue, existingMaxValue) > 0 {\n\t\t\t\tc.columnChunk.MetaData.Statistics.MaxValue = maxValue.Bytes()\n\t\t\t}\n\n\t\t\tif existingMinValue.isNull() || c.columnType.Compare(minValue, existingMinValue) < 0 {\n\t\t\t\tc.columnChunk.MetaData.Statistics.MinValue = minValue.Bytes()\n\t\t\t}\n\t\t}\n\n\t\tc.offsetIndex.PageLocations = append(c.offsetIndex.PageLocations, format.PageLocation{\n\t\t\tOffset:             c.columnChunk.MetaData.TotalCompressedSize,\n\t\t\tCompressedPageSize: compressedSize,\n\t\t\tFirstRowIndex:      c.numRows,\n\t\t})\n\n\t\tc.numRows += page.NumRows()\n\t}\n\n\tpageType := header.Type\n\tencoding := format.Encoding(-1)\n\tswitch pageType {\n\tcase format.DataPageV2:\n\t\tencoding = header.DataPageHeaderV2.Encoding\n\tcase format.DataPage:\n\t\tencoding = header.DataPageHeader.Encoding\n\tcase format.DictionaryPage:\n\t\tencoding = header.DictionaryPageHeader.Encoding\n\t}\n\n\tc.columnChunk.MetaData.TotalUncompressedSize += int64(uncompressedSize)\n\tc.columnChunk.MetaData.TotalCompressedSize += int64(compressedSize)\n\tc.columnChunk.MetaData.EncodingStats = addPageEncodingStats(c.columnChunk.MetaData.EncodingStats, format.PageEncodingStats{\n\t\tPageType: pageType,\n\t\tEncoding: encoding,\n\t\tCount:    1,\n\t})\n}\n\nfunc addEncoding(encodings []format.Encoding, add format.Encoding) []format.Encoding {\n\tfor _, enc := range encodings {\n\t\tif enc == add {\n\t\t\treturn encodings\n\t\t}\n\t}\n\treturn append(encodings, add)\n}\n\nfunc addPageEncodingStats(stats []format.PageEncodingStats, pages ...format.PageEncodingStats) []format.PageEncodingStats {\naddPages:\n\tfor _, add := range pages {\n\t\tfor i, st := range stats {\n\t\t\tif st.PageType == add.PageType && st.Encoding == add.Encoding {\n\t\t\t\tstats[i].Count += add.Count\n\t\t\t\tcontinue addPages\n\t\t\t}\n\t\t}\n\t\tstats = append(stats, add)\n\t}\n\treturn stats\n}\n\nfunc sortPageEncodings(encodings []format.Encoding) {\n\tsort.Slice(encodings, func(i, j int) bool {\n\t\treturn encodings[i] < encodings[j]\n\t})\n}\n\nfunc sortPageEncodingStats(stats []format.PageEncodingStats) {\n\tsort.Slice(stats, func(i, j int) bool {\n\t\ts1 := &stats[i]\n\t\ts2 := &stats[j]\n\t\tif s1.PageType != s2.PageType {\n\t\t\treturn s1.PageType < s2.PageType\n\t\t}\n\t\treturn s1.Encoding < s2.Encoding\n\t})\n}\n\ntype offsetTrackingWriter struct {\n\twriter io.Writer\n\toffset int64\n}\n\nfunc (w *offsetTrackingWriter) Reset(writer io.Writer) {\n\tw.writer = writer\n\tw.offset = 0\n}\n\nfunc (w *offsetTrackingWriter) Write(b []byte) (int, error) {\n\tn, err := w.writer.Write(b)\n\tw.offset += int64(n)\n\treturn n, err\n}\n\nfunc (w *offsetTrackingWriter) WriteString(s string) (int, error) {\n\tn, err := io.WriteString(w.writer, s)\n\tw.offset += int64(n)\n\treturn n, err\n}\n\nfunc (w *offsetTrackingWriter) ReadFrom(r io.Reader) (int64, error) {\n\t// io.Copy will make use of io.ReaderFrom if w.writer implements it.\n\tn, err := io.Copy(w.writer, r)\n\tw.offset += n\n\treturn n, err\n}\n\nvar (\n\t_ RowWriterWithSchema = (*Writer)(nil)\n\t_ RowReaderFrom       = (*Writer)(nil)\n\t_ RowGroupWriter      = (*Writer)(nil)\n\n\t_ RowWriter   = (*writer)(nil)\n\t_ ValueWriter = (*writer)(nil)\n\n\t_ ValueWriter = (*writerColumn)(nil)\n\n\t_ io.ReaderFrom   = (*offsetTrackingWriter)(nil)\n\t_ io.StringWriter = (*offsetTrackingWriter)(nil)\n)\n"
  },
  {
    "path": "writer_go18.go",
    "content": "//go:build go1.18\n\npackage parquet\n\nimport (\n\t\"io\"\n\t\"reflect\"\n)\n\n// GenericWriter is similar to a Writer but uses a type parameter to define the\n// Go type representing the schema of rows being written.\n//\n// Using this type over Writer has multiple advantages:\n//\n//   - By leveraging type information, the Go compiler can provide greater\n//     guarantees that the code is correct. For example, the parquet.Writer.Write\n//     method accepts an argument of type interface{}, which delays type checking\n//     until runtime. The parquet.GenericWriter[T].Write method ensures at\n//     compile time that the values it receives will be of type T, reducing the\n//     risk of introducing errors.\n//\n//   - Since type information is known at compile time, the implementation of\n//     parquet.GenericWriter[T] can make safe assumptions, removing the need for\n//     runtime validation of how the parameters are passed to its methods.\n//     Optimizations relying on type information are more effective, some of the\n//     writer's state can be precomputed at initialization, which was not possible\n//     with parquet.Writer.\n//\n//   - The parquet.GenericWriter[T].Write method uses a data-oriented design,\n//     accepting an slice of T instead of a single value, creating more\n//     opportunities to amortize the runtime cost of abstractions.\n//     This optimization is not available for parquet.Writer because its Write\n//     method's argument would be of type []interface{}, which would require\n//     conversions back and forth from concrete types to empty interfaces (since\n//     a []T cannot be interpreted as []interface{} in Go), would make the API\n//     more difficult to use and waste compute resources in the type conversions,\n//     defeating the purpose of the optimization in the first place.\n//\n// Note that this type is only available when compiling with Go 1.18 or later.\ntype GenericWriter[T any] struct {\n\t// At this time GenericWriter is expressed in terms of Writer to reuse the\n\t// underlying logic. In the future, and if we accepted to break backward\n\t// compatibility on the Write method, we could modify Writer to be an alias\n\t// to GenericWriter with:\n\t//\n\t//\ttype Writer = GenericWriter[any]\n\t//\n\tbase Writer\n\t// This function writes rows of type T to the writer, it gets generated by\n\t// the NewGenericWriter function based on the type T and the underlying\n\t// schema of the parquet file.\n\twrite writeFunc[T]\n\t// This field is used to leverage the optimized writeRowsFunc algorithms.\n\tcolumns []ColumnBuffer\n}\n\n// NewGenericWriter is like NewWriter but returns a GenericWriter[T] suited to\n// write rows of Go type T.\n//\n// The type parameter T should be a map, struct, or any. Any other types will\n// cause a panic at runtime. Type checking is a lot more effective when the\n// generic parameter is a struct type, using map and interface types is somewhat\n// similar to using a Writer.\n//\n// If the option list may explicitly declare a schema, it must be compatible\n// with the schema generated from T.\n//\n// Sorting columns may be set on the writer to configure the generated row\n// groups metadata. However, rows are always written in the order they were\n// seen, no reordering is performed, the writer expects the application to\n// ensure proper correlation between the order of rows and the list of sorting\n// columns. See SortingWriter[T] for a writer which handles reordering rows\n// based on the configured sorting columns.\nfunc NewGenericWriter[T any](output io.Writer, options ...WriterOption) *GenericWriter[T] {\n\tconfig, err := NewWriterConfig(options...)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\n\tschema := config.Schema\n\tt := typeOf[T]()\n\n\tif schema == nil && t != nil {\n\t\tschema = schemaOf(dereference(t))\n\t\tconfig.Schema = schema\n\t}\n\n\tif config.Schema == nil {\n\t\tpanic(\"generic writer must be instantiated with schema or concrete type.\")\n\t}\n\n\treturn &GenericWriter[T]{\n\t\tbase: Writer{\n\t\t\toutput: output,\n\t\t\tconfig: config,\n\t\t\tschema: schema,\n\t\t\twriter: newWriter(output, config),\n\t\t},\n\t\twrite: writeFuncOf[T](t, config.Schema),\n\t}\n}\n\ntype writeFunc[T any] func(*GenericWriter[T], []T) (int, error)\n\nfunc writeFuncOf[T any](t reflect.Type, schema *Schema) writeFunc[T] {\n\tif t == nil {\n\t\treturn (*GenericWriter[T]).writeAny\n\t}\n\tswitch t.Kind() {\n\tcase reflect.Interface, reflect.Map:\n\t\treturn (*GenericWriter[T]).writeRows\n\n\tcase reflect.Struct:\n\t\treturn makeWriteFunc[T](t, schema)\n\n\tcase reflect.Pointer:\n\t\tif e := t.Elem(); e.Kind() == reflect.Struct {\n\t\t\treturn makeWriteFunc[T](t, schema)\n\t\t}\n\t}\n\tpanic(\"cannot create writer for values of type \" + t.String())\n}\n\nfunc makeWriteFunc[T any](t reflect.Type, schema *Schema) writeFunc[T] {\n\twriteRows := writeRowsFuncOf(t, schema, nil)\n\treturn func(w *GenericWriter[T], rows []T) (n int, err error) {\n\t\tif w.columns == nil {\n\t\t\tw.columns = make([]ColumnBuffer, len(w.base.writer.columns))\n\t\t\tfor i, c := range w.base.writer.columns {\n\t\t\t\t// These fields are usually lazily initialized when writing rows,\n\t\t\t\t// we need them to exist now tho.\n\t\t\t\tc.columnBuffer = c.newColumnBuffer()\n\t\t\t\tw.columns[i] = c.columnBuffer\n\t\t\t}\n\t\t}\n\t\terr = writeRows(w.columns, makeArrayOf(rows), columnLevels{})\n\t\tif err == nil {\n\t\t\tn = len(rows)\n\t\t}\n\t\treturn n, err\n\t}\n}\n\nfunc (w *GenericWriter[T]) Close() error {\n\treturn w.base.Close()\n}\n\nfunc (w *GenericWriter[T]) Flush() error {\n\treturn w.base.Flush()\n}\n\nfunc (w *GenericWriter[T]) Reset(output io.Writer) {\n\tw.base.Reset(output)\n}\n\nfunc (w *GenericWriter[T]) Write(rows []T) (int, error) {\n\treturn w.base.writer.writeRows(len(rows), func(i, j int) (int, error) {\n\t\tn, err := w.write(w, rows[i:j:j])\n\t\tif err != nil {\n\t\t\treturn n, err\n\t\t}\n\n\t\tfor _, c := range w.base.writer.columns {\n\t\t\tif c.columnBuffer.Size() >= int64(c.bufferSize) {\n\t\t\t\tif err := c.flush(); err != nil {\n\t\t\t\t\treturn n, err\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\treturn n, nil\n\t})\n}\n\nfunc (w *GenericWriter[T]) WriteRows(rows []Row) (int, error) {\n\treturn w.base.WriteRows(rows)\n}\n\nfunc (w *GenericWriter[T]) WriteRowGroup(rowGroup RowGroup) (int64, error) {\n\treturn w.base.WriteRowGroup(rowGroup)\n}\n\n// SetKeyValueMetadata sets a key/value pair in the Parquet file metadata.\n//\n// Keys are assumed to be unique, if the same key is repeated multiple times the\n// last value is retained. While the parquet format does not require unique keys,\n// this design decision was made to optimize for the most common use case where\n// applications leverage this extension mechanism to associate single values to\n// keys. This may create incompatibilities with other parquet libraries, or may\n// cause some key/value pairs to be lost when open parquet files written with\n// repeated keys. We can revisit this decision if it ever becomes a blocker.\nfunc (w *GenericWriter[T]) SetKeyValueMetadata(key, value string) {\n\tw.base.SetKeyValueMetadata(key, value)\n}\n\nfunc (w *GenericWriter[T]) ReadRowsFrom(rows RowReader) (int64, error) {\n\treturn w.base.ReadRowsFrom(rows)\n}\n\nfunc (w *GenericWriter[T]) Schema() *Schema {\n\treturn w.base.Schema()\n}\n\nfunc (w *GenericWriter[T]) writeRows(rows []T) (int, error) {\n\tif cap(w.base.rowbuf) < len(rows) {\n\t\tw.base.rowbuf = make([]Row, len(rows))\n\t} else {\n\t\tw.base.rowbuf = w.base.rowbuf[:len(rows)]\n\t}\n\tdefer clearRows(w.base.rowbuf)\n\n\tschema := w.base.Schema()\n\tfor i := range rows {\n\t\tw.base.rowbuf[i] = schema.Deconstruct(w.base.rowbuf[i], &rows[i])\n\t}\n\n\treturn w.base.WriteRows(w.base.rowbuf)\n}\n\nfunc (w *GenericWriter[T]) writeAny(rows []T) (n int, err error) {\n\tfor i := range rows {\n\t\tif err = w.base.Write(rows[i]); err != nil {\n\t\t\treturn n, err\n\t\t}\n\t\tn++\n\t}\n\treturn n, nil\n}\n\nvar (\n\t_ RowWriterWithSchema = (*GenericWriter[any])(nil)\n\t_ RowReaderFrom       = (*GenericWriter[any])(nil)\n\t_ RowGroupWriter      = (*GenericWriter[any])(nil)\n\n\t_ RowWriterWithSchema = (*GenericWriter[struct{}])(nil)\n\t_ RowReaderFrom       = (*GenericWriter[struct{}])(nil)\n\t_ RowGroupWriter      = (*GenericWriter[struct{}])(nil)\n\n\t_ RowWriterWithSchema = (*GenericWriter[map[struct{}]struct{}])(nil)\n\t_ RowReaderFrom       = (*GenericWriter[map[struct{}]struct{}])(nil)\n\t_ RowGroupWriter      = (*GenericWriter[map[struct{}]struct{}])(nil)\n)\n"
  },
  {
    "path": "writer_go18_test.go",
    "content": "//go:build go1.18\n\npackage parquet_test\n\nimport (\n\t\"bytes\"\n\t\"io\"\n\t\"math/rand\"\n\t\"reflect\"\n\t\"testing\"\n\n\t\"github.com/segmentio/parquet-go\"\n)\n\nfunc BenchmarkGenericWriter(b *testing.B) {\n\tbenchmarkGenericWriter[benchmarkRowType](b)\n\tbenchmarkGenericWriter[booleanColumn](b)\n\tbenchmarkGenericWriter[int32Column](b)\n\tbenchmarkGenericWriter[int64Column](b)\n\tbenchmarkGenericWriter[floatColumn](b)\n\tbenchmarkGenericWriter[doubleColumn](b)\n\tbenchmarkGenericWriter[byteArrayColumn](b)\n\tbenchmarkGenericWriter[fixedLenByteArrayColumn](b)\n\tbenchmarkGenericWriter[stringColumn](b)\n\tbenchmarkGenericWriter[indexedStringColumn](b)\n\tbenchmarkGenericWriter[uuidColumn](b)\n\tbenchmarkGenericWriter[timeColumn](b)\n\tbenchmarkGenericWriter[timeInMillisColumn](b)\n\tbenchmarkGenericWriter[mapColumn](b)\n\tbenchmarkGenericWriter[decimalColumn](b)\n\tbenchmarkGenericWriter[contact](b)\n\tbenchmarkGenericWriter[paddedBooleanColumn](b)\n\tbenchmarkGenericWriter[optionalInt32Column](b)\n\tbenchmarkGenericWriter[repeatedInt32Column](b)\n}\n\nfunc benchmarkGenericWriter[Row generator[Row]](b *testing.B) {\n\tvar model Row\n\tb.Run(reflect.TypeOf(model).Name(), func(b *testing.B) {\n\t\tprng := rand.New(rand.NewSource(0))\n\t\trows := make([]Row, benchmarkNumRows)\n\t\tfor i := range rows {\n\t\t\trows[i] = rows[i].generate(prng)\n\t\t}\n\n\t\tb.Run(\"go1.17\", func(b *testing.B) {\n\t\t\twriter := parquet.NewWriter(io.Discard, parquet.SchemaOf(rows[0]))\n\t\t\ti := 0\n\t\t\tbenchmarkRowsPerSecond(b, func() int {\n\t\t\t\tfor j := 0; j < benchmarkRowsPerStep; j++ {\n\t\t\t\t\tif err := writer.Write(&rows[i]); err != nil {\n\t\t\t\t\t\tb.Fatal(err)\n\t\t\t\t\t}\n\t\t\t\t}\n\n\t\t\t\ti += benchmarkRowsPerStep\n\t\t\t\ti %= benchmarkNumRows\n\n\t\t\t\tif i == 0 {\n\t\t\t\t\twriter.Close()\n\t\t\t\t\twriter.Reset(io.Discard)\n\t\t\t\t}\n\t\t\t\treturn benchmarkRowsPerStep\n\t\t\t})\n\t\t})\n\n\t\tb.Run(\"go1.18\", func(b *testing.B) {\n\t\t\twriter := parquet.NewGenericWriter[Row](io.Discard)\n\t\t\ti := 0\n\t\t\tbenchmarkRowsPerSecond(b, func() int {\n\t\t\t\tn, err := writer.Write(rows[i : i+benchmarkRowsPerStep])\n\t\t\t\tif err != nil {\n\t\t\t\t\tb.Fatal(err)\n\t\t\t\t}\n\n\t\t\t\ti += benchmarkRowsPerStep\n\t\t\t\ti %= benchmarkNumRows\n\n\t\t\t\tif i == 0 {\n\t\t\t\t\twriter.Close()\n\t\t\t\t\twriter.Reset(io.Discard)\n\t\t\t\t}\n\t\t\t\treturn n\n\t\t\t})\n\t\t})\n\t})\n}\n\nfunc TestIssue272(t *testing.T) {\n\ttype T2 struct {\n\t\tX string `parquet:\",dict,optional\"`\n\t}\n\n\ttype T1 struct {\n\t\tTA *T2\n\t\tTB *T2\n\t}\n\n\ttype T struct {\n\t\tT1 *T1\n\t}\n\n\tconst nRows = 1\n\n\trow := T{\n\t\tT1: &T1{\n\t\t\tTA: &T2{\n\t\t\t\tX: \"abc\",\n\t\t\t},\n\t\t},\n\t}\n\n\trows := make([]T, nRows)\n\tfor i := range rows {\n\t\trows[i] = row\n\t}\n\n\tb := new(bytes.Buffer)\n\tw := parquet.NewGenericWriter[T](b)\n\n\tif _, err := w.Write(rows); err != nil {\n\t\tt.Fatal(err)\n\t}\n\tif err := w.Close(); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tf := bytes.NewReader(b.Bytes())\n\tr := parquet.NewGenericReader[T](f)\n\n\tparquetRows := make([]parquet.Row, nRows)\n\tn, err := r.ReadRows(parquetRows)\n\tif err != nil && err != io.EOF {\n\t\tt.Fatal(err)\n\t}\n\tif n != nRows {\n\t\tt.Fatalf(\"wrong number of rows read: want=%d got=%d\", nRows, n)\n\t}\n\tfor _, r := range parquetRows {\n\t\tif d := r[0].DefinitionLevel(); d != 3 {\n\t\t\tt.Errorf(\"wrong definition level for column 0: %d\", d)\n\t\t}\n\t\tif d := r[1].DefinitionLevel(); d != 1 {\n\t\t\tt.Errorf(\"wrong definition level for column 1: %d\", d)\n\t\t}\n\t}\n}\n\nfunc TestIssue279(t *testing.T) {\n\ttype T2 struct {\n\t\tId   int    `parquet:\",plain,optional\"`\n\t\tName string `parquet:\",plain,optional\"`\n\t}\n\n\ttype T1 struct {\n\t\tTA []*T2\n\t}\n\n\ttype T struct {\n\t\tT1 *T1\n\t}\n\n\tconst nRows = 1\n\n\trow := T{\n\t\tT1: &T1{\n\t\t\tTA: []*T2{\n\t\t\t\t{\n\t\t\t\t\tId:   43,\n\t\t\t\t\tName: \"john\",\n\t\t\t\t},\n\t\t\t},\n\t\t},\n\t}\n\n\trows := make([]T, nRows)\n\tfor i := range rows {\n\t\trows[i] = row\n\t}\n\n\tb := new(bytes.Buffer)\n\tw := parquet.NewGenericWriter[T](b)\n\n\tif _, err := w.Write(rows); err != nil {\n\t\tt.Fatal(err)\n\t}\n\tif err := w.Close(); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tf := bytes.NewReader(b.Bytes())\n\tr := parquet.NewGenericReader[T](f)\n\n\tparquetRows := make([]parquet.Row, nRows)\n\tn, err := r.ReadRows(parquetRows)\n\tif err != nil && err != io.EOF {\n\t\tt.Fatal(err)\n\t}\n\tif n != nRows {\n\t\tt.Fatalf(\"wrong number of rows read: want=%d got=%d\", nRows, n)\n\t}\n\tfor _, r := range parquetRows {\n\t\tif d := r[0].DefinitionLevel(); d != 3 {\n\t\t\tt.Errorf(\"wrong definition level for column 0: %d\", d)\n\t\t}\n\t\tif d := r[1].DefinitionLevel(); d != 3 {\n\t\t\tt.Errorf(\"wrong definition level for column 1: %d\", d)\n\t\t}\n\t}\n}\n\nfunc TestIssue302(t *testing.T) {\n\ttests := []struct {\n\t\tname string\n\t\tfn   func(t *testing.T)\n\t}{\n\t\t{\n\t\t\tname: \"SimpleMap\",\n\t\t\tfn: func(t *testing.T) {\n\t\t\t\ttype M map[string]int\n\n\t\t\t\ttype T struct {\n\t\t\t\t\tM M `parquet:\",\"`\n\t\t\t\t}\n\n\t\t\t\tb := new(bytes.Buffer)\n\t\t\t\t_ = parquet.NewGenericWriter[T](b)\n\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tname: \"MapWithValueTag\",\n\t\t\tfn: func(t *testing.T) {\n\t\t\t\ttype M map[string]int\n\n\t\t\t\ttype T struct {\n\t\t\t\t\tM M `parquet:\",\" parquet-value:\",zstd\"`\n\t\t\t\t}\n\n\t\t\t\tb := new(bytes.Buffer)\n\t\t\t\t_ = parquet.NewGenericWriter[T](b)\n\n\t\t\t},\n\t\t},\n\n\t\t{\n\t\t\tname: \"MapWithOptionalTag\",\n\t\t\tfn: func(t *testing.T) {\n\t\t\t\ttype M map[string]int\n\n\t\t\t\ttype T struct {\n\t\t\t\t\tM M `parquet:\",optional\"`\n\t\t\t\t}\n\n\t\t\t\tb := new(bytes.Buffer)\n\t\t\t\tw := parquet.NewGenericWriter[T](b)\n\t\t\t\texpect := []T{\n\t\t\t\t\t{\n\t\t\t\t\t\tM: M{\n\t\t\t\t\t\t\t\"Holden\": 1,\n\t\t\t\t\t\t\t\"Naomi\":  2,\n\t\t\t\t\t\t},\n\t\t\t\t\t},\n\t\t\t\t\t{\n\t\t\t\t\t\tM: nil,\n\t\t\t\t\t},\n\t\t\t\t\t{\n\t\t\t\t\t\tM: M{\n\t\t\t\t\t\t\t\"Naomi\":  1,\n\t\t\t\t\t\t\t\"Holden\": 2,\n\t\t\t\t\t\t},\n\t\t\t\t\t},\n\t\t\t\t}\n\t\t\t\t_, err := w.Write(expect)\n\t\t\t\tif err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\t\t\t\tif err = w.Close(); err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\n\t\t\t\tbufReader := bytes.NewReader(b.Bytes())\n\t\t\t\tr := parquet.NewGenericReader[T](bufReader)\n\t\t\t\tvalues := make([]T, 3)\n\t\t\t\t_, err = r.Read(values)\n\t\t\t\tif !reflect.DeepEqual(expect, values) {\n\t\t\t\t\tt.Fatalf(\"values do not match.\\n\\texpect: %v\\n\\tactual: %v\", expect, values)\n\t\t\t\t}\n\t\t\t},\n\t\t},\n\t}\n\n\tfor _, test := range tests {\n\t\tt.Run(test.name, test.fn)\n\t}\n}\n\nfunc TestIssue347Writer(t *testing.T) {\n\ttype TestType struct {\n\t\tKey int\n\t}\n\n\tb := new(bytes.Buffer)\n\t// instantiating with concrete type shouldn't panic\n\t_ = parquet.NewGenericWriter[TestType](b)\n\n\t// instantiating with schema and interface type parameter shouldn't panic\n\tschema := parquet.SchemaOf(TestType{})\n\t_ = parquet.NewGenericWriter[any](b, schema)\n\n\tdefer func() {\n\t\tif r := recover(); r == nil {\n\t\t\tt.Errorf(\"instantiating generic buffer without schema and with interface \" +\n\t\t\t\t\"type parameter should panic\")\n\t\t}\n\t}()\n\t_ = parquet.NewGenericWriter[any](b)\n}\n\nfunc TestIssue375(t *testing.T) {\n\ttype Row struct{ FirstName, LastName string }\n\n\toutput := new(bytes.Buffer)\n\twriter := parquet.NewGenericWriter[Row](output, parquet.MaxRowsPerRowGroup(10))\n\n\trows := make([]Row, 100)\n\tfor i := range rows {\n\t\trows[i] = Row{\n\t\t\tFirstName: \"0123456789\"[i%10 : i%10+1],\n\t\t\tLastName:  \"foo\",\n\t\t}\n\t}\n\n\tn, err := writer.Write(rows)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\tif n != len(rows) {\n\t\tt.Fatal(\"wrong number of rows written:\", n)\n\t}\n\n\tif err := writer.Close(); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tf, err := parquet.OpenFile(bytes.NewReader(output.Bytes()), int64(output.Len()))\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\trowGroups := f.RowGroups()\n\tif len(rowGroups) != 10 {\n\t\tt.Errorf(\"wrong number of row groups in parquet file: want=10 got=%d\", len(rowGroups))\n\t}\n}\n\nfunc TestGenericSetKeyValueMetadata(t *testing.T) {\n\ttestKey := \"test-key\"\n\ttestValue := \"test-value\"\n\n\ttype Row struct{ FirstName, LastName string }\n\n\toutput := new(bytes.Buffer)\n\twriter := parquet.NewGenericWriter[Row](output, parquet.MaxRowsPerRowGroup(10))\n\n\trows := []Row{\n\t\t{FirstName: \"First\", LastName: \"Last\"},\n\t}\n\n\t_, err := writer.Write(rows)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\twriter.SetKeyValueMetadata(testKey, testValue)\n\n\terr = writer.Close()\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tf, err := parquet.OpenFile(bytes.NewReader(output.Bytes()), int64(output.Len()))\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tvalue, ok := f.Lookup(testKey)\n\tif !ok {\n\t\tt.Fatalf(\"key/value metadata should have included %q\", testKey)\n\t}\n\tif value != testValue {\n\t\tt.Errorf(\"expected %q, got %q\", testValue, value)\n\t}\n}\n"
  },
  {
    "path": "writer_test.go",
    "content": "package parquet_test\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"os\"\n\t\"os/exec\"\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/google/uuid\"\n\t\"github.com/hexops/gotextdiff\"\n\t\"github.com/hexops/gotextdiff/myers\"\n\t\"github.com/hexops/gotextdiff/span\"\n\n\t\"github.com/segmentio/parquet-go\"\n\t\"github.com/segmentio/parquet-go/compress\"\n)\n\nconst (\n\tv1 = 1\n\tv2 = 2\n)\n\nfunc scanParquetFile(f *os.File) error {\n\ts, err := f.Stat()\n\tif err != nil {\n\t\treturn err\n\t}\n\n\tp, err := parquet.OpenFile(f, s.Size())\n\tif err != nil {\n\t\treturn err\n\t}\n\n\treturn scanParquetValues(p.Root())\n}\n\nfunc scanParquetValues(col *parquet.Column) error {\n\treturn forEachColumnValue(col, func(leaf *parquet.Column, value parquet.Value) error {\n\t\tfmt.Printf(\"%s > %+v\\n\", strings.Join(leaf.Path(), \".\"), value)\n\t\treturn nil\n\t})\n}\n\nfunc generateParquetFile(rows rows, options ...parquet.WriterOption) ([]byte, error) {\n\ttmp, err := os.CreateTemp(\"/tmp\", \"*.parquet\")\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\tdefer tmp.Close()\n\tpath := tmp.Name()\n\tdefer os.Remove(path)\n\t// fmt.Println(path)\n\n\twriterOptions := []parquet.WriterOption{parquet.PageBufferSize(20)}\n\twriterOptions = append(writerOptions, options...)\n\n\tif err := writeParquetFile(tmp, rows, writerOptions...); err != nil {\n\t\treturn nil, err\n\t}\n\n\tif err := scanParquetFile(tmp); err != nil {\n\t\treturn nil, err\n\t}\n\n\treturn parquetTools(\"dump\", path)\n}\n\ntype firstAndLastName struct {\n\tFirstName string `parquet:\"first_name,dict,zstd\"`\n\tLastName  string `parquet:\"last_name,delta,zstd\"`\n}\n\ntype timeseries struct {\n\tName      string  `parquet:\"name,dict\"`\n\tTimestamp int64   `parquet:\"timestamp,delta\"`\n\tValue     float64 `parquet:\"value\"`\n}\n\ntype event struct {\n\tName     string  `parquet:\"name,dict\"`\n\tType     string  `parquet:\"-\"`\n\tValue    float64 `parquet:\"value\"`\n\tCategory string  `parquet:\"-\"`\n}\n\nvar writerTests = []struct {\n\tscenario string\n\tversion  int\n\tcodec    compress.Codec\n\trows     []interface{}\n\tdump     string\n}{\n\t{\n\t\tscenario: \"page v1 with dictionary encoding\",\n\t\tversion:  v1,\n\t\trows: []interface{}{\n\t\t\t&firstAndLastName{FirstName: \"Han\", LastName: \"Solo\"},\n\t\t\t&firstAndLastName{FirstName: \"Leia\", LastName: \"Skywalker\"},\n\t\t\t&firstAndLastName{FirstName: \"Luke\", LastName: \"Skywalker\"},\n\t\t},\n\t\tdump: `row group 0\n--------------------------------------------------------------------------------\nfirst_name:  BINARY ZSTD DO:4 FPO:55 SZ:90/72/0.80 VC:3 ENC:RLE_DICTIONARY,PLAIN ST:[min: Han, max: Luke, num_nulls not defined]\nlast_name:   BINARY ZSTD DO:0 FPO:94 SZ:127/121/0.95 VC:3 ENC:DELTA_BYTE_ARRAY ST:[min: Skywalker, max: Solo, num_nulls not defined]\n\n    first_name TV=3 RL=0 DL=0 DS: 3 DE:PLAIN\n    ----------------------------------------------------------------------------\n    page 0:                        DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:7 VC:3\n\n    last_name TV=3 RL=0 DL=0\n    ----------------------------------------------------------------------------\n    page 0:                        DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:56 VC:2\n    page 1:                        DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:19 VC:1\n\nBINARY first_name\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 3 ***\nvalue 1: R:0 D:0 V:Han\nvalue 2: R:0 D:0 V:Leia\nvalue 3: R:0 D:0 V:Luke\n\nBINARY last_name\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 3 ***\nvalue 1: R:0 D:0 V:Solo\nvalue 2: R:0 D:0 V:Skywalker\nvalue 3: R:0 D:0 V:Skywalker\n`,\n\t},\n\n\t{ // same as the previous test but uses page v2 where data pages aren't compressed\n\t\tscenario: \"page v2 with dictionary encoding\",\n\t\tversion:  v2,\n\t\trows: []interface{}{\n\t\t\t&firstAndLastName{FirstName: \"Han\", LastName: \"Solo\"},\n\t\t\t&firstAndLastName{FirstName: \"Leia\", LastName: \"Skywalker\"},\n\t\t\t&firstAndLastName{FirstName: \"Luke\", LastName: \"Skywalker\"},\n\t\t},\n\t\tdump: `row group 0\n--------------------------------------------------------------------------------\nfirst_name:  BINARY ZSTD DO:4 FPO:55 SZ:86/77/0.90 VC:3 ENC:PLAIN,RLE_DICTIONARY ST:[min: Han, max: Luke, num_nulls not defined]\nlast_name:   BINARY ZSTD DO:0 FPO:90 SZ:137/131/0.96 VC:3 ENC:DELTA_BYTE_ARRAY ST:[min: Skywalker, max: Solo, num_nulls not defined]\n\n    first_name TV=3 RL=0 DL=0 DS: 3 DE:PLAIN\n    ----------------------------------------------------------------------------\n    page 0:                        DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:7 VC:3\n\n    last_name TV=3 RL=0 DL=0\n    ----------------------------------------------------------------------------\n    page 0:                        DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] SZ:56 VC:2\n    page 1:                        DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] SZ:19 VC:1\n\nBINARY first_name\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 3 ***\nvalue 1: R:0 D:0 V:Han\nvalue 2: R:0 D:0 V:Leia\nvalue 3: R:0 D:0 V:Luke\n\nBINARY last_name\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 3 ***\nvalue 1: R:0 D:0 V:Solo\nvalue 2: R:0 D:0 V:Skywalker\nvalue 3: R:0 D:0 V:Skywalker\n`,\n\t},\n\n\t{\n\t\tscenario: \"timeseries with delta encoding\",\n\t\tversion:  v2,\n\t\tcodec:    &parquet.Gzip,\n\t\trows: []interface{}{\n\t\t\ttimeseries{Name: \"http_request_total\", Timestamp: 1639444033, Value: 100},\n\t\t\ttimeseries{Name: \"http_request_total\", Timestamp: 1639444058, Value: 0},\n\t\t\ttimeseries{Name: \"http_request_total\", Timestamp: 1639444085, Value: 42},\n\t\t\ttimeseries{Name: \"http_request_total\", Timestamp: 1639444093, Value: 1},\n\t\t\ttimeseries{Name: \"http_request_total\", Timestamp: 1639444101, Value: 2},\n\t\t\ttimeseries{Name: \"http_request_total\", Timestamp: 1639444108, Value: 5},\n\t\t\ttimeseries{Name: \"http_request_total\", Timestamp: 1639444133, Value: 4},\n\t\t\ttimeseries{Name: \"http_request_total\", Timestamp: 1639444137, Value: 5},\n\t\t\ttimeseries{Name: \"http_request_total\", Timestamp: 1639444141, Value: 6},\n\t\t\ttimeseries{Name: \"http_request_total\", Timestamp: 1639444144, Value: 10},\n\t\t},\n\t\tdump: `row group 0\n--------------------------------------------------------------------------------\nname:       BINARY GZIP DO:4 FPO:70 SZ:126/101/0.80 VC:10 ENC:PLAIN,RLE_DICTIONARY ST:[min: http_request_total, max: http_request_total, num_nulls not defined]\ntimestamp:  INT64 GZIP DO:0 FPO:130 SZ:299/550/1.84 VC:10 ENC:DELTA_BINARY_PACKED ST:[min: 1639444033, max: 1639444144, num_nulls not defined]\nvalue:      DOUBLE GZIP DO:0 FPO:429 SZ:292/192/0.66 VC:10 ENC:PLAIN ST:[min: -0.0, max: 100.0, num_nulls not defined]\n\n    name TV=10 RL=0 DL=0 DS: 1 DE:PLAIN\n    ----------------------------------------------------------------------------\n    page 0:                   DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:5\n    page 1:                   DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:5\n\n    timestamp TV=10 RL=0 DL=0\n    ----------------------------------------------------------------------------\n    page 0:                   DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3\n    page 1:                   DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3\n    page 2:                   DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3\n    page 3:                   DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:9 VC:1\n\n    value TV=10 RL=0 DL=0\n    ----------------------------------------------------------------------------\n    page 0:                   DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3\n    page 1:                   DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3\n    page 2:                   DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3\n    page 3:                   DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:8 VC:1\n\nBINARY name\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 10 ***\nvalue 1:  R:0 D:0 V:http_request_total\nvalue 2:  R:0 D:0 V:http_request_total\nvalue 3:  R:0 D:0 V:http_request_total\nvalue 4:  R:0 D:0 V:http_request_total\nvalue 5:  R:0 D:0 V:http_request_total\nvalue 6:  R:0 D:0 V:http_request_total\nvalue 7:  R:0 D:0 V:http_request_total\nvalue 8:  R:0 D:0 V:http_request_total\nvalue 9:  R:0 D:0 V:http_request_total\nvalue 10: R:0 D:0 V:http_request_total\n\nINT64 timestamp\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 10 ***\nvalue 1:  R:0 D:0 V:1639444033\nvalue 2:  R:0 D:0 V:1639444058\nvalue 3:  R:0 D:0 V:1639444085\nvalue 4:  R:0 D:0 V:1639444093\nvalue 5:  R:0 D:0 V:1639444101\nvalue 6:  R:0 D:0 V:1639444108\nvalue 7:  R:0 D:0 V:1639444133\nvalue 8:  R:0 D:0 V:1639444137\nvalue 9:  R:0 D:0 V:1639444141\nvalue 10: R:0 D:0 V:1639444144\n\nDOUBLE value\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 10 ***\nvalue 1:  R:0 D:0 V:100.0\nvalue 2:  R:0 D:0 V:0.0\nvalue 3:  R:0 D:0 V:42.0\nvalue 4:  R:0 D:0 V:1.0\nvalue 5:  R:0 D:0 V:2.0\nvalue 6:  R:0 D:0 V:5.0\nvalue 7:  R:0 D:0 V:4.0\nvalue 8:  R:0 D:0 V:5.0\nvalue 9:  R:0 D:0 V:6.0\nvalue 10: R:0 D:0 V:10.0\n`,\n\t},\n\n\t{\n\t\tscenario: \"example from the twitter blog (v1)\",\n\t\tversion:  v1,\n\t\trows: []interface{}{\n\t\t\tAddressBook{\n\t\t\t\tOwner: \"Julien Le Dem\",\n\t\t\t\tOwnerPhoneNumbers: []string{\n\t\t\t\t\t\"555 123 4567\",\n\t\t\t\t\t\"555 666 1337\",\n\t\t\t\t},\n\t\t\t\tContacts: []Contact{\n\t\t\t\t\t{\n\t\t\t\t\t\tName:        \"Dmitriy Ryaboy\",\n\t\t\t\t\t\tPhoneNumber: \"555 987 6543\",\n\t\t\t\t\t},\n\t\t\t\t\t{\n\t\t\t\t\t\tName: \"Chris Aniszczyk\",\n\t\t\t\t\t},\n\t\t\t\t},\n\t\t\t},\n\t\t\tAddressBook{\n\t\t\t\tOwner:             \"A. Nonymous\",\n\t\t\t\tOwnerPhoneNumbers: nil,\n\t\t\t},\n\t\t},\n\n\t\tdump: `row group 0\n--------------------------------------------------------------------------------\nowner:              BINARY ZSTD DO:0 FPO:4 SZ:81/73/0.90 VC:2 ENC:DELTA_LENGTH_BYTE_ARRAY ST:[min: A. Nonymous, max: Julien Le Dem, num_nulls not defined]\nownerPhoneNumbers:  BINARY GZIP DO:0 FPO:85 SZ:179/129/0.72 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: 555 123 4567, max: 555 666 1337, num_nulls: 1]\ncontacts:\n.name:              BINARY UNCOMPRESSED DO:0 FPO:264 SZ:138/138/1.00 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: Chris Aniszczyk, max: Dmitriy Ryaboy, num_nulls: 1]\n.phoneNumber:       BINARY ZSTD DO:0 FPO:402 SZ:113/95/0.84 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: 555 987 6543, max: 555 987 6543, num_nulls: 2]\n\n    owner TV=2 RL=0 DL=0\n    ----------------------------------------------------------------------------\n    page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:50 VC:2\n\n    ownerPhoneNumbers TV=3 RL=1 DL=1\n    ----------------------------------------------------------------------------\n    page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:64 VC:2\n    page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:17 VC:1\n\n    contacts.name TV=3 RL=1 DL=1\n    ----------------------------------------------------------------------------\n    page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[verified] SZ:73 VC:2\n    page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[verified] SZ:17 VC:1\n\n    contacts.phoneNumber TV=3 RL=1 DL=2\n    ----------------------------------------------------------------------------\n    page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:33 VC:2\n    page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:17 VC:1\n\nBINARY owner\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 2 ***\nvalue 1: R:0 D:0 V:Julien Le Dem\nvalue 2: R:0 D:0 V:A. Nonymous\n\nBINARY ownerPhoneNumbers\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 3 ***\nvalue 1: R:0 D:1 V:555 123 4567\nvalue 2: R:1 D:1 V:555 666 1337\nvalue 3: R:0 D:0 V:<null>\n\nBINARY contacts.name\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 3 ***\nvalue 1: R:0 D:1 V:Dmitriy Ryaboy\nvalue 2: R:1 D:1 V:Chris Aniszczyk\nvalue 3: R:0 D:0 V:<null>\n\nBINARY contacts.phoneNumber\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 3 ***\nvalue 1: R:0 D:2 V:555 987 6543\nvalue 2: R:1 D:1 V:<null>\nvalue 3: R:0 D:0 V:<null>\n`,\n\t},\n\n\t{\n\t\tscenario: \"example from the twitter blog (v2)\",\n\t\tversion:  v2,\n\t\trows: []interface{}{\n\t\t\tAddressBook{\n\t\t\t\tOwner: \"Julien Le Dem\",\n\t\t\t\tOwnerPhoneNumbers: []string{\n\t\t\t\t\t\"555 123 4567\",\n\t\t\t\t\t\"555 666 1337\",\n\t\t\t\t},\n\t\t\t\tContacts: []Contact{\n\t\t\t\t\t{\n\t\t\t\t\t\tName:        \"Dmitriy Ryaboy\",\n\t\t\t\t\t\tPhoneNumber: \"555 987 6543\",\n\t\t\t\t\t},\n\t\t\t\t\t{\n\t\t\t\t\t\tName: \"Chris Aniszczyk\",\n\t\t\t\t\t},\n\t\t\t\t},\n\t\t\t},\n\t\t\tAddressBook{\n\t\t\t\tOwner:             \"A. Nonymous\",\n\t\t\t\tOwnerPhoneNumbers: nil,\n\t\t\t},\n\t\t},\n\n\t\tdump: `row group 0\n--------------------------------------------------------------------------------\nowner:              BINARY ZSTD DO:0 FPO:4 SZ:86/78/0.91 VC:2 ENC:DELTA_LENGTH_BYTE_ARRAY ST:[min: A. Nonymous, max: Julien Le Dem, num_nulls not defined]\nownerPhoneNumbers:  BINARY GZIP DO:0 FPO:90 SZ:172/122/0.71 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: 555 123 4567, max: 555 666 1337, num_nulls: 1]\ncontacts:\n.name:              BINARY UNCOMPRESSED DO:0 FPO:262 SZ:132/132/1.00 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: Chris Aniszczyk, max: Dmitriy Ryaboy, num_nulls: 1]\n.phoneNumber:       BINARY ZSTD DO:0 FPO:394 SZ:108/90/0.83 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: 555 987 6543, max: 555 987 6543, num_nulls: 2]\n\n    owner TV=2 RL=0 DL=0\n    ----------------------------------------------------------------------------\n    page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:50 VC:2\n\n    ownerPhoneNumbers TV=3 RL=1 DL=1\n    ----------------------------------------------------------------------------\n    page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:56 VC:2\n    page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1\n\n    contacts.name TV=3 RL=1 DL=1\n    ----------------------------------------------------------------------------\n    page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:65 VC:2\n    page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1\n\n    contacts.phoneNumber TV=3 RL=1 DL=2\n    ----------------------------------------------------------------------------\n    page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:25 VC:2\n    page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1\n\nBINARY owner\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 2 ***\nvalue 1: R:0 D:0 V:Julien Le Dem\nvalue 2: R:0 D:0 V:A. Nonymous\n\nBINARY ownerPhoneNumbers\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 3 ***\nvalue 1: R:0 D:1 V:555 123 4567\nvalue 2: R:1 D:1 V:555 666 1337\nvalue 3: R:0 D:0 V:<null>\n\nBINARY contacts.name\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 3 ***\nvalue 1: R:0 D:1 V:Dmitriy Ryaboy\nvalue 2: R:1 D:1 V:Chris Aniszczyk\nvalue 3: R:0 D:0 V:<null>\n\nBINARY contacts.phoneNumber\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 3 ***\nvalue 1: R:0 D:2 V:555 987 6543\nvalue 2: R:1 D:1 V:<null>\nvalue 3: R:0 D:0 V:<null>\n`,\n\t},\n\n\t{\n\t\tscenario: \"omit `-` fields\",\n\t\tversion:  v1,\n\t\trows: []interface{}{\n\t\t\t&event{Name: \"customer1\", Type: \"request\", Value: 42.0},\n\t\t\t&event{Name: \"customer2\", Type: \"access\", Value: 1.0},\n\t\t},\n\t\tdump: `row group 0\n--------------------------------------------------------------------------------\nname:   BINARY UNCOMPRESSED DO:4 FPO:49 SZ:73/73/1.00 VC:2 ENC:RLE_DICTIONARY,PLAIN ST:[min: customer1, max: customer2, num_nulls not defined]\nvalue:  DOUBLE UNCOMPRESSED DO:0 FPO:77 SZ:39/39/1.00 VC:2 ENC:PLAIN ST:[min: 1.0, max: 42.0, num_nulls not defined]\n\n    name TV=2 RL=0 DL=0 DS: 2 DE:PLAIN\n    ----------------------------------------------------------------------------\n    page 0:                  DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] CRC:[verified] SZ:5 VC:2\n\n    value TV=2 RL=0 DL=0\n    ----------------------------------------------------------------------------\n    page 0:                  DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] CRC:[verified] SZ:16 VC:2\n\nBINARY name\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 2 ***\nvalue 1: R:0 D:0 V:customer1\nvalue 2: R:0 D:0 V:customer2\n\nDOUBLE value\n--------------------------------------------------------------------------------\n*** row group 1 of 1, values 1 to 2 ***\nvalue 1: R:0 D:0 V:42.0\nvalue 2: R:0 D:0 V:1.0\n`,\n\t},\n}\n\nfunc TestWriter(t *testing.T) {\n\tif !hasParquetTools() {\n\t\tt.Skip(\"Skipping TestWriter writerTests because parquet-tools are not installed in Github CI. FIXME.\") // TODO\n\t}\n\n\tfor _, test := range writerTests {\n\t\tdataPageVersion := test.version\n\t\tcodec := test.codec\n\t\trows := test.rows\n\t\tdump := test.dump\n\n\t\tt.Run(test.scenario, func(t *testing.T) {\n\t\t\tt.Parallel()\n\n\t\t\tb, err := generateParquetFile(makeRows(rows),\n\t\t\t\tparquet.DataPageVersion(dataPageVersion),\n\t\t\t\tparquet.Compression(codec),\n\t\t\t)\n\t\t\tif err != nil {\n\t\t\t\tt.Logf(\"\\n%s\", string(b))\n\t\t\t\tt.Fatal(err)\n\t\t\t}\n\n\t\t\tif string(b) != dump {\n\t\t\t\tedits := myers.ComputeEdits(span.URIFromPath(\"want.txt\"), dump, string(b))\n\t\t\t\tdiff := fmt.Sprint(gotextdiff.ToUnified(\"want.txt\", \"got.txt\", dump, edits))\n\t\t\t\tt.Errorf(\"\\n%s\", diff)\n\t\t\t}\n\t\t})\n\t}\n}\n\nfunc hasParquetTools() bool {\n\t_, err := exec.LookPath(\"parquet-tools\")\n\treturn err == nil\n}\n\nfunc parquetTools(cmd, path string) ([]byte, error) {\n\tp := exec.Command(\"parquet-tools\", cmd, \"--debug\", \"--disable-crop\", path)\n\n\toutput, err := p.CombinedOutput()\n\tif err != nil {\n\t\treturn output, err\n\t}\n\n\t// parquet-tools has trailing spaces on some lines\n\tlines := bytes.Split(output, []byte(\"\\n\"))\n\n\tfor i, line := range lines {\n\t\tlines[i] = bytes.TrimRight(line, \" \")\n\t}\n\n\treturn bytes.Join(lines, []byte(\"\\n\")), nil\n}\n\nfunc TestWriterGenerateBloomFilters(t *testing.T) {\n\ttype Person struct {\n\t\tFirstName utf8string `parquet:\"first_name\"`\n\t\tLastName  utf8string `parquet:\"last_name\"`\n\t}\n\n\terr := quickCheck(func(rows []Person) bool {\n\t\tif len(rows) == 0 { // TODO: support writing files with no rows\n\t\t\treturn true\n\t\t}\n\n\t\tbuffer := new(bytes.Buffer)\n\t\twriter := parquet.NewWriter(buffer,\n\t\t\tparquet.BloomFilters(\n\t\t\t\tparquet.SplitBlockFilter(10, \"last_name\"),\n\t\t\t),\n\t\t)\n\t\tfor i := range rows {\n\t\t\tif err := writer.Write(&rows[i]); err != nil {\n\t\t\t\tt.Error(err)\n\t\t\t\treturn false\n\t\t\t}\n\t\t}\n\t\tif err := writer.Close(); err != nil {\n\t\t\tt.Error(err)\n\t\t\treturn false\n\t\t}\n\n\t\treader := bytes.NewReader(buffer.Bytes())\n\t\tf, err := parquet.OpenFile(reader, reader.Size())\n\t\tif err != nil {\n\t\t\tt.Error(err)\n\t\t\treturn false\n\t\t}\n\t\trowGroup := f.RowGroups()[0]\n\t\tcolumns := rowGroup.ColumnChunks()\n\t\tfirstName := columns[0]\n\t\tlastName := columns[1]\n\n\t\tif firstName.BloomFilter() != nil {\n\t\t\tt.Errorf(`\"first_name\" column has a bloom filter even though none were configured`)\n\t\t\treturn false\n\t\t}\n\n\t\tbloomFilter := lastName.BloomFilter()\n\t\tif bloomFilter == nil {\n\t\t\tt.Error(`\"last_name\" column has no bloom filter despite being configured to have one`)\n\t\t\treturn false\n\t\t}\n\n\t\tfor i, row := range rows {\n\t\t\tif ok, err := bloomFilter.Check(parquet.ValueOf(row.LastName)); err != nil {\n\t\t\t\tt.Errorf(\"unexpected error checking bloom filter: %v\", err)\n\t\t\t\treturn false\n\t\t\t} else if !ok {\n\t\t\t\tt.Errorf(\"bloom filter does not contain value %q of row %d\", row.LastName, i)\n\t\t\t\treturn false\n\t\t\t}\n\t\t}\n\n\t\treturn true\n\t})\n\tif err != nil {\n\t\tt.Error(err)\n\t}\n}\n\nfunc TestBloomFilterForDict(t *testing.T) {\n\ttype testStruct struct {\n\t\tA string `parquet:\"a,dict\"`\n\t}\n\n\tschema := parquet.SchemaOf(&testStruct{})\n\n\tb := bytes.NewBuffer(nil)\n\tw := parquet.NewWriter(\n\t\tb,\n\t\tschema,\n\t\tparquet.BloomFilters(parquet.SplitBlockFilter(10, \"a\")),\n\t)\n\n\terr := w.Write(&testStruct{A: \"test\"})\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\terr = w.Close()\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tf, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len()))\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tok, err := f.RowGroups()[0].ColumnChunks()[0].BloomFilter().Check(parquet.ValueOf(\"test\"))\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\tif !ok {\n\t\tt.Error(\"bloom filter should have contained 'test'\")\n\t}\n}\n\nfunc TestWriterRepeatedUUIDDict(t *testing.T) {\n\tinputID := uuid.MustParse(\"123456ab-0000-0000-0000-000000000000\")\n\trecords := []struct {\n\t\tList []uuid.UUID `parquet:\"list,dict\"`\n\t}{{\n\t\t[]uuid.UUID{inputID},\n\t}}\n\tschema := parquet.SchemaOf(&records[0])\n\tb := bytes.NewBuffer(nil)\n\tw := parquet.NewWriter(b, schema)\n\tif err := w.Write(records[0]); err != nil {\n\t\tt.Fatal(err)\n\t}\n\tif err := w.Close(); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tf, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len()))\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\trowbuf := make([]parquet.Row, 1)\n\trows := f.RowGroups()[0].Rows()\n\tdefer rows.Close()\n\tn, err := rows.ReadRows(rowbuf)\n\tif n == 0 {\n\t\tt.Fatalf(\"reading row from parquet file: %v\", err)\n\t}\n\tif len(rowbuf[0]) != 1 {\n\t\tt.Errorf(\"expected 1 value in row, got %d\", len(rowbuf[0]))\n\t}\n\tif !bytes.Equal(inputID[:], rowbuf[0][0].Bytes()) {\n\t\tt.Errorf(\"expected to get UUID %q back out, got %q\", inputID, rowbuf[0][0].Bytes())\n\t}\n}\n\nfunc TestWriterResetWithBloomFilters(t *testing.T) {\n\ttype Test struct {\n\t\tValue string `parquet:\"value,dict\"`\n\t}\n\n\twriter := parquet.NewWriter(new(bytes.Buffer),\n\t\tparquet.BloomFilters(\n\t\t\tparquet.SplitBlockFilter(10, \"value\"),\n\t\t),\n\t)\n\n\tif err := writer.Write(&Test{Value: \"foo\"}); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tif err := writer.Close(); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\twriter.Reset(new(bytes.Buffer))\n\n\tif err := writer.Write(&Test{Value: \"bar\"}); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tif err := writer.Close(); err != nil {\n\t\tt.Fatal(err)\n\t}\n}\n\nfunc TestWriterMaxRowsPerRowGroup(t *testing.T) {\n\toutput := new(bytes.Buffer)\n\twriter := parquet.NewWriter(output, parquet.MaxRowsPerRowGroup(10))\n\n\tfor i := 0; i < 100; i++ {\n\t\terr := writer.Write(struct{ FirstName, LastName string }{\n\t\t\tFirstName: \"0123456789\"[i%10 : i%10+1],\n\t\t\tLastName:  \"foo\",\n\t\t})\n\t\tif err != nil {\n\t\t\tt.Fatal(err)\n\t\t}\n\t}\n\n\tif err := writer.Close(); err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tf, err := parquet.OpenFile(bytes.NewReader(output.Bytes()), int64(output.Len()))\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\trowGroups := f.RowGroups()\n\tif len(rowGroups) != 10 {\n\t\tt.Errorf(\"wrong number of row groups in parquet file: want=10 got=%d\", len(rowGroups))\n\t}\n}\n\nfunc TestSetKeyValueMetadata(t *testing.T) {\n\ttestKey := \"test-key\"\n\ttestValue := \"test-value\"\n\n\ttype testStruct struct {\n\t\tA string `parquet:\"a,dict\"`\n\t}\n\n\tschema := parquet.SchemaOf(&testStruct{})\n\n\tb := bytes.NewBuffer(nil)\n\tw := parquet.NewWriter(\n\t\tb,\n\t\tschema,\n\t)\n\n\terr := w.Write(&testStruct{A: \"test\"})\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tw.SetKeyValueMetadata(testKey, testValue)\n\n\terr = w.Close()\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tf, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len()))\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tvalue, ok := f.Lookup(testKey)\n\tif !ok {\n\t\tt.Fatalf(\"key/value metadata should have included %q\", testKey)\n\t}\n\tif value != testValue {\n\t\tt.Errorf(\"expected %q, got %q\", testValue, value)\n\t}\n}\n\nfunc TestSetKeyValueMetadataOverwritesExisting(t *testing.T) {\n\ttestKey := \"test-key\"\n\ttestValue := \"test-value\"\n\n\ttype testStruct struct {\n\t\tA string `parquet:\"a,dict\"`\n\t}\n\n\tschema := parquet.SchemaOf(&testStruct{})\n\n\tb := bytes.NewBuffer(nil)\n\tw := parquet.NewWriter(\n\t\tb,\n\t\tschema,\n\t\tparquet.KeyValueMetadata(testKey, \"original-value\"),\n\t)\n\n\terr := w.Write(&testStruct{A: \"test\"})\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tw.SetKeyValueMetadata(testKey, testValue)\n\n\terr = w.Close()\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tf, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len()))\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\n\tvalue, ok := f.Lookup(testKey)\n\tif !ok {\n\t\tt.Fatalf(\"key/value metadata should have included %q\", testKey)\n\t}\n\tif value != testValue {\n\t\tt.Errorf(\"expected %q, got %q\", testValue, value)\n\t}\n}\n"
  }
]