Repository: google/riegeli
Branch: master
Commit: a0a8dac780d1
Files: 498
Total size: 5.0 MB
Directory structure:
gitextract_1atzokxc/
├── .bazelrc
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── MODULE.bazel
├── README.md
├── configure
├── doc/
│ ├── index.md
│ ├── record_writer_options.md
│ └── riegeli_records_file_format.md
├── python/
│ ├── BUILD
│ ├── MANIFEST.in
│ ├── README.md
│ ├── __init__.py
│ ├── build_pip_package.sh
│ ├── dummy_binary.py
│ ├── riegeli/
│ │ ├── BUILD
│ │ ├── BUILD.tpl
│ │ ├── __init__.py
│ │ ├── base/
│ │ │ ├── BUILD
│ │ │ ├── __init__.py
│ │ │ ├── riegeli_error.py
│ │ │ ├── utils.cc
│ │ │ └── utils.h
│ │ ├── bytes/
│ │ │ ├── BUILD
│ │ │ ├── python_reader.cc
│ │ │ ├── python_reader.h
│ │ │ ├── python_writer.cc
│ │ │ └── python_writer.h
│ │ ├── py_extension.bzl
│ │ ├── python_configure.bzl
│ │ ├── records/
│ │ │ ├── BUILD
│ │ │ ├── __init__.py
│ │ │ ├── examples/
│ │ │ │ ├── BUILD
│ │ │ │ ├── __init__.py
│ │ │ │ └── write_read_records.py
│ │ │ ├── record_position.cc
│ │ │ ├── record_position.h
│ │ │ ├── record_reader.cc
│ │ │ ├── record_writer.cc
│ │ │ ├── records_metadata.proto
│ │ │ ├── skipped_region.py
│ │ │ └── tests/
│ │ │ ├── BUILD
│ │ │ ├── __init__.py
│ │ │ ├── records_test.proto
│ │ │ └── records_test.py
│ │ └── tensorflow/
│ │ ├── BUILD
│ │ ├── __init__.py
│ │ ├── kernel_tests/
│ │ │ ├── __init__.py
│ │ │ └── riegeli_dataset_test.py
│ │ └── ops/
│ │ ├── __init__.py
│ │ └── riegeli_dataset_ops.py
│ └── setup.py
├── riegeli/
│ ├── .gitignore
│ ├── BUILD
│ ├── base/
│ │ ├── BUILD
│ │ ├── any.h
│ │ ├── any_initializer.h
│ │ ├── any_internal.h
│ │ ├── arithmetic.h
│ │ ├── assert.cc
│ │ ├── assert.h
│ │ ├── background_cleaning.cc
│ │ ├── background_cleaning.h
│ │ ├── binary_search.h
│ │ ├── buffer.cc
│ │ ├── buffer.h
│ │ ├── buffering.h
│ │ ├── byte_fill.cc
│ │ ├── byte_fill.h
│ │ ├── bytes_ref.h
│ │ ├── c_string_ref.h
│ │ ├── chain.cc
│ │ ├── chain.h
│ │ ├── chain_base.h
│ │ ├── chain_details.h
│ │ ├── closing_ptr.h
│ │ ├── compact_string.cc
│ │ ├── compact_string.h
│ │ ├── compare.h
│ │ ├── constexpr.h
│ │ ├── cord_iterator_span.cc
│ │ ├── cord_iterator_span.h
│ │ ├── cord_utils.cc
│ │ ├── cord_utils.h
│ │ ├── debug.cc
│ │ ├── debug.h
│ │ ├── dependency.h
│ │ ├── dependency_base.h
│ │ ├── dependency_manager.h
│ │ ├── errno_mapping.cc
│ │ ├── errno_mapping.h
│ │ ├── estimated_allocated_size.h
│ │ ├── external_data.cc
│ │ ├── external_data.h
│ │ ├── external_ref.h
│ │ ├── external_ref_base.h
│ │ ├── external_ref_support.h
│ │ ├── global.h
│ │ ├── hybrid_direct_common.h
│ │ ├── hybrid_direct_internal.h
│ │ ├── hybrid_direct_map.h
│ │ ├── hybrid_direct_set.h
│ │ ├── initializer.h
│ │ ├── initializer_internal.h
│ │ ├── intrusive_shared_ptr.h
│ │ ├── invoker.h
│ │ ├── iterable.h
│ │ ├── maker.h
│ │ ├── memory_estimator.cc
│ │ ├── memory_estimator.h
│ │ ├── moving_dependency.h
│ │ ├── new_aligned.h
│ │ ├── null_safe_memcpy.h
│ │ ├── object.cc
│ │ ├── object.h
│ │ ├── optional_compact_string.h
│ │ ├── options_parser.cc
│ │ ├── options_parser.h
│ │ ├── ownership.h
│ │ ├── parallelism.cc
│ │ ├── parallelism.h
│ │ ├── port.h
│ │ ├── recycling_pool.h
│ │ ├── ref_count.h
│ │ ├── reset.h
│ │ ├── shared_buffer.cc
│ │ ├── shared_buffer.h
│ │ ├── shared_ptr.h
│ │ ├── sized_shared_buffer.cc
│ │ ├── sized_shared_buffer.h
│ │ ├── stable_dependency.h
│ │ ├── status.cc
│ │ ├── status.h
│ │ ├── stream_utils.cc
│ │ ├── stream_utils.h
│ │ ├── string_ref.h
│ │ ├── string_utils.cc
│ │ ├── string_utils.h
│ │ ├── temporary_storage.h
│ │ ├── type_erased_ref.h
│ │ ├── type_id.h
│ │ ├── type_traits.h
│ │ ├── types.h
│ │ ├── unicode.cc
│ │ ├── unicode.h
│ │ └── uninitialized_vector.h
│ ├── brotli/
│ │ ├── BUILD
│ │ ├── brotli_allocator.cc
│ │ ├── brotli_allocator.h
│ │ ├── brotli_dictionary.cc
│ │ ├── brotli_dictionary.h
│ │ ├── brotli_reader.cc
│ │ ├── brotli_reader.h
│ │ ├── brotli_writer.cc
│ │ └── brotli_writer.h
│ ├── bytes/
│ │ ├── BUILD
│ │ ├── array_backward_writer.cc
│ │ ├── array_backward_writer.h
│ │ ├── array_writer.cc
│ │ ├── array_writer.h
│ │ ├── backward_writer.cc
│ │ ├── backward_writer.h
│ │ ├── buffer_options.cc
│ │ ├── buffer_options.h
│ │ ├── buffered_reader.cc
│ │ ├── buffered_reader.h
│ │ ├── buffered_writer.cc
│ │ ├── buffered_writer.h
│ │ ├── cfile_handle.cc
│ │ ├── cfile_handle.h
│ │ ├── cfile_internal.cc
│ │ ├── cfile_internal.h
│ │ ├── cfile_internal_for_cc.h
│ │ ├── cfile_reader.cc
│ │ ├── cfile_reader.h
│ │ ├── cfile_writer.cc
│ │ ├── cfile_writer.h
│ │ ├── chain_backward_writer.cc
│ │ ├── chain_backward_writer.h
│ │ ├── chain_reader.cc
│ │ ├── chain_reader.h
│ │ ├── chain_writer.cc
│ │ ├── chain_writer.h
│ │ ├── compact_string_writer.h
│ │ ├── copy_all.cc
│ │ ├── copy_all.h
│ │ ├── cord_backward_writer.cc
│ │ ├── cord_backward_writer.h
│ │ ├── cord_reader.cc
│ │ ├── cord_reader.h
│ │ ├── cord_writer.cc
│ │ ├── cord_writer.h
│ │ ├── fd_handle.cc
│ │ ├── fd_handle.h
│ │ ├── fd_internal.cc
│ │ ├── fd_internal.h
│ │ ├── fd_internal_for_cc.h
│ │ ├── fd_mmap_reader.cc
│ │ ├── fd_mmap_reader.h
│ │ ├── fd_reader.cc
│ │ ├── fd_reader.h
│ │ ├── fd_writer.cc
│ │ ├── fd_writer.h
│ │ ├── file_mode_string.cc
│ │ ├── file_mode_string.h
│ │ ├── iostream_internal.h
│ │ ├── istream_reader.cc
│ │ ├── istream_reader.h
│ │ ├── joining_reader.cc
│ │ ├── joining_reader.h
│ │ ├── limiting_backward_writer.cc
│ │ ├── limiting_backward_writer.h
│ │ ├── limiting_reader.cc
│ │ ├── limiting_reader.h
│ │ ├── limiting_writer.cc
│ │ ├── limiting_writer.h
│ │ ├── null_backward_writer.cc
│ │ ├── null_backward_writer.h
│ │ ├── null_writer.cc
│ │ ├── null_writer.h
│ │ ├── ostream_writer.cc
│ │ ├── ostream_writer.h
│ │ ├── path_ref.h
│ │ ├── position_shifting_backward_writer.cc
│ │ ├── position_shifting_backward_writer.h
│ │ ├── position_shifting_reader.cc
│ │ ├── position_shifting_reader.h
│ │ ├── position_shifting_writer.cc
│ │ ├── position_shifting_writer.h
│ │ ├── prefix_limiting_backward_writer.cc
│ │ ├── prefix_limiting_backward_writer.h
│ │ ├── prefix_limiting_reader.cc
│ │ ├── prefix_limiting_reader.h
│ │ ├── prefix_limiting_writer.cc
│ │ ├── prefix_limiting_writer.h
│ │ ├── pullable_reader.cc
│ │ ├── pullable_reader.h
│ │ ├── pushable_backward_writer.cc
│ │ ├── pushable_backward_writer.h
│ │ ├── pushable_writer.cc
│ │ ├── pushable_writer.h
│ │ ├── read_all.cc
│ │ ├── read_all.h
│ │ ├── reader.cc
│ │ ├── reader.h
│ │ ├── reader_cfile.cc
│ │ ├── reader_cfile.h
│ │ ├── reader_factory.cc
│ │ ├── reader_factory.h
│ │ ├── reader_istream.cc
│ │ ├── reader_istream.h
│ │ ├── resizable_writer.cc
│ │ ├── resizable_writer.h
│ │ ├── restricted_chain_writer.cc
│ │ ├── restricted_chain_writer.h
│ │ ├── splitting_writer.cc
│ │ ├── splitting_writer.h
│ │ ├── std_io.cc
│ │ ├── std_io.h
│ │ ├── string_reader.cc
│ │ ├── string_reader.h
│ │ ├── string_writer.cc
│ │ ├── string_writer.h
│ │ ├── stringify.h
│ │ ├── stringify_writer.h
│ │ ├── vector_writer.h
│ │ ├── wrapping_backward_writer.cc
│ │ ├── wrapping_backward_writer.h
│ │ ├── wrapping_reader.cc
│ │ ├── wrapping_reader.h
│ │ ├── wrapping_writer.cc
│ │ ├── wrapping_writer.h
│ │ ├── write.h
│ │ ├── write_int_internal.cc
│ │ ├── write_int_internal.h
│ │ ├── writer.cc
│ │ ├── writer.h
│ │ ├── writer_cfile.cc
│ │ ├── writer_cfile.h
│ │ ├── writer_ostream.cc
│ │ └── writer_ostream.h
│ ├── bzip2/
│ │ ├── BUILD
│ │ ├── bzip2_error.cc
│ │ ├── bzip2_error.h
│ │ ├── bzip2_reader.cc
│ │ ├── bzip2_reader.h
│ │ ├── bzip2_writer.cc
│ │ └── bzip2_writer.h
│ ├── chunk_encoding/
│ │ ├── BUILD
│ │ ├── README.md
│ │ ├── brotli_encoder_selection.cc
│ │ ├── brotli_encoder_selection.h
│ │ ├── chunk.cc
│ │ ├── chunk.h
│ │ ├── chunk_decoder.cc
│ │ ├── chunk_decoder.h
│ │ ├── chunk_encoder.cc
│ │ ├── chunk_encoder.h
│ │ ├── compressor.cc
│ │ ├── compressor.h
│ │ ├── compressor_options.cc
│ │ ├── compressor_options.h
│ │ ├── constants.h
│ │ ├── decompressor.cc
│ │ ├── decompressor.h
│ │ ├── deferred_encoder.cc
│ │ ├── deferred_encoder.h
│ │ ├── field_projection.h
│ │ ├── hash.cc
│ │ ├── hash.h
│ │ ├── simple_decoder.cc
│ │ ├── simple_decoder.h
│ │ ├── simple_encoder.cc
│ │ ├── simple_encoder.h
│ │ ├── transpose_decoder.cc
│ │ ├── transpose_decoder.h
│ │ ├── transpose_encoder.cc
│ │ ├── transpose_encoder.h
│ │ └── transpose_internal.h
│ ├── containers/
│ │ ├── BUILD
│ │ ├── chunked_sorted_string_set.cc
│ │ ├── chunked_sorted_string_set.h
│ │ ├── linear_sorted_string_set.cc
│ │ └── linear_sorted_string_set.h
│ ├── csv/
│ │ ├── BUILD
│ │ ├── csv_reader.cc
│ │ ├── csv_reader.h
│ │ ├── csv_record.cc
│ │ ├── csv_record.h
│ │ ├── csv_writer.cc
│ │ └── csv_writer.h
│ ├── digests/
│ │ ├── BUILD
│ │ ├── adler32_digester.cc
│ │ ├── adler32_digester.h
│ │ ├── crc32_digester.cc
│ │ ├── crc32_digester.h
│ │ ├── crc32c_digester.h
│ │ ├── digest_converter.h
│ │ ├── digester_handle.cc
│ │ ├── digester_handle.h
│ │ ├── digesting_reader.cc
│ │ ├── digesting_reader.h
│ │ ├── digesting_writer.cc
│ │ ├── digesting_writer.h
│ │ ├── highwayhash_digester.cc
│ │ ├── highwayhash_digester.h
│ │ ├── md5_digester.h
│ │ ├── openssl_digester.h
│ │ ├── sha1_digester.h
│ │ ├── sha256_digester.h
│ │ ├── sha512_256_digester.h
│ │ ├── sha512_digester.h
│ │ └── wrapping_digester.h
│ ├── endian/
│ │ ├── BUILD
│ │ ├── endian_reading.h
│ │ └── endian_writing.h
│ ├── gcs/
│ │ ├── BUILD
│ │ ├── gcs_internal.h
│ │ ├── gcs_object.cc
│ │ ├── gcs_object.h
│ │ ├── gcs_reader.cc
│ │ ├── gcs_reader.h
│ │ ├── gcs_writer.cc
│ │ └── gcs_writer.h
│ ├── lines/
│ │ ├── BUILD
│ │ ├── line_reading.cc
│ │ ├── line_reading.h
│ │ ├── line_writing.h
│ │ ├── newline.h
│ │ ├── text_reader.cc
│ │ ├── text_reader.h
│ │ ├── text_writer.cc
│ │ └── text_writer.h
│ ├── lz4/
│ │ ├── BUILD
│ │ ├── lz4_dictionary.cc
│ │ ├── lz4_dictionary.h
│ │ ├── lz4_reader.cc
│ │ ├── lz4_reader.h
│ │ ├── lz4_writer.cc
│ │ └── lz4_writer.h
│ ├── messages/
│ │ ├── BUILD
│ │ ├── context_projection.h
│ │ ├── dynamic_field_handler.h
│ │ ├── field_copier.h
│ │ ├── field_handler_map.h
│ │ ├── field_handlers.cc
│ │ ├── field_handlers.h
│ │ ├── map_entry_field.h
│ │ ├── message_wire_format.h
│ │ ├── parse_message.cc
│ │ ├── parse_message.h
│ │ ├── serialize_message.cc
│ │ ├── serialize_message.h
│ │ ├── serialized_message_assembler.cc
│ │ ├── serialized_message_assembler.h
│ │ ├── serialized_message_backward_writer.cc
│ │ ├── serialized_message_backward_writer.h
│ │ ├── serialized_message_internal.h
│ │ ├── serialized_message_reader.cc
│ │ ├── serialized_message_reader.h
│ │ ├── serialized_message_reader_internal.h
│ │ ├── serialized_message_writer.cc
│ │ ├── serialized_message_writer.h
│ │ ├── text_parse_message.cc
│ │ ├── text_parse_message.h
│ │ ├── text_print_message.cc
│ │ └── text_print_message.h
│ ├── ordered_varint/
│ │ ├── BUILD
│ │ ├── ordered_varint_internal.h
│ │ ├── ordered_varint_reading.cc
│ │ ├── ordered_varint_reading.h
│ │ ├── ordered_varint_writing.cc
│ │ └── ordered_varint_writing.h
│ ├── records/
│ │ ├── BUILD
│ │ ├── README.md
│ │ ├── block.h
│ │ ├── chunk_reader.cc
│ │ ├── chunk_reader.h
│ │ ├── chunk_writer.cc
│ │ ├── chunk_writer.h
│ │ ├── record_position.cc
│ │ ├── record_position.h
│ │ ├── record_reader.cc
│ │ ├── record_reader.h
│ │ ├── record_writer.cc
│ │ ├── record_writer.h
│ │ ├── records_metadata.proto
│ │ ├── skipped_region.cc
│ │ ├── skipped_region.h
│ │ └── tools/
│ │ ├── BUILD
│ │ ├── describe_riegeli_file.cc
│ │ ├── records_benchmark.cc
│ │ ├── riegeli_summary.proto
│ │ ├── tfrecord_recognizer.cc
│ │ └── tfrecord_recognizer.h
│ ├── snappy/
│ │ ├── BUILD
│ │ ├── framed/
│ │ │ ├── BUILD
│ │ │ ├── framed_snappy_reader.cc
│ │ │ ├── framed_snappy_reader.h
│ │ │ ├── framed_snappy_writer.cc
│ │ │ └── framed_snappy_writer.h
│ │ ├── hadoop/
│ │ │ ├── BUILD
│ │ │ ├── hadoop_snappy_reader.cc
│ │ │ ├── hadoop_snappy_reader.h
│ │ │ ├── hadoop_snappy_writer.cc
│ │ │ └── hadoop_snappy_writer.h
│ │ ├── snappy_reader.cc
│ │ ├── snappy_reader.h
│ │ ├── snappy_streams.cc
│ │ ├── snappy_streams.h
│ │ ├── snappy_writer.cc
│ │ └── snappy_writer.h
│ ├── tensorflow/
│ │ ├── BUILD
│ │ ├── io/
│ │ │ ├── BUILD
│ │ │ ├── file_reader.cc
│ │ │ ├── file_reader.h
│ │ │ ├── file_writer.cc
│ │ │ ├── file_writer.h
│ │ │ └── tstring_writer.h
│ │ ├── kernels/
│ │ │ └── riegeli_dataset_ops.cc
│ │ └── ops/
│ │ └── riegeli_dataset_ops.cc
│ ├── text/
│ │ ├── BUILD
│ │ ├── ascii_align.h
│ │ ├── concat.h
│ │ ├── join.h
│ │ ├── write_int.cc
│ │ └── write_int.h
│ ├── varint/
│ │ ├── BUILD
│ │ ├── varint_internal.h
│ │ ├── varint_reading.cc
│ │ ├── varint_reading.h
│ │ └── varint_writing.h
│ ├── xz/
│ │ ├── BUILD
│ │ ├── xz_error.cc
│ │ ├── xz_error.h
│ │ ├── xz_reader.cc
│ │ ├── xz_reader.h
│ │ ├── xz_writer.cc
│ │ └── xz_writer.h
│ ├── zlib/
│ │ ├── BUILD
│ │ ├── zlib_dictionary.h
│ │ ├── zlib_error.cc
│ │ ├── zlib_error.h
│ │ ├── zlib_reader.cc
│ │ ├── zlib_reader.h
│ │ ├── zlib_writer.cc
│ │ └── zlib_writer.h
│ └── zstd/
│ ├── BUILD
│ ├── zstd_dictionary.cc
│ ├── zstd_dictionary.h
│ ├── zstd_reader.cc
│ ├── zstd_reader.h
│ ├── zstd_writer.cc
│ └── zstd_writer.h
└── tf_dependency/
├── BUILD
├── BUILD.tpl
└── tf_configure.bzl
================================================
FILE CONTENTS
================================================
================================================
FILE: .bazelrc
================================================
# Enable Bzlmod by default.
common --enable_bzlmod
# Use C++17.
build --cxxopt=-std=c++17
build --host_cxxopt=-std=c++17
# Make Python protos faster by backing them with C++ protos.
# TODO: Reenable once protobuf releases
# https://github.com/protocolbuffers/protobuf/pull/22633
# i.e. in version > 32.0. Or possibly switch to upb.
# build --define=use_fast_cpp_protos=true
# Options from ./configure
# This is currently disabled because TensorFlow does not support bzlmod,
# hence Riegeli/TensorFlow bindings are broken anyway.
# import %workspace%/configure.bazelrc
================================================
FILE: CONTRIBUTING.md
================================================
# How to Contribute
We'd love to accept your patches and contributions to this project. There are
just a few small guidelines you need to follow.
## Contributor License Agreement
Contributions to this project must be accompanied by a Contributor License
Agreement. You (or your employer) retain the copyright to your contribution,
this simply gives us permission to use and redistribute your contributions as
part of the project. Head over to to see
your current agreements on file or to sign a new one.
You generally only need to submit a CLA once, so if you've already submitted one
(even if it was for a different project), you probably don't need to do it
again.
## Code reviews
All submissions, including submissions by project members, require review. We
use GitHub pull requests for this purpose. Consult
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
information on using pull requests.
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: MANIFEST.in
================================================
include .bazelrc
include *.md
include LICENSE
include MANIFEST.in
include WORKSPACE
include configure
recursive-include doc *
recursive-include python *
recursive-include riegeli *
recursive-include third_party *
================================================
FILE: MODULE.bazel
================================================
module(
name = "riegeli",
repo_name = "com_google_riegeli",
)
bazel_dep(
name = "abseil-cpp",
version = "20260107.0",
repo_name = "com_google_absl",
)
bazel_dep(
name = "abseil-py",
version = "2.1.0",
repo_name = "absl_py",
)
bazel_dep(
name = "bazel_skylib",
version = "1.7.1",
)
bazel_dep(
name = "boringssl",
version = "0.0.0-20240530-2db0eb3",
)
bazel_dep(
name = "brotli",
version = "1.1.0",
repo_name = "org_brotli",
)
bazel_dep(
name = "bzip2",
version = "1.0.8",
)
bazel_dep(
name = "highwayhash",
version = "0.0.0-20240305-5ad3bf8.bcr.1",
)
bazel_dep(
name = "lz4",
version = "1.9.4",
)
bazel_dep(
name = "platforms",
version = "0.0.9",
)
bazel_dep(
name = "protobuf",
version = "33.2",
repo_name = "com_google_protobuf",
)
bazel_dep(
name = "rules_cc",
version = "0.1.2",
)
bazel_dep(
name = "rules_python",
version = "0.36.0",
)
bazel_dep(
name = "snappy",
version = "1.2.0",
)
bazel_dep(
name = "xz",
version = "5.4.5.bcr.1",
)
bazel_dep(
name = "zlib",
version = "1.3.1.bcr.3",
)
bazel_dep(
name = "zstd",
version = "1.5.6",
repo_name = "net_zstd",
)
bazel_dep(
name = "google_cloud_cpp",
version = "3.0.0-rc1",
)
# Configure hermetic Python toolchain
SUPPORTED_PYTHON_VERSIONS = [
"3.8",
"3.9",
"3.10",
"3.11",
"3.12",
]
DEFAULT_PYTHON_VERSION = SUPPORTED_PYTHON_VERSIONS[-1]
python = use_extension("@rules_python//python/extensions:python.bzl", "python")
[
python.toolchain(
is_default = version == DEFAULT_PYTHON_VERSION,
python_version = version,
)
for version in SUPPORTED_PYTHON_VERSIONS
]
================================================
FILE: README.md
================================================
# Riegeli
*Riegeli/records* is a file format for storing a sequence of string records,
typically serialized protocol buffers. It supports dense compression, fast
decoding, seeking, detection and optional skipping of data corruption, filtering
of proto message fields for even faster decoding, and parallel encoding.
See [documentation](https://github.com/google/riegeli/blob/master/doc/index.md).
# Status
Riegeli file format will only change in a backward compatible way (i.e. future
readers will understand current files, but current readers might not understand
files using future features).
Riegeli C++ API might change in incompatible ways.
================================================
FILE: configure
================================================
#!/bin/bash
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
PYTHON_BIN_PATH=`which python`
if [[ $PYTHON_BIN_PATH ]] && $PYTHON_BIN_PATH -c "import tensorflow" &>/dev/null; then
TF_CFLAGS=$($PYTHON_BIN_PATH -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))')
TF_LFLAGS=$($PYTHON_BIN_PATH -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))')
TF_HEADER_DIR=${TF_CFLAGS%% *}
TF_HEADER_DIR=${TF_HEADER_DIR#-I}
TF_SHARED_LIBRARY_DIR=${TF_LFLAGS%% *}
TF_SHARED_LIBRARY_DIR=${TF_SHARED_LIBRARY_DIR#-L}
TF_SHARED_LIBRARY_NAME=${TF_LFLAGS##* -l:}
else
TF_HEADER_DIR=
TF_SHARED_LIBRARY_DIR=
TF_SHARED_LIBRARY_NAME=
fi
{
printf 'build --action_env PYTHON_BIN_PATH="%s"\n' "$PYTHON_BIN_PATH"
printf 'build --action_env TF_HEADER_DIR="%s"\n' "$TF_HEADER_DIR"
printf 'build --action_env TF_SHARED_LIBRARY_DIR="%s"\n' "$TF_SHARED_LIBRARY_DIR"
printf 'build --action_env TF_SHARED_LIBRARY_NAME="%s"\n' "$TF_SHARED_LIBRARY_NAME"
} >configure.bazelrc
echo "Set up configure.bazelrc. Make sure to include it in your .bazelrc file."
================================================
FILE: doc/index.md
================================================
# Riegeli
*Riegeli/records* is a file format for storing a sequence of string records,
typically serialized protocol buffers. It supports dense compression, fast
decoding, seeking, detection and optional skipping of data corruption, filtering
of proto message fields for even faster decoding, and parallel encoding.
* [Specification of Riegeli/records file format](riegeli_records_file_format.md).
* [Specifying options for writing Riegeli/records files](record_writer_options.md).
================================================
FILE: doc/record_writer_options.md
================================================
# Specifying options for writing Riegeli/records files
Options for writing Riegeli/records files can be specified as a string:
```data
options ::= option? ("," option?)*
option ::=
"default" |
"transpose" (":" ("true" | "false"))? |
"uncompressed" |
"brotli" (":" brotli_level)? |
"zstd" (":" zstd_level)? |
"snappy" (":" snappy_level)? |
"window_log" ":" window_log |
"brotli_encoder" ":" ("rbrotli_or_cbrotli" | "cbrotli" | "rbrotli") |
"chunk_size" ":" chunk_size |
"bucket_fraction" ":" bucket_fraction |
"padding" (":" padding)? |
"initial_padding" (":" padding)? |
"final_padding" (":" padding)? |
"parallelism" ":" parallelism
brotli_level ::= integer in the range [0..11] (default 6)
zstd_level ::= integer in the range [-131072..22] (default 3)
snappy_level ::= integer in the range [1..2] (default 1)
window_log ::= "auto" or integer in the range [10..31]
chunk_size ::= "auto" or positive integer expressed as real with optional
suffix [BkKMGTPE]
bucket_fraction ::= real in the range [0..1]
padding ::= positive integer expressed as real with optional suffix [BkKMGTPE]
(default 64K)
parallelism ::= non-negative integer
```
An empty string is the same as `default`.
## `transpose`
If `true` (`transpose` is the same as `transpose:true`), records should be
serialized proto messages (but nothing will break if they are not). A chunk of
records will be processed in a way which allows for better compression.
If `false`, a chunk of records will be stored in a simpler format, directly or
with compression.
Default: `false`.
## Compression algorithms
### `uncompressed`
Changes compression algorithm to Uncompressed (turns compression off).
### `brotli`
Changes compression algorithm to [Brotli](https://github.com/google/brotli).
Sets compression level which tunes the tradeoff between compression density and
compression speed (higher = better density but slower).
`brotli_level` must be between 0 and 11. Default: `6`.
This is the default compression algorithm.
### `zstd`
Changes compression algorithm to [Zstd](https://facebook.github.io/zstd/). Sets
compression level which tunes the tradeoff between compression density and
compression speed (higher = better density but slower).
`zstd_level` must be between -131072 and 22. Level 0 is currently equivalent to
3. Default: 3.
### `snappy`
Changes compression algorithm to [Snappy](https://google.github.io/snappy/).
`snappy_level` must be between 1 and 2. Default: 1.
## `window_log`
Logarithm of the LZ77 sliding window size. This tunes the tradeoff between
compression density and memory usage (higher = better density but more memory).
Special value `auto` means to keep the default (`brotli`: 22, `zstd`: derived
from compression level and chunk size).
For `uncompressed` and `snappy`, `window_log` must be `auto`. For `brotli`,
`window_log` must be `auto` or between 10 and 30. For `zstd`, `window_log` must
be `auto` or between 10 and 30 in 32-bit build, 31 in 64-bit build.
Default: `auto`.
## `chunk_size`
Sets the desired uncompressed size of a chunk which groups messages to be
transposed, compressed, and written together.
A larger chunk size improves compression density; a smaller chunk size allows to
read pieces of the file independently with finer granularity, and reduces memory
usage of both writer and reader.
Special value `auto` means to keep the default (compressed: 1M, uncompressed:
4k).
Default: `auto`.
## `bucket_fraction`
Sets the desired uncompressed size of a bucket which groups values of several
fields of the given wire type to be compressed together, relative to the desired
chunk size, on the scale between 0.0 (compress each field separately) to 1.0
(put all fields of the same wire type in the same bucket.
This is meaningful if transpose and compression are enabled. A larger bucket
size improves compression density; a smaller bucket size makes reading with
projection faster, allowing to skip decompression of values of fields which are
not included.
Default 1.0.
## `padding`
If `padding > 1`, padding is written at the beginning, when flushing, and at the
end of the file, for the absolute position to reach a multiple of `padding`.
Consequences if `padding` is a multiple of 64KB:
1. Physical concatenation of separately written files yields a valid file
(setting metadata in subsequent files is wasteful but harmless).
2. Even if the existing file was corrupted or truncated, data appended to it
will be recoverable.
The cost is that up to `padding` bytes is wasted when padding is written.
`padding` is a shortcut for `set_initial_padding` with `set_final_padding`.
`padding` without the parameter assumes 64KB.
Default: 1 (no padding).
## `initial_padding`
If `initial_padding > 1`, padding is written at the beginning of the file, for
the absolute position to reach a multiple of `initial_padding`.
See `padding` for details.
`initial_padding` without the parameter assumes 64KB.
Default: 1 (no padding).
## `final_padding`
If `final_padding > 1`, padding is written when flushing and at the end of the
file, for the absolute position to reach a multiple of `final_padding`.
See `padding` for details.
`final_padding` without the parameter assumes 64KB.
Default: 1 (no padding).
## `parallelism`
Sets the maximum number of chunks being encoded in parallel in background.
Larger parallelism can increase throughput, up to a point where it no longer
matters; smaller parallelism reduces memory usage.
If `parallelism > 0`, chunks are written in background and reporting writing
errors is delayed.
Default: 0.
================================================
FILE: doc/riegeli_records_file_format.md
================================================
# Riegeli/records file format specification
## Summary
File contents are interpreted as a sequence of variable-sized *chunks,* where a
chunk encodes some number of *records.* A record can be any byte sequence but
Riegeli has special support for the common case where it is a serialized proto
message.
In order to support seeking and recovery after data corruption, the sequence of
chunks is interrupted by a *block header* at every multiple of the block size
which is 64 KiB. After the block header the interrupted chunk continues.
A record can be identified by the position of the chunk beginning and the index
of the record within the chunk. A record can also be identified by a number
resembling a file position, defined as the sum of the chunk beginning and the
record index.
## Conventions
Numbers in block headers and chunk headers are encoded as unsigned Little-Endian
integers.
Hashes are 64-bit [HighwayHash](https://github.com/google/highwayhash) values
with the key {0x2f696c6567656952, 0x0a7364726f636572, 0x2f696c6567656952,
0x0a7364726f636572} ('Riegeli/', 'records\n', 'Riegeli/', 'records\n').
## Block header
A block header allows to locate the chunk that the block header interrupts.
Block headers can interrupt a chunk at arbitrary points, including in the middle
of the chunk header.
If a block header lies exactly between chunks, it is considered to interrupt the
next chunk; this includes the situation at the beginning of the file. In this
case the chunk formally begins at the beginning of the block, even though it
contains no bytes before the block header.
* Block header (24 bytes):
* `header_hash` (8 bytes) — hash of the rest of the header
(`previous_chunk` and `next_chunk`)
* `previous_chunk` (8 bytes) — distance from the beginning of the chunk
interrupted by this block header to the beginning of the block
* `next_chunk` (8 bytes) — distance from the beginning of the block to the
end of the chunk interrupted by this block header
If `header_hash` does not match, then this block header is corrupted and must be
ignored. Block headers can be skipped during sequential file reading, they are
useful only for seeking and for error recovery.
## Chunk
A chunk must not begin inside nor immediately after a block header.
* Chunk header (40 bytes):
* `header_hash` (8 bytes) — hash of the rest of the header (`data_size` up
to and including `decoded_data_size`)
* `data_size` (8 bytes) — size of `data` (excluding intervening block
headers)
* `data_hash` (8 bytes) — hash of `data`
* `chunk_type` (1 byte) — determines how to interpret `data`
* `num_records` (7 bytes) — number of records after decoding
* `decoded_data_size` (8 bytes) — sum of record sizes after decoding
* `data` (`data_size` bytes) — encoded records or other data
* `padding` — ignored (usually filled with zeros by the encoder)
If `header_hash` does not match, header contents cannot be trusted; if skipping
over corruption is desired, a valid chunk should be located using block headers.
If `data_hash` does not match, `data` is corrupted; if skipping over corruption
is desired, the chunk must be ignored.
The size of `padding` is the minimum size which satisfies the following
constraints:
* The chunk (including chunk header, `data`, `padding`, and intervening block
headers) has at least as many bytes as `num_records`.
* The chunk does not end inside nor immediately after a block header.
If `num_records` is 0, `decoded_data_size` has a meaning depending on the chunk
type.
*Rationale:*
*The presence of `padding` allows to assign unique numbers resembling file
positions to records.*
*`decoded_data_size` is stored in the chunk header, instead of being implied by
or stored in `data`, to help decoders decide how many chunks to potentially read
ahead.*
## Chunk data
Some parts of chunk data are compressed. The compression format is generally
specified as `compression_type` (byte):
* 0 — none
* 0x62 ('b') — [Brotli](https://github.com/google/brotli)
* 0x7a ('z') — [Zstd](https://facebook.github.io/zstd/)
* 0x73 ('s') — [Snappy](https://google.github.io/snappy/)
Any compressed block is prefixed with its decompressed size (varint64) unless
`compression_type` is 0.
*Rationale:*
*Knowing the decompressed size can make easier for the decoder to decompress
data into a preallocated array.*
### File signature
`chunk_type` is 0x73 ('s').
A file signature chunk must be present at the beginning of the file. It may also
be present elsewhere, in which case it encodes no records and is ignored.
`data_size`, `num_records`, and `decoded_data_size` must be 0.
This makes the first 64 bytes of a Riegeli/records file fixed:
```data
83 af 70 d1 0d 88 4a 3f 00 00 00 00 00 00 00 00
40 00 00 00 00 00 00 00 91 ba c2 3c 92 87 e1 a9
00 00 00 00 00 00 00 00 e1 9f 13 c0 e9 b1 c3 72
73 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
```
### File metadata
`chunk_type` is 0x6d ('m').
A file metadata chunk provides information describing the records. Metadata are
not necessary to read the records but might be helpful to interpret their
contents.
If present, metadata should be written immediately after file signature.
The chunk is encoded like a transposed chunk with a single record containing a
serialized `RecordsMetadata` proto message, except that `chunk_type` is
different and `num_records` is 0.
### Padding chunk
`chunk_type` is 0x70 ('p').
A padding chunk encodes no records and only occupies file space.
`num_records` and `decoded_data_size` must be 0. `data` is ignored (usually
filled with zeros by the encoder).
This can be used for more efficient file concatenation (bringing the file offset
modulo `kBlockSize` to 0 allows for physical concatenation of files without
examining their contents), or for syncing to a file system which requires a
particular file offset granularity in order for the sync to be effective.
### Simple chunk with records
`chunk_type` is 0x72 ('r').
Simple chunks store record sizes and concatenated record contents in two
buffers, possibly compressed.
The format:
* `compression_type` (byte) — compression type for sizes and values
* `compressed_sizes_size` (varint64) — size of `compressed_sizes`
* `compressed_sizes` (`compressed_sizes_size` bytes) - compressed buffer with
record sizes
* `compressed_values` (the rest of `data`) — compressed buffer with record
values
`compressed_sizes`, after decompression, contains `num_records` varint64s: the
size of each record.
`compressed_values`, after decompression, contains `decoded_data_size` bytes:
concatenation of record values.
### Transposed chunk with records
`chunk_type` is 0x74 ('t').
TODO: Document this.
## Properties of the file format
* Data corruption anywhere is detected whenever the hash allows this, and it
causes only a local data loss of up to a chunk (if chunk data are damaged)
or block (if chunk header is damaged).
* It is possible to open for append and write more records, even without
reading the original file contents; the original file size must be taken
into account though.
* Seeking to the chunk closest to the given file position requires a seek +
small read, then iterating through chunk headers in a block.
## Implementation notes
The following formulas clarify how certain field values and positions can be
computed.
Constants for fixed sizes:
```c++
kBlockSize = 1 << 16;
kBlockHeaderSize = 24;
kUsableBlockSize = kBlockSize - kBlockHeaderSize;
kChunkHeaderSize = 40;
```
Constraints for chunk boundary distances in a block header:
```c++
previous_chunk % kBlockSize < kUsableBlockSize &&
next_chunk > 0 &&
(next_chunk - 1) % kBlockSize >= kBlockHeaderSize
```
End position of a chunk which begins at `chunk_begin`:
```c++
NumOverheadBlocks(pos, size) =
(size + (pos + kUsableBlockSize - 1) % kBlockSize) / kUsableBlockSize;
AddWithOverhead(pos, size) =
pos + size + NumOverheadBlocks(pos, size) * kBlockHeaderSize;
// Equivalent implementation using unsigned arithmetic modulo 1 << 64:
// RemainingInBlock(pos) = (-pos) % kBlockSize;
RemainingInBlock(pos) = kBlockSize - 1 - (pos + kBlockSize - 1) % kBlockSize;
SaturatingSub(a, b) = a > b ? a - b : 0;
// 0 -> 0, 1..25 -> 25, 26 -> 26, ..., 64K -> 64K, 64K+1..64K+25 -> 64K+25 etc.
RoundUpToPossibleChunkBoundary(pos) =
pos + SaturatingSub(RemainingInBlock(pos), kUsableBlockSize - 1);
chunk_end = max(AddWithOverhead(chunk_begin, kChunkHeaderSize + data_size),
RoundUpToPossibleChunkBoundary(chunk_begin + num_records));
```
Fields of a block header at `block_begin` which interrupts a chunk at
`chunk_begin`:
```c++
prev_chunk = block_begin - chunk_begin;
next_chunk = chunk_end - block_begin;
```
================================================
FILE: python/BUILD
================================================
load("@rules_python//python:defs.bzl", "py_binary")
package(default_visibility = ["//visibility:private"])
licenses(["notice"])
# These dependencies are gathered in a py_binary, instead of directly in
# sh_binary data, so that bazel links __init__.py files to runfiles.
py_binary(
name = "dummy_binary",
srcs = ["dummy_binary.py"],
srcs_version = "PY3",
deps = [
"//python/riegeli",
"//python/riegeli/tensorflow:riegeli_dataset_ops",
],
)
sh_binary(
name = "build_pip_package",
srcs = ["build_pip_package.sh"],
data = [
"MANIFEST.in",
"README.md",
"setup.py",
":dummy_binary",
],
)
================================================
FILE: python/MANIFEST.in
================================================
recursive-include riegeli *.py
================================================
FILE: python/README.md
================================================
# Riegeli
*Riegeli/records* is a file format for storing a sequence of string records,
typically serialized protocol buffers. It supports dense compression, fast
decoding, seeking, detection and optional skipping of data corruption, filtering
of proto message fields for even faster decoding, and parallel encoding.
See [documentation](https://github.com/google/riegeli/blob/master/doc/index.md).
# Status
Riegeli file format will only change in a backward compatible way (i.e. future
readers will understand current files, but current readers might not understand
files using future features).
Riegeli C++ API might change in incompatible ways.
================================================
FILE: python/__init__.py
================================================
================================================
FILE: python/build_pip_package.sh
================================================
#!/bin/bash
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Builds a pip package for riegeli.
#
# Usage (where DEST is a where to write the output, e.g. ~/riegeli-dist):
# $ bazel build -c opt python:build_pip_package
# $ bazel-bin/python/build_pip_package --dest DEST --sdist --bdist
set -e
function is_absolute {
[[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]]
}
function real_path() {
if is_absolute "$1"; then
printf "%s" "$1"
else
printf "%s/%s" "$PWD" "${1#./}"
fi
}
function build_sdist() {
local dest=$1
python python/setup.py sdist --dist-dir "$dest"
}
function build_bdist() {
local dest=$1
cd bazel-bin/python/build_pip_package.runfiles/com_google_riegeli/python
python setup.py bdist_wheel --dist-dir "$dest"
cd -
}
function main() {
local dest=
local sdist=false
local bdist=false
while [[ $# -gt 0 ]]; do
if [[ $1 == --dest ]]; then
shift
dest=$(real_path "$1")
elif [[ $1 == --sdist ]]; then
sdist=true
elif [[ $1 == --bdist ]]; then
bdist=true
else
printf "Unknown flag: %s\n" "$1" >&2
exit 1
fi
shift
done
if [[ -z $dest ]]; then
printf "Missing required flag: --dest DIRECTORY\n" >&2
exit 1
fi
if [[ $sdist != true ]] && [[ $bdist != true ]]; then
printf "Nothing to do: missing --sdist or --bdist\n" >&2
exit 1
fi
mkdir -p -- "$dest"
if [[ $sdist = true ]]; then
build_sdist "$dest"
fi
if [[ $bdist = true ]]; then
build_bdist "$dest"
fi
}
main "$@"
================================================
FILE: python/dummy_binary.py
================================================
================================================
FILE: python/riegeli/BUILD
================================================
# Riegeli, file format for storing a sequence of records.
load("@rules_python//python:defs.bzl", "py_library")
package(
default_visibility = ["//visibility:public"],
features = ["header_modules"],
)
licenses(["notice"])
exports_files(["LICENSE"])
py_library(
name = "riegeli",
srcs = ["__init__.py"],
imports = [".."],
deps = [
"//python/riegeli/base:riegeli_error",
"//python/riegeli/records:record_position",
"//python/riegeli/records:record_reader",
"//python/riegeli/records:record_writer",
"//python/riegeli/records:records_metadata_py_pb2",
"//python/riegeli/records:skipped_region",
],
)
================================================
FILE: python/riegeli/BUILD.tpl
================================================
load("@bazel_tools//tools/python:toolchain.bzl", "py_runtime_pair")
licenses(["restricted"])
package(
default_visibility = ["//visibility:public"],
features = ["header_modules"],
)
toolchain(
name = "toolchain",
toolchain = ":py_runtime_pair",
toolchain_type = "@bazel_tools//tools/python:toolchain_type",
)
# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
# See https://docs.python.org/3/extending/windows.html
cc_import(
name = "python_lib",
interface_library = select({
":windows": ":python_import_lib",
# A placeholder for Unix platforms which makes --no_build happy.
"//conditions:default": "not-existing.lib",
}),
system_provided = 1,
)
cc_library(
name = "python_headers",
hdrs = [":python_include"],
deps = select({
":windows": [":python_lib"],
"//conditions:default": [],
}),
includes = ["python_include"],
)
cc_library(
name = "numpy_headers",
hdrs = [":numpy_include"],
includes = ["numpy_include"],
)
config_setting(
name = "windows",
values = {"cpu": "x64_windows"},
visibility = ["//visibility:public"],
)
%{PYTHON_RUNTIME_PAIR}
%{PYTHON_INCLUDE_GENRULE}
%{NUMPY_INCLUDE_GENRULE}
%{PYTHON_IMPORT_LIB_GENRULE}
================================================
FILE: python/riegeli/__init__.py
================================================
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Writes or reads Riegeli/records files."""
from riegeli.base import riegeli_error
from riegeli.records import record_position
from riegeli.records import record_reader
from riegeli.records import record_writer
from riegeli.records import records_metadata_pb2
from riegeli.records import skipped_region
__all__ = (
'RiegeliError',
'CancelledError',
'UnknownError',
'InvalidArgumentError',
'DeadlineExceededError',
'NotFoundError',
'AlreadyExistsError',
'PermissionDeniedError',
'UnauthenticatedError',
'ResourceExhaustedError',
'FailedPreconditionError',
'AbortedError',
'OutOfRangeError',
'UnimplementedError',
'InternalError',
'UnavailableError',
'DataLossError',
'FlushType',
'RecordPosition',
'SkippedRegion',
'RecordsMetadata',
'set_record_type',
'RecordWriter',
'EXISTENCE_ONLY',
'get_record_type',
'RecordReader',
)
# pylint: disable=invalid-name
RiegeliError = riegeli_error.RiegeliError
CancelledError = riegeli_error.CancelledError
UnknownError = riegeli_error.UnknownError
InvalidArgumentError = riegeli_error.InvalidArgumentError
DeadlineExceededError = riegeli_error.DeadlineExceededError
NotFoundError = riegeli_error.NotFoundError
AlreadyExistsError = riegeli_error.AlreadyExistsError
PermissionDeniedError = riegeli_error.PermissionDeniedError
UnauthenticatedError = riegeli_error.UnauthenticatedError
ResourceExhaustedError = riegeli_error.ResourceExhaustedError
FailedPreconditionError = riegeli_error.FailedPreconditionError
AbortedError = riegeli_error.AbortedError
OutOfRangeError = riegeli_error.OutOfRangeError
UnimplementedError = riegeli_error.UnimplementedError
InternalError = riegeli_error.InternalError
UnavailableError = riegeli_error.UnavailableError
DataLossError = riegeli_error.DataLossError
RecordPosition = record_position.RecordPosition
SkippedRegion = skipped_region.SkippedRegion
RecordsMetadata = records_metadata_pb2.RecordsMetadata
FlushType = record_writer.FlushType
set_record_type = record_writer.set_record_type
RecordWriter = record_writer.RecordWriter
EXISTENCE_ONLY = record_reader.EXISTENCE_ONLY
get_record_type = record_reader.get_record_type
RecordReader = record_reader.RecordReader
================================================
FILE: python/riegeli/base/BUILD
================================================
load("@rules_cc//cc:defs.bzl", "cc_library")
load("@rules_python//python:defs.bzl", "py_library")
package(
default_visibility = ["//python/riegeli:__subpackages__"],
features = ["header_modules"],
)
licenses(["notice"])
cc_library(
name = "utils",
srcs = ["utils.cc"],
hdrs = ["utils.h"],
data = [":riegeli_error"], # Python module imported from C++.
# utils.cc has #define before #include to influence what the included
# files provide.
features = ["-use_header_modules"],
deps = [
"//riegeli/base:arithmetic",
"//riegeli/base:assert",
"//riegeli/base:chain",
"//riegeli/base:compare",
"//riegeli/base:types",
"@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:string_view",
"@com_google_absl//absl/types:span",
"@rules_python//python/cc:current_py_cc_headers",
],
)
py_library(
name = "riegeli_error",
srcs = ["riegeli_error.py"],
)
================================================
FILE: python/riegeli/base/__init__.py
================================================
================================================
FILE: python/riegeli/base/riegeli_error.py
================================================
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = (
'RiegeliError',
'CancelledError',
'UnknownError',
'InvalidArgumentError',
'DeadlineExceededError',
'NotFoundError',
'AlreadyExistsError',
'PermissionDeniedError',
'UnauthenticatedError',
'ResourceExhaustedError',
'FailedPreconditionError',
'AbortedError',
'OutOfRangeError',
'UnimplementedError',
'InternalError',
'UnavailableError',
'DataLossError',
)
class RiegeliError(Exception):
"""Base class of errors reported by Google APIs.
Sometimes multiple error codes may apply. Services should return the most
specific error code that applies. For example, prefer `OutOfRangeError` over
`FailedPreconditionError` if both codes apply. Similarly prefer
`NotFoundError` or `AlreadyExistsError` over `FailedPreconditionError`.
Attributes:
code: Error code classifying the error, matching C++ StatusCode.
"""
class CancelledError(RiegeliError):
"""The operation was cancelled, typically by the caller."""
code = 1
class UnknownError(RiegeliError):
"""Unknown error.
For example, this error may be returned when a Status value received from
another address space belongs to an error-space that is not known in this
address space. Also errors raised by APIs that do not return enough error
information may be converted to this error.
"""
code = 2
class InvalidArgumentError(RiegeliError):
"""The client specified an invalid argument.
Note that this differs from `FailedPreconditionError`. `InvalidArgumentError`
indicates arguments that are problematic regardless of the state of the system
(e.g., a malformed file name).
"""
code = 3
class DeadlineExceededError(RiegeliError):
"""The deadline expired before the operation could complete.
For operations that change the state of the system, this error may be returned
even if the operation has completed successfully. or example, a successful
response from a server could have been delayed long enough for the deadline to
expire.
"""
code = 4
class NotFoundError(RiegeliError):
"""Some requested entity (e.g., file or directory) was not found.
Note to server developers: if a request is denied for an entire class of
users, such as gradual feature rollout or undocumented allowlist,
`NotFoundError` may be used. If a request is denied for some users within a
class of users, such as user-based access control, `PermissionDeniedError`
must be used.
"""
code = 5
class AlreadyExistsError(RiegeliError):
"""The entity that a client attempted to create already exists."""
code = 6
class PermissionDeniedError(RiegeliError):
"""The caller does not have permission to execute the specified operation.
`PermissionDeniedError` must not be used for rejections caused by exhausting
some resource (use `ResourceExhaustedError` instead for those errors).
`PermissionDeniedError` must not be used if the caller can not be identified
(use `UnauthenticatedError` instead for those errors). This error code does
not imply the request is valid or the requested entity exists or satisfies
other pre-conditions.
"""
code = 7
class UnauthenticatedError(RiegeliError):
"""No valid authentication credentials for the operation."""
code = 16
class ResourceExhaustedError(RiegeliError):
"""Some resource has been exhausted.
Perhaps a per-user quota, or perhaps the entire file system is out of
space.
"""
code = 8
class FailedPreconditionError(RiegeliError):
"""Failed precondition.
The operation was rejected because the system is not in a state required for
the operation's execution. For example, the directory to be deleted is
non-empty, an rmdir operation is applied to a non-directory, etc.
A litmus test that may help a service implementor in deciding between
`FailedPreconditionError`, `AbortedError`, and `UnavailableError`:
(a) Use `UnavailableError` if the client can retry just the failing call.
(b) Use `AbortedError` if the client should retry at a higher-level (e.g.,
when a client-specified test-and-set fails, indicating the client should
restart a read-modify-write sequence).
(c) Use `FailedPreconditionError` if the client should not retry until the
system state has been explicitly fixed. E.g., if an "rmdir" fails because
the directory is non-empty, `FailedPreconditionError` should be returned
since the client should not retry unless the files are deleted from the
directory.
"""
code = 9
class AbortedError(RiegeliError):
"""The operation was aborted.
Typically due to a concurrency issue such as a sequencer check failure or
transaction abort.
See litmus test at `FailedPreconditionError` for deciding between
`FailedPreconditionError`, `AbortedError`, and `UnavailableError`.
"""
code = 10
class OutOfRangeError(RiegeliError):
"""The operation was attempted past the valid range.
E.g., seeking or reading past end-of-file.
Unlike `InvalidArgumentError`, this error indicates a problem that may be
fixed if the system state changes. For example, a 32-bit file system will
generate `InvalidArgumentError` if asked to read at an offset that is not in
the range [0,2^32-1], but it will generate `OutOfRangeError` if asked to read
from an offset past the current file size.
There is a fair bit of overlap between `FailedPreconditionError` and
`OutOfRangeError`. We recommend using `OutOfRangeError` (the more specific
error) when it applies so that callers who are iterating through a space can
easily look for an `OutOfRangeError` error to detect when they are done.
"""
code = 11
class UnimplementedError(RiegeliError):
"""The operation is not implemented.
Or is not supported/enabled in this service.
"""
code = 12
class InternalError(RiegeliError):
"""Internal errors.
This means that some invariants expected by the underlying system have been
broken. This error code is reserved for serious errors.
"""
code = 13
class UnavailableError(RiegeliError):
"""The service is currently unavailable.
This is most likely a transient condition, which can be corrected by retrying
with a backoff.
See litmus test at `FailedPreconditionError` for deciding between
`FailedPreconditionError`, `AbortedError`, and `UnavailableError`.
"""
code = 14
class DataLossError(RiegeliError):
"""Unrecoverable data loss or corruption."""
code = 15
================================================
FILE: python/riegeli/base/utils.cc
================================================
// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// From https://docs.python.org/3/c-api/intro.html:
// Since Python may define some pre-processor definitions which affect the
// standard headers on some systems, you must include Python.h before any
// standard headers are included.
#define PY_SSIZE_T_CLEAN
#include
// clang-format: do not reorder the above include.
#include "python/riegeli/base/utils.h"
// clang-format: do not reorder the above include.
#include
#include
#include
#include
#include
#include
#include "absl/base/attributes.h"
#include "absl/base/optimization.h"
#include "absl/status/status.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
#include "absl/types/span.h"
#include "riegeli/base/arithmetic.h"
#include "riegeli/base/assert.h"
#include "riegeli/base/chain.h"
#include "riegeli/base/compare.h"
#include "riegeli/base/types.h"
namespace riegeli::python {
Exception& Exception::operator=(const Exception& that) noexcept {
PythonLock lock;
Py_XINCREF(that.type_.get());
type_.reset(that.type_.get());
Py_XINCREF(that.value_.get());
value_.reset(that.value_.get());
Py_XINCREF(that.traceback_.get());
traceback_.reset(that.traceback_.get());
return *this;
}
Exception Exception::Fetch() {
PythonLock::AssertHeld();
PyObject* type;
PyObject* value;
PyObject* traceback;
PyErr_Fetch(&type, &value, &traceback);
PyErr_NormalizeException(&type, &value, &traceback);
return Exception(type, value, traceback);
}
PyObject* Exception::Restore() const& {
PythonLock::AssertHeld();
Py_XINCREF(type_.get());
Py_XINCREF(value_.get());
Py_XINCREF(traceback_.get());
PyErr_Restore(type_.get(), value_.get(), traceback_.get());
return nullptr;
}
PyObject* Exception::Restore() && {
PythonLock::AssertHeld();
PyErr_Restore(type_.release(), value_.release(), traceback_.release());
return nullptr;
}
std::string Exception::message() const {
if (ok()) return "OK";
PythonLock lock;
RIEGELI_ASSERT(PyExceptionClass_Check(type_.get()))
<< "Expected an exception class, not " << Py_TYPE(type_.get())->tp_name;
std::string message = PyExceptionClass_Name(type_.get());
if (value_ == nullptr) return message;
const PythonPtr str_result(PyObject_Str(value_.get()));
if (ABSL_PREDICT_FALSE(str_result == nullptr)) {
PyErr_Clear();
absl::StrAppend(&message, ": ");
return message;
}
StrOrBytes str;
if (ABSL_PREDICT_FALSE(!str.FromPython(str_result.get()))) {
PyErr_Clear();
absl::StrAppend(&message, ": ");
return message;
}
if (!absl::string_view(str).empty()) {
absl::StrAppend(&message, ": ", absl::string_view(str));
}
return message;
}
void SetRiegeliError(const absl::Status& status) {
RIEGELI_ASSERT(!status.ok())
<< "Failed precondition of SetRiegeliError(): status not failed";
PythonLock::AssertHeld();
PythonPtr message = StringToPython(status.message());
if (ABSL_PREDICT_FALSE(message == nullptr)) return;
PyObject* type;
switch (status.code()) {
#define HANDLE_CODE(name) \
case absl::StatusCode::k##name: { \
static constexpr ImportedConstant k##name##Error( \
"riegeli.base.riegeli_error", #name "Error"); \
if (ABSL_PREDICT_FALSE(!k##name##Error.Verify())) return; \
type = k##name##Error.get(); \
} break
// clang-format off
HANDLE_CODE(Cancelled);
default:
HANDLE_CODE(Unknown);
HANDLE_CODE(InvalidArgument);
HANDLE_CODE(DeadlineExceeded);
HANDLE_CODE(NotFound);
HANDLE_CODE(AlreadyExists);
HANDLE_CODE(PermissionDenied);
HANDLE_CODE(Unauthenticated);
HANDLE_CODE(ResourceExhausted);
HANDLE_CODE(FailedPrecondition);
HANDLE_CODE(Aborted);
HANDLE_CODE(OutOfRange);
HANDLE_CODE(Unimplemented);
HANDLE_CODE(Internal);
HANDLE_CODE(Unavailable);
HANDLE_CODE(DataLoss);
// clang-format on
#undef HANDLE_CODE
}
Py_INCREF(type);
PyErr_Restore(type, message.release(), nullptr);
}
namespace py_internal {
namespace {
// A linked list of all objects of type `StaticObject` which have `value_`
// allocated, chained by their `next_` fields. This is used to free the objects
// on Python interpreter shutdown.
const StaticObject* all_static_objects = nullptr;
} // namespace
void FreeStaticObjectsImpl() {
const StaticObject* static_object =
std::exchange(all_static_objects, nullptr);
while (static_object != nullptr) {
static_object->value_ = nullptr;
static_object = std::exchange(static_object->next_, nullptr);
}
}
// `extern "C"` sets the C calling convention for compatibility with the Python
// API. `static` avoids making symbols public, as `extern "C"` trumps anonymous
// namespace.
extern "C" {
static void FreeStaticObjects() { FreeStaticObjectsImpl(); }
} // extern "C"
void StaticObject::RegisterThis() const {
PythonLock::AssertHeld();
if (all_static_objects == nullptr) {
// This is the first registered `StaticObject` since `Py_Initialize()`.
Py_AtExit(FreeStaticObjects);
}
next_ = std::exchange(all_static_objects, this);
}
bool ImportedCapsuleBase::ImportValue() const {
// For some reason `PyImport_ImportModule()` is sometimes required before
// `PyCapsule_Import()` for a module with a nested name.
const size_t dot = absl::string_view(capsule_name_).rfind('.');
RIEGELI_ASSERT_NE(dot, absl::string_view::npos)
<< "Capsule name does not contain a dot: " << capsule_name_;
RIEGELI_CHECK(
PyImport_ImportModule(std::string(capsule_name_, dot).c_str()) != nullptr)
<< Exception::Fetch().message();
value_ = PyCapsule_Import(capsule_name_, false);
return value_ != nullptr;
}
} // namespace py_internal
bool Identifier::AllocateValue() const {
value_ = StringToPython(name_).release();
if (ABSL_PREDICT_FALSE(value_ == nullptr)) return false;
PyUnicode_InternInPlace(&value_);
RegisterThis();
return true;
}
bool ImportedConstant::AllocateValue() const {
const PythonPtr module_name = StringToPython(module_name_);
if (ABSL_PREDICT_FALSE(module_name == nullptr)) return false;
const PythonPtr module(PyImport_Import(module_name.get()));
if (ABSL_PREDICT_FALSE(module == nullptr)) return false;
const PythonPtr attr_name = StringToPython(attr_name_);
if (ABSL_PREDICT_FALSE(attr_name == nullptr)) return false;
value_ = PyObject_GetAttr(module.get(), attr_name.get());
if (ABSL_PREDICT_FALSE(value_ == nullptr)) return false;
RegisterThis();
return true;
}
bool ExportCapsule(PyObject* module, const char* capsule_name,
const void* ptr) {
PythonPtr capsule(
PyCapsule_New(const_cast(ptr), capsule_name, nullptr));
if (ABSL_PREDICT_FALSE(capsule == nullptr)) return false;
const size_t dot = absl::string_view(capsule_name).rfind('.');
RIEGELI_ASSERT_NE(dot, absl::string_view::npos)
<< "Capsule name does not contain a dot: " << capsule_name;
RIEGELI_ASSERT(PyModule_Check(module))
<< "Expected a module, not " << Py_TYPE(module)->tp_name;
RIEGELI_ASSERT_EQ(absl::string_view(PyModule_GetName(module)),
absl::string_view(capsule_name, dot))
<< "Module name mismatch";
if (ABSL_PREDICT_FALSE(PyModule_AddObject(module, capsule_name + dot + 1,
capsule.release()) < 0)) {
return false;
}
return true;
}
MemoryView::~MemoryView() {
if (object_ != nullptr && Py_REFCNT(object_.get()) > 1) {
PyObject* value;
PyObject* type;
PyObject* traceback;
PyErr_Fetch(&value, &type, &traceback);
ReleaseInternal();
PyErr_Restore(value, type, traceback);
}
}
PyObject* MemoryView::ToPython(absl::string_view value) {
RIEGELI_ASSERT_EQ(object_, nullptr)
<< "Failed precondition of MemoryView::ToPython(): "
"called more than once";
object_.reset(PyMemoryView_FromMemory(const_cast(value.data()),
IntCast(value.size()),
PyBUF_READ));
return object_.get();
}
PyObject* MemoryView::MutableToPython(absl::Span value) {
RIEGELI_ASSERT_EQ(object_, nullptr)
<< "Failed precondition of MemoryView::MutableToPython(): "
"called more than once";
object_.reset(PyMemoryView_FromMemory(
value.data(), IntCast(value.size()), PyBUF_WRITE));
return object_.get();
}
bool MemoryView::Release() {
bool release_ok = true;
if (object_ != nullptr && Py_REFCNT(object_.get()) > 1) {
release_ok = ReleaseInternal();
}
object_.reset();
return release_ok;
}
inline bool MemoryView::ReleaseInternal() {
static constexpr Identifier id_release("release");
const PythonPtr release_result(
PyObject_CallMethodObjArgs(object_.get(), id_release.get(), nullptr));
return release_result != nullptr;
}
bool StrOrBytes::FromPython(PyObject* object ABSL_ATTRIBUTE_LIFETIME_BOUND) {
RIEGELI_ASSERT_EQ(data_.data(), nullptr)
<< "Failed precondition of StrOrBytes::FromPython(): "
"called more than once";
if (PyUnicode_Check(object)) {
Py_ssize_t length;
const char* data = PyUnicode_AsUTF8AndSize(object, &length);
if (ABSL_PREDICT_FALSE(data == nullptr)) return false;
data_ = absl::string_view(data, IntCast(length));
return true;
} else if (ABSL_PREDICT_FALSE(!PyBytes_Check(object))) {
PyErr_Format(PyExc_TypeError, "Expected str or bytes, not %s",
Py_TYPE(object)->tp_name);
return false;
}
data_ = absl::string_view(PyBytes_AS_STRING(object),
IntCast(PyBytes_GET_SIZE(object)));
return true;
}
PythonPtr ChainToPython(const Chain& value) {
PythonPtr bytes(
PyBytes_FromStringAndSize(nullptr, IntCast(value.size())));
if (ABSL_PREDICT_FALSE(bytes == nullptr)) return nullptr;
value.CopyTo(PyBytes_AS_STRING(bytes.get()));
return bytes;
}
std::optional ChainFromPython(PyObject* object) {
Py_buffer buffer;
if (ABSL_PREDICT_FALSE(PyObject_GetBuffer(object, &buffer, PyBUF_CONTIG_RO) <
0)) {
return std::nullopt;
}
Chain result(absl::string_view(static_cast(buffer.buf),
IntCast(buffer.len)));
PyBuffer_Release(&buffer);
return result;
}
PythonPtr SizeToPython(size_t value) {
if (ABSL_PREDICT_FALSE(value >
std::numeric_limits::max())) {
PyErr_Format(PyExc_OverflowError, "Size out of range: %zu", value);
return nullptr;
}
return PythonPtr(
PyLong_FromUnsignedLongLong(IntCast(value)));
}
std::optional SizeFromPython(PyObject* object) {
const PythonPtr index(PyNumber_Index(object));
if (ABSL_PREDICT_FALSE(index == nullptr)) return std::nullopt;
RIEGELI_ASSERT(PyLong_Check(index.get()))
<< "PyNumber_Index() returned an unexpected type: "
<< Py_TYPE(index.get())->tp_name;
unsigned long long index_value = PyLong_AsUnsignedLongLong(index.get());
if (ABSL_PREDICT_FALSE(index_value == static_cast(-1)) &&
PyErr_Occurred()) {
return std::nullopt;
}
if (ABSL_PREDICT_FALSE(index_value > std::numeric_limits::max())) {
PyErr_Format(PyExc_OverflowError, "Size out of range: %llu", index_value);
return std::nullopt;
}
return IntCast(index_value);
}
PythonPtr PositionToPython(Position value) {
if (ABSL_PREDICT_FALSE(value >
std::numeric_limits::max())) {
PyErr_Format(PyExc_OverflowError, "Position out of range: %ju",
uintmax_t{value});
return nullptr;
}
return PythonPtr(
PyLong_FromUnsignedLongLong(IntCast(value)));
}
std::optional PositionFromPython(PyObject* object) {
const PythonPtr index(PyNumber_Index(object));
if (ABSL_PREDICT_FALSE(index == nullptr)) return std::nullopt;
RIEGELI_ASSERT(PyLong_Check(index.get()))
<< "PyNumber_Index() returned an unexpected type: "
<< Py_TYPE(index.get())->tp_name;
const unsigned long long index_value = PyLong_AsUnsignedLongLong(index.get());
if (ABSL_PREDICT_FALSE(index_value == static_cast(-1)) &&
PyErr_Occurred()) {
return std::nullopt;
}
if (ABSL_PREDICT_FALSE(index_value > std::numeric_limits::max())) {
PyErr_Format(PyExc_OverflowError, "Position out of range: %llu",
index_value);
return std::nullopt;
}
return IntCast(index_value);
}
PythonPtr PartialOrderingToPython(PartialOrdering ordering) {
if (ordering == PartialOrdering::unordered) {
return Py_INCREF(Py_None), PythonPtr(Py_None);
}
return PythonPtr(PyLong_FromLong(ordering < 0 ? -1 : ordering == 0 ? 0 : 1));
}
std::optional PartialOrderingFromPython(PyObject* object) {
if (object == Py_None) return PartialOrdering::unordered;
const long long_value = PyLong_AsLong(object);
if (ABSL_PREDICT_FALSE(long_value == -1) && PyErr_Occurred()) {
return std::nullopt;
}
return riegeli::Compare(long_value, 0);
}
} // namespace riegeli::python
================================================
FILE: python/riegeli/base/utils.h
================================================
// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PYTHON_RIEGELI_BASE_UTILS_H_
#define PYTHON_RIEGELI_BASE_UTILS_H_
// From https://docs.python.org/3/c-api/intro.html:
// Since Python may define some pre-processor definitions which affect the
// standard headers on some systems, you must include Python.h before any
// standard headers are included.
#include
// clang-format: do not reorder the above include.
#include
#include
#include
#include
#include
#include
#include
#include "absl/base/attributes.h"
#include "absl/base/optimization.h"
#include "absl/status/status.h"
#include "absl/strings/string_view.h"
#include "absl/types/span.h"
#include "riegeli/base/arithmetic.h"
#include "riegeli/base/assert.h"
#include "riegeli/base/chain.h"
#include "riegeli/base/compare.h"
#include "riegeli/base/types.h"
namespace riegeli::python {
// Ensures that Python GIL is locked. Reentrant.
//
// Same as `PyGILState_Ensure()` / `PyGILState_Release()`.
class PythonLock {
public:
static void AssertHeld() {
RIEGELI_ASSERT(PyGILState_Check()) << "Python GIL was assumed to be held";
}
PythonLock() { gstate_ = PyGILState_Ensure(); }
PythonLock(const PythonLock&) = delete;
PythonLock& operator=(const PythonLock&) = delete;
~PythonLock() { PyGILState_Release(gstate_); }
private:
PyGILState_STATE gstate_;
};
// Unlocks Python GIL, allowing non-Python threads to run.
//
// Same as `Py_BEGIN_ALLOW_THREADS` / `Py_END_ALLOW_THREADS`.
class PythonUnlock {
public:
PythonUnlock() {
PythonLock::AssertHeld();
tstate_ = PyEval_SaveThread();
}
PythonUnlock(const PythonUnlock&) = delete;
PythonUnlock& operator=(const PythonUnlock&) = delete;
~PythonUnlock() { PyEval_RestoreThread(tstate_); }
private:
PyThreadState* tstate_;
};
// Apply a function with Python GIL unlocked, allowing non-Python threads to
// run.
//
// Same as `Py_BEGIN_ALLOW_THREADS` / `Py_END_ALLOW_THREADS`.
template
std::invoke_result_t PythonUnlocked(Function&& f) {
PythonUnlock unlock;
return std::forward(f)();
}
// Owned `PyObject` which assumes that Python GIL is held.
struct Deleter {
template
void operator()(T* ptr) const {
PythonLock::AssertHeld();
Py_DECREF(ptr);
}
};
using PythonPtr = std::unique_ptr;
// Owned `PyObject` which does not assume that Python GIL is held.
struct LockingDeleter {
template
void operator()(T* ptr) const {
PythonLock lock;
Py_DECREF(ptr);
}
};
using PythonPtrLocking = std::unique_ptr;
// Allows a C++ object to be safely embedded in a Python object allocated with
// `PyType_GenericAlloc()`.
//
// `PythonWrapped` is similar to `std::optional`, but:
// * `PythonWrapped` is POD.
// * `PythonWrapped` supports only a subset of `std::optional` API.
// * `PythonWrapped` filled with zero bytes is valid and absent
// (`PyType_GenericAlloc()` fills the Python object with zero bytes).
// * `PythonWrapped` should be explicitly `reset()` in the implementation of
// `tp_dealloc` (there is no C++ destructor).
template
class PythonWrapped {
public:
static_assert(alignof(T) <= alignof(max_align_t),
"PythonWrapped does not support overaligned types");
template
ABSL_ATTRIBUTE_REINITIALIZES void emplace(Args&&... args) {
if (has_value_) {
get()->~T();
} else {
has_value_ = true;
}
new (storage_) T(std::forward(args)...);
}
ABSL_ATTRIBUTE_REINITIALIZES void reset() {
if (has_value_) {
get()->~T();
has_value_ = false;
}
}
bool has_value() const { return has_value_; }
T* get() ABSL_ATTRIBUTE_LIFETIME_BOUND {
RIEGELI_ASSERT(has_value_) << "Object uninitialized";
return std::launder(reinterpret_cast(storage_));
}
const T* get() const ABSL_ATTRIBUTE_LIFETIME_BOUND {
RIEGELI_ASSERT(has_value_) << "Object uninitialized";
return std::launder(reinterpret_cast(storage_));
}
T& operator*() ABSL_ATTRIBUTE_LIFETIME_BOUND { return *get(); }
const T& operator*() const ABSL_ATTRIBUTE_LIFETIME_BOUND { return *get(); }
T* operator->() ABSL_ATTRIBUTE_LIFETIME_BOUND { return get(); }
const T* operator->() const ABSL_ATTRIBUTE_LIFETIME_BOUND { return get(); }
bool Verify() const {
PythonLock::AssertHeld();
if (ABSL_PREDICT_FALSE(!has_value())) {
PyErr_SetString(PyExc_ValueError, "Object uninitialized");
return false;
}
return true;
}
private:
bool has_value_;
alignas(T) char storage_[sizeof(T)];
};
// Represents an optional Python exception being raised.
class Exception {
public:
// No exception.
Exception() = default;
Exception(const Exception& that) noexcept;
Exception& operator=(const Exception& that) noexcept;
Exception(Exception&& that) = default;
Exception& operator=(Exception&& that) = default;
// Fetches the active Python exception.
static Exception Fetch();
// Restores the active Python exception.
PyObject* Restore() const&;
PyObject* Restore() &&;
bool ok() const { return type_ == nullptr; }
std::string message() const;
// For implementing `tp_traverse` of objects containing `Exception`.
int Traverse(visitproc visit, void* arg);
private:
// Steals references.
explicit Exception(PyObject* type, PyObject* value, PyObject* traceback)
: type_(type), value_(value), traceback_(traceback) {}
PythonPtrLocking type_;
PythonPtrLocking value_;
PythonPtrLocking traceback_;
};
// Translate a failed status to the active Python exception, a class extending
// `RiegeliError`.
void SetRiegeliError(const absl::Status& status);
namespace py_internal {
// Lazily initialized pointer to a Python object, persisting until interpreter
// shutdown.
class StaticObject {
protected:
mutable PyObject* value_ = nullptr;
mutable const StaticObject* next_ = nullptr;
// Register this object in a global list of static objects. This must be
// called when `value_` is allocated.
void RegisterThis() const;
private:
friend void FreeStaticObjectsImpl();
};
// Template parameter independent part of `ImportedCapsule`.
class ImportedCapsuleBase {
public:
// Forces importing the value, returning `false` on failures (with Python
// exception set).
//
// If `Verify()` returns `true`, `get()` does not die.
bool Verify() const {
PythonLock::AssertHeld();
if (ABSL_PREDICT_FALSE(value_ == nullptr)) return ImportValue();
return true;
}
protected:
explicit constexpr ImportedCapsuleBase(const char* capsule_name)
: capsule_name_(capsule_name) {}
~ImportedCapsuleBase() = default;
bool ImportValue() const;
mutable void* value_ = nullptr;
private:
const char* capsule_name_;
};
} // namespace py_internal
// Creates a Python string (type `str`) which persists until interpreter
// shutdown. This is useful for attribute or method names in
// `PyObject_GetAttr()` or `PyObject_CallMethodObjArgs()`.
//
// An instance of `Identifier` should be allocated statically:
// ```
// static constexpr Identifier id_write("write");
// ```
//
// Then `id_write.get()` is a borrowed reference to the Python object.
class Identifier : public py_internal::StaticObject {
public:
explicit constexpr Identifier(absl::string_view name) : name_(name) {}
// Forces allocating the value, returning `false` on failures (with Python
// exception set).
//
// If `Verify()` returns `true`, `get()` does not die.
bool Verify() const {
PythonLock::AssertHeld();
if (ABSL_PREDICT_FALSE(value_ == nullptr)) return AllocateValue();
return true;
}
// Returns the value, allocating it on the first call. Dies on failure
// (use `Verify()` to prevent this).
PyObject* get() const ABSL_ATTRIBUTE_LIFETIME_BOUND {
PythonLock::AssertHeld();
if (ABSL_PREDICT_FALSE(value_ == nullptr)) {
RIEGELI_CHECK(AllocateValue()) << Exception::Fetch().message();
}
return value_;
}
private:
bool AllocateValue() const;
absl::string_view name_;
};
// Imports a Python module and gets its attribute, which persists until
// interpreter shutdown.
//
// An instance of `ImportedConstant` should be allocated statically:
// ```
// static constexpr ImportedConstant kRiegeliError(
// "riegeli.base.riegeli_error", "RiegeliError");
// ```
//
// Then `kRiegeliError.get()` is a borrowed reference to the Python object.
class ImportedConstant : public py_internal::StaticObject {
public:
explicit constexpr ImportedConstant(absl::string_view module_name,
absl::string_view attr_name)
: module_name_(module_name), attr_name_(attr_name) {}
// Forces importing the value, returning `false` on failures (with Python
// exception set).
//
// If `Verify()` returns `true`, `get()` does not die.
bool Verify() const {
PythonLock::AssertHeld();
if (ABSL_PREDICT_FALSE(value_ == nullptr)) return AllocateValue();
return true;
}
// Returns the value, importing it on the first call. Dies on failure
// (use `Verify()` to prevent this).
PyObject* get() const ABSL_ATTRIBUTE_LIFETIME_BOUND {
PythonLock::AssertHeld();
if (ABSL_PREDICT_FALSE(value_ == nullptr)) {
RIEGELI_CHECK(AllocateValue()) << Exception::Fetch().message();
}
return value_;
}
private:
bool AllocateValue() const;
absl::string_view module_name_;
absl::string_view attr_name_;
};
// Exports a Python capsule containing a C++ pointer, which should be valid
// forever, by adding it to the given module.
//
// `capsule_name` must be "module_name.attr_name" with `module_name`
// corresponding to `PyModule_GetName(module)`.
//
// Returns `false` on failure (with Python exception set).
bool ExportCapsule(PyObject* module, const char* capsule_name, const void* ptr);
// Imports a Python capsule and gets its stored pointer, which persists forever.
//
// `capsule_name must` be "module_name.attr_name".
//
// An instance of `ImportedCapsule` should be allocated statically:
// ```
// static constexpr ImportedCapsule kRecordPositionApi(
// "riegeli.records.record_position._CPPAPI");
// ```
//
// Then `kRecordPositionApi.get()` is a pointer stored in the capsule.
template
class ImportedCapsule : public py_internal::ImportedCapsuleBase {
public:
explicit constexpr ImportedCapsule(const char* capsule_name)
: ImportedCapsuleBase(capsule_name) {}
// Returns the value, importing it on the first call. Dies on failure
// (use `Verify()` to prevent this).
const T* get() const ABSL_ATTRIBUTE_LIFETIME_BOUND {
PythonLock::AssertHeld();
if (ABSL_PREDICT_FALSE(value_ == nullptr)) {
RIEGELI_CHECK(ImportValue()) << Exception::Fetch().message();
}
return static_cast(value_);
}
const T& operator*() const ABSL_ATTRIBUTE_LIFETIME_BOUND { return *get(); }
const T* operator->() const ABSL_ATTRIBUTE_LIFETIME_BOUND { return get(); }
};
// Converts C++ `long` to a Python `int` object.
//
// Returns `nullptr` on failure (with Python exception set).
inline PythonPtr IntToPython(long value) {
return PythonPtr(PyLong_FromLong(value));
}
// Converts C++ `absl::string_view` to a Python `bytes` object.
//
// Returns `nullptr` on failure (with Python exception set).
inline PythonPtr BytesToPython(absl::string_view value) {
return PythonPtr(PyBytes_FromStringAndSize(
value.data(), IntCast(value.size())));
}
// Converts C++ array of bytes to a Python `memoryview` object.
//
// Memory is shared. The C++ memory must be valid as long as the Python object
// is needed.
class MemoryView {
public:
MemoryView() = default;
MemoryView(const MemoryView&) = delete;
MemoryView& operator=(const MemoryView&) = delete;
// Calls `Release()`, ignoring its result, without disturbing the Python
// exception state.
~MemoryView();
// Creates and returns a read-only `memoryview` object.
//
// Returns `nullptr` on failure (with Python exception set).
//
// `ToPython()` or `MutableToPython()` must be called at most once for each
// `MemoryView` object.
PyObject* ToPython(absl::string_view value);
// Creates and returns a mutable `memoryview` object.
//
// Returns `nullptr` on failure (with Python exception set).
//
// `ToPython()` or `MutableToPython()` must be called at most once for each
// `MemoryView` object.
PyObject* MutableToPython(absl::Span value);
// If a reference to the `memoryview` has been stored elsewhere, calls
// `memoryview.release()` to mark the `memoryview` as invalid.
//
// Returns `false` on failure (with Python exception set).
bool Release();
private:
bool ReleaseInternal();
PythonPtr object_;
};
// Refers to internals of a Python `bytes`-like object, using the buffer
// protocol.
class BytesLike {
public:
BytesLike() noexcept { buffer_.obj = nullptr; }
BytesLike(const BytesLike&) = delete;
BytesLike& operator=(const BytesLike&) = delete;
~BytesLike() {
PythonLock::AssertHeld();
if (buffer_.obj != nullptr) PyBuffer_Release(&buffer_);
}
// Converts from a Python object.
//
// Returns `false` on failure (with Python exception set).
//
// Must be called at most once for each `BytesLike` object.
bool FromPython(PyObject* object) {
RIEGELI_ASSERT_EQ(buffer_.obj, nullptr)
<< "Failed precondition of BytesLike::FromPython(): "
"called more than once";
return PyObject_GetBuffer(object, &buffer_, PyBUF_CONTIG_RO) == 0;
}
// Returns the binary contents.
/*implicit*/ operator absl::string_view() const
ABSL_ATTRIBUTE_LIFETIME_BOUND {
return absl::string_view(static_cast(buffer_.buf),
IntCast(buffer_.len));
}
private:
Py_buffer buffer_;
};
// Converts C++ `absl::string_view` to a Python `str` object. Unicode is
// converted from UTF-8.
//
// Returns `nullptr` on failure (with Python exception set).
inline PythonPtr StringToPython(absl::string_view value) {
return PythonPtr(PyUnicode_FromStringAndSize(
value.data(), IntCast(value.size())));
}
// Refers to internals of a Python object representing text. Valid Python
// objects are `str` or `bytes`. Unicode is converted to UTF-8.
class StrOrBytes {
public:
StrOrBytes() noexcept {}
StrOrBytes(const StrOrBytes&) = delete;
StrOrBytes& operator=(const StrOrBytes&) = delete;
// Converts from a Python object.
//
// Returns `false` on failure (with Python exception set).
//
// Must be called at most once for each `StrOrBytes` object.
bool FromPython(PyObject* object ABSL_ATTRIBUTE_LIFETIME_BOUND);
// Returns the text contents.
/*implicit*/ operator absl::string_view() const { return data_; }
private:
absl::string_view data_;
};
// Converts C++ `Chain` to a Python `bytes` object.
//
// Returns `nullptr` on failure (with Python exception set).
PythonPtr ChainToPython(const Chain& value);
// Converts a Python `bytes`-like object to C++ `Chain`, using the buffer
// protocol.
//
// Returns `std::nullopt` on failure (with Python exception set).
std::optional ChainFromPython(PyObject* object);
// Converts C++ `size_t` to a Python `int` object.
//
// Returns `nullptr` on failure (with Python exception set).
PythonPtr SizeToPython(size_t value);
// Converts a Python object to C++ `size_t`. Valid Python objects are the same
// as for slicing: `int` or objects supporting `__index__()`.
//
// Returns `std::nullopt` on failure (with Python exception set).
std::optional SizeFromPython(PyObject* object);
// Converts C++ `Position` to a Python `int` object.
//
// Returns `nullptr` on failure (with Python exception set).
PythonPtr PositionToPython(Position value);
// Converts a Python object to C++ `Position`. Valid Python objects are the same
// as for slicing: `int` or objects supporting `__index__()`.
//
// Returns `std::nullopt` on failure (with Python exception set).
std::optional PositionFromPython(PyObject* object);
// Converts C++ `PartialOrdering` to a Python `None` (for `unordered`) or `int`
// object (-1 for `less`, 0 for `equivalent`, or 1 for `greater`).
//
// Returns `nullptr` on failure (with Python exception set).
PythonPtr PartialOrderingToPython(PartialOrdering ordering);
// Converts a Python object to C++ `PartialOrdering`. Valid Python objects are
// `int` (compared with 0) or `None`.
//
// Returns `std::nullopt` on failure (with Python exception set).
std::optional PartialOrderingFromPython(PyObject* object);
// Implementation details follow.
inline Exception::Exception(const Exception& that) noexcept { *this = that; }
inline int Exception::Traverse(visitproc visit, void* arg) {
Py_VISIT(type_.get());
Py_VISIT(value_.get());
Py_VISIT(traceback_.get());
return 0;
}
} // namespace riegeli::python
#endif // PYTHON_RIEGELI_BASE_UTILS_H_
================================================
FILE: python/riegeli/bytes/BUILD
================================================
load("@rules_cc//cc:defs.bzl", "cc_library")
package(
default_visibility = ["//python/riegeli:__subpackages__"],
features = ["header_modules"],
)
licenses(["notice"])
cc_library(
name = "python_reader",
srcs = ["python_reader.cc"],
hdrs = ["python_reader.h"],
# python_reader.cc has #define before #include to influence what the
# included files provide.
features = ["-use_header_modules"],
deps = [
"//python/riegeli/base:utils",
"//riegeli/base:arithmetic",
"//riegeli/base:assert",
"//riegeli/base:global",
"//riegeli/base:object",
"//riegeli/base:types",
"//riegeli/bytes:buffer_options",
"//riegeli/bytes:buffered_reader",
"@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/numeric:bits",
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:string_view",
"@com_google_absl//absl/types:span",
"@rules_python//python/cc:current_py_cc_headers",
],
)
cc_library(
name = "python_writer",
srcs = ["python_writer.cc"],
hdrs = ["python_writer.h"],
# python_writer.cc has #define before #include to influence what the
# included files provide.
features = ["-use_header_modules"],
deps = [
"//python/riegeli/base:utils",
"//riegeli/base:arithmetic",
"//riegeli/base:assert",
"//riegeli/base:global",
"//riegeli/base:object",
"//riegeli/base:types",
"//riegeli/bytes:buffer_options",
"//riegeli/bytes:buffered_writer",
"@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/numeric:bits",
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:string_view",
"@rules_python//python/cc:current_py_cc_headers",
],
)
================================================
FILE: python/riegeli/bytes/python_reader.cc
================================================
// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// From https://docs.python.org/3/c-api/intro.html:
// Since Python may define some pre-processor definitions which affect the
// standard headers on some systems, you must include Python.h before any
// standard headers are included.
#define PY_SSIZE_T_CLEAN
#include
// clang-format: do not reorder the above include.
#include "python/riegeli/bytes/python_reader.h"
// clang-format: do not reorder the above include.
#include
#include
#include
#include
#include "absl/base/optimization.h"
#include "absl/numeric/bits.h"
#include "absl/status/status.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
#include "absl/types/span.h"
#include "python/riegeli/base/utils.h"
#include "riegeli/base/arithmetic.h"
#include "riegeli/base/assert.h"
#include "riegeli/base/global.h"
#include "riegeli/base/types.h"
#include "riegeli/bytes/buffered_reader.h"
namespace riegeli::python {
PythonReader::PythonReader(PyObject* src, Options options)
: BufferedReader(options.buffer_options()), owns_src_(options.owns_src()) {
PythonLock::AssertHeld();
Py_INCREF(src);
src_.reset(src);
if (options.assumed_pos() != std::nullopt) {
set_limit_pos(*options.assumed_pos());
// `supports_random_access_` is left as `false`.
random_access_status_ = Global([] {
return absl::UnimplementedError(
"PythonReader::Options::assumed_pos() excludes random access");
});
} else {
static constexpr Identifier id_seekable("seekable");
const PythonPtr seekable_result(
PyObject_CallMethodObjArgs(src_.get(), id_seekable.get(), nullptr));
if (ABSL_PREDICT_FALSE(seekable_result == nullptr)) {
FailOperation("seekable()");
return;
}
const int seekable_is_true = PyObject_IsTrue(seekable_result.get());
if (ABSL_PREDICT_FALSE(seekable_is_true < 0)) {
FailOperation("PyObject_IsTrue() after seekable()");
return;
}
if (seekable_is_true == 0) {
// Random access is not supported. Assume 0 as the initial position.
// `supports_random_access_` is left as `false`.
random_access_status_ = Global([] {
return absl::UnimplementedError(
"seekable() is False which excludes random access");
});
return;
}
static constexpr Identifier id_tell("tell");
const PythonPtr tell_result(
PyObject_CallMethodObjArgs(src_.get(), id_tell.get(), nullptr));
if (ABSL_PREDICT_FALSE(tell_result == nullptr)) {
FailOperation("tell()");
return;
}
const std::optional file_pos =
PositionFromPython(tell_result.get());
if (ABSL_PREDICT_FALSE(file_pos == std::nullopt)) {
FailOperation("PositionFromPython() after tell()");
return;
}
set_limit_pos(*file_pos);
supports_random_access_ = true;
}
BeginRun();
}
void PythonReader::Done() {
BufferedReader::Done();
random_access_status_ = absl::OkStatus();
if (owns_src_ && src_ != nullptr) {
PythonLock lock;
static constexpr Identifier id_close("close");
const PythonPtr close_result(
PyObject_CallMethodObjArgs(src_.get(), id_close.get(), nullptr));
if (ABSL_PREDICT_FALSE(close_result == nullptr)) FailOperation("close()");
}
}
inline bool PythonReader::FailOperation(absl::string_view operation) {
RIEGELI_ASSERT(is_open())
<< "Failed precondition of PythonReader::FailOperation(): "
"Object closed";
PythonLock::AssertHeld();
if (ABSL_PREDICT_FALSE(!ok())) {
// Ignore this error because `PythonReader` already failed.
PyErr_Clear();
return false;
}
exception_ = Exception::Fetch();
return Fail(absl::UnknownError(
absl::StrCat(operation, " failed: ", exception_.message())));
}
bool PythonReader::ReadInternal(size_t min_length, size_t max_length,
char* dest) {
RIEGELI_ASSERT_GT(min_length, 0u)
<< "Failed precondition of BufferedReader::ReadInternal(): "
"nothing to read";
RIEGELI_ASSERT_GE(max_length, min_length)
<< "Failed precondition of BufferedReader::ReadInternal(): "
"max_length < min_length";
RIEGELI_ASSERT_OK(*this)
<< "Failed precondition of BufferedReader::ReadInternal()";
PythonLock lock;
// Find a read function to use, preferring in order: `readinto1()`,
// `readinto()`, `read1()`, `read()`.
if (ABSL_PREDICT_FALSE(read_function_ == nullptr)) {
static constexpr Identifier id_readinto1("readinto1");
read_function_.reset(PyObject_GetAttr(src_.get(), id_readinto1.get()));
read_function_name_ = "readinto1()";
if (read_function_ == nullptr) {
if (ABSL_PREDICT_FALSE(!PyErr_ExceptionMatches(PyExc_AttributeError))) {
return FailOperation(read_function_name_);
}
PyErr_Clear();
static constexpr Identifier id_readinto("readinto");
read_function_.reset(PyObject_GetAttr(src_.get(), id_readinto.get()));
read_function_name_ = "readinto()";
if (read_function_ == nullptr) {
if (ABSL_PREDICT_FALSE(!PyErr_ExceptionMatches(PyExc_AttributeError))) {
return FailOperation(read_function_name_);
}
PyErr_Clear();
use_bytes_ = true;
static constexpr Identifier id_read1("read1");
read_function_.reset(PyObject_GetAttr(src_.get(), id_read1.get()));
read_function_name_ = "read1()";
if (read_function_ == nullptr) {
if (ABSL_PREDICT_FALSE(
!PyErr_ExceptionMatches(PyExc_AttributeError))) {
return FailOperation(read_function_name_);
}
PyErr_Clear();
static constexpr Identifier id_read("read");
read_function_.reset(PyObject_GetAttr(src_.get(), id_read.get()));
read_function_name_ = "read()";
if (ABSL_PREDICT_FALSE(read_function_ == nullptr)) {
return FailOperation(read_function_name_);
}
}
}
}
}
for (;;) {
if (ABSL_PREDICT_FALSE(limit_pos() ==
std::numeric_limits::max())) {
return FailOverflow();
}
const size_t length_to_read = UnsignedMin(
max_length, std::numeric_limits::max() - limit_pos(),
absl::bit_floor(size_t{std::numeric_limits::max()}));
size_t length_read;
if (!use_bytes_) {
PythonPtr read_result;
{
// Prefer using `readinto1()` or `readinto()` to avoid copying memory.
MemoryView memory_view;
PyObject* const memory_view_object =
memory_view.MutableToPython(absl::MakeSpan(dest, length_to_read));
if (ABSL_PREDICT_FALSE(memory_view_object == nullptr)) {
return FailOperation("MemoryView::MutableToPython()");
}
read_result.reset(PyObject_CallFunctionObjArgs(
read_function_.get(), memory_view_object, nullptr));
if (ABSL_PREDICT_FALSE(read_result == nullptr)) {
return FailOperation(read_function_name_);
}
if (ABSL_PREDICT_FALSE(!memory_view.Release())) {
return FailOperation("MemoryView::Release()");
}
}
const std::optional length_read_opt =
SizeFromPython(read_result.get());
if (ABSL_PREDICT_FALSE(length_read_opt == std::nullopt)) {
return FailOperation(
absl::StrCat("SizeFromPython() after ", read_function_name_));
}
length_read = *length_read_opt;
if (ABSL_PREDICT_FALSE(length_read == 0)) return false;
if (ABSL_PREDICT_FALSE(length_read > max_length)) {
return Fail(absl::InternalError(
absl::StrCat(read_function_name_, " read more than requested")));
}
} else {
const PythonPtr length(SizeToPython(length_to_read));
if (ABSL_PREDICT_FALSE(length == nullptr)) {
return FailOperation("SizeToPython()");
}
const PythonPtr read_result(PyObject_CallFunctionObjArgs(
read_function_.get(), length.get(), nullptr));
if (ABSL_PREDICT_FALSE(read_result == nullptr)) {
return FailOperation(read_function_name_);
}
Py_buffer buffer;
if (ABSL_PREDICT_FALSE(PyObject_GetBuffer(read_result.get(), &buffer,
PyBUF_CONTIG_RO) < 0)) {
return FailOperation(
absl::StrCat("PyObject_GetBuffer() after ", read_function_name_));
}
if (ABSL_PREDICT_FALSE(buffer.len == 0)) {
PyBuffer_Release(&buffer);
return false;
}
if (ABSL_PREDICT_FALSE(IntCast(buffer.len) > max_length)) {
PyBuffer_Release(&buffer);
return Fail(absl::InternalError(
absl::StrCat(read_function_name_, " read more than requested")));
}
std::memcpy(dest, buffer.buf, IntCast(buffer.len));
length_read = IntCast(buffer.len);
PyBuffer_Release(&buffer);
}
move_limit_pos(length_read);
if (length_read >= min_length) return true;
dest += length_read;
min_length -= length_read;
max_length -= length_read;
}
}
bool PythonReader::SeekBehindBuffer(Position new_pos) {
RIEGELI_ASSERT(new_pos < start_pos() || new_pos > limit_pos())
<< "Failed precondition of BufferedReader::SeekBehindBuffer(): "
"position in the buffer, use Seek() instead";
RIEGELI_ASSERT_EQ(start_to_limit(), 0u)
<< "Failed precondition of BufferedReader::SeekBehindBuffer(): "
"buffer not empty";
if (ABSL_PREDICT_FALSE(!PythonReader::SupportsRandomAccess())) {
if (ABSL_PREDICT_FALSE(new_pos < start_pos())) {
if (ok()) Fail(random_access_status_);
return false;
}
return BufferedReader::SeekBehindBuffer(new_pos);
}
if (ABSL_PREDICT_FALSE(!ok())) return false;
PythonLock lock;
if (new_pos > limit_pos()) {
// Seeking forwards.
const std::optional size = SizeInternal();
if (ABSL_PREDICT_FALSE(size == std::nullopt)) return false;
if (ABSL_PREDICT_FALSE(new_pos > *size)) {
// File ends.
set_limit_pos(*size);
return false;
}
}
set_limit_pos(new_pos);
const PythonPtr file_pos = PositionToPython(limit_pos());
if (ABSL_PREDICT_FALSE(file_pos == nullptr)) {
return FailOperation("PositionToPython()");
}
static constexpr Identifier id_seek("seek");
const PythonPtr seek_result(PyObject_CallMethodObjArgs(
src_.get(), id_seek.get(), file_pos.get(), nullptr));
if (ABSL_PREDICT_FALSE(seek_result == nullptr)) {
return FailOperation("seek()");
}
return true;
}
inline std::optional PythonReader::SizeInternal() {
RIEGELI_ASSERT_OK(*this)
<< "Failed precondition of PythonReader::SizeInternal()";
RIEGELI_ASSERT(PythonReader::SupportsRandomAccess())
<< "Failed precondition of PythonReader::SizeInternal(): "
"random access not supported";
PythonLock::AssertHeld();
absl::string_view operation;
const PythonPtr file_pos = PositionToPython(0);
if (ABSL_PREDICT_FALSE(file_pos == nullptr)) {
FailOperation("PositionToPython()");
return std::nullopt;
}
const PythonPtr whence = IntToPython(2); // `io.SEEK_END`
if (ABSL_PREDICT_FALSE(whence == nullptr)) {
FailOperation("IntToPython()");
return std::nullopt;
}
static constexpr Identifier id_seek("seek");
PythonPtr result(PyObject_CallMethodObjArgs(
src_.get(), id_seek.get(), file_pos.get(), whence.get(), nullptr));
if (result.get() == Py_None) {
// Python2 `file.seek()` returns `None`, so `tell()` is needed to get the
// new position. Python2 is dead, but some classes still behave like that.
static constexpr Identifier id_tell("tell");
result.reset(
PyObject_CallMethodObjArgs(src_.get(), id_tell.get(), nullptr));
operation = "tell()";
} else {
// `io.IOBase.seek()` returns the new position.
operation = "seek()";
}
if (ABSL_PREDICT_FALSE(result == nullptr)) {
FailOperation(operation);
return std::nullopt;
}
const std::optional size = PositionFromPython(result.get());
if (ABSL_PREDICT_FALSE(size == std::nullopt)) {
FailOperation(absl::StrCat("PositionFromPython() after ", operation));
return std::nullopt;
}
return *size;
}
std::optional PythonReader::SizeImpl() {
if (ABSL_PREDICT_FALSE(!PythonReader::SupportsRandomAccess())) {
if (ok()) Fail(random_access_status_);
return std::nullopt;
}
if (ABSL_PREDICT_FALSE(!ok())) return std::nullopt;
PythonLock lock;
const std::optional size = SizeInternal();
if (ABSL_PREDICT_FALSE(size == std::nullopt)) return std::nullopt;
const PythonPtr file_pos = PositionToPython(limit_pos());
if (ABSL_PREDICT_FALSE(file_pos == nullptr)) {
FailOperation("PositionToPython()");
return std::nullopt;
}
static constexpr Identifier id_seek("seek");
const PythonPtr seek_result(PyObject_CallMethodObjArgs(
src_.get(), id_seek.get(), file_pos.get(), nullptr));
if (ABSL_PREDICT_FALSE(seek_result == nullptr)) {
FailOperation("seek()");
return std::nullopt;
}
return *size;
}
} // namespace riegeli::python
================================================
FILE: python/riegeli/bytes/python_reader.h
================================================
// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PYTHON_RIEGELI_BYTES_PYTHON_READER_H_
#define PYTHON_RIEGELI_BYTES_PYTHON_READER_H_
// From https://docs.python.org/3/c-api/intro.html:
// Since Python may define some pre-processor definitions which affect the
// standard headers on some systems, you must include Python.h before any
// standard headers are included.
#include
// clang-format: do not reorder the above include.
#include
#include
#include
#include "absl/base/attributes.h"
#include "absl/status/status.h"
#include "absl/strings/string_view.h"
#include "python/riegeli/base/utils.h"
#include "riegeli/base/object.h"
#include "riegeli/base/types.h"
#include "riegeli/bytes/buffer_options.h"
#include "riegeli/bytes/buffered_reader.h"
namespace riegeli::python {
// A `Reader` which reads from a Python binary I/O stream.
//
// The stream must support:
// * `close()` - for `Close()` if `Options::owns_src()`
// * `readinto1(memoryview)` or
// `readinto(memoryview)` or
// `read1(int)` or
// `read(int)`
// * `seekable()`
// * `seek(int[, int])` - for `Seek()` or `Size()`
// * `tell()` - for `Seek()` or `Size()`
//
// `PythonReader` supports random access if
// `Options::assumed_pos() == std::nullopt` and the stream supports random
// access (this is checked by calling `seekable()`).
//
// Warning: if random access is not supported and the stream is not owned,
// it will have an unpredictable amount of extra data consumed because of
// buffering.
class PythonReader : public BufferedReader {
public:
class Options : public BufferOptionsBase {
public:
Options() noexcept {}
// If `true`, `PythonReader::Close()` closes the stream.
//
// Default: `false`.
Options& set_owns_src(bool owns_src) & ABSL_ATTRIBUTE_LIFETIME_BOUND {
owns_src_ = owns_src;
return *this;
}
Options&& set_owns_src(bool owns_src) && ABSL_ATTRIBUTE_LIFETIME_BOUND {
return std::move(set_owns_src(owns_src));
}
bool owns_src() const { return owns_src_; }
// If `std::nullopt`, the current position reported by `pos()` corresponds
// to the current stream position if possible, otherwise 0 is assumed as the
// initial position. Random access is supported if the stream supports
// random access.
//
// If not `std::nullopt`, this position is assumed initially, to be reported
// by `pos()`. It does not need to correspond to the current stream
// position. Random access is not supported.
//
// Default: `std::nullopt`.
Options& set_assumed_pos(std::optional assumed_pos) &
ABSL_ATTRIBUTE_LIFETIME_BOUND {
assumed_pos_ = assumed_pos;
return *this;
}
Options&& set_assumed_pos(std::optional assumed_pos) &&
ABSL_ATTRIBUTE_LIFETIME_BOUND {
return std::move(set_assumed_pos(assumed_pos));
}
std::optional assumed_pos() const { return assumed_pos_; }
private:
bool owns_src_ = false;
std::optional assumed_pos_;
};
// Creates a closed `PythonReader`.
explicit PythonReader(Closed) noexcept : BufferedReader(kClosed) {}
// Will read from `src`.
explicit PythonReader(PyObject* src, Options options = Options());
PythonReader(PythonReader&& that) noexcept;
PythonReader& operator=(PythonReader&& that) noexcept;
// Returns a borrowed reference to the stream being read from.
PyObject* src() const ABSL_ATTRIBUTE_LIFETIME_BOUND { return src_.get(); }
const Exception& exception() const ABSL_ATTRIBUTE_LIFETIME_BOUND {
return exception_;
}
bool ToleratesReadingAhead() override {
return BufferedReader::ToleratesReadingAhead() ||
PythonReader::SupportsRandomAccess();
}
bool SupportsRandomAccess() override { return supports_random_access_; }
// For implementing `tp_traverse` of objects containing `PythonReader`.
int Traverse(visitproc visit, void* arg);
protected:
void Done() override;
bool ReadInternal(size_t min_length, size_t max_length, char* dest) override;
bool SeekBehindBuffer(Position new_pos) override;
std::optional SizeImpl() override;
private:
ABSL_ATTRIBUTE_COLD bool FailOperation(absl::string_view operation);
std::optional SizeInternal();
PythonPtrLocking src_;
bool owns_src_ = false;
bool supports_random_access_ = false;
absl::Status random_access_status_;
Exception exception_;
PythonPtrLocking read_function_;
absl::string_view read_function_name_;
bool use_bytes_ = false;
};
inline PythonReader::PythonReader(PythonReader&& that) noexcept
: BufferedReader(static_cast(that)),
src_(std::move(that.src_)),
owns_src_(that.owns_src_),
supports_random_access_(
std::exchange(that.supports_random_access_, false)),
random_access_status_(std::move(that.random_access_status_)),
exception_(std::move(that.exception_)),
read_function_(std::move(that.read_function_)),
read_function_name_(that.read_function_name_),
use_bytes_(that.use_bytes_) {}
inline PythonReader& PythonReader::operator=(PythonReader&& that) noexcept {
BufferedReader::operator=(static_cast(that));
src_ = std::move(that.src_);
owns_src_ = that.owns_src_;
supports_random_access_ = std::exchange(that.supports_random_access_, false);
random_access_status_ = std::move(that.random_access_status_);
exception_ = std::move(that.exception_);
read_function_ = std::move(that.read_function_);
read_function_name_ = that.read_function_name_;
use_bytes_ = that.use_bytes_;
return *this;
}
inline int PythonReader::Traverse(visitproc visit, void* arg) {
Py_VISIT(src_.get());
Py_VISIT(read_function_.get());
return exception_.Traverse(visit, arg);
}
} // namespace riegeli::python
#endif // PYTHON_RIEGELI_BYTES_PYTHON_READER_H_
================================================
FILE: python/riegeli/bytes/python_writer.cc
================================================
// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// From https://docs.python.org/3/c-api/intro.html:
// Since Python may define some pre-processor definitions which affect the
// standard headers on some systems, you must include Python.h before any
// standard headers are included.
#define PY_SSIZE_T_CLEAN
#include
// clang-format: do not reorder the above include.
#include "python/riegeli/bytes/python_writer.h"
// clang-format: do not reorder the above include.
#include
#include
#include
#include "absl/base/attributes.h"
#include "absl/base/optimization.h"
#include "absl/numeric/bits.h"
#include "absl/status/status.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
#include "python/riegeli/base/utils.h"
#include "riegeli/base/arithmetic.h"
#include "riegeli/base/assert.h"
#include "riegeli/base/global.h"
#include "riegeli/base/types.h"
#include "riegeli/bytes/buffered_writer.h"
namespace riegeli::python {
PythonWriter::PythonWriter(PyObject* dest, Options options)
: BufferedWriter(options.buffer_options()),
owns_dest_(options.owns_dest()) {
PythonLock::AssertHeld();
Py_INCREF(dest);
dest_.reset(dest);
if (options.assumed_pos() != std::nullopt) {
set_start_pos(*options.assumed_pos());
// `supports_random_access_` is left as `false`.
random_access_status_ = Global([] {
return absl::UnimplementedError(
"PythonWriter::Options::assumed_pos() excludes random access");
});
} else {
static constexpr Identifier id_seekable("seekable");
const PythonPtr seekable_result(
PyObject_CallMethodObjArgs(dest_.get(), id_seekable.get(), nullptr));
if (ABSL_PREDICT_FALSE(seekable_result == nullptr)) {
FailOperation("seekable()");
return;
}
const int seekable_is_true = PyObject_IsTrue(seekable_result.get());
if (ABSL_PREDICT_FALSE(seekable_is_true < 0)) {
FailOperation("PyObject_IsTrue() after seekable()");
return;
}
if (seekable_is_true == 0) {
// Random access is not supported. Assume 0 as the initial position.
// `supports_random_access_` is left as `false`.
random_access_status_ = Global([] {
return absl::UnimplementedError(
"seekable() is False which excludes random access");
});
return;
}
static constexpr Identifier id_tell("tell");
const PythonPtr tell_result(
PyObject_CallMethodObjArgs(dest_.get(), id_tell.get(), nullptr));
if (ABSL_PREDICT_FALSE(tell_result == nullptr)) {
FailOperation("tell()");
return;
}
const std::optional file_pos =
PositionFromPython(tell_result.get());
if (ABSL_PREDICT_FALSE(file_pos == std::nullopt)) {
FailOperation("PositionFromPython() after tell()");
return;
}
set_start_pos(*file_pos);
supports_random_access_ = true;
}
BeginRun();
}
void PythonWriter::Done() {
BufferedWriter::Done();
random_access_status_ = absl::OkStatus();
if (owns_dest_ && dest_ != nullptr) {
PythonLock lock;
static constexpr Identifier id_close("close");
const PythonPtr close_result(
PyObject_CallMethodObjArgs(dest_.get(), id_close.get(), nullptr));
if (ABSL_PREDICT_FALSE(close_result == nullptr)) FailOperation("close()");
}
}
inline bool PythonWriter::FailOperation(absl::string_view operation) {
RIEGELI_ASSERT(is_open())
<< "Failed precondition of PythonWriter::FailOperation(): "
"Object closed";
PythonLock::AssertHeld();
if (ABSL_PREDICT_FALSE(!ok())) {
// Ignore this error because `PythonWriter` already failed.
PyErr_Clear();
return false;
}
exception_ = Exception::Fetch();
return Fail(absl::UnknownError(
absl::StrCat(operation, " failed: ", exception_.message())));
}
bool PythonWriter::WriteInternal(absl::string_view src) {
RIEGELI_ASSERT(!src.empty())
<< "Failed precondition of BufferedWriter::WriteInternal(): "
"nothing to write";
RIEGELI_ASSERT_OK(*this)
<< "Failed precondition of BufferedWriter::WriteInternal()";
if (ABSL_PREDICT_FALSE(src.size() >
std::numeric_limits::max() - start_pos())) {
return FailOverflow();
}
PythonLock lock;
if (ABSL_PREDICT_FALSE(write_function_ == nullptr)) {
static constexpr Identifier id_write("write");
write_function_.reset(PyObject_GetAttr(dest_.get(), id_write.get()));
if (ABSL_PREDICT_FALSE(write_function_ == nullptr)) {
return FailOperation("write()");
}
}
do {
const size_t length_to_write = UnsignedMin(
src.size(),
absl::bit_floor(size_t{std::numeric_limits::max()}));
size_t length_written;
{
PythonPtr write_result;
if (!use_bytes_) {
// Prefer passing a `memoryview` to avoid copying memory.
MemoryView memory_view;
PyObject* const memory_view_object = memory_view.ToPython(
absl::string_view(src.data(), length_to_write));
if (ABSL_PREDICT_FALSE(memory_view_object == nullptr)) {
return FailOperation("MemoryView::ToPython()");
}
write_result.reset(PyObject_CallFunctionObjArgs(
write_function_.get(), memory_view_object, nullptr));
if (ABSL_PREDICT_FALSE(write_result == nullptr)) {
if (!PyErr_ExceptionMatches(PyExc_TypeError)) {
return FailOperation("write()");
}
PyErr_Clear();
use_bytes_ = true;
}
if (ABSL_PREDICT_FALSE(!memory_view.Release())) {
return FailOperation("MemoryView::Release()");
}
}
if (use_bytes_) {
// `write()` does not support `memoryview`. Use `bytes`.
const PythonPtr bytes = BytesToPython(src.substr(0, length_to_write));
if (ABSL_PREDICT_FALSE(bytes == nullptr)) {
return FailOperation("BytesToPython()");
}
write_result.reset(PyObject_CallFunctionObjArgs(write_function_.get(),
bytes.get(), nullptr));
if (ABSL_PREDICT_FALSE(write_result == nullptr)) {
return FailOperation("write()");
}
}
if (write_result.get() == Py_None) {
// Python2 `file.write()` returns `None`, and would raise an exception
// if less than the full length had been written. Python2 is dead, but
// some classes still behave like that.
length_written = length_to_write;
} else {
// `io.IOBase.write()` returns the length written.
const std::optional length_written_opt =
SizeFromPython(write_result.get());
if (ABSL_PREDICT_FALSE(length_written_opt == std::nullopt)) {
return FailOperation("SizeFromPython() after write()");
}
length_written = *length_written_opt;
}
}
if (ABSL_PREDICT_FALSE(length_written > length_to_write)) {
return Fail(absl::InternalError("write() wrote more than requested"));
}
move_start_pos(length_written);
src.remove_prefix(length_written);
} while (!src.empty());
return true;
}
bool PythonWriter::FlushImpl(FlushType flush_type) {
if (ABSL_PREDICT_FALSE(!BufferedWriter::FlushImpl(flush_type))) return false;
switch (flush_type) {
case FlushType::kFromObject:
if (!owns_dest_) return true;
ABSL_FALLTHROUGH_INTENDED;
case FlushType::kFromProcess:
case FlushType::kFromMachine:
PythonLock lock;
static constexpr Identifier id_flush("flush");
const PythonPtr flush_result(
PyObject_CallMethodObjArgs(dest_.get(), id_flush.get(), nullptr));
if (ABSL_PREDICT_FALSE(flush_result == nullptr)) {
return FailOperation("flush()");
}
return true;
}
RIEGELI_ASSUME_UNREACHABLE()
<< "Unknown flush type: " << static_cast(flush_type);
}
bool PythonWriter::SeekBehindBuffer(Position new_pos) {
RIEGELI_ASSERT_NE(new_pos, pos())
<< "Failed precondition of BufferedWriter::SeekBehindBuffer(): "
"position unchanged, use Seek() instead";
RIEGELI_ASSERT_EQ(start_to_limit(), 0u)
<< "Failed precondition of BufferedWriter::SeekBehindBuffer(): "
"buffer not empty";
if (ABSL_PREDICT_FALSE(!PythonWriter::SupportsRandomAccess())) {
if (ok()) Fail(random_access_status_);
return false;
}
PythonLock lock;
if (new_pos > start_pos()) {
// Seeking forwards.
const std::optional size = SizeInternal();
if (ABSL_PREDICT_FALSE(size == std::nullopt)) return false;
if (ABSL_PREDICT_FALSE(new_pos > *size)) {
// File ends.
set_start_pos(*size);
return false;
}
}
set_start_pos(new_pos);
const PythonPtr file_pos = PositionToPython(start_pos());
if (ABSL_PREDICT_FALSE(file_pos == nullptr)) {
return FailOperation("PositionToPython()");
}
static constexpr Identifier id_seek("seek");
const PythonPtr seek_result(PyObject_CallMethodObjArgs(
dest_.get(), id_seek.get(), file_pos.get(), nullptr));
if (ABSL_PREDICT_FALSE(seek_result == nullptr)) {
return FailOperation("seek()");
}
return true;
}
inline std::optional PythonWriter::SizeInternal() {
RIEGELI_ASSERT_OK(*this)
<< "Failed precondition of PythonWriter::SizeInternal()";
RIEGELI_ASSERT(PythonWriter::SupportsRandomAccess())
<< "Failed precondition of PythonWriter::SizeInternal(): "
"random access not supported";
RIEGELI_ASSERT_EQ(start_to_limit(), 0u)
<< "Failed precondition of PythonWriter::SizeInternal(): "
"buffer not empty";
PythonLock::AssertHeld();
absl::string_view operation;
const PythonPtr file_pos = PositionToPython(0);
if (ABSL_PREDICT_FALSE(file_pos == nullptr)) {
FailOperation("PositionToPython()");
return std::nullopt;
}
const PythonPtr whence = IntToPython(2); // `io.SEEK_END`
if (ABSL_PREDICT_FALSE(whence == nullptr)) {
FailOperation("IntToPython()");
return std::nullopt;
}
static constexpr Identifier id_seek("seek");
PythonPtr result(PyObject_CallMethodObjArgs(
dest_.get(), id_seek.get(), file_pos.get(), whence.get(), nullptr));
if (result.get() == Py_None) {
// Python2 `file.seek()` returns `None`. Python2 is dead, but some classes
// still behave like that.
static constexpr Identifier id_tell("tell");
result.reset(
PyObject_CallMethodObjArgs(dest_.get(), id_tell.get(), nullptr));
operation = "tell()";
} else {
// `io.IOBase.seek()` returns the new position.
operation = "seek()";
}
if (ABSL_PREDICT_FALSE(result == nullptr)) {
FailOperation(operation);
return std::nullopt;
}
const std::optional size = PositionFromPython(result.get());
if (ABSL_PREDICT_FALSE(size == std::nullopt)) {
FailOperation(absl::StrCat("PositionFromPython() after ", operation));
return std::nullopt;
}
return *size;
}
std::optional PythonWriter::SizeBehindBuffer() {
RIEGELI_ASSERT_EQ(start_to_limit(), 0u)
<< "Failed precondition of BufferedWriter::SizeBehindBuffer(): "
"buffer not empty";
if (ABSL_PREDICT_FALSE(!PythonWriter::SupportsRandomAccess())) {
if (ok()) Fail(random_access_status_);
return std::nullopt;
}
if (ABSL_PREDICT_FALSE(!ok())) return std::nullopt;
PythonLock lock;
const std::optional size = SizeInternal();
if (ABSL_PREDICT_FALSE(size == std::nullopt)) return std::nullopt;
const PythonPtr file_pos = PositionToPython(start_pos());
if (ABSL_PREDICT_FALSE(file_pos == nullptr)) {
FailOperation("PositionToPython()");
return std::nullopt;
}
static constexpr Identifier id_seek("seek");
const PythonPtr seek_result(PyObject_CallMethodObjArgs(
dest_.get(), id_seek.get(), file_pos.get(), nullptr));
if (ABSL_PREDICT_FALSE(seek_result == nullptr)) {
FailOperation("seek()");
return std::nullopt;
}
return *size;
}
bool PythonWriter::TruncateBehindBuffer(Position new_size) {
RIEGELI_ASSERT_EQ(start_to_limit(), 0u)
<< "Failed precondition of BufferedWriter::TruncateBehindBuffer(): "
"buffer not empty";
if (ABSL_PREDICT_FALSE(!PythonWriter::SupportsRandomAccess())) {
if (ok()) Fail(random_access_status_);
return false;
}
if (ABSL_PREDICT_FALSE(!ok())) return false;
PythonLock lock;
const std::optional size = SizeInternal();
if (ABSL_PREDICT_FALSE(size == std::nullopt)) return false;
if (ABSL_PREDICT_FALSE(new_size > *size)) {
// File ends.
set_start_pos(*size);
return false;
}
{
const PythonPtr file_pos = PositionToPython(new_size);
if (ABSL_PREDICT_FALSE(file_pos == nullptr)) {
return FailOperation("PositionToPython()");
}
static constexpr Identifier id_seek("seek");
const PythonPtr seek_result(PyObject_CallMethodObjArgs(
dest_.get(), id_seek.get(), file_pos.get(), nullptr));
if (ABSL_PREDICT_FALSE(seek_result == nullptr)) {
return FailOperation("seek()");
}
}
set_start_pos(new_size);
static constexpr Identifier id_truncate("truncate");
const PythonPtr truncate_result(
PyObject_CallMethodObjArgs(dest_.get(), id_truncate.get(), nullptr));
if (ABSL_PREDICT_FALSE(truncate_result == nullptr)) {
return FailOperation("truncate()");
}
return true;
}
} // namespace riegeli::python
================================================
FILE: python/riegeli/bytes/python_writer.h
================================================
// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PYTHON_RIEGELI_BYTES_PYTHON_WRITER_H_
#define PYTHON_RIEGELI_BYTES_PYTHON_WRITER_H_
// From https://docs.python.org/3/c-api/intro.html:
// Since Python may define some pre-processor definitions which affect the
// standard headers on some systems, you must include Python.h before any
// standard headers are included.
#include
// clang-format: do not reorder the above include.
#include
#include
#include "absl/base/attributes.h"
#include "absl/status/status.h"
#include "absl/strings/string_view.h"
#include "python/riegeli/base/utils.h"
#include "riegeli/base/object.h"
#include "riegeli/base/types.h"
#include "riegeli/bytes/buffer_options.h"
#include "riegeli/bytes/buffered_writer.h"
namespace riegeli::python {
// A `Writer` which writes to a Python binary I/O stream.
//
// The stream must support:
// * `close()` - for `Close()` if `Options::owns_dest()`
// * `write(bytes)`
// * `flush()` - for `Flush()`
// * `seekable()`
// * `seek(int[, int])` - for `Seek()`, `Size()`, or `Truncate()`
// * `tell()` - for `Seek()`, `Size()`, or `Truncate()`
// * `truncate()` - for `Truncate()`
//
// `PythonWriter` supports random access if
// `Options::assumed_pos() == std::nullopt` and the stream supports random
// access (this is checked by calling `seekable()`).
class PythonWriter : public BufferedWriter {
public:
class Options : public BufferOptionsBase {
public:
Options() noexcept {}
// If `true`, `PythonWriter::Close()` closes the stream, and
// `PythonWriter::Flush(flush_type)` flushes the stream even if `flush_type`
// is `FlushType::kFromObject`.
//
// Default: `false`.
Options& set_owns_dest(bool owns_dest) & ABSL_ATTRIBUTE_LIFETIME_BOUND {
owns_dest_ = owns_dest;
return *this;
}
Options&& set_owns_dest(bool owns_dest) && ABSL_ATTRIBUTE_LIFETIME_BOUND {
return std::move(set_owns_dest(owns_dest));
}
bool owns_dest() const { return owns_dest_; }
// If `std::nullopt`, the current position reported by `pos()` corresponds
// to the current stream position if possible, otherwise 0 is assumed as the
// initial position. Random access is supported if the stream supports
// random access.
//
// If not `std::nullopt`, this position is assumed initially, to be reported
// by `pos()`. It does not need to correspond to the current stream
// position. Random access is not supported.
//
// Default: `std::nullopt`.
Options& set_assumed_pos(std::optional assumed_pos) &
ABSL_ATTRIBUTE_LIFETIME_BOUND {
assumed_pos_ = assumed_pos;
return *this;
}
Options&& set_assumed_pos(std::optional assumed_pos) &&
ABSL_ATTRIBUTE_LIFETIME_BOUND {
return std::move(set_assumed_pos(assumed_pos));
}
std::optional assumed_pos() const { return assumed_pos_; }
private:
bool owns_dest_ = false;
std::optional assumed_pos_;
};
// Creates a closed `PythonWriter`.
explicit PythonWriter(Closed) noexcept : BufferedWriter(kClosed) {}
// Will write to `dest`.
explicit PythonWriter(PyObject* dest, Options options = Options());
PythonWriter(PythonWriter&& that) noexcept;
PythonWriter& operator=(PythonWriter&& that) noexcept;
// Returns a borrowed reference to the stream being written to.
PyObject* dest() const ABSL_ATTRIBUTE_LIFETIME_BOUND { return dest_.get(); }
const Exception& exception() const ABSL_ATTRIBUTE_LIFETIME_BOUND {
return exception_;
}
bool SupportsRandomAccess() override { return supports_random_access_; }
// For implementing `tp_traverse` of objects containing `PythonWriter`.
int Traverse(visitproc visit, void* arg);
protected:
void Done() override;
bool WriteInternal(absl::string_view src) override;
bool FlushImpl(FlushType flush_type) override;
bool SeekBehindBuffer(Position new_pos) override;
std::optional SizeBehindBuffer() override;
bool TruncateBehindBuffer(Position new_size) override;
private:
ABSL_ATTRIBUTE_COLD bool FailOperation(absl::string_view operation);
std::optional SizeInternal();
PythonPtrLocking dest_;
bool owns_dest_ = false;
bool supports_random_access_ = false;
absl::Status random_access_status_;
Exception exception_;
PythonPtrLocking write_function_;
bool use_bytes_ = false;
};
inline PythonWriter::PythonWriter(PythonWriter&& that) noexcept
: BufferedWriter(static_cast(that)),
dest_(std::move(that.dest_)),
owns_dest_(that.owns_dest_),
supports_random_access_(
std::exchange(that.supports_random_access_, false)),
random_access_status_(std::move(that.random_access_status_)),
exception_(std::move(that.exception_)),
write_function_(std::move(that.write_function_)),
use_bytes_(that.use_bytes_) {}
inline PythonWriter& PythonWriter::operator=(PythonWriter&& that) noexcept {
BufferedWriter::operator=(static_cast(that));
dest_ = std::move(that.dest_);
owns_dest_ = that.owns_dest_;
supports_random_access_ = std::exchange(that.supports_random_access_, false);
random_access_status_ = std::move(that.random_access_status_);
exception_ = std::move(that.exception_);
write_function_ = std::move(that.write_function_);
use_bytes_ = that.use_bytes_;
return *this;
}
inline int PythonWriter::Traverse(visitproc visit, void* arg) {
Py_VISIT(dest_.get());
Py_VISIT(write_function_.get());
return exception_.Traverse(visit, arg);
}
} // namespace riegeli::python
#endif // PYTHON_RIEGELI_BYTES_PYTHON_WRITER_H_
================================================
FILE: python/riegeli/py_extension.bzl
================================================
"""Supports writing Python modules in C++."""
load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
load("@rules_python//python:defs.bzl", "py_library")
def py_extension(
name = None,
srcs = None,
hdrs = None,
data = None,
features = None,
visibility = None,
deps = None):
"""Creates a Python module implemented in C++.
Python modules can depend on a py_extension. Other py_extensions can depend
on a generated C++ library named with "_cc" suffix.
Args:
name: Name for this target.
srcs: C++ source files.
hdrs: C++ header files, for other py_extensions which depend on this.
data: Files needed at runtime. This may include Python libraries.
features: Passed to cc_library.
visibility: Controls which rules can depend on this.
deps: Other C++ libraries that this library depends upon.
"""
cc_library_name = name + "_cc"
cc_binary_name = name + ".so"
cc_library(
name = cc_library_name,
srcs = srcs,
hdrs = hdrs,
data = data,
features = features,
visibility = visibility,
deps = deps,
alwayslink = True,
)
cc_binary(
name = cc_binary_name,
linkshared = True,
linkstatic = True,
visibility = ["//visibility:private"],
deps = [cc_library_name],
)
py_library(
name = name,
data = [cc_binary_name],
visibility = visibility,
)
================================================
FILE: python/riegeli/python_configure.bzl
================================================
"""Repository rule for Python autoconfiguration.
`python_configure` depends on the following environment variables:
* `PYTHON_BIN_PATH`: location of python binary.
* `PYTHON_LIB_PATH`: Location of python libraries.
"""
_BAZEL_SH = "BAZEL_SH"
_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
_PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
_TF_PYTHON_CONFIG_REPO = "TF_PYTHON_CONFIG_REPO"
def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
if not out:
out = tpl
repository_ctx.template(
out,
Label("//python/riegeli:{}.tpl".format(tpl)),
substitutions,
)
def _fail(msg):
"""Outputs failure message when auto configuration fails."""
red = "\033[0;31m"
no_color = "\033[0m"
fail("{}Python Configuration Error:{} {}\n".format(red, no_color, msg))
def _is_windows(repository_ctx):
"""Returns true if the host operating system is Windows."""
os_name = repository_ctx.os.name.lower()
return "windows" in os_name
def _execute(
repository_ctx,
cmdline,
error_msg = None,
error_details = None,
empty_stdout_fine = False):
"""Executes an arbitrary shell command.
Args:
repository_ctx: the repository_ctx object
cmdline: list of strings, the command to execute
error_msg: string, a summary of the error if the command fails
error_details: string, details about the error or steps to fix it
empty_stdout_fine: bool, if True, an empty stdout result is fine,
otherwise it's an error
Return:
the result of repository_ctx.execute(cmdline)
"""
result = repository_ctx.execute(cmdline)
if result.stderr or not (empty_stdout_fine or result.stdout):
_fail("\n".join([
error_msg.strip() if error_msg else "Repository command failed",
result.stderr.strip(),
error_details if error_details else "",
]))
return result
def _read_dir(repository_ctx, src_dir):
"""Returns a string with all files in a directory.
Finds all files inside a directory, traversing subfolders and following
symlinks. The returned string contains the full path of all files
separated by line breaks.
"""
if _is_windows(repository_ctx):
src_dir = src_dir.replace("/", "\\")
find_result = _execute(
repository_ctx,
["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
empty_stdout_fine = True,
)
# src_files will be used in genrule.outs where the paths must
# use forward slashes.
result = find_result.stdout.replace("\\", "/")
else:
find_result = _execute(
repository_ctx,
["find", src_dir, "-follow", "-type", "f"],
empty_stdout_fine = True,
)
result = find_result.stdout
return result
def _genrule(src_dir, genrule_name, command, outs):
"""Returns a string with a genrule.
Genrule executes the given command and produces the given outputs.
"""
return (
"genrule(\n" +
' name = "{}",\n' +
" outs = [\n" +
"{}\n" +
" ],\n" +
' cmd = """\n' +
"{}\n" +
' """,\n' +
")\n"
).format(genrule_name, outs, command)
def _norm_path(path):
"""Returns a path with '/' and removes the trailing slash."""
return path.replace("\\", "/").rstrip("/")
def _symlink_genrule_for_dir(
repository_ctx,
src_dir,
dest_dir,
genrule_name,
src_files = [],
dest_files = []):
"""Returns a genrule to symlink (or copy if on Windows) a set of files.
If src_dir is passed, files will be read from the given directory; otherwise
we assume files are in src_files and dest_files
"""
if src_dir != None:
src_dir = _norm_path(src_dir)
dest_dir = _norm_path(dest_dir)
files = "\n".join(
sorted(_read_dir(repository_ctx, src_dir).splitlines()),
)
# Create a list with the src_dir stripped to use for outputs.
dest_files = files.replace(src_dir, "").splitlines()
src_files = files.splitlines()
command = []
outs = []
for i in range(len(dest_files)):
if dest_files[i] != "":
# If we have only one file to link we do not want to use the
# dest_dir, as $(@D) will include the full path to the file.
dest = "$(@D)/{}{}".format(
dest_dir if len(dest_files) != 1 else "",
dest_files[i],
)
# Copy the headers to create a sandboxable setup.
cmd = "cp -f"
command.append('{} "{}" "{}"'.format(cmd, src_files[i], dest))
outs.append(' "{}{}",'.format(dest_dir, dest_files[i]))
genrule = _genrule(
src_dir,
genrule_name,
" && ".join(command),
"\n".join(outs),
)
return genrule
def _get_python_bin(repository_ctx):
"""Gets the python bin path."""
python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
if python_bin != None:
return python_bin
python_bin_path = repository_ctx.which("python")
if python_bin_path != None:
return str(python_bin_path)
_fail(("Cannot find python in PATH, please make sure " +
"python is installed and add its directory in PATH, " +
"or --define {}='/something/else'.\nPATH={}").format(
_PYTHON_BIN_PATH,
repository_ctx.os.environ.get("PATH", ""),
))
def _get_bash_bin(repository_ctx):
"""Gets the bash bin path."""
bash_bin = repository_ctx.os.environ.get(_BAZEL_SH)
if bash_bin != None:
return bash_bin
bash_bin_path = repository_ctx.which("bash")
if bash_bin_path != None:
return str(bash_bin_path)
_fail(("Cannot find bash in PATH, please make sure " +
"bash is installed and add its directory in PATH, " +
"or --define {}='/path/to/bash'.\nPATH={}").format(
_BAZEL_SH,
repository_ctx.os.environ.get("PATH", ""),
))
def _get_python_runtime_pair(repository_ctx, python_bin):
"""Builds a py_runtime_pair definition."""
return (
"py_runtime_pair(\n" +
' name = "py_runtime_pair",\n' +
" py2_runtime = None,\n" +
" py3_runtime = \":py3_runtime\",\n" +
")\n" +
"\n" +
"py_runtime(\n" +
' name = "py3_runtime",\n' +
' interpreter_path = "{}",\n' +
' python_version = "PY3",\n' +
")\n"
).format(python_bin)
def _get_python_lib(repository_ctx, python_bin):
"""Gets the python lib path."""
python_lib = repository_ctx.os.environ.get(_PYTHON_LIB_PATH)
if python_lib != None:
return python_lib
print_lib = ("<= 1:\n" +
" print(paths[0])\n" +
"END")
cmd = "{} - {}".format(python_bin, print_lib)
result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
return result.stdout.strip("\n")
def _check_python_lib(repository_ctx, python_lib):
"""Checks the python lib path."""
cmd = 'test -d "{}" -a -x "{}"'.format(python_lib, python_lib)
result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
if result.return_code == 1:
_fail("Invalid python library path: {}".format(python_lib))
def _check_python_bin(repository_ctx, python_bin):
"""Checks the python bin path."""
cmd = '[[ -x "{}" ]] && [[ ! -d "{}" ]]'.format(python_bin, python_bin)
result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
if result.return_code == 1:
_fail(("--define {}='{}' is not executable. " +
"Is it the python binary?").format(
_PYTHON_BIN_PATH,
python_bin,
))
def _get_python_include(repository_ctx, python_bin):
"""Gets the python include path."""
result = _execute(
repository_ctx,
[
python_bin,
"-c",
"import importlib; " +
"import importlib.util; " +
"print(importlib.import_module('distutils.sysconfig').get_python_inc() " +
"if importlib.util.find_spec('distutils.sysconfig') " +
"else importlib.import_module('sysconfig').get_path('include'))",
],
error_msg = "Problem getting python include path.",
error_details = ("Is the Python binary path set up right? " +
"(See ./configure or {}.) " +
"Is distutils installed?").format(_PYTHON_BIN_PATH),
)
return result.stdout.splitlines()[0]
def _get_python_import_lib_name(repository_ctx, python_bin):
"""Gets Python import library name (pythonXY.lib) on Windows."""
result = _execute(
repository_ctx,
[
python_bin,
"-c",
"import sys; " +
'print("python{}{}.lib".format(' +
"sys.version_info.major, sys.version_info.minor))",
],
error_msg = "Problem getting python import library.",
error_details = ("Is the Python binary path set up right? " +
"(See ./configure or {}.) ").format(_PYTHON_BIN_PATH),
)
return result.stdout.splitlines()[0]
def _get_numpy_include(repository_ctx, python_bin):
"""Gets the numpy include path."""
return _execute(
repository_ctx,
[
python_bin,
"-c",
"import numpy; print(numpy.get_include())",
],
error_msg = "Problem getting numpy include path.",
error_details = "Is numpy installed?",
).stdout.splitlines()[0]
def _create_local_python_repository(repository_ctx):
"""Creates the repository containing files set up to build with Python."""
python_bin = _get_python_bin(repository_ctx)
_check_python_bin(repository_ctx, python_bin)
python_runtime_pair = _get_python_runtime_pair(repository_ctx, python_bin)
python_lib = _get_python_lib(repository_ctx, python_bin)
_check_python_lib(repository_ctx, python_lib)
python_include = _get_python_include(repository_ctx, python_bin)
numpy_include = _get_numpy_include(repository_ctx, python_bin) + "/numpy"
python_include_rule = _symlink_genrule_for_dir(
repository_ctx,
python_include,
"python_include",
"python_include",
)
python_import_lib_genrule = ""
# To build Python C/C++ extension on Windows, we need to link to python
# import library pythonXY.lib
# See https://docs.python.org/3/extending/windows.html
if _is_windows(repository_ctx):
python_include = _norm_path(python_include)
python_import_lib_name = _get_python_import_lib_name(
repository_ctx,
python_bin,
)
python_import_lib_src = "{}/libs/{}".format(
python_include.rsplit("/", 1)[0],
python_import_lib_name,
)
python_import_lib_genrule = _symlink_genrule_for_dir(
repository_ctx,
None,
"",
"python_import_lib",
[python_import_lib_src],
[python_import_lib_name],
)
numpy_include_rule = _symlink_genrule_for_dir(
repository_ctx,
numpy_include,
"numpy_include/numpy",
"numpy_include",
)
_tpl(repository_ctx, "BUILD", {
"%{PYTHON_RUNTIME_PAIR}": python_runtime_pair,
"%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
"%{PYTHON_IMPORT_LIB_GENRULE}": python_import_lib_genrule,
"%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
})
def _create_remote_python_repository(repository_ctx, remote_config_repo):
"""Creates pointers to a remotely configured repo set up to build with Python.
"""
repository_ctx.template("BUILD", Label(remote_config_repo + ":BUILD"), {})
def _python_autoconf_impl(repository_ctx):
"""Implementation of the python_autoconf repository rule."""
if _TF_PYTHON_CONFIG_REPO in repository_ctx.os.environ:
_create_remote_python_repository(
repository_ctx,
repository_ctx.os.environ[_TF_PYTHON_CONFIG_REPO],
)
else:
_create_local_python_repository(repository_ctx)
python_configure = repository_rule(
implementation = _python_autoconf_impl,
environ = [
_BAZEL_SH,
_PYTHON_BIN_PATH,
_PYTHON_LIB_PATH,
_TF_PYTHON_CONFIG_REPO,
],
)
"""Detects and configures the local Python.
Add the following to your WORKSPACE FILE:
```python
python_configure(name = "local_config_python")
```
Args:
name: A unique name for this workspace rule.
"""
================================================
FILE: python/riegeli/records/BUILD
================================================
load("@com_google_protobuf//bazel:proto_library.bzl", "proto_library")
load("@com_google_protobuf//bazel:py_proto_library.bzl", "py_proto_library")
load("@rules_python//python:defs.bzl", "py_library")
load("//python/riegeli:py_extension.bzl", "py_extension")
package(
default_visibility = ["//python/riegeli:__subpackages__"],
features = ["header_modules"],
)
licenses(["notice"])
py_extension(
name = "record_reader",
srcs = ["record_reader.cc"],
# Python modules imported from C++.
data = [
":records_metadata_py_pb2",
":skipped_region",
"@com_google_protobuf//:protobuf_python",
],
# record_reader.cc has #define before #include to influence what the
# included files provide.
features = ["-use_header_modules"],
deps = [
":record_position_cc",
"//python/riegeli/base:utils",
"//python/riegeli/bytes:python_reader",
"//riegeli/base:arithmetic",
"//riegeli/base:assert",
"//riegeli/base:chain",
"//riegeli/base:compare",
"//riegeli/base:types",
"//riegeli/chunk_encoding:field_projection",
"//riegeli/records:record_position",
"//riegeli/records:record_reader",
"//riegeli/records:skipped_region",
"@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings:string_view",
"@rules_python//python/cc:current_py_cc_headers",
],
)
py_extension(
name = "record_writer",
srcs = ["record_writer.cc"],
# record_writer.cc has #define before #include to influence what the
# included files provide.
features = ["-use_header_modules"],
deps = [
":record_position_cc",
"//python/riegeli/base:utils",
"//python/riegeli/bytes:python_writer",
"//riegeli/base:assert",
"//riegeli/base:chain",
"//riegeli/base:types",
"//riegeli/records:record_writer",
"@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings:string_view",
"@rules_python//python/cc:current_py_cc_headers",
],
)
py_extension(
name = "record_position",
srcs = ["record_position.cc"],
hdrs = ["record_position.h"],
# record_position.cc has #define before #include to influence what the
# included files provide.
features = ["-use_header_modules"],
deps = [
"//python/riegeli/base:utils",
"//riegeli/base:arithmetic",
"//riegeli/base:types",
"//riegeli/records:record_position",
"@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/hash",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:string_view",
"@rules_python//python/cc:current_py_cc_headers",
],
)
py_library(
name = "skipped_region",
srcs = ["skipped_region.py"],
)
proto_library(
name = "records_metadata_proto",
srcs = ["records_metadata.proto"],
deps = ["@com_google_protobuf//:descriptor_proto"],
)
py_proto_library(
name = "records_metadata_py_pb2",
deps = [":records_metadata_proto"],
)
================================================
FILE: python/riegeli/records/__init__.py
================================================
================================================
FILE: python/riegeli/records/examples/BUILD
================================================
load("@rules_python//python:defs.bzl", "py_binary")
package(features = ["header_modules"])
licenses(["notice"])
py_binary(
name = "write_read_records",
srcs = ["write_read_records.py"],
deps = [
"//python/riegeli",
"//python/riegeli/records/tests:records_test_py_pb2",
],
)
================================================
FILE: python/riegeli/records/examples/__init__.py
================================================
================================================
FILE: python/riegeli/records/examples/write_read_records.py
================================================
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Simple example which writes and reads a Riegeli/records file."""
import io
import riegeli
from riegeli.records.tests import records_test_pb2
def sample_string(i, size):
piece = f'{i} '.encode()
result = piece * -(-size // len(piece)) # len(result) >= size
return result[:size]
def sample_message(i, size):
return records_test_pb2.SimpleMessage(id=i, payload=sample_string(i, size))
def write_records(filename):
print('Writing', filename)
metadata = riegeli.RecordsMetadata()
riegeli.set_record_type(metadata, records_test_pb2.SimpleMessage)
with riegeli.RecordWriter(
io.FileIO(filename, mode='wb'), options='transpose', metadata=metadata
) as writer:
writer.write_messages(sample_message(i, 100) for i in range(100))
def read_records(filename):
print('Reading', filename)
with riegeli.RecordReader(
io.FileIO(filename, mode='rb'),
field_projection=[[
records_test_pb2.SimpleMessage.DESCRIPTOR.fields_by_name['id'].number
]],
) as reader:
print(
' '.join(
str(record.id)
for record in reader.read_messages(records_test_pb2.SimpleMessage)
)
)
def main():
filename = '/tmp/riegeli_example'
write_records(filename)
read_records(filename)
if __name__ == '__main__':
main()
================================================
FILE: python/riegeli/records/record_position.cc
================================================
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// From https://docs.python.org/3/c-api/intro.html:
// Since Python may define some pre-processor definitions which affect the
// standard headers on some systems, you must include Python.h before any
// standard headers are included.
#define PY_SSIZE_T_CLEAN
#include
// clang-format: do not reorder the above include.
#include "python/riegeli/records/record_position.h"
// clang-format: do not reorder the above include.
#include
#include
#include
#include
#include
#include "absl/base/optimization.h"
#include "absl/hash/hash.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
#include "python/riegeli/base/utils.h"
#include "riegeli/base/arithmetic.h"
#include "riegeli/base/types.h"
#include "riegeli/records/record_position.h"
namespace riegeli::python {
namespace {
struct PyRecordPositionObject {
// clang-format off
PyObject_HEAD
static_assert(true, ""); // clang-format workaround.
// clang-format on
PythonWrapped record_position;
};
extern PyTypeObject PyRecordPosition_Type;
// `extern "C"` sets the C calling convention for compatibility with the Python
// API. `static` avoids making symbols public, as `extern "C"` trumps anonymous
// namespace.
extern "C" {
static void RecordPositionDestructor(PyRecordPositionObject* self) {
PythonUnlocked([&] { self->record_position.reset(); });
Py_TYPE(self)->tp_free(self);
}
static PyRecordPositionObject* RecordPositionNew(PyTypeObject* cls,
PyObject* args,
PyObject* kwargs) {
static constexpr const char* keywords[] = {"chunk_begin", "record_index",
nullptr};
PyObject* chunk_begin_arg;
PyObject* record_index_arg;
if (ABSL_PREDICT_FALSE(!PyArg_ParseTupleAndKeywords(
args, kwargs, "OO:RecordPosition", const_cast(keywords),
&chunk_begin_arg, &record_index_arg))) {
return nullptr;
}
const std::optional chunk_begin =
PositionFromPython(chunk_begin_arg);
if (ABSL_PREDICT_FALSE(chunk_begin == std::nullopt)) return nullptr;
const std::optional record_index =
PositionFromPython(record_index_arg);
if (ABSL_PREDICT_FALSE(record_index == std::nullopt)) return nullptr;
if (ABSL_PREDICT_FALSE(*chunk_begin > std::numeric_limits::max()) ||
ABSL_PREDICT_FALSE(*record_index >
std::numeric_limits::max() - *chunk_begin)) {
PyErr_Format(PyExc_OverflowError, "RecordPosition overflow: %llu/%llu",
static_cast(*chunk_begin),
static_cast(*record_index));
return nullptr;
}
std::unique_ptr self(
reinterpret_cast(cls->tp_alloc(cls, 0)));
if (ABSL_PREDICT_FALSE(self == nullptr)) return nullptr;
self->record_position.emplace(RecordPosition(
IntCast(*chunk_begin), IntCast(*record_index)));
return self.release();
}
static PyObject* RecordPositionChunkBegin(PyRecordPositionObject* self,
void* closure) {
const RecordPosition pos =
PythonUnlocked([&] { return self->record_position->get(); });
return PositionToPython(pos.chunk_begin()).release();
}
static PyObject* RecordPositionRecordIndex(PyRecordPositionObject* self,
void* closure) {
const RecordPosition pos =
PythonUnlocked([&] { return self->record_position->get(); });
return PositionToPython(pos.record_index()).release();
}
static PyObject* RecordPositionNumeric(PyRecordPositionObject* self,
void* closure) {
const RecordPosition pos =
PythonUnlocked([&] { return self->record_position->get(); });
return PositionToPython(pos.numeric()).release();
}
static PyObject* RecordPositionCompare(PyObject* a, PyObject* b, int op) {
if (ABSL_PREDICT_FALSE(!PyObject_TypeCheck(a, &PyRecordPosition_Type)) ||
ABSL_PREDICT_FALSE(!PyObject_TypeCheck(b, &PyRecordPosition_Type))) {
Py_INCREF(Py_NotImplemented);
return Py_NotImplemented;
}
RecordPosition a_pos;
RecordPosition b_pos;
PythonUnlocked([&] {
a_pos =
reinterpret_cast(a)->record_position->get();
b_pos =
reinterpret_cast(b)->record_position->get();
});
switch (op) {
case Py_EQ:
return PyBool_FromLong(a_pos == b_pos);
case Py_NE:
return PyBool_FromLong(a_pos != b_pos);
case Py_LT:
return PyBool_FromLong(a_pos < b_pos);
case Py_GT:
return PyBool_FromLong(a_pos > b_pos);
case Py_LE:
return PyBool_FromLong(a_pos <= b_pos);
case Py_GE:
return PyBool_FromLong(a_pos >= b_pos);
default:
Py_INCREF(Py_NotImplemented);
return Py_NotImplemented;
}
}
static Py_hash_t RecordPositionHash(PyRecordPositionObject* self) {
const RecordPosition pos =
PythonUnlocked([&] { return self->record_position->get(); });
Py_hash_t hash = static_cast(absl::Hash()(pos));
if (ABSL_PREDICT_FALSE(hash == -1)) hash = -2;
return hash;
}
static PyObject* RecordPositionStr(PyRecordPositionObject* self) {
const RecordPosition pos =
PythonUnlocked([&] { return self->record_position->get(); });
return StringToPython(pos.ToString()).release();
}
static PyRecordPositionObject* RecordPositionFromStr(PyTypeObject* cls,
PyObject* args,
PyObject* kwargs) {
static constexpr const char* keywords[] = {"serialized", nullptr};
PyObject* serialized_arg;
if (ABSL_PREDICT_FALSE(!PyArg_ParseTupleAndKeywords(
args, kwargs, "O:from_str", const_cast(keywords),
&serialized_arg))) {
return nullptr;
}
StrOrBytes serialized;
if (ABSL_PREDICT_FALSE(!serialized.FromPython(serialized_arg))) {
return nullptr;
}
RecordPosition pos;
if (ABSL_PREDICT_FALSE(!pos.FromString(serialized))) {
PyErr_SetString(PyExc_ValueError, "RecordPosition.from_str() failed");
return nullptr;
}
std::unique_ptr self(
reinterpret_cast(cls->tp_alloc(cls, 0)));
if (ABSL_PREDICT_FALSE(self == nullptr)) return nullptr;
self->record_position.emplace(pos);
return self.release();
}
static PyObject* RecordPositionRepr(PyRecordPositionObject* self) {
const RecordPosition pos =
PythonUnlocked([&] { return self->record_position->get(); });
return StringToPython(absl::StrCat("RecordPosition(", pos.chunk_begin(), ", ",
pos.record_index(), ")"))
.release();
}
static PyObject* RecordPositionToBytes(PyRecordPositionObject* self,
PyObject* args) {
const RecordPosition pos =
PythonUnlocked([&] { return self->record_position->get(); });
return BytesToPython(pos.ToBytes()).release();
}
static PyRecordPositionObject* RecordPositionFromBytes(PyTypeObject* cls,
PyObject* args,
PyObject* kwargs) {
static constexpr const char* keywords[] = {"serialized", nullptr};
PyObject* serialized_arg;
if (ABSL_PREDICT_FALSE(!PyArg_ParseTupleAndKeywords(
args, kwargs, "O:from_bytes", const_cast(keywords),
&serialized_arg))) {
return nullptr;
}
BytesLike serialized;
if (ABSL_PREDICT_FALSE(!serialized.FromPython(serialized_arg))) {
return nullptr;
}
RecordPosition pos;
if (ABSL_PREDICT_FALSE(!pos.FromBytes(serialized))) {
PyErr_SetString(PyExc_ValueError, "RecordPosition.from_bytes() failed");
return nullptr;
}
std::unique_ptr self(
reinterpret_cast(cls->tp_alloc(cls, 0)));
if (ABSL_PREDICT_FALSE(self == nullptr)) return nullptr;
self->record_position.emplace(pos);
return self.release();
}
} // extern "C"
const PyMethodDef RecordPositionMethods[] = {
{"from_str", reinterpret_cast(RecordPositionFromStr),
METH_VARARGS | METH_KEYWORDS | METH_CLASS,
R"doc(
from_str(type, serialized: str | bytes) -> RecordPosition
Parses RecordPosition from its text format.
Args:
serialized: Text string to parse.
)doc"},
{"to_bytes", reinterpret_cast(RecordPositionToBytes),
METH_NOARGS,
R"doc(
to_bytes(self) -> bytes
Returns the RecordPosition serialized to its binary format.
Serialized byte strings have the same natural order as the corresponding
positions.
)doc"},
{"from_bytes", reinterpret_cast(RecordPositionFromBytes),
METH_VARARGS | METH_KEYWORDS | METH_CLASS, R"doc(
from_bytes(
type, serialized: bytes | bytearray | memoryview) -> RecordPosition
Parses RecordPosition from its binary format.
Serialized byte strings have the same natural order as the corresponding
positions.
Args:
serialized: Byte string to parse.
)doc"},
{nullptr, nullptr, 0, nullptr},
};
const PyGetSetDef RecordPositionGetSet[] = {
{const_cast("chunk_begin"),
reinterpret_cast(RecordPositionChunkBegin), nullptr,
const_cast(R"doc(
chunk_begin: int
File position of the beginning of the chunk containing the given record.
)doc"),
nullptr},
{const_cast("record_index"),
reinterpret_cast(RecordPositionRecordIndex), nullptr,
const_cast(R"doc(
record_index: int
Index of the record within the chunk.
)doc"),
nullptr},
{const_cast("numeric"),
reinterpret_cast(RecordPositionNumeric), nullptr,
const_cast(R"doc(
numeric: int
Converts RecordPosition to an integer scaled between 0 and file size.
Distinct RecordPositions of a valid file have distinct numeric values.
)doc"),
nullptr},
{nullptr, nullptr, nullptr, nullptr, nullptr}};
PyTypeObject PyRecordPosition_Type = {
// clang-format off
PyVarObject_HEAD_INIT(&PyType_Type, 0)
// clang-format on
"riegeli.records.record_position.RecordPosition", // tp_name
sizeof(PyRecordPositionObject), // tp_basicsize
0, // tp_itemsize
reinterpret_cast(RecordPositionDestructor), // tp_dealloc
#if PY_VERSION_HEX >= 0x03080000
0, // tp_vectorcall_offset
#else
nullptr, // tp_print
#endif
nullptr, // tp_getattr
nullptr, // tp_setattr
nullptr, // tp_as_async
reinterpret_cast(RecordPositionRepr), // tp_repr
nullptr, // tp_as_number
nullptr, // tp_as_sequence
nullptr, // tp_as_mapping
reinterpret_cast(RecordPositionHash), // tp_hash
nullptr, // tp_call
reinterpret_cast(RecordPositionStr), // tp_str
nullptr, // tp_getattro
nullptr, // tp_setattro
nullptr, // tp_as_buffer
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, // tp_flags
R"doc(
RecordPosition(chunk_begin: int, record_index: int) -> RecordPosition
Represents a position in a Riegeli/records file.
There are two ways of expressing positions, both strictly monotonic:
* RecordPosition - Faster for seeking.
* int - Scaled between 0 and file size.
RecordPosition can be converted to int by the numeric property.
Working with RecordPosition is recommended, unless it is needed to seek to an
approximate position interpolated along the file, e.g. for splitting the file
into shards, or unless the position must be expressed as an integer from the
range [0, file_size] in order to fit into a preexisting API.
Both RecordReader and RecordWriter return positions. A position from
RecordWriter can act as a future: accessing its contents for the first time
might block, waiting for pending operations to complete.
)doc", // tp_doc
nullptr, // tp_traverse
nullptr, // tp_clear
RecordPositionCompare, // tp_richcompare
0, // tp_weaklistoffset
nullptr, // tp_iter
nullptr, // tp_iternext
const_cast(RecordPositionMethods), // tp_methods
nullptr, // tp_members
const_cast(RecordPositionGetSet), // tp_getset
nullptr, // tp_base
nullptr, // tp_dict
nullptr, // tp_descr_get
nullptr, // tp_descr_set
0, // tp_dictoffset
nullptr, // tp_init
nullptr, // tp_alloc
reinterpret_cast(RecordPositionNew), // tp_new
nullptr, // tp_free
nullptr, // tp_is_gc
nullptr, // tp_bases
nullptr, // tp_mro
nullptr, // tp_cache
nullptr, // tp_subclasses
nullptr, // tp_weaklist
nullptr, // tp_del
0, // tp_version_tag
nullptr, // tp_finalize
};
PythonPtr RecordPositionToPython(FutureRecordPosition value) {
PythonPtr self(PyRecordPosition_Type.tp_alloc(&PyRecordPosition_Type, 0));
if (ABSL_PREDICT_FALSE(self == nullptr)) return nullptr;
reinterpret_cast(self.get())
->record_position.emplace(std::move(value));
return self;
}
std::optional RecordPositionFromPython(PyObject* object) {
if (ABSL_PREDICT_FALSE(!PyObject_TypeCheck(object, &PyRecordPosition_Type))) {
PyErr_Format(PyExc_TypeError, "Expected RecordPosition, not %s",
Py_TYPE(object)->tp_name);
return std::nullopt;
}
return PythonUnlocked([&] {
return reinterpret_cast(object)
->record_position->get();
});
}
const char* const kModuleName = "riegeli.records.record_position";
const char kModuleDoc[] =
R"doc(Represents a position in a Riegeli/records file.)doc";
PyModuleDef kModuleDef = {
PyModuleDef_HEAD_INIT,
kModuleName, // m_name
kModuleDoc, // m_doc
-1, // m_size
nullptr, // m_methods
nullptr, // m_slots
nullptr, // m_traverse
nullptr, // m_clear
nullptr, // m_free
};
PyObject* InitModule() {
if (ABSL_PREDICT_FALSE(PyType_Ready(&PyRecordPosition_Type) < 0)) {
return nullptr;
}
PythonPtr module(PyModule_Create(&kModuleDef));
if (ABSL_PREDICT_FALSE(module == nullptr)) return nullptr;
Py_INCREF(&PyRecordPosition_Type);
if (ABSL_PREDICT_FALSE(PyModule_AddObject(module.get(), "RecordPosition",
reinterpret_cast(
&PyRecordPosition_Type)) < 0)) {
return nullptr;
}
static constexpr RecordPositionApi kRecordPositionApi = {
RecordPositionToPython,
RecordPositionFromPython,
};
if (ABSL_PREDICT_FALSE(!ExportCapsule(
module.get(), kRecordPositionCapsuleName, &kRecordPositionApi))) {
return nullptr;
}
return module.release();
}
} // namespace
PyMODINIT_FUNC PyInit_record_position() { return InitModule(); }
} // namespace riegeli::python
================================================
FILE: python/riegeli/records/record_position.h
================================================
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PYTHON_RIEGELI_RECORDS_RECORD_POSITION_H_
#define PYTHON_RIEGELI_RECORDS_RECORD_POSITION_H_
// From https://docs.python.org/3/c-api/intro.html:
// Since Python may define some pre-processor definitions which affect the
// standard headers on some systems, you must include Python.h before any
// standard headers are included.
#include
// clang-format: do not reorder the above include.
#include
#include "python/riegeli/base/utils.h"
#include "riegeli/records/record_position.h"
namespace riegeli::python {
// Access the API thus:
// ```
// static constexpr ImportedCapsule kRecordPositionApi(
// kRecordPositionCapsuleName);
// ```
struct RecordPositionApi {
PythonPtr (*RecordPositionToPython)(FutureRecordPosition value);
std::optional (*RecordPositionFromPython)(PyObject* object);
};
inline constexpr const char* kRecordPositionCapsuleName =
"riegeli.records.record_position._CPPAPI";
} // namespace riegeli::python
#endif // PYTHON_RIEGELI_RECORDS_RECORD_POSITION_H_
================================================
FILE: python/riegeli/records/record_reader.cc
================================================
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// From https://docs.python.org/3/c-api/intro.html:
// Since Python may define some pre-processor definitions which affect the
// standard headers on some systems, you must include Python.h before any
// standard headers are included.
#define PY_SSIZE_T_CLEAN
#include
// clang-format: do not reorder the above include.
#include
#include
#include
#include
#include
#include "absl/base/optimization.h"
#include "absl/status/status.h"
#include "absl/strings/string_view.h"
#include "python/riegeli/base/utils.h"
#include "python/riegeli/bytes/python_reader.h"
#include "python/riegeli/records/record_position.h"
#include "riegeli/base/arithmetic.h"
#include "riegeli/base/assert.h"
#include "riegeli/base/chain.h"
#include "riegeli/base/compare.h"
#include "riegeli/base/types.h"
#include "riegeli/chunk_encoding/field_projection.h"
#include "riegeli/records/record_position.h"
#include "riegeli/records/record_reader.h"
#include "riegeli/records/skipped_region.h"
namespace riegeli::python {
namespace {
constexpr ImportedCapsule kRecordPositionApi(
kRecordPositionCapsuleName);
// `extern "C"` sets the C calling convention for compatibility with the Python
// API. `static` avoids making symbols public, as `extern "C"` trumps anonymous
// namespace.
extern "C" {
static PyObject* GetRecordType(PyObject* self, PyObject* args,
PyObject* kwargs) {
static constexpr const char* keywords[] = {"metadata", nullptr};
PyObject* metadata_arg;
if (ABSL_PREDICT_FALSE(!PyArg_ParseTupleAndKeywords(
args, kwargs, "O:get_record_type", const_cast(keywords),
&metadata_arg))) {
return nullptr;
}
// record_type_name = metadata.record_type_name
static constexpr Identifier id_record_type_name("record_type_name");
const PythonPtr record_type_name(
PyObject_GetAttr(metadata_arg, id_record_type_name.get()));
if (ABSL_PREDICT_FALSE(record_type_name == nullptr)) return nullptr;
// if not record_type_name: return None
const int record_type_name_is_true = PyObject_IsTrue(record_type_name.get());
if (ABSL_PREDICT_FALSE(record_type_name_is_true < 0)) return nullptr;
if (record_type_name_is_true == 0) Py_RETURN_NONE;
// file_descriptors = metadata.file_descriptor
static constexpr Identifier id_file_descriptor("file_descriptor");
const PythonPtr file_descriptors(
PyObject_GetAttr(metadata_arg, id_file_descriptor.get()));
if (ABSL_PREDICT_FALSE(file_descriptors == nullptr)) return nullptr;
// if not file_descriptors: return None
const int file_descriptors_is_true = PyObject_IsTrue(file_descriptors.get());
if (ABSL_PREDICT_FALSE(file_descriptors_is_true < 0)) return nullptr;
if (file_descriptors_is_true == 0) Py_RETURN_NONE;
// pool = DescriptorPool()
static constexpr ImportedConstant kDescriptorPool(
"google.protobuf.descriptor_pool", "DescriptorPool");
if (ABSL_PREDICT_FALSE(!kDescriptorPool.Verify())) return nullptr;
const PythonPtr pool(
PyObject_CallFunctionObjArgs(kDescriptorPool.get(), nullptr));
if (ABSL_PREDICT_FALSE(pool == nullptr)) return nullptr;
// for file_descriptor in file_descriptors:
// pool.Add(file_descriptor)
const PythonPtr iter(PyObject_GetIter(file_descriptors.get()));
if (ABSL_PREDICT_FALSE(iter == nullptr)) return nullptr;
while (const PythonPtr file_descriptor{PyIter_Next(iter.get())}) {
static constexpr Identifier id_Add("Add");
const PythonPtr add_result(PyObject_CallMethodObjArgs(
pool.get(), id_Add.get(), file_descriptor.get(), nullptr));
if (ABSL_PREDICT_FALSE(add_result == nullptr)) return nullptr;
}
if (ABSL_PREDICT_FALSE(PyErr_Occurred() != nullptr)) return nullptr;
// message_descriptor = pool.FindMessageTypeByName(record_type_name)
static constexpr Identifier id_FindMessageTypeByName("FindMessageTypeByName");
const PythonPtr message_descriptor(
PyObject_CallMethodObjArgs(pool.get(), id_FindMessageTypeByName.get(),
record_type_name.get(), nullptr));
if (ABSL_PREDICT_FALSE(message_descriptor == nullptr)) return nullptr;
// return GetMessageClass(message_descriptor)
const PythonPtr message_factory(
PyImport_ImportModule("google.protobuf.message_factory"));
if (ABSL_PREDICT_FALSE(message_factory == nullptr)) return nullptr;
static constexpr Identifier id_GetMessageClass("GetMessageClass");
return PyObject_CallMethodObjArgs(message_factory.get(),
id_GetMessageClass.get(),
message_descriptor.get(), nullptr);
}
} // extern "C"
struct PyRecordReaderObject {
// clang-format off
PyObject_HEAD
static_assert(true, ""); // clang-format workaround.
// clang-format on
PythonWrapped> record_reader;
PyObject* recovery;
PythonWrapped recovery_exception;
};
extern PyTypeObject PyRecordReader_Type;
struct PyRecordIterObject {
// clang-format off
PyObject_HEAD
static_assert(true, ""); // clang-format workaround.
// clang-format on
PyObject* (*read_record)(PyRecordReaderObject* self, PyObject* args);
PyRecordReaderObject* record_reader;
PyObject* args;
};
extern PyTypeObject PyRecordIter_Type;
bool RecordReaderHasException(PyRecordReaderObject* self) {
return self->recovery_exception.has_value() || !self->record_reader->ok();
}
void SetExceptionFromRecordReader(PyRecordReaderObject* self) {
if (self->recovery_exception.has_value()) {
self->recovery_exception->Restore();
return;
}
RIEGELI_ASSERT(!self->record_reader->ok())
<< "Failed precondition of SetExceptionFromRecordReader(): "
"RecordReader OK";
if (!self->record_reader->src().exception().ok()) {
self->record_reader->src().exception().Restore();
return;
}
SetRiegeliError(self->record_reader->status());
}
std::optional VerifyFieldNumber(long field_number_value) {
static_assert(Field::kExistenceOnly == 0,
"VerifyFieldNumber() assumes that Field::kExistenceOnly == 0");
if (ABSL_PREDICT_FALSE(field_number_value < Field::kExistenceOnly ||
field_number_value > (1 << 29) - 1)) {
PyErr_Format(PyExc_OverflowError, "Field number out of range: %ld",
field_number_value);
return std::nullopt;
}
return IntCast(field_number_value);
}
std::optional FieldNumberFromPython(PyObject* object) {
if (ABSL_PREDICT_FALSE(!PyLong_Check(object))) {
PyErr_Format(PyExc_TypeError, "Expected int, not %s",
Py_TYPE(object)->tp_name);
return std::nullopt;
}
const long field_number_value = PyLong_AsLong(object);
if (ABSL_PREDICT_FALSE(field_number_value == -1) && PyErr_Occurred()) {
return std::nullopt;
}
return VerifyFieldNumber(field_number_value);
}
std::optional FieldProjectionFromPython(PyObject* object) {
FieldProjection field_projection;
const PythonPtr field_iter(PyObject_GetIter(object));
if (ABSL_PREDICT_FALSE(field_iter == nullptr)) return std::nullopt;
while (const PythonPtr field_object{PyIter_Next(field_iter.get())}) {
Field field;
const PythonPtr field_number_iter(PyObject_GetIter(field_object.get()));
if (ABSL_PREDICT_FALSE(field_number_iter == nullptr)) return std::nullopt;
while (const PythonPtr field_number_object{
PyIter_Next(field_number_iter.get())}) {
const std::optional field_number =
FieldNumberFromPython(field_number_object.get());
if (ABSL_PREDICT_FALSE(field_number == std::nullopt)) return std::nullopt;
field.AddFieldNumber(*field_number);
}
if (ABSL_PREDICT_FALSE(PyErr_Occurred() != nullptr)) return std::nullopt;
field_projection.AddField(std::move(field));
}
if (ABSL_PREDICT_FALSE(PyErr_Occurred() != nullptr)) return std::nullopt;
return field_projection;
}
// `extern "C"` sets the C calling convention for compatibility with the Python
// API. `static` avoids making symbols public, as `extern "C"` trumps anonymous
// namespace.
extern "C" {
static void RecordReaderDestructor(PyRecordReaderObject* self) {
PyObject_GC_UnTrack(self);
#if PY_VERSION_HEX < 0x030D0000 // < 3.13
Py_TRASHCAN_BEGIN(self, RecordReaderDestructor);
#endif
PythonUnlocked([&] { self->record_reader.reset(); });
Py_XDECREF(self->recovery);
self->recovery_exception.reset();
Py_TYPE(self)->tp_free(self);
#if PY_VERSION_HEX < 0x030D0000 // < 3.13
Py_TRASHCAN_END;
#endif
}
static int RecordReaderTraverse(PyRecordReaderObject* self, visitproc visit,
void* arg) {
Py_VISIT(self->recovery);
if (self->recovery_exception.has_value()) {
const int recovery_exception_result =
self->recovery_exception->Traverse(visit, arg);
if (ABSL_PREDICT_FALSE(recovery_exception_result != 0)) {
return recovery_exception_result;
}
}
if (self->record_reader.has_value()) {
return self->record_reader->src().Traverse(visit, arg);
}
return 0;
}
static int RecordReaderClear(PyRecordReaderObject* self) {
PythonUnlocked([&] { self->record_reader.reset(); });
Py_CLEAR(self->recovery);
self->recovery_exception.reset();
return 0;
}
static int RecordReaderInit(PyRecordReaderObject* self, PyObject* args,
PyObject* kwargs) {
static constexpr const char* keywords[] = {"src",
"owns_src",
"assumed_pos",
"min_buffer_size",
"max_buffer_size",
"buffer_size",
"field_projection",
"recovery",
nullptr};
PyObject* src_arg;
PyObject* owns_src_arg = nullptr;
PyObject* assumed_pos_arg = nullptr;
PyObject* min_buffer_size_arg = nullptr;
PyObject* max_buffer_size_arg = nullptr;
PyObject* buffer_size_arg = nullptr;
PyObject* field_projection_arg = nullptr;
PyObject* recovery_arg = nullptr;
if (ABSL_PREDICT_FALSE(!PyArg_ParseTupleAndKeywords(
args, kwargs, "O|$OOOOOOO:RecordReader", const_cast(keywords),
&src_arg, &owns_src_arg, &assumed_pos_arg, &min_buffer_size_arg,
&max_buffer_size_arg, &buffer_size_arg, &field_projection_arg,
&recovery_arg))) {
return -1;
}
PythonReader::Options python_reader_options;
python_reader_options.set_owns_src(true);
if (owns_src_arg != nullptr) {
const int owns_src_is_true = PyObject_IsTrue(owns_src_arg);
if (ABSL_PREDICT_FALSE(owns_src_is_true < 0)) return -1;
python_reader_options.set_owns_src(owns_src_is_true != 0);
}
if (assumed_pos_arg != nullptr && assumed_pos_arg != Py_None) {
const std::optional assumed_pos =
PositionFromPython(assumed_pos_arg);
if (ABSL_PREDICT_FALSE(assumed_pos == std::nullopt)) return -1;
python_reader_options.set_assumed_pos(*assumed_pos);
}
if (buffer_size_arg != nullptr && buffer_size_arg != Py_None) {
min_buffer_size_arg = buffer_size_arg;
max_buffer_size_arg = buffer_size_arg;
}
if (min_buffer_size_arg != nullptr) {
const std::optional min_buffer_size =
SizeFromPython(min_buffer_size_arg);
if (ABSL_PREDICT_FALSE(min_buffer_size == std::nullopt)) return -1;
python_reader_options.set_min_buffer_size(*min_buffer_size);
}
if (max_buffer_size_arg != nullptr) {
const std::optional max_buffer_size =
SizeFromPython(max_buffer_size_arg);
if (ABSL_PREDICT_FALSE(max_buffer_size == std::nullopt)) return -1;
python_reader_options.set_max_buffer_size(*max_buffer_size);
}
RecordReaderBase::Options record_reader_options;
if (field_projection_arg != nullptr && field_projection_arg != Py_None) {
std::optional field_projection =
FieldProjectionFromPython(field_projection_arg);
if (ABSL_PREDICT_FALSE(field_projection == std::nullopt)) return -1;
record_reader_options.set_field_projection(*std::move(field_projection));
}
if (recovery_arg != nullptr && recovery_arg != Py_None) {
Py_INCREF(recovery_arg);
Py_XDECREF(self->recovery);
self->recovery = recovery_arg;
record_reader_options.set_recovery([self](
const SkippedRegion& skipped_region,
RecordReaderBase& record_reader) {
PythonLock lock;
const PythonPtr begin_object = PositionToPython(skipped_region.begin());
if (ABSL_PREDICT_FALSE(begin_object == nullptr)) {
self->recovery_exception.emplace(Exception::Fetch());
return false;
}
const PythonPtr end_object = PositionToPython(skipped_region.end());
if (ABSL_PREDICT_FALSE(end_object == nullptr)) {
self->recovery_exception.emplace(Exception::Fetch());
return false;
}
const PythonPtr message_object = StringToPython(skipped_region.message());
if (ABSL_PREDICT_FALSE(message_object == nullptr)) {
self->recovery_exception.emplace(Exception::Fetch());
return false;
}
static constexpr ImportedConstant kSkippedRegion(
"riegeli.records.skipped_region", "SkippedRegion");
if (ABSL_PREDICT_FALSE(!kSkippedRegion.Verify())) {
self->recovery_exception.emplace(Exception::Fetch());
return false;
}
const PythonPtr skipped_region_object(PyObject_CallFunctionObjArgs(
kSkippedRegion.get(), begin_object.get(), end_object.get(),
message_object.get(), nullptr));
if (ABSL_PREDICT_FALSE(skipped_region_object == nullptr)) {
self->recovery_exception.emplace(Exception::Fetch());
return false;
}
const PythonPtr recovery_result(PyObject_CallFunctionObjArgs(
self->recovery, skipped_region_object.get(), nullptr));
if (ABSL_PREDICT_FALSE(recovery_result == nullptr)) {
if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
PyErr_Clear();
} else {
self->recovery_exception.emplace(Exception::Fetch());
}
return false;
}
return true;
});
}
PythonReader python_reader(src_arg, std::move(python_reader_options));
PythonUnlocked([&] {
self->record_reader.emplace(std::move(python_reader),
std::move(record_reader_options));
});
if (ABSL_PREDICT_FALSE(!self->record_reader->ok())) {
self->record_reader->src().Close();
SetExceptionFromRecordReader(self);
return -1;
}
return 0;
}
static PyObject* RecordReaderSrc(PyRecordReaderObject* self, void* closure) {
PyObject* const src = ABSL_PREDICT_FALSE(!self->record_reader.has_value())
? Py_None
: self->record_reader->src().src();
Py_INCREF(src);
return src;
}
static PyObject* RecordReaderRepr(PyRecordReaderObject* self) {
const PythonPtr format = StringToPython("");
if (ABSL_PREDICT_FALSE(format == nullptr)) return nullptr;
// return format.format(self.src)
PyObject* const src = ABSL_PREDICT_FALSE(!self->record_reader.has_value())
? Py_None
: self->record_reader->src().src();
static constexpr Identifier id_format("format");
return PyObject_CallMethodObjArgs(format.get(), id_format.get(), src,
nullptr);
}
static PyObject* RecordReaderEnter(PyObject* self, PyObject* args) {
// return self
Py_INCREF(self);
return self;
}
static PyObject* RecordReaderExit(PyRecordReaderObject* self, PyObject* args) {
PyObject* exc_type;
PyObject* exc_value;
PyObject* traceback;
if (ABSL_PREDICT_FALSE(!PyArg_ParseTuple(args, "OOO:__exit__", &exc_type,
&exc_value, &traceback))) {
return nullptr;
}
// self.close(), suppressing exceptions if exc_type != None.
if (ABSL_PREDICT_TRUE(self->record_reader.has_value())) {
const bool close_ok =
PythonUnlocked([&] { return self->record_reader->Close(); });
if (ABSL_PREDICT_FALSE(!close_ok) && exc_type == Py_None) {
SetExceptionFromRecordReader(self);
return nullptr;
}
}
Py_RETURN_FALSE;
}
static PyObject* RecordReaderClose(PyRecordReaderObject* self, PyObject* args) {
if (ABSL_PREDICT_TRUE(self->record_reader.has_value())) {
const bool close_ok =
PythonUnlocked([&] { return self->record_reader->Close(); });
if (ABSL_PREDICT_FALSE(!close_ok)) {
SetExceptionFromRecordReader(self);
return nullptr;
}
}
Py_RETURN_NONE;
}
static PyObject* RecordReaderCheckFileFormat(PyRecordReaderObject* self,
PyObject* args) {
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
const bool check_file_format_ok =
PythonUnlocked([&] { return self->record_reader->CheckFileFormat(); });
if (ABSL_PREDICT_FALSE(!check_file_format_ok)) {
if (ABSL_PREDICT_FALSE(RecordReaderHasException(self))) {
SetExceptionFromRecordReader(self);
return nullptr;
}
Py_RETURN_FALSE;
}
Py_RETURN_TRUE;
}
static PyObject* RecordReaderReadMetadata(PyRecordReaderObject* self,
PyObject* args) {
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
Chain metadata;
const bool read_serialized_metadata_ok = PythonUnlocked(
[&] { return self->record_reader->ReadSerializedMetadata(metadata); });
if (ABSL_PREDICT_FALSE(!read_serialized_metadata_ok)) {
if (ABSL_PREDICT_FALSE(RecordReaderHasException(self))) {
SetExceptionFromRecordReader(self);
return nullptr;
}
Py_RETURN_NONE;
}
const PythonPtr serialized_metadata = ChainToPython(metadata);
if (ABSL_PREDICT_FALSE(serialized_metadata == nullptr)) return nullptr;
// return RecordsMetadata.FromString(serialized_metadata)
static constexpr ImportedConstant kRecordsMetadata(
"riegeli.records.records_metadata_pb2", "RecordsMetadata");
if (ABSL_PREDICT_FALSE(!kRecordsMetadata.Verify())) return nullptr;
static constexpr ImportedConstant kDecodeError("google.protobuf.message",
"DecodeError");
if (ABSL_PREDICT_FALSE(!kDecodeError.Verify())) return nullptr;
static constexpr Identifier id_FromString("FromString");
PythonPtr metadata_object(
PyObject_CallMethodObjArgs(kRecordsMetadata.get(), id_FromString.get(),
serialized_metadata.get(), nullptr));
if (ABSL_PREDICT_FALSE(metadata_object == nullptr)) {
if (self->record_reader->recovery() != nullptr &&
PyErr_ExceptionMatches(kDecodeError.get())) {
const Exception exception = Exception::Fetch();
if (self->record_reader->recovery()(
SkippedRegion(self->record_reader->last_pos().chunk_begin(),
self->record_reader->pos().numeric(),
exception.message()),
*self->record_reader)) {
// Recovered metadata decoding, assume empty `RecordsMetadata`.
return PyObject_CallFunctionObjArgs(kRecordsMetadata.get(), nullptr);
}
if (ABSL_PREDICT_FALSE(self->recovery_exception.has_value())) {
self->recovery_exception->Restore();
return nullptr;
}
Py_RETURN_NONE;
}
return nullptr;
}
return metadata_object.release();
}
static PyObject* RecordReaderReadSerializedMetadata(PyRecordReaderObject* self,
PyObject* args) {
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
Chain metadata;
const bool read_serialized_metadata_ok = PythonUnlocked(
[&] { return self->record_reader->ReadSerializedMetadata(metadata); });
if (ABSL_PREDICT_FALSE(!read_serialized_metadata_ok)) {
if (ABSL_PREDICT_FALSE(RecordReaderHasException(self))) {
SetExceptionFromRecordReader(self);
return nullptr;
}
Py_RETURN_NONE;
}
return ChainToPython(metadata).release();
}
static PyObject* RecordReaderReadRecord(PyRecordReaderObject* self,
PyObject* args) {
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
Chain record;
const bool read_record_ok =
PythonUnlocked([&] { return self->record_reader->ReadRecord(record); });
if (ABSL_PREDICT_FALSE(!read_record_ok)) {
if (ABSL_PREDICT_FALSE(RecordReaderHasException(self))) {
SetExceptionFromRecordReader(self);
return nullptr;
}
Py_RETURN_NONE;
}
return ChainToPython(record).release();
}
static PyObject* RecordReaderReadMessage(PyRecordReaderObject* self,
PyObject* args, PyObject* kwargs) {
static constexpr const char* keywords[] = {"message_type", nullptr};
PyObject* message_type_arg;
if (ABSL_PREDICT_FALSE(!PyArg_ParseTupleAndKeywords(
args, kwargs, "O:read_message", const_cast(keywords),
&message_type_arg))) {
return nullptr;
}
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
absl::string_view record;
for (;;) {
const bool read_record_ok =
PythonUnlocked([&] { return self->record_reader->ReadRecord(record); });
if (ABSL_PREDICT_FALSE(!read_record_ok)) {
if (ABSL_PREDICT_FALSE(RecordReaderHasException(self))) {
SetExceptionFromRecordReader(self);
return nullptr;
}
Py_RETURN_NONE;
}
MemoryView memory_view;
PyObject* const record_object = memory_view.ToPython(record);
if (ABSL_PREDICT_FALSE(record_object == nullptr)) return nullptr;
static constexpr ImportedConstant kDecodeError("google.protobuf.message",
"DecodeError");
if (ABSL_PREDICT_FALSE(!kDecodeError.Verify())) return nullptr;
// return message_type.FromString(record)
static constexpr Identifier id_FromString("FromString");
PythonPtr message(PyObject_CallMethodObjArgs(
message_type_arg, id_FromString.get(), record_object, nullptr));
if (ABSL_PREDICT_FALSE(message == nullptr)) {
if (self->record_reader->recovery() != nullptr &&
PyErr_ExceptionMatches(kDecodeError.get())) {
const Exception exception = Exception::Fetch();
if (ABSL_PREDICT_FALSE(!memory_view.Release())) return nullptr;
if (self->record_reader->recovery()(
SkippedRegion(self->record_reader->last_pos().numeric(),
self->record_reader->pos().numeric(),
exception.message()),
*self->record_reader)) {
continue;
}
if (ABSL_PREDICT_FALSE(self->recovery_exception.has_value())) {
self->recovery_exception->Restore();
return nullptr;
}
Py_RETURN_NONE;
}
return nullptr;
}
if (ABSL_PREDICT_FALSE(!memory_view.Release())) return nullptr;
return message.release();
}
}
static PyRecordIterObject* RecordReaderReadRecords(PyRecordReaderObject* self,
PyObject* args) {
std::unique_ptr iter(
PyObject_GC_New(PyRecordIterObject, &PyRecordIter_Type));
if (ABSL_PREDICT_FALSE(iter == nullptr)) return nullptr;
iter->read_record = [](PyRecordReaderObject* self, PyObject* args) {
return RecordReaderReadRecord(self, args);
};
Py_INCREF(self);
iter->record_reader = self;
iter->args = nullptr;
return iter.release();
}
static PyRecordIterObject* RecordReaderReadMessages(PyRecordReaderObject* self,
PyObject* args,
PyObject* kwargs) {
static constexpr const char* keywords[] = {"message_type", nullptr};
PyObject* message_type_arg;
if (ABSL_PREDICT_FALSE(!PyArg_ParseTupleAndKeywords(
args, kwargs, "O:read_messages", const_cast(keywords),
&message_type_arg))) {
return nullptr;
}
std::unique_ptr iter(
PyObject_GC_New(PyRecordIterObject, &PyRecordIter_Type));
if (ABSL_PREDICT_FALSE(iter == nullptr)) return nullptr;
iter->read_record = [](PyRecordReaderObject* self, PyObject* args) {
return RecordReaderReadMessage(self, args, nullptr);
};
Py_INCREF(self);
iter->record_reader = self;
iter->args = PyTuple_Pack(1, message_type_arg);
if (ABSL_PREDICT_FALSE(iter->args == nullptr)) return nullptr;
return iter.release();
}
static PyObject* RecordReaderSetFieldProjection(PyRecordReaderObject* self,
PyObject* args,
PyObject* kwargs) {
static constexpr const char* keywords[] = {"field_projection", nullptr};
PyObject* field_projection_arg;
if (ABSL_PREDICT_FALSE(!PyArg_ParseTupleAndKeywords(
args, kwargs, "O:set_field_projection", const_cast(keywords),
&field_projection_arg))) {
return nullptr;
}
std::optional field_projection;
if (field_projection_arg == Py_None) {
field_projection = FieldProjection::All();
} else {
field_projection = FieldProjectionFromPython(field_projection_arg);
if (ABSL_PREDICT_FALSE(field_projection == std::nullopt)) return nullptr;
}
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
const bool set_field_projection_ok = PythonUnlocked([&] {
return self->record_reader->SetFieldProjection(
*std::move(field_projection));
});
if (ABSL_PREDICT_FALSE(!set_field_projection_ok)) {
SetExceptionFromRecordReader(self);
return nullptr;
}
Py_RETURN_NONE;
}
static PyObject* RecordReaderLastPos(PyRecordReaderObject* self,
void* closure) {
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
if (ABSL_PREDICT_FALSE(!kRecordPositionApi.Verify())) return nullptr;
if (ABSL_PREDICT_FALSE(!self->record_reader->last_record_is_valid())) {
SetRiegeliError(absl::FailedPreconditionError("No record was read"));
return nullptr;
}
return kRecordPositionApi
->RecordPositionToPython(self->record_reader->last_pos())
.release();
}
static PyObject* RecordReaderPos(PyRecordReaderObject* self, void* closure) {
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
if (ABSL_PREDICT_FALSE(!kRecordPositionApi.Verify())) return nullptr;
return kRecordPositionApi->RecordPositionToPython(self->record_reader->pos())
.release();
}
static PyObject* RecordReaderSupportsRandomAccess(PyRecordReaderObject* self,
void* closure) {
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
return PyBool_FromLong(self->record_reader->SupportsRandomAccess());
}
static PyObject* RecordReaderSeek(PyRecordReaderObject* self, PyObject* args,
PyObject* kwargs) {
static constexpr const char* keywords[] = {"pos", nullptr};
PyObject* pos_arg;
if (ABSL_PREDICT_FALSE(!PyArg_ParseTupleAndKeywords(
args, kwargs, "O:seek", const_cast(keywords), &pos_arg))) {
return nullptr;
}
if (ABSL_PREDICT_FALSE(!kRecordPositionApi.Verify())) return nullptr;
const std::optional pos =
kRecordPositionApi->RecordPositionFromPython(pos_arg);
if (ABSL_PREDICT_FALSE(pos == std::nullopt)) return nullptr;
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
const bool seek_ok =
PythonUnlocked([&] { return self->record_reader->Seek(*pos); });
if (ABSL_PREDICT_FALSE(!seek_ok)) {
SetExceptionFromRecordReader(self);
return nullptr;
}
Py_RETURN_NONE;
}
static PyObject* RecordReaderSeekNumeric(PyRecordReaderObject* self,
PyObject* args, PyObject* kwargs) {
static constexpr const char* keywords[] = {"pos", nullptr};
PyObject* pos_arg;
if (ABSL_PREDICT_FALSE(!PyArg_ParseTupleAndKeywords(
args, kwargs, "O:seek_numeric", const_cast(keywords),
&pos_arg))) {
return nullptr;
}
const std::optional pos = PositionFromPython(pos_arg);
if (ABSL_PREDICT_FALSE(pos == std::nullopt)) return nullptr;
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
const bool seek_ok =
PythonUnlocked([&] { return self->record_reader->Seek(*pos); });
if (ABSL_PREDICT_FALSE(!seek_ok)) {
SetExceptionFromRecordReader(self);
return nullptr;
}
Py_RETURN_NONE;
}
static PyObject* RecordReaderSeekBack(PyRecordReaderObject* self,
PyObject* args) {
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
const bool seek_back_ok =
PythonUnlocked([&] { return self->record_reader->SeekBack(); });
if (ABSL_PREDICT_FALSE(!seek_back_ok)) {
if (ABSL_PREDICT_FALSE(RecordReaderHasException(self))) {
SetExceptionFromRecordReader(self);
return nullptr;
}
Py_RETURN_FALSE;
}
Py_RETURN_TRUE;
}
static PyObject* RecordReaderSize(PyRecordReaderObject* self, PyObject* args) {
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
const std::optional size =
PythonUnlocked([&] { return self->record_reader->Size(); });
if (ABSL_PREDICT_FALSE(size == std::nullopt)) {
SetExceptionFromRecordReader(self);
return nullptr;
}
return PositionToPython(*size).release();
}
static PyObject* RecordReaderSearch(PyRecordReaderObject* self, PyObject* args,
PyObject* kwargs) {
static constexpr const char* keywords[] = {"test", nullptr};
PyObject* test_arg;
if (ABSL_PREDICT_FALSE(!PyArg_ParseTupleAndKeywords(
args, kwargs, "O:search", const_cast(keywords), &test_arg))) {
return nullptr;
}
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
std::optional test_exception;
const std::optional result = PythonUnlocked([&] {
return self->record_reader->Search(
[&](RecordReaderBase&) -> std::optional {
PythonLock lock;
const PythonPtr test_result(
PyObject_CallFunctionObjArgs(test_arg, self, nullptr));
if (ABSL_PREDICT_FALSE(test_result == nullptr)) {
test_exception.emplace(Exception::Fetch());
return std::nullopt;
}
const std::optional ordering =
PartialOrderingFromPython(test_result.get());
if (ABSL_PREDICT_FALSE(ordering == std::nullopt)) {
test_exception.emplace(Exception::Fetch());
return std::nullopt;
}
return *ordering;
});
});
if (ABSL_PREDICT_FALSE(result == std::nullopt)) {
if (test_exception != std::nullopt) {
test_exception->Restore();
} else {
SetExceptionFromRecordReader(self);
}
return nullptr;
}
return PartialOrderingToPython(*result).release();
}
static PyObject* RecordReaderSearchForRecord(PyRecordReaderObject* self,
PyObject* args, PyObject* kwargs) {
static constexpr const char* keywords[] = {"test", nullptr};
PyObject* test_arg;
if (ABSL_PREDICT_FALSE(!PyArg_ParseTupleAndKeywords(
args, kwargs, "O:search_for_record", const_cast(keywords),
&test_arg))) {
return nullptr;
}
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
std::optional test_exception;
const std::optional result = PythonUnlocked([&] {
return self->record_reader->Search(
[&](const Chain& record) -> std::optional {
PythonLock lock;
const PythonPtr record_object = ChainToPython(record);
if (ABSL_PREDICT_FALSE(record_object == nullptr)) {
test_exception.emplace(Exception::Fetch());
return std::nullopt;
}
const PythonPtr test_result(PyObject_CallFunctionObjArgs(
test_arg, record_object.get(), nullptr));
if (ABSL_PREDICT_FALSE(test_result == nullptr)) {
test_exception.emplace(Exception::Fetch());
return std::nullopt;
}
const std::optional ordering =
PartialOrderingFromPython(test_result.get());
if (ABSL_PREDICT_FALSE(ordering == std::nullopt)) {
test_exception.emplace(Exception::Fetch());
return std::nullopt;
}
return *ordering;
});
});
if (ABSL_PREDICT_FALSE(result == std::nullopt)) {
if (test_exception != std::nullopt) {
test_exception->Restore();
if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
PyErr_Clear();
Py_RETURN_NONE;
}
} else {
SetExceptionFromRecordReader(self);
}
return nullptr;
}
return PartialOrderingToPython(*result).release();
}
static PyObject* RecordReaderSearchForMessage(PyRecordReaderObject* self,
PyObject* args,
PyObject* kwargs) {
static constexpr const char* keywords[] = {"message_type", "test", nullptr};
PyObject* message_type_arg;
PyObject* test_arg;
if (ABSL_PREDICT_FALSE(!PyArg_ParseTupleAndKeywords(
args, kwargs, "OO:search_for_message", const_cast(keywords),
&message_type_arg, &test_arg))) {
return nullptr;
}
if (ABSL_PREDICT_FALSE(!self->record_reader.Verify())) return nullptr;
static constexpr ImportedConstant kDecodeError("google.protobuf.message",
"DecodeError");
if (ABSL_PREDICT_FALSE(!kDecodeError.Verify())) return nullptr;
// `RecordReader::Search(test)` sets the recovery function to `nullptr` while
// calling `test()`. Save it here to call it explicitly in `test()`.
std::function recovery =
self->record_reader->recovery();
std::optional test_exception;
const std::optional result = PythonUnlocked([&] {
return self->record_reader->Search(
[&](absl::string_view record) -> std::optional {
PythonLock lock;
MemoryView memory_view;
PyObject* const record_object = memory_view.ToPython(record);
if (ABSL_PREDICT_FALSE(record_object == nullptr)) {
test_exception.emplace(Exception::Fetch());
return std::nullopt;
}
// message = message_type.FromString(record)
static constexpr Identifier id_FromString("FromString");
const PythonPtr message(PyObject_CallMethodObjArgs(
message_type_arg, id_FromString.get(), record_object, nullptr));
if (ABSL_PREDICT_FALSE(message == nullptr)) {
if (recovery != nullptr &&
PyErr_ExceptionMatches(kDecodeError.get())) {
const Exception exception = Exception::Fetch();
if (ABSL_PREDICT_FALSE(!memory_view.Release())) {
test_exception.emplace(Exception::Fetch());
return std::nullopt;
}
if (recovery(
SkippedRegion(self->record_reader->last_pos().numeric(),
self->record_reader->pos().numeric(),
exception.message()),
*self->record_reader)) {
// Declare the skipped record unordered.
return PartialOrdering::unordered;
}
if (ABSL_PREDICT_FALSE(self->recovery_exception.has_value())) {
return std::nullopt;
}
// Cancel the search.
PyErr_SetNone(PyExc_StopIteration);
}
test_exception.emplace(Exception::Fetch());
return std::nullopt;
}
if (ABSL_PREDICT_FALSE(!memory_view.Release())) {
test_exception.emplace(Exception::Fetch());
return std::nullopt;
}
const PythonPtr test_result(
PyObject_CallFunctionObjArgs(test_arg, message.get(), nullptr));
if (ABSL_PREDICT_FALSE(test_result == nullptr)) {
test_exception.emplace(Exception::Fetch());
return std::nullopt;
}
const std::optional ordering =
PartialOrderingFromPython(test_result.get());
if (ABSL_PREDICT_FALSE(ordering == std::nullopt)) {
test_exception.emplace(Exception::Fetch());
return std::nullopt;
}
return *ordering;
});
});
if (ABSL_PREDICT_FALSE(result == std::nullopt)) {
if (test_exception != std::nullopt) {
test_exception->Restore();
if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
PyErr_Clear();
Py_RETURN_NONE;
}
} else {
SetExceptionFromRecordReader(self);
}
return nullptr;
}
return PartialOrderingToPython(*result).release();
}
} // extern "C"
const PyMethodDef RecordReaderMethods[] = {
{"__enter__", RecordReaderEnter, METH_NOARGS,
R"doc(
__enter__(self) -> RecordReader
Returns self.
)doc"},
{"__exit__", reinterpret_cast(RecordReaderExit), METH_VARARGS,
R"doc(
__exit__(self, exc_type, exc_value, traceback) -> bool
Calls close().
Suppresses exceptions from close() if an exception is already in flight.
Args:
exc_type: None or exception in flight (type).
exc_value: None or exception in flight (value).
traceback: None or exception in flight (traceback).
)doc"},
{"close", reinterpret_cast(RecordReaderClose), METH_NOARGS,
R"doc(
close(self) -> None
Indicates that reading is done.
Verifies that the file is not truncated at the current position, i.e. that it
either has more data or ends cleanly. Marks the RecordReader as closed,
disallowing further reading.
If the RecordReader was failed, raises the same exception again.
If the RecordReader was not failed but already closed, does nothing.
)doc"},
{"check_file_format",
reinterpret_cast(RecordReaderCheckFileFormat), METH_NOARGS,
R"doc(
check_file_format(self) -> bool
Ensures that the file looks like a valid Riegeli/Records file.
Reading functions already check the file format. check_file_format() can verify
the file format before (or instead of) performing other operations.
This ignores the recovery function. If invalid file contents are skipped, then
checking the file format is meaningless: any file can be read.
Returns:
True if this looks like a Riegeli/records file. False if the file ends before
this could be determined.
Raises:
RiegeliError: If this is not a Riegeli/records file.
)doc"},
{"read_metadata", reinterpret_cast(RecordReaderReadMetadata),
METH_NOARGS, R"doc(
read_metadata(self) -> RecordsMetadata | None
Returns file metadata.
Record type in metadata can be conveniently interpreted by get_record_type().
read_metadata() must be called while the RecordReader is at the beginning of the
file (calling check_file_format() before is allowed).
Returns:
File metadata as parsed RecordsMetadata message, or None at end of file.
)doc"},
{"read_serialized_metadata",
reinterpret_cast(RecordReaderReadSerializedMetadata),
METH_NOARGS, R"doc(
read_serialized_metadata(self) -> bytes | None
Returns file metadata.
This is like read_metadata(), but metadata is returned in the serialized form.
This is faster if the caller needs metadata already serialized.
Returns:
File metadata as serialized RecordsMetadata message, or None at end of file.
)doc"},
{"read_record", reinterpret_cast(RecordReaderReadRecord),
METH_NOARGS, R"doc(
read_record(self) -> bytes | None
Reads the next record.
Returns:
The record read as bytes, or None at end of file.
)doc"},
{"read_message", reinterpret_cast(RecordReaderReadMessage),
METH_VARARGS | METH_KEYWORDS, R"doc(
read_message(self, message_type: type[Message]) -> Message | None
Reads the next record.
Args:
message_type: Type of the message to parse the record as.
Returns:
The record read as a parsed message, or None at end of file.
)doc"},
{"read_records", reinterpret_cast(RecordReaderReadRecords),
METH_NOARGS, R"doc(
read_records(self) -> Iterator[bytes]
Returns an iterator which reads all remaining records.
Yields:
The next record read as bytes.
)doc"},
{"read_messages", reinterpret_cast(RecordReaderReadMessages),
METH_VARARGS | METH_KEYWORDS, R"doc(
read_messages(self, message_type: type[Message]) -> Iterator[Message]
Returns an iterator which reads all remaining records.
Yields:
The next record read as parsed message.
)doc"},
{"set_field_projection",
reinterpret_cast(RecordReaderSetFieldProjection),
METH_VARARGS | METH_KEYWORDS, R"doc(
set_field_projection(
self, field_projection: Iterable[Iterable[int]] | None
) -> None
Like field_projection constructor argument, but can be done at any time.
Args:
field_projection: If not None, the set of fields to be included in returned
records, allowing to exclude the remaining fields (but does not guarantee
that they will be excluded). Excluding data makes reading faster. Projection
is effective if the file has been written with "transpose" in RecordWriter
options. Additionally, "bucket_fraction" in RecordWriter options with a
lower value can make reading with projection faster. A field projection is
specified as an iterable of field paths. A field path is specified as an
iterable of proto field numbers descending from the root message. A special
field EXISTENCE_ONLY can be added to the end of the path; it preserves
field existence but ignores its value; warning: for a repeated field this
preserves the field count only if the field is not packed.
)doc"},
{"seek", reinterpret_cast(RecordReaderSeek),
METH_VARARGS | METH_KEYWORDS, R"doc(
seek(self, pos: RecordPosition) -> None
Seeks to a position.
The position should have been obtained by pos for the same file.
Args:
pos: Seek target.
)doc"},
{"seek_numeric", reinterpret_cast(RecordReaderSeekNumeric),
METH_VARARGS | METH_KEYWORDS, R"doc(
seek_numeric(self, pos: int) -> None
Seeks to a position.
The position can be any integer between 0 and file size. If it points between
records, it is interpreted as the next record.
Args:
pos: Seek target.
)doc"},
{"seek_back", reinterpret_cast(RecordReaderSeekBack),
METH_NOARGS, R"doc(
seek_back(self) -> bool
Seeks back by one record.
Returns:
If successful, True. Returns False at the beginning of the file.
)doc"},
{"size", reinterpret_cast(RecordReaderSize), METH_NOARGS,
R"doc(
size(self) -> int
Returns the size of the file in bytes.
This is the position corresponding to its end.
)doc"},
{"search", reinterpret_cast(RecordReaderSearch),
METH_VARARGS | METH_KEYWORDS,
R"doc(
search(self, test: Callable[[RecordReader], int | None]) -> None
Searches the file for a desired record, or for a desired position between
records, given that it is possible to determine whether a given record is before
or after the desired position.
The current position before calling search() does not matter.
Args:
test: A function which takes the RecordReader as a parameter, seeked to some
record, and returns an int or None:
* < 0: The current record is before the desired position.
* == 0: The current record is desired, searching can stop.
* > 0: The current record is after the desired position.
* None: It could not be determined which is the case. The current record
will be skipped.
It can also raise StopIteration to cancel the search.
Preconditions:
* All < 0 records precede all == 0 records.
* All == 0 records precede all > 0 records.
* All < 0 records precede all > 0 records, even if there are no == 0 records.
Return values:
* 0: There is some == 0 record, and search() points to some such record.
* 1: There are no == 0 records but there is some > 0 record, and search()
points to the earliest such record.
* -1: There are no == 0 nor > 0 records, but there is some < 0 record, and
search() points to the end of file.
* None: All records are None, and search() points to the end of file,
or search() was cancelled.
To find the earliest == 0 record instead of an arbitrary one, test() can be
changed to return > 0 in place of == 0.
Further guarantees:
* If a test() returns == 0, search() points back to the record before test()
and returns.
* If a test() returns < 0, test() will not be called again at earlier
positions.
* If a test() returns > 0, test() will not be called again at later positions.
* test() will not be called again at the same position.
It follows that if a test() returns == 0 or > 0, search() points to the record
before the last test() call with one of these results. This allows to
communicate additional context of a == 0 or > 0 result by a side effect of
test().
)doc"},
{"search_for_record",
reinterpret_cast(RecordReaderSearchForRecord),
METH_VARARGS | METH_KEYWORDS,
R"doc(
search_for_record(self, test: Callable[[bytes], int | None]) -> None
A variant of search() which reads a record before calling test(), instead of
letting test() read the record.
Args:
test: A function which takes the record read as bytes as a parameter, and
returns an int or None, like in search().
)doc"},
{"search_for_message",
reinterpret_cast(RecordReaderSearchForMessage),
METH_VARARGS | METH_KEYWORDS,
R"doc(
search_for_message(
self, message_type: type[Message],
test: Callable[[Message], int | None]
) -> None
A variant of search() which reads a record before calling test(), instead of
letting test() read the record.
Args:
message_type: Type of the message to parse the record as.
test: A function which takes the record read as a parsed message as a
parameter, and returns an int or None, like in search().
)doc"},
{nullptr, nullptr, 0, nullptr},
};
const PyGetSetDef RecordReaderGetSet[] = {
{const_cast("src"), reinterpret_cast